1291 files changed, 151409 insertions, 61805 deletions
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
index 44d137d..35f2e97 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -25,9 +25,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CFLAliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
@@ -40,44 +47,72 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
-// Register the AliasAnalysis interface, providing a nice name to refer to.
-INITIALIZE_ANALYSIS_GROUP(AliasAnalysis, "Alias Analysis", NoAA)
-char AliasAnalysis::ID = 0;
+/// Allow disabling BasicAA from the AA results. This is particularly useful
+/// when testing to isolate a single AA implementation.
+static cl::opt<bool> DisableBasicAA("disable-basicaa", cl::Hidden,
+                                    cl::init(false));
+
+AAResults::AAResults(AAResults &&Arg) : AAs(std::move(Arg.AAs)) {
+  for (auto &AA : AAs)
+    AA->setAAResults(this);
+}
+
+AAResults &AAResults::operator=(AAResults &&Arg) {
+  AAs = std::move(Arg.AAs);
+  for (auto &AA : AAs)
+    AA->setAAResults(this);
+  return *this;
+}
+
+AAResults::~AAResults() {
+// FIXME; It would be nice to at least clear out the pointers back to this
+// aggregation here, but we end up with non-nesting lifetimes in the legacy
+// pass manager that prevent this from working. In the legacy pass manager
+// we'll end up with dangling references here in some cases.
+#if 0
+  for (auto &AA : AAs)
+    AA->setAAResults(nullptr);
+#endif
+}
 
 //===----------------------------------------------------------------------===//
 // Default chaining methods
 //===----------------------------------------------------------------------===//
 
-AliasResult AliasAnalysis::alias(const MemoryLocation &LocA,
-                                 const MemoryLocation &LocB) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  return AA->alias(LocA, LocB);
+AliasResult AAResults::alias(const MemoryLocation &LocA,
+                             const MemoryLocation &LocB) {
+  for (const auto &AA : AAs) {
+    auto Result = AA->alias(LocA, LocB);
+    if (Result != MayAlias)
+      return Result;
+  }
+  return MayAlias;
 }
 
-bool AliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
-                                           bool OrLocal) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  return AA->pointsToConstantMemory(Loc, OrLocal);
-}
+bool AAResults::pointsToConstantMemory(const MemoryLocation &Loc,
+                                       bool OrLocal) {
+  for (const auto &AA : AAs)
+    if (AA->pointsToConstantMemory(Loc, OrLocal))
+      return true;
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  return AA->getArgModRefInfo(CS, ArgIdx);
+  return false;
 }
 
-void AliasAnalysis::deleteValue(Value *V) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  AA->deleteValue(V);
-}
+ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {
+  ModRefInfo Result = MRI_ModRef;
+
+  for (const auto &AA : AAs) {
+    Result = ModRefInfo(Result & AA->getArgModRefInfo(CS, ArgIdx));
 
-void AliasAnalysis::addEscapingUse(Use &U) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  AA->addEscapingUse(U);
+    // Early-exit the moment we reach the bottom of the lattice.
+    if (Result == MRI_NoModRef)
+      return Result;
+  }
+
+  return Result;
 }
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(Instruction *I, ImmutableCallSite Call) {
+ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) {
   // We may have two calls
   if (auto CS = ImmutableCallSite(I)) {
     // Check if the two calls modify the same memory
@@ -88,289 +123,215 @@ AliasAnalysis::getModRefInfo(Instruction *I, ImmutableCallSite Call) {
     // is that if the call references what this instruction
     // defines, it must be clobbered by this location.
     const MemoryLocation DefLoc = MemoryLocation::get(I);
-    if (getModRefInfo(Call, DefLoc) != AliasAnalysis::NoModRef)
-      return AliasAnalysis::ModRef;
-  }
-  return AliasAnalysis::NoModRef;
-}
-
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-
-  ModRefBehavior MRB = getModRefBehavior(CS);
-  if (MRB == DoesNotAccessMemory)
-    return NoModRef;
-
-  ModRefResult Mask = ModRef;
-  if (onlyReadsMemory(MRB))
-    Mask = Ref;
-
-  if (onlyAccessesArgPointees(MRB)) {
-    bool doesAlias = false;
-    ModRefResult AllArgsMask = NoModRef;
-    if (doesAccessArgPointees(MRB)) {
-      for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-           AI != AE; ++AI) {
-        const Value *Arg = *AI;
-        if (!Arg->getType()->isPointerTy())
-          continue;
-        unsigned ArgIdx = std::distance(CS.arg_begin(), AI);
-        MemoryLocation ArgLoc =
-            MemoryLocation::getForArgument(CS, ArgIdx, *TLI);
-        if (!isNoAlias(ArgLoc, Loc)) {
-          ModRefResult ArgMask = getArgModRefInfo(CS, ArgIdx);
-          doesAlias = true;
-          AllArgsMask = ModRefResult(AllArgsMask | ArgMask);
-        }
-      }
-    }
-    if (!doesAlias)
-      return NoModRef;
-    Mask = ModRefResult(Mask & AllArgsMask);
+    if (getModRefInfo(Call, DefLoc) != MRI_NoModRef)
+      return MRI_ModRef;
   }
+  return MRI_NoModRef;
+}
 
-  // If Loc is a constant memory location, the call definitely could not
-  // modify the memory location.
-  if ((Mask & Mod) && pointsToConstantMemory(Loc))
-    Mask = ModRefResult(Mask & ~Mod);
-
-  // If this is the end of the chain, don't forward.
-  if (!AA) return Mask;
-
-  // Otherwise, fall back to the next AA in the chain. But we can merge
-  // in any mask we've managed to compute.
-  return ModRefResult(AA->getModRefInfo(CS, Loc) & Mask);
-}
-
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-
-  // If CS1 or CS2 are readnone, they don't interact.
-  ModRefBehavior CS1B = getModRefBehavior(CS1);
-  if (CS1B == DoesNotAccessMemory) return NoModRef;
-
-  ModRefBehavior CS2B = getModRefBehavior(CS2);
-  if (CS2B == DoesNotAccessMemory) return NoModRef;
-
-  // If they both only read from memory, there is no dependence.
-  if (onlyReadsMemory(CS1B) && onlyReadsMemory(CS2B))
-    return NoModRef;
-
-  AliasAnalysis::ModRefResult Mask = ModRef;
-
-  // If CS1 only reads memory, the only dependence on CS2 can be
-  // from CS1 reading memory written by CS2.
-  if (onlyReadsMemory(CS1B))
-    Mask = ModRefResult(Mask & Ref);
-
-  // If CS2 only access memory through arguments, accumulate the mod/ref
-  // information from CS1's references to the memory referenced by
-  // CS2's arguments.
-  if (onlyAccessesArgPointees(CS2B)) {
-    AliasAnalysis::ModRefResult R = NoModRef;
-    if (doesAccessArgPointees(CS2B)) {
-      for (ImmutableCallSite::arg_iterator
-           I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
-        const Value *Arg = *I;
-        if (!Arg->getType()->isPointerTy())
-          continue;
-        unsigned CS2ArgIdx = std::distance(CS2.arg_begin(), I);
-        auto CS2ArgLoc = MemoryLocation::getForArgument(CS2, CS2ArgIdx, *TLI);
-
-        // ArgMask indicates what CS2 might do to CS2ArgLoc, and the dependence of
-        // CS1 on that location is the inverse.
-        ModRefResult ArgMask = getArgModRefInfo(CS2, CS2ArgIdx);
-        if (ArgMask == Mod)
-          ArgMask = ModRef;
-        else if (ArgMask == Ref)
-          ArgMask = Mod;
-
-        R = ModRefResult((R | (getModRefInfo(CS1, CS2ArgLoc) & ArgMask)) & Mask);
-        if (R == Mask)
-          break;
-      }
-    }
-    return R;
-  }
+ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,
+                                    const MemoryLocation &Loc) {
+  ModRefInfo Result = MRI_ModRef;
 
-  // If CS1 only accesses memory through arguments, check if CS2 references
-  // any of the memory referenced by CS1's arguments. If not, return NoModRef.
-  if (onlyAccessesArgPointees(CS1B)) {
-    AliasAnalysis::ModRefResult R = NoModRef;
-    if (doesAccessArgPointees(CS1B)) {
-      for (ImmutableCallSite::arg_iterator
-           I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
-        const Value *Arg = *I;
-        if (!Arg->getType()->isPointerTy())
-          continue;
-        unsigned CS1ArgIdx = std::distance(CS1.arg_begin(), I);
-        auto CS1ArgLoc = MemoryLocation::getForArgument(CS1, CS1ArgIdx, *TLI);
-
-        // ArgMask indicates what CS1 might do to CS1ArgLoc; if CS1 might Mod
-        // CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If CS1
-        // might Ref, then we care only about a Mod by CS2.
-        ModRefResult ArgMask = getArgModRefInfo(CS1, CS1ArgIdx);
-        ModRefResult ArgR = getModRefInfo(CS2, CS1ArgLoc);
-        if (((ArgMask & Mod) != NoModRef && (ArgR & ModRef) != NoModRef) ||
-            ((ArgMask & Ref) != NoModRef && (ArgR & Mod)    != NoModRef))
-          R = ModRefResult((R | ArgMask) & Mask);
-
-        if (R == Mask)
-          break;
-      }
-    }
-    return R;
-  }
+  for (const auto &AA : AAs) {
+    Result = ModRefInfo(Result & AA->getModRefInfo(CS, Loc));
 
-  // If this is the end of the chain, don't forward.
-  if (!AA) return Mask;
+    // Early-exit the moment we reach the bottom of the lattice.
+    if (Result == MRI_NoModRef)
+      return Result;
+  }
 
-  // Otherwise, fall back to the next AA in the chain. But we can merge
-  // in any mask we've managed to compute.
-  return ModRefResult(AA->getModRefInfo(CS1, CS2) & Mask);
+  return Result;
 }
 
-AliasAnalysis::ModRefBehavior
-AliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
+                                    ImmutableCallSite CS2) {
+  ModRefInfo Result = MRI_ModRef;
+
+  for (const auto &AA : AAs) {
+    Result = ModRefInfo(Result & AA->getModRefInfo(CS1, CS2));
+
+    // Early-exit the moment we reach the bottom of the lattice.
+    if (Result == MRI_NoModRef)
+      return Result;
+  }
+
+  return Result;
+}
 
-  ModRefBehavior Min = UnknownModRefBehavior;
+FunctionModRefBehavior AAResults::getModRefBehavior(ImmutableCallSite CS) {
+  FunctionModRefBehavior Result = FMRB_UnknownModRefBehavior;
 
-  // Call back into the alias analysis with the other form of getModRefBehavior
-  // to see if it can give a better response.
-  if (const Function *F = CS.getCalledFunction())
-    Min = getModRefBehavior(F);
+  for (const auto &AA : AAs) {
+    Result = FunctionModRefBehavior(Result & AA->getModRefBehavior(CS));
 
-  // If this is the end of the chain, don't forward.
-  if (!AA) return Min;
+    // Early-exit the moment we reach the bottom of the lattice.
+    if (Result == FMRB_DoesNotAccessMemory)
+      return Result;
+  }
 
-  // Otherwise, fall back to the next AA in the chain. But we can merge
-  // in any result we've managed to compute.
-  return ModRefBehavior(AA->getModRefBehavior(CS) & Min);
+  return Result;
 }
 
-AliasAnalysis::ModRefBehavior
-AliasAnalysis::getModRefBehavior(const Function *F) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  return AA->getModRefBehavior(F);
+FunctionModRefBehavior AAResults::getModRefBehavior(const Function *F) {
+  FunctionModRefBehavior Result = FMRB_UnknownModRefBehavior;
+
+  for (const auto &AA : AAs) {
+    Result = FunctionModRefBehavior(Result & AA->getModRefBehavior(F));
+
+    // Early-exit the moment we reach the bottom of the lattice.
+    if (Result == FMRB_DoesNotAccessMemory)
+      return Result;
+  }
+
+  return Result;
 }
 
 //===----------------------------------------------------------------------===//
-// AliasAnalysis non-virtual helper method implementation
+// Helper method implementation
 //===----------------------------------------------------------------------===//
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(const LoadInst *L, const MemoryLocation &Loc) {
+ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
+                                    const MemoryLocation &Loc) {
   // Be conservative in the face of volatile/atomic.
   if (!L->isUnordered())
-    return ModRef;
+    return MRI_ModRef;
 
   // If the load address doesn't alias the given address, it doesn't read
   // or write the specified memory.
   if (Loc.Ptr && !alias(MemoryLocation::get(L), Loc))
-    return NoModRef;
+    return MRI_NoModRef;
 
   // Otherwise, a load just reads.
-  return Ref;
+  return MRI_Ref;
 }
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(const StoreInst *S, const MemoryLocation &Loc) {
+ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
+                                    const MemoryLocation &Loc) {
   // Be conservative in the face of volatile/atomic.
   if (!S->isUnordered())
-    return ModRef;
+    return MRI_ModRef;
 
   if (Loc.Ptr) {
     // If the store address cannot alias the pointer in question, then the
     // specified memory cannot be modified by the store.
     if (!alias(MemoryLocation::get(S), Loc))
-      return NoModRef;
+      return MRI_NoModRef;
 
     // If the pointer is a pointer to constant memory, then it could not have
     // been modified by this store.
     if (pointsToConstantMemory(Loc))
-      return NoModRef;
-
+      return MRI_NoModRef;
   }
 
   // Otherwise, a store just writes.
-  return Mod;
+  return MRI_Mod;
 }
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(const VAArgInst *V, const MemoryLocation &Loc) {
+ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,
+                                    const MemoryLocation &Loc) {
 
   if (Loc.Ptr) {
     // If the va_arg address cannot alias the pointer in question, then the
     // specified memory cannot be accessed by the va_arg.
     if (!alias(MemoryLocation::get(V), Loc))
-      return NoModRef;
+      return MRI_NoModRef;
 
     // If the pointer is a pointer to constant memory, then it could not have
     // been modified by this va_arg.
     if (pointsToConstantMemory(Loc))
-      return NoModRef;
+      return MRI_NoModRef;
   }
 
   // Otherwise, a va_arg reads and writes.
-  return ModRef;
+  return MRI_ModRef;
+}
+
+ModRefInfo AAResults::getModRefInfo(const CatchPadInst *CatchPad,
+                                    const MemoryLocation &Loc) {
+  if (Loc.Ptr) {
+    // If the pointer is a pointer to constant memory,
+    // then it could not have been modified by this catchpad.
+    if (pointsToConstantMemory(Loc))
+      return MRI_NoModRef;
+  }
+
+  // Otherwise, a catchpad reads and writes.
+  return MRI_ModRef;
+}
+
+ModRefInfo AAResults::getModRefInfo(const CatchReturnInst *CatchRet,
+                                    const MemoryLocation &Loc) {
+  if (Loc.Ptr) {
+    // If the pointer is a pointer to constant memory,
+    // then it could not have been modified by this catchpad.
+    if (pointsToConstantMemory(Loc))
+      return MRI_NoModRef;
+  }
+
+  // Otherwise, a catchret reads and writes.
+  return MRI_ModRef;
 }
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(const AtomicCmpXchgInst *CX,
-                             const MemoryLocation &Loc) {
+ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,
+                                    const MemoryLocation &Loc) {
   // Acquire/Release cmpxchg has properties that matter for arbitrary addresses.
   if (CX->getSuccessOrdering() > Monotonic)
-    return ModRef;
+    return MRI_ModRef;
 
   // If the cmpxchg address does not alias the location, it does not access it.
   if (Loc.Ptr && !alias(MemoryLocation::get(CX), Loc))
-    return NoModRef;
+    return MRI_NoModRef;
 
-  return ModRef;
+  return MRI_ModRef;
 }
 
-AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW,
-                             const MemoryLocation &Loc) {
+ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
+                                    const MemoryLocation &Loc) {
   // Acquire/Release atomicrmw has properties that matter for arbitrary addresses.
   if (RMW->getOrdering() > Monotonic)
-    return ModRef;
+    return MRI_ModRef;
 
   // If the atomicrmw address does not alias the location, it does not access it.
   if (Loc.Ptr && !alias(MemoryLocation::get(RMW), Loc))
-    return NoModRef;
+    return MRI_NoModRef;
 
-  return ModRef;
+  return MRI_ModRef;
 }
 
-// FIXME: this is really just shoring-up a deficiency in alias analysis.
-// BasicAA isn't willing to spend linear time determining whether an alloca
-// was captured before or after this particular call, while we are. However,
-// with a smarter AA in place, this test is just wasting compile time.
-AliasAnalysis::ModRefResult AliasAnalysis::callCapturesBefore(
-    const Instruction *I, const MemoryLocation &MemLoc, DominatorTree *DT) {
+/// \brief Return information about whether a particular call site modifies
+/// or reads the specified memory location \p MemLoc before instruction \p I
+/// in a BasicBlock. A ordered basic block \p OBB can be used to speed up
+/// instruction-ordering queries inside the BasicBlock containing \p I.
+/// FIXME: this is really just shoring-up a deficiency in alias analysis.
+/// BasicAA isn't willing to spend linear time determining whether an alloca
+/// was captured before or after this particular call, while we are. However,
+/// with a smarter AA in place, this test is just wasting compile time.
+ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
+                                         const MemoryLocation &MemLoc,
+                                         DominatorTree *DT,
+                                         OrderedBasicBlock *OBB) {
   if (!DT)
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
 
-  const Value *Object = GetUnderlyingObject(MemLoc.Ptr, *DL);
+  const Value *Object =
+      GetUnderlyingObject(MemLoc.Ptr, I->getModule()->getDataLayout());
   if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) ||
       isa<Constant>(Object))
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
 
   ImmutableCallSite CS(I);
   if (!CS.getInstruction() || CS.getInstruction() == Object)
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
 
   if (llvm::PointerMayBeCapturedBefore(Object, /* ReturnCaptures */ true,
                                        /* StoreCaptures */ true, I, DT,
-                                       /* include Object */ true))
-    return AliasAnalysis::ModRef;
+                                       /* include Object */ true,
+                                       /* OrderedBasicBlock */ OBB))
+    return MRI_ModRef;
 
   unsigned ArgNo = 0;
-  AliasAnalysis::ModRefResult R = AliasAnalysis::NoModRef;
+  ModRefInfo R = MRI_NoModRef;
   for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
        CI != CE; ++CI, ++ArgNo) {
     // Only look at the no-capture or byval pointer arguments.  If this
@@ -389,50 +350,20 @@ AliasAnalysis::ModRefResult AliasAnalysis::callCapturesBefore(
     if (CS.doesNotAccessMemory(ArgNo))
       continue;
     if (CS.onlyReadsMemory(ArgNo)) {
-      R = AliasAnalysis::Ref;
+      R = MRI_Ref;
       continue;
     }
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
   }
   return R;
 }
 
-// AliasAnalysis destructor: DO NOT move this to the header file for
-// AliasAnalysis or else clients of the AliasAnalysis class may not depend on
-// the AliasAnalysis.o file in the current .a file, causing alias analysis
-// support to not be included in the tool correctly!
-//
-AliasAnalysis::~AliasAnalysis() {}
-
-/// InitializeAliasAnalysis - Subclasses must call this method to initialize the
-/// AliasAnalysis interface before any other methods are called.
-///
-void AliasAnalysis::InitializeAliasAnalysis(Pass *P, const DataLayout *NewDL) {
-  DL = NewDL;
-  auto *TLIP = P->getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  TLI = TLIP ? &TLIP->getTLI() : nullptr;
-  AA = &P->getAnalysis<AliasAnalysis>();
-}
-
-// getAnalysisUsage - All alias analysis implementations should invoke this
-// directly (using AliasAnalysis::getAnalysisUsage(AU)).
-void AliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();         // All AA's chain
-}
-
-/// getTypeStoreSize - Return the DataLayout store size for the given type,
-/// if known, or a conservative value otherwise.
-///
-uint64_t AliasAnalysis::getTypeStoreSize(Type *Ty) {
-  return DL ? DL->getTypeStoreSize(Ty) : MemoryLocation::UnknownSize;
-}
-
 /// canBasicBlockModify - Return true if it is possible for execution of the
 /// specified basic block to modify the location Loc.
 ///
-bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB,
-                                        const MemoryLocation &Loc) {
-  return canInstructionRangeModRef(BB.front(), BB.back(), Loc, Mod);
+bool AAResults::canBasicBlockModify(const BasicBlock &BB,
+                                    const MemoryLocation &Loc) {
+  return canInstructionRangeModRef(BB.front(), BB.back(), Loc, MRI_Mod);
 }
 
 /// canInstructionRangeModRef - Return true if it is possible for the
@@ -440,28 +371,178 @@ bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB,
 /// mode) the location Loc. The instructions to consider are all
 /// of the instructions in the range of [I1,I2] INCLUSIVE.
 /// I1 and I2 must be in the same basic block.
-bool AliasAnalysis::canInstructionRangeModRef(const Instruction &I1,
-                                              const Instruction &I2,
-                                              const MemoryLocation &Loc,
-                                              const ModRefResult Mode) {
+bool AAResults::canInstructionRangeModRef(const Instruction &I1,
+                                          const Instruction &I2,
+                                          const MemoryLocation &Loc,
+                                          const ModRefInfo Mode) {
   assert(I1.getParent() == I2.getParent() &&
          "Instructions not in same basic block!");
-  BasicBlock::const_iterator I = &I1;
-  BasicBlock::const_iterator E = &I2;
+  BasicBlock::const_iterator I = I1.getIterator();
+  BasicBlock::const_iterator E = I2.getIterator();
   ++E;  // Convert from inclusive to exclusive range.
 
   for (; I != E; ++I) // Check every instruction in range
-    if (getModRefInfo(I, Loc) & Mode)
+    if (getModRefInfo(&*I, Loc) & Mode)
       return true;
   return false;
 }
 
+// Provide a definition for the root virtual destructor.
+AAResults::Concept::~Concept() {}
+
+namespace {
+/// A wrapper pass for external alias analyses. This just squirrels away the
+/// callback used to run any analyses and register their results.
+struct ExternalAAWrapperPass : ImmutablePass {
+  typedef std::function<void(Pass &, Function &, AAResults &)> CallbackT;
+
+  CallbackT CB;
+
+  static char ID;
+
+  ExternalAAWrapperPass() : ImmutablePass(ID) {
+    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+  explicit ExternalAAWrapperPass(CallbackT CB)
+      : ImmutablePass(ID), CB(std::move(CB)) {
+    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+}
+
+char ExternalAAWrapperPass::ID = 0;
+INITIALIZE_PASS(ExternalAAWrapperPass, "external-aa", "External Alias Analysis",
+                false, true)
+
+ImmutablePass *
+llvm::createExternalAAWrapperPass(ExternalAAWrapperPass::CallbackT Callback) {
+  return new ExternalAAWrapperPass(std::move(Callback));
+}
+
+AAResultsWrapperPass::AAResultsWrapperPass() : FunctionPass(ID) {
+  initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+char AAResultsWrapperPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AAResultsWrapperPass, "aa",
+                      "Function Alias Analysis Results", false, true)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CFLAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ExternalAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScopedNoAliasAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TypeBasedAAWrapperPass)
+INITIALIZE_PASS_END(AAResultsWrapperPass, "aa",
+                    "Function Alias Analysis Results", false, true)
+
+FunctionPass *llvm::createAAResultsWrapperPass() {
+  return new AAResultsWrapperPass();
+}
+
+/// Run the wrapper pass to rebuild an aggregation over known AA passes.
+///
+/// This is the legacy pass manager's interface to the new-style AA results
+/// aggregation object. Because this is somewhat shoe-horned into the legacy
+/// pass manager, we hard code all the specific alias analyses available into
+/// it. While the particular set enabled is configured via commandline flags,
+/// adding a new alias analysis to LLVM will require adding support for it to
+/// this list.
+bool AAResultsWrapperPass::runOnFunction(Function &F) {
+  // NB! This *must* be reset before adding new AA results to the new
+  // AAResults object because in the legacy pass manager, each instance
+  // of these will refer to the *same* immutable analyses, registering and
+  // unregistering themselves with them. We need to carefully tear down the
+  // previous object first, in this case replacing it with an empty one, before
+  // registering new results.
+  AAR.reset(new AAResults());
+
+  // BasicAA is always available for function analyses. Also, we add it first
+  // so that it can trump TBAA results when it proves MustAlias.
+  // FIXME: TBAA should have an explicit mode to support this and then we
+  // should reconsider the ordering here.
+  if (!DisableBasicAA)
+    AAR->addAAResult(getAnalysis<BasicAAWrapperPass>().getResult());
+
+  // Populate the results with the currently available AAs.
+  if (auto *WrapperPass = getAnalysisIfAvailable<ScopedNoAliasAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = getAnalysisIfAvailable<TypeBasedAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass =
+          getAnalysisIfAvailable<objcarc::ObjCARCAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = getAnalysisIfAvailable<GlobalsAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = getAnalysisIfAvailable<SCEVAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = getAnalysisIfAvailable<CFLAAWrapperPass>())
+    AAR->addAAResult(WrapperPass->getResult());
+
+  // If available, run an external AA providing callback over the results as
+  // well.
+  if (auto *WrapperPass = getAnalysisIfAvailable<ExternalAAWrapperPass>())
+    if (WrapperPass->CB)
+      WrapperPass->CB(*this, F, *AAR);
+
+  // Analyses don't mutate the IR, so return false.
+  return false;
+}
+
+void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<BasicAAWrapperPass>();
+
+  // We also need to mark all the alias analysis passes we will potentially
+  // probe in runOnFunction as used here to ensure the legacy pass manager
+  // preserves them. This hard coding of lists of alias analyses is specific to
+  // the legacy pass manager.
+  AU.addUsedIfAvailable<ScopedNoAliasAAWrapperPass>();
+  AU.addUsedIfAvailable<TypeBasedAAWrapperPass>();
+  AU.addUsedIfAvailable<objcarc::ObjCARCAAWrapperPass>();
+  AU.addUsedIfAvailable<GlobalsAAWrapperPass>();
+  AU.addUsedIfAvailable<SCEVAAWrapperPass>();
+  AU.addUsedIfAvailable<CFLAAWrapperPass>();
+}
+
+AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
+                                        BasicAAResult &BAR) {
+  AAResults AAR;
+
+  // Add in our explicitly constructed BasicAA results.
+  if (!DisableBasicAA)
+    AAR.addAAResult(BAR);
+
+  // Populate the results with the other currently available AAs.
+  if (auto *WrapperPass =
+          P.getAnalysisIfAvailable<ScopedNoAliasAAWrapperPass>())
+    AAR.addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = P.getAnalysisIfAvailable<TypeBasedAAWrapperPass>())
+    AAR.addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass =
+          P.getAnalysisIfAvailable<objcarc::ObjCARCAAWrapperPass>())
+    AAR.addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = P.getAnalysisIfAvailable<GlobalsAAWrapperPass>())
+    AAR.addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = P.getAnalysisIfAvailable<SCEVAAWrapperPass>())
+    AAR.addAAResult(WrapperPass->getResult());
+  if (auto *WrapperPass = P.getAnalysisIfAvailable<CFLAAWrapperPass>())
+    AAR.addAAResult(WrapperPass->getResult());
+
+  return AAR;
+}
+
 /// isNoAliasCall - Return true if this pointer is returned by a noalias
 /// function.
 bool llvm::isNoAliasCall(const Value *V) {
-  if (isa<CallInst>(V) || isa<InvokeInst>(V))
-    return ImmutableCallSite(cast<Instruction>(V))
-      .paramHasAttr(0, Attribute::NoAlias);
+  if (auto CS = ImmutableCallSite(V))
+    return CS.paramHasAttr(0, Attribute::NoAlias);
   return false;
 }
 
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp
deleted file mode 100644
index 9b6a5a4..0000000
--- a/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-//===- AliasAnalysisCounter.cpp - Alias Analysis Query Counter ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a pass which can be used to count how many alias queries
-// are being made and how the alias analysis implementation being used responds.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static cl::opt<bool>
-PrintAll("count-aa-print-all-queries", cl::ReallyHidden, cl::init(true));
-static cl::opt<bool>
-PrintAllFailures("count-aa-print-all-failed-queries", cl::ReallyHidden);
-
-namespace {
-  class AliasAnalysisCounter : public ModulePass, public AliasAnalysis {
-    unsigned No, May, Partial, Must;
-    unsigned NoMR, JustRef, JustMod, MR;
-    Module *M;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    AliasAnalysisCounter() : ModulePass(ID) {
-      initializeAliasAnalysisCounterPass(*PassRegistry::getPassRegistry());
-      No = May = Partial = Must = 0;
-      NoMR = JustRef = JustMod = MR = 0;
-    }
-
-    void printLine(const char *Desc, unsigned Val, unsigned Sum) {
-      errs() <<  "  " << Val << " " << Desc << " responses ("
-             << Val*100/Sum << "%)\n";
-    }
-    ~AliasAnalysisCounter() override {
-      unsigned AASum = No+May+Partial+Must;
-      unsigned MRSum = NoMR+JustRef+JustMod+MR;
-      if (AASum + MRSum) { // Print a report if any counted queries occurred...
-        errs() << "\n===== Alias Analysis Counter Report =====\n"
-               << "  Analysis counted:\n"
-               << "  " << AASum << " Total Alias Queries Performed\n";
-        if (AASum) {
-          printLine("no alias",     No, AASum);
-          printLine("may alias",   May, AASum);
-          printLine("partial alias", Partial, AASum);
-          printLine("must alias", Must, AASum);
-          errs() << "  Alias Analysis Counter Summary: " << No*100/AASum << "%/"
-                 << May*100/AASum << "%/"
-                 << Partial*100/AASum << "%/"
-                 << Must*100/AASum<<"%\n\n";
-        }
-
-        errs() << "  " << MRSum    << " Total Mod/Ref Queries Performed\n";
-        if (MRSum) {
-          printLine("no mod/ref",    NoMR, MRSum);
-          printLine("ref",        JustRef, MRSum);
-          printLine("mod",        JustMod, MRSum);
-          printLine("mod/ref",         MR, MRSum);
-          errs() << "  Mod/Ref Analysis Counter Summary: " <<NoMR*100/MRSum
-                 << "%/" << JustRef*100/MRSum << "%/" << JustMod*100/MRSum
-                 << "%/" << MR*100/MRSum <<"%\n\n";
-        }
-      }
-    }
-
-    bool runOnModule(Module &M) override {
-      this->M = &M;
-      InitializeAliasAnalysis(this, &M.getDataLayout());
-      return false;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AliasAnalysis::getAnalysisUsage(AU);
-      AU.addRequired<AliasAnalysis>();
-      AU.setPreservesAll();
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(AnalysisID PI) override {
-      if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
-    }
-    
-    // FIXME: We could count these too...
-    bool pointsToConstantMemory(const MemoryLocation &Loc,
-                                bool OrLocal) override {
-      return getAnalysis<AliasAnalysis>().pointsToConstantMemory(Loc, OrLocal);
-    }
-
-    // Forwarding functions: just delegate to a real AA implementation, counting
-    // the number of responses...
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override;
-
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override {
-      return AliasAnalysis::getModRefInfo(CS1,CS2);
-    }
-  };
-}
-
-char AliasAnalysisCounter::ID = 0;
-INITIALIZE_AG_PASS(AliasAnalysisCounter, AliasAnalysis, "count-aa",
-                   "Count Alias Analysis Query Responses", false, true, false)
-
-ModulePass *llvm::createAliasAnalysisCounterPass() {
-  return new AliasAnalysisCounter();
-}
-
-AliasResult AliasAnalysisCounter::alias(const MemoryLocation &LocA,
-                                        const MemoryLocation &LocB) {
-  AliasResult R = getAnalysis<AliasAnalysis>().alias(LocA, LocB);
-
-  const char *AliasString = nullptr;
-  switch (R) {
-  case NoAlias:   No++;   AliasString = "No alias"; break;
-  case MayAlias:  May++;  AliasString = "May alias"; break;
-  case PartialAlias: Partial++; AliasString = "Partial alias"; break;
-  case MustAlias: Must++; AliasString = "Must alias"; break;
-  }
-
-  if (PrintAll || (PrintAllFailures && R == MayAlias)) {
-    errs() << AliasString << ":\t";
-    errs() << "[" << LocA.Size << "B] ";
-    LocA.Ptr->printAsOperand(errs(), true, M);
-    errs() << ", ";
-    errs() << "[" << LocB.Size << "B] ";
-    LocB.Ptr->printAsOperand(errs(), true, M);
-    errs() << "\n";
-  }
-
-  return R;
-}
-
-AliasAnalysis::ModRefResult
-AliasAnalysisCounter::getModRefInfo(ImmutableCallSite CS,
-                                    const MemoryLocation &Loc) {
-  ModRefResult R = getAnalysis<AliasAnalysis>().getModRefInfo(CS, Loc);
-
-  const char *MRString = nullptr;
-  switch (R) {
-  case NoModRef: NoMR++;     MRString = "NoModRef"; break;
-  case Ref:      JustRef++;  MRString = "JustRef"; break;
-  case Mod:      JustMod++;  MRString = "JustMod"; break;
-  case ModRef:   MR++;       MRString = "ModRef"; break;
-  }
-
-  if (PrintAll || (PrintAllFailures && R == ModRef)) {
-    errs() << MRString << ":  Ptr: ";
-    errs() << "[" << Loc.Size << "B] ";
-    Loc.Ptr->printAsOperand(errs(), true, M);
-    errs() << "\t<->" << *CS.getInstruction() << '\n';
-  }
-  return R;
-}
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index 5d1b001..12917b6 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -21,8 +21,10 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
@@ -57,7 +59,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.setPreservesAll();
     }
 
@@ -81,7 +83,7 @@ namespace {
 char AAEval::ID = 0;
 INITIALIZE_PASS_BEGIN(AAEval, "aa-eval",
                 "Exhaustive Alias Analysis Precision Evaluator", false, true)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(AAEval, "aa-eval",
                 "Exhaustive Alias Analysis Precision Evaluator", false, true)
 
@@ -139,16 +141,17 @@ static inline bool isInterestingPointer(Value *V) {
 }
 
 bool AAEval::runOnFunction(Function &F) {
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   SetVector<Value *> Pointers;
-  SetVector<CallSite> CallSites;
+  SmallSetVector<CallSite, 16> CallSites;
   SetVector<Value *> Loads;
   SetVector<Value *> Stores;
 
-  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
-    if (I->getType()->isPointerTy())    // Add all pointer arguments.
-      Pointers.insert(I);
+  for (auto &I : F.args())
+    if (I.getType()->isPointerTy())    // Add all pointer arguments.
+      Pointers.insert(&I);
 
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
     if (I->getType()->isPointerTy()) // Add all pointer instructions.
@@ -164,10 +167,9 @@ bool AAEval::runOnFunction(Function &F) {
       if (!isa<Function>(Callee) && isInterestingPointer(Callee))
         Pointers.insert(Callee);
       // Consider formals.
-      for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-           AI != AE; ++AI)
-        if (isInterestingPointer(*AI))
-          Pointers.insert(*AI);
+      for (Use &DataOp : CS.data_ops())
+        if (isInterestingPointer(DataOp))
+          Pointers.insert(DataOp);
       CallSites.insert(CS);
     } else {
       // Consider all operands.
@@ -188,12 +190,12 @@ bool AAEval::runOnFunction(Function &F) {
        I1 != E; ++I1) {
     uint64_t I1Size = MemoryLocation::UnknownSize;
     Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
-    if (I1ElTy->isSized()) I1Size = AA.getTypeStoreSize(I1ElTy);
+    if (I1ElTy->isSized()) I1Size = DL.getTypeStoreSize(I1ElTy);
 
     for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
       uint64_t I2Size = MemoryLocation::UnknownSize;
       Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType();
-      if (I2ElTy->isSized()) I2Size = AA.getTypeStoreSize(I2ElTy);
+      if (I2ElTy->isSized()) I2Size = DL.getTypeStoreSize(I2ElTy);
 
       switch (AA.alias(*I1, I1Size, *I2, I2Size)) {
       case NoAlias:
@@ -281,30 +283,29 @@ bool AAEval::runOnFunction(Function &F) {
   }
 
   // Mod/ref alias analysis: compare all pairs of calls and values
-  for (SetVector<CallSite>::iterator C = CallSites.begin(),
-         Ce = CallSites.end(); C != Ce; ++C) {
+  for (auto C = CallSites.begin(), Ce = CallSites.end(); C != Ce; ++C) {
     Instruction *I = C->getInstruction();
 
     for (SetVector<Value *>::iterator V = Pointers.begin(), Ve = Pointers.end();
          V != Ve; ++V) {
       uint64_t Size = MemoryLocation::UnknownSize;
       Type *ElTy = cast<PointerType>((*V)->getType())->getElementType();
-      if (ElTy->isSized()) Size = AA.getTypeStoreSize(ElTy);
+      if (ElTy->isSized()) Size = DL.getTypeStoreSize(ElTy);
 
       switch (AA.getModRefInfo(*C, *V, Size)) {
-      case AliasAnalysis::NoModRef:
+      case MRI_NoModRef:
         PrintModRefResults("NoModRef", PrintNoModRef, I, *V, F.getParent());
         ++NoModRefCount;
         break;
-      case AliasAnalysis::Mod:
+      case MRI_Mod:
         PrintModRefResults("Just Mod", PrintMod, I, *V, F.getParent());
         ++ModCount;
         break;
-      case AliasAnalysis::Ref:
+      case MRI_Ref:
         PrintModRefResults("Just Ref", PrintRef, I, *V, F.getParent());
         ++RefCount;
         break;
-      case AliasAnalysis::ModRef:
+      case MRI_ModRef:
         PrintModRefResults("Both ModRef", PrintModRef, I, *V, F.getParent());
         ++ModRefCount;
         break;
@@ -313,25 +314,24 @@ bool AAEval::runOnFunction(Function &F) {
   }
 
   // Mod/ref alias analysis: compare all pairs of calls
-  for (SetVector<CallSite>::iterator C = CallSites.begin(),
-         Ce = CallSites.end(); C != Ce; ++C) {
-    for (SetVector<CallSite>::iterator D = CallSites.begin(); D != Ce; ++D) {
+  for (auto C = CallSites.begin(), Ce = CallSites.end(); C != Ce; ++C) {
+    for (auto D = CallSites.begin(); D != Ce; ++D) {
       if (D == C)
         continue;
       switch (AA.getModRefInfo(*C, *D)) {
-      case AliasAnalysis::NoModRef:
+      case MRI_NoModRef:
         PrintModRefResults("NoModRef", PrintNoModRef, *C, *D, F.getParent());
         ++NoModRefCount;
         break;
-      case AliasAnalysis::Mod:
+      case MRI_Mod:
         PrintModRefResults("Just Mod", PrintMod, *C, *D, F.getParent());
         ++ModCount;
         break;
-      case AliasAnalysis::Ref:
+      case MRI_Ref:
         PrintModRefResults("Just Ref", PrintRef, *C, *D, F.getParent());
         ++RefCount;
         break;
-      case AliasAnalysis::ModRef:
+      case MRI_ModRef:
         PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent());
         ++ModRefCount;
         break;
diff --git a/contrib/llvm/lib/Analysis/AliasDebugger.cpp b/contrib/llvm/lib/Analysis/AliasDebugger.cpp
deleted file mode 100644
index e5107b3..0000000
--- a/contrib/llvm/lib/Analysis/AliasDebugger.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===- AliasDebugger.cpp - Simple Alias Analysis Use Checker --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This simple pass checks alias analysis users to ensure that if they
-// create a new value, they do not query AA without informing it of the value.
-// It acts as a shim over any other AA pass you want.
-//
-// Yes keeping track of every value in the program is expensive, but this is 
-// a debugging pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include <set>
-using namespace llvm;
-
-namespace {
-  
-  class AliasDebugger : public ModulePass, public AliasAnalysis {
-
-    //What we do is simple.  Keep track of every value the AA could
-    //know about, and verify that queries are one of those.
-    //A query to a value that didn't exist when the AA was created
-    //means someone forgot to update the AA when creating new values
-
-    std::set<const Value*> Vals;
-    
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    AliasDebugger() : ModulePass(ID) {
-      initializeAliasDebuggerPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnModule(Module &M) override {
-      InitializeAliasAnalysis(this, &M.getDataLayout()); // set up super class
-
-      for(Module::global_iterator I = M.global_begin(),
-            E = M.global_end(); I != E; ++I) {
-        Vals.insert(&*I);
-        for (User::const_op_iterator OI = I->op_begin(),
-             OE = I->op_end(); OI != OE; ++OI)
-          Vals.insert(*OI);
-      }
-
-      for(Module::iterator I = M.begin(),
-            E = M.end(); I != E; ++I){
-        Vals.insert(&*I);
-        if(!I->isDeclaration()) {
-          for (Function::arg_iterator AI = I->arg_begin(), AE = I->arg_end();
-               AI != AE; ++AI) 
-            Vals.insert(&*AI);     
-          for (Function::const_iterator FI = I->begin(), FE = I->end();
-               FI != FE; ++FI) 
-            for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end();
-                 BI != BE; ++BI) {
-              Vals.insert(&*BI);
-              for (User::const_op_iterator OI = BI->op_begin(),
-                   OE = BI->op_end(); OI != OE; ++OI)
-                Vals.insert(*OI);
-            }
-        }
-        
-      }
-      return false;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AliasAnalysis::getAnalysisUsage(AU);
-      AU.setPreservesAll();                         // Does not transform code
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(AnalysisID PI) override {
-      if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
-    }
-    
-    //------------------------------------------------
-    // Implement the AliasAnalysis API
-    //
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override {
-      assert(Vals.find(LocA.Ptr) != Vals.end() &&
-             "Never seen value in AA before");
-      assert(Vals.find(LocB.Ptr) != Vals.end() &&
-             "Never seen value in AA before");
-      return AliasAnalysis::alias(LocA, LocB);
-    }
-
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override {
-      assert(Vals.find(Loc.Ptr) != Vals.end() && "Never seen value in AA before");
-      return AliasAnalysis::getModRefInfo(CS, Loc);
-    }
-
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override {
-      return AliasAnalysis::getModRefInfo(CS1,CS2);
-    }
-
-    bool pointsToConstantMemory(const MemoryLocation &Loc,
-                                bool OrLocal) override {
-      assert(Vals.find(Loc.Ptr) != Vals.end() && "Never seen value in AA before");
-      return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
-    }
-
-    void deleteValue(Value *V) override {
-      assert(Vals.find(V) != Vals.end() && "Never seen value in AA before");
-      AliasAnalysis::deleteValue(V);
-    }
-
-  };
-}
-
-char AliasDebugger::ID = 0;
-INITIALIZE_AG_PASS(AliasDebugger, AliasAnalysis, "debug-aa",
-                   "AA use debugger", false, true, false)
-
-Pass *llvm::createAliasDebugger() { return new AliasDebugger(); }
-
diff --git a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
index 54d0f43..3094049 100644
--- a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
@@ -167,8 +168,7 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
   if (!UnknownInsts.empty()) {
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
       if (AA.getModRefInfo(UnknownInsts[i],
-                           MemoryLocation(Ptr, Size, AAInfo)) !=
-          AliasAnalysis::NoModRef)
+                           MemoryLocation(Ptr, Size, AAInfo)) != MRI_NoModRef)
         return true;
   }
 
@@ -182,16 +182,14 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
     ImmutableCallSite C1(getUnknownInst(i)), C2(Inst);
-    if (!C1 || !C2 ||
-        AA.getModRefInfo(C1, C2) != AliasAnalysis::NoModRef ||
-        AA.getModRefInfo(C2, C1) != AliasAnalysis::NoModRef)
+    if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
+        AA.getModRefInfo(C2, C1) != MRI_NoModRef)
       return true;
   }
 
   for (iterator I = begin(), E = end(); I != E; ++I)
-    if (AA.getModRefInfo(
-            Inst, MemoryLocation(I.getPointer(), I.getSize(), I.getAAInfo())) !=
-        AliasAnalysis::NoModRef)
+    if (AA.getModRefInfo(Inst, MemoryLocation(I.getPointer(), I.getSize(),
+                                              I.getAAInfo())) != MRI_NoModRef)
       return true;
 
   return false;
@@ -223,7 +221,7 @@ AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
     if (Cur->Forward || !Cur->aliasesPointer(Ptr, Size, AAInfo, AA)) continue;
     
     if (!FoundSet) {      // If this is the first alias set ptr can go into.
-      FoundSet = Cur;     // Remember it.
+      FoundSet = &*Cur;   // Remember it.
     } else {              // Otherwise, we must merge the sets.
       FoundSet->mergeSetIn(*Cur, *this);     // Merge in contents.
     }
@@ -257,7 +255,7 @@ AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
     if (Cur->Forward || !Cur->aliasesUnknownInst(Inst, AA))
       continue;
     if (!FoundSet)            // If this is the first alias set ptr can go into.
-      FoundSet = Cur;         // Remember it.
+      FoundSet = &*Cur;       // Remember it.
     else if (!Cur->Forward)   // Otherwise, we must merge the sets.
       FoundSet->mergeSetIn(*Cur, *this);     // Merge in contents.
   }
@@ -309,8 +307,9 @@ bool AliasSetTracker::add(LoadInst *LI) {
 
   AliasSet::AccessLattice Access = AliasSet::RefAccess;
   bool NewPtr;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
   AliasSet &AS = addPointer(LI->getOperand(0),
-                            AA.getTypeStoreSize(LI->getType()),
+                            DL.getTypeStoreSize(LI->getType()),
                             AAInfo, Access, NewPtr);
   if (LI->isVolatile()) AS.setVolatile();
   return NewPtr;
@@ -324,9 +323,10 @@ bool AliasSetTracker::add(StoreInst *SI) {
 
   AliasSet::AccessLattice Access = AliasSet::ModAccess;
   bool NewPtr;
+  const DataLayout &DL = SI->getModule()->getDataLayout();
   Value *Val = SI->getOperand(0);
   AliasSet &AS = addPointer(SI->getOperand(1),
-                            AA.getTypeStoreSize(Val->getType()),
+                            DL.getTypeStoreSize(Val->getType()),
                             AAInfo, Access, NewPtr);
   if (SI->isVolatile()) AS.setVolatile();
   return NewPtr;
@@ -372,8 +372,8 @@ bool AliasSetTracker::add(Instruction *I) {
 }
 
 void AliasSetTracker::add(BasicBlock &BB) {
-  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
-    add(I);
+  for (auto &I : BB)
+    add(&I);
 }
 
 void AliasSetTracker::add(const AliasSetTracker &AST) {
@@ -443,7 +443,8 @@ AliasSetTracker::remove(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) {
 }
 
 bool AliasSetTracker::remove(LoadInst *LI) {
-  uint64_t Size = AA.getTypeStoreSize(LI->getType());
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(LI->getType());
 
   AAMDNodes AAInfo;
   LI->getAAMetadata(AAInfo);
@@ -455,7 +456,8 @@ bool AliasSetTracker::remove(LoadInst *LI) {
 }
 
 bool AliasSetTracker::remove(StoreInst *SI) {
-  uint64_t Size = AA.getTypeStoreSize(SI->getOperand(0)->getType());
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(SI->getOperand(0)->getType());
 
   AAMDNodes AAInfo;
   SI->getAAMetadata(AAInfo);
@@ -505,9 +507,6 @@ bool AliasSetTracker::remove(Instruction *I) {
 // dangling pointers to deleted instructions.
 //
 void AliasSetTracker::deleteValue(Value *PtrVal) {
-  // Notify the alias analysis implementation that this value is gone.
-  AA.deleteValue(PtrVal);
-
   // If this is a call instruction, remove the callsite from the appropriate
   // AliasSet (if present).
   if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
@@ -650,11 +649,12 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
     }
 
     bool runOnFunction(Function &F) override {
-      Tracker = new AliasSetTracker(getAnalysis<AliasAnalysis>());
+      auto &AAWP = getAnalysis<AAResultsWrapperPass>();
+      Tracker = new AliasSetTracker(AAWP.getAAResults());
 
       for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
         Tracker->add(&*I);
@@ -668,6 +668,6 @@ namespace {
 char AliasSetPrinter::ID = 0;
 INITIALIZE_PASS_BEGIN(AliasSetPrinter, "print-alias-sets",
                 "Alias Set Printer", false, true)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(AliasSetPrinter, "print-alias-sets",
                 "Alias Set Printer", false, true)
diff --git a/contrib/llvm/lib/Analysis/Analysis.cpp b/contrib/llvm/lib/Analysis/Analysis.cpp
index 842ff0a..9c1ac00 100644
--- a/contrib/llvm/lib/Analysis/Analysis.cpp
+++ b/contrib/llvm/lib/Analysis/Analysis.cpp
@@ -20,23 +20,23 @@ using namespace llvm;
 
 /// initializeAnalysis - Initialize all passes linked into the Analysis library.
 void llvm::initializeAnalysis(PassRegistry &Registry) {
-  initializeAliasAnalysisAnalysisGroup(Registry);
-  initializeAliasAnalysisCounterPass(Registry);
   initializeAAEvalPass(Registry);
-  initializeAliasDebuggerPass(Registry);
   initializeAliasSetPrinterPass(Registry);
-  initializeNoAAPass(Registry);
-  initializeBasicAliasAnalysisPass(Registry);
-  initializeBlockFrequencyInfoPass(Registry);
-  initializeBranchProbabilityInfoPass(Registry);
+  initializeBasicAAWrapperPassPass(Registry);
+  initializeBlockFrequencyInfoWrapperPassPass(Registry);
+  initializeBranchProbabilityInfoWrapperPassPass(Registry);
+  initializeCallGraphWrapperPassPass(Registry);
+  initializeCallGraphPrinterPass(Registry);
+  initializeCallGraphViewerPass(Registry);
   initializeCostModelAnalysisPass(Registry);
   initializeCFGViewerPass(Registry);
   initializeCFGPrinterPass(Registry);
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
-  initializeCFLAliasAnalysisPass(Registry);
+  initializeCFLAAWrapperPassPass(Registry);
   initializeDependenceAnalysisPass(Registry);
   initializeDelinearizationPass(Registry);
+  initializeDemandedBitsPass(Registry);
   initializeDivergenceAnalysisPass(Registry);
   initializeDominanceFrontierPass(Registry);
   initializeDomViewerPass(Registry);
@@ -47,34 +47,40 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializePostDomPrinterPass(Registry);
   initializePostDomOnlyViewerPass(Registry);
   initializePostDomOnlyPrinterPass(Registry);
+  initializeAAResultsWrapperPassPass(Registry);
+  initializeGlobalsAAWrapperPassPass(Registry);
   initializeIVUsersPass(Registry);
   initializeInstCountPass(Registry);
   initializeIntervalPartitionPass(Registry);
   initializeLazyValueInfoPass(Registry);
-  initializeLibCallAliasAnalysisPass(Registry);
   initializeLintPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
   initializeMemDerefPrinterPass(Registry);
   initializeMemoryDependenceAnalysisPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
+  initializeObjCARCAAWrapperPassPass(Registry);
   initializePostDominatorTreePass(Registry);
   initializeRegionInfoPassPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
   initializeRegionOnlyViewerPass(Registry);
   initializeRegionOnlyPrinterPass(Registry);
-  initializeScalarEvolutionPass(Registry);
-  initializeScalarEvolutionAliasAnalysisPass(Registry);
+  initializeSCEVAAWrapperPassPass(Registry);
+  initializeScalarEvolutionWrapperPassPass(Registry);
   initializeTargetTransformInfoWrapperPassPass(Registry);
-  initializeTypeBasedAliasAnalysisPass(Registry);
-  initializeScopedNoAliasAAPass(Registry);
+  initializeTypeBasedAAWrapperPassPass(Registry);
+  initializeScopedNoAliasAAWrapperPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
   initializeAnalysis(*unwrap(R));
 }
 
+void LLVMInitializeIPA(LLVMPassRegistryRef R) {
+  initializeAnalysis(*unwrap(R));
+}
+
 LLVMBool LLVMVerifyModule(LLVMModuleRef M, LLVMVerifierFailureAction Action,
                           char **OutMessages) {
   raw_ostream *DebugOS = Action != LLVMReturnStatusAction ? &errs() : nullptr;
diff --git a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 3586354..c3d2803 100644
--- a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -13,24 +13,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
@@ -42,6 +39,18 @@
 #include <algorithm>
 using namespace llvm;
 
+/// Enable analysis of recursive PHI nodes.
+static cl::opt<bool> EnableRecPhiAnalysis("basicaa-recphi", cl::Hidden,
+                                          cl::init(false));
+
+/// SearchLimitReached / SearchTimes shows how often the limit of
+/// to decompose GEPs is reached. It will affect the precision
+/// of basic alias analysis.
+#define DEBUG_TYPE "basicaa"
+STATISTIC(SearchLimitReached, "Number of times the limit to "
+                              "decompose GEPs is reached");
+STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
+
 /// Cutoff after which to stop analysing a set of phi nodes potentially involved
 /// in a cycle. Because we are analysing 'through' phi nodes we need to be
 /// careful with value equivalence. We use reachability to make sure a value
@@ -57,8 +66,8 @@ static const unsigned MaxLookupSearchDepth = 6;
 // Useful predicates
 //===----------------------------------------------------------------------===//
 
-/// isNonEscapingLocalObject - Return true if the pointer is to a function-local
-/// object that never escapes from the function.
+/// Returns true if the pointer is to a function-local object that never
+/// escapes from the function.
 static bool isNonEscapingLocalObject(const Value *V) {
   // If this is a local allocation, check to see if it escapes.
   if (isa<AllocaInst>(V) || isNoAliasCall(V))
@@ -82,8 +91,8 @@ static bool isNonEscapingLocalObject(const Value *V) {
   return false;
 }
 
-/// isEscapeSource - Return true if the pointer is one which would have
-/// been considered an escape by isNonEscapingLocalObject.
+/// Returns true if the pointer is one which would have been considered an
+/// escape by isNonEscapingLocalObject.
 static bool isEscapeSource(const Value *V) {
   if (isa<CallInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V))
     return true;
@@ -97,8 +106,7 @@ static bool isEscapeSource(const Value *V) {
   return false;
 }
 
-/// getObjectSize - Return the size of the object specified by V, or
-/// UnknownSize if unknown.
+/// Returns the size of the object specified by V, or UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
@@ -108,8 +116,8 @@ static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
   return MemoryLocation::UnknownSize;
 }
 
-/// isObjectSmallerThan - Return true if we can prove that the object specified
-/// by V is smaller than Size.
+/// Returns true if we can prove that the object specified by V is smaller than
+/// Size.
 static bool isObjectSmallerThan(const Value *V, uint64_t Size,
                                 const DataLayout &DL,
                                 const TargetLibraryInfo &TLI) {
@@ -144,15 +152,14 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
 
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
-  uint64_t ObjectSize = getObjectSize(V, DL, TLI, /*RoundToAlign*/true);
+  uint64_t ObjectSize = getObjectSize(V, DL, TLI, /*RoundToAlign*/ true);
 
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size;
 }
 
-/// isObjectSize - Return true if we can prove that the object specified
-/// by V has size Size.
-static bool isObjectSize(const Value *V, uint64_t Size,
-                         const DataLayout &DL, const TargetLibraryInfo &TLI) {
+/// Returns true if we can prove that the object specified by V has size Size.
+static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
+                         const TargetLibraryInfo &TLI) {
   uint64_t ObjectSize = getObjectSize(V, DL, TLI);
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize == Size;
 }
@@ -161,42 +168,20 @@ static bool isObjectSize(const Value *V, uint64_t Size,
 // GetElementPtr Instruction Decomposition and Analysis
 //===----------------------------------------------------------------------===//
 
-namespace {
-  enum ExtensionKind {
-    EK_NotExtended,
-    EK_SignExt,
-    EK_ZeroExt
-  };
-
-  struct VariableGEPIndex {
-    const Value *V;
-    ExtensionKind Extension;
-    int64_t Scale;
-
-    bool operator==(const VariableGEPIndex &Other) const {
-      return V == Other.V && Extension == Other.Extension &&
-        Scale == Other.Scale;
-    }
-
-    bool operator!=(const VariableGEPIndex &Other) const {
-      return !operator==(Other);
-    }
-  };
-}
-
-
-/// GetLinearExpression - Analyze the specified value as a linear expression:
-/// "A*V + B", where A and B are constant integers.  Return the scale and offset
-/// values as APInts and return V as a Value*, and return whether we looked
-/// through any sign or zero extends.  The incoming Value is known to have
-/// IntegerType and it may already be sign or zero extended.
+/// Analyzes the specified value as a linear expression: "A*V + B", where A and
+/// B are constant integers.
+///
+/// Returns the scale and offset values as APInts and return V as a Value*, and
+/// return whether we looked through any sign or zero extends.  The incoming
+/// Value is known to have IntegerType and it may already be sign or zero
+/// extended.
 ///
 /// Note that this looks through extends, so the high bits may not be
 /// represented in the result.
-static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
-                                  ExtensionKind &Extension,
-                                  const DataLayout &DL, unsigned Depth,
-                                  AssumptionCache *AC, DominatorTree *DT) {
+/*static*/ const Value *BasicAAResult::GetLinearExpression(
+    const Value *V, APInt &Scale, APInt &Offset, unsigned &ZExtBits,
+    unsigned &SExtBits, const DataLayout &DL, unsigned Depth,
+    AssumptionCache *AC, DominatorTree *DT, bool &NSW, bool &NUW) {
   assert(V->getType()->isIntegerTy() && "Not an integer value");
 
   // Limit our recursion depth.
@@ -206,54 +191,125 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     return V;
   }
 
-  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
+  if (const ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
+    // if it's a constant, just convert it to an offset and remove the variable.
+    // If we've been called recursively the Offset bit width will be greater
+    // than the constant's (the Offset's always as wide as the outermost call),
+    // so we'll zext here and process any extension in the isa<SExtInst> &
+    // isa<ZExtInst> cases below.
+    Offset += Const->getValue().zextOrSelf(Offset.getBitWidth());
+    assert(Scale == 0 && "Constant values don't have a scale");
+    return V;
+  }
+
+  if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
+
+      // If we've been called recursively then Offset and Scale will be wider
+      // that the BOp operands. We'll always zext it here as we'll process sign
+      // extensions below (see the isa<SExtInst> / isa<ZExtInst> cases).
+      APInt RHS = RHSC->getValue().zextOrSelf(Offset.getBitWidth());
+
       switch (BOp->getOpcode()) {
-      default: break;
+      default:
+        // We don't understand this instruction, so we can't decompose it any
+        // further.
+        Scale = 1;
+        Offset = 0;
+        return V;
       case Instruction::Or:
         // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
         // analyze it.
         if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), DL, 0, AC,
-                               BOp, DT))
-          break;
-        // FALL THROUGH.
+                               BOp, DT)) {
+          Scale = 1;
+          Offset = 0;
+          return V;
+        }
+      // FALL THROUGH.
       case Instruction::Add:
-        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth + 1, AC, DT);
-        Offset += RHSC->getValue();
-        return V;
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
+                                SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
+        Offset += RHS;
+        break;
+      case Instruction::Sub:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
+                                SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
+        Offset -= RHS;
+        break;
       case Instruction::Mul:
-        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth + 1, AC, DT);
-        Offset *= RHSC->getValue();
-        Scale *= RHSC->getValue();
-        return V;
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
+                                SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
+        Offset *= RHS;
+        Scale *= RHS;
+        break;
       case Instruction::Shl:
-        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth + 1, AC, DT);
-        Offset <<= RHSC->getValue().getLimitedValue();
-        Scale <<= RHSC->getValue().getLimitedValue();
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
+                                SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
+        Offset <<= RHS.getLimitedValue();
+        Scale <<= RHS.getLimitedValue();
+        // the semantics of nsw and nuw for left shifts don't match those of
+        // multiplications, so we won't propagate them.
+        NSW = NUW = false;
         return V;
       }
+
+      if (isa<OverflowingBinaryOperator>(BOp)) {
+        NUW &= BOp->hasNoUnsignedWrap();
+        NSW &= BOp->hasNoSignedWrap();
+      }
+      return V;
     }
   }
 
   // Since GEP indices are sign extended anyway, we don't care about the high
   // bits of a sign or zero extended value - just scales and offsets.  The
   // extensions have to be consistent though.
-  if ((isa<SExtInst>(V) && Extension != EK_ZeroExt) ||
-      (isa<ZExtInst>(V) && Extension != EK_SignExt)) {
+  if (isa<SExtInst>(V) || isa<ZExtInst>(V)) {
     Value *CastOp = cast<CastInst>(V)->getOperand(0);
-    unsigned OldWidth = Scale.getBitWidth();
+    unsigned NewWidth = V->getType()->getPrimitiveSizeInBits();
     unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
-    Scale = Scale.trunc(SmallWidth);
-    Offset = Offset.trunc(SmallWidth);
-    Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt;
-
-    Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension, DL,
-                                        Depth + 1, AC, DT);
-    Scale = Scale.zext(OldWidth);
-    Offset = Offset.zext(OldWidth);
+    unsigned OldZExtBits = ZExtBits, OldSExtBits = SExtBits;
+    const Value *Result =
+        GetLinearExpression(CastOp, Scale, Offset, ZExtBits, SExtBits, DL,
+                            Depth + 1, AC, DT, NSW, NUW);
+
+    // zext(zext(%x)) == zext(%x), and similiarly for sext; we'll handle this
+    // by just incrementing the number of bits we've extended by.
+    unsigned ExtendedBy = NewWidth - SmallWidth;
+
+    if (isa<SExtInst>(V) && ZExtBits == 0) {
+      // sext(sext(%x, a), b) == sext(%x, a + b)
+
+      if (NSW) {
+        // We haven't sign-wrapped, so it's valid to decompose sext(%x + c)
+        // into sext(%x) + sext(c). We'll sext the Offset ourselves:
+        unsigned OldWidth = Offset.getBitWidth();
+        Offset = Offset.trunc(SmallWidth).sext(NewWidth).zextOrSelf(OldWidth);
+      } else {
+        // We may have signed-wrapped, so don't decompose sext(%x + c) into
+        // sext(%x) + sext(c)
+        Scale = 1;
+        Offset = 0;
+        Result = CastOp;
+        ZExtBits = OldZExtBits;
+        SExtBits = OldSExtBits;
+      }
+      SExtBits += ExtendedBy;
+    } else {
+      // sext(zext(%x, a), b) = zext(zext(%x, a), b) = zext(%x, a + b)
+
+      if (!NUW) {
+        // We may have unsigned-wrapped, so don't decompose zext(%x + c) into
+        // zext(%x) + zext(c)
+        Scale = 1;
+        Offset = 0;
+        Result = CastOp;
+        ZExtBits = OldZExtBits;
+        SExtBits = OldSExtBits;
+      }
+      ZExtBits += ExtendedBy;
+    }
 
     return Result;
   }
@@ -263,29 +319,27 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
   return V;
 }
 
-/// DecomposeGEPExpression - If V is a symbolic pointer expression, decompose it
-/// into a base pointer with a constant offset and a number of scaled symbolic
-/// offsets.
+/// If V is a symbolic pointer expression, decompose it into a base pointer
+/// with a constant offset and a number of scaled symbolic offsets.
 ///
-/// The scaled symbolic offsets (represented by pairs of a Value* and a scale in
-/// the VarIndices vector) are Value*'s that are known to be scaled by the
-/// specified amount, but which may have other unrepresented high bits. As such,
-/// the gep cannot necessarily be reconstructed from its decomposed form.
+/// The scaled symbolic offsets (represented by pairs of a Value* and a scale
+/// in the VarIndices vector) are Value*'s that are known to be scaled by the
+/// specified amount, but which may have other unrepresented high bits. As
+/// such, the gep cannot necessarily be reconstructed from its decomposed form.
 ///
 /// When DataLayout is around, this function is capable of analyzing everything
 /// that GetUnderlyingObject can look through. To be able to do that
 /// GetUnderlyingObject and DecomposeGEPExpression must use the same search
-/// depth (MaxLookupSearchDepth).
-/// When DataLayout not is around, it just looks through pointer casts.
-///
-static const Value *
-DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
-                       SmallVectorImpl<VariableGEPIndex> &VarIndices,
-                       bool &MaxLookupReached, const DataLayout &DL,
-                       AssumptionCache *AC, DominatorTree *DT) {
+/// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks
+/// through pointer casts.
+/*static*/ const Value *BasicAAResult::DecomposeGEPExpression(
+    const Value *V, int64_t &BaseOffs,
+    SmallVectorImpl<VariableGEPIndex> &VarIndices, bool &MaxLookupReached,
+    const DataLayout &DL, AssumptionCache *AC, DominatorTree *DT) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = MaxLookupSearchDepth;
   MaxLookupReached = false;
+  SearchTimes++;
 
   BaseOffs = 0;
   do {
@@ -318,7 +372,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
         // updated when GetUnderlyingObject is updated). TLI should be
         // provided also.
         if (const Value *Simplified =
-              SimplifyInstruction(const_cast<Instruction *>(I), DL)) {
+                SimplifyInstruction(const_cast<Instruction *>(I), DL)) {
           V = Simplified;
           continue;
         }
@@ -333,43 +387,47 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
     unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
-    for (User::const_op_iterator I = GEPOp->op_begin()+1,
-         E = GEPOp->op_end(); I != E; ++I) {
-      Value *Index = *I;
+    for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
+         I != E; ++I) {
+      const Value *Index = *I;
       // Compute the (potentially symbolic) offset in bytes for this index.
       if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
         // For a struct, add the member offset.
         unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-        if (FieldNo == 0) continue;
+        if (FieldNo == 0)
+          continue;
 
         BaseOffs += DL.getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
 
       // For an array/pointer, add the element offset, explicitly scaled.
-      if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
-        if (CIdx->isZero()) continue;
+      if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
+        if (CIdx->isZero())
+          continue;
         BaseOffs += DL.getTypeAllocSize(*GTI) * CIdx->getSExtValue();
         continue;
       }
 
       uint64_t Scale = DL.getTypeAllocSize(*GTI);
-      ExtensionKind Extension = EK_NotExtended;
+      unsigned ZExtBits = 0, SExtBits = 0;
 
       // If the integer type is smaller than the pointer size, it is implicitly
       // sign extended to pointer size.
       unsigned Width = Index->getType()->getIntegerBitWidth();
-      if (DL.getPointerSizeInBits(AS) > Width)
-        Extension = EK_SignExt;
+      unsigned PointerSize = DL.getPointerSizeInBits(AS);
+      if (PointerSize > Width)
+        SExtBits += PointerSize - Width;
 
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
-      Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension, DL,
-                                  0, AC, DT);
+      bool NSW = true, NUW = true;
+      Index = GetLinearExpression(Index, IndexScale, IndexOffset, ZExtBits,
+                                  SExtBits, DL, 0, AC, DT, NSW, NUW);
 
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
-      BaseOffs += IndexOffset.getSExtValue()*Scale;
+      BaseOffs += IndexOffset.getSExtValue() * Scale;
       Scale *= IndexScale.getSExtValue();
 
       // If we already had an occurrence of this index variable, merge this
@@ -377,23 +435,23 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
       for (unsigned i = 0, e = VarIndices.size(); i != e; ++i) {
-        if (VarIndices[i].V == Index &&
-            VarIndices[i].Extension == Extension) {
+        if (VarIndices[i].V == Index && VarIndices[i].ZExtBits == ZExtBits &&
+            VarIndices[i].SExtBits == SExtBits) {
           Scale += VarIndices[i].Scale;
-          VarIndices.erase(VarIndices.begin()+i);
+          VarIndices.erase(VarIndices.begin() + i);
           break;
         }
       }
 
       // Make sure that we have a scale that makes sense for this target's
       // pointer size.
-      if (unsigned ShiftBits = 64 - DL.getPointerSizeInBits(AS)) {
+      if (unsigned ShiftBits = 64 - PointerSize) {
         Scale <<= ShiftBits;
         Scale = (int64_t)Scale >> ShiftBits;
       }
 
       if (Scale) {
-        VariableGEPIndex Entry = {Index, Extension,
+        VariableGEPIndex Entry = {Index, ZExtBits, SExtBits,
                                   static_cast<int64_t>(Scale)};
         VarIndices.push_back(Entry);
       }
@@ -405,196 +463,25 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
 
   // If the chain of expressions is too deep, just return early.
   MaxLookupReached = true;
+  SearchLimitReached++;
   return V;
 }
 
-//===----------------------------------------------------------------------===//
-// BasicAliasAnalysis Pass
-//===----------------------------------------------------------------------===//
-
-#ifndef NDEBUG
-static const Function *getParent(const Value *V) {
-  if (const Instruction *inst = dyn_cast<Instruction>(V))
-    return inst->getParent()->getParent();
-
-  if (const Argument *arg = dyn_cast<Argument>(V))
-    return arg->getParent();
-
-  return nullptr;
-}
-
-static bool notDifferentParent(const Value *O1, const Value *O2) {
-
-  const Function *F1 = getParent(O1);
-  const Function *F2 = getParent(O2);
-
-  return !F1 || !F2 || F1 == F2;
-}
-#endif
-
-namespace {
-  /// BasicAliasAnalysis - This is the primary alias analysis implementation.
-  struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis {
-    static char ID; // Class identification, replacement for typeinfo
-    BasicAliasAnalysis() : ImmutablePass(ID) {
-      initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool doInitialization(Module &M) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AliasAnalysis>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-    }
-
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override {
-      assert(AliasCache.empty() && "AliasCache must be cleared after use!");
-      assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
-             "BasicAliasAnalysis doesn't support interprocedural queries.");
-      AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags,
-                                     LocB.Ptr, LocB.Size, LocB.AATags);
-      // AliasCache rarely has more than 1 or 2 elements, always use
-      // shrink_and_clear so it quickly returns to the inline capacity of the
-      // SmallDenseMap if it ever grows larger.
-      // FIXME: This should really be shrink_to_inline_capacity_and_clear().
-      AliasCache.shrink_and_clear();
-      VisitedPhiBBs.clear();
-      return Alias;
-    }
-
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override;
-
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override;
-
-    /// pointsToConstantMemory - Chase pointers until we find a (constant
-    /// global) or not.
-    bool pointsToConstantMemory(const MemoryLocation &Loc,
-                                bool OrLocal) override;
-
-    /// Get the location associated with a pointer argument of a callsite.
-    ModRefResult getArgModRefInfo(ImmutableCallSite CS,
-                                  unsigned ArgIdx) override;
-
-    /// getModRefBehavior - Return the behavior when calling the given
-    /// call site.
-    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override;
-
-    /// getModRefBehavior - Return the behavior when calling the given function.
-    /// For use when the call site is not known.
-    ModRefBehavior getModRefBehavior(const Function *F) override;
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(const void *ID) override {
-      if (ID == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
-    }
-
-  private:
-    // AliasCache - Track alias queries to guard against recursion.
-    typedef std::pair<MemoryLocation, MemoryLocation> LocPair;
-    typedef SmallDenseMap<LocPair, AliasResult, 8> AliasCacheTy;
-    AliasCacheTy AliasCache;
-
-    /// \brief Track phi nodes we have visited. When interpret "Value" pointer
-    /// equality as value equality we need to make sure that the "Value" is not
-    /// part of a cycle. Otherwise, two uses could come from different
-    /// "iterations" of a cycle and see different values for the same "Value"
-    /// pointer.
-    /// The following example shows the problem:
-    ///   %p = phi(%alloca1, %addr2)
-    ///   %l = load %ptr
-    ///   %addr1 = gep, %alloca2, 0, %l
-    ///   %addr2 = gep  %alloca2, 0, (%l + 1)
-    ///      alias(%p, %addr1) -> MayAlias !
-    ///   store %l, ...
-    SmallPtrSet<const BasicBlock*, 8> VisitedPhiBBs;
-
-    // Visited - Track instructions visited by pointsToConstantMemory.
-    SmallPtrSet<const Value*, 16> Visited;
-
-    /// \brief Check whether two Values can be considered equivalent.
-    ///
-    /// In addition to pointer equivalence of \p V1 and \p V2 this checks
-    /// whether they can not be part of a cycle in the value graph by looking at
-    /// all visited phi nodes an making sure that the phis cannot reach the
-    /// value. We have to do this because we are looking through phi nodes (That
-    /// is we say noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB).
-    bool isValueEqualInPotentialCycles(const Value *V1, const Value *V2);
-
-    /// \brief Dest and Src are the variable indices from two decomposed
-    /// GetElementPtr instructions GEP1 and GEP2 which have common base
-    /// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
-    /// difference between the two pointers.
-    void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
-                            const SmallVectorImpl<VariableGEPIndex> &Src);
-
-    // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
-    // instruction against another.
-    AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size,
-                         const AAMDNodes &V1AAInfo,
-                         const Value *V2, uint64_t V2Size,
-                         const AAMDNodes &V2AAInfo,
-                         const Value *UnderlyingV1, const Value *UnderlyingV2);
-
-    // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI
-    // instruction against another.
-    AliasResult aliasPHI(const PHINode *PN, uint64_t PNSize,
-                         const AAMDNodes &PNAAInfo,
-                         const Value *V2, uint64_t V2Size,
-                         const AAMDNodes &V2AAInfo);
-
-    /// aliasSelect - Disambiguate a Select instruction against another value.
-    AliasResult aliasSelect(const SelectInst *SI, uint64_t SISize,
-                            const AAMDNodes &SIAAInfo,
-                            const Value *V2, uint64_t V2Size,
-                            const AAMDNodes &V2AAInfo);
-
-    AliasResult aliasCheck(const Value *V1, uint64_t V1Size,
-                           AAMDNodes V1AATag,
-                           const Value *V2, uint64_t V2Size,
-                           AAMDNodes V2AATag);
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char BasicAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(BasicAliasAnalysis, AliasAnalysis, "basicaa",
-                   "Basic Alias Analysis (stateless AA impl)",
-                   false, true, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_AG_PASS_END(BasicAliasAnalysis, AliasAnalysis, "basicaa",
-                   "Basic Alias Analysis (stateless AA impl)",
-                   false, true, false)
-
-
-ImmutablePass *llvm::createBasicAliasAnalysisPass() {
-  return new BasicAliasAnalysis();
-}
-
-/// pointsToConstantMemory - Returns whether the given pointer value
-/// points to memory that is local to the function, with global constants being
-/// considered local to all functions.
-bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
-                                                bool OrLocal) {
+/// Returns whether the given pointer value points to memory that is local to
+/// the function, with global constants being considered local to all
+/// functions.
+bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
+                                           bool OrLocal) {
   assert(Visited.empty() && "Visited must be cleared after use!");
 
   unsigned MaxLookup = 8;
   SmallVector<const Value *, 16> Worklist;
   Worklist.push_back(Loc.Ptr);
   do {
-    const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), *DL);
+    const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL);
     if (!Visited.insert(V).second) {
       Visited.clear();
-      return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+      return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
     }
 
     // An alloca instruction defines local memory.
@@ -608,7 +495,7 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
       // others.  GV may even be a declaration, not a definition.
       if (!GV->isConstant()) {
         Visited.clear();
-        return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+        return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
       }
       continue;
     }
@@ -626,7 +513,7 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
       // Don't bother inspecting phi nodes with many operands.
       if (PN->getNumIncomingValues() > MaxLookup) {
         Visited.clear();
-        return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+        return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
       }
       for (Value *IncValue : PN->incoming_values())
         Worklist.push_back(IncValue);
@@ -635,7 +522,7 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
 
     // Otherwise be conservative.
     Visited.clear();
-    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+    return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
 
   } while (!Worklist.empty() && --MaxLookup);
 
@@ -656,66 +543,56 @@ static bool isMemsetPattern16(const Function *MS,
         isa<IntegerType>(MemsetType->getParamType(2)))
       return true;
   }
-
   return false;
 }
 
-/// getModRefBehavior - Return the behavior when calling the given call site.
-AliasAnalysis::ModRefBehavior
-BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+/// Returns the behavior when calling the given call site.
+FunctionModRefBehavior BasicAAResult::getModRefBehavior(ImmutableCallSite CS) {
   if (CS.doesNotAccessMemory())
     // Can't do better than this.
-    return DoesNotAccessMemory;
+    return FMRB_DoesNotAccessMemory;
 
-  ModRefBehavior Min = UnknownModRefBehavior;
+  FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
 
   // If the callsite knows it only reads memory, don't return worse
   // than that.
   if (CS.onlyReadsMemory())
-    Min = OnlyReadsMemory;
+    Min = FMRB_OnlyReadsMemory;
 
   if (CS.onlyAccessesArgMemory())
-    Min = ModRefBehavior(Min & OnlyAccessesArgumentPointees);
+    Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
 
-  // The AliasAnalysis base class has some smarts, lets use them.
-  return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
+  // The AAResultBase base class has some smarts, lets use them.
+  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min);
 }
 
-/// getModRefBehavior - Return the behavior when calling the given function.
-/// For use when the call site is not known.
-AliasAnalysis::ModRefBehavior
-BasicAliasAnalysis::getModRefBehavior(const Function *F) {
+/// Returns the behavior when calling the given function. For use when the call
+/// site is not known.
+FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
   // If the function declares it doesn't access memory, we can't do better.
   if (F->doesNotAccessMemory())
-    return DoesNotAccessMemory;
-
-  // For intrinsics, we can check the table.
-  if (Intrinsic::ID iid = F->getIntrinsicID()) {
-#define GET_INTRINSIC_MODREF_BEHAVIOR
-#include "llvm/IR/Intrinsics.gen"
-#undef GET_INTRINSIC_MODREF_BEHAVIOR
-  }
+    return FMRB_DoesNotAccessMemory;
 
-  ModRefBehavior Min = UnknownModRefBehavior;
+  FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
 
   // If the function declares it only reads memory, go with that.
   if (F->onlyReadsMemory())
-    Min = OnlyReadsMemory;
+    Min = FMRB_OnlyReadsMemory;
 
   if (F->onlyAccessesArgMemory())
-    Min = ModRefBehavior(Min & OnlyAccessesArgumentPointees);
-
-  const TargetLibraryInfo &TLI =
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  if (isMemsetPattern16(F, TLI))
-    Min = OnlyAccessesArgumentPointees;
+    Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
 
   // Otherwise be conservative.
-  return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
+  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min);
 }
 
-AliasAnalysis::ModRefResult
-BasicAliasAnalysis::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {
+/// Returns true if this is a writeonly (i.e Mod only) parameter.  Currently,
+/// we don't have a writeonly attribute, so this only knows about builtin
+/// intrinsics and target library functions.  We could consider adding a
+/// writeonly attribute in the future and moving all of these facts to either
+/// Intrinsics.td or InferFunctionAttr.cpp
+static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
+                             const TargetLibraryInfo &TLI) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction()))
     switch (II->getIntrinsicID()) {
     default:
@@ -723,50 +600,106 @@ BasicAliasAnalysis::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {
     case Intrinsic::memset:
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
-      assert((ArgIdx == 0 || ArgIdx == 1) &&
-             "Invalid argument index for memory intrinsic");
-      return ArgIdx ? Ref : Mod;
+      // We don't currently have a writeonly attribute.  All other properties
+      // of these intrinsics are nicely described via attributes in
+      // Intrinsics.td and handled generically.
+      if (ArgIdx == 0)
+        return true;
     }
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
   // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
-  // whenever possible.
-  if (CS.getCalledFunction() &&
-      isMemsetPattern16(CS.getCalledFunction(), *TLI)) {
-    assert((ArgIdx == 0 || ArgIdx == 1) &&
-           "Invalid argument index for memset_pattern16");
-    return ArgIdx ? Ref : Mod;
-  }
-  // FIXME: Handle memset_pattern4 and memset_pattern8 also.
+  // whenever possible.  Note that all but the missing writeonly attribute are
+  // handled via InferFunctionAttr.
+  if (CS.getCalledFunction() && isMemsetPattern16(CS.getCalledFunction(), TLI))
+    if (ArgIdx == 0)
+      return true;
+
+  // TODO: memset_pattern4, memset_pattern8
+  // TODO: _chk variants
+  // TODO: strcmp, strcpy
+
+  return false;
+}
+
+ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
+                                           unsigned ArgIdx) {
+
+  // Emulate the missing writeonly attribute by checking for known builtin
+  // intrinsics and target library functions.
+  if (isWriteOnlyParam(CS, ArgIdx, TLI))
+    return MRI_Mod;
 
-  return AliasAnalysis::getArgModRefInfo(CS, ArgIdx);
+  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadOnly))
+    return MRI_Ref;
+
+  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadNone))
+    return MRI_NoModRef;
+
+  return AAResultBase::getArgModRefInfo(CS, ArgIdx);
 }
 
 static bool isAssumeIntrinsic(ImmutableCallSite CS) {
   const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
-  if (II && II->getIntrinsicID() == Intrinsic::assume)
-    return true;
+  return II && II->getIntrinsicID() == Intrinsic::assume;
+}
 
-  return false;
+#ifndef NDEBUG
+static const Function *getParent(const Value *V) {
+  if (const Instruction *inst = dyn_cast<Instruction>(V))
+    return inst->getParent()->getParent();
+
+  if (const Argument *arg = dyn_cast<Argument>(V))
+    return arg->getParent();
+
+  return nullptr;
 }
 
-bool BasicAliasAnalysis::doInitialization(Module &M) {
-  InitializeAliasAnalysis(this, &M.getDataLayout());
-  return true;
+static bool notDifferentParent(const Value *O1, const Value *O2) {
+
+  const Function *F1 = getParent(O1);
+  const Function *F2 = getParent(O2);
+
+  return !F1 || !F2 || F1 == F2;
+}
+#endif
+
+AliasResult BasicAAResult::alias(const MemoryLocation &LocA,
+                                 const MemoryLocation &LocB) {
+  assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
+         "BasicAliasAnalysis doesn't support interprocedural queries.");
+
+  // If we have a directly cached entry for these locations, we have recursed
+  // through this once, so just return the cached results. Notably, when this
+  // happens, we don't clear the cache.
+  auto CacheIt = AliasCache.find(LocPair(LocA, LocB));
+  if (CacheIt != AliasCache.end())
+    return CacheIt->second;
+
+  AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr,
+                                 LocB.Size, LocB.AATags);
+  // AliasCache rarely has more than 1 or 2 elements, always use
+  // shrink_and_clear so it quickly returns to the inline capacity of the
+  // SmallDenseMap if it ever grows larger.
+  // FIXME: This should really be shrink_to_inline_capacity_and_clear().
+  AliasCache.shrink_and_clear();
+  VisitedPhiBBs.clear();
+  return Alias;
 }
 
-/// getModRefInfo - Check to see if the specified callsite can clobber the
-/// specified memory object.  Since we only look at local properties of this
-/// function, we really can't say much about this query.  We do, however, use
-/// simple "address taken" analysis on local objects.
-AliasAnalysis::ModRefResult
-BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
-                                  const MemoryLocation &Loc) {
+/// Checks to see if the specified callsite can clobber the specified memory
+/// object.
+///
+/// Since we only look at local properties of this function, we really can't
+/// say much about this query.  We do, however, use simple "address taken"
+/// analysis on local objects.
+ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
+                                        const MemoryLocation &Loc) {
   assert(notDifferentParent(CS.getInstruction(), Loc.Ptr) &&
          "AliasAnalysis query involving multiple functions!");
 
-  const Value *Object = GetUnderlyingObject(Loc.Ptr, *DL);
+  const Value *Object = GetUnderlyingObject(Loc.Ptr, DL);
 
   // If this is a tail call and Loc.Ptr points to a stack location, we know that
   // the tail call cannot access or modify the local stack.
@@ -776,7 +709,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
   if (isa<AllocaInst>(Object))
     if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
       if (CI->isTailCall())
-        return NoModRef;
+        return MRI_NoModRef;
 
   // If the pointer is to a locally allocated object that does not escape,
   // then the call can not mod/ref the pointer unless the call takes the pointer
@@ -798,41 +731,42 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       // is impossible to alias the pointer we're checking.  If not, we have to
       // assume that the call could touch the pointer, even though it doesn't
       // escape.
-      if (!isNoAlias(MemoryLocation(*CI), MemoryLocation(Object))) {
+      AliasResult AR =
+          getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object));
+      if (AR) {
         PassedAsArg = true;
         break;
       }
     }
 
     if (!PassedAsArg)
-      return NoModRef;
+      return MRI_NoModRef;
   }
 
   // While the assume intrinsic is marked as arbitrarily writing so that
   // proper control dependencies will be maintained, it never aliases any
   // particular memory location.
   if (isAssumeIntrinsic(CS))
-    return NoModRef;
+    return MRI_NoModRef;
 
-  // The AliasAnalysis base class has some smarts, lets use them.
-  return AliasAnalysis::getModRefInfo(CS, Loc);
+  // The AAResultBase base class has some smarts, lets use them.
+  return AAResultBase::getModRefInfo(CS, Loc);
 }
 
-AliasAnalysis::ModRefResult
-BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
-                                  ImmutableCallSite CS2) {
+ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS1,
+                                        ImmutableCallSite CS2) {
   // While the assume intrinsic is marked as arbitrarily writing so that
   // proper control dependencies will be maintained, it never aliases any
   // particular memory location.
   if (isAssumeIntrinsic(CS1) || isAssumeIntrinsic(CS2))
-    return NoModRef;
+    return MRI_NoModRef;
 
-  // The AliasAnalysis base class has some smarts, lets use them.
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
+  // The AAResultBase base class has some smarts, lets use them.
+  return AAResultBase::getModRefInfo(CS1, CS2);
 }
 
-/// \brief Provide ad-hoc rules to disambiguate accesses through two GEP
-/// operators, both having the exact same pointer operand.
+/// Provide ad-hoc rules to disambiguate accesses through two GEP operators,
+/// both having the exact same pointer operand.
 static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
                                             uint64_t V1Size,
                                             const GEPOperator *GEP2,
@@ -860,10 +794,9 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
   ConstantInt *C2 =
       dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1));
 
-  // If the last (struct) indices aren't constants, we can't say anything.
-  // If they're identical, the other indices might be also be dynamically
-  // equal, so the GEPs can alias.
-  if (!C1 || !C2 || C1 == C2)
+  // If the last (struct) indices are constants and are equal, the other indices
+  // might be also be dynamically equal, so the GEPs can alias.
+  if (C1 && C2 && C1 == C2)
     return MayAlias;
 
   // Find the last-indexed type of the GEP, i.e., the type you'd get if
@@ -886,12 +819,49 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
     IntermediateIndices.push_back(GEP1->getOperand(i + 1));
   }
 
-  StructType *LastIndexedStruct =
-      dyn_cast<StructType>(GetElementPtrInst::getIndexedType(
-          GEP1->getSourceElementType(), IntermediateIndices));
+  auto *Ty = GetElementPtrInst::getIndexedType(
+    GEP1->getSourceElementType(), IntermediateIndices);
+  StructType *LastIndexedStruct = dyn_cast<StructType>(Ty);
+
+  if (isa<SequentialType>(Ty)) {
+    // We know that:
+    // - both GEPs begin indexing from the exact same pointer;
+    // - the last indices in both GEPs are constants, indexing into a sequential
+    //   type (array or pointer);
+    // - both GEPs only index through arrays prior to that.
+    //
+    // Because array indices greater than the number of elements are valid in
+    // GEPs, unless we know the intermediate indices are identical between
+    // GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't
+    // partially overlap. We also need to check that the loaded size matches
+    // the element size, otherwise we could still have overlap.
+    const uint64_t ElementSize =
+        DL.getTypeStoreSize(cast<SequentialType>(Ty)->getElementType());
+    if (V1Size != ElementSize || V2Size != ElementSize)
+      return MayAlias;
+
+    for (unsigned i = 0, e = GEP1->getNumIndices() - 1; i != e; ++i)
+      if (GEP1->getOperand(i + 1) != GEP2->getOperand(i + 1))
+        return MayAlias;
 
-  if (!LastIndexedStruct)
+    // Now we know that the array/pointer that GEP1 indexes into and that
+    // that GEP2 indexes into must either precisely overlap or be disjoint.
+    // Because they cannot partially overlap and because fields in an array
+    // cannot overlap, if we can prove the final indices are different between
+    // GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias.
+    
+    // If the last indices are constants, we've already checked they don't
+    // equal each other so we can exit early.
+    if (C1 && C2)
+      return NoAlias;
+    if (isKnownNonEqual(GEP1->getOperand(GEP1->getNumOperands() - 1),
+                        GEP2->getOperand(GEP2->getNumOperands() - 1),
+                        DL))
+      return NoAlias;
     return MayAlias;
+  } else if (!LastIndexedStruct || !C1 || !C2) {
+    return MayAlias;
+  }
 
   // We know that:
   // - both GEPs begin indexing from the exact same pointer;
@@ -925,39 +895,21 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
   return MayAlias;
 }
 
-/// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
-/// against another pointer.  We know that V1 is a GEP, but we don't know
-/// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, DL),
-/// UnderlyingV2 is the same for V2.
+/// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against
+/// another pointer.
 ///
-AliasResult BasicAliasAnalysis::aliasGEP(
-    const GEPOperator *GEP1, uint64_t V1Size, const AAMDNodes &V1AAInfo,
-    const Value *V2, uint64_t V2Size, const AAMDNodes &V2AAInfo,
-    const Value *UnderlyingV1, const Value *UnderlyingV2) {
+/// We know that V1 is a GEP, but we don't know anything about V2.
+/// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for
+/// V2.
+AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
+                                    const AAMDNodes &V1AAInfo, const Value *V2,
+                                    uint64_t V2Size, const AAMDNodes &V2AAInfo,
+                                    const Value *UnderlyingV1,
+                                    const Value *UnderlyingV2) {
   int64_t GEP1BaseOffset;
   bool GEP1MaxLookupReached;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
-  // We have to get two AssumptionCaches here because GEP1 and V2 may be from
-  // different functions.
-  // FIXME: This really doesn't make any sense. We get a dominator tree below
-  // that can only refer to a single function. But this function (aliasGEP) is
-  // a method on an immutable pass that can be called when there *isn't*
-  // a single function. The old pass management layer makes this "work", but
-  // this isn't really a clean solution.
-  AssumptionCacheTracker &ACT = getAnalysis<AssumptionCacheTracker>();
-  AssumptionCache *AC1 = nullptr, *AC2 = nullptr;
-  if (auto *GEP1I = dyn_cast<Instruction>(GEP1))
-    AC1 = &ACT.getAssumptionCache(
-        const_cast<Function &>(*GEP1I->getParent()->getParent()));
-  if (auto *I2 = dyn_cast<Instruction>(V2))
-    AC2 = &ACT.getAssumptionCache(
-        const_cast<Function &>(*I2->getParent()->getParent()));
-
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
   // If we have two gep instructions with must-alias or not-alias'ing base
   // pointers, figure out if the indexes to the GEP tell us anything about the
   // derived pointer.
@@ -971,9 +923,8 @@ AliasResult BasicAliasAnalysis::aliasGEP(
     // identical.
     if ((BaseAlias == MayAlias) && V1Size == V2Size) {
       // Do the base pointers alias assuming type and size.
-      AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size,
-                                                V1AAInfo, UnderlyingV2,
-                                                V2Size, V2AAInfo);
+      AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size, V1AAInfo,
+                                                UnderlyingV2, V2Size, V2AAInfo);
       if (PreciseBaseAlias == NoAlias) {
         // See if the computed offset from the common pointer tells us about the
         // relation of the resulting pointer.
@@ -982,15 +933,15 @@ AliasResult BasicAliasAnalysis::aliasGEP(
         SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
         const Value *GEP2BasePtr =
             DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                                   GEP2MaxLookupReached, *DL, AC2, DT);
+                                   GEP2MaxLookupReached, DL, &AC, DT);
         const Value *GEP1BasePtr =
             DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                                   GEP1MaxLookupReached, *DL, AC1, DT);
+                                   GEP1MaxLookupReached, DL, &AC, DT);
         // DecomposeGEPExpression and GetUnderlyingObject should return the
         // same result except when DecomposeGEPExpression has no DataLayout.
+        // FIXME: They always have a DataLayout so this should become an
+        // assert.
         if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
-          assert(!DL &&
-                 "DecomposeGEPExpression and GetUnderlyingObject disagree!");
           return MayAlias;
         }
         // If the max search depth is reached the result is undefined
@@ -1007,35 +958,35 @@ AliasResult BasicAliasAnalysis::aliasGEP(
 
     // If we get a No or May, then return it immediately, no amount of analysis
     // will improve this situation.
-    if (BaseAlias != MustAlias) return BaseAlias;
+    if (BaseAlias != MustAlias)
+      return BaseAlias;
 
     // Otherwise, we have a MustAlias.  Since the base pointers alias each other
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
     const Value *GEP1BasePtr =
         DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                               GEP1MaxLookupReached, *DL, AC1, DT);
+                               GEP1MaxLookupReached, DL, &AC, DT);
 
     int64_t GEP2BaseOffset;
     bool GEP2MaxLookupReached;
     SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
         DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                               GEP2MaxLookupReached, *DL, AC2, DT);
+                               GEP2MaxLookupReached, DL, &AC, DT);
 
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
+    // FIXME: They always have a DataLayout so this should become an assert.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
-      assert(!DL &&
-             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
 
     // If we know the two GEPs are based off of the exact same pointer (and not
     // just the same underlying object), see if that tells us anything about
     // the resulting pointers.
-    if (DL && GEP1->getPointerOperand() == GEP2->getPointerOperand()) {
-      AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, *DL);
+    if (GEP1->getPointerOperand() == GEP2->getPointerOperand()) {
+      AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
       // If we couldn't find anything interesting, don't abandon just yet.
       if (R != MayAlias)
         return R;
@@ -1072,13 +1023,12 @@ AliasResult BasicAliasAnalysis::aliasGEP(
 
     const Value *GEP1BasePtr =
         DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                               GEP1MaxLookupReached, *DL, AC1, DT);
+                               GEP1MaxLookupReached, DL, &AC, DT);
 
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
+    // FIXME: They always have a DataLayout so this should become an assert.
     if (GEP1BasePtr != UnderlyingV1) {
-      assert(!DL &&
-             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
     // If the max search depth is reached the result is undefined
@@ -1124,12 +1074,42 @@ AliasResult BasicAliasAnalysis::aliasGEP(
     }
   }
 
-  // Try to distinguish something like &A[i][1] against &A[42][0].
-  // Grab the least significant bit set in any of the scales.
   if (!GEP1VariableIndices.empty()) {
     uint64_t Modulo = 0;
-    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i)
-      Modulo |= (uint64_t) GEP1VariableIndices[i].Scale;
+    bool AllPositive = true;
+    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) {
+
+      // Try to distinguish something like &A[i][1] against &A[42][0].
+      // Grab the least significant bit set in any of the scales. We
+      // don't need std::abs here (even if the scale's negative) as we'll
+      // be ^'ing Modulo with itself later.
+      Modulo |= (uint64_t)GEP1VariableIndices[i].Scale;
+
+      if (AllPositive) {
+        // If the Value could change between cycles, then any reasoning about
+        // the Value this cycle may not hold in the next cycle. We'll just
+        // give up if we can't determine conditions that hold for every cycle:
+        const Value *V = GEP1VariableIndices[i].V;
+
+        bool SignKnownZero, SignKnownOne;
+        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
+                       0, &AC, nullptr, DT);
+
+        // Zero-extension widens the variable, and so forces the sign
+        // bit to zero.
+        bool IsZExt = GEP1VariableIndices[i].ZExtBits > 0 || isa<ZExtInst>(V);
+        SignKnownZero |= IsZExt;
+        SignKnownOne &= !IsZExt;
+
+        // If the variable begins with a zero then we know it's
+        // positive, regardless of whether the value is signed or
+        // unsigned.
+        int64_t Scale = GEP1VariableIndices[i].Scale;
+        AllPositive =
+            (SignKnownZero && Scale >= 0) || (SignKnownOne && Scale < 0);
+      }
+    }
+
     Modulo = Modulo ^ (Modulo & (Modulo - 1));
 
     // We can compute the difference between the two addresses
@@ -1140,6 +1120,16 @@ AliasResult BasicAliasAnalysis::aliasGEP(
         V2Size != MemoryLocation::UnknownSize && ModOffset >= V2Size &&
         V1Size <= Modulo - ModOffset)
       return NoAlias;
+
+    // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
+    // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
+    // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
+    if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t)GEP1BaseOffset)
+      return NoAlias;
+
+    if (constantOffsetHeuristic(GEP1VariableIndices, V1Size, V2Size,
+                                GEP1BaseOffset, &AC, DT))
+      return NoAlias;
   }
 
   // Statically, we can see that the base objects are the same, but the
@@ -1164,46 +1154,44 @@ static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
   return MayAlias;
 }
 
-/// aliasSelect - Provide a bunch of ad-hoc rules to disambiguate a Select
-/// instruction against another.
-AliasResult BasicAliasAnalysis::aliasSelect(const SelectInst *SI,
-                                            uint64_t SISize,
-                                            const AAMDNodes &SIAAInfo,
-                                            const Value *V2, uint64_t V2Size,
-                                            const AAMDNodes &V2AAInfo) {
+/// Provides a bunch of ad-hoc rules to disambiguate a Select instruction
+/// against another.
+AliasResult BasicAAResult::aliasSelect(const SelectInst *SI, uint64_t SISize,
+                                       const AAMDNodes &SIAAInfo,
+                                       const Value *V2, uint64_t V2Size,
+                                       const AAMDNodes &V2AAInfo) {
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
     if (SI->getCondition() == SI2->getCondition()) {
-      AliasResult Alias =
-        aliasCheck(SI->getTrueValue(), SISize, SIAAInfo,
-                   SI2->getTrueValue(), V2Size, V2AAInfo);
+      AliasResult Alias = aliasCheck(SI->getTrueValue(), SISize, SIAAInfo,
+                                     SI2->getTrueValue(), V2Size, V2AAInfo);
       if (Alias == MayAlias)
         return MayAlias;
       AliasResult ThisAlias =
-        aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
-                   SI2->getFalseValue(), V2Size, V2AAInfo);
+          aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
+                     SI2->getFalseValue(), V2Size, V2AAInfo);
       return MergeAliasResults(ThisAlias, Alias);
     }
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
   AliasResult Alias =
-    aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(), SISize, SIAAInfo);
+      aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(), SISize, SIAAInfo);
   if (Alias == MayAlias)
     return MayAlias;
 
   AliasResult ThisAlias =
-    aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(), SISize, SIAAInfo);
+      aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(), SISize, SIAAInfo);
   return MergeAliasResults(ThisAlias, Alias);
 }
 
-// aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI instruction
-// against another.
-AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
-                                         const AAMDNodes &PNAAInfo,
-                                         const Value *V2, uint64_t V2Size,
-                                         const AAMDNodes &V2AAInfo) {
+/// Provide a bunch of ad-hoc rules to disambiguate a PHI instruction against
+/// another.
+AliasResult BasicAAResult::aliasPHI(const PHINode *PN, uint64_t PNSize,
+                                    const AAMDNodes &PNAAInfo, const Value *V2,
+                                    uint64_t V2Size,
+                                    const AAMDNodes &V2AAInfo) {
   // Track phi nodes we have visited. We use this information when we determine
   // value equivalence.
   VisitedPhiBBs.insert(PN->getParent());
@@ -1232,9 +1220,9 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
 
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         AliasResult ThisAlias =
-          aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
-                     PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
-                     V2Size, V2AAInfo);
+            aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
+                       PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
+                       V2Size, V2AAInfo);
         Alias = MergeAliasResults(ThisAlias, Alias);
         if (Alias == MayAlias)
           break;
@@ -1247,8 +1235,9 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
       return Alias;
     }
 
-  SmallPtrSet<Value*, 4> UniqueSrc;
-  SmallVector<Value*, 4> V1Srcs;
+  SmallPtrSet<Value *, 4> UniqueSrc;
+  SmallVector<Value *, 4> V1Srcs;
+  bool isRecursive = false;
   for (Value *PV1 : PN->incoming_values()) {
     if (isa<PHINode>(PV1))
       // If any of the source itself is a PHI, return MayAlias conservatively
@@ -1256,12 +1245,33 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
       // sides are PHI nodes. In which case, this is O(m x n) time where 'm'
       // and 'n' are the number of PHI sources.
       return MayAlias;
+
+    if (EnableRecPhiAnalysis)
+      if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
+        // Check whether the incoming value is a GEP that advances the pointer
+        // result of this PHI node (e.g. in a loop). If this is the case, we
+        // would recurse and always get a MayAlias. Handle this case specially
+        // below.
+        if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
+            isa<ConstantInt>(PV1GEP->idx_begin())) {
+          isRecursive = true;
+          continue;
+        }
+      }
+
     if (UniqueSrc.insert(PV1).second)
       V1Srcs.push_back(PV1);
   }
 
-  AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo,
-                                 V1Srcs[0], PNSize, PNAAInfo);
+  // If this PHI node is recursive, set the size of the accessed memory to
+  // unknown to represent all the possible values the GEP could advance the
+  // pointer to.
+  if (isRecursive)
+    PNSize = MemoryLocation::UnknownSize;
+
+  AliasResult Alias =
+      aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0], PNSize, PNAAInfo);
+
   // Early exit if the check of the first PHI source against V2 is MayAlias.
   // Other results are not possible.
   if (Alias == MayAlias)
@@ -1272,8 +1282,8 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
-    AliasResult ThisAlias = aliasCheck(V2, V2Size, V2AAInfo,
-                                       V, PNSize, PNAAInfo);
+    AliasResult ThisAlias =
+        aliasCheck(V2, V2Size, V2AAInfo, V, PNSize, PNAAInfo);
     Alias = MergeAliasResults(ThisAlias, Alias);
     if (Alias == MayAlias)
       break;
@@ -1282,13 +1292,11 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
   return Alias;
 }
 
-// aliasCheck - Provide a bunch of ad-hoc rules to disambiguate in common cases,
-// such as array references.
-//
-AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
-                                           AAMDNodes V1AAInfo, const Value *V2,
-                                           uint64_t V2Size,
-                                           AAMDNodes V2AAInfo) {
+/// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as
+/// array references.
+AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
+                                      AAMDNodes V1AAInfo, const Value *V2,
+                                      uint64_t V2Size, AAMDNodes V2AAInfo) {
   // If either of the memory references is empty, it doesn't matter what the
   // pointer values are.
   if (V1Size == 0 || V2Size == 0)
@@ -1313,11 +1321,11 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     return MustAlias;
 
   if (!V1->getType()->isPointerTy() || !V2->getType()->isPointerTy())
-    return NoAlias;  // Scalars cannot alias each other
+    return NoAlias; // Scalars cannot alias each other
 
   // Figure out what objects these things are pointing to if we can.
-  const Value *O1 = GetUnderlyingObject(V1, *DL, MaxLookupSearchDepth);
-  const Value *O2 = GetUnderlyingObject(V2, *DL, MaxLookupSearchDepth);
+  const Value *O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth);
+  const Value *O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth);
 
   // Null values in the default address space don't point to any object, so they
   // don't alias any other pointer.
@@ -1366,12 +1374,11 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
 
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
-  if (DL)
-    if ((V1Size != MemoryLocation::UnknownSize &&
-         isObjectSmallerThan(O2, V1Size, *DL, *TLI)) ||
-        (V2Size != MemoryLocation::UnknownSize &&
-         isObjectSmallerThan(O1, V2Size, *DL, *TLI)))
-      return NoAlias;
+  if ((V1Size != MemoryLocation::UnknownSize &&
+       isObjectSmallerThan(O2, V1Size, DL, TLI)) ||
+      (V2Size != MemoryLocation::UnknownSize &&
+       isObjectSmallerThan(O1, V2Size, DL, TLI)))
+    return NoAlias;
 
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
@@ -1380,7 +1387,7 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (V1 > V2)
     std::swap(Locs.first, Locs.second);
   std::pair<AliasCacheTy::iterator, bool> Pair =
-    AliasCache.insert(std::make_pair(Locs, MayAlias));
+      AliasCache.insert(std::make_pair(Locs, MayAlias));
   if (!Pair.second)
     return Pair.first->second;
 
@@ -1393,8 +1400,10 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     std::swap(V1AAInfo, V2AAInfo);
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
-    AliasResult Result = aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2);
-    if (Result != MayAlias) return AliasCache[Locs] = Result;
+    AliasResult Result =
+        aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2);
+    if (Result != MayAlias)
+      return AliasCache[Locs] = Result;
   }
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
@@ -1403,9 +1412,9 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     std::swap(V1AAInfo, V2AAInfo);
   }
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
-    AliasResult Result = aliasPHI(PN, V1Size, V1AAInfo,
-                                  V2, V2Size, V2AAInfo);
-    if (Result != MayAlias) return AliasCache[Locs] = Result;
+    AliasResult Result = aliasPHI(PN, V1Size, V1AAInfo, V2, V2Size, V2AAInfo);
+    if (Result != MayAlias)
+      return AliasCache[Locs] = Result;
   }
 
   if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
@@ -1414,29 +1423,38 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     std::swap(V1AAInfo, V2AAInfo);
   }
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
-    AliasResult Result = aliasSelect(S1, V1Size, V1AAInfo,
-                                     V2, V2Size, V2AAInfo);
-    if (Result != MayAlias) return AliasCache[Locs] = Result;
+    AliasResult Result =
+        aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo);
+    if (Result != MayAlias)
+      return AliasCache[Locs] = Result;
   }
 
   // If both pointers are pointing into the same object and one of them
   // accesses is accessing the entire object, then the accesses must
   // overlap in some way.
-  if (DL && O1 == O2)
+  if (O1 == O2)
     if ((V1Size != MemoryLocation::UnknownSize &&
-         isObjectSize(O1, V1Size, *DL, *TLI)) ||
+         isObjectSize(O1, V1Size, DL, TLI)) ||
         (V2Size != MemoryLocation::UnknownSize &&
-         isObjectSize(O2, V2Size, *DL, *TLI)))
+         isObjectSize(O2, V2Size, DL, TLI)))
       return AliasCache[Locs] = PartialAlias;
 
-  AliasResult Result =
-      AliasAnalysis::alias(MemoryLocation(V1, V1Size, V1AAInfo),
-                           MemoryLocation(V2, V2Size, V2AAInfo));
+  // Recurse back into the best AA results we have, potentially with refined
+  // memory locations. We have already ensured that BasicAA has a MayAlias
+  // cache result for these, so any recursion back into BasicAA won't loop.
+  AliasResult Result = getBestAAResults().alias(Locs.first, Locs.second);
   return AliasCache[Locs] = Result;
 }
 
-bool BasicAliasAnalysis::isValueEqualInPotentialCycles(const Value *V,
-                                                       const Value *V2) {
+/// Check whether two Values can be considered equivalent.
+///
+/// In addition to pointer equivalence of \p V1 and \p V2 this checks whether
+/// they can not be part of a cycle in the value graph by looking at all
+/// visited phi nodes an making sure that the phis cannot reach the value. We
+/// have to do this because we are looking through phi nodes (That is we say
+/// noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB).
+bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
+                                                  const Value *V2) {
   if (V != V2)
     return false;
 
@@ -1450,28 +1468,21 @@ bool BasicAliasAnalysis::isValueEqualInPotentialCycles(const Value *V,
   if (VisitedPhiBBs.size() > MaxNumPhiBBsValueReachabilityCheck)
     return false;
 
-  // Use dominance or loop info if available.
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
-  LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-
   // Make sure that the visited phis cannot reach the Value. This ensures that
   // the Values cannot come from different iterations of a potential cycle the
   // phi nodes could be involved in.
   for (auto *P : VisitedPhiBBs)
-    if (isPotentiallyReachable(P->begin(), Inst, DT, LI))
+    if (isPotentiallyReachable(&P->front(), Inst, DT, LI))
       return false;
 
   return true;
 }
 
-/// GetIndexDifference - Dest and Src are the variable indices from two
-/// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
-/// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
-/// difference between the two pointers.
-void BasicAliasAnalysis::GetIndexDifference(
+/// Computes the symbolic difference between two de-composed GEPs.
+///
+/// Dest and Src are the variable indices from two decomposed GetElementPtr
+/// instructions GEP1 and GEP2 which have common base pointers.
+void BasicAAResult::GetIndexDifference(
     SmallVectorImpl<VariableGEPIndex> &Dest,
     const SmallVectorImpl<VariableGEPIndex> &Src) {
   if (Src.empty())
@@ -1479,14 +1490,14 @@ void BasicAliasAnalysis::GetIndexDifference(
 
   for (unsigned i = 0, e = Src.size(); i != e; ++i) {
     const Value *V = Src[i].V;
-    ExtensionKind Extension = Src[i].Extension;
+    unsigned ZExtBits = Src[i].ZExtBits, SExtBits = Src[i].SExtBits;
     int64_t Scale = Src[i].Scale;
 
     // Find V in Dest.  This is N^2, but pointer indices almost never have more
     // than a few variable indexes.
     for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
       if (!isValueEqualInPotentialCycles(Dest[j].V, V) ||
-          Dest[j].Extension != Extension)
+          Dest[j].ZExtBits != ZExtBits || Dest[j].SExtBits != SExtBits)
         continue;
 
       // If we found it, subtract off Scale V's from the entry in Dest.  If it
@@ -1501,8 +1512,120 @@ void BasicAliasAnalysis::GetIndexDifference(
 
     // If we didn't consume this entry, add it to the end of the Dest list.
     if (Scale) {
-      VariableGEPIndex Entry = { V, Extension, -Scale };
+      VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale};
       Dest.push_back(Entry);
     }
   }
 }
+
+bool BasicAAResult::constantOffsetHeuristic(
+    const SmallVectorImpl<VariableGEPIndex> &VarIndices, uint64_t V1Size,
+    uint64_t V2Size, int64_t BaseOffset, AssumptionCache *AC,
+    DominatorTree *DT) {
+  if (VarIndices.size() != 2 || V1Size == MemoryLocation::UnknownSize ||
+      V2Size == MemoryLocation::UnknownSize)
+    return false;
+
+  const VariableGEPIndex &Var0 = VarIndices[0], &Var1 = VarIndices[1];
+
+  if (Var0.ZExtBits != Var1.ZExtBits || Var0.SExtBits != Var1.SExtBits ||
+      Var0.Scale != -Var1.Scale)
+    return false;
+
+  unsigned Width = Var1.V->getType()->getIntegerBitWidth();
+
+  // We'll strip off the Extensions of Var0 and Var1 and do another round
+  // of GetLinearExpression decomposition. In the example above, if Var0
+  // is zext(%x + 1) we should get V1 == %x and V1Offset == 1.
+
+  APInt V0Scale(Width, 0), V0Offset(Width, 0), V1Scale(Width, 0),
+      V1Offset(Width, 0);
+  bool NSW = true, NUW = true;
+  unsigned V0ZExtBits = 0, V0SExtBits = 0, V1ZExtBits = 0, V1SExtBits = 0;
+  const Value *V0 = GetLinearExpression(Var0.V, V0Scale, V0Offset, V0ZExtBits,
+                                        V0SExtBits, DL, 0, AC, DT, NSW, NUW);
+  NSW = true, NUW = true;
+  const Value *V1 = GetLinearExpression(Var1.V, V1Scale, V1Offset, V1ZExtBits,
+                                        V1SExtBits, DL, 0, AC, DT, NSW, NUW);
+
+  if (V0Scale != V1Scale || V0ZExtBits != V1ZExtBits ||
+      V0SExtBits != V1SExtBits || !isValueEqualInPotentialCycles(V0, V1))
+    return false;
+
+  // We have a hit - Var0 and Var1 only differ by a constant offset!
+
+  // If we've been sext'ed then zext'd the maximum difference between Var0 and
+  // Var1 is possible to calculate, but we're just interested in the absolute
+  // minimum difference between the two. The minimum distance may occur due to
+  // wrapping; consider "add i3 %i, 5": if %i == 7 then 7 + 5 mod 8 == 4, and so
+  // the minimum distance between %i and %i + 5 is 3.
+  APInt MinDiff = V0Offset - V1Offset, Wrapped = -MinDiff;
+  MinDiff = APIntOps::umin(MinDiff, Wrapped);
+  uint64_t MinDiffBytes = MinDiff.getZExtValue() * std::abs(Var0.Scale);
+
+  // We can't definitely say whether GEP1 is before or after V2 due to wrapping
+  // arithmetic (i.e. for some values of GEP1 and V2 GEP1 < V2, and for other
+  // values GEP1 > V2). We'll therefore only declare NoAlias if both V1Size and
+  // V2Size can fit in the MinDiffBytes gap.
+  return V1Size + std::abs(BaseOffset) <= MinDiffBytes &&
+         V2Size + std::abs(BaseOffset) <= MinDiffBytes;
+}
+
+//===----------------------------------------------------------------------===//
+// BasicAliasAnalysis Pass
+//===----------------------------------------------------------------------===//
+
+char BasicAA::PassID;
+
+BasicAAResult BasicAA::run(Function &F, AnalysisManager<Function> *AM) {
+  return BasicAAResult(F.getParent()->getDataLayout(),
+                       AM->getResult<TargetLibraryAnalysis>(F),
+                       AM->getResult<AssumptionAnalysis>(F),
+                       AM->getCachedResult<DominatorTreeAnalysis>(F),
+                       AM->getCachedResult<LoopAnalysis>(F));
+}
+
+BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
+    initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+char BasicAAWrapperPass::ID = 0;
+void BasicAAWrapperPass::anchor() {}
+
+INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basicaa",
+                      "Basic Alias Analysis (stateless AA impl)", true, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(BasicAAWrapperPass, "basicaa",
+                    "Basic Alias Analysis (stateless AA impl)", true, true)
+
+FunctionPass *llvm::createBasicAAWrapperPass() {
+  return new BasicAAWrapperPass();
+}
+
+bool BasicAAWrapperPass::runOnFunction(Function &F) {
+  auto &ACT = getAnalysis<AssumptionCacheTracker>();
+  auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+
+  Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), TLIWP.getTLI(),
+                                 ACT.getAssumptionCache(F),
+                                 DTWP ? &DTWP->getDomTree() : nullptr,
+                                 LIWP ? &LIWP->getLoopInfo() : nullptr));
+
+  return false;
+}
+
+void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) {
+  return BasicAAResult(
+      F.getParent()->getDataLayout(),
+      P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+      P.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+}
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index 3d819eb..90b7a33 100644
--- a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -55,7 +55,7 @@ struct GraphTraits<BlockFrequencyInfo *> {
   typedef Function::const_iterator nodes_iterator;
 
   static inline const NodeType *getEntryNode(const BlockFrequencyInfo *G) {
-    return G->getFunction()->begin();
+    return &G->getFunction()->front();
   }
   static ChildIteratorType child_begin(const NodeType *N) {
     return succ_begin(N);
@@ -105,51 +105,36 @@ struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
 } // end namespace llvm
 #endif
 
-INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
-                      "Block Frequency Analysis", true, true)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
-                    "Block Frequency Analysis", true, true)
-
-char BlockFrequencyInfo::ID = 0;
-
+BlockFrequencyInfo::BlockFrequencyInfo() {}
 
-BlockFrequencyInfo::BlockFrequencyInfo() : FunctionPass(ID) {
-  initializeBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
-}
-
-BlockFrequencyInfo::~BlockFrequencyInfo() {}
-
-void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<BranchProbabilityInfo>();
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.setPreservesAll();
+BlockFrequencyInfo::BlockFrequencyInfo(const Function &F,
+                                       const BranchProbabilityInfo &BPI,
+                                       const LoopInfo &LI) {
+  calculate(F, BPI, LI);
 }
 
-bool BlockFrequencyInfo::runOnFunction(Function &F) {
-  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
-  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+void BlockFrequencyInfo::calculate(const Function &F,
+                                   const BranchProbabilityInfo &BPI,
+                                   const LoopInfo &LI) {
   if (!BFI)
     BFI.reset(new ImplType);
-  BFI->doFunction(&F, &BPI, &LI);
+  BFI->calculate(F, BPI, LI);
 #ifndef NDEBUG
   if (ViewBlockFreqPropagationDAG != GVDT_None)
     view();
 #endif
-  return false;
-}
-
-void BlockFrequencyInfo::releaseMemory() { BFI.reset(); }
-
-void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const {
-  if (BFI) BFI->print(O);
 }
 
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
   return BFI ? BFI->getBlockFreq(BB) : 0;
 }
 
+void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB,
+                                      uint64_t Freq) {
+  assert(BFI && "Expected analysis to be available");
+  BFI->setBlockFreq(BB, Freq);
+}
+
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
 void BlockFrequencyInfo::view() const {
@@ -180,3 +165,49 @@ BlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
 uint64_t BlockFrequencyInfo::getEntryFreq() const {
   return BFI ? BFI->getEntryFreq() : 0;
 }
+
+void BlockFrequencyInfo::releaseMemory() { BFI.reset(); }
+
+void BlockFrequencyInfo::print(raw_ostream &OS) const {
+  if (BFI)
+    BFI->print(OS);
+}
+
+
+INITIALIZE_PASS_BEGIN(BlockFrequencyInfoWrapperPass, "block-freq",
+                      "Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(BlockFrequencyInfoWrapperPass, "block-freq",
+                    "Block Frequency Analysis", true, true)
+
+char BlockFrequencyInfoWrapperPass::ID = 0;
+
+
+BlockFrequencyInfoWrapperPass::BlockFrequencyInfoWrapperPass()
+    : FunctionPass(ID) {
+  initializeBlockFrequencyInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+BlockFrequencyInfoWrapperPass::~BlockFrequencyInfoWrapperPass() {}
+
+void BlockFrequencyInfoWrapperPass::print(raw_ostream &OS,
+                                          const Module *) const {
+  BFI.print(OS);
+}
+
+void BlockFrequencyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+void BlockFrequencyInfoWrapperPass::releaseMemory() { BFI.releaseMemory(); }
+
+bool BlockFrequencyInfoWrapperPass::runOnFunction(Function &F) {
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  BFI.calculate(F, BPI, LI);
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 6ceda06..48e23af 100644
--- a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -530,6 +530,13 @@ BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
   return Freqs[Node.Index].Scaled;
 }
 
+void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node,
+                                              uint64_t Freq) {
+  assert(Node.isValid() && "Expected valid node");
+  assert(Node.Index < Freqs.size() && "Expected legal index");
+  Freqs[Node.Index].Integer = Freq;
+}
+
 std::string
 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
   return std::string();
@@ -743,7 +750,10 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
     auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)];
     DEBUG(dbgs() << " - Add back edge mass for node "
                  << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
-    Dist.addLocal(HeaderNode, BackedgeMass.getMass());
+    if (BackedgeMass.getMass() > 0)
+      Dist.addLocal(HeaderNode, BackedgeMass.getMass());
+    else
+      DEBUG(dbgs() << "   Nothing added. Back edge mass is zero\n");
   }
 
   DitheringDistributer D(Dist, LoopMass);
diff --git a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 430b412..cf0cc8d 100644
--- a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -27,13 +27,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "branch-prob"
 
-INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob",
+INITIALIZE_PASS_BEGIN(BranchProbabilityInfoWrapperPass, "branch-prob",
                       "Branch Probability Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob",
+INITIALIZE_PASS_END(BranchProbabilityInfoWrapperPass, "branch-prob",
                     "Branch Probability Analysis", false, true)
 
-char BranchProbabilityInfo::ID = 0;
+char BranchProbabilityInfoWrapperPass::ID = 0;
 
 // Weights are for internal use only. They are used by heuristics to help to
 // estimate edges' probability. Example:
@@ -108,13 +108,6 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
 /// instruction. This is essentially never taken.
 static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 
-// Standard weight value. Used when none of the heuristics set weight for
-// the edge.
-static const uint32_t NORMAL_WEIGHT = 16;
-
-// Minimum weight of an edge. Please note, that weight is NEVER 0.
-static const uint32_t MIN_WEIGHT = 1;
-
 /// \brief Calculate edge weights for successors lead to unreachable.
 ///
 /// Predict that a successor which leads necessarily to an
@@ -147,22 +140,34 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
   if (TI->getNumSuccessors() == 1 || UnreachableEdges.empty())
     return false;
 
-  uint32_t UnreachableWeight =
-    std::max(UR_TAKEN_WEIGHT / (unsigned)UnreachableEdges.size(), MIN_WEIGHT);
-  for (SmallVectorImpl<unsigned>::iterator I = UnreachableEdges.begin(),
-                                           E = UnreachableEdges.end();
-       I != E; ++I)
-    setEdgeWeight(BB, *I, UnreachableWeight);
+  // If the terminator is an InvokeInst, check only the normal destination block
+  // as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    if (PostDominatedByUnreachable.count(II->getNormalDest())) {
+      PostDominatedByUnreachable.insert(BB);
+      // Return false here so that edge weights for InvokeInst could be decided
+      // in calcInvokeHeuristics().
+      return false;
+    }
 
-  if (ReachableEdges.empty())
+  if (ReachableEdges.empty()) {
+    BranchProbability Prob(1, UnreachableEdges.size());
+    for (unsigned SuccIdx : UnreachableEdges)
+      setEdgeProbability(BB, SuccIdx, Prob);
     return true;
-  uint32_t ReachableWeight =
-    std::max(UR_NONTAKEN_WEIGHT / (unsigned)ReachableEdges.size(),
-             NORMAL_WEIGHT);
-  for (SmallVectorImpl<unsigned>::iterator I = ReachableEdges.begin(),
-                                           E = ReachableEdges.end();
-       I != E; ++I)
-    setEdgeWeight(BB, *I, ReachableWeight);
+  }
+
+  BranchProbability UnreachableProb(UR_TAKEN_WEIGHT,
+                                    (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) *
+                                        UnreachableEdges.size());
+  BranchProbability ReachableProb(UR_NONTAKEN_WEIGHT,
+                                  (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) *
+                                      ReachableEdges.size());
+
+  for (unsigned SuccIdx : UnreachableEdges)
+    setEdgeProbability(BB, SuccIdx, UnreachableProb);
+  for (unsigned SuccIdx : ReachableEdges)
+    setEdgeProbability(BB, SuccIdx, ReachableProb);
 
   return true;
 }
@@ -213,10 +218,18 @@ bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
 
   WeightSum = 0;
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    uint32_t W = Weights[i] / ScalingFactor;
-    WeightSum += W;
-    setEdgeWeight(BB, i, W);
+    Weights[i] /= ScalingFactor;
+    WeightSum += Weights[i];
   }
+
+  if (WeightSum == 0) {
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      setEdgeProbability(BB, i, {1, e});
+  } else {
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      setEdgeProbability(BB, i, {Weights[i], static_cast<uint32_t>(WeightSum)});
+  }
+
   assert(WeightSum <= UINT32_MAX &&
          "Expected weights to scale down to 32 bits");
 
@@ -265,21 +278,24 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(BasicBlock *BB) {
   if (TI->getNumSuccessors() == 1 || ColdEdges.empty())
     return false;
 
-  uint32_t ColdWeight =
-      std::max(CC_TAKEN_WEIGHT / (unsigned) ColdEdges.size(), MIN_WEIGHT);
-  for (SmallVectorImpl<unsigned>::iterator I = ColdEdges.begin(),
-                                           E = ColdEdges.end();
-       I != E; ++I)
-    setEdgeWeight(BB, *I, ColdWeight);
-
-  if (NormalEdges.empty())
+  if (NormalEdges.empty()) {
+    BranchProbability Prob(1, ColdEdges.size());
+    for (unsigned SuccIdx : ColdEdges)
+      setEdgeProbability(BB, SuccIdx, Prob);
     return true;
-  uint32_t NormalWeight = std::max(
-      CC_NONTAKEN_WEIGHT / (unsigned) NormalEdges.size(), NORMAL_WEIGHT);
-  for (SmallVectorImpl<unsigned>::iterator I = NormalEdges.begin(),
-                                           E = NormalEdges.end();
-       I != E; ++I)
-    setEdgeWeight(BB, *I, NormalWeight);
+  }
+
+  BranchProbability ColdProb(CC_TAKEN_WEIGHT,
+                             (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) *
+                                 ColdEdges.size());
+  BranchProbability NormalProb(CC_NONTAKEN_WEIGHT,
+                               (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) *
+                                   NormalEdges.size());
+
+  for (unsigned SuccIdx : ColdEdges)
+    setEdgeProbability(BB, SuccIdx, ColdProb);
+  for (unsigned SuccIdx : NormalEdges)
+    setEdgeProbability(BB, SuccIdx, NormalProb);
 
   return true;
 }
@@ -312,15 +328,18 @@ bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) {
   if (!isProb)
     std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, TakenIdx, PH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTakenIdx, PH_NONTAKEN_WEIGHT);
+  BranchProbability TakenProb(PH_TAKEN_WEIGHT,
+                              PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
+  setEdgeProbability(BB, TakenIdx, TakenProb);
+  setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
   return true;
 }
 
 // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
 // as taken, exiting edges as not-taken.
-bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
-  Loop *L = LI->getLoopFor(BB);
+bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB,
+                                                     const LoopInfo &LI) {
+  Loop *L = LI.getLoopFor(BB);
   if (!L)
     return false;
 
@@ -340,37 +359,35 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
   if (BackEdges.empty() && ExitingEdges.empty())
     return false;
 
-  if (uint32_t numBackEdges = BackEdges.size()) {
-    uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges;
-    if (backWeight < NORMAL_WEIGHT)
-      backWeight = NORMAL_WEIGHT;
+  // Collect the sum of probabilities of back-edges/in-edges/exiting-edges, and
+  // normalize them so that they sum up to one.
+  SmallVector<BranchProbability, 4> Probs(3, BranchProbability::getZero());
+  unsigned Denom = (BackEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
+                   (InEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) +
+                   (ExitingEdges.empty() ? 0 : LBH_NONTAKEN_WEIGHT);
+  if (!BackEdges.empty())
+    Probs[0] = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
+  if (!InEdges.empty())
+    Probs[1] = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
+  if (!ExitingEdges.empty())
+    Probs[2] = BranchProbability(LBH_NONTAKEN_WEIGHT, Denom);
 
-    for (SmallVectorImpl<unsigned>::iterator EI = BackEdges.begin(),
-         EE = BackEdges.end(); EI != EE; ++EI) {
-      setEdgeWeight(BB, *EI, backWeight);
-    }
+  if (uint32_t numBackEdges = BackEdges.size()) {
+    auto Prob = Probs[0] / numBackEdges;
+    for (unsigned SuccIdx : BackEdges)
+      setEdgeProbability(BB, SuccIdx, Prob);
   }
 
   if (uint32_t numInEdges = InEdges.size()) {
-    uint32_t inWeight = LBH_TAKEN_WEIGHT / numInEdges;
-    if (inWeight < NORMAL_WEIGHT)
-      inWeight = NORMAL_WEIGHT;
-
-    for (SmallVectorImpl<unsigned>::iterator EI = InEdges.begin(),
-         EE = InEdges.end(); EI != EE; ++EI) {
-      setEdgeWeight(BB, *EI, inWeight);
-    }
+    auto Prob = Probs[1] / numInEdges;
+    for (unsigned SuccIdx : InEdges)
+      setEdgeProbability(BB, SuccIdx, Prob);
   }
 
   if (uint32_t numExitingEdges = ExitingEdges.size()) {
-    uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numExitingEdges;
-    if (exitWeight < MIN_WEIGHT)
-      exitWeight = MIN_WEIGHT;
-
-    for (SmallVectorImpl<unsigned>::iterator EI = ExitingEdges.begin(),
-         EE = ExitingEdges.end(); EI != EE; ++EI) {
-      setEdgeWeight(BB, *EI, exitWeight);
-    }
+    auto Prob = Probs[2] / numExitingEdges;
+    for (unsigned SuccIdx : ExitingEdges)
+      setEdgeProbability(BB, SuccIdx, Prob);
   }
 
   return true;
@@ -452,9 +469,10 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
   if (!isProb)
     std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, TakenIdx, ZH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTakenIdx, ZH_NONTAKEN_WEIGHT);
-
+  BranchProbability TakenProb(ZH_TAKEN_WEIGHT,
+                              ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
+  setEdgeProbability(BB, TakenIdx, TakenProb);
+  setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
   return true;
 }
 
@@ -488,9 +506,10 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(BasicBlock *BB) {
   if (!isProb)
     std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, TakenIdx, FPH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTakenIdx, FPH_NONTAKEN_WEIGHT);
-
+  BranchProbability TakenProb(FPH_TAKEN_WEIGHT,
+                              FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT);
+  setEdgeProbability(BB, TakenIdx, TakenProb);
+  setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
   return true;
 }
 
@@ -499,82 +518,30 @@ bool BranchProbabilityInfo::calcInvokeHeuristics(BasicBlock *BB) {
   if (!II)
     return false;
 
-  setEdgeWeight(BB, 0/*Index for Normal*/, IH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, 1/*Index for Unwind*/, IH_NONTAKEN_WEIGHT);
+  BranchProbability TakenProb(IH_TAKEN_WEIGHT,
+                              IH_TAKEN_WEIGHT + IH_NONTAKEN_WEIGHT);
+  setEdgeProbability(BB, 0 /*Index for Normal*/, TakenProb);
+  setEdgeProbability(BB, 1 /*Index for Unwind*/, TakenProb.getCompl());
   return true;
 }
 
-void BranchProbabilityInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.setPreservesAll();
-}
-
-bool BranchProbabilityInfo::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
-               << " ----\n\n");
-  LastF = &F; // Store the last function we ran on for printing.
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  assert(PostDominatedByUnreachable.empty());
-  assert(PostDominatedByColdCall.empty());
-
-  // Walk the basic blocks in post-order so that we can build up state about
-  // the successors of a block iteratively.
-  for (auto BB : post_order(&F.getEntryBlock())) {
-    DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n");
-    if (calcUnreachableHeuristics(BB))
-      continue;
-    if (calcMetadataWeights(BB))
-      continue;
-    if (calcColdCallHeuristics(BB))
-      continue;
-    if (calcLoopBranchHeuristics(BB))
-      continue;
-    if (calcPointerHeuristics(BB))
-      continue;
-    if (calcZeroHeuristics(BB))
-      continue;
-    if (calcFloatingPointHeuristics(BB))
-      continue;
-    calcInvokeHeuristics(BB);
-  }
-
-  PostDominatedByUnreachable.clear();
-  PostDominatedByColdCall.clear();
-  return false;
-}
-
 void BranchProbabilityInfo::releaseMemory() {
-  Weights.clear();
+  Probs.clear();
 }
 
-void BranchProbabilityInfo::print(raw_ostream &OS, const Module *) const {
+void BranchProbabilityInfo::print(raw_ostream &OS) const {
   OS << "---- Branch Probabilities ----\n";
   // We print the probabilities from the last function the analysis ran over,
   // or the function it is currently running over.
   assert(LastF && "Cannot print prior to running over a function");
-  for (Function::const_iterator BI = LastF->begin(), BE = LastF->end();
-       BI != BE; ++BI) {
-    for (succ_const_iterator SI = succ_begin(BI), SE = succ_end(BI);
-         SI != SE; ++SI) {
-      printEdgeProbability(OS << "  ", BI, *SI);
+  for (const auto &BI : *LastF) {
+    for (succ_const_iterator SI = succ_begin(&BI), SE = succ_end(&BI); SI != SE;
+         ++SI) {
+      printEdgeProbability(OS << "  ", &BI, *SI);
     }
   }
 }
 
-uint32_t BranchProbabilityInfo::getSumForBlock(const BasicBlock *BB) const {
-  uint32_t Sum = 0;
-
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(BB, I.getSuccessorIndex());
-    uint32_t PrevSum = Sum;
-
-    Sum += Weight;
-    assert(Sum >= PrevSum); (void) PrevSum;
-  }
-
-  return Sum;
-}
-
 bool BranchProbabilityInfo::
 isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
@@ -583,97 +550,74 @@ isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
 }
 
 BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
-  uint32_t Sum = 0;
-  uint32_t MaxWeight = 0;
+  auto MaxProb = BranchProbability::getZero();
   BasicBlock *MaxSucc = nullptr;
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     BasicBlock *Succ = *I;
-    uint32_t Weight = getEdgeWeight(BB, Succ);
-    uint32_t PrevSum = Sum;
-
-    Sum += Weight;
-    assert(Sum > PrevSum); (void) PrevSum;
-
-    if (Weight > MaxWeight) {
-      MaxWeight = Weight;
+    auto Prob = getEdgeProbability(BB, Succ);
+    if (Prob > MaxProb) {
+      MaxProb = Prob;
       MaxSucc = Succ;
     }
   }
 
   // Hot probability is at least 4/5 = 80%
-  if (BranchProbability(MaxWeight, Sum) > BranchProbability(4, 5))
+  if (MaxProb > BranchProbability(4, 5))
     return MaxSucc;
 
   return nullptr;
 }
 
-/// Get the raw edge weight for the edge. If can't find it, return
-/// DEFAULT_WEIGHT value. Here an edge is specified using PredBlock and an index
-/// to the successors.
-uint32_t BranchProbabilityInfo::
-getEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors) const {
-  DenseMap<Edge, uint32_t>::const_iterator I =
-      Weights.find(std::make_pair(Src, IndexInSuccessors));
+/// Get the raw edge probability for the edge. If can't find it, return a
+/// default probability 1/N where N is the number of successors. Here an edge is
+/// specified using PredBlock and an
+/// index to the successors.
+BranchProbability
+BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
+                                          unsigned IndexInSuccessors) const {
+  auto I = Probs.find(std::make_pair(Src, IndexInSuccessors));
 
-  if (I != Weights.end())
+  if (I != Probs.end())
     return I->second;
 
-  return DEFAULT_WEIGHT;
+  return {1,
+          static_cast<uint32_t>(std::distance(succ_begin(Src), succ_end(Src)))};
 }
 
-uint32_t BranchProbabilityInfo::getEdgeWeight(const BasicBlock *Src,
-                                              succ_const_iterator Dst) const {
-  return getEdgeWeight(Src, Dst.getSuccessorIndex());
+BranchProbability
+BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
+                                          succ_const_iterator Dst) const {
+  return getEdgeProbability(Src, Dst.getSuccessorIndex());
 }
 
-/// Get the raw edge weight calculated for the block pair. This returns the sum
-/// of all raw edge weights from Src to Dst.
-uint32_t BranchProbabilityInfo::
-getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
-  uint32_t Weight = 0;
-  bool FoundWeight = false;
-  DenseMap<Edge, uint32_t>::const_iterator MapI;
+/// Get the raw edge probability calculated for the block pair. This returns the
+/// sum of all raw edge probabilities from Src to Dst.
+BranchProbability
+BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
+                                          const BasicBlock *Dst) const {
+  auto Prob = BranchProbability::getZero();
+  bool FoundProb = false;
   for (succ_const_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
     if (*I == Dst) {
-      MapI = Weights.find(std::make_pair(Src, I.getSuccessorIndex()));
-      if (MapI != Weights.end()) {
-        FoundWeight = true;
-        Weight += MapI->second;
+      auto MapI = Probs.find(std::make_pair(Src, I.getSuccessorIndex()));
+      if (MapI != Probs.end()) {
+        FoundProb = true;
+        Prob += MapI->second;
       }
     }
-  return (!FoundWeight) ? DEFAULT_WEIGHT : Weight;
+  uint32_t succ_num = std::distance(succ_begin(Src), succ_end(Src));
+  return FoundProb ? Prob : BranchProbability(1, succ_num);
 }
 
-/// Set the edge weight for a given edge specified by PredBlock and an index
-/// to the successors.
-void BranchProbabilityInfo::
-setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
-              uint32_t Weight) {
-  Weights[std::make_pair(Src, IndexInSuccessors)] = Weight;
-  DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
-               << IndexInSuccessors << " successor weight to "
-               << Weight << "\n");
-}
-
-/// Get an edge's probability, relative to other out-edges from Src.
-BranchProbability BranchProbabilityInfo::
-getEdgeProbability(const BasicBlock *Src, unsigned IndexInSuccessors) const {
-  uint32_t N = getEdgeWeight(Src, IndexInSuccessors);
-  uint32_t D = getSumForBlock(Src);
-
-  return BranchProbability(N, D);
-}
-
-/// Get the probability of going from Src to Dst. It returns the sum of all
-/// probabilities for edges from Src to Dst.
-BranchProbability BranchProbabilityInfo::
-getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
-
-  uint32_t N = getEdgeWeight(Src, Dst);
-  uint32_t D = getSumForBlock(Src);
-
-  return BranchProbability(N, D);
+/// Set the edge probability for a given edge specified by PredBlock and an
+/// index to the successors.
+void BranchProbabilityInfo::setEdgeProbability(const BasicBlock *Src,
+                                               unsigned IndexInSuccessors,
+                                               BranchProbability Prob) {
+  Probs[std::make_pair(Src, IndexInSuccessors)] = Prob;
+  DEBUG(dbgs() << "set edge " << Src->getName() << " -> " << IndexInSuccessors
+               << " successor probability to " << Prob << "\n");
 }
 
 raw_ostream &
@@ -688,3 +632,54 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
 
   return OS;
 }
+
+void BranchProbabilityInfo::calculate(Function &F, const LoopInfo& LI) {
+  DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
+               << " ----\n\n");
+  LastF = &F; // Store the last function we ran on for printing.
+  assert(PostDominatedByUnreachable.empty());
+  assert(PostDominatedByColdCall.empty());
+
+  // Walk the basic blocks in post-order so that we can build up state about
+  // the successors of a block iteratively.
+  for (auto BB : post_order(&F.getEntryBlock())) {
+    DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n");
+    if (calcUnreachableHeuristics(BB))
+      continue;
+    if (calcMetadataWeights(BB))
+      continue;
+    if (calcColdCallHeuristics(BB))
+      continue;
+    if (calcLoopBranchHeuristics(BB, LI))
+      continue;
+    if (calcPointerHeuristics(BB))
+      continue;
+    if (calcZeroHeuristics(BB))
+      continue;
+    if (calcFloatingPointHeuristics(BB))
+      continue;
+    calcInvokeHeuristics(BB);
+  }
+
+  PostDominatedByUnreachable.clear();
+  PostDominatedByColdCall.clear();
+}
+
+void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
+  const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  BPI.calculate(F, LI);
+  return false;
+}
+
+void BranchProbabilityInfoWrapperPass::releaseMemory() { BPI.releaseMemory(); }
+
+void BranchProbabilityInfoWrapperPass::print(raw_ostream &OS,
+                                             const Module *) const {
+  BPI.print(OS);
+}
diff --git a/contrib/llvm/lib/Analysis/CFG.cpp b/contrib/llvm/lib/Analysis/CFG.cpp
index e15109b..0dfd57d 100644
--- a/contrib/llvm/lib/Analysis/CFG.cpp
+++ b/contrib/llvm/lib/Analysis/CFG.cpp
@@ -69,8 +69,9 @@ void llvm::FindFunctionBackedges(const Function &F,
 /// and return its position in the terminator instruction's list of
 /// successors.  It is an error to call this with a block that is not a
 /// successor.
-unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
-  TerminatorInst *Term = BB->getTerminator();
+unsigned llvm::GetSuccessorNumber(const BasicBlock *BB,
+    const BasicBlock *Succ) {
+  const TerminatorInst *Term = BB->getTerminator();
 #ifndef NDEBUG
   unsigned e = Term->getNumSuccessors();
 #endif
@@ -203,7 +204,8 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
       return true;
 
     // Linear scan, start at 'A', see whether we hit 'B' or the end first.
-    for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) {
+    for (BasicBlock::const_iterator I = A->getIterator(), E = BB->end(); I != E;
+         ++I) {
       if (&*I == B)
         return true;
     }
diff --git a/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp
index fe1c088..4843ed6 100644
--- a/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp
@@ -27,18 +27,17 @@
 // time.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/CFLAliasAnalysis.h"
 #include "StratifiedSets.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
@@ -47,7 +46,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <forward_list>
 #include <memory>
 #include <tuple>
 
@@ -55,6 +53,19 @@ using namespace llvm;
 
 #define DEBUG_TYPE "cfl-aa"
 
+CFLAAResult::CFLAAResult(const TargetLibraryInfo &TLI) : AAResultBase(TLI) {}
+CFLAAResult::CFLAAResult(CFLAAResult &&Arg) : AAResultBase(std::move(Arg)) {}
+
+// \brief Information we have about a function and would like to keep around
+struct CFLAAResult::FunctionInfo {
+  StratifiedSets<Value *> Sets;
+  // Lots of functions have < 4 returns. Adjust as necessary.
+  SmallVector<Value *, 4> ReturnedValues;
+
+  FunctionInfo(StratifiedSets<Value *> &&S, SmallVector<Value *, 4> &&RV)
+      : Sets(std::move(S)), ReturnedValues(std::move(RV)) {}
+};
+
 // Try to go from a Value* to a Function*. Never returns nullptr.
 static Optional<Function *> parentFunctionOfValue(Value *);
 
@@ -141,129 +152,13 @@ struct Edge {
       : From(From), To(To), Weight(W), AdditionalAttrs(A) {}
 };
 
-// \brief Information we have about a function and would like to keep around
-struct FunctionInfo {
-  StratifiedSets<Value *> Sets;
-  // Lots of functions have < 4 returns. Adjust as necessary.
-  SmallVector<Value *, 4> ReturnedValues;
-
-  FunctionInfo(StratifiedSets<Value *> &&S, SmallVector<Value *, 4> &&RV)
-      : Sets(std::move(S)), ReturnedValues(std::move(RV)) {}
-};
-
-struct CFLAliasAnalysis;
-
-struct FunctionHandle : public CallbackVH {
-  FunctionHandle(Function *Fn, CFLAliasAnalysis *CFLAA)
-      : CallbackVH(Fn), CFLAA(CFLAA) {
-    assert(Fn != nullptr);
-    assert(CFLAA != nullptr);
-  }
-
-  ~FunctionHandle() override {}
-
-  void deleted() override { removeSelfFromCache(); }
-  void allUsesReplacedWith(Value *) override { removeSelfFromCache(); }
-
-private:
-  CFLAliasAnalysis *CFLAA;
-
-  void removeSelfFromCache();
-};
-
-struct CFLAliasAnalysis : public ImmutablePass, public AliasAnalysis {
-private:
-  /// \brief Cached mapping of Functions to their StratifiedSets.
-  /// If a function's sets are currently being built, it is marked
-  /// in the cache as an Optional without a value. This way, if we
-  /// have any kind of recursion, it is discernable from a function
-  /// that simply has empty sets.
-  DenseMap<Function *, Optional<FunctionInfo>> Cache;
-  std::forward_list<FunctionHandle> Handles;
-
-public:
-  static char ID;
-
-  CFLAliasAnalysis() : ImmutablePass(ID) {
-    initializeCFLAliasAnalysisPass(*PassRegistry::getPassRegistry());
-  }
-
-  ~CFLAliasAnalysis() override {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AliasAnalysis::getAnalysisUsage(AU);
-  }
-
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &AliasAnalysis::ID)
-      return (AliasAnalysis *)this;
-    return this;
-  }
-
-  /// \brief Inserts the given Function into the cache.
-  void scan(Function *Fn);
-
-  void evict(Function *Fn) { Cache.erase(Fn); }
-
-  /// \brief Ensures that the given function is available in the cache.
-  /// Returns the appropriate entry from the cache.
-  const Optional<FunctionInfo> &ensureCached(Function *Fn) {
-    auto Iter = Cache.find(Fn);
-    if (Iter == Cache.end()) {
-      scan(Fn);
-      Iter = Cache.find(Fn);
-      assert(Iter != Cache.end());
-      assert(Iter->second.hasValue());
-    }
-    return Iter->second;
-  }
-
-  AliasResult query(const MemoryLocation &LocA, const MemoryLocation &LocB);
-
-  AliasResult alias(const MemoryLocation &LocA,
-                    const MemoryLocation &LocB) override {
-    if (LocA.Ptr == LocB.Ptr) {
-      if (LocA.Size == LocB.Size) {
-        return MustAlias;
-      } else {
-        return PartialAlias;
-      }
-    }
-
-    // Comparisons between global variables and other constants should be
-    // handled by BasicAA.
-    // TODO: ConstantExpr handling -- CFLAA may report NoAlias when comparing
-    // a GlobalValue and ConstantExpr, but every query needs to have at least
-    // one Value tied to a Function, and neither GlobalValues nor ConstantExprs
-    // are.
-    if (isa<Constant>(LocA.Ptr) && isa<Constant>(LocB.Ptr)) {
-      return AliasAnalysis::alias(LocA, LocB);
-    }
-
-    AliasResult QueryResult = query(LocA, LocB);
-    if (QueryResult == MayAlias)
-      return AliasAnalysis::alias(LocA, LocB);
-
-    return QueryResult;
-  }
-
-  bool doInitialization(Module &M) override;
-};
-
-void FunctionHandle::removeSelfFromCache() {
-  assert(CFLAA != nullptr);
-  auto *Val = getValPtr();
-  CFLAA->evict(cast<Function>(Val));
-  setValPtr(nullptr);
-}
-
 // \brief Gets the edges our graph should have, based on an Instruction*
 class GetEdgesVisitor : public InstVisitor<GetEdgesVisitor, void> {
-  CFLAliasAnalysis &AA;
+  CFLAAResult &AA;
   SmallVectorImpl<Edge> &Output;
 
 public:
-  GetEdgesVisitor(CFLAliasAnalysis &AA, SmallVectorImpl<Edge> &Output)
+  GetEdgesVisitor(CFLAAResult &AA, SmallVectorImpl<Edge> &Output)
       : AA(AA), Output(Output) {}
 
   void visitInstruction(Instruction &) {
@@ -480,6 +375,8 @@ public:
   }
 
   template <typename InstT> void visitCallLikeInst(InstT &Inst) {
+    // TODO: Add support for noalias args/all the other fun function attributes
+    // that we can tack on.
     SmallVector<Function *, 4> Targets;
     if (getPossibleTargets(&Inst, Targets)) {
       if (tryInterproceduralAnalysis(Targets, &Inst, Inst.arg_operands()))
@@ -488,8 +385,16 @@ public:
       Output.clear();
     }
 
+    // Because the function is opaque, we need to note that anything
+    // could have happened to the arguments, and that the result could alias
+    // just about anything, too.
+    // The goal of the loop is in part to unify many Values into one set, so we
+    // don't care if the function is void there.
     for (Value *V : Inst.arg_operands())
       Output.push_back(Edge(&Inst, V, EdgeType::Assign, AttrAll));
+    if (Inst.getNumArgOperands() == 0 &&
+        Inst.getType() != Type::getVoidTy(Inst.getContext()))
+      Output.push_back(Edge(&Inst, &Inst, EdgeType::Assign, AttrAll));
   }
 
   void visitCallInst(CallInst &Inst) { visitCallLikeInst(Inst); }
@@ -624,7 +529,7 @@ public:
   // ----- Various Edge iterators for the graph ----- //
 
   // \brief Iterator for edges. Because this graph is bidirected, we don't
-  // allow modificaiton of the edges using this iterator. Additionally, the
+  // allow modification of the edges using this iterator. Additionally, the
   // iterator becomes invalid if you add edges to or from the node you're
   // getting the edges of.
   struct EdgeIterator : public std::iterator<std::forward_iterator_tag,
@@ -727,16 +632,6 @@ typedef WeightedBidirectionalGraph<std::pair<EdgeType, StratifiedAttrs>> GraphT;
 typedef DenseMap<Value *, GraphT::Node> NodeMapT;
 }
 
-// -- Setting up/registering CFLAA pass -- //
-char CFLAliasAnalysis::ID = 0;
-
-INITIALIZE_AG_PASS(CFLAliasAnalysis, AliasAnalysis, "cfl-aa",
-                   "CFL-Based AA implementation", false, true, false)
-
-ImmutablePass *llvm::createCFLAliasAnalysisPass() {
-  return new CFLAliasAnalysis();
-}
-
 //===----------------------------------------------------------------------===//
 // Function declarations that require types defined in the namespace above
 //===----------------------------------------------------------------------===//
@@ -751,12 +646,10 @@ static Optional<StratifiedAttr> valueToAttrIndex(Value *Val);
 static EdgeType flipWeight(EdgeType);
 
 // Gets edges of the given Instruction*, writing them to the SmallVector*.
-static void argsToEdges(CFLAliasAnalysis &, Instruction *,
-                        SmallVectorImpl<Edge> &);
+static void argsToEdges(CFLAAResult &, Instruction *, SmallVectorImpl<Edge> &);
 
 // Gets edges of the given ConstantExpr*, writing them to the SmallVector*.
-static void argsToEdges(CFLAliasAnalysis &, ConstantExpr *,
-                        SmallVectorImpl<Edge> &);
+static void argsToEdges(CFLAAResult &, ConstantExpr *, SmallVectorImpl<Edge> &);
 
 // Gets the "Level" that one should travel in StratifiedSets
 // given an EdgeType.
@@ -764,13 +657,13 @@ static Level directionOfEdgeType(EdgeType);
 
 // Builds the graph needed for constructing the StratifiedSets for the
 // given function
-static void buildGraphFrom(CFLAliasAnalysis &, Function *,
+static void buildGraphFrom(CFLAAResult &, Function *,
                            SmallVectorImpl<Value *> &, NodeMapT &, GraphT &);
 
 // Gets the edges of a ConstantExpr as if it was an Instruction. This
 // function also acts on any nested ConstantExprs, adding the edges
 // of those to the given SmallVector as well.
-static void constexprToEdges(CFLAliasAnalysis &, ConstantExpr &,
+static void constexprToEdges(CFLAAResult &, ConstantExpr &,
                              SmallVectorImpl<Edge> &);
 
 // Given an Instruction, this will add it to the graph, along with any
@@ -779,16 +672,13 @@ static void constexprToEdges(CFLAliasAnalysis &, ConstantExpr &,
 //   %0 = load i16* getelementptr ([1 x i16]* @a, 0, 0), align 2
 // addInstructionToGraph would add both the `load` and `getelementptr`
 // instructions to the graph appropriately.
-static void addInstructionToGraph(CFLAliasAnalysis &, Instruction &,
+static void addInstructionToGraph(CFLAAResult &, Instruction &,
                                   SmallVectorImpl<Value *> &, NodeMapT &,
                                   GraphT &);
 
 // Notes whether it would be pointless to add the given Value to our sets.
 static bool canSkipAddingToSets(Value *Val);
 
-// Builds the graph + StratifiedSets for a function.
-static FunctionInfo buildSetsFrom(CFLAliasAnalysis &, Function *);
-
 static Optional<Function *> parentFunctionOfValue(Value *Val) {
   if (auto *Inst = dyn_cast<Instruction>(Val)) {
     auto *Bb = Inst->getParent();
@@ -825,7 +715,7 @@ static bool hasUsefulEdges(Instruction *Inst) {
 }
 
 static bool hasUsefulEdges(ConstantExpr *CE) {
-  // ConstantExpr doens't have terminators, invokes, or fences, so only needs
+  // ConstantExpr doesn't have terminators, invokes, or fences, so only needs
   // to check for compares.
   return CE->getOpcode() != Instruction::ICmp &&
          CE->getOpcode() != Instruction::FCmp;
@@ -862,7 +752,7 @@ static EdgeType flipWeight(EdgeType Initial) {
   llvm_unreachable("Incomplete coverage of EdgeType enum");
 }
 
-static void argsToEdges(CFLAliasAnalysis &Analysis, Instruction *Inst,
+static void argsToEdges(CFLAAResult &Analysis, Instruction *Inst,
                         SmallVectorImpl<Edge> &Output) {
   assert(hasUsefulEdges(Inst) &&
          "Expected instructions to have 'useful' edges");
@@ -870,7 +760,7 @@ static void argsToEdges(CFLAliasAnalysis &Analysis, Instruction *Inst,
   v.visit(Inst);
 }
 
-static void argsToEdges(CFLAliasAnalysis &Analysis, ConstantExpr *CE,
+static void argsToEdges(CFLAAResult &Analysis, ConstantExpr *CE,
                         SmallVectorImpl<Edge> &Output) {
   assert(hasUsefulEdges(CE) && "Expected constant expr to have 'useful' edges");
   GetEdgesVisitor v(Analysis, Output);
@@ -889,7 +779,7 @@ static Level directionOfEdgeType(EdgeType Weight) {
   llvm_unreachable("Incomplete switch coverage");
 }
 
-static void constexprToEdges(CFLAliasAnalysis &Analysis,
+static void constexprToEdges(CFLAAResult &Analysis,
                              ConstantExpr &CExprToCollapse,
                              SmallVectorImpl<Edge> &Results) {
   SmallVector<ConstantExpr *, 4> Worklist;
@@ -919,7 +809,7 @@ static void constexprToEdges(CFLAliasAnalysis &Analysis,
   }
 }
 
-static void addInstructionToGraph(CFLAliasAnalysis &Analysis, Instruction &Inst,
+static void addInstructionToGraph(CFLAAResult &Analysis, Instruction &Inst,
                                   SmallVectorImpl<Value *> &ReturnedValues,
                                   NodeMapT &Map, GraphT &Graph) {
   const auto findOrInsertNode = [&Map, &Graph](Value *Val) {
@@ -982,7 +872,7 @@ static void addInstructionToGraph(CFLAliasAnalysis &Analysis, Instruction &Inst,
 // buy us much that we don't already have. I'd like to add interprocedural
 // analysis prior to this however, in case that somehow requires the graph
 // produced by this for efficient execution
-static void buildGraphFrom(CFLAliasAnalysis &Analysis, Function *Fn,
+static void buildGraphFrom(CFLAAResult &Analysis, Function *Fn,
                            SmallVectorImpl<Value *> &ReturnedValues,
                            NodeMapT &Map, GraphT &Graph) {
   for (auto &Bb : Fn->getBasicBlockList())
@@ -1012,12 +902,13 @@ static bool canSkipAddingToSets(Value *Val) {
   return false;
 }
 
-static FunctionInfo buildSetsFrom(CFLAliasAnalysis &Analysis, Function *Fn) {
+// Builds the graph + StratifiedSets for a function.
+CFLAAResult::FunctionInfo CFLAAResult::buildSetsFrom(Function *Fn) {
   NodeMapT Map;
   GraphT Graph;
   SmallVector<Value *, 4> ReturnedValues;
 
-  buildGraphFrom(Analysis, Fn, ReturnedValues, Map, Graph);
+  buildGraphFrom(*this, Fn, ReturnedValues, Map, Graph);
 
   DenseMap<GraphT::Node, Value *> NodeValueMap;
   NodeValueMap.resize(Map.size());
@@ -1098,19 +989,35 @@ static FunctionInfo buildSetsFrom(CFLAliasAnalysis &Analysis, Function *Fn) {
   return FunctionInfo(Builder.build(), std::move(ReturnedValues));
 }
 
-void CFLAliasAnalysis::scan(Function *Fn) {
+void CFLAAResult::scan(Function *Fn) {
   auto InsertPair = Cache.insert(std::make_pair(Fn, Optional<FunctionInfo>()));
   (void)InsertPair;
   assert(InsertPair.second &&
          "Trying to scan a function that has already been cached");
 
-  FunctionInfo Info(buildSetsFrom(*this, Fn));
+  FunctionInfo Info(buildSetsFrom(Fn));
   Cache[Fn] = std::move(Info);
   Handles.push_front(FunctionHandle(Fn, this));
 }
 
-AliasResult CFLAliasAnalysis::query(const MemoryLocation &LocA,
-                                    const MemoryLocation &LocB) {
+void CFLAAResult::evict(Function *Fn) { Cache.erase(Fn); }
+
+/// \brief Ensures that the given function is available in the cache.
+/// Returns the appropriate entry from the cache.
+const Optional<CFLAAResult::FunctionInfo> &
+CFLAAResult::ensureCached(Function *Fn) {
+  auto Iter = Cache.find(Fn);
+  if (Iter == Cache.end()) {
+    scan(Fn);
+    Iter = Cache.find(Fn);
+    assert(Iter != Cache.end());
+    assert(Iter->second.hasValue());
+  }
+  return Iter->second;
+}
+
+AliasResult CFLAAResult::query(const MemoryLocation &LocA,
+                               const MemoryLocation &LocB) {
   auto *ValA = const_cast<Value *>(LocA.Ptr);
   auto *ValB = const_cast<Value *>(LocB.Ptr);
 
@@ -1176,7 +1083,37 @@ AliasResult CFLAliasAnalysis::query(const MemoryLocation &LocA,
   return NoAlias;
 }
 
-bool CFLAliasAnalysis::doInitialization(Module &M) {
-  InitializeAliasAnalysis(this, &M.getDataLayout());
-  return true;
+CFLAAResult CFLAA::run(Function &F, AnalysisManager<Function> *AM) {
+  return CFLAAResult(AM->getResult<TargetLibraryAnalysis>(F));
+}
+
+char CFLAA::PassID;
+
+char CFLAAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFLAAWrapperPass, "cfl-aa", "CFL-Based Alias Analysis",
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CFLAAWrapperPass, "cfl-aa", "CFL-Based Alias Analysis",
+                    false, true)
+
+ImmutablePass *llvm::createCFLAAWrapperPass() { return new CFLAAWrapperPass(); }
+
+CFLAAWrapperPass::CFLAAWrapperPass() : ImmutablePass(ID) {
+  initializeCFLAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool CFLAAWrapperPass::doInitialization(Module &M) {
+  Result.reset(
+      new CFLAAResult(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  return false;
+}
+
+bool CFLAAWrapperPass::doFinalization(Module &M) {
+  Result.reset();
+  return false;
+}
+
+void CFLAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp b/contrib/llvm/lib/Analysis/CallGraph.cpp
index e2799d9..7cec962 100644
--- a/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp
+++ b/contrib/llvm/lib/Analysis/CallGraph.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 CallGraph::CallGraph(Module &M)
     : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
-      CallsExternalNode(new CallGraphNode(nullptr)) {
+      CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
   // Add every function to the call graph.
   for (Function &F : M)
     addToCallGraph(&F);
@@ -32,10 +32,19 @@ CallGraph::CallGraph(Module &M)
     Root = ExternalCallingNode;
 }
 
+CallGraph::CallGraph(CallGraph &&Arg)
+    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root),
+      ExternalCallingNode(Arg.ExternalCallingNode),
+      CallsExternalNode(std::move(Arg.CallsExternalNode)) {
+  Arg.FunctionMap.clear();
+  Arg.Root = nullptr;
+  Arg.ExternalCallingNode = nullptr;
+}
+
 CallGraph::~CallGraph() {
   // CallsExternalNode is not in the function map, delete it explicitly.
-  CallsExternalNode->allReferencesDropped();
-  delete CallsExternalNode;
+  if (CallsExternalNode)
+    CallsExternalNode->allReferencesDropped();
 
 // Reset all node's use counts to zero before deleting them to prevent an
 // assertion from firing.
@@ -43,8 +52,6 @@ CallGraph::~CallGraph() {
   for (auto &I : FunctionMap)
     I.second->allReferencesDropped();
 #endif
-  for (auto &I : FunctionMap)
-    delete I.second;
 }
 
 void CallGraph::addToCallGraph(Function *F) {
@@ -70,7 +77,7 @@ void CallGraph::addToCallGraph(Function *F) {
   // If this function is not defined in this translation unit, it could call
   // anything.
   if (F->isDeclaration() && !F->isIntrinsic())
-    Node->addCalledFunction(CallSite(), CallsExternalNode);
+    Node->addCalledFunction(CallSite(), CallsExternalNode.get());
 
   // Look for calls by this function.
   for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
@@ -83,7 +90,7 @@ void CallGraph::addToCallGraph(Function *F) {
           // Indirect calls of intrinsics are not allowed so no need to check.
           // We can be more precise here by using TargetArg returned by
           // Intrinsic::isLeaf.
-          Node->addCalledFunction(CS, CallsExternalNode);
+          Node->addCalledFunction(CS, CallsExternalNode.get());
         else if (!Callee->isIntrinsic())
           Node->addCalledFunction(CS, getOrInsertFunction(Callee));
       }
@@ -105,7 +112,7 @@ void CallGraph::print(raw_ostream &OS) const {
   Nodes.reserve(FunctionMap.size());
 
   for (auto I = begin(), E = end(); I != E; ++I)
-    Nodes.push_back(I->second);
+    Nodes.push_back(I->second.get());
 
   std::sort(Nodes.begin(), Nodes.end(),
             [](CallGraphNode *LHS, CallGraphNode *RHS) {
@@ -120,9 +127,8 @@ void CallGraph::print(raw_ostream &OS) const {
     CN->print(OS);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void CallGraph::dump() const { print(dbgs()); }
-#endif
 
 // removeFunctionFromModule - Unlink the function from this module, returning
 // it.  Because this removes the function from the module, the call graph node
@@ -134,7 +140,6 @@ Function *CallGraph::removeFunctionFromModule(CallGraphNode *CGN) {
   assert(CGN->empty() && "Cannot remove function from call "
          "graph if it references other functions!");
   Function *F = CGN->getFunction(); // Get the function for the call graph node
-  delete CGN;                       // Delete the call graph node for this func
   FunctionMap.erase(F);             // Remove the call graph node from the map
 
   M.getFunctionList().remove(F);
@@ -152,7 +157,7 @@ void CallGraph::spliceFunction(const Function *From, const Function *To) {
          "Pointing CallGraphNode at a function that already exists");
   FunctionMapTy::iterator I = FunctionMap.find(From);
   I->second->F = const_cast<Function*>(To);
-  FunctionMap[To] = I->second;
+  FunctionMap[To] = std::move(I->second);
   FunctionMap.erase(I);
 }
 
@@ -160,12 +165,13 @@ void CallGraph::spliceFunction(const Function *From, const Function *To) {
 // it will insert a new CallGraphNode for the specified function if one does
 // not already exist.
 CallGraphNode *CallGraph::getOrInsertFunction(const Function *F) {
-  CallGraphNode *&CGN = FunctionMap[F];
+  auto &CGN = FunctionMap[F];
   if (CGN)
-    return CGN;
+    return CGN.get();
 
   assert((!F || F->getParent() == &M) && "Function not in current module!");
-  return CGN = new CallGraphNode(const_cast<Function*>(F));
+  CGN = llvm::make_unique<CallGraphNode>(const_cast<Function *>(F));
+  return CGN.get();
 }
 
 //===----------------------------------------------------------------------===//
@@ -190,9 +196,8 @@ void CallGraphNode::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void CallGraphNode::dump() const { print(dbgs()); }
-#endif
 
 /// removeCallEdgeFor - This method removes the edge in the node for the
 /// specified call site.  Note that this method takes linear time, so it
@@ -297,6 +302,5 @@ void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
   G->print(OS);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); }
-#endif
diff --git a/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
index 07b389a..6dd1d0a 100644
--- a/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -612,9 +612,10 @@ namespace {
     bool runOnSCC(CallGraphSCC &SCC) override {
       Out << Banner;
       for (CallGraphNode *CGN : SCC) {
-        if (CGN->getFunction())
-          CGN->getFunction()->print(Out);
-        else
+        if (CGN->getFunction()) {
+          if (isFunctionInPrintList(CGN->getFunction()->getName()))
+            CGN->getFunction()->print(Out);
+        } else
           Out << "\nPrinting <null> Function\n";
       }
       return false;
diff --git a/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp b/contrib/llvm/lib/Analysis/CallPrinter.cpp
index 68dcd3c..68dcd3c 100644
--- a/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/CallPrinter.cpp
diff --git a/contrib/llvm/lib/Analysis/CaptureTracking.cpp b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
index 52ef807..1add2fa 100644
--- a/contrib/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
@@ -52,63 +53,6 @@ namespace {
     bool Captured;
   };
 
-  struct NumberedInstCache {
-    SmallDenseMap<const Instruction *, unsigned, 32> NumberedInsts;
-    BasicBlock::const_iterator LastInstFound;
-    unsigned LastInstPos;
-    const BasicBlock *BB;
-
-    NumberedInstCache(const BasicBlock *BasicB) : LastInstPos(0), BB(BasicB) {
-      LastInstFound = BB->end();
-    }
-
-    /// \brief Find the first instruction 'A' or 'B' in 'BB'. Number out
-    /// instruction while walking 'BB'.
-    const Instruction *find(const Instruction *A, const Instruction *B) {
-      const Instruction *Inst = nullptr;
-      assert(!(LastInstFound == BB->end() && LastInstPos != 0) &&
-             "Instruction supposed to be in NumberedInsts");
-
-      // Start the search with the instruction found in the last lookup round.
-      auto II = BB->begin();
-      auto IE = BB->end();
-      if (LastInstFound != IE)
-        II = std::next(LastInstFound);
-
-      // Number all instructions up to the point where we find 'A' or 'B'.
-      for (++LastInstPos; II != IE; ++II, ++LastInstPos) {
-        Inst = cast<Instruction>(II);
-        NumberedInsts[Inst] = LastInstPos;
-        if (Inst == A || Inst == B)
-          break;
-      }
-
-      assert(II != IE && "Instruction not found?");
-      LastInstFound = II;
-      return Inst;
-    }
-
-    /// \brief Find out whether 'A' dominates 'B', meaning whether 'A'
-    /// comes before 'B' in 'BB'. This is a simplification that considers
-    /// cached instruction positions and ignores other basic blocks, being
-    /// only relevant to compare relative instructions positions inside 'BB'.
-    bool dominates(const Instruction *A, const Instruction *B) {
-      assert(A->getParent() == B->getParent() &&
-             "Instructions must be in the same basic block!");
-
-      unsigned NA = NumberedInsts.lookup(A);
-      unsigned NB = NumberedInsts.lookup(B);
-      if (NA && NB)
-        return NA < NB;
-      if (NA)
-        return true;
-      if (NB)
-        return false;
-
-      return A == find(A, B);
-    }
-  };
-
   /// Only find pointer captures which happen before the given instruction. Uses
   /// the dominator tree to determine whether one instruction is before another.
   /// Only support the case where the Value is defined in the same basic block
@@ -116,8 +60,8 @@ namespace {
   struct CapturesBefore : public CaptureTracker {
 
     CapturesBefore(bool ReturnCaptures, const Instruction *I, DominatorTree *DT,
-                   bool IncludeI)
-      : LocalInstCache(I->getParent()), BeforeHere(I), DT(DT),
+                   bool IncludeI, OrderedBasicBlock *IC)
+      : OrderedBB(IC), BeforeHere(I), DT(DT),
         ReturnCaptures(ReturnCaptures), IncludeI(IncludeI), Captured(false) {}
 
     void tooManyUses() override { Captured = true; }
@@ -131,18 +75,18 @@ namespace {
 
       // Compute the case where both instructions are inside the same basic
       // block. Since instructions in the same BB as BeforeHere are numbered in
-      // 'LocalInstCache', avoid using 'dominates' and 'isPotentiallyReachable'
+      // 'OrderedBB', avoid using 'dominates' and 'isPotentiallyReachable'
       // which are very expensive for large basic blocks.
       if (BB == BeforeHere->getParent()) {
         // 'I' dominates 'BeforeHere' => not safe to prune.
         //
-        // The value defined by an invoke dominates an instruction only if it
-        // dominates every instruction in UseBB. A PHI is dominated only if
-        // the instruction dominates every possible use in the UseBB. Since
+        // The value defined by an invoke dominates an instruction only
+        // if it dominates every instruction in UseBB. A PHI is dominated only
+        // if the instruction dominates every possible use in the UseBB. Since
         // UseBB == BB, avoid pruning.
         if (isa<InvokeInst>(BeforeHere) || isa<PHINode>(I) || I == BeforeHere)
           return false;
-        if (!LocalInstCache.dominates(BeforeHere, I))
+        if (!OrderedBB->dominates(BeforeHere, I))
           return false;
 
         // 'BeforeHere' comes before 'I', it's safe to prune if we also
@@ -157,10 +101,7 @@ namespace {
 
         SmallVector<BasicBlock*, 32> Worklist;
         Worklist.append(succ_begin(BB), succ_end(BB));
-        if (!isPotentiallyReachableFromMany(Worklist, BB, DT))
-          return true;
-
-        return false;
+        return !isPotentiallyReachableFromMany(Worklist, BB, DT);
       }
 
       // If the value is defined in the same basic block as use and BeforeHere,
@@ -196,7 +137,7 @@ namespace {
       return true;
     }
 
-    NumberedInstCache LocalInstCache;
+    OrderedBasicBlock *OrderedBB;
     const Instruction *BeforeHere;
     DominatorTree *DT;
 
@@ -238,21 +179,29 @@ bool llvm::PointerMayBeCaptured(const Value *V,
 /// returning the value (or part of it) from the function counts as capturing
 /// it or not.  The boolean StoreCaptures specified whether storing the value
 /// (or part of it) into memory anywhere automatically counts as capturing it
-/// or not.
+/// or not. A ordered basic block \p OBB can be used in order to speed up
+/// queries about relative order among instructions in the same basic block.
 bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                       bool StoreCaptures, const Instruction *I,
-                                      DominatorTree *DT, bool IncludeI) {
+                                      DominatorTree *DT, bool IncludeI,
+                                      OrderedBasicBlock *OBB) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
+  bool UseNewOBB = OBB == nullptr;
 
   if (!DT)
     return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures);
+  if (UseNewOBB)
+    OBB = new OrderedBasicBlock(I->getParent());
 
   // TODO: See comment in PointerMayBeCaptured regarding what could be done
   // with StoreCaptures.
 
-  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI);
+  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI, OBB);
   PointerMayBeCaptured(V, &CB);
+
+  if (UseNewOBB)
+    delete OBB;
   return CB.Captured;
 }
 
@@ -300,8 +249,9 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
       // that loading a value from a pointer does not cause the pointer to be
       // captured, even though the loaded value might be the pointer itself
       // (think of self-referential objects).
-      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-      for (CallSite::arg_iterator A = B; A != E; ++A)
+      CallSite::data_operand_iterator B =
+        CS.data_operands_begin(), E = CS.data_operands_end();
+      for (CallSite::data_operand_iterator A = B; A != E; ++A)
         if (A->get() == V && !CS.doesNotCapture(A - B))
           // The parameter is not marked 'nocapture' - captured.
           if (Tracker->captured(U))
diff --git a/contrib/llvm/lib/Analysis/CodeMetrics.cpp b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
index 46a2c43..4090b4c 100644
--- a/contrib/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
@@ -45,14 +45,8 @@ static void completeEphemeralValues(SmallVector<const Value *, 16> &WorkSet,
       continue;
 
     // If all uses of this value are ephemeral, then so is this value.
-    bool FoundNEUse = false;
-    for (const User *I : V->users())
-      if (!EphValues.count(I)) {
-        FoundNEUse = true;
-        break;
-      }
-
-    if (FoundNEUse)
+    if (!std::all_of(V->user_begin(), V->user_end(),
+                     [&](const User *U) { return EphValues.count(U); }))
       continue;
 
     EphValues.insert(V);
@@ -116,7 +110,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
   for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
        II != E; ++II) {
     // Skip ephemeral values.
-    if (EphValues.count(II))
+    if (EphValues.count(&*II))
       continue;
 
     // Special handling for calls.
@@ -155,6 +149,9 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
     if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
       ++NumVectorInsts;
 
+    if (II->getType()->isTokenTy() && II->isUsedOutsideOfBlock(BB))
+      notDuplicatable = true;
+
     if (const CallInst *CI = dyn_cast<CallInst>(II))
       if (CI->cannotDuplicate())
         notDuplicatable = true;
diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
index 02a5aef..ccb5663 100644
--- a/contrib/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
@@ -248,8 +248,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
 
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
-      CE->getOpcode() == Instruction::BitCast ||
-      CE->getOpcode() == Instruction::AddrSpaceCast)
+      CE->getOpcode() == Instruction::BitCast)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
@@ -532,6 +531,10 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       return GV->getInitializer();
 
+  if (auto *GA = dyn_cast<GlobalAlias>(C))
+    if (GA->getAliasee() && !GA->mayBeOverridden())
+      return ConstantFoldLoadFromConstPtr(GA->getAliasee(), DL);
+
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE)
@@ -1236,6 +1239,9 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
   case Intrinsic::sqrt:
   case Intrinsic::sin:
   case Intrinsic::cos:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
   case Intrinsic::pow:
   case Intrinsic::powi:
   case Intrinsic::bswap:
@@ -1276,24 +1282,30 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
   // return true for a name like "cos\0blah" which strcmp would return equal to
   // "cos", but has length 8.
   switch (Name[0]) {
-  default: return false;
+  default:
+    return false;
   case 'a':
-    return Name == "acos" || Name == "asin" || Name == "atan" || Name =="atan2";
+    return Name == "acos" || Name == "asin" || Name == "atan" ||
+           Name == "atan2" || Name == "acosf" || Name == "asinf" ||
+           Name == "atanf" || Name == "atan2f";
   case 'c':
-    return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
+    return Name == "ceil" || Name == "cos" || Name == "cosh" ||
+           Name == "ceilf" || Name == "cosf" || Name == "coshf";
   case 'e':
-    return Name == "exp" || Name == "exp2";
+    return Name == "exp" || Name == "exp2" || Name == "expf" || Name == "exp2f";
   case 'f':
-    return Name == "fabs" || Name == "fmod" || Name == "floor";
+    return Name == "fabs" || Name == "floor" || Name == "fmod" ||
+           Name == "fabsf" || Name == "floorf" || Name == "fmodf";
   case 'l':
-    return Name == "log" || Name == "log10";
+    return Name == "log" || Name == "log10" || Name == "logf" ||
+           Name == "log10f";
   case 'p':
-    return Name == "pow";
+    return Name == "pow" || Name == "powf";
   case 's':
     return Name == "sin" || Name == "sinh" || Name == "sqrt" ||
-      Name == "sinf" || Name == "sqrtf";
+           Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
   case 't':
-    return Name == "tan" || Name == "tanh";
+    return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
   }
 }
 
@@ -1422,6 +1434,36 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
         return ConstantFP::get(Ty->getContext(), V);
       }
 
+      if (IntrinsicID == Intrinsic::floor) {
+        APFloat V = Op->getValueAPF();
+        V.roundToIntegral(APFloat::rmTowardNegative);
+        return ConstantFP::get(Ty->getContext(), V);
+      }
+
+      if (IntrinsicID == Intrinsic::ceil) {
+        APFloat V = Op->getValueAPF();
+        V.roundToIntegral(APFloat::rmTowardPositive);
+        return ConstantFP::get(Ty->getContext(), V);
+      }
+
+      if (IntrinsicID == Intrinsic::trunc) {
+        APFloat V = Op->getValueAPF();
+        V.roundToIntegral(APFloat::rmTowardZero);
+        return ConstantFP::get(Ty->getContext(), V);
+      }
+
+      if (IntrinsicID == Intrinsic::rint) {
+        APFloat V = Op->getValueAPF();
+        V.roundToIntegral(APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(Ty->getContext(), V);
+      }
+
+      if (IntrinsicID == Intrinsic::nearbyint) {
+        APFloat V = Op->getValueAPF();
+        V.roundToIntegral(APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(Ty->getContext(), V);
+      }
+
       /// We only fold functions with finite arguments. Folding NaN and inf is
       /// likely to be aborted with an exception anyway, and some host libms
       /// have known errors raising exceptions.
@@ -1448,10 +1490,6 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
           return ConstantFoldFP(exp, V, Ty);
         case Intrinsic::exp2:
           return ConstantFoldFP(exp2, V, Ty);
-        case Intrinsic::floor:
-          return ConstantFoldFP(floor, V, Ty);
-        case Intrinsic::ceil:
-          return ConstantFoldFP(ceil, V, Ty);
         case Intrinsic::sin:
           return ConstantFoldFP(sin, V, Ty);
         case Intrinsic::cos:
@@ -1463,43 +1501,51 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
 
       switch (Name[0]) {
       case 'a':
-        if (Name == "acos" && TLI->has(LibFunc::acos))
+        if ((Name == "acos" && TLI->has(LibFunc::acos)) ||
+            (Name == "acosf" && TLI->has(LibFunc::acosf)))
           return ConstantFoldFP(acos, V, Ty);
-        else if (Name == "asin" && TLI->has(LibFunc::asin))
+        else if ((Name == "asin" && TLI->has(LibFunc::asin)) ||
+                 (Name == "asinf" && TLI->has(LibFunc::asinf)))
           return ConstantFoldFP(asin, V, Ty);
-        else if (Name == "atan" && TLI->has(LibFunc::atan))
+        else if ((Name == "atan" && TLI->has(LibFunc::atan)) ||
+                 (Name == "atanf" && TLI->has(LibFunc::atanf)))
           return ConstantFoldFP(atan, V, Ty);
         break;
       case 'c':
-        if (Name == "ceil" && TLI->has(LibFunc::ceil))
+        if ((Name == "ceil" && TLI->has(LibFunc::ceil)) ||
+            (Name == "ceilf" && TLI->has(LibFunc::ceilf)))
           return ConstantFoldFP(ceil, V, Ty);
-        else if (Name == "cos" && TLI->has(LibFunc::cos))
+        else if ((Name == "cos" && TLI->has(LibFunc::cos)) ||
+                 (Name == "cosf" && TLI->has(LibFunc::cosf)))
           return ConstantFoldFP(cos, V, Ty);
-        else if (Name == "cosh" && TLI->has(LibFunc::cosh))
+        else if ((Name == "cosh" && TLI->has(LibFunc::cosh)) ||
+                 (Name == "coshf" && TLI->has(LibFunc::coshf)))
           return ConstantFoldFP(cosh, V, Ty);
-        else if (Name == "cosf" && TLI->has(LibFunc::cosf))
-          return ConstantFoldFP(cos, V, Ty);
         break;
       case 'e':
-        if (Name == "exp" && TLI->has(LibFunc::exp))
+        if ((Name == "exp" && TLI->has(LibFunc::exp)) ||
+            (Name == "expf" && TLI->has(LibFunc::expf)))
           return ConstantFoldFP(exp, V, Ty);
-
-        if (Name == "exp2" && TLI->has(LibFunc::exp2)) {
+        if ((Name == "exp2" && TLI->has(LibFunc::exp2)) ||
+            (Name == "exp2f" && TLI->has(LibFunc::exp2f)))
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
           return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
-        }
         break;
       case 'f':
-        if (Name == "fabs" && TLI->has(LibFunc::fabs))
+        if ((Name == "fabs" && TLI->has(LibFunc::fabs)) ||
+            (Name == "fabsf" && TLI->has(LibFunc::fabsf)))
           return ConstantFoldFP(fabs, V, Ty);
-        else if (Name == "floor" && TLI->has(LibFunc::floor))
+        else if ((Name == "floor" && TLI->has(LibFunc::floor)) ||
+                 (Name == "floorf" && TLI->has(LibFunc::floorf)))
           return ConstantFoldFP(floor, V, Ty);
         break;
       case 'l':
-        if (Name == "log" && V > 0 && TLI->has(LibFunc::log))
+        if ((Name == "log" && V > 0 && TLI->has(LibFunc::log)) ||
+            (Name == "logf" && V > 0 && TLI->has(LibFunc::logf)))
           return ConstantFoldFP(log, V, Ty);
-        else if (Name == "log10" && V > 0 && TLI->has(LibFunc::log10))
+        else if ((Name == "log10" && V > 0 && TLI->has(LibFunc::log10)) ||
+                 (Name == "log10f" && V > 0 && TLI->has(LibFunc::log10f)))
           return ConstantFoldFP(log10, V, Ty);
         else if (IntrinsicID == Intrinsic::sqrt &&
                  (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) {
@@ -1516,21 +1562,22 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
         }
         break;
       case 's':
-        if (Name == "sin" && TLI->has(LibFunc::sin))
+        if ((Name == "sin" && TLI->has(LibFunc::sin)) ||
+            (Name == "sinf" && TLI->has(LibFunc::sinf)))
           return ConstantFoldFP(sin, V, Ty);
-        else if (Name == "sinh" && TLI->has(LibFunc::sinh))
+        else if ((Name == "sinh" && TLI->has(LibFunc::sinh)) ||
+                 (Name == "sinhf" && TLI->has(LibFunc::sinhf)))
           return ConstantFoldFP(sinh, V, Ty);
-        else if (Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt))
-          return ConstantFoldFP(sqrt, V, Ty);
-        else if (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf))
+        else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt)) ||
+                 (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf)))
           return ConstantFoldFP(sqrt, V, Ty);
-        else if (Name == "sinf" && TLI->has(LibFunc::sinf))
-          return ConstantFoldFP(sin, V, Ty);
         break;
       case 't':
-        if (Name == "tan" && TLI->has(LibFunc::tan))
+        if ((Name == "tan" && TLI->has(LibFunc::tan)) ||
+            (Name == "tanf" && TLI->has(LibFunc::tanf)))
           return ConstantFoldFP(tan, V, Ty);
-        else if (Name == "tanh" && TLI->has(LibFunc::tanh))
+        else if ((Name == "tanh" && TLI->has(LibFunc::tanh)) ||
+                 (Name == "tanhf" && TLI->has(LibFunc::tanhf)))
           return ConstantFoldFP(tanh, V, Ty);
         break;
       default:
@@ -1633,11 +1680,14 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
 
         if (!TLI)
           return nullptr;
-        if (Name == "pow" && TLI->has(LibFunc::pow))
+        if ((Name == "pow" && TLI->has(LibFunc::pow)) ||
+            (Name == "powf" && TLI->has(LibFunc::powf)))
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
-        if (Name == "fmod" && TLI->has(LibFunc::fmod))
+        if ((Name == "fmod" && TLI->has(LibFunc::fmod)) ||
+            (Name == "fmodf" && TLI->has(LibFunc::fmodf)))
           return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
-        if (Name == "atan2" && TLI->has(LibFunc::atan2))
+        if ((Name == "atan2" && TLI->has(LibFunc::atan2)) ||
+            (Name == "atan2f" && TLI->has(LibFunc::atan2f)))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
       } else if (ConstantInt *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
         if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
diff --git a/contrib/llvm/lib/Analysis/CostModel.cpp b/contrib/llvm/lib/Analysis/CostModel.cpp
index b529c1a..0383cbf 100644
--- a/contrib/llvm/lib/Analysis/CostModel.cpp
+++ b/contrib/llvm/lib/Analysis/CostModel.cpp
@@ -152,10 +152,7 @@ static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
     Mask[i] = val;
 
   SmallVector<int, 16> ActualMask = SI->getShuffleMask();
-  if (Mask != ActualMask)
-    return false;
-
-  return true;
+  return Mask == ActualMask;
 }
 
 static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
@@ -383,10 +380,8 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
     return -1;
 
   switch (I->getOpcode()) {
-  case Instruction::GetElementPtr:{
-    Type *ValTy = I->getOperand(0)->getType()->getPointerElementType();
-    return TTI->getAddressComputationCost(ValTy);
-  }
+  case Instruction::GetElementPtr:
+    return TTI->getUserCost(I);
 
   case Instruction::Ret:
   case Instruction::PHI:
@@ -505,12 +500,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   }
   case Instruction::Call:
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      SmallVector<Type*, 4> Tys;
+      SmallVector<Value *, 4> Args;
       for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
-        Tys.push_back(II->getArgOperand(J)->getType());
+        Args.push_back(II->getArgOperand(J));
 
       return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
-                                        Tys);
+                                        Args);
     }
     return -1;
   default:
@@ -525,7 +520,7 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
 
   for (Function::iterator B = F->begin(), BE = F->end(); B != BE; ++B) {
     for (BasicBlock::iterator it = B->begin(), e = B->end(); it != e; ++it) {
-      Instruction *Inst = it;
+      Instruction *Inst = &*it;
       unsigned Cost = getInstructionCost(Inst);
       if (Cost != (unsigned)-1)
         OS << "Cost Model: Found an estimated cost of " << Cost;
diff --git a/contrib/llvm/lib/Analysis/Delinearization.cpp b/contrib/llvm/lib/Analysis/Delinearization.cpp
index 9d15786..baee8b3 100644
--- a/contrib/llvm/lib/Analysis/Delinearization.cpp
+++ b/contrib/llvm/lib/Analysis/Delinearization.cpp
@@ -60,12 +60,12 @@ public:
 void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<ScalarEvolution>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
 }
 
 bool Delinearization::runOnFunction(Function &F) {
   this->F = &F;
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   return false;
 }
@@ -102,20 +102,14 @@ void Delinearization::print(raw_ostream &O, const Module *) const {
       if (!BasePointer)
         break;
       AccessFn = SE->getMinusSCEV(AccessFn, BasePointer);
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn);
-
-      // Do not try to delinearize memory accesses that are not AddRecs.
-      if (!AR)
-        break;
-
 
       O << "\n";
       O << "Inst:" << *Inst << "\n";
       O << "In Loop with Header: " << L->getHeader()->getName() << "\n";
-      O << "AddRec: " << *AR << "\n";
+      O << "AccessFunction: " << *AccessFn << "\n";
 
       SmallVector<const SCEV *, 3> Subscripts, Sizes;
-      SE->delinearize(AR, Subscripts, Sizes, SE->getElementSize(Inst));
+      SE->delinearize(AccessFn, Subscripts, Sizes, SE->getElementSize(Inst));
       if (Subscripts.size() == 0 || Sizes.size() == 0 ||
           Subscripts.size() != Sizes.size()) {
         O << "failed to delinearize\n";
diff --git a/contrib/llvm/lib/Analysis/DemandedBits.cpp b/contrib/llvm/lib/Analysis/DemandedBits.cpp
new file mode 100644
index 0000000..912c5ce
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/DemandedBits.cpp
@@ -0,0 +1,392 @@
+//===---- DemandedBits.cpp - Determine demanded bits ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a demanded bits analysis. A demanded bit is one that
+// contributes to a result; bits that are not demanded can be either zero or
+// one without affecting control or data flow. For example in this sequence:
+//
+//   %1 = add i32 %x, %y
+//   %2 = trunc i32 %1 to i16
+//
+// Only the lowest 16 bits of %1 are demanded; the rest are removed by the
+// trunc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "demanded-bits"
+
+char DemandedBits::ID = 0;
+INITIALIZE_PASS_BEGIN(DemandedBits, "demanded-bits", "Demanded bits analysis",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DemandedBits, "demanded-bits", "Demanded bits analysis",
+                    false, false)
+
+DemandedBits::DemandedBits() : FunctionPass(ID), F(nullptr), Analyzed(false) {
+  initializeDemandedBitsPass(*PassRegistry::getPassRegistry());
+}
+
+void DemandedBits::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesAll();
+}
+
+static bool isAlwaysLive(Instruction *I) {
+  return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+      I->isEHPad() || I->mayHaveSideEffects();
+}
+
+void DemandedBits::determineLiveOperandBits(
+    const Instruction *UserI, const Instruction *I, unsigned OperandNo,
+    const APInt &AOut, APInt &AB, APInt &KnownZero, APInt &KnownOne,
+    APInt &KnownZero2, APInt &KnownOne2) {
+  unsigned BitWidth = AB.getBitWidth();
+
+  // We're called once per operand, but for some instructions, we need to
+  // compute known bits of both operands in order to determine the live bits of
+  // either (when both operands are instructions themselves). We don't,
+  // however, want to do this twice, so we cache the result in APInts that live
+  // in the caller. For the two-relevant-operands case, both operand values are
+  // provided here.
+  auto ComputeKnownBits =
+      [&](unsigned BitWidth, const Value *V1, const Value *V2) {
+        const DataLayout &DL = I->getModule()->getDataLayout();
+        KnownZero = APInt(BitWidth, 0);
+        KnownOne = APInt(BitWidth, 0);
+        computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0,
+                         AC, UserI, DT);
+
+        if (V2) {
+          KnownZero2 = APInt(BitWidth, 0);
+          KnownOne2 = APInt(BitWidth, 0);
+          computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL,
+                           0, AC, UserI, DT);
+        }
+      };
+
+  switch (UserI->getOpcode()) {
+  default: break;
+  case Instruction::Call:
+  case Instruction::Invoke:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI))
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap:
+        // The alive bits of the input are the swapped alive bits of
+        // the output.
+        AB = AOut.byteSwap();
+        break;
+      case Intrinsic::ctlz:
+        if (OperandNo == 0) {
+          // We need some output bits, so we need all bits of the
+          // input to the left of, and including, the leftmost bit
+          // known to be one.
+          ComputeKnownBits(BitWidth, I, nullptr);
+          AB = APInt::getHighBitsSet(BitWidth,
+                 std::min(BitWidth, KnownOne.countLeadingZeros()+1));
+        }
+        break;
+      case Intrinsic::cttz:
+        if (OperandNo == 0) {
+          // We need some output bits, so we need all bits of the
+          // input to the right of, and including, the rightmost bit
+          // known to be one.
+          ComputeKnownBits(BitWidth, I, nullptr);
+          AB = APInt::getLowBitsSet(BitWidth,
+                 std::min(BitWidth, KnownOne.countTrailingZeros()+1));
+        }
+        break;
+      }
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // Find the highest live output bit. We don't need any more input
+    // bits than that (adds, and thus subtracts, ripple only to the
+    // left).
+    AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+    break;
+  case Instruction::Shl:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.lshr(ShiftAmt);
+
+        // If the shift is nuw/nsw, then the high bits are not dead
+        // (because we've promised that they *must* be zero).
+        const ShlOperator *S = cast<ShlOperator>(UserI);
+        if (S->hasNoSignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+        else if (S->hasNoUnsignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::LShr:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.shl(ShiftAmt);
+
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<LShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::AShr:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.shl(ShiftAmt);
+        // Because the high input bit is replicated into the
+        // high-order bits of the result, if we need any of those
+        // bits, then we must keep the highest input bit.
+        if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt))
+            .getBoolValue())
+          AB.setBit(BitWidth-1);
+
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<AShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::And:
+    AB = AOut;
+
+    // For bits that are known zero, the corresponding bits in the
+    // other operand are dead (unless they're both zero, in which
+    // case they can't both be dead, so just mark the LHS bits as
+    // dead).
+    if (OperandNo == 0) {
+      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+      AB &= ~KnownZero2;
+    } else {
+      if (!isa<Instruction>(UserI->getOperand(0)))
+        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+      AB &= ~(KnownZero & ~KnownZero2);
+    }
+    break;
+  case Instruction::Or:
+    AB = AOut;
+
+    // For bits that are known one, the corresponding bits in the
+    // other operand are dead (unless they're both one, in which
+    // case they can't both be dead, so just mark the LHS bits as
+    // dead).
+    if (OperandNo == 0) {
+      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+      AB &= ~KnownOne2;
+    } else {
+      if (!isa<Instruction>(UserI->getOperand(0)))
+        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+      AB &= ~(KnownOne & ~KnownOne2);
+    }
+    break;
+  case Instruction::Xor:
+  case Instruction::PHI:
+    AB = AOut;
+    break;
+  case Instruction::Trunc:
+    AB = AOut.zext(BitWidth);
+    break;
+  case Instruction::ZExt:
+    AB = AOut.trunc(BitWidth);
+    break;
+  case Instruction::SExt:
+    AB = AOut.trunc(BitWidth);
+    // Because the high input bit is replicated into the
+    // high-order bits of the result, if we need any of those
+    // bits, then we must keep the highest input bit.
+    if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(),
+                                      AOut.getBitWidth() - BitWidth))
+        .getBoolValue())
+      AB.setBit(BitWidth-1);
+    break;
+  case Instruction::Select:
+    if (OperandNo != 0)
+      AB = AOut;
+    break;
+  case Instruction::ICmp:
+    // Count the number of leading zeroes in each operand.
+    ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+    auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(),
+                                     KnownZero2.countLeadingOnes());
+    AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes);
+    break;
+  }
+}
+
+bool DemandedBits::runOnFunction(Function& Fn) {
+  F = &Fn;
+  Analyzed = false;
+  return false;
+}
+
+void DemandedBits::performAnalysis() {
+  if (Analyzed)
+    // Analysis already completed for this function.
+    return;
+  Analyzed = true;
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  
+  Visited.clear();
+  AliveBits.clear();
+
+  SmallVector<Instruction*, 128> Worklist;
+
+  // Collect the set of "root" instructions that are known live.
+  for (Instruction &I : instructions(*F)) {
+    if (!isAlwaysLive(&I))
+      continue;
+
+    DEBUG(dbgs() << "DemandedBits: Root: " << I << "\n");
+    // For integer-valued instructions, set up an initial empty set of alive
+    // bits and add the instruction to the work list. For other instructions
+    // add their operands to the work list (for integer values operands, mark
+    // all bits as live).
+    if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+      if (!AliveBits.count(&I)) {
+        AliveBits[&I] = APInt(IT->getBitWidth(), 0);
+        Worklist.push_back(&I);
+      }
+
+      continue;
+    }
+
+    // Non-integer-typed instructions...
+    for (Use &OI : I.operands()) {
+      if (Instruction *J = dyn_cast<Instruction>(OI)) {
+        if (IntegerType *IT = dyn_cast<IntegerType>(J->getType()))
+          AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth());
+        Worklist.push_back(J);
+      }
+    }
+    // To save memory, we don't add I to the Visited set here. Instead, we
+    // check isAlwaysLive on every instruction when searching for dead
+    // instructions later (we need to check isAlwaysLive for the
+    // integer-typed instructions anyway).
+  }
+
+  // Propagate liveness backwards to operands.
+  while (!Worklist.empty()) {
+    Instruction *UserI = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI);
+    APInt AOut;
+    if (UserI->getType()->isIntegerTy()) {
+      AOut = AliveBits[UserI];
+      DEBUG(dbgs() << " Alive Out: " << AOut);
+    }
+    DEBUG(dbgs() << "\n");
+
+    if (!UserI->getType()->isIntegerTy())
+      Visited.insert(UserI);
+
+    APInt KnownZero, KnownOne, KnownZero2, KnownOne2;
+    // Compute the set of alive bits for each operand. These are anded into the
+    // existing set, if any, and if that changes the set of alive bits, the
+    // operand is added to the work-list.
+    for (Use &OI : UserI->operands()) {
+      if (Instruction *I = dyn_cast<Instruction>(OI)) {
+        if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) {
+          unsigned BitWidth = IT->getBitWidth();
+          APInt AB = APInt::getAllOnesValue(BitWidth);
+          if (UserI->getType()->isIntegerTy() && !AOut &&
+              !isAlwaysLive(UserI)) {
+            AB = APInt(BitWidth, 0);
+          } else {
+            // If all bits of the output are dead, then all bits of the input
+            // Bits of each operand that are used to compute alive bits of the
+            // output are alive, all others are dead.
+            determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB,
+                                     KnownZero, KnownOne,
+                                     KnownZero2, KnownOne2);
+          }
+
+          // If we've added to the set of alive bits (or the operand has not
+          // been previously visited), then re-queue the operand to be visited
+          // again.
+          APInt ABPrev(BitWidth, 0);
+          auto ABI = AliveBits.find(I);
+          if (ABI != AliveBits.end())
+            ABPrev = ABI->second;
+
+          APInt ABNew = AB | ABPrev;
+          if (ABNew != ABPrev || ABI == AliveBits.end()) {
+            AliveBits[I] = std::move(ABNew);
+            Worklist.push_back(I);
+          }
+        } else if (!Visited.count(I)) {
+          Worklist.push_back(I);
+        }
+      }
+    }
+  }
+}
+
+APInt DemandedBits::getDemandedBits(Instruction *I) {
+  performAnalysis();
+  
+  const DataLayout &DL = I->getParent()->getModule()->getDataLayout();
+  if (AliveBits.count(I))
+    return AliveBits[I];
+  return APInt::getAllOnesValue(DL.getTypeSizeInBits(I->getType()));
+}
+
+bool DemandedBits::isInstructionDead(Instruction *I) {
+  performAnalysis();
+
+  return !Visited.count(I) && AliveBits.find(I) == AliveBits.end() &&
+    !isAlwaysLive(I);
+}
+
+void DemandedBits::print(raw_ostream &OS, const Module *M) const {
+  // This is gross. But the alternative is making all the state mutable
+  // just because of this one debugging method.
+  const_cast<DemandedBits*>(this)->performAnalysis();
+  for (auto &KV : AliveBits) {
+    OS << "DemandedBits: 0x" << utohexstr(KV.second.getLimitedValue()) << " for "
+       << *KV.first << "\n";
+  }
+}
+
+FunctionPass *llvm::createDemandedBitsPass() {
+  return new DemandedBits();
+}
diff --git a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
index 4826ac4..4040ad3 100644
--- a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -117,8 +117,8 @@ Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
 INITIALIZE_PASS_BEGIN(DependenceAnalysis, "da",
                       "Dependence Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(DependenceAnalysis, "da",
                     "Dependence Analysis", true, true)
 
@@ -132,8 +132,8 @@ FunctionPass *llvm::createDependenceAnalysisPass() {
 
 bool DependenceAnalysis::runOnFunction(Function &F) {
   this->F = &F;
-  AA = &getAnalysis<AliasAnalysis>();
-  SE = &getAnalysis<ScalarEvolution>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   return false;
 }
@@ -145,8 +145,8 @@ void DependenceAnalysis::releaseMemory() {
 
 void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequiredTransitive<AliasAnalysis>();
-  AU.addRequiredTransitive<ScalarEvolution>();
+  AU.addRequiredTransitive<AAResultsWrapperPass>();
+  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
   AU.addRequiredTransitive<LoopInfoWrapperPass>();
 }
 
@@ -233,7 +233,8 @@ FullDependence::FullDependence(Instruction *Source, Instruction *Destination,
     : Dependence(Source, Destination), Levels(CommonLevels),
       LoopIndependent(PossiblyLoopIndependent) {
   Consistent = true;
-  DV = CommonLevels ? new DVEntry[CommonLevels] : nullptr;
+  if (CommonLevels)
+    DV = make_unique<DVEntry[]>(CommonLevels);
 }
 
 // The rest are simple getters that hide the implementation.
@@ -371,7 +372,7 @@ void DependenceAnalysis::Constraint::setLine(const SCEV *AA,
 void DependenceAnalysis::Constraint::setDistance(const SCEV *D,
                                                  const Loop *CurLoop) {
   Kind = Distance;
-  A = SE->getConstant(D->getType(), 1);
+  A = SE->getOne(D->getType());
   B = SE->getNegativeSCEV(A);
   C = SE->getNegativeSCEV(D);
   AssociatedLoop = CurLoop;
@@ -500,10 +501,10 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X,
       if (!C1B2_C2B1 || !C1A2_C2A1 ||
           !A1B2_A2B1 || !A2B1_A1B2)
         return false;
-      APInt Xtop = C1B2_C2B1->getValue()->getValue();
-      APInt Xbot = A1B2_A2B1->getValue()->getValue();
-      APInt Ytop = C1A2_C2A1->getValue()->getValue();
-      APInt Ybot = A2B1_A1B2->getValue()->getValue();
+      APInt Xtop = C1B2_C2B1->getAPInt();
+      APInt Xbot = A1B2_A2B1->getAPInt();
+      APInt Ytop = C1A2_C2A1->getAPInt();
+      APInt Ybot = A2B1_A1B2->getAPInt();
       DEBUG(dbgs() << "\t\tXtop = " << Xtop << "\n");
       DEBUG(dbgs() << "\t\tXbot = " << Xbot << "\n");
       DEBUG(dbgs() << "\t\tYtop = " << Ytop << "\n");
@@ -527,7 +528,7 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X,
       }
       if (const SCEVConstant *CUB =
           collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
-        APInt UpperBound = CUB->getValue()->getValue();
+        APInt UpperBound = CUB->getAPInt();
         DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
           X->setEmpty();
@@ -630,8 +631,8 @@ static AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
                                           const Value *B) {
   const Value *AObj = GetUnderlyingObject(A, DL);
   const Value *BObj = GetUnderlyingObject(B, DL);
-  return AA->alias(AObj, AA->getTypeStoreSize(AObj->getType()),
-                   BObj, AA->getTypeStoreSize(BObj->getType()));
+  return AA->alias(AObj, DL.getTypeStoreSize(AObj->getType()),
+                   BObj, DL.getTypeStoreSize(BObj->getType()));
 }
 
 
@@ -1114,8 +1115,8 @@ bool DependenceAnalysis::strongSIVtest(const SCEV *Coeff,
 
   // Can we compute distance?
   if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
-    APInt ConstDelta = cast<SCEVConstant>(Delta)->getValue()->getValue();
-    APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getValue()->getValue();
+    APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt();
+    APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt();
     APInt Distance  = ConstDelta; // these need to be initialized
     APInt Remainder = ConstDelta;
     APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
@@ -1256,11 +1257,9 @@ bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
   assert(SE->isKnownPositive(ConstCoeff) && "ConstCoeff should be positive");
 
   // compute SplitIter for use by DependenceAnalysis::getSplitIteration()
-  SplitIter =
-    SE->getUDivExpr(SE->getSMaxExpr(SE->getConstant(Delta->getType(), 0),
-                                    Delta),
-                    SE->getMulExpr(SE->getConstant(Delta->getType(), 2),
-                                   ConstCoeff));
+  SplitIter = SE->getUDivExpr(
+      SE->getSMaxExpr(SE->getZero(Delta->getType()), Delta),
+      SE->getMulExpr(SE->getConstant(Delta->getType(), 2), ConstCoeff));
   DEBUG(dbgs() << "\t    Split iter = " << *SplitIter << "\n");
 
   const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
@@ -1302,14 +1301,14 @@ bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
         return true;
       }
       Result.DV[Level].Splitable = false;
-      Result.DV[Level].Distance = SE->getConstant(Delta->getType(), 0);
+      Result.DV[Level].Distance = SE->getZero(Delta->getType());
       return false;
     }
   }
 
   // check that Coeff divides Delta
-  APInt APDelta = ConstDelta->getValue()->getValue();
-  APInt APCoeff = ConstCoeff->getValue()->getValue();
+  APInt APDelta = ConstDelta->getAPInt();
+  APInt APCoeff = ConstCoeff->getAPInt();
   APInt Distance = APDelta; // these need to be initialzed
   APInt Remainder = APDelta;
   APInt::sdivrem(APDelta, APCoeff, Distance, Remainder);
@@ -1463,10 +1462,10 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
 
   // find gcd
   APInt G, X, Y;
-  APInt AM = ConstSrcCoeff->getValue()->getValue();
-  APInt BM = ConstDstCoeff->getValue()->getValue();
+  APInt AM = ConstSrcCoeff->getAPInt();
+  APInt BM = ConstDstCoeff->getAPInt();
   unsigned Bits = AM.getBitWidth();
-  if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) {
+  if (findGCD(Bits, AM, BM, ConstDelta->getAPInt(), G, X, Y)) {
     // gcd doesn't divide Delta, no dependence
     ++ExactSIVindependence;
     ++ExactSIVsuccesses;
@@ -1481,7 +1480,7 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
   // UM is perhaps unavailable, let's check
   if (const SCEVConstant *CUB =
       collectConstantUpperBound(CurLoop, Delta->getType())) {
-    UM = CUB->getValue()->getValue();
+    UM = CUB->getAPInt();
     DEBUG(dbgs() << "\t    UM = " << UM << "\n");
     UMvalid = true;
   }
@@ -1609,8 +1608,8 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
 static
 bool isRemainderZero(const SCEVConstant *Dividend,
                      const SCEVConstant *Divisor) {
-  APInt ConstDividend = Dividend->getValue()->getValue();
-  APInt ConstDivisor = Divisor->getValue()->getValue();
+  APInt ConstDividend = Dividend->getAPInt();
+  APInt ConstDivisor = Divisor->getAPInt();
   return ConstDividend.srem(ConstDivisor) == 0;
 }
 
@@ -1665,8 +1664,8 @@ bool DependenceAnalysis::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   Level--;
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
-  NewConstraint.setLine(SE->getConstant(Delta->getType(), 0),
-                        DstCoeff, Delta, CurLoop);
+  NewConstraint.setLine(SE->getZero(Delta->getType()), DstCoeff, Delta,
+                        CurLoop);
   DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   if (isKnownPredicate(CmpInst::ICMP_EQ, SrcConst, DstConst)) {
     if (Level < CommonLevels) {
@@ -1775,8 +1774,8 @@ bool DependenceAnalysis::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   Level--;
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
-  NewConstraint.setLine(SrcCoeff, SE->getConstant(Delta->getType(), 0),
-                        Delta, CurLoop);
+  NewConstraint.setLine(SrcCoeff, SE->getZero(Delta->getType()), Delta,
+                        CurLoop);
   DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   if (isKnownPredicate(CmpInst::ICMP_EQ, DstConst, SrcConst)) {
     if (Level < CommonLevels) {
@@ -1867,10 +1866,10 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
 
   // find gcd
   APInt G, X, Y;
-  APInt AM = ConstSrcCoeff->getValue()->getValue();
-  APInt BM = ConstDstCoeff->getValue()->getValue();
+  APInt AM = ConstSrcCoeff->getAPInt();
+  APInt BM = ConstDstCoeff->getAPInt();
   unsigned Bits = AM.getBitWidth();
-  if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) {
+  if (findGCD(Bits, AM, BM, ConstDelta->getAPInt(), G, X, Y)) {
     // gcd doesn't divide Delta, no dependence
     ++ExactRDIVindependence;
     return true;
@@ -1884,7 +1883,7 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
   // SrcUM is perhaps unavailable, let's check
   if (const SCEVConstant *UpperBound =
       collectConstantUpperBound(SrcLoop, Delta->getType())) {
-    SrcUM = UpperBound->getValue()->getValue();
+    SrcUM = UpperBound->getAPInt();
     DEBUG(dbgs() << "\t    SrcUM = " << SrcUM << "\n");
     SrcUMvalid = true;
   }
@@ -1894,7 +1893,7 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
   // UM is perhaps unavailable, let's check
   if (const SCEVConstant *UpperBound =
       collectConstantUpperBound(DstLoop, Delta->getType())) {
-    DstUM = UpperBound->getValue()->getValue();
+    DstUM = UpperBound->getAPInt();
     DEBUG(dbgs() << "\t    DstUM = " << DstUM << "\n");
     DstUMvalid = true;
   }
@@ -2307,7 +2306,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
       Constant = getConstantPart(Product);
     if (!Constant)
       return false;
-    APInt ConstCoeff = Constant->getValue()->getValue();
+    APInt ConstCoeff = Constant->getAPInt();
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
     Coefficients = AddRec->getStart();
   }
@@ -2328,7 +2327,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
       Constant = getConstantPart(Product);
     if (!Constant)
       return false;
-    APInt ConstCoeff = Constant->getValue()->getValue();
+    APInt ConstCoeff = Constant->getAPInt();
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
     Coefficients = AddRec->getStart();
   }
@@ -2352,7 +2351,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
         const SCEVConstant *ConstOp = getConstantPart(Product);
         if (!ConstOp)
           return false;
-        APInt ConstOpValue = ConstOp->getValue()->getValue();
+        APInt ConstOpValue = ConstOp->getAPInt();
         ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD,
                                                    ConstOpValue.abs());
       }
@@ -2362,7 +2361,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
   }
   if (!Constant)
     return false;
-  APInt ConstDelta = cast<SCEVConstant>(Constant)->getValue()->getValue();
+  APInt ConstDelta = cast<SCEVConstant>(Constant)->getAPInt();
   DEBUG(dbgs() << "    ConstDelta = " << ConstDelta << "\n");
   if (ConstDelta == 0)
     return false;
@@ -2410,7 +2409,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
           Constant = getConstantPart(Product);
         else
           Constant = cast<SCEVConstant>(Coeff);
-        APInt ConstCoeff = Constant->getValue()->getValue();
+        APInt ConstCoeff = Constant->getAPInt();
         RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
       }
       Inner = AddRec->getStart();
@@ -2428,7 +2427,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
           Constant = getConstantPart(Product);
         else
           Constant = cast<SCEVConstant>(Coeff);
-        APInt ConstCoeff = Constant->getValue()->getValue();
+        APInt ConstCoeff = Constant->getAPInt();
         RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
       }
       Inner = AddRec->getStart();
@@ -2445,7 +2444,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
       // or constant, in which case we give up on this direction.
       continue;
     }
-    APInt ConstCoeff = Constant->getValue()->getValue();
+    APInt ConstCoeff = Constant->getAPInt();
     RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
     DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n");
     if (RunningGCD != 0) {
@@ -2728,10 +2727,10 @@ void DependenceAnalysis::findBoundsALL(CoefficientInfo *A,
     // If the difference is 0, we won't need to know the number of iterations.
     if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
       Bound[K].Lower[Dependence::DVEntry::ALL] =
-        SE->getConstant(A[K].Coeff->getType(), 0);
+          SE->getZero(A[K].Coeff->getType());
     if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].PosPart, B[K].NegPart))
       Bound[K].Upper[Dependence::DVEntry::ALL] =
-        SE->getConstant(A[K].Coeff->getType(), 0);
+          SE->getZero(A[K].Coeff->getType());
   }
 }
 
@@ -2800,9 +2799,8 @@ void DependenceAnalysis::findBoundsLT(CoefficientInfo *A,
   Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
   Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
-    const SCEV *Iter_1 =
-      SE->getMinusSCEV(Bound[K].Iterations,
-                       SE->getConstant(Bound[K].Iterations->getType(), 1));
+    const SCEV *Iter_1 = SE->getMinusSCEV(
+        Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
       getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
     Bound[K].Lower[Dependence::DVEntry::LT] =
@@ -2847,9 +2845,8 @@ void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
   Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
   Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
-    const SCEV *Iter_1 =
-      SE->getMinusSCEV(Bound[K].Iterations,
-                       SE->getConstant(Bound[K].Iterations->getType(), 1));
+    const SCEV *Iter_1 = SE->getMinusSCEV(
+        Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType()));
     const SCEV *NegPart =
       getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
     Bound[K].Lower[Dependence::DVEntry::GT] =
@@ -2874,13 +2871,13 @@ void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
 
 // X^+ = max(X, 0)
 const SCEV *DependenceAnalysis::getPositivePart(const SCEV *X) const {
-  return SE->getSMaxExpr(X, SE->getConstant(X->getType(), 0));
+  return SE->getSMaxExpr(X, SE->getZero(X->getType()));
 }
 
 
 // X^- = min(X, 0)
 const SCEV *DependenceAnalysis::getNegativePart(const SCEV *X) const {
-  return SE->getSMinExpr(X, SE->getConstant(X->getType(), 0));
+  return SE->getSMinExpr(X, SE->getZero(X->getType()));
 }
 
 
@@ -2891,7 +2888,7 @@ DependenceAnalysis::CoefficientInfo *
 DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript,
                                      bool SrcFlag,
                                      const SCEV *&Constant) const {
-  const SCEV *Zero = SE->getConstant(Subscript->getType(), 0);
+  const SCEV *Zero = SE->getZero(Subscript->getType());
   CoefficientInfo *CI = new CoefficientInfo[MaxLevels + 1];
   for (unsigned K = 1; K <= MaxLevels; ++K) {
     CI[K].Coeff = Zero;
@@ -2975,7 +2972,7 @@ const SCEV *DependenceAnalysis::findCoefficient(const SCEV *Expr,
                                                 const Loop *TargetLoop)  const {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
   if (!AddRec)
-    return SE->getConstant(Expr->getType(), 0);
+    return SE->getZero(Expr->getType());
   if (AddRec->getLoop() == TargetLoop)
     return AddRec->getStepRecurrence(*SE);
   return findCoefficient(AddRec->getStart(), TargetLoop);
@@ -3110,8 +3107,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src,
     const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
     if (!Bconst || !Cconst) return false;
-    APInt Beta = Bconst->getValue()->getValue();
-    APInt Charlie = Cconst->getValue()->getValue();
+    APInt Beta = Bconst->getAPInt();
+    APInt Charlie = Cconst->getAPInt();
     APInt CdivB = Charlie.sdiv(Beta);
     assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
     const SCEV *AP_K = findCoefficient(Dst, CurLoop);
@@ -3125,8 +3122,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src,
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
     if (!Aconst || !Cconst) return false;
-    APInt Alpha = Aconst->getValue()->getValue();
-    APInt Charlie = Cconst->getValue()->getValue();
+    APInt Alpha = Aconst->getAPInt();
+    APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
     assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
     const SCEV *A_K = findCoefficient(Src, CurLoop);
@@ -3139,8 +3136,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src,
     const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
     const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
     if (!Aconst || !Cconst) return false;
-    APInt Alpha = Aconst->getValue()->getValue();
-    APInt Charlie = Cconst->getValue()->getValue();
+    APInt Alpha = Aconst->getAPInt();
+    APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
     assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
     const SCEV *A_K = findCoefficient(Src, CurLoop);
@@ -3244,20 +3241,36 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
 /// source and destination array references are recurrences on a nested loop,
 /// this function flattens the nested recurrences into separate recurrences
 /// for each loop level.
-bool DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV,
-                                        const SCEV *DstSCEV,
-                                        SmallVectorImpl<Subscript> &Pair,
-                                        const SCEV *ElementSize) {
+bool DependenceAnalysis::tryDelinearize(Instruction *Src,
+                                        Instruction *Dst,
+                                        SmallVectorImpl<Subscript> &Pair)
+{
+  Value *SrcPtr = getPointerOperand(Src);
+  Value *DstPtr = getPointerOperand(Dst);
+
+  Loop *SrcLoop = LI->getLoopFor(Src->getParent());
+  Loop *DstLoop = LI->getLoopFor(Dst->getParent());
+
+  // Below code mimics the code in Delinearization.cpp
+  const SCEV *SrcAccessFn =
+    SE->getSCEVAtScope(SrcPtr, SrcLoop);
+  const SCEV *DstAccessFn =
+    SE->getSCEVAtScope(DstPtr, DstLoop);
+
   const SCEVUnknown *SrcBase =
-      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcSCEV));
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
   const SCEVUnknown *DstBase =
-      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstSCEV));
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
 
   if (!SrcBase || !DstBase || SrcBase != DstBase)
     return false;
 
-  SrcSCEV = SE->getMinusSCEV(SrcSCEV, SrcBase);
-  DstSCEV = SE->getMinusSCEV(DstSCEV, DstBase);
+  const SCEV *ElementSize = SE->getElementSize(Src);
+  if (ElementSize != SE->getElementSize(Dst))
+    return false;
+
+  const SCEV *SrcSCEV = SE->getMinusSCEV(SrcAccessFn, SrcBase);
+  const SCEV *DstSCEV = SE->getMinusSCEV(DstAccessFn, DstBase);
 
   const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV);
   const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV);
@@ -3330,7 +3343,6 @@ static void dumpSmallBitVector(SmallBitVector &BV) {
 }
 #endif
 
-
 // depends -
 // Returns NULL if there is no dependence.
 // Otherwise, return a Dependence with as many details as possible.
@@ -3425,10 +3437,11 @@ DependenceAnalysis::depends(Instruction *Src, Instruction *Dst,
     Pair[0].Dst = DstSCEV;
   }
 
-  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
-      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) {
-    DEBUG(dbgs() << "    delinerized GEP\n");
-    Pairs = Pair.size();
+  if (Delinearize && CommonLevels > 1) {
+    if (tryDelinearize(Src, Dst, Pair)) {
+      DEBUG(dbgs() << "    delinerized GEP\n");
+      Pairs = Pair.size();
+    }
   }
 
   for (unsigned P = 0; P < Pairs; ++P) {
@@ -3746,9 +3759,7 @@ DependenceAnalysis::depends(Instruction *Src, Instruction *Dst,
       return nullptr;
   }
 
-  auto Final = make_unique<FullDependence>(Result);
-  Result.DV = nullptr;
-  return std::move(Final);
+  return make_unique<FullDependence>(std::move(Result));
 }
 
 
@@ -3852,10 +3863,11 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence &Dep,
     Pair[0].Dst = DstSCEV;
   }
 
-  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
-      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) {
-    DEBUG(dbgs() << "    delinerized GEP\n");
-    Pairs = Pair.size();
+  if (Delinearize && CommonLevels > 1) {
+    if (tryDelinearize(Src, Dst, Pair)) {
+      DEBUG(dbgs() << "    delinerized GEP\n");
+      Pairs = Pair.size();
+    }
   }
 
   for (unsigned P = 0; P < Pairs; ++P) {
diff --git a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
index e5ee295..5ae6d74 100644
--- a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- DivergenceAnalysis.cpp ------ Divergence Analysis ------------------===//
+//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines divergence analysis which determines whether a branch in a
-// GPU program is divergent. It can help branch optimizations such as jump
+// This file implements divergence analysis which determines whether a branch
+// in a GPU program is divergent.It can help branch optimizations such as jump
 // threading and loop unswitching to make better decisions.
 //
 // GPU programs typically use the SIMD execution model, where multiple threads
@@ -61,75 +61,31 @@
 // 2. memory as black box. It conservatively considers values loaded from
 //    generic or local address as divergent. This can be improved by leveraging
 //    pointer analysis.
+//
 //===----------------------------------------------------------------------===//
 
-#include <vector>
-#include "llvm/IR/Dominators.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include <vector>
 using namespace llvm;
 
-#define DEBUG_TYPE "divergence"
-
-namespace {
-class DivergenceAnalysis : public FunctionPass {
-public:
-  static char ID;
-
-  DivergenceAnalysis() : FunctionPass(ID) {
-    initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTree>();
-    AU.setPreservesAll();
-  }
-
-  bool runOnFunction(Function &F) override;
-
-  // Print all divergent branches in the function.
-  void print(raw_ostream &OS, const Module *) const override;
-
-  // Returns true if V is divergent.
-  bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
-  // Returns true if V is uniform/non-divergent.
-  bool isUniform(const Value *V) const { return !isDivergent(V); }
-
-private:
-  // Stores all divergent values.
-  DenseSet<const Value *> DivergentValues;
-};
-} // End of anonymous namespace
-
-// Register this pass.
-char DivergenceAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
-                      false, true)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
-INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
-                    false, true)
-
 namespace {
 
 class DivergencePropagator {
 public:
-  DivergencePropagator(Function &F, TargetTransformInfo &TTI,
-                       DominatorTree &DT, PostDominatorTree &PDT,
-                       DenseSet<const Value *> &DV)
+  DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
+                       PostDominatorTree &PDT, DenseSet<const Value *> &DV)
       : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {}
   void populateWithSourcesOfDivergence();
   void propagate();
@@ -140,7 +96,7 @@ private:
   // A helper function that explores sync dependents of TI.
   void exploreSyncDependency(TerminatorInst *TI);
   // Computes the influence region from Start to End. This region includes all
-  // basic blocks on any path from Start to End.
+  // basic blocks on any simple path from Start to End.
   void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End,
                               DenseSet<BasicBlock *> &InfluenceRegion);
   // Finds all users of I that are outside the influence region, and add these
@@ -153,13 +109,13 @@ private:
   DominatorTree &DT;
   PostDominatorTree &PDT;
   std::vector<Value *> Worklist; // Stack for DFS.
-  DenseSet<const Value *> &DV; // Stores all divergent values.
+  DenseSet<const Value *> &DV;   // Stores all divergent values.
 };
 
 void DivergencePropagator::populateWithSourcesOfDivergence() {
   Worklist.clear();
   DV.clear();
-  for (auto &I : inst_range(F)) {
+  for (auto &I : instructions(F)) {
     if (TTI.isSourceOfDivergence(&I)) {
       Worklist.push_back(&I);
       DV.insert(&I);
@@ -191,8 +147,8 @@ void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
   for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) {
     // A PHINode is uniform if it returns the same value no matter which path is
     // taken.
-    if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(I).second)
-      Worklist.push_back(I);
+    if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(&*I).second)
+      Worklist.push_back(&*I);
   }
 
   // Propagation rule 2: if a value defined in a loop is used outside, the user
@@ -242,21 +198,33 @@ void DivergencePropagator::findUsersOutsideInfluenceRegion(
   }
 }
 
+// A helper function for computeInfluenceRegion that adds successors of "ThisBB"
+// to the influence region.
+static void
+addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End,
+                               DenseSet<BasicBlock *> &InfluenceRegion,
+                               std::vector<BasicBlock *> &InfluenceStack) {
+  for (BasicBlock *Succ : successors(ThisBB)) {
+    if (Succ != End && InfluenceRegion.insert(Succ).second)
+      InfluenceStack.push_back(Succ);
+  }
+}
+
 void DivergencePropagator::computeInfluenceRegion(
     BasicBlock *Start, BasicBlock *End,
     DenseSet<BasicBlock *> &InfluenceRegion) {
   assert(PDT.properlyDominates(End, Start) &&
          "End does not properly dominate Start");
+
+  // The influence region starts from the end of "Start" to the beginning of
+  // "End". Therefore, "Start" should not be in the region unless "Start" is in
+  // a loop that doesn't contain "End".
   std::vector<BasicBlock *> InfluenceStack;
-  InfluenceStack.push_back(Start);
-  InfluenceRegion.insert(Start);
+  addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack);
   while (!InfluenceStack.empty()) {
     BasicBlock *BB = InfluenceStack.back();
     InfluenceStack.pop_back();
-    for (BasicBlock *Succ : successors(BB)) {
-      if (End != Succ && InfluenceRegion.insert(Succ).second)
-        InfluenceStack.push_back(Succ);
-    }
+    addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack);
   }
 }
 
@@ -286,10 +254,25 @@ void DivergencePropagator::propagate() {
 
 } /// end namespace anonymous
 
+// Register this pass.
+char DivergenceAnalysis::ID = 0;
+INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
+                    false, true)
+
 FunctionPass *llvm::createDivergenceAnalysisPass() {
   return new DivergenceAnalysis();
 }
 
+void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<PostDominatorTree>();
+  AU.setPreservesAll();
+}
+
 bool DivergenceAnalysis::runOnFunction(Function &F) {
   auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
   if (TTIWP == nullptr)
@@ -329,8 +312,8 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
     if (DivergentValues.count(&Arg))
       OS << "DIVERGENT:  " << Arg << "\n";
   }
-  // Iterate instructions using inst_range to ensure a deterministic order.
-  for (auto &I : inst_range(F)) {
+  // Iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &I : instructions(F)) {
     if (DivergentValues.count(&I))
       OS << "DIVERGENT:" << I << "\n";
   }
diff --git a/contrib/llvm/lib/Analysis/EHPersonalities.cpp b/contrib/llvm/lib/Analysis/EHPersonalities.cpp
new file mode 100644
index 0000000..01be8b3
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/EHPersonalities.cpp
@@ -0,0 +1,106 @@
+//===- EHPersonalities.cpp - Compute EH-related information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// See if the given exception handling personality function is one that we
+/// understand.  If so, return a description of it; otherwise return Unknown.
+EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
+  const Function *F =
+      Pers ? dyn_cast<Function>(Pers->stripPointerCasts()) : nullptr;
+  if (!F)
+    return EHPersonality::Unknown;
+  return StringSwitch<EHPersonality>(F->getName())
+    .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
+    .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
+    .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
+    .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
+    .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
+    .Case("_except_handler4",      EHPersonality::MSVC_X86SEH)
+    .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
+    .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
+    .Case("ProcessCLRException",   EHPersonality::CoreCLR)
+    .Default(EHPersonality::Unknown);
+}
+
+bool llvm::canSimplifyInvokeNoUnwind(const Function *F) {
+  EHPersonality Personality = classifyEHPersonality(F->getPersonalityFn());
+  // We can't simplify any invokes to nounwind functions if the personality
+  // function wants to catch asynch exceptions.  The nounwind attribute only
+  // implies that the function does not throw synchronous exceptions.
+  return !isAsynchronousEHPersonality(Personality);
+}
+
+DenseMap<BasicBlock *, ColorVector> llvm::colorEHFunclets(Function &F) {
+  SmallVector<std::pair<BasicBlock *, BasicBlock *>, 16> Worklist;
+  BasicBlock *EntryBlock = &F.getEntryBlock();
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+
+  // Build up the color map, which maps each block to its set of 'colors'.
+  // For any block B the "colors" of B are the set of funclets F (possibly
+  // including a root "funclet" representing the main function) such that
+  // F will need to directly contain B or a copy of B (where the term "directly
+  // contain" is used to distinguish from being "transitively contained" in
+  // a nested funclet).
+  //
+  // Note: Despite not being a funclet in the truest sense, a catchswitch is
+  // considered to belong to its own funclet for the purposes of coloring.
+
+  DEBUG_WITH_TYPE("winehprepare-coloring", dbgs() << "\nColoring funclets for "
+                                                  << F.getName() << "\n");
+
+  Worklist.push_back({EntryBlock, EntryBlock});
+
+  while (!Worklist.empty()) {
+    BasicBlock *Visiting;
+    BasicBlock *Color;
+    std::tie(Visiting, Color) = Worklist.pop_back_val();
+    DEBUG_WITH_TYPE("winehprepare-coloring",
+                    dbgs() << "Visiting " << Visiting->getName() << ", "
+                           << Color->getName() << "\n");
+    Instruction *VisitingHead = Visiting->getFirstNonPHI();
+    if (VisitingHead->isEHPad()) {
+      // Mark this funclet head as a member of itself.
+      Color = Visiting;
+    }
+    // Note that this is a member of the given color.
+    ColorVector &Colors = BlockColors[Visiting];
+    if (std::find(Colors.begin(), Colors.end(), Color) == Colors.end())
+      Colors.push_back(Color);
+    else
+      continue;
+
+    DEBUG_WITH_TYPE("winehprepare-coloring",
+                    dbgs() << "  Assigned color \'" << Color->getName()
+                           << "\' to block \'" << Visiting->getName()
+                           << "\'.\n");
+
+    BasicBlock *SuccColor = Color;
+    TerminatorInst *Terminator = Visiting->getTerminator();
+    if (auto *CatchRet = dyn_cast<CatchReturnInst>(Terminator)) {
+      Value *ParentPad = CatchRet->getParentPad();
+      if (isa<ConstantTokenNone>(ParentPad))
+        SuccColor = EntryBlock;
+      else
+        SuccColor = cast<Instruction>(ParentPad)->getParent();
+    }
+
+    for (BasicBlock *Succ : successors(Visiting))
+      Worklist.push_back({Succ, SuccColor});
+  }
+  return BlockColors;
+}
diff --git a/contrib/llvm/lib/Analysis/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
new file mode 100644
index 0000000..1babb82
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -0,0 +1,972 @@
+//===- GlobalsModRef.cpp - Simple Mod/Ref Analysis for Globals ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This simple pass provides alias and mod/ref information for global values
+// that do not have their address taken, and keeps track of whether functions
+// read or write memory (are "pure").  For this simple (but very common) case,
+// we can provide pretty accurate and useful information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "globalsmodref-aa"
+
+STATISTIC(NumNonAddrTakenGlobalVars,
+          "Number of global vars without address taken");
+STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken");
+STATISTIC(NumNoMemFunctions, "Number of functions that do not access memory");
+STATISTIC(NumReadMemFunctions, "Number of functions that only read memory");
+STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects");
+
+// An option to enable unsafe alias results from the GlobalsModRef analysis.
+// When enabled, GlobalsModRef will provide no-alias results which in extremely
+// rare cases may not be conservatively correct. In particular, in the face of
+// transforms which cause assymetry between how effective GetUnderlyingObject
+// is for two pointers, it may produce incorrect results.
+//
+// These unsafe results have been returned by GMR for many years without
+// causing significant issues in the wild and so we provide a mechanism to
+// re-enable them for users of LLVM that have a particular performance
+// sensitivity and no known issues. The option also makes it easy to evaluate
+// the performance impact of these results.
+static cl::opt<bool> EnableUnsafeGlobalsModRefAliasResults(
+    "enable-unsafe-globalsmodref-alias-results", cl::init(false), cl::Hidden);
+
+/// The mod/ref information collected for a particular function.
+///
+/// We collect information about mod/ref behavior of a function here, both in
+/// general and as pertains to specific globals. We only have this detailed
+/// information when we know *something* useful about the behavior. If we
+/// saturate to fully general mod/ref, we remove the info for the function.
+class GlobalsAAResult::FunctionInfo {
+  typedef SmallDenseMap<const GlobalValue *, ModRefInfo, 16> GlobalInfoMapType;
+
+  /// Build a wrapper struct that has 8-byte alignment. All heap allocations
+  /// should provide this much alignment at least, but this makes it clear we
+  /// specifically rely on this amount of alignment.
+  struct LLVM_ALIGNAS(8) AlignedMap {
+    AlignedMap() {}
+    AlignedMap(const AlignedMap &Arg) : Map(Arg.Map) {}
+    GlobalInfoMapType Map;
+  };
+
+  /// Pointer traits for our aligned map.
+  struct AlignedMapPointerTraits {
+    static inline void *getAsVoidPointer(AlignedMap *P) { return P; }
+    static inline AlignedMap *getFromVoidPointer(void *P) {
+      return (AlignedMap *)P;
+    }
+    enum { NumLowBitsAvailable = 3 };
+    static_assert(AlignOf<AlignedMap>::Alignment >= (1 << NumLowBitsAvailable),
+                  "AlignedMap insufficiently aligned to have enough low bits.");
+  };
+
+  /// The bit that flags that this function may read any global. This is
+  /// chosen to mix together with ModRefInfo bits.
+  enum { MayReadAnyGlobal = 4 };
+
+  /// Checks to document the invariants of the bit packing here.
+  static_assert((MayReadAnyGlobal & MRI_ModRef) == 0,
+                "ModRef and the MayReadAnyGlobal flag bits overlap.");
+  static_assert(((MayReadAnyGlobal | MRI_ModRef) >>
+                 AlignedMapPointerTraits::NumLowBitsAvailable) == 0,
+                "Insufficient low bits to store our flag and ModRef info.");
+
+public:
+  FunctionInfo() : Info() {}
+  ~FunctionInfo() {
+    delete Info.getPointer();
+  }
+  // Spell out the copy ond move constructors and assignment operators to get
+  // deep copy semantics and correct move semantics in the face of the
+  // pointer-int pair.
+  FunctionInfo(const FunctionInfo &Arg)
+      : Info(nullptr, Arg.Info.getInt()) {
+    if (const auto *ArgPtr = Arg.Info.getPointer())
+      Info.setPointer(new AlignedMap(*ArgPtr));
+  }
+  FunctionInfo(FunctionInfo &&Arg)
+      : Info(Arg.Info.getPointer(), Arg.Info.getInt()) {
+    Arg.Info.setPointerAndInt(nullptr, 0);
+  }
+  FunctionInfo &operator=(const FunctionInfo &RHS) {
+    delete Info.getPointer();
+    Info.setPointerAndInt(nullptr, RHS.Info.getInt());
+    if (const auto *RHSPtr = RHS.Info.getPointer())
+      Info.setPointer(new AlignedMap(*RHSPtr));
+    return *this;
+  }
+  FunctionInfo &operator=(FunctionInfo &&RHS) {
+    delete Info.getPointer();
+    Info.setPointerAndInt(RHS.Info.getPointer(), RHS.Info.getInt());
+    RHS.Info.setPointerAndInt(nullptr, 0);
+    return *this;
+  }
+
+  /// Returns the \c ModRefInfo info for this function.
+  ModRefInfo getModRefInfo() const {
+    return ModRefInfo(Info.getInt() & MRI_ModRef);
+  }
+
+  /// Adds new \c ModRefInfo for this function to its state.
+  void addModRefInfo(ModRefInfo NewMRI) {
+    Info.setInt(Info.getInt() | NewMRI);
+  }
+
+  /// Returns whether this function may read any global variable, and we don't
+  /// know which global.
+  bool mayReadAnyGlobal() const { return Info.getInt() & MayReadAnyGlobal; }
+
+  /// Sets this function as potentially reading from any global.
+  void setMayReadAnyGlobal() { Info.setInt(Info.getInt() | MayReadAnyGlobal); }
+
+  /// Returns the \c ModRefInfo info for this function w.r.t. a particular
+  /// global, which may be more precise than the general information above.
+  ModRefInfo getModRefInfoForGlobal(const GlobalValue &GV) const {
+    ModRefInfo GlobalMRI = mayReadAnyGlobal() ? MRI_Ref : MRI_NoModRef;
+    if (AlignedMap *P = Info.getPointer()) {
+      auto I = P->Map.find(&GV);
+      if (I != P->Map.end())
+        GlobalMRI = ModRefInfo(GlobalMRI | I->second);
+    }
+    return GlobalMRI;
+  }
+
+  /// Add mod/ref info from another function into ours, saturating towards
+  /// MRI_ModRef.
+  void addFunctionInfo(const FunctionInfo &FI) {
+    addModRefInfo(FI.getModRefInfo());
+
+    if (FI.mayReadAnyGlobal())
+      setMayReadAnyGlobal();
+
+    if (AlignedMap *P = FI.Info.getPointer())
+      for (const auto &G : P->Map)
+        addModRefInfoForGlobal(*G.first, G.second);
+  }
+
+  void addModRefInfoForGlobal(const GlobalValue &GV, ModRefInfo NewMRI) {
+    AlignedMap *P = Info.getPointer();
+    if (!P) {
+      P = new AlignedMap();
+      Info.setPointer(P);
+    }
+    auto &GlobalMRI = P->Map[&GV];
+    GlobalMRI = ModRefInfo(GlobalMRI | NewMRI);
+  }
+
+  /// Clear a global's ModRef info. Should be used when a global is being
+  /// deleted.
+  void eraseModRefInfoForGlobal(const GlobalValue &GV) {
+    if (AlignedMap *P = Info.getPointer())
+      P->Map.erase(&GV);
+  }
+
+private:
+  /// All of the information is encoded into a single pointer, with a three bit
+  /// integer in the low three bits. The high bit provides a flag for when this
+  /// function may read any global. The low two bits are the ModRefInfo. And
+  /// the pointer, when non-null, points to a map from GlobalValue to
+  /// ModRefInfo specific to that GlobalValue.
+  PointerIntPair<AlignedMap *, 3, unsigned, AlignedMapPointerTraits> Info;
+};
+
+void GlobalsAAResult::DeletionCallbackHandle::deleted() {
+  Value *V = getValPtr();
+  if (auto *F = dyn_cast<Function>(V))
+    GAR->FunctionInfos.erase(F);
+
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (GAR->NonAddressTakenGlobals.erase(GV)) {
+      // This global might be an indirect global.  If so, remove it and
+      // remove any AllocRelatedValues for it.
+      if (GAR->IndirectGlobals.erase(GV)) {
+        // Remove any entries in AllocsForIndirectGlobals for this global.
+        for (auto I = GAR->AllocsForIndirectGlobals.begin(),
+                  E = GAR->AllocsForIndirectGlobals.end();
+             I != E; ++I)
+          if (I->second == GV)
+            GAR->AllocsForIndirectGlobals.erase(I);
+      }
+
+      // Scan the function info we have collected and remove this global
+      // from all of them.
+      for (auto &FIPair : GAR->FunctionInfos)
+        FIPair.second.eraseModRefInfoForGlobal(*GV);
+    }
+  }
+
+  // If this is an allocation related to an indirect global, remove it.
+  GAR->AllocsForIndirectGlobals.erase(V);
+
+  // And clear out the handle.
+  setValPtr(nullptr);
+  GAR->Handles.erase(I);
+  // This object is now destroyed!
+}
+
+FunctionModRefBehavior GlobalsAAResult::getModRefBehavior(const Function *F) {
+  FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
+
+  if (FunctionInfo *FI = getFunctionInfo(F)) {
+    if (FI->getModRefInfo() == MRI_NoModRef)
+      Min = FMRB_DoesNotAccessMemory;
+    else if ((FI->getModRefInfo() & MRI_Mod) == 0)
+      Min = FMRB_OnlyReadsMemory;
+  }
+
+  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min);
+}
+
+FunctionModRefBehavior
+GlobalsAAResult::getModRefBehavior(ImmutableCallSite CS) {
+  FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
+
+  if (const Function *F = CS.getCalledFunction())
+    if (FunctionInfo *FI = getFunctionInfo(F)) {
+      if (FI->getModRefInfo() == MRI_NoModRef)
+        Min = FMRB_DoesNotAccessMemory;
+      else if ((FI->getModRefInfo() & MRI_Mod) == 0)
+        Min = FMRB_OnlyReadsMemory;
+    }
+
+  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min);
+}
+
+/// Returns the function info for the function, or null if we don't have
+/// anything useful to say about it.
+GlobalsAAResult::FunctionInfo *
+GlobalsAAResult::getFunctionInfo(const Function *F) {
+  auto I = FunctionInfos.find(F);
+  if (I != FunctionInfos.end())
+    return &I->second;
+  return nullptr;
+}
+
+/// AnalyzeGlobals - Scan through the users of all of the internal
+/// GlobalValue's in the program.  If none of them have their "address taken"
+/// (really, their address passed to something nontrivial), record this fact,
+/// and record the functions that they are used directly in.
+void GlobalsAAResult::AnalyzeGlobals(Module &M) {
+  SmallPtrSet<Function *, 64> TrackedFunctions;
+  for (Function &F : M)
+    if (F.hasLocalLinkage())
+      if (!AnalyzeUsesOfPointer(&F)) {
+        // Remember that we are tracking this global.
+        NonAddressTakenGlobals.insert(&F);
+        TrackedFunctions.insert(&F);
+        Handles.emplace_front(*this, &F);
+        Handles.front().I = Handles.begin();
+        ++NumNonAddrTakenFunctions;
+      }
+
+  SmallPtrSet<Function *, 64> Readers, Writers;
+  for (GlobalVariable &GV : M.globals())
+    if (GV.hasLocalLinkage()) {
+      if (!AnalyzeUsesOfPointer(&GV, &Readers,
+                                GV.isConstant() ? nullptr : &Writers)) {
+        // Remember that we are tracking this global, and the mod/ref fns
+        NonAddressTakenGlobals.insert(&GV);
+        Handles.emplace_front(*this, &GV);
+        Handles.front().I = Handles.begin();
+
+        for (Function *Reader : Readers) {
+          if (TrackedFunctions.insert(Reader).second) {
+            Handles.emplace_front(*this, Reader);
+            Handles.front().I = Handles.begin();
+          }
+          FunctionInfos[Reader].addModRefInfoForGlobal(GV, MRI_Ref);
+        }
+
+        if (!GV.isConstant()) // No need to keep track of writers to constants
+          for (Function *Writer : Writers) {
+            if (TrackedFunctions.insert(Writer).second) {
+              Handles.emplace_front(*this, Writer);
+              Handles.front().I = Handles.begin();
+            }
+            FunctionInfos[Writer].addModRefInfoForGlobal(GV, MRI_Mod);
+          }
+        ++NumNonAddrTakenGlobalVars;
+
+        // If this global holds a pointer type, see if it is an indirect global.
+        if (GV.getType()->getElementType()->isPointerTy() &&
+            AnalyzeIndirectGlobalMemory(&GV))
+          ++NumIndirectGlobalVars;
+      }
+      Readers.clear();
+      Writers.clear();
+    }
+}
+
+/// AnalyzeUsesOfPointer - Look at all of the users of the specified pointer.
+/// If this is used by anything complex (i.e., the address escapes), return
+/// true.  Also, while we are at it, keep track of those functions that read and
+/// write to the value.
+///
+/// If OkayStoreDest is non-null, stores into this global are allowed.
+bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
+                                           SmallPtrSetImpl<Function *> *Readers,
+                                           SmallPtrSetImpl<Function *> *Writers,
+                                           GlobalValue *OkayStoreDest) {
+  if (!V->getType()->isPointerTy())
+    return true;
+
+  for (Use &U : V->uses()) {
+    User *I = U.getUser();
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (Readers)
+        Readers->insert(LI->getParent()->getParent());
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (V == SI->getOperand(1)) {
+        if (Writers)
+          Writers->insert(SI->getParent()->getParent());
+      } else if (SI->getOperand(1) != OkayStoreDest) {
+        return true; // Storing the pointer
+      }
+    } else if (Operator::getOpcode(I) == Instruction::GetElementPtr) {
+      if (AnalyzeUsesOfPointer(I, Readers, Writers))
+        return true;
+    } else if (Operator::getOpcode(I) == Instruction::BitCast) {
+      if (AnalyzeUsesOfPointer(I, Readers, Writers, OkayStoreDest))
+        return true;
+    } else if (auto CS = CallSite(I)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      if (CS.isDataOperand(&U)) {
+        // Detect calls to free.
+        if (CS.isArgOperand(&U) && isFreeCall(I, &TLI)) {
+          if (Writers)
+            Writers->insert(CS->getParent()->getParent());
+        } else {
+          return true; // Argument of an unknown call.
+        }
+      }
+    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return true; // Allow comparison against null.
+    } else {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable
+/// which holds a pointer type.  See if the global always points to non-aliased
+/// heap memory: that is, all initializers of the globals are allocations, and
+/// those allocations have no use other than initialization of the global.
+/// Further, all loads out of GV must directly use the memory, not store the
+/// pointer somewhere.  If this is true, we consider the memory pointed to by
+/// GV to be owned by GV and can disambiguate other pointers from it.
+bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) {
+  // Keep track of values related to the allocation of the memory, f.e. the
+  // value produced by the malloc call and any casts.
+  std::vector<Value *> AllocRelatedValues;
+
+  // If the initializer is a valid pointer, bail.
+  if (Constant *C = GV->getInitializer())
+    if (!C->isNullValue())
+      return false;
+    
+  // Walk the user list of the global.  If we find anything other than a direct
+  // load or store, bail out.
+  for (User *U : GV->users()) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // The pointer loaded from the global can only be used in simple ways:
+      // we allow addressing of it and loading storing to it.  We do *not* allow
+      // storing the loaded pointer somewhere else or passing to a function.
+      if (AnalyzeUsesOfPointer(LI))
+        return false; // Loaded pointer escapes.
+      // TODO: Could try some IP mod/ref of the loaded pointer.
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Storing the global itself.
+      if (SI->getOperand(0) == GV)
+        return false;
+
+      // If storing the null pointer, ignore it.
+      if (isa<ConstantPointerNull>(SI->getOperand(0)))
+        continue;
+
+      // Check the value being stored.
+      Value *Ptr = GetUnderlyingObject(SI->getOperand(0),
+                                       GV->getParent()->getDataLayout());
+
+      if (!isAllocLikeFn(Ptr, &TLI))
+        return false; // Too hard to analyze.
+
+      // Analyze all uses of the allocation.  If any of them are used in a
+      // non-simple way (e.g. stored to another global) bail out.
+      if (AnalyzeUsesOfPointer(Ptr, /*Readers*/ nullptr, /*Writers*/ nullptr,
+                               GV))
+        return false; // Loaded pointer escapes.
+
+      // Remember that this allocation is related to the indirect global.
+      AllocRelatedValues.push_back(Ptr);
+    } else {
+      // Something complex, bail out.
+      return false;
+    }
+  }
+
+  // Okay, this is an indirect global.  Remember all of the allocations for
+  // this global in AllocsForIndirectGlobals.
+  while (!AllocRelatedValues.empty()) {
+    AllocsForIndirectGlobals[AllocRelatedValues.back()] = GV;
+    Handles.emplace_front(*this, AllocRelatedValues.back());
+    Handles.front().I = Handles.begin();
+    AllocRelatedValues.pop_back();
+  }
+  IndirectGlobals.insert(GV);
+  Handles.emplace_front(*this, GV);
+  Handles.front().I = Handles.begin();
+  return true;
+}
+
+void GlobalsAAResult::CollectSCCMembership(CallGraph &CG) {  
+  // We do a bottom-up SCC traversal of the call graph.  In other words, we
+  // visit all callees before callers (leaf-first).
+  unsigned SCCID = 0;
+  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+    const std::vector<CallGraphNode *> &SCC = *I;
+    assert(!SCC.empty() && "SCC with no functions?");
+
+    for (auto *CGN : SCC)
+      if (Function *F = CGN->getFunction())
+        FunctionToSCCMap[F] = SCCID;
+    ++SCCID;
+  }
+}
+
+/// AnalyzeCallGraph - At this point, we know the functions where globals are
+/// immediately stored to and read from.  Propagate this information up the call
+/// graph to all callers and compute the mod/ref info for all memory for each
+/// function.
+void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
+  // We do a bottom-up SCC traversal of the call graph.  In other words, we
+  // visit all callees before callers (leaf-first).
+  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+    const std::vector<CallGraphNode *> &SCC = *I;
+    assert(!SCC.empty() && "SCC with no functions?");
+
+    if (!SCC[0]->getFunction() || SCC[0]->getFunction()->mayBeOverridden()) {
+      // Calls externally or is weak - can't say anything useful. Remove any existing
+      // function records (may have been created when scanning globals).
+      for (auto *Node : SCC)
+        FunctionInfos.erase(Node->getFunction());
+      continue;
+    }
+
+    FunctionInfo &FI = FunctionInfos[SCC[0]->getFunction()];
+    bool KnowNothing = false;
+
+    // Collect the mod/ref properties due to called functions.  We only compute
+    // one mod-ref set.
+    for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) {
+      Function *F = SCC[i]->getFunction();
+      if (!F) {
+        KnowNothing = true;
+        break;
+      }
+
+      if (F->isDeclaration()) {
+        // Try to get mod/ref behaviour from function attributes.
+        if (F->doesNotAccessMemory()) {
+          // Can't do better than that!
+        } else if (F->onlyReadsMemory()) {
+          FI.addModRefInfo(MRI_Ref);
+          if (!F->isIntrinsic())
+            // This function might call back into the module and read a global -
+            // consider every global as possibly being read by this function.
+            FI.setMayReadAnyGlobal();
+        } else {
+          FI.addModRefInfo(MRI_ModRef);
+          // Can't say anything useful unless it's an intrinsic - they don't
+          // read or write global variables of the kind considered here.
+          KnowNothing = !F->isIntrinsic();
+        }
+        continue;
+      }
+
+      for (CallGraphNode::iterator CI = SCC[i]->begin(), E = SCC[i]->end();
+           CI != E && !KnowNothing; ++CI)
+        if (Function *Callee = CI->second->getFunction()) {
+          if (FunctionInfo *CalleeFI = getFunctionInfo(Callee)) {
+            // Propagate function effect up.
+            FI.addFunctionInfo(*CalleeFI);
+          } else {
+            // Can't say anything about it.  However, if it is inside our SCC,
+            // then nothing needs to be done.
+            CallGraphNode *CalleeNode = CG[Callee];
+            if (std::find(SCC.begin(), SCC.end(), CalleeNode) == SCC.end())
+              KnowNothing = true;
+          }
+        } else {
+          KnowNothing = true;
+        }
+    }
+
+    // If we can't say anything useful about this SCC, remove all SCC functions
+    // from the FunctionInfos map.
+    if (KnowNothing) {
+      for (auto *Node : SCC)
+        FunctionInfos.erase(Node->getFunction());
+      continue;
+    }
+
+    // Scan the function bodies for explicit loads or stores.
+    for (auto *Node : SCC) {
+      if (FI.getModRefInfo() == MRI_ModRef)
+        break; // The mod/ref lattice saturates here.
+      for (Instruction &I : instructions(Node->getFunction())) {
+        if (FI.getModRefInfo() == MRI_ModRef)
+          break; // The mod/ref lattice saturates here.
+
+        // We handle calls specially because the graph-relevant aspects are
+        // handled above.
+        if (auto CS = CallSite(&I)) {
+          if (isAllocationFn(&I, &TLI) || isFreeCall(&I, &TLI)) {
+            // FIXME: It is completely unclear why this is necessary and not
+            // handled by the above graph code.
+            FI.addModRefInfo(MRI_ModRef);
+          } else if (Function *Callee = CS.getCalledFunction()) {
+            // The callgraph doesn't include intrinsic calls.
+            if (Callee->isIntrinsic()) {
+              FunctionModRefBehavior Behaviour =
+                  AAResultBase::getModRefBehavior(Callee);
+              FI.addModRefInfo(ModRefInfo(Behaviour & MRI_ModRef));
+            }
+          }
+          continue;
+        }
+
+        // All non-call instructions we use the primary predicates for whether
+        // thay read or write memory.
+        if (I.mayReadFromMemory())
+          FI.addModRefInfo(MRI_Ref);
+        if (I.mayWriteToMemory())
+          FI.addModRefInfo(MRI_Mod);
+      }
+    }
+
+    if ((FI.getModRefInfo() & MRI_Mod) == 0)
+      ++NumReadMemFunctions;
+    if (FI.getModRefInfo() == MRI_NoModRef)
+      ++NumNoMemFunctions;
+
+    // Finally, now that we know the full effect on this SCC, clone the
+    // information to each function in the SCC.
+    // FI is a reference into FunctionInfos, so copy it now so that it doesn't
+    // get invalidated if DenseMap decides to re-hash.
+    FunctionInfo CachedFI = FI;
+    for (unsigned i = 1, e = SCC.size(); i != e; ++i)
+      FunctionInfos[SCC[i]->getFunction()] = CachedFI;
+  }
+}
+
+// GV is a non-escaping global. V is a pointer address that has been loaded from.
+// If we can prove that V must escape, we can conclude that a load from V cannot
+// alias GV.
+static bool isNonEscapingGlobalNoAliasWithLoad(const GlobalValue *GV,
+                                               const Value *V,
+                                               int &Depth,
+                                               const DataLayout &DL) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Inputs;
+  Visited.insert(V);
+  Inputs.push_back(V);
+  do {
+    const Value *Input = Inputs.pop_back_val();
+    
+    if (isa<GlobalValue>(Input) || isa<Argument>(Input) || isa<CallInst>(Input) ||
+        isa<InvokeInst>(Input))
+      // Arguments to functions or returns from functions are inherently
+      // escaping, so we can immediately classify those as not aliasing any
+      // non-addr-taken globals.
+      //
+      // (Transitive) loads from a global are also safe - if this aliased
+      // another global, its address would escape, so no alias.
+      continue;
+
+    // Recurse through a limited number of selects, loads and PHIs. This is an
+    // arbitrary depth of 4, lower numbers could be used to fix compile time
+    // issues if needed, but this is generally expected to be only be important
+    // for small depths.
+    if (++Depth > 4)
+      return false;
+
+    if (auto *LI = dyn_cast<LoadInst>(Input)) {
+      Inputs.push_back(GetUnderlyingObject(LI->getPointerOperand(), DL));
+      continue;
+    }  
+    if (auto *SI = dyn_cast<SelectInst>(Input)) {
+      const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL);
+      const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL);
+      if (Visited.insert(LHS).second)
+        Inputs.push_back(LHS);
+      if (Visited.insert(RHS).second)
+        Inputs.push_back(RHS);
+      continue;
+    }
+    if (auto *PN = dyn_cast<PHINode>(Input)) {
+      for (const Value *Op : PN->incoming_values()) {
+        Op = GetUnderlyingObject(Op, DL);
+        if (Visited.insert(Op).second)
+          Inputs.push_back(Op);
+      }
+      continue;
+    }
+    
+    return false;
+  } while (!Inputs.empty());
+
+  // All inputs were known to be no-alias.
+  return true;
+}
+
+// There are particular cases where we can conclude no-alias between
+// a non-addr-taken global and some other underlying object. Specifically,
+// a non-addr-taken global is known to not be escaped from any function. It is
+// also incorrect for a transformation to introduce an escape of a global in
+// a way that is observable when it was not there previously. One function
+// being transformed to introduce an escape which could possibly be observed
+// (via loading from a global or the return value for example) within another
+// function is never safe. If the observation is made through non-atomic
+// operations on different threads, it is a data-race and UB. If the
+// observation is well defined, by being observed the transformation would have
+// changed program behavior by introducing the observed escape, making it an
+// invalid transform.
+//
+// This property does require that transformations which *temporarily* escape
+// a global that was not previously escaped, prior to restoring it, cannot rely
+// on the results of GMR::alias. This seems a reasonable restriction, although
+// currently there is no way to enforce it. There is also no realistic
+// optimization pass that would make this mistake. The closest example is
+// a transformation pass which does reg2mem of SSA values but stores them into
+// global variables temporarily before restoring the global variable's value.
+// This could be useful to expose "benign" races for example. However, it seems
+// reasonable to require that a pass which introduces escapes of global
+// variables in this way to either not trust AA results while the escape is
+// active, or to be forced to operate as a module pass that cannot co-exist
+// with an alias analysis such as GMR.
+bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV,
+                                                 const Value *V) {
+  // In order to know that the underlying object cannot alias the
+  // non-addr-taken global, we must know that it would have to be an escape.
+  // Thus if the underlying object is a function argument, a load from
+  // a global, or the return of a function, it cannot alias. We can also
+  // recurse through PHI nodes and select nodes provided all of their inputs
+  // resolve to one of these known-escaping roots.
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Inputs;
+  Visited.insert(V);
+  Inputs.push_back(V);
+  int Depth = 0;
+  do {
+    const Value *Input = Inputs.pop_back_val();
+
+    if (auto *InputGV = dyn_cast<GlobalValue>(Input)) {
+      // If one input is the very global we're querying against, then we can't
+      // conclude no-alias.
+      if (InputGV == GV)
+        return false;
+
+      // Distinct GlobalVariables never alias, unless overriden or zero-sized.
+      // FIXME: The condition can be refined, but be conservative for now.
+      auto *GVar = dyn_cast<GlobalVariable>(GV);
+      auto *InputGVar = dyn_cast<GlobalVariable>(InputGV);
+      if (GVar && InputGVar &&
+          !GVar->isDeclaration() && !InputGVar->isDeclaration() &&
+          !GVar->mayBeOverridden() && !InputGVar->mayBeOverridden()) {
+        Type *GVType = GVar->getInitializer()->getType();
+        Type *InputGVType = InputGVar->getInitializer()->getType();
+        if (GVType->isSized() && InputGVType->isSized() &&
+            (DL.getTypeAllocSize(GVType) > 0) &&
+            (DL.getTypeAllocSize(InputGVType) > 0))
+          continue;
+      }
+
+      // Conservatively return false, even though we could be smarter
+      // (e.g. look through GlobalAliases).
+      return false;
+    }
+
+    if (isa<Argument>(Input) || isa<CallInst>(Input) ||
+        isa<InvokeInst>(Input)) {
+      // Arguments to functions or returns from functions are inherently
+      // escaping, so we can immediately classify those as not aliasing any
+      // non-addr-taken globals.
+      continue;
+    }
+    
+    // Recurse through a limited number of selects, loads and PHIs. This is an
+    // arbitrary depth of 4, lower numbers could be used to fix compile time
+    // issues if needed, but this is generally expected to be only be important
+    // for small depths.
+    if (++Depth > 4)
+      return false;
+
+    if (auto *LI = dyn_cast<LoadInst>(Input)) {
+      // A pointer loaded from a global would have been captured, and we know
+      // that the global is non-escaping, so no alias.
+      const Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL);
+      if (isNonEscapingGlobalNoAliasWithLoad(GV, Ptr, Depth, DL))
+        // The load does not alias with GV.
+        continue;
+      // Otherwise, a load could come from anywhere, so bail.
+      return false;
+    }
+    if (auto *SI = dyn_cast<SelectInst>(Input)) {
+      const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL);
+      const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL);
+      if (Visited.insert(LHS).second)
+        Inputs.push_back(LHS);
+      if (Visited.insert(RHS).second)
+        Inputs.push_back(RHS);
+      continue;
+    }
+    if (auto *PN = dyn_cast<PHINode>(Input)) {
+      for (const Value *Op : PN->incoming_values()) {
+        Op = GetUnderlyingObject(Op, DL);
+        if (Visited.insert(Op).second)
+          Inputs.push_back(Op);
+      }
+      continue;
+    }
+
+    // FIXME: It would be good to handle other obvious no-alias cases here, but
+    // it isn't clear how to do so reasonbly without building a small version
+    // of BasicAA into this code. We could recurse into AAResultBase::alias
+    // here but that seems likely to go poorly as we're inside the
+    // implementation of such a query. Until then, just conservatievly retun
+    // false.
+    return false;
+  } while (!Inputs.empty());
+
+  // If all the inputs to V were definitively no-alias, then V is no-alias.
+  return true;
+}
+
+/// alias - If one of the pointers is to a global that we are tracking, and the
+/// other is some random pointer, we know there cannot be an alias, because the
+/// address of the global isn't taken.
+AliasResult GlobalsAAResult::alias(const MemoryLocation &LocA,
+                                   const MemoryLocation &LocB) {
+  // Get the base object these pointers point to.
+  const Value *UV1 = GetUnderlyingObject(LocA.Ptr, DL);
+  const Value *UV2 = GetUnderlyingObject(LocB.Ptr, DL);
+
+  // If either of the underlying values is a global, they may be non-addr-taken
+  // globals, which we can answer queries about.
+  const GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1);
+  const GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2);
+  if (GV1 || GV2) {
+    // If the global's address is taken, pretend we don't know it's a pointer to
+    // the global.
+    if (GV1 && !NonAddressTakenGlobals.count(GV1))
+      GV1 = nullptr;
+    if (GV2 && !NonAddressTakenGlobals.count(GV2))
+      GV2 = nullptr;
+
+    // If the two pointers are derived from two different non-addr-taken
+    // globals we know these can't alias.
+    if (GV1 && GV2 && GV1 != GV2)
+      return NoAlias;
+
+    // If one is and the other isn't, it isn't strictly safe but we can fake
+    // this result if necessary for performance. This does not appear to be
+    // a common problem in practice.
+    if (EnableUnsafeGlobalsModRefAliasResults)
+      if ((GV1 || GV2) && GV1 != GV2)
+        return NoAlias;
+
+    // Check for a special case where a non-escaping global can be used to
+    // conclude no-alias.
+    if ((GV1 || GV2) && GV1 != GV2) {
+      const GlobalValue *GV = GV1 ? GV1 : GV2;
+      const Value *UV = GV1 ? UV2 : UV1;
+      if (isNonEscapingGlobalNoAlias(GV, UV))
+        return NoAlias;
+    }
+
+    // Otherwise if they are both derived from the same addr-taken global, we
+    // can't know the two accesses don't overlap.
+  }
+
+  // These pointers may be based on the memory owned by an indirect global.  If
+  // so, we may be able to handle this.  First check to see if the base pointer
+  // is a direct load from an indirect global.
+  GV1 = GV2 = nullptr;
+  if (const LoadInst *LI = dyn_cast<LoadInst>(UV1))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+      if (IndirectGlobals.count(GV))
+        GV1 = GV;
+  if (const LoadInst *LI = dyn_cast<LoadInst>(UV2))
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+      if (IndirectGlobals.count(GV))
+        GV2 = GV;
+
+  // These pointers may also be from an allocation for the indirect global.  If
+  // so, also handle them.
+  if (!GV1)
+    GV1 = AllocsForIndirectGlobals.lookup(UV1);
+  if (!GV2)
+    GV2 = AllocsForIndirectGlobals.lookup(UV2);
+
+  // Now that we know whether the two pointers are related to indirect globals,
+  // use this to disambiguate the pointers. If the pointers are based on
+  // different indirect globals they cannot alias.
+  if (GV1 && GV2 && GV1 != GV2)
+    return NoAlias;
+
+  // If one is based on an indirect global and the other isn't, it isn't
+  // strictly safe but we can fake this result if necessary for performance.
+  // This does not appear to be a common problem in practice.
+  if (EnableUnsafeGlobalsModRefAliasResults)
+    if ((GV1 || GV2) && GV1 != GV2)
+      return NoAlias;
+
+  return AAResultBase::alias(LocA, LocB);
+}
+
+ModRefInfo GlobalsAAResult::getModRefInfoForArgument(ImmutableCallSite CS,
+                                                     const GlobalValue *GV) {
+  if (CS.doesNotAccessMemory())
+    return MRI_NoModRef;
+  ModRefInfo ConservativeResult = CS.onlyReadsMemory() ? MRI_Ref : MRI_ModRef;
+  
+  // Iterate through all the arguments to the called function. If any argument
+  // is based on GV, return the conservative result.
+  for (auto &A : CS.args()) {
+    SmallVector<Value*, 4> Objects;
+    GetUnderlyingObjects(A, Objects, DL);
+    
+    // All objects must be identified.
+    if (!std::all_of(Objects.begin(), Objects.end(), isIdentifiedObject))
+      return ConservativeResult;
+
+    if (std::find(Objects.begin(), Objects.end(), GV) != Objects.end())
+      return ConservativeResult;
+  }
+
+  // We identified all objects in the argument list, and none of them were GV.
+  return MRI_NoModRef;
+}
+
+ModRefInfo GlobalsAAResult::getModRefInfo(ImmutableCallSite CS,
+                                          const MemoryLocation &Loc) {
+  unsigned Known = MRI_ModRef;
+
+  // If we are asking for mod/ref info of a direct call with a pointer to a
+  // global we are tracking, return information if we have it.
+  if (const GlobalValue *GV =
+          dyn_cast<GlobalValue>(GetUnderlyingObject(Loc.Ptr, DL)))
+    if (GV->hasLocalLinkage())
+      if (const Function *F = CS.getCalledFunction())
+        if (NonAddressTakenGlobals.count(GV))
+          if (const FunctionInfo *FI = getFunctionInfo(F))
+            Known = FI->getModRefInfoForGlobal(*GV) |
+              getModRefInfoForArgument(CS, GV);
+
+  if (Known == MRI_NoModRef)
+    return MRI_NoModRef; // No need to query other mod/ref analyses
+  return ModRefInfo(Known & AAResultBase::getModRefInfo(CS, Loc));
+}
+
+GlobalsAAResult::GlobalsAAResult(const DataLayout &DL,
+                                 const TargetLibraryInfo &TLI)
+    : AAResultBase(TLI), DL(DL) {}
+
+GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
+    : AAResultBase(std::move(Arg)), DL(Arg.DL),
+      NonAddressTakenGlobals(std::move(Arg.NonAddressTakenGlobals)),
+      IndirectGlobals(std::move(Arg.IndirectGlobals)),
+      AllocsForIndirectGlobals(std::move(Arg.AllocsForIndirectGlobals)),
+      FunctionInfos(std::move(Arg.FunctionInfos)),
+      Handles(std::move(Arg.Handles)) {
+  // Update the parent for each DeletionCallbackHandle.
+  for (auto &H : Handles) {
+    assert(H.GAR == &Arg);
+    H.GAR = this;
+  }
+}
+
+/*static*/ GlobalsAAResult
+GlobalsAAResult::analyzeModule(Module &M, const TargetLibraryInfo &TLI,
+                               CallGraph &CG) {
+  GlobalsAAResult Result(M.getDataLayout(), TLI);
+
+  // Discover which functions aren't recursive, to feed into AnalyzeGlobals.
+  Result.CollectSCCMembership(CG);
+
+  // Find non-addr taken globals.
+  Result.AnalyzeGlobals(M);
+
+  // Propagate on CG.
+  Result.AnalyzeCallGraph(CG, M);
+
+  return Result;
+}
+
+GlobalsAAResult GlobalsAA::run(Module &M, AnalysisManager<Module> *AM) {
+  return GlobalsAAResult::analyzeModule(M,
+                                        AM->getResult<TargetLibraryAnalysis>(M),
+                                        AM->getResult<CallGraphAnalysis>(M));
+}
+
+char GlobalsAA::PassID;
+
+char GlobalsAAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalsAAWrapperPass, "globals-aa",
+                      "Globals Alias Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(GlobalsAAWrapperPass, "globals-aa",
+                    "Globals Alias Analysis", false, true)
+
+ModulePass *llvm::createGlobalsAAWrapperPass() {
+  return new GlobalsAAWrapperPass();
+}
+
+GlobalsAAWrapperPass::GlobalsAAWrapperPass() : ModulePass(ID) {
+  initializeGlobalsAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool GlobalsAAWrapperPass::runOnModule(Module &M) {
+  Result.reset(new GlobalsAAResult(GlobalsAAResult::analyzeModule(
+      M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+      getAnalysis<CallGraphWrapperPass>().getCallGraph())));
+  return false;
+}
+
+bool GlobalsAAWrapperPass::doFinalization(Module &M) {
+  Result.reset();
+  return false;
+}
+
+void GlobalsAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<CallGraphWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
diff --git a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp
deleted file mode 100644
index 28fb49c..0000000
--- a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp
+++ /dev/null
@@ -1,609 +0,0 @@
-//===- GlobalsModRef.cpp - Simple Mod/Ref Analysis for Globals ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This simple pass provides alias and mod/ref information for global values
-// that do not have their address taken, and keeps track of whether functions
-// read or write memory (are "pure").  For this simple (but very common) case,
-// we can provide pretty accurate and useful information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include <set>
-using namespace llvm;
-
-#define DEBUG_TYPE "globalsmodref-aa"
-
-STATISTIC(NumNonAddrTakenGlobalVars,
-          "Number of global vars without address taken");
-STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken");
-STATISTIC(NumNoMemFunctions, "Number of functions that do not access memory");
-STATISTIC(NumReadMemFunctions, "Number of functions that only read memory");
-STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects");
-
-namespace {
-/// FunctionRecord - One instance of this structure is stored for every
-/// function in the program.  Later, the entries for these functions are
-/// removed if the function is found to call an external function (in which
-/// case we know nothing about it.
-struct FunctionRecord {
-  /// GlobalInfo - Maintain mod/ref info for all of the globals without
-  /// addresses taken that are read or written (transitively) by this
-  /// function.
-  std::map<const GlobalValue *, unsigned> GlobalInfo;
-
-  /// MayReadAnyGlobal - May read global variables, but it is not known which.
-  bool MayReadAnyGlobal;
-
-  unsigned getInfoForGlobal(const GlobalValue *GV) const {
-    unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0;
-    std::map<const GlobalValue *, unsigned>::const_iterator I =
-        GlobalInfo.find(GV);
-    if (I != GlobalInfo.end())
-      Effect |= I->second;
-    return Effect;
-  }
-
-  /// FunctionEffect - Capture whether or not this function reads or writes to
-  /// ANY memory.  If not, we can do a lot of aggressive analysis on it.
-  unsigned FunctionEffect;
-
-  FunctionRecord() : MayReadAnyGlobal(false), FunctionEffect(0) {}
-};
-
-/// GlobalsModRef - The actual analysis pass.
-class GlobalsModRef : public ModulePass, public AliasAnalysis {
-  /// NonAddressTakenGlobals - The globals that do not have their addresses
-  /// taken.
-  std::set<const GlobalValue *> NonAddressTakenGlobals;
-
-  /// IndirectGlobals - The memory pointed to by this global is known to be
-  /// 'owned' by the global.
-  std::set<const GlobalValue *> IndirectGlobals;
-
-  /// AllocsForIndirectGlobals - If an instruction allocates memory for an
-  /// indirect global, this map indicates which one.
-  std::map<const Value *, const GlobalValue *> AllocsForIndirectGlobals;
-
-  /// FunctionInfo - For each function, keep track of what globals are
-  /// modified or read.
-  std::map<const Function *, FunctionRecord> FunctionInfo;
-
-public:
-  static char ID;
-  GlobalsModRef() : ModulePass(ID) {
-    initializeGlobalsModRefPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnModule(Module &M) override {
-    InitializeAliasAnalysis(this, &M.getDataLayout());
-
-    // Find non-addr taken globals.
-    AnalyzeGlobals(M);
-
-    // Propagate on CG.
-    AnalyzeCallGraph(getAnalysis<CallGraphWrapperPass>().getCallGraph(), M);
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AliasAnalysis::getAnalysisUsage(AU);
-    AU.addRequired<CallGraphWrapperPass>();
-    AU.setPreservesAll(); // Does not transform code
-  }
-
-  //------------------------------------------------
-  // Implement the AliasAnalysis API
-  //
-  AliasResult alias(const MemoryLocation &LocA,
-                    const MemoryLocation &LocB) override;
-  ModRefResult getModRefInfo(ImmutableCallSite CS,
-                             const MemoryLocation &Loc) override;
-  ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                             ImmutableCallSite CS2) override {
-    return AliasAnalysis::getModRefInfo(CS1, CS2);
-  }
-
-  /// getModRefBehavior - Return the behavior of the specified function if
-  /// called from the specified call site.  The call site may be null in which
-  /// case the most generic behavior of this function should be returned.
-  ModRefBehavior getModRefBehavior(const Function *F) override {
-    ModRefBehavior Min = UnknownModRefBehavior;
-
-    if (FunctionRecord *FR = getFunctionInfo(F)) {
-      if (FR->FunctionEffect == 0)
-        Min = DoesNotAccessMemory;
-      else if ((FR->FunctionEffect & Mod) == 0)
-        Min = OnlyReadsMemory;
-    }
-
-    return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
-  }
-
-  /// getModRefBehavior - Return the behavior of the specified function if
-  /// called from the specified call site.  The call site may be null in which
-  /// case the most generic behavior of this function should be returned.
-  ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override {
-    ModRefBehavior Min = UnknownModRefBehavior;
-
-    if (const Function *F = CS.getCalledFunction())
-      if (FunctionRecord *FR = getFunctionInfo(F)) {
-        if (FR->FunctionEffect == 0)
-          Min = DoesNotAccessMemory;
-        else if ((FR->FunctionEffect & Mod) == 0)
-          Min = OnlyReadsMemory;
-      }
-
-    return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
-  }
-
-  void deleteValue(Value *V) override;
-  void addEscapingUse(Use &U) override;
-
-  /// getAdjustedAnalysisPointer - This method is used when a pass implements
-  /// an analysis interface through multiple inheritance.  If needed, it
-  /// should override this to adjust the this pointer as needed for the
-  /// specified pass info.
-  void *getAdjustedAnalysisPointer(AnalysisID PI) override {
-    if (PI == &AliasAnalysis::ID)
-      return (AliasAnalysis *)this;
-    return this;
-  }
-
-private:
-  /// getFunctionInfo - Return the function info for the function, or null if
-  /// we don't have anything useful to say about it.
-  FunctionRecord *getFunctionInfo(const Function *F) {
-    std::map<const Function *, FunctionRecord>::iterator I =
-        FunctionInfo.find(F);
-    if (I != FunctionInfo.end())
-      return &I->second;
-    return nullptr;
-  }
-
-  void AnalyzeGlobals(Module &M);
-  void AnalyzeCallGraph(CallGraph &CG, Module &M);
-  bool AnalyzeUsesOfPointer(Value *V, std::vector<Function *> &Readers,
-                            std::vector<Function *> &Writers,
-                            GlobalValue *OkayStoreDest = nullptr);
-  bool AnalyzeIndirectGlobalMemory(GlobalValue *GV);
-};
-}
-
-char GlobalsModRef::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis, "globalsmodref-aa",
-                         "Simple mod/ref analysis for globals", false, true,
-                         false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis, "globalsmodref-aa",
-                       "Simple mod/ref analysis for globals", false, true,
-                       false)
-
-Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); }
-
-/// AnalyzeGlobals - Scan through the users of all of the internal
-/// GlobalValue's in the program.  If none of them have their "address taken"
-/// (really, their address passed to something nontrivial), record this fact,
-/// and record the functions that they are used directly in.
-void GlobalsModRef::AnalyzeGlobals(Module &M) {
-  std::vector<Function *> Readers, Writers;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (I->hasLocalLinkage()) {
-      if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
-        // Remember that we are tracking this global.
-        NonAddressTakenGlobals.insert(I);
-        ++NumNonAddrTakenFunctions;
-      }
-      Readers.clear();
-      Writers.clear();
-    }
-
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
-       ++I)
-    if (I->hasLocalLinkage()) {
-      if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
-        // Remember that we are tracking this global, and the mod/ref fns
-        NonAddressTakenGlobals.insert(I);
-
-        for (unsigned i = 0, e = Readers.size(); i != e; ++i)
-          FunctionInfo[Readers[i]].GlobalInfo[I] |= Ref;
-
-        if (!I->isConstant()) // No need to keep track of writers to constants
-          for (unsigned i = 0, e = Writers.size(); i != e; ++i)
-            FunctionInfo[Writers[i]].GlobalInfo[I] |= Mod;
-        ++NumNonAddrTakenGlobalVars;
-
-        // If this global holds a pointer type, see if it is an indirect global.
-        if (I->getType()->getElementType()->isPointerTy() &&
-            AnalyzeIndirectGlobalMemory(I))
-          ++NumIndirectGlobalVars;
-      }
-      Readers.clear();
-      Writers.clear();
-    }
-}
-
-/// AnalyzeUsesOfPointer - Look at all of the users of the specified pointer.
-/// If this is used by anything complex (i.e., the address escapes), return
-/// true.  Also, while we are at it, keep track of those functions that read and
-/// write to the value.
-///
-/// If OkayStoreDest is non-null, stores into this global are allowed.
-bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
-                                         std::vector<Function *> &Readers,
-                                         std::vector<Function *> &Writers,
-                                         GlobalValue *OkayStoreDest) {
-  if (!V->getType()->isPointerTy())
-    return true;
-
-  for (Use &U : V->uses()) {
-    User *I = U.getUser();
-    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-      Readers.push_back(LI->getParent()->getParent());
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-      if (V == SI->getOperand(1)) {
-        Writers.push_back(SI->getParent()->getParent());
-      } else if (SI->getOperand(1) != OkayStoreDest) {
-        return true; // Storing the pointer
-      }
-    } else if (Operator::getOpcode(I) == Instruction::GetElementPtr) {
-      if (AnalyzeUsesOfPointer(I, Readers, Writers))
-        return true;
-    } else if (Operator::getOpcode(I) == Instruction::BitCast) {
-      if (AnalyzeUsesOfPointer(I, Readers, Writers, OkayStoreDest))
-        return true;
-    } else if (auto CS = CallSite(I)) {
-      // Make sure that this is just the function being called, not that it is
-      // passing into the function.
-      if (!CS.isCallee(&U)) {
-        // Detect calls to free.
-        if (isFreeCall(I, TLI))
-          Writers.push_back(CS->getParent()->getParent());
-        else
-          return true; // Argument of an unknown call.
-      }
-    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
-      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
-        return true; // Allow comparison against null.
-    } else {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable
-/// which holds a pointer type.  See if the global always points to non-aliased
-/// heap memory: that is, all initializers of the globals are allocations, and
-/// those allocations have no use other than initialization of the global.
-/// Further, all loads out of GV must directly use the memory, not store the
-/// pointer somewhere.  If this is true, we consider the memory pointed to by
-/// GV to be owned by GV and can disambiguate other pointers from it.
-bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
-  // Keep track of values related to the allocation of the memory, f.e. the
-  // value produced by the malloc call and any casts.
-  std::vector<Value *> AllocRelatedValues;
-
-  // Walk the user list of the global.  If we find anything other than a direct
-  // load or store, bail out.
-  for (User *U : GV->users()) {
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      // The pointer loaded from the global can only be used in simple ways:
-      // we allow addressing of it and loading storing to it.  We do *not* allow
-      // storing the loaded pointer somewhere else or passing to a function.
-      std::vector<Function *> ReadersWriters;
-      if (AnalyzeUsesOfPointer(LI, ReadersWriters, ReadersWriters))
-        return false; // Loaded pointer escapes.
-      // TODO: Could try some IP mod/ref of the loaded pointer.
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      // Storing the global itself.
-      if (SI->getOperand(0) == GV)
-        return false;
-
-      // If storing the null pointer, ignore it.
-      if (isa<ConstantPointerNull>(SI->getOperand(0)))
-        continue;
-
-      // Check the value being stored.
-      Value *Ptr = GetUnderlyingObject(SI->getOperand(0),
-                                       GV->getParent()->getDataLayout());
-
-      if (!isAllocLikeFn(Ptr, TLI))
-        return false; // Too hard to analyze.
-
-      // Analyze all uses of the allocation.  If any of them are used in a
-      // non-simple way (e.g. stored to another global) bail out.
-      std::vector<Function *> ReadersWriters;
-      if (AnalyzeUsesOfPointer(Ptr, ReadersWriters, ReadersWriters, GV))
-        return false; // Loaded pointer escapes.
-
-      // Remember that this allocation is related to the indirect global.
-      AllocRelatedValues.push_back(Ptr);
-    } else {
-      // Something complex, bail out.
-      return false;
-    }
-  }
-
-  // Okay, this is an indirect global.  Remember all of the allocations for
-  // this global in AllocsForIndirectGlobals.
-  while (!AllocRelatedValues.empty()) {
-    AllocsForIndirectGlobals[AllocRelatedValues.back()] = GV;
-    AllocRelatedValues.pop_back();
-  }
-  IndirectGlobals.insert(GV);
-  return true;
-}
-
-/// AnalyzeCallGraph - At this point, we know the functions where globals are
-/// immediately stored to and read from.  Propagate this information up the call
-/// graph to all callers and compute the mod/ref info for all memory for each
-/// function.
-void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
-  // We do a bottom-up SCC traversal of the call graph.  In other words, we
-  // visit all callees before callers (leaf-first).
-  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
-    const std::vector<CallGraphNode *> &SCC = *I;
-    assert(!SCC.empty() && "SCC with no functions?");
-
-    if (!SCC[0]->getFunction()) {
-      // Calls externally - can't say anything useful.  Remove any existing
-      // function records (may have been created when scanning globals).
-      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
-        FunctionInfo.erase(SCC[i]->getFunction());
-      continue;
-    }
-
-    FunctionRecord &FR = FunctionInfo[SCC[0]->getFunction()];
-
-    bool KnowNothing = false;
-    unsigned FunctionEffect = 0;
-
-    // Collect the mod/ref properties due to called functions.  We only compute
-    // one mod-ref set.
-    for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) {
-      Function *F = SCC[i]->getFunction();
-      if (!F) {
-        KnowNothing = true;
-        break;
-      }
-
-      if (F->isDeclaration()) {
-        // Try to get mod/ref behaviour from function attributes.
-        if (F->doesNotAccessMemory()) {
-          // Can't do better than that!
-        } else if (F->onlyReadsMemory()) {
-          FunctionEffect |= Ref;
-          if (!F->isIntrinsic())
-            // This function might call back into the module and read a global -
-            // consider every global as possibly being read by this function.
-            FR.MayReadAnyGlobal = true;
-        } else {
-          FunctionEffect |= ModRef;
-          // Can't say anything useful unless it's an intrinsic - they don't
-          // read or write global variables of the kind considered here.
-          KnowNothing = !F->isIntrinsic();
-        }
-        continue;
-      }
-
-      for (CallGraphNode::iterator CI = SCC[i]->begin(), E = SCC[i]->end();
-           CI != E && !KnowNothing; ++CI)
-        if (Function *Callee = CI->second->getFunction()) {
-          if (FunctionRecord *CalleeFR = getFunctionInfo(Callee)) {
-            // Propagate function effect up.
-            FunctionEffect |= CalleeFR->FunctionEffect;
-
-            // Incorporate callee's effects on globals into our info.
-            for (const auto &G : CalleeFR->GlobalInfo)
-              FR.GlobalInfo[G.first] |= G.second;
-            FR.MayReadAnyGlobal |= CalleeFR->MayReadAnyGlobal;
-          } else {
-            // Can't say anything about it.  However, if it is inside our SCC,
-            // then nothing needs to be done.
-            CallGraphNode *CalleeNode = CG[Callee];
-            if (std::find(SCC.begin(), SCC.end(), CalleeNode) == SCC.end())
-              KnowNothing = true;
-          }
-        } else {
-          KnowNothing = true;
-        }
-    }
-
-    // If we can't say anything useful about this SCC, remove all SCC functions
-    // from the FunctionInfo map.
-    if (KnowNothing) {
-      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
-        FunctionInfo.erase(SCC[i]->getFunction());
-      continue;
-    }
-
-    // Scan the function bodies for explicit loads or stores.
-    for (auto *Node : SCC) {
-      if (FunctionEffect == ModRef)
-        break; // The mod/ref lattice saturates here.
-      for (Instruction &I : inst_range(Node->getFunction())) {
-        if (FunctionEffect == ModRef)
-          break; // The mod/ref lattice saturates here.
-
-        // We handle calls specially because the graph-relevant aspects are
-        // handled above.
-        if (auto CS = CallSite(&I)) {
-          if (isAllocationFn(&I, TLI) || isFreeCall(&I, TLI)) {
-            // FIXME: It is completely unclear why this is necessary and not
-            // handled by the above graph code.
-            FunctionEffect |= ModRef;
-          } else if (Function *Callee = CS.getCalledFunction()) {
-            // The callgraph doesn't include intrinsic calls.
-            if (Callee->isIntrinsic()) {
-              ModRefBehavior Behaviour =
-                  AliasAnalysis::getModRefBehavior(Callee);
-              FunctionEffect |= (Behaviour & ModRef);
-            }
-          }
-          continue;
-        }
-
-        // All non-call instructions we use the primary predicates for whether
-        // thay read or write memory.
-        if (I.mayReadFromMemory())
-          FunctionEffect |= Ref;
-        if (I.mayWriteToMemory())
-          FunctionEffect |= Mod;
-      }
-    }
-
-    if ((FunctionEffect & Mod) == 0)
-      ++NumReadMemFunctions;
-    if (FunctionEffect == 0)
-      ++NumNoMemFunctions;
-    FR.FunctionEffect = FunctionEffect;
-
-    // Finally, now that we know the full effect on this SCC, clone the
-    // information to each function in the SCC.
-    for (unsigned i = 1, e = SCC.size(); i != e; ++i)
-      FunctionInfo[SCC[i]->getFunction()] = FR;
-  }
-}
-
-/// alias - If one of the pointers is to a global that we are tracking, and the
-/// other is some random pointer, we know there cannot be an alias, because the
-/// address of the global isn't taken.
-AliasResult GlobalsModRef::alias(const MemoryLocation &LocA,
-                                 const MemoryLocation &LocB) {
-  // Get the base object these pointers point to.
-  const Value *UV1 = GetUnderlyingObject(LocA.Ptr, *DL);
-  const Value *UV2 = GetUnderlyingObject(LocB.Ptr, *DL);
-
-  // If either of the underlying values is a global, they may be non-addr-taken
-  // globals, which we can answer queries about.
-  const GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1);
-  const GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2);
-  if (GV1 || GV2) {
-    // If the global's address is taken, pretend we don't know it's a pointer to
-    // the global.
-    if (GV1 && !NonAddressTakenGlobals.count(GV1))
-      GV1 = nullptr;
-    if (GV2 && !NonAddressTakenGlobals.count(GV2))
-      GV2 = nullptr;
-
-    // If the two pointers are derived from two different non-addr-taken
-    // globals, or if one is and the other isn't, we know these can't alias.
-    if ((GV1 || GV2) && GV1 != GV2)
-      return NoAlias;
-
-    // Otherwise if they are both derived from the same addr-taken global, we
-    // can't know the two accesses don't overlap.
-  }
-
-  // These pointers may be based on the memory owned by an indirect global.  If
-  // so, we may be able to handle this.  First check to see if the base pointer
-  // is a direct load from an indirect global.
-  GV1 = GV2 = nullptr;
-  if (const LoadInst *LI = dyn_cast<LoadInst>(UV1))
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
-      if (IndirectGlobals.count(GV))
-        GV1 = GV;
-  if (const LoadInst *LI = dyn_cast<LoadInst>(UV2))
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
-      if (IndirectGlobals.count(GV))
-        GV2 = GV;
-
-  // These pointers may also be from an allocation for the indirect global.  If
-  // so, also handle them.
-  if (AllocsForIndirectGlobals.count(UV1))
-    GV1 = AllocsForIndirectGlobals[UV1];
-  if (AllocsForIndirectGlobals.count(UV2))
-    GV2 = AllocsForIndirectGlobals[UV2];
-
-  // Now that we know whether the two pointers are related to indirect globals,
-  // use this to disambiguate the pointers.  If either pointer is based on an
-  // indirect global and if they are not both based on the same indirect global,
-  // they cannot alias.
-  if ((GV1 || GV2) && GV1 != GV2)
-    return NoAlias;
-
-  return AliasAnalysis::alias(LocA, LocB);
-}
-
-AliasAnalysis::ModRefResult
-GlobalsModRef::getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc) {
-  unsigned Known = ModRef;
-
-  // If we are asking for mod/ref info of a direct call with a pointer to a
-  // global we are tracking, return information if we have it.
-  const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
-  if (const GlobalValue *GV =
-          dyn_cast<GlobalValue>(GetUnderlyingObject(Loc.Ptr, DL)))
-    if (GV->hasLocalLinkage())
-      if (const Function *F = CS.getCalledFunction())
-        if (NonAddressTakenGlobals.count(GV))
-          if (const FunctionRecord *FR = getFunctionInfo(F))
-            Known = FR->getInfoForGlobal(GV);
-
-  if (Known == NoModRef)
-    return NoModRef; // No need to query other mod/ref analyses
-  return ModRefResult(Known & AliasAnalysis::getModRefInfo(CS, Loc));
-}
-
-//===----------------------------------------------------------------------===//
-// Methods to update the analysis as a result of the client transformation.
-//
-void GlobalsModRef::deleteValue(Value *V) {
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    if (NonAddressTakenGlobals.erase(GV)) {
-      // This global might be an indirect global.  If so, remove it and remove
-      // any AllocRelatedValues for it.
-      if (IndirectGlobals.erase(GV)) {
-        // Remove any entries in AllocsForIndirectGlobals for this global.
-        for (std::map<const Value *, const GlobalValue *>::iterator
-                 I = AllocsForIndirectGlobals.begin(),
-                 E = AllocsForIndirectGlobals.end();
-             I != E;) {
-          if (I->second == GV) {
-            AllocsForIndirectGlobals.erase(I++);
-          } else {
-            ++I;
-          }
-        }
-      }
-    }
-  }
-
-  // Otherwise, if this is an allocation related to an indirect global, remove
-  // it.
-  AllocsForIndirectGlobals.erase(V);
-
-  AliasAnalysis::deleteValue(V);
-}
-
-void GlobalsModRef::addEscapingUse(Use &U) {
-  // For the purposes of this analysis, it is conservatively correct to treat
-  // a newly escaping value equivalently to a deleted one.  We could perhaps
-  // be more precise by processing the new use and attempting to update our
-  // saved analysis results to accommodate it.
-  deleteValue(U);
-
-  AliasAnalysis::addEscapingUse(U);
-}
diff --git a/contrib/llvm/lib/Analysis/IPA/IPA.cpp b/contrib/llvm/lib/Analysis/IPA/IPA.cpp
deleted file mode 100644
index 806bfb8..0000000
--- a/contrib/llvm/lib/Analysis/IPA/IPA.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===-- IPA.cpp -----------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the common initialization routines for the IPA library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/InitializePasses.h"
-#include "llvm-c/Initialization.h"
-#include "llvm/PassRegistry.h"
-
-using namespace llvm;
-
-/// initializeIPA - Initialize all passes linked into the IPA library.
-void llvm::initializeIPA(PassRegistry &Registry) {
-  initializeCallGraphWrapperPassPass(Registry);
-  initializeCallGraphPrinterPass(Registry);
-  initializeCallGraphViewerPass(Registry);
-  initializeGlobalsModRefPass(Registry);
-}
-
-void LLVMInitializeIPA(LLVMPassRegistryRef R) {
-  initializeIPA(*unwrap(R));
-}
diff --git a/contrib/llvm/lib/Analysis/IVUsers.cpp b/contrib/llvm/lib/Analysis/IVUsers.cpp
index 926787d..e0c5d8f 100644
--- a/contrib/llvm/lib/Analysis/IVUsers.cpp
+++ b/contrib/llvm/lib/Analysis/IVUsers.cpp
@@ -39,7 +39,7 @@ INITIALIZE_PASS_BEGIN(IVUsers, "iv-users",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(IVUsers, "iv-users",
                       "Induction Variable Users", false, true)
 
@@ -255,7 +255,7 @@ void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<ScalarEvolution>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -266,7 +266,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
       *L->getHeader()->getParent());
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
   // Collect ephemeral values so that AddUsersIfInteresting skips them.
   EphValues.clear();
@@ -276,7 +276,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   // them by stride.  Start by finding all of the PHI nodes in the header for
   // this loop.  If they are induction variables, inspect their uses.
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
-    (void)AddUsersIfInteresting(I);
+    (void)AddUsersIfInteresting(&*I);
 
   return false;
 }
diff --git a/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp
index c0d2e37..a86a703 100644
--- a/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp
+++ b/contrib/llvm/lib/Analysis/InlineCost.cpp
@@ -115,11 +115,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// inlining has the given attribute set either at the call site or the
   /// function declaration.  Primarily used to inspect call site specific
   /// attributes since these can be more precise than the ones on the callee
-  /// itself. 
+  /// itself.
   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
   
   /// Return true if the given value is known non null within the callee if
-  /// inlined through this particular callsite. 
+  /// inlined through this particular callsite.
   bool isKnownNonNullInCallee(Value *V);
 
   // Custom analysis routines.
@@ -156,6 +156,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitSwitchInst(SwitchInst &SI);
   bool visitIndirectBrInst(IndirectBrInst &IBI);
   bool visitResumeInst(ResumeInst &RI);
+  bool visitCleanupReturnInst(CleanupReturnInst &RI);
+  bool visitCatchReturnInst(CatchReturnInst &RI);
   bool visitUnreachableInst(UnreachableInst &I);
 
 public:
@@ -832,8 +834,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
-    // bonus we want to apply, but don't go below zero.
-    Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
+    // threshold to get the bonus we want to apply, but don't go below zero.
+    Cost -= std::max(0, CA.getThreshold() - CA.getCost());
   }
 
   return Base::visitCallSite(CS);
@@ -903,6 +905,18 @@ bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
   return false;
 }
 
+bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a cleanupret instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a catchret instruction.
+  return false;
+}
+
 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
   // FIXME: It might be reasonably to discount the cost of instructions leading
   // to unreachable as they have the lowest possible impact on both runtime and
@@ -946,20 +960,21 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
       continue;
 
     // Skip ephemeral values.
-    if (EphValues.count(I))
+    if (EphValues.count(&*I))
       continue;
 
     ++NumInstructions;
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
 
-    // If the instruction is floating point, and the target says this operation is
-    // expensive or the function has the "use-soft-float" attribute, this may
-    // eventually become a library call.  Treat the cost as such.
+    // If the instruction is floating point, and the target says this operation
+    // is expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call. Treat the cost as such.
     if (I->getType()->isFloatingPointTy()) {
       bool hasSoftFloatAttr = false;
 
-      // If the function has the "use-soft-float" attribute, mark it as expensive.
+      // If the function has the "use-soft-float" attribute, mark it as
+      // expensive.
       if (F.hasFnAttribute("use-soft-float")) {
         Attribute Attr = F.getFnAttribute("use-soft-float");
         StringRef Val = Attr.getValueAsString();
@@ -977,7 +992,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // all of the per-instruction logic. The visit tree returns true if we
     // consumed the instruction in any way, and false if the instruction's base
     // cost should count against inlining.
-    if (Base::visit(I))
+    if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
       Cost += InlineConstants::InstrCost;
@@ -1157,15 +1172,15 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
        FAI != FAE; ++FAI, ++CAI) {
     assert(CAI != CS.arg_end());
     if (Constant *C = dyn_cast<Constant>(CAI))
-      SimplifiedValues[FAI] = C;
+      SimplifiedValues[&*FAI] = C;
 
     Value *PtrArg = *CAI;
     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
-      ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
+      ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
 
       // We can SROA any pointer arguments derived from alloca instructions.
       if (isa<AllocaInst>(PtrArg)) {
-        SROAArgValues[FAI] = PtrArg;
+        SROAArgValues[&*FAI] = PtrArg;
         SROAArgCosts[PtrArg] = 0;
       }
     }
@@ -1281,7 +1296,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   else if (NumVectorInstructions <= NumInstructions / 2)
     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
 
-  return Cost < Threshold;
+  return Cost <= std::max(0, Threshold);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1304,36 +1319,6 @@ void CallAnalyzer::dump() {
 }
 #endif
 
-INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
-                      true, true)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
-                    true, true)
-
-char InlineCostAnalysis::ID = 0;
-
-InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {}
-
-InlineCostAnalysis::~InlineCostAnalysis() {}
-
-void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequired<TargetTransformInfoWrapperPass>();
-  CallGraphSCCPass::getAnalysisUsage(AU);
-}
-
-bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
-  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
-  ACT = &getAnalysis<AssumptionCacheTracker>();
-  return false;
-}
-
-InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
-  return getInlineCost(CS, CS.getCalledFunction(), Threshold);
-}
-
 /// \brief Test that two functions either have or have not the given attribute
 ///        at the same time.
 template<typename AttrKind>
@@ -1346,14 +1331,19 @@ static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
 static bool functionsHaveCompatibleAttributes(Function *Caller,
                                               Function *Callee,
                                               TargetTransformInfo &TTI) {
-  return TTI.hasCompatibleFunctionAttributes(Caller, Callee) &&
-         attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
-         attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
-         attributeMatches(Caller, Callee, Attribute::SanitizeThread);
+  return TTI.areInlineCompatible(Caller, Callee) &&
+         AttributeFuncs::areInlineCompatible(*Caller, *Callee);
+}
+
+InlineCost llvm::getInlineCost(CallSite CS, int Threshold,
+                               TargetTransformInfo &CalleeTTI,
+                               AssumptionCacheTracker *ACT) {
+  return getInlineCost(CS, CS.getCalledFunction(), Threshold, CalleeTTI, ACT);
 }
 
-InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
-                                             int Threshold) {
+InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, int Threshold,
+                               TargetTransformInfo &CalleeTTI,
+                               AssumptionCacheTracker *ACT) {
   // Cannot inline indirect calls.
   if (!Callee)
     return llvm::InlineCost::getNever();
@@ -1368,8 +1358,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
 
   // Never inline functions with conflicting attributes (unless callee has
   // always-inline attribute).
-  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
-                                         TTIWP->getTTI(*Callee)))
+  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
     return llvm::InlineCost::getNever();
 
   // Don't inline this call if the caller has the optnone attribute.
@@ -1386,7 +1375,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
         << "...\n");
 
-  CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
+  CallAnalyzer CA(CalleeTTI, ACT, *Callee, Threshold, CS);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
@@ -1400,7 +1389,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
 }
 
-bool InlineCostAnalysis::isInlineViable(Function &F) {
+bool llvm::isInlineViable(Function &F) {
   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Disallow inlining of functions which contain indirect branches or
@@ -1408,9 +1397,8 @@ bool InlineCostAnalysis::isInlineViable(Function &F) {
     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
       return false;
 
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
-         ++II) {
-      CallSite CS(II);
+    for (auto &II : *BI) {
+      CallSite CS(&II);
       if (!CS)
         continue;
 
diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
index a7f8f5c..6dfe625 100644
--- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -70,7 +70,7 @@ static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned);
 static Value *SimplifyXorInst(Value *, Value *, const Query &, unsigned);
 static Value *SimplifyTruncInst(Value *, Type *, const Query &, unsigned);
 
-/// getFalse - For a boolean type, or a vector of boolean type, return false, or
+/// For a boolean type, or a vector of boolean type, return false, or
 /// a vector with every element false, as appropriate for the type.
 static Constant *getFalse(Type *Ty) {
   assert(Ty->getScalarType()->isIntegerTy(1) &&
@@ -78,7 +78,7 @@ static Constant *getFalse(Type *Ty) {
   return Constant::getNullValue(Ty);
 }
 
-/// getTrue - For a boolean type, or a vector of boolean type, return true, or
+/// For a boolean type, or a vector of boolean type, return true, or
 /// a vector with every element true, as appropriate for the type.
 static Constant *getTrue(Type *Ty) {
   assert(Ty->getScalarType()->isIntegerTy(1) &&
@@ -100,7 +100,7 @@ static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
     CRHS == LHS;
 }
 
-/// ValueDominatesPHI - Does the given value dominate the specified phi node?
+/// Does the given value dominate the specified phi node?
 static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
@@ -122,7 +122,7 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
     return DT->dominates(I, P);
   }
 
-  // Otherwise, if the instruction is in the entry block, and is not an invoke,
+  // Otherwise, if the instruction is in the entry block and is not an invoke,
   // then it obviously dominates all phi nodes.
   if (I->getParent() == &I->getParent()->getParent()->getEntryBlock() &&
       !isa<InvokeInst>(I))
@@ -131,8 +131,8 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   return false;
 }
 
-/// ExpandBinOp - Simplify "A op (B op' C)" by distributing op over op', turning
-/// it into "(A op B) op' (A op C)".  Here "op" is given by Opcode and "op'" is
+/// Simplify "A op (B op' C)" by distributing op over op', turning it into
+/// "(A op B) op' (A op C)".  Here "op" is given by Opcode and "op'" is
 /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
@@ -193,8 +193,8 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   return nullptr;
 }
 
-/// SimplifyAssociativeBinOp - Generic simplifications for associative binary
-/// operations.  Returns the simpler value, or null if none was found.
+/// Generic simplifications for associative binary operations.
+/// Returns the simpler value, or null if none was found.
 static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
                                        const Query &Q, unsigned MaxRecurse) {
   Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc;
@@ -290,10 +290,10 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
   return nullptr;
 }
 
-/// ThreadBinOpOverSelect - In the case of a binary operation with a select
-/// instruction as an operand, try to simplify the binop by seeing whether
-/// evaluating it on both branches of the select results in the same value.
-/// Returns the common value if so, otherwise returns null.
+/// In the case of a binary operation with a select instruction as an operand,
+/// try to simplify the binop by seeing whether evaluating it on both branches
+/// of the select results in the same value. Returns the common value if so,
+/// otherwise returns null.
 static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
                                     const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -362,10 +362,9 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
   return nullptr;
 }
 
-/// ThreadCmpOverSelect - In the case of a comparison with a select instruction,
-/// try to simplify the comparison by seeing whether both branches of the select
-/// result in the same value.  Returns the common value if so, otherwise returns
-/// null.
+/// In the case of a comparison with a select instruction, try to simplify the
+/// comparison by seeing whether both branches of the select result in the same
+/// value. Returns the common value if so, otherwise returns null.
 static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const Query &Q,
                                   unsigned MaxRecurse) {
@@ -444,10 +443,10 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
-/// ThreadBinOpOverPHI - In the case of a binary operation with an operand that
-/// is a PHI instruction, try to simplify the binop by seeing whether evaluating
-/// it on the incoming phi values yields the same result for every value.  If so
-/// returns the common value, otherwise returns null.
+/// In the case of a binary operation with an operand that is a PHI instruction,
+/// try to simplify the binop by seeing whether evaluating it on the incoming
+/// phi values yields the same result for every value. If so returns the common
+/// value, otherwise returns null.
 static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
                                  const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -486,10 +485,10 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
   return CommonValue;
 }
 
-/// ThreadCmpOverPHI - In the case of a comparison with a PHI instruction, try
-/// try to simplify the comparison by seeing whether comparing with all of the
-/// incoming phi values yields the same result every time.  If so returns the
-/// common result, otherwise returns null.
+/// In the case of a comparison with a PHI instruction, try to simplify the
+/// comparison by seeing whether comparing with all of the incoming phi values
+/// yields the same result every time. If so returns the common result,
+/// otherwise returns null.
 static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
                                const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -524,8 +523,8 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   return CommonValue;
 }
 
-/// SimplifyAddInst - Given operands for an Add, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an Add, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
@@ -656,8 +655,8 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
   return ConstantExpr::getSub(LHSOffset, RHSOffset);
 }
 
-/// SimplifySubInst - Given operands for a Sub, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a Sub, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0))
@@ -889,8 +888,8 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
  return nullptr;
 }
 
-/// SimplifyMulInst - Given operands for a Mul, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a Mul, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
@@ -989,8 +988,8 @@ Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL,
                            RecursionLimit);
 }
 
-/// SimplifyDiv - Given operands for an SDiv or UDiv, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an SDiv or UDiv, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const Query &Q, unsigned MaxRecurse) {
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
@@ -1075,8 +1074,8 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   return nullptr;
 }
 
-/// SimplifySDivInst - Given operands for an SDiv, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an SDiv, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse))
@@ -1093,8 +1092,8 @@ Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout &DL,
                             RecursionLimit);
 }
 
-/// SimplifyUDivInst - Given operands for a UDiv, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a UDiv, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
@@ -1154,8 +1153,8 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                             RecursionLimit);
 }
 
-/// SimplifyRem - Given operands for an SRem or URem, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an SRem or URem, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const Query &Q, unsigned MaxRecurse) {
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
@@ -1215,8 +1214,8 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   return nullptr;
 }
 
-/// SimplifySRemInst - Given operands for an SRem, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an SRem, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse))
@@ -1233,8 +1232,8 @@ Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout &DL,
                             RecursionLimit);
 }
 
-/// SimplifyURemInst - Given operands for a URem, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a URem, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
@@ -1279,7 +1278,7 @@ Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                             RecursionLimit);
 }
 
-/// isUndefShift - Returns true if a shift by \c Amount always yields undef.
+/// Returns true if a shift by \c Amount always yields undef.
 static bool isUndefShift(Value *Amount) {
   Constant *C = dyn_cast<Constant>(Amount);
   if (!C)
@@ -1306,8 +1305,8 @@ static bool isUndefShift(Value *Amount) {
   return false;
 }
 
-/// SimplifyShift - Given operands for an Shl, LShr or AShr, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an Shl, LShr or AShr, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
                             const Query &Q, unsigned MaxRecurse) {
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
@@ -1375,8 +1374,8 @@ static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
   return nullptr;
 }
 
-/// SimplifyShlInst - Given operands for an Shl, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an Shl, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
@@ -1402,8 +1401,8 @@ Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                            RecursionLimit);
 }
 
-/// SimplifyLShrInst - Given operands for an LShr, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an LShr, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                                const Query &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
@@ -1427,8 +1426,8 @@ Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                             RecursionLimit);
 }
 
-/// SimplifyAShrInst - Given operands for an AShr, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an AShr, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                const Query &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
@@ -1502,8 +1501,8 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
   return nullptr;
 }
 
-// Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range
-// of possible values cannot be satisfied.
+/// Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range
+/// of possible values cannot be satisfied.
 static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   ConstantInt *CI1, *CI2;
@@ -1554,8 +1553,8 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   return nullptr;
 }
 
-/// SimplifyAndInst - Given operands for an And, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an And, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
@@ -1661,8 +1660,8 @@ Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout &DL,
                            RecursionLimit);
 }
 
-// Simplify (or (icmp ...) (icmp ...)) to true when we can tell that the union
-// contains all possible values.
+/// Simplify (or (icmp ...) (icmp ...)) to true when we can tell that the union
+/// contains all possible values.
 static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   ConstantInt *CI1, *CI2;
@@ -1713,8 +1712,8 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   return nullptr;
 }
 
-/// SimplifyOrInst - Given operands for an Or, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an Or, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                              unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
@@ -1849,8 +1848,8 @@ Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL,
                           RecursionLimit);
 }
 
-/// SimplifyXorInst - Given operands for a Xor, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a Xor, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
@@ -1910,9 +1909,9 @@ static Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
-/// ExtractEquivalentCondition - Rummage around inside V looking for something
-/// equivalent to the comparison "LHS Pred RHS".  Return such a value if found,
-/// otherwise return null.  Helper function for analyzing max/min idioms.
+/// Rummage around inside V looking for something equivalent to the comparison
+/// "LHS Pred RHS". Return such a value if found, otherwise return null.
+/// Helper function for analyzing max/min idioms.
 static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
                                          Value *LHS, Value *RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
@@ -2090,8 +2089,7 @@ static Constant *computePointerICmp(const DataLayout &DL,
 
     // Is the set of underlying objects all noalias calls?
     auto IsNAC = [](SmallVectorImpl<Value *> &Objects) {
-      return std::all_of(Objects.begin(), Objects.end(),
-                         [](Value *V){ return isNoAliasCall(V); });
+      return std::all_of(Objects.begin(), Objects.end(), isNoAliasCall);
     };
 
     // Is the set of underlying objects all things which must be disjoint from
@@ -2101,21 +2099,17 @@ static Constant *computePointerICmp(const DataLayout &DL,
     // that might be resolve lazily to symbols in another dynamically-loaded
     // library (and, thus, could be malloc'ed by the implementation).
     auto IsAllocDisjoint = [](SmallVectorImpl<Value *> &Objects) {
-      return std::all_of(Objects.begin(), Objects.end(),
-                         [](Value *V){
-                           if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
-                             return AI->getParent() && AI->getParent()->getParent() &&
-                                    AI->isStaticAlloca();
-                           if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-                             return (GV->hasLocalLinkage() ||
-                                     GV->hasHiddenVisibility() ||
-                                     GV->hasProtectedVisibility() ||
-                                     GV->hasUnnamedAddr()) &&
-                                    !GV->isThreadLocal();
-                           if (const Argument *A = dyn_cast<Argument>(V))
-                             return A->hasByValAttr();
-                           return false;
-                         });
+      return std::all_of(Objects.begin(), Objects.end(), [](Value *V) {
+        if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
+          return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
+        if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+          return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
+                  GV->hasProtectedVisibility() || GV->hasUnnamedAddr()) &&
+                 !GV->isThreadLocal();
+        if (const Argument *A = dyn_cast<Argument>(V))
+          return A->hasByValAttr();
+        return false;
+      });
     };
 
     if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) ||
@@ -2128,8 +2122,8 @@ static Constant *computePointerICmp(const DataLayout &DL,
   return nullptr;
 }
 
-/// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an ICmpInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const Query &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
@@ -2176,6 +2170,19 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       // X >=u 1 -> X
       if (match(RHS, m_One()))
         return LHS;
+      if (isImpliedCondition(RHS, LHS, Q.DL))
+        return getTrue(ITy);
+      break;
+    case ICmpInst::ICMP_SGE:
+      /// For signed comparison, the values for an i1 are 0 and -1 
+      /// respectively. This maps into a truth table of:
+      /// LHS | RHS | LHS >=s RHS   | LHS implies RHS
+      ///  0  |  0  |  1 (0 >= 0)   |  1
+      ///  0  |  1  |  1 (0 >= -1)  |  1
+      ///  1  |  0  |  0 (-1 >= 0)  |  0
+      ///  1  |  1  |  1 (-1 >= -1) |  1
+      if (isImpliedCondition(LHS, RHS, Q.DL))
+        return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
       // X <s 0 -> X
@@ -2187,6 +2194,10 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (match(RHS, m_One()))
         return LHS;
       break;
+    case ICmpInst::ICMP_ULE:
+      if (isImpliedCondition(LHS, RHS, Q.DL))
+        return getTrue(ITy);
+      break;
     }
   }
 
@@ -2360,9 +2371,19 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     } else if (match(LHS, m_And(m_Value(), m_ConstantInt(CI2)))) {
       // 'and x, CI2' produces [0, CI2].
       Upper = CI2->getValue() + 1;
+    } else if (match(LHS, m_NUWAdd(m_Value(), m_ConstantInt(CI2)))) {
+      // 'add nuw x, CI2' produces [CI2, UINT_MAX].
+      Lower = CI2->getValue();
     }
-    if (Lower != Upper) {
-      ConstantRange LHS_CR = ConstantRange(Lower, Upper);
+
+    ConstantRange LHS_CR = Lower != Upper ? ConstantRange(Lower, Upper)
+                                          : ConstantRange(Width, true);
+
+    if (auto *I = dyn_cast<Instruction>(LHS))
+      if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
+        LHS_CR = LHS_CR.intersectWith(getConstantRangeFromMetadata(*Ranges));
+
+    if (!LHS_CR.isFullSet()) {
       if (RHS_CR.contains(LHS_CR))
         return ConstantInt::getTrue(RHS->getContext());
       if (RHS_CR.inverse().contains(LHS_CR))
@@ -2370,6 +2391,30 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // If both operands have range metadata, use the metadata
+  // to simplify the comparison.
+  if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
+    auto RHS_Instr = dyn_cast<Instruction>(RHS);
+    auto LHS_Instr = dyn_cast<Instruction>(LHS);
+
+    if (RHS_Instr->getMetadata(LLVMContext::MD_range) &&
+        LHS_Instr->getMetadata(LLVMContext::MD_range)) {
+      auto RHS_CR = getConstantRangeFromMetadata(
+          *RHS_Instr->getMetadata(LLVMContext::MD_range));
+      auto LHS_CR = getConstantRangeFromMetadata(
+          *LHS_Instr->getMetadata(LLVMContext::MD_range));
+
+      auto Satisfied_CR = ConstantRange::makeSatisfyingICmpRegion(Pred, RHS_CR);
+      if (Satisfied_CR.contains(LHS_CR))
+        return ConstantInt::getTrue(RHS->getContext());
+
+      auto InversedSatisfied_CR = ConstantRange::makeSatisfyingICmpRegion(
+                CmpInst::getInversePredicate(Pred), RHS_CR);
+      if (InversedSatisfied_CR.contains(LHS_CR))
+        return ConstantInt::getFalse(RHS->getContext());
+    }
+  }
+
   // Compare of cast, for example (zext X) != 0 -> X != 0
   if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
     Instruction *LI = cast<CastInst>(LHS);
@@ -2529,6 +2574,14 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // icmp eq|ne X, Y -> false|true if X != Y
+  if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+      isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT)) {
+    LLVMContext &Ctx = LHS->getType()->getContext();
+    return Pred == ICmpInst::ICMP_NE ?
+      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
+  }
+  
   // Special logic for binary operators.
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
@@ -3039,13 +3092,13 @@ Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const DataLayout &DL,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT, AssumptionCache *AC,
-                              Instruction *CxtI) {
+                              const Instruction *CxtI) {
   return ::SimplifyICmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
                             RecursionLimit);
 }
 
-/// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an FCmpInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                FastMathFlags FMF, const Query &Q,
                                unsigned MaxRecurse) {
@@ -3169,8 +3222,7 @@ Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                             Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
-/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
-/// replaced with RepOp.
+/// See if V simplifies when its operand Op is replaced with RepOp.
 static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                            const Query &Q,
                                            unsigned MaxRecurse) {
@@ -3253,8 +3305,8 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   return nullptr;
 }
 
-/// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
-/// the result.  If not, this returns null.
+/// Given operands for a SelectInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
                                  Value *FalseVal, const Query &Q,
                                  unsigned MaxRecurse) {
@@ -3391,8 +3443,8 @@ Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                               Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
-/// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for an GetElementPtrInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                               const Query &Q, unsigned) {
   // The type of the GEP pointer operand.
@@ -3484,8 +3536,8 @@ Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout &DL,
       Ops, Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
-/// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
-/// can fold the result.  If not, this returns null.
+/// Given operands for an InsertValueInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
                                       ArrayRef<unsigned> Idxs, const Query &Q,
                                       unsigned) {
@@ -3521,8 +3573,8 @@ Value *llvm::SimplifyInsertValueInst(
                                    RecursionLimit);
 }
 
-/// SimplifyExtractValueInst - Given operands for an ExtractValueInst, see if we
-/// can fold the result.  If not, this returns null.
+/// Given operands for an ExtractValueInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                        const Query &, unsigned) {
   if (auto *CAgg = dyn_cast<Constant>(Agg))
@@ -3556,8 +3608,8 @@ Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                     RecursionLimit);
 }
 
-/// SimplifyExtractElementInst - Given operands for an ExtractElementInst, see if we
-/// can fold the result.  If not, this returns null.
+/// Given operands for an ExtractElementInst, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &,
                                          unsigned) {
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
@@ -3588,7 +3640,7 @@ Value *llvm::SimplifyExtractElementInst(
                                       RecursionLimit);
 }
 
-/// SimplifyPHINode - See if we can fold the given phi.  If not, returns null.
+/// See if we can fold the given phi. If not, returns null.
 static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
@@ -3638,8 +3690,8 @@ Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout &DL,
 
 //=== Helper functions for higher up the class hierarchy.
 
-/// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a BinaryOperator, see if we can fold the result.
+/// If not, this returns null.
 static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const Query &Q, unsigned MaxRecurse) {
   switch (Opcode) {
@@ -3705,8 +3757,8 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   }
 }
 
-/// SimplifyFPBinOp - Given operands for a BinaryOperator, see if we can
-/// fold the result.  If not, this returns null.
+/// Given operands for a BinaryOperator, see if we can fold the result.
+/// If not, this returns null.
 /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
 /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
 static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
@@ -3741,8 +3793,7 @@ Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            RecursionLimit);
 }
 
-/// SimplifyCmpInst - Given operands for a CmpInst, see if we can
-/// fold the result.
+/// Given operands for a CmpInst, see if we can fold the result.
 static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const Query &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
@@ -3880,8 +3931,8 @@ Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
                         Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
-/// SimplifyInstruction - See if we can compute a simplified version of this
-/// instruction.  If not, this returns null.
+/// See if we can compute a simplified version of this instruction.
+/// If not, this returns null.
 Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
                                  const TargetLibraryInfo *TLI,
                                  const DominatorTree *DT, AssumptionCache *AC) {
@@ -4024,6 +4075,17 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
     break;
   }
 
+  // In general, it is possible for computeKnownBits to determine all bits in a
+  // value even when the operands are not all constants.
+  if (!Result && I->getType()->isIntegerTy()) {
+    unsigned BitWidth = I->getType()->getScalarSizeInBits();
+    APInt KnownZero(BitWidth, 0);
+    APInt KnownOne(BitWidth, 0);
+    computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT);
+    if ((KnownZero | KnownOne).isAllOnesValue())
+      Result = ConstantInt::get(I->getContext(), KnownOne);
+  }
+
   /// If called on unreachable code, the above logic may report that the
   /// instruction simplified to itself.  Make life easier for users by
   /// detecting that case here, returning a safe value instead.
diff --git a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
index c8d0410..0f0f31e 100644
--- a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -198,7 +198,8 @@ void LazyCallGraph::SCC::insertOutgoingEdge(Node &CallerN, Node &CalleeN) {
   assert(CalleeC.isDescendantOf(*this) &&
          "Callee must be a descendant of the Caller.");
 
-  // The only change required is to add this SCC to the parent set of the callee.
+  // The only change required is to add this SCC to the parent set of the
+  // callee.
   CalleeC.ParentSCCs.insert(this);
 }
 
@@ -454,8 +455,7 @@ void LazyCallGraph::SCC::internalDFS(
 }
 
 SmallVector<LazyCallGraph::SCC *, 1>
-LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN,
-                                       Node &CalleeN) {
+LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN, Node &CalleeN) {
   // First remove it from the node.
   CallerN.removeEdgeInternal(CalleeN.getFunction());
 
@@ -522,7 +522,7 @@ LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN,
   // the leaf SCC list.
   if (!IsLeafSCC && !ResultSCCs.empty())
     G->LeafSCCs.erase(std::remove(G->LeafSCCs.begin(), G->LeafSCCs.end(), this),
-                     G->LeafSCCs.end());
+                      G->LeafSCCs.end());
 
   // Return the new list of SCCs.
   return ResultSCCs;
diff --git a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
index a6ae7f2..0d1d34e 100644
--- a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
@@ -64,10 +65,10 @@ class LVILatticeVal {
   enum LatticeValueTy {
     /// This Value has no known value yet.
     undefined,
-    
+
     /// This Value has a specific constant value.
     constant,
-    
+
     /// This Value is known to not have the specified value.
     notconstant,
 
@@ -77,13 +78,13 @@ class LVILatticeVal {
     /// This value is not known to be constant, and we know that it has a value.
     overdefined
   };
-  
+
   /// Val: This stores the current lattice value along with the Constant* for
   /// the constant if this is a 'constant' or 'notconstant' value.
   LatticeValueTy Tag;
   Constant *Val;
   ConstantRange Range;
-  
+
 public:
   LVILatticeVal() : Tag(undefined), Val(nullptr), Range(1, true) {}
 
@@ -104,29 +105,34 @@ public:
     Res.markConstantRange(CR);
     return Res;
   }
+  static LVILatticeVal getOverdefined() {
+    LVILatticeVal Res;
+    Res.markOverdefined();
+    return Res;
+  }
   
   bool isUndefined() const     { return Tag == undefined; }
   bool isConstant() const      { return Tag == constant; }
   bool isNotConstant() const   { return Tag == notconstant; }
   bool isConstantRange() const { return Tag == constantrange; }
   bool isOverdefined() const   { return Tag == overdefined; }
-  
+
   Constant *getConstant() const {
     assert(isConstant() && "Cannot get the constant of a non-constant!");
     return Val;
   }
-  
+
   Constant *getNotConstant() const {
     assert(isNotConstant() && "Cannot get the constant of a non-notconstant!");
     return Val;
   }
-  
+
   ConstantRange getConstantRange() const {
     assert(isConstantRange() &&
            "Cannot get the constant-range of a non-constant-range!");
     return Range;
   }
-  
+
   /// Return true if this is a change in status.
   bool markOverdefined() {
     if (isOverdefined())
@@ -150,7 +156,7 @@ public:
     Val = V;
     return true;
   }
-  
+
   /// Return true if this is a change in status.
   bool markNotConstant(Constant *V) {
     assert(V && "Marking constant with NULL");
@@ -168,27 +174,27 @@ public:
     Val = V;
     return true;
   }
-  
+
   /// Return true if this is a change in status.
   bool markConstantRange(const ConstantRange NewR) {
     if (isConstantRange()) {
       if (NewR.isEmptySet())
         return markOverdefined();
-      
+
       bool changed = Range != NewR;
       Range = NewR;
       return changed;
     }
-    
+
     assert(isUndefined());
     if (NewR.isEmptySet())
       return markOverdefined();
-    
+
     Tag = constantrange;
     Range = NewR;
     return true;
   }
-  
+
   /// Merge the specified lattice value into this one, updating this
   /// one and returning true if anything changed.
   bool mergeIn(const LVILatticeVal &RHS, const DataLayout &DL) {
@@ -267,7 +273,7 @@ public:
     return markConstantRange(NewR);
   }
 };
-  
+
 } // end anonymous namespace.
 
 namespace llvm {
@@ -295,9 +301,9 @@ raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
 namespace {
   /// A callback value handle updates the cache when values are erased.
   class LazyValueInfoCache;
-  struct LVIValueHandle : public CallbackVH {
+  struct LVIValueHandle final : public CallbackVH {
     LazyValueInfoCache *Parent;
-      
+
     LVIValueHandle(Value *V, LazyValueInfoCache *P)
       : CallbackVH(V), Parent(P) { }
 
@@ -308,24 +314,27 @@ namespace {
   };
 }
 
-namespace { 
+namespace {
   /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
     /// This is all of the cached block information for exactly one Value*.
     /// The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
-    typedef std::map<AssertingVH<BasicBlock>, LVILatticeVal> ValueCacheEntryTy;
+    /// Over-defined lattice values are recorded in OverDefinedCache to reduce
+    /// memory overhead.
+    typedef SmallDenseMap<AssertingVH<BasicBlock>, LVILatticeVal, 4>
+        ValueCacheEntryTy;
 
     /// This is all of the cached information for all values,
     /// mapped from Value* to key information.
     std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache;
-    
+
     /// This tracks, on a per-block basis, the set of values that are
-    /// over-defined at the end of that block.  This is required
-    /// for cache updating.
-    typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy;
-    DenseSet<OverDefinedPairTy> OverDefinedCache;
+    /// over-defined at the end of that block.
+    typedef DenseMap<AssertingVH<BasicBlock>, SmallPtrSet<Value *, 4>>
+        OverDefinedCacheTy;
+    OverDefinedCacheTy OverDefinedCache;
 
     /// Keep track of all blocks that we have ever seen, so we
     /// don't spend time removing unused blocks from our caches.
@@ -357,9 +366,13 @@ namespace {
 
     void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
       SeenBlocks.insert(BB);
-      lookup(Val)[BB] = Result;
+
+      // Insert over-defined values into their own cache to reduce memory
+      // overhead.
       if (Result.isOverdefined())
-        OverDefinedCache.insert(std::make_pair(BB, Val));
+        OverDefinedCache[BB].insert(Val);
+      else
+        lookup(Val)[BB] = Result;
     }
 
     LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
@@ -382,11 +395,39 @@ namespace {
                                             Instruction *BBI);
 
     void solve();
-    
+
     ValueCacheEntryTy &lookup(Value *V) {
       return ValueCache[LVIValueHandle(V, this)];
     }
 
+    bool isOverdefined(Value *V, BasicBlock *BB) const {
+      auto ODI = OverDefinedCache.find(BB);
+
+      if (ODI == OverDefinedCache.end())
+        return false;
+
+      return ODI->second.count(V);
+    }
+
+    bool hasCachedValueInfo(Value *V, BasicBlock *BB) {
+      if (isOverdefined(V, BB))
+        return true;
+
+      LVIValueHandle ValHandle(V, this);
+      auto I = ValueCache.find(ValHandle);
+      if (I == ValueCache.end())
+        return false;
+
+      return I->second.count(BB);
+    }
+
+    LVILatticeVal getCachedValueInfo(Value *V, BasicBlock *BB) {
+      if (isOverdefined(V, BB))
+        return LVILatticeVal::getOverdefined();
+
+      return lookup(V)[BB];
+    }
+    
   public:
     /// This is the query interface to determine the lattice
     /// value for the specified Value* at the end of the specified block.
@@ -402,15 +443,15 @@ namespace {
     /// value for the specified Value* that is true on the specified edge.
     LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB,
                                  Instruction *CxtI = nullptr);
-    
+
     /// This is the update interface to inform the cache that an edge from
     /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc.
     void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
-    
+
     /// This is part of the update interface to inform the cache
     /// that a block has been deleted.
     void eraseBlock(BasicBlock *BB);
-    
+
     /// clear - Empty the cache.
     void clear() {
       SeenBlocks.clear();
@@ -425,15 +466,17 @@ namespace {
 } // end anonymous namespace
 
 void LVIValueHandle::deleted() {
-  typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy;
-  
-  SmallVector<OverDefinedPairTy, 4> ToErase;
-  for (const OverDefinedPairTy &P : Parent->OverDefinedCache)
-    if (P.second == getValPtr())
-      ToErase.push_back(P);
-  for (const OverDefinedPairTy &P : ToErase)
-    Parent->OverDefinedCache.erase(P);
-  
+  SmallVector<AssertingVH<BasicBlock>, 4> ToErase;
+  for (auto &I : Parent->OverDefinedCache) {
+    SmallPtrSetImpl<Value *> &ValueSet = I.second;
+    if (ValueSet.count(getValPtr()))
+      ValueSet.erase(getValPtr());
+    if (ValueSet.empty())
+      ToErase.push_back(I.first);
+  }
+  for (auto &BB : ToErase)
+    Parent->OverDefinedCache.erase(BB);
+
   // This erasure deallocates *this, so it MUST happen after we're done
   // using any and all members of *this.
   Parent->ValueCache.erase(*this);
@@ -446,15 +489,11 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
     return;
   SeenBlocks.erase(I);
 
-  SmallVector<OverDefinedPairTy, 4> ToErase;
-  for (const OverDefinedPairTy& P : OverDefinedCache)
-    if (P.first == BB)
-      ToErase.push_back(P);
-  for (const OverDefinedPairTy &P : ToErase)
-    OverDefinedCache.erase(P);
+  auto ODI = OverDefinedCache.find(BB);
+  if (ODI != OverDefinedCache.end())
+    OverDefinedCache.erase(ODI);
 
-  for (std::map<LVIValueHandle, ValueCacheEntryTy>::iterator
-       I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I)
+  for (auto I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I)
     I->second.erase(BB);
 }
 
@@ -466,7 +505,8 @@ void LazyValueInfoCache::solve() {
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
       assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
-      assert(lookup(e.second).count(e.first) && "Result should be in cache!");
+      assert(hasCachedValueInfo(e.second, e.first) &&
+             "Result should be in cache!");
 
       BlockValueStack.pop();
       BlockValueSet.erase(e);
@@ -482,11 +522,7 @@ bool LazyValueInfoCache::hasBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  LVIValueHandle ValHandle(Val, this);
-  std::map<LVIValueHandle, ValueCacheEntryTy>::iterator I =
-    ValueCache.find(ValHandle);
-  if (I == ValueCache.end()) return false;
-  return I->second.count(BB);
+  return hasCachedValueInfo(Val, BB);
 }
 
 LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
@@ -495,17 +531,36 @@ LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
     return LVILatticeVal::get(VC);
 
   SeenBlocks.insert(BB);
-  return lookup(Val)[BB];
+  return getCachedValueInfo(Val, BB);
+}
+
+static LVILatticeVal getFromRangeMetadata(Instruction *BBI) {
+  switch (BBI->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+  case Instruction::Call:
+  case Instruction::Invoke:
+    if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range)) 
+      if (isa<IntegerType>(BBI->getType())) {
+        ConstantRange Result = getConstantRangeFromMetadata(*Ranges);
+        return LVILatticeVal::getRange(Result);
+      }
+    break;
+  };
+  // Nothing known - Note that we do not want overdefined here.  We may know
+  // something else about the value and not having range metadata shouldn't
+  // cause us to throw away those facts.
+  return LVILatticeVal();
 }
 
 bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  if (lookup(Val).count(BB)) {
+  if (hasCachedValueInfo(Val, BB)) {
     // If we have a cached value, use that.
     DEBUG(dbgs() << "  reuse BB '" << BB->getName()
-                 << "' val=" << lookup(Val)[BB] << '\n');
+                 << "' val=" << getCachedValueInfo(Val, BB) << '\n');
 
     // Since we're reusing a cached value, we don't need to update the
     // OverDefinedCache. The cache will have been properly updated whenever the
@@ -516,7 +571,7 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
   // Hold off inserting this value into the Cache in case we have to return
   // false and come back later.
   LVILatticeVal Res;
-  
+
   Instruction *BBI = dyn_cast<Instruction>(Val);
   if (!BBI || BBI->getParent() != BB) {
     if (!solveBlockValueNonLocal(Res, Val, BB))
@@ -532,12 +587,18 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
     return true;
   }
 
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(BBI)) {
-    Res = LVILatticeVal::getNot(ConstantPointerNull::get(AI->getType()));
+  // If this value is a nonnull pointer, record it's range and bailout.
+  PointerType *PT = dyn_cast<PointerType>(BBI->getType());
+  if (PT && isKnownNonNull(BBI)) {
+    Res = LVILatticeVal::getNot(ConstantPointerNull::get(PT));
     insertResult(Val, BB, Res);
     return true;
   }
 
+  // If this is an instruction which supports range metadata, return the
+  // implied range.  TODO: This should be an intersection, not a union.
+  Res.mergeIn(getFromRangeMetadata(BBI), DL);
+
   // We can only analyze the definitions of certain classes of instructions
   // (integral binops and casts at the moment), so bail if this isn't one.
   LVILatticeVal Result;
@@ -661,7 +722,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
         PointerType *PTy = cast<PointerType>(Val->getType());
         Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
       }
-      
+
       BBLV = Result;
       return true;
     }
@@ -674,7 +735,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   BBLV = Result;
   return true;
 }
-  
+
 bool LazyValueInfoCache::solveBlockValuePHINode(LVILatticeVal &BBLV,
                                                 PHINode *PN, BasicBlock *BB) {
   LVILatticeVal Result;  // Start Undefined.
@@ -700,7 +761,7 @@ bool LazyValueInfoCache::solveBlockValuePHINode(LVILatticeVal &BBLV,
     if (Result.isOverdefined()) {
       DEBUG(dbgs() << " compute BB '" << BB->getName()
             << "' - overdefined because of pred.\n");
-      
+
       BBLV = Result;
       return true;
     }
@@ -765,7 +826,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
     BBLV.markOverdefined();
     return true;
   }
-  
+
   ConstantRange LHSRange = LHSVal.getConstantRange();
   ConstantRange RHSRange(1);
   IntegerType *ResultTy = cast<IntegerType>(BBI->getType());
@@ -819,7 +880,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
   case Instruction::Or:
     Result.markConstantRange(LHSRange.binaryOr(RHSRange));
     break;
-  
+
   // Unhandled instructions are overdefined.
   default:
     DEBUG(dbgs() << " compute BB '" << BB->getName()
@@ -827,7 +888,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
     Result.markOverdefined();
     break;
   }
-  
+
   BBLV = Result;
   return true;
 }
@@ -877,7 +938,7 @@ bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
 /// Val is not constrained on the edge.
 static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
                               BasicBlock *BBTo, LVILatticeVal &Result) {
-  // TODO: Handle more complex conditionals.  If (v == 0 || v2 < 1) is false, we
+  // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
     // If this is a conditional branch and only one successor goes to BBTo, then
@@ -887,7 +948,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
       bool isTrueDest = BI->getSuccessor(0) == BBTo;
       assert(BI->getSuccessor(!isTrueDest) == BBTo &&
              "BBTo isn't a successor of BBFrom");
-      
+
       // If V is the condition of the branch itself, then we know exactly what
       // it is.
       if (BI->getCondition() == Val) {
@@ -895,7 +956,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
                               Type::getInt1Ty(Val->getContext()), isTrueDest));
         return true;
       }
-      
+
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
       if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
@@ -997,7 +1058,7 @@ LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB,
                                                   Instruction *CxtI) {
   DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
         << BB->getName() << "'\n");
-  
+
   assert(BlockValueStack.empty() && BlockValueSet.empty());
   pushBlockValue(std::make_pair(BB, V));
 
@@ -1014,6 +1075,8 @@ LVILatticeVal LazyValueInfoCache::getValueAt(Value *V, Instruction *CxtI) {
         << CxtI->getName() << "'\n");
 
   LVILatticeVal Result;
+  if (auto *I = dyn_cast<Instruction>(V))
+    Result = getFromRangeMetadata(I);
   mergeAssumeBlockValueConstantRange(V, Result, CxtI);
 
   DEBUG(dbgs() << "  Result = " << Result << "\n");
@@ -1025,7 +1088,7 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
                Instruction *CxtI) {
   DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '"
         << FromBB->getName() << "' to '" << ToBB->getName() << "'\n");
-  
+
   LVILatticeVal Result;
   if (!getEdgeValue(V, FromBB, ToBB, Result, CxtI)) {
     solve();
@@ -1040,24 +1103,24 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
 
 void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
                                     BasicBlock *NewSucc) {
-  // When an edge in the graph has been threaded, values that we could not 
-  // determine a value for before (i.e. were marked overdefined) may be possible
-  // to solve now.  We do NOT try to proactively update these values.  Instead,
-  // we clear their entries from the cache, and allow lazy updating to recompute
-  // them when needed.
-  
+  // When an edge in the graph has been threaded, values that we could not
+  // determine a value for before (i.e. were marked overdefined) may be
+  // possible to solve now. We do NOT try to proactively update these values.
+  // Instead, we clear their entries from the cache, and allow lazy updating to
+  // recompute them when needed.
+
   // The updating process is fairly simple: we need to drop cached info
   // for all values that were marked overdefined in OldSucc, and for those same
   // values in any successor of OldSucc (except NewSucc) in which they were
   // also marked overdefined.
   std::vector<BasicBlock*> worklist;
   worklist.push_back(OldSucc);
-  
-  DenseSet<Value*> ClearSet;
-  for (OverDefinedPairTy &P : OverDefinedCache)
-    if (P.first == OldSucc)
-      ClearSet.insert(P.second);
-  
+
+  auto I = OverDefinedCache.find(OldSucc);
+  if (I == OverDefinedCache.end())
+    return; // Nothing to process here.
+  SmallVector<Value *, 4> ValsToClear(I->second.begin(), I->second.end());
+
   // Use a worklist to perform a depth-first search of OldSucc's successors.
   // NOTE: We do not need a visited list since any blocks we have already
   // visited will have had their overdefined markers cleared already, and we
@@ -1065,32 +1128,31 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
   while (!worklist.empty()) {
     BasicBlock *ToUpdate = worklist.back();
     worklist.pop_back();
-    
+
     // Skip blocks only accessible through NewSucc.
     if (ToUpdate == NewSucc) continue;
-    
+
     bool changed = false;
-    for (Value *V : ClearSet) {
+    for (Value *V : ValsToClear) {
       // If a value was marked overdefined in OldSucc, and is here too...
-      DenseSet<OverDefinedPairTy>::iterator OI =
-        OverDefinedCache.find(std::make_pair(ToUpdate, V));
-      if (OI == OverDefinedCache.end()) continue;
-
-      // Remove it from the caches.
-      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(V, this)];
-      ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate);
-
-      assert(CI != Entry.end() && "Couldn't find entry to update?");
-      Entry.erase(CI);
-      OverDefinedCache.erase(OI);
-
-      // If we removed anything, then we potentially need to update 
+      auto OI = OverDefinedCache.find(ToUpdate);
+      if (OI == OverDefinedCache.end())
+        continue;
+      SmallPtrSetImpl<Value *> &ValueSet = OI->second;
+      if (!ValueSet.count(V))
+        continue;
+
+      ValueSet.erase(V);
+      if (ValueSet.empty())
+        OverDefinedCache.erase(OI);
+
+      // If we removed anything, then we potentially need to update
       // blocks successors too.
       changed = true;
     }
 
     if (!changed) continue;
-    
+
     worklist.insert(worklist.end(), succ_begin(ToUpdate), succ_end(ToUpdate));
   }
 }
@@ -1158,7 +1220,7 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
 }
 
 /// Determine whether the specified value is known to be a
-/// constant on the specified edge.  Return null if not.
+/// constant on the specified edge. Return null if not.
 Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
                                            BasicBlock *ToBB,
                                            Instruction *CxtI) {
@@ -1190,26 +1252,26 @@ static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C,
       return ResCI->isZero() ? LazyValueInfo::False : LazyValueInfo::True;
     return LazyValueInfo::Unknown;
   }
-  
+
   if (Result.isConstantRange()) {
     ConstantInt *CI = dyn_cast<ConstantInt>(C);
     if (!CI) return LazyValueInfo::Unknown;
-    
+
     ConstantRange CR = Result.getConstantRange();
     if (Pred == ICmpInst::ICMP_EQ) {
       if (!CR.contains(CI->getValue()))
         return LazyValueInfo::False;
-      
+
       if (CR.isSingleElement() && CR.contains(CI->getValue()))
         return LazyValueInfo::True;
     } else if (Pred == ICmpInst::ICMP_NE) {
       if (!CR.contains(CI->getValue()))
         return LazyValueInfo::True;
-      
+
       if (CR.isSingleElement() && CR.contains(CI->getValue()))
         return LazyValueInfo::False;
     }
-    
+
     // Handle more complex predicates.
     ConstantRange TrueValues =
         ICmpInst::makeConstantRange((ICmpInst::Predicate)Pred, CI->getValue());
@@ -1219,7 +1281,7 @@ static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C,
       return LazyValueInfo::False;
     return LazyValueInfo::Unknown;
   }
-  
+
   if (Result.isNotConstant()) {
     // If this is an equality comparison, we can try to fold it knowing that
     // "V != C1".
@@ -1240,7 +1302,7 @@ static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C,
     }
     return LazyValueInfo::Unknown;
   }
-  
+
   return LazyValueInfo::Unknown;
 }
 
@@ -1266,20 +1328,69 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
   if (Ret != Unknown)
     return Ret;
 
-  // TODO: Move this logic inside getValueAt so that it can be cached rather
-  // than re-queried on each call.  This would also allow us to merge the
-  // underlying lattice values to get more information 
+  // Note: The following bit of code is somewhat distinct from the rest of LVI;
+  // LVI as a whole tries to compute a lattice value which is conservatively
+  // correct at a given location.  In this case, we have a predicate which we
+  // weren't able to prove about the merged result, and we're pushing that
+  // predicate back along each incoming edge to see if we can prove it
+  // separately for each input.  As a motivating example, consider:
+  // bb1:
+  //   %v1 = ... ; constantrange<1, 5>
+  //   br label %merge
+  // bb2:
+  //   %v2 = ... ; constantrange<10, 20>
+  //   br label %merge
+  // merge:
+  //   %phi = phi [%v1, %v2] ; constantrange<1,20>
+  //   %pred = icmp eq i32 %phi, 8
+  // We can't tell from the lattice value for '%phi' that '%pred' is false
+  // along each path, but by checking the predicate over each input separately,
+  // we can.
+  // We limit the search to one step backwards from the current BB and value.
+  // We could consider extending this to search further backwards through the
+  // CFG and/or value graph, but there are non-obvious compile time vs quality
+  // tradeoffs.  
   if (CxtI) {
-    // For a comparison where the V is outside this block, it's possible
-    // that we've branched on it before.  Look to see if the value is known
-    // on all incoming edges.
     BasicBlock *BB = CxtI->getParent();
+
+    // Function entry or an unreachable block.  Bail to avoid confusing
+    // analysis below.
     pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-    if (PI != PE &&
-        (!isa<Instruction>(V) ||
-         cast<Instruction>(V)->getParent() != BB)) {
+    if (PI == PE)
+      return Unknown;
+
+    // If V is a PHI node in the same block as the context, we need to ask
+    // questions about the predicate as applied to the incoming value along
+    // each edge. This is useful for eliminating cases where the predicate is
+    // known along all incoming edges.
+    if (auto *PHI = dyn_cast<PHINode>(V))
+      if (PHI->getParent() == BB) {
+        Tristate Baseline = Unknown;
+        for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) {
+          Value *Incoming = PHI->getIncomingValue(i);
+          BasicBlock *PredBB = PHI->getIncomingBlock(i);
+          // Note that PredBB may be BB itself.        
+          Tristate Result = getPredicateOnEdge(Pred, Incoming, C, PredBB, BB,
+                                               CxtI);
+          
+          // Keep going as long as we've seen a consistent known result for
+          // all inputs.
+          Baseline = (i == 0) ? Result /* First iteration */
+            : (Baseline == Result ? Baseline : Unknown); /* All others */
+          if (Baseline == Unknown)
+            break;
+        }
+        if (Baseline != Unknown)
+          return Baseline;
+      }    
+
+    // For a comparison where the V is outside this block, it's possible
+    // that we've branched on it before. Look to see if the value is known
+    // on all incoming edges.
+    if (!isa<Instruction>(V) ||
+        cast<Instruction>(V)->getParent() != BB) {
       // For predecessor edge, determine if the comparison is true or false
-      // on that edge.  If they're all true or all false, we can conclude 
+      // on that edge. If they're all true or all false, we can conclude
       // the value of the comparison in this block.
       Tristate Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
       if (Baseline != Unknown) {
diff --git a/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp
deleted file mode 100644
index 991a0e3..0000000
--- a/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===- LibCallAliasAnalysis.cpp - Implement AliasAnalysis for libcalls ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the LibCallAliasAnalysis class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/LibCallAliasAnalysis.h"
-#include "llvm/Analysis/LibCallSemantics.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-using namespace llvm;
-  
-// Register this pass...
-char LibCallAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(LibCallAliasAnalysis, AliasAnalysis, "libcall-aa",
-                   "LibCall Alias Analysis", false, true, false)
-
-FunctionPass *llvm::createLibCallAliasAnalysisPass(LibCallInfo *LCI) {
-  return new LibCallAliasAnalysis(LCI);
-}
-
-LibCallAliasAnalysis::~LibCallAliasAnalysis() {
-  delete LCI;
-}
-
-void LibCallAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AliasAnalysis::getAnalysisUsage(AU);
-  AU.setPreservesAll();                         // Does not transform code
-}
-
-bool LibCallAliasAnalysis::runOnFunction(Function &F) {
-  // set up super class
-  InitializeAliasAnalysis(this, &F.getParent()->getDataLayout());
-  return false;
-}
-
-/// AnalyzeLibCallDetails - Given a call to a function with the specified
-/// LibCallFunctionInfo, see if we can improve the mod/ref footprint of the call
-/// vs the specified pointer/size.
-AliasAnalysis::ModRefResult
-LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI,
-                                            ImmutableCallSite CS,
-                                            const MemoryLocation &Loc) {
-  // If we have a function, check to see what kind of mod/ref effects it
-  // has.  Start by including any info globally known about the function.
-  AliasAnalysis::ModRefResult MRInfo = FI->UniversalBehavior;
-  if (MRInfo == NoModRef) return MRInfo;
-  
-  // If that didn't tell us that the function is 'readnone', check to see
-  // if we have detailed info and if 'P' is any of the locations we know
-  // about.
-  const LibCallFunctionInfo::LocationMRInfo *Details = FI->LocationDetails;
-  if (Details == nullptr)
-    return MRInfo;
-  
-  // If the details array is of the 'DoesNot' kind, we only know something if
-  // the pointer is a match for one of the locations in 'Details'.  If we find a
-  // match, we can prove some interactions cannot happen.
-  // 
-  if (FI->DetailsType == LibCallFunctionInfo::DoesNot) {
-    // Find out if the pointer refers to a known location.
-    for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) {
-      const LibCallLocationInfo &LocInfo =
-      LCI->getLocationInfo(Details[i].LocationID);
-      LibCallLocationInfo::LocResult Res = LocInfo.isLocation(CS, Loc);
-      if (Res != LibCallLocationInfo::Yes) continue;
-      
-      // If we find a match against a location that we 'do not' interact with,
-      // learn this info into MRInfo.
-      return ModRefResult(MRInfo & ~Details[i].MRInfo);
-    }
-    return MRInfo;
-  }
-  
-  // If the details are of the 'DoesOnly' sort, we know something if the pointer
-  // is a match for one of the locations in 'Details'.  Also, if we can prove
-  // that the pointers is *not* one of the locations in 'Details', we know that
-  // the call is NoModRef.
-  assert(FI->DetailsType == LibCallFunctionInfo::DoesOnly);
-  
-  // Find out if the pointer refers to a known location.
-  bool NoneMatch = true;
-  for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) {
-    const LibCallLocationInfo &LocInfo =
-    LCI->getLocationInfo(Details[i].LocationID);
-    LibCallLocationInfo::LocResult Res = LocInfo.isLocation(CS, Loc);
-    if (Res == LibCallLocationInfo::No) continue;
-    
-    // If we don't know if this pointer points to the location, then we have to
-    // assume it might alias in some case.
-    if (Res == LibCallLocationInfo::Unknown) {
-      NoneMatch = false;
-      continue;
-    }
-    
-    // If we know that this pointer definitely is pointing into the location,
-    // merge in this information.
-    return ModRefResult(MRInfo & Details[i].MRInfo);
-  }
-  
-  // If we found that the pointer is guaranteed to not match any of the
-  // locations in our 'DoesOnly' rule, then we know that the pointer must point
-  // to some other location.  Since the libcall doesn't mod/ref any other
-  // locations, return NoModRef.
-  if (NoneMatch)
-    return NoModRef;
-  
-  // Otherwise, return any other info gained so far.
-  return MRInfo;
-}
-
-// getModRefInfo - Check to see if the specified callsite can clobber the
-// specified memory object.
-//
-AliasAnalysis::ModRefResult
-LibCallAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
-                                    const MemoryLocation &Loc) {
-  ModRefResult MRInfo = ModRef;
-  
-  // If this is a direct call to a function that LCI knows about, get the
-  // information about the runtime function.
-  if (LCI) {
-    if (const Function *F = CS.getCalledFunction()) {
-      if (const LibCallFunctionInfo *FI = LCI->getFunctionInfo(F)) {
-        MRInfo = ModRefResult(MRInfo & AnalyzeLibCallDetails(FI, CS, Loc));
-        if (MRInfo == NoModRef) return NoModRef;
-      }
-    }
-  }
-  
-  // The AliasAnalysis base class has some smarts, lets use them.
-  return (ModRefResult)(MRInfo | AliasAnalysis::getModRefInfo(CS, Loc));
-}
diff --git a/contrib/llvm/lib/Analysis/LibCallSemantics.cpp b/contrib/llvm/lib/Analysis/LibCallSemantics.cpp
deleted file mode 100644
index 003c81e..0000000
--- a/contrib/llvm/lib/Analysis/LibCallSemantics.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//===- LibCallSemantics.cpp - Describe library semantics ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements interfaces that can be used to describe language
-// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM
-// optimizers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/LibCallSemantics.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Function.h"
-using namespace llvm;
-
-/// This impl pointer in ~LibCallInfo is actually a StringMap.  This
-/// helper does the cast.
-static StringMap<const LibCallFunctionInfo*> *getMap(void *Ptr) {
-  return static_cast<StringMap<const LibCallFunctionInfo*> *>(Ptr);
-}
-
-LibCallInfo::~LibCallInfo() {
-  delete getMap(Impl);
-}
-
-const LibCallLocationInfo &LibCallInfo::getLocationInfo(unsigned LocID) const {
-  // Get location info on the first call.
-  if (NumLocations == 0)
-    NumLocations = getLocationInfo(Locations);
-  
-  assert(LocID < NumLocations && "Invalid location ID!");
-  return Locations[LocID];
-}
-
-
-/// Return the LibCallFunctionInfo object corresponding to
-/// the specified function if we have it.  If not, return null.
-const LibCallFunctionInfo *
-LibCallInfo::getFunctionInfo(const Function *F) const {
-  StringMap<const LibCallFunctionInfo*> *Map = getMap(Impl);
-  
-  /// If this is the first time we are querying for this info, lazily construct
-  /// the StringMap to index it.
-  if (!Map) {
-    Impl = Map = new StringMap<const LibCallFunctionInfo*>();
-    
-    const LibCallFunctionInfo *Array = getFunctionInfoArray();
-    if (!Array) return nullptr;
-    
-    // We now have the array of entries.  Populate the StringMap.
-    for (unsigned i = 0; Array[i].Name; ++i)
-      (*Map)[Array[i].Name] = Array+i;
-  }
-  
-  // Look up this function in the string map.
-  return Map->lookup(F->getName());
-}
-
-/// See if the given exception handling personality function is one that we
-/// understand.  If so, return a description of it; otherwise return Unknown.
-EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
-  const Function *F = dyn_cast<Function>(Pers->stripPointerCasts());
-  if (!F)
-    return EHPersonality::Unknown;
-  return StringSwitch<EHPersonality>(F->getName())
-    .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
-    .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
-    .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
-    .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
-    .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
-    .Case("_except_handler4",      EHPersonality::MSVC_X86SEH)
-    .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
-    .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
-    .Default(EHPersonality::Unknown);
-}
-
-bool llvm::canSimplifyInvokeNoUnwind(const Function *F) {
-  EHPersonality Personality = classifyEHPersonality(F->getPersonalityFn());
-  // We can't simplify any invokes to nounwind functions if the personality
-  // function wants to catch asynch exceptions.  The nounwind attribute only
-  // implies that the function does not throw synchronous exceptions.
-  return !isAsynchronousEHPersonality(Personality);
-}
diff --git a/contrib/llvm/lib/Analysis/Lint.cpp b/contrib/llvm/lib/Analysis/Lint.cpp
index 0b9308a..2dfb09c 100644
--- a/contrib/llvm/lib/Analysis/Lint.cpp
+++ b/contrib/llvm/lib/Analysis/Lint.cpp
@@ -49,6 +49,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -98,12 +99,13 @@ namespace {
     void visitInsertElementInst(InsertElementInst &I);
     void visitUnreachableInst(UnreachableInst &I);
 
-    Value *findValue(Value *V, const DataLayout &DL, bool OffsetOk) const;
-    Value *findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk,
+    Value *findValue(Value *V, bool OffsetOk) const;
+    Value *findValueImpl(Value *V, bool OffsetOk,
                          SmallPtrSetImpl<Value *> &Visited) const;
 
   public:
     Module *Mod;
+    const DataLayout *DL;
     AliasAnalysis *AA;
     AssumptionCache *AC;
     DominatorTree *DT;
@@ -121,7 +123,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
@@ -165,7 +167,7 @@ INITIALIZE_PASS_BEGIN(Lint, "lint", "Statically lint-checks LLVM IR",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR",
                     false, true)
 
@@ -178,7 +180,8 @@ INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR",
 //
 bool Lint::runOnFunction(Function &F) {
   Mod = F.getParent();
-  AA = &getAnalysis<AliasAnalysis>();
+  DL = &F.getParent()->getDataLayout();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
@@ -200,12 +203,11 @@ void Lint::visitFunction(Function &F) {
 void Lint::visitCallSite(CallSite CS) {
   Instruction &I = *CS.getInstruction();
   Value *Callee = CS.getCalledValue();
-  const DataLayout &DL = CS->getModule()->getDataLayout();
 
   visitMemoryReference(I, Callee, MemoryLocation::UnknownSize, 0, nullptr,
                        MemRef::Callee);
 
-  if (Function *F = dyn_cast<Function>(findValue(Callee, DL,
+  if (Function *F = dyn_cast<Function>(findValue(Callee,
                                                  /*OffsetOk=*/false))) {
     Assert(CS.getCallingConv() == F->getCallingConv(),
            "Undefined behavior: Caller and callee calling convention differ",
@@ -232,7 +234,7 @@ void Lint::visitCallSite(CallSite CS) {
     for (; AI != AE; ++AI) {
       Value *Actual = *AI;
       if (PI != PE) {
-        Argument *Formal = PI++;
+        Argument *Formal = &*PI++;
         Assert(Formal->getType() == Actual->getType(),
                "Undefined behavior: Call argument type mismatches "
                "callee parameter type",
@@ -253,8 +255,8 @@ void Lint::visitCallSite(CallSite CS) {
         if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) {
           Type *Ty =
             cast<PointerType>(Formal->getType())->getElementType();
-          visitMemoryReference(I, Actual, AA->getTypeStoreSize(Ty),
-                               DL.getABITypeAlignment(Ty), Ty,
+          visitMemoryReference(I, Actual, DL->getTypeStoreSize(Ty),
+                               DL->getABITypeAlignment(Ty), Ty,
                                MemRef::Read | MemRef::Write);
         }
       }
@@ -264,7 +266,7 @@ void Lint::visitCallSite(CallSite CS) {
   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->isTailCall())
     for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
          AI != AE; ++AI) {
-      Value *Obj = findValue(*AI, DL, /*OffsetOk=*/true);
+      Value *Obj = findValue(*AI, /*OffsetOk=*/true);
       Assert(!isa<AllocaInst>(Obj),
              "Undefined behavior: Call with \"tail\" keyword references "
              "alloca",
@@ -291,7 +293,7 @@ void Lint::visitCallSite(CallSite CS) {
       // overlap is not distinguished from the case where nothing is known.
       uint64_t Size = 0;
       if (const ConstantInt *Len =
-              dyn_cast<ConstantInt>(findValue(MCI->getLength(), DL,
+              dyn_cast<ConstantInt>(findValue(MCI->getLength(),
                                               /*OffsetOk=*/false)))
         if (Len->getValue().isIntN(32))
           Size = Len->getValue().getZExtValue();
@@ -343,13 +345,6 @@ void Lint::visitCallSite(CallSite CS) {
       visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0,
                            nullptr, MemRef::Read | MemRef::Write);
       break;
-
-    case Intrinsic::eh_begincatch:
-      visitEHBeginCatch(II);
-      break;
-    case Intrinsic::eh_endcatch:
-      visitEHEndCatch(II);
-      break;
     }
 }
 
@@ -367,8 +362,7 @@ void Lint::visitReturnInst(ReturnInst &I) {
          "Unusual: Return statement in function with noreturn attribute", &I);
 
   if (Value *V = I.getReturnValue()) {
-    Value *Obj =
-        findValue(V, F->getParent()->getDataLayout(), /*OffsetOk=*/true);
+    Value *Obj = findValue(V, /*OffsetOk=*/true);
     Assert(!isa<AllocaInst>(Obj), "Unusual: Returning alloca value", &I);
   }
 }
@@ -383,8 +377,7 @@ void Lint::visitMemoryReference(Instruction &I,
   if (Size == 0)
     return;
 
-  Value *UnderlyingObject =
-      findValue(Ptr, I.getModule()->getDataLayout(), /*OffsetOk=*/true);
+  Value *UnderlyingObject = findValue(Ptr, /*OffsetOk=*/true);
   Assert(!isa<ConstantPointerNull>(UnderlyingObject),
          "Undefined behavior: Null pointer dereference", &I);
   Assert(!isa<UndefValue>(UnderlyingObject),
@@ -423,9 +416,8 @@ void Lint::visitMemoryReference(Instruction &I,
   // Check for buffer overflows and misalignment.
   // Only handles memory references that read/write something simple like an
   // alloca instruction or a global variable.
-  auto &DL = I.getModule()->getDataLayout();
   int64_t Offset = 0;
-  if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, DL)) {
+  if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, *DL)) {
     // OK, so the access is to a constant offset from Ptr.  Check that Ptr is
     // something we can handle and if so extract the size of this base object
     // along with its alignment.
@@ -435,20 +427,20 @@ void Lint::visitMemoryReference(Instruction &I,
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
       Type *ATy = AI->getAllocatedType();
       if (!AI->isArrayAllocation() && ATy->isSized())
-        BaseSize = DL.getTypeAllocSize(ATy);
+        BaseSize = DL->getTypeAllocSize(ATy);
       BaseAlign = AI->getAlignment();
       if (BaseAlign == 0 && ATy->isSized())
-        BaseAlign = DL.getABITypeAlignment(ATy);
+        BaseAlign = DL->getABITypeAlignment(ATy);
     } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
       // If the global may be defined differently in another compilation unit
       // then don't warn about funky memory accesses.
       if (GV->hasDefinitiveInitializer()) {
         Type *GTy = GV->getType()->getElementType();
         if (GTy->isSized())
-          BaseSize = DL.getTypeAllocSize(GTy);
+          BaseSize = DL->getTypeAllocSize(GTy);
         BaseAlign = GV->getAlignment();
         if (BaseAlign == 0 && GTy->isSized())
-          BaseAlign = DL.getABITypeAlignment(GTy);
+          BaseAlign = DL->getABITypeAlignment(GTy);
       }
     }
 
@@ -462,7 +454,7 @@ void Lint::visitMemoryReference(Instruction &I,
     // Accesses that say that the memory is more aligned than it is are not
     // defined.
     if (Align == 0 && Ty && Ty->isSized())
-      Align = DL.getABITypeAlignment(Ty);
+      Align = DL->getABITypeAlignment(Ty);
     Assert(!BaseAlign || Align <= MinAlign(BaseAlign, Offset),
            "Undefined behavior: Memory reference address is misaligned", &I);
   }
@@ -470,13 +462,13 @@ void Lint::visitMemoryReference(Instruction &I,
 
 void Lint::visitLoadInst(LoadInst &I) {
   visitMemoryReference(I, I.getPointerOperand(),
-                       AA->getTypeStoreSize(I.getType()), I.getAlignment(),
+                       DL->getTypeStoreSize(I.getType()), I.getAlignment(),
                        I.getType(), MemRef::Read);
 }
 
 void Lint::visitStoreInst(StoreInst &I) {
   visitMemoryReference(I, I.getPointerOperand(),
-                       AA->getTypeStoreSize(I.getOperand(0)->getType()),
+                       DL->getTypeStoreSize(I.getOperand(0)->getType()),
                        I.getAlignment(),
                        I.getOperand(0)->getType(), MemRef::Write);
 }
@@ -492,208 +484,26 @@ void Lint::visitSub(BinaryOperator &I) {
 }
 
 void Lint::visitLShr(BinaryOperator &I) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(
-          findValue(I.getOperand(1), I.getModule()->getDataLayout(),
-                    /*OffsetOk=*/false)))
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(1),
+                                                        /*OffsetOk=*/false)))
     Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
            "Undefined result: Shift count out of range", &I);
 }
 
 void Lint::visitAShr(BinaryOperator &I) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(
-          I.getOperand(1), I.getModule()->getDataLayout(), /*OffsetOk=*/false)))
+  if (ConstantInt *CI =
+          dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
     Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
            "Undefined result: Shift count out of range", &I);
 }
 
 void Lint::visitShl(BinaryOperator &I) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(
-          I.getOperand(1), I.getModule()->getDataLayout(), /*OffsetOk=*/false)))
+  if (ConstantInt *CI =
+          dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
     Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
            "Undefined result: Shift count out of range", &I);
 }
 
-static bool
-allPredsCameFromLandingPad(BasicBlock *BB,
-                           SmallSet<BasicBlock *, 4> &VisitedBlocks) {
-  VisitedBlocks.insert(BB);
-  if (BB->isLandingPad())
-    return true;
-  // If we find a block with no predecessors, the search failed.
-  if (pred_empty(BB))
-    return false;
-  for (BasicBlock *Pred : predecessors(BB)) {
-    if (VisitedBlocks.count(Pred))
-      continue;
-    if (!allPredsCameFromLandingPad(Pred, VisitedBlocks))
-      return false;
-  }
-  return true;
-}
-
-static bool
-allSuccessorsReachEndCatch(BasicBlock *BB, BasicBlock::iterator InstBegin,
-                           IntrinsicInst **SecondBeginCatch,
-                           SmallSet<BasicBlock *, 4> &VisitedBlocks) {
-  VisitedBlocks.insert(BB);
-  for (BasicBlock::iterator I = InstBegin, E = BB->end(); I != E; ++I) {
-    IntrinsicInst *IC = dyn_cast<IntrinsicInst>(I);
-    if (IC && IC->getIntrinsicID() == Intrinsic::eh_endcatch)
-      return true;
-    // If we find another begincatch while looking for an endcatch,
-    // that's also an error.
-    if (IC && IC->getIntrinsicID() == Intrinsic::eh_begincatch) {
-      *SecondBeginCatch = IC;
-      return false;
-    }
-  }
-
-  // If we reach a block with no successors while searching, the
-  // search has failed.
-  if (succ_empty(BB))
-    return false;
-  // Otherwise, search all of the successors.
-  for (BasicBlock *Succ : successors(BB)) {
-    if (VisitedBlocks.count(Succ))
-      continue;
-    if (!allSuccessorsReachEndCatch(Succ, Succ->begin(), SecondBeginCatch,
-                                    VisitedBlocks))
-      return false;
-  }
-  return true;
-}
-
-void Lint::visitEHBeginCatch(IntrinsicInst *II) {
-  // The checks in this function make a potentially dubious assumption about
-  // the CFG, namely that any block involved in a catch is only used for the
-  // catch.  This will very likely be true of IR generated by a front end,
-  // but it may cease to be true, for example, if the IR is run through a
-  // pass which combines similar blocks.
-  //
-  // In general, if we encounter a block the isn't dominated by the catch
-  // block while we are searching the catch block's successors for a call
-  // to end catch intrinsic, then it is possible that it will be legal for
-  // a path through this block to never reach a call to llvm.eh.endcatch.
-  // An analogous statement could be made about our search for a landing
-  // pad among the catch block's predecessors.
-  //
-  // What is actually required is that no path is possible at runtime that
-  // reaches a call to llvm.eh.begincatch without having previously visited
-  // a landingpad instruction and that no path is possible at runtime that
-  // calls llvm.eh.begincatch and does not subsequently call llvm.eh.endcatch
-  // (mentally adjusting for the fact that in reality these calls will be
-  // removed before code generation).
-  //
-  // Because this is a lint check, we take a pessimistic approach and warn if
-  // the control flow is potentially incorrect.
-
-  SmallSet<BasicBlock *, 4> VisitedBlocks;
-  BasicBlock *CatchBB = II->getParent();
-
-  // The begin catch must occur in a landing pad block or all paths
-  // to it must have come from a landing pad.
-  Assert(allPredsCameFromLandingPad(CatchBB, VisitedBlocks),
-         "llvm.eh.begincatch may be reachable without passing a landingpad",
-         II);
-
-  // Reset the visited block list.
-  VisitedBlocks.clear();
-
-  IntrinsicInst *SecondBeginCatch = nullptr;
-
-  // This has to be called before it is asserted.  Otherwise, the first assert
-  // below can never be hit.
-  bool EndCatchFound = allSuccessorsReachEndCatch(
-      CatchBB, std::next(static_cast<BasicBlock::iterator>(II)),
-      &SecondBeginCatch, VisitedBlocks);
-  Assert(
-      SecondBeginCatch == nullptr,
-      "llvm.eh.begincatch may be called a second time before llvm.eh.endcatch",
-      II, SecondBeginCatch);
-  Assert(EndCatchFound,
-         "Some paths from llvm.eh.begincatch may not reach llvm.eh.endcatch",
-         II);
-}
-
-static bool allPredCameFromBeginCatch(
-    BasicBlock *BB, BasicBlock::reverse_iterator InstRbegin,
-    IntrinsicInst **SecondEndCatch, SmallSet<BasicBlock *, 4> &VisitedBlocks) {
-  VisitedBlocks.insert(BB);
-  // Look for a begincatch in this block.
-  for (BasicBlock::reverse_iterator RI = InstRbegin, RE = BB->rend(); RI != RE;
-       ++RI) {
-    IntrinsicInst *IC = dyn_cast<IntrinsicInst>(&*RI);
-    if (IC && IC->getIntrinsicID() == Intrinsic::eh_begincatch)
-      return true;
-    // If we find another end catch before we find a begin catch, that's
-    // an error.
-    if (IC && IC->getIntrinsicID() == Intrinsic::eh_endcatch) {
-      *SecondEndCatch = IC;
-      return false;
-    }
-    // If we encounter a landingpad instruction, the search failed.
-    if (isa<LandingPadInst>(*RI))
-      return false;
-  }
-  // If while searching we find a block with no predeccesors,
-  // the search failed.
-  if (pred_empty(BB))
-    return false;
-  // Search any predecessors we haven't seen before.
-  for (BasicBlock *Pred : predecessors(BB)) {
-    if (VisitedBlocks.count(Pred))
-      continue;
-    if (!allPredCameFromBeginCatch(Pred, Pred->rbegin(), SecondEndCatch,
-                                   VisitedBlocks))
-      return false;
-  }
-  return true;
-}
-
-void Lint::visitEHEndCatch(IntrinsicInst *II) {
-  // The check in this function makes a potentially dubious assumption about
-  // the CFG, namely that any block involved in a catch is only used for the
-  // catch.  This will very likely be true of IR generated by a front end,
-  // but it may cease to be true, for example, if the IR is run through a
-  // pass which combines similar blocks.
-  //
-  // In general, if we encounter a block the isn't post-dominated by the
-  // end catch block while we are searching the end catch block's predecessors
-  // for a call to the begin catch intrinsic, then it is possible that it will
-  // be legal for a path to reach the end catch block without ever having
-  // called llvm.eh.begincatch.
-  //
-  // What is actually required is that no path is possible at runtime that
-  // reaches a call to llvm.eh.endcatch without having previously visited
-  // a call to llvm.eh.begincatch (mentally adjusting for the fact that in
-  // reality these calls will be removed before code generation).
-  //
-  // Because this is a lint check, we take a pessimistic approach and warn if
-  // the control flow is potentially incorrect.
-
-  BasicBlock *EndCatchBB = II->getParent();
-
-  // Alls paths to the end catch call must pass through a begin catch call.
-
-  // If llvm.eh.begincatch wasn't called in the current block, we'll use this
-  // lambda to recursively look for it in predecessors.
-  SmallSet<BasicBlock *, 4> VisitedBlocks;
-  IntrinsicInst *SecondEndCatch = nullptr;
-
-  // This has to be called before it is asserted.  Otherwise, the first assert
-  // below can never be hit.
-  bool BeginCatchFound =
-      allPredCameFromBeginCatch(EndCatchBB, BasicBlock::reverse_iterator(II),
-                                &SecondEndCatch, VisitedBlocks);
-  Assert(
-      SecondEndCatch == nullptr,
-      "llvm.eh.endcatch may be called a second time after llvm.eh.begincatch",
-      II, SecondEndCatch);
-  Assert(BeginCatchFound,
-         "llvm.eh.endcatch may be reachable without passing llvm.eh.begincatch",
-         II);
-}
-
 static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
                    AssumptionCache *AC) {
   // Assume undef could be zero.
@@ -777,25 +587,23 @@ void Lint::visitIndirectBrInst(IndirectBrInst &I) {
 }
 
 void Lint::visitExtractElementInst(ExtractElementInst &I) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(
-          findValue(I.getIndexOperand(), I.getModule()->getDataLayout(),
-                    /*OffsetOk=*/false)))
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getIndexOperand(),
+                                                        /*OffsetOk=*/false)))
     Assert(CI->getValue().ult(I.getVectorOperandType()->getNumElements()),
            "Undefined result: extractelement index out of range", &I);
 }
 
 void Lint::visitInsertElementInst(InsertElementInst &I) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(
-          findValue(I.getOperand(2), I.getModule()->getDataLayout(),
-                    /*OffsetOk=*/false)))
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(2),
+                                                        /*OffsetOk=*/false)))
     Assert(CI->getValue().ult(I.getType()->getNumElements()),
            "Undefined result: insertelement index out of range", &I);
 }
 
 void Lint::visitUnreachableInst(UnreachableInst &I) {
   // This isn't undefined behavior, it's merely suspicious.
-  Assert(&I == I.getParent()->begin() ||
-             std::prev(BasicBlock::iterator(&I))->mayHaveSideEffects(),
+  Assert(&I == &I.getParent()->front() ||
+             std::prev(I.getIterator())->mayHaveSideEffects(),
          "Unusual: unreachable immediately preceded by instruction without "
          "side effects",
          &I);
@@ -808,13 +616,13 @@ void Lint::visitUnreachableInst(UnreachableInst &I) {
 /// Most analysis passes don't require this logic, because instcombine
 /// will simplify most of these kinds of things away. But it's a goal of
 /// this Lint pass to be useful even on non-optimized IR.
-Value *Lint::findValue(Value *V, const DataLayout &DL, bool OffsetOk) const {
+Value *Lint::findValue(Value *V, bool OffsetOk) const {
   SmallPtrSet<Value *, 4> Visited;
-  return findValueImpl(V, DL, OffsetOk, Visited);
+  return findValueImpl(V, OffsetOk, Visited);
 }
 
 /// findValueImpl - Implementation helper for findValue.
-Value *Lint::findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk,
+Value *Lint::findValueImpl(Value *V, bool OffsetOk,
                            SmallPtrSetImpl<Value *> &Visited) const {
   // Detect self-referential values.
   if (!Visited.insert(V).second)
@@ -825,17 +633,18 @@ Value *Lint::findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk,
   // TODO: Look through eliminable cast pairs.
   // TODO: Look through calls with unique return values.
   // TODO: Look through vector insert/extract/shuffle.
-  V = OffsetOk ? GetUnderlyingObject(V, DL) : V->stripPointerCasts();
+  V = OffsetOk ? GetUnderlyingObject(V, *DL) : V->stripPointerCasts();
   if (LoadInst *L = dyn_cast<LoadInst>(V)) {
-    BasicBlock::iterator BBI = L;
+    BasicBlock::iterator BBI = L->getIterator();
     BasicBlock *BB = L->getParent();
     SmallPtrSet<BasicBlock *, 4> VisitedBlocks;
     for (;;) {
       if (!VisitedBlocks.insert(BB).second)
         break;
-      if (Value *U = FindAvailableLoadedValue(L->getPointerOperand(),
-                                              BB, BBI, 6, AA))
-        return findValueImpl(U, DL, OffsetOk, Visited);
+      if (Value *U =
+          FindAvailableLoadedValue(L->getPointerOperand(),
+                                   BB, BBI, DefMaxInstsToScan, AA))
+        return findValueImpl(U, OffsetOk, Visited);
       if (BBI != BB->begin()) break;
       BB = BB->getUniquePredecessor();
       if (!BB) break;
@@ -844,38 +653,38 @@ Value *Lint::findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     if (Value *W = PN->hasConstantValue())
       if (W != V)
-        return findValueImpl(W, DL, OffsetOk, Visited);
+        return findValueImpl(W, OffsetOk, Visited);
   } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
-    if (CI->isNoopCast(DL))
-      return findValueImpl(CI->getOperand(0), DL, OffsetOk, Visited);
+    if (CI->isNoopCast(*DL))
+      return findValueImpl(CI->getOperand(0), OffsetOk, Visited);
   } else if (ExtractValueInst *Ex = dyn_cast<ExtractValueInst>(V)) {
     if (Value *W = FindInsertedValue(Ex->getAggregateOperand(),
                                      Ex->getIndices()))
       if (W != V)
-        return findValueImpl(W, DL, OffsetOk, Visited);
+        return findValueImpl(W, OffsetOk, Visited);
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     // Same as above, but for ConstantExpr instead of Instruction.
     if (Instruction::isCast(CE->getOpcode())) {
       if (CastInst::isNoopCast(Instruction::CastOps(CE->getOpcode()),
                                CE->getOperand(0)->getType(), CE->getType(),
-                               DL.getIntPtrType(V->getType())))
-        return findValueImpl(CE->getOperand(0), DL, OffsetOk, Visited);
+                               DL->getIntPtrType(V->getType())))
+        return findValueImpl(CE->getOperand(0), OffsetOk, Visited);
     } else if (CE->getOpcode() == Instruction::ExtractValue) {
       ArrayRef<unsigned> Indices = CE->getIndices();
       if (Value *W = FindInsertedValue(CE->getOperand(0), Indices))
         if (W != V)
-          return findValueImpl(W, DL, OffsetOk, Visited);
+          return findValueImpl(W, OffsetOk, Visited);
     }
   }
 
   // As a last resort, try SimplifyInstruction or constant folding.
   if (Instruction *Inst = dyn_cast<Instruction>(V)) {
-    if (Value *W = SimplifyInstruction(Inst, DL, TLI, DT, AC))
-      return findValueImpl(W, DL, OffsetOk, Visited);
+    if (Value *W = SimplifyInstruction(Inst, *DL, TLI, DT, AC))
+      return findValueImpl(W, OffsetOk, Visited);
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-    if (Value *W = ConstantFoldConstantExpression(CE, DL, TLI))
+    if (Value *W = ConstantFoldConstantExpression(CE, *DL, TLI))
       if (W != V)
-        return findValueImpl(W, DL, OffsetOk, Visited);
+        return findValueImpl(W, OffsetOk, Visited);
   }
 
   return V;
diff --git a/contrib/llvm/lib/Analysis/Loads.cpp b/contrib/llvm/lib/Analysis/Loads.cpp
index 624c5a1..4b2fa3c 100644
--- a/contrib/llvm/lib/Analysis/Loads.cpp
+++ b/contrib/llvm/lib/Analysis/Loads.cpp
@@ -118,7 +118,8 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
   // from/to.  If so, the previous load or store would have already trapped,
   // so there is no harm doing an extra load (also, CSE will later eliminate
   // the load entirely).
-  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
+  BasicBlock::iterator BBI = ScanFrom->getIterator(),
+                       E = ScanFrom->getParent()->begin();
 
   // We can at least always strip pointer casts even though we can't use the
   // base here.
@@ -161,6 +162,18 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
   return false;
 }
 
+/// DefMaxInstsToScan - the default number of maximum instructions
+/// to scan in the block, used by FindAvailableLoadedValue().
+/// FindAvailableLoadedValue() was introduced in r60148, to improve jump
+/// threading in part by eliminating partially redundant loads.
+/// At that point, the value of MaxInstsToScan was already set to '6'
+/// without documented explanation.
+cl::opt<unsigned>
+llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden,
+  cl::desc("Use this to specify the default maximum number of instructions "
+           "to scan backward from a given instruction, when searching for "
+           "available loaded value"));
+
 /// \brief Scan the ScanBB block backwards to see if we have the value at the
 /// memory address *Ptr locally available within a small number of instructions.
 ///
@@ -199,7 +212,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
   while (ScanFrom != ScanBB->begin()) {
     // We must ignore debug info directives when counting (otherwise they
     // would affect codegen).
-    Instruction *Inst = --ScanFrom;
+    Instruction *Inst = &*--ScanFrom;
     if (isa<DbgInfoIntrinsic>(Inst))
       continue;
 
@@ -246,9 +259,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
 
       // If we have alias analysis and it says the store won't modify the loaded
       // value, ignore the store.
-      if (AA &&
-          (AA->getModRefInfo(SI, StrippedPtr, AccessSize) &
-           AliasAnalysis::Mod) == 0)
+      if (AA && (AA->getModRefInfo(SI, StrippedPtr, AccessSize) & MRI_Mod) == 0)
         continue;
 
       // Otherwise the store that may or may not alias the pointer, bail out.
@@ -261,8 +272,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
       // If alias analysis claims that it really won't modify the load,
       // ignore it.
       if (AA &&
-          (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) &
-           AliasAnalysis::Mod) == 0)
+          (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) & MRI_Mod) == 0)
         continue;
 
       // May modify the pointer, bail out.
diff --git a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index becbae4..8bcdcb8 100644
--- a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -58,12 +58,12 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(
 /// Maximum SIMD width.
 const unsigned VectorizerParams::MaxVectorWidth = 64;
 
-/// \brief We collect interesting dependences up to this threshold.
-static cl::opt<unsigned> MaxInterestingDependence(
-    "max-interesting-dependences", cl::Hidden,
-    cl::desc("Maximum number of interesting dependences collected by "
-             "loop-access analysis (default = 100)"),
-    cl::init(100));
+/// \brief We collect dependences up to this threshold.
+static cl::opt<unsigned>
+    MaxDependences("max-dependences", cl::Hidden,
+                   cl::desc("Maximum number of dependences collected by "
+                            "loop-access analysis (default = 100)"),
+                   cl::init(100));
 
 bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
@@ -87,11 +87,10 @@ Value *llvm::stripIntegerCast(Value *V) {
   return V;
 }
 
-const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                             const ValueToValueMap &PtrToStride,
                                             Value *Ptr, Value *OrigPtr) {
-
-  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
+  const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
 
   // If there is an entry in the map return the SCEV of the pointer with the
   // symbolic stride replaced by one.
@@ -108,36 +107,82 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
     ValueToValueMap RewriteMap;
     RewriteMap[StrideVal] = One;
 
-    const SCEV *ByOne =
-        SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
-    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
+    ScalarEvolution *SE = PSE.getSE();
+    const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
+    const auto *CT =
+        static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
+
+    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
+    auto *Expr = PSE.getSCEV(Ptr);
+
+    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr
                  << "\n");
-    return ByOne;
+    return Expr;
   }
 
   // Otherwise, just return the SCEV of the original pointer.
-  return SE->getSCEV(Ptr);
+  return OrigSCEV;
 }
 
 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
-                                    const ValueToValueMap &Strides) {
+                                    const ValueToValueMap &Strides,
+                                    PredicatedScalarEvolution &PSE) {
   // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
+
+  const SCEV *ScStart = AR->getStart();
   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-  Pointers.emplace_back(Ptr, AR->getStart(), ScEnd, WritePtr, DepSetId, ASId,
-                        Sc);
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // For expressions with negative step, the upper bound is ScStart and the
+  // lower bound is ScEnd.
+  if (const SCEVConstant *CStep = dyn_cast<const SCEVConstant>(Step)) {
+    if (CStep->getValue()->isNegative())
+      std::swap(ScStart, ScEnd);
+  } else {
+    // Fallback case: the step is not constant, but the we can still
+    // get the upper and lower bounds of the interval by using min/max
+    // expressions.
+    ScStart = SE->getUMinExpr(ScStart, ScEnd);
+    ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
+  }
+
+  Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
+}
+
+SmallVector<RuntimePointerChecking::PointerCheck, 4>
+RuntimePointerChecking::generateChecks() const {
+  SmallVector<PointerCheck, 4> Checks;
+
+  for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
+    for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
+      const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I];
+      const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J];
+
+      if (needsChecking(CGI, CGJ))
+        Checks.push_back(std::make_pair(&CGI, &CGJ));
+    }
+  }
+  return Checks;
+}
+
+void RuntimePointerChecking::generateChecks(
+    MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
+  assert(Checks.empty() && "Checks is not empty");
+  groupChecks(DepCands, UseDependencies);
+  Checks = generateChecks();
 }
 
-bool RuntimePointerChecking::needsChecking(
-    const CheckingPtrGroup &M, const CheckingPtrGroup &N,
-    const SmallVectorImpl<int> *PtrPartition) const {
+bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M,
+                                           const CheckingPtrGroup &N) const {
   for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
     for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
-      if (needsChecking(M.Members[I], N.Members[J], PtrPartition))
+      if (needsChecking(M.Members[I], N.Members[J]))
         return true;
   return false;
 }
@@ -204,8 +249,31 @@ void RuntimePointerChecking::groupChecks(
 
   CheckingGroups.clear();
 
+  // If we need to check two pointers to the same underlying object
+  // with a non-constant difference, we shouldn't perform any pointer
+  // grouping with those pointers. This is because we can easily get
+  // into cases where the resulting check would return false, even when
+  // the accesses are safe.
+  //
+  // The following example shows this:
+  // for (i = 0; i < 1000; ++i)
+  //   a[5000 + i * m] = a[i] + a[i + 9000]
+  //
+  // Here grouping gives a check of (5000, 5000 + 1000 * m) against
+  // (0, 10000) which is always false. However, if m is 1, there is no
+  // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
+  // us to perform an accurate check in this case.
+  //
+  // The above case requires that we have an UnknownDependence between
+  // accesses to the same underlying object. This cannot happen unless
+  // ShouldRetryWithRuntimeCheck is set, and therefore UseDependencies
+  // is also false. In this case we will use the fallback path and create
+  // separate checking groups for all pointers.
+
   // If we don't have the dependency partitions, construct a new
-  // checking pointer group for each pointer.
+  // checking pointer group for each pointer. This is also required
+  // for correctness, because in this case we can have checking between
+  // pointers to the same underlying object.
   if (!UseDependencies) {
     for (unsigned I = 0; I < Pointers.size(); ++I)
       CheckingGroups.push_back(CheckingPtrGroup(I, *this));
@@ -222,7 +290,7 @@ void RuntimePointerChecking::groupChecks(
   // don't process them twice.
   SmallSet<unsigned, 2> Seen;
 
-  // Go through all equivalence classes, get the the "pointer check groups"
+  // Go through all equivalence classes, get the "pointer check groups"
   // and add them to the overall solution. We use the order in which accesses
   // appear in 'Pointers' to enforce determinism.
   for (unsigned I = 0; I < Pointers.size(); ++I) {
@@ -280,8 +348,14 @@ void RuntimePointerChecking::groupChecks(
   }
 }
 
-bool RuntimePointerChecking::needsChecking(
-    unsigned I, unsigned J, const SmallVectorImpl<int> *PtrPartition) const {
+bool RuntimePointerChecking::arePointersInSamePartition(
+    const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
+    unsigned PtrIdx2) {
+  return (PtrToPartition[PtrIdx1] != -1 &&
+          PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
+}
+
+bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
   const PointerInfo &PointerI = Pointers[I];
   const PointerInfo &PointerJ = Pointers[J];
 
@@ -297,85 +371,45 @@ bool RuntimePointerChecking::needsChecking(
   if (PointerI.AliasSetId != PointerJ.AliasSetId)
     return false;
 
-  // If PtrPartition is set omit checks between pointers of the same partition.
-  // Partition number -1 means that the pointer is used in multiple partitions.
-  // In this case we can't omit the check.
-  if (PtrPartition && (*PtrPartition)[I] != -1 &&
-      (*PtrPartition)[I] == (*PtrPartition)[J])
-    return false;
-
   return true;
 }
 
-void RuntimePointerChecking::print(
-    raw_ostream &OS, unsigned Depth,
-    const SmallVectorImpl<int> *PtrPartition) const {
-
-  OS.indent(Depth) << "Run-time memory checks:\n";
-
+void RuntimePointerChecking::printChecks(
+    raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks,
+    unsigned Depth) const {
   unsigned N = 0;
-  for (unsigned I = 0; I < CheckingGroups.size(); ++I)
-    for (unsigned J = I + 1; J < CheckingGroups.size(); ++J)
-      if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition)) {
-        OS.indent(Depth) << "Check " << N++ << ":\n";
-        OS.indent(Depth + 2) << "Comparing group " << I << ":\n";
-
-        for (unsigned K = 0; K < CheckingGroups[I].Members.size(); ++K) {
-          OS.indent(Depth + 2)
-              << *Pointers[CheckingGroups[I].Members[K]].PointerValue << "\n";
-          if (PtrPartition)
-            OS << " (Partition: "
-               << (*PtrPartition)[CheckingGroups[I].Members[K]] << ")"
-               << "\n";
-        }
+  for (const auto &Check : Checks) {
+    const auto &First = Check.first->Members, &Second = Check.second->Members;
 
-        OS.indent(Depth + 2) << "Against group " << J << ":\n";
+    OS.indent(Depth) << "Check " << N++ << ":\n";
 
-        for (unsigned K = 0; K < CheckingGroups[J].Members.size(); ++K) {
-          OS.indent(Depth + 2)
-              << *Pointers[CheckingGroups[J].Members[K]].PointerValue << "\n";
-          if (PtrPartition)
-            OS << " (Partition: "
-               << (*PtrPartition)[CheckingGroups[J].Members[K]] << ")"
-               << "\n";
-        }
-      }
+    OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
+    for (unsigned K = 0; K < First.size(); ++K)
+      OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
 
-  OS.indent(Depth) << "Grouped accesses:\n";
-  for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
-    OS.indent(Depth + 2) << "Group " << I << ":\n";
-    OS.indent(Depth + 4) << "(Low: " << *CheckingGroups[I].Low
-                         << " High: " << *CheckingGroups[I].High << ")\n";
-    for (unsigned J = 0; J < CheckingGroups[I].Members.size(); ++J) {
-      OS.indent(Depth + 6) << "Member: "
-                           << *Pointers[CheckingGroups[I].Members[J]].Expr
-                           << "\n";
-    }
+    OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
+    for (unsigned K = 0; K < Second.size(); ++K)
+      OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
   }
 }
 
-unsigned RuntimePointerChecking::getNumberOfChecks(
-    const SmallVectorImpl<int> *PtrPartition) const {
-
-  unsigned NumPartitions = CheckingGroups.size();
-  unsigned CheckCount = 0;
+void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
 
-  for (unsigned I = 0; I < NumPartitions; ++I)
-    for (unsigned J = I + 1; J < NumPartitions; ++J)
-      if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition))
-        CheckCount++;
-  return CheckCount;
-}
+  OS.indent(Depth) << "Run-time memory checks:\n";
+  printChecks(OS, Checks, Depth);
 
-bool RuntimePointerChecking::needsAnyChecking(
-    const SmallVectorImpl<int> *PtrPartition) const {
-  unsigned NumPointers = Pointers.size();
+  OS.indent(Depth) << "Grouped accesses:\n";
+  for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
+    const auto &CG = CheckingGroups[I];
 
-  for (unsigned I = 0; I < NumPointers; ++I)
-    for (unsigned J = I + 1; J < NumPointers; ++J)
-      if (needsChecking(I, J, PtrPartition))
-        return true;
-  return false;
+    OS.indent(Depth + 2) << "Group " << &CG << ":\n";
+    OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
+                         << ")\n";
+    for (unsigned J = 0; J < CG.Members.size(); ++J) {
+      OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
+                           << "\n";
+    }
+  }
 }
 
 namespace {
@@ -390,9 +424,10 @@ public:
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA)
-      : DL(Dl), AST(*AA), LI(LI), DepCands(DA),
-        IsRTCheckAnalysisNeeded(false) {}
+                 MemoryDepChecker::DepCandidates &DA,
+                 PredicatedScalarEvolution &PSE)
+      : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
+        PSE(PSE) {}
 
   /// \brief Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -435,7 +470,7 @@ public:
   /// We decided that no dependence analysis would be used.  Reset the state.
   void resetDepChecks(MemoryDepChecker &DepChecker) {
     CheckDeps.clear();
-    DepChecker.clearInterestingDependences();
+    DepChecker.clearDependences();
   }
 
   MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
@@ -477,14 +512,18 @@ private:
   /// (i.e. ShouldRetryWithRuntimeCheck), isDependencyCheckNeeded is cleared
   /// while this remains set if we have potentially dependent accesses.
   bool IsRTCheckAnalysisNeeded;
+
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  PredicatedScalarEvolution &PSE;
 };
 
 } // end anonymous namespace
 
 /// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE,
-                                const ValueToValueMap &Strides, Value *Ptr) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
+                                const ValueToValueMap &Strides, Value *Ptr,
+                                Loop *L) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR)
     return false;
@@ -527,11 +566,11 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       else
         ++NumReadPtrChecks;
 
-      if (hasComputableBounds(SE, StridesMap, Ptr) &&
+      if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
           (!ShouldCheckStride ||
-           isStridedPtr(SE, Ptr, TheLoop, StridesMap) == 1)) {
+           isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) {
         // The id of the dependence set.
         unsigned DepId;
 
@@ -545,7 +584,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
           // Each access has its own dependence set.
           DepId = RunningDepId++;
 
-        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
 
         DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
       } else {
@@ -599,9 +638,9 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   }
 
   if (NeedRTCheck && CanDoRT)
-    RtCheck.groupChecks(DepCands, IsDepCheckNeeded);
+    RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
 
-  DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks(nullptr)
+  DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
                << " pointer comparisons.\n");
 
   RtCheck.Need = NeedRTCheck;
@@ -706,6 +745,11 @@ void AccessAnalysis::processMemAccesses() {
           GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
           DEBUG(dbgs() << "Underlying objects for pointer " << *Ptr << "\n");
           for (Value *UnderlyingObj : TempObjects) {
+            // nullptr never alias, don't join sets for pointer that have "null"
+            // in their UnderlyingObjects list.
+            if (isa<ConstantPointerNull>(UnderlyingObj))
+              continue;
+
             UnderlyingObjToAccessMap::iterator Prev =
                 ObjToLastAccess.find(UnderlyingObj);
             if (Prev != ObjToLastAccess.end())
@@ -775,20 +819,20 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                       const ValueToValueMap &StridesMap) {
-  const Type *Ty = Ptr->getType();
+int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
+                       const Loop *Lp, const ValueToValueMap &StridesMap) {
+  Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
   // Make sure that the pointer does not point to aggregate types.
-  const PointerType *PtrTy = cast<PointerType>(Ty);
+  auto *PtrTy = cast<PointerType>(Ty);
   if (PtrTy->getElementType()->isAggregateType()) {
     DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
           << *Ptr << "\n");
     return 0;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR) {
@@ -801,6 +845,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
   if (Lp != AR->getLoop()) {
     DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
           *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
   }
 
   // The address calculation must not wrap. Otherwise, a dependence could be
@@ -811,16 +856,16 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
+  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp);
   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
     DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
+                 << *Ptr << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
 
   // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
@@ -832,7 +877,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
 
   auto &DL = Lp->getHeader()->getModule()->getDataLayout();
   int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
-  const APInt &APStepVal = C->getValue()->getValue();
+  const APInt &APStepVal = C->getAPInt();
 
   // Huge step value - give up.
   if (APStepVal.getBitWidth() > 64)
@@ -872,15 +917,15 @@ bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
   llvm_unreachable("unexpected DepType!");
 }
 
-bool MemoryDepChecker::Dependence::isInterestingDependence(DepType Type) {
+bool MemoryDepChecker::Dependence::isBackward() const {
   switch (Type) {
   case NoDep:
   case Forward:
+  case ForwardButPreventsForwarding:
+  case Unknown:
     return false;
 
   case BackwardVectorizable:
-  case Unknown:
-  case ForwardButPreventsForwarding:
   case Backward:
   case BackwardVectorizableButPreventsForwarding:
     return true;
@@ -889,17 +934,21 @@ bool MemoryDepChecker::Dependence::isInterestingDependence(DepType Type) {
 }
 
 bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
+  return isBackward() || Type == Unknown;
+}
+
+bool MemoryDepChecker::Dependence::isForward() const {
   switch (Type) {
-  case NoDep:
   case Forward:
   case ForwardButPreventsForwarding:
-    return false;
+    return true;
 
+  case NoDep:
   case Unknown:
   case BackwardVectorizable:
   case Backward:
   case BackwardVectorizableButPreventsForwarding:
-    return true;
+    return false;
   }
   llvm_unreachable("unexpected DepType!");
 }
@@ -999,11 +1048,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
+  const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr);
+  const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr);
 
-  int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides);
-  int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides);
+  int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides);
+  int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides);
 
   const SCEV *Src = AScev;
   const SCEV *Sink = BScev;
@@ -1020,12 +1069,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     std::swap(StrideAPtr, StrideBPtr);
   }
 
-  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+  const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
 
   DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-        << "(Induction step: " << StrideAPtr <<  ")\n");
+               << "(Induction step: " << StrideAPtr << ")\n");
   DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
-        << *InstMap[BIdx] << ": " << *Dist << "\n");
+               << *InstMap[BIdx] << ": " << *Dist << "\n");
 
   // Need accesses with constant stride. We don't want to vectorize
   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
@@ -1048,7 +1097,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   unsigned TypeByteSize = DL.getTypeAllocSize(ATy);
 
   // Negative distances are not plausible dependencies.
-  const APInt &Val = C->getValue()->getValue();
+  const APInt &Val = C->getAPInt();
   if (Val.isNegative()) {
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
     if (IsTrueDataDependence &&
@@ -1064,7 +1113,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // Could be improved to assert type sizes are the same (i32 == float, etc).
   if (Val == 0) {
     if (ATy == BTy)
-      return Dependence::NoDep;
+      return Dependence::Forward;
     DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n");
     return Dependence::Unknown;
   }
@@ -1203,22 +1252,21 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                 isDependent(*A.first, A.second, *B.first, B.second, Strides);
             SafeForVectorization &= Dependence::isSafeForVectorization(Type);
 
-            // Gather dependences unless we accumulated MaxInterestingDependence
+            // Gather dependences unless we accumulated MaxDependences
             // dependences.  In that case return as soon as we find the first
             // unsafe dependence.  This puts a limit on this quadratic
             // algorithm.
-            if (RecordInterestingDependences) {
-              if (Dependence::isInterestingDependence(Type))
-                InterestingDependences.push_back(
-                    Dependence(A.second, B.second, Type));
-
-              if (InterestingDependences.size() >= MaxInterestingDependence) {
-                RecordInterestingDependences = false;
-                InterestingDependences.clear();
+            if (RecordDependences) {
+              if (Type != Dependence::NoDep)
+                Dependences.push_back(Dependence(A.second, B.second, Type));
+
+              if (Dependences.size() >= MaxDependences) {
+                RecordDependences = false;
+                Dependences.clear();
                 DEBUG(dbgs() << "Too many dependences, stopped recording\n");
               }
             }
-            if (!RecordInterestingDependences && !SafeForVectorization)
+            if (!RecordDependences && !SafeForVectorization)
               return false;
           }
         ++OI;
@@ -1227,8 +1275,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
     }
   }
 
-  DEBUG(dbgs() << "Total Interesting Dependences: "
-               << InterestingDependences.size() << "\n");
+  DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
   return SafeForVectorization;
 }
 
@@ -1298,10 +1345,10 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(LoopAccessReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(LoopAccessReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -1370,7 +1417,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
         if (!St) {
-          emitAnalysis(LoopAccessReport(it) <<
+          emitAnalysis(LoopAccessReport(&*it) <<
                        "instruction cannot be vectorized");
           CanVecMem = false;
           return;
@@ -1402,7 +1449,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses);
+                          AA, LI, DependentAccesses, PSE);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1453,7 +1500,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second || !isStridedPtr(SE, Ptr, TheLoop, Strides)) {
+    if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -1483,7 +1530,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
+      Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides);
   if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
     DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -1510,6 +1557,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
       PtrRtChecking.reset();
       PtrRtChecking.Need = true;
 
+      auto *SE = PSE.getSE();
       CanDoRTIfNeeded =
           Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
 
@@ -1552,7 +1600,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+  return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
@@ -1566,86 +1614,115 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
   return nullptr;
 }
 
-std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck(
-    Instruction *Loc, const SmallVectorImpl<int> *PtrPartition) const {
-  if (!PtrRtChecking.Need)
-    return std::make_pair(nullptr, nullptr);
+namespace {
+/// \brief IR Values for the lower and upper bounds of a pointer evolution.  We
+/// need to use value-handles because SCEV expansion can invalidate previously
+/// expanded values.  Thus expansion of a pointer can invalidate the bounds for
+/// a previous one.
+struct PointerBounds {
+  TrackingVH<Value> Start;
+  TrackingVH<Value> End;
+};
+} // end anonymous namespace
 
-  SmallVector<TrackingVH<Value>, 2> Starts;
-  SmallVector<TrackingVH<Value>, 2> Ends;
+/// \brief Expand code for the lower and upper bound of the pointer group \p CG
+/// in \p TheLoop.  \return the values for the bounds.
+static PointerBounds
+expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
+             Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE,
+             const RuntimePointerChecking &PtrRtChecking) {
+  Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
+  const SCEV *Sc = SE->getSCEV(Ptr);
+
+  if (SE->isLoopInvariant(Sc, TheLoop)) {
+    DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
+                 << "\n");
+    return {Ptr, Ptr};
+  } else {
+    unsigned AS = Ptr->getType()->getPointerAddressSpace();
+    LLVMContext &Ctx = Loc->getContext();
+
+    // Use this type for pointer arithmetic.
+    Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+    Value *Start = nullptr, *End = nullptr;
+
+    DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+    Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
+    End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+    DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n");
+    return {Start, End};
+  }
+}
 
-  LLVMContext &Ctx = Loc->getContext();
-  SCEVExpander Exp(*SE, DL, "induction");
-  Instruction *FirstInst = nullptr;
+/// \brief Turns a collection of checks into a collection of expanded upper and
+/// lower bounds for both pointers in the check.
+static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
+    const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
+    Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp,
+    const RuntimePointerChecking &PtrRtChecking) {
+  SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
+
+  // Here we're relying on the SCEV Expander's cache to only emit code for the
+  // same bounds once.
+  std::transform(
+      PointerChecks.begin(), PointerChecks.end(),
+      std::back_inserter(ChecksWithBounds),
+      [&](const RuntimePointerChecking::PointerCheck &Check) {
+        PointerBounds
+          First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking),
+          Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking);
+        return std::make_pair(First, Second);
+      });
+
+  return ChecksWithBounds;
+}
 
-  for (unsigned i = 0; i < PtrRtChecking.CheckingGroups.size(); ++i) {
-    const RuntimePointerChecking::CheckingPtrGroup &CG =
-        PtrRtChecking.CheckingGroups[i];
-    Value *Ptr = PtrRtChecking.Pointers[CG.Members[0]].PointerValue;
-    const SCEV *Sc = SE->getSCEV(Ptr);
-
-    if (SE->isLoopInvariant(Sc, TheLoop)) {
-      DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
-                   << "\n");
-      Starts.push_back(Ptr);
-      Ends.push_back(Ptr);
-    } else {
-      unsigned AS = Ptr->getType()->getPointerAddressSpace();
-
-      // Use this type for pointer arithmetic.
-      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
-      Value *Start = nullptr, *End = nullptr;
-
-      DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
-      Start = Exp.expandCodeFor(CG.Low, PtrArithTy, Loc);
-      End = Exp.expandCodeFor(CG.High, PtrArithTy, Loc);
-      DEBUG(dbgs() << "Start: " << *CG.Low << " End: " << *CG.High << "\n");
-      Starts.push_back(Start);
-      Ends.push_back(End);
-    }
-  }
+std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
+    Instruction *Loc,
+    const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
+    const {
+  auto *SE = PSE.getSE();
+  SCEVExpander Exp(*SE, DL, "induction");
+  auto ExpandedChecks =
+      expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
 
+  LLVMContext &Ctx = Loc->getContext();
+  Instruction *FirstInst = nullptr;
   IRBuilder<> ChkBuilder(Loc);
   // Our instructions might fold to a constant.
   Value *MemoryRuntimeCheck = nullptr;
-  for (unsigned i = 0; i < PtrRtChecking.CheckingGroups.size(); ++i) {
-    for (unsigned j = i + 1; j < PtrRtChecking.CheckingGroups.size(); ++j) {
-      const RuntimePointerChecking::CheckingPtrGroup &CGI =
-          PtrRtChecking.CheckingGroups[i];
-      const RuntimePointerChecking::CheckingPtrGroup &CGJ =
-          PtrRtChecking.CheckingGroups[j];
-
-      if (!PtrRtChecking.needsChecking(CGI, CGJ, PtrPartition))
-        continue;
-
-      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
-      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
-
-      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
-             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
-             "Trying to bounds check pointers with different address spaces");
 
-      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
-      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
-
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
-
-      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
-      FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
-      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
-      FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
-      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+  for (const auto &Check : ExpandedChecks) {
+    const PointerBounds &A = Check.first, &B = Check.second;
+    // Check if two pointers (A and B) conflict where conflict is computed as:
+    // start(A) <= end(B) && start(B) <= end(A)
+    unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
+    unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
+
+    assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
+           (AS1 == A.End->getType()->getPointerAddressSpace()) &&
+           "Trying to bounds check pointers with different address spaces");
+
+    Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+    Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+    Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
+    Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
+    Value *End0 =   ChkBuilder.CreateBitCast(A.End,   PtrArithTy1, "bc");
+    Value *End1 =   ChkBuilder.CreateBitCast(B.End,   PtrArithTy0, "bc");
+
+    Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+    FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
+    Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+    FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
+    Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+    FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+    if (MemoryRuntimeCheck) {
+      IsConflict =
+          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
       FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-      if (MemoryRuntimeCheck) {
-        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
-                                         "conflict.rdx");
-        FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-      }
-      MemoryRuntimeCheck = IsConflict;
     }
+    MemoryRuntimeCheck = IsConflict;
   }
 
   if (!MemoryRuntimeCheck)
@@ -1661,12 +1738,20 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck(
   return std::make_pair(FirstInst, Check);
 }
 
+std::pair<Instruction *, Instruction *>
+LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
+  if (!PtrRtChecking.Need)
+    return std::make_pair(nullptr, nullptr);
+
+  return addRuntimeChecks(Loc, PtrRtChecking.getChecks());
+}
+
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const DataLayout &DL,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : PtrRtChecking(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL),
+    : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
@@ -1685,14 +1770,14 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   if (Report)
     OS.indent(Depth) << "Report: " << Report->str() << "\n";
 
-  if (auto *InterestingDependences = DepChecker.getInterestingDependences()) {
-    OS.indent(Depth) << "Interesting Dependences:\n";
-    for (auto &Dep : *InterestingDependences) {
+  if (auto *Dependences = DepChecker.getDependences()) {
+    OS.indent(Depth) << "Dependences:\n";
+    for (auto &Dep : *Dependences) {
       Dep.print(OS, Depth + 2, DepChecker.getMemoryInstructions());
       OS << "\n";
     }
   } else
-    OS.indent(Depth) << "Too many interesting dependences, not recorded\n";
+    OS.indent(Depth) << "Too many dependences, not recorded\n";
 
   // List the pair of accesses need run-time checks to prove independence.
   PtrRtChecking.print(OS, Depth);
@@ -1701,6 +1786,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   OS.indent(Depth) << "Store to invariant address was "
                    << (StoreToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
+
+  OS.indent(Depth) << "SCEV assumptions:\n";
+  PSE.getUnionPredicate().print(OS, Depth);
 }
 
 const LoopAccessInfo &
@@ -1714,8 +1802,8 @@ LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) {
 
   if (!LAI) {
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI,
-                                            Strides);
+    LAI =
+        llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI, Strides);
 #ifndef NDEBUG
     LAI->NumSymbolicStrides = Strides.size();
 #endif
@@ -1737,10 +1825,10 @@ void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const {
 }
 
 bool LoopAccessAnalysis::runOnFunction(Function &F) {
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TLI = TLIP ? &TLIP->getTLI() : nullptr;
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
@@ -1748,8 +1836,8 @@ bool LoopAccessAnalysis::runOnFunction(Function &F) {
 }
 
 void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<ScalarEvolution>();
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
 
@@ -1761,8 +1849,8 @@ static const char laa_name[] = "Loop Access Analysis";
 #define LAA_NAME "loop-accesses"
 
 INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
diff --git a/contrib/llvm/lib/Analysis/LoopInfo.cpp b/contrib/llvm/lib/Analysis/LoopInfo.cpp
index 6b6faf8..0c725fc 100644
--- a/contrib/llvm/lib/Analysis/LoopInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LoopInfo.cpp
@@ -102,8 +102,8 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
     return false;
   if (I->mayReadFromMemory())
     return false;
-  // The landingpad instruction is immobile.
-  if (isa<LandingPadInst>(I))
+  // EH block instructions are immobile.
+  if (I->isEHPad())
     return false;
   // Determine the insertion point, unless one was given.
   if (!InsertPt) {
@@ -120,6 +120,13 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
 
   // Hoist.
   I->moveBefore(InsertPt);
+
+  // There is possibility of hoisting this instruction above some arbitrary
+  // condition. Any metadata defined on it can be control dependent on this
+  // condition. Conservatively strip it here so that we don't give any wrong
+  // information to the optimizer.
+  I->dropUnknownNonDebugMetadata();
+
   Changed = true;
   return true;
 }
@@ -172,7 +179,13 @@ PHINode *Loop::getCanonicalInductionVariable() const {
 bool Loop::isLCSSAForm(DominatorTree &DT) const {
   for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
     BasicBlock *BB = *BI;
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I) {
+      // Tokens can't be used in PHI nodes and live-out tokens prevent loop
+      // optimizations, so for the purposes of considered LCSSA form, we
+      // can ignore them.
+      if (I->getType()->isTokenTy())
+        continue;
+
       for (Use &U : I->uses()) {
         Instruction *UI = cast<Instruction>(U.getUser());
         BasicBlock *UserBB = UI->getParent();
@@ -188,11 +201,21 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
             DT.isReachableFromEntry(UserBB))
           return false;
       }
+    }
   }
 
   return true;
 }
 
+bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT) const {
+  if (!isLCSSAForm(DT))
+    return false;
+
+  return std::all_of(begin(), end(), [&](const Loop *L) {
+    return L->isRecursivelyLCSSAForm(DT);
+  });
+}
+
 /// isLoopSimplifyForm - Return true if the Loop is in the form that
 /// the LoopSimplify form transforms loops to, which is sometimes called
 /// normal form.
@@ -211,15 +234,23 @@ bool Loop::isSafeToClone() const {
     if (isa<IndirectBrInst>((*I)->getTerminator()))
       return false;
 
-    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()))
+    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) {
       if (II->cannotDuplicate())
         return false;
+      // Return false if any loop blocks contain invokes to EH-pads other than
+      // landingpads;  we don't know how to split those edges yet.
+      auto *FirstNonPHI = II->getUnwindDest()->getFirstNonPHI();
+      if (FirstNonPHI->isEHPad() && !isa<LandingPadInst>(FirstNonPHI))
+        return false;
+    }
 
     for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) {
       if (const CallInst *CI = dyn_cast<CallInst>(BI)) {
         if (CI->cannotDuplicate())
           return false;
       }
+      if (BI->getType()->isTokenTy() && BI->isUsedOutsideOfBlock(*I))
+        return false;
     }
   }
   return true;
@@ -602,14 +633,14 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   return NearLoop;
 }
 
-/// updateUnloop - The last backedge has been removed from a loop--now the
-/// "unloop". Find a new parent for the blocks contained within unloop and
-/// update the loop tree. We don't necessarily have valid dominators at this
-/// point, but LoopInfo is still valid except for the removal of this loop.
-///
-/// Note that Unloop may now be an empty loop. Calling Loop::getHeader without
-/// checking first is illegal.
-void LoopInfo::updateUnloop(Loop *Unloop) {
+LoopInfo::LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree) {
+  analyze(DomTree);
+}
+
+void LoopInfo::markAsRemoved(Loop *Unloop) {
+  assert(!Unloop->isInvalid() && "Loop has already been removed");
+  Unloop->invalidate();
+  RemovedLoops.push_back(Unloop);
 
   // First handle the special case of no parent loop to simplify the algorithm.
   if (!Unloop->getParentLoop()) {
@@ -675,7 +706,7 @@ LoopInfo LoopAnalysis::run(Function &F, AnalysisManager<Function> *AM) {
   // objects. I don't want to add that kind of complexity until the scope of
   // the problem is better understood.
   LoopInfo LI;
-  LI.Analyze(AM->getResult<DominatorTreeAnalysis>(F));
+  LI.analyze(AM->getResult<DominatorTreeAnalysis>(F));
   return LI;
 }
 
@@ -685,6 +716,20 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+    : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L) {
+  OS << Banner;
+  for (auto *Block : L.blocks())
+    if (Block)
+      Block->print(OS);
+    else
+      OS << "Printing <null> block";
+  return PreservedAnalyses::all();
+}
+
 //===----------------------------------------------------------------------===//
 // LoopInfo implementation
 //
@@ -698,7 +743,7 @@ INITIALIZE_PASS_END(LoopInfoWrapperPass, "loops", "Natural Loop Information",
 
 bool LoopInfoWrapperPass::runOnFunction(Function &) {
   releaseMemory();
-  LI.Analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
+  LI.analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
   return false;
 }
 
diff --git a/contrib/llvm/lib/Analysis/LoopPass.cpp b/contrib/llvm/lib/Analysis/LoopPass.cpp
index e9fcf02..8163231 100644
--- a/contrib/llvm/lib/Analysis/LoopPass.cpp
+++ b/contrib/llvm/lib/Analysis/LoopPass.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -27,35 +28,30 @@ namespace {
 
 /// PrintLoopPass - Print a Function corresponding to a Loop.
 ///
-class PrintLoopPass : public LoopPass {
-private:
-  std::string Banner;
-  raw_ostream &Out;       // raw_ostream to print on.
+class PrintLoopPassWrapper : public LoopPass {
+  PrintLoopPass P;
 
 public:
   static char ID;
-  PrintLoopPass(const std::string &B, raw_ostream &o)
-      : LoopPass(ID), Banner(B), Out(o) {}
+  PrintLoopPassWrapper() : LoopPass(ID) {}
+  PrintLoopPassWrapper(raw_ostream &OS, const std::string &Banner)
+      : LoopPass(ID), P(OS, Banner) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 
   bool runOnLoop(Loop *L, LPPassManager &) override {
-    Out << Banner;
-    for (Loop::block_iterator b = L->block_begin(), be = L->block_end();
-         b != be;
-         ++b) {
-      if (*b)
-        (*b)->print(Out);
-      else
-        Out << "Printing <null> block";
-    }
+    auto BBI = find_if(L->blocks().begin(), L->blocks().end(),
+                       [](BasicBlock *BB) { return BB; });
+    if (BBI != L->blocks().end() &&
+        isFunctionInPrintList((*BBI)->getParent()->getName()))
+      P.run(*L);
     return false;
   }
 };
 
-char PrintLoopPass::ID = 0;
+char PrintLoopPassWrapper::ID = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -66,81 +62,34 @@ char LPPassManager::ID = 0;
 
 LPPassManager::LPPassManager()
   : FunctionPass(ID), PMDataManager() {
-  skipThisLoop = false;
-  redoThisLoop = false;
   LI = nullptr;
   CurrentLoop = nullptr;
 }
 
-/// Delete loop from the loop queue and loop hierarchy (LoopInfo).
-void LPPassManager::deleteLoopFromQueue(Loop *L) {
-
-  LI->updateUnloop(L);
-
-  // Notify passes that the loop is being deleted.
-  deleteSimpleAnalysisLoop(L);
-
-  // If L is current loop then skip rest of the passes and let
-  // runOnFunction remove L from LQ. Otherwise, remove L from LQ now
-  // and continue applying other passes on CurrentLoop.
-  if (CurrentLoop == L)
-    skipThisLoop = true;
-
-  delete L;
-
-  if (skipThisLoop)
-    return;
-
-  for (std::deque<Loop *>::iterator I = LQ.begin(),
-         E = LQ.end(); I != E; ++I) {
-    if (*I == L) {
-      LQ.erase(I);
-      break;
-    }
-  }
-}
-
 // Inset loop into loop nest (LoopInfo) and loop queue (LQ).
-void LPPassManager::insertLoop(Loop *L, Loop *ParentLoop) {
-
-  assert (CurrentLoop != L && "Cannot insert CurrentLoop");
+Loop &LPPassManager::addLoop(Loop *ParentLoop) {
+  // Create a new loop. LI will take ownership.
+  Loop *L = new Loop();
 
-  // Insert into loop nest
-  if (ParentLoop)
-    ParentLoop->addChildLoop(L);
-  else
+  // Insert into the loop nest and the loop queue.
+  if (!ParentLoop) {
+    // This is the top level loop.
     LI->addTopLevelLoop(L);
-
-  insertLoopIntoQueue(L);
-}
-
-void LPPassManager::insertLoopIntoQueue(Loop *L) {
-  // Insert L into loop queue
-  if (L == CurrentLoop)
-    redoLoop(L);
-  else if (!L->getParentLoop())
-    // This is top level loop.
     LQ.push_front(L);
-  else {
-    // Insert L after the parent loop.
-    for (std::deque<Loop *>::iterator I = LQ.begin(),
-           E = LQ.end(); I != E; ++I) {
-      if (*I == L->getParentLoop()) {
-        // deque does not support insert after.
-        ++I;
-        LQ.insert(I, 1, L);
-        break;
-      }
-    }
+    return *L;
   }
-}
 
-// Reoptimize this loop. LPPassManager will re-insert this loop into the
-// queue. This allows LoopPass to change loop nest for the loop. This
-// utility may send LPPassManager into infinite loops so use caution.
-void LPPassManager::redoLoop(Loop *L) {
-  assert (CurrentLoop == L && "Can redo only CurrentLoop");
-  redoThisLoop = true;
+  ParentLoop->addChildLoop(L);
+  // Insert L into the loop queue after the parent loop.
+  for (auto I = LQ.begin(), E = LQ.end(); I != E; ++I) {
+    if (*I == L->getParentLoop()) {
+      // deque does not support insert after.
+      ++I;
+      LQ.insert(I, 1, L);
+      break;
+    }
+  }
+  return *L;
 }
 
 /// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
@@ -229,10 +178,8 @@ bool LPPassManager::runOnFunction(Function &F) {
 
   // Walk Loops
   while (!LQ.empty()) {
-
-    CurrentLoop  = LQ.back();
-    skipThisLoop = false;
-    redoThisLoop = false;
+    bool LoopWasDeleted = false;
+    CurrentLoop = LQ.back();
 
     // Run all passes on the current Loop.
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
@@ -250,14 +197,18 @@ bool LPPassManager::runOnFunction(Function &F) {
 
         Changed |= P->runOnLoop(CurrentLoop, *this);
       }
+      LoopWasDeleted = CurrentLoop->isInvalid();
 
       if (Changed)
         dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG,
-                     skipThisLoop ? "<deleted>" :
-                                    CurrentLoop->getHeader()->getName());
+                     LoopWasDeleted ? "<deleted>"
+                                    : CurrentLoop->getHeader()->getName());
       dumpPreservedSet(P);
 
-      if (!skipThisLoop) {
+      if (LoopWasDeleted) {
+        // Notify passes that the loop is being deleted.
+        deleteSimpleAnalysisLoop(CurrentLoop);
+      } else {
         // Manually check that this loop is still healthy. This is done
         // instead of relying on LoopInfo::verifyLoop since LoopInfo
         // is a function pass and it's really expensive to verify every
@@ -276,12 +227,11 @@ bool LPPassManager::runOnFunction(Function &F) {
 
       removeNotPreservedAnalysis(P);
       recordAvailableAnalysis(P);
-      removeDeadPasses(P,
-                       skipThisLoop ? "<deleted>" :
-                                      CurrentLoop->getHeader()->getName(),
+      removeDeadPasses(P, LoopWasDeleted ? "<deleted>"
+                                         : CurrentLoop->getHeader()->getName(),
                        ON_LOOP_MSG);
 
-      if (skipThisLoop)
+      if (LoopWasDeleted)
         // Do not run other passes on this loop.
         break;
     }
@@ -289,17 +239,15 @@ bool LPPassManager::runOnFunction(Function &F) {
     // If the loop was deleted, release all the loop passes. This frees up
     // some memory, and avoids trouble with the pass manager trying to call
     // verifyAnalysis on them.
-    if (skipThisLoop)
+    if (LoopWasDeleted) {
       for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
         Pass *P = getContainedPass(Index);
         freePass(P, "<deleted>", ON_LOOP_MSG);
       }
+    }
 
     // Pop the loop from queue after running all passes.
     LQ.pop_back();
-
-    if (redoThisLoop)
-      LQ.push_back(CurrentLoop);
   }
 
   // Finalization
@@ -327,7 +275,7 @@ void LPPassManager::dumpPassStructure(unsigned Offset) {
 
 Pass *LoopPass::createPrinterPass(raw_ostream &O,
                                   const std::string &Banner) const {
-  return new PrintLoopPass(Banner, O);
+  return new PrintLoopPassWrapper(O, Banner);
 }
 
 // Check if this pass is suitable for the current LPPassManager, if
diff --git a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
index da3b829..078cefe 100644
--- a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
@@ -49,7 +49,7 @@ namespace {
     void print(raw_ostream &OS, const Module * = nullptr) const override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequiredTransitive<AliasAnalysis>();
+      AU.addRequiredTransitive<AAResultsWrapperPass>();
       AU.addRequiredTransitive<MemoryDependenceAnalysis>();
       AU.setPreservesAll();
     }
@@ -96,7 +96,7 @@ bool MemDepPrinter::runOnFunction(Function &F) {
 
   // All this code uses non-const interfaces because MemDep is not
   // const-friendly, though nothing is actually modified.
-  for (auto &I : inst_range(F)) {
+  for (auto &I : instructions(F)) {
     Instruction *Inst = &I;
 
     if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory())
@@ -135,7 +135,7 @@ bool MemDepPrinter::runOnFunction(Function &F) {
 }
 
 void MemDepPrinter::print(raw_ostream &OS, const Module *M) const {
-  for (const auto &I : inst_range(*F)) {
+  for (const auto &I : instructions(*F)) {
     const Instruction *Inst = &I;
 
     DepSetMap::const_iterator DI = Deps.find(Inst);
diff --git a/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp b/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp
index fa292a2..36f1424 100644
--- a/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -22,7 +22,8 @@ using namespace llvm;
 
 namespace {
   struct MemDerefPrinter : public FunctionPass {
-    SmallVector<Value *, 4> Vec;
+    SmallVector<Value *, 4> Deref;
+    SmallPtrSet<Value *, 4> DerefAndAligned;
 
     static char ID; // Pass identification, replacement for typeid
     MemDerefPrinter() : FunctionPass(ID) {
@@ -34,7 +35,8 @@ namespace {
     bool runOnFunction(Function &F) override;
     void print(raw_ostream &OS, const Module * = nullptr) const override;
     void releaseMemory() override {
-      Vec.clear();
+      Deref.clear();
+      DerefAndAligned.clear();
     }
   };
 }
@@ -51,11 +53,13 @@ FunctionPass *llvm::createMemDerefPrinter() {
 
 bool MemDerefPrinter::runOnFunction(Function &F) {
   const DataLayout &DL = F.getParent()->getDataLayout();
-  for (auto &I: inst_range(F)) {
+  for (auto &I: instructions(F)) {
     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
       Value *PO = LI->getPointerOperand();
       if (isDereferenceablePointer(PO, DL))
-        Vec.push_back(PO);
+        Deref.push_back(PO);
+      if (isDereferenceableAndAlignedPointer(PO, LI->getAlignment(), DL))
+        DerefAndAligned.insert(PO);
     }
   }
   return false;
@@ -63,8 +67,12 @@ bool MemDerefPrinter::runOnFunction(Function &F) {
 
 void MemDerefPrinter::print(raw_ostream &OS, const Module *M) const {
   OS << "The following are dereferenceable:\n";
-  for (auto &V: Vec) {
+  for (Value *V: Deref) {
     V->print(OS);
+    if (DerefAndAligned.count(V))
+      OS << "\t(aligned)";
+    else
+      OS << "\t(unaligned)";
     OS << "\n\n";
   }
 }
diff --git a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
index 8ddac8f..9e896ae 100644
--- a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "memory-builtins"
 
-enum AllocType {
+enum AllocType : uint8_t {
   OpNewLike          = 1<<0, // allocates; never returns null
   MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
   CallocLike         = 1<<2, // allocates + bzero
@@ -62,6 +62,14 @@ static const AllocFnsTy AllocationFnData[] = {
   {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
   {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
   {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {LibFunc::msvc_new_int,         OpNewLike,   1, 0,  -1}, // new(unsigned int)
+  {LibFunc::msvc_new_int_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {LibFunc::msvc_new_longlong,         OpNewLike,   1, 0,  -1}, // new(unsigned long long)
+  {LibFunc::msvc_new_longlong_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned long long, nothrow)
+  {LibFunc::msvc_new_array_int,         OpNewLike,   1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::msvc_new_array_int_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {LibFunc::msvc_new_array_longlong,         OpNewLike,   1, 0,  -1}, // new[](unsigned long long)
+  {LibFunc::msvc_new_array_longlong_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned long long, nothrow)
   {LibFunc::calloc,              CallocLike,  2, 0,   1},
   {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
   {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
@@ -107,18 +115,13 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
     return nullptr;
 
-  unsigned i = 0;
-  bool found = false;
-  for ( ; i < array_lengthof(AllocationFnData); ++i) {
-    if (AllocationFnData[i].Func == TLIFn) {
-      found = true;
-      break;
-    }
-  }
-  if (!found)
+  const AllocFnsTy *FnData =
+      std::find_if(std::begin(AllocationFnData), std::end(AllocationFnData),
+                   [TLIFn](const AllocFnsTy &Fn) { return Fn.Func == TLIFn; });
+
+  if (FnData == std::end(AllocationFnData))
     return nullptr;
 
-  const AllocFnsTy *FnData = &AllocationFnData[i];
   if ((FnData->AllocTy & AllocTy) != FnData->AllocTy)
     return nullptr;
 
@@ -184,20 +187,6 @@ bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
   return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
 }
 
-/// \brief Tests if a value is a call or invoke to a library function that
-/// reallocates memory (such as realloc).
-bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                           bool LookThroughBitCast) {
-  return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
-}
-
-/// \brief Tests if a value is a call or invoke to a library function that
-/// allocates memory and never returns null (such as operator new).
-bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                               bool LookThroughBitCast) {
-  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast);
-}
-
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
@@ -313,14 +302,26 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   unsigned ExpectedNumParams;
   if (TLIFn == LibFunc::free ||
       TLIFn == LibFunc::ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc::ZdaPv)   // operator delete[](void*)
+      TLIFn == LibFunc::ZdaPv || // operator delete[](void*)
+      TLIFn == LibFunc::msvc_delete_ptr32 || // operator delete(void*)
+      TLIFn == LibFunc::msvc_delete_ptr64 || // operator delete(void*)
+      TLIFn == LibFunc::msvc_delete_array_ptr32 || // operator delete[](void*)
+      TLIFn == LibFunc::msvc_delete_array_ptr64)   // operator delete[](void*)
     ExpectedNumParams = 1;
   else if (TLIFn == LibFunc::ZdlPvj ||              // delete(void*, uint)
            TLIFn == LibFunc::ZdlPvm ||              // delete(void*, ulong)
            TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
            TLIFn == LibFunc::ZdaPvj ||              // delete[](void*, uint)
            TLIFn == LibFunc::ZdaPvm ||              // delete[](void*, ulong)
-           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t)   // delete[](void*, nothrow)
+           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_ptr32_int ||      // delete(void*, uint)
+           TLIFn == LibFunc::msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
+           TLIFn == LibFunc::msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
+           TLIFn == LibFunc::msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
+           TLIFn == LibFunc::msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
     return nullptr;
@@ -621,7 +622,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
 
   // always generate code immediately before the instruction being
   // processed, so that the generated code dominates the same BBs
-  Instruction *PrevInsertPoint = Builder.GetInsertPoint();
+  BuilderTy::InsertPointGuard Guard(Builder);
   if (Instruction *I = dyn_cast<Instruction>(V))
     Builder.SetInsertPoint(I);
 
@@ -650,9 +651,6 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
     Result = unknown();
   }
 
-  if (PrevInsertPoint)
-    Builder.SetInsertPoint(PrevInsertPoint);
-
   // Don't reuse CacheIt since it may be invalid at this point.
   CacheMap[V] = Result;
   return Result;
@@ -742,7 +740,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
 
   // compute offset/size for each PHI incoming pointer
   for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) {
-    Builder.SetInsertPoint(PHI.getIncomingBlock(i)->getFirstInsertionPt());
+    Builder.SetInsertPoint(&*PHI.getIncomingBlock(i)->getFirstInsertionPt());
     SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i));
 
     if (!bothKnown(EdgeData)) {
diff --git a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 782a67b..6918360 100644
--- a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -22,7 +22,9 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -49,7 +51,11 @@ STATISTIC(NumCacheCompleteNonLocalPtr,
           "Number of block queries that were completely cached");
 
 // Limit for the number of instructions to scan in a block.
-static const unsigned int BlockScanLimit = 100;
+
+static cl::opt<unsigned> BlockScanLimit(
+    "memdep-block-scan-limit", cl::Hidden, cl::init(100),
+    cl::desc("The number of instructions to scan in a block in memory "
+             "dependency analysis (default = 100)"));
 
 // Limit on the number of memdep results to process.
 static const unsigned int NumResultsLimit = 100;
@@ -60,7 +66,8 @@ char MemoryDependenceAnalysis::ID = 0;
 INITIALIZE_PASS_BEGIN(MemoryDependenceAnalysis, "memdep",
                 "Memory Dependence Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(MemoryDependenceAnalysis, "memdep",
                       "Memory Dependence Analysis", false, true)
 
@@ -87,15 +94,17 @@ void MemoryDependenceAnalysis::releaseMemory() {
 void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequiredTransitive<AliasAnalysis>();
+  AU.addRequiredTransitive<AAResultsWrapperPass>();
+  AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
 }
 
 bool MemoryDependenceAnalysis::runOnFunction(Function &F) {
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   return false;
 }
 
@@ -118,43 +127,43 @@ static void RemoveFromReverseMap(DenseMap<Instruction*,
 /// location, fill in Loc with the details, otherwise set Loc.Ptr to null.
 /// Return a ModRefInfo value describing the general behavior of the
 /// instruction.
-static AliasAnalysis::ModRefResult
-GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) {
+static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
+                              const TargetLibraryInfo &TLI) {
   if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
     if (LI->isUnordered()) {
       Loc = MemoryLocation::get(LI);
-      return AliasAnalysis::Ref;
+      return MRI_Ref;
     }
     if (LI->getOrdering() == Monotonic) {
       Loc = MemoryLocation::get(LI);
-      return AliasAnalysis::ModRef;
+      return MRI_ModRef;
     }
     Loc = MemoryLocation();
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
   }
 
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     if (SI->isUnordered()) {
       Loc = MemoryLocation::get(SI);
-      return AliasAnalysis::Mod;
+      return MRI_Mod;
     }
     if (SI->getOrdering() == Monotonic) {
       Loc = MemoryLocation::get(SI);
-      return AliasAnalysis::ModRef;
+      return MRI_ModRef;
     }
     Loc = MemoryLocation();
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
   }
 
   if (const VAArgInst *V = dyn_cast<VAArgInst>(Inst)) {
     Loc = MemoryLocation::get(V);
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
   }
 
-  if (const CallInst *CI = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
+  if (const CallInst *CI = isFreeCall(Inst, &TLI)) {
     // calls to free() deallocate the entire structure
     Loc = MemoryLocation(CI->getArgOperand(0));
-    return AliasAnalysis::Mod;
+    return MRI_Mod;
   }
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
@@ -170,7 +179,7 @@ GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) {
           cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(), AAInfo);
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
-      return AliasAnalysis::Mod;
+      return MRI_Mod;
     case Intrinsic::invariant_end:
       II->getAAMetadata(AAInfo);
       Loc = MemoryLocation(
@@ -178,7 +187,7 @@ GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) {
           cast<ConstantInt>(II->getArgOperand(1))->getZExtValue(), AAInfo);
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
-      return AliasAnalysis::Mod;
+      return MRI_Mod;
     default:
       break;
     }
@@ -186,10 +195,10 @@ GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) {
 
   // Otherwise, just do the coarse-grained thing that always works.
   if (Inst->mayWriteToMemory())
-    return AliasAnalysis::ModRef;
+    return MRI_ModRef;
   if (Inst->mayReadFromMemory())
-    return AliasAnalysis::Ref;
-  return AliasAnalysis::NoModRef;
+    return MRI_Ref;
+  return MRI_NoModRef;
 }
 
 /// getCallSiteDependencyFrom - Private helper for finding the local
@@ -207,14 +216,14 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
     if (!Limit)
       return MemDepResult::getUnknown();
 
-    Instruction *Inst = --ScanIt;
+    Instruction *Inst = &*--ScanIt;
 
     // If this inst is a memory op, get the pointer it accessed
     MemoryLocation Loc;
-    AliasAnalysis::ModRefResult MR = GetLocation(Inst, Loc, AA);
+    ModRefInfo MR = GetLocation(Inst, Loc, *TLI);
     if (Loc.Ptr) {
       // A simple instruction.
-      if (AA->getModRefInfo(CS, Loc) != AliasAnalysis::NoModRef)
+      if (AA->getModRefInfo(CS, Loc) != MRI_NoModRef)
         return MemDepResult::getClobber(Inst);
       continue;
     }
@@ -224,10 +233,10 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
       if (isa<DbgInfoIntrinsic>(Inst)) continue;
       // If these two calls do not interfere, look past it.
       switch (AA->getModRefInfo(CS, InstCS)) {
-      case AliasAnalysis::NoModRef:
+      case MRI_NoModRef:
         // If the two calls are the same, return InstCS as a Def, so that
         // CS can be found redundant and eliminated.
-        if (isReadOnlyCall && !(MR & AliasAnalysis::Mod) &&
+        if (isReadOnlyCall && !(MR & MRI_Mod) &&
             CS.getInstruction()->isIdenticalToWhenDefined(Inst))
           return MemDepResult::getDef(Inst);
 
@@ -241,7 +250,7 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
 
     // If we could not obtain a pointer for the instruction and the instruction
     // touches memory then assume that this is a dependency.
-    if (MR != AliasAnalysis::NoModRef)
+    if (MR != MRI_NoModRef)
       return MemDepResult::getClobber(Inst);
   }
 
@@ -371,6 +380,75 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst) {
 
+  if (QueryInst != nullptr) {
+    if (auto *LI = dyn_cast<LoadInst>(QueryInst)) {
+      MemDepResult invariantGroupDependency =
+          getInvariantGroupPointerDependency(LI, BB);
+
+      if (invariantGroupDependency.isDef())
+        return invariantGroupDependency;
+    }
+  }
+  return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst);
+}
+
+MemDepResult
+MemoryDependenceAnalysis::getInvariantGroupPointerDependency(LoadInst *LI,
+                                                             BasicBlock *BB) {
+  Value *LoadOperand = LI->getPointerOperand();
+  // It's is not safe to walk the use list of global value, because function
+  // passes aren't allowed to look outside their functions.
+  if (isa<GlobalValue>(LoadOperand))
+    return MemDepResult::getUnknown();
+
+  auto *InvariantGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group);
+  if (!InvariantGroupMD)
+    return MemDepResult::getUnknown();
+
+  MemDepResult Result = MemDepResult::getUnknown();
+  llvm::SmallSet<Value *, 14> Seen;
+  // Queue to process all pointers that are equivalent to load operand.
+  llvm::SmallVector<Value *, 8> LoadOperandsQueue;
+  LoadOperandsQueue.push_back(LoadOperand);
+  while (!LoadOperandsQueue.empty()) {
+    Value *Ptr = LoadOperandsQueue.pop_back_val();
+    if (isa<GlobalValue>(Ptr))
+      continue;
+
+    if (auto *BCI = dyn_cast<BitCastInst>(Ptr)) {
+      if (!Seen.count(BCI->getOperand(0))) {
+        LoadOperandsQueue.push_back(BCI->getOperand(0));
+        Seen.insert(BCI->getOperand(0));
+      }
+    }
+
+    for (Use &Us : Ptr->uses()) {
+      auto *U = dyn_cast<Instruction>(Us.getUser());
+      if (!U || U == LI || !DT->dominates(U, LI))
+        continue;
+
+      if (auto *BCI = dyn_cast<BitCastInst>(U)) {
+        if (!Seen.count(BCI)) {
+          LoadOperandsQueue.push_back(BCI);
+          Seen.insert(BCI);
+        }
+        continue;
+      }
+      // If we hit load/store with the same invariant.group metadata (and the
+      // same pointer operand) we can assume that value pointed by pointer
+      // operand didn't change.
+      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) && U->getParent() == BB &&
+          U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD)
+        return MemDepResult::getDef(U);
+    }
+  }
+  return Result;
+}
+
+MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
+    const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
+    BasicBlock *BB, Instruction *QueryInst) {
+
   const Value *MemLocBase = nullptr;
   int64_t MemLocOffset = 0;
   unsigned Limit = BlockScanLimit;
@@ -399,7 +477,7 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
   // being 42. A key property of this program however is that if either
   // 1 or 4 were missing, there would be a race between the store of 42
   // either the store of 0 or the load (making the whole progam racy).
-  // The paper mentionned above shows that the same property is respected
+  // The paper mentioned above shows that the same property is respected
   // by every program that can detect any optimisation of that kind: either
   // it is racy (undefined) or there is a release followed by an acquire
   // between the pair of accesses under consideration.
@@ -416,9 +494,15 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
 
   const DataLayout &DL = BB->getModule()->getDataLayout();
 
+  // Create a numbered basic block to lazily compute and cache instruction
+  // positions inside a BB. This is used to provide fast queries for relative
+  // position between two instructions in a BB and can be used by
+  // AliasAnalysis::callCapturesBefore.
+  OrderedBasicBlock OBB(BB);
+
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
-    Instruction *Inst = --ScanIt;
+    Instruction *Inst = &*--ScanIt;
 
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
       // Debug intrinsics don't (and can't) cause dependencies.
@@ -567,7 +651,7 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
       // If alias analysis can tell that this store is guaranteed to not modify
       // the query pointer, ignore it.  Use getModRefInfo to handle cases where
       // the query pointer points to constant memory etc.
-      if (AA->getModRefInfo(SI, MemLoc) == AliasAnalysis::NoModRef)
+      if (AA->getModRefInfo(SI, MemLoc) == MRI_NoModRef)
         continue;
 
       // Ok, this store might clobber the query pointer.  Check to see if it is
@@ -594,7 +678,6 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
     // a subsequent bitcast of the malloc call result.  There can be stores to
     // the malloced memory between the malloc call and its bitcast uses, and we
     // need to continue scanning until the malloc call.
-    const TargetLibraryInfo *TLI = AA->getTargetLibraryInfo();
     if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, TLI)) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, DL);
 
@@ -602,13 +685,13 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
         return MemDepResult::getDef(Inst);
       if (isInvariantLoad)
         continue;
-      // Be conservative if the accessed pointer may alias the allocation.
-      if (AA->alias(Inst, AccessPtr) != NoAlias)
-        return MemDepResult::getClobber(Inst);
-      // If the allocation is not aliased and does not read memory (like
-      // strdup), it is safe to ignore.
-      if (isa<AllocaInst>(Inst) ||
-          isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI))
+      // Be conservative if the accessed pointer may alias the allocation -
+      // fallback to the generic handling below.
+      if ((AA->alias(Inst, AccessPtr) == NoAlias) &&
+          // If the allocation is not aliased and does not read memory (like
+          // strdup), it is safe to ignore.
+          (isa<AllocaInst>(Inst) || isMallocLikeFn(Inst, TLI) ||
+           isCallocLikeFn(Inst, TLI)))
         continue;
     }
 
@@ -616,17 +699,17 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
        continue;
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
-    AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc);
+    ModRefInfo MR = AA->getModRefInfo(Inst, MemLoc);
     // If necessary, perform additional analysis.
-    if (MR == AliasAnalysis::ModRef)
-      MR = AA->callCapturesBefore(Inst, MemLoc, DT);
+    if (MR == MRI_ModRef)
+      MR = AA->callCapturesBefore(Inst, MemLoc, DT, &OBB);
     switch (MR) {
-    case AliasAnalysis::NoModRef:
+    case MRI_NoModRef:
       // If the call has no effect on the queried pointer, just ignore it.
       continue;
-    case AliasAnalysis::Mod:
+    case MRI_Mod:
       return MemDepResult::getClobber(Inst);
-    case AliasAnalysis::Ref:
+    case MRI_Ref:
       // If the call is known to never store to the pointer, and if this is a
       // load query, we can safely ignore it (scan past it).
       if (isLoad)
@@ -677,20 +760,20 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       LocalCache = MemDepResult::getNonFuncLocal();
   } else {
     MemoryLocation MemLoc;
-    AliasAnalysis::ModRefResult MR = GetLocation(QueryInst, MemLoc, AA);
+    ModRefInfo MR = GetLocation(QueryInst, MemLoc, *TLI);
     if (MemLoc.Ptr) {
       // If we can do a pointer scan, make it happen.
-      bool isLoad = !(MR & AliasAnalysis::Mod);
+      bool isLoad = !(MR & MRI_Mod);
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst))
         isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start;
 
-      LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos,
-                                            QueryParent, QueryInst);
+      LocalCache = getPointerDependencyFrom(
+          MemLoc, isLoad, ScanPos->getIterator(), QueryParent, QueryInst);
     } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) {
       CallSite QueryCS(QueryInst);
       bool isReadOnly = AA->onlyReadsMemory(QueryCS);
-      LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos,
-                                             QueryParent);
+      LocalCache = getCallSiteDependencyFrom(
+          QueryCS, isReadOnly, ScanPos->getIterator(), QueryParent);
     } else
       // Non-memory instruction.
       LocalCache = MemDepResult::getUnknown();
@@ -709,10 +792,8 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
 static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
                          int Count = -1) {
   if (Count == -1) Count = Cache.size();
-  if (Count == 0) return;
-
-  for (unsigned i = 1; i != unsigned(Count); ++i)
-    assert(!(Cache[i] < Cache[i-1]) && "Cache isn't sorted!");
+  assert(std::is_sorted(Cache.begin(), Cache.begin() + Count) &&
+         "Cache isn't sorted!");
 }
 #endif
 
@@ -813,7 +894,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     BasicBlock::iterator ScanPos = DirtyBB->end();
     if (ExistingResult) {
       if (Instruction *Inst = ExistingResult->getResult().getInst()) {
-        ScanPos = Inst;
+        ScanPos = Inst->getIterator();
         // We're removing QueryInst's use of Inst.
         RemoveFromReverseMap(ReverseNonLocalDeps, Inst,
                              QueryCS.getInstruction());
@@ -952,11 +1033,11 @@ MemDepResult MemoryDependenceAnalysis::GetNonLocalInfoForBlock(
     assert(ExistingResult->getResult().getInst()->getParent() == BB &&
            "Instruction invalidated?");
     ++NumCacheDirtyNonLocalPtr;
-    ScanPos = ExistingResult->getResult().getInst();
+    ScanPos = ExistingResult->getResult().getInst()->getIterator();
 
     // Eliminating the dirty entry from 'Cache', so update the reverse info.
     ValueIsLoadPair CacheKey(Loc.Ptr, isLoad);
-    RemoveFromReverseMap(ReverseNonLocalPtrDeps, ScanPos, CacheKey);
+    RemoveFromReverseMap(ReverseNonLocalPtrDeps, &*ScanPos, CacheKey);
   } else {
     ++NumUncacheNonLocalPtr;
   }
@@ -1507,7 +1588,7 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
   // the entire block to get to this point.
   MemDepResult NewDirtyVal;
   if (!RemInst->isTerminator())
-    NewDirtyVal = MemDepResult::getDirty(++BasicBlock::iterator(RemInst));
+    NewDirtyVal = MemDepResult::getDirty(&*++RemInst->getIterator());
 
   ReverseDepMapType::iterator ReverseDepIt = ReverseLocalDeps.find(RemInst);
   if (ReverseDepIt != ReverseLocalDeps.end()) {
@@ -1614,7 +1695,6 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
 
 
   assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?");
-  AA->deleteValue(RemInst);
   DEBUG(verifyRemoved(RemInst));
 }
 /// verifyRemoved - Verify that the specified instruction does not occur
diff --git a/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp
deleted file mode 100644
index 322a9a8..0000000
--- a/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//===- NoAliasAnalysis.cpp - Minimal Alias Analysis Impl ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the default implementation of the Alias Analysis interface
-// that simply returns "I don't know" for all queries.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-using namespace llvm;
-
-namespace {
-  /// NoAA - This class implements the -no-aa pass, which always returns "I
-  /// don't know" for alias queries.  NoAA is unlike other alias analysis
-  /// implementations, in that it does not chain to a previous analysis.  As
-  /// such it doesn't follow many of the rules that other alias analyses must.
-  ///
-  struct NoAA : public ImmutablePass, public AliasAnalysis {
-    static char ID; // Class identification, replacement for typeinfo
-    NoAA() : ImmutablePass(ID) {
-      initializeNoAAPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {}
-
-    bool doInitialization(Module &M) override {
-      // Note: NoAA does not call InitializeAliasAnalysis because it's
-      // special and does not support chaining.
-      DL = &M.getDataLayout();
-      return true;
-    }
-
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override {
-      return MayAlias;
-    }
-
-    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override {
-      return UnknownModRefBehavior;
-    }
-    ModRefBehavior getModRefBehavior(const Function *F) override {
-      return UnknownModRefBehavior;
-    }
-
-    bool pointsToConstantMemory(const MemoryLocation &Loc,
-                                bool OrLocal) override {
-      return false;
-    }
-    ModRefResult getArgModRefInfo(ImmutableCallSite CS,
-                                  unsigned ArgIdx) override {
-      return ModRef;
-    }
-
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override {
-      return ModRef;
-    }
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override {
-      return ModRef;
-    }
-
-    void deleteValue(Value *V) override {}
-    void addEscapingUse(Use &U) override {}
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(const void *ID) override {
-      if (ID == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
-    }
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char NoAA::ID = 0;
-INITIALIZE_AG_PASS(NoAA, AliasAnalysis, "no-aa",
-                   "No Alias Analysis (always returns 'may' alias)",
-                   true, true, true)
-
-ImmutablePass *llvm::createNoAAPass() { return new NoAA(); }
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
index 3893aab..25f660f 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -18,66 +18,46 @@
 /// used. Naive LLVM IR transformations which would otherwise be
 /// behavior-preserving may break these assumptions.
 ///
+/// TODO: Theoretically we could check for dependencies between objc_* calls
+/// and FMRB_OnlyAccessesArgumentPointees calls or other well-behaved calls.
+///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
-#include "ObjCARCAliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/PassSupport.h"
 
 #define DEBUG_TYPE "objc-arc-aa"
 
-namespace llvm {
-  class Function;
-  class Value;
-}
-
 using namespace llvm;
 using namespace llvm::objcarc;
 
-// Register this pass...
-char ObjCARCAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
-                   "ObjC-ARC-Based Alias Analysis", false, true, false)
-
-ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
-  return new ObjCARCAliasAnalysis();
-}
-
-bool ObjCARCAliasAnalysis::doInitialization(Module &M) {
-  InitializeAliasAnalysis(this, &M.getDataLayout());
-  return true;
-}
-
-void
-ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
-}
-
-AliasResult ObjCARCAliasAnalysis::alias(const MemoryLocation &LocA,
-                                        const MemoryLocation &LocB) {
+AliasResult ObjCARCAAResult::alias(const MemoryLocation &LocA,
+                                   const MemoryLocation &LocB) {
   if (!EnableARCOpts)
-    return AliasAnalysis::alias(LocA, LocB);
+    return AAResultBase::alias(LocA, LocB);
 
   // First, strip off no-ops, including ObjC-specific no-ops, and try making a
   // precise alias query.
   const Value *SA = GetRCIdentityRoot(LocA.Ptr);
   const Value *SB = GetRCIdentityRoot(LocB.Ptr);
   AliasResult Result =
-      AliasAnalysis::alias(MemoryLocation(SA, LocA.Size, LocA.AATags),
-                           MemoryLocation(SB, LocB.Size, LocB.AATags));
+      AAResultBase::alias(MemoryLocation(SA, LocA.Size, LocA.AATags),
+                          MemoryLocation(SB, LocB.Size, LocB.AATags));
   if (Result != MayAlias)
     return Result;
 
   // If that failed, climb to the underlying object, including climbing through
   // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *UA = GetUnderlyingObjCPtr(SA, *DL);
-  const Value *UB = GetUnderlyingObjCPtr(SB, *DL);
+  const Value *UA = GetUnderlyingObjCPtr(SA, DL);
+  const Value *UB = GetUnderlyingObjCPtr(SB, DL);
   if (UA != SA || UB != SB) {
-    Result = AliasAnalysis::alias(MemoryLocation(UA), MemoryLocation(UB));
+    Result = AAResultBase::alias(MemoryLocation(UA), MemoryLocation(UB));
     // We can't use MustAlias or PartialAlias results here because
     // GetUnderlyingObjCPtr may return an offsetted pointer value.
     if (Result == NoAlias)
@@ -89,55 +69,47 @@ AliasResult ObjCARCAliasAnalysis::alias(const MemoryLocation &LocA,
   return MayAlias;
 }
 
-bool ObjCARCAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
-                                                  bool OrLocal) {
+bool ObjCARCAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
+                                             bool OrLocal) {
   if (!EnableARCOpts)
-    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+    return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
 
   // First, strip off no-ops, including ObjC-specific no-ops, and try making
   // a precise alias query.
   const Value *S = GetRCIdentityRoot(Loc.Ptr);
-  if (AliasAnalysis::pointsToConstantMemory(
+  if (AAResultBase::pointsToConstantMemory(
           MemoryLocation(S, Loc.Size, Loc.AATags), OrLocal))
     return true;
 
   // If that failed, climb to the underlying object, including climbing through
   // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *U = GetUnderlyingObjCPtr(S, *DL);
+  const Value *U = GetUnderlyingObjCPtr(S, DL);
   if (U != S)
-    return AliasAnalysis::pointsToConstantMemory(MemoryLocation(U), OrLocal);
+    return AAResultBase::pointsToConstantMemory(MemoryLocation(U), OrLocal);
 
   // If that failed, fail. We don't need to chain here, since that's covered
   // by the earlier precise query.
   return false;
 }
 
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
-  // We have nothing to do. Just chain to the next AliasAnalysis.
-  return AliasAnalysis::getModRefBehavior(CS);
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
+FunctionModRefBehavior ObjCARCAAResult::getModRefBehavior(const Function *F) {
   if (!EnableARCOpts)
-    return AliasAnalysis::getModRefBehavior(F);
+    return AAResultBase::getModRefBehavior(F);
 
   switch (GetFunctionClass(F)) {
   case ARCInstKind::NoopCast:
-    return DoesNotAccessMemory;
+    return FMRB_DoesNotAccessMemory;
   default:
     break;
   }
 
-  return AliasAnalysis::getModRefBehavior(F);
+  return AAResultBase::getModRefBehavior(F);
 }
 
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
-                                    const MemoryLocation &Loc) {
+ModRefInfo ObjCARCAAResult::getModRefInfo(ImmutableCallSite CS,
+                                          const MemoryLocation &Loc) {
   if (!EnableARCOpts)
-    return AliasAnalysis::getModRefInfo(CS, Loc);
+    return AAResultBase::getModRefInfo(CS, Loc);
 
   switch (GetBasicARCInstKind(CS.getInstruction())) {
   case ARCInstKind::Retain:
@@ -151,18 +123,48 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     // These functions don't access any memory visible to the compiler.
     // Note that this doesn't include objc_retainBlock, because it updates
     // pointers when it copies block data.
-    return NoModRef;
+    return MRI_NoModRef;
   default:
     break;
   }
 
-  return AliasAnalysis::getModRefInfo(CS, Loc);
+  return AAResultBase::getModRefInfo(CS, Loc);
+}
+
+ObjCARCAAResult ObjCARCAA::run(Function &F, AnalysisManager<Function> *AM) {
+  return ObjCARCAAResult(F.getParent()->getDataLayout(),
+                         AM->getResult<TargetLibraryAnalysis>(F));
+}
+
+char ObjCARCAA::PassID;
+
+char ObjCARCAAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCAAWrapperPass, "objc-arc-aa",
+                      "ObjC-ARC-Based Alias Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ObjCARCAAWrapperPass, "objc-arc-aa",
+                    "ObjC-ARC-Based Alias Analysis", false, true)
+
+ImmutablePass *llvm::createObjCARCAAWrapperPass() {
+  return new ObjCARCAAWrapperPass();
+}
+
+ObjCARCAAWrapperPass::ObjCARCAAWrapperPass() : ImmutablePass(ID) {
+  initializeObjCARCAAWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
-                                    ImmutableCallSite CS2) {
-  // TODO: Theoretically we could check for dependencies between objc_* calls
-  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
+bool ObjCARCAAWrapperPass::doInitialization(Module &M) {
+  Result.reset(new ObjCARCAAResult(
+      M.getDataLayout(), getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  return false;
+}
+
+bool ObjCARCAAWrapperPass::doFinalization(Module &M) {
+  Result.reset();
+  return false;
+}
+
+void ObjCARCAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp b/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
new file mode 100644
index 0000000..e3e74aa
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp
@@ -0,0 +1,28 @@
+//===- ObjCARCAnalysisUtils.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+bool llvm::objcarc::EnableARCOpts;
+static cl::opt<bool, true>
+EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::desc("enable/disable all ARC Optimizations"),
+                       cl::location(EnableARCOpts),
+                       cl::init(true));
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
index afb873a..133b635 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp
+++ b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -19,7 +19,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Intrinsics.h"
 
 using namespace llvm;
@@ -91,7 +93,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
         .Default(ARCInstKind::CallOrUser);
 
   // One argument.
-  const Argument *A0 = AI++;
+  const Argument *A0 = &*AI++;
   if (AI == AE)
     // Argument is a pointer.
     if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
@@ -129,7 +131,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
     }
 
   // Two arguments, first is i8**.
-  const Argument *A1 = AI++;
+  const Argument *A1 = &*AI++;
   if (AI == AE)
     if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
       if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
diff --git a/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
new file mode 100644
index 0000000..0f0016f
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
@@ -0,0 +1,85 @@
+//===- OrderedBasicBlock.cpp --------------------------------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OrderedBasicBlock class. OrderedBasicBlock
+// maintains an interface where clients can query if one instruction comes
+// before another in a BasicBlock. Since BasicBlock currently lacks a reliable
+// way to query relative position between instructions one can use
+// OrderedBasicBlock to do such queries. OrderedBasicBlock is lazily built on a
+// source BasicBlock and maintains an internal Instruction -> Position map. A
+// OrderedBasicBlock instance should be discarded whenever the source
+// BasicBlock changes.
+//
+// It's currently used by the CaptureTracker in order to find relative
+// positions of a pair of instructions inside a BasicBlock.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/IR/Instruction.h"
+using namespace llvm;
+
+OrderedBasicBlock::OrderedBasicBlock(const BasicBlock *BasicB)
+    : NextInstPos(0), BB(BasicB) {
+  LastInstFound = BB->end();
+}
+
+/// \brief Given no cached results, find if \p A comes before \p B in \p BB.
+/// Cache and number out instruction while walking \p BB.
+bool OrderedBasicBlock::comesBefore(const Instruction *A,
+                                    const Instruction *B) {
+  const Instruction *Inst = nullptr;
+  assert(!(LastInstFound == BB->end() && NextInstPos != 0) &&
+         "Instruction supposed to be in NumberedInsts");
+
+  // Start the search with the instruction found in the last lookup round.
+  auto II = BB->begin();
+  auto IE = BB->end();
+  if (LastInstFound != IE)
+    II = std::next(LastInstFound);
+
+  // Number all instructions up to the point where we find 'A' or 'B'.
+  for (; II != IE; ++II) {
+    Inst = cast<Instruction>(II);
+    NumberedInsts[Inst] = NextInstPos++;
+    if (Inst == A || Inst == B)
+      break;
+  }
+
+  assert(II != IE && "Instruction not found?");
+  assert((Inst == A || Inst == B) && "Should find A or B");
+  LastInstFound = II;
+  return Inst == A;
+}
+
+/// \brief Find out whether \p A dominates \p B, meaning whether \p A
+/// comes before \p B in \p BB. This is a simplification that considers
+/// cached instruction positions and ignores other basic blocks, being
+/// only relevant to compare relative instructions positions inside \p BB.
+bool OrderedBasicBlock::dominates(const Instruction *A, const Instruction *B) {
+  assert(A->getParent() == B->getParent() &&
+         "Instructions must be in the same basic block!");
+
+  // First we lookup the instructions. If they don't exist, lookup will give us
+  // back ::end(). If they both exist, we compare the numbers. Otherwise, if NA
+  // exists and NB doesn't, it means NA must come before NB because we would
+  // have numbered NB as well if it didn't. The same is true for NB. If it
+  // exists, but NA does not, NA must come after it. If neither exist, we need
+  // to number the block and cache the results (by calling comesBefore).
+  auto NAI = NumberedInsts.find(A);
+  auto NBI = NumberedInsts.find(B);
+  if (NAI != NumberedInsts.end() && NBI != NumberedInsts.end())
+    return NAI->second < NBI->second;
+  if (NAI != NumberedInsts.end())
+    return true;
+  if (NBI != NumberedInsts.end())
+    return false;
+
+  return comesBefore(A, B);
+}
diff --git a/contrib/llvm/lib/Analysis/RegionInfo.cpp b/contrib/llvm/lib/Analysis/RegionInfo.cpp
index 8cd8534..f59d267 100644
--- a/contrib/llvm/lib/Analysis/RegionInfo.cpp
+++ b/contrib/llvm/lib/Analysis/RegionInfo.cpp
@@ -21,6 +21,9 @@
 #include <algorithm>
 #include <iterator>
 #include <set>
+#ifndef NDEBUG
+#include "llvm/Analysis/RegionPrinter.h"
+#endif
 
 using namespace llvm;
 
@@ -103,6 +106,12 @@ void RegionInfo::recalculate(Function &F, DominatorTree *DT_,
   calculate(F);
 }
 
+#ifndef NDEBUG
+void RegionInfo::view() { viewRegion(this); }
+
+void RegionInfo::viewOnly() { viewRegionOnly(this); }
+#endif
+
 //===----------------------------------------------------------------------===//
 // RegionInfoPass implementation
 //
diff --git a/contrib/llvm/lib/Analysis/RegionPrinter.cpp b/contrib/llvm/lib/Analysis/RegionPrinter.cpp
index d7f5109..acb218d 100644
--- a/contrib/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/RegionPrinter.cpp
@@ -20,6 +20,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include "llvm/IR/LegacyPassManager.h"
+#endif
 
 using namespace llvm;
 
@@ -55,25 +58,22 @@ struct DOTGraphTraits<RegionNode*> : public DefaultDOTGraphTraits {
   }
 };
 
-template<>
-struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> {
+template <>
+struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
 
   DOTGraphTraits (bool isSimple = false)
     : DOTGraphTraits<RegionNode*>(isSimple) {}
 
-  static std::string getGraphName(RegionInfoPass *DT) {
-    return "Region Graph";
-  }
+  static std::string getGraphName(const RegionInfo *) { return "Region Graph"; }
 
-  std::string getNodeLabel(RegionNode *Node, RegionInfoPass *G) {
-    RegionInfo &RI = G->getRegionInfo();
-    return DOTGraphTraits<RegionNode*>::getNodeLabel(Node,
-                                                     reinterpret_cast<RegionNode*>(RI.getTopLevelRegion()));
+  std::string getNodeLabel(RegionNode *Node, RegionInfo *G) {
+    return DOTGraphTraits<RegionNode *>::getNodeLabel(
+        Node, reinterpret_cast<RegionNode *>(G->getTopLevelRegion()));
   }
 
   std::string getEdgeAttributes(RegionNode *srcNode,
-    GraphTraits<RegionInfo*>::ChildIteratorType CI, RegionInfoPass *G) {
-    RegionInfo &RI = G->getRegionInfo();
+                                GraphTraits<RegionInfo *>::ChildIteratorType CI,
+                                RegionInfo *G) {
     RegionNode *destNode = *CI;
 
     if (srcNode->isSubRegion() || destNode->isSubRegion())
@@ -83,7 +83,7 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> {
     BasicBlock *srcBB = srcNode->getNodeAs<BasicBlock>();
     BasicBlock *destBB = destNode->getNodeAs<BasicBlock>();
 
-    Region *R = RI.getRegionFor(destBB);
+    Region *R = G->getRegionFor(destBB);
 
     while (R && R->getParent())
       if (R->getParent()->getEntry() == destBB)
@@ -91,7 +91,7 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> {
       else
         break;
 
-    if (R->getEntry() == destBB && R->contains(srcBB))
+    if (R && R->getEntry() == destBB && R->contains(srcBB))
       return "constraint=false";
 
     return "";
@@ -99,8 +99,7 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> {
 
   // Print the cluster of the subregions. This groups the single basic blocks
   // and adds a different background color for each group.
-  static void printRegionCluster(const Region &R,
-                                 GraphWriter<RegionInfoPass*> &GW,
+  static void printRegionCluster(const Region &R, GraphWriter<RegionInfo *> &GW,
                                  unsigned depth = 0) {
     raw_ostream &O = GW.getOStream();
     O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(&R)
@@ -132,50 +131,81 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> {
     O.indent(2 * depth) << "}\n";
   }
 
-  static void addCustomGraphFeatures(const RegionInfoPass* RIP,
-                                     GraphWriter<RegionInfoPass*> &GW) {
-    const RegionInfo &RI = RIP->getRegionInfo();
+  static void addCustomGraphFeatures(const RegionInfo *G,
+                                     GraphWriter<RegionInfo *> &GW) {
     raw_ostream &O = GW.getOStream();
     O << "\tcolorscheme = \"paired12\"\n";
-    printRegionCluster(*RI.getTopLevelRegion(), GW, 4);
+    printRegionCluster(*G->getTopLevelRegion(), GW, 4);
   }
 };
 } //end namespace llvm
 
 namespace {
 
+struct RegionInfoPassGraphTraits {
+  static RegionInfo *getGraph(RegionInfoPass *RIP) {
+    return &RIP->getRegionInfo();
+  }
+};
+
+struct RegionPrinter
+    : public DOTGraphTraitsPrinter<RegionInfoPass, false, RegionInfo *,
+                                   RegionInfoPassGraphTraits> {
+  static char ID;
+  RegionPrinter()
+      : DOTGraphTraitsPrinter<RegionInfoPass, false, RegionInfo *,
+                              RegionInfoPassGraphTraits>("reg", ID) {
+    initializeRegionPrinterPass(*PassRegistry::getPassRegistry());
+  }
+};
+char RegionPrinter::ID = 0;
+
+struct RegionOnlyPrinter
+    : public DOTGraphTraitsPrinter<RegionInfoPass, true, RegionInfo *,
+                                   RegionInfoPassGraphTraits> {
+  static char ID;
+  RegionOnlyPrinter()
+      : DOTGraphTraitsPrinter<RegionInfoPass, true, RegionInfo *,
+                              RegionInfoPassGraphTraits>("reg", ID) {
+    initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry());
+  }
+};
+char RegionOnlyPrinter::ID = 0;
+
 struct RegionViewer
-  : public DOTGraphTraitsViewer<RegionInfoPass, false> {
+    : public DOTGraphTraitsViewer<RegionInfoPass, false, RegionInfo *,
+                                  RegionInfoPassGraphTraits> {
   static char ID;
-  RegionViewer() : DOTGraphTraitsViewer<RegionInfoPass, false>("reg", ID){
+  RegionViewer()
+      : DOTGraphTraitsViewer<RegionInfoPass, false, RegionInfo *,
+                             RegionInfoPassGraphTraits>("reg", ID) {
     initializeRegionViewerPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionViewer::ID = 0;
 
 struct RegionOnlyViewer
-  : public DOTGraphTraitsViewer<RegionInfoPass, true> {
+    : public DOTGraphTraitsViewer<RegionInfoPass, true, RegionInfo *,
+                                  RegionInfoPassGraphTraits> {
   static char ID;
-  RegionOnlyViewer() : DOTGraphTraitsViewer<RegionInfoPass, true>("regonly", ID) {
+  RegionOnlyViewer()
+      : DOTGraphTraitsViewer<RegionInfoPass, true, RegionInfo *,
+                             RegionInfoPassGraphTraits>("regonly", ID) {
     initializeRegionOnlyViewerPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionOnlyViewer::ID = 0;
 
-struct RegionPrinter
-  : public DOTGraphTraitsPrinter<RegionInfoPass, false> {
-  static char ID;
-  RegionPrinter() :
-    DOTGraphTraitsPrinter<RegionInfoPass, false>("reg", ID) {
-      initializeRegionPrinterPass(*PassRegistry::getPassRegistry());
-    }
-};
-char RegionPrinter::ID = 0;
 } //end anonymous namespace
 
 INITIALIZE_PASS(RegionPrinter, "dot-regions",
                 "Print regions of function to 'dot' file", true, true)
 
+INITIALIZE_PASS(
+    RegionOnlyPrinter, "dot-regions-only",
+    "Print regions of function to 'dot' file (with no function bodies)", true,
+    true)
+
 INITIALIZE_PASS(RegionViewer, "view-regions", "View regions of function",
                 true, true)
 
@@ -183,25 +213,12 @@ INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only",
                 "View regions of function (with no function bodies)",
                 true, true)
 
-namespace {
-
-struct RegionOnlyPrinter
-  : public DOTGraphTraitsPrinter<RegionInfoPass, true> {
-  static char ID;
-  RegionOnlyPrinter() :
-    DOTGraphTraitsPrinter<RegionInfoPass, true>("reg", ID) {
-      initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry());
-    }
-};
+FunctionPass *llvm::createRegionPrinterPass() { return new RegionPrinter(); }
 
+FunctionPass *llvm::createRegionOnlyPrinterPass() {
+  return new RegionOnlyPrinter();
 }
 
-char RegionOnlyPrinter::ID = 0;
-INITIALIZE_PASS(RegionOnlyPrinter, "dot-regions-only",
-                "Print regions of function to 'dot' file "
-                "(with no function bodies)",
-                true, true)
-
 FunctionPass* llvm::createRegionViewerPass() {
   return new RegionViewer();
 }
@@ -210,11 +227,41 @@ FunctionPass* llvm::createRegionOnlyViewerPass() {
   return new RegionOnlyViewer();
 }
 
-FunctionPass* llvm::createRegionPrinterPass() {
-  return new RegionPrinter();
+#ifndef NDEBUG
+static void viewRegionInfo(RegionInfo *RI, bool ShortNames) {
+  assert(RI && "Argument must be non-null");
+
+  llvm::Function *F = RI->getTopLevelRegion()->getEntry()->getParent();
+  std::string GraphName = DOTGraphTraits<RegionInfo *>::getGraphName(RI);
+
+  llvm::ViewGraph(RI, "reg", ShortNames,
+                  Twine(GraphName) + " for '" + F->getName() + "' function");
 }
 
-FunctionPass* llvm::createRegionOnlyPrinterPass() {
-  return new RegionOnlyPrinter();
+static void invokeFunctionPass(const Function *F, FunctionPass *ViewerPass) {
+  assert(F && "Argument must be non-null");
+  assert(!F->isDeclaration() && "Function must have an implementation");
+
+  // The viewer and analysis passes do not modify anything, so we can safely
+  // remove the const qualifier
+  auto NonConstF = const_cast<Function *>(F);
+
+  llvm::legacy::FunctionPassManager FPM(NonConstF->getParent());
+  FPM.add(ViewerPass);
+  FPM.doInitialization();
+  FPM.run(*NonConstF);
+  FPM.doFinalization();
 }
 
+void llvm::viewRegion(RegionInfo *RI) { viewRegionInfo(RI, false); }
+
+void llvm::viewRegion(const Function *F) {
+  invokeFunctionPass(F, createRegionViewerPass());
+}
+
+void llvm::viewRegionOnly(RegionInfo *RI) { viewRegionInfo(RI, true); }
+
+void llvm::viewRegionOnly(const Function *F) {
+  invokeFunctionPass(F, createRegionOnlyViewerPass());
+}
+#endif
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
index 9c7c175..34074ef 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -83,11 +83,13 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -114,16 +116,6 @@ static cl::opt<bool>
 VerifySCEV("verify-scev",
            cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
 
-INITIALIZE_PASS_BEGIN(ScalarEvolution, "scalar-evolution",
-                "Scalar Evolution Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ScalarEvolution, "scalar-evolution",
-                "Scalar Evolution Analysis", false, true)
-char ScalarEvolution::ID = 0;
-
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -132,12 +124,11 @@ char ScalarEvolution::ID = 0;
 // Implementation of the SCEV class.
 //
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void SCEV::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
-#endif
 
 void SCEV::print(raw_ostream &OS) const {
   switch (static_cast<SCEVTypes>(getSCEVType())) {
@@ -303,7 +294,7 @@ bool SCEV::isNonConstantNegative() const {
   if (!SC) return false;
 
   // Return true if the value is negative, this matches things like (-42 * V).
-  return SC->getValue()->getValue().isNegative();
+  return SC->getAPInt().isNegative();
 }
 
 SCEVCouldNotCompute::SCEVCouldNotCompute() :
@@ -455,179 +446,179 @@ bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// SCEVComplexityCompare - Return true if the complexity of the LHS is less
-  /// than the complexity of the RHS.  This comparator is used to canonicalize
-  /// expressions.
-  class SCEVComplexityCompare {
-    const LoopInfo *const LI;
-  public:
-    explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
-
-    // Return true or false if LHS is less than, or at least RHS, respectively.
-    bool operator()(const SCEV *LHS, const SCEV *RHS) const {
-      return compare(LHS, RHS) < 0;
-    }
-
-    // Return negative, zero, or positive, if LHS is less than, equal to, or
-    // greater than RHS, respectively. A three-way result allows recursive
-    // comparisons to be more efficient.
-    int compare(const SCEV *LHS, const SCEV *RHS) const {
-      // Fast-path: SCEVs are uniqued so we can do a quick equality check.
-      if (LHS == RHS)
-        return 0;
-
-      // Primarily, sort the SCEVs by their getSCEVType().
-      unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
-      if (LType != RType)
-        return (int)LType - (int)RType;
-
-      // Aside from the getSCEVType() ordering, the particular ordering
-      // isn't very important except that it's beneficial to be consistent,
-      // so that (a + b) and (b + a) don't end up as different expressions.
-      switch (static_cast<SCEVTypes>(LType)) {
-      case scUnknown: {
-        const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
-        const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
-
-        // Sort SCEVUnknown values with some loose heuristics. TODO: This is
-        // not as complete as it could be.
-        const Value *LV = LU->getValue(), *RV = RU->getValue();
-
-        // Order pointer values after integer values. This helps SCEVExpander
-        // form GEPs.
-        bool LIsPointer = LV->getType()->isPointerTy(),
-             RIsPointer = RV->getType()->isPointerTy();
-        if (LIsPointer != RIsPointer)
-          return (int)LIsPointer - (int)RIsPointer;
-
-        // Compare getValueID values.
-        unsigned LID = LV->getValueID(),
-                 RID = RV->getValueID();
-        if (LID != RID)
-          return (int)LID - (int)RID;
-
-        // Sort arguments by their position.
-        if (const Argument *LA = dyn_cast<Argument>(LV)) {
-          const Argument *RA = cast<Argument>(RV);
-          unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
-          return (int)LArgNo - (int)RArgNo;
-        }
-
-        // For instructions, compare their loop depth, and their operand
-        // count.  This is pretty loose.
-        if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
-          const Instruction *RInst = cast<Instruction>(RV);
-
-          // Compare loop depths.
-          const BasicBlock *LParent = LInst->getParent(),
-                           *RParent = RInst->getParent();
-          if (LParent != RParent) {
-            unsigned LDepth = LI->getLoopDepth(LParent),
-                     RDepth = LI->getLoopDepth(RParent);
-            if (LDepth != RDepth)
-              return (int)LDepth - (int)RDepth;
-          }
-
-          // Compare the number of operands.
-          unsigned LNumOps = LInst->getNumOperands(),
-                   RNumOps = RInst->getNumOperands();
-          return (int)LNumOps - (int)RNumOps;
-        }
+/// SCEVComplexityCompare - Return true if the complexity of the LHS is less
+/// than the complexity of the RHS.  This comparator is used to canonicalize
+/// expressions.
+class SCEVComplexityCompare {
+  const LoopInfo *const LI;
+public:
+  explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
 
-        return 0;
-      }
+  // Return true or false if LHS is less than, or at least RHS, respectively.
+  bool operator()(const SCEV *LHS, const SCEV *RHS) const {
+    return compare(LHS, RHS) < 0;
+  }
 
-      case scConstant: {
-        const SCEVConstant *LC = cast<SCEVConstant>(LHS);
-        const SCEVConstant *RC = cast<SCEVConstant>(RHS);
-
-        // Compare constant values.
-        const APInt &LA = LC->getValue()->getValue();
-        const APInt &RA = RC->getValue()->getValue();
-        unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
-        if (LBitWidth != RBitWidth)
-          return (int)LBitWidth - (int)RBitWidth;
-        return LA.ult(RA) ? -1 : 1;
+  // Return negative, zero, or positive, if LHS is less than, equal to, or
+  // greater than RHS, respectively. A three-way result allows recursive
+  // comparisons to be more efficient.
+  int compare(const SCEV *LHS, const SCEV *RHS) const {
+    // Fast-path: SCEVs are uniqued so we can do a quick equality check.
+    if (LHS == RHS)
+      return 0;
+
+    // Primarily, sort the SCEVs by their getSCEVType().
+    unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
+    if (LType != RType)
+      return (int)LType - (int)RType;
+
+    // Aside from the getSCEVType() ordering, the particular ordering
+    // isn't very important except that it's beneficial to be consistent,
+    // so that (a + b) and (b + a) don't end up as different expressions.
+    switch (static_cast<SCEVTypes>(LType)) {
+    case scUnknown: {
+      const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
+      const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
+
+      // Sort SCEVUnknown values with some loose heuristics. TODO: This is
+      // not as complete as it could be.
+      const Value *LV = LU->getValue(), *RV = RU->getValue();
+
+      // Order pointer values after integer values. This helps SCEVExpander
+      // form GEPs.
+      bool LIsPointer = LV->getType()->isPointerTy(),
+        RIsPointer = RV->getType()->isPointerTy();
+      if (LIsPointer != RIsPointer)
+        return (int)LIsPointer - (int)RIsPointer;
+
+      // Compare getValueID values.
+      unsigned LID = LV->getValueID(),
+        RID = RV->getValueID();
+      if (LID != RID)
+        return (int)LID - (int)RID;
+
+      // Sort arguments by their position.
+      if (const Argument *LA = dyn_cast<Argument>(LV)) {
+        const Argument *RA = cast<Argument>(RV);
+        unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
+        return (int)LArgNo - (int)RArgNo;
       }
 
-      case scAddRecExpr: {
-        const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
-        const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
-
-        // Compare addrec loop depths.
-        const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
-        if (LLoop != RLoop) {
-          unsigned LDepth = LLoop->getLoopDepth(),
-                   RDepth = RLoop->getLoopDepth();
+      // For instructions, compare their loop depth, and their operand
+      // count.  This is pretty loose.
+      if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
+        const Instruction *RInst = cast<Instruction>(RV);
+
+        // Compare loop depths.
+        const BasicBlock *LParent = LInst->getParent(),
+          *RParent = RInst->getParent();
+        if (LParent != RParent) {
+          unsigned LDepth = LI->getLoopDepth(LParent),
+            RDepth = LI->getLoopDepth(RParent);
           if (LDepth != RDepth)
             return (int)LDepth - (int)RDepth;
         }
 
-        // Addrec complexity grows with operand count.
-        unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
-        if (LNumOps != RNumOps)
-          return (int)LNumOps - (int)RNumOps;
+        // Compare the number of operands.
+        unsigned LNumOps = LInst->getNumOperands(),
+          RNumOps = RInst->getNumOperands();
+        return (int)LNumOps - (int)RNumOps;
+      }
 
-        // Lexicographically compare.
-        for (unsigned i = 0; i != LNumOps; ++i) {
-          long X = compare(LA->getOperand(i), RA->getOperand(i));
-          if (X != 0)
-            return X;
-        }
+      return 0;
+    }
 
-        return 0;
+    case scConstant: {
+      const SCEVConstant *LC = cast<SCEVConstant>(LHS);
+      const SCEVConstant *RC = cast<SCEVConstant>(RHS);
+
+      // Compare constant values.
+      const APInt &LA = LC->getAPInt();
+      const APInt &RA = RC->getAPInt();
+      unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
+      if (LBitWidth != RBitWidth)
+        return (int)LBitWidth - (int)RBitWidth;
+      return LA.ult(RA) ? -1 : 1;
+    }
+
+    case scAddRecExpr: {
+      const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
+      const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
+
+      // Compare addrec loop depths.
+      const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
+      if (LLoop != RLoop) {
+        unsigned LDepth = LLoop->getLoopDepth(),
+          RDepth = RLoop->getLoopDepth();
+        if (LDepth != RDepth)
+          return (int)LDepth - (int)RDepth;
       }
 
-      case scAddExpr:
-      case scMulExpr:
-      case scSMaxExpr:
-      case scUMaxExpr: {
-        const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
-        const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
-
-        // Lexicographically compare n-ary expressions.
-        unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
-        if (LNumOps != RNumOps)
-          return (int)LNumOps - (int)RNumOps;
-
-        for (unsigned i = 0; i != LNumOps; ++i) {
-          if (i >= RNumOps)
-            return 1;
-          long X = compare(LC->getOperand(i), RC->getOperand(i));
-          if (X != 0)
-            return X;
-        }
+      // Addrec complexity grows with operand count.
+      unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
+      if (LNumOps != RNumOps)
         return (int)LNumOps - (int)RNumOps;
+
+      // Lexicographically compare.
+      for (unsigned i = 0; i != LNumOps; ++i) {
+        long X = compare(LA->getOperand(i), RA->getOperand(i));
+        if (X != 0)
+          return X;
       }
 
-      case scUDivExpr: {
-        const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
-        const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
+      return 0;
+    }
+
+    case scAddExpr:
+    case scMulExpr:
+    case scSMaxExpr:
+    case scUMaxExpr: {
+      const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
+      const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
 
-        // Lexicographically compare udiv expressions.
-        long X = compare(LC->getLHS(), RC->getLHS());
+      // Lexicographically compare n-ary expressions.
+      unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
+      if (LNumOps != RNumOps)
+        return (int)LNumOps - (int)RNumOps;
+
+      for (unsigned i = 0; i != LNumOps; ++i) {
+        if (i >= RNumOps)
+          return 1;
+        long X = compare(LC->getOperand(i), RC->getOperand(i));
         if (X != 0)
           return X;
-        return compare(LC->getRHS(), RC->getRHS());
       }
+      return (int)LNumOps - (int)RNumOps;
+    }
 
-      case scTruncate:
-      case scZeroExtend:
-      case scSignExtend: {
-        const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
-        const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+    case scUDivExpr: {
+      const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
+      const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
 
-        // Compare cast expressions by operand.
-        return compare(LC->getOperand(), RC->getOperand());
-      }
+      // Lexicographically compare udiv expressions.
+      long X = compare(LC->getLHS(), RC->getLHS());
+      if (X != 0)
+        return X;
+      return compare(LC->getRHS(), RC->getRHS());
+    }
 
-      case scCouldNotCompute:
-        llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-      }
-      llvm_unreachable("Unknown SCEV kind!");
+    case scTruncate:
+    case scZeroExtend:
+    case scSignExtend: {
+      const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
+      const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+
+      // Compare cast expressions by operand.
+      return compare(LC->getOperand(), RC->getOperand());
     }
-  };
-}
+
+    case scCouldNotCompute:
+      llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+    }
+    llvm_unreachable("Unknown SCEV kind!");
+  }
+};
+}  // end anonymous namespace
 
 /// GroupByComplexity - Given a list of SCEV objects, order them by their
 /// complexity, and group objects of the same complexity together by value.
@@ -675,24 +666,22 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   }
 }
 
-namespace {
-struct FindSCEVSize {
-  int Size;
-  FindSCEVSize() : Size(0) {}
-
-  bool follow(const SCEV *S) {
-    ++Size;
-    // Keep looking at all operands of S.
-    return true;
-  }
-  bool isDone() const {
-    return false;
-  }
-};
-}
-
 // Returns the size of the SCEV S.
 static inline int sizeOfSCEV(const SCEV *S) {
+  struct FindSCEVSize {
+    int Size;
+    FindSCEVSize() : Size(0) {}
+
+    bool follow(const SCEV *S) {
+      ++Size;
+      // Keep looking at all operands of S.
+      return true;
+    }
+    bool isDone() const {
+      return false;
+    }
+  };
+
   FindSCEVSize F;
   SCEVTraversal<FindSCEVSize> ST(F);
   ST.visitAll(S);
@@ -771,8 +760,8 @@ public:
 
   void visitConstant(const SCEVConstant *Numerator) {
     if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
-      APInt NumeratorVal = Numerator->getValue()->getValue();
-      APInt DenominatorVal = D->getValue()->getValue();
+      APInt NumeratorVal = Numerator->getAPInt();
+      APInt DenominatorVal = D->getAPInt();
       uint32_t NumeratorBW = NumeratorVal.getBitWidth();
       uint32_t DenominatorBW = DenominatorVal.getBitWidth();
 
@@ -792,17 +781,15 @@ public:
 
   void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
     const SCEV *StartQ, *StartR, *StepQ, *StepR;
-    assert(Numerator->isAffine() && "Numerator should be affine");
+    if (!Numerator->isAffine())
+      return cannotDivide(Numerator);
     divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
     divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
     // Bail out if the types do not match.
     Type *Ty = Denominator->getType();
     if (Ty != StartQ->getType() || Ty != StartR->getType() ||
-        Ty != StepQ->getType() || Ty != StepR->getType()) {
-      Quotient = Zero;
-      Remainder = Numerator;
-      return;
-    }
+        Ty != StepQ->getType() || Ty != StepR->getType())
+      return cannotDivide(Numerator);
     Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
                                 Numerator->getNoWrapFlags());
     Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
@@ -818,11 +805,8 @@ public:
       divide(SE, Op, Denominator, &Q, &R);
 
       // Bail out if types do not match.
-      if (Ty != Q->getType() || Ty != R->getType()) {
-        Quotient = Zero;
-        Remainder = Numerator;
-        return;
-      }
+      if (Ty != Q->getType() || Ty != R->getType())
+        return cannotDivide(Numerator);
 
       Qs.push_back(Q);
       Rs.push_back(R);
@@ -845,11 +829,8 @@ public:
     bool FoundDenominatorTerm = false;
     for (const SCEV *Op : Numerator->operands()) {
       // Bail out if types do not match.
-      if (Ty != Op->getType()) {
-        Quotient = Zero;
-        Remainder = Numerator;
-        return;
-      }
+      if (Ty != Op->getType())
+        return cannotDivide(Numerator);
 
       if (FoundDenominatorTerm) {
         Qs.push_back(Op);
@@ -865,11 +846,8 @@ public:
       }
 
       // Bail out if types do not match.
-      if (Ty != Q->getType()) {
-        Quotient = Zero;
-        Remainder = Numerator;
-        return;
-      }
+      if (Ty != Q->getType())
+        return cannotDivide(Numerator);
 
       FoundDenominatorTerm = true;
       Qs.push_back(Q);
@@ -884,11 +862,8 @@ public:
       return;
     }
 
-    if (!isa<SCEVUnknown>(Denominator)) {
-      Quotient = Zero;
-      Remainder = Numerator;
-      return;
-    }
+    if (!isa<SCEVUnknown>(Denominator))
+      return cannotDivide(Numerator);
 
     // The Remainder is obtained by replacing Denominator by 0 in Numerator.
     ValueToValueMap RewriteMap;
@@ -908,15 +883,12 @@ public:
     // Quotient is (Numerator - Remainder) divided by Denominator.
     const SCEV *Q, *R;
     const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
-    if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) {
-      // This SCEV does not seem to simplify: fail the division here.
-      Quotient = Zero;
-      Remainder = Numerator;
-      return;
-    }
+    // This SCEV does not seem to simplify: fail the division here.
+    if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator))
+      return cannotDivide(Numerator);
     divide(SE, Diff, Denominator, &Q, &R);
-    assert(R == Zero &&
-           "(Numerator - Remainder) should evenly divide Denominator");
+    if (R != Zero)
+      return cannotDivide(Numerator);
     Quotient = Q;
   }
 
@@ -924,11 +896,18 @@ private:
   SCEVDivision(ScalarEvolution &S, const SCEV *Numerator,
                const SCEV *Denominator)
       : SE(S), Denominator(Denominator) {
-    Zero = SE.getConstant(Denominator->getType(), 0);
-    One = SE.getConstant(Denominator->getType(), 1);
+    Zero = SE.getZero(Denominator->getType());
+    One = SE.getOne(Denominator->getType());
+
+    // We generally do not know how to divide Expr by Denominator. We
+    // initialize the division to a "cannot divide" state to simplify the rest
+    // of the code.
+    cannotDivide(Numerator);
+  }
 
-    // By default, we don't know how to divide Expr by Denominator.
-    // Providing the default here simplifies the rest of the code.
+  // Convenience function for giving up on the division. We set the quotient to
+  // be equal to zero and the remainder to be equal to the numerator.
+  void cannotDivide(const SCEV *Numerator) {
     Quotient = Zero;
     Remainder = Numerator;
   }
@@ -1151,8 +1130,8 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   // If the input value is a chrec scev, truncate the chrec's operands.
   if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
     SmallVector<const SCEV *, 4> Operands;
-    for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
-      Operands.push_back(getTruncateExpr(AddRec->getOperand(i), Ty));
+    for (const SCEV *Op : AddRec->operands())
+      Operands.push_back(getTruncateExpr(Op, Ty));
     return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
   }
 
@@ -1287,7 +1266,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
   // `Step`:
 
   // 1. NSW/NUW flags on the step increment.
-  const SCEV *PreStart = SE->getAddExpr(DiffOps, SA->getNoWrapFlags());
+  auto PreStartFlags =
+    ScalarEvolution::maskFlags(SA->getNoWrapFlags(), SCEV::FlagNUW);
+  const SCEV *PreStart = SE->getAddExpr(DiffOps, PreStartFlags);
   const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
       SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
 
@@ -1322,9 +1303,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
       ExtendOpTraits<ExtendOpTy>::getOverflowLimitForStep(Step, &Pred, SE);
 
   if (OverflowLimit &&
-      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
+      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit))
     return PreStart;
-  }
+
   return nullptr;
 }
 
@@ -1390,24 +1371,22 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start,
   if (!StartC)
     return false;
 
-  APInt StartAI = StartC->getValue()->getValue();
+  APInt StartAI = StartC->getAPInt();
 
   for (unsigned Delta : {-2, -1, 1, 2}) {
     const SCEV *PreStart = getConstant(StartAI - Delta);
 
+    FoldingSetNodeID ID;
+    ID.AddInteger(scAddRecExpr);
+    ID.AddPointer(PreStart);
+    ID.AddPointer(Step);
+    ID.AddPointer(L);
+    void *IP = nullptr;
+    const auto *PreAR =
+      static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+
     // Give up if we don't already have the add recurrence we need because
     // actually constructing an add recurrence is relatively expensive.
-    const SCEVAddRecExpr *PreAR = [&]() {
-      FoldingSetNodeID ID;
-      ID.AddInteger(scAddRecExpr);
-      ID.AddPointer(PreStart);
-      ID.AddPointer(Step);
-      ID.AddPointer(L);
-      void *IP = nullptr;
-      return static_cast<SCEVAddRecExpr *>(
-          this->UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
-    }();
-
     if (PreAR && PreAR->getNoWrapFlags(WrapType)) {  // proves (2)
       const SCEV *DeltaS = getConstant(StartC->getType(), Delta);
       ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
@@ -1578,6 +1557,18 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
       }
     }
 
+  if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
+    // zext((A + B + ...)<nuw>) --> (zext(A) + zext(B) + ...)<nuw>
+    if (SA->getNoWrapFlags(SCEV::FlagNUW)) {
+      // If the addition does not unsign overflow then we can, by definition,
+      // commute the zero extension with the addition operation.
+      SmallVector<const SCEV *, 4> Ops;
+      for (const auto *Op : SA->operands())
+        Ops.push_back(getZeroExtendExpr(Op, Ty));
+      return getAddExpr(Ops, SCEV::FlagNUW);
+    }
+  }
+
   // The cast wasn't folded; create an explicit cast node.
   // Recompute the insert position, as it may have been invalidated.
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
@@ -1635,14 +1626,14 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   }
 
   // sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2
-  if (auto SA = dyn_cast<SCEVAddExpr>(Op)) {
+  if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
     if (SA->getNumOperands() == 2) {
-      auto SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0));
-      auto SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1));
+      auto *SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0));
+      auto *SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1));
       if (SMul && SC1) {
-        if (auto SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) {
-          const APInt &C1 = SC1->getValue()->getValue();
-          const APInt &C2 = SC2->getValue()->getValue();
+        if (auto *SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) {
+          const APInt &C1 = SC1->getAPInt();
+          const APInt &C2 = SC2->getAPInt();
           if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
               C2.ugt(C1) && C2.isPowerOf2())
             return getAddExpr(getSignExtendExpr(SC1, Ty),
@@ -1650,6 +1641,16 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         }
       }
     }
+
+    // sext((A + B + ...)<nsw>) --> (sext(A) + sext(B) + ...)<nsw>
+    if (SA->getNoWrapFlags(SCEV::FlagNSW)) {
+      // If the addition does not sign overflow then we can, by definition,
+      // commute the sign extension with the addition operation.
+      SmallVector<const SCEV *, 4> Ops;
+      for (const auto *Op : SA->operands())
+        Ops.push_back(getSignExtendExpr(Op, Ty));
+      return getAddExpr(Ops, SCEV::FlagNSW);
+    }
   }
   // If the input value is a chrec scev, and we can prove that the value
   // did not overflow the old, smaller, value, we can sign extend all of the
@@ -1754,16 +1755,16 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       // If Start and Step are constants, check if we can apply this
       // transformation:
       // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2
-      auto SC1 = dyn_cast<SCEVConstant>(Start);
-      auto SC2 = dyn_cast<SCEVConstant>(Step);
+      auto *SC1 = dyn_cast<SCEVConstant>(Start);
+      auto *SC2 = dyn_cast<SCEVConstant>(Step);
       if (SC1 && SC2) {
-        const APInt &C1 = SC1->getValue()->getValue();
-        const APInt &C2 = SC2->getValue()->getValue();
+        const APInt &C1 = SC1->getAPInt();
+        const APInt &C2 = SC2->getAPInt();
         if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
             C2.isPowerOf2()) {
           Start = getSignExtendExpr(Start, Ty);
-          const SCEV *NewAR = getAddRecExpr(getConstant(AR->getType(), 0), Step,
-                                            L, AR->getNoWrapFlags());
+          const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L,
+                                            AR->getNoWrapFlags());
           return getAddExpr(Start, getSignExtendExpr(NewAR, Ty));
         }
       }
@@ -1798,7 +1799,7 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
 
   // Sign-extend negative constants.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
-    if (SC->getValue()->getValue().isNegative())
+    if (SC->getAPInt().isNegative())
       return getSignExtendExpr(Op, Ty);
 
   // Peel off a truncate cast.
@@ -1876,7 +1877,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
     // Pull a buried constant out to the outside.
     if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero())
       Interesting = true;
-    AccumulatedConstant += Scale * C->getValue()->getValue();
+    AccumulatedConstant += Scale * C->getAPInt();
   }
 
   // Next comes everything else. We're especially interested in multiplies
@@ -1885,7 +1886,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
     const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[i]);
     if (Mul && isa<SCEVConstant>(Mul->getOperand(0))) {
       APInt NewScale =
-        Scale * cast<SCEVConstant>(Mul->getOperand(0))->getValue()->getValue();
+          Scale * cast<SCEVConstant>(Mul->getOperand(0))->getAPInt();
       if (Mul->getNumOperands() == 2 && isa<SCEVAddExpr>(Mul->getOperand(1))) {
         // A multiplication of a constant with another add; recurse.
         const SCEVAddExpr *Add = cast<SCEVAddExpr>(Mul->getOperand(1));
@@ -1898,8 +1899,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
         // the map.
         SmallVector<const SCEV *, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
         const SCEV *Key = SE.getMulExpr(MulOps);
-        std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair =
-          M.insert(std::make_pair(Key, NewScale));
+        auto Pair = M.insert(std::make_pair(Key, NewScale));
         if (Pair.second) {
           NewOps.push_back(Pair.first->first);
         } else {
@@ -1927,22 +1927,15 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
   return Interesting;
 }
 
-namespace {
-  struct APIntCompare {
-    bool operator()(const APInt &LHS, const APInt &RHS) const {
-      return LHS.ult(RHS);
-    }
-  };
-}
-
 // We're trying to construct a SCEV of type `Type' with `Ops' as operands and
 // `OldFlags' as can't-wrap behavior.  Infer a more aggressive set of
 // can't-overflow flags for the operation if possible.
 static SCEV::NoWrapFlags
 StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
                       const SmallVectorImpl<const SCEV *> &Ops,
-                      SCEV::NoWrapFlags OldFlags) {
+                      SCEV::NoWrapFlags Flags) {
   using namespace std::placeholders;
+  typedef OverflowingBinaryOperator OBO;
 
   bool CanAnalyze =
       Type == scAddExpr || Type == scAddRecExpr || Type == scMulExpr;
@@ -1951,18 +1944,42 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
 
   int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
   SCEV::NoWrapFlags SignOrUnsignWrap =
-      ScalarEvolution::maskFlags(OldFlags, SignOrUnsignMask);
+      ScalarEvolution::maskFlags(Flags, SignOrUnsignMask);
 
   // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
-  auto IsKnownNonNegative =
-    std::bind(std::mem_fn(&ScalarEvolution::isKnownNonNegative), SE, _1);
+  auto IsKnownNonNegative = [&](const SCEV *S) {
+    return SE->isKnownNonNegative(S);
+  };
+
+  if (SignOrUnsignWrap == SCEV::FlagNSW && all_of(Ops, IsKnownNonNegative))
+    Flags =
+        ScalarEvolution::setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
 
-  if (SignOrUnsignWrap == SCEV::FlagNSW &&
-      std::all_of(Ops.begin(), Ops.end(), IsKnownNonNegative))
-    return ScalarEvolution::setFlags(OldFlags,
-                                     (SCEV::NoWrapFlags)SignOrUnsignMask);
+  SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask);
+
+  if (SignOrUnsignWrap != SignOrUnsignMask && Type == scAddExpr &&
+      Ops.size() == 2 && isa<SCEVConstant>(Ops[0])) {
+
+    // (A + C) --> (A + C)<nsw> if the addition does not sign overflow
+    // (A + C) --> (A + C)<nuw> if the addition does not unsign overflow
+
+    const APInt &C = cast<SCEVConstant>(Ops[0])->getAPInt();
+    if (!(SignOrUnsignWrap & SCEV::FlagNSW)) {
+      auto NSWRegion =
+        ConstantRange::makeNoWrapRegion(Instruction::Add, C, OBO::NoSignedWrap);
+      if (NSWRegion.contains(SE->getSignedRange(Ops[1])))
+        Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
+    }
+    if (!(SignOrUnsignWrap & SCEV::FlagNUW)) {
+      auto NUWRegion =
+        ConstantRange::makeNoWrapRegion(Instruction::Add, C,
+                                        OBO::NoUnsignedWrap);
+      if (NUWRegion.contains(SE->getUnsignedRange(Ops[1])))
+        Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
+    }
+  }
 
-  return OldFlags;
+  return Flags;
 }
 
 /// getAddExpr - Get a canonical add expression, or something simpler if
@@ -1980,10 +1997,10 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
            "SCEVAddExpr operand types don't match!");
 #endif
 
-  Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
-
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, LI);
+  GroupByComplexity(Ops, &LI);
+
+  Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -1992,8 +2009,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     assert(Idx < Ops.size());
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
       // We found two constants, fold them together!
-      Ops[0] = getConstant(LHSC->getValue()->getValue() +
-                           RHSC->getValue()->getValue());
+      Ops[0] = getConstant(LHSC->getAPInt() + RHSC->getAPInt());
       if (Ops.size() == 2) return Ops[0];
       Ops.erase(Ops.begin()+1);  // Erase the folded element
       LHSC = cast<SCEVConstant>(Ops[0]);
@@ -2063,8 +2079,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
               break;
             }
             LargeMulOps.push_back(T->getOperand());
-          } else if (const SCEVConstant *C =
-                       dyn_cast<SCEVConstant>(M->getOperand(j))) {
+          } else if (const auto *C = dyn_cast<SCEVConstant>(M->getOperand(j))) {
             LargeMulOps.push_back(getAnyExtendExpr(C, SrcType));
           } else {
             Ok = false;
@@ -2123,24 +2138,28 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
                                      Ops.data(), Ops.size(),
                                      APInt(BitWidth, 1), *this)) {
+      struct APIntCompare {
+        bool operator()(const APInt &LHS, const APInt &RHS) const {
+          return LHS.ult(RHS);
+        }
+      };
+
       // Some interesting folding opportunity is present, so its worthwhile to
       // re-generate the operands list. Group the operands by constant scale,
       // to avoid multiplying by the same constant scale multiple times.
       std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare> MulOpLists;
-      for (SmallVectorImpl<const SCEV *>::const_iterator I = NewOps.begin(),
-           E = NewOps.end(); I != E; ++I)
-        MulOpLists[M.find(*I)->second].push_back(*I);
+      for (const SCEV *NewOp : NewOps)
+        MulOpLists[M.find(NewOp)->second].push_back(NewOp);
       // Re-generate the operands list.
       Ops.clear();
       if (AccumulatedConstant != 0)
         Ops.push_back(getConstant(AccumulatedConstant));
-      for (std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare>::iterator
-           I = MulOpLists.begin(), E = MulOpLists.end(); I != E; ++I)
-        if (I->first != 0)
-          Ops.push_back(getMulExpr(getConstant(I->first),
-                                   getAddExpr(I->second)));
+      for (auto &MulOp : MulOpLists)
+        if (MulOp.first != 0)
+          Ops.push_back(getMulExpr(getConstant(MulOp.first),
+                                   getAddExpr(MulOp.second)));
       if (Ops.empty())
-        return getConstant(Ty, 0);
+        return getZero(Ty);
       if (Ops.size() == 1)
         return Ops[0];
       return getAddExpr(Ops);
@@ -2168,7 +2187,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
             InnerMul = getMulExpr(MulOps);
           }
-          const SCEV *One = getConstant(Ty, 1);
+          const SCEV *One = getOne(Ty);
           const SCEV *AddOne = getAddExpr(One, InnerMul);
           const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV);
           if (Ops.size() == 2) return OuterMul;
@@ -2279,8 +2298,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                                AddRec->op_end());
         for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
              ++OtherIdx)
-          if (const SCEVAddRecExpr *OtherAddRec =
-                dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
+          if (const auto *OtherAddRec = dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
             if (OtherAddRec->getLoop() == AddRecLoop) {
               for (unsigned i = 0, e = OtherAddRec->getNumOperands();
                    i != e; ++i) {
@@ -2388,10 +2406,10 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
            "SCEVMulExpr operand types don't match!");
 #endif
 
-  Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
-
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, LI);
+  GroupByComplexity(Ops, &LI);
+
+  Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -2410,9 +2428,8 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     ++Idx;
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
       // We found two constants, fold them together!
-      ConstantInt *Fold = ConstantInt::get(getContext(),
-                                           LHSC->getValue()->getValue() *
-                                           RHSC->getValue()->getValue());
+      ConstantInt *Fold =
+          ConstantInt::get(getContext(), LHSC->getAPInt() * RHSC->getAPInt());
       Ops[0] = getConstant(Fold);
       Ops.erase(Ops.begin()+1);  // Erase the folded element
       if (Ops.size() == 1) return Ops[0];
@@ -2433,23 +2450,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) {
           SmallVector<const SCEV *, 4> NewOps;
           bool AnyFolded = false;
-          for (SCEVAddRecExpr::op_iterator I = Add->op_begin(),
-                 E = Add->op_end(); I != E; ++I) {
-            const SCEV *Mul = getMulExpr(Ops[0], *I);
+          for (const SCEV *AddOp : Add->operands()) {
+            const SCEV *Mul = getMulExpr(Ops[0], AddOp);
             if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true;
             NewOps.push_back(Mul);
           }
           if (AnyFolded)
             return getAddExpr(NewOps);
-        }
-        else if (const SCEVAddRecExpr *
-                 AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
+        } else if (const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
           // Negation preserves a recurrence's no self-wrap property.
           SmallVector<const SCEV *, 4> Operands;
-          for (SCEVAddRecExpr::op_iterator I = AddRec->op_begin(),
-                 E = AddRec->op_end(); I != E; ++I) {
-            Operands.push_back(getMulExpr(Ops[0], *I));
-          }
+          for (const SCEV *AddRecOp : AddRec->operands())
+            Operands.push_back(getMulExpr(Ops[0], AddRecOp));
+
           return getAddRecExpr(Operands, AddRec->getLoop(),
                                AddRec->getNoWrapFlags(SCEV::FlagNW));
         }
@@ -2560,7 +2573,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       SmallVector<const SCEV*, 7> AddRecOps;
       for (int x = 0, xe = AddRec->getNumOperands() +
              OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
-        const SCEV *Term = getConstant(Ty, 0);
+        const SCEV *Term = getZero(Ty);
         for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
           uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
           for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
@@ -2638,11 +2651,11 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
       // its operands.
       // TODO: Generalize this to non-constants by using known-bits information.
       Type *Ty = LHS->getType();
-      unsigned LZ = RHSC->getValue()->getValue().countLeadingZeros();
+      unsigned LZ = RHSC->getAPInt().countLeadingZeros();
       unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1;
       // For non-power-of-two values, effectively round the value up to the
       // nearest power of two.
-      if (!RHSC->getValue()->getValue().isPowerOf2())
+      if (!RHSC->getAPInt().isPowerOf2())
         ++MaxShiftAmt;
       IntegerType *ExtTy =
         IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt);
@@ -2650,18 +2663,17 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
         if (const SCEVConstant *Step =
             dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this))) {
           // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded.
-          const APInt &StepInt = Step->getValue()->getValue();
-          const APInt &DivInt = RHSC->getValue()->getValue();
+          const APInt &StepInt = Step->getAPInt();
+          const APInt &DivInt = RHSC->getAPInt();
           if (!StepInt.urem(DivInt) &&
               getZeroExtendExpr(AR, ExtTy) ==
               getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
                             getZeroExtendExpr(Step, ExtTy),
                             AR->getLoop(), SCEV::FlagAnyWrap)) {
             SmallVector<const SCEV *, 4> Operands;
-            for (unsigned i = 0, e = AR->getNumOperands(); i != e; ++i)
-              Operands.push_back(getUDivExpr(AR->getOperand(i), RHS));
-            return getAddRecExpr(Operands, AR->getLoop(),
-                                 SCEV::FlagNW);
+            for (const SCEV *Op : AR->operands())
+              Operands.push_back(getUDivExpr(Op, RHS));
+            return getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagNW);
           }
           /// Get a canonical UDivExpr for a recurrence.
           /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0.
@@ -2672,7 +2684,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
               getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
                             getZeroExtendExpr(Step, ExtTy),
                             AR->getLoop(), SCEV::FlagAnyWrap)) {
-            const APInt &StartInt = StartC->getValue()->getValue();
+            const APInt &StartInt = StartC->getAPInt();
             const APInt &StartRem = StartInt.urem(StepInt);
             if (StartRem != 0)
               LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step,
@@ -2682,8 +2694,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
       // (A*B)/C --> A*(B/C) if safe and B/C can be folded.
       if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) {
         SmallVector<const SCEV *, 4> Operands;
-        for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i)
-          Operands.push_back(getZeroExtendExpr(M->getOperand(i), ExtTy));
+        for (const SCEV *Op : M->operands())
+          Operands.push_back(getZeroExtendExpr(Op, ExtTy));
         if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands))
           // Find an operand that's safely divisible.
           for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
@@ -2700,8 +2712,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
       // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
       if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) {
         SmallVector<const SCEV *, 4> Operands;
-        for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i)
-          Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy));
+        for (const SCEV *Op : A->operands())
+          Operands.push_back(getZeroExtendExpr(Op, ExtTy));
         if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) {
           Operands.clear();
           for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) {
@@ -2739,8 +2751,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
 }
 
 static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue().abs();
-  APInt B = C2->getValue()->getValue().abs();
+  APInt A = C1->getAPInt().abs();
+  APInt B = C2->getAPInt().abs();
   uint32_t ABW = A.getBitWidth();
   uint32_t BBW = B.getBitWidth();
 
@@ -2769,8 +2781,7 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
   if (const SCEVConstant *RHSCst = dyn_cast<SCEVConstant>(RHS)) {
     // If the mulexpr multiplies by a constant, then that constant must be the
     // first element of the mulexpr.
-    if (const SCEVConstant *LHSCst =
-            dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+    if (const auto *LHSCst = dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
       if (LHSCst == RHSCst) {
         SmallVector<const SCEV *, 2> Operands;
         Operands.append(Mul->op_begin() + 1, Mul->op_end());
@@ -2782,10 +2793,10 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
       // check.
       APInt Factor = gcd(LHSCst, RHSCst);
       if (!Factor.isIntN(1)) {
-        LHSCst = cast<SCEVConstant>(
-            getConstant(LHSCst->getValue()->getValue().udiv(Factor)));
-        RHSCst = cast<SCEVConstant>(
-            getConstant(RHSCst->getValue()->getValue().udiv(Factor)));
+        LHSCst =
+            cast<SCEVConstant>(getConstant(LHSCst->getAPInt().udiv(Factor)));
+        RHSCst =
+            cast<SCEVConstant>(getConstant(RHSCst->getAPInt().udiv(Factor)));
         SmallVector<const SCEV *, 2> Operands;
         Operands.push_back(LHSCst);
         Operands.append(Mul->op_begin() + 1, Mul->op_end());
@@ -2859,22 +2870,19 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   // Canonicalize nested AddRecs in by nesting them in order of loop depth.
   if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
     const Loop *NestedLoop = NestedAR->getLoop();
-    if (L->contains(NestedLoop) ?
-        (L->getLoopDepth() < NestedLoop->getLoopDepth()) :
-        (!NestedLoop->contains(L) &&
-         DT->dominates(L->getHeader(), NestedLoop->getHeader()))) {
+    if (L->contains(NestedLoop)
+            ? (L->getLoopDepth() < NestedLoop->getLoopDepth())
+            : (!NestedLoop->contains(L) &&
+               DT.dominates(L->getHeader(), NestedLoop->getHeader()))) {
       SmallVector<const SCEV *, 4> NestedOperands(NestedAR->op_begin(),
                                                   NestedAR->op_end());
       Operands[0] = NestedAR->getStart();
       // AddRecs require their operands be loop-invariant with respect to their
       // loops. Don't perform this transformation if it would break this
       // requirement.
-      bool AllInvariant = true;
-      for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-        if (!isLoopInvariant(Operands[i], L)) {
-          AllInvariant = false;
-          break;
-        }
+      bool AllInvariant = all_of(
+          Operands, [&](const SCEV *Op) { return isLoopInvariant(Op, L); });
+
       if (AllInvariant) {
         // Create a recurrence for the outer loop with the same step size.
         //
@@ -2884,12 +2892,10 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
           maskFlags(Flags, SCEV::FlagNW | NestedAR->getNoWrapFlags());
 
         NestedOperands[0] = getAddRecExpr(Operands, L, OuterFlags);
-        AllInvariant = true;
-        for (unsigned i = 0, e = NestedOperands.size(); i != e; ++i)
-          if (!isLoopInvariant(NestedOperands[i], NestedLoop)) {
-            AllInvariant = false;
-            break;
-          }
+        AllInvariant = all_of(NestedOperands, [&](const SCEV *Op) {
+          return isLoopInvariant(Op, NestedLoop);
+        });
+
         if (AllInvariant) {
           // Ok, both add recurrences are valid after the transformation.
           //
@@ -2936,10 +2942,11 @@ ScalarEvolution::getGEPExpr(Type *PointeeType, const SCEV *BaseExpr,
   // FIXME(PR23527): Don't blindly transfer the inbounds flag from the GEP
   // instruction to its SCEV, because the Instruction may be guarded by control
   // flow and the no-overflow bits may not be valid for the expression in any
-  // context.
+  // context. This can be fixed similarly to how these flags are handled for
+  // adds.
   SCEV::NoWrapFlags Wrap = InBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  const SCEV *TotalOffset = getConstant(IntPtrTy, 0);
+  const SCEV *TotalOffset = getZero(IntPtrTy);
   // The address space is unimportant. The first thing we do on CurTy is getting
   // its element type.
   Type *CurTy = PointerType::getUnqual(PointeeType);
@@ -2996,7 +3003,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, LI);
+  GroupByComplexity(Ops, &LI);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -3005,9 +3012,8 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
     assert(Idx < Ops.size());
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
       // We found two constants, fold them together!
-      ConstantInt *Fold = ConstantInt::get(getContext(),
-                              APIntOps::smax(LHSC->getValue()->getValue(),
-                                             RHSC->getValue()->getValue()));
+      ConstantInt *Fold = ConstantInt::get(
+          getContext(), APIntOps::smax(LHSC->getAPInt(), RHSC->getAPInt()));
       Ops[0] = getConstant(Fold);
       Ops.erase(Ops.begin()+1);  // Erase the folded element
       if (Ops.size() == 1) return Ops[0];
@@ -3100,7 +3106,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, LI);
+  GroupByComplexity(Ops, &LI);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -3109,9 +3115,8 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
     assert(Idx < Ops.size());
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
       // We found two constants, fold them together!
-      ConstantInt *Fold = ConstantInt::get(getContext(),
-                              APIntOps::umax(LHSC->getValue()->getValue(),
-                                             RHSC->getValue()->getValue()));
+      ConstantInt *Fold = ConstantInt::get(
+          getContext(), APIntOps::umax(LHSC->getAPInt(), RHSC->getAPInt()));
       Ops[0] = getConstant(Fold);
       Ops.erase(Ops.begin()+1);  // Erase the folded element
       if (Ops.size() == 1) return Ops[0];
@@ -3200,8 +3205,7 @@ const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
   // We can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
-  return getConstant(IntTy,
-                     F->getParent()->getDataLayout().getTypeAllocSize(AllocTy));
+  return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy));
 }
 
 const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy,
@@ -3211,9 +3215,7 @@ const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy,
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   return getConstant(
-      IntTy,
-      F->getParent()->getDataLayout().getStructLayout(STy)->getElementOffset(
-          FieldNo));
+      IntTy, getDataLayout().getStructLayout(STy)->getElementOffset(FieldNo));
 }
 
 const SCEV *ScalarEvolution::getUnknown(Value *V) {
@@ -3255,7 +3257,7 @@ bool ScalarEvolution::isSCEVable(Type *Ty) const {
 /// for which isSCEVable must return true.
 uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
-  return F->getParent()->getDataLayout().getTypeSizeInBits(Ty);
+  return getDataLayout().getTypeSizeInBits(Ty);
 }
 
 /// getEffectiveSCEVType - Return a type with the same bitwidth as
@@ -3265,20 +3267,20 @@ uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
 Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
-  if (Ty->isIntegerTy()) {
+  if (Ty->isIntegerTy())
     return Ty;
-  }
 
   // The only other support type is pointer.
   assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
-  return F->getParent()->getDataLayout().getIntPtrType(Ty);
+  return getDataLayout().getIntPtrType(Ty);
 }
 
 const SCEV *ScalarEvolution::getCouldNotCompute() {
-  return &CouldNotCompute;
+  return CouldNotCompute.get();
 }
 
-namespace {
+
+bool ScalarEvolution::checkValidity(const SCEV *S) const {
   // Helper class working with SCEVTraversal to figure out if a SCEV contains
   // a SCEVUnknown with null value-pointer. FindInvalidSCEVUnknown::FindOne
   // is set iff if find such SCEVUnknown.
@@ -3300,9 +3302,7 @@ namespace {
     }
     bool isDone() const { return FindOne; }
   };
-}
 
-bool ScalarEvolution::checkValidity(const SCEV *S) const {
   FindInvalidSCEVUnknown F;
   SCEVTraversal<FindInvalidSCEVUnknown> ST(F);
   ST.visitAll(S);
@@ -3315,35 +3315,39 @@ bool ScalarEvolution::checkValidity(const SCEV *S) const {
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
+  const SCEV *S = getExistingSCEV(V);
+  if (S == nullptr) {
+    S = createSCEV(V);
+    ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S));
+  }
+  return S;
+}
+
+const SCEV *ScalarEvolution::getExistingSCEV(Value *V) {
+  assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
+
   ValueExprMapType::iterator I = ValueExprMap.find_as(V);
   if (I != ValueExprMap.end()) {
     const SCEV *S = I->second;
     if (checkValidity(S))
       return S;
-    else
-      ValueExprMap.erase(I);
+    ValueExprMap.erase(I);
   }
-  const SCEV *S = createSCEV(V);
-
-  // The process of creating a SCEV for V may have caused other SCEVs
-  // to have been created, so it's necessary to insert the new entry
-  // from scratch, rather than trying to remember the insert position
-  // above.
-  ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S));
-  return S;
+  return nullptr;
 }
 
 /// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V
 ///
-const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V) {
+const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
+                                             SCEV::NoWrapFlags Flags) {
   if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
     return getConstant(
                cast<ConstantInt>(ConstantExpr::getNeg(VC->getValue())));
 
   Type *Ty = V->getType();
   Ty = getEffectiveSCEVType(Ty);
-  return getMulExpr(V,
-                  getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))));
+  return getMulExpr(
+      V, getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))), Flags);
 }
 
 /// getNotSCEV - Return a SCEV corresponding to ~V = -1-V
@@ -3362,15 +3366,40 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
 /// getMinusSCEV - Return LHS-RHS.  Minus is represented in SCEV as A+B*-1.
 const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
                                           SCEV::NoWrapFlags Flags) {
-  assert(!maskFlags(Flags, SCEV::FlagNUW) && "subtraction does not have NUW");
-
   // Fast path: X - X --> 0.
   if (LHS == RHS)
-    return getConstant(LHS->getType(), 0);
+    return getZero(LHS->getType());
+
+  // We represent LHS - RHS as LHS + (-1)*RHS. This transformation
+  // makes it so that we cannot make much use of NUW.
+  auto AddFlags = SCEV::FlagAnyWrap;
+  const bool RHSIsNotMinSigned =
+      !getSignedRange(RHS).getSignedMin().isMinSignedValue();
+  if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) {
+    // Let M be the minimum representable signed value. Then (-1)*RHS
+    // signed-wraps if and only if RHS is M. That can happen even for
+    // a NSW subtraction because e.g. (-1)*M signed-wraps even though
+    // -1 - M does not. So to transfer NSW from LHS - RHS to LHS +
+    // (-1)*RHS, we need to prove that RHS != M.
+    //
+    // If LHS is non-negative and we know that LHS - RHS does not
+    // signed-wrap, then RHS cannot be M. So we can rule out signed-wrap
+    // either by proving that RHS > M or that LHS >= 0.
+    if (RHSIsNotMinSigned || isKnownNonNegative(LHS)) {
+      AddFlags = SCEV::FlagNSW;
+    }
+  }
+
+  // FIXME: Find a correct way to transfer NSW to (-1)*M when LHS -
+  // RHS is NSW and LHS >= 0.
+  //
+  // The difficulty here is that the NSW flag may have been proven
+  // relative to a loop that is to be found in a recurrence in LHS and
+  // not in RHS. Applying NSW to (-1)*M may then let the NSW have a
+  // larger scope than intended.
+  auto NegFlags = RHSIsNotMinSigned ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  // X - Y --> X + -Y.
-  // X -(nsw || nuw) Y --> X + -Y.
-  return getAddExpr(LHS, getNegativeSCEV(RHS));
+  return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags);
 }
 
 /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
@@ -3513,16 +3542,14 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
 
   if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
     return getPointerBase(Cast->getOperand());
-  }
-  else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
+  } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
     const SCEV *PtrOp = nullptr;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      if ((*I)->getType()->isPointerTy()) {
+    for (const SCEV *NAryOp : NAry->operands()) {
+      if (NAryOp->getType()->isPointerTy()) {
         // Cannot find the base of an expression with multiple pointer operands.
         if (PtrOp)
           return V;
-        PtrOp = *I;
+        PtrOp = NAryOp;
       }
     }
     if (!PtrOp)
@@ -3558,8 +3585,7 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
     if (!Visited.insert(I).second)
       continue;
 
-    ValueExprMapType::iterator It =
-      ValueExprMap.find_as(static_cast<Value *>(I));
+    auto It = ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       const SCEV *Old = It->second;
 
@@ -3587,165 +3613,476 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
   }
 }
 
-/// createNodeForPHI - PHI nodes have two cases.  Either the PHI node exists in
-/// a loop header, making it a potential recurrence, or it doesn't.
-///
-const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
-  if (const Loop *L = LI->getLoopFor(PN->getParent()))
-    if (L->getHeader() == PN->getParent()) {
-      // The loop may have multiple entrances or multiple exits; we can analyze
-      // this phi as an addrec if it has a unique entry value and a unique
-      // backedge value.
-      Value *BEValueV = nullptr, *StartValueV = nullptr;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        Value *V = PN->getIncomingValue(i);
-        if (L->contains(PN->getIncomingBlock(i))) {
-          if (!BEValueV) {
-            BEValueV = V;
-          } else if (BEValueV != V) {
-            BEValueV = nullptr;
-            break;
-          }
-        } else if (!StartValueV) {
-          StartValueV = V;
-        } else if (StartValueV != V) {
-          StartValueV = nullptr;
-          break;
-        }
-      }
-      if (BEValueV && StartValueV) {
-        // While we are analyzing this PHI node, handle its value symbolically.
-        const SCEV *SymbolicName = getUnknown(PN);
-        assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
-               "PHI node already processed?");
-        ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
-
-        // Using this symbolic name for the PHI, analyze the value coming around
-        // the back-edge.
-        const SCEV *BEValue = getSCEV(BEValueV);
-
-        // NOTE: If BEValue is loop invariant, we know that the PHI node just
-        // has a special value for the first iteration of the loop.
-
-        // If the value coming around the backedge is an add with the symbolic
-        // value we just inserted, then we found a simple induction variable!
-        if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
-          // If there is a single occurrence of the symbolic value, replace it
-          // with a recurrence.
-          unsigned FoundIndex = Add->getNumOperands();
-          for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
-            if (Add->getOperand(i) == SymbolicName)
-              if (FoundIndex == e) {
-                FoundIndex = i;
-                break;
-              }
+namespace {
+class SCEVInitRewriter : public SCEVRewriteVisitor<SCEVInitRewriter> {
+public:
+  static const SCEV *rewrite(const SCEV *Scev, const Loop *L,
+                             ScalarEvolution &SE) {
+    SCEVInitRewriter Rewriter(L, SE);
+    const SCEV *Result = Rewriter.visit(Scev);
+    return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
+  }
 
-          if (FoundIndex != Add->getNumOperands()) {
-            // Create an add with everything but the specified operand.
-            SmallVector<const SCEV *, 8> Ops;
-            for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
-              if (i != FoundIndex)
-                Ops.push_back(Add->getOperand(i));
-            const SCEV *Accum = getAddExpr(Ops);
-
-            // This is not a valid addrec if the step amount is varying each
-            // loop iteration, but is not itself an addrec in this loop.
-            if (isLoopInvariant(Accum, L) ||
-                (isa<SCEVAddRecExpr>(Accum) &&
-                 cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
-              SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
-
-              // If the increment doesn't overflow, then neither the addrec nor
-              // the post-increment will overflow.
-              if (const AddOperator *OBO = dyn_cast<AddOperator>(BEValueV)) {
-                if (OBO->getOperand(0) == PN) {
-                  if (OBO->hasNoUnsignedWrap())
-                    Flags = setFlags(Flags, SCEV::FlagNUW);
-                  if (OBO->hasNoSignedWrap())
-                    Flags = setFlags(Flags, SCEV::FlagNSW);
-                }
-              } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
-                // If the increment is an inbounds GEP, then we know the address
-                // space cannot be wrapped around. We cannot make any guarantee
-                // about signed or unsigned overflow because pointers are
-                // unsigned but we may have a negative index from the base
-                // pointer. We can guarantee that no unsigned wrap occurs if the
-                // indices form a positive value.
-                if (GEP->isInBounds() && GEP->getOperand(0) == PN) {
-                  Flags = setFlags(Flags, SCEV::FlagNW);
-
-                  const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
-                  if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
-                    Flags = setFlags(Flags, SCEV::FlagNUW);
-                }
+  SCEVInitRewriter(const Loop *L, ScalarEvolution &SE)
+      : SCEVRewriteVisitor(SE), L(L), Valid(true) {}
 
-                // We cannot transfer nuw and nsw flags from subtraction
-                // operations -- sub nuw X, Y is not the same as add nuw X, -Y
-                // for instance.
-              }
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant))
+      Valid = false;
+    return Expr;
+  }
 
-              const SCEV *StartVal = getSCEV(StartValueV);
-              const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
-
-              // Since the no-wrap flags are on the increment, they apply to the
-              // post-incremented value as well.
-              if (isLoopInvariant(Accum, L))
-                (void)getAddRecExpr(getAddExpr(StartVal, Accum),
-                                    Accum, L, Flags);
-
-              // Okay, for the entire analysis of this edge we assumed the PHI
-              // to be symbolic.  We now need to go back and purge all of the
-              // entries for the scalars that use the symbolic expression.
-              ForgetSymbolicName(PN, SymbolicName);
-              ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
-              return PHISCEV;
-            }
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    // Only allow AddRecExprs for this loop.
+    if (Expr->getLoop() == L)
+      return Expr->getStart();
+    Valid = false;
+    return Expr;
+  }
+
+  bool isValid() { return Valid; }
+
+private:
+  const Loop *L;
+  bool Valid;
+};
+
+class SCEVShiftRewriter : public SCEVRewriteVisitor<SCEVShiftRewriter> {
+public:
+  static const SCEV *rewrite(const SCEV *Scev, const Loop *L,
+                             ScalarEvolution &SE) {
+    SCEVShiftRewriter Rewriter(L, SE);
+    const SCEV *Result = Rewriter.visit(Scev);
+    return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
+  }
+
+  SCEVShiftRewriter(const Loop *L, ScalarEvolution &SE)
+      : SCEVRewriteVisitor(SE), L(L), Valid(true) {}
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    // Only allow AddRecExprs for this loop.
+    if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant))
+      Valid = false;
+    return Expr;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (Expr->getLoop() == L && Expr->isAffine())
+      return SE.getMinusSCEV(Expr, Expr->getStepRecurrence(SE));
+    Valid = false;
+    return Expr;
+  }
+  bool isValid() { return Valid; }
+
+private:
+  const Loop *L;
+  bool Valid;
+};
+} // end anonymous namespace
+
+const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
+  const Loop *L = LI.getLoopFor(PN->getParent());
+  if (!L || L->getHeader() != PN->getParent())
+    return nullptr;
+
+  // The loop may have multiple entrances or multiple exits; we can analyze
+  // this phi as an addrec if it has a unique entry value and a unique
+  // backedge value.
+  Value *BEValueV = nullptr, *StartValueV = nullptr;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (L->contains(PN->getIncomingBlock(i))) {
+      if (!BEValueV) {
+        BEValueV = V;
+      } else if (BEValueV != V) {
+        BEValueV = nullptr;
+        break;
+      }
+    } else if (!StartValueV) {
+      StartValueV = V;
+    } else if (StartValueV != V) {
+      StartValueV = nullptr;
+      break;
+    }
+  }
+  if (BEValueV && StartValueV) {
+    // While we are analyzing this PHI node, handle its value symbolically.
+    const SCEV *SymbolicName = getUnknown(PN);
+    assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
+           "PHI node already processed?");
+    ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
+
+    // Using this symbolic name for the PHI, analyze the value coming around
+    // the back-edge.
+    const SCEV *BEValue = getSCEV(BEValueV);
+
+    // NOTE: If BEValue is loop invariant, we know that the PHI node just
+    // has a special value for the first iteration of the loop.
+
+    // If the value coming around the backedge is an add with the symbolic
+    // value we just inserted, then we found a simple induction variable!
+    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
+      // If there is a single occurrence of the symbolic value, replace it
+      // with a recurrence.
+      unsigned FoundIndex = Add->getNumOperands();
+      for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+        if (Add->getOperand(i) == SymbolicName)
+          if (FoundIndex == e) {
+            FoundIndex = i;
+            break;
           }
-        } else if (const SCEVAddRecExpr *AddRec =
-                     dyn_cast<SCEVAddRecExpr>(BEValue)) {
-          // Otherwise, this could be a loop like this:
-          //     i = 0;  for (j = 1; ..; ++j) { ....  i = j; }
-          // In this case, j = {1,+,1}  and BEValue is j.
-          // Because the other in-value of i (0) fits the evolution of BEValue
-          // i really is an addrec evolution.
-          if (AddRec->getLoop() == L && AddRec->isAffine()) {
-            const SCEV *StartVal = getSCEV(StartValueV);
-
-            // If StartVal = j.start - j.stride, we can use StartVal as the
-            // initial step of the addrec evolution.
-            if (StartVal == getMinusSCEV(AddRec->getOperand(0),
-                                         AddRec->getOperand(1))) {
-              // FIXME: For constant StartVal, we should be able to infer
-              // no-wrap flags.
-              const SCEV *PHISCEV =
-                getAddRecExpr(StartVal, AddRec->getOperand(1), L,
-                              SCEV::FlagAnyWrap);
-
-              // Okay, for the entire analysis of this edge we assumed the PHI
-              // to be symbolic.  We now need to go back and purge all of the
-              // entries for the scalars that use the symbolic expression.
-              ForgetSymbolicName(PN, SymbolicName);
-              ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
-              return PHISCEV;
+
+      if (FoundIndex != Add->getNumOperands()) {
+        // Create an add with everything but the specified operand.
+        SmallVector<const SCEV *, 8> Ops;
+        for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+          if (i != FoundIndex)
+            Ops.push_back(Add->getOperand(i));
+        const SCEV *Accum = getAddExpr(Ops);
+
+        // This is not a valid addrec if the step amount is varying each
+        // loop iteration, but is not itself an addrec in this loop.
+        if (isLoopInvariant(Accum, L) ||
+            (isa<SCEVAddRecExpr>(Accum) &&
+             cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
+          SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+
+          // If the increment doesn't overflow, then neither the addrec nor
+          // the post-increment will overflow.
+          if (const AddOperator *OBO = dyn_cast<AddOperator>(BEValueV)) {
+            if (OBO->getOperand(0) == PN) {
+              if (OBO->hasNoUnsignedWrap())
+                Flags = setFlags(Flags, SCEV::FlagNUW);
+              if (OBO->hasNoSignedWrap())
+                Flags = setFlags(Flags, SCEV::FlagNSW);
+            }
+          } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
+            // If the increment is an inbounds GEP, then we know the address
+            // space cannot be wrapped around. We cannot make any guarantee
+            // about signed or unsigned overflow because pointers are
+            // unsigned but we may have a negative index from the base
+            // pointer. We can guarantee that no unsigned wrap occurs if the
+            // indices form a positive value.
+            if (GEP->isInBounds() && GEP->getOperand(0) == PN) {
+              Flags = setFlags(Flags, SCEV::FlagNW);
+
+              const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
+              if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
+                Flags = setFlags(Flags, SCEV::FlagNUW);
             }
+
+            // We cannot transfer nuw and nsw flags from subtraction
+            // operations -- sub nuw X, Y is not the same as add nuw X, -Y
+            // for instance.
           }
+
+          const SCEV *StartVal = getSCEV(StartValueV);
+          const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
+
+          // Since the no-wrap flags are on the increment, they apply to the
+          // post-incremented value as well.
+          if (isLoopInvariant(Accum, L))
+            (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
+
+          // Okay, for the entire analysis of this edge we assumed the PHI
+          // to be symbolic.  We now need to go back and purge all of the
+          // entries for the scalars that use the symbolic expression.
+          ForgetSymbolicName(PN, SymbolicName);
+          ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+          return PHISCEV;
+        }
+      }
+    } else {
+      // Otherwise, this could be a loop like this:
+      //     i = 0;  for (j = 1; ..; ++j) { ....  i = j; }
+      // In this case, j = {1,+,1}  and BEValue is j.
+      // Because the other in-value of i (0) fits the evolution of BEValue
+      // i really is an addrec evolution.
+      //
+      // We can generalize this saying that i is the shifted value of BEValue
+      // by one iteration:
+      //   PHI(f(0), f({1,+,1})) --> f({0,+,1})
+      const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this);
+      const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this);
+      if (Shifted != getCouldNotCompute() &&
+          Start != getCouldNotCompute()) {
+        const SCEV *StartVal = getSCEV(StartValueV);
+        if (Start == StartVal) {
+          // Okay, for the entire analysis of this edge we assumed the PHI
+          // to be symbolic.  We now need to go back and purge all of the
+          // entries for the scalars that use the symbolic expression.
+          ForgetSymbolicName(PN, SymbolicName);
+          ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted;
+          return Shifted;
         }
       }
     }
+  }
+
+  return nullptr;
+}
+
+// Checks if the SCEV S is available at BB.  S is considered available at BB
+// if S can be materialized at BB without introducing a fault.
+static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S,
+                               BasicBlock *BB) {
+  struct CheckAvailable {
+    bool TraversalDone = false;
+    bool Available = true;
+
+    const Loop *L = nullptr;  // The loop BB is in (can be nullptr)
+    BasicBlock *BB = nullptr;
+    DominatorTree &DT;
+
+    CheckAvailable(const Loop *L, BasicBlock *BB, DominatorTree &DT)
+      : L(L), BB(BB), DT(DT) {}
+
+    bool setUnavailable() {
+      TraversalDone = true;
+      Available = false;
+      return false;
+    }
+
+    bool follow(const SCEV *S) {
+      switch (S->getSCEVType()) {
+      case scConstant: case scTruncate: case scZeroExtend: case scSignExtend:
+      case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr:
+        // These expressions are available if their operand(s) is/are.
+        return true;
+
+      case scAddRecExpr: {
+        // We allow add recurrences that are on the loop BB is in, or some
+        // outer loop.  This guarantees availability because the value of the
+        // add recurrence at BB is simply the "current" value of the induction
+        // variable.  We can relax this in the future; for instance an add
+        // recurrence on a sibling dominating loop is also available at BB.
+        const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop();
+        if (L && (ARLoop == L || ARLoop->contains(L)))
+          return true;
+
+        return setUnavailable();
+      }
+
+      case scUnknown: {
+        // For SCEVUnknown, we check for simple dominance.
+        const auto *SU = cast<SCEVUnknown>(S);
+        Value *V = SU->getValue();
+
+        if (isa<Argument>(V))
+          return false;
+
+        if (isa<Instruction>(V) && DT.dominates(cast<Instruction>(V), BB))
+          return false;
+
+        return setUnavailable();
+      }
+
+      case scUDivExpr:
+      case scCouldNotCompute:
+        // We do not try to smart about these at all.
+        return setUnavailable();
+      }
+      llvm_unreachable("switch should be fully covered!");
+    }
+
+    bool isDone() { return TraversalDone; }
+  };
+
+  CheckAvailable CA(L, BB, DT);
+  SCEVTraversal<CheckAvailable> ST(CA);
+
+  ST.visitAll(S);
+  return CA.Available;
+}
+
+// Try to match a control flow sequence that branches out at BI and merges back
+// at Merge into a "C ? LHS : RHS" select pattern.  Return true on a successful
+// match.
+static bool BrPHIToSelect(DominatorTree &DT, BranchInst *BI, PHINode *Merge,
+                          Value *&C, Value *&LHS, Value *&RHS) {
+  C = BI->getCondition();
+
+  BasicBlockEdge LeftEdge(BI->getParent(), BI->getSuccessor(0));
+  BasicBlockEdge RightEdge(BI->getParent(), BI->getSuccessor(1));
+
+  if (!LeftEdge.isSingleEdge())
+    return false;
+
+  assert(RightEdge.isSingleEdge() && "Follows from LeftEdge.isSingleEdge()");
+
+  Use &LeftUse = Merge->getOperandUse(0);
+  Use &RightUse = Merge->getOperandUse(1);
+
+  if (DT.dominates(LeftEdge, LeftUse) && DT.dominates(RightEdge, RightUse)) {
+    LHS = LeftUse;
+    RHS = RightUse;
+    return true;
+  }
+
+  if (DT.dominates(LeftEdge, RightUse) && DT.dominates(RightEdge, LeftUse)) {
+    LHS = RightUse;
+    RHS = LeftUse;
+    return true;
+  }
+
+  return false;
+}
+
+const SCEV *ScalarEvolution::createNodeFromSelectLikePHI(PHINode *PN) {
+  if (PN->getNumIncomingValues() == 2) {
+    const Loop *L = LI.getLoopFor(PN->getParent());
+
+    // We don't want to break LCSSA, even in a SCEV expression tree.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (LI.getLoopFor(PN->getIncomingBlock(i)) != L)
+        return nullptr;
+
+    // Try to match
+    //
+    //  br %cond, label %left, label %right
+    // left:
+    //  br label %merge
+    // right:
+    //  br label %merge
+    // merge:
+    //  V = phi [ %x, %left ], [ %y, %right ]
+    //
+    // as "select %cond, %x, %y"
+
+    BasicBlock *IDom = DT[PN->getParent()]->getIDom()->getBlock();
+    assert(IDom && "At least the entry block should dominate PN");
+
+    auto *BI = dyn_cast<BranchInst>(IDom->getTerminator());
+    Value *Cond = nullptr, *LHS = nullptr, *RHS = nullptr;
+
+    if (BI && BI->isConditional() &&
+        BrPHIToSelect(DT, BI, PN, Cond, LHS, RHS) &&
+        IsAvailableOnEntry(L, DT, getSCEV(LHS), PN->getParent()) &&
+        IsAvailableOnEntry(L, DT, getSCEV(RHS), PN->getParent()))
+      return createNodeForSelectOrPHI(PN, Cond, LHS, RHS);
+  }
+
+  return nullptr;
+}
+
+const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
+  if (const SCEV *S = createAddRecFromPHI(PN))
+    return S;
+
+  if (const SCEV *S = createNodeFromSelectLikePHI(PN))
+    return S;
 
   // If the PHI has a single incoming value, follow that value, unless the
   // PHI's incoming blocks are in a different loop, in which case doing so
   // risks breaking LCSSA form. Instcombine would normally zap these, but
   // it doesn't have DominatorTree information, so it may miss cases.
-  if (Value *V =
-          SimplifyInstruction(PN, F->getParent()->getDataLayout(), TLI, DT, AC))
-    if (LI->replacementPreservesLCSSAForm(PN, V))
+  if (Value *V = SimplifyInstruction(PN, getDataLayout(), &TLI, &DT, &AC))
+    if (LI.replacementPreservesLCSSAForm(PN, V))
       return getSCEV(V);
 
   // If it's not a loop phi, we can't handle it yet.
   return getUnknown(PN);
 }
 
+const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I,
+                                                      Value *Cond,
+                                                      Value *TrueVal,
+                                                      Value *FalseVal) {
+  // Handle "constant" branch or select. This can occur for instance when a
+  // loop pass transforms an inner loop and moves on to process the outer loop.
+  if (auto *CI = dyn_cast<ConstantInt>(Cond))
+    return getSCEV(CI->isOne() ? TrueVal : FalseVal);
+
+  // Try to match some simple smax or umax patterns.
+  auto *ICI = dyn_cast<ICmpInst>(Cond);
+  if (!ICI)
+    return getUnknown(I);
+
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+
+  switch (ICI->getPredicate()) {
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_SLE:
+    std::swap(LHS, RHS);
+  // fall through
+  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_SGE:
+    // a >s b ? a+x : b+x  ->  smax(a, b)+x
+    // a >s b ? b+x : a+x  ->  smin(a, b)+x
+    if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) {
+      const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), I->getType());
+      const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), I->getType());
+      const SCEV *LA = getSCEV(TrueVal);
+      const SCEV *RA = getSCEV(FalseVal);
+      const SCEV *LDiff = getMinusSCEV(LA, LS);
+      const SCEV *RDiff = getMinusSCEV(RA, RS);
+      if (LDiff == RDiff)
+        return getAddExpr(getSMaxExpr(LS, RS), LDiff);
+      LDiff = getMinusSCEV(LA, RS);
+      RDiff = getMinusSCEV(RA, LS);
+      if (LDiff == RDiff)
+        return getAddExpr(getSMinExpr(LS, RS), LDiff);
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_ULE:
+    std::swap(LHS, RHS);
+  // fall through
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_UGE:
+    // a >u b ? a+x : b+x  ->  umax(a, b)+x
+    // a >u b ? b+x : a+x  ->  umin(a, b)+x
+    if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) {
+      const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
+      const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), I->getType());
+      const SCEV *LA = getSCEV(TrueVal);
+      const SCEV *RA = getSCEV(FalseVal);
+      const SCEV *LDiff = getMinusSCEV(LA, LS);
+      const SCEV *RDiff = getMinusSCEV(RA, RS);
+      if (LDiff == RDiff)
+        return getAddExpr(getUMaxExpr(LS, RS), LDiff);
+      LDiff = getMinusSCEV(LA, RS);
+      RDiff = getMinusSCEV(RA, LS);
+      if (LDiff == RDiff)
+        return getAddExpr(getUMinExpr(LS, RS), LDiff);
+    }
+    break;
+  case ICmpInst::ICMP_NE:
+    // n != 0 ? n+x : 1+x  ->  umax(n, 1)+x
+    if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) &&
+        isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
+      const SCEV *One = getOne(I->getType());
+      const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
+      const SCEV *LA = getSCEV(TrueVal);
+      const SCEV *RA = getSCEV(FalseVal);
+      const SCEV *LDiff = getMinusSCEV(LA, LS);
+      const SCEV *RDiff = getMinusSCEV(RA, One);
+      if (LDiff == RDiff)
+        return getAddExpr(getUMaxExpr(One, LS), LDiff);
+    }
+    break;
+  case ICmpInst::ICMP_EQ:
+    // n == 0 ? 1+x : n+x  ->  umax(n, 1)+x
+    if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) &&
+        isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
+      const SCEV *One = getOne(I->getType());
+      const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
+      const SCEV *LA = getSCEV(TrueVal);
+      const SCEV *RA = getSCEV(FalseVal);
+      const SCEV *LDiff = getMinusSCEV(LA, One);
+      const SCEV *RDiff = getMinusSCEV(RA, LS);
+      if (LDiff == RDiff)
+        return getAddExpr(getUMaxExpr(One, LS), LDiff);
+    }
+    break;
+  default:
+    break;
+  }
+
+  return getUnknown(I);
+}
+
 /// createNodeForGEP - Expand GEP instructions into add and multiply
 /// operations. This allows them to be analyzed by regular SCEV code.
 ///
@@ -3769,7 +4106,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
 uint32_t
 ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
-    return C->getValue()->getValue().countTrailingZeros();
+    return C->getAPInt().countTrailingZeros();
 
   if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(S))
     return std::min(GetMinTrailingZeros(T->getOperand()),
@@ -3834,8 +4171,8 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
     // For a SCEVUnknown, ask ValueTracking.
     unsigned BitWidth = getTypeSizeInBits(U->getType());
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    computeKnownBits(U->getValue(), Zeros, Ones,
-                     F->getParent()->getDataLayout(), 0, AC, nullptr, DT);
+    computeKnownBits(U->getValue(), Zeros, Ones, getDataLayout(), 0, &AC,
+                     nullptr, &DT);
     return Zeros.countTrailingOnes();
   }
 
@@ -3846,26 +4183,9 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
 /// GetRangeFromMetadata - Helper method to assign a range to V from
 /// metadata present in the IR.
 static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    if (MDNode *MD = I->getMetadata(LLVMContext::MD_range)) {
-      ConstantRange TotalRange(
-          cast<IntegerType>(I->getType())->getBitWidth(), false);
-
-      unsigned NumRanges = MD->getNumOperands() / 2;
-      assert(NumRanges >= 1);
-
-      for (unsigned i = 0; i < NumRanges; ++i) {
-        ConstantInt *Lower =
-            mdconst::extract<ConstantInt>(MD->getOperand(2 * i + 0));
-        ConstantInt *Upper =
-            mdconst::extract<ConstantInt>(MD->getOperand(2 * i + 1));
-        ConstantRange Range(Lower->getValue(), Upper->getValue());
-        TotalRange = TotalRange.unionWith(Range);
-      }
-
-      return TotalRange;
-    }
-  }
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (MDNode *MD = I->getMetadata(LLVMContext::MD_range))
+      return getConstantRangeFromMetadata(*MD);
 
   return None;
 }
@@ -3887,7 +4207,7 @@ ScalarEvolution::getRange(const SCEV *S,
     return I->second;
 
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
-    return setRange(C, SignHint, ConstantRange(C->getValue()->getValue()));
+    return setRange(C, SignHint, ConstantRange(C->getAPInt()));
 
   unsigned BitWidth = getTypeSizeInBits(S->getType());
   ConstantRange ConservativeResult(BitWidth, /*isFullSet=*/true);
@@ -3965,9 +4285,8 @@ ScalarEvolution::getRange(const SCEV *S,
     if (AddRec->getNoWrapFlags(SCEV::FlagNUW))
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(AddRec->getStart()))
         if (!C->getValue()->isZero())
-          ConservativeResult =
-            ConservativeResult.intersectWith(
-              ConstantRange(C->getValue()->getValue(), APInt(BitWidth, 0)));
+          ConservativeResult = ConservativeResult.intersectWith(
+              ConstantRange(C->getAPInt(), APInt(BitWidth, 0)));
 
     // If there's no signed wrap, and all the operands have the same sign or
     // zero, the value won't ever change sign.
@@ -4065,18 +4384,18 @@ ScalarEvolution::getRange(const SCEV *S,
     // Split here to avoid paying the compile-time cost of calling both
     // computeKnownBits and ComputeNumSignBits.  This restriction can be lifted
     // if needed.
-    const DataLayout &DL = F->getParent()->getDataLayout();
+    const DataLayout &DL = getDataLayout();
     if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) {
       // For a SCEVUnknown, ask ValueTracking.
       APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-      computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AC, nullptr, DT);
+      computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, &AC, nullptr, &DT);
       if (Ones != ~Zeros + 1)
         ConservativeResult =
             ConservativeResult.intersectWith(ConstantRange(Ones, ~Zeros + 1));
     } else {
       assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED &&
              "generalize as needed!");
-      unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, AC, nullptr, DT);
+      unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
       if (NS > 1)
         ConservativeResult = ConservativeResult.intersectWith(
             ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1),
@@ -4089,8 +4408,64 @@ ScalarEvolution::getRange(const SCEV *S,
   return setRange(S, SignHint, ConservativeResult);
 }
 
-/// createSCEV - We know that there is no SCEV for the specified value.
-/// Analyze the expression.
+SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) {
+  if (isa<ConstantExpr>(V)) return SCEV::FlagAnyWrap;
+  const BinaryOperator *BinOp = cast<BinaryOperator>(V);
+
+  // Return early if there are no flags to propagate to the SCEV.
+  SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+  if (BinOp->hasNoUnsignedWrap())
+    Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
+  if (BinOp->hasNoSignedWrap())
+    Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
+  if (Flags == SCEV::FlagAnyWrap) {
+    return SCEV::FlagAnyWrap;
+  }
+
+  // Here we check that BinOp is in the header of the innermost loop
+  // containing BinOp, since we only deal with instructions in the loop
+  // header. The actual loop we need to check later will come from an add
+  // recurrence, but getting that requires computing the SCEV of the operands,
+  // which can be expensive. This check we can do cheaply to rule out some
+  // cases early.
+  Loop *innermostContainingLoop = LI.getLoopFor(BinOp->getParent());
+  if (innermostContainingLoop == nullptr ||
+      innermostContainingLoop->getHeader() != BinOp->getParent())
+    return SCEV::FlagAnyWrap;
+
+  // Only proceed if we can prove that BinOp does not yield poison.
+  if (!isKnownNotFullPoison(BinOp)) return SCEV::FlagAnyWrap;
+
+  // At this point we know that if V is executed, then it does not wrap
+  // according to at least one of NSW or NUW. If V is not executed, then we do
+  // not know if the calculation that V represents would wrap. Multiple
+  // instructions can map to the same SCEV. If we apply NSW or NUW from V to
+  // the SCEV, we must guarantee no wrapping for that SCEV also when it is
+  // derived from other instructions that map to the same SCEV. We cannot make
+  // that guarantee for cases where V is not executed. So we need to find the
+  // loop that V is considered in relation to and prove that V is executed for
+  // every iteration of that loop. That implies that the value that V
+  // calculates does not wrap anywhere in the loop, so then we can apply the
+  // flags to the SCEV.
+  //
+  // We check isLoopInvariant to disambiguate in case we are adding two
+  // recurrences from different loops, so that we know which loop to prove
+  // that V is executed in.
+  for (int OpIndex = 0; OpIndex < 2; ++OpIndex) {
+    const SCEV *Op = getSCEV(BinOp->getOperand(OpIndex));
+    if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
+      const int OtherOpIndex = 1 - OpIndex;
+      const SCEV *OtherOp = getSCEV(BinOp->getOperand(OtherOpIndex));
+      if (isLoopInvariant(OtherOp, AddRec->getLoop()) &&
+          isGuaranteedToExecuteForEveryIteration(BinOp, AddRec->getLoop()))
+        return Flags;
+    }
+  }
+  return SCEV::FlagAnyWrap;
+}
+
+/// createSCEV - We know that there is no SCEV for the specified value.  Analyze
+/// the expression.
 ///
 const SCEV *ScalarEvolution::createSCEV(Value *V) {
   if (!isSCEVable(V->getType()))
@@ -4104,14 +4479,14 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     // reachable. Such instructions don't matter, and they aren't required
     // to obey basic rules for definitions dominating uses which this
     // analysis depends on.
-    if (!DT->isReachableFromEntry(I->getParent()))
+    if (!DT.isReachableFromEntry(I->getParent()))
       return getUnknown(V);
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
     Opcode = CE->getOpcode();
   else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
     return getConstant(CI);
   else if (isa<ConstantPointerNull>(V))
-    return getConstant(V->getType(), 0);
+    return getZero(V->getType());
   else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
     return GA->mayBeOverridden() ? getUnknown(V) : getSCEV(GA->getAliasee());
   else
@@ -4126,47 +4501,79 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     // because it leads to N-1 getAddExpr calls for N ultimate operands.
     // Instead, gather up all the operands and make a single getAddExpr call.
     // LLVM IR canonical form means we need only traverse the left operands.
-    //
-    // Don't apply this instruction's NSW or NUW flags to the new
-    // expression. The instruction may be guarded by control flow that the
-    // no-wrap behavior depends on. Non-control-equivalent instructions can be
-    // mapped to the same SCEV expression, and it would be incorrect to transfer
-    // NSW/NUW semantics to those operations.
     SmallVector<const SCEV *, 4> AddOps;
-    AddOps.push_back(getSCEV(U->getOperand(1)));
-    for (Value *Op = U->getOperand(0); ; Op = U->getOperand(0)) {
-      unsigned Opcode = Op->getValueID() - Value::InstructionVal;
-      if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+    for (Value *Op = U;; Op = U->getOperand(0)) {
+      U = dyn_cast<Operator>(Op);
+      unsigned Opcode = U ? U->getOpcode() : 0;
+      if (!U || (Opcode != Instruction::Add && Opcode != Instruction::Sub)) {
+        assert(Op != V && "V should be an add");
+        AddOps.push_back(getSCEV(Op));
+        break;
+      }
+
+      if (auto *OpSCEV = getExistingSCEV(U)) {
+        AddOps.push_back(OpSCEV);
+        break;
+      }
+
+      // If a NUW or NSW flag can be applied to the SCEV for this
+      // addition, then compute the SCEV for this addition by itself
+      // with a separate call to getAddExpr. We need to do that
+      // instead of pushing the operands of the addition onto AddOps,
+      // since the flags are only known to apply to this particular
+      // addition - they may not apply to other additions that can be
+      // formed with operands from AddOps.
+      const SCEV *RHS = getSCEV(U->getOperand(1));
+      SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(U);
+      if (Flags != SCEV::FlagAnyWrap) {
+        const SCEV *LHS = getSCEV(U->getOperand(0));
+        if (Opcode == Instruction::Sub)
+          AddOps.push_back(getMinusSCEV(LHS, RHS, Flags));
+        else
+          AddOps.push_back(getAddExpr(LHS, RHS, Flags));
         break;
-      U = cast<Operator>(Op);
-      const SCEV *Op1 = getSCEV(U->getOperand(1));
+      }
+
       if (Opcode == Instruction::Sub)
-        AddOps.push_back(getNegativeSCEV(Op1));
+        AddOps.push_back(getNegativeSCEV(RHS));
       else
-        AddOps.push_back(Op1);
+        AddOps.push_back(RHS);
     }
-    AddOps.push_back(getSCEV(U->getOperand(0)));
     return getAddExpr(AddOps);
   }
+
   case Instruction::Mul: {
-    // Don't transfer NSW/NUW for the same reason as AddExpr.
     SmallVector<const SCEV *, 4> MulOps;
-    MulOps.push_back(getSCEV(U->getOperand(1)));
-    for (Value *Op = U->getOperand(0);
-         Op->getValueID() == Instruction::Mul + Value::InstructionVal;
-         Op = U->getOperand(0)) {
-      U = cast<Operator>(Op);
+    for (Value *Op = U;; Op = U->getOperand(0)) {
+      U = dyn_cast<Operator>(Op);
+      if (!U || U->getOpcode() != Instruction::Mul) {
+        assert(Op != V && "V should be a mul");
+        MulOps.push_back(getSCEV(Op));
+        break;
+      }
+
+      if (auto *OpSCEV = getExistingSCEV(U)) {
+        MulOps.push_back(OpSCEV);
+        break;
+      }
+
+      SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(U);
+      if (Flags != SCEV::FlagAnyWrap) {
+        MulOps.push_back(getMulExpr(getSCEV(U->getOperand(0)),
+                                    getSCEV(U->getOperand(1)), Flags));
+        break;
+      }
+
       MulOps.push_back(getSCEV(U->getOperand(1)));
     }
-    MulOps.push_back(getSCEV(U->getOperand(0)));
     return getMulExpr(MulOps);
   }
   case Instruction::UDiv:
     return getUDivExpr(getSCEV(U->getOperand(0)),
                        getSCEV(U->getOperand(1)));
   case Instruction::Sub:
-    return getMinusSCEV(getSCEV(U->getOperand(0)),
-                        getSCEV(U->getOperand(1)));
+    return getMinusSCEV(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1)),
+                        getNoWrapFlagsFromUB(U));
   case Instruction::And:
     // For an expression like x&255 that merely masks off the high bits,
     // use zext(trunc(x)) as the SCEV expression.
@@ -4185,8 +4592,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       unsigned TZ = A.countTrailingZeros();
       unsigned BitWidth = A.getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(U->getOperand(0), KnownZero, KnownOne,
-                       F->getParent()->getDataLayout(), 0, AC, nullptr, DT);
+      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, getDataLayout(),
+                       0, &AC, nullptr, &DT);
 
       APInt EffectiveMask =
           APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
@@ -4286,9 +4693,18 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       if (SA->getValue().uge(BitWidth))
         break;
 
+      // It is currently not resolved how to interpret NSW for left
+      // shift by BitWidth - 1, so we avoid applying flags in that
+      // case. Remove this check (or this comment) once the situation
+      // is resolved. See
+      // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html
+      // and http://reviews.llvm.org/D8890 .
+      auto Flags = SCEV::FlagAnyWrap;
+      if (SA->getValue().ult(BitWidth - 1)) Flags = getNoWrapFlagsFromUB(U);
+
       Constant *X = ConstantInt::get(getContext(),
         APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
-      return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X));
+      return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X), Flags);
     }
     break;
 
@@ -4363,94 +4779,13 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     return createNodeForPHI(cast<PHINode>(U));
 
   case Instruction::Select:
-    // This could be a smax or umax that was lowered earlier.
-    // Try to recover it.
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U->getOperand(0))) {
-      Value *LHS = ICI->getOperand(0);
-      Value *RHS = ICI->getOperand(1);
-      switch (ICI->getPredicate()) {
-      case ICmpInst::ICMP_SLT:
-      case ICmpInst::ICMP_SLE:
-        std::swap(LHS, RHS);
-        // fall through
-      case ICmpInst::ICMP_SGT:
-      case ICmpInst::ICMP_SGE:
-        // a >s b ? a+x : b+x  ->  smax(a, b)+x
-        // a >s b ? b+x : a+x  ->  smin(a, b)+x
-        if (getTypeSizeInBits(LHS->getType()) <=
-            getTypeSizeInBits(U->getType())) {
-          const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), U->getType());
-          const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), U->getType());
-          const SCEV *LA = getSCEV(U->getOperand(1));
-          const SCEV *RA = getSCEV(U->getOperand(2));
-          const SCEV *LDiff = getMinusSCEV(LA, LS);
-          const SCEV *RDiff = getMinusSCEV(RA, RS);
-          if (LDiff == RDiff)
-            return getAddExpr(getSMaxExpr(LS, RS), LDiff);
-          LDiff = getMinusSCEV(LA, RS);
-          RDiff = getMinusSCEV(RA, LS);
-          if (LDiff == RDiff)
-            return getAddExpr(getSMinExpr(LS, RS), LDiff);
-        }
-        break;
-      case ICmpInst::ICMP_ULT:
-      case ICmpInst::ICMP_ULE:
-        std::swap(LHS, RHS);
-        // fall through
-      case ICmpInst::ICMP_UGT:
-      case ICmpInst::ICMP_UGE:
-        // a >u b ? a+x : b+x  ->  umax(a, b)+x
-        // a >u b ? b+x : a+x  ->  umin(a, b)+x
-        if (getTypeSizeInBits(LHS->getType()) <=
-            getTypeSizeInBits(U->getType())) {
-          const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType());
-          const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), U->getType());
-          const SCEV *LA = getSCEV(U->getOperand(1));
-          const SCEV *RA = getSCEV(U->getOperand(2));
-          const SCEV *LDiff = getMinusSCEV(LA, LS);
-          const SCEV *RDiff = getMinusSCEV(RA, RS);
-          if (LDiff == RDiff)
-            return getAddExpr(getUMaxExpr(LS, RS), LDiff);
-          LDiff = getMinusSCEV(LA, RS);
-          RDiff = getMinusSCEV(RA, LS);
-          if (LDiff == RDiff)
-            return getAddExpr(getUMinExpr(LS, RS), LDiff);
-        }
-        break;
-      case ICmpInst::ICMP_NE:
-        // n != 0 ? n+x : 1+x  ->  umax(n, 1)+x
-        if (getTypeSizeInBits(LHS->getType()) <=
-                getTypeSizeInBits(U->getType()) &&
-            isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
-          const SCEV *One = getConstant(U->getType(), 1);
-          const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType());
-          const SCEV *LA = getSCEV(U->getOperand(1));
-          const SCEV *RA = getSCEV(U->getOperand(2));
-          const SCEV *LDiff = getMinusSCEV(LA, LS);
-          const SCEV *RDiff = getMinusSCEV(RA, One);
-          if (LDiff == RDiff)
-            return getAddExpr(getUMaxExpr(One, LS), LDiff);
-        }
-        break;
-      case ICmpInst::ICMP_EQ:
-        // n == 0 ? 1+x : n+x  ->  umax(n, 1)+x
-        if (getTypeSizeInBits(LHS->getType()) <=
-                getTypeSizeInBits(U->getType()) &&
-            isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
-          const SCEV *One = getConstant(U->getType(), 1);
-          const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType());
-          const SCEV *LA = getSCEV(U->getOperand(1));
-          const SCEV *RA = getSCEV(U->getOperand(2));
-          const SCEV *LDiff = getMinusSCEV(LA, One);
-          const SCEV *RDiff = getMinusSCEV(RA, LS);
-          if (LDiff == RDiff)
-            return getAddExpr(getUMaxExpr(One, LS), LDiff);
-        }
-        break;
-      default:
-        break;
-      }
-    }
+    // U can also be a select constant expr, which let fall through.  Since
+    // createNodeForSelect only works for a condition that is an `ICmpInst`, and
+    // constant expressions cannot have instructions as operands, we'd have
+    // returned getUnknown for a select constant expressions anyway.
+    if (isa<Instruction>(U))
+      return createNodeForSelectOrPHI(cast<Instruction>(U), U->getOperand(0),
+                                      U->getOperand(1), U->getOperand(2));
 
   default: // We cannot analyze this expression.
     break;
@@ -4534,8 +4869,7 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
     return 1;
 
   // Get the trip count from the BE count by adding 1.
-  const SCEV *TCMul = getAddExpr(ExitCount,
-                                 getConstant(ExitCount->getType(), 1));
+  const SCEV *TCMul = getAddExpr(ExitCount, getOne(ExitCount->getType()));
   // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt
   // to factor simple cases.
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul))
@@ -4610,10 +4944,10 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   if (!Pair.second)
     return Pair.first->second;
 
-  // ComputeBackedgeTakenCount may allocate memory for its result. Inserting it
+  // computeBackedgeTakenCount may allocate memory for its result. Inserting it
   // into the BackedgeTakenCounts map transfers ownership. Otherwise, the result
   // must be cleared in this scope.
-  BackedgeTakenInfo Result = ComputeBackedgeTakenCount(L);
+  BackedgeTakenInfo Result = computeBackedgeTakenCount(L);
 
   if (Result.getExact(this) != getCouldNotCompute()) {
     assert(isLoopInvariant(Result.getExact(this), L) &&
@@ -4666,7 +5000,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   }
 
   // Re-lookup the insert position, since the call to
-  // ComputeBackedgeTakenCount above could result in a
+  // computeBackedgeTakenCount above could result in a
   // recusive call to getBackedgeTakenInfo (on a different
   // loop), which would invalidate the iterator computed
   // earlier.
@@ -4744,12 +5078,12 @@ void ScalarEvolution::forgetValue(Value *V) {
 }
 
 /// getExact - Get the exact loop backedge taken count considering all loop
-/// exits. A computable result can only be return for loops with a single exit.
-/// Returning the minimum taken count among all exits is incorrect because one
-/// of the loop's exit limit's may have been skipped. HowFarToZero assumes that
-/// the limit of each loop test is never skipped. This is a valid assumption as
-/// long as the loop exits via that test. For precise results, it is the
-/// caller's responsibility to specify the relevant loop exit using
+/// exits. A computable result can only be returned for loops with a single
+/// exit.  Returning the minimum taken count among all exits is incorrect
+/// because one of the loop's exit limit's may have been skipped. HowFarToZero
+/// assumes that the limit of each loop test is never skipped. This is a valid
+/// assumption as long as the loop exits via that test. For precise results, it
+/// is the caller's responsibility to specify the relevant loop exit using
 /// getExact(ExitingBlock, SE).
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
@@ -4847,10 +5181,10 @@ void ScalarEvolution::BackedgeTakenInfo::clear() {
   delete[] ExitNotTaken.getNextExit();
 }
 
-/// ComputeBackedgeTakenCount - Compute the number of times the backedge
+/// computeBackedgeTakenCount - Compute the number of times the backedge
 /// of the specified loop will execute.
 ScalarEvolution::BackedgeTakenInfo
-ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
+ScalarEvolution::computeBackedgeTakenCount(const Loop *L) {
   SmallVector<BasicBlock *, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
@@ -4864,7 +5198,7 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   // and compute maxBECount.
   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
     BasicBlock *ExitBB = ExitingBlocks[i];
-    ExitLimit EL = ComputeExitLimit(L, ExitBB);
+    ExitLimit EL = computeExitLimit(L, ExitBB);
 
     // 1. For each exit that can be computed, add an entry to ExitCounts.
     // CouldComputeBECount is true only if all exits can be computed.
@@ -4885,7 +5219,7 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
     // MaxBECount is conservatively the maximum EL.Max, where CouldNotCompute is
     // considered greater than any computable EL.Max.
     if (EL.Max != getCouldNotCompute() && Latch &&
-        DT->dominates(ExitBB, Latch)) {
+        DT.dominates(ExitBB, Latch)) {
       if (!MustExitMaxBECount)
         MustExitMaxBECount = EL.Max;
       else {
@@ -4906,13 +5240,11 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount);
 }
 
-/// ComputeExitLimit - Compute the number of times the backedge of the specified
-/// loop will execute if it exits via the specified block.
 ScalarEvolution::ExitLimit
-ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
+ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
 
-  // Okay, we've chosen an exiting block.  See what condition causes us to
-  // exit at this block and remember the exit block and whether all other targets
+  // Okay, we've chosen an exiting block.  See what condition causes us to exit
+  // at this block and remember the exit block and whether all other targets
   // lead to the loop header.
   bool MustExecuteLoopHeader = true;
   BasicBlock *Exit = nullptr;
@@ -4952,8 +5284,7 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
       if (!Pred)
         return getCouldNotCompute();
       TerminatorInst *PredTerm = Pred->getTerminator();
-      for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) {
-        BasicBlock *PredSucc = PredTerm->getSuccessor(i);
+      for (const BasicBlock *PredSucc : PredTerm->successors()) {
         if (PredSucc == BB)
           continue;
         // If the predecessor has a successor that isn't BB and isn't
@@ -4976,19 +5307,19 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
   if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
     assert(BI->isConditional() && "If unconditional, it can't be in loop!");
     // Proceed to the next level to examine the exit condition expression.
-    return ComputeExitLimitFromCond(L, BI->getCondition(), BI->getSuccessor(0),
+    return computeExitLimitFromCond(L, BI->getCondition(), BI->getSuccessor(0),
                                     BI->getSuccessor(1),
                                     /*ControlsExit=*/IsOnlyExit);
   }
 
   if (SwitchInst *SI = dyn_cast<SwitchInst>(Term))
-    return ComputeExitLimitFromSingleExitSwitch(L, SI, Exit,
+    return computeExitLimitFromSingleExitSwitch(L, SI, Exit,
                                                 /*ControlsExit=*/IsOnlyExit);
 
   return getCouldNotCompute();
 }
 
-/// ComputeExitLimitFromCond - Compute the number of times the
+/// computeExitLimitFromCond - Compute the number of times the
 /// backedge of the specified loop will execute if its exit condition
 /// were a conditional branch of ExitCond, TBB, and FBB.
 ///
@@ -4997,7 +5328,7 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
 /// condition is true and can infer that failing to meet the condition prior to
 /// integer wraparound results in undefined behavior.
 ScalarEvolution::ExitLimit
-ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
+ScalarEvolution::computeExitLimitFromCond(const Loop *L,
                                           Value *ExitCond,
                                           BasicBlock *TBB,
                                           BasicBlock *FBB,
@@ -5007,9 +5338,9 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
       bool EitherMayExit = L->contains(TBB);
-      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
+      ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
                                                ControlsExit && !EitherMayExit);
-      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
+      ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
                                                ControlsExit && !EitherMayExit);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
@@ -5042,9 +5373,9 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
       bool EitherMayExit = L->contains(FBB);
-      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
+      ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
                                                ControlsExit && !EitherMayExit);
-      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
+      ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
                                                ControlsExit && !EitherMayExit);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
@@ -5079,7 +5410,7 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
   // With an icmp, it may be feasible to compute an exact backedge-taken count.
   // Proceed to the next level to examine the icmp.
   if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
-    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
+    return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
 
   // Check for a constant condition. These are normally stripped out by
   // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
@@ -5091,18 +5422,15 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
       return getCouldNotCompute();
     else
       // The backedge is never taken.
-      return getConstant(CI->getType(), 0);
+      return getZero(CI->getType());
   }
 
   // If it's not an integer or pointer comparison then compute it the hard way.
-  return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+  return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
 }
 
-/// ComputeExitLimitFromICmp - Compute the number of times the
-/// backedge of the specified loop will execute if its exit condition
-/// were a conditional branch of the ICmpInst ExitCond, TBB, and FBB.
 ScalarEvolution::ExitLimit
-ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
+ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
                                           ICmpInst *ExitCond,
                                           BasicBlock *TBB,
                                           BasicBlock *FBB,
@@ -5119,11 +5447,16 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
   if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
     if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
       ExitLimit ItCnt =
-        ComputeLoadConstantCompareExitLimit(LI, RHS, L, Cond);
+        computeLoadConstantCompareExitLimit(LI, RHS, L, Cond);
       if (ItCnt.hasAnyInfo())
         return ItCnt;
     }
 
+  ExitLimit ShiftEL = computeShiftCompareExitLimit(
+      ExitCond->getOperand(0), ExitCond->getOperand(1), L, Cond);
+  if (ShiftEL.hasAnyInfo())
+    return ShiftEL;
+
   const SCEV *LHS = getSCEV(ExitCond->getOperand(0));
   const SCEV *RHS = getSCEV(ExitCond->getOperand(1));
 
@@ -5149,7 +5482,7 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
       if (AddRec->getLoop() == L) {
         // Form the constant range.
         ConstantRange CompRange(
-            ICmpInst::makeConstantRange(Cond, RHSC->getValue()->getValue()));
+            ICmpInst::makeConstantRange(Cond, RHSC->getAPInt()));
 
         const SCEV *Ret = AddRec->getNumIterationsInRange(CompRange, *this);
         if (!isa<SCEVCouldNotCompute>(Ret)) return Ret;
@@ -5183,21 +5516,13 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
     break;
   }
   default:
-#if 0
-    dbgs() << "ComputeBackedgeTakenCount ";
-    if (ExitCond->getOperand(0)->getType()->isUnsigned())
-      dbgs() << "[unsigned] ";
-    dbgs() << *LHS << "   "
-         << Instruction::getOpcodeName(Instruction::ICmp)
-         << "   " << *RHS << "\n";
-#endif
     break;
   }
-  return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+  return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
 }
 
 ScalarEvolution::ExitLimit
-ScalarEvolution::ComputeExitLimitFromSingleExitSwitch(const Loop *L,
+ScalarEvolution::computeExitLimitFromSingleExitSwitch(const Loop *L,
                                                       SwitchInst *Switch,
                                                       BasicBlock *ExitingBlock,
                                                       bool ControlsExit) {
@@ -5230,11 +5555,11 @@ EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C,
   return cast<SCEVConstant>(Val)->getValue();
 }
 
-/// ComputeLoadConstantCompareExitLimit - Given an exit condition of
+/// computeLoadConstantCompareExitLimit - Given an exit condition of
 /// 'icmp op load X, cst', try to see if we can compute the backedge
 /// execution count.
 ScalarEvolution::ExitLimit
-ScalarEvolution::ComputeLoadConstantCompareExitLimit(
+ScalarEvolution::computeLoadConstantCompareExitLimit(
   LoadInst *LI,
   Constant *RHS,
   const Loop *L,
@@ -5303,11 +5628,6 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit(
     Result = ConstantExpr::getICmp(predicate, Result, RHS);
     if (!isa<ConstantInt>(Result)) break;  // Couldn't decide for sure
     if (cast<ConstantInt>(Result)->getValue().isMinValue()) {
-#if 0
-      dbgs() << "\n***\n*** Computed loop count " << *ItCst
-             << "\n*** From global " << *GV << "*** BB: " << *L->getHeader()
-             << "***\n";
-#endif
       ++NumArrayLenItCounts;
       return getConstant(ItCst);   // Found terminating iteration!
     }
@@ -5315,6 +5635,149 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit(
   return getCouldNotCompute();
 }
 
+ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
+    Value *LHS, Value *RHSV, const Loop *L, ICmpInst::Predicate Pred) {
+  ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV);
+  if (!RHS)
+    return getCouldNotCompute();
+
+  const BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return getCouldNotCompute();
+
+  const BasicBlock *Predecessor = L->getLoopPredecessor();
+  if (!Predecessor)
+    return getCouldNotCompute();
+
+  // Return true if V is of the form "LHS `shift_op` <positive constant>".
+  // Return LHS in OutLHS and shift_opt in OutOpCode.
+  auto MatchPositiveShift =
+      [](Value *V, Value *&OutLHS, Instruction::BinaryOps &OutOpCode) {
+
+    using namespace PatternMatch;
+
+    ConstantInt *ShiftAmt;
+    if (match(V, m_LShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt))))
+      OutOpCode = Instruction::LShr;
+    else if (match(V, m_AShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt))))
+      OutOpCode = Instruction::AShr;
+    else if (match(V, m_Shl(m_Value(OutLHS), m_ConstantInt(ShiftAmt))))
+      OutOpCode = Instruction::Shl;
+    else
+      return false;
+
+    return ShiftAmt->getValue().isStrictlyPositive();
+  };
+
+  // Recognize a "shift recurrence" either of the form %iv or of %iv.shifted in
+  //
+  // loop:
+  //   %iv = phi i32 [ %iv.shifted, %loop ], [ %val, %preheader ]
+  //   %iv.shifted = lshr i32 %iv, <positive constant>
+  //
+  // Return true on a succesful match.  Return the corresponding PHI node (%iv
+  // above) in PNOut and the opcode of the shift operation in OpCodeOut.
+  auto MatchShiftRecurrence =
+      [&](Value *V, PHINode *&PNOut, Instruction::BinaryOps &OpCodeOut) {
+    Optional<Instruction::BinaryOps> PostShiftOpCode;
+
+    {
+      Instruction::BinaryOps OpC;
+      Value *V;
+
+      // If we encounter a shift instruction, "peel off" the shift operation,
+      // and remember that we did so.  Later when we inspect %iv's backedge
+      // value, we will make sure that the backedge value uses the same
+      // operation.
+      //
+      // Note: the peeled shift operation does not have to be the same
+      // instruction as the one feeding into the PHI's backedge value.  We only
+      // really care about it being the same *kind* of shift instruction --
+      // that's all that is required for our later inferences to hold.
+      if (MatchPositiveShift(LHS, V, OpC)) {
+        PostShiftOpCode = OpC;
+        LHS = V;
+      }
+    }
+
+    PNOut = dyn_cast<PHINode>(LHS);
+    if (!PNOut || PNOut->getParent() != L->getHeader())
+      return false;
+
+    Value *BEValue = PNOut->getIncomingValueForBlock(Latch);
+    Value *OpLHS;
+
+    return
+        // The backedge value for the PHI node must be a shift by a positive
+        // amount
+        MatchPositiveShift(BEValue, OpLHS, OpCodeOut) &&
+
+        // of the PHI node itself
+        OpLHS == PNOut &&
+
+        // and the kind of shift should be match the kind of shift we peeled
+        // off, if any.
+        (!PostShiftOpCode.hasValue() || *PostShiftOpCode == OpCodeOut);
+  };
+
+  PHINode *PN;
+  Instruction::BinaryOps OpCode;
+  if (!MatchShiftRecurrence(LHS, PN, OpCode))
+    return getCouldNotCompute();
+
+  const DataLayout &DL = getDataLayout();
+
+  // The key rationale for this optimization is that for some kinds of shift
+  // recurrences, the value of the recurrence "stabilizes" to either 0 or -1
+  // within a finite number of iterations.  If the condition guarding the
+  // backedge (in the sense that the backedge is taken if the condition is true)
+  // is false for the value the shift recurrence stabilizes to, then we know
+  // that the backedge is taken only a finite number of times.
+
+  ConstantInt *StableValue = nullptr;
+  switch (OpCode) {
+  default:
+    llvm_unreachable("Impossible case!");
+
+  case Instruction::AShr: {
+    // {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
+    // bitwidth(K) iterations.
+    Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
+    bool KnownZero, KnownOne;
+    ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr,
+                   Predecessor->getTerminator(), &DT);
+    auto *Ty = cast<IntegerType>(RHS->getType());
+    if (KnownZero)
+      StableValue = ConstantInt::get(Ty, 0);
+    else if (KnownOne)
+      StableValue = ConstantInt::get(Ty, -1, true);
+    else
+      return getCouldNotCompute();
+
+    break;
+  }
+  case Instruction::LShr:
+  case Instruction::Shl:
+    // Both {K,lshr,<positive-constant>} and {K,shl,<positive-constant>}
+    // stabilize to 0 in at most bitwidth(K) iterations.
+    StableValue = ConstantInt::get(cast<IntegerType>(RHS->getType()), 0);
+    break;
+  }
+
+  auto *Result =
+      ConstantFoldCompareInstOperands(Pred, StableValue, RHS, DL, &TLI);
+  assert(Result->getType()->isIntegerTy(1) &&
+         "Otherwise cannot be an operand to a branch instruction");
+
+  if (Result->isZeroValue()) {
+    unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+    const SCEV *UpperBound =
+        getConstant(getEffectiveSCEVType(RHS->getType()), BitWidth);
+    return ExitLimit(getCouldNotCompute(), UpperBound);
+  }
+
+  return getCouldNotCompute();
+}
 
 /// CanConstantFold - Return true if we can constant fold an instruction of the
 /// specified type, assuming that all operands were constants.
@@ -5356,12 +5819,10 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
   PHINode *PHI = nullptr;
-  for (Instruction::op_iterator OpI = UseInst->op_begin(),
-         OpE = UseInst->op_end(); OpI != OpE; ++OpI) {
-
-    if (isa<Constant>(*OpI)) continue;
+  for (Value *Op : UseInst->operands()) {
+    if (isa<Constant>(Op)) continue;
 
-    Instruction *OpInst = dyn_cast<Instruction>(*OpI);
+    Instruction *OpInst = dyn_cast<Instruction>(Op);
     if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr;
 
     PHINode *P = dyn_cast<PHINode>(OpInst);
@@ -5395,9 +5856,8 @@ static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I || !canConstantEvolve(I, L)) return nullptr;
 
-  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+  if (PHINode *PN = dyn_cast<PHINode>(I))
     return PN;
-  }
 
   // Record non-constant instructions contained by the loop.
   DenseMap<Instruction *, PHINode *> PHIMap;
@@ -5454,6 +5914,30 @@ static Constant *EvaluateExpression(Value *V, const Loop *L,
                                   TLI);
 }
 
+
+// If every incoming value to PN except the one for BB is a specific Constant,
+// return that, else return nullptr.
+static Constant *getOtherIncomingValue(PHINode *PN, BasicBlock *BB) {
+  Constant *IncomingVal = nullptr;
+
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingBlock(i) == BB)
+      continue;
+
+    auto *CurrentVal = dyn_cast<Constant>(PN->getIncomingValue(i));
+    if (!CurrentVal)
+      return nullptr;
+
+    if (IncomingVal != CurrentVal) {
+      if (IncomingVal)
+        return nullptr;
+      IncomingVal = CurrentVal;
+    }
+  }
+
+  return IncomingVal;
+}
+
 /// getConstantEvolutionLoopExitValue - If we know that the specified Phi is
 /// in the header of its containing loop, we know the loop executes a
 /// constant number of times, and the PHI node is just a recurrence
@@ -5462,8 +5946,7 @@ Constant *
 ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
                                                    const APInt &BEs,
                                                    const Loop *L) {
-  DenseMap<PHINode*, Constant*>::const_iterator I =
-    ConstantEvolutionLoopExitValue.find(PN);
+  auto I = ConstantEvolutionLoopExitValue.find(PN);
   if (I != ConstantEvolutionLoopExitValue.end())
     return I->second;
 
@@ -5476,22 +5959,21 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
   BasicBlock *Header = L->getHeader();
   assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!");
 
-  // Since the loop is canonicalized, the PHI node must have two entries.  One
-  // entry must be a constant (coming in from outside of the loop), and the
-  // second must be derived from the same PHI.
-  bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
-  PHINode *PHI = nullptr;
-  for (BasicBlock::iterator I = Header->begin();
-       (PHI = dyn_cast<PHINode>(I)); ++I) {
-    Constant *StartCST =
-      dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge));
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return nullptr;
+
+  for (auto &I : *Header) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI) break;
+    auto *StartCST = getOtherIncomingValue(PHI, Latch);
     if (!StartCST) continue;
     CurrentIterVals[PHI] = StartCST;
   }
   if (!CurrentIterVals.count(PN))
     return RetVal = nullptr;
 
-  Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
+  Value *BEValue = PN->getIncomingValueForBlock(Latch);
 
   // Execute the loop symbolically to determine the exit value.
   if (BEs.getActiveBits() >= 32)
@@ -5499,7 +5981,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
 
   unsigned NumIterations = BEs.getZExtValue(); // must be in range
   unsigned IterationNum = 0;
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = getDataLayout();
   for (; ; ++IterationNum) {
     if (IterationNum == NumIterations)
       return RetVal = CurrentIterVals[PN];  // Got exit value!
@@ -5508,7 +5990,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
     // EvaluateExpression adds non-phi values to the CurrentIterVals map.
     DenseMap<Instruction *, Constant *> NextIterVals;
     Constant *NextPHI =
-        EvaluateExpression(BEValue, L, CurrentIterVals, DL, TLI);
+        EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI);
     if (!NextPHI)
       return nullptr;        // Couldn't evaluate!
     NextIterVals[PN] = NextPHI;
@@ -5519,23 +6001,21 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
     // cease to be able to evaluate one of them or if they stop evolving,
     // because that doesn't necessarily prevent us from computing PN.
     SmallVector<std::pair<PHINode *, Constant *>, 8> PHIsToCompute;
-    for (DenseMap<Instruction *, Constant *>::const_iterator
-           I = CurrentIterVals.begin(), E = CurrentIterVals.end(); I != E; ++I){
-      PHINode *PHI = dyn_cast<PHINode>(I->first);
+    for (const auto &I : CurrentIterVals) {
+      PHINode *PHI = dyn_cast<PHINode>(I.first);
       if (!PHI || PHI == PN || PHI->getParent() != Header) continue;
-      PHIsToCompute.push_back(std::make_pair(PHI, I->second));
+      PHIsToCompute.emplace_back(PHI, I.second);
     }
     // We use two distinct loops because EvaluateExpression may invalidate any
     // iterators into CurrentIterVals.
-    for (SmallVectorImpl<std::pair<PHINode *, Constant*> >::const_iterator
-             I = PHIsToCompute.begin(), E = PHIsToCompute.end(); I != E; ++I) {
-      PHINode *PHI = I->first;
+    for (const auto &I : PHIsToCompute) {
+      PHINode *PHI = I.first;
       Constant *&NextPHI = NextIterVals[PHI];
       if (!NextPHI) {   // Not already computed.
-        Value *BEValue = PHI->getIncomingValue(SecondIsBackedge);
-        NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, TLI);
+        Value *BEValue = PHI->getIncomingValueForBlock(Latch);
+        NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI);
       }
-      if (NextPHI != I->second)
+      if (NextPHI != I.second)
         StoppedEvolving = false;
     }
 
@@ -5548,12 +6028,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
   }
 }
 
-/// ComputeExitCountExhaustively - If the loop is known to execute a
-/// constant number of times (the condition evolves only from constants),
-/// try to evaluate a few iterations of the loop until we get the exit
-/// condition gets a value of ExitWhen (true or false).  If we cannot
-/// evaluate the trip count of the loop, return getCouldNotCompute().
-const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
+const SCEV *ScalarEvolution::computeExitCountExhaustively(const Loop *L,
                                                           Value *Cond,
                                                           bool ExitWhen) {
   PHINode *PN = getConstantEvolvingPHI(Cond, L);
@@ -5567,14 +6042,14 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
   BasicBlock *Header = L->getHeader();
   assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!");
 
-  // One entry must be a constant (coming in from outside of the loop), and the
-  // second must be derived from the same PHI.
-  bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
-  PHINode *PHI = nullptr;
-  for (BasicBlock::iterator I = Header->begin();
-       (PHI = dyn_cast<PHINode>(I)); ++I) {
-    Constant *StartCST =
-      dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge));
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Should follow from NumIncomingValues == 2!");
+
+  for (auto &I : *Header) {
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      break;
+    auto *StartCST = getOtherIncomingValue(PHI, Latch);
     if (!StartCST) continue;
     CurrentIterVals[PHI] = StartCST;
   }
@@ -5585,10 +6060,10 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
   // the loop symbolically to determine when the condition gets a value of
   // "ExitWhen".
   unsigned MaxIterations = MaxBruteForceIterations;   // Limit analysis.
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = getDataLayout();
   for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){
-    ConstantInt *CondVal = dyn_cast_or_null<ConstantInt>(
-        EvaluateExpression(Cond, L, CurrentIterVals, DL, TLI));
+    auto *CondVal = dyn_cast_or_null<ConstantInt>(
+        EvaluateExpression(Cond, L, CurrentIterVals, DL, &TLI));
 
     // Couldn't symbolically evaluate.
     if (!CondVal) return getCouldNotCompute();
@@ -5605,20 +6080,17 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
     // calling EvaluateExpression on them because that may invalidate iterators
     // into CurrentIterVals.
     SmallVector<PHINode *, 8> PHIsToCompute;
-    for (DenseMap<Instruction *, Constant *>::const_iterator
-           I = CurrentIterVals.begin(), E = CurrentIterVals.end(); I != E; ++I){
-      PHINode *PHI = dyn_cast<PHINode>(I->first);
+    for (const auto &I : CurrentIterVals) {
+      PHINode *PHI = dyn_cast<PHINode>(I.first);
       if (!PHI || PHI->getParent() != Header) continue;
       PHIsToCompute.push_back(PHI);
     }
-    for (SmallVectorImpl<PHINode *>::const_iterator I = PHIsToCompute.begin(),
-             E = PHIsToCompute.end(); I != E; ++I) {
-      PHINode *PHI = *I;
+    for (PHINode *PHI : PHIsToCompute) {
       Constant *&NextPHI = NextIterVals[PHI];
       if (NextPHI) continue;    // Already computed!
 
-      Value *BEValue = PHI->getIncomingValue(SecondIsBackedge);
-      NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, TLI);
+      Value *BEValue = PHI->getIncomingValueForBlock(Latch);
+      NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI);
     }
     CurrentIterVals.swap(NextIterVals);
   }
@@ -5638,22 +6110,22 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
 /// In the case that a relevant loop exit value cannot be computed, the
 /// original value V is returned.
 const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values =
+      ValuesAtScopes[V];
   // Check to see if we've folded this expression at this loop before.
-  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values = ValuesAtScopes[V];
-  for (unsigned u = 0; u < Values.size(); u++) {
-    if (Values[u].first == L)
-      return Values[u].second ? Values[u].second : V;
-  }
-  Values.push_back(std::make_pair(L, static_cast<const SCEV *>(nullptr)));
+  for (auto &LS : Values)
+    if (LS.first == L)
+      return LS.second ? LS.second : V;
+
+  Values.emplace_back(L, nullptr);
+
   // Otherwise compute it.
   const SCEV *C = computeSCEVAtScope(V, L);
-  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values2 = ValuesAtScopes[V];
-  for (unsigned u = Values2.size(); u > 0; u--) {
-    if (Values2[u - 1].first == L) {
-      Values2[u - 1].second = C;
+  for (auto &LS : reverse(ValuesAtScopes[V]))
+    if (LS.first == L) {
+      LS.second = C;
       break;
     }
-  }
   return C;
 }
 
@@ -5763,7 +6235,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
   // exit value from the loop without using SCEVs.
   if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) {
     if (Instruction *I = dyn_cast<Instruction>(SU->getValue())) {
-      const Loop *LI = (*this->LI)[I->getParent()];
+      const Loop *LI = this->LI[I->getParent()];
       if (LI && LI->getParentLoop() == L)  // Looking for loop exit value.
         if (PHINode *PN = dyn_cast<PHINode>(I))
           if (PN->getParent() == LI->getHeader()) {
@@ -5777,9 +6249,8 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
               // Okay, we know how many times the containing loop executes.  If
               // this is a constant evolving PHI node, get the final value at
               // the specified iteration number.
-              Constant *RV = getConstantEvolutionLoopExitValue(PN,
-                                                   BTCC->getValue()->getValue(),
-                                                               LI);
+              Constant *RV =
+                  getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), LI);
               if (RV) return getSCEV(RV);
             }
           }
@@ -5791,8 +6262,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
       if (CanConstantFold(I)) {
         SmallVector<Constant *, 4> Operands;
         bool MadeImprovement = false;
-        for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-          Value *Op = I->getOperand(i);
+        for (Value *Op : I->operands()) {
           if (Constant *C = dyn_cast<Constant>(Op)) {
             Operands.push_back(C);
             continue;
@@ -5821,16 +6291,16 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
         // Check to see if getSCEVAtScope actually made an improvement.
         if (MadeImprovement) {
           Constant *C = nullptr;
-          const DataLayout &DL = F->getParent()->getDataLayout();
+          const DataLayout &DL = getDataLayout();
           if (const CmpInst *CI = dyn_cast<CmpInst>(I))
             C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
-                                                Operands[1], DL, TLI);
+                                                Operands[1], DL, &TLI);
           else if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
             if (!LI->isVolatile())
               C = ConstantFoldLoadFromConstPtr(Operands[0], DL);
           } else
             C = ConstantFoldInstOperands(I->getOpcode(), I->getType(), Operands,
-                                         DL, TLI);
+                                         DL, &TLI);
           if (!C) return V;
           return getSCEV(C);
         }
@@ -6021,10 +6491,10 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     return std::make_pair(CNC, CNC);
   }
 
-  uint32_t BitWidth = LC->getValue()->getValue().getBitWidth();
-  const APInt &L = LC->getValue()->getValue();
-  const APInt &M = MC->getValue()->getValue();
-  const APInt &N = NC->getValue()->getValue();
+  uint32_t BitWidth = LC->getAPInt().getBitWidth();
+  const APInt &L = LC->getAPInt();
+  const APInt &M = MC->getAPInt();
+  const APInt &N = NC->getAPInt();
   APInt Two(BitWidth, 2);
   APInt Four(BitWidth, 4);
 
@@ -6103,10 +6573,6 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
     const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
     const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
     if (R1 && R2) {
-#if 0
-      dbgs() << "HFTZ: " << *V << " - sol#1: " << *R1
-             << "  sol#2: " << *R2 << "\n";
-#endif
       // Pick the smallest positive root value.
       if (ConstantInt *CB =
           dyn_cast<ConstantInt>(ConstantExpr::getICmp(CmpInst::ICMP_ULT,
@@ -6160,7 +6626,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
   // For negative steps (counting down to zero):
   //   N = Start/-Step
   // First compute the unsigned distance from zero in the direction of Step.
-  bool CountDown = StepC->getValue()->getValue().isNegative();
+  bool CountDown = StepC->getAPInt().isNegative();
   const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start);
 
   // Handle unitary steps, which cannot wraparound.
@@ -6185,13 +6651,53 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
   // done by counting and comparing the number of trailing zeros of Step and
   // Distance.
   if (!CountDown) {
-    const APInt &StepV = StepC->getValue()->getValue();
+    const APInt &StepV = StepC->getAPInt();
     // StepV.isPowerOf2() returns true if StepV is an positive power of two.  It
     // also returns true if StepV is maximally negative (eg, INT_MIN), but that
     // case is not handled as this code is guarded by !CountDown.
     if (StepV.isPowerOf2() &&
-        GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros())
-      return getUDivExactExpr(Distance, Step);
+        GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) {
+      // Here we've constrained the equation to be of the form
+      //
+      //   2^(N + k) * Distance' = (StepV == 2^N) * X (mod 2^W)  ... (0)
+      //
+      // where we're operating on a W bit wide integer domain and k is
+      // non-negative.  The smallest unsigned solution for X is the trip count.
+      //
+      // (0) is equivalent to:
+      //
+      //      2^(N + k) * Distance' - 2^N * X = L * 2^W
+      // <=>  2^N(2^k * Distance' - X) = L * 2^(W - N) * 2^N
+      // <=>  2^k * Distance' - X = L * 2^(W - N)
+      // <=>  2^k * Distance'     = L * 2^(W - N) + X    ... (1)
+      //
+      // The smallest X satisfying (1) is unsigned remainder of dividing the LHS
+      // by 2^(W - N).
+      //
+      // <=>  X = 2^k * Distance' URem 2^(W - N)   ... (2)
+      //
+      // E.g. say we're solving
+      //
+      //   2 * Val = 2 * X  (in i8)   ... (3)
+      //
+      // then from (2), we get X = Val URem i8 128 (k = 0 in this case).
+      //
+      // Note: It is tempting to solve (3) by setting X = Val, but Val is not
+      // necessarily the smallest unsigned value of X that satisfies (3).
+      // E.g. if Val is i8 -127 then the smallest value of X that satisfies (3)
+      // is i8 1, not i8 -127
+
+      const auto *ModuloResult = getUDivExactExpr(Distance, Step);
+
+      // Since SCEV does not have a URem node, we construct one using a truncate
+      // and a zero extend.
+
+      unsigned NarrowWidth = StepV.getBitWidth() - StepV.countTrailingZeros();
+      auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth);
+      auto *WideTy = Distance->getType();
+
+      return getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy);
+    }
   }
 
   // If the condition controls loop exit (the loop exits only if the expression
@@ -6207,8 +6713,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
 
   // Then, try to solve the above equation provided that Start is constant.
   if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
-    return SolveLinEquationWithOverflow(StepC->getValue()->getValue(),
-                                        -StartC->getValue()->getValue(),
+    return SolveLinEquationWithOverflow(StepC->getAPInt(), -StartC->getAPInt(),
                                         *this);
   return getCouldNotCompute();
 }
@@ -6226,7 +6731,7 @@ ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
   // already.  If so, the backedge will execute zero times.
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
     if (!C->getValue()->isNullValue())
-      return getConstant(C->getType(), 0);
+      return getZero(C->getType());
     return getCouldNotCompute();  // Otherwise it will loop infinitely.
   }
 
@@ -6251,7 +6756,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
   // A loop's header is defined to be a block that dominates the loop.
   // If the header has a unique predecessor outside the loop, it must be
   // a block that has exactly one successor that can reach the loop.
-  if (Loop *L = LI->getLoopFor(BB))
+  if (Loop *L = LI.getLoopFor(BB))
     return std::make_pair(L->getLoopPredecessor(), L->getHeader());
 
   return std::pair<BasicBlock *, BasicBlock *>();
@@ -6267,13 +6772,20 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
   // Quick check to see if they are the same SCEV.
   if (A == B) return true;
 
+  auto ComputesEqualValues = [](const Instruction *A, const Instruction *B) {
+    // Not all instructions that are "identical" compute the same value.  For
+    // instance, two distinct alloca instructions allocating the same type are
+    // identical and do not read memory; but compute distinct values.
+    return A->isIdenticalTo(B) && (isa<BinaryOperator>(A) || isa<GetElementPtrInst>(A));
+  };
+
   // Otherwise, if they're both SCEVUnknown, it's possible that they hold
   // two different instructions with the same value. Check for this case.
   if (const SCEVUnknown *AU = dyn_cast<SCEVUnknown>(A))
     if (const SCEVUnknown *BU = dyn_cast<SCEVUnknown>(B))
       if (const Instruction *AI = dyn_cast<Instruction>(AU->getValue()))
         if (const Instruction *BI = dyn_cast<Instruction>(BU->getValue()))
-          if (AI->isIdenticalTo(BI) && !AI->mayReadFromMemory())
+          if (ComputesEqualValues(AI, BI))
             return true;
 
   // Otherwise assume they may have a different value.
@@ -6324,7 +6836,7 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
   // If there's a constant operand, canonicalize comparisons with boundary
   // cases, and canonicalize *-or-equal comparisons to regular comparisons.
   if (const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS)) {
-    const APInt &RA = RC->getValue()->getValue();
+    const APInt &RA = RC->getAPInt();
     switch (Pred) {
     default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
     case ICmpInst::ICMP_EQ:
@@ -6515,16 +7027,14 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
       Pred = ICmpInst::ICMP_ULT;
       Changed = true;
     } else if (!getUnsignedRange(LHS).getUnsignedMin().isMinValue()) {
-      LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS,
-                       SCEV::FlagNUW);
+      LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS);
       Pred = ICmpInst::ICMP_ULT;
       Changed = true;
     }
     break;
   case ICmpInst::ICMP_UGE:
     if (!getUnsignedRange(RHS).getUnsignedMin().isMinValue()) {
-      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS,
-                       SCEV::FlagNUW);
+      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
     } else if (!getUnsignedRange(LHS).getUnsignedMax().isMaxValue()) {
@@ -6612,10 +7122,140 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
   if (LeftGuarded && RightGuarded)
     return true;
 
+  if (isKnownPredicateViaSplitting(Pred, LHS, RHS))
+    return true;
+
   // Otherwise see what can be done with known constant ranges.
   return isKnownPredicateWithRanges(Pred, LHS, RHS);
 }
 
+bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS,
+                                           ICmpInst::Predicate Pred,
+                                           bool &Increasing) {
+  bool Result = isMonotonicPredicateImpl(LHS, Pred, Increasing);
+
+#ifndef NDEBUG
+  // Verify an invariant: inverting the predicate should turn a monotonically
+  // increasing change to a monotonically decreasing one, and vice versa.
+  bool IncreasingSwapped;
+  bool ResultSwapped = isMonotonicPredicateImpl(
+      LHS, ICmpInst::getSwappedPredicate(Pred), IncreasingSwapped);
+
+  assert(Result == ResultSwapped && "should be able to analyze both!");
+  if (ResultSwapped)
+    assert(Increasing == !IncreasingSwapped &&
+           "monotonicity should flip as we flip the predicate");
+#endif
+
+  return Result;
+}
+
+bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
+                                               ICmpInst::Predicate Pred,
+                                               bool &Increasing) {
+
+  // A zero step value for LHS means the induction variable is essentially a
+  // loop invariant value. We don't really depend on the predicate actually
+  // flipping from false to true (for increasing predicates, and the other way
+  // around for decreasing predicates), all we care about is that *if* the
+  // predicate changes then it only changes from false to true.
+  //
+  // A zero step value in itself is not very useful, but there may be places
+  // where SCEV can prove X >= 0 but not prove X > 0, so it is helpful to be
+  // as general as possible.
+
+  switch (Pred) {
+  default:
+    return false; // Conservative answer
+
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_ULE:
+    if (!LHS->getNoWrapFlags(SCEV::FlagNUW))
+      return false;
+
+    Increasing = Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE;
+    return true;
+
+  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_SGE:
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_SLE: {
+    if (!LHS->getNoWrapFlags(SCEV::FlagNSW))
+      return false;
+
+    const SCEV *Step = LHS->getStepRecurrence(*this);
+
+    if (isKnownNonNegative(Step)) {
+      Increasing = Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE;
+      return true;
+    }
+
+    if (isKnownNonPositive(Step)) {
+      Increasing = Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE;
+      return true;
+    }
+
+    return false;
+  }
+
+  }
+
+  llvm_unreachable("switch has default clause!");
+}
+
+bool ScalarEvolution::isLoopInvariantPredicate(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L,
+    ICmpInst::Predicate &InvariantPred, const SCEV *&InvariantLHS,
+    const SCEV *&InvariantRHS) {
+
+  // If there is a loop-invariant, force it into the RHS, otherwise bail out.
+  if (!isLoopInvariant(RHS, L)) {
+    if (!isLoopInvariant(LHS, L))
+      return false;
+
+    std::swap(LHS, RHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  const SCEVAddRecExpr *ArLHS = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!ArLHS || ArLHS->getLoop() != L)
+    return false;
+
+  bool Increasing;
+  if (!isMonotonicPredicate(ArLHS, Pred, Increasing))
+    return false;
+
+  // If the predicate "ArLHS `Pred` RHS" monotonically increases from false to
+  // true as the loop iterates, and the backedge is control dependent on
+  // "ArLHS `Pred` RHS" == true then we can reason as follows:
+  //
+  //   * if the predicate was false in the first iteration then the predicate
+  //     is never evaluated again, since the loop exits without taking the
+  //     backedge.
+  //   * if the predicate was true in the first iteration then it will
+  //     continue to be true for all future iterations since it is
+  //     monotonically increasing.
+  //
+  // For both the above possibilities, we can replace the loop varying
+  // predicate with its value on the first iteration of the loop (which is
+  // loop invariant).
+  //
+  // A similar reasoning applies for a monotonically decreasing predicate, by
+  // replacing true with false and false with true in the above two bullets.
+
+  auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred);
+
+  if (!isLoopBackedgeGuardedByCond(L, P, LHS, RHS))
+    return false;
+
+  InvariantPred = Pred;
+  InvariantLHS = ArLHS->getStart();
+  InvariantRHS = RHS;
+  return true;
+}
+
 bool
 ScalarEvolution::isKnownPredicateWithRanges(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS) {
@@ -6690,6 +7330,84 @@ ScalarEvolution::isKnownPredicateWithRanges(ICmpInst::Predicate Pred,
   return false;
 }
 
+bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
+                                                    const SCEV *LHS,
+                                                    const SCEV *RHS) {
+
+  // Match Result to (X + Y)<ExpectedFlags> where Y is a constant integer.
+  // Return Y via OutY.
+  auto MatchBinaryAddToConst =
+      [this](const SCEV *Result, const SCEV *X, APInt &OutY,
+             SCEV::NoWrapFlags ExpectedFlags) {
+    const SCEV *NonConstOp, *ConstOp;
+    SCEV::NoWrapFlags FlagsPresent;
+
+    if (!splitBinaryAdd(Result, ConstOp, NonConstOp, FlagsPresent) ||
+        !isa<SCEVConstant>(ConstOp) || NonConstOp != X)
+      return false;
+
+    OutY = cast<SCEVConstant>(ConstOp)->getAPInt();
+    return (FlagsPresent & ExpectedFlags) == ExpectedFlags;
+  };
+
+  APInt C;
+
+  switch (Pred) {
+  default:
+    break;
+
+  case ICmpInst::ICMP_SGE:
+    std::swap(LHS, RHS);
+  case ICmpInst::ICMP_SLE:
+    // X s<= (X + C)<nsw> if C >= 0
+    if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative())
+      return true;
+
+    // (X + C)<nsw> s<= X if C <= 0
+    if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) &&
+        !C.isStrictlyPositive())
+      return true;
+    break;
+
+  case ICmpInst::ICMP_SGT:
+    std::swap(LHS, RHS);
+  case ICmpInst::ICMP_SLT:
+    // X s< (X + C)<nsw> if C > 0
+    if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) &&
+        C.isStrictlyPositive())
+      return true;
+
+    // (X + C)<nsw> s< X if C < 0
+    if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && C.isNegative())
+      return true;
+    break;
+  }
+
+  return false;
+}
+
+bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred,
+                                                   const SCEV *LHS,
+                                                   const SCEV *RHS) {
+  if (Pred != ICmpInst::ICMP_ULT || ProvingSplitPredicate)
+    return false;
+
+  // Allowing arbitrary number of activations of isKnownPredicateViaSplitting on
+  // the stack can result in exponential time complexity.
+  SaveAndRestore<bool> Restore(ProvingSplitPredicate, true);
+
+  // If L >= 0 then I `ult` L <=> I >= 0 && I `slt` L
+  //
+  // To prove L >= 0 we use isKnownNonNegative whereas to prove I >= 0 we use
+  // isKnownPredicate.  isKnownPredicate is more powerful, but also more
+  // expensive; and using isKnownNonNegative(RHS) is sufficient for most of the
+  // interesting cases seen in practice.  We can consider "upgrading" L >= 0 to
+  // use isKnownPredicate later if needed.
+  return isKnownNonNegative(RHS) &&
+         isKnownPredicate(CmpInst::ICMP_SGE, LHS, getZero(LHS->getType())) &&
+         isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS);
+}
+
 /// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is
 /// protected by a conditional between LHS and RHS.  This is used to
 /// to eliminate casts.
@@ -6715,46 +7433,49 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
                     LoopContinuePredicate->getSuccessor(0) != L->getHeader()))
     return true;
 
+  // We don't want more than one activation of the following loops on the stack
+  // -- that can lead to O(n!) time complexity.
+  if (WalkingBEDominatingConds)
+    return false;
+
+  SaveAndRestore<bool> ClearOnExit(WalkingBEDominatingConds, true);
+
+  // See if we can exploit a trip count to prove the predicate.
+  const auto &BETakenInfo = getBackedgeTakenInfo(L);
+  const SCEV *LatchBECount = BETakenInfo.getExact(Latch, this);
+  if (LatchBECount != getCouldNotCompute()) {
+    // We know that Latch branches back to the loop header exactly
+    // LatchBECount times.  This means the backdege condition at Latch is
+    // equivalent to  "{0,+,1} u< LatchBECount".
+    Type *Ty = LatchBECount->getType();
+    auto NoWrapFlags = SCEV::NoWrapFlags(SCEV::FlagNUW | SCEV::FlagNW);
+    const SCEV *LoopCounter =
+      getAddRecExpr(getZero(Ty), getOne(Ty), L, NoWrapFlags);
+    if (isImpliedCond(Pred, LHS, RHS, ICmpInst::ICMP_ULT, LoopCounter,
+                      LatchBECount))
+      return true;
+  }
+
   // Check conditions due to any @llvm.assume intrinsics.
-  for (auto &AssumeVH : AC->assumptions()) {
+  for (auto &AssumeVH : AC.assumptions()) {
     if (!AssumeVH)
       continue;
     auto *CI = cast<CallInst>(AssumeVH);
-    if (!DT->dominates(CI, Latch->getTerminator()))
+    if (!DT.dominates(CI, Latch->getTerminator()))
       continue;
 
     if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
       return true;
   }
 
-  struct ClearWalkingBEDominatingCondsOnExit {
-    ScalarEvolution &SE;
-
-    explicit ClearWalkingBEDominatingCondsOnExit(ScalarEvolution &SE)
-        : SE(SE){};
-
-    ~ClearWalkingBEDominatingCondsOnExit() {
-      SE.WalkingBEDominatingConds = false;
-    }
-  };
-
-  // We don't want more than one activation of the following loop on the stack
-  // -- that can lead to O(n!) time complexity.
-  if (WalkingBEDominatingConds)
-    return false;
-
-  WalkingBEDominatingConds = true;
-  ClearWalkingBEDominatingCondsOnExit ClearOnExit(*this);
-
   // If the loop is not reachable from the entry block, we risk running into an
   // infinite loop as we walk up into the dom tree.  These loops do not matter
   // anyway, so we just return a conservative answer when we see them.
-  if (!DT->isReachableFromEntry(L->getHeader()))
+  if (!DT.isReachableFromEntry(L->getHeader()))
     return false;
 
-  for (DomTreeNode *DTN = (*DT)[Latch], *HeaderDTN = (*DT)[L->getHeader()];
-       DTN != HeaderDTN;
-       DTN = DTN->getIDom()) {
+  for (DomTreeNode *DTN = DT[Latch], *HeaderDTN = DT[L->getHeader()];
+       DTN != HeaderDTN; DTN = DTN->getIDom()) {
 
     assert(DTN && "should reach the loop header before reaching the root!");
 
@@ -6778,7 +7499,7 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
       // We're constructively (and conservatively) enumerating edges within the
       // loop body that dominate the latch.  The dominator tree better agree
       // with us on this:
-      assert(DT->dominates(DominatingEdge, Latch) && "should be!");
+      assert(DT.dominates(DominatingEdge, Latch) && "should be!");
 
       if (isImpliedCond(Pred, LHS, RHS, Condition,
                         BB != ContinuePredicate->getSuccessor(0)))
@@ -6823,11 +7544,11 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   }
 
   // Check conditions due to any @llvm.assume intrinsics.
-  for (auto &AssumeVH : AC->assumptions()) {
+  for (auto &AssumeVH : AC.assumptions()) {
     if (!AssumeVH)
       continue;
     auto *CI = cast<CallInst>(AssumeVH);
-    if (!DT->dominates(CI, L->getHeader()))
+    if (!DT.dominates(CI, L->getHeader()))
       continue;
 
     if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
@@ -6837,6 +7558,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   return false;
 }
 
+namespace {
 /// RAII wrapper to prevent recursive application of isImpliedCond.
 /// ScalarEvolution's PendingLoopPredicates set must be empty unless we are
 /// currently evaluating isImpliedCond.
@@ -6854,6 +7576,7 @@ struct MarkPendingLoopPredicate {
       LoopPreds.erase(Cond);
   }
 };
+} // end anonymous namespace
 
 /// isImpliedCond - Test whether the condition described by Pred, LHS,
 /// and RHS is true whenever the given Cond value evaluates to true.
@@ -6892,6 +7615,14 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
   const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
   const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));
 
+  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS);
+}
+
+bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
+                                    const SCEV *RHS,
+                                    ICmpInst::Predicate FoundPred,
+                                    const SCEV *FoundLHS,
+                                    const SCEV *FoundRHS) {
   // Balance the types.
   if (getTypeSizeInBits(LHS->getType()) <
       getTypeSizeInBits(FoundLHS->getType())) {
@@ -6947,6 +7678,13 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
                                    RHS, LHS, FoundLHS, FoundRHS);
   }
 
+  // Unsigned comparison is the same as signed comparison when both the operands
+  // are non-negative.
+  if (CmpInst::isUnsigned(FoundPred) &&
+      CmpInst::getSignedPredicate(FoundPred) == Pred &&
+      isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS))
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+
   // Check if we can make progress by sharpening ranges.
   if (FoundPred == ICmpInst::ICMP_NE &&
       (isa<SCEVConstant>(FoundLHS) || isa<SCEVConstant>(FoundRHS))) {
@@ -6970,7 +7708,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
     APInt Min = ICmpInst::isSigned(Pred) ?
         getSignedRange(V).getSignedMin() : getUnsignedRange(V).getUnsignedMin();
 
-    if (Min == C->getValue()->getValue()) {
+    if (Min == C->getAPInt()) {
       // Given (V >= Min && V != Min) we conclude V >= (Min + 1).
       // This is true even if (Min + 1) wraps around -- in case of
       // wraparound, (Min + 1) < Min, so (V >= Min => V >= (Min + 1)).
@@ -7021,6 +7759,149 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
   return false;
 }
 
+bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr,
+                                     const SCEV *&L, const SCEV *&R,
+                                     SCEV::NoWrapFlags &Flags) {
+  const auto *AE = dyn_cast<SCEVAddExpr>(Expr);
+  if (!AE || AE->getNumOperands() != 2)
+    return false;
+
+  L = AE->getOperand(0);
+  R = AE->getOperand(1);
+  Flags = AE->getNoWrapFlags();
+  return true;
+}
+
+bool ScalarEvolution::computeConstantDifference(const SCEV *Less,
+                                                const SCEV *More,
+                                                APInt &C) {
+  // We avoid subtracting expressions here because this function is usually
+  // fairly deep in the call stack (i.e. is called many times).
+
+  if (isa<SCEVAddRecExpr>(Less) && isa<SCEVAddRecExpr>(More)) {
+    const auto *LAR = cast<SCEVAddRecExpr>(Less);
+    const auto *MAR = cast<SCEVAddRecExpr>(More);
+
+    if (LAR->getLoop() != MAR->getLoop())
+      return false;
+
+    // We look at affine expressions only; not for correctness but to keep
+    // getStepRecurrence cheap.
+    if (!LAR->isAffine() || !MAR->isAffine())
+      return false;
+
+    if (LAR->getStepRecurrence(*this) != MAR->getStepRecurrence(*this))
+      return false;
+
+    Less = LAR->getStart();
+    More = MAR->getStart();
+
+    // fall through
+  }
+
+  if (isa<SCEVConstant>(Less) && isa<SCEVConstant>(More)) {
+    const auto &M = cast<SCEVConstant>(More)->getAPInt();
+    const auto &L = cast<SCEVConstant>(Less)->getAPInt();
+    C = M - L;
+    return true;
+  }
+
+  const SCEV *L, *R;
+  SCEV::NoWrapFlags Flags;
+  if (splitBinaryAdd(Less, L, R, Flags))
+    if (const auto *LC = dyn_cast<SCEVConstant>(L))
+      if (R == More) {
+        C = -(LC->getAPInt());
+        return true;
+      }
+
+  if (splitBinaryAdd(More, L, R, Flags))
+    if (const auto *LC = dyn_cast<SCEVConstant>(L))
+      if (R == Less) {
+        C = LC->getAPInt();
+        return true;
+      }
+
+  return false;
+}
+
+bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+    const SCEV *FoundLHS, const SCEV *FoundRHS) {
+  if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_ULT)
+    return false;
+
+  const auto *AddRecLHS = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!AddRecLHS)
+    return false;
+
+  const auto *AddRecFoundLHS = dyn_cast<SCEVAddRecExpr>(FoundLHS);
+  if (!AddRecFoundLHS)
+    return false;
+
+  // We'd like to let SCEV reason about control dependencies, so we constrain
+  // both the inequalities to be about add recurrences on the same loop.  This
+  // way we can use isLoopEntryGuardedByCond later.
+
+  const Loop *L = AddRecFoundLHS->getLoop();
+  if (L != AddRecLHS->getLoop())
+    return false;
+
+  //  FoundLHS u< FoundRHS u< -C =>  (FoundLHS + C) u< (FoundRHS + C) ... (1)
+  //
+  //  FoundLHS s< FoundRHS s< INT_MIN - C => (FoundLHS + C) s< (FoundRHS + C)
+  //                                                                  ... (2)
+  //
+  // Informal proof for (2), assuming (1) [*]:
+  //
+  // We'll also assume (A s< B) <=> ((A + INT_MIN) u< (B + INT_MIN)) ... (3)[**]
+  //
+  // Then
+  //
+  //       FoundLHS s< FoundRHS s< INT_MIN - C
+  // <=>  (FoundLHS + INT_MIN) u< (FoundRHS + INT_MIN) u< -C   [ using (3) ]
+  // <=>  (FoundLHS + INT_MIN + C) u< (FoundRHS + INT_MIN + C) [ using (1) ]
+  // <=>  (FoundLHS + INT_MIN + C + INT_MIN) s<
+  //                        (FoundRHS + INT_MIN + C + INT_MIN) [ using (3) ]
+  // <=>  FoundLHS + C s< FoundRHS + C
+  //
+  // [*]: (1) can be proved by ruling out overflow.
+  //
+  // [**]: This can be proved by analyzing all the four possibilities:
+  //    (A s< 0, B s< 0), (A s< 0, B s>= 0), (A s>= 0, B s< 0) and
+  //    (A s>= 0, B s>= 0).
+  //
+  // Note:
+  // Despite (2), "FoundRHS s< INT_MIN - C" does not mean that "FoundRHS + C"
+  // will not sign underflow.  For instance, say FoundLHS = (i8 -128), FoundRHS
+  // = (i8 -127) and C = (i8 -100).  Then INT_MIN - C = (i8 -28), and FoundRHS
+  // s< (INT_MIN - C).  Lack of sign overflow / underflow in "FoundRHS + C" is
+  // neither necessary nor sufficient to prove "(FoundLHS + C) s< (FoundRHS +
+  // C)".
+
+  APInt LDiff, RDiff;
+  if (!computeConstantDifference(FoundLHS, LHS, LDiff) ||
+      !computeConstantDifference(FoundRHS, RHS, RDiff) ||
+      LDiff != RDiff)
+    return false;
+
+  if (LDiff == 0)
+    return true;
+
+  APInt FoundRHSLimit;
+
+  if (Pred == CmpInst::ICMP_ULT) {
+    FoundRHSLimit = -RDiff;
+  } else {
+    assert(Pred == CmpInst::ICMP_SLT && "Checked above!");
+    FoundRHSLimit = APInt::getSignedMinValue(getTypeSizeInBits(RHS->getType())) - RDiff;
+  }
+
+  // Try to prove (1) or (2), as needed.
+  return isLoopEntryGuardedByCond(L, Pred, FoundRHS,
+                                  getConstant(FoundRHSLimit));
+}
+
 /// isImpliedCondOperands - Test whether the condition described by Pred,
 /// LHS, and RHS is true whenever the condition described by Pred, FoundLHS,
 /// and FoundRHS is true.
@@ -7031,6 +7912,9 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
   if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
+  if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS))
+    return true;
+
   return isImpliedCondOperandsHelper(Pred, LHS, RHS,
                                      FoundLHS, FoundRHS) ||
          // ~x < ~y --> x > y
@@ -7043,17 +7927,13 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
 /// If Expr computes ~A, return A else return nullptr
 static const SCEV *MatchNotExpr(const SCEV *Expr) {
   const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (!Add || Add->getNumOperands() != 2) return nullptr;
-
-  const SCEVConstant *AddLHS = dyn_cast<SCEVConstant>(Add->getOperand(0));
-  if (!(AddLHS && AddLHS->getValue()->getValue().isAllOnesValue()))
+  if (!Add || Add->getNumOperands() != 2 ||
+      !Add->getOperand(0)->isAllOnesValue())
     return nullptr;
 
   const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
-  if (!AddRHS || AddRHS->getNumOperands() != 2) return nullptr;
-
-  const SCEVConstant *MulLHS = dyn_cast<SCEVConstant>(AddRHS->getOperand(0));
-  if (!(MulLHS && MulLHS->getValue()->getValue().isAllOnesValue()))
+  if (!AddRHS || AddRHS->getNumOperands() != 2 ||
+      !AddRHS->getOperand(0)->isAllOnesValue())
     return nullptr;
 
   return AddRHS->getOperand(1);
@@ -7067,8 +7947,7 @@ static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr,
   const MaxExprType *MaxExpr = dyn_cast<MaxExprType>(MaybeMaxExpr);
   if (!MaxExpr) return false;
 
-  auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate);
-  return It != MaxExpr->op_end();
+  return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end();
 }
 
 
@@ -7084,6 +7963,38 @@ static bool IsMinConsistingOf(ScalarEvolution &SE,
   return IsMaxConsistingOf<MaxExprType>(MaybeMaxExpr, SE.getNotSCEV(Candidate));
 }
 
+static bool IsKnownPredicateViaAddRecStart(ScalarEvolution &SE,
+                                           ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS) {
+
+  // If both sides are affine addrecs for the same loop, with equal
+  // steps, and we know the recurrences don't wrap, then we only
+  // need to check the predicate on the starting values.
+
+  if (!ICmpInst::isRelational(Pred))
+    return false;
+
+  const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!LAR)
+    return false;
+  const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
+  if (!RAR)
+    return false;
+  if (LAR->getLoop() != RAR->getLoop())
+    return false;
+  if (!LAR->isAffine() || !RAR->isAffine())
+    return false;
+
+  if (LAR->getStepRecurrence(SE) != RAR->getStepRecurrence(SE))
+    return false;
+
+  SCEV::NoWrapFlags NW = ICmpInst::isSigned(Pred) ?
+                         SCEV::FlagNSW : SCEV::FlagNUW;
+  if (!LAR->getNoWrapFlags(NW) || !RAR->getNoWrapFlags(NW))
+    return false;
+
+  return SE.isKnownPredicate(Pred, LAR->getStart(), RAR->getStart());
+}
 
 /// Is LHS `Pred` RHS true on the virtue of LHS or RHS being a Min or Max
 /// expression?
@@ -7129,7 +8040,9 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
   auto IsKnownPredicateFull =
       [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
     return isKnownPredicateWithRanges(Pred, LHS, RHS) ||
-        IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS);
+           IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
+           IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
+           isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
   };
 
   switch (Pred) {
@@ -7185,7 +8098,7 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
       !isa<SCEVConstant>(AddLHS->getOperand(0)))
     return false;
 
-  APInt ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getValue()->getValue();
+  APInt ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getAPInt();
 
   // `FoundLHSRange` is the range we know `FoundLHS` to be in by virtue of the
   // antecedent "`FoundLHS` `Pred` `FoundRHS`".
@@ -7194,13 +8107,12 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
 
   // Since `LHS` is `FoundLHS` + `AddLHS->getOperand(0)`, we can compute a range
   // for `LHS`:
-  APInt Addend =
-      cast<SCEVConstant>(AddLHS->getOperand(0))->getValue()->getValue();
+  APInt Addend = cast<SCEVConstant>(AddLHS->getOperand(0))->getAPInt();
   ConstantRange LHSRange = FoundLHSRange.add(ConstantRange(Addend));
 
   // We can also compute the range of values for `LHS` that satisfy the
   // consequent, "`LHS` `Pred` `RHS`":
-  APInt ConstRHS = cast<SCEVConstant>(RHS)->getValue()->getValue();
+  APInt ConstRHS = cast<SCEVConstant>(RHS)->getAPInt();
   ConstantRange SatisfyingLHSRange =
       ConstantRange::makeSatisfyingICmpRegion(Pred, ConstRHS);
 
@@ -7217,7 +8129,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
   if (NoWrap) return false;
 
   unsigned BitWidth = getTypeSizeInBits(RHS->getType());
-  const SCEV *One = getConstant(Stride->getType(), 1);
+  const SCEV *One = getOne(Stride->getType());
 
   if (IsSigned) {
     APInt MaxRHS = getSignedRange(RHS).getSignedMax();
@@ -7246,7 +8158,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
   if (NoWrap) return false;
 
   unsigned BitWidth = getTypeSizeInBits(RHS->getType());
-  const SCEV *One = getConstant(Stride->getType(), 1);
+  const SCEV *One = getOne(Stride->getType());
 
   if (IsSigned) {
     APInt MinRHS = getSignedRange(RHS).getSignedMin();
@@ -7271,7 +8183,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
 // stride and presence of the equality in the comparison.
 const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
                                             bool Equality) {
-  const SCEV *One = getConstant(Step->getType(), 1);
+  const SCEV *One = getOne(Step->getType());
   Delta = Equality ? getAddExpr(Delta, Step)
                    : getAddExpr(Delta, getMinusSCEV(Step, One));
   return getUDivExpr(Delta, Step);
@@ -7324,7 +8236,7 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // overflow, in which case if RHS - Start is a constant, we don't need to
     // do a max operation since we can just figure it out statically
     if (NoWrap && isa<SCEVConstant>(Diff)) {
-      APInt D = dyn_cast<const SCEVConstant>(Diff)->getValue()->getValue();
+      APInt D = dyn_cast<const SCEVConstant>(Diff)->getAPInt();
       if (D.isNegative())
         End = Start;
     } else
@@ -7405,7 +8317,7 @@ ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
     // overflow, in which case if RHS - Start is a constant, we don't need to
     // do a max operation since we can just figure it out statically
     if (NoWrap && isa<SCEVConstant>(Diff)) {
-      APInt D = dyn_cast<const SCEVConstant>(Diff)->getValue()->getValue();
+      APInt D = dyn_cast<const SCEVConstant>(Diff)->getAPInt();
       if (!D.isNegative())
         End = Start;
     } else
@@ -7460,23 +8372,20 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart()))
     if (!SC->getValue()->isZero()) {
       SmallVector<const SCEV *, 4> Operands(op_begin(), op_end());
-      Operands[0] = SE.getConstant(SC->getType(), 0);
+      Operands[0] = SE.getZero(SC->getType());
       const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(),
                                              getNoWrapFlags(FlagNW));
-      if (const SCEVAddRecExpr *ShiftedAddRec =
-            dyn_cast<SCEVAddRecExpr>(Shifted))
+      if (const auto *ShiftedAddRec = dyn_cast<SCEVAddRecExpr>(Shifted))
         return ShiftedAddRec->getNumIterationsInRange(
-                           Range.subtract(SC->getValue()->getValue()), SE);
+            Range.subtract(SC->getAPInt()), SE);
       // This is strange and shouldn't happen.
       return SE.getCouldNotCompute();
     }
 
   // The only time we can solve this is when we have all constant indices.
   // Otherwise, we cannot determine the overflow conditions.
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (!isa<SCEVConstant>(getOperand(i)))
-      return SE.getCouldNotCompute();
-
+  if (any_of(operands(), [](const SCEV *Op) { return !isa<SCEVConstant>(Op); }))
+    return SE.getCouldNotCompute();
 
   // Okay at this point we know that all elements of the chrec are constants and
   // that the start element is zero.
@@ -7485,7 +8394,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   // iteration exits.
   unsigned BitWidth = SE.getTypeSizeInBits(getType());
   if (!Range.contains(APInt(BitWidth, 0)))
-    return SE.getConstant(getType(), 0);
+    return SE.getZero(getType());
 
   if (isAffine()) {
     // If this is an affine expression then we have this situation:
@@ -7496,7 +8405,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
     // If A is negative then the lower of the range is the last possible loop
     // value.  Also note that we already checked for a full range.
     APInt One(BitWidth,1);
-    APInt A     = cast<SCEVConstant>(getOperand(1))->getValue()->getValue();
+    APInt A = cast<SCEVConstant>(getOperand(1))->getAPInt();
     APInt End = A.sge(One) ? (Range.getUpper() - One) : Range.getLower();
 
     // The exit value should be (End+A)/A.
@@ -7528,15 +8437,13 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
                                              FlagAnyWrap);
 
     // Next, solve the constructed addrec
-    std::pair<const SCEV *,const SCEV *> Roots =
-      SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE);
+    auto Roots = SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE);
     const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
     const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
     if (R1) {
       // Pick the smallest positive root value.
-      if (ConstantInt *CB =
-          dyn_cast<ConstantInt>(ConstantExpr::getICmp(ICmpInst::ICMP_ULT,
-                         R1->getValue(), R2->getValue()))) {
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp(
+              ICmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) {
         if (!CB->getZExtValue())
           std::swap(R1, R2);   // R1 is the minimum root now.
 
@@ -7549,7 +8456,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
         if (Range.contains(R1Val->getValue())) {
           // The next iteration must be out of the range...
           ConstantInt *NextVal =
-                ConstantInt::get(SE.getContext(), R1->getValue()->getValue()+1);
+              ConstantInt::get(SE.getContext(), R1->getAPInt() + 1);
 
           R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
           if (!Range.contains(R1Val->getValue()))
@@ -7560,7 +8467,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
         // If R1 was not in the range, then it is a good return value.  Make
         // sure that R1-1 WAS in the range though, just in case.
         ConstantInt *NextVal =
-               ConstantInt::get(SE.getContext(), R1->getValue()->getValue()-1);
+            ConstantInt::get(SE.getContext(), R1->getAPInt() - 1);
         R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
         if (Range.contains(R1Val->getValue()))
           return R1;
@@ -7644,9 +8551,84 @@ struct SCEVCollectTerms {
   }
   bool isDone() const { return false; }
 };
+
+// Check if a SCEV contains an AddRecExpr.
+struct SCEVHasAddRec {
+  bool &ContainsAddRec;
+
+  SCEVHasAddRec(bool &ContainsAddRec) : ContainsAddRec(ContainsAddRec) {
+   ContainsAddRec = false;
+  }
+
+  bool follow(const SCEV *S) {
+    if (isa<SCEVAddRecExpr>(S)) {
+      ContainsAddRec = true;
+
+      // Stop recursion: once we collected a term, do not walk its operands.
+      return false;
+    }
+
+    // Keep looking.
+    return true;
+  }
+  bool isDone() const { return false; }
+};
+
+// Find factors that are multiplied with an expression that (possibly as a
+// subexpression) contains an AddRecExpr. In the expression:
+//
+//  8 * (100 +  %p * %q * (%a + {0, +, 1}_loop))
+//
+// "%p * %q" are factors multiplied by the expression "(%a + {0, +, 1}_loop)"
+// that contains the AddRec {0, +, 1}_loop. %p * %q are likely to be array size
+// parameters as they form a product with an induction variable.
+//
+// This collector expects all array size parameters to be in the same MulExpr.
+// It might be necessary to later add support for collecting parameters that are
+// spread over different nested MulExpr.
+struct SCEVCollectAddRecMultiplies {
+  SmallVectorImpl<const SCEV *> &Terms;
+  ScalarEvolution &SE;
+
+  SCEVCollectAddRecMultiplies(SmallVectorImpl<const SCEV *> &T, ScalarEvolution &SE)
+      : Terms(T), SE(SE) {}
+
+  bool follow(const SCEV *S) {
+    if (auto *Mul = dyn_cast<SCEVMulExpr>(S)) {
+      bool HasAddRec = false;
+      SmallVector<const SCEV *, 0> Operands;
+      for (auto Op : Mul->operands()) {
+        if (isa<SCEVUnknown>(Op)) {
+          Operands.push_back(Op);
+        } else {
+          bool ContainsAddRec;
+          SCEVHasAddRec ContiansAddRec(ContainsAddRec);
+          visitAll(Op, ContiansAddRec);
+          HasAddRec |= ContainsAddRec;
+        }
+      }
+      if (Operands.size() == 0)
+        return true;
+
+      if (!HasAddRec)
+        return false;
+
+      Terms.push_back(SE.getMulExpr(Operands));
+      // Stop recursion: once we collected a term, do not walk its operands.
+      return false;
+    }
+
+    // Keep looking.
+    return true;
+  }
+  bool isDone() const { return false; }
+};
 }
 
-/// Find parametric terms in this SCEVAddRecExpr.
+/// Find parametric terms in this SCEVAddRecExpr. We first for parameters in
+/// two places:
+///   1) The strides of AddRec expressions.
+///   2) Unknowns that are multiplied with AddRec expressions.
 void ScalarEvolution::collectParametricTerms(const SCEV *Expr,
     SmallVectorImpl<const SCEV *> &Terms) {
   SmallVector<const SCEV *, 4> Strides;
@@ -7669,6 +8651,9 @@ void ScalarEvolution::collectParametricTerms(const SCEV *Expr,
       for (const SCEV *T : Terms)
         dbgs() << *T << "\n";
     });
+
+  SCEVCollectAddRecMultiplies MulCollector(Terms, *this);
+  visitAll(Expr, MulCollector);
 }
 
 static bool findArrayDimensionsRec(ScalarEvolution &SE,
@@ -7718,30 +8703,28 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
   return true;
 }
 
-namespace {
-struct FindParameter {
-  bool FoundParameter;
-  FindParameter() : FoundParameter(false) {}
-
-  bool follow(const SCEV *S) {
-    if (isa<SCEVUnknown>(S)) {
-      FoundParameter = true;
-      // Stop recursion: we found a parameter.
-      return false;
-    }
-    // Keep looking.
-    return true;
-  }
-  bool isDone() const {
-    // Stop recursion if we have found a parameter.
-    return FoundParameter;
-  }
-};
-}
-
 // Returns true when S contains at least a SCEVUnknown parameter.
 static inline bool
 containsParameters(const SCEV *S) {
+  struct FindParameter {
+    bool FoundParameter;
+    FindParameter() : FoundParameter(false) {}
+
+    bool follow(const SCEV *S) {
+      if (isa<SCEVUnknown>(S)) {
+        FoundParameter = true;
+        // Stop recursion: we found a parameter.
+        return false;
+      }
+      // Keep looking.
+      return true;
+    }
+    bool isDone() const {
+      // Stop recursion if we have found a parameter.
+      return FoundParameter;
+    }
+  };
+
   FindParameter F;
   SCEVTraversal<FindParameter> ST(F);
   ST.visitAll(S);
@@ -7829,11 +8812,13 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
 
   ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
 
-  // Divide all terms by the element size.
+  // Try to divide all terms by the element size. If term is not divisible by
+  // element size, proceed with the original term.
   for (const SCEV *&Term : Terms) {
     const SCEV *Q, *R;
     SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
-    Term = Q;
+    if (!Q->isZero())
+      Term = Q;
   }
 
   SmallVector<const SCEV *, 4> NewTerms;
@@ -7875,7 +8860,7 @@ void ScalarEvolution::computeAccessFunctions(
   if (Sizes.empty())
     return;
 
-  if (auto AR = dyn_cast<SCEVAddRecExpr>(Expr))
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(Expr))
     if (!AR->isAffine())
       return;
 
@@ -8059,58 +9044,55 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //                   ScalarEvolution Class Implementation
 //===----------------------------------------------------------------------===//
 
-ScalarEvolution::ScalarEvolution()
-    : FunctionPass(ID), WalkingBEDominatingConds(false), ValuesAtScopes(64),
-      LoopDispositions(64), BlockDispositions(64), FirstUnknown(nullptr) {
-  initializeScalarEvolutionPass(*PassRegistry::getPassRegistry());
-}
-
-bool ScalarEvolution::runOnFunction(Function &F) {
-  this->F = &F;
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  return false;
-}
-
-void ScalarEvolution::releaseMemory() {
+ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
+                                 AssumptionCache &AC, DominatorTree &DT,
+                                 LoopInfo &LI)
+    : F(F), TLI(TLI), AC(AC), DT(DT), LI(LI),
+      CouldNotCompute(new SCEVCouldNotCompute()),
+      WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
+      ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64),
+      FirstUnknown(nullptr) {}
+
+ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
+    : F(Arg.F), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT), LI(Arg.LI),
+      CouldNotCompute(std::move(Arg.CouldNotCompute)),
+      ValueExprMap(std::move(Arg.ValueExprMap)),
+      WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
+      BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
+      ConstantEvolutionLoopExitValue(
+          std::move(Arg.ConstantEvolutionLoopExitValue)),
+      ValuesAtScopes(std::move(Arg.ValuesAtScopes)),
+      LoopDispositions(std::move(Arg.LoopDispositions)),
+      BlockDispositions(std::move(Arg.BlockDispositions)),
+      UnsignedRanges(std::move(Arg.UnsignedRanges)),
+      SignedRanges(std::move(Arg.SignedRanges)),
+      UniqueSCEVs(std::move(Arg.UniqueSCEVs)),
+      UniquePreds(std::move(Arg.UniquePreds)),
+      SCEVAllocator(std::move(Arg.SCEVAllocator)),
+      FirstUnknown(Arg.FirstUnknown) {
+  Arg.FirstUnknown = nullptr;
+}
+
+ScalarEvolution::~ScalarEvolution() {
   // Iterate through all the SCEVUnknown instances and call their
   // destructors, so that they release their references to their values.
-  for (SCEVUnknown *U = FirstUnknown; U; U = U->Next)
-    U->~SCEVUnknown();
+  for (SCEVUnknown *U = FirstUnknown; U;) {
+    SCEVUnknown *Tmp = U;
+    U = U->Next;
+    Tmp->~SCEVUnknown();
+  }
   FirstUnknown = nullptr;
 
   ValueExprMap.clear();
 
   // Free any extra memory created for ExitNotTakenInfo in the unlikely event
   // that a loop had multiple computable exits.
-  for (DenseMap<const Loop*, BackedgeTakenInfo>::iterator I =
-         BackedgeTakenCounts.begin(), E = BackedgeTakenCounts.end();
-       I != E; ++I) {
-    I->second.clear();
-  }
+  for (auto &BTCI : BackedgeTakenCounts)
+    BTCI.second.clear();
 
   assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
   assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!");
-
-  BackedgeTakenCounts.clear();
-  ConstantEvolutionLoopExitValue.clear();
-  ValuesAtScopes.clear();
-  LoopDispositions.clear();
-  BlockDispositions.clear();
-  UnsignedRanges.clear();
-  SignedRanges.clear();
-  UniqueSCEVs.clear();
-  SCEVAllocator.Reset();
-}
-
-void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequiredTransitive<LoopInfoWrapperPass>();
-  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  assert(!ProvingSplitPredicate && "ProvingSplitPredicate garbage!");
 }
 
 bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) {
@@ -8152,7 +9134,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
   OS << "\n";
 }
 
-void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
+void ScalarEvolution::print(raw_ostream &OS) const {
   // ScalarEvolution's implementation of the print method is to print
   // out SCEV values of all instructions that are interesting. Doing
   // this potentially causes it to create new SCEV objects though,
@@ -8162,13 +9144,13 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
   ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
 
   OS << "Classifying expressions for: ";
-  F->printAsOperand(OS, /*PrintType=*/false);
+  F.printAsOperand(OS, /*PrintType=*/false);
   OS << "\n";
-  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
-    if (isSCEVable(I->getType()) && !isa<CmpInst>(*I)) {
-      OS << *I << '\n';
+  for (Instruction &I : instructions(F))
+    if (isSCEVable(I.getType()) && !isa<CmpInst>(I)) {
+      OS << I << '\n';
       OS << "  -->  ";
-      const SCEV *SV = SE.getSCEV(&*I);
+      const SCEV *SV = SE.getSCEV(&I);
       SV->print(OS);
       if (!isa<SCEVCouldNotCompute>(SV)) {
         OS << " U: ";
@@ -8177,7 +9159,7 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
         SE.getSignedRange(SV).print(OS);
       }
 
-      const Loop *L = LI->getLoopFor((*I).getParent());
+      const Loop *L = LI.getLoopFor(I.getParent());
 
       const SCEV *AtUse = SE.getSCEVAtScope(SV, L);
       if (AtUse != SV) {
@@ -8205,9 +9187,9 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
     }
 
   OS << "Determining loop execution counts for: ";
-  F->printAsOperand(OS, /*PrintType=*/false);
+  F.printAsOperand(OS, /*PrintType=*/false);
   OS << "\n";
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+  for (LoopInfo::iterator I = LI.begin(), E = LI.end(); I != E; ++I)
     PrintLoopInfo(OS, &SE, *I);
 }
 
@@ -8260,9 +9242,8 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
 
     // This recurrence is variant w.r.t. L if any of its operands
     // are variant.
-    for (SCEVAddRecExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
-         I != E; ++I)
-      if (!isLoopInvariant(*I, L))
+    for (auto *Op : AR->operands())
+      if (!isLoopInvariant(Op, L))
         return LoopVariant;
 
     // Otherwise it's loop-invariant.
@@ -8272,11 +9253,9 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
   case scMulExpr:
   case scUMaxExpr:
   case scSMaxExpr: {
-    const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
     bool HasVarying = false;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      LoopDisposition D = getLoopDisposition(*I, L);
+    for (auto *Op : cast<SCEVNAryExpr>(S)->operands()) {
+      LoopDisposition D = getLoopDisposition(Op, L);
       if (D == LoopVariant)
         return LoopVariant;
       if (D == LoopComputable)
@@ -8300,7 +9279,7 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
     // invariant if they are not contained in the specified loop.
     // Instructions are never considered invariant in the function body
     // (null loop) because they are defined within the "loop".
-    if (Instruction *I = dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue()))
+    if (auto *I = dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue()))
       return (L && !L->contains(I)) ? LoopInvariant : LoopVariant;
     return LoopInvariant;
   case scCouldNotCompute:
@@ -8351,7 +9330,7 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
     // produces the addrec's value is a PHI, and a PHI effectively properly
     // dominates its entire containing block.
     const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(S);
-    if (!DT->dominates(AR->getLoop()->getHeader(), BB))
+    if (!DT.dominates(AR->getLoop()->getHeader(), BB))
       return DoesNotDominateBlock;
   }
   // FALL THROUGH into SCEVNAryExpr handling.
@@ -8361,9 +9340,8 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
   case scSMaxExpr: {
     const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
     bool Proper = true;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      BlockDisposition D = getBlockDisposition(*I, BB);
+    for (const SCEV *NAryOp : NAry->operands()) {
+      BlockDisposition D = getBlockDisposition(NAryOp, BB);
       if (D == DoesNotDominateBlock)
         return DoesNotDominateBlock;
       if (D == DominatesBlock)
@@ -8388,7 +9366,7 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
           dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue())) {
       if (I->getParent() == BB)
         return DominatesBlock;
-      if (DT->properlyDominates(I->getParent(), BB))
+      if (DT.properlyDominates(I->getParent(), BB))
         return ProperlyDominatesBlock;
       return DoesNotDominateBlock;
     }
@@ -8407,24 +9385,22 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) {
   return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
 }
 
-namespace {
-// Search for a SCEV expression node within an expression tree.
-// Implements SCEVTraversal::Visitor.
-struct SCEVSearch {
-  const SCEV *Node;
-  bool IsFound;
+bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
+  // Search for a SCEV expression node within an expression tree.
+  // Implements SCEVTraversal::Visitor.
+  struct SCEVSearch {
+    const SCEV *Node;
+    bool IsFound;
 
-  SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
+    SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
 
-  bool follow(const SCEV *S) {
-    IsFound |= (S == Node);
-    return !IsFound;
-  }
-  bool isDone() const { return IsFound; }
-};
-}
+    bool follow(const SCEV *S) {
+      IsFound |= (S == Node);
+      return !IsFound;
+    }
+    bool isDone() const { return IsFound; }
+  };
 
-bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
   SCEVSearch Search(Op);
   visitAll(S, Search);
   return Search.IsFound;
@@ -8463,43 +9439,39 @@ static void replaceSubString(std::string &Str, StringRef From, StringRef To) {
 /// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis.
 static void
 getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) {
-  for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I) {
-    getLoopBackedgeTakenCounts(*I, Map, SE); // recurse.
-
-    std::string &S = Map[L];
-    if (S.empty()) {
-      raw_string_ostream OS(S);
-      SE.getBackedgeTakenCount(L)->print(OS);
+  std::string &S = Map[L];
+  if (S.empty()) {
+    raw_string_ostream OS(S);
+    SE.getBackedgeTakenCount(L)->print(OS);
 
-      // false and 0 are semantically equivalent. This can happen in dead loops.
-      replaceSubString(OS.str(), "false", "0");
-      // Remove wrap flags, their use in SCEV is highly fragile.
-      // FIXME: Remove this when SCEV gets smarter about them.
-      replaceSubString(OS.str(), "<nw>", "");
-      replaceSubString(OS.str(), "<nsw>", "");
-      replaceSubString(OS.str(), "<nuw>", "");
-    }
+    // false and 0 are semantically equivalent. This can happen in dead loops.
+    replaceSubString(OS.str(), "false", "0");
+    // Remove wrap flags, their use in SCEV is highly fragile.
+    // FIXME: Remove this when SCEV gets smarter about them.
+    replaceSubString(OS.str(), "<nw>", "");
+    replaceSubString(OS.str(), "<nsw>", "");
+    replaceSubString(OS.str(), "<nuw>", "");
   }
-}
 
-void ScalarEvolution::verifyAnalysis() const {
-  if (!VerifySCEV)
-    return;
+  for (auto *R : reverse(*L))
+    getLoopBackedgeTakenCounts(R, Map, SE); // recurse.
+}
 
+void ScalarEvolution::verify() const {
   ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
 
   // Gather stringified backedge taken counts for all loops using SCEV's caches.
   // FIXME: It would be much better to store actual values instead of strings,
   //        but SCEV pointers will change if we drop the caches.
   VerifyMap BackedgeDumpsOld, BackedgeDumpsNew;
-  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
+  for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I)
     getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE);
 
-  // Gather stringified backedge taken counts for all loops without using
-  // SCEV's caches.
-  SE.releaseMemory();
-  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
-    getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE);
+  // Gather stringified backedge taken counts for all loops using a fresh
+  // ScalarEvolution object.
+  ScalarEvolution SE2(F, TLI, AC, DT, LI);
+  for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I)
+    getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE2);
 
   // Now compare whether they're the same with and without caches. This allows
   // verifying that no pass changed the cache.
@@ -8532,3 +9504,238 @@ void ScalarEvolution::verifyAnalysis() const {
 
   // TODO: Verify more things.
 }
+
+char ScalarEvolutionAnalysis::PassID;
+
+ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
+                                             AnalysisManager<Function> *AM) {
+  return ScalarEvolution(F, AM->getResult<TargetLibraryAnalysis>(F),
+                         AM->getResult<AssumptionAnalysis>(F),
+                         AM->getResult<DominatorTreeAnalysis>(F),
+                         AM->getResult<LoopAnalysis>(F));
+}
+
+PreservedAnalyses
+ScalarEvolutionPrinterPass::run(Function &F, AnalysisManager<Function> *AM) {
+  AM->getResult<ScalarEvolutionAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS_BEGIN(ScalarEvolutionWrapperPass, "scalar-evolution",
+                      "Scalar Evolution Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ScalarEvolutionWrapperPass, "scalar-evolution",
+                    "Scalar Evolution Analysis", false, true)
+char ScalarEvolutionWrapperPass::ID = 0;
+
+ScalarEvolutionWrapperPass::ScalarEvolutionWrapperPass() : FunctionPass(ID) {
+  initializeScalarEvolutionWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool ScalarEvolutionWrapperPass::runOnFunction(Function &F) {
+  SE.reset(new ScalarEvolution(
+      F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+      getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+      getAnalysis<LoopInfoWrapperPass>().getLoopInfo()));
+  return false;
+}
+
+void ScalarEvolutionWrapperPass::releaseMemory() { SE.reset(); }
+
+void ScalarEvolutionWrapperPass::print(raw_ostream &OS, const Module *) const {
+  SE->print(OS);
+}
+
+void ScalarEvolutionWrapperPass::verifyAnalysis() const {
+  if (!VerifySCEV)
+    return;
+
+  SE->verify();
+}
+
+void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<AssumptionCacheTracker>();
+  AU.addRequiredTransitive<LoopInfoWrapperPass>();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
+}
+
+const SCEVPredicate *
+ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS,
+                                   const SCEVConstant *RHS) {
+  FoldingSetNodeID ID;
+  // Unique this node based on the arguments
+  ID.AddInteger(SCEVPredicate::P_Equal);
+  ID.AddPointer(LHS);
+  ID.AddPointer(RHS);
+  void *IP = nullptr;
+  if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP))
+    return S;
+  SCEVEqualPredicate *Eq = new (SCEVAllocator)
+      SCEVEqualPredicate(ID.Intern(SCEVAllocator), LHS, RHS);
+  UniquePreds.InsertNode(Eq, IP);
+  return Eq;
+}
+
+namespace {
+class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
+public:
+  static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE,
+                             SCEVUnionPredicate &A) {
+    SCEVPredicateRewriter Rewriter(SE, A);
+    return Rewriter.visit(Scev);
+  }
+
+  SCEVPredicateRewriter(ScalarEvolution &SE, SCEVUnionPredicate &P)
+      : SCEVRewriteVisitor(SE), P(P) {}
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    auto ExprPreds = P.getPredicatesForExpr(Expr);
+    for (auto *Pred : ExprPreds)
+      if (const auto *IPred = dyn_cast<const SCEVEqualPredicate>(Pred))
+        if (IPred->getLHS() == Expr)
+          return IPred->getRHS();
+
+    return Expr;
+  }
+
+private:
+  SCEVUnionPredicate &P;
+};
+} // end anonymous namespace
+
+const SCEV *ScalarEvolution::rewriteUsingPredicate(const SCEV *Scev,
+                                                   SCEVUnionPredicate &Preds) {
+  return SCEVPredicateRewriter::rewrite(Scev, *this, Preds);
+}
+
+/// SCEV predicates
+SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
+                             SCEVPredicateKind Kind)
+    : FastID(ID), Kind(Kind) {}
+
+SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
+                                       const SCEVUnknown *LHS,
+                                       const SCEVConstant *RHS)
+    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {}
+
+bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
+  const auto *Op = dyn_cast<const SCEVEqualPredicate>(N);
+
+  if (!Op)
+    return false;
+
+  return Op->LHS == LHS && Op->RHS == RHS;
+}
+
+bool SCEVEqualPredicate::isAlwaysTrue() const { return false; }
+
+const SCEV *SCEVEqualPredicate::getExpr() const { return LHS; }
+
+void SCEVEqualPredicate::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n";
+}
+
+/// Union predicates don't get cached so create a dummy set ID for it.
+SCEVUnionPredicate::SCEVUnionPredicate()
+    : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {}
+
+bool SCEVUnionPredicate::isAlwaysTrue() const {
+  return all_of(Preds,
+                [](const SCEVPredicate *I) { return I->isAlwaysTrue(); });
+}
+
+ArrayRef<const SCEVPredicate *>
+SCEVUnionPredicate::getPredicatesForExpr(const SCEV *Expr) {
+  auto I = SCEVToPreds.find(Expr);
+  if (I == SCEVToPreds.end())
+    return ArrayRef<const SCEVPredicate *>();
+  return I->second;
+}
+
+bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
+  if (const auto *Set = dyn_cast<const SCEVUnionPredicate>(N))
+    return all_of(Set->Preds,
+                  [this](const SCEVPredicate *I) { return this->implies(I); });
+
+  auto ScevPredsIt = SCEVToPreds.find(N->getExpr());
+  if (ScevPredsIt == SCEVToPreds.end())
+    return false;
+  auto &SCEVPreds = ScevPredsIt->second;
+
+  return any_of(SCEVPreds,
+                [N](const SCEVPredicate *I) { return I->implies(N); });
+}
+
+const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; }
+
+void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const {
+  for (auto Pred : Preds)
+    Pred->print(OS, Depth);
+}
+
+void SCEVUnionPredicate::add(const SCEVPredicate *N) {
+  if (const auto *Set = dyn_cast<const SCEVUnionPredicate>(N)) {
+    for (auto Pred : Set->Preds)
+      add(Pred);
+    return;
+  }
+
+  if (implies(N))
+    return;
+
+  const SCEV *Key = N->getExpr();
+  assert(Key && "Only SCEVUnionPredicate doesn't have an "
+                " associated expression!");
+
+  SCEVToPreds[Key].push_back(N);
+  Preds.push_back(N);
+}
+
+PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE)
+    : SE(SE), Generation(0) {}
+
+const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
+  const SCEV *Expr = SE.getSCEV(V);
+  RewriteEntry &Entry = RewriteMap[Expr];
+
+  // If we already have an entry and the version matches, return it.
+  if (Entry.second && Generation == Entry.first)
+    return Entry.second;
+
+  // We found an entry but it's stale. Rewrite the stale entry
+  // acording to the current predicate.
+  if (Entry.second)
+    Expr = Entry.second;
+
+  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds);
+  Entry = {Generation, NewSCEV};
+
+  return NewSCEV;
+}
+
+void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
+  if (Preds.implies(&Pred))
+    return;
+  Preds.add(&Pred);
+  updateGeneration();
+}
+
+const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
+  return Preds;
+}
+
+void PredicatedScalarEvolution::updateGeneration() {
+  // If the generation number wrapped recompute everything.
+  if (++Generation == 0) {
+    for (auto &II : RewriteMap) {
+      const SCEV *Rewritten = II.second.second;
+      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)};
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 6bc0d85..2e50c80 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -19,125 +19,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 
-namespace {
-  /// ScalarEvolutionAliasAnalysis - This is a simple alias analysis
-  /// implementation that uses ScalarEvolution to answer queries.
-  class ScalarEvolutionAliasAnalysis : public FunctionPass,
-                                       public AliasAnalysis {
-    ScalarEvolution *SE;
-
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    ScalarEvolutionAliasAnalysis() : FunctionPass(ID), SE(nullptr) {
-      initializeScalarEvolutionAliasAnalysisPass(
-        *PassRegistry::getPassRegistry());
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(AnalysisID PI) override {
-      if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
-    }
-
-  private:
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    bool runOnFunction(Function &F) override;
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override;
-
-    Value *GetBaseValue(const SCEV *S);
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char ScalarEvolutionAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa",
-                   "ScalarEvolution-based Alias Analysis", false, true, false)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_AG_PASS_END(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa",
-                    "ScalarEvolution-based Alias Analysis", false, true, false)
-
-FunctionPass *llvm::createScalarEvolutionAliasAnalysisPass() {
-  return new ScalarEvolutionAliasAnalysis();
-}
-
-void
-ScalarEvolutionAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequiredTransitive<ScalarEvolution>();
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
-}
-
-bool
-ScalarEvolutionAliasAnalysis::runOnFunction(Function &F) {
-  InitializeAliasAnalysis(this, &F.getParent()->getDataLayout());
-  SE = &getAnalysis<ScalarEvolution>();
-  return false;
-}
-
-/// GetBaseValue - Given an expression, try to find a
-/// base value. Return null is none was found.
-Value *
-ScalarEvolutionAliasAnalysis::GetBaseValue(const SCEV *S) {
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-    // In an addrec, assume that the base will be in the start, rather
-    // than the step.
-    return GetBaseValue(AR->getStart());
-  } else if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
-    // If there's a pointer operand, it'll be sorted at the end of the list.
-    const SCEV *Last = A->getOperand(A->getNumOperands()-1);
-    if (Last->getType()->isPointerTy())
-      return GetBaseValue(Last);
-  } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
-    // This is a leaf node.
-    return U->getValue();
-  }
-  // No Identified object found.
-  return nullptr;
-}
-
-AliasResult ScalarEvolutionAliasAnalysis::alias(const MemoryLocation &LocA,
-                                                const MemoryLocation &LocB) {
+AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
+                                const MemoryLocation &LocB) {
   // If either of the memory references is empty, it doesn't matter what the
   // pointer values are. This allows the code below to ignore this special
   // case.
   if (LocA.Size == 0 || LocB.Size == 0)
     return NoAlias;
 
-  // This is ScalarEvolutionAliasAnalysis. Get the SCEVs!
-  const SCEV *AS = SE->getSCEV(const_cast<Value *>(LocA.Ptr));
-  const SCEV *BS = SE->getSCEV(const_cast<Value *>(LocB.Ptr));
+  // This is SCEVAAResult. Get the SCEVs!
+  const SCEV *AS = SE.getSCEV(const_cast<Value *>(LocA.Ptr));
+  const SCEV *BS = SE.getSCEV(const_cast<Value *>(LocB.Ptr));
 
   // If they evaluate to the same expression, it's a MustAlias.
-  if (AS == BS) return MustAlias;
+  if (AS == BS)
+    return MustAlias;
 
   // If something is known about the difference between the two addresses,
   // see if it's enough to prove a NoAlias.
-  if (SE->getEffectiveSCEVType(AS->getType()) ==
-      SE->getEffectiveSCEVType(BS->getType())) {
-    unsigned BitWidth = SE->getTypeSizeInBits(AS->getType());
+  if (SE.getEffectiveSCEVType(AS->getType()) ==
+      SE.getEffectiveSCEVType(BS->getType())) {
+    unsigned BitWidth = SE.getTypeSizeInBits(AS->getType());
     APInt ASizeInt(BitWidth, LocA.Size);
     APInt BSizeInt(BitWidth, LocB.Size);
 
     // Compute the difference between the two pointers.
-    const SCEV *BA = SE->getMinusSCEV(BS, AS);
+    const SCEV *BA = SE.getMinusSCEV(BS, AS);
 
     // Test whether the difference is known to be great enough that memory of
     // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt
     // are non-zero, which is special-cased above.
-    if (ASizeInt.ule(SE->getUnsignedRange(BA).getUnsignedMin()) &&
-        (-BSizeInt).uge(SE->getUnsignedRange(BA).getUnsignedMax()))
+    if (ASizeInt.ule(SE.getUnsignedRange(BA).getUnsignedMin()) &&
+        (-BSizeInt).uge(SE.getUnsignedRange(BA).getUnsignedMax()))
       return NoAlias;
 
     // Folding the subtraction while preserving range information can be tricky
@@ -145,13 +62,13 @@ AliasResult ScalarEvolutionAliasAnalysis::alias(const MemoryLocation &LocA,
     // and try again to see if things fold better that way.
 
     // Compute the difference between the two pointers.
-    const SCEV *AB = SE->getMinusSCEV(AS, BS);
+    const SCEV *AB = SE.getMinusSCEV(AS, BS);
 
     // Test whether the difference is known to be great enough that memory of
     // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt
     // are non-zero, which is special-cased above.
-    if (BSizeInt.ule(SE->getUnsignedRange(AB).getUnsignedMin()) &&
-        (-ASizeInt).uge(SE->getUnsignedRange(AB).getUnsignedMax()))
+    if (BSizeInt.ule(SE.getUnsignedRange(AB).getUnsignedMin()) &&
+        (-ASizeInt).uge(SE.getUnsignedRange(AB).getUnsignedMax()))
       return NoAlias;
   }
 
@@ -170,5 +87,62 @@ AliasResult ScalarEvolutionAliasAnalysis::alias(const MemoryLocation &LocA,
       return NoAlias;
 
   // Forward the query to the next analysis.
-  return AliasAnalysis::alias(LocA, LocB);
+  return AAResultBase::alias(LocA, LocB);
+}
+
+/// Given an expression, try to find a base value.
+///
+/// Returns null if none was found.
+Value *SCEVAAResult::GetBaseValue(const SCEV *S) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // In an addrec, assume that the base will be in the start, rather
+    // than the step.
+    return GetBaseValue(AR->getStart());
+  } else if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
+    // If there's a pointer operand, it'll be sorted at the end of the list.
+    const SCEV *Last = A->getOperand(A->getNumOperands() - 1);
+    if (Last->getType()->isPointerTy())
+      return GetBaseValue(Last);
+  } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // This is a leaf node.
+    return U->getValue();
+  }
+  // No Identified object found.
+  return nullptr;
+}
+
+SCEVAAResult SCEVAA::run(Function &F, AnalysisManager<Function> *AM) {
+  return SCEVAAResult(AM->getResult<TargetLibraryAnalysis>(F),
+                      AM->getResult<ScalarEvolutionAnalysis>(F));
+}
+
+char SCEVAA::PassID;
+
+char SCEVAAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SCEVAAWrapperPass, "scev-aa",
+                      "ScalarEvolution-based Alias Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SCEVAAWrapperPass, "scev-aa",
+                    "ScalarEvolution-based Alias Analysis", false, true)
+
+FunctionPass *llvm::createSCEVAAWrapperPass() {
+  return new SCEVAAWrapperPass();
+}
+
+SCEVAAWrapperPass::SCEVAAWrapperPass() : FunctionPass(ID) {
+  initializeSCEVAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool SCEVAAWrapperPass::runOnFunction(Function &F) {
+  Result.reset(
+      new SCEVAAResult(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                       getAnalysis<ScalarEvolutionWrapperPass>().getSE()));
+  return false;
+}
+
+void SCEVAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
index fee2a2d..921403d 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -63,7 +63,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
             // Create a new cast, and leave the old cast in place in case
             // it is being used as an insert point. Clear its operand
             // so that it doesn't hold anything live.
-            Ret = CastInst::Create(Op, V, Ty, "", IP);
+            Ret = CastInst::Create(Op, V, Ty, "", &*IP);
             Ret->takeName(CI);
             CI->replaceAllUsesWith(Ret);
             CI->setOperand(0, UndefValue::get(V->getType()));
@@ -75,17 +75,39 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
 
   // Create a new cast.
   if (!Ret)
-    Ret = CastInst::Create(Op, V, Ty, V->getName(), IP);
+    Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP);
 
   // We assert at the end of the function since IP might point to an
   // instruction with different dominance properties than a cast
   // (an invoke for example) and not dominate BIP (but the cast does).
-  assert(SE.DT->dominates(Ret, BIP));
+  assert(SE.DT.dominates(Ret, &*BIP));
 
   rememberInstruction(Ret);
   return Ret;
 }
 
+static BasicBlock::iterator findInsertPointAfter(Instruction *I,
+                                                 BasicBlock *MustDominate) {
+  BasicBlock::iterator IP = ++I->getIterator();
+  if (auto *II = dyn_cast<InvokeInst>(I))
+    IP = II->getNormalDest()->begin();
+
+  while (isa<PHINode>(IP))
+    ++IP;
+
+  while (IP->isEHPad()) {
+    if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
+      ++IP;
+    } else if (isa<CatchSwitchInst>(IP)) {
+      IP = MustDominate->getFirstInsertionPt();
+    } else {
+      llvm_unreachable("unexpected eh pad!");
+    }
+  }
+
+  return IP;
+}
+
 /// InsertNoopCastOfTo - Insert a cast of V to the specified type,
 /// which must be possible with a noop cast, doing what we can to share
 /// the casts.
@@ -135,19 +157,14 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
     while ((isa<BitCastInst>(IP) &&
             isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
             cast<BitCastInst>(IP)->getOperand(0) != A) ||
-           isa<DbgInfoIntrinsic>(IP) ||
-           isa<LandingPadInst>(IP))
+           isa<DbgInfoIntrinsic>(IP))
       ++IP;
     return ReuseOrCreateCast(A, Ty, Op, IP);
   }
 
   // Cast the instruction immediately after the instruction.
   Instruction *I = cast<Instruction>(V);
-  BasicBlock::iterator IP = I; ++IP;
-  if (InvokeInst *II = dyn_cast<InvokeInst>(I))
-    IP = II->getNormalDest()->begin();
-  while (isa<PHINode>(IP) || isa<LandingPadInst>(IP))
-    ++IP;
+  BasicBlock::iterator IP = findInsertPointAfter(I, Builder.GetInsertBlock());
   return ReuseOrCreateCast(I, Ty, Op, IP);
 }
 
@@ -174,7 +191,7 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
         ScanLimit++;
       if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS &&
           IP->getOperand(1) == RHS)
-        return IP;
+        return &*IP;
       if (IP == BlockBegin) break;
     }
   }
@@ -184,13 +201,13 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   BuilderType::InsertPointGuard Guard(Builder);
 
   // Move the insertion point out of as many loops as we can.
-  while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
+  while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
     if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break;
     BasicBlock *Preheader = L->getLoopPreheader();
     if (!Preheader) break;
 
     // Ok, move up a level.
-    Builder.SetInsertPoint(Preheader, Preheader->getTerminator());
+    Builder.SetInsertPoint(Preheader->getTerminator());
   }
 
   // If we haven't found this binop, insert it.
@@ -229,19 +246,15 @@ static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
     // Check for divisibility.
     if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) {
       ConstantInt *CI =
-        ConstantInt::get(SE.getContext(),
-                         C->getValue()->getValue().sdiv(
-                                                   FC->getValue()->getValue()));
+          ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt()));
       // If the quotient is zero and the remainder is non-zero, reject
       // the value at this scale. It will be considered for subsequent
       // smaller scales.
       if (!CI->isZero()) {
         const SCEV *Div = SE.getConstant(CI);
         S = Div;
-        Remainder =
-          SE.getAddExpr(Remainder,
-                        SE.getConstant(C->getValue()->getValue().srem(
-                                                  FC->getValue()->getValue())));
+        Remainder = SE.getAddExpr(
+            Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt())));
         return true;
       }
     }
@@ -254,10 +267,9 @@ static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
     // of the given factor. If so, we can factor it.
     const SCEVConstant *FC = cast<SCEVConstant>(Factor);
     if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
-      if (!C->getValue()->getValue().srem(FC->getValue()->getValue())) {
+      if (!C->getAPInt().srem(FC->getAPInt())) {
         SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end());
-        NewMulOps[0] = SE.getConstant(
-            C->getValue()->getValue().sdiv(FC->getValue()->getValue()));
+        NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
         S = SE.getMulExpr(NewMulOps);
         return true;
       }
@@ -402,8 +414,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
       const SCEV *ElSize = SE.getSizeOfExpr(IntPtrTy, ElTy);
       if (!ElSize->isZero()) {
         SmallVector<const SCEV *, 8> NewOps;
-        for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-          const SCEV *Op = Ops[i];
+        for (const SCEV *Op : Ops) {
           const SCEV *Remainder = SE.getConstant(Ty, 0);
           if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) {
             // Op now has ElSize factored out.
@@ -414,7 +425,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
           } else {
             // The operand was not divisible, so add it to the list of operands
             // we'll scan next iteration.
-            NewOps.push_back(Ops[i]);
+            NewOps.push_back(Op);
           }
         }
         // If we made any changes, update Ops.
@@ -483,7 +494,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
        Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace()));
 
     assert(!isa<Instruction>(V) ||
-           SE.DT->dominates(cast<Instruction>(V), Builder.GetInsertPoint()));
+           SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint()));
 
     // Expand the operands for a plain byte offset.
     Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty);
@@ -508,7 +519,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
           ScanLimit++;
         if (IP->getOpcode() == Instruction::GetElementPtr &&
             IP->getOperand(0) == V && IP->getOperand(1) == Idx)
-          return IP;
+          return &*IP;
         if (IP == BlockBegin) break;
       }
     }
@@ -517,13 +528,13 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     BuilderType::InsertPointGuard Guard(Builder);
 
     // Move the insertion point out of as many loops as we can.
-    while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
+    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
       if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break;
       BasicBlock *Preheader = L->getLoopPreheader();
       if (!Preheader) break;
 
       // Ok, move up a level.
-      Builder.SetInsertPoint(Preheader, Preheader->getTerminator());
+      Builder.SetInsertPoint(Preheader->getTerminator());
     }
 
     // Emit a GEP.
@@ -537,16 +548,13 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   BuilderType::InsertPoint SaveInsertPt = Builder.saveIP();
 
   // Move the insertion point out of as many loops as we can.
-  while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
+  while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
     if (!L->isLoopInvariant(V)) break;
 
-    bool AnyIndexNotLoopInvariant = false;
-    for (SmallVectorImpl<Value *>::const_iterator I = GepIndices.begin(),
-         E = GepIndices.end(); I != E; ++I)
-      if (!L->isLoopInvariant(*I)) {
-        AnyIndexNotLoopInvariant = true;
-        break;
-      }
+    bool AnyIndexNotLoopInvariant =
+        std::any_of(GepIndices.begin(), GepIndices.end(),
+                    [L](Value *Op) { return !L->isLoopInvariant(Op); });
+
     if (AnyIndexNotLoopInvariant)
       break;
 
@@ -554,7 +562,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     if (!Preheader) break;
 
     // Ok, move up a level.
-    Builder.SetInsertPoint(Preheader, Preheader->getTerminator());
+    Builder.SetInsertPoint(Preheader->getTerminator());
   }
 
   // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
@@ -563,9 +571,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   Value *Casted = V;
   if (V->getType() != PTy)
     Casted = InsertNoopCastOfTo(Casted, PTy);
-  Value *GEP = Builder.CreateGEP(OriginalElTy, Casted,
-                                 GepIndices,
-                                 "scevgep");
+  Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
   Ops.push_back(SE.getUnknown(GEP));
   rememberInstruction(GEP);
 
@@ -593,8 +599,7 @@ static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B,
 /// expression, according to PickMostRelevantLoop.
 const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
   // Test whether we've already computed the most relevant loop for this SCEV.
-  std::pair<DenseMap<const SCEV *, const Loop *>::iterator, bool> Pair =
-    RelevantLoops.insert(std::make_pair(S, nullptr));
+  auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr));
   if (!Pair.second)
     return Pair.first->second;
 
@@ -603,7 +608,7 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
     return nullptr;
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     if (const Instruction *I = dyn_cast<Instruction>(U->getValue()))
-      return Pair.first->second = SE.LI->getLoopFor(I->getParent());
+      return Pair.first->second = SE.LI.getLoopFor(I->getParent());
     // A non-instruction has no relevant loops.
     return nullptr;
   }
@@ -611,9 +616,8 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
     const Loop *L = nullptr;
     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
       L = AR->getLoop();
-    for (SCEVNAryExpr::op_iterator I = N->op_begin(), E = N->op_end();
-         I != E; ++I)
-      L = PickMostRelevantLoop(L, getRelevantLoop(*I), *SE.DT);
+    for (const SCEV *Op : N->operands())
+      L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT);
     return RelevantLoops[N] = L;
   }
   if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) {
@@ -621,10 +625,8 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
     return RelevantLoops[C] = Result;
   }
   if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
-    const Loop *Result =
-      PickMostRelevantLoop(getRelevantLoop(D->getLHS()),
-                           getRelevantLoop(D->getRHS()),
-                           *SE.DT);
+    const Loop *Result = PickMostRelevantLoop(
+        getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT);
     return RelevantLoops[D] = Result;
   }
   llvm_unreachable("Unexpected SCEV type!");
@@ -679,13 +681,12 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
 
   // Sort by loop. Use a stable sort so that constants follow non-constants and
   // pointer operands precede non-pointer operands.
-  std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(*SE.DT));
+  std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(SE.DT));
 
   // Emit instructions to add all the operands. Hoist as much as possible
   // out of loops, and form meaningful getelementptrs where possible.
   Value *Sum = nullptr;
-  for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator
-       I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ) {
+  for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) {
     const Loop *CurLoop = I->first;
     const SCEV *Op = I->second;
     if (!Sum) {
@@ -747,14 +748,13 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
     OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
 
   // Sort by loop. Use a stable sort so that constants follow non-constants.
-  std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(*SE.DT));
+  std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(SE.DT));
 
   // Emit instructions to mul all the operands. Hoist as much as possible
   // out of loops.
   Value *Prod = nullptr;
-  for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator
-       I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ++I) {
-    const SCEV *Op = I->second;
+  for (const auto &I : OpsAndLoops) {
+    const SCEV *Op = I.second;
     if (!Prod) {
       // This is the first operand. Just expand it.
       Prod = expand(Op);
@@ -788,7 +788,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
 
   Value *LHS = expandCodeFor(S->getLHS(), Ty);
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
-    const APInt &RHS = SC->getValue()->getValue();
+    const APInt &RHS = SC->getAPInt();
     if (RHS.isPowerOf2())
       return InsertBinop(Instruction::LShr, LHS,
                          ConstantInt::get(Ty, RHS.logBase2()));
@@ -834,7 +834,7 @@ bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
     for (User::op_iterator OI = IncV->op_begin()+1,
            OE = IncV->op_end(); OI != OE; ++OI)
       if (Instruction *OInst = dyn_cast<Instruction>(OI))
-        if (!SE.DT->dominates(OInst, IVIncInsertPos))
+        if (!SE.DT.dominates(OInst, IVIncInsertPos))
           return false;
   }
   // Advance to the next instruction.
@@ -873,19 +873,18 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
   case Instruction::Add:
   case Instruction::Sub: {
     Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1));
-    if (!OInst || SE.DT->dominates(OInst, InsertPos))
+    if (!OInst || SE.DT.dominates(OInst, InsertPos))
       return dyn_cast<Instruction>(IncV->getOperand(0));
     return nullptr;
   }
   case Instruction::BitCast:
     return dyn_cast<Instruction>(IncV->getOperand(0));
   case Instruction::GetElementPtr:
-    for (Instruction::op_iterator I = IncV->op_begin()+1, E = IncV->op_end();
-         I != E; ++I) {
+    for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) {
       if (isa<Constant>(*I))
         continue;
       if (Instruction *OInst = dyn_cast<Instruction>(*I)) {
-        if (!SE.DT->dominates(OInst, InsertPos))
+        if (!SE.DT.dominates(OInst, InsertPos))
           return nullptr;
       }
       if (allowScale) {
@@ -912,13 +911,16 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
 /// it available to other uses in this loop. Recursively hoist any operands,
 /// until we reach a value that dominates InsertPos.
 bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
-  if (SE.DT->dominates(IncV, InsertPos))
+  if (SE.DT.dominates(IncV, InsertPos))
       return true;
 
   // InsertPos must itself dominate IncV so that IncV's new position satisfies
   // its existing users.
-  if (isa<PHINode>(InsertPos)
-      || !SE.DT->dominates(InsertPos->getParent(), IncV->getParent()))
+  if (isa<PHINode>(InsertPos) ||
+      !SE.DT.dominates(InsertPos->getParent(), IncV->getParent()))
+    return false;
+
+  if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos))
     return false;
 
   // Check that the chain of IV operands leading back to Phi can be hoisted.
@@ -930,11 +932,10 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
     // IncV is safe to hoist.
     IVIncs.push_back(IncV);
     IncV = Oper;
-    if (SE.DT->dominates(IncV, InsertPos))
+    if (SE.DT.dominates(IncV, InsertPos))
       break;
   }
-  for (SmallVectorImpl<Instruction*>::reverse_iterator I = IVIncs.rbegin(),
-         E = IVIncs.rend(); I != E; ++I) {
+  for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
     (*I)->moveBefore(InsertPos);
   }
   return true;
@@ -1002,7 +1003,7 @@ static void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
 }
 
 /// \brief Check whether we can cheaply express the requested SCEV in terms of
-/// the available PHI SCEV by truncation and/or invertion of the step.
+/// the available PHI SCEV by truncation and/or inversion of the step.
 static bool canBeCheaplyTransformed(ScalarEvolution &SE,
                                     const SCEVAddRecExpr *Phi,
                                     const SCEVAddRecExpr *Requested,
@@ -1084,12 +1085,13 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
 
     // Only try partially matching scevs that need truncation and/or
     // step-inversion if we know this loop is outside the current loop.
-    bool TryNonMatchingSCEV = IVIncInsertLoop &&
-      SE.DT->properlyDominates(LatchBlock, IVIncInsertLoop->getHeader());
+    bool TryNonMatchingSCEV =
+        IVIncInsertLoop &&
+        SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader());
 
-    for (BasicBlock::iterator I = L->getHeader()->begin();
-         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-      if (!SE.isSCEVable(PN->getType()))
+    for (auto &I : *L->getHeader()) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN || !SE.isSCEVable(PN->getType()))
         continue;
 
       const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PN));
@@ -1142,7 +1144,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       // Potentially, move the increment. We have made sure in
       // isExpandedAddRecExprPHI or hoistIVInc that this is possible.
       if (L == IVIncInsertLoop)
-        hoistBeforePos(SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
+        hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
 
       // Ok, the add recurrence looks usable.
       // Remember this PHI, even in post-inc mode.
@@ -1167,13 +1169,13 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   PostIncLoops.clear();
 
   // Expand code for the start value.
-  Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy,
-                                L->getHeader()->begin());
+  Value *StartV =
+      expandCodeFor(Normalized->getStart(), ExpandTy, &L->getHeader()->front());
 
   // StartV must be hoisted into L's preheader to dominate the new phi.
   assert(!isa<Instruction>(StartV) ||
-         SE.DT->properlyDominates(cast<Instruction>(StartV)->getParent(),
-                                  L->getHeader()));
+         SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(),
+                                 L->getHeader()));
 
   // Expand code for the step value. Do this before creating the PHI so that PHI
   // reuse code doesn't see an incomplete PHI.
@@ -1185,7 +1187,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   if (useSubtract)
     Step = SE.getNegativeSCEV(Step);
   // Expand the step somewhere that dominates the loop header.
-  Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+  Value *StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front());
 
   // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if
   // we actually do emit an addition.  It does not apply if we emit a
@@ -1249,9 +1251,8 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   if (PostIncLoops.count(L)) {
     PostIncLoopSet Loops;
     Loops.insert(L);
-    Normalized =
-      cast<SCEVAddRecExpr>(TransformForPostIncUse(Normalize, S, nullptr,
-                                                  nullptr, Loops, SE, *SE.DT));
+    Normalized = cast<SCEVAddRecExpr>(TransformForPostIncUse(
+        Normalize, S, nullptr, nullptr, Loops, SE, SE.DT));
   }
 
   // Strip off any non-loop-dominating component from the addrec start.
@@ -1301,9 +1302,9 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     // For an expansion to use the postinc form, the client must call
     // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
     // or dominated by IVIncInsertPos.
-    if (isa<Instruction>(Result)
-        && !SE.DT->dominates(cast<Instruction>(Result),
-                             Builder.GetInsertPoint())) {
+    if (isa<Instruction>(Result) &&
+        !SE.DT.dominates(cast<Instruction>(Result),
+                         &*Builder.GetInsertPoint())) {
       // The induction variable's postinc expansion does not dominate this use.
       // IVUsers tries to prevent this case, so it is rare. However, it can
       // happen when an IVUser outside the loop is not dominated by the latch
@@ -1321,7 +1322,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
       {
         // Expand the step somewhere that dominates the loop header.
         BuilderType::InsertPointGuard Guard(Builder);
-        StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+        StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front());
       }
       Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
     }
@@ -1395,13 +1396,9 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
                                        S->getNoWrapFlags(SCEV::FlagNW)));
     BasicBlock::iterator NewInsertPt =
-      std::next(BasicBlock::iterator(cast<Instruction>(V)));
-    BuilderType::InsertPointGuard Guard(Builder);
-    while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
-           isa<LandingPadInst>(NewInsertPt))
-      ++NewInsertPt;
+        findInsertPointAfter(cast<Instruction>(V), Builder.GetInsertBlock());
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
-                      NewInsertPt);
+                      &*NewInsertPt);
     return V;
   }
 
@@ -1442,7 +1439,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     BasicBlock *Header = L->getHeader();
     pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
     CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar",
-                                  Header->begin());
+                                  &Header->front());
     rememberInstruction(CanonicalIV);
 
     SmallSet<BasicBlock *, 4> PredSeen;
@@ -1587,7 +1584,8 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
 
 Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
                                    Instruction *IP) {
-  Builder.SetInsertPoint(IP->getParent(), IP);
+  assert(IP);
+  Builder.SetInsertPoint(IP);
   return expandCodeFor(SH, Ty);
 }
 
@@ -1605,8 +1603,8 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
 Value *SCEVExpander::expand(const SCEV *S) {
   // Compute an insertion point for this SCEV object. Hoist the instructions
   // as far out in the loop nest as possible.
-  Instruction *InsertPt = Builder.GetInsertPoint();
-  for (Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock()); ;
+  Instruction *InsertPt = &*Builder.GetInsertPoint();
+  for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());;
        L = L->getParentLoop())
     if (SE.isLoopInvariant(S, L)) {
       if (!L) break;
@@ -1616,30 +1614,29 @@ Value *SCEVExpander::expand(const SCEV *S) {
         // LSR sets the insertion point for AddRec start/step values to the
         // block start to simplify value reuse, even though it's an invalid
         // position. SCEVExpander must correct for this in all cases.
-        InsertPt = L->getHeader()->getFirstInsertionPt();
+        InsertPt = &*L->getHeader()->getFirstInsertionPt();
       }
     } else {
       // If the SCEV is computable at this level, insert it into the header
       // after the PHIs (and after any other instructions that we've inserted
       // there) so that it is guaranteed to dominate any user inside the loop.
       if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
-        InsertPt = L->getHeader()->getFirstInsertionPt();
+        InsertPt = &*L->getHeader()->getFirstInsertionPt();
       while (InsertPt != Builder.GetInsertPoint()
              && (isInsertedInstruction(InsertPt)
                  || isa<DbgInfoIntrinsic>(InsertPt))) {
-        InsertPt = std::next(BasicBlock::iterator(InsertPt));
+        InsertPt = &*std::next(InsertPt->getIterator());
       }
       break;
     }
 
   // Check to see if we already expanded this here.
-  std::map<std::pair<const SCEV *, Instruction *>, TrackingVH<Value> >::iterator
-    I = InsertedExpressions.find(std::make_pair(S, InsertPt));
+  auto I = InsertedExpressions.find(std::make_pair(S, InsertPt));
   if (I != InsertedExpressions.end())
     return I->second;
 
   BuilderType::InsertPointGuard Guard(Builder);
-  Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
+  Builder.SetInsertPoint(InsertPt);
 
   // Expand the expression into instructions.
   Value *V = visit(S);
@@ -1677,8 +1674,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
 
   // Emit code for it.
   BuilderType::InsertPointGuard Guard(Builder);
-  PHINode *V = cast<PHINode>(expandCodeFor(H, nullptr,
-                                           L->getHeader()->begin()));
+  PHINode *V =
+      cast<PHINode>(expandCodeFor(H, nullptr, &L->getHeader()->front()));
 
   return V;
 }
@@ -1694,10 +1691,13 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
                                            const TargetTransformInfo *TTI) {
   // Find integer phis in order of increasing width.
   SmallVector<PHINode*, 8> Phis;
-  for (BasicBlock::iterator I = L->getHeader()->begin();
-       PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
-    Phis.push_back(Phi);
+  for (auto &I : *L->getHeader()) {
+    if (auto *PN = dyn_cast<PHINode>(&I))
+      Phis.push_back(PN);
+    else
+      break;
   }
+
   if (TTI)
     std::sort(Phis.begin(), Phis.end(), [](Value *LHS, Value *RHS) {
       // Put pointers at the back and make sure pointer < pointer = false.
@@ -1711,13 +1711,23 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   DenseMap<const SCEV *, PHINode *> ExprToIVMap;
   // Process phis from wide to narrow. Map wide phis to their truncation
   // so narrow phis can reuse them.
-  for (SmallVectorImpl<PHINode*>::const_iterator PIter = Phis.begin(),
-         PEnd = Phis.end(); PIter != PEnd; ++PIter) {
-    PHINode *Phi = *PIter;
+  for (PHINode *Phi : Phis) {
+    auto SimplifyPHINode = [&](PHINode *PN) -> Value * {
+      if (Value *V = SimplifyInstruction(PN, DL, &SE.TLI, &SE.DT, &SE.AC))
+        return V;
+      if (!SE.isSCEVable(PN->getType()))
+        return nullptr;
+      auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN));
+      if (!Const)
+        return nullptr;
+      return Const->getValue();
+    };
 
     // Fold constant phis. They may be congruent to other constant phis and
     // would confuse the logic below that expects proper IVs.
-    if (Value *V = SimplifyInstruction(Phi, DL, SE.TLI, SE.DT, SE.AC)) {
+    if (Value *V = SimplifyPHINode(Phi)) {
+      if (V->getType() != Phi->getType())
+        continue;
       Phi->replaceAllUsesWith(V);
       DeadInsts.emplace_back(Phi);
       ++NumElim;
@@ -1784,7 +1794,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
         if (OrigInc->getType() != IsomorphicInc->getType()) {
           Instruction *IP = nullptr;
           if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
-            IP = PN->getParent()->getFirstInsertionPt();
+            IP = &*PN->getParent()->getFirstInsertionPt();
           else
             IP = OrigInc->getNextNode();
 
@@ -1802,7 +1812,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
     ++NumElim;
     Value *NewIV = OrigPhiRef;
     if (OrigPhiRef->getType() != Phi->getType()) {
-      IRBuilder<> Builder(L->getHeader()->getFirstInsertionPt());
+      IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt());
       Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
       NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName);
     }
@@ -1812,8 +1822,46 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   return NumElim;
 }
 
+Value *SCEVExpander::findExistingExpansion(const SCEV *S,
+                                           const Instruction *At, Loop *L) {
+  using namespace llvm::PatternMatch;
+
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Look for suitable value in simple conditions at the loop exits.
+  for (BasicBlock *BB : ExitingBlocks) {
+    ICmpInst::Predicate Pred;
+    Instruction *LHS, *RHS;
+    BasicBlock *TrueBB, *FalseBB;
+
+    if (!match(BB->getTerminator(),
+               m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
+                    TrueBB, FalseBB)))
+      continue;
+
+    if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At))
+      return LHS;
+
+    if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At))
+      return RHS;
+  }
+
+  // There is potential to make this significantly smarter, but this simple
+  // heuristic already gets some interesting cases.
+
+  // Can not find suitable value.
+  return nullptr;
+}
+
 bool SCEVExpander::isHighCostExpansionHelper(
-    const SCEV *S, Loop *L, SmallPtrSetImpl<const SCEV *> &Processed) {
+    const SCEV *S, Loop *L, const Instruction *At,
+    SmallPtrSetImpl<const SCEV *> &Processed) {
+
+  // If we can find an existing value for this scev avaliable at the point "At"
+  // then consider the expression cheap.
+  if (At && findExistingExpansion(S, At, L) != nullptr)
+    return false;
 
   // Zero/One operand expressions
   switch (S->getSCEVType()) {
@@ -1821,14 +1869,14 @@ bool SCEVExpander::isHighCostExpansionHelper(
   case scConstant:
     return false;
   case scTruncate:
-    return isHighCostExpansionHelper(cast<SCEVTruncateExpr>(S)->getOperand(), L,
-                                     Processed);
+    return isHighCostExpansionHelper(cast<SCEVTruncateExpr>(S)->getOperand(),
+                                     L, At, Processed);
   case scZeroExtend:
     return isHighCostExpansionHelper(cast<SCEVZeroExtendExpr>(S)->getOperand(),
-                                     L, Processed);
+                                     L, At, Processed);
   case scSignExtend:
     return isHighCostExpansionHelper(cast<SCEVSignExtendExpr>(S)->getOperand(),
-                                     L, Processed);
+                                     L, At, Processed);
   }
 
   if (!Processed.insert(S).second)
@@ -1836,10 +1884,10 @@ bool SCEVExpander::isHighCostExpansionHelper(
 
   if (auto *UDivExpr = dyn_cast<SCEVUDivExpr>(S)) {
     // If the divisor is a power of two and the SCEV type fits in a native
-    // integer, consider the divison cheap irrespective of whether it occurs in
+    // integer, consider the division cheap irrespective of whether it occurs in
     // the user code since it can be lowered into a right shift.
     if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS()))
-      if (SC->getValue()->getValue().isPowerOf2()) {
+      if (SC->getAPInt().isPowerOf2()) {
         const DataLayout &DL =
             L->getHeader()->getParent()->getParent()->getDataLayout();
         unsigned Width = cast<IntegerType>(UDivExpr->getType())->getBitWidth();
@@ -1855,22 +1903,14 @@ bool SCEVExpander::isHighCostExpansionHelper(
     if (!ExitingBB)
       return true;
 
-    BranchInst *ExitingBI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
-    if (!ExitingBI || !ExitingBI->isConditional())
+    // At the beginning of this function we already tried to find existing value
+    // for plain 'S'. Now try to lookup 'S + 1' since it is common pattern
+    // involving division. This is just a simple search heuristic.
+    if (!At)
+      At = &ExitingBB->back();
+    if (!findExistingExpansion(
+            SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), At, L))
       return true;
-
-    ICmpInst *OrigCond = dyn_cast<ICmpInst>(ExitingBI->getCondition());
-    if (!OrigCond)
-      return true;
-
-    const SCEV *RHS = SE.getSCEV(OrigCond->getOperand(1));
-    RHS = SE.getMinusSCEV(RHS, SE.getConstant(RHS->getType(), 1));
-    if (RHS != S) {
-      const SCEV *LHS = SE.getSCEV(OrigCond->getOperand(0));
-      LHS = SE.getMinusSCEV(LHS, SE.getConstant(LHS->getType(), 1));
-      if (LHS != S)
-        return true;
-    }
   }
 
   // HowManyLessThans uses a Max expression whenever the loop is not guarded by
@@ -1882,11 +1922,9 @@ bool SCEVExpander::isHighCostExpansionHelper(
   // BackedgeTakenCount. They may already exist in program code, and if not,
   // they are not too expensive rematerialize.
   if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(S)) {
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      if (isHighCostExpansionHelper(*I, L, Processed))
+    for (auto *Op : NAry->operands())
+      if (isHighCostExpansionHelper(Op, L, At, Processed))
         return true;
-    }
   }
 
   // If we haven't recognized an expensive SCEV pattern, assume it's an
@@ -1894,6 +1932,43 @@ bool SCEVExpander::isHighCostExpansionHelper(
   return false;
 }
 
+Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
+                                            Instruction *IP) {
+  assert(IP);
+  switch (Pred->getKind()) {
+  case SCEVPredicate::P_Union:
+    return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP);
+  case SCEVPredicate::P_Equal:
+    return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP);
+  }
+  llvm_unreachable("Unknown SCEV predicate type");
+}
+
+Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
+                                          Instruction *IP) {
+  Value *Expr0 = expandCodeFor(Pred->getLHS(), Pred->getLHS()->getType(), IP);
+  Value *Expr1 = expandCodeFor(Pred->getRHS(), Pred->getRHS()->getType(), IP);
+
+  Builder.SetInsertPoint(IP);
+  auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check");
+  return I;
+}
+
+Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
+                                          Instruction *IP) {
+  auto *BoolType = IntegerType::get(IP->getContext(), 1);
+  Value *Check = ConstantInt::getNullValue(BoolType);
+
+  // Loop over all checks in this set.
+  for (auto Pred : Union->getPredicates()) {
+    auto *NextCheck = expandCodeForPredicate(Pred, IP);
+    Builder.SetInsertPoint(IP);
+    Check = Builder.CreateOr(Check, NextCheck);
+  }
+
+  return Check;
+}
+
 namespace {
 // Search for a SCEV subexpression that is not safe to expand.  Any expression
 // that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
index b238fe4..b7fd5d5 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -109,7 +109,7 @@ TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
     SmallVector<const SCEV *, 8> Operands;
     const Loop *L = AR->getLoop();
     // The addrec conceptually uses its operands at loop entry.
-    Instruction *LUser = L->getHeader()->begin();
+    Instruction *LUser = &L->getHeader()->front();
     // Transform each operand.
     for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
          I != E; ++I) {
diff --git a/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp
index a5fca3e..486f3a5 100644
--- a/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp
+++ b/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp
@@ -26,28 +26,29 @@
 // ... = load %ptr2, !alias.scope !{ !scope1, !scope2 }, !noalias !{ !scope1 }
 //
 // When evaluating an aliasing query, if one of the instructions is associated
-// has a set of noalias scopes in some domain that is superset of the alias
+// has a set of noalias scopes in some domain that is a superset of the alias
 // scopes in that domain of some other instruction, then the two memory
 // accesses are assumed not to alias.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+
 using namespace llvm;
 
 // A handy option for disabling scoped no-alias functionality. The same effect
 // can also be achieved by stripping the associated metadata tags from IR, but
 // this option is sometimes more convenient.
-static cl::opt<bool>
-EnableScopedNoAlias("enable-scoped-noalias", cl::init(true));
+static cl::opt<bool> EnableScopedNoAlias("enable-scoped-noalias",
+                                         cl::init(true));
 
 namespace {
 /// AliasScopeNode - This is a simple wrapper around an MDNode which provides
@@ -57,7 +58,7 @@ class AliasScopeNode {
   const MDNode *Node;
 
 public:
-  AliasScopeNode() : Node(0) {}
+  AliasScopeNode() : Node(nullptr) {}
   explicit AliasScopeNode(const MDNode *N) : Node(N) {}
 
   /// getNode - Get the MDNode for this AliasScopeNode.
@@ -70,79 +71,74 @@ public:
     return dyn_cast_or_null<MDNode>(Node->getOperand(1));
   }
 };
+} // end of anonymous namespace
 
-/// ScopedNoAliasAA - This is a simple alias analysis
-/// implementation that uses scoped-noalias metadata to answer queries.
-class ScopedNoAliasAA : public ImmutablePass, public AliasAnalysis {
-public:
-  static char ID; // Class identification, replacement for typeinfo
-  ScopedNoAliasAA() : ImmutablePass(ID) {
-    initializeScopedNoAliasAAPass(*PassRegistry::getPassRegistry());
-  }
+AliasResult ScopedNoAliasAAResult::alias(const MemoryLocation &LocA,
+                                         const MemoryLocation &LocB) {
+  if (!EnableScopedNoAlias)
+    return AAResultBase::alias(LocA, LocB);
 
-  bool doInitialization(Module &M) override;
+  // Get the attached MDNodes.
+  const MDNode *AScopes = LocA.AATags.Scope, *BScopes = LocB.AATags.Scope;
 
-  /// getAdjustedAnalysisPointer - This method is used when a pass implements
-  /// an analysis interface through multiple inheritance.  If needed, it
-  /// should override this to adjust the this pointer as needed for the
-  /// specified pass info.
-  void *getAdjustedAnalysisPointer(const void *PI) override {
-    if (PI == &AliasAnalysis::ID)
-      return (AliasAnalysis*)this;
-    return this;
-  }
+  const MDNode *ANoAlias = LocA.AATags.NoAlias, *BNoAlias = LocB.AATags.NoAlias;
 
-protected:
-  bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const;
-  void collectMDInDomain(const MDNode *List, const MDNode *Domain,
-                         SmallPtrSetImpl<const MDNode *> &Nodes) const;
-
-private:
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  AliasResult alias(const MemoryLocation &LocA,
-                    const MemoryLocation &LocB) override;
-  bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal) override;
-  ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override;
-  ModRefBehavior getModRefBehavior(const Function *F) override;
-  ModRefResult getModRefInfo(ImmutableCallSite CS,
-                             const MemoryLocation &Loc) override;
-  ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                             ImmutableCallSite CS2) override;
-};
-}  // End of anonymous namespace
+  if (!mayAliasInScopes(AScopes, BNoAlias))
+    return NoAlias;
 
-// Register this pass...
-char ScopedNoAliasAA::ID = 0;
-INITIALIZE_AG_PASS(ScopedNoAliasAA, AliasAnalysis, "scoped-noalias",
-                   "Scoped NoAlias Alias Analysis", false, true, false)
+  if (!mayAliasInScopes(BScopes, ANoAlias))
+    return NoAlias;
 
-ImmutablePass *llvm::createScopedNoAliasAAPass() {
-  return new ScopedNoAliasAA();
+  // If they may alias, chain to the next AliasAnalysis.
+  return AAResultBase::alias(LocA, LocB);
 }
 
-bool ScopedNoAliasAA::doInitialization(Module &M) {
-  InitializeAliasAnalysis(this, &M.getDataLayout());
-  return true;
+ModRefInfo ScopedNoAliasAAResult::getModRefInfo(ImmutableCallSite CS,
+                                                const MemoryLocation &Loc) {
+  if (!EnableScopedNoAlias)
+    return AAResultBase::getModRefInfo(CS, Loc);
+
+  if (!mayAliasInScopes(Loc.AATags.Scope, CS.getInstruction()->getMetadata(
+                                              LLVMContext::MD_noalias)))
+    return MRI_NoModRef;
+
+  if (!mayAliasInScopes(
+          CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
+          Loc.AATags.NoAlias))
+    return MRI_NoModRef;
+
+  return AAResultBase::getModRefInfo(CS, Loc);
 }
 
-void
-ScopedNoAliasAA::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
+ModRefInfo ScopedNoAliasAAResult::getModRefInfo(ImmutableCallSite CS1,
+                                                ImmutableCallSite CS2) {
+  if (!EnableScopedNoAlias)
+    return AAResultBase::getModRefInfo(CS1, CS2);
+
+  if (!mayAliasInScopes(
+          CS1.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
+          CS2.getInstruction()->getMetadata(LLVMContext::MD_noalias)))
+    return MRI_NoModRef;
+
+  if (!mayAliasInScopes(
+          CS2.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
+          CS1.getInstruction()->getMetadata(LLVMContext::MD_noalias)))
+    return MRI_NoModRef;
+
+  return AAResultBase::getModRefInfo(CS1, CS2);
 }
 
-void
-ScopedNoAliasAA::collectMDInDomain(const MDNode *List, const MDNode *Domain,
-                   SmallPtrSetImpl<const MDNode *> &Nodes) const {
+void ScopedNoAliasAAResult::collectMDInDomain(
+    const MDNode *List, const MDNode *Domain,
+    SmallPtrSetImpl<const MDNode *> &Nodes) const {
   for (unsigned i = 0, ie = List->getNumOperands(); i != ie; ++i)
     if (const MDNode *MD = dyn_cast<MDNode>(List->getOperand(i)))
       if (AliasScopeNode(MD).getDomain() == Domain)
         Nodes.insert(MD);
 }
 
-bool
-ScopedNoAliasAA::mayAliasInScopes(const MDNode *Scopes,
-                                  const MDNode *NoAlias) const {
+bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes,
+                                             const MDNode *NoAlias) const {
   if (!Scopes || !NoAlias)
     return true;
 
@@ -177,76 +173,40 @@ ScopedNoAliasAA::mayAliasInScopes(const MDNode *Scopes,
   return true;
 }
 
-AliasResult ScopedNoAliasAA::alias(const MemoryLocation &LocA,
-                                   const MemoryLocation &LocB) {
-  if (!EnableScopedNoAlias)
-    return AliasAnalysis::alias(LocA, LocB);
-
-  // Get the attached MDNodes.
-  const MDNode *AScopes = LocA.AATags.Scope,
-               *BScopes = LocB.AATags.Scope;
+ScopedNoAliasAAResult ScopedNoAliasAA::run(Function &F,
+                                           AnalysisManager<Function> *AM) {
+  return ScopedNoAliasAAResult(AM->getResult<TargetLibraryAnalysis>(F));
+}
 
-  const MDNode *ANoAlias = LocA.AATags.NoAlias,
-               *BNoAlias = LocB.AATags.NoAlias;
+char ScopedNoAliasAA::PassID;
 
-  if (!mayAliasInScopes(AScopes, BNoAlias))
-    return NoAlias;
-
-  if (!mayAliasInScopes(BScopes, ANoAlias))
-    return NoAlias;
+char ScopedNoAliasAAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ScopedNoAliasAAWrapperPass, "scoped-noalias",
+                      "Scoped NoAlias Alias Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ScopedNoAliasAAWrapperPass, "scoped-noalias",
+                    "Scoped NoAlias Alias Analysis", false, true)
 
-  // If they may alias, chain to the next AliasAnalysis.
-  return AliasAnalysis::alias(LocA, LocB);
+ImmutablePass *llvm::createScopedNoAliasAAWrapperPass() {
+  return new ScopedNoAliasAAWrapperPass();
 }
 
-bool ScopedNoAliasAA::pointsToConstantMemory(const MemoryLocation &Loc,
-                                             bool OrLocal) {
-  return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+ScopedNoAliasAAWrapperPass::ScopedNoAliasAAWrapperPass() : ImmutablePass(ID) {
+  initializeScopedNoAliasAAWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
-AliasAnalysis::ModRefBehavior
-ScopedNoAliasAA::getModRefBehavior(ImmutableCallSite CS) {
-  return AliasAnalysis::getModRefBehavior(CS);
+bool ScopedNoAliasAAWrapperPass::doInitialization(Module &M) {
+  Result.reset(new ScopedNoAliasAAResult(
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  return false;
 }
 
-AliasAnalysis::ModRefBehavior
-ScopedNoAliasAA::getModRefBehavior(const Function *F) {
-  return AliasAnalysis::getModRefBehavior(F);
+bool ScopedNoAliasAAWrapperPass::doFinalization(Module &M) {
+  Result.reset();
+  return false;
 }
 
-AliasAnalysis::ModRefResult
-ScopedNoAliasAA::getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) {
-  if (!EnableScopedNoAlias)
-    return AliasAnalysis::getModRefInfo(CS, Loc);
-
-  if (!mayAliasInScopes(Loc.AATags.Scope, CS.getInstruction()->getMetadata(
-                                              LLVMContext::MD_noalias)))
-    return NoModRef;
-
-  if (!mayAliasInScopes(
-          CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
-          Loc.AATags.NoAlias))
-    return NoModRef;
-
-  return AliasAnalysis::getModRefInfo(CS, Loc);
-}
-
-AliasAnalysis::ModRefResult
-ScopedNoAliasAA::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
-  if (!EnableScopedNoAlias)
-    return AliasAnalysis::getModRefInfo(CS1, CS2);
-
-  if (!mayAliasInScopes(
-          CS1.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
-          CS2.getInstruction()->getMetadata(LLVMContext::MD_noalias)))
-    return NoModRef;
-
-  if (!mayAliasInScopes(
-          CS2.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
-          CS1.getInstruction()->getMetadata(LLVMContext::MD_noalias)))
-    return NoModRef;
-
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
+void ScopedNoAliasAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
-
diff --git a/contrib/llvm/lib/Analysis/SparsePropagation.cpp b/contrib/llvm/lib/Analysis/SparsePropagation.cpp
index edd82f5..f5a927b 100644
--- a/contrib/llvm/lib/Analysis/SparsePropagation.cpp
+++ b/contrib/llvm/lib/Analysis/SparsePropagation.cpp
@@ -328,17 +328,17 @@ void SparseSolver::Solve(Function &F) {
 
 void SparseSolver::Print(Function &F, raw_ostream &OS) const {
   OS << "\nFUNCTION: " << F.getName() << "\n";
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (!BBExecutable.count(BB))
+  for (auto &BB : F) {
+    if (!BBExecutable.count(&BB))
       OS << "INFEASIBLE: ";
     OS << "\t";
-    if (BB->hasName())
-      OS << BB->getName() << ":\n";
+    if (BB.hasName())
+      OS << BB.getName() << ":\n";
     else
       OS << "; anon bb\n";
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      LatticeFunc->PrintValue(getLatticeState(I), OS);
-      OS << *I << "\n";
+    for (auto &I : BB) {
+      LatticeFunc->PrintValue(getLatticeState(&I), OS);
+      OS << I << "\n";
     }
     
     OS << "\n";
diff --git a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 635c50c..ce38819 100644
--- a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -52,19 +52,27 @@ static bool hasSinCosPiStret(const Triple &T) {
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
 static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
-                       const char *const *StandardNames) {
-#ifndef NDEBUG
+                       ArrayRef<const char *> StandardNames) {
   // Verify that the StandardNames array is in alphabetical order.
-  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
-    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
-      llvm_unreachable("TargetLibraryInfoImpl function names must be sorted");
+  assert(std::is_sorted(StandardNames.begin(), StandardNames.end(),
+                        [](const char *LHS, const char *RHS) {
+                          return strcmp(LHS, RHS) < 0;
+                        }) &&
+         "TargetLibraryInfoImpl function names must be sorted");
+
+  if (T.getArch() == Triple::r600 ||
+      T.getArch() == Triple::amdgcn) {
+    TLI.setUnavailable(LibFunc::ldexp);
+    TLI.setUnavailable(LibFunc::ldexpf);
+    TLI.setUnavailable(LibFunc::ldexpl);
   }
-#endif // !NDEBUG
 
   // There are no library implementations of mempcy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
   if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn) {
+      T.getArch() == Triple::amdgcn ||
+      T.getArch() == Triple::wasm32 ||
+      T.getArch() == Triple::wasm64) {
     TLI.setUnavailable(LibFunc::memcpy);
     TLI.setUnavailable(LibFunc::memset);
     TLI.setUnavailable(LibFunc::memset_pattern16);
@@ -72,13 +80,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   }
 
   // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
+  // All versions of watchOS support it.
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
       TLI.setUnavailable(LibFunc::memset_pattern16);
   } else if (T.isiOS()) {
     if (T.isOSVersionLT(3, 0))
       TLI.setUnavailable(LibFunc::memset_pattern16);
-  } else {
+  } else if (!T.isWatchOS()) {
     TLI.setUnavailable(LibFunc::memset_pattern16);
   }
 
@@ -286,8 +295,13 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     }
     break;
   case Triple::IOS:
+  case Triple::TvOS:
+  case Triple::WatchOS:
     TLI.setUnavailable(LibFunc::exp10l);
-    if (T.isOSVersionLT(7, 0)) {
+    if (!T.isWatchOS() && (T.isOSVersionLT(7, 0) ||
+                           (T.isOSVersionLT(9, 0) &&
+                            (T.getArch() == Triple::x86 ||
+                             T.getArch() == Triple::x86_64)))) {
       TLI.setUnavailable(LibFunc::exp10);
       TLI.setUnavailable(LibFunc::exp10f);
     } else {
@@ -311,12 +325,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
   // Linux (GLIBC):
   // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html
-  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsl.c
+  // http://svn.freebsd.org/base/head/lib/libc/string/ffsl.c
   // http://www.gnu.org/software/gnulib/manual/html_node/ffsl.html
   switch (T.getOS()) {
   case Triple::Darwin:
   case Triple::MacOSX:
   case Triple::IOS:
+  case Triple::TvOS:
+  case Triple::WatchOS:
   case Triple::FreeBSD:
   case Triple::Linux:
     break;
@@ -325,9 +341,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   }
 
   // ffsll is available on at least FreeBSD and Linux (GLIBC):
-  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsll.c
+  // http://svn.freebsd.org/base/head/lib/libc/string/ffsll.c
   // http://www.gnu.org/software/gnulib/manual/html_node/ffsll.html
   switch (T.getOS()) {
+  case Triple::Darwin:
+  case Triple::MacOSX:
+  case Triple::IOS:
+  case Triple::TvOS:
+  case Triple::WatchOS:
   case Triple::FreeBSD:
   case Triple::Linux:
     break;
@@ -335,6 +356,16 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::ffsll);
   }
 
+  // The following functions are available on at least FreeBSD:
+  // http://svn.freebsd.org/base/head/lib/libc/string/fls.c
+  // http://svn.freebsd.org/base/head/lib/libc/string/flsl.c
+  // http://svn.freebsd.org/base/head/lib/libc/string/flsll.c
+  if (!T.isOSFreeBSD()) {
+    TLI.setUnavailable(LibFunc::fls);
+    TLI.setUnavailable(LibFunc::flsl);
+    TLI.setUnavailable(LibFunc::flsll);
+  }
+
   // The following functions are available on at least Linux:
   if (!T.isOSLinux()) {
     TLI.setUnavailable(LibFunc::dunder_strdup);
diff --git a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7d1c3fb..9c1d3fd 100644
--- a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -46,30 +46,37 @@ TargetTransformInfo &TargetTransformInfo::operator=(TargetTransformInfo &&RHS) {
   return *this;
 }
 
-unsigned TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
-                                               Type *OpTy) const {
-  return TTIImpl->getOperationCost(Opcode, Ty, OpTy);
+int TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
+                                          Type *OpTy) const {
+  int Cost = TTIImpl->getOperationCost(Opcode, Ty, OpTy);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getCallCost(FunctionType *FTy,
-                                          int NumArgs) const {
-  return TTIImpl->getCallCost(FTy, NumArgs);
+int TargetTransformInfo::getCallCost(FunctionType *FTy, int NumArgs) const {
+  int Cost = TTIImpl->getCallCost(FTy, NumArgs);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned
-TargetTransformInfo::getCallCost(const Function *F,
-                                 ArrayRef<const Value *> Arguments) const {
-  return TTIImpl->getCallCost(F, Arguments);
+int TargetTransformInfo::getCallCost(const Function *F,
+                                     ArrayRef<const Value *> Arguments) const {
+  int Cost = TTIImpl->getCallCost(F, Arguments);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned
-TargetTransformInfo::getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<const Value *> Arguments) const {
-  return TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
+int TargetTransformInfo::getIntrinsicCost(
+    Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
+  int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getUserCost(const User *U) const {
-  return TTIImpl->getUserCost(U);
+int TargetTransformInfo::getUserCost(const User *U) const {
+  int Cost = TTIImpl->getUserCost(U);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
 bool TargetTransformInfo::hasBranchDivergence() const {
@@ -106,14 +113,20 @@ bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                         Scale, AddrSpace);
 }
 
-bool TargetTransformInfo::isLegalMaskedStore(Type *DataType,
-                                             int Consecutive) const {
-  return TTIImpl->isLegalMaskedStore(DataType, Consecutive);
+bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
+  return TTIImpl->isLegalMaskedStore(DataType);
+}
+
+bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
+  return TTIImpl->isLegalMaskedLoad(DataType);
 }
 
-bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType,
-                                            int Consecutive) const {
-  return TTIImpl->isLegalMaskedLoad(DataType, Consecutive);
+bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
+  return TTIImpl->isLegalMaskedGather(DataType);
+}
+
+bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const {
+  return TTIImpl->isLegalMaskedGather(DataType);
 }
 
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
@@ -121,8 +134,10 @@ int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               bool HasBaseReg,
                                               int64_t Scale,
                                               unsigned AddrSpace) const {
-  return TTIImpl->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
-                                       Scale, AddrSpace);
+  int Cost = TTIImpl->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
+                                           Scale, AddrSpace);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
 bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const {
@@ -153,6 +168,10 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
+  return TTIImpl->enableInterleavedAccessVectorization();
+}
+
 TargetTransformInfo::PopcntSupportKind
 TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return TTIImpl->getPopcntSupport(IntTyWidthInBit);
@@ -162,22 +181,30 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
   return TTIImpl->haveFastSqrt(Ty);
 }
 
-unsigned TargetTransformInfo::getFPOpCost(Type *Ty) const {
-  return TTIImpl->getFPOpCost(Ty);
+int TargetTransformInfo::getFPOpCost(Type *Ty) const {
+  int Cost = TTIImpl->getFPOpCost(Ty);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  return TTIImpl->getIntImmCost(Imm, Ty);
+int TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  int Cost = TTIImpl->getIntImmCost(Imm, Ty);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                            const APInt &Imm, Type *Ty) const {
-  return TTIImpl->getIntImmCost(Opcode, Idx, Imm, Ty);
+int TargetTransformInfo::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                       const APInt &Imm, Type *Ty) const {
+  int Cost = TTIImpl->getIntImmCost(Opcode, Idx, Imm, Ty);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                            const APInt &Imm, Type *Ty) const {
-  return TTIImpl->getIntImmCost(IID, Idx, Imm, Ty);
+int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                       const APInt &Imm, Type *Ty) const {
+  int Cost = TTIImpl->getIntImmCost(IID, Idx, Imm, Ty);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
@@ -192,81 +219,122 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
 
-unsigned TargetTransformInfo::getArithmeticInstrCost(
+int TargetTransformInfo::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
     OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
     OperandValueProperties Opd2PropInfo) const {
-  return TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+  int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                             Opd1PropInfo, Opd2PropInfo);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty,
-                                             int Index, Type *SubTp) const {
-  return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp);
+int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
+                                        Type *SubTp) const {
+  int Cost = TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                               Type *Src) const {
-  return TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                          Type *Src) const {
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
-  return TTIImpl->getCFInstrCost(Opcode);
+int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
+  int Cost = TTIImpl->getCFInstrCost(Opcode);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                                 Type *CondTy) const {
-  return TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                            Type *CondTy) const {
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                                 unsigned Index) const {
-  return TTIImpl->getVectorInstrCost(Opcode, Val, Index);
+int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                            unsigned Index) const {
+  int Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                              unsigned Alignment,
-                                              unsigned AddressSpace) const {
-  return TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                         unsigned Alignment,
+                                         unsigned AddressSpace) const {
+  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned
-TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace) const {
-  return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) const {
+  int Cost =
+      TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
+int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                                Value *Ptr, bool VariableMask,
+                                                unsigned Alignment) const {
+  int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                             Alignment);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
+int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     unsigned Alignment, unsigned AddressSpace) const {
-  return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                                 Alignment, AddressSpace);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned
-TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                           ArrayRef<Type *> Tys) const {
-  return TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys);
+int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                               ArrayRef<Type *> Tys) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
-                                               ArrayRef<Type *> Tys) const {
-  return TTIImpl->getCallInstrCost(F, RetTy, Tys);
+int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                                               ArrayRef<Value *> Args) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
+int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
+                                          ArrayRef<Type *> Tys) const {
+  int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
 unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return TTIImpl->getNumberOfParts(Tp);
 }
 
-unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
-                                                        bool IsComplex) const {
-  return TTIImpl->getAddressComputationCost(Tp, IsComplex);
+int TargetTransformInfo::getAddressComputationCost(Type *Tp,
+                                                   bool IsComplex) const {
+  int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
-unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
-                                               bool IsPairwiseForm) const {
-  return TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm);
+int TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
+                                          bool IsPairwiseForm) const {
+  int Cost = TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
 }
 
 unsigned
@@ -284,9 +352,9 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
   return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
 }
 
-bool TargetTransformInfo::hasCompatibleFunctionAttributes(
-    const Function *Caller, const Function *Callee) const {
-  return TTIImpl->hasCompatibleFunctionAttributes(Caller, Callee);
+bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
+                                              const Function *Callee) const {
+  return TTIImpl->areInlineCompatible(Caller, Callee);
 }
 
 TargetTransformInfo::Concept::~Concept() {}
@@ -294,16 +362,16 @@ TargetTransformInfo::Concept::~Concept() {}
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
 
 TargetIRAnalysis::TargetIRAnalysis(
-    std::function<Result(Function &)> TTICallback)
+    std::function<Result(const Function &)> TTICallback)
     : TTICallback(TTICallback) {}
 
-TargetIRAnalysis::Result TargetIRAnalysis::run(Function &F) {
+TargetIRAnalysis::Result TargetIRAnalysis::run(const Function &F) {
   return TTICallback(F);
 }
 
 char TargetIRAnalysis::PassID;
 
-TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(Function &F) {
+TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(const Function &F) {
   return Result(F.getParent()->getDataLayout());
 }
 
@@ -327,7 +395,7 @@ TargetTransformInfoWrapperPass::TargetTransformInfoWrapperPass(
       *PassRegistry::getPassRegistry());
 }
 
-TargetTransformInfo &TargetTransformInfoWrapperPass::getTTI(Function &F) {
+TargetTransformInfo &TargetTransformInfoWrapperPass::getTTI(const Function &F) {
   TTI = TIRA.run(F);
   return *TTI;
 }
diff --git a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 4e9c6f6..9f92391 100644
--- a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -70,7 +70,7 @@
 //   A a;
 // } B;
 //
-// For an acess to B.a.s, we attach !5 (a path tag node) to the load/store
+// For an access to B.a.s, we attach !5 (a path tag node) to the load/store
 // instruction. The base type is !4 (struct B), the access type is !2 (scalar
 // type short) and the offset is 4.
 //
@@ -121,15 +121,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/SetVector.h"
 using namespace llvm;
 
 // A handy option for disabling TBAA functionality. The same effect can also be
@@ -138,199 +136,138 @@ using namespace llvm;
 static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true));
 
 namespace {
-  /// TBAANode - This is a simple wrapper around an MDNode which provides a
-  /// higher-level interface by hiding the details of how alias analysis
-  /// information is encoded in its operands.
-  class TBAANode {
-    const MDNode *Node;
-
-  public:
-    TBAANode() : Node(nullptr) {}
-    explicit TBAANode(const MDNode *N) : Node(N) {}
-
-    /// getNode - Get the MDNode for this TBAANode.
-    const MDNode *getNode() const { return Node; }
-
-    /// getParent - Get this TBAANode's Alias tree parent.
-    TBAANode getParent() const {
-      if (Node->getNumOperands() < 2)
-        return TBAANode();
-      MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
-      if (!P)
-        return TBAANode();
-      // Ok, this node has a valid parent. Return it.
-      return TBAANode(P);
-    }
-
-    /// TypeIsImmutable - Test if this TBAANode represents a type for objects
-    /// which are not modified (by any means) in the context where this
-    /// AliasAnalysis is relevant.
-    bool TypeIsImmutable() const {
-      if (Node->getNumOperands() < 3)
-        return false;
-      ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(2));
-      if (!CI)
-        return false;
-      return CI->getValue()[0];
-    }
-  };
-
-  /// This is a simple wrapper around an MDNode which provides a
-  /// higher-level interface by hiding the details of how alias analysis
-  /// information is encoded in its operands.
-  class TBAAStructTagNode {
-    /// This node should be created with createTBAAStructTagNode.
-    const MDNode *Node;
+/// TBAANode - This is a simple wrapper around an MDNode which provides a
+/// higher-level interface by hiding the details of how alias analysis
+/// information is encoded in its operands.
+class TBAANode {
+  const MDNode *Node;
+
+public:
+  TBAANode() : Node(nullptr) {}
+  explicit TBAANode(const MDNode *N) : Node(N) {}
+
+  /// getNode - Get the MDNode for this TBAANode.
+  const MDNode *getNode() const { return Node; }
+
+  /// getParent - Get this TBAANode's Alias tree parent.
+  TBAANode getParent() const {
+    if (Node->getNumOperands() < 2)
+      return TBAANode();
+    MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
+    if (!P)
+      return TBAANode();
+    // Ok, this node has a valid parent. Return it.
+    return TBAANode(P);
+  }
 
-  public:
-    explicit TBAAStructTagNode(const MDNode *N) : Node(N) {}
+  /// TypeIsImmutable - Test if this TBAANode represents a type for objects
+  /// which are not modified (by any means) in the context where this
+  /// AliasAnalysis is relevant.
+  bool TypeIsImmutable() const {
+    if (Node->getNumOperands() < 3)
+      return false;
+    ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(2));
+    if (!CI)
+      return false;
+    return CI->getValue()[0];
+  }
+};
 
-    /// Get the MDNode for this TBAAStructTagNode.
-    const MDNode *getNode() const { return Node; }
+/// This is a simple wrapper around an MDNode which provides a
+/// higher-level interface by hiding the details of how alias analysis
+/// information is encoded in its operands.
+class TBAAStructTagNode {
+  /// This node should be created with createTBAAStructTagNode.
+  const MDNode *Node;
 
-    const MDNode *getBaseType() const {
-      return dyn_cast_or_null<MDNode>(Node->getOperand(0));
-    }
-    const MDNode *getAccessType() const {
-      return dyn_cast_or_null<MDNode>(Node->getOperand(1));
-    }
-    uint64_t getOffset() const {
-      return mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
-    }
-    /// TypeIsImmutable - Test if this TBAAStructTagNode represents a type for
-    /// objects which are not modified (by any means) in the context where this
-    /// AliasAnalysis is relevant.
-    bool TypeIsImmutable() const {
-      if (Node->getNumOperands() < 4)
-        return false;
-      ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(3));
-      if (!CI)
-        return false;
-      return CI->getValue()[0];
-    }
-  };
-
-  /// This is a simple wrapper around an MDNode which provides a
-  /// higher-level interface by hiding the details of how alias analysis
-  /// information is encoded in its operands.
-  class TBAAStructTypeNode {
-    /// This node should be created with createTBAAStructTypeNode.
-    const MDNode *Node;
-
-  public:
-    TBAAStructTypeNode() : Node(nullptr) {}
-    explicit TBAAStructTypeNode(const MDNode *N) : Node(N) {}
-
-    /// Get the MDNode for this TBAAStructTypeNode.
-    const MDNode *getNode() const { return Node; }
-
-    /// Get this TBAAStructTypeNode's field in the type DAG with
-    /// given offset. Update the offset to be relative to the field type.
-    TBAAStructTypeNode getParent(uint64_t &Offset) const {
-      // Parent can be omitted for the root node.
-      if (Node->getNumOperands() < 2)
-        return TBAAStructTypeNode();
+public:
+  explicit TBAAStructTagNode(const MDNode *N) : Node(N) {}
 
-      // Fast path for a scalar type node and a struct type node with a single
-      // field.
-      if (Node->getNumOperands() <= 3) {
-        uint64_t Cur = Node->getNumOperands() == 2
-                           ? 0
-                           : mdconst::extract<ConstantInt>(Node->getOperand(2))
-                                 ->getZExtValue();
-        Offset -= Cur;
-        MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
-        if (!P)
-          return TBAAStructTypeNode();
-        return TBAAStructTypeNode(P);
-      }
+  /// Get the MDNode for this TBAAStructTagNode.
+  const MDNode *getNode() const { return Node; }
 
-      // Assume the offsets are in order. We return the previous field if
-      // the current offset is bigger than the given offset.
-      unsigned TheIdx = 0;
-      for (unsigned Idx = 1; Idx < Node->getNumOperands(); Idx += 2) {
-        uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1))
-                           ->getZExtValue();
-        if (Cur > Offset) {
-          assert(Idx >= 3 &&
-                 "TBAAStructTypeNode::getParent should have an offset match!");
-          TheIdx = Idx - 2;
-          break;
-        }
-      }
-      // Move along the last field.
-      if (TheIdx == 0)
-        TheIdx = Node->getNumOperands() - 2;
-      uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1))
-                         ->getZExtValue();
+  const MDNode *getBaseType() const {
+    return dyn_cast_or_null<MDNode>(Node->getOperand(0));
+  }
+  const MDNode *getAccessType() const {
+    return dyn_cast_or_null<MDNode>(Node->getOperand(1));
+  }
+  uint64_t getOffset() const {
+    return mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
+  }
+  /// TypeIsImmutable - Test if this TBAAStructTagNode represents a type for
+  /// objects which are not modified (by any means) in the context where this
+  /// AliasAnalysis is relevant.
+  bool TypeIsImmutable() const {
+    if (Node->getNumOperands() < 4)
+      return false;
+    ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(3));
+    if (!CI)
+      return false;
+    return CI->getValue()[0];
+  }
+};
+
+/// This is a simple wrapper around an MDNode which provides a
+/// higher-level interface by hiding the details of how alias analysis
+/// information is encoded in its operands.
+class TBAAStructTypeNode {
+  /// This node should be created with createTBAAStructTypeNode.
+  const MDNode *Node;
+
+public:
+  TBAAStructTypeNode() : Node(nullptr) {}
+  explicit TBAAStructTypeNode(const MDNode *N) : Node(N) {}
+
+  /// Get the MDNode for this TBAAStructTypeNode.
+  const MDNode *getNode() const { return Node; }
+
+  /// Get this TBAAStructTypeNode's field in the type DAG with
+  /// given offset. Update the offset to be relative to the field type.
+  TBAAStructTypeNode getParent(uint64_t &Offset) const {
+    // Parent can be omitted for the root node.
+    if (Node->getNumOperands() < 2)
+      return TBAAStructTypeNode();
+
+    // Fast path for a scalar type node and a struct type node with a single
+    // field.
+    if (Node->getNumOperands() <= 3) {
+      uint64_t Cur = Node->getNumOperands() == 2
+                         ? 0
+                         : mdconst::extract<ConstantInt>(Node->getOperand(2))
+                               ->getZExtValue();
       Offset -= Cur;
-      MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(TheIdx));
+      MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
       if (!P)
         return TBAAStructTypeNode();
       return TBAAStructTypeNode(P);
     }
-  };
-}
-
-namespace {
-  /// TypeBasedAliasAnalysis - This is a simple alias analysis
-  /// implementation that uses TypeBased to answer queries.
-  class TypeBasedAliasAnalysis : public ImmutablePass,
-                                 public AliasAnalysis {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    TypeBasedAliasAnalysis() : ImmutablePass(ID) {
-      initializeTypeBasedAliasAnalysisPass(*PassRegistry::getPassRegistry());
-    }
 
-    bool doInitialization(Module &M) override;
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(const void *PI) override {
-      if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
+    // Assume the offsets are in order. We return the previous field if
+    // the current offset is bigger than the given offset.
+    unsigned TheIdx = 0;
+    for (unsigned Idx = 1; Idx < Node->getNumOperands(); Idx += 2) {
+      uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1))
+                         ->getZExtValue();
+      if (Cur > Offset) {
+        assert(Idx >= 3 &&
+               "TBAAStructTypeNode::getParent should have an offset match!");
+        TheIdx = Idx - 2;
+        break;
+      }
     }
-
-    bool Aliases(const MDNode *A, const MDNode *B) const;
-    bool PathAliases(const MDNode *A, const MDNode *B) const;
-
-  private:
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override;
-    bool pointsToConstantMemory(const MemoryLocation &Loc,
-                                bool OrLocal) override;
-    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override;
-    ModRefBehavior getModRefBehavior(const Function *F) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override;
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char TypeBasedAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(TypeBasedAliasAnalysis, AliasAnalysis, "tbaa",
-                   "Type-Based Alias Analysis", false, true, false)
-
-ImmutablePass *llvm::createTypeBasedAliasAnalysisPass() {
-  return new TypeBasedAliasAnalysis();
-}
-
-bool TypeBasedAliasAnalysis::doInitialization(Module &M) {
-  InitializeAliasAnalysis(this, &M.getDataLayout());
-  return true;
-}
-
-void
-TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
+    // Move along the last field.
+    if (TheIdx == 0)
+      TheIdx = Node->getNumOperands() - 2;
+    uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1))
+                       ->getZExtValue();
+    Offset -= Cur;
+    MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(TheIdx));
+    if (!P)
+      return TBAAStructTypeNode();
+    return TBAAStructTypeNode(P);
+  }
+};
 }
 
 /// Check the first operand of the tbaa tag node, if it is a MDNode, we treat
@@ -342,145 +279,36 @@ static bool isStructPathTBAA(const MDNode *MD) {
   return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
 }
 
-/// Aliases - Test whether the type represented by A may alias the
-/// type represented by B.
-bool
-TypeBasedAliasAnalysis::Aliases(const MDNode *A,
-                                const MDNode *B) const {
-  // Make sure that both MDNodes are struct-path aware.
-  if (isStructPathTBAA(A) && isStructPathTBAA(B))
-    return PathAliases(A, B);
-
-  // Keep track of the root node for A and B.
-  TBAANode RootA, RootB;
-
-  // Climb the tree from A to see if we reach B.
-  for (TBAANode T(A); ; ) {
-    if (T.getNode() == B)
-      // B is an ancestor of A.
-      return true;
-
-    RootA = T;
-    T = T.getParent();
-    if (!T.getNode())
-      break;
-  }
-
-  // Climb the tree from B to see if we reach A.
-  for (TBAANode T(B); ; ) {
-    if (T.getNode() == A)
-      // A is an ancestor of B.
-      return true;
-
-    RootB = T;
-    T = T.getParent();
-    if (!T.getNode())
-      break;
-  }
-
-  // Neither node is an ancestor of the other.
-  
-  // If they have different roots, they're part of different potentially
-  // unrelated type systems, so we must be conservative.
-  if (RootA.getNode() != RootB.getNode())
-    return true;
-
-  // If they have the same root, then we've proved there's no alias.
-  return false;
-}
-
-/// Test whether the struct-path tag represented by A may alias the
-/// struct-path tag represented by B.
-bool
-TypeBasedAliasAnalysis::PathAliases(const MDNode *A,
-                                    const MDNode *B) const {
-  // Verify that both input nodes are struct-path aware.
-  assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware.");
-  assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware.");
-
-  // Keep track of the root node for A and B.
-  TBAAStructTypeNode RootA, RootB;
-  TBAAStructTagNode TagA(A), TagB(B);
-
-  // TODO: We need to check if AccessType of TagA encloses AccessType of
-  // TagB to support aggregate AccessType. If yes, return true.
-
-  // Start from the base type of A, follow the edge with the correct offset in
-  // the type DAG and adjust the offset until we reach the base type of B or
-  // until we reach the Root node.
-  // Compare the adjusted offset once we have the same base.
-
-  // Climb the type DAG from base type of A to see if we reach base type of B.
-  const MDNode *BaseA = TagA.getBaseType();
-  const MDNode *BaseB = TagB.getBaseType();
-  uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset();
-  for (TBAAStructTypeNode T(BaseA); ; ) {
-    if (T.getNode() == BaseB)
-      // Base type of A encloses base type of B, check if the offsets match.
-      return OffsetA == OffsetB;
-
-    RootA = T;
-    // Follow the edge with the correct offset, OffsetA will be adjusted to
-    // be relative to the field type.
-    T = T.getParent(OffsetA);
-    if (!T.getNode())
-      break;
-  }
-
-  // Reset OffsetA and climb the type DAG from base type of B to see if we reach
-  // base type of A.
-  OffsetA = TagA.getOffset();
-  for (TBAAStructTypeNode T(BaseB); ; ) {
-    if (T.getNode() == BaseA)
-      // Base type of B encloses base type of A, check if the offsets match.
-      return OffsetA == OffsetB;
-
-    RootB = T;
-    // Follow the edge with the correct offset, OffsetB will be adjusted to
-    // be relative to the field type.
-    T = T.getParent(OffsetB);
-    if (!T.getNode())
-      break;
-  }
-
-  // Neither node is an ancestor of the other.
-
-  // If they have different roots, they're part of different potentially
-  // unrelated type systems, so we must be conservative.
-  if (RootA.getNode() != RootB.getNode())
-    return true;
-
-  // If they have the same root, then we've proved there's no alias.
-  return false;
-}
-
-AliasResult TypeBasedAliasAnalysis::alias(const MemoryLocation &LocA,
-                                          const MemoryLocation &LocB) {
+AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA,
+                                     const MemoryLocation &LocB) {
   if (!EnableTBAA)
-    return AliasAnalysis::alias(LocA, LocB);
+    return AAResultBase::alias(LocA, LocB);
 
   // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must
   // be conservative.
   const MDNode *AM = LocA.AATags.TBAA;
-  if (!AM) return AliasAnalysis::alias(LocA, LocB);
+  if (!AM)
+    return AAResultBase::alias(LocA, LocB);
   const MDNode *BM = LocB.AATags.TBAA;
-  if (!BM) return AliasAnalysis::alias(LocA, LocB);
+  if (!BM)
+    return AAResultBase::alias(LocA, LocB);
 
   // If they may alias, chain to the next AliasAnalysis.
   if (Aliases(AM, BM))
-    return AliasAnalysis::alias(LocA, LocB);
+    return AAResultBase::alias(LocA, LocB);
 
   // Otherwise return a definitive result.
   return NoAlias;
 }
 
-bool TypeBasedAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
-                                                    bool OrLocal) {
+bool TypeBasedAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
+                                               bool OrLocal) {
   if (!EnableTBAA)
-    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+    return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
 
   const MDNode *M = Loc.AATags.TBAA;
-  if (!M) return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+  if (!M)
+    return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
 
   // If this is an "immutable" type, we can assume the pointer is pointing
   // to constant memory.
@@ -488,80 +316,82 @@ bool TypeBasedAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc,
       (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
     return true;
 
-  return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+  return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
 }
 
-AliasAnalysis::ModRefBehavior
-TypeBasedAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+FunctionModRefBehavior
+TypeBasedAAResult::getModRefBehavior(ImmutableCallSite CS) {
   if (!EnableTBAA)
-    return AliasAnalysis::getModRefBehavior(CS);
+    return AAResultBase::getModRefBehavior(CS);
 
-  ModRefBehavior Min = UnknownModRefBehavior;
+  FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
 
   // If this is an "immutable" type, we can assume the call doesn't write
   // to memory.
   if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
     if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
         (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
-      Min = OnlyReadsMemory;
+      Min = FMRB_OnlyReadsMemory;
 
-  return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
+  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min);
 }
 
-AliasAnalysis::ModRefBehavior
-TypeBasedAliasAnalysis::getModRefBehavior(const Function *F) {
+FunctionModRefBehavior TypeBasedAAResult::getModRefBehavior(const Function *F) {
   // Functions don't have metadata. Just chain to the next implementation.
-  return AliasAnalysis::getModRefBehavior(F);
+  return AAResultBase::getModRefBehavior(F);
 }
 
-AliasAnalysis::ModRefResult
-TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
-                                      const MemoryLocation &Loc) {
+ModRefInfo TypeBasedAAResult::getModRefInfo(ImmutableCallSite CS,
+                                            const MemoryLocation &Loc) {
   if (!EnableTBAA)
-    return AliasAnalysis::getModRefInfo(CS, Loc);
+    return AAResultBase::getModRefInfo(CS, Loc);
 
   if (const MDNode *L = Loc.AATags.TBAA)
     if (const MDNode *M =
             CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
       if (!Aliases(L, M))
-        return NoModRef;
+        return MRI_NoModRef;
 
-  return AliasAnalysis::getModRefInfo(CS, Loc);
+  return AAResultBase::getModRefInfo(CS, Loc);
 }
 
-AliasAnalysis::ModRefResult
-TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
-                                      ImmutableCallSite CS2) {
+ModRefInfo TypeBasedAAResult::getModRefInfo(ImmutableCallSite CS1,
+                                            ImmutableCallSite CS2) {
   if (!EnableTBAA)
-    return AliasAnalysis::getModRefInfo(CS1, CS2);
+    return AAResultBase::getModRefInfo(CS1, CS2);
 
   if (const MDNode *M1 =
           CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
     if (const MDNode *M2 =
             CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
       if (!Aliases(M1, M2))
-        return NoModRef;
+        return MRI_NoModRef;
 
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
+  return AAResultBase::getModRefInfo(CS1, CS2);
 }
 
 bool MDNode::isTBAAVtableAccess() const {
   if (!isStructPathTBAA(this)) {
-    if (getNumOperands() < 1) return false;
+    if (getNumOperands() < 1)
+      return false;
     if (MDString *Tag1 = dyn_cast<MDString>(getOperand(0))) {
-      if (Tag1->getString() == "vtable pointer") return true;
+      if (Tag1->getString() == "vtable pointer")
+        return true;
     }
     return false;
   }
 
   // For struct-path aware TBAA, we use the access type of the tag.
-  if (getNumOperands() < 2) return false;
+  if (getNumOperands() < 2)
+    return false;
   MDNode *Tag = cast_or_null<MDNode>(getOperand(1));
-  if (!Tag) return false;
+  if (!Tag)
+    return false;
   if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
-    if (Tag1->getString() == "vtable pointer") return true;
+    if (Tag1->getString() == "vtable pointer")
+      return true;
   }
-  return false;  
+  return false;
 }
 
 MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
@@ -575,9 +405,11 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   bool StructPath = isStructPathTBAA(A) && isStructPathTBAA(B);
   if (StructPath) {
     A = cast_or_null<MDNode>(A->getOperand(1));
-    if (!A) return nullptr;
+    if (!A)
+      return nullptr;
     B = cast_or_null<MDNode>(B->getOperand(1));
-    if (!B) return nullptr;
+    if (!B)
+      return nullptr;
   }
 
   SmallSetVector<MDNode *, 4> PathA;
@@ -604,7 +436,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   int IB = PathB.size() - 1;
 
   MDNode *Ret = nullptr;
-  while (IA >= 0 && IB >=0) {
+  while (IA >= 0 && IB >= 0) {
     if (PathA[IA] == PathB[IB])
       Ret = PathA[IA];
     else
@@ -644,3 +476,147 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
     N.NoAlias = getMetadata(LLVMContext::MD_noalias);
 }
 
+/// Aliases - Test whether the type represented by A may alias the
+/// type represented by B.
+bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const {
+  // Make sure that both MDNodes are struct-path aware.
+  if (isStructPathTBAA(A) && isStructPathTBAA(B))
+    return PathAliases(A, B);
+
+  // Keep track of the root node for A and B.
+  TBAANode RootA, RootB;
+
+  // Climb the tree from A to see if we reach B.
+  for (TBAANode T(A);;) {
+    if (T.getNode() == B)
+      // B is an ancestor of A.
+      return true;
+
+    RootA = T;
+    T = T.getParent();
+    if (!T.getNode())
+      break;
+  }
+
+  // Climb the tree from B to see if we reach A.
+  for (TBAANode T(B);;) {
+    if (T.getNode() == A)
+      // A is an ancestor of B.
+      return true;
+
+    RootB = T;
+    T = T.getParent();
+    if (!T.getNode())
+      break;
+  }
+
+  // Neither node is an ancestor of the other.
+
+  // If they have different roots, they're part of different potentially
+  // unrelated type systems, so we must be conservative.
+  if (RootA.getNode() != RootB.getNode())
+    return true;
+
+  // If they have the same root, then we've proved there's no alias.
+  return false;
+}
+
+/// Test whether the struct-path tag represented by A may alias the
+/// struct-path tag represented by B.
+bool TypeBasedAAResult::PathAliases(const MDNode *A, const MDNode *B) const {
+  // Verify that both input nodes are struct-path aware.
+  assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware.");
+  assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware.");
+
+  // Keep track of the root node for A and B.
+  TBAAStructTypeNode RootA, RootB;
+  TBAAStructTagNode TagA(A), TagB(B);
+
+  // TODO: We need to check if AccessType of TagA encloses AccessType of
+  // TagB to support aggregate AccessType. If yes, return true.
+
+  // Start from the base type of A, follow the edge with the correct offset in
+  // the type DAG and adjust the offset until we reach the base type of B or
+  // until we reach the Root node.
+  // Compare the adjusted offset once we have the same base.
+
+  // Climb the type DAG from base type of A to see if we reach base type of B.
+  const MDNode *BaseA = TagA.getBaseType();
+  const MDNode *BaseB = TagB.getBaseType();
+  uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset();
+  for (TBAAStructTypeNode T(BaseA);;) {
+    if (T.getNode() == BaseB)
+      // Base type of A encloses base type of B, check if the offsets match.
+      return OffsetA == OffsetB;
+
+    RootA = T;
+    // Follow the edge with the correct offset, OffsetA will be adjusted to
+    // be relative to the field type.
+    T = T.getParent(OffsetA);
+    if (!T.getNode())
+      break;
+  }
+
+  // Reset OffsetA and climb the type DAG from base type of B to see if we reach
+  // base type of A.
+  OffsetA = TagA.getOffset();
+  for (TBAAStructTypeNode T(BaseB);;) {
+    if (T.getNode() == BaseA)
+      // Base type of B encloses base type of A, check if the offsets match.
+      return OffsetA == OffsetB;
+
+    RootB = T;
+    // Follow the edge with the correct offset, OffsetB will be adjusted to
+    // be relative to the field type.
+    T = T.getParent(OffsetB);
+    if (!T.getNode())
+      break;
+  }
+
+  // Neither node is an ancestor of the other.
+
+  // If they have different roots, they're part of different potentially
+  // unrelated type systems, so we must be conservative.
+  if (RootA.getNode() != RootB.getNode())
+    return true;
+
+  // If they have the same root, then we've proved there's no alias.
+  return false;
+}
+
+TypeBasedAAResult TypeBasedAA::run(Function &F, AnalysisManager<Function> *AM) {
+  return TypeBasedAAResult(AM->getResult<TargetLibraryAnalysis>(F));
+}
+
+char TypeBasedAA::PassID;
+
+char TypeBasedAAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(TypeBasedAAWrapperPass, "tbaa",
+                      "Type-Based Alias Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(TypeBasedAAWrapperPass, "tbaa", "Type-Based Alias Analysis",
+                    false, true)
+
+ImmutablePass *llvm::createTypeBasedAAWrapperPass() {
+  return new TypeBasedAAWrapperPass();
+}
+
+TypeBasedAAWrapperPass::TypeBasedAAWrapperPass() : ImmutablePass(ID) {
+  initializeTypeBasedAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+bool TypeBasedAAWrapperPass::doInitialization(Module &M) {
+  Result.reset(new TypeBasedAAResult(
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+  return false;
+}
+
+bool TypeBasedAAWrapperPass::doFinalization(Module &M) {
+  Result.reset();
+  return false;
+}
+
+void TypeBasedAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
diff --git a/contrib/llvm/lib/Analysis/ValueTracking.cpp b/contrib/llvm/lib/Analysis/ValueTracking.cpp
index fa0d779..a83e207 100644
--- a/contrib/llvm/lib/Analysis/ValueTracking.cpp
+++ b/contrib/llvm/lib/Analysis/ValueTracking.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -43,7 +44,7 @@ const unsigned MaxDepth = 6;
 
 /// Enable an experimental feature to leverage information about dominating
 /// conditions to compute known bits.  The individual options below control how
-/// hard we search.  The defaults are choosen to be fairly aggressive.  If you
+/// hard we search.  The defaults are chosen to be fairly aggressive.  If you
 /// run into compile time problems when testing, scale them back and report
 /// your findings.
 static cl::opt<bool> EnableDomConditions("value-tracking-dom-conditions",
@@ -58,12 +59,12 @@ static cl::opt<unsigned> DomConditionsMaxDepth("dom-conditions-max-depth",
 /// conditions?
 static cl::opt<unsigned> DomConditionsMaxDomBlocks("dom-conditions-dom-blocks",
                                                    cl::Hidden,
-                                                   cl::init(20000));
+                                                   cl::init(20));
 
 // Controls the number of uses of the value searched for possible
 // dominating comparisons.
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
-                                              cl::Hidden, cl::init(2000));
+                                              cl::Hidden, cl::init(20));
 
 // If true, don't consider only compares whose only use is a branch.
 static cl::opt<bool> DomConditionsSingleCmpUse("dom-conditions-single-cmp-use",
@@ -185,6 +186,25 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
   return ::isKnownNonZero(V, DL, Depth, Query(AC, safeCxtI(V, CxtI), DT));
 }
 
+bool llvm::isKnownNonNegative(Value *V, const DataLayout &DL, unsigned Depth,
+                              AssumptionCache *AC, const Instruction *CxtI,
+                              const DominatorTree *DT) {
+  bool NonNegative, Negative;
+  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
+  return NonNegative;
+}
+
+static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
+                           const Query &Q);
+
+bool llvm::isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
+                          AssumptionCache *AC, const Instruction *CxtI,
+                          const DominatorTree *DT) {
+  return ::isKnownNonEqual(V1, V2, DL, Query(AC,
+                                             safeCxtI(V1, safeCxtI(V2, CxtI)),
+                                             DT));
+}
+
 static bool MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout &DL,
                               unsigned Depth, const Query &Q);
 
@@ -320,7 +340,7 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
   }
 
   // If low bits are zero in either operand, output low known-0 bits.
-  // Also compute a conserative estimate for high known-0 bits.
+  // Also compute a conservative estimate for high known-0 bits.
   // More trickiness is possible, but this is sufficient for the
   // interesting case of alignment computation.
   KnownOne.clearAllBits();
@@ -347,26 +367,30 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
 }
 
 void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
-                                             APInt &KnownZero) {
+                                             APInt &KnownZero,
+                                             APInt &KnownOne) {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned NumRanges = Ranges.getNumOperands() / 2;
   assert(NumRanges >= 1);
 
-  // Use the high end of the ranges to find leading zeros.
-  unsigned MinLeadingZeros = BitWidth;
+  KnownZero.setAllBits();
+  KnownOne.setAllBits();
+
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
     ConstantInt *Upper =
         mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
-    if (Range.isWrappedSet())
-      MinLeadingZeros = 0; // -1 has no zeros
-    unsigned LeadingZeros = (Upper->getValue() - 1).countLeadingZeros();
-    MinLeadingZeros = std::min(LeadingZeros, MinLeadingZeros);
-  }
 
-  KnownZero = APInt::getHighBitsSet(BitWidth, MinLeadingZeros);
+    // The first CommonPrefixBits of all values in Range are equal.
+    unsigned CommonPrefixBits =
+        (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();
+
+    APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
+    KnownOne &= Range.getUnsignedMax() & Mask;
+    KnownZero &= ~Range.getUnsignedMax() & Mask;
+  }
 }
 
 static bool isEphemeralValueOf(Instruction *I, const Value *E) {
@@ -374,20 +398,20 @@ static bool isEphemeralValueOf(Instruction *I, const Value *E) {
   SmallPtrSet<const Value *, 32> Visited;
   SmallPtrSet<const Value *, 16> EphValues;
 
+  // The instruction defining an assumption's condition itself is always
+  // considered ephemeral to that assumption (even if it has other
+  // non-ephemeral users). See r246696's test case for an example.
+  if (std::find(I->op_begin(), I->op_end(), E) != I->op_end())
+    return true;
+
   while (!WorkSet.empty()) {
     const Value *V = WorkSet.pop_back_val();
     if (!Visited.insert(V).second)
       continue;
 
     // If all uses of this value are ephemeral, then so is this value.
-    bool FoundNEUse = false;
-    for (const User *I : V->users())
-      if (!EphValues.count(I)) {
-        FoundNEUse = true;
-        break;
-      }
-
-    if (!FoundNEUse) {
+    if (std::all_of(V->user_begin(), V->user_end(),
+                    [&](const User *U) { return EphValues.count(U); })) {
       if (V == E)
         return true;
 
@@ -447,7 +471,7 @@ static bool isValidAssumeForContext(Value *V, const Query &Q) {
       for (BasicBlock::const_iterator I =
              std::next(BasicBlock::const_iterator(Q.CxtI)),
                                       IE(Inv); I != IE; ++I)
-        if (!isSafeToSpeculativelyExecute(I) && !isAssumeLikeIntrinsic(I))
+        if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
           return false;
 
       return !isEphemeralValueOf(Inv, Q.CxtI);
@@ -464,14 +488,14 @@ static bool isValidAssumeForContext(Value *V, const Query &Q) {
     // of the block); the common case is that the assume will come first.
     for (BasicBlock::iterator I = std::next(BasicBlock::iterator(Inv)),
          IE = Inv->getParent()->end(); I != IE; ++I)
-      if (I == Q.CxtI)
+      if (&*I == Q.CxtI)
         return true;
 
     // The context must come first...
     for (BasicBlock::const_iterator I =
            std::next(BasicBlock::const_iterator(Q.CxtI)),
                                     IE(Inv); I != IE; ++I)
-      if (!isSafeToSpeculativelyExecute(I) && !isAssumeLikeIntrinsic(I))
+      if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
         return false;
 
     return !isEphemeralValueOf(Inv, Q.CxtI);
@@ -601,6 +625,11 @@ static void computeKnownBitsFromDominatingCondition(Value *V, APInt &KnownZero,
   if (!Q.DT || !Q.CxtI)
     return;
   Instruction *Cxt = const_cast<Instruction *>(Q.CxtI);
+  // The context instruction might be in a statically unreachable block.  If
+  // so, asking dominator queries may yield suprising results.  (e.g. the block
+  // may not have a dom tree node)
+  if (!Q.DT->isReachableFromEntry(Cxt->getParent()))
+    return;
 
   // Avoid useless work
   if (auto VI = dyn_cast<Instruction>(V))
@@ -647,7 +676,9 @@ static void computeKnownBitsFromDominatingCondition(Value *V, APInt &KnownZero,
     // instruction.  Finding a condition where one path dominates the context
     // isn't enough because both the true and false cases could merge before
     // the context instruction we're actually interested in.  Instead, we need
-    // to ensure that the taken *edge* dominates the context instruction.
+    // to ensure that the taken *edge* dominates the context instruction.  We
+    // know that the edge must be reachable since we started from a reachable
+    // block.
     BasicBlock *BB0 = BI->getSuccessor(0);
     BasicBlockEdge Edge(BI->getParent(), BB0);
     if (!Edge.isSingleEdge() || !Q.DT->dominates(Edge, Q.CxtI->getParent()))
@@ -941,6 +972,90 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
   }
 }
 
+// Compute known bits from a shift operator, including those with a
+// non-constant shift amount. KnownZero and KnownOne are the outputs of this
+// function. KnownZero2 and KnownOne2 are pre-allocated temporaries with the
+// same bit width as KnownZero and KnownOne. KZF and KOF are operator-specific
+// functors that, given the known-zero or known-one bits respectively, and a
+// shift amount, compute the implied known-zero or known-one bits of the shift
+// operator's result respectively for that shift amount. The results from calling
+// KZF and KOF are conservatively combined for all permitted shift amounts.
+template <typename KZFunctor, typename KOFunctor>
+static void computeKnownBitsFromShiftOperator(Operator *I,
+              APInt &KnownZero, APInt &KnownOne,
+              APInt &KnownZero2, APInt &KnownOne2,
+              const DataLayout &DL, unsigned Depth, const Query &Q,
+              KZFunctor KZF, KOFunctor KOF) {
+  unsigned BitWidth = KnownZero.getBitWidth();
+
+  if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
+    KnownZero = KZF(KnownZero, ShiftAmt);
+    KnownOne  = KOF(KnownOne, ShiftAmt);
+    return;
+  }
+
+  computeKnownBits(I->getOperand(1), KnownZero, KnownOne, DL, Depth + 1, Q);
+
+  // Note: We cannot use KnownZero.getLimitedValue() here, because if
+  // BitWidth > 64 and any upper bits are known, we'll end up returning the
+  // limit value (which implies all bits are known).
+  uint64_t ShiftAmtKZ = KnownZero.zextOrTrunc(64).getZExtValue();
+  uint64_t ShiftAmtKO = KnownOne.zextOrTrunc(64).getZExtValue();
+
+  // It would be more-clearly correct to use the two temporaries for this
+  // calculation. Reusing the APInts here to prevent unnecessary allocations.
+  KnownZero.clearAllBits(), KnownOne.clearAllBits();
+
+  // If we know the shifter operand is nonzero, we can sometimes infer more
+  // known bits. However this is expensive to compute, so be lazy about it and
+  // only compute it when absolutely necessary.
+  Optional<bool> ShifterOperandIsNonZero;
+
+  // Early exit if we can't constrain any well-defined shift amount.
+  if (!(ShiftAmtKZ & (BitWidth - 1)) && !(ShiftAmtKO & (BitWidth - 1))) {
+    ShifterOperandIsNonZero =
+        isKnownNonZero(I->getOperand(1), DL, Depth + 1, Q);
+    if (!*ShifterOperandIsNonZero)
+      return;
+  }
+
+  computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q);
+
+  KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+  for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
+    // Combine the shifted known input bits only for those shift amounts
+    // compatible with its known constraints.
+    if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt)
+      continue;
+    if ((ShiftAmt | ShiftAmtKO) != ShiftAmt)
+      continue;
+    // If we know the shifter is nonzero, we may be able to infer more known
+    // bits. This check is sunk down as far as possible to avoid the expensive
+    // call to isKnownNonZero if the cheaper checks above fail.
+    if (ShiftAmt == 0) {
+      if (!ShifterOperandIsNonZero.hasValue())
+        ShifterOperandIsNonZero =
+            isKnownNonZero(I->getOperand(1), DL, Depth + 1, Q);
+      if (*ShifterOperandIsNonZero)
+        continue;
+    }
+
+    KnownZero &= KZF(KnownZero2, ShiftAmt);
+    KnownOne  &= KOF(KnownOne2, ShiftAmt);
+  }
+
+  // If there are no compatible shift amounts, then we've proven that the shift
+  // amount must be >= the BitWidth, and the result is undefined. We could
+  // return anything we'd like, but we need to make sure the sets of known bits
+  // stay disjoint (it should be better for some other code to actually
+  // propagate the undef than to pick a value here using known bits).
+  if ((KnownZero & KnownOne) != 0)
+    KnownZero.clearAllBits(), KnownOne.clearAllBits();
+}
+
 static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
                                          APInt &KnownOne, const DataLayout &DL,
                                          unsigned Depth, const Query &Q) {
@@ -951,7 +1066,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   default: break;
   case Instruction::Load:
     if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
-      computeKnownBitsFromRangeMetadata(*MD, KnownZero);
+      computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne);
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
@@ -962,6 +1077,22 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
     KnownOne &= KnownOne2;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     KnownZero |= KnownZero2;
+
+    // and(x, add (x, -1)) is a common idiom that always clears the low bit;
+    // here we handle the more general case of adding any odd number by
+    // matching the form add(x, add(x, y)) where y is odd.
+    // TODO: This could be generalized to clearing any bit set in y where the
+    // following bit is known to be unset in y.
+    Value *Y = nullptr;
+    if (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
+                                      m_Value(Y))) ||
+        match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
+                                      m_Value(Y)))) {
+      APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0);
+      computeKnownBits(Y, KnownZero3, KnownOne3, DL, Depth + 1, Q);
+      if (KnownOne3.countTrailingOnes() > 0)
+        KnownZero |= APInt::getLowBitsSet(BitWidth, 1);
+    }
     break;
   }
   case Instruction::Or: {
@@ -1050,7 +1181,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   }
   case Instruction::BitCast: {
     Type *SrcTy = I->getOperand(0)->getType();
-    if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+    if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy() ||
+         SrcTy->isFloatingPointTy()) &&
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
@@ -1077,48 +1209,54 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
       KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
     break;
   }
-  case Instruction::Shl:
+  case Instruction::Shl: {
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
-      KnownZero <<= ShiftAmt;
-      KnownOne  <<= ShiftAmt;
-      KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
-    }
+    auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
+      return (KnownZero << ShiftAmt) |
+             APInt::getLowBitsSet(BitWidth, ShiftAmt); // Low bits known 0.
+    };
+
+    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
+      return KnownOne << ShiftAmt;
+    };
+
+    computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
+                                      KnownZero2, KnownOne2, DL, Depth, Q,
+                                      KZF, KOF);
     break;
-  case Instruction::LShr:
+  }
+  case Instruction::LShr: {
     // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      // Compute the new bits that are at the top now.
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-
-      // Unsigned shift right.
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-      // high bits known zero.
-      KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
-    }
+    auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
+      return APIntOps::lshr(KnownZero, ShiftAmt) |
+             // High bits known zero.
+             APInt::getHighBitsSet(BitWidth, ShiftAmt);
+    };
+
+    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
+      return APIntOps::lshr(KnownOne, ShiftAmt);
+    };
+
+    computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
+                                      KnownZero2, KnownOne2, DL, Depth, Q,
+                                      KZF, KOF);
     break;
-  case Instruction::AShr:
+  }
+  case Instruction::AShr: {
     // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      // Compute the new bits that are at the top now.
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+    auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
+      return APIntOps::ashr(KnownZero, ShiftAmt);
+    };
 
-      // Signed shift right.
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q);
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
+      return APIntOps::ashr(KnownOne, ShiftAmt);
+    };
 
-      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-      if (KnownZero[BitWidth-ShiftAmt-1])    // New bits are known zero.
-        KnownZero |= HighBits;
-      else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
-        KnownOne |= HighBits;
-    }
+    computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
+                                      KnownZero2, KnownOne2, DL, Depth, Q,
+                                      KZF, KOF);
     break;
+  }
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
@@ -1336,13 +1474,19 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   case Instruction::Call:
   case Instruction::Invoke:
     if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
-      computeKnownBitsFromRangeMetadata(*MD, KnownZero);
+      computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne);
     // If a range metadata is attached to this IntrinsicInst, intersect the
     // explicit range specified by the metadata and the implicit range of
     // the intrinsic.
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
+      case Intrinsic::bswap:
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL,
+                         Depth + 1, Q);
+        KnownZero |= KnownZero2.byteSwap();
+        KnownOne |= KnownOne2.byteSwap();
+        break;
       case Intrinsic::ctlz:
       case Intrinsic::cttz: {
         unsigned LowBits = Log2_32(BitWidth)+1;
@@ -1353,8 +1497,24 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
         break;
       }
       case Intrinsic::ctpop: {
-        unsigned LowBits = Log2_32(BitWidth)+1;
-        KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL,
+                         Depth + 1, Q);
+        // We can bound the space the count needs.  Also, bits known to be zero
+        // can't contribute to the population.
+        unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation();
+        unsigned LeadingZeros =
+          APInt(BitWidth, BitsPossiblySet).countLeadingZeros();
+        assert(LeadingZeros <= BitWidth);
+        KnownZero |= APInt::getHighBitsSet(BitWidth, LeadingZeros);
+        KnownOne &= ~KnownZero;
+        // TODO: we could bound KnownOne using the lower bound on the number
+        // of bits which might be set provided by popcnt KnownOne2.
+        break;
+      }
+      case Intrinsic::fabs: {
+        Type *Ty = II->getType();
+        APInt SignBit = APInt::getSignBit(Ty->getScalarSizeInBits());
+        KnownZero |= APInt::getSplat(Ty->getPrimitiveSizeInBits(), SignBit);
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
@@ -1394,6 +1554,46 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero,
   }
 }
 
+static unsigned getAlignment(const Value *V, const DataLayout &DL) {
+  unsigned Align = 0;
+  if (auto *GO = dyn_cast<GlobalObject>(V)) {
+    Align = GO->getAlignment();
+    if (Align == 0) {
+      if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
+        Type *ObjectType = GVar->getType()->getElementType();
+        if (ObjectType->isSized()) {
+          // If the object is defined in the current Module, we'll be giving
+          // it the preferred alignment. Otherwise, we have to assume that it
+          // may only have the minimum ABI alignment.
+          if (GVar->isStrongDefinitionForLinker())
+            Align = DL.getPreferredAlignment(GVar);
+          else
+            Align = DL.getABITypeAlignment(ObjectType);
+        }
+      }
+    }
+  } else if (const Argument *A = dyn_cast<Argument>(V)) {
+    Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0;
+
+    if (!Align && A->hasStructRetAttr()) {
+      // An sret parameter has at least the ABI alignment of the return type.
+      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
+      if (EltTy->isSized())
+        Align = DL.getABITypeAlignment(EltTy);
+    }
+  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    Align = AI->getAlignment();
+  else if (auto CS = ImmutableCallSite(V))
+    Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex);
+  else if (const LoadInst *LI = dyn_cast<LoadInst>(V))
+    if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
+      ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
+      Align = CI->getLimitedValue();
+    }
+
+  return Align;
+}
+
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the KnownZero/KnownOne bit sets.
 ///
@@ -1416,8 +1616,9 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   unsigned BitWidth = KnownZero.getBitWidth();
 
   assert((V->getType()->isIntOrIntVectorTy() ||
+          V->getType()->isFPOrFPVectorTy() ||
           V->getType()->getScalarType()->isPointerTy()) &&
-         "Not integer or pointer type!");
+         "Not integer, floating point, or pointer type!");
   assert((DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) &&
          (!V->getType()->isIntOrIntVectorTy() ||
           V->getType()->getScalarSizeInBits() == BitWidth) &&
@@ -1454,59 +1655,6 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     return;
   }
 
-  // The address of an aligned GlobalValue has trailing zeros.
-  if (auto *GO = dyn_cast<GlobalObject>(V)) {
-    unsigned Align = GO->getAlignment();
-    if (Align == 0) {
-      if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
-        Type *ObjectType = GVar->getType()->getElementType();
-        if (ObjectType->isSized()) {
-          // If the object is defined in the current Module, we'll be giving
-          // it the preferred alignment. Otherwise, we have to assume that it
-          // may only have the minimum ABI alignment.
-          if (GVar->isStrongDefinitionForLinker())
-            Align = DL.getPreferredAlignment(GVar);
-          else
-            Align = DL.getABITypeAlignment(ObjectType);
-        }
-      }
-    }
-    if (Align > 0)
-      KnownZero = APInt::getLowBitsSet(BitWidth,
-                                       countTrailingZeros(Align));
-    else
-      KnownZero.clearAllBits();
-    KnownOne.clearAllBits();
-    return;
-  }
-
-  if (Argument *A = dyn_cast<Argument>(V)) {
-    unsigned Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0;
-
-    if (!Align && A->hasStructRetAttr()) {
-      // An sret parameter has at least the ABI alignment of the return type.
-      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
-      if (EltTy->isSized())
-        Align = DL.getABITypeAlignment(EltTy);
-    }
-
-    if (Align)
-      KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
-    else
-      KnownZero.clearAllBits();
-    KnownOne.clearAllBits();
-
-    // Don't give up yet... there might be an assumption that provides more
-    // information...
-    computeKnownBitsFromAssume(V, KnownZero, KnownOne, DL, Depth, Q);
-
-    // Or a dominating condition for that matter
-    if (EnableDomConditions && Depth <= DomConditionsMaxDepth)
-      computeKnownBitsFromDominatingCondition(V, KnownZero, KnownOne, DL,
-                                              Depth, Q);
-    return;
-  }
-
   // Start out not knowing anything.
   KnownZero.clearAllBits(); KnownOne.clearAllBits();
 
@@ -1525,6 +1673,14 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 
   if (Operator *I = dyn_cast<Operator>(V))
     computeKnownBitsFromOperator(I, KnownZero, KnownOne, DL, Depth, Q);
+
+  // Aligned pointers have trailing zeros - refine KnownZero set
+  if (V->getType()->isPointerTy()) {
+    unsigned Align = getAlignment(V, DL);
+    if (Align)
+      KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+  }
+
   // computeKnownBitsFromAssume and computeKnownBitsFromDominatingCondition
   // strictly refines KnownZero and KnownOne. Therefore, we run them after
   // computeKnownBitsFromOperator.
@@ -1587,9 +1743,10 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
     return false;
 
   Value *X = nullptr, *Y = nullptr;
-  // A shift of a power of two is a power of two or zero.
+  // A shift left or a logical shift right of a power of two is a power of two
+  // or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
-                 match(V, m_Shr(m_Value(X), m_Value()))))
+                 match(V, m_LShr(m_Value(X), m_Value()))))
     return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q, DL);
 
   if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
@@ -1812,6 +1969,23 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
     ComputeSignBit(X, XKnownNonNegative, XKnownNegative, DL, Depth, Q);
     if (XKnownNegative)
       return true;
+
+    // If the shifter operand is a constant, and all of the bits shifted
+    // out are known to be zero, and X is known non-zero then at least one
+    // non-zero bit must remain.
+    if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
+      APInt KnownZero(BitWidth, 0);
+      APInt KnownOne(BitWidth, 0);
+      computeKnownBits(X, KnownZero, KnownOne, DL, Depth, Q);
+      
+      auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
+      // Is there a known one in the portion not shifted out?
+      if (KnownOne.countLeadingZeros() < BitWidth - ShiftVal)
+        return true;
+      // Are all the bits to be shifted out known zero?
+      if (KnownZero.countTrailingOnes() >= ShiftVal)
+        return isKnownNonZero(X, DL, Depth, Q);
+    }
   }
   // div exact can only produce a zero if the dividend is zero.
   else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
@@ -1871,6 +2045,26 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
         isKnownNonZero(SI->getFalseValue(), DL, Depth, Q))
       return true;
   }
+  // PHI
+  else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    // Try and detect a recurrence that monotonically increases from a
+    // starting value, as these are common as induction variables.
+    if (PN->getNumIncomingValues() == 2) {
+      Value *Start = PN->getIncomingValue(0);
+      Value *Induction = PN->getIncomingValue(1);
+      if (isa<ConstantInt>(Induction) && !isa<ConstantInt>(Start))
+        std::swap(Start, Induction);
+      if (ConstantInt *C = dyn_cast<ConstantInt>(Start)) {
+        if (!C->isZero() && !C->isNegative()) {
+          ConstantInt *X;
+          if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) ||
+               match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) &&
+              !X->isNegative())
+            return true;
+        }
+      }
+    }
+  }
 
   if (!BitWidth) return false;
   APInt KnownZero(BitWidth, 0);
@@ -1879,6 +2073,51 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth,
   return KnownOne != 0;
 }
 
+/// Return true if V2 == V1 + X, where X is known non-zero.
+static bool isAddOfNonZero(Value *V1, Value *V2, const DataLayout &DL,
+                           const Query &Q) {
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
+  if (!BO || BO->getOpcode() != Instruction::Add)
+    return false;
+  Value *Op = nullptr;
+  if (V2 == BO->getOperand(0))
+    Op = BO->getOperand(1);
+  else if (V2 == BO->getOperand(1))
+    Op = BO->getOperand(0);
+  else
+    return false;
+  return isKnownNonZero(Op, DL, 0, Q);
+}
+
+/// Return true if it is known that V1 != V2.
+static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL,
+                            const Query &Q) {
+  if (V1->getType()->isVectorTy() || V1 == V2)
+    return false;
+  if (V1->getType() != V2->getType())
+    // We can't look through casts yet.
+    return false;
+  if (isAddOfNonZero(V1, V2, DL, Q) || isAddOfNonZero(V2, V1, DL, Q))
+    return true;
+
+  if (IntegerType *Ty = dyn_cast<IntegerType>(V1->getType())) {
+    // Are any known bits in V1 contradictory to known bits in V2? If V1
+    // has a known zero where V2 has a known one, they must not be equal.
+    auto BitWidth = Ty->getBitWidth();
+    APInt KnownZero1(BitWidth, 0);
+    APInt KnownOne1(BitWidth, 0);
+    computeKnownBits(V1, KnownZero1, KnownOne1, DL, 0, Q);
+    APInt KnownZero2(BitWidth, 0);
+    APInt KnownOne2(BitWidth, 0);
+    computeKnownBits(V2, KnownZero2, KnownOne2, DL, 0, Q);
+
+    auto OppositeBits = (KnownZero1 & KnownOne2) | (KnownZero2 & KnownOne1);
+    if (OppositeBits.getBoolValue())
+      return true;
+  }
+  return false;
+}
+
 /// Return true if 'V & Mask' is known to be zero.  We use this predicate to
 /// simplify operations downstream. Mask is known to be zero for bits that V
 /// cannot have.
@@ -2317,6 +2556,9 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) {
 
   switch (I->getOpcode()) {
   default: break;
+  // Unsigned integers are always nonnegative.
+  case Instruction::UIToFP:
+    return true;
   case Instruction::FMul:
     // x*x is always non-negative or a NaN.
     if (I->getOperand(0) == I->getOperand(1)) 
@@ -2327,6 +2569,9 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) {
   case Instruction::FRem:
     return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) &&
            CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
+  case Instruction::Select:
+    return CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1) &&
+           CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1);
   case Instruction::FPExt:
   case Instruction::FPTrunc:
     // Widening/narrowing never change sign.
@@ -2335,6 +2580,12 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) {
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 
       switch (II->getIntrinsicID()) {
       default: break;
+      case Intrinsic::maxnum:
+        return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) ||
+               CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
+      case Intrinsic::minnum:
+        return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) &&
+               CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1);
       case Intrinsic::exp:
       case Intrinsic::exp2:
       case Intrinsic::fabs:
@@ -2545,7 +2796,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
       }
 
       // This insert value inserts something else than what we are looking for.
-      // See if the (aggregrate) value inserted into has the value we are
+      // See if the (aggregate) value inserted into has the value we are
       // looking for, then.
       if (*req_idx != *i)
         return FindInsertedValue(I->getAggregateOperand(), idx_range,
@@ -2560,7 +2811,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
   }
 
   if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
-    // If we're extracting a value from an aggregrate that was extracted from
+    // If we're extracting a value from an aggregate that was extracted from
     // something else, we can extract from that something else directly instead.
     // However, we will need to chain I's indices with the requested indices.
 
@@ -2591,7 +2842,12 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                               const DataLayout &DL) {
   unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
   APInt ByteOffset(BitWidth, 0);
-  while (1) {
+
+  // We walk up the defs but use a visited set to handle unreachable code. In
+  // that case, we stop after accumulating the cycle once (not that it
+  // matters).
+  SmallPtrSet<Value *, 16> Visited;
+  while (Visited.insert(Ptr).second) {
     if (Ptr->getType()->isVectorTy())
       break;
 
@@ -2935,20 +3191,42 @@ static bool isDereferenceableFromAttribute(const Value *V, const DataLayout &DL,
   return isDereferenceableFromAttribute(V, Offset, Ty, DL, CtxI, DT, TLI);
 }
 
-/// Return true if Value is always a dereferenceable pointer.
-///
+static bool isAligned(const Value *Base, APInt Offset, unsigned Align,
+                      const DataLayout &DL) {
+  APInt BaseAlign(Offset.getBitWidth(), getAlignment(Base, DL));
+
+  if (!BaseAlign) {
+    Type *Ty = Base->getType()->getPointerElementType();
+    if (!Ty->isSized())
+      return false;
+    BaseAlign = DL.getABITypeAlignment(Ty);
+  }
+
+  APInt Alignment(Offset.getBitWidth(), Align);
+
+  assert(Alignment.isPowerOf2() && "must be a power of 2!");
+  return BaseAlign.uge(Alignment) && !(Offset & (Alignment-1));
+}
+
+static bool isAligned(const Value *Base, unsigned Align, const DataLayout &DL) {
+  Type *Ty = Base->getType();
+  assert(Ty->isSized() && "must be sized");
+  APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
+  return isAligned(Base, Offset, Align, DL);
+}
+
 /// Test if V is always a pointer to allocated and suitably aligned memory for
 /// a simple load or store.
-static bool isDereferenceablePointer(const Value *V, const DataLayout &DL,
-                                     const Instruction *CtxI,
-                                     const DominatorTree *DT,
-                                     const TargetLibraryInfo *TLI,
-                                     SmallPtrSetImpl<const Value *> &Visited) {
+static bool isDereferenceableAndAlignedPointer(
+    const Value *V, unsigned Align, const DataLayout &DL,
+    const Instruction *CtxI, const DominatorTree *DT,
+    const TargetLibraryInfo *TLI, SmallPtrSetImpl<const Value *> &Visited) {
   // Note that it is not safe to speculate into a malloc'd region because
   // malloc may return null.
 
-  // These are obviously ok.
-  if (isa<AllocaInst>(V)) return true;
+  // These are obviously ok if aligned.
+  if (isa<AllocaInst>(V))
+    return isAligned(V, Align, DL);
 
   // It's not always safe to follow a bitcast, for example:
   //   bitcast i8* (alloca i8) to i32*
@@ -2963,21 +3241,22 @@ static bool isDereferenceablePointer(const Value *V, const DataLayout &DL,
     if (STy->isSized() && DTy->isSized() &&
         (DL.getTypeStoreSize(STy) >= DL.getTypeStoreSize(DTy)) &&
         (DL.getABITypeAlignment(STy) >= DL.getABITypeAlignment(DTy)))
-      return isDereferenceablePointer(BC->getOperand(0), DL, CtxI,
-                                      DT, TLI, Visited);
+      return isDereferenceableAndAlignedPointer(BC->getOperand(0), Align, DL,
+                                                CtxI, DT, TLI, Visited);
   }
 
   // Global variables which can't collapse to null are ok.
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    return !GV->hasExternalWeakLinkage();
+    if (!GV->hasExternalWeakLinkage())
+      return isAligned(V, Align, DL);
 
   // byval arguments are okay.
   if (const Argument *A = dyn_cast<Argument>(V))
     if (A->hasByValAttr())
-      return true;
-    
+      return isAligned(V, Align, DL);
+
   if (isDereferenceableFromAttribute(V, DL, CtxI, DT, TLI))
-    return true;
+    return isAligned(V, Align, DL);
 
   // For GEPs, determine if the indexing lands within the allocated object.
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
@@ -2985,61 +3264,76 @@ static bool isDereferenceablePointer(const Value *V, const DataLayout &DL,
     Type *Ty = VTy->getPointerElementType();
     const Value *Base = GEP->getPointerOperand();
 
-    // Conservatively require that the base pointer be fully dereferenceable.
+    // Conservatively require that the base pointer be fully dereferenceable
+    // and aligned.
     if (!Visited.insert(Base).second)
       return false;
-    if (!isDereferenceablePointer(Base, DL, CtxI,
-                                  DT, TLI, Visited))
+    if (!isDereferenceableAndAlignedPointer(Base, Align, DL, CtxI, DT, TLI,
+                                            Visited))
       return false;
-    
+
     APInt Offset(DL.getPointerTypeSizeInBits(VTy), 0);
     if (!GEP->accumulateConstantOffset(DL, Offset))
       return false;
-    
-    // Check if the load is within the bounds of the underlying object.
+
+    // Check if the load is within the bounds of the underlying object
+    // and offset is aligned.
     uint64_t LoadSize = DL.getTypeStoreSize(Ty);
     Type *BaseType = Base->getType()->getPointerElementType();
-    return (Offset + LoadSize).ule(DL.getTypeAllocSize(BaseType));
+    assert(isPowerOf2_32(Align) && "must be a power of 2!");
+    return (Offset + LoadSize).ule(DL.getTypeAllocSize(BaseType)) && 
+           !(Offset & APInt(Offset.getBitWidth(), Align-1));
   }
 
   // For gc.relocate, look through relocations
-  if (const IntrinsicInst *I = dyn_cast<IntrinsicInst>(V))
-    if (I->getIntrinsicID() == Intrinsic::experimental_gc_relocate) {
-      GCRelocateOperands RelocateInst(I);
-      return isDereferenceablePointer(RelocateInst.getDerivedPtr(), DL, CtxI,
-                                      DT, TLI, Visited);
-    }
+  if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
+    return isDereferenceableAndAlignedPointer(
+        RelocateInst->getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited);
 
   if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
-    return isDereferenceablePointer(ASC->getOperand(0), DL, CtxI,
-                                    DT, TLI, Visited);
+    return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, DL,
+                                              CtxI, DT, TLI, Visited);
 
   // If we don't know, assume the worst.
   return false;
 }
 
-bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL,
-                                    const Instruction *CtxI,
-                                    const DominatorTree *DT,
-                                    const TargetLibraryInfo *TLI) {
+bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
+                                              const DataLayout &DL,
+                                              const Instruction *CtxI,
+                                              const DominatorTree *DT,
+                                              const TargetLibraryInfo *TLI) {
   // When dereferenceability information is provided by a dereferenceable
   // attribute, we know exactly how many bytes are dereferenceable. If we can
   // determine the exact offset to the attributed variable, we can use that
   // information here.
   Type *VTy = V->getType();
   Type *Ty = VTy->getPointerElementType();
+
+  // Require ABI alignment for loads without alignment specification
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(Ty);
+
   if (Ty->isSized()) {
     APInt Offset(DL.getTypeStoreSizeInBits(VTy), 0);
     const Value *BV = V->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-    
+
     if (Offset.isNonNegative())
-      if (isDereferenceableFromAttribute(BV, Offset, Ty, DL,
-                                         CtxI, DT, TLI))
+      if (isDereferenceableFromAttribute(BV, Offset, Ty, DL, CtxI, DT, TLI) &&
+          isAligned(BV, Offset, Align, DL))
         return true;
   }
 
   SmallPtrSet<const Value *, 32> Visited;
-  return ::isDereferenceablePointer(V, DL, CtxI, DT, TLI, Visited);
+  return ::isDereferenceableAndAlignedPointer(V, Align, DL, CtxI, DT, TLI,
+                                              Visited);
+}
+
+bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL,
+                                    const Instruction *CtxI,
+                                    const DominatorTree *DT,
+                                    const TargetLibraryInfo *TLI) {
+  return isDereferenceableAndAlignedPointer(V, 1, DL, CtxI, DT, TLI);
 }
 
 bool llvm::isSafeToSpeculativelyExecute(const Value *V,
@@ -3089,10 +3383,15 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     const LoadInst *LI = cast<LoadInst>(Inst);
     if (!LI->isUnordered() ||
         // Speculative load may create a race that did not exist in the source.
-        LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
+        LI->getParent()->getParent()->hasFnAttribute(
+            Attribute::SanitizeThread) ||
+        // Speculative load may load data from dirty regions.
+        LI->getParent()->getParent()->hasFnAttribute(
+            Attribute::SanitizeAddress))
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
-    return isDereferenceablePointer(LI->getPointerOperand(), DL, CtxI, DT, TLI);
+    return isDereferenceableAndAlignedPointer(
+        LI->getPointerOperand(), LI->getAlignment(), DL, CtxI, DT, TLI);
   }
   case Instruction::Call: {
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
@@ -3147,16 +3446,27 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
   case Instruction::Switch:
   case Instruction::Unreachable:
   case Instruction::Fence:
-  case Instruction::LandingPad:
   case Instruction::AtomicRMW:
   case Instruction::AtomicCmpXchg:
+  case Instruction::LandingPad:
   case Instruction::Resume:
+  case Instruction::CatchSwitch:
+  case Instruction::CatchPad:
+  case Instruction::CatchRet:
+  case Instruction::CleanupPad:
+  case Instruction::CleanupRet:
     return false; // Misc instructions which have effects
   }
 }
 
+bool llvm::mayBeMemoryDependent(const Instruction &I) {
+  return I.mayReadOrWriteMemory() || !isSafeToSpeculativelyExecute(&I);
+}
+
 /// Return true if we know that the specified value is never null.
 bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
+  assert(V->getType()->isPointerTy() && "V must be pointer type");
+
   // Alloca never returns null, malloc might.
   if (isa<AllocaInst>(V)) return true;
 
@@ -3164,9 +3474,12 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
   if (const Argument *A = dyn_cast<Argument>(V))
     return A->hasByValOrInAllocaAttr() || A->hasNonNullAttr();
 
-  // Global values are not null unless extern weak.
+  // A global variable in address space 0 is non null unless extern weak.
+  // Other address spaces may have null as a valid address for a global,
+  // so we can't assume anything.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-    return !GV->hasExternalWeakLinkage();
+    return !GV->hasExternalWeakLinkage() &&
+           GV->getType()->getAddressSpace() == 0;
 
   // A Load tagged w/nonnull metadata is never null. 
   if (const LoadInst *LI = dyn_cast<LoadInst>(V))
@@ -3176,16 +3489,14 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
     if (CS.isReturnNonNull())
       return true;
 
-  // operator new never returns null.
-  if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
-    return true;
-
   return false;
 }
 
 static bool isKnownNonNullFromDominatingCondition(const Value *V,
                                                   const Instruction *CtxI,
                                                   const DominatorTree *DT) {
+  assert(V->getType()->isPointerTy() && "V must be pointer type");
+
   unsigned NumUsesExplored = 0;
   for (auto U : V->users()) {
     // Avoid massive lists
@@ -3316,40 +3627,339 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
   return OverflowResult::MayOverflow;
 }
 
-static SelectPatternFlavor matchSelectPattern(ICmpInst::Predicate Pred,
+static OverflowResult computeOverflowForSignedAdd(
+    Value *LHS, Value *RHS, AddOperator *Add, const DataLayout &DL,
+    AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) {
+  if (Add && Add->hasNoSignedWrap()) {
+    return OverflowResult::NeverOverflows;
+  }
+
+  bool LHSKnownNonNegative, LHSKnownNegative;
+  bool RHSKnownNonNegative, RHSKnownNegative;
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
+                 AC, CxtI, DT);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
+                 AC, CxtI, DT);
+
+  if ((LHSKnownNonNegative && RHSKnownNegative) ||
+      (LHSKnownNegative && RHSKnownNonNegative)) {
+    // The sign bits are opposite: this CANNOT overflow.
+    return OverflowResult::NeverOverflows;
+  }
+
+  // The remaining code needs Add to be available. Early returns if not so.
+  if (!Add)
+    return OverflowResult::MayOverflow;
+
+  // If the sign of Add is the same as at least one of the operands, this add
+  // CANNOT overflow. This is particularly useful when the sum is
+  // @llvm.assume'ed non-negative rather than proved so from analyzing its
+  // operands.
+  bool LHSOrRHSKnownNonNegative =
+      (LHSKnownNonNegative || RHSKnownNonNegative);
+  bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative);
+  if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
+    bool AddKnownNonNegative, AddKnownNegative;
+    ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL,
+                   /*Depth=*/0, AC, CxtI, DT);
+    if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) ||
+        (AddKnownNegative && LHSOrRHSKnownNegative)) {
+      return OverflowResult::NeverOverflows;
+    }
+  }
+
+  return OverflowResult::MayOverflow;
+}
+
+OverflowResult llvm::computeOverflowForSignedAdd(AddOperator *Add,
+                                                 const DataLayout &DL,
+                                                 AssumptionCache *AC,
+                                                 const Instruction *CxtI,
+                                                 const DominatorTree *DT) {
+  return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1),
+                                       Add, DL, AC, CxtI, DT);
+}
+
+OverflowResult llvm::computeOverflowForSignedAdd(Value *LHS, Value *RHS,
+                                                 const DataLayout &DL,
+                                                 AssumptionCache *AC,
+                                                 const Instruction *CxtI,
+                                                 const DominatorTree *DT) {
+  return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT);
+}
+
+bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
+  // FIXME: This conservative implementation can be relaxed. E.g. most
+  // atomic operations are guaranteed to terminate on most platforms
+  // and most functions terminate.
+
+  return !I->isAtomic() &&       // atomics may never succeed on some platforms
+         !isa<CallInst>(I) &&    // could throw and might not terminate
+         !isa<InvokeInst>(I) &&  // might not terminate and could throw to
+                                 //   non-successor (see bug 24185 for details).
+         !isa<ResumeInst>(I) &&  // has no successors
+         !isa<ReturnInst>(I);    // has no successors
+}
+
+bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
+                                                  const Loop *L) {
+  // The loop header is guaranteed to be executed for every iteration.
+  //
+  // FIXME: Relax this constraint to cover all basic blocks that are
+  // guaranteed to be executed at every iteration.
+  if (I->getParent() != L->getHeader()) return false;
+
+  for (const Instruction &LI : *L->getHeader()) {
+    if (&LI == I) return true;
+    if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false;
+  }
+  llvm_unreachable("Instruction not contained in its own parent basic block.");
+}
+
+bool llvm::propagatesFullPoison(const Instruction *I) {
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Xor:
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+      // These operations all propagate poison unconditionally. Note that poison
+      // is not any particular value, so xor or subtraction of poison with
+      // itself still yields poison, not zero.
+      return true;
+
+    case Instruction::AShr:
+    case Instruction::SExt:
+      // For these operations, one bit of the input is replicated across
+      // multiple output bits. A replicated poison bit is still poison.
+      return true;
+
+    case Instruction::Shl: {
+      // Left shift *by* a poison value is poison. The number of
+      // positions to shift is unsigned, so no negative values are
+      // possible there. Left shift by zero places preserves poison. So
+      // it only remains to consider left shift of poison by a positive
+      // number of places.
+      //
+      // A left shift by a positive number of places leaves the lowest order bit
+      // non-poisoned. However, if such a shift has a no-wrap flag, then we can
+      // make the poison operand violate that flag, yielding a fresh full-poison
+      // value.
+      auto *OBO = cast<OverflowingBinaryOperator>(I);
+      return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap();
+    }
+
+    case Instruction::Mul: {
+      // A multiplication by zero yields a non-poison zero result, so we need to
+      // rule out zero as an operand. Conservatively, multiplication by a
+      // non-zero constant is not multiplication by zero.
+      //
+      // Multiplication by a non-zero constant can leave some bits
+      // non-poisoned. For example, a multiplication by 2 leaves the lowest
+      // order bit unpoisoned. So we need to consider that.
+      //
+      // Multiplication by 1 preserves poison. If the multiplication has a
+      // no-wrap flag, then we can make the poison operand violate that flag
+      // when multiplied by any integer other than 0 and 1.
+      auto *OBO = cast<OverflowingBinaryOperator>(I);
+      if (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) {
+        for (Value *V : OBO->operands()) {
+          if (auto *CI = dyn_cast<ConstantInt>(V)) {
+            // A ConstantInt cannot yield poison, so we can assume that it is
+            // the other operand that is poison.
+            return !CI->isZero();
+          }
+        }
+      }
+      return false;
+    }
+
+    case Instruction::GetElementPtr:
+      // A GEP implicitly represents a sequence of additions, subtractions,
+      // truncations, sign extensions and multiplications. The multiplications
+      // are by the non-zero sizes of some set of types, so we do not have to be
+      // concerned with multiplication by zero. If the GEP is in-bounds, then
+      // these operations are implicitly no-signed-wrap so poison is propagated
+      // by the arguments above for Add, Sub, Trunc, SExt and Mul.
+      return cast<GEPOperator>(I)->isInBounds();
+
+    default:
+      return false;
+  }
+}
+
+const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) {
+  switch (I->getOpcode()) {
+    case Instruction::Store:
+      return cast<StoreInst>(I)->getPointerOperand();
+
+    case Instruction::Load:
+      return cast<LoadInst>(I)->getPointerOperand();
+
+    case Instruction::AtomicCmpXchg:
+      return cast<AtomicCmpXchgInst>(I)->getPointerOperand();
+
+    case Instruction::AtomicRMW:
+      return cast<AtomicRMWInst>(I)->getPointerOperand();
+
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+      return I->getOperand(1);
+
+    default:
+      return nullptr;
+  }
+}
+
+bool llvm::isKnownNotFullPoison(const Instruction *PoisonI) {
+  // We currently only look for uses of poison values within the same basic
+  // block, as that makes it easier to guarantee that the uses will be
+  // executed given that PoisonI is executed.
+  //
+  // FIXME: Expand this to consider uses beyond the same basic block. To do
+  // this, look out for the distinction between post-dominance and strong
+  // post-dominance.
+  const BasicBlock *BB = PoisonI->getParent();
+
+  // Set of instructions that we have proved will yield poison if PoisonI
+  // does.
+  SmallSet<const Value *, 16> YieldsPoison;
+  YieldsPoison.insert(PoisonI);
+
+  for (BasicBlock::const_iterator I = PoisonI->getIterator(), E = BB->end();
+       I != E; ++I) {
+    if (&*I != PoisonI) {
+      const Value *NotPoison = getGuaranteedNonFullPoisonOp(&*I);
+      if (NotPoison != nullptr && YieldsPoison.count(NotPoison)) return true;
+      if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+        return false;
+    }
+
+    // Mark poison that propagates from I through uses of I.
+    if (YieldsPoison.count(&*I)) {
+      for (const User *User : I->users()) {
+        const Instruction *UserI = cast<Instruction>(User);
+        if (UserI->getParent() == BB && propagatesFullPoison(UserI))
+          YieldsPoison.insert(User);
+      }
+    }
+  }
+  return false;
+}
+
+static bool isKnownNonNaN(Value *V, FastMathFlags FMF) {
+  if (FMF.noNaNs())
+    return true;
+
+  if (auto *C = dyn_cast<ConstantFP>(V))
+    return !C->isNaN();
+  return false;
+}
+
+static bool isKnownNonZero(Value *V) {
+  if (auto *C = dyn_cast<ConstantFP>(V))
+    return !C->isZero();
+  return false;
+}
+
+static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
+                                              FastMathFlags FMF,
                                               Value *CmpLHS, Value *CmpRHS,
                                               Value *TrueVal, Value *FalseVal,
                                               Value *&LHS, Value *&RHS) {
   LHS = CmpLHS;
   RHS = CmpRHS;
 
-  // (icmp X, Y) ? X : Y
-  if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
-    switch (Pred) {
-    default: return SPF_UNKNOWN; // Equality.
-    case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_UGE: return SPF_UMAX;
-    case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE: return SPF_SMAX;
-    case ICmpInst::ICMP_ULT:
-    case ICmpInst::ICMP_ULE: return SPF_UMIN;
-    case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE: return SPF_SMIN;
+  // If the predicate is an "or-equal"  (FP) predicate, then signed zeroes may
+  // return inconsistent results between implementations.
+  //   (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0
+  //   minNum(0.0, -0.0)          // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1)
+  // Therefore we behave conservatively and only proceed if at least one of the
+  // operands is known to not be zero, or if we don't care about signed zeroes.
+  switch (Pred) {
+  default: break;
+  case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE:
+    if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
+        !isKnownNonZero(CmpRHS))
+      return {SPF_UNKNOWN, SPNB_NA, false};
+  }
+
+  SelectPatternNaNBehavior NaNBehavior = SPNB_NA;
+  bool Ordered = false;
+
+  // When given one NaN and one non-NaN input:
+  //   - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input.
+  //   - A simple C99 (a < b ? a : b) construction will return 'b' (as the
+  //     ordered comparison fails), which could be NaN or non-NaN.
+  // so here we discover exactly what NaN behavior is required/accepted.
+  if (CmpInst::isFPPredicate(Pred)) {
+    bool LHSSafe = isKnownNonNaN(CmpLHS, FMF);
+    bool RHSSafe = isKnownNonNaN(CmpRHS, FMF);
+
+    if (LHSSafe && RHSSafe) {
+      // Both operands are known non-NaN.
+      NaNBehavior = SPNB_RETURNS_ANY;
+    } else if (CmpInst::isOrdered(Pred)) {
+      // An ordered comparison will return false when given a NaN, so it
+      // returns the RHS.
+      Ordered = true;
+      if (LHSSafe)
+        // LHS is non-NaN, so if RHS is NaN then NaN will be returned.
+        NaNBehavior = SPNB_RETURNS_NAN;
+      else if (RHSSafe)
+        NaNBehavior = SPNB_RETURNS_OTHER;
+      else
+        // Completely unsafe.
+        return {SPF_UNKNOWN, SPNB_NA, false};
+    } else {
+      Ordered = false;
+      // An unordered comparison will return true when given a NaN, so it
+      // returns the LHS.
+      if (LHSSafe)
+        // LHS is non-NaN, so if RHS is NaN then non-NaN will be returned.
+        NaNBehavior = SPNB_RETURNS_OTHER;
+      else if (RHSSafe)
+        NaNBehavior = SPNB_RETURNS_NAN;
+      else
+        // Completely unsafe.
+        return {SPF_UNKNOWN, SPNB_NA, false};
     }
   }
 
-  // (icmp X, Y) ? Y : X
   if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
+    std::swap(CmpLHS, CmpRHS);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+    if (NaNBehavior == SPNB_RETURNS_NAN)
+      NaNBehavior = SPNB_RETURNS_OTHER;
+    else if (NaNBehavior == SPNB_RETURNS_OTHER)
+      NaNBehavior = SPNB_RETURNS_NAN;
+    Ordered = !Ordered;
+  }
+
+  // ([if]cmp X, Y) ? X : Y
+  if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
     switch (Pred) {
-    default: return SPF_UNKNOWN; // Equality.
+    default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality.
     case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_UGE: return SPF_UMIN;
+    case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false};
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE: return SPF_SMIN;
+    case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false};
     case ICmpInst::ICMP_ULT:
-    case ICmpInst::ICMP_ULE: return SPF_UMAX;
+    case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false};
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE: return SPF_SMAX;
+    case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false};
+    case FCmpInst::FCMP_UGT:
+    case FCmpInst::FCMP_UGE:
+    case FCmpInst::FCMP_OGT:
+    case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered};
+    case FCmpInst::FCMP_ULT:
+    case FCmpInst::FCMP_ULE:
+    case FCmpInst::FCMP_OLT:
+    case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered};
     }
   }
 
@@ -3360,13 +3970,13 @@ static SelectPatternFlavor matchSelectPattern(ICmpInst::Predicate Pred,
       // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
       // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
       if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) {
-        return (CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS;
+        return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
 
       // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
       // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
       if (Pred == ICmpInst::ICMP_SLT && (C1->isZero() || C1->isOne())) {
-        return (CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS;
+        return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
     }
     
@@ -3377,24 +3987,36 @@ static SelectPatternFlavor matchSelectPattern(ICmpInst::Predicate Pred,
            match(CmpLHS, m_Not(m_Specific(TrueVal))))) {
         LHS = TrueVal;
         RHS = FalseVal;
-        return SPF_SMIN;
+        return {SPF_SMIN, SPNB_NA, false};
       }
     }
   }
 
   // TODO: (X > 4) ? X : 5   -->  (X >= 5) ? X : 5  -->  MAX(X, 5)
 
-  return SPF_UNKNOWN;
+  return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
-static Constant *lookThroughCast(ICmpInst *CmpI, Value *V1, Value *V2,
-                                 Instruction::CastOps *CastOp) {
+static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
+                              Instruction::CastOps *CastOp) {
   CastInst *CI = dyn_cast<CastInst>(V1);
   Constant *C = dyn_cast<Constant>(V2);
-  if (!CI || !C)
+  CastInst *CI2 = dyn_cast<CastInst>(V2);
+  if (!CI)
     return nullptr;
   *CastOp = CI->getOpcode();
 
+  if (CI2) {
+    // If V1 and V2 are both the same cast from the same type, we can look
+    // through V1.
+    if (CI2->getOpcode() == CI->getOpcode() &&
+        CI2->getSrcTy() == CI->getSrcTy())
+      return CI2->getOperand(0);
+    return nullptr;
+  } else if (!C) {
+    return nullptr;
+  }
+
   if (isa<SExtInst>(CI) && CmpI->isSigned()) {
     Constant *T = ConstantExpr::getTrunc(C, CI->getSrcTy());
     // This is only valid if the truncated value can be sign-extended
@@ -3409,39 +4031,200 @@ static Constant *lookThroughCast(ICmpInst *CmpI, Value *V1, Value *V2,
   if (isa<TruncInst>(CI))
     return ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned());
 
+  if (isa<FPToUIInst>(CI))
+    return ConstantExpr::getUIToFP(C, CI->getSrcTy(), true);
+
+  if (isa<FPToSIInst>(CI))
+    return ConstantExpr::getSIToFP(C, CI->getSrcTy(), true);
+
+  if (isa<UIToFPInst>(CI))
+    return ConstantExpr::getFPToUI(C, CI->getSrcTy(), true);
+
+  if (isa<SIToFPInst>(CI))
+    return ConstantExpr::getFPToSI(C, CI->getSrcTy(), true);
+
+  if (isa<FPTruncInst>(CI))
+    return ConstantExpr::getFPExtend(C, CI->getSrcTy(), true);
+
+  if (isa<FPExtInst>(CI))
+    return ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true);
+
   return nullptr;
 }
 
-SelectPatternFlavor llvm::matchSelectPattern(Value *V,
+SelectPatternResult llvm::matchSelectPattern(Value *V,
                                              Value *&LHS, Value *&RHS,
                                              Instruction::CastOps *CastOp) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
-  if (!SI) return SPF_UNKNOWN;
+  if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};
 
-  ICmpInst *CmpI = dyn_cast<ICmpInst>(SI->getCondition());
-  if (!CmpI) return SPF_UNKNOWN;
+  CmpInst *CmpI = dyn_cast<CmpInst>(SI->getCondition());
+  if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false};
 
-  ICmpInst::Predicate Pred = CmpI->getPredicate();
+  CmpInst::Predicate Pred = CmpI->getPredicate();
   Value *CmpLHS = CmpI->getOperand(0);
   Value *CmpRHS = CmpI->getOperand(1);
   Value *TrueVal = SI->getTrueValue();
   Value *FalseVal = SI->getFalseValue();
+  FastMathFlags FMF;
+  if (isa<FPMathOperator>(CmpI))
+    FMF = CmpI->getFastMathFlags();
 
   // Bail out early.
   if (CmpI->isEquality())
-    return SPF_UNKNOWN;
+    return {SPF_UNKNOWN, SPNB_NA, false};
 
   // Deal with type mismatches.
   if (CastOp && CmpLHS->getType() != TrueVal->getType()) {
-    if (Constant *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp))
-      return ::matchSelectPattern(Pred, CmpLHS, CmpRHS,
+    if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp))
+      return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                   cast<CastInst>(TrueVal)->getOperand(0), C,
                                   LHS, RHS);
-    if (Constant *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp))
-      return ::matchSelectPattern(Pred, CmpLHS, CmpRHS,
+    if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp))
+      return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                   C, cast<CastInst>(FalseVal)->getOperand(0),
                                   LHS, RHS);
   }
-  return ::matchSelectPattern(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal,
+  return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal,
                               LHS, RHS);
 }
+
+ConstantRange llvm::getConstantRangeFromMetadata(MDNode &Ranges) {
+  const unsigned NumRanges = Ranges.getNumOperands() / 2;
+  assert(NumRanges >= 1 && "Must have at least one range!");
+  assert(Ranges.getNumOperands() % 2 == 0 && "Must be a sequence of pairs");
+
+  auto *FirstLow = mdconst::extract<ConstantInt>(Ranges.getOperand(0));
+  auto *FirstHigh = mdconst::extract<ConstantInt>(Ranges.getOperand(1));
+
+  ConstantRange CR(FirstLow->getValue(), FirstHigh->getValue());
+
+  for (unsigned i = 1; i < NumRanges; ++i) {
+    auto *Low = mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
+    auto *High = mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
+
+    // Note: unionWith will potentially create a range that contains values not
+    // contained in any of the original N ranges.
+    CR = CR.unionWith(ConstantRange(Low->getValue(), High->getValue()));
+  }
+
+  return CR;
+}
+
+/// Return true if "icmp Pred LHS RHS" is always true.
+static bool isTruePredicate(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
+                            const DataLayout &DL, unsigned Depth,
+                            AssumptionCache *AC, const Instruction *CxtI,
+                            const DominatorTree *DT) {
+  assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!");
+  if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
+    return true;
+
+  switch (Pred) {
+  default:
+    return false;
+
+  case CmpInst::ICMP_SLE: {
+    const APInt *C;
+
+    // LHS s<= LHS +_{nsw} C   if C >= 0
+    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
+      return !C->isNegative();
+    return false;
+  }
+
+  case CmpInst::ICMP_ULE: {
+    const APInt *C;
+
+    // LHS u<= LHS +_{nuw} C   for any C
+    if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C))))
+      return true;
+
+    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
+    auto MatchNUWAddsToSameValue = [&](Value *A, Value *B, Value *&X,
+                                       const APInt *&CA, const APInt *&CB) {
+      if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
+          match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
+        return true;
+
+      // If X & C == 0 then (X | C) == X +_{nuw} C
+      if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
+          match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
+        unsigned BitWidth = CA->getBitWidth();
+        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+        computeKnownBits(X, KnownZero, KnownOne, DL, Depth + 1, AC, CxtI, DT);
+
+        if ((KnownZero & *CA) == *CA && (KnownZero & *CB) == *CB)
+          return true;
+      }
+
+      return false;
+    };
+
+    Value *X;
+    const APInt *CLHS, *CRHS;
+    if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
+      return CLHS->ule(*CRHS);
+
+    return false;
+  }
+  }
+}
+
+/// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred
+/// ALHS ARHS" is true.
+static bool isImpliedCondOperands(CmpInst::Predicate Pred, Value *ALHS,
+                                  Value *ARHS, Value *BLHS, Value *BRHS,
+                                  const DataLayout &DL, unsigned Depth,
+                                  AssumptionCache *AC, const Instruction *CxtI,
+                                  const DominatorTree *DT) {
+  switch (Pred) {
+  default:
+    return false;
+
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SLE:
+    return isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI,
+                           DT) &&
+           isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI,
+                           DT);
+
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_ULE:
+    return isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI,
+                           DT) &&
+           isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI,
+                           DT);
+  }
+}
+
+bool llvm::isImpliedCondition(Value *LHS, Value *RHS, const DataLayout &DL,
+                              unsigned Depth, AssumptionCache *AC,
+                              const Instruction *CxtI,
+                              const DominatorTree *DT) {
+  assert(LHS->getType() == RHS->getType() && "mismatched type");
+  Type *OpTy = LHS->getType();
+  assert(OpTy->getScalarType()->isIntegerTy(1));
+
+  // LHS ==> RHS by definition
+  if (LHS == RHS) return true;
+
+  if (OpTy->isVectorTy())
+    // TODO: extending the code below to handle vectors
+    return false;
+  assert(OpTy->isIntegerTy(1) && "implied by above");
+
+  ICmpInst::Predicate APred, BPred;
+  Value *ALHS, *ARHS;
+  Value *BLHS, *BRHS;
+
+  if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS))) ||
+      !match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS))))
+    return false;
+
+  if (APred == BPred)
+    return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth, AC,
+                                 CxtI, DT);
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/VectorUtils.cpp b/contrib/llvm/lib/Analysis/VectorUtils.cpp
index 8c671ef..4b244ec 100644
--- a/contrib/llvm/lib/Analysis/VectorUtils.cpp
+++ b/contrib/llvm/lib/Analysis/VectorUtils.cpp
@@ -11,13 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
 
 /// \brief Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all
@@ -79,7 +86,7 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
 /// d) call should only reads memory.
 /// If all these condition is met then return ValidIntrinsicID
 /// else return not_intrinsic.
-llvm::Intrinsic::ID
+Intrinsic::ID
 llvm::checkUnaryFloatSignature(const CallInst &I,
                                Intrinsic::ID ValidIntrinsicID) {
   if (I.getNumArgOperands() != 1 ||
@@ -98,7 +105,7 @@ llvm::checkUnaryFloatSignature(const CallInst &I,
 /// d) call should only reads memory.
 /// If all these condition is met then return ValidIntrinsicID
 /// else return not_intrinsic.
-llvm::Intrinsic::ID
+Intrinsic::ID
 llvm::checkBinaryFloatSignature(const CallInst &I,
                                 Intrinsic::ID ValidIntrinsicID) {
   if (I.getNumArgOperands() != 2 ||
@@ -114,8 +121,8 @@ llvm::checkBinaryFloatSignature(const CallInst &I,
 /// \brief Returns intrinsic ID for call.
 /// For the input call instruction it finds mapping intrinsic and returns
 /// its ID, in case it does not found it return not_intrinsic.
-llvm::Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI,
-                                                const TargetLibraryInfo *TLI) {
+Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI,
+                                          const TargetLibraryInfo *TLI) {
   // If we have an intrinsic call, check if it is trivially vectorizable.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
     Intrinsic::ID ID = II->getIntrinsicID();
@@ -228,8 +235,7 @@ unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
       cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
 
   // Walk backwards and try to peel off zeros.
-  while (LastOperand > 1 &&
-         match(Gep->getOperand(LastOperand), llvm::PatternMatch::m_Zero())) {
+  while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
     // Find the type we're currently indexing into.
     gep_type_iterator GEPTI = gep_type_begin(Gep);
     std::advance(GEPTI, LastOperand - 1);
@@ -247,8 +253,7 @@ unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
 /// \brief If the argument is a GEP, then returns the operand identified by
 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 /// operand, it returns that instead.
-llvm::Value *llvm::stripGetElementPtr(llvm::Value *Ptr, ScalarEvolution *SE,
-                                      Loop *Lp) {
+Value *llvm::stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP)
     return Ptr;
@@ -265,8 +270,8 @@ llvm::Value *llvm::stripGetElementPtr(llvm::Value *Ptr, ScalarEvolution *SE,
 }
 
 /// \brief If a value has only one user that is a CastInst, return it.
-llvm::Value *llvm::getUniqueCastUse(llvm::Value *Ptr, Loop *Lp, Type *Ty) {
-  llvm::Value *UniqueCast = nullptr;
+Value *llvm::getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
+  Value *UniqueCast = nullptr;
   for (User *U : Ptr->users()) {
     CastInst *CI = dyn_cast<CastInst>(U);
     if (CI && CI->getType() == Ty) {
@@ -281,16 +286,15 @@ llvm::Value *llvm::getUniqueCastUse(llvm::Value *Ptr, Loop *Lp, Type *Ty) {
 
 /// \brief Get the stride of a pointer access in a loop. Looks for symbolic
 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
-llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE,
-                                        Loop *Lp) {
-  const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
+  auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
   if (!PtrTy || PtrTy->isAggregateType())
     return nullptr;
 
   // Try to remove a gep instruction to make the pointer (actually index at this
   // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
   // pointer, otherwise, we are analyzing the index.
-  llvm::Value *OrigPtr = Ptr;
+  Value *OrigPtr = Ptr;
 
   // The size of the pointer access.
   int64_t PtrAccessSize = 1;
@@ -320,8 +324,7 @@ llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE,
       if (M->getOperand(0)->getSCEVType() != scConstant)
         return nullptr;
 
-      const APInt &APStepVal =
-          cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
+      const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
 
       // Huge step value - give up.
       if (APStepVal.getBitWidth() > 64)
@@ -346,7 +349,7 @@ llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE,
   if (!U)
     return nullptr;
 
-  llvm::Value *Stride = U->getValue();
+  Value *Stride = U->getValue();
   if (!Lp->isLoopInvariant(Stride))
     return nullptr;
 
@@ -361,7 +364,7 @@ llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE,
 /// \brief Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
-llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) {
+Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
   assert(V->getType()->isVectorTy() && "Not looking at a vector?");
   VectorType *VTy = cast<VectorType>(V->getType());
   unsigned Width = VTy->getNumElements();
@@ -399,14 +402,166 @@ llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) {
 
   // Extract a value from a vector add operation with a constant zero.
   Value *Val = nullptr; Constant *Con = nullptr;
-  if (match(V,
-            llvm::PatternMatch::m_Add(llvm::PatternMatch::m_Value(Val),
-                                      llvm::PatternMatch::m_Constant(Con)))) {
+  if (match(V, m_Add(m_Value(Val), m_Constant(Con))))
     if (Constant *Elt = Con->getAggregateElement(EltNo))
       if (Elt->isNullValue())
         return findScalarElement(Val, EltNo);
-  }
 
   // Otherwise, we don't know.
   return nullptr;
 }
+
+/// \brief Get splat value if the input is a splat vector or return nullptr.
+/// This function is not fully general. It checks only 2 cases:
+/// the input value is (1) a splat constants vector or (2) a sequence
+/// of instructions that broadcast a single value into a vector.
+///
+const llvm::Value *llvm::getSplatValue(const Value *V) {
+
+  if (auto *C = dyn_cast<Constant>(V))
+    if (isa<VectorType>(V->getType()))
+      return C->getSplatValue();
+
+  auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V);
+  if (!ShuffleInst)
+    return nullptr;
+  // All-zero (or undef) shuffle mask elements.
+  for (int MaskElt : ShuffleInst->getShuffleMask())
+    if (MaskElt != 0 && MaskElt != -1)
+      return nullptr;
+  // The first shuffle source is 'insertelement' with index 0.
+  auto *InsertEltInst =
+    dyn_cast<InsertElementInst>(ShuffleInst->getOperand(0));
+  if (!InsertEltInst || !isa<ConstantInt>(InsertEltInst->getOperand(2)) ||
+      !cast<ConstantInt>(InsertEltInst->getOperand(2))->isNullValue())
+    return nullptr;
+
+  return InsertEltInst->getOperand(1);
+}
+
+MapVector<Instruction *, uint64_t>
+llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
+                               const TargetTransformInfo *TTI) {
+
+  // DemandedBits will give us every value's live-out bits. But we want
+  // to ensure no extra casts would need to be inserted, so every DAG
+  // of connected values must have the same minimum bitwidth.
+  EquivalenceClasses<Value *> ECs;
+  SmallVector<Value *, 16> Worklist;
+  SmallPtrSet<Value *, 4> Roots;
+  SmallPtrSet<Value *, 16> Visited;
+  DenseMap<Value *, uint64_t> DBits;
+  SmallPtrSet<Instruction *, 4> InstructionSet;
+  MapVector<Instruction *, uint64_t> MinBWs;
+
+  // Determine the roots. We work bottom-up, from truncs or icmps.
+  bool SeenExtFromIllegalType = false;
+  for (auto *BB : Blocks)
+    for (auto &I : *BB) {
+      InstructionSet.insert(&I);
+
+      if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) &&
+          !TTI->isTypeLegal(I.getOperand(0)->getType()))
+        SeenExtFromIllegalType = true;
+
+      // Only deal with non-vector integers up to 64-bits wide.
+      if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) &&
+          !I.getType()->isVectorTy() &&
+          I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
+        // Don't make work for ourselves. If we know the loaded type is legal,
+        // don't add it to the worklist.
+        if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType()))
+          continue;
+
+        Worklist.push_back(&I);
+        Roots.insert(&I);
+      }
+    }
+  // Early exit.
+  if (Worklist.empty() || (TTI && !SeenExtFromIllegalType))
+    return MinBWs;
+
+  // Now proceed breadth-first, unioning values together.
+  while (!Worklist.empty()) {
+    Value *Val = Worklist.pop_back_val();
+    Value *Leader = ECs.getOrInsertLeaderValue(Val);
+
+    if (Visited.count(Val))
+      continue;
+    Visited.insert(Val);
+
+    // Non-instructions terminate a chain successfully.
+    if (!isa<Instruction>(Val))
+      continue;
+    Instruction *I = cast<Instruction>(Val);
+
+    // If we encounter a type that is larger than 64 bits, we can't represent
+    // it so bail out.
+    if (DB.getDemandedBits(I).getBitWidth() > 64)
+      return MapVector<Instruction *, uint64_t>();
+
+    uint64_t V = DB.getDemandedBits(I).getZExtValue();
+    DBits[Leader] |= V;
+
+    // Casts, loads and instructions outside of our range terminate a chain
+    // successfully.
+    if (isa<SExtInst>(I) || isa<ZExtInst>(I) || isa<LoadInst>(I) ||
+        !InstructionSet.count(I))
+      continue;
+
+    // Unsafe casts terminate a chain unsuccessfully. We can't do anything
+    // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
+    // transform anything that relies on them.
+    if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I) || isa<IntToPtrInst>(I) ||
+        !I->getType()->isIntegerTy()) {
+      DBits[Leader] |= ~0ULL;
+      continue;
+    }
+
+    // We don't modify the types of PHIs. Reductions will already have been
+    // truncated if possible, and inductions' sizes will have been chosen by
+    // indvars.
+    if (isa<PHINode>(I))
+      continue;
+
+    if (DBits[Leader] == ~0ULL)
+      // All bits demanded, no point continuing.
+      continue;
+
+    for (Value *O : cast<User>(I)->operands()) {
+      ECs.unionSets(Leader, O);
+      Worklist.push_back(O);
+    }
+  }
+
+  // Now we've discovered all values, walk them to see if there are
+  // any users we didn't see. If there are, we can't optimize that
+  // chain.
+  for (auto &I : DBits)
+    for (auto *U : I.first->users())
+      if (U->getType()->isIntegerTy() && DBits.count(U) == 0)
+        DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL;
+
+  for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) {
+    uint64_t LeaderDemandedBits = 0;
+    for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
+      LeaderDemandedBits |= DBits[*MI];
+
+    uint64_t MinBW = (sizeof(LeaderDemandedBits) * 8) -
+                     llvm::countLeadingZeros(LeaderDemandedBits);
+    // Round up to a power of 2
+    if (!isPowerOf2_64((uint64_t)MinBW))
+      MinBW = NextPowerOf2(MinBW);
+    for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) {
+      if (!isa<Instruction>(*MI))
+        continue;
+      Type *Ty = (*MI)->getType();
+      if (Roots.count(*MI))
+        Ty = cast<Instruction>(*MI)->getOperand(0)->getType();
+      if (MinBW < Ty->getScalarSizeInBits())
+        MinBWs[cast<Instruction>(*MI)] = MinBW;
+    }
+  }
+
+  return MinBWs;
+}
diff --git a/contrib/llvm/lib/AsmParser/LLLexer.cpp b/contrib/llvm/lib/AsmParser/LLLexer.cpp
index 5c4bab7..26eca23 100644
--- a/contrib/llvm/lib/AsmParser/LLLexer.cpp
+++ b/contrib/llvm/lib/AsmParser/LLLexer.cpp
@@ -105,7 +105,7 @@ void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
     Pair[1] += hexDigitValue(*Buffer);
   }
   Pair[0] = 0;
-  for (int i=0; i<16; i++, Buffer++) {
+  for (int i = 0; i < 16 && Buffer != End; i++, Buffer++) {
     Pair[0] *= 16;
     Pair[0] += hexDigitValue(*Buffer);
   }
@@ -523,9 +523,14 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(zeroinitializer);
   KEYWORD(undef);
   KEYWORD(null);
+  KEYWORD(none);
   KEYWORD(to);
+  KEYWORD(caller);
+  KEYWORD(within);
+  KEYWORD(from);
   KEYWORD(tail);
   KEYWORD(musttail);
+  KEYWORD(notail);
   KEYWORD(target);
   KEYWORD(triple);
   KEYWORD(unwind);
@@ -586,6 +591,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(preserve_mostcc);
   KEYWORD(preserve_allcc);
   KEYWORD(ghccc);
+  KEYWORD(x86_intrcc);
+  KEYWORD(hhvmcc);
+  KEYWORD(hhvm_ccc);
+  KEYWORD(cxx_fast_tlscc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -601,6 +610,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(convergent);
   KEYWORD(dereferenceable);
   KEYWORD(dereferenceable_or_null);
+  KEYWORD(inaccessiblememonly);
+  KEYWORD(inaccessiblemem_or_argmemonly);
   KEYWORD(inlinehint);
   KEYWORD(inreg);
   KEYWORD(jumptable);
@@ -613,6 +624,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noduplicate);
   KEYWORD(noimplicitfloat);
   KEYWORD(noinline);
+  KEYWORD(norecurse);
   KEYWORD(nonlazybind);
   KEYWORD(nonnull);
   KEYWORD(noredzone);
@@ -690,6 +702,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("label",     Type::getLabelTy(Context));
   TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
   TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
+  TYPEKEYWORD("token",     Type::getTokenTy(Context));
 #undef TYPEKEYWORD
 
   // Keywords for instructions.
@@ -749,6 +762,11 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(extractvalue,   ExtractValue);
   INSTKEYWORD(insertvalue,    InsertValue);
   INSTKEYWORD(landingpad,     LandingPad);
+  INSTKEYWORD(cleanupret,     CleanupRet);
+  INSTKEYWORD(catchret,       CatchRet);
+  INSTKEYWORD(catchswitch,  CatchSwitch);
+  INSTKEYWORD(catchpad,     CatchPad);
+  INSTKEYWORD(cleanuppad,   CleanupPad);
 #undef INSTKEYWORD
 
 #define DWKEYWORD(TYPE, TOKEN)                                                 \
@@ -763,6 +781,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   DWKEYWORD(VIRTUALITY, DwarfVirtuality);
   DWKEYWORD(LANG, DwarfLang);
   DWKEYWORD(OP, DwarfOp);
+  DWKEYWORD(MACINFO, DwarfMacinfo);
 #undef DWKEYWORD
 
   if (Keyword.startswith("DIFlag")) {
diff --git a/contrib/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm/lib/AsmParser/LLParser.cpp
index 1c6e7bd..3471a2d 100644
--- a/contrib/llvm/lib/AsmParser/LLParser.cpp
+++ b/contrib/llvm/lib/AsmParser/LLParser.cpp
@@ -13,6 +13,7 @@
 
 #include "LLParser.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/CallingConv.h"
@@ -48,6 +49,32 @@ bool LLParser::Run() {
          ValidateEndOfModule();
 }
 
+bool LLParser::parseStandaloneConstantValue(Constant *&C,
+                                            const SlotMapping *Slots) {
+  restoreParsingState(Slots);
+  Lex.Lex();
+
+  Type *Ty = nullptr;
+  if (ParseType(Ty) || parseConstantValue(Ty, C))
+    return true;
+  if (Lex.getKind() != lltok::Eof)
+    return Error(Lex.getLoc(), "expected end of string");
+  return false;
+}
+
+void LLParser::restoreParsingState(const SlotMapping *Slots) {
+  if (!Slots)
+    return;
+  NumberedVals = Slots->GlobalValues;
+  NumberedMetadata = Slots->MetadataNodes;
+  for (const auto &I : Slots->NamedTypes)
+    NamedTypes.insert(
+        std::make_pair(I.getKey(), std::make_pair(I.second, LocTy())));
+  for (const auto &I : Slots->Types)
+    NumberedTypes.insert(
+        std::make_pair(I.first, std::make_pair(I.second, LocTy())));
+}
+
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
 bool LLParser::ValidateEndOfModule() {
@@ -158,7 +185,7 @@ bool LLParser::ValidateEndOfModule() {
 
   // Look for intrinsic functions and CallInst that need to be upgraded
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
-    UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
+    UpgradeCallsToIntrinsic(&*FI++); // must be post-increment, as we remove
 
   UpgradeDebugInfo(*M);
 
@@ -169,6 +196,10 @@ bool LLParser::ValidateEndOfModule() {
   // the mapping from LLParser as it doesn't need it anymore.
   Slots->GlobalValues = std::move(NumberedVals);
   Slots->MetadataNodes = std::move(NumberedMetadata);
+  for (const auto &I : NamedTypes)
+    Slots->NamedTypes.insert(std::make_pair(I.getKey(), I.second.first));
+  for (const auto &I : NumberedTypes)
+    Slots->Types.insert(std::make_pair(I.first, I.second.first));
 
   return false;
 }
@@ -647,6 +678,12 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
     return Error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
+  Type *Ty;
+  LocTy ExplicitTypeLoc = Lex.getLoc();
+  if (ParseType(Ty) ||
+      ParseToken(lltok::comma, "expected comma after alias's type"))
+    return true;
+
   Constant *Aliasee;
   LocTy AliaseeLoc = Lex.getLoc();
   if (Lex.getKind() != lltok::kw_bitcast &&
@@ -669,11 +706,35 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
   auto *PTy = dyn_cast<PointerType>(AliaseeType);
   if (!PTy)
     return Error(AliaseeLoc, "An alias must have pointer type");
+  unsigned AddrSpace = PTy->getAddressSpace();
+
+  if (Ty != PTy->getElementType())
+    return Error(
+        ExplicitTypeLoc,
+        "explicit pointee type doesn't match operand's pointee type");
+
+  GlobalValue *GVal = nullptr;
+
+  // See if the alias was forward referenced, if so, prepare to replace the
+  // forward reference.
+  if (!Name.empty()) {
+    GVal = M->getNamedValue(Name);
+    if (GVal) {
+      if (!ForwardRefVals.erase(Name))
+        return Error(NameLoc, "redefinition of global '@" + Name + "'");
+    }
+  } else {
+    auto I = ForwardRefValIDs.find(NumberedVals.size());
+    if (I != ForwardRefValIDs.end()) {
+      GVal = I->second.first;
+      ForwardRefValIDs.erase(I);
+    }
+  }
 
   // Okay, create the alias but do not insert it into the module yet.
   std::unique_ptr<GlobalAlias> GA(
-      GlobalAlias::create(PTy, (GlobalValue::LinkageTypes)Linkage, Name,
-                          Aliasee, /*Parent*/ nullptr));
+      GlobalAlias::create(Ty, AddrSpace, (GlobalValue::LinkageTypes)Linkage,
+                          Name, Aliasee, /*Parent*/ nullptr));
   GA->setThreadLocalMode(TLM);
   GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
@@ -682,27 +743,17 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
   if (Name.empty())
     NumberedVals.push_back(GA.get());
 
-  // See if this value already exists in the symbol table.  If so, it is either
-  // a redefinition or a definition of a forward reference.
-  if (GlobalValue *Val = M->getNamedValue(Name)) {
-    // See if this was a redefinition.  If so, there is no entry in
-    // ForwardRefVals.
-    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
-      I = ForwardRefVals.find(Name);
-    if (I == ForwardRefVals.end())
-      return Error(NameLoc, "redefinition of global named '@" + Name + "'");
-
-    // Otherwise, this was a definition of forward ref.  Verify that types
-    // agree.
-    if (Val->getType() != GA->getType())
-      return Error(NameLoc,
-              "forward reference and definition of alias have different types");
+  if (GVal) {
+    // Verify that types agree.
+    if (GVal->getType() != GA->getType())
+      return Error(
+          ExplicitTypeLoc,
+          "forward reference and definition of alias have different types");
 
     // If they agree, just RAUW the old value with the alias and remove the
     // forward ref info.
-    Val->replaceAllUsesWith(GA.get());
-    Val->eraseFromParent();
-    ForwardRefVals.erase(I);
+    GVal->replaceAllUsesWith(GA.get());
+    GVal->eraseFromParent();
   }
 
   // Insert into the module, we know its name won't collide now.
@@ -767,12 +818,11 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   if (!Name.empty()) {
     GVal = M->getNamedValue(Name);
     if (GVal) {
-      if (!ForwardRefVals.erase(Name) || !isa<GlobalValue>(GVal))
+      if (!ForwardRefVals.erase(Name))
         return Error(NameLoc, "redefinition of global '@" + Name + "'");
     }
   } else {
-    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
-      I = ForwardRefValIDs.find(NumberedVals.size());
+    auto I = ForwardRefValIDs.find(NumberedVals.size());
     if (I != ForwardRefValIDs.end()) {
       GVal = I->second.first;
       ForwardRefValIDs.erase(I);
@@ -903,14 +953,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     }
     // Target-dependent attributes:
     case lltok::StringConstant: {
-      std::string Attr = Lex.getStrVal();
-      Lex.Lex();
-      std::string Val;
-      if (EatIfPresent(lltok::equal) &&
-          ParseStringConstant(Val))
+      if (ParseStringAttribute(B))
         return true;
-
-      B.addAttribute(Attr, Val);
       continue;
     }
 
@@ -951,6 +995,10 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break;
     case lltok::kw_cold: B.addAttribute(Attribute::Cold); break;
     case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break;
+    case lltok::kw_inaccessiblememonly:
+      B.addAttribute(Attribute::InaccessibleMemOnly); break;
+    case lltok::kw_inaccessiblemem_or_argmemonly:
+      B.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); break;
     case lltok::kw_inlinehint: B.addAttribute(Attribute::InlineHint); break;
     case lltok::kw_jumptable: B.addAttribute(Attribute::JumpTable); break;
     case lltok::kw_minsize: B.addAttribute(Attribute::MinSize); break;
@@ -963,6 +1011,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break;
     case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break;
     case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break;
+    case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break;
     case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break;
     case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break;
     case lltok::kw_optsize: B.addAttribute(Attribute::OptimizeForSize); break;
@@ -1015,6 +1064,17 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
 // GlobalValue Reference/Resolution Routines.
 //===----------------------------------------------------------------------===//
 
+static inline GlobalValue *createGlobalFwdRef(Module *M, PointerType *PTy,
+                                              const std::string &Name) {
+  if (auto *FT = dyn_cast<FunctionType>(PTy->getElementType()))
+    return Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
+  else
+    return new GlobalVariable(*M, PTy->getElementType(), false,
+                              GlobalValue::ExternalWeakLinkage, nullptr, Name,
+                              nullptr, GlobalVariable::NotThreadLocal,
+                              PTy->getAddressSpace());
+}
+
 /// GetGlobalVal - Get a value with the specified name or ID, creating a
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
@@ -1033,8 +1093,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
   if (!Val) {
-    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
-      I = ForwardRefVals.find(Name);
+    auto I = ForwardRefVals.find(Name);
     if (I != ForwardRefVals.end())
       Val = I->second.first;
   }
@@ -1048,15 +1107,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
   }
 
   // Otherwise, create a new forward reference for this value and remember it.
-  GlobalValue *FwdVal;
-  if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
-    FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
-  else
-    FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
-                                GlobalValue::ExternalWeakLinkage, nullptr, Name,
-                                nullptr, GlobalVariable::NotThreadLocal,
-                                PTy->getAddressSpace());
-
+  GlobalValue *FwdVal = createGlobalFwdRef(M, PTy, Name);
   ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
   return FwdVal;
 }
@@ -1073,8 +1124,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
   if (!Val) {
-    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
-      I = ForwardRefValIDs.find(ID);
+    auto I = ForwardRefValIDs.find(ID);
     if (I != ForwardRefValIDs.end())
       Val = I->second.first;
   }
@@ -1088,13 +1138,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
   }
 
   // Otherwise, create a new forward reference for this value and remember it.
-  GlobalValue *FwdVal;
-  if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
-    FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, "", M);
-  else
-    FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
-                                GlobalValue::ExternalWeakLinkage, nullptr, "");
-
+  GlobalValue *FwdVal = createGlobalFwdRef(M, PTy, "");
   ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
   return FwdVal;
 }
@@ -1217,6 +1261,19 @@ bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) {
          ParseToken(lltok::rparen, "expected ')' in address space");
 }
 
+/// ParseStringAttribute
+///   := StringConstant
+///   := StringConstant '=' StringConstant
+bool LLParser::ParseStringAttribute(AttrBuilder &B) {
+  std::string Attr = Lex.getStrVal();
+  Lex.Lex();
+  std::string Val;
+  if (EatIfPresent(lltok::equal) && ParseStringConstant(Val))
+    return true;
+  B.addAttribute(Attr, Val);
+  return false;
+}
+
 /// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes.
 bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
   bool HaveError = false;
@@ -1228,6 +1285,11 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     switch (Token) {
     default:  // End of attributes.
       return HaveError;
+    case lltok::StringConstant: {
+      if (ParseStringAttribute(B))
+        return true;
+      continue;
+    }
     case lltok::kw_align: {
       unsigned Alignment;
       if (ParseOptionalAlignment(Alignment))
@@ -1309,6 +1371,11 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     switch (Token) {
     default:  // End of attributes.
       return HaveError;
+    case lltok::StringConstant: {
+      if (ParseStringAttribute(B))
+        return true;
+      continue;
+    }
     case lltok::kw_dereferenceable: {
       uint64_t Bytes;
       if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes))
@@ -1323,6 +1390,13 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
       B.addDereferenceableOrNullAttr(Bytes);
       continue;
     }
+    case lltok::kw_align: {
+      unsigned Alignment;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
+      B.addAlignmentAttr(Alignment);
+      continue;
+    }
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
     case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
@@ -1330,7 +1404,6 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
 
     // Error handling.
-    case lltok::kw_align:
     case lltok::kw_byval:
     case lltok::kw_inalloca:
     case lltok::kw_nest:
@@ -1473,6 +1546,10 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'preserve_mostcc'
 ///   ::= 'preserve_allcc'
 ///   ::= 'ghccc'
+///   ::= 'x86_intrcc'
+///   ::= 'hhvmcc'
+///   ::= 'hhvm_ccc'
+///   ::= 'cxx_fast_tlscc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -1501,6 +1578,10 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
   case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break;
   case lltok::kw_ghccc:          CC = CallingConv::GHC; break;
+  case lltok::kw_x86_intrcc:     CC = CallingConv::X86_INTR; break;
+  case lltok::kw_hhvmcc:         CC = CallingConv::HHVM; break;
+  case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
+  case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
@@ -1883,7 +1964,59 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
   return false;
 }
 
+/// ParseOptionalOperandBundles
+///    ::= /*empty*/
+///    ::= '[' OperandBundle [, OperandBundle ]* ']'
+///
+/// OperandBundle
+///    ::= bundle-tag '(' ')'
+///    ::= bundle-tag '(' Type Value [, Type Value ]* ')'
+///
+/// bundle-tag ::= String Constant
+bool LLParser::ParseOptionalOperandBundles(
+    SmallVectorImpl<OperandBundleDef> &BundleList, PerFunctionState &PFS) {
+  LocTy BeginLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::lsquare))
+    return false;
+
+  while (Lex.getKind() != lltok::rsquare) {
+    // If this isn't the first operand bundle, we need a comma.
+    if (!BundleList.empty() &&
+        ParseToken(lltok::comma, "expected ',' in input list"))
+      return true;
 
+    std::string Tag;
+    if (ParseStringConstant(Tag))
+      return true;
+
+    if (ParseToken(lltok::lparen, "expected '(' in operand bundle"))
+      return true;
+
+    std::vector<Value *> Inputs;
+    while (Lex.getKind() != lltok::rparen) {
+      // If this isn't the first input, we need a comma.
+      if (!Inputs.empty() &&
+          ParseToken(lltok::comma, "expected ',' in input list"))
+        return true;
+
+      Type *Ty = nullptr;
+      Value *Input = nullptr;
+      if (ParseType(Ty) || ParseValue(Ty, Input, PFS))
+        return true;
+      Inputs.push_back(Input);
+    }
+
+    BundleList.emplace_back(std::move(Tag), std::move(Inputs));
+
+    Lex.Lex(); // Lex the ')'.
+  }
+
+  if (BundleList.empty())
+    return Error(BeginLoc, "operand bundle set must not be empty");
+
+  Lex.Lex(); // Lex the ']'.
+  return false;
+}
 
 /// ParseArgumentList - Parse the argument list for a function type or function
 /// prototype.
@@ -2146,31 +2279,29 @@ LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f,
   : P(p), F(f), FunctionNumber(functionNumber) {
 
   // Insert unnamed arguments into the NumberedVals list.
-  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end();
-       AI != E; ++AI)
-    if (!AI->hasName())
-      NumberedVals.push_back(AI);
+  for (Argument &A : F.args())
+    if (!A.hasName())
+      NumberedVals.push_back(&A);
 }
 
 LLParser::PerFunctionState::~PerFunctionState() {
   // If there were any forward referenced non-basicblock values, delete them.
-  for (std::map<std::string, std::pair<Value*, LocTy> >::iterator
-       I = ForwardRefVals.begin(), E = ForwardRefVals.end(); I != E; ++I)
-    if (!isa<BasicBlock>(I->second.first)) {
-      I->second.first->replaceAllUsesWith(
-                           UndefValue::get(I->second.first->getType()));
-      delete I->second.first;
-      I->second.first = nullptr;
-    }
 
-  for (std::map<unsigned, std::pair<Value*, LocTy> >::iterator
-       I = ForwardRefValIDs.begin(), E = ForwardRefValIDs.end(); I != E; ++I)
-    if (!isa<BasicBlock>(I->second.first)) {
-      I->second.first->replaceAllUsesWith(
-                           UndefValue::get(I->second.first->getType()));
-      delete I->second.first;
-      I->second.first = nullptr;
-    }
+  for (const auto &P : ForwardRefVals) {
+    if (isa<BasicBlock>(P.second.first))
+      continue;
+    P.second.first->replaceAllUsesWith(
+        UndefValue::get(P.second.first->getType()));
+    delete P.second.first;
+  }
+
+  for (const auto &P : ForwardRefValIDs) {
+    if (isa<BasicBlock>(P.second.first))
+      continue;
+    P.second.first->replaceAllUsesWith(
+        UndefValue::get(P.second.first->getType()));
+    delete P.second.first;
+  }
 }
 
 bool LLParser::PerFunctionState::FinishFunction() {
@@ -2189,16 +2320,15 @@ bool LLParser::PerFunctionState::FinishFunction() {
 /// GetVal - Get a value with the specified name or ID, creating a
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
-Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
-                                          Type *Ty, LocTy Loc) {
+Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty,
+                                          LocTy Loc) {
   // Look this name up in the normal function symbol table.
   Value *Val = F.getValueSymbolTable().lookup(Name);
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
   if (!Val) {
-    std::map<std::string, std::pair<Value*, LocTy> >::iterator
-      I = ForwardRefVals.find(Name);
+    auto I = ForwardRefVals.find(Name);
     if (I != ForwardRefVals.end())
       Val = I->second.first;
   }
@@ -2222,25 +2352,24 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
 
   // Otherwise, create a new forward reference for this value and remember it.
   Value *FwdVal;
-  if (Ty->isLabelTy())
+  if (Ty->isLabelTy()) {
     FwdVal = BasicBlock::Create(F.getContext(), Name, &F);
-  else
+  } else {
     FwdVal = new Argument(Ty, Name);
+  }
 
   ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
   return FwdVal;
 }
 
-Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty,
-                                          LocTy Loc) {
+Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc) {
   // Look this name up in the normal function symbol table.
   Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
   if (!Val) {
-    std::map<unsigned, std::pair<Value*, LocTy> >::iterator
-      I = ForwardRefValIDs.find(ID);
+    auto I = ForwardRefValIDs.find(ID);
     if (I != ForwardRefValIDs.end())
       Val = I->second.first;
   }
@@ -2263,10 +2392,11 @@ Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty,
 
   // Otherwise, create a new forward reference for this value and remember it.
   Value *FwdVal;
-  if (Ty->isLabelTy())
+  if (Ty->isLabelTy()) {
     FwdVal = BasicBlock::Create(F.getContext(), "", &F);
-  else
+  } else {
     FwdVal = new Argument(Ty);
+  }
 
   ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
   return FwdVal;
@@ -2295,14 +2425,15 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
       return P.Error(NameLoc, "instruction expected to be numbered '%" +
                      Twine(NumberedVals.size()) + "'");
 
-    std::map<unsigned, std::pair<Value*, LocTy> >::iterator FI =
-      ForwardRefValIDs.find(NameID);
+    auto FI = ForwardRefValIDs.find(NameID);
     if (FI != ForwardRefValIDs.end()) {
-      if (FI->second.first->getType() != Inst->getType())
+      Value *Sentinel = FI->second.first;
+      if (Sentinel->getType() != Inst->getType())
         return P.Error(NameLoc, "instruction forward referenced with type '" +
                        getTypeString(FI->second.first->getType()) + "'");
-      FI->second.first->replaceAllUsesWith(Inst);
-      delete FI->second.first;
+
+      Sentinel->replaceAllUsesWith(Inst);
+      delete Sentinel;
       ForwardRefValIDs.erase(FI);
     }
 
@@ -2311,14 +2442,15 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
   }
 
   // Otherwise, the instruction had a name.  Resolve forward refs and set it.
-  std::map<std::string, std::pair<Value*, LocTy> >::iterator
-    FI = ForwardRefVals.find(NameStr);
+  auto FI = ForwardRefVals.find(NameStr);
   if (FI != ForwardRefVals.end()) {
-    if (FI->second.first->getType() != Inst->getType())
+    Value *Sentinel = FI->second.first;
+    if (Sentinel->getType() != Inst->getType())
       return P.Error(NameLoc, "instruction forward referenced with type '" +
                      getTypeString(FI->second.first->getType()) + "'");
-    FI->second.first->replaceAllUsesWith(Inst);
-    delete FI->second.first;
+
+    Sentinel->replaceAllUsesWith(Inst);
+    delete Sentinel;
     ForwardRefVals.erase(FI);
   }
 
@@ -2421,6 +2553,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   case lltok::kw_null: ID.Kind = ValID::t_Null; break;
   case lltok::kw_undef: ID.Kind = ValID::t_Undef; break;
   case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break;
+  case lltok::kw_none: ID.Kind = ValID::t_None; break;
 
   case lltok::lbrace: {
     // ValID ::= '{' ConstVector '}'
@@ -2430,9 +2563,10 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         ParseToken(lltok::rbrace, "expected end of struct constant"))
       return true;
 
-    ID.ConstantStructElts = new Constant*[Elts.size()];
+    ID.ConstantStructElts = make_unique<Constant *[]>(Elts.size());
     ID.UIntVal = Elts.size();
-    memcpy(ID.ConstantStructElts, Elts.data(), Elts.size()*sizeof(Elts[0]));
+    memcpy(ID.ConstantStructElts.get(), Elts.data(),
+           Elts.size() * sizeof(Elts[0]));
     ID.Kind = ValID::t_ConstantStruct;
     return false;
   }
@@ -2451,8 +2585,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       return true;
 
     if (isPackedStruct) {
-      ID.ConstantStructElts = new Constant*[Elts.size()];
-      memcpy(ID.ConstantStructElts, Elts.data(), Elts.size()*sizeof(Elts[0]));
+      ID.ConstantStructElts = make_unique<Constant *[]>(Elts.size());
+      memcpy(ID.ConstantStructElts.get(), Elts.data(),
+             Elts.size() * sizeof(Elts[0]));
       ID.UIntVal = Elts.size();
       ID.Kind = ValID::t_PackedConstantStruct;
       return false;
@@ -2891,7 +3026,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         }
       }
 
-      SmallPtrSet<const Type*, 4> Visited;
+      SmallPtrSet<Type*, 4> Visited;
       if (!Indices.empty() && !Ty->isSized(&Visited))
         return Error(ID.Loc, "base element of getelementptr must be sized");
 
@@ -3066,6 +3201,11 @@ struct DwarfTagField : public MDUnsignedField {
   DwarfTagField(dwarf::Tag DefaultTag)
       : MDUnsignedField(DefaultTag, dwarf::DW_TAG_hi_user) {}
 };
+struct DwarfMacinfoTypeField : public MDUnsignedField {
+  DwarfMacinfoTypeField() : MDUnsignedField(0, dwarf::DW_MACINFO_vendor_ext) {}
+  DwarfMacinfoTypeField(dwarf::MacinfoRecordType DefaultType)
+    : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {}
+};
 struct DwarfAttEncodingField : public MDUnsignedField {
   DwarfAttEncodingField() : MDUnsignedField(0, dwarf::DW_ATE_hi_user) {}
 };
@@ -3159,6 +3299,26 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
 
 template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            DwarfMacinfoTypeField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfMacinfo)
+    return TokError("expected DWARF macinfo type");
+
+  unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal());
+  if (Macinfo == dwarf::DW_MACINFO_invalid)
+    return TokError(
+        "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'");
+  assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type");
+
+  Result.assign(Macinfo);
+  Lex.Lex();
+  return false;
+}
+
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             DwarfVirtualityField &Result) {
   if (Lex.getKind() == lltok::APSInt)
     return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
@@ -3569,8 +3729,11 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
 ///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
 ///                      splitDebugFilename: "abc.debug", emissionKind: 1,
 ///                      enums: !1, retainedTypes: !2, subprograms: !3,
-///                      globals: !4, imports: !5, dwoId: 0x0abcd)
+///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd)
 bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
+  if (!IsDistinct)
+    return Lex.Error("missing 'distinct', required for !DICompileUnit");
+
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(language, DwarfLangField, );                                        \
   REQUIRED(file, MDField, (/* AllowNull */ false));                            \
@@ -3585,16 +3748,16 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(subprograms, MDField, );                                            \
   OPTIONAL(globals, MDField, );                                                \
   OPTIONAL(imports, MDField, );                                                \
+  OPTIONAL(macros, MDField, );                                                 \
   OPTIONAL(dwoId, MDUnsignedField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DICompileUnit,
-                           (Context, language.Val, file.Val, producer.Val,
-                            isOptimized.Val, flags.Val, runtimeVersion.Val,
-                            splitDebugFilename.Val, emissionKind.Val, enums.Val,
-                            retainedTypes.Val, subprograms.Val, globals.Val,
-                            imports.Val, dwoId.Val));
+  Result = DICompileUnit::getDistinct(
+      Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
+      runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
+      retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, macros.Val,
+      dwoId.Val);
   return false;
 }
 
@@ -3604,9 +3767,10 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     isDefinition: true, scopeLine: 8, containingType: !3,
 ///                     virtuality: DW_VIRTUALTIY_pure_virtual,
 ///                     virtualIndex: 10, flags: 11,
-///                     isOptimized: false, function: void ()* @_Z3foov,
-///                     templateParams: !4, declaration: !5, variables: !6)
+///                     isOptimized: false, templateParams: !4, declaration: !5,
+///                     variables: !6)
 bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
+  auto Loc = Lex.getLoc();
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(scope, MDField, );                                                  \
   OPTIONAL(name, MDStringField, );                                             \
@@ -3622,19 +3786,23 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(virtualIndex, MDUnsignedField, (0, UINT32_MAX));                    \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(isOptimized, MDBoolField, );                                        \
-  OPTIONAL(function, MDConstant, );                                            \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
   OPTIONAL(variables, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  if (isDefinition.Val && !IsDistinct)
+    return Lex.Error(
+        Loc,
+        "missing 'distinct', required for !DISubprogram when 'isDefinition'");
+
   Result = GET_OR_DISTINCT(
-      DISubprogram, (Context, scope.Val, name.Val, linkageName.Val, file.Val,
-                     line.Val, type.Val, isLocal.Val, isDefinition.Val,
-                     scopeLine.Val, containingType.Val, virtuality.Val,
-                     virtualIndex.Val, flags.Val, isOptimized.Val, function.Val,
-                     templateParams.Val, declaration.Val, variables.Val));
+      DISubprogram,
+      (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val,
+       type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val,
+       containingType.Val, virtuality.Val, virtualIndex.Val, flags.Val,
+       isOptimized.Val, templateParams.Val, declaration.Val, variables.Val));
   return false;
 }
 
@@ -3685,6 +3853,39 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
+/// ParseDIMacro:
+///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue")
+bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(type, DwarfMacinfoTypeField, );                                     \
+  REQUIRED(line, LineField, );                                                 \
+  REQUIRED(name, MDStringField, );                                             \
+  OPTIONAL(value, MDStringField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIMacro,
+                           (Context, type.Val, line.Val, name.Val, value.Val));
+  return false;
+}
+
+/// ParseDIMacroFile:
+///   ::= !DIMacroFile(line: 9, file: !2, nodes: !3)
+bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(type, DwarfMacinfoTypeField, (dwarf::DW_MACINFO_start_file));       \
+  REQUIRED(line, LineField, );                                                 \
+  REQUIRED(file, MDField, );                                                   \
+  OPTIONAL(nodes, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIMacroFile,
+                           (Context, type.Val, line.Val, file.Val, nodes.Val));
+  return false;
+}
+
+
 /// ParseDIModule:
 ///   ::= !DIModule(scope: !0, name: "SomeModule", configMacros: "-DNDEBUG",
 ///                 includePath: "/usr/include", isysroot: "/")
@@ -3762,24 +3963,25 @@ bool LLParser::ParseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
 }
 
 /// ParseDILocalVariable:
-///   ::= !DILocalVariable(tag: DW_TAG_arg_variable, scope: !0, name: "foo",
+///   ::= !DILocalVariable(arg: 7, scope: !0, name: "foo",
+///                        file: !1, line: 7, type: !2, arg: 2, flags: 7)
+///   ::= !DILocalVariable(scope: !0, name: "foo",
 ///                        file: !1, line: 7, type: !2, arg: 2, flags: 7)
 bool LLParser::ParseDILocalVariable(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  REQUIRED(tag, DwarfTagField, );                                              \
   REQUIRED(scope, MDField, (/* AllowNull */ false));                           \
   OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(arg, MDUnsignedField, (0, UINT16_MAX));                             \
   OPTIONAL(file, MDField, );                                                   \
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(type, MDField, );                                                   \
-  OPTIONAL(arg, MDUnsignedField, (0, UINT16_MAX));                             \
   OPTIONAL(flags, DIFlagField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
   Result = GET_OR_DISTINCT(DILocalVariable,
-                           (Context, tag.Val, scope.Val, name.Val, file.Val,
-                            line.Val, type.Val, arg.Val, flags.Val));
+                           (Context, scope.Val, name.Val, file.Val, line.Val,
+                            type.Val, arg.Val, flags.Val));
   return false;
 }
 
@@ -3969,13 +4171,11 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
     V = PFS->GetVal(ID.StrVal, Ty, ID.Loc);
     return V == nullptr;
   case ValID::t_InlineAsm: {
-    PointerType *PTy = dyn_cast<PointerType>(Ty);
-    FunctionType *FTy =
-      PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : nullptr;
-    if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
+    if (!ID.FTy || !InlineAsm::Verify(ID.FTy, ID.StrVal2))
       return Error(ID.Loc, "invalid type for inline asm constraint string");
-    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1,
-                       (ID.UIntVal>>1)&1, (InlineAsm::AsmDialect(ID.UIntVal>>2)));
+    V = InlineAsm::get(ID.FTy, ID.StrVal, ID.StrVal2, ID.UIntVal & 1,
+                       (ID.UIntVal >> 1) & 1,
+                       (InlineAsm::AsmDialect(ID.UIntVal >> 2)));
     return false;
   }
   case ValID::t_GlobalName:
@@ -4035,6 +4235,11 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
       return Error(ID.Loc, "invalid type for null constant");
     V = Constant::getNullValue(Ty);
     return false;
+  case ValID::t_None:
+    if (!Ty->isTokenTy())
+      return Error(ID.Loc, "invalid type for none constant");
+    V = Constant::getNullValue(Ty);
+    return false;
   case ValID::t_Constant:
     if (ID.ConstantVal->getType() != Ty)
       return Error(ID.Loc, "constant expression type mismatch");
@@ -4056,8 +4261,8 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
           return Error(ID.Loc, "element " + Twine(i) +
                     " of struct initializer doesn't match struct element type");
 
-      V = ConstantStruct::get(ST, makeArrayRef(ID.ConstantStructElts,
-                                               ID.UIntVal));
+      V = ConstantStruct::get(
+          ST, makeArrayRef(ID.ConstantStructElts.get(), ID.UIntVal));
     } else
       return Error(ID.Loc, "constant expression type mismatch");
     return false;
@@ -4065,11 +4270,35 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
   llvm_unreachable("Invalid ValID");
 }
 
+bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
+  C = nullptr;
+  ValID ID;
+  auto Loc = Lex.getLoc();
+  if (ParseValID(ID, /*PFS=*/nullptr))
+    return true;
+  switch (ID.Kind) {
+  case ValID::t_APSInt:
+  case ValID::t_APFloat:
+  case ValID::t_Undef:
+  case ValID::t_Constant:
+  case ValID::t_ConstantStruct:
+  case ValID::t_PackedConstantStruct: {
+    Value *V;
+    if (ConvertValIDToValue(Ty, ID, V, /*PFS=*/nullptr))
+      return true;
+    assert(isa<Constant>(V) && "Expected a constant value");
+    C = cast<Constant>(V);
+    return false;
+  }
+  default:
+    return Error(Loc, "expected a constant value");
+  }
+}
+
 bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
   V = nullptr;
   ValID ID;
-  return ParseValID(ID, PFS) ||
-         ConvertValIDToValue(Ty, ID, V, PFS);
+  return ParseValID(ID, PFS) || ConvertValIDToValue(Ty, ID, V, PFS);
 }
 
 bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) {
@@ -4242,8 +4471,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   if (!FunctionName.empty()) {
     // If this was a definition of a forward reference, remove the definition
     // from the forward reference table and fill in the forward ref.
-    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator FRVI =
-      ForwardRefVals.find(FunctionName);
+    auto FRVI = ForwardRefVals.find(FunctionName);
     if (FRVI != ForwardRefVals.end()) {
       Fn = M->getFunction(FunctionName);
       if (!Fn)
@@ -4265,8 +4493,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   } else {
     // If this is a definition of a forward referenced function, make sure the
     // types agree.
-    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator I
-      = ForwardRefValIDs.find(NumberedVals.size());
+    auto I = ForwardRefValIDs.find(NumberedVals.size());
     if (I != ForwardRefValIDs.end()) {
       Fn = cast<Function>(I->second.first);
       if (Fn->getType() != PFT)
@@ -4498,6 +4725,11 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_indirectbr:  return ParseIndirectBr(Inst, PFS);
   case lltok::kw_invoke:      return ParseInvoke(Inst, PFS);
   case lltok::kw_resume:      return ParseResume(Inst, PFS);
+  case lltok::kw_cleanupret:  return ParseCleanupRet(Inst, PFS);
+  case lltok::kw_catchret:    return ParseCatchRet(Inst, PFS);
+  case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS);
+  case lltok::kw_catchpad:    return ParseCatchPad(Inst, PFS);
+  case lltok::kw_cleanuppad:  return ParseCleanupPad(Inst, PFS);
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_sub:
@@ -4580,6 +4812,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_call:     return ParseCall(Inst, PFS, CallInst::TCK_None);
   case lltok::kw_tail:     return ParseCall(Inst, PFS, CallInst::TCK_Tail);
   case lltok::kw_musttail: return ParseCall(Inst, PFS, CallInst::TCK_MustTail);
+  case lltok::kw_notail:   return ParseCall(Inst, PFS, CallInst::TCK_NoTail);
   // Memory.
   case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
   case lltok::kw_load:           return ParseLoad(Inst, PFS);
@@ -4798,15 +5031,15 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy RetTypeLoc;
   ValID CalleeID;
   SmallVector<ParamInfo, 16> ArgList;
+  SmallVector<OperandBundleDef, 2> BundleList;
 
   BasicBlock *NormalBB, *UnwindBB;
-  if (ParseOptionalCallingConv(CC) ||
-      ParseOptionalReturnAttrs(RetAttrs) ||
+  if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
       ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
-      ParseValID(CalleeID) ||
-      ParseParameterList(ArgList, PFS) ||
+      ParseValID(CalleeID) || ParseParameterList(ArgList, PFS) ||
       ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
                                  NoBuiltinLoc) ||
+      ParseOptionalOperandBundles(BundleList, PFS) ||
       ParseToken(lltok::kw_to, "expected 'to' in invoke") ||
       ParseTypeAndBasicBlock(NormalBB, PFS) ||
       ParseToken(lltok::kw_unwind, "expected 'unwind' in invoke") ||
@@ -4829,6 +5062,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
     Ty = FunctionType::get(RetType, ParamTypes, false);
   }
 
+  CalleeID.FTy = Ty;
+
   // Look up the callee.
   Value *Callee;
   if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS))
@@ -4880,7 +5115,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   // Finish off the Attribute and check them
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
 
-  InvokeInst *II = InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args);
+  InvokeInst *II =
+      InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args, BundleList);
   II->setCallingConv(CC);
   II->setAttributes(PAL);
   ForwardRefAttrGroups[II] = FwdRefAttrGrps;
@@ -4900,6 +5136,183 @@ bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
+bool LLParser::ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
+                                  PerFunctionState &PFS) {
+  if (ParseToken(lltok::lsquare, "expected '[' in catchpad/cleanuppad"))
+    return true;
+
+  while (Lex.getKind() != lltok::rsquare) {
+    // If this isn't the first argument, we need a comma.
+    if (!Args.empty() &&
+        ParseToken(lltok::comma, "expected ',' in argument list"))
+      return true;
+
+    // Parse the argument.
+    LocTy ArgLoc;
+    Type *ArgTy = nullptr;
+    if (ParseType(ArgTy, ArgLoc))
+      return true;
+
+    Value *V;
+    if (ArgTy->isMetadataTy()) {
+      if (ParseMetadataAsValue(V, PFS))
+        return true;
+    } else {
+      if (ParseValue(ArgTy, V, PFS))
+        return true;
+    }
+    Args.push_back(V);
+  }
+
+  Lex.Lex();  // Lex the ']'.
+  return false;
+}
+
+/// ParseCleanupRet
+///   ::= 'cleanupret' from Value unwind ('to' 'caller' | TypeAndValue)
+bool LLParser::ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *CleanupPad = nullptr;
+
+  if (ParseToken(lltok::kw_from, "expected 'from' after cleanupret"))
+    return true;
+
+  if (ParseValue(Type::getTokenTy(Context), CleanupPad, PFS))
+    return true;
+
+  if (ParseToken(lltok::kw_unwind, "expected 'unwind' in cleanupret"))
+    return true;
+
+  BasicBlock *UnwindBB = nullptr;
+  if (Lex.getKind() == lltok::kw_to) {
+    Lex.Lex();
+    if (ParseToken(lltok::kw_caller, "expected 'caller' in cleanupret"))
+      return true;
+  } else {
+    if (ParseTypeAndBasicBlock(UnwindBB, PFS)) {
+      return true;
+    }
+  }
+
+  Inst = CleanupReturnInst::Create(CleanupPad, UnwindBB);
+  return false;
+}
+
+/// ParseCatchRet
+///   ::= 'catchret' from Parent Value 'to' TypeAndValue
+bool LLParser::ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *CatchPad = nullptr;
+
+  if (ParseToken(lltok::kw_from, "expected 'from' after catchret"))
+    return true;
+
+  if (ParseValue(Type::getTokenTy(Context), CatchPad, PFS))
+    return true;
+
+  BasicBlock *BB;
+  if (ParseToken(lltok::kw_to, "expected 'to' in catchret") ||
+      ParseTypeAndBasicBlock(BB, PFS))
+      return true;
+
+  Inst = CatchReturnInst::Create(CatchPad, BB);
+  return false;
+}
+
+/// ParseCatchSwitch
+///   ::= 'catchswitch' within Parent
+bool LLParser::ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *ParentPad;
+  LocTy BBLoc;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after catchswitch"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for catchswitch");
+
+  if (ParseValue(Type::getTokenTy(Context), ParentPad, PFS))
+    return true;
+
+  if (ParseToken(lltok::lsquare, "expected '[' with catchswitch labels"))
+    return true;
+
+  SmallVector<BasicBlock *, 32> Table;
+  do {
+    BasicBlock *DestBB;
+    if (ParseTypeAndBasicBlock(DestBB, PFS))
+      return true;
+    Table.push_back(DestBB);
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rsquare, "expected ']' after catchswitch labels"))
+    return true;
+
+  if (ParseToken(lltok::kw_unwind,
+                 "expected 'unwind' after catchswitch scope"))
+    return true;
+
+  BasicBlock *UnwindBB = nullptr;
+  if (EatIfPresent(lltok::kw_to)) {
+    if (ParseToken(lltok::kw_caller, "expected 'caller' in catchswitch"))
+      return true;
+  } else {
+    if (ParseTypeAndBasicBlock(UnwindBB, PFS))
+      return true;
+  }
+
+  auto *CatchSwitch =
+      CatchSwitchInst::Create(ParentPad, UnwindBB, Table.size());
+  for (BasicBlock *DestBB : Table)
+    CatchSwitch->addHandler(DestBB);
+  Inst = CatchSwitch;
+  return false;
+}
+
+/// ParseCatchPad
+///   ::= 'catchpad' ParamList 'to' TypeAndValue 'unwind' TypeAndValue
+bool LLParser::ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *CatchSwitch = nullptr;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after catchpad"))
+    return true;
+
+  if (Lex.getKind() != lltok::LocalVar && Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for catchpad");
+
+  if (ParseValue(Type::getTokenTy(Context), CatchSwitch, PFS))
+    return true;
+
+  SmallVector<Value *, 8> Args;
+  if (ParseExceptionArgs(Args, PFS))
+    return true;
+
+  Inst = CatchPadInst::Create(CatchSwitch, Args);
+  return false;
+}
+
+/// ParseCleanupPad
+///   ::= 'cleanuppad' within Parent ParamList
+bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *ParentPad = nullptr;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after cleanuppad"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for cleanuppad");
+
+  if (ParseValue(Type::getTokenTy(Context), ParentPad, PFS))
+    return true;
+
+  SmallVector<Value *, 8> Args;
+  if (ParseExceptionArgs(Args, PFS))
+    return true;
+
+  Inst = CleanupPadInst::Create(ParentPad, Args);
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Binary Operators.
 //===----------------------------------------------------------------------===//
@@ -5196,12 +5609,14 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
 }
 
 /// ParseCall
-///   ::= 'call' OptionalCallingConv OptionalAttrs Type Value
-///       ParameterList OptionalAttrs
-///   ::= 'tail' 'call' OptionalCallingConv OptionalAttrs Type Value
-///       ParameterList OptionalAttrs
-///   ::= 'musttail' 'call' OptionalCallingConv OptionalAttrs Type Value
-///       ParameterList OptionalAttrs
+///   ::= 'call' OptionalFastMathFlags OptionalCallingConv
+///           OptionalAttrs Type Value ParameterList OptionalAttrs
+///   ::= 'tail' 'call' OptionalFastMathFlags OptionalCallingConv
+///           OptionalAttrs Type Value ParameterList OptionalAttrs
+///   ::= 'musttail' 'call' OptionalFastMathFlags OptionalCallingConv
+///           OptionalAttrs Type Value ParameterList OptionalAttrs
+///   ::= 'notail' 'call'  OptionalFastMathFlags OptionalCallingConv
+///           OptionalAttrs Type Value ParameterList OptionalAttrs
 bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
                          CallInst::TailCallKind TCK) {
   AttrBuilder RetAttrs, FnAttrs;
@@ -5212,20 +5627,29 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   LocTy RetTypeLoc;
   ValID CalleeID;
   SmallVector<ParamInfo, 16> ArgList;
+  SmallVector<OperandBundleDef, 2> BundleList;
   LocTy CallLoc = Lex.getLoc();
 
-  if ((TCK != CallInst::TCK_None &&
-       ParseToken(lltok::kw_call, "expected 'tail call'")) ||
-      ParseOptionalCallingConv(CC) ||
-      ParseOptionalReturnAttrs(RetAttrs) ||
+  if (TCK != CallInst::TCK_None &&
+      ParseToken(lltok::kw_call,
+                 "expected 'tail call', 'musttail call', or 'notail call'"))
+    return true;
+
+  FastMathFlags FMF = EatFastMathFlagsIfPresent();
+
+  if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) ||
       ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
       ParseValID(CalleeID) ||
       ParseParameterList(ArgList, PFS, TCK == CallInst::TCK_MustTail,
                          PFS.getFunction().isVarArg()) ||
-      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
-                                 BuiltinLoc))
+      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false, BuiltinLoc) ||
+      ParseOptionalOperandBundles(BundleList, PFS))
     return true;
 
+  if (FMF.any() && !RetType->isFPOrFPVectorTy())
+    return Error(CallLoc, "fast-math-flags specified for call without "
+                          "floating-point scalar or vector return type");
+
   // If RetType is a non-function pointer type, then this is the short syntax
   // for the call, which means that RetType is just the return type.  Infer the
   // rest of the function argument types from the arguments that are present.
@@ -5242,6 +5666,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
     Ty = FunctionType::get(RetType, ParamTypes, false);
   }
 
+  CalleeID.FTy = Ty;
+
   // Look up the callee.
   Value *Callee;
   if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS))
@@ -5293,9 +5719,11 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   // Finish off the Attribute and check them
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
 
-  CallInst *CI = CallInst::Create(Ty, Callee, Args);
+  CallInst *CI = CallInst::Create(Ty, Callee, Args, BundleList);
   CI->setTailCallKind(TCK);
   CI->setCallingConv(CC);
+  if (FMF.any())
+    CI->setFastMathFlags(FMF);
   CI->setAttributes(PAL);
   ForwardRefAttrGroups[CI] = FwdRefAttrGrps;
   Inst = CI;
@@ -5614,7 +6042,7 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
     Indices.push_back(Val);
   }
 
-  SmallPtrSet<const Type*, 4> Visited;
+  SmallPtrSet<Type*, 4> Visited;
   if (!Indices.empty() && !Ty->isSized(&Visited))
     return Error(Loc, "base element of getelementptr must be sized");
 
diff --git a/contrib/llvm/lib/AsmParser/LLParser.h b/contrib/llvm/lib/AsmParser/LLParser.h
index 6e57b3e..f61a5e5 100644
--- a/contrib/llvm/lib/AsmParser/LLParser.h
+++ b/contrib/llvm/lib/AsmParser/LLParser.h
@@ -46,29 +46,32 @@ namespace llvm {
   /// or a symbolic (%var) reference.  This is just a discriminated union.
   struct ValID {
     enum {
-      t_LocalID, t_GlobalID,      // ID in UIntVal.
-      t_LocalName, t_GlobalName,  // Name in StrVal.
-      t_APSInt, t_APFloat,        // Value in APSIntVal/APFloatVal.
-      t_Null, t_Undef, t_Zero,    // No value.
-      t_EmptyArray,               // No value:  []
-      t_Constant,                 // Value in ConstantVal.
-      t_InlineAsm,                // Value in StrVal/StrVal2/UIntVal.
-      t_ConstantStruct,           // Value in ConstantStructElts.
-      t_PackedConstantStruct      // Value in ConstantStructElts.
-    } Kind;
+      t_LocalID, t_GlobalID,           // ID in UIntVal.
+      t_LocalName, t_GlobalName,       // Name in StrVal.
+      t_APSInt, t_APFloat,             // Value in APSIntVal/APFloatVal.
+      t_Null, t_Undef, t_Zero, t_None, // No value.
+      t_EmptyArray,                    // No value:  []
+      t_Constant,                      // Value in ConstantVal.
+      t_InlineAsm,                     // Value in FTy/StrVal/StrVal2/UIntVal.
+      t_ConstantStruct,                // Value in ConstantStructElts.
+      t_PackedConstantStruct           // Value in ConstantStructElts.
+    } Kind = t_LocalID;
 
     LLLexer::LocTy Loc;
     unsigned UIntVal;
+    FunctionType *FTy = nullptr;
     std::string StrVal, StrVal2;
     APSInt APSIntVal;
-    APFloat APFloatVal;
+    APFloat APFloatVal{0.0};
     Constant *ConstantVal;
-    Constant **ConstantStructElts;
-
-    ValID() : Kind(t_LocalID), APFloatVal(0.0) {}
-    ~ValID() {
-      if (Kind == t_ConstantStruct || Kind == t_PackedConstantStruct)
-        delete [] ConstantStructElts;
+    std::unique_ptr<Constant *[]> ConstantStructElts;
+
+    ValID() = default;
+    ValID(const ValID &RHS)
+        : Kind(RHS.Kind), Loc(RHS.Loc), UIntVal(RHS.UIntVal), FTy(RHS.FTy),
+          StrVal(RHS.StrVal), StrVal2(RHS.StrVal2), APSIntVal(RHS.APSIntVal),
+          APFloatVal(RHS.APFloatVal), ConstantVal(RHS.ConstantVal) {
+      assert(!RHS.ConstantStructElts);
     }
 
     bool operator<(const ValID &RHS) const {
@@ -143,6 +146,8 @@ namespace llvm {
           Slots(Slots), BlockAddressPFS(nullptr) {}
     bool Run();
 
+    bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
+
     LLVMContext &getContext() { return Context; }
 
   private:
@@ -154,6 +159,10 @@ namespace llvm {
       return Error(Lex.getLoc(), Msg);
     }
 
+    /// Restore the internal name and slot mappings using the mappings that
+    /// were created at an earlier parsing stage.
+    void restoreParsingState(const SlotMapping *Slots);
+
     /// GetGlobalVal - Get a value with the specified name or ID, creating a
     /// forward reference record if needed.  This can return null if the value
     /// exists but does not have the right type.
@@ -210,6 +219,8 @@ namespace llvm {
       return ParseUInt64(Val);
     }
 
+    bool ParseStringAttribute(AttrBuilder &B);
+
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
     bool parseOptionalUnnamedAddr(bool &UnnamedAddr) {
@@ -343,10 +354,12 @@ namespace llvm {
     bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
                              PerFunctionState *PFS);
 
+    bool parseConstantValue(Type *Ty, Constant *&C);
     bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
     bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
       return ParseValue(Ty, V, &PFS);
     }
+
     bool ParseValue(Type *Ty, Value *&V, LocTy &Loc,
                     PerFunctionState &PFS) {
       Loc = Lex.getLoc();
@@ -381,6 +394,13 @@ namespace llvm {
                             bool IsMustTailCall = false,
                             bool InVarArgsFunc = false);
 
+    bool
+    ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList,
+                                PerFunctionState &PFS);
+
+    bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
+                            PerFunctionState &PFS);
+
     // Constant Parsing.
     bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
     bool ParseGlobalValue(Type *Ty, Constant *&V);
@@ -441,6 +461,11 @@ namespace llvm {
     bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseResume(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
 
     bool ParseArithmetic(Instruction *&I, PerFunctionState &PFS, unsigned Opc,
                          unsigned OperandType);
diff --git a/contrib/llvm/lib/AsmParser/LLToken.h b/contrib/llvm/lib/AsmParser/LLToken.h
index 691f085..29a7f16 100644
--- a/contrib/llvm/lib/AsmParser/LLToken.h
+++ b/contrib/llvm/lib/AsmParser/LLToken.h
@@ -49,10 +49,14 @@ namespace lltok {
     kw_external, kw_thread_local,
     kw_localdynamic, kw_initialexec, kw_localexec,
     kw_zeroinitializer,
-    kw_undef, kw_null,
+    kw_undef, kw_null, kw_none,
     kw_to,
+    kw_caller,
+    kw_within,
+    kw_from,
     kw_tail,
     kw_musttail,
+    kw_notail,
     kw_target,
     kw_triple,
     kw_unwind,
@@ -96,6 +100,9 @@ namespace lltok {
     kw_webkit_jscc, kw_anyregcc,
     kw_preserve_mostcc, kw_preserve_allcc,
     kw_ghccc,
+    kw_x86_intrcc,
+    kw_hhvmcc, kw_hhvm_ccc,
+    kw_cxx_fast_tlscc,
 
     // Attributes:
     kw_attributes,
@@ -109,6 +116,8 @@ namespace lltok {
     kw_convergent,
     kw_dereferenceable,
     kw_dereferenceable_or_null,
+    kw_inaccessiblememonly,
+    kw_inaccessiblemem_or_argmemonly,
     kw_inlinehint,
     kw_inreg,
     kw_jumptable,
@@ -121,6 +130,7 @@ namespace lltok {
     kw_noduplicate,
     kw_noimplicitfloat,
     kw_noinline,
+    kw_norecurse,
     kw_nonlazybind,
     kw_nonnull,
     kw_noredzone,
@@ -177,7 +187,8 @@ namespace lltok {
     kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
 
     kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_resume,
-    kw_unreachable,
+    kw_unreachable, kw_cleanupret, kw_catchswitch, kw_catchret, kw_catchpad,
+    kw_cleanuppad,
 
     kw_alloca, kw_load, kw_store, kw_fence, kw_cmpxchg, kw_atomicrmw,
     kw_getelementptr,
@@ -209,6 +220,7 @@ namespace lltok {
     DwarfLang,         // DW_LANG_foo
     DwarfOp,           // DW_OP_foo
     DIFlag,            // DIFlagFoo
+    DwarfMacinfo,      // DW_MACINFO_foo
 
     // Type valued tokens (TyVal).
     Type,
diff --git a/contrib/llvm/lib/AsmParser/Parser.cpp b/contrib/llvm/lib/AsmParser/Parser.cpp
index 9145a54..4e55e62 100644
--- a/contrib/llvm/lib/AsmParser/Parser.cpp
+++ b/contrib/llvm/lib/AsmParser/Parser.cpp
@@ -66,3 +66,15 @@ std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
   MemoryBufferRef F(AsmString, "<string>");
   return parseAssembly(F, Err, Context, Slots);
 }
+
+Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err,
+                                   const Module &M, const SlotMapping *Slots) {
+  SourceMgr SM;
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Asm);
+  SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
+  Constant *C;
+  if (LLParser(Asm, SM, Err, const_cast<Module *>(&M))
+          .parseStandaloneConstantValue(C, Slots))
+    return nullptr;
+  return C;
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp
index 289c76e..385c18a 100644
--- a/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/BitReader.h"
+#include "llvm-c/Core.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
@@ -22,12 +23,25 @@ using namespace llvm;
 /* Builds a module from the bitcode in the specified memory buffer, returning a
    reference to the module via the OutModule parameter. Returns 0 on success.
    Optionally returns a human-readable error message via OutMessage. */
-LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf,
-                          LLVMModuleRef *OutModule, char **OutMessage) {
+LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutModule,
+                          char **OutMessage) {
   return LLVMParseBitcodeInContext(wrap(&getGlobalContext()), MemBuf, OutModule,
                                    OutMessage);
 }
 
+LLVMBool LLVMParseBitcode2(LLVMMemoryBufferRef MemBuf,
+                           LLVMModuleRef *OutModule) {
+  return LLVMParseBitcodeInContext2(wrap(&getGlobalContext()), MemBuf,
+                                    OutModule);
+}
+
+static void diagnosticHandler(const DiagnosticInfo &DI, void *C) {
+  auto *Message = reinterpret_cast<std::string *>(C);
+  raw_string_ostream Stream(*Message);
+  DiagnosticPrinterRawOStream DP(Stream);
+  DI.print(DP);
+}
+
 LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
                                    LLVMMemoryBufferRef MemBuf,
                                    LLVMModuleRef *OutModule,
@@ -35,18 +49,36 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
   MemoryBufferRef Buf = unwrap(MemBuf)->getMemBufferRef();
   LLVMContext &Ctx = *unwrap(ContextRef);
 
+  LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler =
+      Ctx.getDiagnosticHandler();
+  void *OldDiagnosticContext = Ctx.getDiagnosticContext();
   std::string Message;
-  raw_string_ostream Stream(Message);
-  DiagnosticPrinterRawOStream DP(Stream);
+  Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true);
+
+  ErrorOr<std::unique_ptr<Module>> ModuleOrErr = parseBitcodeFile(Buf, Ctx);
+
+  Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true);
 
-  ErrorOr<std::unique_ptr<Module>> ModuleOrErr = parseBitcodeFile(
-      Buf, Ctx, [&](const DiagnosticInfo &DI) { DI.print(DP); });
   if (ModuleOrErr.getError()) {
-    if (OutMessage) {
-      Stream.flush();
+    if (OutMessage)
       *OutMessage = strdup(Message.c_str());
-    }
-    *OutModule = wrap((Module*)nullptr);
+    *OutModule = wrap((Module *)nullptr);
+    return 1;
+  }
+
+  *OutModule = wrap(ModuleOrErr.get().release());
+  return 0;
+}
+
+LLVMBool LLVMParseBitcodeInContext2(LLVMContextRef ContextRef,
+                                    LLVMMemoryBufferRef MemBuf,
+                                    LLVMModuleRef *OutModule) {
+  MemoryBufferRef Buf = unwrap(MemBuf)->getMemBufferRef();
+  LLVMContext &Ctx = *unwrap(ContextRef);
+
+  ErrorOr<std::unique_ptr<Module>> ModuleOrErr = parseBitcodeFile(Buf, Ctx);
+  if (ModuleOrErr.getError()) {
+    *OutModule = wrap((Module *)nullptr);
     return 1;
   }
 
@@ -59,26 +91,50 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
    Optionally returns a human-readable error message via OutMessage. */
 LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef,
                                        LLVMMemoryBufferRef MemBuf,
-                                       LLVMModuleRef *OutM,
-                                       char **OutMessage) {
+                                       LLVMModuleRef *OutM, char **OutMessage) {
+  LLVMContext &Ctx = *unwrap(ContextRef);
+  LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler =
+      Ctx.getDiagnosticHandler();
+  void *OldDiagnosticContext = Ctx.getDiagnosticContext();
+
   std::string Message;
+  Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true);
   std::unique_ptr<MemoryBuffer> Owner(unwrap(MemBuf));
 
   ErrorOr<std::unique_ptr<Module>> ModuleOrErr =
-      getLazyBitcodeModule(std::move(Owner), *unwrap(ContextRef));
+      getLazyBitcodeModule(std::move(Owner), Ctx);
   Owner.release();
+  Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true);
 
-  if (std::error_code EC = ModuleOrErr.getError()) {
+  if (ModuleOrErr.getError()) {
     *OutM = wrap((Module *)nullptr);
     if (OutMessage)
-      *OutMessage = strdup(EC.message().c_str());
+      *OutMessage = strdup(Message.c_str());
     return 1;
   }
 
   *OutM = wrap(ModuleOrErr.get().release());
 
   return 0;
+}
+
+LLVMBool LLVMGetBitcodeModuleInContext2(LLVMContextRef ContextRef,
+                                        LLVMMemoryBufferRef MemBuf,
+                                        LLVMModuleRef *OutM) {
+  LLVMContext &Ctx = *unwrap(ContextRef);
+  std::unique_ptr<MemoryBuffer> Owner(unwrap(MemBuf));
+
+  ErrorOr<std::unique_ptr<Module>> ModuleOrErr =
+      getLazyBitcodeModule(std::move(Owner), Ctx);
+  Owner.release();
 
+  if (ModuleOrErr.getError()) {
+    *OutM = wrap((Module *)nullptr);
+    return 1;
+  }
+
+  *OutM = wrap(ModuleOrErr.get().release());
+  return 0;
 }
 
 LLVMBool LLVMGetBitcodeModule(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
@@ -87,20 +143,7 @@ LLVMBool LLVMGetBitcodeModule(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
                                        OutMessage);
 }
 
-/* Deprecated: Use LLVMGetBitcodeModuleInContext instead. */
-LLVMBool LLVMGetBitcodeModuleProviderInContext(LLVMContextRef ContextRef,
-                                               LLVMMemoryBufferRef MemBuf,
-                                               LLVMModuleProviderRef *OutMP,
-                                               char **OutMessage) {
-  return LLVMGetBitcodeModuleInContext(ContextRef, MemBuf,
-                                       reinterpret_cast<LLVMModuleRef*>(OutMP),
-                                       OutMessage);
-}
-
-/* Deprecated: Use LLVMGetBitcodeModule instead. */
-LLVMBool LLVMGetBitcodeModuleProvider(LLVMMemoryBufferRef MemBuf,
-                                      LLVMModuleProviderRef *OutMP,
-                                      char **OutMessage) {
-  return LLVMGetBitcodeModuleProviderInContext(LLVMGetGlobalContext(), MemBuf,
-                                               OutMP, OutMessage);
+LLVMBool LLVMGetBitcodeModule2(LLVMMemoryBufferRef MemBuf,
+                               LLVMModuleRef *OutM) {
+  return LLVMGetBitcodeModuleInContext2(LLVMGetGlobalContext(), MemBuf, OutM);
 }
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index c04e8b9..2ad4b32 100644
--- a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/FunctionInfo.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/DataStream.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -93,35 +94,35 @@ public:
   void resolveConstantForwardRefs();
 };
 
-class BitcodeReaderMDValueList {
+class BitcodeReaderMetadataList {
   unsigned NumFwdRefs;
   bool AnyFwdRefs;
   unsigned MinFwdRef;
   unsigned MaxFwdRef;
-  std::vector<TrackingMDRef> MDValuePtrs;
+  std::vector<TrackingMDRef> MetadataPtrs;
 
   LLVMContext &Context;
 public:
-  BitcodeReaderMDValueList(LLVMContext &C)
+  BitcodeReaderMetadataList(LLVMContext &C)
       : NumFwdRefs(0), AnyFwdRefs(false), Context(C) {}
 
   // vector compatibility methods
-  unsigned size() const       { return MDValuePtrs.size(); }
-  void resize(unsigned N)     { MDValuePtrs.resize(N); }
-  void push_back(Metadata *MD) { MDValuePtrs.emplace_back(MD); }
-  void clear()                { MDValuePtrs.clear();  }
-  Metadata *back() const      { return MDValuePtrs.back(); }
-  void pop_back()             { MDValuePtrs.pop_back(); }
-  bool empty() const          { return MDValuePtrs.empty(); }
+  unsigned size() const { return MetadataPtrs.size(); }
+  void resize(unsigned N) { MetadataPtrs.resize(N); }
+  void push_back(Metadata *MD) { MetadataPtrs.emplace_back(MD); }
+  void clear() { MetadataPtrs.clear(); }
+  Metadata *back() const { return MetadataPtrs.back(); }
+  void pop_back() { MetadataPtrs.pop_back(); }
+  bool empty() const { return MetadataPtrs.empty(); }
 
   Metadata *operator[](unsigned i) const {
-    assert(i < MDValuePtrs.size());
-    return MDValuePtrs[i];
+    assert(i < MetadataPtrs.size());
+    return MetadataPtrs[i];
   }
 
   void shrinkTo(unsigned N) {
     assert(N <= size() && "Invalid shrinkTo request!");
-    MDValuePtrs.resize(N);
+    MetadataPtrs.resize(N);
   }
 
   Metadata *getValueFwdRef(unsigned Idx);
@@ -131,17 +132,27 @@ public:
 
 class BitcodeReader : public GVMaterializer {
   LLVMContext &Context;
-  DiagnosticHandlerFunction DiagnosticHandler;
   Module *TheModule = nullptr;
   std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<BitstreamReader> StreamFile;
   BitstreamCursor Stream;
+  // Next offset to start scanning for lazy parsing of function bodies.
   uint64_t NextUnreadBit = 0;
+  // Last function offset found in the VST.
+  uint64_t LastFunctionBlockBit = 0;
   bool SeenValueSymbolTable = false;
+  uint64_t VSTOffset = 0;
+  // Contains an arbitrary and optional string identifying the bitcode producer
+  std::string ProducerIdentification;
+  // Number of module level metadata records specified by the
+  // MODULE_CODE_METADATA_VALUES record.
+  unsigned NumModuleMDs = 0;
+  // Support older bitcode without the MODULE_CODE_METADATA_VALUES record.
+  bool SeenModuleValuesRecord = false;
 
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
-  BitcodeReaderMDValueList MDValueList;
+  BitcodeReaderMetadataList MetadataList;
   std::vector<Comdat *> ComdatList;
   SmallVector<Instruction *, 64> InstructionList;
 
@@ -157,7 +168,7 @@ class BitcodeReader : public GVMaterializer {
   /// is thus not represented here.  As such all indices are off by one.
   std::vector<AttributeSet> MAttributes;
 
-  /// \brief The set of attribute groups.
+  /// The set of attribute groups.
   std::map<unsigned, AttributeSet> MAttributeGroups;
 
   /// While parsing a function body, this is a list of the basic blocks for the
@@ -208,23 +219,24 @@ class BitcodeReader : public GVMaterializer {
   /// (e.g.) blockaddress forward references.
   bool WillMaterializeAllForwardRefs = false;
 
-  /// Functions that have block addresses taken.  This is usually empty.
-  SmallPtrSet<const Function *, 4> BlockAddressesTaken;
-
   /// True if any Metadata block has been materialized.
   bool IsMetadataMaterialized = false;
 
   bool StripDebugInfo = false;
 
+  /// Functions that need to be matched with subprograms when upgrading old
+  /// metadata.
+  SmallDenseMap<Function *, DISubprogram *, 16> FunctionsWithSPs;
+
+  std::vector<std::string> BundleTags;
+
 public:
   std::error_code error(BitcodeError E, const Twine &Message);
   std::error_code error(BitcodeError E);
   std::error_code error(const Twine &Message);
 
-  BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context,
-                DiagnosticHandlerFunction DiagnosticHandler);
-  BitcodeReader(LLVMContext &Context,
-                DiagnosticHandlerFunction DiagnosticHandler);
+  BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context);
+  BitcodeReader(LLVMContext &Context);
   ~BitcodeReader() override { freeState(); }
 
   std::error_code materializeForwardReferencedFunctions();
@@ -233,11 +245,9 @@ public:
 
   void releaseBuffer();
 
-  bool isDematerializable(const GlobalValue *GV) const override;
   std::error_code materialize(GlobalValue *GV) override;
-  std::error_code materializeModule(Module *M) override;
+  std::error_code materializeModule() override;
   std::vector<StructType *> getIdentifiedStructTypes() const override;
-  void dematerialize(GlobalValue *GV) override;
 
   /// \brief Main interface to parsing a bitcode buffer.
   /// \returns true if an error occurred.
@@ -249,6 +259,9 @@ public:
   /// \returns true if an error occurred.
   ErrorOr<std::string> parseTriple();
 
+  /// Cheap mechanism to just extract the identification block out of bitcode.
+  ErrorOr<std::string> parseIdentificationBlock();
+
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
   /// Materialize any deferred Metadata block.
@@ -256,7 +269,20 @@ public:
 
   void setStripDebugInfo() override;
 
+  /// Save the mapping between the metadata values and the corresponding
+  /// value id that were recorded in the MetadataList during parsing. If
+  /// OnlyTempMD is true, then only record those entries that are still
+  /// temporary metadata. This interface is used when metadata linking is
+  /// performed as a postpass, such as during function importing.
+  void saveMetadataList(DenseMap<const Metadata *, unsigned> &MetadataToIDs,
+                        bool OnlyTempMD) override;
+
 private:
+  /// Parse the "IDENTIFICATION_BLOCK_ID" block, populate the
+  // ProducerIdentification data member, and do some basic enforcement on the
+  // "epoch" encoded in the bitcode.
+  std::error_code parseBitcodeVersion();
+
   std::vector<StructType *> IdentifiedStructTypes;
   StructType *createIdentifiedStructType(LLVMContext &Context, StringRef Name);
   StructType *createIdentifiedStructType(LLVMContext &Context);
@@ -268,7 +294,7 @@ private:
     return ValueList.getValueFwdRef(ID, Ty);
   }
   Metadata *getFnMetadataByID(unsigned ID) {
-    return MDValueList.getValueFwdRef(ID);
+    return MetadataList.getValueFwdRef(ID);
   }
   BasicBlock *getBasicBlock(unsigned ID) const {
     if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID
@@ -351,21 +377,28 @@ private:
   /// a corresponding error code.
   std::error_code parseAlignmentValue(uint64_t Exponent, unsigned &Alignment);
   std::error_code parseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
-  std::error_code parseModule(bool Resume, bool ShouldLazyLoadMetadata = false);
+  std::error_code parseModule(uint64_t ResumeBit,
+                              bool ShouldLazyLoadMetadata = false);
   std::error_code parseAttributeBlock();
   std::error_code parseAttributeGroupBlock();
   std::error_code parseTypeTable();
   std::error_code parseTypeTableBody();
+  std::error_code parseOperandBundleTags();
 
-  std::error_code parseValueSymbolTable();
+  ErrorOr<Value *> recordValue(SmallVectorImpl<uint64_t> &Record,
+                               unsigned NameIndex, Triple &TT);
+  std::error_code parseValueSymbolTable(uint64_t Offset = 0);
   std::error_code parseConstants();
+  std::error_code rememberAndSkipFunctionBodies();
   std::error_code rememberAndSkipFunctionBody();
   /// Save the positions of the Metadata blocks and skip parsing the blocks.
   std::error_code rememberAndSkipMetadata();
   std::error_code parseFunctionBody(Function *F);
   std::error_code globalCleanup();
   std::error_code resolveGlobalAndAliasInits();
-  std::error_code parseMetadata();
+  std::error_code parseMetadata(bool ModuleLevel = false);
+  std::error_code parseMetadataKinds();
+  std::error_code parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
   std::error_code parseMetadataAttachment(Function &F);
   ErrorOr<std::string> parseModuleTriple();
   std::error_code parseUseLists();
@@ -376,6 +409,94 @@ private:
       Function *F,
       DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator);
 };
+
+/// Class to manage reading and parsing function summary index bitcode
+/// files/sections.
+class FunctionIndexBitcodeReader {
+  DiagnosticHandlerFunction DiagnosticHandler;
+
+  /// Eventually points to the function index built during parsing.
+  FunctionInfoIndex *TheIndex = nullptr;
+
+  std::unique_ptr<MemoryBuffer> Buffer;
+  std::unique_ptr<BitstreamReader> StreamFile;
+  BitstreamCursor Stream;
+
+  /// \brief Used to indicate whether we are doing lazy parsing of summary data.
+  ///
+  /// If false, the summary section is fully parsed into the index during
+  /// the initial parse. Otherwise, if true, the caller is expected to
+  /// invoke \a readFunctionSummary for each summary needed, and the summary
+  /// section is thus parsed lazily.
+  bool IsLazy = false;
+
+  /// Used to indicate whether caller only wants to check for the presence
+  /// of the function summary bitcode section. All blocks are skipped,
+  /// but the SeenFuncSummary boolean is set.
+  bool CheckFuncSummaryPresenceOnly = false;
+
+  /// Indicates whether we have encountered a function summary section
+  /// yet during parsing, used when checking if file contains function
+  /// summary section.
+  bool SeenFuncSummary = false;
+
+  /// \brief Map populated during function summary section parsing, and
+  /// consumed during ValueSymbolTable parsing.
+  ///
+  /// Used to correlate summary records with VST entries. For the per-module
+  /// index this maps the ValueID to the parsed function summary, and
+  /// for the combined index this maps the summary record's bitcode
+  /// offset to the function summary (since in the combined index the
+  /// VST records do not hold value IDs but rather hold the function
+  /// summary record offset).
+  DenseMap<uint64_t, std::unique_ptr<FunctionSummary>> SummaryMap;
+
+  /// Map populated during module path string table parsing, from the
+  /// module ID to a string reference owned by the index's module
+  /// path string table, used to correlate with combined index function
+  /// summary records.
+  DenseMap<uint64_t, StringRef> ModuleIdMap;
+
+public:
+  std::error_code error(BitcodeError E, const Twine &Message);
+  std::error_code error(BitcodeError E);
+  std::error_code error(const Twine &Message);
+
+  FunctionIndexBitcodeReader(MemoryBuffer *Buffer,
+                             DiagnosticHandlerFunction DiagnosticHandler,
+                             bool IsLazy = false,
+                             bool CheckFuncSummaryPresenceOnly = false);
+  FunctionIndexBitcodeReader(DiagnosticHandlerFunction DiagnosticHandler,
+                             bool IsLazy = false,
+                             bool CheckFuncSummaryPresenceOnly = false);
+  ~FunctionIndexBitcodeReader() { freeState(); }
+
+  void freeState();
+
+  void releaseBuffer();
+
+  /// Check if the parser has encountered a function summary section.
+  bool foundFuncSummary() { return SeenFuncSummary; }
+
+  /// \brief Main interface to parsing a bitcode buffer.
+  /// \returns true if an error occurred.
+  std::error_code parseSummaryIndexInto(std::unique_ptr<DataStreamer> Streamer,
+                                        FunctionInfoIndex *I);
+
+  /// \brief Interface for parsing a function summary lazily.
+  std::error_code parseFunctionSummary(std::unique_ptr<DataStreamer> Streamer,
+                                       FunctionInfoIndex *I,
+                                       size_t FunctionSummaryOffset);
+
+private:
+  std::error_code parseModule();
+  std::error_code parseValueSymbolTable();
+  std::error_code parseEntireSummary();
+  std::error_code parseModuleStringTable();
+  std::error_code initStream(std::unique_ptr<DataStreamer> Streamer);
+  std::error_code initStreamFromBuffer();
+  std::error_code initLazyStream(std::unique_ptr<DataStreamer> Streamer);
+};
 } // namespace
 
 BitcodeDiagnosticInfo::BitcodeDiagnosticInfo(std::error_code EC,
@@ -397,43 +518,51 @@ static std::error_code error(DiagnosticHandlerFunction DiagnosticHandler,
   return error(DiagnosticHandler, EC, EC.message());
 }
 
-static std::error_code error(DiagnosticHandlerFunction DiagnosticHandler,
+static std::error_code error(LLVMContext &Context, std::error_code EC,
                              const Twine &Message) {
-  return error(DiagnosticHandler,
-               make_error_code(BitcodeError::CorruptedBitcode), Message);
+  return error([&](const DiagnosticInfo &DI) { Context.diagnose(DI); }, EC,
+               Message);
+}
+
+static std::error_code error(LLVMContext &Context, std::error_code EC) {
+  return error(Context, EC, EC.message());
+}
+
+static std::error_code error(LLVMContext &Context, const Twine &Message) {
+  return error(Context, make_error_code(BitcodeError::CorruptedBitcode),
+               Message);
 }
 
 std::error_code BitcodeReader::error(BitcodeError E, const Twine &Message) {
-  return ::error(DiagnosticHandler, make_error_code(E), Message);
+  if (!ProducerIdentification.empty()) {
+    return ::error(Context, make_error_code(E),
+                   Message + " (Producer: '" + ProducerIdentification +
+                       "' Reader: 'LLVM " + LLVM_VERSION_STRING "')");
+  }
+  return ::error(Context, make_error_code(E), Message);
 }
 
 std::error_code BitcodeReader::error(const Twine &Message) {
-  return ::error(DiagnosticHandler,
-                 make_error_code(BitcodeError::CorruptedBitcode), Message);
+  if (!ProducerIdentification.empty()) {
+    return ::error(Context, make_error_code(BitcodeError::CorruptedBitcode),
+                   Message + " (Producer: '" + ProducerIdentification +
+                       "' Reader: 'LLVM " + LLVM_VERSION_STRING "')");
+  }
+  return ::error(Context, make_error_code(BitcodeError::CorruptedBitcode),
+                 Message);
 }
 
 std::error_code BitcodeReader::error(BitcodeError E) {
-  return ::error(DiagnosticHandler, make_error_code(E));
-}
-
-static DiagnosticHandlerFunction getDiagHandler(DiagnosticHandlerFunction F,
-                                                LLVMContext &C) {
-  if (F)
-    return F;
-  return [&C](const DiagnosticInfo &DI) { C.diagnose(DI); };
+  return ::error(Context, make_error_code(E));
 }
 
-BitcodeReader::BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context,
-                             DiagnosticHandlerFunction DiagnosticHandler)
-    : Context(Context),
-      DiagnosticHandler(getDiagHandler(DiagnosticHandler, Context)),
-      Buffer(Buffer), ValueList(Context), MDValueList(Context) {}
+BitcodeReader::BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context)
+    : Context(Context), Buffer(Buffer), ValueList(Context),
+      MetadataList(Context) {}
 
-BitcodeReader::BitcodeReader(LLVMContext &Context,
-                             DiagnosticHandlerFunction DiagnosticHandler)
-    : Context(Context),
-      DiagnosticHandler(getDiagHandler(DiagnosticHandler, Context)),
-      Buffer(nullptr), ValueList(Context), MDValueList(Context) {}
+BitcodeReader::BitcodeReader(LLVMContext &Context)
+    : Context(Context), Buffer(nullptr), ValueList(Context),
+      MetadataList(Context) {}
 
 std::error_code BitcodeReader::materializeForwardReferencedFunctions() {
   if (WillMaterializeAllForwardRefs)
@@ -472,7 +601,7 @@ void BitcodeReader::freeState() {
   Buffer = nullptr;
   std::vector<Type*>().swap(TypeList);
   ValueList.clear();
-  MDValueList.clear();
+  MetadataList.clear();
   std::vector<Comdat *>().swap(ComdatList);
 
   std::vector<AttributeSet>().swap(MAttributes);
@@ -779,6 +908,8 @@ void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
     OldV->replaceAllUsesWith(V);
     delete PrevVal;
   }
+
+  return;
 }
 
 
@@ -904,7 +1035,7 @@ void BitcodeReaderValueList::resolveConstantForwardRefs() {
   }
 }
 
-void BitcodeReaderMDValueList::assignValue(Metadata *MD, unsigned Idx) {
+void BitcodeReaderMetadataList::assignValue(Metadata *MD, unsigned Idx) {
   if (Idx == size()) {
     push_back(MD);
     return;
@@ -913,7 +1044,7 @@ void BitcodeReaderMDValueList::assignValue(Metadata *MD, unsigned Idx) {
   if (Idx >= size())
     resize(Idx+1);
 
-  TrackingMDRef &OldMD = MDValuePtrs[Idx];
+  TrackingMDRef &OldMD = MetadataPtrs[Idx];
   if (!OldMD) {
     OldMD.reset(MD);
     return;
@@ -925,11 +1056,11 @@ void BitcodeReaderMDValueList::assignValue(Metadata *MD, unsigned Idx) {
   --NumFwdRefs;
 }
 
-Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
+Metadata *BitcodeReaderMetadataList::getValueFwdRef(unsigned Idx) {
   if (Idx >= size())
     resize(Idx + 1);
 
-  if (Metadata *MD = MDValuePtrs[Idx])
+  if (Metadata *MD = MetadataPtrs[Idx])
     return MD;
 
   // Track forward refs to be resolved later.
@@ -944,11 +1075,11 @@ Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
 
   // Create and return a placeholder, which will later be RAUW'd.
   Metadata *MD = MDNode::getTemporary(Context, None).release();
-  MDValuePtrs[Idx].reset(MD);
+  MetadataPtrs[Idx].reset(MD);
   return MD;
 }
 
-void BitcodeReaderMDValueList::tryToResolveCycles() {
+void BitcodeReaderMetadataList::tryToResolveCycles() {
   if (!AnyFwdRefs)
     // Nothing to do.
     return;
@@ -959,7 +1090,7 @@ void BitcodeReaderMDValueList::tryToResolveCycles() {
 
   // Resolve any cycles.
   for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) {
-    auto &MD = MDValuePtrs[I];
+    auto &MD = MetadataPtrs[I];
     auto *N = dyn_cast_or_null<MDNode>(MD);
     if (!N)
       continue;
@@ -1102,6 +1233,10 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Cold;
   case bitc::ATTR_KIND_CONVERGENT:
     return Attribute::Convergent;
+  case bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY:
+    return Attribute::InaccessibleMemOnly;
+  case bitc::ATTR_KIND_INACCESSIBLEMEM_OR_ARGMEMONLY:
+    return Attribute::InaccessibleMemOrArgMemOnly;
   case bitc::ATTR_KIND_INLINE_HINT:
     return Attribute::InlineHint;
   case bitc::ATTR_KIND_IN_REG:
@@ -1126,6 +1261,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoImplicitFloat;
   case bitc::ATTR_KIND_NO_INLINE:
     return Attribute::NoInline;
+  case bitc::ATTR_KIND_NO_RECURSE:
+    return Attribute::NoRecurse;
   case bitc::ATTR_KIND_NON_LAZY_BIND:
     return Attribute::NonLazyBind;
   case bitc::ATTR_KIND_NON_NULL:
@@ -1360,6 +1497,9 @@ std::error_code BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
       ResultTy = Type::getX86_MMXTy(Context);
       break;
+    case bitc::TYPE_CODE_TOKEN:     // TOKEN
+      ResultTy = Type::getTokenTy(Context);
+      break;
     case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width]
       if (Record.size() < 1)
         return error("Invalid record");
@@ -1524,7 +1664,107 @@ std::error_code BitcodeReader::parseTypeTableBody() {
   }
 }
 
-std::error_code BitcodeReader::parseValueSymbolTable() {
+std::error_code BitcodeReader::parseOperandBundleTags() {
+  if (Stream.EnterSubBlock(bitc::OPERAND_BUNDLE_TAGS_BLOCK_ID))
+    return error("Invalid record");
+
+  if (!BundleTags.empty())
+    return error("Invalid multiple blocks");
+
+  SmallVector<uint64_t, 64> Record;
+
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Tags are implicitly mapped to integers by their order.
+
+    if (Stream.readRecord(Entry.ID, Record) != bitc::OPERAND_BUNDLE_TAG)
+      return error("Invalid record");
+
+    // OPERAND_BUNDLE_TAG: [strchr x N]
+    BundleTags.emplace_back();
+    if (convertToString(Record, 0, BundleTags.back()))
+      return error("Invalid record");
+    Record.clear();
+  }
+}
+
+/// Associate a value with its name from the given index in the provided record.
+ErrorOr<Value *> BitcodeReader::recordValue(SmallVectorImpl<uint64_t> &Record,
+                                            unsigned NameIndex, Triple &TT) {
+  SmallString<128> ValueName;
+  if (convertToString(Record, NameIndex, ValueName))
+    return error("Invalid record");
+  unsigned ValueID = Record[0];
+  if (ValueID >= ValueList.size() || !ValueList[ValueID])
+    return error("Invalid record");
+  Value *V = ValueList[ValueID];
+
+  StringRef NameStr(ValueName.data(), ValueName.size());
+  if (NameStr.find_first_of(0) != StringRef::npos)
+    return error("Invalid value name");
+  V->setName(NameStr);
+  auto *GO = dyn_cast<GlobalObject>(V);
+  if (GO) {
+    if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) {
+      if (TT.isOSBinFormatMachO())
+        GO->setComdat(nullptr);
+      else
+        GO->setComdat(TheModule->getOrInsertComdat(V->getName()));
+    }
+  }
+  return V;
+}
+
+/// Parse the value symbol table at either the current parsing location or
+/// at the given bit offset if provided.
+std::error_code BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
+  uint64_t CurrentBit;
+  // Pass in the Offset to distinguish between calling for the module-level
+  // VST (where we want to jump to the VST offset) and the function-level
+  // VST (where we don't).
+  if (Offset > 0) {
+    // Save the current parsing location so we can jump back at the end
+    // of the VST read.
+    CurrentBit = Stream.GetCurrentBitNo();
+    Stream.JumpToBit(Offset * 32);
+#ifndef NDEBUG
+    // Do some checking if we are in debug mode.
+    BitstreamEntry Entry = Stream.advance();
+    assert(Entry.Kind == BitstreamEntry::SubBlock);
+    assert(Entry.ID == bitc::VALUE_SYMTAB_BLOCK_ID);
+#else
+    // In NDEBUG mode ignore the output so we don't get an unused variable
+    // warning.
+    Stream.advance();
+#endif
+  }
+
+  // Compute the delta between the bitcode indices in the VST (the word offset
+  // to the word-aligned ENTER_SUBBLOCK for the function block, and that
+  // expected by the lazy reader. The reader's EnterSubBlock expects to have
+  // already read the ENTER_SUBBLOCK code (size getAbbrevIDWidth) and BlockID
+  // (size BlockIDWidth). Note that we access the stream's AbbrevID width here
+  // just before entering the VST subblock because: 1) the EnterSubBlock
+  // changes the AbbrevID width; 2) the VST block is nested within the same
+  // outer MODULE_BLOCK as the FUNCTION_BLOCKs and therefore have the same
+  // AbbrevID width before calling EnterSubBlock; and 3) when we want to
+  // jump to the FUNCTION_BLOCK using this offset later, we don't want
+  // to rely on the stream's AbbrevID width being that of the MODULE_BLOCK.
+  unsigned FuncBitcodeOffsetDelta =
+      Stream.getAbbrevIDWidth() + bitc::BlockIDWidth;
+
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
     return error("Invalid record");
 
@@ -1542,6 +1782,8 @@ std::error_code BitcodeReader::parseValueSymbolTable() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      if (Offset > 0)
+        Stream.JumpToBit(CurrentBit);
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -1554,23 +1796,39 @@ std::error_code BitcodeReader::parseValueSymbolTable() {
     default:  // Default behavior: unknown type.
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
-      if (convertToString(Record, 1, ValueName))
-        return error("Invalid record");
-      unsigned ValueID = Record[0];
-      if (ValueID >= ValueList.size() || !ValueList[ValueID])
-        return error("Invalid record");
-      Value *V = ValueList[ValueID];
-
-      V->setName(StringRef(ValueName.data(), ValueName.size()));
-      if (auto *GO = dyn_cast<GlobalObject>(V)) {
-        if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) {
-          if (TT.isOSBinFormatMachO())
-            GO->setComdat(nullptr);
-          else
-            GO->setComdat(TheModule->getOrInsertComdat(V->getName()));
-        }
+      ErrorOr<Value *> ValOrErr = recordValue(Record, 1, TT);
+      if (std::error_code EC = ValOrErr.getError())
+        return EC;
+      ValOrErr.get();
+      break;
+    }
+    case bitc::VST_CODE_FNENTRY: {
+      // VST_FNENTRY: [valueid, offset, namechar x N]
+      ErrorOr<Value *> ValOrErr = recordValue(Record, 2, TT);
+      if (std::error_code EC = ValOrErr.getError())
+        return EC;
+      Value *V = ValOrErr.get();
+
+      auto *GO = dyn_cast<GlobalObject>(V);
+      if (!GO) {
+        // If this is an alias, need to get the actual Function object
+        // it aliases, in order to set up the DeferredFunctionInfo entry below.
+        auto *GA = dyn_cast<GlobalAlias>(V);
+        if (GA)
+          GO = GA->getBaseObject();
+        assert(GO);
       }
-      ValueName.clear();
+
+      uint64_t FuncWordOffset = Record[1];
+      Function *F = dyn_cast<Function>(GO);
+      assert(F);
+      uint64_t FuncBitOffset = FuncWordOffset * 32;
+      DeferredFunctionInfo[F] = FuncBitOffset + FuncBitcodeOffsetDelta;
+      // Set the LastFunctionBlockBit to point to the last function block.
+      // Later when parsing is resumed after function materialization,
+      // we can simply skip that last function block.
+      if (FuncBitOffset > LastFunctionBlockBit)
+        LastFunctionBlockBit = FuncBitOffset;
       break;
     }
     case bitc::VST_CODE_BBENTRY: {
@@ -1588,19 +1846,51 @@ std::error_code BitcodeReader::parseValueSymbolTable() {
   }
 }
 
+/// Parse a single METADATA_KIND record, inserting result in MDKindMap.
+std::error_code
+BitcodeReader::parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record) {
+  if (Record.size() < 2)
+    return error("Invalid record");
+
+  unsigned Kind = Record[0];
+  SmallString<8> Name(Record.begin() + 1, Record.end());
+
+  unsigned NewKind = TheModule->getMDKindID(Name.str());
+  if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
+    return error("Conflicting METADATA_KIND records");
+  return std::error_code();
+}
+
 static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
 
-std::error_code BitcodeReader::parseMetadata() {
+/// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
+/// module level metadata.
+std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
   IsMetadataMaterialized = true;
-  unsigned NextMDValueNo = MDValueList.size();
+  unsigned NextMetadataNo = MetadataList.size();
+  if (ModuleLevel && SeenModuleValuesRecord) {
+    // Now that we are parsing the module level metadata, we want to restart
+    // the numbering of the MD values, and replace temp MD created earlier
+    // with their real values. If we saw a METADATA_VALUE record then we
+    // would have set the MetadataList size to the number specified in that
+    // record, to support parsing function-level metadata first, and we need
+    // to reset back to 0 to fill the MetadataList in with the parsed module
+    // The function-level metadata parsing should have reset the MetadataList
+    // size back to the value reported by the METADATA_VALUE record, saved in
+    // NumModuleMDs.
+    assert(NumModuleMDs == MetadataList.size() &&
+           "Expected MetadataList to only contain module level values");
+    NextMetadataNo = 0;
+  }
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
     return error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
 
-  auto getMD =
-      [&](unsigned ID) -> Metadata *{ return MDValueList.getValueFwdRef(ID); };
+  auto getMD = [&](unsigned ID) -> Metadata * {
+    return MetadataList.getValueFwdRef(ID);
+  };
   auto getMDOrNull = [&](unsigned ID) -> Metadata *{
     if (ID)
       return getMD(ID - 1);
@@ -1624,7 +1914,10 @@ std::error_code BitcodeReader::parseMetadata() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      MDValueList.tryToResolveCycles();
+      MetadataList.tryToResolveCycles();
+      assert((!(ModuleLevel && SeenModuleValuesRecord) ||
+              NumModuleMDs == MetadataList.size()) &&
+             "Inconsistent bitcode: METADATA_VALUES mismatch");
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -1652,7 +1945,8 @@ std::error_code BitcodeReader::parseMetadata() {
       unsigned Size = Record.size();
       NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name);
       for (unsigned i = 0; i != Size; ++i) {
-        MDNode *MD = dyn_cast_or_null<MDNode>(MDValueList.getValueFwdRef(Record[i]));
+        MDNode *MD =
+            dyn_cast_or_null<MDNode>(MetadataList.getValueFwdRef(Record[i]));
         if (!MD)
           return error("Invalid record");
         NMD->addOperand(MD);
@@ -1669,7 +1963,7 @@ std::error_code BitcodeReader::parseMetadata() {
       // If this isn't a LocalAsMetadata record, we're dropping it.  This used
       // to be legal, but there's no upgrade path.
       auto dropRecord = [&] {
-        MDValueList.assignValue(MDNode::get(Context, None), NextMDValueNo++);
+        MetadataList.assignValue(MDNode::get(Context, None), NextMetadataNo++);
       };
       if (Record.size() != 2) {
         dropRecord();
@@ -1682,9 +1976,9 @@ std::error_code BitcodeReader::parseMetadata() {
         break;
       }
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_OLD_NODE: {
@@ -1699,7 +1993,7 @@ std::error_code BitcodeReader::parseMetadata() {
         if (!Ty)
           return error("Invalid record");
         if (Ty->isMetadataTy())
-          Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
+          Elts.push_back(MetadataList.getValueFwdRef(Record[i + 1]));
         else if (!Ty->isVoidTy()) {
           auto *MD =
               ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
@@ -1709,7 +2003,7 @@ std::error_code BitcodeReader::parseMetadata() {
         } else
           Elts.push_back(nullptr);
       }
-      MDValueList.assignValue(MDNode::get(Context, Elts), NextMDValueNo++);
+      MetadataList.assignValue(MDNode::get(Context, Elts), NextMetadataNo++);
       break;
     }
     case bitc::METADATA_VALUE: {
@@ -1720,9 +2014,9 @@ std::error_code BitcodeReader::parseMetadata() {
       if (Ty->isMetadataTy() || Ty->isVoidTy())
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_DISTINCT_NODE:
@@ -1732,10 +2026,10 @@ std::error_code BitcodeReader::parseMetadata() {
       SmallVector<Metadata *, 8> Elts;
       Elts.reserve(Record.size());
       for (unsigned ID : Record)
-        Elts.push_back(ID ? MDValueList.getValueFwdRef(ID - 1) : nullptr);
-      MDValueList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
-                                         : MDNode::get(Context, Elts),
-                              NextMDValueNo++);
+        Elts.push_back(ID ? MetadataList.getValueFwdRef(ID - 1) : nullptr);
+      MetadataList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
+                                          : MDNode::get(Context, Elts),
+                               NextMetadataNo++);
       break;
     }
     case bitc::METADATA_LOCATION: {
@@ -1744,13 +2038,13 @@ std::error_code BitcodeReader::parseMetadata() {
 
       unsigned Line = Record[1];
       unsigned Column = Record[2];
-      MDNode *Scope = cast<MDNode>(MDValueList.getValueFwdRef(Record[3]));
+      MDNode *Scope = cast<MDNode>(MetadataList.getValueFwdRef(Record[3]));
       Metadata *InlinedAt =
-          Record[4] ? MDValueList.getValueFwdRef(Record[4] - 1) : nullptr;
-      MDValueList.assignValue(
+          Record[4] ? MetadataList.getValueFwdRef(Record[4] - 1) : nullptr;
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DILocation, Record[0],
                           (Context, Line, Column, Scope, InlinedAt)),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_GENERIC_DEBUG: {
@@ -1766,63 +2060,65 @@ std::error_code BitcodeReader::parseMetadata() {
       auto *Header = getMDString(Record[3]);
       SmallVector<Metadata *, 8> DwarfOps;
       for (unsigned I = 4, E = Record.size(); I != E; ++I)
-        DwarfOps.push_back(Record[I] ? MDValueList.getValueFwdRef(Record[I] - 1)
-                                     : nullptr);
-      MDValueList.assignValue(GET_OR_DISTINCT(GenericDINode, Record[0],
-                                              (Context, Tag, Header, DwarfOps)),
-                              NextMDValueNo++);
+        DwarfOps.push_back(
+            Record[I] ? MetadataList.getValueFwdRef(Record[I] - 1) : nullptr);
+      MetadataList.assignValue(
+          GET_OR_DISTINCT(GenericDINode, Record[0],
+                          (Context, Tag, Header, DwarfOps)),
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_SUBRANGE: {
       if (Record.size() != 3)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DISubrange, Record[0],
                           (Context, Record[1], unrotateSign(Record[2]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_ENUMERATOR: {
       if (Record.size() != 3)
         return error("Invalid record");
 
-      MDValueList.assignValue(GET_OR_DISTINCT(DIEnumerator, Record[0],
-                                              (Context, unrotateSign(Record[1]),
-                                               getMDString(Record[2]))),
-                              NextMDValueNo++);
+      MetadataList.assignValue(
+          GET_OR_DISTINCT(
+              DIEnumerator, Record[0],
+              (Context, unrotateSign(Record[1]), getMDString(Record[2]))),
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_BASIC_TYPE: {
       if (Record.size() != 6)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIBasicType, Record[0],
                           (Context, Record[1], getMDString(Record[2]),
                            Record[3], Record[4], Record[5])),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_DERIVED_TYPE: {
       if (Record.size() != 12)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIDerivedType, Record[0],
                           (Context, Record[1], getMDString(Record[2]),
                            getMDOrNull(Record[3]), Record[4],
                            getMDOrNull(Record[5]), getMDOrNull(Record[6]),
                            Record[7], Record[8], Record[9], Record[10],
                            getMDOrNull(Record[11]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_COMPOSITE_TYPE: {
       if (Record.size() != 16)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DICompositeType, Record[0],
                           (Context, Record[1], getMDString(Record[2]),
                            getMDOrNull(Record[3]), Record[4],
@@ -1831,17 +2127,17 @@ std::error_code BitcodeReader::parseMetadata() {
                            getMDOrNull(Record[11]), Record[12],
                            getMDOrNull(Record[13]), getMDOrNull(Record[14]),
                            getMDString(Record[15]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_SUBROUTINE_TYPE: {
       if (Record.size() != 3)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DISubroutineType, Record[0],
                           (Context, Record[1], getMDOrNull(Record[2]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
 
@@ -1849,12 +2145,12 @@ std::error_code BitcodeReader::parseMetadata() {
       if (Record.size() != 6)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIModule, Record[0],
                           (Context, getMDOrNull(Record[1]),
-                          getMDString(Record[2]), getMDString(Record[3]),
-                          getMDString(Record[4]), getMDString(Record[5]))),
-          NextMDValueNo++);
+                           getMDString(Record[2]), getMDString(Record[3]),
+                           getMDString(Record[4]), getMDString(Record[5]))),
+          NextMetadataNo++);
       break;
     }
 
@@ -1862,185 +2158,260 @@ std::error_code BitcodeReader::parseMetadata() {
       if (Record.size() != 3)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIFile, Record[0], (Context, getMDString(Record[1]),
                                               getMDString(Record[2]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_COMPILE_UNIT: {
-      if (Record.size() < 14 || Record.size() > 15)
+      if (Record.size() < 14 || Record.size() > 16)
         return error("Invalid record");
 
-      MDValueList.assignValue(
-          GET_OR_DISTINCT(
-              DICompileUnit, Record[0],
-              (Context, Record[1], getMDOrNull(Record[2]),
-               getMDString(Record[3]), Record[4], getMDString(Record[5]),
-               Record[6], getMDString(Record[7]), Record[8],
-               getMDOrNull(Record[9]), getMDOrNull(Record[10]),
-               getMDOrNull(Record[11]), getMDOrNull(Record[12]),
-               getMDOrNull(Record[13]), Record.size() == 14 ? 0 : Record[14])),
-          NextMDValueNo++);
+      // Ignore Record[0], which indicates whether this compile unit is
+      // distinct.  It's always distinct.
+      MetadataList.assignValue(
+          DICompileUnit::getDistinct(
+              Context, Record[1], getMDOrNull(Record[2]),
+              getMDString(Record[3]), Record[4], getMDString(Record[5]),
+              Record[6], getMDString(Record[7]), Record[8],
+              getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+              getMDOrNull(Record[11]), getMDOrNull(Record[12]),
+              getMDOrNull(Record[13]),
+              Record.size() <= 15 ? 0 : getMDOrNull(Record[15]),
+              Record.size() <= 14 ? 0 : Record[14]),
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_SUBPROGRAM: {
-      if (Record.size() != 19)
-        return error("Invalid record");
-
-      MDValueList.assignValue(
-          GET_OR_DISTINCT(
-              DISubprogram, Record[0],
-              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
-               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
-               getMDOrNull(Record[6]), Record[7], Record[8], Record[9],
-               getMDOrNull(Record[10]), Record[11], Record[12], Record[13],
-               Record[14], getMDOrNull(Record[15]), getMDOrNull(Record[16]),
-               getMDOrNull(Record[17]), getMDOrNull(Record[18]))),
-          NextMDValueNo++);
+      if (Record.size() != 18 && Record.size() != 19)
+        return error("Invalid record");
+
+      bool HasFn = Record.size() == 19;
+      DISubprogram *SP = GET_OR_DISTINCT(
+          DISubprogram,
+          Record[0] || Record[8], // All definitions should be distinct.
+          (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+           getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+           getMDOrNull(Record[6]), Record[7], Record[8], Record[9],
+           getMDOrNull(Record[10]), Record[11], Record[12], Record[13],
+           Record[14], getMDOrNull(Record[15 + HasFn]),
+           getMDOrNull(Record[16 + HasFn]), getMDOrNull(Record[17 + HasFn])));
+      MetadataList.assignValue(SP, NextMetadataNo++);
+
+      // Upgrade sp->function mapping to function->sp mapping.
+      if (HasFn && Record[15]) {
+        if (auto *CMD = dyn_cast<ConstantAsMetadata>(getMDOrNull(Record[15])))
+          if (auto *F = dyn_cast<Function>(CMD->getValue())) {
+            if (F->isMaterializable())
+              // Defer until materialized; unmaterialized functions may not have
+              // metadata.
+              FunctionsWithSPs[F] = SP;
+            else if (!F->empty())
+              F->setSubprogram(SP);
+          }
+      }
       break;
     }
     case bitc::METADATA_LEXICAL_BLOCK: {
       if (Record.size() != 5)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DILexicalBlock, Record[0],
                           (Context, getMDOrNull(Record[1]),
                            getMDOrNull(Record[2]), Record[3], Record[4])),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_LEXICAL_BLOCK_FILE: {
       if (Record.size() != 4)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DILexicalBlockFile, Record[0],
                           (Context, getMDOrNull(Record[1]),
                            getMDOrNull(Record[2]), Record[3])),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_NAMESPACE: {
       if (Record.size() != 5)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DINamespace, Record[0],
                           (Context, getMDOrNull(Record[1]),
                            getMDOrNull(Record[2]), getMDString(Record[3]),
                            Record[4])),
-          NextMDValueNo++);
+          NextMetadataNo++);
+      break;
+    }
+    case bitc::METADATA_MACRO: {
+      if (Record.size() != 5)
+        return error("Invalid record");
+
+      MetadataList.assignValue(
+          GET_OR_DISTINCT(DIMacro, Record[0],
+                          (Context, Record[1], Record[2],
+                           getMDString(Record[3]), getMDString(Record[4]))),
+          NextMetadataNo++);
+      break;
+    }
+    case bitc::METADATA_MACRO_FILE: {
+      if (Record.size() != 5)
+        return error("Invalid record");
+
+      MetadataList.assignValue(
+          GET_OR_DISTINCT(DIMacroFile, Record[0],
+                          (Context, Record[1], Record[2],
+                           getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_TEMPLATE_TYPE: {
       if (Record.size() != 3)
         return error("Invalid record");
 
-      MDValueList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
-                                              Record[0],
-                                              (Context, getMDString(Record[1]),
-                                               getMDOrNull(Record[2]))),
-                              NextMDValueNo++);
+      MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
+                                               Record[0],
+                                               (Context, getMDString(Record[1]),
+                                                getMDOrNull(Record[2]))),
+                               NextMetadataNo++);
       break;
     }
     case bitc::METADATA_TEMPLATE_VALUE: {
       if (Record.size() != 5)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DITemplateValueParameter, Record[0],
                           (Context, Record[1], getMDString(Record[2]),
                            getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_GLOBAL_VAR: {
       if (Record.size() != 11)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIGlobalVariable, Record[0],
                           (Context, getMDOrNull(Record[1]),
                            getMDString(Record[2]), getMDString(Record[3]),
                            getMDOrNull(Record[4]), Record[5],
                            getMDOrNull(Record[6]), Record[7], Record[8],
                            getMDOrNull(Record[9]), getMDOrNull(Record[10]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_LOCAL_VAR: {
       // 10th field is for the obseleted 'inlinedAt:' field.
-      if (Record.size() != 9 && Record.size() != 10)
+      if (Record.size() < 8 || Record.size() > 10)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      // 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
+      // DW_TAG_arg_variable.
+      bool HasTag = Record.size() > 8;
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DILocalVariable, Record[0],
-                          (Context, Record[1], getMDOrNull(Record[2]),
-                           getMDString(Record[3]), getMDOrNull(Record[4]),
-                           Record[5], getMDOrNull(Record[6]), Record[7],
-                           Record[8])),
-          NextMDValueNo++);
+                          (Context, getMDOrNull(Record[1 + HasTag]),
+                           getMDString(Record[2 + HasTag]),
+                           getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag],
+                           getMDOrNull(Record[5 + HasTag]), Record[6 + HasTag],
+                           Record[7 + HasTag])),
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_EXPRESSION: {
       if (Record.size() < 1)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIExpression, Record[0],
                           (Context, makeArrayRef(Record).slice(1))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_OBJC_PROPERTY: {
       if (Record.size() != 8)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIObjCProperty, Record[0],
                           (Context, getMDString(Record[1]),
                            getMDOrNull(Record[2]), Record[3],
                            getMDString(Record[4]), getMDString(Record[5]),
                            Record[6], getMDOrNull(Record[7]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_IMPORTED_ENTITY: {
       if (Record.size() != 6)
         return error("Invalid record");
 
-      MDValueList.assignValue(
+      MetadataList.assignValue(
           GET_OR_DISTINCT(DIImportedEntity, Record[0],
                           (Context, Record[1], getMDOrNull(Record[2]),
                            getMDOrNull(Record[3]), Record[4],
                            getMDString(Record[5]))),
-          NextMDValueNo++);
+          NextMetadataNo++);
       break;
     }
     case bitc::METADATA_STRING: {
       std::string String(Record.begin(), Record.end());
       llvm::UpgradeMDStringConstant(String);
       Metadata *MD = MDString::get(Context, String);
-      MDValueList.assignValue(MD, NextMDValueNo++);
+      MetadataList.assignValue(MD, NextMetadataNo++);
       break;
     }
     case bitc::METADATA_KIND: {
-      if (Record.size() < 2)
-        return error("Invalid record");
+      // Support older bitcode files that had METADATA_KIND records in a
+      // block with METADATA_BLOCK_ID.
+      if (std::error_code EC = parseMetadataKindRecord(Record))
+        return EC;
+      break;
+    }
+    }
+  }
+#undef GET_OR_DISTINCT
+}
 
-      unsigned Kind = Record[0];
-      SmallString<8> Name(Record.begin()+1, Record.end());
+/// Parse the metadata kinds out of the METADATA_KIND_BLOCK.
+std::error_code BitcodeReader::parseMetadataKinds() {
+  if (Stream.EnterSubBlock(bitc::METADATA_KIND_BLOCK_ID))
+    return error("Invalid record");
 
-      unsigned NewKind = TheModule->getMDKindID(Name.str());
-      if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
-        return error("Conflicting METADATA_KIND records");
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    unsigned Code = Stream.readRecord(Entry.ID, Record);
+    switch (Code) {
+    default: // Default behavior: ignore.
+      break;
+    case bitc::METADATA_KIND: {
+      if (std::error_code EC = parseMetadataKindRecord(Record))
+        return EC;
       break;
     }
     }
   }
-#undef GET_OR_DISTINCT
 }
 
 /// Decode a signed value stored with the sign bit in the LSB for dense VBR
@@ -2283,8 +2654,6 @@ std::error_code BitcodeReader::parseConstants() {
         return error("Invalid record");
 
       Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
-      unsigned Size = Record.size();
-
       if (EltTy->isIntegerTy(8)) {
         SmallVector<uint8_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
@@ -2309,21 +2678,24 @@ std::error_code BitcodeReader::parseConstants() {
           V = ConstantDataVector::get(Context, Elts);
         else
           V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isHalfTy()) {
+        SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::getFP(Context, Elts);
+        else
+          V = ConstantDataArray::getFP(Context, Elts);
       } else if (EltTy->isFloatTy()) {
-        SmallVector<float, 16> Elts(Size);
-        std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat);
+        SmallVector<uint32_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
-          V = ConstantDataVector::get(Context, Elts);
+          V = ConstantDataVector::getFP(Context, Elts);
         else
-          V = ConstantDataArray::get(Context, Elts);
+          V = ConstantDataArray::getFP(Context, Elts);
       } else if (EltTy->isDoubleTy()) {
-        SmallVector<double, 16> Elts(Size);
-        std::transform(Record.begin(), Record.end(), Elts.begin(),
-                       BitsToDouble);
+        SmallVector<uint64_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
-          V = ConstantDataVector::get(Context, Elts);
+          V = ConstantDataVector::getFP(Context, Elts);
         else
-          V = ConstantDataArray::get(Context, Elts);
+          V = ConstantDataArray::getFP(Context, Elts);
       } else {
         return error("Invalid type for value");
       }
@@ -2410,11 +2782,12 @@ std::error_code BitcodeReader::parseConstants() {
 
       Type *SelectorTy = Type::getInt1Ty(Context);
 
-      // If CurTy is a vector of length n, then Record[0] must be a <n x i1>
-      // vector. Otherwise, it must be a single bit.
+      // The selector might be an i1 or an <n x i1>
+      // Get the type from the ValueList before getting a forward ref.
       if (VectorType *VTy = dyn_cast<VectorType>(CurTy))
-        SelectorTy = VectorType::get(Type::getInt1Ty(Context),
-                                     VTy->getNumElements());
+        if (Value *V = ValueList[Record[0]])
+          if (SelectorTy != V->getType())
+            SelectorTy = VectorType::get(SelectorTy, VTy->getNumElements());
 
       V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
                                                               SelectorTy),
@@ -2567,9 +2940,6 @@ std::error_code BitcodeReader::parseConstants() {
       if (!Fn)
         return error("Invalid record");
 
-      // Don't let Fn get dematerialized.
-      BlockAddressesTaken.insert(Fn);
-
       // If the function is already parsed we can insert the block address right
       // away.
       BasicBlock *BB;
@@ -2584,7 +2954,7 @@ std::error_code BitcodeReader::parseConstants() {
             return error("Invalid ID");
           ++BBI;
         }
-        BB = BBI;
+        BB = &*BBI;
       } else {
         // Otherwise insert a placeholder and remember it so it can be inserted
         // when the function is parsed.
@@ -2652,7 +3022,7 @@ std::error_code BitcodeReader::parseUseLists() {
         V = ValueList[ID];
       unsigned NumUses = 0;
       SmallDenseMap<const Use *, unsigned, 16> Order;
-      for (const Use &U : V->uses()) {
+      for (const Use &U : V->materialized_uses()) {
         if (++NumUses > Record.size())
           break;
         Order[&U] = Record[NumUses - 1];
@@ -2688,7 +3058,7 @@ std::error_code BitcodeReader::materializeMetadata() {
   for (uint64_t BitPos : DeferredMetadataInfo) {
     // Move the bit stream to the saved position.
     Stream.JumpToBit(BitPos);
-    if (std::error_code EC = parseMetadata())
+    if (std::error_code EC = parseMetadata(true))
       return EC;
   }
   DeferredMetadataInfo.clear();
@@ -2697,6 +3067,35 @@ std::error_code BitcodeReader::materializeMetadata() {
 
 void BitcodeReader::setStripDebugInfo() { StripDebugInfo = true; }
 
+void BitcodeReader::saveMetadataList(
+    DenseMap<const Metadata *, unsigned> &MetadataToIDs, bool OnlyTempMD) {
+  for (unsigned ID = 0; ID < MetadataList.size(); ++ID) {
+    Metadata *MD = MetadataList[ID];
+    auto *N = dyn_cast_or_null<MDNode>(MD);
+    assert((!N || (N->isResolved() || N->isTemporary())) &&
+           "Found non-resolved non-temp MDNode while saving metadata");
+    // Save all values if !OnlyTempMD, otherwise just the temporary metadata.
+    // Note that in the !OnlyTempMD case we need to save all Metadata, not
+    // just MDNode, as we may have references to other types of module-level
+    // metadata (e.g. ValueAsMetadata) from instructions.
+    if (!OnlyTempMD || (N && N->isTemporary())) {
+      // Will call this after materializing each function, in order to
+      // handle remapping of the function's instructions/metadata.
+      // See if we already have an entry in that case.
+      if (OnlyTempMD && MetadataToIDs.count(MD)) {
+        assert(MetadataToIDs[MD] == ID && "Inconsistent metadata value id");
+        continue;
+      }
+      if (N && N->isTemporary())
+        // Ensure that we assert if someone tries to RAUW this temporary
+        // metadata while it is the key of a map. The flag will be set back
+        // to true when the saved metadata list is destroyed.
+        N->setCanReplace(false);
+      MetadataToIDs[MD] = ID;
+    }
+  }
+}
+
 /// When we see the block for a function body, remember where it is and then
 /// skip it.  This lets us lazily deserialize the functions.
 std::error_code BitcodeReader::rememberAndSkipFunctionBody() {
@@ -2709,6 +3108,9 @@ std::error_code BitcodeReader::rememberAndSkipFunctionBody() {
 
   // Save the current stream state.
   uint64_t CurBit = Stream.GetCurrentBitNo();
+  assert(
+      (DeferredFunctionInfo[Fn] == 0 || DeferredFunctionInfo[Fn] == CurBit) &&
+      "Mismatch between VST and scanned function offsets");
   DeferredFunctionInfo[Fn] = CurBit;
 
   // Skip over the function block for now.
@@ -2741,10 +3143,91 @@ std::error_code BitcodeReader::globalCleanup() {
   return std::error_code();
 }
 
-std::error_code BitcodeReader::parseModule(bool Resume,
+/// Support for lazy parsing of function bodies. This is required if we
+/// either have an old bitcode file without a VST forward declaration record,
+/// or if we have an anonymous function being materialized, since anonymous
+/// functions do not have a name and are therefore not in the VST.
+std::error_code BitcodeReader::rememberAndSkipFunctionBodies() {
+  Stream.JumpToBit(NextUnreadBit);
+
+  if (Stream.AtEndOfStream())
+    return error("Could not find function in stream");
+
+  if (!SeenFirstFunctionBody)
+    return error("Trying to materialize functions before seeing function blocks");
+
+  // An old bitcode file with the symbol table at the end would have
+  // finished the parse greedily.
+  assert(SeenValueSymbolTable);
+
+  SmallVector<uint64_t, 64> Record;
+
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+    switch (Entry.Kind) {
+    default:
+      return error("Expect SubBlock");
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
+      default:
+        return error("Expect function block");
+      case bitc::FUNCTION_BLOCK_ID:
+        if (std::error_code EC = rememberAndSkipFunctionBody())
+          return EC;
+        NextUnreadBit = Stream.GetCurrentBitNo();
+        return std::error_code();
+      }
+    }
+  }
+}
+
+std::error_code BitcodeReader::parseBitcodeVersion() {
+  if (Stream.EnterSubBlock(bitc::IDENTIFICATION_BLOCK_ID))
+    return error("Invalid record");
+
+  // Read all the records.
+  SmallVector<uint64_t, 64> Record;
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+
+    switch (Entry.Kind) {
+    default:
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    unsigned BitCode = Stream.readRecord(Entry.ID, Record);
+    switch (BitCode) {
+    default: // Default behavior: reject
+      return error("Invalid value");
+    case bitc::IDENTIFICATION_CODE_STRING: { // IDENTIFICATION:      [strchr x
+                                             // N]
+      convertToString(Record, 0, ProducerIdentification);
+      break;
+    }
+    case bitc::IDENTIFICATION_CODE_EPOCH: { // EPOCH:      [epoch#]
+      unsigned epoch = (unsigned)Record[0];
+      if (epoch != bitc::BITCODE_CURRENT_EPOCH) {
+        return error(
+          Twine("Incompatible epoch: Bitcode '") + Twine(epoch) +
+          "' vs current: '" + Twine(bitc::BITCODE_CURRENT_EPOCH) + "'");
+      }
+    }
+    }
+  }
+}
+
+std::error_code BitcodeReader::parseModule(uint64_t ResumeBit,
                                            bool ShouldLazyLoadMetadata) {
-  if (Resume)
-    Stream.JumpToBit(NextUnreadBit);
+  if (ResumeBit)
+    Stream.JumpToBit(ResumeBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
     return error("Invalid record");
 
@@ -2785,9 +3268,23 @@ std::error_code BitcodeReader::parseModule(bool Resume,
           return EC;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (std::error_code EC = parseValueSymbolTable())
-          return EC;
-        SeenValueSymbolTable = true;
+        if (!SeenValueSymbolTable) {
+          // Either this is an old form VST without function index and an
+          // associated VST forward declaration record (which would have caused
+          // the VST to be jumped to and parsed before it was encountered
+          // normally in the stream), or there were no function blocks to
+          // trigger an earlier parsing of the VST.
+          assert(VSTOffset == 0 || FunctionsWithBodies.empty());
+          if (std::error_code EC = parseValueSymbolTable())
+            return EC;
+          SeenValueSymbolTable = true;
+        } else {
+          // We must have had a VST forward declaration record, which caused
+          // the parser to jump to and parse the VST earlier.
+          assert(VSTOffset > 0);
+          if (Stream.SkipBlock())
+            return error("Invalid record");
+        }
         break;
       case bitc::CONSTANTS_BLOCK_ID:
         if (std::error_code EC = parseConstants())
@@ -2802,7 +3299,11 @@ std::error_code BitcodeReader::parseModule(bool Resume,
           break;
         }
         assert(DeferredMetadataInfo.empty() && "Unexpected deferred metadata");
-        if (std::error_code EC = parseMetadata())
+        if (std::error_code EC = parseMetadata(true))
+          return EC;
+        break;
+      case bitc::METADATA_KIND_BLOCK_ID:
+        if (std::error_code EC = parseMetadataKinds())
           return EC;
         break;
       case bitc::FUNCTION_BLOCK_ID:
@@ -2815,8 +3316,39 @@ std::error_code BitcodeReader::parseModule(bool Resume,
           SeenFirstFunctionBody = true;
         }
 
+        if (VSTOffset > 0) {
+          // If we have a VST forward declaration record, make sure we
+          // parse the VST now if we haven't already. It is needed to
+          // set up the DeferredFunctionInfo vector for lazy reading.
+          if (!SeenValueSymbolTable) {
+            if (std::error_code EC =
+                    BitcodeReader::parseValueSymbolTable(VSTOffset))
+              return EC;
+            SeenValueSymbolTable = true;
+            // Fall through so that we record the NextUnreadBit below.
+            // This is necessary in case we have an anonymous function that
+            // is later materialized. Since it will not have a VST entry we
+            // need to fall back to the lazy parse to find its offset.
+          } else {
+            // If we have a VST forward declaration record, but have already
+            // parsed the VST (just above, when the first function body was
+            // encountered here), then we are resuming the parse after
+            // materializing functions. The ResumeBit points to the
+            // start of the last function block recorded in the
+            // DeferredFunctionInfo map. Skip it.
+            if (Stream.SkipBlock())
+              return error("Invalid record");
+            continue;
+          }
+        }
+
+        // Support older bitcode files that did not have the function
+        // index in the VST, nor a VST forward declaration record, as
+        // well as anonymous functions that do not have VST entries.
+        // Build the DeferredFunctionInfo vector on the fly.
         if (std::error_code EC = rememberAndSkipFunctionBody())
           return EC;
+
         // Suspend parsing when we reach the function bodies. Subsequent
         // materialization calls will resume it when necessary. If the bitcode
         // file is old, the symbol table will be at the end instead and will not
@@ -2830,6 +3362,10 @@ std::error_code BitcodeReader::parseModule(bool Resume,
         if (std::error_code EC = parseUseLists())
           return EC;
         break;
+      case bitc::OPERAND_BUNDLE_TAGS_BLOCK_ID:
+        if (std::error_code EC = parseOperandBundleTags())
+          return EC;
+        break;
       }
       continue;
 
@@ -2840,7 +3376,8 @@ std::error_code BitcodeReader::parseModule(bool Resume,
 
 
     // Read a record.
-    switch (Stream.readRecord(Entry.ID, Record)) {
+    auto BitCode = Stream.readRecord(Entry.ID, Record);
+    switch (BitCode) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
@@ -3012,11 +3549,14 @@ std::error_code BitcodeReader::parseModule(bool Resume,
       auto *FTy = dyn_cast<FunctionType>(Ty);
       if (!FTy)
         return error("Invalid type for value");
+      auto CC = static_cast<CallingConv::ID>(Record[1]);
+      if (CC & ~CallingConv::MaxID)
+        return error("Invalid calling convention ID");
 
       Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
                                         "", TheModule);
 
-      Func->setCallingConv(static_cast<CallingConv::ID>(Record[1]));
+      Func->setCallingConv(CC);
       bool isProto = Record[2];
       uint64_t RawLinkage = Record[3];
       Func->setLinkage(getDecodedLinkage(RawLinkage));
@@ -3079,35 +3619,51 @@ std::error_code BitcodeReader::parseModule(bool Resume,
       }
       break;
     }
-    // ALIAS: [alias type, aliasee val#, linkage]
-    // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass]
-    case bitc::MODULE_CODE_ALIAS: {
-      if (Record.size() < 3)
+    // ALIAS: [alias type, addrspace, aliasee val#, linkage]
+    // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
+    case bitc::MODULE_CODE_ALIAS:
+    case bitc::MODULE_CODE_ALIAS_OLD: {
+      bool NewRecord = BitCode == bitc::MODULE_CODE_ALIAS;
+      if (Record.size() < (3 + (unsigned)NewRecord))
         return error("Invalid record");
-      Type *Ty = getTypeByID(Record[0]);
+      unsigned OpNum = 0;
+      Type *Ty = getTypeByID(Record[OpNum++]);
       if (!Ty)
         return error("Invalid record");
-      auto *PTy = dyn_cast<PointerType>(Ty);
-      if (!PTy)
-        return error("Invalid type for value");
 
-      auto *NewGA =
-          GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
+      unsigned AddrSpace;
+      if (!NewRecord) {
+        auto *PTy = dyn_cast<PointerType>(Ty);
+        if (!PTy)
+          return error("Invalid type for value");
+        Ty = PTy->getElementType();
+        AddrSpace = PTy->getAddressSpace();
+      } else {
+        AddrSpace = Record[OpNum++];
+      }
+
+      auto Val = Record[OpNum++];
+      auto Linkage = Record[OpNum++];
+      auto *NewGA = GlobalAlias::create(
+          Ty, AddrSpace, getDecodedLinkage(Linkage), "", TheModule);
       // Old bitcode files didn't have visibility field.
       // Local linkage must have default visibility.
-      if (Record.size() > 3 && !NewGA->hasLocalLinkage())
-        // FIXME: Change to an error if non-default in 4.0.
-        NewGA->setVisibility(getDecodedVisibility(Record[3]));
-      if (Record.size() > 4)
-        NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[4]));
+      if (OpNum != Record.size()) {
+        auto VisInd = OpNum++;
+        if (!NewGA->hasLocalLinkage())
+          // FIXME: Change to an error if non-default in 4.0.
+          NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
+      }
+      if (OpNum != Record.size())
+        NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
       else
-        upgradeDLLImportExportLinkage(NewGA, Record[2]);
-      if (Record.size() > 5)
-        NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[5]));
-      if (Record.size() > 6)
-        NewGA->setUnnamedAddr(Record[6]);
+        upgradeDLLImportExportLinkage(NewGA, Linkage);
+      if (OpNum != Record.size())
+        NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
+      if (OpNum != Record.size())
+        NewGA->setUnnamedAddr(Record[OpNum++]);
       ValueList.push_back(NewGA);
-      AliasInits.push_back(std::make_pair(NewGA, Record[1]));
+      AliasInits.push_back(std::make_pair(NewGA, Val));
       break;
     }
     /// MODULE_CODE_PURGEVALS: [numvals]
@@ -3117,11 +3673,52 @@ std::error_code BitcodeReader::parseModule(bool Resume,
         return error("Invalid record");
       ValueList.shrinkTo(Record[0]);
       break;
+    /// MODULE_CODE_VSTOFFSET: [offset]
+    case bitc::MODULE_CODE_VSTOFFSET:
+      if (Record.size() < 1)
+        return error("Invalid record");
+      VSTOffset = Record[0];
+      break;
+    /// MODULE_CODE_METADATA_VALUES: [numvals]
+    case bitc::MODULE_CODE_METADATA_VALUES:
+      if (Record.size() < 1)
+        return error("Invalid record");
+      assert(!IsMetadataMaterialized);
+      // This record contains the number of metadata values in the module-level
+      // METADATA_BLOCK. It is used to support lazy parsing of metadata as
+      // a postpass, where we will parse function-level metadata first.
+      // This is needed because the ids of metadata are assigned implicitly
+      // based on their ordering in the bitcode, with the function-level
+      // metadata ids starting after the module-level metadata ids. Otherwise,
+      // we would have to parse the module-level metadata block to prime the
+      // MetadataList when we are lazy loading metadata during function
+      // importing. Initialize the MetadataList size here based on the
+      // record value, regardless of whether we are doing lazy metadata
+      // loading, so that we have consistent handling and assertion
+      // checking in parseMetadata for module-level metadata.
+      NumModuleMDs = Record[0];
+      SeenModuleValuesRecord = true;
+      assert(MetadataList.size() == 0);
+      MetadataList.resize(NumModuleMDs);
+      break;
     }
     Record.clear();
   }
 }
 
+/// Helper to read the header common to all bitcode files.
+static bool hasValidBitcodeHeader(BitstreamCursor &Stream) {
+  // Sniff for the signature.
+  if (Stream.Read(8) != 'B' ||
+      Stream.Read(8) != 'C' ||
+      Stream.Read(4) != 0x0 ||
+      Stream.Read(4) != 0xC ||
+      Stream.Read(4) != 0xE ||
+      Stream.Read(4) != 0xD)
+    return false;
+  return true;
+}
+
 std::error_code
 BitcodeReader::parseBitcodeInto(std::unique_ptr<DataStreamer> Streamer,
                                 Module *M, bool ShouldLazyLoadMetadata) {
@@ -3131,12 +3728,7 @@ BitcodeReader::parseBitcodeInto(std::unique_ptr<DataStreamer> Streamer,
     return EC;
 
   // Sniff for the signature.
-  if (Stream.Read(8) != 'B' ||
-      Stream.Read(8) != 'C' ||
-      Stream.Read(4) != 0x0 ||
-      Stream.Read(4) != 0xC ||
-      Stream.Read(4) != 0xE ||
-      Stream.Read(4) != 0xD)
+  if (!hasValidBitcodeHeader(Stream))
     return error("Invalid bitcode signature");
 
   // We expect a number of well-defined blocks, though we don't necessarily
@@ -3153,8 +3745,13 @@ BitcodeReader::parseBitcodeInto(std::unique_ptr<DataStreamer> Streamer,
     if (Entry.Kind != BitstreamEntry::SubBlock)
       return error("Malformed block");
 
+    if (Entry.ID == bitc::IDENTIFICATION_BLOCK_ID) {
+      parseBitcodeVersion();
+      continue;
+    }
+
     if (Entry.ID == bitc::MODULE_BLOCK_ID)
-      return parseModule(false, ShouldLazyLoadMetadata);
+      return parseModule(0, ShouldLazyLoadMetadata);
 
     if (Stream.SkipBlock())
       return error("Invalid record");
@@ -3204,12 +3801,7 @@ ErrorOr<std::string> BitcodeReader::parseTriple() {
     return EC;
 
   // Sniff for the signature.
-  if (Stream.Read(8) != 'B' ||
-      Stream.Read(8) != 'C' ||
-      Stream.Read(4) != 0x0 ||
-      Stream.Read(4) != 0xC ||
-      Stream.Read(4) != 0xE ||
-      Stream.Read(4) != 0xD)
+  if (!hasValidBitcodeHeader(Stream))
     return error("Invalid bitcode signature");
 
   // We expect a number of well-defined blocks, though we don't necessarily
@@ -3239,6 +3831,41 @@ ErrorOr<std::string> BitcodeReader::parseTriple() {
   }
 }
 
+ErrorOr<std::string> BitcodeReader::parseIdentificationBlock() {
+  if (std::error_code EC = initStream(nullptr))
+    return EC;
+
+  // Sniff for the signature.
+  if (!hasValidBitcodeHeader(Stream))
+    return error("Invalid bitcode signature");
+
+  // We expect a number of well-defined blocks, though we don't necessarily
+  // need to understand them all.
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::IDENTIFICATION_BLOCK_ID) {
+        if (std::error_code EC = parseBitcodeVersion())
+          return EC;
+        return ProducerIdentification;
+      }
+      // Ignore other sub-blocks.
+      if (Stream.SkipBlock())
+        return error("Malformed block");
+      continue;
+    case BitstreamEntry::Record:
+      Stream.skipRecord(Entry.ID);
+      continue;
+    }
+  }
+}
+
 /// Parse metadata attachments.
 std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
@@ -3274,7 +3901,7 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
           auto K = MDKindMap.find(Record[I]);
           if (K == MDKindMap.end())
             return error("Invalid ID");
-          Metadata *MD = MDValueList.getValueFwdRef(Record[I + 1]);
+          Metadata *MD = MetadataList.getValueFwdRef(Record[I + 1]);
           F.setMetadata(K->second, cast<MDNode>(MD));
         }
         continue;
@@ -3288,7 +3915,7 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
           return error("Invalid ID");
-        Metadata *Node = MDValueList.getValueFwdRef(Record[i + 1]);
+        Metadata *Node = MetadataList.getValueFwdRef(Record[i + 1]);
         if (isa<LocalAsMetadata>(Node))
           // Drop the attachment.  This used to be legal, but there's no
           // upgrade path.
@@ -3303,17 +3930,17 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) {
   }
 }
 
-static std::error_code typeCheckLoadStoreInst(DiagnosticHandlerFunction DH,
-                                              Type *ValType, Type *PtrType) {
+static std::error_code typeCheckLoadStoreInst(Type *ValType, Type *PtrType) {
+  LLVMContext &Context = PtrType->getContext();
   if (!isa<PointerType>(PtrType))
-    return error(DH, "Load/Store operand is not a pointer type");
+    return error(Context, "Load/Store operand is not a pointer type");
   Type *ElemType = cast<PointerType>(PtrType)->getElementType();
 
   if (ValType && ValType != ElemType)
-    return error(DH, "Explicit load/store type does not match pointee type of "
-                     "pointer operand");
+    return error(Context, "Explicit load/store type does not match pointee "
+                          "type of pointer operand");
   if (!PointerType::isLoadableOrStorableType(ElemType))
-    return error(DH, "Cannot load/store from pointer");
+    return error(Context, "Cannot load/store from pointer");
   return std::error_code();
 }
 
@@ -3324,11 +3951,11 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
-  unsigned ModuleMDValueListSize = MDValueList.size();
+  unsigned ModuleMetadataListSize = MetadataList.size();
 
   // Add all the function arguments to the value table.
-  for(Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
-    ValueList.push_back(I);
+  for (Argument &I : F->args())
+    ValueList.push_back(&I);
 
   unsigned NextValueNo = ValueList.size();
   BasicBlock *CurBB = nullptr;
@@ -3344,6 +3971,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
     return nullptr;
   };
 
+  std::vector<OperandBundleDef> OperandBundles;
+
   // Read all the records.
   SmallVector<uint64_t, 64> Record;
   while (1) {
@@ -3452,8 +4081,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       unsigned ScopeID = Record[2], IAID = Record[3];
 
       MDNode *Scope = nullptr, *IA = nullptr;
-      if (ScopeID) Scope = cast<MDNode>(MDValueList.getValueFwdRef(ScopeID-1));
-      if (IAID)    IA = cast<MDNode>(MDValueList.getValueFwdRef(IAID-1));
+      if (ScopeID)
+        Scope = cast<MDNode>(MetadataList.getValueFwdRef(ScopeID - 1));
+      if (IAID)
+        IA = cast<MDNode>(MetadataList.getValueFwdRef(IAID - 1));
       LastLoc = DebugLoc::get(Line, Col, Scope, IA);
       I->setDebugLoc(LastLoc);
       I = nullptr;
@@ -3515,7 +4146,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
           CurBB->getInstList().push_back(Temp);
         }
       } else {
-        I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+        auto CastOp = (Instruction::CastOps)Opc;
+        if (!CastInst::castIsValid(CastOp, Op, ResTy))
+          return error("Invalid cast");
+        I = CastInst::Create(CastOp, Op, ResTy);
       }
       InstructionList.push_back(I);
       break;
@@ -3811,6 +4445,110 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       }
       break;
     }
+    case bitc::FUNC_CODE_INST_CLEANUPRET: { // CLEANUPRET: [val] or [val,bb#]
+      if (Record.size() != 1 && Record.size() != 2)
+        return error("Invalid record");
+      unsigned Idx = 0;
+      Value *CleanupPad =
+          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      if (!CleanupPad)
+        return error("Invalid record");
+      BasicBlock *UnwindDest = nullptr;
+      if (Record.size() == 2) {
+        UnwindDest = getBasicBlock(Record[Idx++]);
+        if (!UnwindDest)
+          return error("Invalid record");
+      }
+
+      I = CleanupReturnInst::Create(CleanupPad, UnwindDest);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CATCHRET: { // CATCHRET: [val,bb#]
+      if (Record.size() != 2)
+        return error("Invalid record");
+      unsigned Idx = 0;
+      Value *CatchPad =
+          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      if (!CatchPad)
+        return error("Invalid record");
+      BasicBlock *BB = getBasicBlock(Record[Idx++]);
+      if (!BB)
+        return error("Invalid record");
+
+      I = CatchReturnInst::Create(CatchPad, BB);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CATCHSWITCH: { // CATCHSWITCH: [tok,num,(bb)*,bb?]
+      // We must have, at minimum, the outer scope and the number of arguments.
+      if (Record.size() < 2)
+        return error("Invalid record");
+
+      unsigned Idx = 0;
+
+      Value *ParentPad =
+          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+
+      unsigned NumHandlers = Record[Idx++];
+
+      SmallVector<BasicBlock *, 2> Handlers;
+      for (unsigned Op = 0; Op != NumHandlers; ++Op) {
+        BasicBlock *BB = getBasicBlock(Record[Idx++]);
+        if (!BB)
+          return error("Invalid record");
+        Handlers.push_back(BB);
+      }
+
+      BasicBlock *UnwindDest = nullptr;
+      if (Idx + 1 == Record.size()) {
+        UnwindDest = getBasicBlock(Record[Idx++]);
+        if (!UnwindDest)
+          return error("Invalid record");
+      }
+
+      if (Record.size() != Idx)
+        return error("Invalid record");
+
+      auto *CatchSwitch =
+          CatchSwitchInst::Create(ParentPad, UnwindDest, NumHandlers);
+      for (BasicBlock *Handler : Handlers)
+        CatchSwitch->addHandler(Handler);
+      I = CatchSwitch;
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CATCHPAD:
+    case bitc::FUNC_CODE_INST_CLEANUPPAD: { // [tok,num,(ty,val)*]
+      // We must have, at minimum, the outer scope and the number of arguments.
+      if (Record.size() < 2)
+        return error("Invalid record");
+
+      unsigned Idx = 0;
+
+      Value *ParentPad =
+          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+
+      unsigned NumArgOperands = Record[Idx++];
+
+      SmallVector<Value *, 2> Args;
+      for (unsigned Op = 0; Op != NumArgOperands; ++Op) {
+        Value *Val;
+        if (getValueTypePair(Record, Idx, NextValueNo, Val))
+          return error("Invalid record");
+        Args.push_back(Val);
+      }
+
+      if (Record.size() != Idx)
+        return error("Invalid record");
+
+      if (BitCode == bitc::FUNC_CODE_INST_CLEANUPPAD)
+        I = CleanupPadInst::Create(ParentPad, Args);
+      else
+        I = CatchPadInst::Create(ParentPad, Args);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
       // Check magic
       if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
@@ -3973,10 +4711,11 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
         }
       }
 
-      I = InvokeInst::Create(Callee, NormalBB, UnwindBB, Ops);
+      I = InvokeInst::Create(Callee, NormalBB, UnwindBB, Ops, OperandBundles);
+      OperandBundles.clear();
       InstructionList.push_back(I);
-      cast<InvokeInst>(I)
-          ->setCallingConv(static_cast<CallingConv::ID>(~(1U << 13) & CCInfo));
+      cast<InvokeInst>(I)->setCallingConv(
+          static_cast<CallingConv::ID>(CallingConv::MaxID & CCInfo));
       cast<InvokeInst>(I)->setAttributes(PAL);
       break;
     }
@@ -4081,6 +4820,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       uint64_t AlignRecord = Record[3];
       const uint64_t InAllocaMask = uint64_t(1) << 5;
       const uint64_t ExplicitTypeMask = uint64_t(1) << 6;
+      // Reserve bit 7 for SwiftError flag.
+      // const uint64_t SwiftErrorMask = uint64_t(1) << 7;
       const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask;
       bool InAlloca = AlignRecord & InAllocaMask;
       Type *Ty = getTypeByID(Record[0]);
@@ -4115,8 +4856,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       Type *Ty = nullptr;
       if (OpNum + 3 == Record.size())
         Ty = getTypeByID(Record[OpNum++]);
-      if (std::error_code EC =
-              typeCheckLoadStoreInst(DiagnosticHandler, Ty, Op->getType()))
+      if (std::error_code EC = typeCheckLoadStoreInst(Ty, Op->getType()))
         return EC;
       if (!Ty)
         Ty = cast<PointerType>(Op->getType())->getElementType();
@@ -4140,8 +4880,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       Type *Ty = nullptr;
       if (OpNum + 5 == Record.size())
         Ty = getTypeByID(Record[OpNum++]);
-      if (std::error_code EC =
-              typeCheckLoadStoreInst(DiagnosticHandler, Ty, Op->getType()))
+      if (std::error_code EC = typeCheckLoadStoreInst(Ty, Op->getType()))
         return EC;
       if (!Ty)
         Ty = cast<PointerType>(Op->getType())->getElementType();
@@ -4175,8 +4914,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
           OpNum + 2 != Record.size())
         return error("Invalid record");
 
-      if (std::error_code EC = typeCheckLoadStoreInst(
-              DiagnosticHandler, Val->getType(), Ptr->getType()))
+      if (std::error_code EC =
+              typeCheckLoadStoreInst(Val->getType(), Ptr->getType()))
         return EC;
       unsigned Align;
       if (std::error_code EC = parseAlignmentValue(Record[OpNum], Align))
@@ -4199,8 +4938,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
           OpNum + 4 != Record.size())
         return error("Invalid record");
 
-      if (std::error_code EC = typeCheckLoadStoreInst(
-              DiagnosticHandler, Val->getType(), Ptr->getType()))
+      if (std::error_code EC =
+              typeCheckLoadStoreInst(Val->getType(), Ptr->getType()))
         return EC;
       AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]);
       if (Ordering == NotAtomic || Ordering == Acquire ||
@@ -4237,8 +4976,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
       SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 2]);
 
-      if (std::error_code EC = typeCheckLoadStoreInst(
-              DiagnosticHandler, Cmp->getType(), Ptr->getType()))
+      if (std::error_code EC =
+              typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType()))
         return EC;
       AtomicOrdering FailureOrdering;
       if (Record.size() < 7)
@@ -4299,7 +5038,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_CALL: {
-      // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
+      // CALL: [paramattrs, cc, fmf, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
         return error("Invalid record");
 
@@ -4307,8 +5046,15 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       AttributeSet PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
 
+      FastMathFlags FMF;
+      if ((CCInfo >> bitc::CALL_FMF) & 1) {
+        FMF = getDecodedFastMathFlags(Record[OpNum++]);
+        if (!FMF.any())
+          return error("Fast math flags indicator set for call with no FMF");
+      }
+
       FunctionType *FTy = nullptr;
-      if (CCInfo >> 15 & 1 &&
+      if (CCInfo >> bitc::CALL_EXPLICIT_TYPE & 1 &&
           !(FTy = dyn_cast<FunctionType>(getTypeByID(Record[OpNum++]))))
         return error("Explicit call type is not a function type");
 
@@ -4354,17 +5100,26 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
         }
       }
 
-      I = CallInst::Create(FTy, Callee, Args);
+      I = CallInst::Create(FTy, Callee, Args, OperandBundles);
+      OperandBundles.clear();
       InstructionList.push_back(I);
       cast<CallInst>(I)->setCallingConv(
-          static_cast<CallingConv::ID>((~(1U << 14) & CCInfo) >> 1));
+          static_cast<CallingConv::ID>((0x7ff & CCInfo) >> bitc::CALL_CCONV));
       CallInst::TailCallKind TCK = CallInst::TCK_None;
-      if (CCInfo & 1)
+      if (CCInfo & 1 << bitc::CALL_TAIL)
         TCK = CallInst::TCK_Tail;
-      if (CCInfo & (1 << 14))
+      if (CCInfo & (1 << bitc::CALL_MUSTTAIL))
         TCK = CallInst::TCK_MustTail;
+      if (CCInfo & (1 << bitc::CALL_NOTAIL))
+        TCK = CallInst::TCK_NoTail;
       cast<CallInst>(I)->setTailCallKind(TCK);
       cast<CallInst>(I)->setAttributes(PAL);
+      if (FMF.any()) {
+        if (!isa<FPMathOperator>(I))
+          return error("Fast-math-flags specified for call without "
+                       "floating-point scalar or vector return type");
+        I->setFastMathFlags(FMF);
+      }
       break;
     }
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
@@ -4379,6 +5134,28 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
+
+    case bitc::FUNC_CODE_OPERAND_BUNDLE: {
+      // A call or an invoke can be optionally prefixed with some variable
+      // number of operand bundle blocks.  These blocks are read into
+      // OperandBundles and consumed at the next call or invoke instruction.
+
+      if (Record.size() < 1 || Record[0] >= BundleTags.size())
+        return error("Invalid record");
+
+      std::vector<Value *> Inputs;
+
+      unsigned OpNum = 1;
+      while (OpNum != Record.size()) {
+        Value *Op;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          return error("Invalid record");
+        Inputs.push_back(Op);
+      }
+
+      OperandBundles.emplace_back(BundleTags[Record[0]], std::move(Inputs));
+      continue;
+    }
     }
 
     // Add instruction to end of current BB.  If there is no current BB, reject
@@ -4387,6 +5164,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       delete I;
       return error("Invalid instruction with no BB");
     }
+    if (!OperandBundles.empty()) {
+      delete I;
+      return error("Operand bundles found with no consumer");
+    }
     CurBB->getInstList().push_back(I);
 
     // If this was a terminator instruction, move to the next block.
@@ -4402,6 +5183,9 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
 
 OutOfRecordLoop:
 
+  if (!OperandBundles.empty())
+    return error("Operand bundles found with no consumer");
+
   // Check the function list for unresolved values.
   if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
     if (!A->getParent()) {
@@ -4421,7 +5205,7 @@ OutOfRecordLoop:
 
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
-  MDValueList.shrinkTo(ModuleMDValueListSize);
+  MetadataList.shrinkTo(ModuleMetadataListSize);
   std::vector<BasicBlock*>().swap(FunctionBBs);
   return std::error_code();
 }
@@ -4431,11 +5215,14 @@ std::error_code BitcodeReader::findFunctionInStream(
     Function *F,
     DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
-    if (Stream.AtEndOfStream())
-      return error("Could not find function in stream");
-    // ParseModule will parse the next body in the stream and set its
-    // position in the DeferredFunctionInfo map.
-    if (std::error_code EC = parseModule(true))
+    // This is the fallback handling for the old format bitcode that
+    // didn't contain the function index in the VST, or when we have
+    // an anonymous function which would not have a VST entry.
+    // Assert that we have one of those two cases.
+    assert(VSTOffset == 0 || !F->hasName());
+    // Parse the next body in the stream and set its position in the
+    // DeferredFunctionInfo map.
+    if (std::error_code EC = rememberAndSkipFunctionBodies())
       return EC;
   }
   return std::error_code();
@@ -4448,8 +5235,12 @@ std::error_code BitcodeReader::findFunctionInStream(
 void BitcodeReader::releaseBuffer() { Buffer.release(); }
 
 std::error_code BitcodeReader::materialize(GlobalValue *GV) {
-  if (std::error_code EC = materializeMetadata())
-    return EC;
+  // In older bitcode we must materialize the metadata before parsing
+  // any functions, in order to set up the MetadataList properly.
+  if (!SeenModuleValuesRecord) {
+    if (std::error_code EC = materializeMetadata())
+      return EC;
+  }
 
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
@@ -4476,7 +5267,8 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) {
 
   // Upgrade any old intrinsic calls in the function.
   for (auto &I : UpgradedIntrinsics) {
-    for (auto UI = I.first->user_begin(), UE = I.first->user_end(); UI != UE;) {
+    for (auto UI = I.first->materialized_user_begin(), UE = I.first->user_end();
+         UI != UE;) {
       User *U = *UI;
       ++UI;
       if (CallInst *CI = dyn_cast<CallInst>(U))
@@ -4484,41 +5276,16 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) {
     }
   }
 
+  // Finish fn->subprogram upgrade for materialized functions.
+  if (DISubprogram *SP = FunctionsWithSPs.lookup(F))
+    F->setSubprogram(SP);
+
   // Bring in any functions that this function forward-referenced via
   // blockaddresses.
   return materializeForwardReferencedFunctions();
 }
 
-bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
-  const Function *F = dyn_cast<Function>(GV);
-  if (!F || F->isDeclaration())
-    return false;
-
-  // Dematerializing F would leave dangling references that wouldn't be
-  // reconnected on re-materialization.
-  if (BlockAddressesTaken.count(F))
-    return false;
-
-  return DeferredFunctionInfo.count(const_cast<Function*>(F));
-}
-
-void BitcodeReader::dematerialize(GlobalValue *GV) {
-  Function *F = dyn_cast<Function>(GV);
-  // If this function isn't dematerializable, this is a noop.
-  if (!F || !isDematerializable(F))
-    return;
-
-  assert(DeferredFunctionInfo.count(F) && "No info to read function later?");
-
-  // Just forget the function body, we can remat it later.
-  F->dropAllReferences();
-  F->setIsMaterializable(true);
-}
-
-std::error_code BitcodeReader::materializeModule(Module *M) {
-  assert(M == TheModule &&
-         "Can only Materialize the Module this BitcodeReader is attached to.");
-
+std::error_code BitcodeReader::materializeModule() {
   if (std::error_code EC = materializeMetadata())
     return EC;
 
@@ -4527,16 +5294,16 @@ std::error_code BitcodeReader::materializeModule(Module *M) {
 
   // Iterate over the module, deserializing any functions that are still on
   // disk.
-  for (Module::iterator F = TheModule->begin(), E = TheModule->end();
-       F != E; ++F) {
-    if (std::error_code EC = materialize(F))
+  for (Function &F : *TheModule) {
+    if (std::error_code EC = materialize(&F))
       return EC;
   }
-  // At this point, if there are any function bodies, the current bit is
-  // pointing to the END_BLOCK record after them. Now make sure the rest
-  // of the bits in the module have been read.
-  if (NextUnreadBit)
-    parseModule(true);
+  // At this point, if there are any function bodies, parse the rest of
+  // the bits in the module past the last function block we have recorded
+  // through either lazy scanning or the VST.
+  if (LastFunctionBlockBit || NextUnreadBit)
+    parseModule(LastFunctionBlockBit > NextUnreadBit ? LastFunctionBlockBit
+                                                     : NextUnreadBit);
 
   // Check that all block address forward references got resolved (as we
   // promised above).
@@ -4561,7 +5328,7 @@ std::error_code BitcodeReader::materializeModule(Module *M) {
   for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
     UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
 
-  UpgradeDebugInfo(*M);
+  UpgradeDebugInfo(*TheModule);
   return std::error_code();
 }
 
@@ -4622,6 +5389,416 @@ BitcodeReader::initLazyStream(std::unique_ptr<DataStreamer> Streamer) {
   return std::error_code();
 }
 
+std::error_code FunctionIndexBitcodeReader::error(BitcodeError E,
+                                                  const Twine &Message) {
+  return ::error(DiagnosticHandler, make_error_code(E), Message);
+}
+
+std::error_code FunctionIndexBitcodeReader::error(const Twine &Message) {
+  return ::error(DiagnosticHandler,
+                 make_error_code(BitcodeError::CorruptedBitcode), Message);
+}
+
+std::error_code FunctionIndexBitcodeReader::error(BitcodeError E) {
+  return ::error(DiagnosticHandler, make_error_code(E));
+}
+
+FunctionIndexBitcodeReader::FunctionIndexBitcodeReader(
+    MemoryBuffer *Buffer, DiagnosticHandlerFunction DiagnosticHandler,
+    bool IsLazy, bool CheckFuncSummaryPresenceOnly)
+    : DiagnosticHandler(DiagnosticHandler), Buffer(Buffer), IsLazy(IsLazy),
+      CheckFuncSummaryPresenceOnly(CheckFuncSummaryPresenceOnly) {}
+
+FunctionIndexBitcodeReader::FunctionIndexBitcodeReader(
+    DiagnosticHandlerFunction DiagnosticHandler, bool IsLazy,
+    bool CheckFuncSummaryPresenceOnly)
+    : DiagnosticHandler(DiagnosticHandler), Buffer(nullptr), IsLazy(IsLazy),
+      CheckFuncSummaryPresenceOnly(CheckFuncSummaryPresenceOnly) {}
+
+void FunctionIndexBitcodeReader::freeState() { Buffer = nullptr; }
+
+void FunctionIndexBitcodeReader::releaseBuffer() { Buffer.release(); }
+
+// Specialized value symbol table parser used when reading function index
+// blocks where we don't actually create global values.
+// At the end of this routine the function index is populated with a map
+// from function name to FunctionInfo. The function info contains
+// the function block's bitcode offset as well as the offset into the
+// function summary section.
+std::error_code FunctionIndexBitcodeReader::parseValueSymbolTable() {
+  if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
+    return error("Invalid record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records for this value table.
+  SmallString<128> ValueName;
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default: // Default behavior: ignore (e.g. VST_CODE_BBENTRY records).
+      break;
+    case bitc::VST_CODE_FNENTRY: {
+      // VST_FNENTRY: [valueid, offset, namechar x N]
+      if (convertToString(Record, 2, ValueName))
+        return error("Invalid record");
+      unsigned ValueID = Record[0];
+      uint64_t FuncOffset = Record[1];
+      std::unique_ptr<FunctionInfo> FuncInfo =
+          llvm::make_unique<FunctionInfo>(FuncOffset);
+      if (foundFuncSummary() && !IsLazy) {
+        DenseMap<uint64_t, std::unique_ptr<FunctionSummary>>::iterator SMI =
+            SummaryMap.find(ValueID);
+        assert(SMI != SummaryMap.end() && "Summary info not found");
+        FuncInfo->setFunctionSummary(std::move(SMI->second));
+      }
+      TheIndex->addFunctionInfo(ValueName, std::move(FuncInfo));
+
+      ValueName.clear();
+      break;
+    }
+    case bitc::VST_CODE_COMBINED_FNENTRY: {
+      // VST_FNENTRY: [offset, namechar x N]
+      if (convertToString(Record, 1, ValueName))
+        return error("Invalid record");
+      uint64_t FuncSummaryOffset = Record[0];
+      std::unique_ptr<FunctionInfo> FuncInfo =
+          llvm::make_unique<FunctionInfo>(FuncSummaryOffset);
+      if (foundFuncSummary() && !IsLazy) {
+        DenseMap<uint64_t, std::unique_ptr<FunctionSummary>>::iterator SMI =
+            SummaryMap.find(FuncSummaryOffset);
+        assert(SMI != SummaryMap.end() && "Summary info not found");
+        FuncInfo->setFunctionSummary(std::move(SMI->second));
+      }
+      TheIndex->addFunctionInfo(ValueName, std::move(FuncInfo));
+
+      ValueName.clear();
+      break;
+    }
+    }
+  }
+}
+
+// Parse just the blocks needed for function index building out of the module.
+// At the end of this routine the function Index is populated with a map
+// from function name to FunctionInfo. The function info contains
+// either the parsed function summary information (when parsing summaries
+// eagerly), or just to the function summary record's offset
+// if parsing lazily (IsLazy).
+std::error_code FunctionIndexBitcodeReader::parseModule() {
+  if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return error("Invalid record");
+
+  // Read the function index for this module.
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+
+    case BitstreamEntry::SubBlock:
+      if (CheckFuncSummaryPresenceOnly) {
+        if (Entry.ID == bitc::FUNCTION_SUMMARY_BLOCK_ID) {
+          SeenFuncSummary = true;
+          // No need to parse the rest since we found the summary.
+          return std::error_code();
+        }
+        if (Stream.SkipBlock())
+          return error("Invalid record");
+        continue;
+      }
+      switch (Entry.ID) {
+      default: // Skip unknown content.
+        if (Stream.SkipBlock())
+          return error("Invalid record");
+        break;
+      case bitc::BLOCKINFO_BLOCK_ID:
+        // Need to parse these to get abbrev ids (e.g. for VST)
+        if (Stream.ReadBlockInfoBlock())
+          return error("Malformed block");
+        break;
+      case bitc::VALUE_SYMTAB_BLOCK_ID:
+        if (std::error_code EC = parseValueSymbolTable())
+          return EC;
+        break;
+      case bitc::FUNCTION_SUMMARY_BLOCK_ID:
+        SeenFuncSummary = true;
+        if (IsLazy) {
+          // Lazy parsing of summary info, skip it.
+          if (Stream.SkipBlock())
+            return error("Invalid record");
+        } else if (std::error_code EC = parseEntireSummary())
+          return EC;
+        break;
+      case bitc::MODULE_STRTAB_BLOCK_ID:
+        if (std::error_code EC = parseModuleStringTable())
+          return EC;
+        break;
+      }
+      continue;
+
+    case BitstreamEntry::Record:
+      Stream.skipRecord(Entry.ID);
+      continue;
+    }
+  }
+}
+
+// Eagerly parse the entire function summary block (i.e. for all functions
+// in the index). This populates the FunctionSummary objects in
+// the index.
+std::error_code FunctionIndexBitcodeReader::parseEntireSummary() {
+  if (Stream.EnterSubBlock(bitc::FUNCTION_SUMMARY_BLOCK_ID))
+    return error("Invalid record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record. The record format depends on whether this
+    // is a per-module index or a combined index file. In the per-module
+    // case the records contain the associated value's ID for correlation
+    // with VST entries. In the combined index the correlation is done
+    // via the bitcode offset of the summary records (which were saved
+    // in the combined index VST entries). The records also contain
+    // information used for ThinLTO renaming and importing.
+    Record.clear();
+    uint64_t CurRecordBit = Stream.GetCurrentBitNo();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default: // Default behavior: ignore.
+      break;
+    // FS_PERMODULE_ENTRY: [valueid, islocal, instcount]
+    case bitc::FS_CODE_PERMODULE_ENTRY: {
+      unsigned ValueID = Record[0];
+      bool IsLocal = Record[1];
+      unsigned InstCount = Record[2];
+      std::unique_ptr<FunctionSummary> FS =
+          llvm::make_unique<FunctionSummary>(InstCount);
+      FS->setLocalFunction(IsLocal);
+      // The module path string ref set in the summary must be owned by the
+      // index's module string table. Since we don't have a module path
+      // string table section in the per-module index, we create a single
+      // module path string table entry with an empty (0) ID to take
+      // ownership.
+      FS->setModulePath(
+          TheIndex->addModulePath(Buffer->getBufferIdentifier(), 0));
+      SummaryMap[ValueID] = std::move(FS);
+    }
+    // FS_COMBINED_ENTRY: [modid, instcount]
+    case bitc::FS_CODE_COMBINED_ENTRY: {
+      uint64_t ModuleId = Record[0];
+      unsigned InstCount = Record[1];
+      std::unique_ptr<FunctionSummary> FS =
+          llvm::make_unique<FunctionSummary>(InstCount);
+      FS->setModulePath(ModuleIdMap[ModuleId]);
+      SummaryMap[CurRecordBit] = std::move(FS);
+    }
+    }
+  }
+  llvm_unreachable("Exit infinite loop");
+}
+
+// Parse the  module string table block into the Index.
+// This populates the ModulePathStringTable map in the index.
+std::error_code FunctionIndexBitcodeReader::parseModuleStringTable() {
+  if (Stream.EnterSubBlock(bitc::MODULE_STRTAB_BLOCK_ID))
+    return error("Invalid record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  SmallString<128> ModulePath;
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return std::error_code();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default: // Default behavior: ignore.
+      break;
+    case bitc::MST_CODE_ENTRY: {
+      // MST_ENTRY: [modid, namechar x N]
+      if (convertToString(Record, 1, ModulePath))
+        return error("Invalid record");
+      uint64_t ModuleId = Record[0];
+      StringRef ModulePathInMap = TheIndex->addModulePath(ModulePath, ModuleId);
+      ModuleIdMap[ModuleId] = ModulePathInMap;
+      ModulePath.clear();
+      break;
+    }
+    }
+  }
+  llvm_unreachable("Exit infinite loop");
+}
+
+// Parse the function info index from the bitcode streamer into the given index.
+std::error_code FunctionIndexBitcodeReader::parseSummaryIndexInto(
+    std::unique_ptr<DataStreamer> Streamer, FunctionInfoIndex *I) {
+  TheIndex = I;
+
+  if (std::error_code EC = initStream(std::move(Streamer)))
+    return EC;
+
+  // Sniff for the signature.
+  if (!hasValidBitcodeHeader(Stream))
+    return error("Invalid bitcode signature");
+
+  // We expect a number of well-defined blocks, though we don't necessarily
+  // need to understand them all.
+  while (1) {
+    if (Stream.AtEndOfStream()) {
+      // We didn't really read a proper Module block.
+      return error("Malformed block");
+    }
+
+    BitstreamEntry Entry =
+        Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
+
+    if (Entry.Kind != BitstreamEntry::SubBlock)
+      return error("Malformed block");
+
+    // If we see a MODULE_BLOCK, parse it to find the blocks needed for
+    // building the function summary index.
+    if (Entry.ID == bitc::MODULE_BLOCK_ID)
+      return parseModule();
+
+    if (Stream.SkipBlock())
+      return error("Invalid record");
+  }
+}
+
+// Parse the function information at the given offset in the buffer into
+// the index. Used to support lazy parsing of function summaries from the
+// combined index during importing.
+// TODO: This function is not yet complete as it won't have a consumer
+// until ThinLTO function importing is added.
+std::error_code FunctionIndexBitcodeReader::parseFunctionSummary(
+    std::unique_ptr<DataStreamer> Streamer, FunctionInfoIndex *I,
+    size_t FunctionSummaryOffset) {
+  TheIndex = I;
+
+  if (std::error_code EC = initStream(std::move(Streamer)))
+    return EC;
+
+  // Sniff for the signature.
+  if (!hasValidBitcodeHeader(Stream))
+    return error("Invalid bitcode signature");
+
+  Stream.JumpToBit(FunctionSummaryOffset);
+
+  BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+  switch (Entry.Kind) {
+  default:
+    return error("Malformed block");
+  case BitstreamEntry::Record:
+    // The expected case.
+    break;
+  }
+
+  // TODO: Read a record. This interface will be completed when ThinLTO
+  // importing is added so that it can be tested.
+  SmallVector<uint64_t, 64> Record;
+  switch (Stream.readRecord(Entry.ID, Record)) {
+  case bitc::FS_CODE_COMBINED_ENTRY:
+  default:
+    return error("Invalid record");
+  }
+
+  return std::error_code();
+}
+
+std::error_code
+FunctionIndexBitcodeReader::initStream(std::unique_ptr<DataStreamer> Streamer) {
+  if (Streamer)
+    return initLazyStream(std::move(Streamer));
+  return initStreamFromBuffer();
+}
+
+std::error_code FunctionIndexBitcodeReader::initStreamFromBuffer() {
+  const unsigned char *BufPtr = (const unsigned char *)Buffer->getBufferStart();
+  const unsigned char *BufEnd = BufPtr + Buffer->getBufferSize();
+
+  if (Buffer->getBufferSize() & 3)
+    return error("Invalid bitcode signature");
+
+  // If we have a wrapper header, parse it and ignore the non-bc file contents.
+  // The magic number is 0x0B17C0DE stored in little endian.
+  if (isBitcodeWrapper(BufPtr, BufEnd))
+    if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
+      return error("Invalid bitcode wrapper header");
+
+  StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
+  Stream.init(&*StreamFile);
+
+  return std::error_code();
+}
+
+std::error_code FunctionIndexBitcodeReader::initLazyStream(
+    std::unique_ptr<DataStreamer> Streamer) {
+  // Check and strip off the bitcode wrapper; BitstreamReader expects never to
+  // see it.
+  auto OwnedBytes =
+      llvm::make_unique<StreamingMemoryObject>(std::move(Streamer));
+  StreamingMemoryObject &Bytes = *OwnedBytes;
+  StreamFile = llvm::make_unique<BitstreamReader>(std::move(OwnedBytes));
+  Stream.init(&*StreamFile);
+
+  unsigned char buf[16];
+  if (Bytes.readBytes(buf, 16, 0) != 16)
+    return error("Invalid bitcode signature");
+
+  if (!isBitcode(buf, buf + 16))
+    return error("Invalid bitcode signature");
+
+  if (isBitcodeWrapper(buf, buf + 4)) {
+    const unsigned char *bitcodeStart = buf;
+    const unsigned char *bitcodeEnd = buf + 16;
+    SkipBitcodeWrapperHeader(bitcodeStart, bitcodeEnd, false);
+    Bytes.dropLeadingBytes(bitcodeStart - buf);
+    Bytes.setKnownObjectSize(bitcodeEnd - bitcodeStart);
+  }
+  return std::error_code();
+}
+
 namespace {
 class BitcodeErrorCategoryType : public std::error_category {
   const char *name() const LLVM_NOEXCEPT override {
@@ -4669,7 +5846,7 @@ getBitcodeModuleImpl(std::unique_ptr<DataStreamer> Streamer, StringRef Name,
 
   if (MaterializeAll) {
     // Read in the entire module, and destroy the BitcodeReader.
-    if (std::error_code EC = M->materializeAllPermanently())
+    if (std::error_code EC = M->materializeAll())
       return cleanupOnError(EC);
   } else {
     // Resolve forward references from blockaddresses.
@@ -4690,10 +5867,8 @@ getBitcodeModuleImpl(std::unique_ptr<DataStreamer> Streamer, StringRef Name,
 static ErrorOr<std::unique_ptr<Module>>
 getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer,
                          LLVMContext &Context, bool MaterializeAll,
-                         DiagnosticHandlerFunction DiagnosticHandler,
                          bool ShouldLazyLoadMetadata = false) {
-  BitcodeReader *R =
-      new BitcodeReader(Buffer.get(), Context, DiagnosticHandler);
+  BitcodeReader *R = new BitcodeReader(Buffer.get(), Context);
 
   ErrorOr<std::unique_ptr<Module>> Ret =
       getBitcodeModuleImpl(nullptr, Buffer->getBufferIdentifier(), R, Context,
@@ -4705,41 +5880,124 @@ getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer,
   return Ret;
 }
 
-ErrorOr<std::unique_ptr<Module>> llvm::getLazyBitcodeModule(
-    std::unique_ptr<MemoryBuffer> &&Buffer, LLVMContext &Context,
-    DiagnosticHandlerFunction DiagnosticHandler, bool ShouldLazyLoadMetadata) {
+ErrorOr<std::unique_ptr<Module>>
+llvm::getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer,
+                           LLVMContext &Context, bool ShouldLazyLoadMetadata) {
   return getLazyBitcodeModuleImpl(std::move(Buffer), Context, false,
-                                  DiagnosticHandler, ShouldLazyLoadMetadata);
+                                  ShouldLazyLoadMetadata);
 }
 
-ErrorOr<std::unique_ptr<Module>> llvm::getStreamedBitcodeModule(
-    StringRef Name, std::unique_ptr<DataStreamer> Streamer,
-    LLVMContext &Context, DiagnosticHandlerFunction DiagnosticHandler) {
+ErrorOr<std::unique_ptr<Module>>
+llvm::getStreamedBitcodeModule(StringRef Name,
+                               std::unique_ptr<DataStreamer> Streamer,
+                               LLVMContext &Context) {
   std::unique_ptr<Module> M = make_unique<Module>(Name, Context);
-  BitcodeReader *R = new BitcodeReader(Context, DiagnosticHandler);
+  BitcodeReader *R = new BitcodeReader(Context);
 
   return getBitcodeModuleImpl(std::move(Streamer), Name, R, Context, false,
                               false);
 }
 
-ErrorOr<std::unique_ptr<Module>>
-llvm::parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context,
-                       DiagnosticHandlerFunction DiagnosticHandler) {
+ErrorOr<std::unique_ptr<Module>> llvm::parseBitcodeFile(MemoryBufferRef Buffer,
+                                                        LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  return getLazyBitcodeModuleImpl(std::move(Buf), Context, true,
-                                  DiagnosticHandler);
+  return getLazyBitcodeModuleImpl(std::move(Buf), Context, true);
   // TODO: Restore the use-lists to the in-memory state when the bitcode was
   // written.  We must defer until the Module has been fully materialized.
 }
 
-std::string
-llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer, LLVMContext &Context,
-                             DiagnosticHandlerFunction DiagnosticHandler) {
+std::string llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer,
+                                         LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
-  auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context,
-                                            DiagnosticHandler);
+  auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context);
   ErrorOr<std::string> Triple = R->parseTriple();
   if (Triple.getError())
     return "";
   return Triple.get();
 }
+
+std::string llvm::getBitcodeProducerString(MemoryBufferRef Buffer,
+                                           LLVMContext &Context) {
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  BitcodeReader R(Buf.release(), Context);
+  ErrorOr<std::string> ProducerString = R.parseIdentificationBlock();
+  if (ProducerString.getError())
+    return "";
+  return ProducerString.get();
+}
+
+// Parse the specified bitcode buffer, returning the function info index.
+// If IsLazy is false, parse the entire function summary into
+// the index. Otherwise skip the function summary section, and only create
+// an index object with a map from function name to function summary offset.
+// The index is used to perform lazy function summary reading later.
+ErrorOr<std::unique_ptr<FunctionInfoIndex>>
+llvm::getFunctionInfoIndex(MemoryBufferRef Buffer,
+                           DiagnosticHandlerFunction DiagnosticHandler,
+                           bool IsLazy) {
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, IsLazy);
+
+  auto Index = llvm::make_unique<FunctionInfoIndex>();
+
+  auto cleanupOnError = [&](std::error_code EC) {
+    R.releaseBuffer(); // Never take ownership on error.
+    return EC;
+  };
+
+  if (std::error_code EC = R.parseSummaryIndexInto(nullptr, Index.get()))
+    return cleanupOnError(EC);
+
+  Buf.release(); // The FunctionIndexBitcodeReader owns it now.
+  return std::move(Index);
+}
+
+// Check if the given bitcode buffer contains a function summary block.
+bool llvm::hasFunctionSummary(MemoryBufferRef Buffer,
+                              DiagnosticHandlerFunction DiagnosticHandler) {
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, false, true);
+
+  auto cleanupOnError = [&](std::error_code EC) {
+    R.releaseBuffer(); // Never take ownership on error.
+    return false;
+  };
+
+  if (std::error_code EC = R.parseSummaryIndexInto(nullptr, nullptr))
+    return cleanupOnError(EC);
+
+  Buf.release(); // The FunctionIndexBitcodeReader owns it now.
+  return R.foundFuncSummary();
+}
+
+// This method supports lazy reading of function summary data from the combined
+// index during ThinLTO function importing. When reading the combined index
+// file, getFunctionInfoIndex is first invoked with IsLazy=true.
+// Then this method is called for each function considered for importing,
+// to parse the summary information for the given function name into
+// the index.
+std::error_code llvm::readFunctionSummary(
+    MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler,
+    StringRef FunctionName, std::unique_ptr<FunctionInfoIndex> Index) {
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler);
+
+  auto cleanupOnError = [&](std::error_code EC) {
+    R.releaseBuffer(); // Never take ownership on error.
+    return EC;
+  };
+
+  // Lookup the given function name in the FunctionMap, which may
+  // contain a list of function infos in the case of a COMDAT. Walk through
+  // and parse each function summary info at the function summary offset
+  // recorded when parsing the value symbol table.
+  for (const auto &FI : Index->getFunctionInfoList(FunctionName)) {
+    size_t FunctionSummaryOffset = FI->bitcodeIndex();
+    if (std::error_code EC =
+            R.parseFunctionSummary(nullptr, Index.get(), FunctionSummaryOffset))
+      return cleanupOnError(EC);
+  }
+
+  Buf.release(); // The FunctionIndexBitcodeReader owns it now.
+  return std::error_code();
+}
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1a70ba5..a899a0c 100644
--- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -13,14 +13,18 @@
 
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "ValueEnumerator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/UseListOrder.h"
@@ -174,6 +178,10 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_IN_ALLOCA;
   case Attribute::Cold:
     return bitc::ATTR_KIND_COLD;
+  case Attribute::InaccessibleMemOnly:
+    return bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY;
+  case Attribute::InaccessibleMemOrArgMemOnly:
+    return bitc::ATTR_KIND_INACCESSIBLEMEM_OR_ARGMEMONLY;
   case Attribute::InlineHint:
     return bitc::ATTR_KIND_INLINE_HINT;
   case Attribute::InReg:
@@ -198,6 +206,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_IMPLICIT_FLOAT;
   case Attribute::NoInline:
     return bitc::ATTR_KIND_NO_INLINE;
+  case Attribute::NoRecurse:
+    return bitc::ATTR_KIND_NO_RECURSE;
   case Attribute::NonLazyBind:
     return bitc::ATTR_KIND_NON_LAZY_BIND;
   case Attribute::NonNull:
@@ -405,6 +415,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
     case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
     case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
     case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
+    case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
     case Type::IntegerTyID:
       // INTEGER: [width]
       Code = bitc::TYPE_CODE_INTEGER;
@@ -573,10 +584,41 @@ static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   }
 }
 
-// Emit top-level description of module, including target triple, inline asm,
-// descriptors for global variables, and function prototype info.
-static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
-                            BitstreamWriter &Stream) {
+/// Write a record that will eventually hold the word offset of the
+/// module-level VST. For now the offset is 0, which will be backpatched
+/// after the real VST is written. Returns the bit offset to backpatch.
+static uint64_t WriteValueSymbolTableForwardDecl(const ValueSymbolTable &VST,
+                                                 BitstreamWriter &Stream) {
+  if (VST.empty())
+    return 0;
+
+  // Write a placeholder value in for the offset of the real VST,
+  // which is written after the function blocks so that it can include
+  // the offset of each function. The placeholder offset will be
+  // updated when the real VST is written.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_VSTOFFSET));
+  // Blocks are 32-bit aligned, so we can use a 32-bit word offset to
+  // hold the real VST offset. Must use fixed instead of VBR as we don't
+  // know how many VBR chunks to reserve ahead of time.
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Emit the placeholder
+  uint64_t Vals[] = {bitc::MODULE_CODE_VSTOFFSET, 0};
+  Stream.EmitRecordWithAbbrev(VSTOffsetAbbrev, Vals);
+
+  // Compute and return the bit offset to the placeholder, which will be
+  // patched when the real VST is written. We can simply subtract the 32-bit
+  // fixed size from the current bit number to get the location to backpatch.
+  return Stream.GetCurrentBitNo() - 32;
+}
+
+/// Emit top-level description of module, including target triple, inline asm,
+/// descriptors for global variables, and function prototype info.
+/// Returns the bit offset to backpatch with the location of the real VST.
+static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
+                                BitstreamWriter &Stream) {
   // Emit various pieces of data attached to a module.
   if (!M->getTargetTriple().empty())
     WriteStringRecord(bitc::MODULE_CODE_TRIPLE, M->getTargetTriple(),
@@ -725,7 +767,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   // Emit the alias information.
   for (const GlobalAlias &A : M->aliases()) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
-    Vals.push_back(VE.getTypeID(A.getType()));
+    Vals.push_back(VE.getTypeID(A.getValueType()));
+    Vals.push_back(A.getType()->getAddressSpace());
     Vals.push_back(VE.getValueID(A.getAliasee()));
     Vals.push_back(getEncodedLinkage(A));
     Vals.push_back(getEncodedVisibility(A));
@@ -736,6 +779,25 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
     Vals.clear();
   }
+
+  // Write a record indicating the number of module-level metadata IDs
+  // This is needed because the ids of metadata are assigned implicitly
+  // based on their ordering in the bitcode, with the function-level
+  // metadata ids starting after the module-level metadata ids. For
+  // function importing where we lazy load the metadata as a postpass,
+  // we want to avoid parsing the module-level metadata before parsing
+  // the imported functions.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_METADATA_VALUES));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  unsigned MDValsAbbrev = Stream.EmitAbbrev(Abbv);
+  Vals.push_back(VE.numMDs());
+  Stream.EmitRecord(bitc::MODULE_CODE_METADATA_VALUES, Vals, MDValsAbbrev);
+  Vals.clear();
+
+  uint64_t VSTOffsetPlaceholder =
+      WriteValueSymbolTableForwardDecl(M->getValueSymbolTable(), Stream);
+  return VSTOffsetPlaceholder;
 }
 
 static uint64_t GetOptimizationFlags(const Value *V) {
@@ -943,7 +1005,8 @@ static void WriteDICompileUnit(const DICompileUnit *N,
                                BitstreamWriter &Stream,
                                SmallVectorImpl<uint64_t> &Record,
                                unsigned Abbrev) {
-  Record.push_back(N->isDistinct());
+  assert(N->isDistinct() && "Expected distinct compile units");
+  Record.push_back(/* IsDistinct */ true);
   Record.push_back(N->getSourceLanguage());
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
@@ -958,6 +1021,7 @@ static void WriteDICompileUnit(const DICompileUnit *N,
   Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities().get()));
   Record.push_back(N->getDWOId());
+  Record.push_back(VE.getMetadataOrNullID(N->getMacros().get()));
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -982,7 +1046,6 @@ static void WriteDISubprogram(const DISubprogram *N, const ValueEnumerator &VE,
   Record.push_back(N->getVirtualIndex());
   Record.push_back(N->getFlags());
   Record.push_back(N->isOptimized());
-  Record.push_back(VE.getMetadataOrNullID(N->getRawFunction()));
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
   Record.push_back(VE.getMetadataOrNullID(N->getVariables().get()));
@@ -1034,6 +1097,33 @@ static void WriteDINamespace(const DINamespace *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
+static void WriteDIMacro(const DIMacro *N, const ValueEnumerator &VE,
+                         BitstreamWriter &Stream,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getMacinfoType());
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawValue()));
+
+  Stream.EmitRecord(bitc::METADATA_MACRO, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteDIMacroFile(const DIMacroFile *N, const ValueEnumerator &VE,
+                             BitstreamWriter &Stream,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getMacinfoType());
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getElements().get()));
+
+  Stream.EmitRecord(bitc::METADATA_MACRO_FILE, Record, Abbrev);
+  Record.clear();
+}
+
 static void WriteDIModule(const DIModule *N, const ValueEnumerator &VE,
                           BitstreamWriter &Stream,
                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
@@ -1100,7 +1190,6 @@ static void WriteDILocalVariable(const DILocalVariable *N,
                                  SmallVectorImpl<uint64_t> &Record,
                                  unsigned Abbrev) {
   Record.push_back(N->isDistinct());
-  Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
@@ -1310,16 +1399,15 @@ static void WriteMetadataAttachment(const Function &F,
     Record.clear();
   }
 
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB) {
       MDs.clear();
-      I->getAllMetadataOtherThanDebugLoc(MDs);
+      I.getAllMetadataOtherThanDebugLoc(MDs);
 
       // If no metadata, ignore instruction.
       if (MDs.empty()) continue;
 
-      Record.push_back(VE.getInstructionID(I));
+      Record.push_back(VE.getInstructionID(&I));
 
       for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
         Record.push_back(MDs[i].first);
@@ -1342,7 +1430,7 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
 
   if (Names.empty()) return;
 
-  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+  Stream.EnterSubblock(bitc::METADATA_KIND_BLOCK_ID, 3);
 
   for (unsigned MDKindID = 0, e = Names.size(); MDKindID != e; ++MDKindID) {
     Record.push_back(MDKindID);
@@ -1356,6 +1444,33 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
+static void WriteOperandBundleTags(const Module *M, BitstreamWriter &Stream) {
+  // Write metadata kinds
+  //
+  // OPERAND_BUNDLE_TAGS_BLOCK_ID : N x OPERAND_BUNDLE_TAG
+  //
+  // OPERAND_BUNDLE_TAG - [strchr x N]
+
+  SmallVector<StringRef, 8> Tags;
+  M->getOperandBundleTags(Tags);
+
+  if (Tags.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::OPERAND_BUNDLE_TAGS_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+
+  for (auto Tag : Tags) {
+    Record.append(Tag.begin(), Tag.end());
+
+    Stream.EmitRecord(bitc::OPERAND_BUNDLE_TAG, Record, 0);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
 static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
   if ((int64_t)V >= 0)
     Vals.push_back(V << 1);
@@ -1515,19 +1630,10 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       if (isa<IntegerType>(EltTy)) {
         for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i)
           Record.push_back(CDS->getElementAsInteger(i));
-      } else if (EltTy->isFloatTy()) {
-        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-          union { float F; uint32_t I; };
-          F = CDS->getElementAsFloat(i);
-          Record.push_back(I);
-        }
       } else {
-        assert(EltTy->isDoubleTy() && "Unknown ConstantData element type");
-        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-          union { double F; uint64_t I; };
-          F = CDS->getElementAsDouble(i);
-          Record.push_back(I);
-        }
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i)
+          Record.push_back(
+              CDS->getElementAsAPFloat(i).bitcastToAPInt().getLimitedValue());
       }
     } else if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
                isa<ConstantVector>(C)) {
@@ -1664,6 +1770,23 @@ static bool PushValueAndType(const Value *V, unsigned InstID,
   return false;
 }
 
+static void WriteOperandBundles(BitstreamWriter &Stream, ImmutableCallSite CS,
+                                unsigned InstID, ValueEnumerator &VE) {
+  SmallVector<unsigned, 64> Record;
+  LLVMContext &C = CS.getInstruction()->getContext();
+
+  for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i) {
+    const auto &Bundle = CS.getOperandBundleAt(i);
+    Record.push_back(C.getOperandBundleTagID(Bundle.getTagName()));
+
+    for (auto &Input : Bundle.Inputs)
+      PushValueAndType(Input, InstID, Record, VE);
+
+    Stream.EmitRecord(bitc::FUNC_CODE_OPERAND_BUNDLE, Record);
+    Record.clear();
+  }
+}
+
 /// pushValue - Like PushValueAndType, but where the type of the value is
 /// omitted (perhaps it was already encoded in an earlier operand).
 static void pushValue(const Value *V, unsigned InstID,
@@ -1806,10 +1929,9 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
       pushValue(SI.getCondition(), InstID, Vals, VE);
       Vals.push_back(VE.getValueID(SI.getDefaultDest()));
-      for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-           i != e; ++i) {
-        Vals.push_back(VE.getValueID(i.getCaseValue()));
-        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
+      for (SwitchInst::ConstCaseIt Case : SI.cases()) {
+        Vals.push_back(VE.getValueID(Case.getCaseValue()));
+        Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
       }
     }
     break;
@@ -1826,6 +1948,10 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     const InvokeInst *II = cast<InvokeInst>(&I);
     const Value *Callee = II->getCalledValue();
     FunctionType *FTy = II->getFunctionType();
+
+    if (II->hasOperandBundles())
+      WriteOperandBundles(Stream, II, InstID, VE);
+
     Code = bitc::FUNC_CODE_INST_INVOKE;
 
     Vals.push_back(VE.getAttributeID(II->getAttributes()));
@@ -1851,6 +1977,49 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     Code = bitc::FUNC_CODE_INST_RESUME;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
     break;
+  case Instruction::CleanupRet: {
+    Code = bitc::FUNC_CODE_INST_CLEANUPRET;
+    const auto &CRI = cast<CleanupReturnInst>(I);
+    pushValue(CRI.getCleanupPad(), InstID, Vals, VE);
+    if (CRI.hasUnwindDest())
+      Vals.push_back(VE.getValueID(CRI.getUnwindDest()));
+    break;
+  }
+  case Instruction::CatchRet: {
+    Code = bitc::FUNC_CODE_INST_CATCHRET;
+    const auto &CRI = cast<CatchReturnInst>(I);
+    pushValue(CRI.getCatchPad(), InstID, Vals, VE);
+    Vals.push_back(VE.getValueID(CRI.getSuccessor()));
+    break;
+  }
+  case Instruction::CleanupPad:
+  case Instruction::CatchPad: {
+    const auto &FuncletPad = cast<FuncletPadInst>(I);
+    Code = isa<CatchPadInst>(FuncletPad) ? bitc::FUNC_CODE_INST_CATCHPAD
+                                         : bitc::FUNC_CODE_INST_CLEANUPPAD;
+    pushValue(FuncletPad.getParentPad(), InstID, Vals, VE);
+
+    unsigned NumArgOperands = FuncletPad.getNumArgOperands();
+    Vals.push_back(NumArgOperands);
+    for (unsigned Op = 0; Op != NumArgOperands; ++Op)
+      PushValueAndType(FuncletPad.getArgOperand(Op), InstID, Vals, VE);
+    break;
+  }
+  case Instruction::CatchSwitch: {
+    Code = bitc::FUNC_CODE_INST_CATCHSWITCH;
+    const auto &CatchSwitch = cast<CatchSwitchInst>(I);
+
+    pushValue(CatchSwitch.getParentPad(), InstID, Vals, VE);
+
+    unsigned NumHandlers = CatchSwitch.getNumHandlers();
+    Vals.push_back(NumHandlers);
+    for (const BasicBlock *CatchPadBB : CatchSwitch.handlers())
+      Vals.push_back(VE.getValueID(CatchPadBB));
+
+    if (CatchSwitch.hasUnwindDest())
+      Vals.push_back(VE.getValueID(CatchSwitch.getUnwindDest()));
+    break;
+  }
   case Instruction::Unreachable:
     Code = bitc::FUNC_CODE_INST_UNREACHABLE;
     AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV;
@@ -1902,6 +2071,8 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64");
     AlignRecord |= AI.isUsedWithInAlloca() << 5;
     AlignRecord |= 1 << 6;
+    // Reserve bit 7 for SwiftError flag.
+    // AlignRecord |= AI.isSwiftError() << 7;
     Vals.push_back(AlignRecord);
     break;
   }
@@ -1971,11 +2142,23 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     const CallInst &CI = cast<CallInst>(I);
     FunctionType *FTy = CI.getFunctionType();
 
+    if (CI.hasOperandBundles())
+      WriteOperandBundles(Stream, &CI, InstID, VE);
+
     Code = bitc::FUNC_CODE_INST_CALL;
 
     Vals.push_back(VE.getAttributeID(CI.getAttributes()));
-    Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()) |
-                   unsigned(CI.isMustTailCall()) << 14 | 1 << 15);
+
+    unsigned Flags = GetOptimizationFlags(&I);
+    Vals.push_back(CI.getCallingConv() << bitc::CALL_CCONV |
+                   unsigned(CI.isTailCall()) << bitc::CALL_TAIL |
+                   unsigned(CI.isMustTailCall()) << bitc::CALL_MUSTTAIL |
+                   1 << bitc::CALL_EXPLICIT_TYPE |
+                   unsigned(CI.isNoTailCall()) << bitc::CALL_NOTAIL |
+                   unsigned(Flags != 0) << bitc::CALL_FMF);
+    if (Flags != 0)
+      Vals.push_back(Flags);
+
     Vals.push_back(VE.getTypeID(FTy));
     PushValueAndType(CI.getCalledValue(), InstID, Vals, VE);  // Callee
 
@@ -2008,56 +2191,149 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   Vals.clear();
 }
 
-// Emit names for globals/functions etc.
-static void WriteValueSymbolTable(const ValueSymbolTable &VST,
-                                  const ValueEnumerator &VE,
-                                  BitstreamWriter &Stream) {
-  if (VST.empty()) return;
+enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 };
+
+/// Determine the encoding to use for the given string name and length.
+static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) {
+  bool isChar6 = true;
+  for (const char *C = Str, *E = C + StrLen; C != E; ++C) {
+    if (isChar6)
+      isChar6 = BitCodeAbbrevOp::isChar6(*C);
+    if ((unsigned char)*C & 128)
+      // don't bother scanning the rest.
+      return SE_Fixed8;
+  }
+  if (isChar6)
+    return SE_Char6;
+  else
+    return SE_Fixed7;
+}
+
+/// Emit names for globals/functions etc. The VSTOffsetPlaceholder,
+/// BitcodeStartBit and FunctionIndex are only passed for the module-level
+/// VST, where we are including a function bitcode index and need to
+/// backpatch the VST forward declaration record.
+static void WriteValueSymbolTable(
+    const ValueSymbolTable &VST, const ValueEnumerator &VE,
+    BitstreamWriter &Stream, uint64_t VSTOffsetPlaceholder = 0,
+    uint64_t BitcodeStartBit = 0,
+    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> *FunctionIndex =
+        nullptr) {
+  if (VST.empty()) {
+    // WriteValueSymbolTableForwardDecl should have returned early as
+    // well. Ensure this handling remains in sync by asserting that
+    // the placeholder offset is not set.
+    assert(VSTOffsetPlaceholder == 0);
+    return;
+  }
+
+  if (VSTOffsetPlaceholder > 0) {
+    // Get the offset of the VST we are writing, and backpatch it into
+    // the VST forward declaration record.
+    uint64_t VSTOffset = Stream.GetCurrentBitNo();
+    // The BitcodeStartBit was the stream offset of the actual bitcode
+    // (e.g. excluding any initial darwin header).
+    VSTOffset -= BitcodeStartBit;
+    assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned");
+    Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32);
+  }
+
   Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
 
+  // For the module-level VST, add abbrev Ids for the VST_CODE_FNENTRY
+  // records, which are not used in the per-function VSTs.
+  unsigned FnEntry8BitAbbrev;
+  unsigned FnEntry7BitAbbrev;
+  unsigned FnEntry6BitAbbrev;
+  if (VSTOffsetPlaceholder > 0) {
+    // 8-bit fixed-width VST_FNENTRY function strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv);
+
+    // 7-bit fixed width VST_FNENTRY function strings.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv);
+
+    // 6-bit char6 VST_FNENTRY function strings.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
+  }
+
   // FIXME: Set up the abbrev, we know how many values there are!
   // FIXME: We know if the type names can use 7-bit ascii.
   SmallVector<unsigned, 64> NameVals;
 
-  for (ValueSymbolTable::const_iterator SI = VST.begin(), SE = VST.end();
-       SI != SE; ++SI) {
-
-    const ValueName &Name = *SI;
-
+  for (const ValueName &Name : VST) {
     // Figure out the encoding to use for the name.
-    bool is7Bit = true;
-    bool isChar6 = true;
-    for (const char *C = Name.getKeyData(), *E = C+Name.getKeyLength();
-         C != E; ++C) {
-      if (isChar6)
-        isChar6 = BitCodeAbbrevOp::isChar6(*C);
-      if ((unsigned char)*C & 128) {
-        is7Bit = false;
-        break;  // don't bother scanning the rest.
-      }
-    }
+    StringEncoding Bits =
+        getStringEncoding(Name.getKeyData(), Name.getKeyLength());
 
     unsigned AbbrevToUse = VST_ENTRY_8_ABBREV;
+    NameVals.push_back(VE.getValueID(Name.getValue()));
+
+    Function *F = dyn_cast<Function>(Name.getValue());
+    if (!F) {
+      // If value is an alias, need to get the aliased base object to
+      // see if it is a function.
+      auto *GA = dyn_cast<GlobalAlias>(Name.getValue());
+      if (GA && GA->getBaseObject())
+        F = dyn_cast<Function>(GA->getBaseObject());
+    }
 
     // VST_ENTRY:   [valueid, namechar x N]
+    // VST_FNENTRY: [valueid, funcoffset, namechar x N]
     // VST_BBENTRY: [bbid, namechar x N]
     unsigned Code;
-    if (isa<BasicBlock>(SI->getValue())) {
+    if (isa<BasicBlock>(Name.getValue())) {
       Code = bitc::VST_CODE_BBENTRY;
-      if (isChar6)
+      if (Bits == SE_Char6)
         AbbrevToUse = VST_BBENTRY_6_ABBREV;
+    } else if (F && !F->isDeclaration()) {
+      // Must be the module-level VST, where we pass in the Index and
+      // have a VSTOffsetPlaceholder. The function-level VST should not
+      // contain any Function symbols.
+      assert(FunctionIndex);
+      assert(VSTOffsetPlaceholder > 0);
+
+      // Save the word offset of the function (from the start of the
+      // actual bitcode written to the stream).
+      assert(FunctionIndex->count(F) == 1);
+      uint64_t BitcodeIndex =
+          (*FunctionIndex)[F]->bitcodeIndex() - BitcodeStartBit;
+      assert((BitcodeIndex & 31) == 0 && "function block not 32-bit aligned");
+      NameVals.push_back(BitcodeIndex / 32);
+
+      Code = bitc::VST_CODE_FNENTRY;
+      AbbrevToUse = FnEntry8BitAbbrev;
+      if (Bits == SE_Char6)
+        AbbrevToUse = FnEntry6BitAbbrev;
+      else if (Bits == SE_Fixed7)
+        AbbrevToUse = FnEntry7BitAbbrev;
     } else {
       Code = bitc::VST_CODE_ENTRY;
-      if (isChar6)
+      if (Bits == SE_Char6)
         AbbrevToUse = VST_ENTRY_6_ABBREV;
-      else if (is7Bit)
+      else if (Bits == SE_Fixed7)
         AbbrevToUse = VST_ENTRY_7_ABBREV;
     }
 
-    NameVals.push_back(VE.getValueID(SI->getValue()));
-    for (const char *P = Name.getKeyData(),
-         *E = Name.getKeyData()+Name.getKeyLength(); P != E; ++P)
-      NameVals.push_back((unsigned char)*P);
+    for (const auto P : Name.getKey())
+      NameVals.push_back((unsigned char)P);
 
     // Emit the finished record.
     Stream.EmitRecord(Code, NameVals, AbbrevToUse);
@@ -2066,6 +2342,66 @@ static void WriteValueSymbolTable(const ValueSymbolTable &VST,
   Stream.ExitBlock();
 }
 
+/// Emit function names and summary offsets for the combined index
+/// used by ThinLTO.
+static void WriteCombinedValueSymbolTable(const FunctionInfoIndex &Index,
+                                          BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
+
+  // 8-bit fixed-width VST_COMBINED_FNENTRY function strings.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+  unsigned FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // 7-bit fixed width VST_COMBINED_FNENTRY function strings.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+  unsigned FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // 6-bit char6 VST_COMBINED_FNENTRY function strings.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+  unsigned FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // FIXME: We know if the type names can use 7-bit ascii.
+  SmallVector<unsigned, 64> NameVals;
+
+  for (const auto &FII : Index) {
+    for (const auto &FI : FII.getValue()) {
+      NameVals.push_back(FI->bitcodeIndex());
+
+      StringRef FuncName = FII.first();
+
+      // Figure out the encoding to use for the name.
+      StringEncoding Bits = getStringEncoding(FuncName.data(), FuncName.size());
+
+      // VST_COMBINED_FNENTRY: [funcsumoffset, namechar x N]
+      unsigned AbbrevToUse = FnEntry8BitAbbrev;
+      if (Bits == SE_Char6)
+        AbbrevToUse = FnEntry6BitAbbrev;
+      else if (Bits == SE_Fixed7)
+        AbbrevToUse = FnEntry7BitAbbrev;
+
+      for (const auto P : FuncName)
+        NameVals.push_back((unsigned char)P);
+
+      // Emit the finished record.
+      Stream.EmitRecord(bitc::VST_CODE_COMBINED_FNENTRY, NameVals, AbbrevToUse);
+      NameVals.clear();
+    }
+  }
+  Stream.ExitBlock();
+}
+
 static void WriteUseList(ValueEnumerator &VE, UseListOrder &&Order,
                          BitstreamWriter &Stream) {
   assert(Order.Shuffle.size() >= 2 && "Shuffle too small");
@@ -2100,9 +2436,34 @@ static void WriteUseListBlock(const Function *F, ValueEnumerator &VE,
   Stream.ExitBlock();
 }
 
-/// WriteFunction - Emit a function body to the module stream.
-static void WriteFunction(const Function &F, ValueEnumerator &VE,
-                          BitstreamWriter &Stream) {
+/// \brief Save information for the given function into the function index.
+///
+/// At a minimum this saves the bitcode index of the function record that
+/// was just written. However, if we are emitting function summary information,
+/// for example for ThinLTO, then a \a FunctionSummary object is created
+/// to hold the provided summary information.
+static void SaveFunctionInfo(
+    const Function &F,
+    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex,
+    unsigned NumInsts, uint64_t BitcodeIndex, bool EmitFunctionSummary) {
+  std::unique_ptr<FunctionSummary> FuncSummary;
+  if (EmitFunctionSummary) {
+    FuncSummary = llvm::make_unique<FunctionSummary>(NumInsts);
+    FuncSummary->setLocalFunction(F.hasLocalLinkage());
+  }
+  FunctionIndex[&F] =
+      llvm::make_unique<FunctionInfo>(BitcodeIndex, std::move(FuncSummary));
+}
+
+/// Emit a function body to the module stream.
+static void WriteFunction(
+    const Function &F, ValueEnumerator &VE, BitstreamWriter &Stream,
+    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex,
+    bool EmitFunctionSummary) {
+  // Save the bitcode index of the start of this function block for recording
+  // in the VST.
+  uint64_t BitcodeIndex = Stream.GetCurrentBitNo();
+
   Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 4);
   VE.incorporateFunction(F);
 
@@ -2128,6 +2489,7 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE,
   bool NeedsMetadataAttachment = F.hasMetadata();
 
   DILocation *LastDL = nullptr;
+  unsigned NumInsts = 0;
 
   // Finally, emit all the instructions, in order.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -2135,6 +2497,9 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE,
          I != E; ++I) {
       WriteInstruction(*I, InstID, VE, Stream, Vals);
 
+      if (!isa<DbgInfoIntrinsic>(I))
+        ++NumInsts;
+
       if (!I->getType()->isVoidTy())
         ++InstID;
 
@@ -2171,6 +2536,9 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE,
     WriteUseListBlock(&F, VE, Stream);
   VE.purgeFunction();
   Stream.ExitBlock();
+
+  SaveFunctionInfo(F, FunctionIndex, NumInsts, BitcodeIndex,
+                   EmitFunctionSummary);
 }
 
 // Emit blockinfo, which defines the standard abbreviations etc.
@@ -2348,9 +2716,183 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
+/// Write the module path strings, currently only used when generating
+/// a combined index file.
+static void WriteModStrings(const FunctionInfoIndex &I,
+                            BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::MODULE_STRTAB_BLOCK_ID, 3);
+
+  // TODO: See which abbrev sizes we actually need to emit
+
+  // 8-bit fixed-width MST_ENTRY strings.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+  unsigned Abbrev8Bit = Stream.EmitAbbrev(Abbv);
+
+  // 7-bit fixed width MST_ENTRY strings.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+  unsigned Abbrev7Bit = Stream.EmitAbbrev(Abbv);
+
+  // 6-bit char6 MST_ENTRY strings.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+  unsigned Abbrev6Bit = Stream.EmitAbbrev(Abbv);
+
+  SmallVector<unsigned, 64> NameVals;
+  for (const StringMapEntry<uint64_t> &MPSE : I.modPathStringEntries()) {
+    StringEncoding Bits =
+        getStringEncoding(MPSE.getKey().data(), MPSE.getKey().size());
+    unsigned AbbrevToUse = Abbrev8Bit;
+    if (Bits == SE_Char6)
+      AbbrevToUse = Abbrev6Bit;
+    else if (Bits == SE_Fixed7)
+      AbbrevToUse = Abbrev7Bit;
+
+    NameVals.push_back(MPSE.getValue());
+
+    for (const auto P : MPSE.getKey())
+      NameVals.push_back((unsigned char)P);
+
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::MST_CODE_ENTRY, NameVals, AbbrevToUse);
+    NameVals.clear();
+  }
+  Stream.ExitBlock();
+}
+
+// Helper to emit a single function summary record.
+static void WritePerModuleFunctionSummaryRecord(
+    SmallVector<unsigned, 64> &NameVals, FunctionSummary *FS, unsigned ValueID,
+    unsigned FSAbbrev, BitstreamWriter &Stream) {
+  assert(FS);
+  NameVals.push_back(ValueID);
+  NameVals.push_back(FS->isLocalFunction());
+  NameVals.push_back(FS->instCount());
+
+  // Emit the finished record.
+  Stream.EmitRecord(bitc::FS_CODE_PERMODULE_ENTRY, NameVals, FSAbbrev);
+  NameVals.clear();
+}
+
+/// Emit the per-module function summary section alongside the rest of
+/// the module's bitcode.
+static void WritePerModuleFunctionSummary(
+    DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex,
+    const Module *M, const ValueEnumerator &VE, BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::FUNCTION_SUMMARY_BLOCK_ID, 3);
+
+  // Abbrev for FS_CODE_PERMODULE_ENTRY.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_CODE_PERMODULE_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // islocal
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
+  unsigned FSAbbrev = Stream.EmitAbbrev(Abbv);
+
+  SmallVector<unsigned, 64> NameVals;
+  for (auto &I : FunctionIndex) {
+    // Skip anonymous functions. We will emit a function summary for
+    // any aliases below.
+    if (!I.first->hasName())
+      continue;
+
+    WritePerModuleFunctionSummaryRecord(
+        NameVals, I.second->functionSummary(),
+        VE.getValueID(M->getValueSymbolTable().lookup(I.first->getName())),
+        FSAbbrev, Stream);
+  }
+
+  for (const GlobalAlias &A : M->aliases()) {
+    if (!A.getBaseObject())
+      continue;
+    const Function *F = dyn_cast<Function>(A.getBaseObject());
+    if (!F || F->isDeclaration())
+      continue;
+
+    assert(FunctionIndex.count(F) == 1);
+    WritePerModuleFunctionSummaryRecord(
+        NameVals, FunctionIndex[F]->functionSummary(),
+        VE.getValueID(M->getValueSymbolTable().lookup(A.getName())), FSAbbrev,
+        Stream);
+  }
+
+  Stream.ExitBlock();
+}
+
+/// Emit the combined function summary section into the combined index
+/// file.
+static void WriteCombinedFunctionSummary(const FunctionInfoIndex &I,
+                                         BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::FUNCTION_SUMMARY_BLOCK_ID, 3);
+
+  // Abbrev for FS_CODE_COMBINED_ENTRY.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_CODE_COMBINED_ENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
+  unsigned FSAbbrev = Stream.EmitAbbrev(Abbv);
+
+  SmallVector<unsigned, 64> NameVals;
+  for (const auto &FII : I) {
+    for (auto &FI : FII.getValue()) {
+      FunctionSummary *FS = FI->functionSummary();
+      assert(FS);
+
+      NameVals.push_back(I.getModuleId(FS->modulePath()));
+      NameVals.push_back(FS->instCount());
+
+      // Record the starting offset of this summary entry for use
+      // in the VST entry. Add the current code size since the
+      // reader will invoke readRecord after the abbrev id read.
+      FI->setBitcodeIndex(Stream.GetCurrentBitNo() + Stream.GetAbbrevIDWidth());
+
+      // Emit the finished record.
+      Stream.EmitRecord(bitc::FS_CODE_COMBINED_ENTRY, NameVals, FSAbbrev);
+      NameVals.clear();
+    }
+  }
+
+  Stream.ExitBlock();
+}
+
+// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the
+// current llvm version, and a record for the epoch number.
+static void WriteIdentificationBlock(const Module *M, BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
+
+  // Write the "user readable" string identifying the bitcode producer
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_STRING));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+  auto StringAbbrev = Stream.EmitAbbrev(Abbv);
+  WriteStringRecord(bitc::IDENTIFICATION_CODE_STRING,
+                    "LLVM" LLVM_VERSION_STRING, StringAbbrev, Stream);
+
+  // Write the epoch version
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_EPOCH));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  auto EpochAbbrev = Stream.EmitAbbrev(Abbv);
+  SmallVector<unsigned, 1> Vals = {bitc::BITCODE_CURRENT_EPOCH};
+  Stream.EmitRecord(bitc::IDENTIFICATION_CODE_EPOCH, Vals, EpochAbbrev);
+  Stream.ExitBlock();
+}
+
 /// WriteModule - Emit the specified module to the bitstream.
 static void WriteModule(const Module *M, BitstreamWriter &Stream,
-                        bool ShouldPreserveUseListOrder) {
+                        bool ShouldPreserveUseListOrder,
+                        uint64_t BitcodeStartBit, bool EmitFunctionSummary) {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
 
   SmallVector<unsigned, 1> Vals;
@@ -2377,7 +2919,7 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream,
 
   // Emit top-level description of module, including target triple, inline asm,
   // descriptors for global variables, and function prototype info.
-  WriteModuleInfo(M, VE, Stream);
+  uint64_t VSTOffsetPlaceholder = WriteModuleInfo(M, VE, Stream);
 
   // Emit constants.
   WriteModuleConstants(VE, Stream);
@@ -2388,17 +2930,25 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream,
   // Emit metadata.
   WriteModuleMetadataStore(M, Stream);
 
-  // Emit names for globals/functions etc.
-  WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream);
-
   // Emit module-level use-lists.
   if (VE.shouldPreserveUseListOrder())
     WriteUseListBlock(nullptr, VE, Stream);
 
+  WriteOperandBundleTags(M, Stream);
+
   // Emit function bodies.
+  DenseMap<const Function *, std::unique_ptr<FunctionInfo>> FunctionIndex;
   for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F)
     if (!F->isDeclaration())
-      WriteFunction(*F, VE, Stream);
+      WriteFunction(*F, VE, Stream, FunctionIndex, EmitFunctionSummary);
+
+  // Need to write after the above call to WriteFunction which populates
+  // the summary information in the index.
+  if (EmitFunctionSummary)
+    WritePerModuleFunctionSummary(FunctionIndex, M, VE, Stream);
+
+  WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream,
+                        VSTOffsetPlaceholder, BitcodeStartBit, &FunctionIndex);
 
   Stream.ExitBlock();
 }
@@ -2473,10 +3023,22 @@ static void EmitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
     Buffer.push_back(0);
 }
 
+/// Helper to write the header common to all bitcode files.
+static void WriteBitcodeHeader(BitstreamWriter &Stream) {
+  // Emit the file header.
+  Stream.Emit((unsigned)'B', 8);
+  Stream.Emit((unsigned)'C', 8);
+  Stream.Emit(0x0, 4);
+  Stream.Emit(0xC, 4);
+  Stream.Emit(0xE, 4);
+  Stream.Emit(0xD, 4);
+}
+
 /// WriteBitcodeToFile - Write the specified module to the specified output
 /// stream.
 void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
-                              bool ShouldPreserveUseListOrder) {
+                              bool ShouldPreserveUseListOrder,
+                              bool EmitFunctionSummary) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256*1024);
 
@@ -2489,17 +3051,20 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
   // Emit the module into the buffer.
   {
     BitstreamWriter Stream(Buffer);
+    // Save the start bit of the actual bitcode, in case there is space
+    // saved at the start for the darwin header above. The reader stream
+    // will start at the bitcode, and we need the offset of the VST
+    // to line up.
+    uint64_t BitcodeStartBit = Stream.GetCurrentBitNo();
 
     // Emit the file header.
-    Stream.Emit((unsigned)'B', 8);
-    Stream.Emit((unsigned)'C', 8);
-    Stream.Emit(0x0, 4);
-    Stream.Emit(0xC, 4);
-    Stream.Emit(0xE, 4);
-    Stream.Emit(0xD, 4);
+    WriteBitcodeHeader(Stream);
+
+    WriteIdentificationBlock(M, Stream);
 
     // Emit the module.
-    WriteModule(M, Stream, ShouldPreserveUseListOrder);
+    WriteModule(M, Stream, ShouldPreserveUseListOrder, BitcodeStartBit,
+                EmitFunctionSummary);
   }
 
   if (TT.isOSDarwin())
@@ -2508,3 +3073,38 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
   // Write the generated bitstream to "Out".
   Out.write((char*)&Buffer.front(), Buffer.size());
 }
+
+// Write the specified function summary index to the given raw output stream,
+// where it will be written in a new bitcode block. This is used when
+// writing the combined index file for ThinLTO.
+void llvm::WriteFunctionSummaryToFile(const FunctionInfoIndex &Index,
+                                      raw_ostream &Out) {
+  SmallVector<char, 0> Buffer;
+  Buffer.reserve(256 * 1024);
+
+  BitstreamWriter Stream(Buffer);
+
+  // Emit the bitcode header.
+  WriteBitcodeHeader(Stream);
+
+  Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
+
+  SmallVector<unsigned, 1> Vals;
+  unsigned CurVersion = 1;
+  Vals.push_back(CurVersion);
+  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
+
+  // Write the module paths in the combined index.
+  WriteModStrings(Index, Stream);
+
+  // Write the function summary combined index records.
+  WriteCombinedFunctionSummary(Index, Stream);
+
+  // Need a special VST writer for the combined index (we don't have a
+  // real VST and real values when this is invoked).
+  WriteCombinedValueSymbolTable(Index, Stream);
+
+  Stream.ExitBlock();
+
+  Out.write((char *)&Buffer.front(), Buffer.size());
+}
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 3165743..24de99a 100644
--- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 
 PreservedAnalyses BitcodeWriterPass::run(Module &M) {
-  WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder);
+  WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, EmitFunctionSummary);
   return PreservedAnalyses::all();
 }
 
@@ -27,17 +27,21 @@ namespace {
   class WriteBitcodePass : public ModulePass {
     raw_ostream &OS; // raw_ostream to print on
     bool ShouldPreserveUseListOrder;
+    bool EmitFunctionSummary;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder)
+    explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder,
+                              bool EmitFunctionSummary)
         : ModulePass(ID), OS(o),
-          ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
+          ShouldPreserveUseListOrder(ShouldPreserveUseListOrder),
+          EmitFunctionSummary(EmitFunctionSummary) {}
 
     const char *getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder);
+      WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder,
+                         EmitFunctionSummary);
       return false;
     }
   };
@@ -46,6 +50,8 @@ namespace {
 char WriteBitcodePass::ID = 0;
 
 ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str,
-                                          bool ShouldPreserveUseListOrder) {
-  return new WriteBitcodePass(Str, ShouldPreserveUseListOrder);
+                                          bool ShouldPreserveUseListOrder,
+                                          bool EmitFunctionSummary) {
+  return new WriteBitcodePass(Str, ShouldPreserveUseListOrder,
+                              EmitFunctionSummary);
 }
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 44dd604..e07563b 100644
--- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -87,15 +87,9 @@ static OrderMap orderModule(const Module &M) {
     if (!isa<GlobalValue>(A.getAliasee()))
       orderValue(A.getAliasee(), OM);
   for (const Function &F : M) {
-    if (F.hasPrefixData())
-      if (!isa<GlobalValue>(F.getPrefixData()))
-        orderValue(F.getPrefixData(), OM);
-    if (F.hasPrologueData())
-      if (!isa<GlobalValue>(F.getPrologueData()))
-        orderValue(F.getPrologueData(), OM);
-    if (F.hasPersonalityFn())
-      if (!isa<GlobalValue>(F.getPersonalityFn()))
-        orderValue(F.getPersonalityFn(), OM);
+    for (const Use &U : F.operands())
+      if (!isa<GlobalValue>(U.get()))
+        orderValue(U.get(), OM);
   }
   OM.LastGlobalConstantID = OM.size();
 
@@ -273,12 +267,8 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
   for (const GlobalAlias &A : M.aliases())
     predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
   for (const Function &F : M) {
-    if (F.hasPrefixData())
-      predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack);
-    if (F.hasPrologueData())
-      predictValueUseListOrder(F.getPrologueData(), nullptr, OM, Stack);
-    if (F.hasPersonalityFn())
-      predictValueUseListOrder(F.getPersonalityFn(), nullptr, OM, Stack);
+    for (const Use &U : F.operands())
+      predictValueUseListOrder(U.get(), nullptr, OM, Stack);
   }
 
   return Stack;
@@ -321,20 +311,10 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   for (const GlobalAlias &GA : M.aliases())
     EnumerateValue(GA.getAliasee());
 
-  // Enumerate the prefix data constants.
+  // Enumerate any optional Function data.
   for (const Function &F : M)
-    if (F.hasPrefixData())
-      EnumerateValue(F.getPrefixData());
-
-  // Enumerate the prologue data constants.
-  for (const Function &F : M)
-    if (F.hasPrologueData())
-      EnumerateValue(F.getPrologueData());
-
-  // Enumerate the personality functions.
-  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (I->hasPersonalityFn())
-      EnumerateValue(I->getPersonalityFn());
+    for (const Use &U : F.operands())
+      EnumerateValue(U.get());
 
   // Enumerate the metadata type.
   //
@@ -425,7 +405,7 @@ unsigned ValueEnumerator::getValueID(const Value *V) const {
 void ValueEnumerator::dump() const {
   print(dbgs(), ValueMap, "Default");
   dbgs() << '\n';
-  print(dbgs(), MDValueMap, "MetaData");
+  print(dbgs(), MetadataMap, "MetaData");
   dbgs() << '\n';
 }
 
@@ -512,10 +492,8 @@ void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
 /// Insert all of the values referenced by named metadata in the specified
 /// module.
 void ValueEnumerator::EnumerateNamedMetadata(const Module &M) {
-  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
-                                             E = M.named_metadata_end();
-       I != E; ++I)
-    EnumerateNamedMDNode(I);
+  for (const auto &I : M.named_metadata())
+    EnumerateNamedMDNode(&I);
 }
 
 void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) {
@@ -544,7 +522,7 @@ void ValueEnumerator::EnumerateMetadata(const Metadata *MD) {
   // EnumerateMDNodeOperands() from re-visiting MD in a cyclic graph.
   //
   // Return early if there's already an ID.
-  if (!MDValueMap.insert(std::make_pair(MD, 0)).second)
+  if (!MetadataMap.insert(std::make_pair(MD, 0)).second)
     return;
 
   // Visit operands first to minimize RAUW.
@@ -557,10 +535,10 @@ void ValueEnumerator::EnumerateMetadata(const Metadata *MD) {
   HasDILocation |= isa<DILocation>(MD);
   HasGenericDINode |= isa<GenericDINode>(MD);
 
-  // Replace the dummy ID inserted above with the correct one.  MDValueMap may
+  // Replace the dummy ID inserted above with the correct one.  MetadataMap may
   // have changed by inserting operands, so we need a fresh lookup here.
   MDs.push_back(MD);
-  MDValueMap[MD] = MDs.size();
+  MetadataMap[MD] = MDs.size();
 }
 
 /// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata
@@ -568,12 +546,12 @@ void ValueEnumerator::EnumerateMetadata(const Metadata *MD) {
 void ValueEnumerator::EnumerateFunctionLocalMetadata(
     const LocalAsMetadata *Local) {
   // Check to see if it's already in!
-  unsigned &MDValueID = MDValueMap[Local];
-  if (MDValueID)
+  unsigned &MetadataID = MetadataMap[Local];
+  if (MetadataID)
     return;
 
   MDs.push_back(Local);
-  MDValueID = MDs.size();
+  MetadataID = MDs.size();
 
   EnumerateValue(Local->getValue());
 
@@ -729,23 +707,20 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
   NumModuleMDs = MDs.size();
 
   // Adding function arguments to the value table.
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
-       I != E; ++I)
-    EnumerateValue(I);
+  for (const auto &I : F.args())
+    EnumerateValue(&I);
 
   FirstFuncConstantID = Values.size();
 
   // Add all function-level constants to the value table.
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I)
-      for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
-           OI != E; ++OI) {
-        if ((isa<Constant>(*OI) && !isa<GlobalValue>(*OI)) ||
-            isa<InlineAsm>(*OI))
-          EnumerateValue(*OI);
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB)
+      for (const Use &OI : I.operands()) {
+        if ((isa<Constant>(OI) && !isa<GlobalValue>(OI)) || isa<InlineAsm>(OI))
+          EnumerateValue(OI);
       }
-    BasicBlocks.push_back(BB);
-    ValueMap[BB] = BasicBlocks.size();
+    BasicBlocks.push_back(&BB);
+    ValueMap[&BB] = BasicBlocks.size();
   }
 
   // Optimize the constant layout.
@@ -759,18 +734,17 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   SmallVector<LocalAsMetadata *, 8> FnLocalMDVector;
   // Add all of the instructions.
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
-      for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
-           OI != E; ++OI) {
-        if (auto *MD = dyn_cast<MetadataAsValue>(&*OI))
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      for (const Use &OI : I.operands()) {
+        if (auto *MD = dyn_cast<MetadataAsValue>(&OI))
           if (auto *Local = dyn_cast<LocalAsMetadata>(MD->getMetadata()))
             // Enumerate metadata after the instructions they might refer to.
             FnLocalMDVector.push_back(Local);
       }
 
-      if (!I->getType()->isVoidTy())
-        EnumerateValue(I);
+      if (!I.getType()->isVoidTy())
+        EnumerateValue(&I);
     }
   }
 
@@ -784,7 +758,7 @@ void ValueEnumerator::purgeFunction() {
   for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
     ValueMap.erase(Values[i].first);
   for (unsigned i = NumModuleMDs, e = MDs.size(); i != e; ++i)
-    MDValueMap.erase(MDs[i]);
+    MetadataMap.erase(MDs[i]);
   for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
     ValueMap.erase(BasicBlocks[i]);
 
@@ -797,8 +771,8 @@ void ValueEnumerator::purgeFunction() {
 static void IncorporateFunctionInfoGlobalBBIDs(const Function *F,
                                  DenseMap<const BasicBlock*, unsigned> &IDMap) {
   unsigned Counter = 0;
-  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-    IDMap[BB] = ++Counter;
+  for (const BasicBlock &BB : *F)
+    IDMap[&BB] = ++Counter;
 }
 
 /// getGlobalBasicBlockID - This returns the function-specific ID for the
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
index 92d166e..9fb8325 100644
--- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
@@ -63,7 +63,7 @@ private:
   std::vector<const Metadata *> MDs;
   SmallVector<const LocalAsMetadata *, 8> FunctionLocalMDs;
   typedef DenseMap<const Metadata *, unsigned> MetadataMapType;
-  MetadataMapType MDValueMap;
+  MetadataMapType MetadataMap;
   bool HasMDString;
   bool HasDILocation;
   bool HasGenericDINode;
@@ -93,7 +93,7 @@ private:
   /// before incorporation.
   unsigned NumModuleValues;
 
-  /// When a function is incorporated, this is the size of the MDValues list
+  /// When a function is incorporated, this is the size of the Metadatas list
   /// before incorporation.
   unsigned NumModuleMDs;
 
@@ -117,8 +117,9 @@ public:
     return ID - 1;
   }
   unsigned getMetadataOrNullID(const Metadata *MD) const {
-    return MDValueMap.lookup(MD);
+    return MetadataMap.lookup(MD);
   }
+  unsigned numMDs() const { return MDs.size(); }
 
   bool hasMDString() const { return HasMDString; }
   bool hasDILocation() const { return HasDILocation; }
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 5fe4c4b..4060db7 100644
--- a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -142,16 +142,15 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   assert(!State);
   State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
 
-  bool IsReturnBlock = (!BB->empty() && BB->back().isReturn());
+  bool IsReturnBlock = BB->isReturnBlock();
   std::vector<unsigned> &KillIndices = State->GetKillIndices();
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
 
   // Examine the live-in regs of all successors.
   for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
-    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-           E = (*SI)->livein_end(); I != E; ++I) {
-      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+    for (const auto &LI : (*SI)->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
         unsigned Reg = *AI;
         State->UnionGroups(Reg, 0);
         KillIndices[Reg] = BB->size();
@@ -365,9 +364,11 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
-    // defined in a call must not be changed (ABI).
+    // defined in a call must not be changed (ABI). Inline assembly may
+    // reference either system calls or the register directly. Skip it until we
+    // can tell user specified registers from compiler-specified.
     if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
-        TII->isPredicated(MI)) {
+        TII->isPredicated(MI) || MI->isInlineAsm()) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -429,6 +430,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // If MI's uses have special allocation requirement, don't allow
   // any use registers to be changed. Also assume all registers
   // used in a call must not be changed (ABI).
+  // Inline Assembly register uses also cannot be safely changed.
   // FIXME: The issue with predicated instruction is more complex. We are being
   // conservatively here because the kill markers cannot be trusted after
   // if-conversion:
@@ -444,7 +446,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // changed.
   bool Special = MI->isCall() ||
     MI->hasExtraSrcRegAllocReq() ||
-    TII->isPredicated(MI);
+    TII->isPredicated(MI) || MI->isInlineAsm();
 
   // Scan the register uses for this instruction and update
   // live-ranges, groups and RegRefs.
@@ -509,15 +511,8 @@ BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
   // Check all references that need rewriting for Reg. For each, use
   // the corresponding register class to narrow the set of registers
   // that are appropriate for renaming.
-  std::pair<std::multimap<unsigned,
-                     AggressiveAntiDepState::RegisterReference>::iterator,
-            std::multimap<unsigned,
-                     AggressiveAntiDepState::RegisterReference>::iterator>
-    Range = State->GetRegRefs().equal_range(Reg);
-  for (std::multimap<unsigned,
-       AggressiveAntiDepState::RegisterReference>::iterator Q = Range.first,
-       QE = Range.second; Q != QE; ++Q) {
-    const TargetRegisterClass *RC = Q->second.RC;
+  for (const auto &Q : make_range(State->GetRegRefs().equal_range(Reg))) {
+    const TargetRegisterClass *RC = Q.second.RC;
     if (!RC) continue;
 
     BitVector RCBV = TRI->getAllocatableSet(MF, RC);
@@ -685,9 +680,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
 
       // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also
       // defines 'NewReg' via an early-clobber operand.
-      auto Range = RegRefs.equal_range(Reg);
-      for (auto Q = Range.first, QE = Range.second; Q != QE; ++Q) {
-        auto UseMI = Q->second.Operand->getParent();
+      for (const auto &Q : make_range(RegRefs.equal_range(Reg))) {
+        MachineInstr *UseMI = Q.second.Operand->getParent();
         int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI);
         if (Idx == -1)
           continue;
@@ -698,6 +692,20 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
         }
       }
 
+      // Also, we cannot rename 'Reg' to 'NewReg' if the instruction defining
+      // 'Reg' is an early-clobber define and that instruction also uses
+      // 'NewReg'.
+      for (const auto &Q : make_range(RegRefs.equal_range(Reg))) {
+        if (!Q.second.Operand->isDef() || !Q.second.Operand->isEarlyClobber())
+          continue;
+
+        MachineInstr *DefMI = Q.second.Operand->getParent();
+        if (DefMI->readsRegister(NewReg, TRI)) {
+          DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
       // Record that 'Reg' can be renamed to 'NewReg'.
       RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
     }
@@ -920,23 +928,16 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
 
             // Update the references to the old register CurrReg to
             // refer to the new register NewReg.
-            std::pair<std::multimap<unsigned,
-                           AggressiveAntiDepState::RegisterReference>::iterator,
-                      std::multimap<unsigned,
-                           AggressiveAntiDepState::RegisterReference>::iterator>
-              Range = RegRefs.equal_range(CurrReg);
-            for (std::multimap<unsigned,
-                 AggressiveAntiDepState::RegisterReference>::iterator
-                   Q = Range.first, QE = Range.second; Q != QE; ++Q) {
-              Q->second.Operand->setReg(NewReg);
+            for (const auto &Q : make_range(RegRefs.equal_range(CurrReg))) {
+              Q.second.Operand->setReg(NewReg);
               // If the SU for the instruction being updated has debug
               // information related to the anti-dependency register, make
               // sure to update that as well.
-              const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
+              const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()];
               if (!SU) continue;
               for (DbgValueVector::iterator DVI = DbgValues.begin(),
                      DVE = DbgValues.end(); DVI != DVE; ++DVI)
-                if (DVI->second == Q->second.Operand->getParent())
+                if (DVI->second == Q.second.Operand->getParent())
                   UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
             }
 
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
index dc9bcff..40451c0 100644
--- a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -29,12 +29,13 @@ using namespace llvm;
 // Compare VirtRegMap::getRegAllocPref().
 AllocationOrder::AllocationOrder(unsigned VirtReg,
                                  const VirtRegMap &VRM,
-                                 const RegisterClassInfo &RegClassInfo)
+                                 const RegisterClassInfo &RegClassInfo,
+                                 const LiveRegMatrix *Matrix)
   : Pos(0) {
   const MachineFunction &MF = VRM.getMachineFunction();
   const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
   Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
-  TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM);
+  TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM, Matrix);
   rewind();
 
   DEBUG({
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.h b/contrib/llvm/lib/CodeGen/AllocationOrder.h
index 02b2d92..2aee3a6 100644
--- a/contrib/llvm/lib/CodeGen/AllocationOrder.h
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.h
@@ -24,6 +24,7 @@ namespace llvm {
 
 class RegisterClassInfo;
 class VirtRegMap;
+class LiveRegMatrix;
 
 class LLVM_LIBRARY_VISIBILITY AllocationOrder {
   SmallVector<MCPhysReg, 16> Hints;
@@ -37,7 +38,8 @@ public:
   /// @param RegClassInfo Information about reserved and allocatable registers.
   AllocationOrder(unsigned VirtReg,
                   const VirtRegMap &VRM,
-                  const RegisterClassInfo &RegClassInfo);
+                  const RegisterClassInfo &RegClassInfo,
+                  const LiveRegMatrix *Matrix);
 
   /// Get the allocation order without reordered hints.
   ArrayRef<MCPhysReg> getOrder() const { return Order; }
diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp
index 98d4c8a..75579a2 100644
--- a/contrib/llvm/lib/CodeGen/Analysis.cpp
+++ b/contrib/llvm/lib/CodeGen/Analysis.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -25,6 +26,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 
@@ -515,7 +517,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
       if (isa<DbgInfoIntrinsic>(BBI))
         continue;
       if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
-          !isSafeToSpeculativelyExecute(BBI))
+          !isSafeToSpeculativelyExecute(&*BBI))
         return false;
     }
 
@@ -643,3 +645,97 @@ bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
 
   return !GS.IsCompared;
 }
+
+static void collectFuncletMembers(
+    DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
+    const MachineBasicBlock *MBB) {
+  // Add this MBB to our funclet.
+  auto P = FuncletMembership.insert(std::make_pair(MBB, Funclet));
+
+  // Don't revisit blocks.
+  if (!P.second) {
+    assert(P.first->second == Funclet && "MBB is part of two funclets!");
+    return;
+  }
+
+  bool IsReturn = false;
+  int NumTerminators = 0;
+  for (const MachineInstr &MI : MBB->terminators()) {
+    IsReturn |= MI.isReturn();
+    ++NumTerminators;
+  }
+  assert((!IsReturn || NumTerminators == 1) &&
+         "Expected only one terminator when a return is present!");
+
+  // Returns are boundaries where funclet transfer can occur, don't follow
+  // successors.
+  if (IsReturn)
+    return;
+
+  for (const MachineBasicBlock *SMBB : MBB->successors())
+    if (!SMBB->isEHPad())
+      collectFuncletMembers(FuncletMembership, Funclet, SMBB);
+}
+
+DenseMap<const MachineBasicBlock *, int>
+llvm::getFuncletMembership(const MachineFunction &MF) {
+  DenseMap<const MachineBasicBlock *, int> FuncletMembership;
+
+  // We don't have anything to do if there aren't any EH pads.
+  if (!MF.getMMI().hasEHFunclets())
+    return FuncletMembership;
+
+  int EntryBBNumber = MF.front().getNumber();
+  bool IsSEH = isAsynchronousEHPersonality(
+      classifyEHPersonality(MF.getFunction()->getPersonalityFn()));
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<const MachineBasicBlock *, 16> FuncletBlocks;
+  SmallVector<const MachineBasicBlock *, 16> UnreachableBlocks;
+  SmallVector<const MachineBasicBlock *, 16> SEHCatchPads;
+  SmallVector<std::pair<const MachineBasicBlock *, int>, 16> CatchRetSuccessors;
+  for (const MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHFuncletEntry()) {
+      FuncletBlocks.push_back(&MBB);
+    } else if (IsSEH && MBB.isEHPad()) {
+      SEHCatchPads.push_back(&MBB);
+    } else if (MBB.pred_empty()) {
+      UnreachableBlocks.push_back(&MBB);
+    }
+
+    MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator();
+    // CatchPads are not funclets for SEH so do not consider CatchRet to
+    // transfer control to another funclet.
+    if (MBBI->getOpcode() != TII->getCatchReturnOpcode())
+      continue;
+
+    // FIXME: SEH CatchPads are not necessarily in the parent function:
+    // they could be inside a finally block.
+    const MachineBasicBlock *Successor = MBBI->getOperand(0).getMBB();
+    const MachineBasicBlock *SuccessorColor = MBBI->getOperand(1).getMBB();
+    CatchRetSuccessors.push_back(
+        {Successor, IsSEH ? EntryBBNumber : SuccessorColor->getNumber()});
+  }
+
+  // We don't have anything to do if there aren't any EH pads.
+  if (FuncletBlocks.empty())
+    return FuncletMembership;
+
+  // Identify all the basic blocks reachable from the function entry.
+  collectFuncletMembers(FuncletMembership, EntryBBNumber, &MF.front());
+  // All blocks not part of a funclet are in the parent function.
+  for (const MachineBasicBlock *MBB : UnreachableBlocks)
+    collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB);
+  // Next, identify all the blocks inside the funclets.
+  for (const MachineBasicBlock *MBB : FuncletBlocks)
+    collectFuncletMembers(FuncletMembership, MBB->getNumber(), MBB);
+  // SEH CatchPads aren't really funclets, handle them separately.
+  for (const MachineBasicBlock *MBB : SEHCatchPads)
+    collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB);
+  // Finally, identify all the targets of a catchret.
+  for (std::pair<const MachineBasicBlock *, int> CatchRetPair :
+       CatchRetSuccessors)
+    collectFuncletMembers(FuncletMembership, CatchRetPair.second,
+                          CatchRetPair.first);
+  return FuncletMembership;
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index 0bad795..ade2d71 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -73,7 +73,6 @@ void ARMException::endFunction(const MachineFunction *MF) {
   const Function *Per = nullptr;
   if (F->hasPersonalityFn())
     Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
-  assert(!MMI->getPersonality() || Per == MMI->getPersonality());
   bool forceEmitPersonality =
     F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
     F->needsUnwindTableEntry();
@@ -115,9 +114,7 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
     Entry = TypeInfos.size();
   }
 
-  for (std::vector<const GlobalValue *>::const_reverse_iterator
-         I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
-    const GlobalValue *GV = *I;
+  for (const GlobalValue *GV : reverse(TypeInfos)) {
     if (VerboseAsm)
       Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
     Asm->EmitTTypeReference(GV, TTypeEncoding);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 125047e..5f67d3d 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -135,11 +135,14 @@ const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
   return *TM.getObjFileLowering();
 }
 
-/// getDataLayout - Return information about data layout.
 const DataLayout &AsmPrinter::getDataLayout() const {
-  return *TM.getDataLayout();
+  return MMI->getModule()->getDataLayout();
 }
 
+// Do not use the cached DataLayout because some client use it without a Module
+// (llmv-dsymutil, llvm-dwarfdump).
+unsigned AsmPrinter::getPointerSize() const { return TM.getPointerSize(); }
+
 const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
   assert(MF && "getSubtargetInfo requires a valid MachineFunction!");
   return MF->getSubtarget<MCSubtargetInfo>();
@@ -189,14 +192,26 @@ bool AsmPrinter::doInitialization(Module &M) {
   // use the directive, where it would need the same conditionalization
   // anyway.
   Triple TT(getTargetTriple());
-  if (TT.isOSDarwin()) {
+  // If there is a version specified, Major will be non-zero.
+  if (TT.isOSDarwin() && TT.getOSMajorVersion() != 0) {
     unsigned Major, Minor, Update;
-    TT.getOSVersion(Major, Minor, Update);
-    // If there is a version specified, Major will be non-zero.
-    if (Major)
-      OutStreamer->EmitVersionMin((TT.isMacOSX() ?
-                                   MCVM_OSXVersionMin : MCVM_IOSVersionMin),
-                                  Major, Minor, Update);
+    MCVersionMinType VersionType;
+    if (TT.isWatchOS()) {
+      VersionType = MCVM_WatchOSVersionMin;
+      TT.getWatchOSVersion(Major, Minor, Update);
+    } else if (TT.isTvOS()) {
+      VersionType = MCVM_TvOSVersionMin;
+      TT.getiOSVersion(Major, Minor, Update);
+    } else if (TT.isMacOSX()) {
+      VersionType = MCVM_OSXVersionMin;
+      if (!TT.getMacOSXVersion(Major, Minor, Update))
+        Major = 0;
+    } else {
+      VersionType = MCVM_IOSVersionMin;
+      TT.getiOSVersion(Major, Minor, Update);
+    }
+    if (Major != 0)
+      OutStreamer->EmitVersionMin(VersionType, Major, Minor, Update);
   }
 
   // Allow the target to emit any magic that it wants at the start of the file.
@@ -224,28 +239,20 @@ bool AsmPrinter::doInitialization(Module &M) {
         TM.getTargetFeatureString()));
     OutStreamer->AddComment("Start of file scope inline assembly");
     OutStreamer->AddBlankLine();
-    EmitInlineAsm(M.getModuleInlineAsm()+"\n", *STI, TM.Options.MCOptions);
+    EmitInlineAsm(M.getModuleInlineAsm()+"\n",
+                  OutContext.getSubtargetCopy(*STI), TM.Options.MCOptions);
     OutStreamer->AddComment("End of file scope inline assembly");
     OutStreamer->AddBlankLine();
   }
 
   if (MAI->doesSupportDebugInformation()) {
-    bool skip_dwarf = false;
-    if (TM.getTargetTriple().isKnownWindowsMSVCEnvironment()) {
+    bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
+    if (EmitCodeView && TM.getTargetTriple().isKnownWindowsMSVCEnvironment()) {
       Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this),
                                      DbgTimerName,
                                      CodeViewLineTablesGroupName));
-      // FIXME: Don't emit DWARF debug info if there's at least one function
-      // with AddressSanitizer instrumentation.
-      // This is a band-aid fix for PR22032.
-      for (auto &F : M.functions()) {
-        if (F.hasFnAttribute(Attribute::SanitizeAddress)) {
-          skip_dwarf = true;
-          break;
-        }
-      }
     }
-    if (!skip_dwarf) {
+    if (!EmitCodeView || MMI->getModule()->getDwarfVersion()) {
       DD = new DwarfDebug(this, &M);
       Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
     }
@@ -340,8 +347,51 @@ MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
   return TM.getSymbol(GV, *Mang);
 }
 
+static MCSymbol *getOrCreateEmuTLSControlSym(MCSymbol *GVSym, MCContext &C) {
+  return C.getOrCreateSymbol(Twine("__emutls_v.") + GVSym->getName());
+}
+
+static MCSymbol *getOrCreateEmuTLSInitSym(MCSymbol *GVSym, MCContext &C) {
+  return C.getOrCreateSymbol(Twine("__emutls_t.") + GVSym->getName());
+}
+
+/// EmitEmulatedTLSControlVariable - Emit the control variable for an emulated TLS variable.
+void AsmPrinter::EmitEmulatedTLSControlVariable(const GlobalVariable *GV,
+                                                MCSymbol *EmittedSym,
+                                                bool AllZeroInitValue) {
+  MCSection *TLSVarSection = getObjFileLowering().getDataSection();
+  OutStreamer->SwitchSection(TLSVarSection);
+  MCSymbol *GVSym = getSymbol(GV);
+  EmitLinkage(GV, EmittedSym);  // same linkage as GV
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+  unsigned AlignLog = getGVAlignmentLog2(GV, DL);
+  unsigned WordSize = DL.getPointerSize();
+  unsigned Alignment = DL.getPointerABIAlignment();
+  EmitAlignment(Log2_32(Alignment));
+  OutStreamer->EmitLabel(EmittedSym);
+  OutStreamer->EmitIntValue(Size, WordSize);
+  OutStreamer->EmitIntValue((1 << AlignLog), WordSize);
+  OutStreamer->EmitIntValue(0, WordSize);
+  if (GV->hasInitializer() && !AllZeroInitValue) {
+    OutStreamer->EmitSymbolValue(
+        getOrCreateEmuTLSInitSym(GVSym, OutContext), WordSize);
+  } else
+    OutStreamer->EmitIntValue(0, WordSize);
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer->emitELFSize(cast<MCSymbolELF>(EmittedSym),
+                             MCConstantExpr::create(4 * WordSize, OutContext));
+  OutStreamer->AddBlankLine();  // End of the __emutls_v.* variable.
+}
+
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
 void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  bool IsEmuTLSVar =
+      GV->getThreadLocalMode() != llvm::GlobalVariable::NotThreadLocal &&
+      TM.Options.EmulatedTLS;
+  assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) &&
+         "No emulated TLS variables in the common section");
+
   if (GV->hasInitializer()) {
     // Check to see if this is a special global used by LLVM, if so, emit it.
     if (EmitSpecialLLVMGlobal(GV))
@@ -352,7 +402,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     if (GlobalGOTEquivs.count(getSymbol(GV)))
       return;
 
-    if (isVerbose()) {
+    if (isVerbose() && !IsEmuTLSVar) {
+      // When printing the control variable __emutls_v.*,
+      // we don't need to print the original TLS variable name.
       GV->printAsOperand(OutStreamer->GetCommentOS(),
                      /*PrintType=*/false, GV->getParent());
       OutStreamer->GetCommentOS() << '\n';
@@ -360,7 +412,12 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   }
 
   MCSymbol *GVSym = getSymbol(GV);
-  EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+  MCSymbol *EmittedSym = IsEmuTLSVar ?
+      getOrCreateEmuTLSControlSym(GVSym, OutContext) : GVSym;
+  // getOrCreateEmuTLSControlSym only creates the symbol with name and default attributes.
+  // GV's or GVSym's attributes will be used for the EmittedSym.
+
+  EmitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
@@ -371,17 +428,29 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
                        "' is already defined");
 
   if (MAI->hasDotTypeDotSizeDirective())
-    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+    OutStreamer->EmitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject);
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *DL = TM.getDataLayout();
-  uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
   // with a specified alignment is a prompt way to break globals emitted to
   // sections and expected to be contiguous (e.g. ObjC metadata).
-  unsigned AlignLog = getGVAlignmentLog2(GV, *DL);
+  unsigned AlignLog = getGVAlignmentLog2(GV, DL);
+
+  bool AllZeroInitValue = false;
+  const Constant *InitValue = GV->getInitializer();
+  if (isa<ConstantAggregateZero>(InitValue))
+    AllZeroInitValue = true;
+  else {
+    const ConstantInt *InitIntValue = dyn_cast<ConstantInt>(InitValue);
+    if (InitIntValue && InitIntValue->isZero())
+      AllZeroInitValue = true;
+  }
+  if (IsEmuTLSVar)
+    EmitEmulatedTLSControlVariable(GV, EmittedSym, AllZeroInitValue);
 
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
@@ -390,6 +459,8 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   // Handle common and BSS local symbols (.lcomm).
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
+    assert(!(IsEmuTLSVar && GVKind.isCommon()) &&
+           "No emulated TLS variables in the common section");
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
     unsigned Align = 1 << AlignLog;
 
@@ -434,12 +505,21 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     return;
   }
 
-  MCSection *TheSection =
+  if (IsEmuTLSVar && AllZeroInitValue)
+    return;  // No need of initialization values.
+
+  MCSymbol *EmittedInitSym = IsEmuTLSVar ?
+      getOrCreateEmuTLSInitSym(GVSym, OutContext) : GVSym;
+  // getOrCreateEmuTLSInitSym only creates the symbol with name and default attributes.
+  // GV's or GVSym's attributes will be used for the EmittedInitSym.
+
+  MCSection *TheSection = IsEmuTLSVar ?
+      getObjFileLowering().getReadOnlySection() :
       getObjFileLowering().SectionForGlobal(GV, GVKind, *Mang, TM);
 
   // Handle the zerofill directive on darwin, which is a special form of BSS
   // emission.
-  if (GVKind.isBSSExtern() && MAI->hasMachoZeroFillDirective()) {
+  if (GVKind.isBSSExtern() && MAI->hasMachoZeroFillDirective() && !IsEmuTLSVar) {
     if (Size == 0) Size = 1;  // zerofill of 0 bytes is undefined.
 
     // .globl _foo
@@ -459,7 +539,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // TLOF class.  This will also make it more obvious that stuff like
   // MCStreamer::EmitTBSSSymbol is macho specific and only called from macho
   // specific code.
-  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective()) {
+  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective() && !IsEmuTLSVar) {
     // Emit the .tbss symbol
     MCSymbol *MangSym =
       OutContext.getOrCreateSymbol(GVSym->getName() + Twine("$tlv$init"));
@@ -473,7 +553,8 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       EmitAlignment(AlignLog, GV);
       OutStreamer->EmitLabel(MangSym);
 
-      EmitGlobalConstant(GV->getInitializer());
+      EmitGlobalConstant(GV->getParent()->getDataLayout(),
+                         GV->getInitializer());
     }
 
     OutStreamer->AddBlankLine();
@@ -490,7 +571,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     //   - __tlv_bootstrap - used to make sure support exists
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
-    unsigned PtrSize = DL->getPointerTypeSize(GV->getType());
+    unsigned PtrSize = DL.getPointerTypeSize(GV->getType());
     OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                                 PtrSize);
     OutStreamer->EmitIntValue(0, PtrSize);
@@ -502,16 +583,18 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   OutStreamer->SwitchSection(TheSection);
 
-  EmitLinkage(GV, GVSym);
+  // emutls_t.* symbols are only used in the current compilation unit.
+  if (!IsEmuTLSVar)
+    EmitLinkage(GV, EmittedInitSym);
   EmitAlignment(AlignLog, GV);
 
-  OutStreamer->EmitLabel(GVSym);
+  OutStreamer->EmitLabel(EmittedInitSym);
 
-  EmitGlobalConstant(GV->getInitializer());
+  EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
 
   if (MAI->hasDotTypeDotSizeDirective())
     // .size foo, 42
-    OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym),
+    OutStreamer->emitELFSize(cast<MCSymbolELF>(EmittedInitSym),
                              MCConstantExpr::create(Size, OutContext));
 
   OutStreamer->AddBlankLine();
@@ -545,7 +628,7 @@ void AsmPrinter::EmitFunctionHeader() {
 
   // Emit the prefix data.
   if (F->hasPrefixData())
-    EmitGlobalConstant(F->getPrefixData());
+    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
 
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
@@ -580,7 +663,7 @@ void AsmPrinter::EmitFunctionHeader() {
 
   // Emit the prologue data.
   if (F->hasPrologueData())
-    EmitGlobalConstant(F->getPrologueData());
+    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrologueData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
@@ -640,19 +723,27 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  OutStreamer->AddComment(Twine("implicit-def: ") +
-                          MMI->getContext().getRegisterInfo()->getName(RegNo));
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "implicit-def: "
+     << PrintReg(RegNo, MF->getSubtarget().getRegisterInfo());
+
+  OutStreamer->AddComment(OS.str());
   OutStreamer->AddBlankLine();
 }
 
 static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
-  std::string Str = "kill:";
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << "kill:";
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &Op = MI->getOperand(i);
     assert(Op.isReg() && "KILL instruction must have only register operands");
-    Str += ' ';
-    Str += AP.MMI->getContext().getRegisterInfo()->getName(Op.getReg());
-    Str += (Op.isDef() ? "<def>" : "<kill>");
+    OS << ' '
+       << PrintReg(Op.getReg(),
+                   AP.MF->getSubtarget().getRegisterInfo())
+       << (Op.isDef() ? "<def>" : "<kill>");
   }
   AP.OutStreamer->AddComment(Str);
   AP.OutStreamer->AddBlankLine();
@@ -688,6 +779,31 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
   int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0;
 
+  for (unsigned i = 0; i < Expr->getNumElements(); ++i) {
+    if (Deref) {
+      // We currently don't support extra Offsets or derefs after the first
+      // one. Bail out early instead of emitting an incorrect comment
+      OS << " [complex expression]";
+      AP.OutStreamer->emitRawComment(OS.str());
+      return true;
+    }
+    uint64_t Op = Expr->getElement(i);
+    if (Op == dwarf::DW_OP_deref) {
+      Deref = true;
+      continue;
+    } else if (Op == dwarf::DW_OP_bit_piece) {
+      // There can't be any operands after this in a valid expression
+      break;
+    }
+    uint64_t ExtraOffset = Expr->getElement(i++);
+    if (Op == dwarf::DW_OP_plus)
+      Offset += ExtraOffset;
+    else {
+      assert(Op == dwarf::DW_OP_minus);
+      Offset -= ExtraOffset;
+    }
+  }
+
   // Register or immediate value. Register 0 means undef.
   if (MI->getOperand(0).isFPImm()) {
     APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF());
@@ -727,7 +843,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
     }
     if (Deref)
       OS << '[';
-    OS << AP.MMI->getContext().getRegisterInfo()->getName(Reg);
+    OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo());
   }
 
   if (Deref)
@@ -888,7 +1004,7 @@ void AsmPrinter::EmitFunctionBody() {
   EmitFunctionBodyEnd();
 
   if (!MMI->getLandingPads().empty() || MMI->hasDebugInfo() ||
-      MAI->hasDotTypeDotSizeDirective()) {
+      MMI->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->EmitLabel(CurrentFnEnd);
@@ -1047,20 +1163,17 @@ bool AsmPrinter::doFinalization(Module &M) {
     // Output stubs for external and common global variables.
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer->SwitchSection(TLOF.getDataRelSection());
-      const DataLayout *DL = TM.getDataLayout();
+      OutStreamer->SwitchSection(TLOF.getDataSection());
+      const DataLayout &DL = M.getDataLayout();
 
       for (const auto &Stub : Stubs) {
         OutStreamer->EmitLabel(Stub.first);
         OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
-                                     DL->getPointerSize());
+                                     DL.getPointerSize());
       }
     }
   }
 
-  // Make sure we wrote out everything we need.
-  OutStreamer->Flush();
-
   // Finalize debug and EH information.
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
@@ -1103,10 +1216,29 @@ bool AsmPrinter::doFinalization(Module &M) {
     else
       assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
 
+    // Set the symbol type to function if the alias has a function type.
+    // This affects codegen when the aliasee is not a function.
+    if (Alias.getType()->getPointerElementType()->isFunctionTy())
+      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
+
     EmitVisibility(Name, Alias.getVisibility());
 
     // Emit the directives as assignments aka .set:
     OutStreamer->EmitAssignment(Name, lowerConstant(Alias.getAliasee()));
+
+    // If the aliasee does not correspond to a symbol in the output, i.e. the
+    // alias is not of an object or the aliased object is private, then set the
+    // size of the alias symbol from the type of the alias. We don't do this in
+    // other situations as the alias and aliasee having differing types but same
+    // size may be intentional.
+    const GlobalObject *BaseObject = Alias.getBaseObject();
+    if (MAI->hasDotTypeDotSizeDirective() && Alias.getValueType()->isSized() &&
+        (!BaseObject || BaseObject->hasPrivateLinkage())) {
+      const DataLayout &DL = M.getDataLayout();
+      uint64_t Size = DL.getTypeAllocSize(Alias.getValueType());
+      OutStreamer->emitELFSize(cast<MCSymbolELF>(Name),
+                               MCConstantExpr::create(Size, OutContext));
+    }
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
@@ -1120,16 +1252,16 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   // Emit __morestack address if needed for indirect calls.
   if (MMI->usesMorestackAddr()) {
-    MCSection *ReadOnlySection =
-        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
-                                                   /*C=*/nullptr);
+    MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
+        getDataLayout(), SectionKind::getReadOnly(),
+        /*C=*/nullptr);
     OutStreamer->SwitchSection(ReadOnlySection);
 
     MCSymbol *AddrSymbol =
         OutContext.getOrCreateSymbol(StringRef("__morestack_addr"));
     OutStreamer->EmitLabel(AddrSymbol);
 
-    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
+    unsigned PtrSize = M.getDataLayout().getPointerSize(0);
     OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
                                  PtrSize);
   }
@@ -1169,7 +1301,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   CurExceptionSym = nullptr;
   bool NeedsLocalForSize = MAI->needsLocalForSize();
   if (!MMI->getLandingPads().empty() || MMI->hasDebugInfo() ||
-      NeedsLocalForSize) {
+      MMI->hasEHFunclets() || NeedsLocalForSize) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
       CurrentFnSymForSize = CurrentFnBegin;
@@ -1206,14 +1338,14 @@ void AsmPrinter::EmitConstantPool() {
     const MachineConstantPoolEntry &CPE = CP[i];
     unsigned Align = CPE.getAlignment();
 
-    SectionKind Kind =
-        CPE.getSectionKind(TM.getDataLayout());
+    SectionKind Kind = CPE.getSectionKind(&getDataLayout());
 
     const Constant *C = nullptr;
     if (!CPE.isMachineConstantPoolEntry())
       C = CPE.Val.ConstVal;
 
-    MCSection *S = getObjFileLowering().getSectionForConstant(Kind, C);
+    MCSection *S =
+        getObjFileLowering().getSectionForConstant(getDataLayout(), Kind, C);
 
     // The number of sections are small, just do a linear search from the
     // last section to the first.
@@ -1260,14 +1392,13 @@ void AsmPrinter::EmitConstantPool() {
       OutStreamer->EmitZeros(NewOffset - Offset);
 
       Type *Ty = CPE.getType();
-      Offset = NewOffset +
-               TM.getDataLayout()->getTypeAllocSize(Ty);
+      Offset = NewOffset + getDataLayout().getTypeAllocSize(Ty);
 
       OutStreamer->EmitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
         EmitMachineConstantPoolValue(CPE.Val.MachineCPVal);
       else
-        EmitGlobalConstant(CPE.Val.ConstVal);
+        EmitGlobalConstant(getDataLayout(), CPE.Val.ConstVal);
     }
   }
 }
@@ -1276,7 +1407,7 @@ void AsmPrinter::EmitConstantPool() {
 /// by the current function to the current output stream.
 ///
 void AsmPrinter::EmitJumpTableInfo() {
-  const DataLayout *DL = MF->getTarget().getDataLayout();
+  const DataLayout &DL = MF->getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
@@ -1296,8 +1427,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     OutStreamer->SwitchSection(ReadOnlySection);
   }
 
-  EmitAlignment(Log2_32(
-      MJTI->getEntryAlignment(*TM.getDataLayout())));
+  EmitAlignment(Log2_32(MJTI->getEntryAlignment(DL)));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -1335,7 +1465,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     // before each jump table.  The first label is never referenced, but tells
     // the assembler and linker the extents of the jump table object.  The
     // second label is actually referenced by the code.
-    if (JTInDiffSection && DL->hasLinkerPrivateGlobalPrefix())
+    if (JTInDiffSection && DL.hasLinkerPrivateGlobalPrefix())
       // FIXME: This doesn't have to have any specific name, just any randomly
       // named and numbered 'l' label would work.  Simplify GetJTISymbol.
       OutStreamer->EmitLabel(GetJTISymbol(JTI, true));
@@ -1409,8 +1539,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
 
   assert(Value && "Unknown entry kind!");
 
-  unsigned EntrySize =
-      MJTI->getEntrySize(*TM.getDataLayout());
+  unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
   OutStreamer->EmitValue(Value, EntrySize);
 }
 
@@ -1435,7 +1564,8 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
   assert(GV->hasInitializer() && "Not a special LLVM global!");
 
   if (GV->getName() == "llvm.global_ctors") {
-    EmitXXStructorList(GV->getInitializer(), /* isCtor */ true);
+    EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+                       /* isCtor */ true);
 
     if (TM.getRelocationModel() == Reloc::Static &&
         MAI->hasStaticCtorDtorReferenceInStaticMode()) {
@@ -1447,7 +1577,8 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
   }
 
   if (GV->getName() == "llvm.global_dtors") {
-    EmitXXStructorList(GV->getInitializer(), /* isCtor */ false);
+    EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+                       /* isCtor */ false);
 
     if (TM.getRelocationModel() == Reloc::Static &&
         MAI->hasStaticCtorDtorReferenceInStaticMode()) {
@@ -1485,7 +1616,8 @@ struct Structor {
 
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
-void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
+void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
+                                    bool isCtor) {
   // Should be an array of '{ int, void ()* }' structs.  The first value is the
   // init priority.
   if (!isa<ConstantArray>(List)) return;
@@ -1520,8 +1652,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *DL = TM.getDataLayout();
-  unsigned Align = Log2_32(DL->getPointerPrefAlignment());
+  unsigned Align = Log2_32(DL.getPointerPrefAlignment());
   std::stable_sort(Structors.begin(), Structors.end(),
                    [](const Structor &L,
                       const Structor &R) { return L.Priority < R.Priority; });
@@ -1542,7 +1673,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
     OutStreamer->SwitchSection(OutputSection);
     if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection())
       EmitAlignment(Align);
-    EmitXXStructor(S.Func);
+    EmitXXStructor(DL, S.Func);
   }
 }
 
@@ -1621,8 +1752,7 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 //
 void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
   if (GV)
-    NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(),
-                                 NumBits);
+    NumBits = getGVAlignmentLog2(GV, GV->getParent()->getDataLayout(), NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
 
@@ -1668,7 +1798,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C = ConstantFoldConstantExpression(CE, *TM.getDataLayout()))
+    if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout()))
       if (C != CE)
         return lowerConstant(C);
 
@@ -1682,11 +1812,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &DL = *TM.getDataLayout();
-
     // Generate a symbolic expression for the byte address
-    APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
-    cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
+    APInt OffsetAI(getDataLayout().getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(getDataLayout(), OffsetAI);
 
     const MCExpr *Base = lowerConstant(CE->getOperand(0));
     if (!OffsetAI)
@@ -1707,7 +1835,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     return lowerConstant(CE->getOperand(0));
 
   case Instruction::IntToPtr: {
-    const DataLayout &DL = *TM.getDataLayout();
+    const DataLayout &DL = getDataLayout();
 
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
@@ -1718,7 +1846,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &DL = *TM.getDataLayout();
+    const DataLayout &DL = getDataLayout();
 
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
@@ -1769,10 +1897,13 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP,
+static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
+                                   AsmPrinter &AP,
                                    const Constant *BaseCV = nullptr,
                                    uint64_t Offset = 0);
 
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP);
+
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
@@ -1789,9 +1920,9 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) {
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
-static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
+static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    uint64_t Size = TM.getDataLayout()->getTypeAllocSizeInBits(V->getType());
+    uint64_t Size = DL.getTypeAllocSizeInBits(V->getType());
     assert(Size % 8 == 0);
 
     // Extend the element to take zero padding into account.
@@ -1806,7 +1937,7 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
     // byte.
     assert(CA->getNumOperands() != 0 && "Should be a CAZ");
     Constant *Op0 = CA->getOperand(0);
-    int Byte = isRepeatedByteSequence(Op0, TM);
+    int Byte = isRepeatedByteSequence(Op0, DL);
     if (Byte == -1)
       return -1;
 
@@ -1823,15 +1954,14 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
   return -1;
 }
 
-static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
-                                             AsmPrinter &AP){
+static void emitGlobalConstantDataSequential(const DataLayout &DL,
+                                             const ConstantDataSequential *CDS,
+                                             AsmPrinter &AP) {
 
   // See if we can aggregate this into a .fill, if so, emit it as such.
-  int Value = isRepeatedByteSequence(CDS, AP.TM);
+  int Value = isRepeatedByteSequence(CDS, DL);
   if (Value != -1) {
-    uint64_t Bytes =
-        AP.TM.getDataLayout()->getTypeAllocSize(
-            CDS->getType());
+    uint64_t Bytes = DL.getTypeAllocSize(CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
       return AP.OutStreamer->EmitFill(Bytes, Value);
@@ -1851,37 +1981,11 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
       AP.OutStreamer->EmitIntValue(CDS->getElementAsInteger(i),
                                    ElementByteSize);
     }
-  } else if (ElementByteSize == 4) {
-    // FP Constants are printed as integer constants to avoid losing
-    // precision.
-    assert(CDS->getElementType()->isFloatTy());
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      union {
-        float F;
-        uint32_t I;
-      };
-
-      F = CDS->getElementAsFloat(i);
-      if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << "float " << F << '\n';
-      AP.OutStreamer->EmitIntValue(I, 4);
-    }
   } else {
-    assert(CDS->getElementType()->isDoubleTy());
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      union {
-        double F;
-        uint64_t I;
-      };
-
-      F = CDS->getElementAsDouble(i);
-      if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << "double " << F << '\n';
-      AP.OutStreamer->EmitIntValue(I, 8);
-    }
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
+      emitGlobalConstantFP(cast<ConstantFP>(CDS->getElementAsConstant(I)), AP);
   }
 
-  const DataLayout &DL = *AP.TM.getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
@@ -1890,12 +1994,12 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
 
 }
 
-static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP,
+static void emitGlobalConstantArray(const DataLayout &DL,
+                                    const ConstantArray *CA, AsmPrinter &AP,
                                     const Constant *BaseCV, uint64_t Offset) {
   // See if we can aggregate some values.  Make sure it can be
   // represented as a series of bytes of the constant value.
-  int Value = isRepeatedByteSequence(CA, AP.TM);
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  int Value = isRepeatedByteSequence(CA, DL);
 
   if (Value != -1) {
     uint64_t Bytes = DL.getTypeAllocSize(CA->getType());
@@ -1903,17 +2007,17 @@ static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP,
   }
   else {
     for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
-      emitGlobalConstantImpl(CA->getOperand(i), AP, BaseCV, Offset);
+      emitGlobalConstantImpl(DL, CA->getOperand(i), AP, BaseCV, Offset);
       Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
     }
   }
 }
 
-static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
+static void emitGlobalConstantVector(const DataLayout &DL,
+                                     const ConstantVector *CV, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
-    emitGlobalConstantImpl(CV->getOperand(i), AP);
+    emitGlobalConstantImpl(DL, CV->getOperand(i), AP);
 
-  const DataLayout &DL = *AP.TM.getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CV->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
@@ -1921,21 +2025,21 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
     AP.OutStreamer->EmitZeros(Padding);
 }
 
-static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP,
+static void emitGlobalConstantStruct(const DataLayout &DL,
+                                     const ConstantStruct *CS, AsmPrinter &AP,
                                      const Constant *BaseCV, uint64_t Offset) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *DL = AP.TM.getDataLayout();
-  unsigned Size = DL->getTypeAllocSize(CS->getType());
-  const StructLayout *Layout = DL->getStructLayout(CS->getType());
+  unsigned Size = DL.getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = DL.getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
   for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
     const Constant *Field = CS->getOperand(i);
 
     // Print the actual field value.
-    emitGlobalConstantImpl(Field, AP, BaseCV, Offset+SizeSoFar);
+    emitGlobalConstantImpl(DL, Field, AP, BaseCV, Offset + SizeSoFar);
 
     // Check if padding is needed and insert one or more 0s.
-    uint64_t FieldSize = DL->getTypeAllocSize(Field->getType());
+    uint64_t FieldSize = DL.getTypeAllocSize(Field->getType());
     uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
                         - Layout->getElementOffset(i)) - FieldSize;
     SizeSoFar += FieldSize + PadSize;
@@ -1974,8 +2078,7 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.TM.getDataLayout()->isBigEndian() &&
-      !CFP->getType()->isPPC_FP128Ty()) {
+  if (AP.getDataLayout().isBigEndian() && !CFP->getType()->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
     if (TrailingBytes)
@@ -1993,13 +2096,13 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = AP.getDataLayout();
   AP.OutStreamer->EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
                             DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout &DL = AP.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -2016,7 +2119,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     // Big endian:
     // * Record the extra bits to emit.
     // * Realign the raw data to emit the chunks of 64-bits.
-    if (DL->isBigEndian()) {
+    if (DL.isBigEndian()) {
       // Basically the structure of the raw data is a chunk of 64-bits cells:
       //    0        1         BitWidth / 64
       // [chunk1][chunk2] ... [chunkN].
@@ -2037,7 +2140,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
   // quantities at a time.
   const uint64_t *RawData = Realigned.getRawData();
   for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
-    uint64_t Val = DL->isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    uint64_t Val = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i];
     AP.OutStreamer->EmitIntValue(Val, 8);
   }
 
@@ -2045,8 +2148,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize(
-        CI->getType());
+    uint64_t Size = AP.getDataLayout().getTypeAllocSize(CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
            (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
@@ -2094,7 +2196,7 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
   if (!AP.GlobalGOTEquivs.count(GOTEquivSym))
     return;
 
-  const GlobalValue *BaseGV = dyn_cast<GlobalValue>(BaseCst);
+  const GlobalValue *BaseGV = dyn_cast_or_null<GlobalValue>(BaseCst);
   if (!BaseGV)
     return;
 
@@ -2149,10 +2251,10 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
     AP.GlobalGOTEquivs[GOTEquivSym] = std::make_pair(GV, NumUses);
 }
 
-static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP,
-                                   const Constant *BaseCV, uint64_t Offset) {
-  const DataLayout *DL = AP.TM.getDataLayout();
-  uint64_t Size = DL->getTypeAllocSize(CV->getType());
+static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
+                                   AsmPrinter &AP, const Constant *BaseCV,
+                                   uint64_t Offset) {
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
 
   // Globals with sub-elements such as combinations of arrays and structs
   // are handled recursively by emitGlobalConstantImpl. Keep track of the
@@ -2189,32 +2291,32 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP,
   }
 
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
-    return emitGlobalConstantDataSequential(CDS, AP);
+    return emitGlobalConstantDataSequential(DL, CDS, AP);
 
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
-    return emitGlobalConstantArray(CVA, AP, BaseCV, Offset);
+    return emitGlobalConstantArray(DL, CVA, AP, BaseCV, Offset);
 
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
-    return emitGlobalConstantStruct(CVS, AP, BaseCV, Offset);
+    return emitGlobalConstantStruct(DL, CVS, AP, BaseCV, Offset);
 
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
     // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
     // vectors).
     if (CE->getOpcode() == Instruction::BitCast)
-      return emitGlobalConstantImpl(CE->getOperand(0), AP);
+      return emitGlobalConstantImpl(DL, CE->getOperand(0), AP);
 
     if (Size > 8) {
       // If the constant expression's size is greater than 64-bits, then we have
       // to emit the value in chunks. Try to constant fold the value and emit it
       // that way.
-      Constant *New = ConstantFoldConstantExpression(CE, *DL);
+      Constant *New = ConstantFoldConstantExpression(CE, DL);
       if (New && New != CE)
-        return emitGlobalConstantImpl(New, AP);
+        return emitGlobalConstantImpl(DL, New, AP);
     }
   }
 
   if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
-    return emitGlobalConstantVector(V, AP);
+    return emitGlobalConstantVector(DL, V, AP);
 
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
@@ -2230,11 +2332,10 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP,
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
-void AsmPrinter::EmitGlobalConstant(const Constant *CV) {
-  uint64_t Size =
-      TM.getDataLayout()->getTypeAllocSize(CV->getType());
+void AsmPrinter::EmitGlobalConstant(const DataLayout &DL, const Constant *CV) {
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
   if (Size)
-    emitGlobalConstantImpl(CV, *this);
+    emitGlobalConstantImpl(DL, CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
     // If the global has zero size, emit a single byte so that two labels don't
     // look like they are at the same location.
@@ -2272,10 +2373,10 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  const DataLayout *DL = TM.getDataLayout();
-  return OutContext.getOrCreateSymbol
-    (Twine(DL->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
-     + "_" + Twine(CPID));
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      "CPI" + Twine(getFunctionNumber()) + "_" +
+                                      Twine(CPID));
 }
 
 /// GetJTISymbol - Return the symbol for the specified jump table entry.
@@ -2286,10 +2387,10 @@ MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
 /// GetJTSetSymbol - Return the symbol for the specified jump table .set
 /// FIXME: privatize to AsmPrinter.
 MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
-  const DataLayout *DL = TM.getDataLayout();
-  return OutContext.getOrCreateSymbol
-  (Twine(DL->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
-   Twine(UID) + "_set_" + Twine(MBBID));
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      Twine(getFunctionNumber()) + "_" +
+                                      Twine(UID) + "_set_" + Twine(MBBID));
 }
 
 MCSymbol *AsmPrinter::getSymbolWithGlobalValueBase(const GlobalValue *GV,
@@ -2301,7 +2402,7 @@ MCSymbol *AsmPrinter::getSymbolWithGlobalValueBase(const GlobalValue *GV,
 /// Return the MCSymbol for the specified ExternalSymbol.
 MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
   SmallString<60> NameStr;
-  Mangler::getNameWithPrefix(NameStr, Sym, *TM.getDataLayout());
+  Mangler::getNameWithPrefix(NameStr, Sym, getDataLayout());
   return OutContext.getOrCreateSymbol(NameStr);
 }
 
@@ -2376,6 +2477,14 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
 void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+  // End the previous funclet and start a new one.
+  if (MBB.isEHFuncletEntry()) {
+    for (const HandlerInfo &HI : Handlers) {
+      HI.Handler->endFunclet();
+      HI.Handler->beginFunclet(MBB);
+    }
+  }
+
   // Emit an alignment directive for this block, if needed.
   if (unsigned Align = MBB.getAlignment())
     EmitAlignment(Align);
@@ -2389,20 +2498,28 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
     if (isVerbose())
       OutStreamer->AddComment("Block address taken");
 
-    for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
-      OutStreamer->EmitLabel(Sym);
+    // MBBs can have their address taken as part of CodeGen without having
+    // their corresponding BB's address taken in IR
+    if (BB->hasAddressTaken())
+      for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
+        OutStreamer->EmitLabel(Sym);
   }
 
   // Print some verbose block comments.
   if (isVerbose()) {
-    if (const BasicBlock *BB = MBB.getBasicBlock())
-      if (BB->hasName())
-        OutStreamer->AddComment("%" + BB->getName());
+    if (const BasicBlock *BB = MBB.getBasicBlock()) {
+      if (BB->hasName()) {
+        BB->printAsOperand(OutStreamer->GetCommentOS(),
+                           /*PrintType=*/false, BB->getModule());
+        OutStreamer->GetCommentOS() << '\n';
+      }
+    }
     emitBasicBlockLoopComments(MBB, LI, *this);
   }
 
   // Print the main label for the block.
-  if (MBB.pred_empty() || isBlockOnlyReachableByFallthrough(&MBB)) {
+  if (MBB.pred_empty() ||
+      (isBlockOnlyReachableByFallthrough(&MBB) && !MBB.isEHFuncletEntry())) {
     if (isVerbose()) {
       // NOTE: Want this comment at start of line, don't emit with AddComment.
       OutStreamer->emitRawComment(" BB#" + Twine(MBB.getNumber()) + ":", false);
@@ -2440,7 +2557,7 @@ bool AsmPrinter::
 isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   // If this is a landing pad, it isn't a fall through.  If it has no preds,
   // then nothing falls through to it.
-  if (MBB->isLandingPad() || MBB->pred_empty())
+  if (MBB->isEHPad() || MBB->pred_empty())
     return false;
 
   // If there isn't exactly one predecessor, it can't be a fall through.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index ad180b6..504c5d2 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -47,7 +47,7 @@ void AsmPrinter::EmitSLEB128(int64_t Value, const char *Desc) const {
   OutStreamer->EmitSLEB128IntValue(Value);
 }
 
-/// EmitULEB128 - emit the specified signed leb128 value.
+/// EmitULEB128 - emit the specified unsigned leb128 value.
 void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc,
                              unsigned PadTo) const {
   if (isVerbose() && Desc)
@@ -56,18 +56,6 @@ void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc,
   OutStreamer->EmitULEB128IntValue(Value, PadTo);
 }
 
-/// EmitCFAByte - Emit a .byte 42 directive for a DW_CFA_xxx value.
-void AsmPrinter::EmitCFAByte(unsigned Val) const {
-  if (isVerbose()) {
-    if (Val >= dwarf::DW_CFA_offset && Val < dwarf::DW_CFA_offset + 64)
-      OutStreamer->AddComment("DW_CFA_offset + Reg (" +
-                              Twine(Val - dwarf::DW_CFA_offset) + ")");
-    else
-      OutStreamer->AddComment(dwarf::CallFrameString(Val));
-  }
-  OutStreamer->EmitIntValue(Val, 1);
-}
-
 static const char *DecodeDWARFEncoding(unsigned Encoding) {
   switch (Encoding) {
   case dwarf::DW_EH_PE_absptr:
@@ -134,7 +122,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
   default:
     llvm_unreachable("Invalid encoded value.");
   case dwarf::DW_EH_PE_absptr:
-    return TM.getDataLayout()->getPointerSize();
+    return MF->getDataLayout().getPointerSize();
   case dwarf::DW_EH_PE_udata2:
     return 2;
   case dwarf::DW_EH_PE_udata4:
@@ -228,6 +216,9 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   case MCCFIInstruction::OpDefCfaOffset:
     OutStreamer->EmitCFIDefCfaOffset(Inst.getOffset());
     break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OutStreamer->EmitCFIAdjustCfaOffset(Inst.getOffset());
+    break;
   case MCCFIInstruction::OpDefCfa:
     OutStreamer->EmitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
     break;
@@ -246,6 +237,12 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   case MCCFIInstruction::OpSameValue:
     OutStreamer->EmitCFISameValue(Inst.getRegister());
     break;
+  case MCCFIInstruction::OpGnuArgsSize:
+    OutStreamer->EmitCFIGnuArgsSize(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpEscape:
+    OutStreamer->EmitCFIEscape(Inst.getValues());
+    break;
   }
 }
 
@@ -284,17 +281,10 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
   }
 }
 
-void
-AsmPrinter::emitDwarfAbbrevs(const std::vector<DIEAbbrev *>& Abbrevs) const {
-  // For each abbrevation.
-  for (const DIEAbbrev *Abbrev : Abbrevs) {
-    // Emit the abbrevations code (base 1 index.)
-    EmitULEB128(Abbrev->getNumber(), "Abbreviation Code");
-
-    // Emit the abbreviations data.
-    Abbrev->Emit(this);
-  }
+void AsmPrinter::emitDwarfAbbrev(const DIEAbbrev &Abbrev) const {
+  // Emit the abbreviations code (base 1 index.)
+  EmitULEB128(Abbrev.getNumber(), "Abbreviation Code");
 
-  // Mark end of abbreviations.
-  EmitULEB128(0, "EOM(3)");
+  // Emit the abbreviations data.
+  Abbrev.Emit(this);
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
index f1efe9d..e59961f 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class MachineBasicBlock;
 class MachineFunction;
 class MachineInstr;
 class MCSymbol;
@@ -50,6 +51,11 @@ public:
   /// beginFunction at all.
   virtual void endFunction(const MachineFunction *MF) = 0;
 
+  /// \brief Emit target-specific EH funclet machinery.
+  virtual void beginFunclet(const MachineBasicBlock &MBB,
+                            MCSymbol *Sym = nullptr) {}
+  virtual void endFunclet() {}
+
   /// \brief Process beginning of an instruction.
   virtual void beginInstruction(const MachineInstr *MI) = 0;
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 793e629..4171657 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -127,19 +127,13 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI));
 
-  // Create a temporary copy of the original STI because the parser may modify
-  // it. For example, when switching between arm and thumb mode. If the target
-  // needs to emit code to return to the original state it can do so in
-  // emitInlineAsmEnd().
-  MCSubtargetInfo TmpSTI = STI;
-
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
   // we only need MCInstrInfo for asm parsing. We create one unconditionally
   // because it's not subtarget dependent.
   std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
   std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
-      TmpSTI, *Parser, *MII, MCOptions));
+      STI, *Parser, *MII, MCOptions));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
@@ -154,7 +148,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   // Don't implicitly switch to the text section before the asm.
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
                         /*NoFinalize*/ true);
-  emitInlineAsmEnd(STI, &TmpSTI);
+  emitInlineAsmEnd(STI, &TAP->getSTI());
   if (Res && !HasDiagHandler)
     report_fatal_error("Error parsing inline asm\n");
 }
@@ -512,9 +506,9 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
 /// for their own strange codes.
 void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
                               const char *Code) const {
-  const DataLayout *DL = TM.getDataLayout();
   if (!strcmp(Code, "private")) {
-    OS << DL->getPrivateGlobalPrefix();
+    const DataLayout &DL = MF->getDataLayout();
+    OS << DL.getPrivateGlobalPrefix();
   } else if (!strcmp(Code, "comment")) {
     OS << MAI->getCommentString();
   } else if (!strcmp(Code, "uid")) {
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 0cc829f..df1997b 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -24,16 +24,19 @@
 
 namespace llvm {
 class ByteStreamer {
- public:
-  virtual ~ByteStreamer() {}
+ protected:
+  ~ByteStreamer() = default;
+  ByteStreamer(const ByteStreamer&) = default;
+  ByteStreamer() = default;
 
+ public:
   // For now we're just handling the calls we need for dwarf emission/hashing.
   virtual void EmitInt8(uint8_t Byte, const Twine &Comment = "") = 0;
   virtual void EmitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
   virtual void EmitULEB128(uint64_t DWord, const Twine &Comment = "") = 0;
 };
 
-class APByteStreamer : public ByteStreamer {
+class APByteStreamer final : public ByteStreamer {
 private:
   AsmPrinter &AP;
 
@@ -53,7 +56,7 @@ public:
   }
 };
 
-class HashingByteStreamer : public ByteStreamer {
+class HashingByteStreamer final : public ByteStreamer {
  private:
   DIEHash &Hash;
  public:
@@ -69,7 +72,7 @@ class HashingByteStreamer : public ByteStreamer {
   }
 };
 
-class BufferByteStreamer : public ByteStreamer {
+class BufferByteStreamer final : public ByteStreamer {
 private:
   SmallVectorImpl<char> &Buffer;
   SmallVectorImpl<std::string> &Comments;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index 46dbc76..7b0cdbd 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -32,6 +32,39 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
+// EmittingAsmStreamer Implementation
+//===----------------------------------------------------------------------===//
+unsigned EmittingAsmStreamer::emitULEB128(uint64_t Value, const char *Desc,
+                                          unsigned PadTo) {
+  AP->EmitULEB128(Value, Desc, PadTo);
+  return 0;
+}
+
+unsigned EmittingAsmStreamer::emitInt8(unsigned char Value) {
+  AP->EmitInt8(Value);
+  return 0;
+}
+
+unsigned EmittingAsmStreamer::emitBytes(StringRef Data) {
+  AP->OutStreamer->EmitBytes(Data);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SizeReporterAsmStreamer Implementation
+//===----------------------------------------------------------------------===//
+unsigned SizeReporterAsmStreamer::emitULEB128(uint64_t Value, const char *Desc,
+                                              unsigned PadTo) {
+  return getULEB128Size(Value);
+}
+
+unsigned SizeReporterAsmStreamer::emitInt8(unsigned char Value) { return 1; }
+
+unsigned SizeReporterAsmStreamer::emitBytes(StringRef Data) {
+  return Data.size();
+}
+
+//===----------------------------------------------------------------------===//
 // DIEAbbrevData Implementation
 //===----------------------------------------------------------------------===//
 
@@ -86,7 +119,7 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
   AP->EmitULEB128(0, "EOM(2)");
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEAbbrev::print(raw_ostream &O) {
   O << "Abbreviation @"
     << format("0x%lx", (long)(intptr_t)this)
@@ -104,12 +137,13 @@ void DIEAbbrev::print(raw_ostream &O) {
       << '\n';
   }
 }
+
+LLVM_DUMP_METHOD
 void DIEAbbrev::dump() { print(dbgs()); }
-#endif
 
 DIEAbbrev DIE::generateAbbrev() const {
   DIEAbbrev Abbrev(Tag, hasChildren());
-  for (const DIEValue &V : Values)
+  for (const DIEValue &V : values())
     Abbrev.AddAttribute(V.getAttribute(), V.getForm());
   return Abbrev;
 }
@@ -144,36 +178,35 @@ DIEValue DIE::findAttribute(dwarf::Attribute Attribute) const {
   return DIEValue();
 }
 
-#ifndef NDEBUG
-void DIE::print(raw_ostream &O, unsigned IndentCount) const {
-  const std::string Indent(IndentCount, ' ');
-  bool isBlock = getTag() == 0;
-
-  if (!isBlock) {
-    O << Indent
-      << "Die: "
-      << format("0x%lx", (long)(intptr_t)this)
-      << ", Offset: " << Offset
-      << ", Size: " << Size << "\n";
-
-    O << Indent
-      << dwarf::TagString(getTag())
-      << " "
-      << dwarf::ChildrenString(hasChildren()) << "\n";
-  } else {
-    O << "Size: " << Size << "\n";
-  }
+LLVM_DUMP_METHOD
+static void printValues(raw_ostream &O, const DIEValueList &Values,
+                        StringRef Type, unsigned Size, unsigned IndentCount) {
+  O << Type << ": Size: " << Size << "\n";
 
-  IndentCount += 2;
   unsigned I = 0;
-  for (const auto &V : Values) {
+  const std::string Indent(IndentCount, ' ');
+  for (const auto &V : Values.values()) {
     O << Indent;
+    O << "Blk[" << I++ << "]";
+    O << "  " << dwarf::FormEncodingString(V.getForm()) << " ";
+    V.print(O);
+    O << "\n";
+  }
+}
 
-    if (!isBlock)
-      O << dwarf::AttributeString(V.getAttribute());
-    else
-      O << "Blk[" << I++ << "]";
+LLVM_DUMP_METHOD
+void DIE::print(raw_ostream &O, unsigned IndentCount) const {
+  const std::string Indent(IndentCount, ' ');
+  O << Indent << "Die: " << format("0x%lx", (long)(intptr_t) this)
+    << ", Offset: " << Offset << ", Size: " << Size << "\n";
+
+  O << Indent << dwarf::TagString(getTag()) << " "
+    << dwarf::ChildrenString(hasChildren()) << "\n";
 
+  IndentCount += 2;
+  for (const auto &V : values()) {
+    O << Indent;
+    O << dwarf::AttributeString(V.getAttribute());
     O << "  " << dwarf::FormEncodingString(V.getForm()) << " ";
     V.print(O);
     O << "\n";
@@ -183,13 +216,13 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const {
   for (const auto &Child : children())
     Child.print(O, IndentCount + 4);
 
-  if (!isBlock) O << "\n";
+  O << "\n";
 }
 
+LLVM_DUMP_METHOD
 void DIE::dump() {
   print(dbgs());
 }
-#endif
 
 void DIEValue::EmitValue(const AsmPrinter *AP) const {
   switch (Ty) {
@@ -215,7 +248,7 @@ unsigned DIEValue::SizeOf(const AsmPrinter *AP) const {
   llvm_unreachable("Unknown DIE kind");
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEValue::print(raw_ostream &O) const {
   switch (Ty) {
   case isNone:
@@ -228,10 +261,10 @@ void DIEValue::print(raw_ostream &O) const {
   }
 }
 
+LLVM_DUMP_METHOD
 void DIEValue::dump() const {
   print(dbgs());
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIEInteger Implementation
@@ -264,7 +297,8 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
   case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
   case dwarf::DW_FORM_addr:
-    Size = Asm->getDataLayout().getPointerSize(); break;
+    Size = Asm->getPointerSize();
+    break;
   case dwarf::DW_FORM_ref_addr:
     Size = SizeOf(Asm, dwarf::DW_FORM_ref_addr);
     break;
@@ -294,21 +328,21 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_GNU_addr_index: return getULEB128Size(Integer);
   case dwarf::DW_FORM_udata: return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata: return getSLEB128Size(Integer);
-  case dwarf::DW_FORM_addr:  return AP->getDataLayout().getPointerSize();
+  case dwarf::DW_FORM_addr:
+    return AP->getPointerSize();
   case dwarf::DW_FORM_ref_addr:
     if (AP->OutStreamer->getContext().getDwarfVersion() == 2)
-      return AP->getDataLayout().getPointerSize();
+      return AP->getPointerSize();
     return sizeof(int32_t);
   default: llvm_unreachable("DIE Value form not supported yet");
   }
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEInteger::print(raw_ostream &O) const {
   O << "Int: " << (int64_t)Integer << "  0x";
   O.write_hex(Integer);
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIEExpr Implementation
@@ -326,12 +360,11 @@ unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getDataLayout().getPointerSize();
+  return AP->getPointerSize();
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIELabel Implementation
@@ -352,12 +385,11 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getDataLayout().getPointerSize();
+  return AP->getPointerSize();
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIELabel::print(raw_ostream &O) const { O << "Lbl: " << Label->getName(); }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIEDelta Implementation
@@ -375,14 +407,13 @@ unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getDataLayout().getPointerSize();
+  return AP->getPointerSize();
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEDelta::print(raw_ostream &O) const {
   O << "Del: " << LabelHi->getName() << "-" << LabelLo->getName();
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIEString Implementation
@@ -431,11 +462,10 @@ unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   return DIEInteger(S.getOffset()).SizeOf(AP, Form);
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEString::print(raw_ostream &O) const {
   O << "String: " << S.getString();
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIEEntry Implementation
@@ -472,15 +502,14 @@ unsigned DIEEntry::getRefAddrSize(const AsmPrinter *AP) {
   const DwarfDebug *DD = AP->getDwarfDebug();
   assert(DD && "Expected Dwarf Debug info to be available");
   if (DD->getDwarfVersion() == 2)
-    return AP->getDataLayout().getPointerSize();
+    return AP->getPointerSize();
   return sizeof(int32_t);
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEEntry::print(raw_ostream &O) const {
   O << format("Die: 0x%lx", (long)(intptr_t)&Entry);
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIETypeSignature Implementation
@@ -491,11 +520,10 @@ void DIETypeSignature::EmitValue(const AsmPrinter *Asm,
   Asm->OutStreamer->EmitIntValue(Unit->getTypeSignature(), 8);
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIETypeSignature::print(raw_ostream &O) const {
   O << format("Type Unit: 0x%lx", Unit->getTypeSignature());
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIELoc Implementation
@@ -505,7 +533,7 @@ void DIETypeSignature::print(raw_ostream &O) const {
 ///
 unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const {
   if (!Size) {
-    for (const auto &V : Values)
+    for (const auto &V : values())
       Size += V.SizeOf(AP);
   }
 
@@ -525,7 +553,7 @@ void DIELoc::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
     Asm->EmitULEB128(Size); break;
   }
 
-  for (const auto &V : Values)
+  for (const auto &V : values())
     V.EmitValue(Asm);
 }
 
@@ -543,12 +571,10 @@ unsigned DIELoc::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   }
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIELoc::print(raw_ostream &O) const {
-  O << "ExprLoc: ";
-  DIE::print(O, 5);
+  printValues(O, *this, "ExprLoc", Size, 5);
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIEBlock Implementation
@@ -558,7 +584,7 @@ void DIELoc::print(raw_ostream &O) const {
 ///
 unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const {
   if (!Size) {
-    for (const auto &V : Values)
+    for (const auto &V : values())
       Size += V.SizeOf(AP);
   }
 
@@ -576,7 +602,7 @@ void DIEBlock::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_block:  Asm->EmitULEB128(Size); break;
   }
 
-  for (const auto &V : Values)
+  for (const auto &V : values())
     V.EmitValue(Asm);
 }
 
@@ -592,12 +618,10 @@ unsigned DIEBlock::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   }
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIEBlock::print(raw_ostream &O) const {
-  O << "Blk: ";
-  DIE::print(O, 5);
+  printValues(O, *this, "Blk", Size, 5);
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // DIELocList Implementation
@@ -608,7 +632,7 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     return 4;
   if (Form == dwarf::DW_FORM_sec_offset)
     return 4;
-  return AP->getDataLayout().getPointerSize();
+  return AP->getPointerSize();
 }
 
 /// EmitValue - Emit label value.
@@ -619,6 +643,5 @@ void DIELocList::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   AP->emitDwarfSymbolReference(Label, /*ForceOffset*/ DD->useSplitDwarf());
 }
 
-#ifndef NDEBUG
+LLVM_DUMP_METHOD
 void DIELocList::print(raw_ostream &O) const { O << "LocList: " << Index; }
-#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 5e60156..0201065 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -470,38 +470,6 @@ void DIEHash::computeHash(const DIE &Die) {
 }
 
 /// This is based on the type signature computation given in section 7.27 of the
-/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE
-/// with the exception that we are hashing only the context and the name of the
-/// type.
-uint64_t DIEHash::computeDIEODRSignature(const DIE &Die) {
-
-  // Add the contexts to the hash. We won't be computing the ODR hash for
-  // function local types so it's safe to use the generic context hashing
-  // algorithm here.
-  // FIXME: If we figure out how to account for linkage in some way we could
-  // actually do this with a slight modification to the parent hash algorithm.
-  if (const DIE *Parent = Die.getParent())
-    addParentContext(*Parent);
-
-  // Add the current DIE information.
-
-  // Add the DWARF tag of the DIE.
-  addULEB128(Die.getTag());
-
-  // Add the name of the type to the hash.
-  addString(getDIEStringAttr(Die, dwarf::DW_AT_name));
-
-  // Now get the result.
-  MD5::MD5Result Result;
-  Hash.final(Result);
-
-  // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
-}
-
-/// This is based on the type signature computation given in section 7.27 of the
 /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
 /// with the inclusion of the full CU and all top level CU entities.
 // TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
index 833ca02..44f0ce8 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -84,9 +84,6 @@ class DIEHash {
 public:
   DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
 
-  /// \brief Computes the ODR signature.
-  uint64_t computeDIEODRSignature(const DIE &Die);
-
   /// \brief Computes the CU signature.
   uint64_t computeCUSignature(const DIE &Die);
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index afffa83..bbe5324 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -9,6 +9,8 @@
 
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+
+#include "DebugLocStream.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -17,7 +19,6 @@
 
 namespace llvm {
 class AsmPrinter;
-class DebugLocStream;
 
 /// \brief This struct describes location entries emitted in the .debug_loc
 /// section.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index f8cdde2..4ad3e18 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -41,7 +41,7 @@ void DwarfAccelTable::AddName(DwarfStringPoolEntryRef Name, const DIE *die,
   DIEs.Values.push_back(new (Allocator) HashDataContents(die, Flags));
 }
 
-void DwarfAccelTable::ComputeBucketCount(void) {
+void DwarfAccelTable::ComputeBucketCount() {
   // First get the number of unique hashes.
   std::vector<uint32_t> uniques(Data.size());
   for (size_t i = 0, e = Data.size(); i < e; ++i)
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 2c212c7..6665c16 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -78,12 +78,11 @@ void DwarfCFIException::endModule() {
     return;
 
   // Emit references to all used personality functions
-  const std::vector<const Function*> &Personalities = MMI->getPersonalities();
-  for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
-    if (!Personalities[i])
+  for (const Function *Personality : MMI->getPersonalities()) {
+    if (!Personality)
       continue;
-    MCSymbol *Sym = Asm->getSymbol(Personalities[i]);
-    TLOF.emitPersonalityValue(*Asm->OutStreamer, Asm->TM, Sym);
+    MCSymbol *Sym = Asm->getSymbol(Personality);
+    TLOF.emitPersonalityValue(*Asm->OutStreamer, Asm->getDataLayout(), Sym);
   }
 }
 
@@ -108,7 +107,6 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   const Function *Per = nullptr;
   if (F->hasPersonalityFn())
     Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
-  assert(!MMI->getPersonality() || Per == MMI->getPersonality());
 
   // Emit a personality function even when there are no landing pads
   bool forceEmitPersonality =
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index fc54a29..725063a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -151,28 +151,33 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
     DIELoc *Loc = new (DIEValueAllocator) DIELoc;
     const MCSymbol *Sym = Asm->getSymbol(Global);
     if (Global->isThreadLocal()) {
-      // FIXME: Make this work with -gsplit-dwarf.
-      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-      assert((PointerSize == 4 || PointerSize == 8) &&
-             "Add support for other sizes if necessary");
-      // Based on GCC's support for TLS:
-      if (!DD->useSplitDwarf()) {
-        // 1) Start with a constNu of the appropriate pointer size
-        addUInt(*Loc, dwarf::DW_FORM_data1,
-                PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
-        // 2) containing the (relocated) offset of the TLS variable
-        //    within the module's TLS block.
-        addExpr(*Loc, dwarf::DW_FORM_udata,
-                Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+      if (Asm->TM.Options.EmulatedTLS) {
+        // TODO: add debug info for emulated thread local mode.
       } else {
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-        addUInt(*Loc, dwarf::DW_FORM_udata,
-                DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+        // FIXME: Make this work with -gsplit-dwarf.
+        unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+        assert((PointerSize == 4 || PointerSize == 8) &&
+               "Add support for other sizes if necessary");
+        // Based on GCC's support for TLS:
+        if (!DD->useSplitDwarf()) {
+          // 1) Start with a constNu of the appropriate pointer size
+          addUInt(*Loc, dwarf::DW_FORM_data1, PointerSize == 4
+                                                  ? dwarf::DW_OP_const4u
+                                                  : dwarf::DW_OP_const8u);
+          // 2) containing the (relocated) offset of the TLS variable
+          //    within the module's TLS block.
+          addExpr(*Loc, dwarf::DW_FORM_udata,
+                  Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+        } else {
+          addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+          addUInt(*Loc, dwarf::DW_FORM_udata,
+                  DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+        }
+        // 3) followed by an OP to make the debugger do a TLS lookup.
+        addUInt(*Loc, dwarf::DW_FORM_data1,
+                DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                      : dwarf::DW_OP_form_tls_address);
       }
-      // 3) followed by an OP to make the debugger do a TLS lookup.
-      addUInt(*Loc, dwarf::DW_FORM_data1,
-              DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
-                                    : dwarf::DW_OP_form_tls_address);
     } else {
       DD->addArangeLabel(SymbolCU(this, Sym));
       addOpAddress(*Loc, Sym);
@@ -338,9 +343,9 @@ void DwarfCompileUnit::constructScopeDIE(
     // Skip imported directives in gmlt-like data.
     if (!includeMinimalInlineScopes()) {
       // There is no need to emit empty lexical block DIE.
-      for (const auto &E : DD->findImportedEntitiesForScope(DS))
+      for (const auto *IE : ImportedEntities[DS])
         Children.push_back(
-            constructImportedEntityDIE(cast<DIImportedEntity>(E.second)));
+            constructImportedEntityDIE(cast<DIImportedEntity>(IE)));
     }
 
     // If there are only other scopes as children, put them directly in the
@@ -435,6 +440,9 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
   addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
           getOrCreateSourceID(IA->getFilename(), IA->getDirectory()));
   addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, IA->getLine());
+  if (IA->getDiscriminator())
+    addUInt(*ScopeDIE, dwarf::DW_AT_GNU_discriminator, None,
+            IA->getDiscriminator());
 
   // Add name to the name table, we do this here because we're guaranteed
   // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
@@ -517,8 +525,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     unsigned FrameReg = 0;
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    assert(Expr != DV.getExpression().end() &&
-           "Wrong number of expressions");
+    assert(Expr != DV.getExpression().end() && "Wrong number of expressions");
     DwarfExpr.AddMachineRegIndirect(FrameReg, Offset);
     DwarfExpr.AddExpression((*Expr)->expr_op_begin(), (*Expr)->expr_op_end());
     ++Expr;
@@ -597,8 +604,8 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
   return ObjectPointer;
 }
 
-void
-DwarfCompileUnit::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
   DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
   if (AbsDef)
     return;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 509c943..2e28467 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -39,6 +39,12 @@ class DwarfCompileUnit : public DwarfUnit {
   /// The start of the unit within its section.
   MCSymbol *LabelBegin;
 
+  typedef llvm::SmallVector<const MDNode *, 8> ImportedEntityList;
+  typedef llvm::DenseMap<const MDNode *, ImportedEntityList>
+  ImportedEntityMap;
+
+  ImportedEntityMap ImportedEntities;
+
   /// GlobalNames - A map of globally visible named entities for this unit.
   StringMap<const DIE *> GlobalNames;
 
@@ -98,6 +104,10 @@ public:
 
   unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
 
+  void addImportedEntity(const DIImportedEntity* IE) {
+    ImportedEntities[IE->getScope()].push_back(IE);
+  }
+
   /// addRange - Add an address range to the list of ranges for this unit.
   void addRange(RangeSpan Range);
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 7d03a39..a4fb07e 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -104,6 +105,14 @@ DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
                             clEnumVal(Disable, "Disabled"), clEnumValEnd),
                  cl::init(Default));
 
+static cl::opt<DefaultOnOff>
+DwarfLinkageNames("dwarf-linkage-names", cl::Hidden,
+                  cl::desc("Emit DWARF linkage-name attributes."),
+                  cl::values(clEnumVal(Default, "Default for platform"),
+                             clEnumVal(Enable, "Enabled"),
+                             clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                  cl::init(Default));
+
 static const char *const DWARFGroupName = "DWARF Emission";
 static const char *const DbgTimerName = "DWARF Debug Writer";
 
@@ -176,9 +185,9 @@ const DIType *DbgVariable::getType() const {
     if (tag == dwarf::DW_TAG_pointer_type)
       subType = resolve(cast<DIDerivedType>(Ty)->getBaseType());
 
-    auto Elements = cast<DICompositeTypeBase>(subType)->getElements();
+    auto Elements = cast<DICompositeType>(subType)->getElements();
     for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
-      auto *DT = cast<DIDerivedTypeBase>(Elements[i]);
+      auto *DT = cast<DIDerivedType>(Elements[i]);
       if (getName() == DT->getName())
         return resolve(DT->getBaseType());
     }
@@ -194,45 +203,67 @@ static LLVM_CONSTEXPR DwarfAccelTable::Atom TypeAtoms[] = {
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     : Asm(A), MMI(Asm->MMI), DebugLocs(A->OutStreamer->isVerboseAsm()),
       PrevLabel(nullptr), InfoHolder(A, "info_string", DIEValueAllocator),
-      UsedNonDefaultText(false),
       SkeletonHolder(A, "skel_string", DIEValueAllocator),
       IsDarwin(Triple(A->getTargetTriple()).isOSDarwin()),
-      IsPS4(Triple(A->getTargetTriple()).isPS4()),
       AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                        dwarf::DW_FORM_data4)),
       AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                       dwarf::DW_FORM_data4)),
       AccelNamespace(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4)),
-      AccelTypes(TypeAtoms) {
+      AccelTypes(TypeAtoms), DebuggerTuning(DebuggerKind::Default) {
 
   CurFn = nullptr;
   CurMI = nullptr;
+  Triple TT(Asm->getTargetTriple());
+
+  // Make sure we know our "debugger tuning."  The target option takes
+  // precedence; fall back to triple-based defaults.
+  if (Asm->TM.Options.DebuggerTuning != DebuggerKind::Default)
+    DebuggerTuning = Asm->TM.Options.DebuggerTuning;
+  else if (IsDarwin)
+    DebuggerTuning = DebuggerKind::LLDB;
+  else if (TT.isPS4CPU())
+    DebuggerTuning = DebuggerKind::SCE;
+  else
+    DebuggerTuning = DebuggerKind::GDB;
 
-  // Turn on accelerator tables for Darwin by default, pubnames by
-  // default for non-Darwin/PS4, and handle split dwarf.
+  // Turn on accelerator tables for LLDB by default.
   if (DwarfAccelTables == Default)
-    HasDwarfAccelTables = IsDarwin;
+    HasDwarfAccelTables = tuneForLLDB();
   else
     HasDwarfAccelTables = DwarfAccelTables == Enable;
 
+  // Handle split DWARF. Off by default for now.
   if (SplitDwarf == Default)
     HasSplitDwarf = false;
   else
     HasSplitDwarf = SplitDwarf == Enable;
 
+  // Pubnames/pubtypes on by default for GDB.
   if (DwarfPubSections == Default)
-    HasDwarfPubSections = !IsDarwin && !IsPS4;
+    HasDwarfPubSections = tuneForGDB();
   else
     HasDwarfPubSections = DwarfPubSections == Enable;
 
+  // SCE does not use linkage names.
+  if (DwarfLinkageNames == Default)
+    UseLinkageNames = !tuneForSCE();
+  else
+    UseLinkageNames = DwarfLinkageNames == Enable;
+
   unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion;
   DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
                                     : MMI->getModule()->getDwarfVersion();
+  // Use dwarf 4 by default if nothing is requested.
+  DwarfVersion = DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION;
 
-  // Darwin and PS4 use the standard TLS opcode (defined in DWARF 3).
-  // Everybody else uses GNU's.
-  UseGNUTLSOpcode = !(IsDarwin || IsPS4) || DwarfVersion < 3;
+  // Work around a GDB bug. GDB doesn't support the standard opcode;
+  // SCE doesn't support GNU's; LLDB prefers the standard opcode, which
+  // is defined as of DWARF 3.
+  // See GDB bug 11616 - DW_OP_form_tls_address is unimplemented
+  // https://sourceware.org/bugzilla/show_bug.cgi?id=11616
+  UseGNUTLSOpcode = tuneForGDB() || DwarfVersion < 3;
 
   Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
 
@@ -300,18 +331,6 @@ void DwarfDebug::addSubprogramNames(const DISubprogram *SP, DIE &Die) {
   }
 }
 
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  if (isa<DISubprogram>(Context))
-    return true;
-  if (auto *T = dyn_cast<DIType>(Context))
-    return isSubprogramContext(resolve(T->getScope()));
-  return false;
-}
-
 /// Check whether we should create a DIE for the given Scope, return true
 /// if we don't create a DIE (the corresponding DIE is null).
 bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
@@ -416,6 +435,16 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
   else
     NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection());
 
+  if (DIUnit->getDWOId()) {
+    // This CU is either a clang module DWO or a skeleton CU.
+    NewCU.addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8,
+                  DIUnit->getDWOId());
+    if (!DIUnit->getSplitDebugFilename().empty())
+      // This is a prefabricated skeleton CU.
+      NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                      DIUnit->getSplitDebugFilename());
+  }
+
   CUMap.insert(std::make_pair(DIUnit, &NewCU));
   CUDieMap.insert(std::make_pair(&Die, &NewCU));
   return NewCU;
@@ -436,8 +465,6 @@ void DwarfDebug::beginModule() {
 
   const Module *M = MMI->getModule();
 
-  FunctionDIs = makeSubprogramMap(*M);
-
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes)
     return;
@@ -449,12 +476,7 @@ void DwarfDebug::beginModule() {
     auto *CUNode = cast<DICompileUnit>(N);
     DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode);
     for (auto *IE : CUNode->getImportedEntities())
-      ScopesWithImportedEntities.push_back(std::make_pair(IE->getScope(), IE));
-    // Stable sort to preserve the order of appearance of imported entities.
-    // This is to avoid out-of-order processing of interdependent declarations
-    // within the same scope, e.g. { namespace A = base; namespace B = A; }
-    std::stable_sort(ScopesWithImportedEntities.begin(),
-                     ScopesWithImportedEntities.end(), less_first());
+      CU.addImportedEntity(IE);
     for (auto *GV : CUNode->getGlobalVariables())
       CU.getOrCreateGlobalVariableDIE(GV);
     for (auto *SP : CUNode->getSubprograms())
@@ -467,7 +489,10 @@ void DwarfDebug::beginModule() {
     for (auto *Ty : CUNode->getRetainedTypes()) {
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
-      CU.getOrCreateTypeDIE(cast<DIType>(resolve(Ty->getRef())));
+      DIType *RT = cast<DIType>(resolve(Ty->getRef()));
+      if (!RT->isExternalTypeRef())
+        // There is no point in force-emitting a forward declaration.
+        CU.getOrCreateTypeDIE(RT);
     }
     // Emit imported_modules last so that the relevant context is already
     // available.
@@ -536,6 +561,8 @@ void DwarfDebug::finalizeModuleInfo() {
   // Collect info for variables that were optimized out.
   collectDeadVariables();
 
+  unsigned MacroOffset = 0;
+  std::unique_ptr<AsmStreamerBase> AS(new SizeReporterAsmStreamer(Asm));
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
   for (const auto &P : CUMap) {
@@ -588,6 +615,15 @@ void DwarfDebug::finalizeModuleInfo() {
         U.setBaseAddress(TheCU.getRanges().front().getStart());
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
+
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    if (CUNode->getMacros()) {
+      // Compile Unit has macros, emit "DW_AT_macro_info" attribute.
+      U.addUInt(U.getUnitDie(), dwarf::DW_AT_macro_info,
+                dwarf::DW_FORM_sec_offset, MacroOffset);
+      // Update macro section offset
+      MacroOffset += handleMacroNodes(AS.get(), CUNode->getMacros(), U);
+    }
   }
 
   // Compute DIE offsets and sizes.
@@ -631,6 +667,9 @@ void DwarfDebug::endModule() {
   // Emit info into a debug ranges section.
   emitDebugRanges();
 
+  // Emit info into a debug macinfo section.
+  emitDebugMacinfo();
+
   if (useSplitDwarf()) {
     emitDebugStrDWO();
     emitDebugInfoDWO();
@@ -1061,12 +1100,8 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
   for (const auto &MBB : *MF)
     for (const auto &MI : MBB)
       if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
-          MI.getDebugLoc()) {
-        // Did the target forget to set the FrameSetup flag for CFI insns?
-        assert(!MI.isCFIInstruction() &&
-               "First non-frame-setup instruction is a CFI instruction.");
+          MI.getDebugLoc())
         return MI.getDebugLoc();
-      }
   return DebugLoc();
 }
 
@@ -1079,8 +1114,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!MMI->hasDebugInfo())
     return;
 
-  auto DI = FunctionDIs.find(MF->getFunction());
-  if (DI == FunctionDIs.end())
+  auto DI = MF->getFunction()->getSubprogram();
+  if (!DI)
     return;
 
   // Grab the lexical scopes for the function, if we don't have any of those
@@ -1127,7 +1162,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     // The first mention of a function argument gets the CurrentFnBegin
     // label, so arguments are visible when breaking at function entry.
     const DILocalVariable *DIVar = Ranges.front().first->getDebugVariable();
-    if (DIVar->getTag() == dwarf::DW_TAG_arg_variable &&
+    if (DIVar->isParameter() &&
         getDISubprogram(DIVar->getScope())->describes(MF->getFunction())) {
       LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin();
       if (Ranges.front().first->getDebugExpression()->isBitPiece()) {
@@ -1171,7 +1206,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
       "endFunction should be called with the same function as beginFunction");
 
   if (!MMI->hasDebugInfo() || LScopes.empty() ||
-      !FunctionDIs.count(MF->getFunction())) {
+      !MF->getFunction()->getSubprogram()) {
     // If we don't have a lexical scope for this function then there will
     // be a hole in the range information. Keep note of this by setting the
     // previously used section to nullptr.
@@ -1812,6 +1847,70 @@ void DwarfDebug::emitDebugRanges() {
   }
 }
 
+unsigned DwarfDebug::handleMacroNodes(AsmStreamerBase *AS,
+                                      DIMacroNodeArray Nodes,
+                                      DwarfCompileUnit &U) {
+  unsigned Size = 0;
+  for (auto *MN : Nodes) {
+    if (auto *M = dyn_cast<DIMacro>(MN))
+      Size += emitMacro(AS, *M);
+    else if (auto *F = dyn_cast<DIMacroFile>(MN))
+      Size += emitMacroFile(AS, *F, U);
+    else
+      llvm_unreachable("Unexpected DI type!");
+  }
+  return Size;
+}
+
+unsigned DwarfDebug::emitMacro(AsmStreamerBase *AS, DIMacro &M) {
+  int Size = 0;
+  Size += AS->emitULEB128(M.getMacinfoType());
+  Size += AS->emitULEB128(M.getLine());
+  StringRef Name = M.getName();
+  StringRef Value = M.getValue();
+  Size += AS->emitBytes(Name);
+  if (!Value.empty()) {
+    // There should be one space between macro name and macro value.
+    Size += AS->emitInt8(' ');
+    Size += AS->emitBytes(Value);
+  }
+  Size += AS->emitInt8('\0');
+  return Size;
+}
+
+unsigned DwarfDebug::emitMacroFile(AsmStreamerBase *AS, DIMacroFile &F,
+                                   DwarfCompileUnit &U) {
+  int Size = 0;
+  assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
+  Size += AS->emitULEB128(dwarf::DW_MACINFO_start_file);
+  Size += AS->emitULEB128(F.getLine());
+  DIFile *File = F.getFile();
+  unsigned FID =
+      U.getOrCreateSourceID(File->getFilename(), File->getDirectory());
+  Size += AS->emitULEB128(FID);
+  Size += handleMacroNodes(AS, F.getElements(), U);
+  Size += AS->emitULEB128(dwarf::DW_MACINFO_end_file);
+  return Size;
+}
+
+// Emit visible names into a debug macinfo section.
+void DwarfDebug::emitDebugMacinfo() {
+  if (MCSection *Macinfo = Asm->getObjFileLowering().getDwarfMacinfoSection()) {
+    // Start the dwarf macinfo section.
+    Asm->OutStreamer->SwitchSection(Macinfo);
+  }
+  std::unique_ptr<AsmStreamerBase> AS(new EmittingAsmStreamer(Asm));
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
+    auto *SkCU = TheCU.getSkeleton();
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    handleMacroNodes(AS.get(), CUNode->getMacros(), U);
+  }
+  Asm->OutStreamer->AddComment("End Of Macro List Mark");
+  Asm->EmitInt8(0);
+}
+
 // DWARF5 Experimental Separate Dwarf emitters.
 
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
@@ -1863,7 +1962,7 @@ void DwarfDebug::emitDebugLineDWO() {
   assert(useSplitDwarf() && "No split dwarf?");
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfLineDWOSection());
-  SplitTypeUnitFileTable.Emit(*Asm->OutStreamer);
+  SplitTypeUnitFileTable.Emit(*Asm->OutStreamer, MCDwarfLineTableParams());
 }
 
 // Emit the .debug_str.dwo section for separated dwarf. This contains the
@@ -1884,7 +1983,7 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
   return &SplitTypeUnitFileTable;
 }
 
-static uint64_t makeTypeSignature(StringRef Identifier) {
+uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
   MD5 Hash;
   Hash.update(Identifier);
   // ... take the least significant 8 bytes and return those. Our MD5
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 01f34c6..460c186 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Target/TargetOptions.h"
 #include <memory>
 
 namespace llvm {
@@ -49,24 +50,6 @@ class DwarfUnit;
 class MachineModuleInfo;
 
 //===----------------------------------------------------------------------===//
-/// This class is used to record source line correspondence.
-class SrcLineInfo {
-  unsigned Line;     // Source line number.
-  unsigned Column;   // Source column.
-  unsigned SourceID; // Source ID number.
-  MCSymbol *Label;   // Label in code ID number.
-public:
-  SrcLineInfo(unsigned L, unsigned C, unsigned S, MCSymbol *label)
-      : Line(L), Column(C), SourceID(S), Label(label) {}
-
-  // Accessors
-  unsigned getLine() const { return Line; }
-  unsigned getColumn() const { return Column; }
-  unsigned getSourceID() const { return SourceID; }
-  MCSymbol *getLabel() const { return Label; }
-};
-
-//===----------------------------------------------------------------------===//
 /// This class is used to track local variable information.
 ///
 /// Variables can be created from allocas, in which case they're generated from
@@ -127,14 +110,14 @@ public:
   // Accessors.
   const DILocalVariable *getVariable() const { return Var; }
   const DILocation *getInlinedAt() const { return IA; }
-  const ArrayRef<const DIExpression *> getExpression() const { return Expr; }
+  ArrayRef<const DIExpression *> getExpression() const { return Expr; }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDebugLocListIndex(unsigned O) { DebugLocListIndex = O; }
   unsigned getDebugLocListIndex() const { return DebugLocListIndex; }
   StringRef getName() const { return Var->getName(); }
   const MachineInstr *getMInsn() const { return MInsn; }
-  const ArrayRef<int> getFrameIndex() const { return FrameIndex; }
+  ArrayRef<int> getFrameIndex() const { return FrameIndex; }
 
   void addMMIEntry(const DbgVariable &V) {
     assert(DebugLocListIndex == ~0U && !MInsn && "not an MMI entry");
@@ -156,7 +139,8 @@ public:
 
   // Translate tag to proper Dwarf tag.
   dwarf::Tag getTag() const {
-    if (Var->getTag() == dwarf::DW_TAG_arg_variable)
+    // FIXME: Why don't we just infer this tag and store it all along?
+    if (Var->isParameter())
       return dwarf::DW_TAG_formal_parameter;
 
     return dwarf::DW_TAG_variable;
@@ -282,11 +266,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Holders for the various debug information flags that we might need to
   /// have exposed. See accessor functions below for description.
 
-  /// Holder for imported entities.
-  typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32>
-  ImportedEntityMap;
-  ImportedEntityMap ScopesWithImportedEntities;
-
   /// Map from MDNodes for user-defined types to the type units that
   /// describe them.
   DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits;
@@ -298,16 +277,12 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Whether to emit the pubnames/pubtypes sections.
   bool HasDwarfPubSections;
 
-  /// Whether or not to use AT_ranges for compilation units.
-  bool HasCURanges;
-
-  /// Whether we emitted a function into a section other than the
-  /// default text.
-  bool UsedNonDefaultText;
-
   /// Whether to use the GNU TLS opcode (instead of the standard opcode).
   bool UseGNUTLSOpcode;
 
+  /// Whether to emit DW_AT_[MIPS_]linkage_name.
+  bool UseLinkageNames;
+
   /// Version of dwarf we're emitting.
   unsigned DwarfVersion;
 
@@ -338,7 +313,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// True iff there are multiple CUs in this module.
   bool SingleCU;
   bool IsDarwin;
-  bool IsPS4;
 
   AddressPool AddrPool;
 
@@ -347,7 +321,8 @@ class DwarfDebug : public AsmPrinterHandler {
   DwarfAccelTable AccelNamespace;
   DwarfAccelTable AccelTypes;
 
-  DenseMap<const Function *, DISubprogram *> FunctionDIs;
+  // Identify a debugger for "tuning" the debug info.
+  DebuggerKind DebuggerTuning;
 
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
@@ -372,12 +347,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Construct a DIE for this abstract scope.
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
-  /// Compute the size and offset of a DIE given an incoming Offset.
-  unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
-
-  /// Compute the size and offset of all the DIEs.
-  void computeSizeAndOffsets();
-
   /// Collect info for variables that were optimized out.
   void collectDeadVariables();
 
@@ -431,20 +400,25 @@ class DwarfDebug : public AsmPrinterHandler {
   /// Emit visible names into a debug str section.
   void emitDebugStr();
 
-  /// Emit visible names into a debug loc section.
+  /// Emit variable locations into a debug loc section.
   void emitDebugLoc();
 
-  /// Emit visible names into a debug loc dwo section.
+  /// Emit variable locations into a debug loc dwo section.
   void emitDebugLocDWO();
 
-  /// Emit visible names into a debug aranges section.
+  /// Emit address ranges into a debug aranges section.
   void emitDebugARanges();
 
-  /// Emit visible names into a debug ranges section.
+  /// Emit address ranges into a debug ranges section.
   void emitDebugRanges();
 
-  /// Emit inline info using custom format.
-  void emitDebugInlineInfo();
+  /// Emit macros into a debug macinfo section.
+  void emitDebugMacinfo();
+  unsigned emitMacro(AsmStreamerBase *AS, DIMacro &M);
+  unsigned emitMacroFile(AsmStreamerBase *AS, DIMacroFile &F,
+                         DwarfCompileUnit &U);
+  unsigned handleMacroNodes(AsmStreamerBase *AS, DIMacroNodeArray Nodes,
+                            DwarfCompileUnit &U);
 
   /// DWARF 5 Experimental Split Dwarf Emitters
 
@@ -456,10 +430,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// section.
   DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU);
 
-  /// Construct the split debug info compile unit for the debug info
-  /// section.
-  DwarfTypeUnit &constructSkeletonTU(DwarfTypeUnit &TU);
-
   /// Emit the debug info dwo section.
   void emitDebugInfoDWO();
 
@@ -544,6 +514,9 @@ public:
   /// Process end of an instruction.
   void endInstruction() override;
 
+  /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
+  static uint64_t makeTypeSignature(StringRef Identifier);
+
   /// Add a DIE to the set of types that we're going to pull into
   /// type units.
   void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
@@ -558,10 +531,22 @@ public:
     SymSize[Sym] = Size;
   }
 
+  /// Returns whether to emit DW_AT_[MIPS_]linkage_name.
+  bool useLinkageNames() const { return UseLinkageNames; }
+
   /// Returns whether to use DW_OP_GNU_push_tls_address, instead of the
   /// standard DW_OP_form_tls_address opcode
   bool useGNUTLSOpcode() const { return UseGNUTLSOpcode; }
 
+  /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
+  ///
+  /// Returns whether we are "tuning" for a given debugger.
+  /// @{
+  bool tuneForGDB() const { return DebuggerTuning == DebuggerKind::GDB; }
+  bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
+  bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
+  /// @}
+
   // Experimental DWARF5 features.
 
   /// Returns whether or not to emit tables that dwarf consumers can
@@ -604,9 +589,6 @@ public:
   DwarfCompileUnit *lookupUnit(const DIE *CU) const {
     return CUDieMap.lookup(CU);
   }
-  /// isSubprogramContext - Return true if Context is either a subprogram
-  /// or another context nested inside a subprogram.
-  bool isSubprogramContext(const MDNode *Context);
 
   void addSubprogramNames(const DISubprogram *SP, DIE &Die);
 
@@ -622,14 +604,6 @@ public:
 
   const MachineFunction *getCurrentFunction() const { return CurFn; }
 
-  iterator_range<ImportedEntityMap::const_iterator>
-  findImportedEntitiesForScope(const MDNode *Scope) const {
-    return make_range(std::equal_range(
-        ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(),
-        std::pair<const MDNode *, const MDNode *>(Scope, nullptr),
-        less_first()));
-  }
-
   /// A helper function to check whether the DIE for a given Scope is
   /// going to be null.
   bool isLexicalScopeDIENull(LexicalScope *Scope);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index a2799b8..7b5b831 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -211,12 +211,15 @@ bool DwarfExpression::AddMachineRegExpression(const DIExpression *Expr,
     return AddMachineRegPiece(MachineReg, SizeInBits,
                getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
   }
-  case dwarf::DW_OP_plus: {
-    // [DW_OP_reg,Offset,DW_OP_plus,DW_OP_deref] --> [DW_OP_breg,Offset].
+  case dwarf::DW_OP_plus:
+  case dwarf::DW_OP_minus: {
+    // [DW_OP_reg,Offset,DW_OP_plus, DW_OP_deref] --> [DW_OP_breg, Offset].
+    // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset].
     auto N = I.getNext();
     if (N != E && N->getOp() == dwarf::DW_OP_deref) {
       unsigned Offset = I->getArg(0);
-      ValidReg = AddMachineRegIndirect(MachineReg, Offset);
+      ValidReg = AddMachineRegIndirect(
+          MachineReg, I->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
       std::advance(I, 2);
       break;
     } else
@@ -255,6 +258,12 @@ void DwarfExpression::AddExpression(DIExpression::expr_op_iterator I,
       EmitOp(dwarf::DW_OP_plus_uconst);
       EmitUnsigned(I->getArg(0));
       break;
+    case dwarf::DW_OP_minus:
+      // There is no OP_minus_uconst.
+      EmitOp(dwarf::DW_OP_constu);
+      EmitUnsigned(I->getArg(0));
+      EmitOp(dwarf::DW_OP_minus);
+      break;
     case dwarf::DW_OP_deref:
       EmitOp(dwarf::DW_OP_deref);
       break;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 3555822..d75fea5 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -192,18 +192,19 @@ void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
                  DIEInteger(1));
 }
 
-void DwarfUnit::addUInt(DIE &Die, dwarf::Attribute Attribute,
+void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
                         Optional<dwarf::Form> Form, uint64_t Integer) {
   if (!Form)
     Form = DIEInteger::BestForm(false, Integer);
   Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
 }
 
-void DwarfUnit::addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer) {
+void DwarfUnit::addUInt(DIEValueList &Block, dwarf::Form Form,
+                        uint64_t Integer) {
   addUInt(Block, (dwarf::Attribute)0, Form, Integer);
 }
 
-void DwarfUnit::addSInt(DIE &Die, dwarf::Attribute Attribute,
+void DwarfUnit::addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
                         Optional<dwarf::Form> Form, int64_t Integer) {
   if (!Form)
     Form = DIEInteger::BestForm(true, Integer);
@@ -222,9 +223,10 @@ void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
                DIEString(DU->getStringPool().getEntry(*Asm, String)));
 }
 
-DIE::value_iterator DwarfUnit::addLabel(DIE &Die, dwarf::Attribute Attribute,
-                                        dwarf::Form Form,
-                                        const MCSymbol *Label) {
+DIEValueList::value_iterator DwarfUnit::addLabel(DIEValueList &Die,
+                                                 dwarf::Attribute Attribute,
+                                                 dwarf::Form Form,
+                                                 const MCSymbol *Label) {
   return Die.addValue(DIEValueAllocator, Attribute, Form, DIELabel(Label));
 }
 
@@ -277,6 +279,13 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type) {
                dwarf::DW_FORM_ref_sig8, DIETypeSignature(Type));
 }
 
+void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
+                                    StringRef Identifier) {
+  uint64_t Signature = DD->makeTypeSignature(Identifier);
+  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8,
+               DIEInteger(Signature));
+}
+
 void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
                             DIEEntry Entry) {
   const DIE *DieCU = Die.getUnitOrNull();
@@ -292,8 +301,6 @@ void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
 }
 
 DIE &DwarfUnit::createAndAddDIE(unsigned Tag, DIE &Parent, const DINode *N) {
-  assert(Tag != dwarf::DW_TAG_auto_variable &&
-         Tag != dwarf::DW_TAG_arg_variable);
   DIE &Die = Parent.addChild(DIE::get(DIEValueAllocator, (dwarf::Tag)Tag));
   if (N)
     insertDIE(N, &Die);
@@ -445,7 +452,7 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
 
   // Find the __forwarding field and the variable field in the __Block_byref
   // struct.
-  DINodeArray Fields = cast<DICompositeTypeBase>(TmpTy)->getElements();
+  DINodeArray Fields = cast<DICompositeType>(TmpTy)->getElements();
   const DIDerivedType *varField = nullptr;
   const DIDerivedType *forwardingField = nullptr;
 
@@ -506,34 +513,35 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
 
 /// Return true if type encoding is unsigned.
 static bool isUnsignedDIType(DwarfDebug *DD, const DIType *Ty) {
-  if (auto *DTy = dyn_cast<DIDerivedTypeBase>(Ty)) {
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    // FIXME: Enums without a fixed underlying type have unknown signedness
+    // here, leading to incorrectly emitted constants.
+    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type)
+      return false;
+
+    // (Pieces of) aggregate types that get hacked apart by SROA may be
+    // represented by a constant. Encode them as unsigned bytes.
+    return true;
+  }
+
+  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
     dwarf::Tag T = (dwarf::Tag)Ty->getTag();
     // Encode pointer constants as unsigned bytes. This is used at least for
     // null pointer constant emission.
-    // (Pieces of) aggregate types that get hacked apart by SROA may also be
-    // represented by a constant. Encode them as unsigned bytes.
     // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
     // here, but accept them for now due to a bug in SROA producing bogus
     // dbg.values.
-    if (T == dwarf::DW_TAG_array_type ||
-        T == dwarf::DW_TAG_class_type ||
-        T == dwarf::DW_TAG_pointer_type ||
+    if (T == dwarf::DW_TAG_pointer_type ||
         T == dwarf::DW_TAG_ptr_to_member_type ||
         T == dwarf::DW_TAG_reference_type ||
-        T == dwarf::DW_TAG_rvalue_reference_type ||
-        T == dwarf::DW_TAG_structure_type ||
-        T == dwarf::DW_TAG_union_type)
+        T == dwarf::DW_TAG_rvalue_reference_type)
       return true;
     assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
            T == dwarf::DW_TAG_volatile_type ||
-           T == dwarf::DW_TAG_restrict_type ||
-           T == dwarf::DW_TAG_enumeration_type);
-    if (DITypeRef Deriv = DTy->getBaseType())
-      return isUnsignedDIType(DD, DD->resolve(Deriv));
-    // FIXME: Enums without a fixed underlying type have unknown signedness
-    // here, leading to incorrectly emitted constants.
-    assert(DTy->getTag() == dwarf::DW_TAG_enumeration_type);
-    return false;
+           T == dwarf::DW_TAG_restrict_type);
+    DITypeRef Deriv = DTy->getBaseType();
+    assert(Deriv && "Expected valid base type");
+    return isUnsignedDIType(DD, DD->resolve(Deriv));
   }
 
   auto *BTy = cast<DIBasicType>(Ty);
@@ -659,7 +667,7 @@ void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
 }
 
 void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
-  if (!LinkageName.empty())
+  if (!LinkageName.empty() && DD->useLinkageNames())
     addString(Die,
               DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
                                          : dwarf::DW_AT_MIPS_linkage_name,
@@ -685,6 +693,8 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
     return getOrCreateNameSpace(NS);
   if (auto *SP = dyn_cast<DISubprogram>(Context))
     return getOrCreateSubprogramDIE(SP);
+  if (auto *M = dyn_cast<DIModule>(Context))
+    return getOrCreateModule(M);
   return getDIE(Context);
 }
 
@@ -700,7 +710,8 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
 
   constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));
 
-  updateAcceleratorTables(Context, Ty, TyDIE);
+  if (!Ty->isExternalTypeRef())
+    updateAcceleratorTables(Context, Ty, TyDIE);
   return &TyDIE;
 }
 
@@ -753,7 +764,7 @@ void DwarfUnit::updateAcceleratorTables(const DIScope *Context,
                                         const DIType *Ty, const DIE &TyDIE) {
   if (!Ty->getName().empty() && !Ty->isForwardDecl()) {
     bool IsImplementation = 0;
-    if (auto *CT = dyn_cast<DICompositeTypeBase>(Ty)) {
+    if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
       // A runtime language of 0 actually means C/C++ and that any
       // non-negative value is some version of Objective-C/C++.
       IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
@@ -795,8 +806,7 @@ std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
 
   // Reverse iterate over our list to go from the outermost construct to the
   // innermost.
-  for (auto I = Parents.rbegin(), E = Parents.rend(); I != E; ++I) {
-    const DIScope *Ctx = *I;
+  for (const DIScope *Ctx : make_range(Parents.rbegin(), Parents.rend())) {
     StringRef Name = Ctx->getName();
     if (Name.empty() && isa<DINamespace>(Ctx))
       Name = "(anonymous namespace)";
@@ -843,7 +853,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
 
   // Add size if non-zero (derived types might be zero-sized.)
   if (Size && Tag != dwarf::DW_TAG_pointer_type
-           && Tag != dwarf::DW_TAG_ptr_to_member_type)
+           && Tag != dwarf::DW_TAG_ptr_to_member_type
+           && Tag != dwarf::DW_TAG_reference_type
+           && Tag != dwarf::DW_TAG_rvalue_reference_type)
     addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
@@ -899,6 +911,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  if (CTy->isExternalTypeRef()) {
+    StringRef Identifier = CTy->getIdentifier();
+    assert(!Identifier.empty() && "external type ref without identifier");
+    addFlag(Buffer, dwarf::DW_AT_declaration);
+    return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier);
+  }
+
   // Add name if not anonymous or intermediate type.
   StringRef Name = CTy->getName();
 
@@ -1134,6 +1153,14 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
                       "definition DIE was created in "
                       "getOrCreateSubprogramDIE");
     DeclLinkageName = SPDecl->getLinkageName();
+    unsigned DeclID =
+        getOrCreateSourceID(SPDecl->getFilename(), SPDecl->getDirectory());
+    unsigned DefID = getOrCreateSourceID(SP->getFilename(), SP->getDirectory());
+    if (DeclID != DefID)
+      addUInt(SPDie, dwarf::DW_AT_decl_file, None, DefID);
+
+    if (SP->getLine() != SPDecl->getLine())
+      addUInt(SPDie, dwarf::DW_AT_decl_line, None, SP->getLine());
   }
 
   // Add function template parameters.
@@ -1180,11 +1207,10 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
-  const DISubroutineType *SPTy = SP->getType();
-  assert(SPTy->getTag() == dwarf::DW_TAG_subroutine_type &&
-         "the type of a subprogram should be a subroutine");
+  DITypeRefArray Args;
+  if (const DISubroutineType *SPTy = SP->getType())
+    Args = SPTy->getTypeArray();
 
-  auto Args = SPTy->getTypeArray();
   // Add a return type. If this is a type like a C/C++ void type we don't add a
   // return type.
   if (Args.size())
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 44d9d22..82760bf 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -113,13 +113,6 @@ protected:
   DwarfUnit(unsigned UID, dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A,
             DwarfDebug *DW, DwarfFile *DWU);
 
-  /// Add a string attribute data and value.
-  ///
-  /// This is guaranteed to be in the local string pool instead of indirected.
-  void addLocalString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
-
-  void addIndexedString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
-
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
 
 public:
@@ -162,9 +155,6 @@ public:
   virtual void addGlobalType(const DIType *Ty, const DIE &Die,
                              const DIScope *Context) {}
 
-  /// Add a new name to the namespace accelerator table.
-  void addAccelNamespace(StringRef Name, const DIE &Die);
-
   /// Returns the DIE map slot for the specified debug variable.
   ///
   /// We delegate the request to DwarfDebug when the MDNode can be part of the
@@ -186,14 +176,14 @@ public:
   void addFlag(DIE &Die, dwarf::Attribute Attribute);
 
   /// Add an unsigned integer attribute data and value.
-  void addUInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
-               uint64_t Integer);
+  void addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
+               Optional<dwarf::Form> Form, uint64_t Integer);
 
-  void addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer);
+  void addUInt(DIEValueList &Block, dwarf::Form Form, uint64_t Integer);
 
   /// Add an signed integer attribute data and value.
-  void addSInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
-               int64_t Integer);
+  void addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
+               Optional<dwarf::Form> Form, int64_t Integer);
 
   void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer);
 
@@ -206,8 +196,10 @@ public:
   void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
 
   /// Add a Dwarf label attribute data and value.
-  DIE::value_iterator addLabel(DIE &Die, dwarf::Attribute Attribute,
-                               dwarf::Form Form, const MCSymbol *Label);
+  DIEValueList::value_iterator addLabel(DIEValueList &Die,
+                                        dwarf::Attribute Attribute,
+                                        dwarf::Form Form,
+                                        const MCSymbol *Label);
 
   void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
 
@@ -228,7 +220,11 @@ public:
   /// Add a DIE attribute data and value.
   void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry);
 
+  /// Add a type's DW_AT_signature and set the  declaration flag.
   void addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type);
+  /// Add an attribute containing the type signature for a unique identifier.
+  void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
+                           StringRef Identifier);
 
   /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 49ef8d3..e24dcb1 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -662,9 +662,8 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
     Entry = TypeInfos.size();
   }
 
-  for (std::vector<const GlobalValue *>::const_reverse_iterator
-         I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
-    const GlobalValue *GV = *I;
+  for (const GlobalValue *GV : make_range(TypeInfos.rbegin(),
+                                          TypeInfos.rend())) {
     if (VerboseAsm)
       Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
     Asm->EmitTTypeReference(GV, TTypeEncoding);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
index e42e082..c6a0e9d 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -76,10 +76,6 @@ protected:
                                SmallVectorImpl<ActionEntry> &Actions,
                                SmallVectorImpl<unsigned> &FirstActions);
 
-  /// Return `true' if this is a call to a function marked `nounwind'. Return
-  /// `false' otherwise.
-  bool callToNoUnwindFunction(const MachineInstr *MI);
-
   void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      RangeMapType &PadMap);
 
@@ -131,6 +127,10 @@ public:
   void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
   void beginInstruction(const MachineInstr *MI) override {}
   void endInstruction() override {}
+
+  /// Return `true' if this is a call to a function marked `nounwind'. Return
+  /// `false' otherwise.
+  static bool callToNoUnwindFunction(const MachineInstr *MI);
 };
 }
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index eb9e4c1..6a023b9 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -48,7 +48,7 @@ void llvm::linkErlangGCPrinter() {}
 void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                                      AsmPrinter &AP) {
   MCStreamer &OS = *AP.OutStreamer;
-  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+  unsigned IntPtrSize = M.getDataLayout().getPointerSize();
 
   // Put this in a custom .note section.
   OS.SwitchSection(
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 2ceec61..c09ef6a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -93,7 +93,7 @@ void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
 ///
 void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                                             AsmPrinter &AP) {
-  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+  unsigned IntPtrSize = M.getDataLayout().getPointerSize();
 
   AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getTextSection());
   EmitCamlGlobal(M, AP, "code_end");
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 6610ac7..1e2f55b 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -27,15 +27,15 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   auto *Scope = cast<DIScope>(S);
   StringRef Dir = Scope->getDirectory(),
             Filename = Scope->getFilename();
-  char *&Result = DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
-  if (Result)
-    return Result;
+  std::string &Filepath =
+      DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
+  if (!Filepath.empty())
+    return Filepath;
 
   // Clang emits directory and relative filename info into the IR, but CodeView
   // operates on full paths.  We could change Clang to emit full paths too, but
   // that would increase the IR size and probably not needed for other users.
   // For now, just concatenate and canonicalize the path here.
-  std::string Filepath;
   if (Filename.find(':') == 1)
     Filepath = Filename;
   else
@@ -74,8 +74,7 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
     Filepath.erase(Cursor, 1);
 
-  Result = strdup(Filepath.c_str());
-  return StringRef(Result);
+  return Filepath;
 }
 
 void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
@@ -83,13 +82,24 @@ void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
   const MDNode *Scope = DL.getScope();
   if (!Scope)
     return;
+  unsigned LineNumber = DL.getLine();
+  // Skip this line if it is longer than the maximum we can record.
+  if (LineNumber > COFF::CVL_MaxLineNumber)
+    return;
+
+  unsigned ColumnNumber = DL.getCol();
+  // Truncate the column number if it is longer than the maximum we can record.
+  if (ColumnNumber > COFF::CVL_MaxColumnNumber)
+    ColumnNumber = 0;
+
   StringRef Filename = getFullFilepath(Scope);
 
   // Skip this instruction if it has the same file:line as the previous one.
   assert(CurFn);
   if (!CurFn->Instrs.empty()) {
     const InstrInfoTy &LastInstr = InstrInfo[CurFn->Instrs.back()];
-    if (LastInstr.Filename == Filename && LastInstr.LineNumber == DL.getLine())
+    if (LastInstr.Filename == Filename && LastInstr.LineNumber == LineNumber &&
+        LastInstr.ColumnNumber == ColumnNumber)
       return;
   }
   FileNameRegistry.add(Filename);
@@ -97,7 +107,7 @@ void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
   MCSymbol *MCL = Asm->MMI->getContext().createTempSymbol();
   Asm->OutStreamer->EmitLabel(MCL);
   CurFn->Instrs.push_back(MCL);
-  InstrInfo[MCL] = InstrInfoTy(Filename, DL.getLine(), DL.getCol());
+  InstrInfo[MCL] = InstrInfoTy(Filename, LineNumber, ColumnNumber);
 }
 
 WinCodeViewLineTables::WinCodeViewLineTables(AsmPrinter *AP)
@@ -253,7 +263,7 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
   }
   FilenameSegmentLengths[LastSegmentEnd] = FI.Instrs.size() - LastSegmentEnd;
 
-  // Emit a line table subsection, requred to do PC-to-file:line lookup.
+  // Emit a line table subsection, required to do PC-to-file:line lookup.
   Asm->OutStreamer->AddComment("Line table subsection for " + Twine(FuncName));
   Asm->EmitInt32(COFF::DEBUG_LINE_TABLE_SUBSECTION);
   MCSymbol *LineTableBegin = Asm->MMI->getContext().createTempSymbol(),
@@ -283,8 +293,9 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
                 ColSegEnd = ColSegI + FilenameSegmentLengths[LastSegmentStart];
          ColSegI != ColSegEnd; ++ColSegI) {
       unsigned ColumnNumber = InstrInfo[FI.Instrs[ColSegI]].ColumnNumber;
+      assert(ColumnNumber <= COFF::CVL_MaxColumnNumber);
       Asm->EmitInt16(ColumnNumber); // Start column
-      Asm->EmitInt16(ColumnNumber); // End column
+      Asm->EmitInt16(0);            // End column
     }
     Asm->OutStreamer->EmitLabel(FileSegmentEnd);
   };
@@ -321,7 +332,10 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
 
     // The first PC with the given linenumber and the linenumber itself.
     EmitLabelDiff(*Asm->OutStreamer, Fn, Instr);
-    Asm->EmitInt32(InstrInfo[Instr].LineNumber);
+    uint32_t LineNumber = InstrInfo[Instr].LineNumber;
+    assert(LineNumber <= COFF::CVL_MaxLineNumber);
+    uint32_t LineData = LineNumber | COFF::CVL_IsStatement;
+    Asm->EmitInt32(LineData);
   }
 
   FinishPreviousChunk();
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index 43d1a43..78068e0 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
@@ -98,7 +98,7 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
     }
   } FileNameRegistry;
 
-  typedef std::map<std::pair<StringRef, StringRef>, char *>
+  typedef std::map<std::pair<StringRef, StringRef>, std::string>
       DirAndFilenameToFilepathMapTy;
   DirAndFilenameToFilepathMapTy DirAndFilenameToFilepathMap;
   StringRef getFullFilepath(const MDNode *S);
@@ -116,14 +116,6 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
 public:
   WinCodeViewLineTables(AsmPrinter *Asm);
 
-  ~WinCodeViewLineTables() override {
-    for (DirAndFilenameToFilepathMapTy::iterator
-             I = DirAndFilenameToFilepathMap.begin(),
-             E = DirAndFilenameToFilepathMap.end();
-         I != E; ++I)
-      free(I->second);
-  }
-
   void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
 
   /// \brief Emit the COFF section that holds the line table information.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index a2b9316..4da5b58 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -37,6 +38,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
@@ -62,9 +64,9 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   // If any landing pads survive, we need an EH table.
   bool hasLandingPads = !MMI->getLandingPads().empty();
+  bool hasEHFunclets = MMI->hasEHFunclets();
 
   const Function *F = MF->getFunction();
-  const Function *ParentF = MMI->getWinEHParent(F);
 
   shouldEmitMoves = Asm->needsSEHMoves();
 
@@ -78,49 +80,23 @@ void WinException::beginFunction(const MachineFunction *MF) {
     F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
     F->needsUnwindTableEntry();
 
-  shouldEmitPersonality = forceEmitPersonality || (hasLandingPads &&
-    PerEncoding != dwarf::DW_EH_PE_omit && Per);
+  shouldEmitPersonality =
+      forceEmitPersonality || ((hasLandingPads || hasEHFunclets) &&
+                               PerEncoding != dwarf::DW_EH_PE_omit && Per);
 
   unsigned LSDAEncoding = TLOF.getLSDAEncoding();
   shouldEmitLSDA = shouldEmitPersonality &&
     LSDAEncoding != dwarf::DW_EH_PE_omit;
 
-  // If we're not using CFI, we don't want the CFI or the personality. If
-  // WinEHPrepare outlined something, we should emit the LSDA.
+  // If we're not using CFI, we don't want the CFI or the personality, but we
+  // might want EH tables if we had EH pads.
   if (!Asm->MAI->usesWindowsCFI()) {
-    bool HasOutlinedChildren =
-        F->hasFnAttribute("wineh-parent") && F == ParentF;
-    shouldEmitLSDA = HasOutlinedChildren;
+    shouldEmitLSDA = hasEHFunclets;
     shouldEmitPersonality = false;
     return;
   }
 
-  // If this was an outlined handler, we need to define the label corresponding
-  // to the offset of the parent frame relative to the stack pointer after the
-  // prologue.
-  if (F != ParentF) {
-    WinEHFuncInfo &FuncInfo = MMI->getWinEHFuncInfo(ParentF);
-    auto I = FuncInfo.CatchHandlerParentFrameObjOffset.find(F);
-    if (I != FuncInfo.CatchHandlerParentFrameObjOffset.end()) {
-      MCSymbol *HandlerTypeParentFrameOffset =
-          Asm->OutContext.getOrCreateParentFrameOffsetSymbol(
-              GlobalValue::getRealLinkageName(F->getName()));
-
-      // Emit a symbol assignment.
-      Asm->OutStreamer->EmitAssignment(
-          HandlerTypeParentFrameOffset,
-          MCConstantExpr::create(I->second, Asm->OutContext));
-    }
-  }
-
-  if (shouldEmitMoves || shouldEmitPersonality)
-    Asm->OutStreamer->EmitWinCFIStartProc(Asm->CurrentFnSym);
-
-  if (shouldEmitPersonality) {
-    const MCSymbol *PersHandlerSym =
-        TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI);
-    Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true);
-  }
+  beginFunclet(MF->front(), Asm->CurrentFnSym);
 }
 
 /// endFunction - Gather and emit post-function exception information.
@@ -134,43 +110,158 @@ void WinException::endFunction(const MachineFunction *MF) {
   if (F->hasPersonalityFn())
     Per = classifyEHPersonality(F->getPersonalityFn());
 
-  // Get rid of any dead landing pads if we're not using a Windows EH scheme. In
-  // Windows EH schemes, the landing pad is not actually reachable. It only
-  // exists so that we can emit the right table data.
-  if (!isMSVCEHPersonality(Per))
+  // Get rid of any dead landing pads if we're not using funclets. In funclet
+  // schemes, the landing pad is not actually reachable. It only exists so
+  // that we can emit the right table data.
+  if (!isFuncletEHPersonality(Per))
     MMI->TidyLandingPads();
 
+  endFunclet();
+
+  // endFunclet will emit the necessary .xdata tables for x64 SEH.
+  if (Per == EHPersonality::MSVC_Win64SEH && MMI->hasEHFunclets())
+    return;
+
   if (shouldEmitPersonality || shouldEmitLSDA) {
     Asm->OutStreamer->PushSection();
 
-    if (shouldEmitMoves || shouldEmitPersonality) {
-      // Emit an UNWIND_INFO struct describing the prologue.
-      Asm->OutStreamer->EmitWinEHHandlerData();
-    } else {
-      // Just switch sections to the right xdata section. This use of
-      // CurrentFnSym assumes that we only emit the LSDA when ending the parent
-      // function.
-      MCSection *XData = WinEH::UnwindEmitter::getXDataSection(
-          Asm->CurrentFnSym, Asm->OutContext);
-      Asm->OutStreamer->SwitchSection(XData);
-    }
+    // Just switch sections to the right xdata section. This use of CurrentFnSym
+    // assumes that we only emit the LSDA when ending the parent function.
+    MCSection *XData = WinEH::UnwindEmitter::getXDataSection(Asm->CurrentFnSym,
+                                                             Asm->OutContext);
+    Asm->OutStreamer->SwitchSection(XData);
 
     // Emit the tables appropriate to the personality function in use. If we
     // don't recognize the personality, assume it uses an Itanium-style LSDA.
     if (Per == EHPersonality::MSVC_Win64SEH)
-      emitCSpecificHandlerTable();
+      emitCSpecificHandlerTable(MF);
     else if (Per == EHPersonality::MSVC_X86SEH)
       emitExceptHandlerTable(MF);
     else if (Per == EHPersonality::MSVC_CXX)
       emitCXXFrameHandler3Table(MF);
+    else if (Per == EHPersonality::CoreCLR)
+      emitCLRExceptionTable(MF);
     else
       emitExceptionTable();
 
     Asm->OutStreamer->PopSection();
   }
+}
+
+/// Retreive the MCSymbol for a GlobalValue or MachineBasicBlock.
+static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
+                                   const MachineBasicBlock *MBB) {
+  if (!MBB)
+    return nullptr;
 
+  assert(MBB->isEHFuncletEntry());
+
+  // Give catches and cleanups a name based off of their parent function and
+  // their funclet entry block's number.
+  const MachineFunction *MF = MBB->getParent();
+  const Function *F = MF->getFunction();
+  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  MCContext &Ctx = MF->getContext();
+  StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
+  return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
+                               Twine(MBB->getNumber()) + "@?0?" +
+                               FuncLinkageName + "@4HA");
+}
+
+void WinException::beginFunclet(const MachineBasicBlock &MBB,
+                                MCSymbol *Sym) {
+  CurrentFuncletEntry = &MBB;
+
+  const Function *F = Asm->MF->getFunction();
+  // If a symbol was not provided for the funclet, invent one.
+  if (!Sym) {
+    Sym = getMCSymbolForMBB(Asm, &MBB);
+
+    // Describe our funclet symbol as a function with internal linkage.
+    Asm->OutStreamer->BeginCOFFSymbolDef(Sym);
+    Asm->OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+    Asm->OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                         << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    Asm->OutStreamer->EndCOFFSymbolDef();
+
+    // We want our funclet's entry point to be aligned such that no nops will be
+    // present after the label.
+    Asm->EmitAlignment(std::max(Asm->MF->getAlignment(), MBB.getAlignment()),
+                       F);
+
+    // Now that we've emitted the alignment directive, point at our funclet.
+    Asm->OutStreamer->EmitLabel(Sym);
+  }
+
+  // Mark 'Sym' as starting our funclet.
   if (shouldEmitMoves || shouldEmitPersonality)
+    Asm->OutStreamer->EmitWinCFIStartProc(Sym);
+
+  if (shouldEmitPersonality) {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const Function *PerFn = nullptr;
+
+    // Determine which personality routine we are using for this funclet.
+    if (F->hasPersonalityFn())
+      PerFn = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+    const MCSymbol *PersHandlerSym =
+        TLOF.getCFIPersonalitySymbol(PerFn, *Asm->Mang, Asm->TM, MMI);
+
+    // Classify the personality routine so that we may reason about it.
+    EHPersonality Per = EHPersonality::Unknown;
+    if (F->hasPersonalityFn())
+      Per = classifyEHPersonality(F->getPersonalityFn());
+
+    // Do not emit a .seh_handler directive if it is a C++ cleanup funclet.
+    if (Per != EHPersonality::MSVC_CXX ||
+        !CurrentFuncletEntry->isCleanupFuncletEntry())
+      Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true);
+  }
+}
+
+void WinException::endFunclet() {
+  // No funclet to process?  Great, we have nothing to do.
+  if (!CurrentFuncletEntry)
+    return;
+
+  if (shouldEmitMoves || shouldEmitPersonality) {
+    const Function *F = Asm->MF->getFunction();
+    EHPersonality Per = EHPersonality::Unknown;
+    if (F->hasPersonalityFn())
+      Per = classifyEHPersonality(F->getPersonalityFn());
+
+    // The .seh_handlerdata directive implicitly switches section, push the
+    // current section so that we may return to it.
+    Asm->OutStreamer->PushSection();
+
+    // Emit an UNWIND_INFO struct describing the prologue.
+    Asm->OutStreamer->EmitWinEHHandlerData();
+
+    if (Per == EHPersonality::MSVC_CXX && shouldEmitPersonality &&
+        !CurrentFuncletEntry->isCleanupFuncletEntry()) {
+      // If this is a C++ catch funclet (or the parent function),
+      // emit a reference to the LSDA for the parent function.
+      StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+      MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
+          Twine("$cppxdata$", FuncLinkageName));
+      Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
+    } else if (Per == EHPersonality::MSVC_Win64SEH && MMI->hasEHFunclets() &&
+               !CurrentFuncletEntry->isEHFuncletEntry()) {
+      // If this is the parent function in Win64 SEH, emit the LSDA immediately
+      // following .seh_handlerdata.
+      emitCSpecificHandlerTable(Asm->MF);
+    }
+
+    // Switch back to the previous section now that we are done writing to
+    // .xdata.
+    Asm->OutStreamer->PopSection();
+
+    // Emit a .seh_endproc directive to mark the end of the function.
     Asm->OutStreamer->EmitWinCFIEndProc();
+  }
+
+  // Let's make sure we don't try to end the same funclet twice.
+  CurrentFuncletEntry = nullptr;
 }
 
 const MCExpr *WinException::create32bitRef(const MCSymbol *Value) {
@@ -188,6 +279,202 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
   return create32bitRef(Asm->getSymbol(GV));
 }
 
+const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+  return MCBinaryExpr::createAdd(create32bitRef(Label),
+                                 MCConstantExpr::create(1, Asm->OutContext),
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf,
+                                      const MCSymbol *OffsetFrom) {
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(OffsetOf, Asm->OutContext),
+      MCSymbolRefExpr::create(OffsetFrom, Asm->OutContext), Asm->OutContext);
+}
+
+const MCExpr *WinException::getOffsetPlusOne(const MCSymbol *OffsetOf,
+                                             const MCSymbol *OffsetFrom) {
+  return MCBinaryExpr::createAdd(getOffset(OffsetOf, OffsetFrom),
+                                 MCConstantExpr::create(1, Asm->OutContext),
+                                 Asm->OutContext);
+}
+
+int WinException::getFrameIndexOffset(int FrameIndex,
+                                      const WinEHFuncInfo &FuncInfo) {
+  const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering();
+  unsigned UnusedReg;
+  if (Asm->MAI->usesWindowsCFI())
+    return TFI.getFrameIndexReferenceFromSP(*Asm->MF, FrameIndex, UnusedReg);
+  // For 32-bit, offsets should be relative to the end of the EH registration
+  // node. For 64-bit, it's relative to SP at the end of the prologue.
+  assert(FuncInfo.EHRegNodeEndOffset != INT_MAX);
+  int Offset = TFI.getFrameIndexReference(*Asm->MF, FrameIndex, UnusedReg);
+  Offset += FuncInfo.EHRegNodeEndOffset;
+  return Offset;
+}
+
+namespace {
+
+/// Top-level state used to represent unwind to caller
+const int NullState = -1;
+
+struct InvokeStateChange {
+  /// EH Label immediately after the last invoke in the previous state, or
+  /// nullptr if the previous state was the null state.
+  const MCSymbol *PreviousEndLabel;
+
+  /// EH label immediately before the first invoke in the new state, or nullptr
+  /// if the new state is the null state.
+  const MCSymbol *NewStartLabel;
+
+  /// State of the invoke following NewStartLabel, or NullState to indicate
+  /// the presence of calls which may unwind to caller.
+  int NewState;
+};
+
+/// Iterator that reports all the invoke state changes in a range of machine
+/// basic blocks.  Changes to the null state are reported whenever a call that
+/// may unwind to caller is encountered.  The MBB range is expected to be an
+/// entire function or funclet, and the start and end of the range are treated
+/// as being in the NullState even if there's not an unwind-to-caller call
+/// before the first invoke or after the last one (i.e., the first state change
+/// reported is the first change to something other than NullState, and a
+/// change back to NullState is always reported at the end of iteration).
+class InvokeStateChangeIterator {
+  InvokeStateChangeIterator(const WinEHFuncInfo &EHInfo,
+                            MachineFunction::const_iterator MFI,
+                            MachineFunction::const_iterator MFE,
+                            MachineBasicBlock::const_iterator MBBI,
+                            int BaseState)
+      : EHInfo(EHInfo), MFI(MFI), MFE(MFE), MBBI(MBBI), BaseState(BaseState) {
+    LastStateChange.PreviousEndLabel = nullptr;
+    LastStateChange.NewStartLabel = nullptr;
+    LastStateChange.NewState = BaseState;
+    scan();
+  }
+
+public:
+  static iterator_range<InvokeStateChangeIterator>
+  range(const WinEHFuncInfo &EHInfo, MachineFunction::const_iterator Begin,
+        MachineFunction::const_iterator End, int BaseState = NullState) {
+    // Reject empty ranges to simplify bookkeeping by ensuring that we can get
+    // the end of the last block.
+    assert(Begin != End);
+    auto BlockBegin = Begin->begin();
+    auto BlockEnd = std::prev(End)->end();
+    return make_range(
+        InvokeStateChangeIterator(EHInfo, Begin, End, BlockBegin, BaseState),
+        InvokeStateChangeIterator(EHInfo, End, End, BlockEnd, BaseState));
+  }
+
+  // Iterator methods.
+  bool operator==(const InvokeStateChangeIterator &O) const {
+    assert(BaseState == O.BaseState);
+    // Must be visiting same block.
+    if (MFI != O.MFI)
+      return false;
+    // Must be visiting same isntr.
+    if (MBBI != O.MBBI)
+      return false;
+    // At end of block/instr iteration, we can still have two distinct states:
+    // one to report the final EndLabel, and another indicating the end of the
+    // state change iteration.  Check for CurrentEndLabel equality to
+    // distinguish these.
+    return CurrentEndLabel == O.CurrentEndLabel;
+  }
+
+  bool operator!=(const InvokeStateChangeIterator &O) const {
+    return !operator==(O);
+  }
+  InvokeStateChange &operator*() { return LastStateChange; }
+  InvokeStateChange *operator->() { return &LastStateChange; }
+  InvokeStateChangeIterator &operator++() { return scan(); }
+
+private:
+  InvokeStateChangeIterator &scan();
+
+  const WinEHFuncInfo &EHInfo;
+  const MCSymbol *CurrentEndLabel = nullptr;
+  MachineFunction::const_iterator MFI;
+  MachineFunction::const_iterator MFE;
+  MachineBasicBlock::const_iterator MBBI;
+  InvokeStateChange LastStateChange;
+  bool VisitingInvoke = false;
+  int BaseState;
+};
+
+} // end anonymous namespace
+
+InvokeStateChangeIterator &InvokeStateChangeIterator::scan() {
+  bool IsNewBlock = false;
+  for (; MFI != MFE; ++MFI, IsNewBlock = true) {
+    if (IsNewBlock)
+      MBBI = MFI->begin();
+    for (auto MBBE = MFI->end(); MBBI != MBBE; ++MBBI) {
+      const MachineInstr &MI = *MBBI;
+      if (!VisitingInvoke && LastStateChange.NewState != BaseState &&
+          MI.isCall() && !EHStreamer::callToNoUnwindFunction(&MI)) {
+        // Indicate a change of state to the null state.  We don't have
+        // start/end EH labels handy but the caller won't expect them for
+        // null state regions.
+        LastStateChange.PreviousEndLabel = CurrentEndLabel;
+        LastStateChange.NewStartLabel = nullptr;
+        LastStateChange.NewState = BaseState;
+        CurrentEndLabel = nullptr;
+        // Don't re-visit this instr on the next scan
+        ++MBBI;
+        return *this;
+      }
+
+      // All other state changes are at EH labels before/after invokes.
+      if (!MI.isEHLabel())
+        continue;
+      MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+      if (Label == CurrentEndLabel) {
+        VisitingInvoke = false;
+        continue;
+      }
+      auto InvokeMapIter = EHInfo.LabelToStateMap.find(Label);
+      // Ignore EH labels that aren't the ones inserted before an invoke
+      if (InvokeMapIter == EHInfo.LabelToStateMap.end())
+        continue;
+      auto &StateAndEnd = InvokeMapIter->second;
+      int NewState = StateAndEnd.first;
+      // Keep track of the fact that we're between EH start/end labels so
+      // we know not to treat the inoke we'll see as unwinding to caller.
+      VisitingInvoke = true;
+      if (NewState == LastStateChange.NewState) {
+        // The state isn't actually changing here.  Record the new end and
+        // keep going.
+        CurrentEndLabel = StateAndEnd.second;
+        continue;
+      }
+      // Found a state change to report
+      LastStateChange.PreviousEndLabel = CurrentEndLabel;
+      LastStateChange.NewStartLabel = Label;
+      LastStateChange.NewState = NewState;
+      // Start keeping track of the new current end
+      CurrentEndLabel = StateAndEnd.second;
+      // Don't re-visit this instr on the next scan
+      ++MBBI;
+      return *this;
+    }
+  }
+  // Iteration hit the end of the block range.
+  if (LastStateChange.NewState != BaseState) {
+    // Report the end of the last new state
+    LastStateChange.PreviousEndLabel = CurrentEndLabel;
+    LastStateChange.NewStartLabel = nullptr;
+    LastStateChange.NewState = BaseState;
+    // Leave CurrentEndLabel non-null to distinguish this state from end.
+    assert(CurrentEndLabel != nullptr);
+    return *this;
+  }
+  // We've reported all state changes and hit the end state.
+  CurrentEndLabel = nullptr;
+  return *this;
+}
+
 /// Emit the language-specific data that __C_specific_handler expects.  This
 /// handler lives in the x64 Microsoft C runtime and allows catching or cleaning
 /// up after faults with __try, __except, and __finally.  The typeinfo values
@@ -216,135 +503,156 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
 ///       imagerel32 LabelLPad;        // Zero means __finally.
 ///     } Entries[NumEntries];
 ///   };
-void WinException::emitCSpecificHandlerTable() {
-  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
-
-  // Simplifying assumptions for first implementation:
-  // - Cleanups are not implemented.
-  // - Filters are not implemented.
-
-  // The Itanium LSDA table sorts similar landing pads together to simplify the
-  // actions table, but we don't need that.
-  SmallVector<const LandingPadInfo *, 64> LandingPads;
-  LandingPads.reserve(PadInfos.size());
-  for (const auto &LP : PadInfos)
-    LandingPads.push_back(&LP);
-
-  // Compute label ranges for call sites as we would for the Itanium LSDA, but
-  // use an all zero action table because we aren't using these actions.
-  SmallVector<unsigned, 64> FirstActions;
-  FirstActions.resize(LandingPads.size());
-  SmallVector<CallSiteEntry, 64> CallSites;
-  computeCallSiteTable(CallSites, LandingPads, FirstActions);
-
-  MCSymbol *EHFuncBeginSym = Asm->getFunctionBegin();
-  MCSymbol *EHFuncEndSym = Asm->getFunctionEnd();
-
-  // Emit the number of table entries.
-  unsigned NumEntries = 0;
-  for (const CallSiteEntry &CSE : CallSites) {
-    if (!CSE.LPad)
-      continue; // Ignore gaps.
-    NumEntries += CSE.LPad->SEHHandlers.size();
+void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
+  auto &OS = *Asm->OutStreamer;
+  MCContext &Ctx = Asm->OutContext;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  // Emit a label assignment with the SEH frame offset so we can use it for
+  // llvm.x86.seh.recoverfp.
+  StringRef FLinkageName =
+      GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+  MCSymbol *ParentFrameOffset =
+      Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
+  const MCExpr *MCOffset =
+      MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx);
+  Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset);
+
+  // Use the assembler to compute the number of table entries through label
+  // difference and division.
+  MCSymbol *TableBegin =
+      Ctx.createTempSymbol("lsda_begin", /*AlwaysAddSuffix=*/true);
+  MCSymbol *TableEnd =
+      Ctx.createTempSymbol("lsda_end", /*AlwaysAddSuffix=*/true);
+  const MCExpr *LabelDiff = getOffset(TableEnd, TableBegin);
+  const MCExpr *EntrySize = MCConstantExpr::create(16, Ctx);
+  const MCExpr *EntryCount = MCBinaryExpr::createDiv(LabelDiff, EntrySize, Ctx);
+  AddComment("Number of call sites");
+  OS.EmitValue(EntryCount, 4);
+
+  OS.EmitLabel(TableBegin);
+
+  // Iterate over all the invoke try ranges. Unlike MSVC, LLVM currently only
+  // models exceptions from invokes. LLVM also allows arbitrary reordering of
+  // the code, so our tables end up looking a bit different. Rather than
+  // trying to match MSVC's tables exactly, we emit a denormalized table.  For
+  // each range of invokes in the same state, we emit table entries for all
+  // the actions that would be taken in that state. This means our tables are
+  // slightly bigger, which is OK.
+  const MCSymbol *LastStartLabel = nullptr;
+  int LastEHState = -1;
+  // Break out before we enter into a finally funclet.
+  // FIXME: We need to emit separate EH tables for cleanups.
+  MachineFunction::const_iterator End = MF->end();
+  MachineFunction::const_iterator Stop = std::next(MF->begin());
+  while (Stop != End && !Stop->isEHFuncletEntry())
+    ++Stop;
+  for (const auto &StateChange :
+       InvokeStateChangeIterator::range(FuncInfo, MF->begin(), Stop)) {
+    // Emit all the actions for the state we just transitioned out of
+    // if it was not the null state
+    if (LastEHState != -1)
+      emitSEHActionsForRange(FuncInfo, LastStartLabel,
+                             StateChange.PreviousEndLabel, LastEHState);
+    LastStartLabel = StateChange.NewStartLabel;
+    LastEHState = StateChange.NewState;
   }
-  Asm->OutStreamer->EmitIntValue(NumEntries, 4);
 
-  // If there are no actions, we don't need to iterate again.
-  if (NumEntries == 0)
-    return;
+  OS.EmitLabel(TableEnd);
+}
 
-  // Emit the four-label records for each call site entry. The table has to be
-  // sorted in layout order, and the call sites should already be sorted.
-  for (const CallSiteEntry &CSE : CallSites) {
-    // Ignore gaps. Unlike the Itanium model, unwinding through a frame without
-    // an EH table entry will propagate the exception rather than terminating
-    // the program.
-    if (!CSE.LPad)
-      continue;
-    const LandingPadInfo *LPad = CSE.LPad;
-
-    // Compute the label range. We may reuse the function begin and end labels
-    // rather than forming new ones.
-    const MCExpr *Begin =
-        create32bitRef(CSE.BeginLabel ? CSE.BeginLabel : EHFuncBeginSym);
-    const MCExpr *End;
-    if (CSE.EndLabel) {
-      // The interval is half-open, so we have to add one to include the return
-      // address of the last invoke in the range.
-      End = MCBinaryExpr::createAdd(create32bitRef(CSE.EndLabel),
-                                    MCConstantExpr::create(1, Asm->OutContext),
-                                    Asm->OutContext);
+void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
+                                          const MCSymbol *BeginLabel,
+                                          const MCSymbol *EndLabel, int State) {
+  auto &OS = *Asm->OutStreamer;
+  MCContext &Ctx = Asm->OutContext;
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  assert(BeginLabel && EndLabel);
+  while (State != -1) {
+    const SEHUnwindMapEntry &UME = FuncInfo.SEHUnwindMap[State];
+    const MCExpr *FilterOrFinally;
+    const MCExpr *ExceptOrNull;
+    auto *Handler = UME.Handler.get<MachineBasicBlock *>();
+    if (UME.IsFinally) {
+      FilterOrFinally = create32bitRef(getMCSymbolForMBB(Asm, Handler));
+      ExceptOrNull = MCConstantExpr::create(0, Ctx);
     } else {
-      End = create32bitRef(EHFuncEndSym);
+      // For an except, the filter can be 1 (catch-all) or a function
+      // label.
+      FilterOrFinally = UME.Filter ? create32bitRef(UME.Filter)
+                                   : MCConstantExpr::create(1, Ctx);
+      ExceptOrNull = create32bitRef(Handler->getSymbol());
     }
 
-    // Emit an entry for each action.
-    for (SEHHandler Handler : LPad->SEHHandlers) {
-      Asm->OutStreamer->EmitValue(Begin, 4);
-      Asm->OutStreamer->EmitValue(End, 4);
-
-      // Emit the filter or finally function pointer, if present. Otherwise,
-      // emit '1' to indicate a catch-all.
-      const Function *F = Handler.FilterOrFinally;
-      if (F)
-        Asm->OutStreamer->EmitValue(create32bitRef(Asm->getSymbol(F)), 4);
-      else
-        Asm->OutStreamer->EmitIntValue(1, 4);
-
-      // Emit the recovery address, if present. Otherwise, this must be a
-      // finally.
-      const BlockAddress *BA = Handler.RecoverBA;
-      if (BA)
-        Asm->OutStreamer->EmitValue(
-            create32bitRef(Asm->GetBlockAddressSymbol(BA)), 4);
-      else
-        Asm->OutStreamer->EmitIntValue(0, 4);
-    }
+    AddComment("LabelStart");
+    OS.EmitValue(getLabelPlusOne(BeginLabel), 4);
+    AddComment("LabelEnd");
+    OS.EmitValue(getLabelPlusOne(EndLabel), 4);
+    AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
+                                                             : "CatchAll");
+    OS.EmitValue(FilterOrFinally, 4);
+    AddComment(UME.IsFinally ? "Null" : "ExceptionHandler");
+    OS.EmitValue(ExceptOrNull, 4);
+
+    assert(UME.ToState < State && "states should decrease");
+    State = UME.ToState;
   }
 }
 
 void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   const Function *F = MF->getFunction();
-  const Function *ParentF = MMI->getWinEHParent(F);
   auto &OS = *Asm->OutStreamer;
-  WinEHFuncInfo &FuncInfo = MMI->getWinEHFuncInfo(ParentF);
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
 
-  StringRef ParentLinkageName =
-      GlobalValue::getRealLinkageName(ParentF->getName());
+  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
 
+  SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
   MCSymbol *FuncInfoXData = nullptr;
   if (shouldEmitPersonality) {
-    FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
-        Twine("$cppxdata$", ParentLinkageName));
-    OS.EmitValue(create32bitRef(FuncInfoXData), 4);
-
-    extendIP2StateTable(MF, ParentF, FuncInfo);
-
-    // Defer emission until we've visited the parent function and all the catch
-    // handlers.  Cleanups don't contribute to the ip2state table, so don't count
-    // them.
-    if (ParentF != F && !FuncInfo.CatchHandlerMaxState.count(F))
-      return;
-    ++FuncInfo.NumIPToStateFuncsVisited;
-    if (FuncInfo.NumIPToStateFuncsVisited != FuncInfo.CatchHandlerMaxState.size())
-      return;
+    // If we're 64-bit, emit a pointer to the C++ EH data, and build a map from
+    // IPs to state numbers.
+    FuncInfoXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$cppxdata$", FuncLinkageName));
+    computeIP2StateTable(MF, FuncInfo, IPToStateTable);
   } else {
-    FuncInfoXData = Asm->OutContext.getOrCreateLSDASymbol(ParentLinkageName);
-    emitEHRegistrationOffsetLabel(FuncInfo, ParentLinkageName);
+    FuncInfoXData = Asm->OutContext.getOrCreateLSDASymbol(FuncLinkageName);
   }
 
+  int UnwindHelpOffset = 0;
+  if (Asm->MAI->usesWindowsCFI())
+    UnwindHelpOffset =
+        getFrameIndexOffset(FuncInfo.UnwindHelpFrameIdx, FuncInfo);
+
   MCSymbol *UnwindMapXData = nullptr;
   MCSymbol *TryBlockMapXData = nullptr;
   MCSymbol *IPToStateXData = nullptr;
-  if (!FuncInfo.UnwindMap.empty())
+  if (!FuncInfo.CxxUnwindMap.empty())
     UnwindMapXData = Asm->OutContext.getOrCreateSymbol(
-        Twine("$stateUnwindMap$", ParentLinkageName));
+        Twine("$stateUnwindMap$", FuncLinkageName));
   if (!FuncInfo.TryBlockMap.empty())
-    TryBlockMapXData = Asm->OutContext.getOrCreateSymbol(
-        Twine("$tryMap$", ParentLinkageName));
-  if (!FuncInfo.IPToStateList.empty())
-    IPToStateXData = Asm->OutContext.getOrCreateSymbol(
-        Twine("$ip2state$", ParentLinkageName));
+    TryBlockMapXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$tryMap$", FuncLinkageName));
+  if (!IPToStateTable.empty())
+    IPToStateXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$ip2state$", FuncLinkageName));
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
 
   // FuncInfo {
   //   uint32_t           MagicNumber
@@ -363,17 +671,38 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   // EHFlags & 4 -> The function is noexcept(true), unwinding can't continue.
   OS.EmitValueToAlignment(4);
   OS.EmitLabel(FuncInfoXData);
-  OS.EmitIntValue(0x19930522, 4);                      // MagicNumber
-  OS.EmitIntValue(FuncInfo.UnwindMap.size(), 4);       // MaxState
-  OS.EmitValue(create32bitRef(UnwindMapXData), 4);     // UnwindMap
-  OS.EmitIntValue(FuncInfo.TryBlockMap.size(), 4);     // NumTryBlocks
-  OS.EmitValue(create32bitRef(TryBlockMapXData), 4);   // TryBlockMap
-  OS.EmitIntValue(FuncInfo.IPToStateList.size(), 4);   // IPMapEntries
-  OS.EmitValue(create32bitRef(IPToStateXData), 4);     // IPToStateMap
-  if (Asm->MAI->usesWindowsCFI())
-    OS.EmitIntValue(FuncInfo.UnwindHelpFrameOffset, 4); // UnwindHelp
-  OS.EmitIntValue(0, 4);                               // ESTypeList
-  OS.EmitIntValue(1, 4);                               // EHFlags
+
+  AddComment("MagicNumber");
+  OS.EmitIntValue(0x19930522, 4);
+
+  AddComment("MaxState");
+  OS.EmitIntValue(FuncInfo.CxxUnwindMap.size(), 4);
+
+  AddComment("UnwindMap");
+  OS.EmitValue(create32bitRef(UnwindMapXData), 4);
+
+  AddComment("NumTryBlocks");
+  OS.EmitIntValue(FuncInfo.TryBlockMap.size(), 4);
+
+  AddComment("TryBlockMap");
+  OS.EmitValue(create32bitRef(TryBlockMapXData), 4);
+
+  AddComment("IPMapEntries");
+  OS.EmitIntValue(IPToStateTable.size(), 4);
+
+  AddComment("IPToStateXData");
+  OS.EmitValue(create32bitRef(IPToStateXData), 4);
+
+  if (Asm->MAI->usesWindowsCFI()) {
+    AddComment("UnwindHelp");
+    OS.EmitIntValue(UnwindHelpOffset, 4);
+  }
+
+  AddComment("ESTypeList");
+  OS.EmitIntValue(0, 4);
+
+  AddComment("EHFlags");
+  OS.EmitIntValue(1, 4);
 
   // UnwindMapEntry {
   //   int32_t ToState;
@@ -381,9 +710,14 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   // };
   if (UnwindMapXData) {
     OS.EmitLabel(UnwindMapXData);
-    for (const WinEHUnwindMapEntry &UME : FuncInfo.UnwindMap) {
-      OS.EmitIntValue(UME.ToState, 4);                // ToState
-      OS.EmitValue(create32bitRef(UME.Cleanup), 4);   // Action
+    for (const CxxUnwindMapEntry &UME : FuncInfo.CxxUnwindMap) {
+      MCSymbol *CleanupSym =
+          getMCSymbolForMBB(Asm, UME.Cleanup.dyn_cast<MachineBasicBlock *>());
+      AddComment("ToState");
+      OS.EmitIntValue(UME.ToState, 4);
+
+      AddComment("Action");
+      OS.EmitValue(create32bitRef(CleanupSym), 4);
     }
   }
 
@@ -398,33 +732,49 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
     OS.EmitLabel(TryBlockMapXData);
     SmallVector<MCSymbol *, 1> HandlerMaps;
     for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
-      WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
-      MCSymbol *HandlerMapXData = nullptr;
+      const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
 
+      MCSymbol *HandlerMapXData = nullptr;
       if (!TBME.HandlerArray.empty())
         HandlerMapXData =
             Asm->OutContext.getOrCreateSymbol(Twine("$handlerMap$")
                                                   .concat(Twine(I))
                                                   .concat("$")
-                                                  .concat(ParentLinkageName));
-
+                                                  .concat(FuncLinkageName));
       HandlerMaps.push_back(HandlerMapXData);
 
-      int CatchHigh = -1;
-      for (WinEHHandlerType &HT : TBME.HandlerArray)
-        CatchHigh =
-            std::max(CatchHigh, FuncInfo.CatchHandlerMaxState[HT.Handler]);
-
-      assert(TBME.TryLow <= TBME.TryHigh);
-      OS.EmitIntValue(TBME.TryLow, 4);                    // TryLow
-      OS.EmitIntValue(TBME.TryHigh, 4);                   // TryHigh
-      OS.EmitIntValue(CatchHigh, 4);                      // CatchHigh
-      OS.EmitIntValue(TBME.HandlerArray.size(), 4);       // NumCatches
-      OS.EmitValue(create32bitRef(HandlerMapXData), 4);   // HandlerArray
+      // TBMEs should form intervals.
+      assert(0 <= TBME.TryLow && "bad trymap interval");
+      assert(TBME.TryLow <= TBME.TryHigh && "bad trymap interval");
+      assert(TBME.TryHigh < TBME.CatchHigh && "bad trymap interval");
+      assert(TBME.CatchHigh < int(FuncInfo.CxxUnwindMap.size()) &&
+             "bad trymap interval");
+
+      AddComment("TryLow");
+      OS.EmitIntValue(TBME.TryLow, 4);
+
+      AddComment("TryHigh");
+      OS.EmitIntValue(TBME.TryHigh, 4);
+
+      AddComment("CatchHigh");
+      OS.EmitIntValue(TBME.CatchHigh, 4);
+
+      AddComment("NumCatches");
+      OS.EmitIntValue(TBME.HandlerArray.size(), 4);
+
+      AddComment("HandlerArray");
+      OS.EmitValue(create32bitRef(HandlerMapXData), 4);
+    }
+
+    // All funclets use the same parent frame offset currently.
+    unsigned ParentFrameOffset = 0;
+    if (shouldEmitPersonality) {
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      ParentFrameOffset = TFI->getWinEHParentFrameOffset(*MF);
     }
 
     for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
-      WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
+      const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
       MCSymbol *HandlerMapXData = HandlerMaps[I];
       if (!HandlerMapXData)
         continue;
@@ -438,32 +788,34 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
       OS.EmitLabel(HandlerMapXData);
       for (const WinEHHandlerType &HT : TBME.HandlerArray) {
         // Get the frame escape label with the offset of the catch object. If
-        // the index is -1, then there is no catch object, and we should emit an
-        // offset of zero, indicating that no copy will occur.
+        // the index is INT_MAX, then there is no catch object, and we should
+        // emit an offset of zero, indicating that no copy will occur.
         const MCExpr *FrameAllocOffsetRef = nullptr;
-        if (HT.CatchObjRecoverIdx >= 0) {
-          MCSymbol *FrameAllocOffset =
-              Asm->OutContext.getOrCreateFrameAllocSymbol(
-                  GlobalValue::getRealLinkageName(ParentF->getName()),
-                  HT.CatchObjRecoverIdx);
-          FrameAllocOffsetRef = MCSymbolRefExpr::create(
-              FrameAllocOffset, MCSymbolRefExpr::VK_None, Asm->OutContext);
+        if (HT.CatchObj.FrameIndex != INT_MAX) {
+          int Offset = getFrameIndexOffset(HT.CatchObj.FrameIndex, FuncInfo);
+          FrameAllocOffsetRef = MCConstantExpr::create(Offset, Asm->OutContext);
         } else {
           FrameAllocOffsetRef = MCConstantExpr::create(0, Asm->OutContext);
         }
 
-        OS.EmitIntValue(HT.Adjectives, 4);                    // Adjectives
-        OS.EmitValue(create32bitRef(HT.TypeDescriptor), 4);   // Type
-        OS.EmitValue(FrameAllocOffsetRef, 4);                 // CatchObjOffset
-        OS.EmitValue(create32bitRef(HT.Handler), 4);          // Handler
+        MCSymbol *HandlerSym =
+            getMCSymbolForMBB(Asm, HT.Handler.dyn_cast<MachineBasicBlock *>());
+
+        AddComment("Adjectives");
+        OS.EmitIntValue(HT.Adjectives, 4);
+
+        AddComment("Type");
+        OS.EmitValue(create32bitRef(HT.TypeDescriptor), 4);
+
+        AddComment("CatchObjOffset");
+        OS.EmitValue(FrameAllocOffsetRef, 4);
+
+        AddComment("Handler");
+        OS.EmitValue(create32bitRef(HandlerSym), 4);
 
         if (shouldEmitPersonality) {
-          MCSymbol *ParentFrameOffset =
-              Asm->OutContext.getOrCreateParentFrameOffsetSymbol(
-                  GlobalValue::getRealLinkageName(HT.Handler->getName()));
-          const MCSymbolRefExpr *ParentFrameOffsetRef = MCSymbolRefExpr::create(
-              ParentFrameOffset, Asm->OutContext);
-          OS.EmitValue(ParentFrameOffsetRef, 4); // ParentFrameOffset
+          AddComment("ParentFrameOffset");
+          OS.EmitIntValue(ParentFrameOffset, 4);
         }
       }
     }
@@ -475,87 +827,65 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   // };
   if (IPToStateXData) {
     OS.EmitLabel(IPToStateXData);
-    for (auto &IPStatePair : FuncInfo.IPToStateList) {
-      OS.EmitValue(create32bitRef(IPStatePair.first), 4);   // IP
-      OS.EmitIntValue(IPStatePair.second, 4);               // State
+    for (auto &IPStatePair : IPToStateTable) {
+      AddComment("IP");
+      OS.EmitValue(IPStatePair.first, 4);
+      AddComment("ToState");
+      OS.EmitIntValue(IPStatePair.second, 4);
     }
   }
 }
 
-void WinException::extendIP2StateTable(const MachineFunction *MF,
-                                       const Function *ParentF,
-                                       WinEHFuncInfo &FuncInfo) {
-  const Function *F = MF->getFunction();
-
-  // The Itanium LSDA table sorts similar landing pads together to simplify the
-  // actions table, but we don't need that.
-  SmallVector<const LandingPadInfo *, 64> LandingPads;
-  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
-  LandingPads.reserve(PadInfos.size());
-  for (const auto &LP : PadInfos)
-    LandingPads.push_back(&LP);
-
-  RangeMapType PadMap;
-  computePadMap(LandingPads, PadMap);
-
-  // The end label of the previous invoke or nounwind try-range.
-  MCSymbol *LastLabel = Asm->getFunctionBegin();
-
-  // Whether there is a potentially throwing instruction (currently this means
-  // an ordinary call) between the end of the previous try-range and now.
-  bool SawPotentiallyThrowing = false;
-
-  int LastEHState = -2;
-
-  // The parent function and the catch handlers contribute to the 'ip2state'
-  // table.
-
-  // Include ip2state entries for the beginning of the main function and
-  // for catch handler functions.
-  if (F == ParentF) {
-    FuncInfo.IPToStateList.push_back(std::make_pair(LastLabel, -1));
-    LastEHState = -1;
-  } else if (FuncInfo.HandlerBaseState.count(F)) {
-    FuncInfo.IPToStateList.push_back(
-        std::make_pair(LastLabel, FuncInfo.HandlerBaseState[F]));
-    LastEHState = FuncInfo.HandlerBaseState[F];
-  }
-  for (const auto &MBB : *MF) {
-    for (const auto &MI : MBB) {
-      if (!MI.isEHLabel()) {
-        if (MI.isCall())
-          SawPotentiallyThrowing |= !callToNoUnwindFunction(&MI);
-        continue;
+void WinException::computeIP2StateTable(
+    const MachineFunction *MF, const WinEHFuncInfo &FuncInfo,
+    SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable) {
+
+  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
+                                       FuncletEnd = MF->begin(),
+                                       End = MF->end();
+       FuncletStart != End; FuncletStart = FuncletEnd) {
+    // Find the end of the funclet
+    while (++FuncletEnd != End) {
+      if (FuncletEnd->isEHFuncletEntry()) {
+        break;
       }
+    }
 
-      // End of the previous try-range?
-      MCSymbol *BeginLabel = MI.getOperand(0).getMCSymbol();
-      if (BeginLabel == LastLabel)
-        SawPotentiallyThrowing = false;
-
-      // Beginning of a new try-range?
-      RangeMapType::const_iterator L = PadMap.find(BeginLabel);
-      if (L == PadMap.end())
-        // Nope, it was just some random label.
-        continue;
-
-      const PadRange &P = L->second;
-      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
-      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
-             "Inconsistent landing pad map!");
-
-      // FIXME: Should this be using FuncInfo.HandlerBaseState?
-      if (SawPotentiallyThrowing && LastEHState != -1) {
-        FuncInfo.IPToStateList.push_back(std::make_pair(LastLabel, -1));
-        SawPotentiallyThrowing = false;
-        LastEHState = -1;
-      }
+    // Don't emit ip2state entries for cleanup funclets. Any interesting
+    // exceptional actions in cleanups must be handled in a separate IR
+    // function.
+    if (FuncletStart->isCleanupFuncletEntry())
+      continue;
 
-      if (LandingPad->WinEHState != LastEHState)
-        FuncInfo.IPToStateList.push_back(
-            std::make_pair(BeginLabel, LandingPad->WinEHState));
-      LastEHState = LandingPad->WinEHState;
-      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+    MCSymbol *StartLabel;
+    int BaseState;
+    if (FuncletStart == MF->begin()) {
+      BaseState = NullState;
+      StartLabel = Asm->getFunctionBegin();
+    } else {
+      auto *FuncletPad =
+          cast<FuncletPadInst>(FuncletStart->getBasicBlock()->getFirstNonPHI());
+      assert(FuncInfo.FuncletBaseStateMap.count(FuncletPad) != 0);
+      BaseState = FuncInfo.FuncletBaseStateMap.find(FuncletPad)->second;
+      StartLabel = getMCSymbolForMBB(Asm, &*FuncletStart);
+    }
+    assert(StartLabel && "need local function start label");
+    IPToStateTable.push_back(
+        std::make_pair(create32bitRef(StartLabel), BaseState));
+
+    for (const auto &StateChange : InvokeStateChangeIterator::range(
+             FuncInfo, FuncletStart, FuncletEnd, BaseState)) {
+      // Compute the label to report as the start of this entry; use the EH
+      // start label for the invoke if we have one, otherwise (this is a call
+      // which may unwind to our caller and does not have an EH start label, so)
+      // use the previous end label.
+      const MCSymbol *ChangeLabel = StateChange.NewStartLabel;
+      if (!ChangeLabel)
+        ChangeLabel = StateChange.PreviousEndLabel;
+      // Emit an entry indicating that PCs after 'Label' have this EH state.
+      IPToStateTable.push_back(
+          std::make_pair(getLabelPlusOne(ChangeLabel), StateChange.NewState));
+      // FIXME: assert that NewState is between CatchLow and CatchHigh.
     }
   }
 }
@@ -566,15 +896,15 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
   // registration in order to recover the parent frame pointer. Now that we know
   // we've code generated the parent, we can emit the label assignment that
   // those helpers use to get the offset of the registration node.
-  assert(FuncInfo.EHRegNodeEscapeIndex != INT_MAX &&
-         "no EH reg node localescape index");
+  MCContext &Ctx = Asm->OutContext;
   MCSymbol *ParentFrameOffset =
-      Asm->OutContext.getOrCreateParentFrameOffsetSymbol(FLinkageName);
-  MCSymbol *RegistrationOffsetSym = Asm->OutContext.getOrCreateFrameAllocSymbol(
-      FLinkageName, FuncInfo.EHRegNodeEscapeIndex);
-  const MCExpr *RegistrationOffsetSymRef =
-      MCSymbolRefExpr::create(RegistrationOffsetSym, Asm->OutContext);
-  Asm->OutStreamer->EmitAssignment(ParentFrameOffset, RegistrationOffsetSymRef);
+      Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
+  unsigned UnusedReg;
+  const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+  int64_t Offset = TFI->getFrameIndexReference(
+      *Asm->MF, FuncInfo.EHRegNodeFrameIndex, UnusedReg);
+  const MCExpr *MCOffset = MCConstantExpr::create(Offset, Ctx);
+  Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset);
 }
 
 /// Emit the language-specific data that _except_handler3 and 4 expect. This is
@@ -585,7 +915,13 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
   const Function *F = MF->getFunction();
   StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
 
-  WinEHFuncInfo &FuncInfo = MMI->getWinEHFuncInfo(F);
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
   emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
 
   // Emit the __ehtable label that we use for llvm.x86.seh.lsda.
@@ -611,58 +947,291 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
     //
     // Only the EHCookieOffset field appears to vary, and it appears to be the
     // offset from the final saved SP value to the retaddr.
+    AddComment("GSCookieOffset");
     OS.EmitIntValue(-2, 4);
+    AddComment("GSCookieXOROffset");
     OS.EmitIntValue(0, 4);
     // FIXME: Calculate.
+    AddComment("EHCookieOffset");
     OS.EmitIntValue(9999, 4);
+    AddComment("EHCookieXOROffset");
     OS.EmitIntValue(0, 4);
     BaseState = -2;
   }
 
-  // Build a list of pointers to LandingPadInfos and then sort by WinEHState.
-  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
-  SmallVector<const LandingPadInfo *, 4> LPads;
-  LPads.reserve((PadInfos.size()));
-  for (const LandingPadInfo &LPInfo : PadInfos)
-    LPads.push_back(&LPInfo);
-  std::sort(LPads.begin(), LPads.end(),
-            [](const LandingPadInfo *L, const LandingPadInfo *R) {
-              return L->WinEHState < R->WinEHState;
-            });
-
-  // For each action in each lpad, emit one of these:
-  // struct ScopeTableEntry {
-  //   int32_t EnclosingLevel;
-  //   int32_t (__cdecl *Filter)();
-  //   void *HandlerOrFinally;
-  // };
-  //
-  // The "outermost" action will use BaseState as its enclosing level. Each
-  // other action will refer to the previous state as its enclosing level.
-  int CurState = 0;
-  for (const LandingPadInfo *LPInfo : LPads) {
-    int EnclosingLevel = BaseState;
-    assert(CurState + int(LPInfo->SEHHandlers.size()) - 1 ==
-               LPInfo->WinEHState &&
-           "gaps in the SEH scope table");
-    for (auto I = LPInfo->SEHHandlers.rbegin(), E = LPInfo->SEHHandlers.rend();
-         I != E; ++I) {
-      const SEHHandler &Handler = *I;
-      const BlockAddress *BA = Handler.RecoverBA;
-      const Function *F = Handler.FilterOrFinally;
-      assert(F && "cannot catch all in 32-bit SEH without filter function");
-      const MCExpr *FilterOrNull =
-          create32bitRef(BA ? Asm->getSymbol(F) : nullptr);
-      const MCExpr *ExceptOrFinally = create32bitRef(
-          BA ? Asm->GetBlockAddressSymbol(BA) : Asm->getSymbol(F));
-
-      OS.EmitIntValue(EnclosingLevel, 4);
-      OS.EmitValue(FilterOrNull, 4);
-      OS.EmitValue(ExceptOrFinally, 4);
-
-      // The next state unwinds to this state.
-      EnclosingLevel = CurState;
-      CurState++;
+  assert(!FuncInfo.SEHUnwindMap.empty());
+  for (const SEHUnwindMapEntry &UME : FuncInfo.SEHUnwindMap) {
+    auto *Handler = UME.Handler.get<MachineBasicBlock *>();
+    const MCSymbol *ExceptOrFinally =
+        UME.IsFinally ? getMCSymbolForMBB(Asm, Handler) : Handler->getSymbol();
+    // -1 is usually the base state for "unwind to caller", but for
+    // _except_handler4 it's -2. Do that replacement here if necessary.
+    int ToState = UME.ToState == -1 ? BaseState : UME.ToState;
+    AddComment("ToState");
+    OS.EmitIntValue(ToState, 4);
+    AddComment(UME.IsFinally ? "Null" : "FilterFunction");
+    OS.EmitValue(create32bitRef(UME.Filter), 4);
+    AddComment(UME.IsFinally ? "FinallyFunclet" : "ExceptionHandler");
+    OS.EmitValue(create32bitRef(ExceptOrFinally), 4);
+  }
+}
+
+static int getTryRank(const WinEHFuncInfo &FuncInfo, int State) {
+  int Rank = 0;
+  while (State != -1) {
+    ++Rank;
+    State = FuncInfo.ClrEHUnwindMap[State].TryParentState;
+  }
+  return Rank;
+}
+
+static int getTryAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
+  int LeftRank = getTryRank(FuncInfo, Left);
+  int RightRank = getTryRank(FuncInfo, Right);
+
+  while (LeftRank < RightRank) {
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
+    --RightRank;
+  }
+
+  while (RightRank < LeftRank) {
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    --LeftRank;
+  }
+
+  while (Left != Right) {
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
+  }
+
+  return Left;
+}
+
+void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
+  // CLR EH "states" are really just IDs that identify handlers/funclets;
+  // states, handlers, and funclets all have 1:1 mappings between them, and a
+  // handler/funclet's "state" is its index in the ClrEHUnwindMap.
+  MCStreamer &OS = *Asm->OutStreamer;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+  MCSymbol *FuncBeginSym = Asm->getFunctionBegin();
+  MCSymbol *FuncEndSym = Asm->getFunctionEnd();
+
+  // A ClrClause describes a protected region.
+  struct ClrClause {
+    const MCSymbol *StartLabel; // Start of protected region
+    const MCSymbol *EndLabel;   // End of protected region
+    int State;          // Index of handler protecting the protected region
+    int EnclosingState; // Index of funclet enclosing the protected region
+  };
+  SmallVector<ClrClause, 8> Clauses;
+
+  // Build a map from handler MBBs to their corresponding states (i.e. their
+  // indices in the ClrEHUnwindMap).
+  int NumStates = FuncInfo.ClrEHUnwindMap.size();
+  assert(NumStates > 0 && "Don't need exception table!");
+  DenseMap<const MachineBasicBlock *, int> HandlerStates;
+  for (int State = 0; State < NumStates; ++State) {
+    MachineBasicBlock *HandlerBlock =
+        FuncInfo.ClrEHUnwindMap[State].Handler.get<MachineBasicBlock *>();
+    HandlerStates[HandlerBlock] = State;
+    // Use this loop through all handlers to verify our assumption (used in
+    // the MinEnclosingState computation) that enclosing funclets have lower
+    // state numbers than their enclosed funclets.
+    assert(FuncInfo.ClrEHUnwindMap[State].HandlerParentState < State &&
+           "ill-formed state numbering");
+  }
+  // Map the main function to the NullState.
+  HandlerStates[&MF->front()] = NullState;
+
+  // Write out a sentinel indicating the end of the standard (Windows) xdata
+  // and the start of the additional (CLR) info.
+  OS.EmitIntValue(0xffffffff, 4);
+  // Write out the number of funclets
+  OS.EmitIntValue(NumStates, 4);
+
+  // Walk the machine blocks/instrs, computing and emitting a few things:
+  // 1. Emit a list of the offsets to each handler entry, in lexical order.
+  // 2. Compute a map (EndSymbolMap) from each funclet to the symbol at its end.
+  // 3. Compute the list of ClrClauses, in the required order (inner before
+  //    outer, earlier before later; the order by which a forward scan with
+  //    early termination will find the innermost enclosing clause covering
+  //    a given address).
+  // 4. A map (MinClauseMap) from each handler index to the index of the
+  //    outermost funclet/function which contains a try clause targeting the
+  //    key handler.  This will be used to determine IsDuplicate-ness when
+  //    emitting ClrClauses.  The NullState value is used to indicate that the
+  //    top-level function contains a try clause targeting the key handler.
+  // HandlerStack is a stack of (PendingStartLabel, PendingState) pairs for
+  // try regions we entered before entering the PendingState try but which
+  // we haven't yet exited.
+  SmallVector<std::pair<const MCSymbol *, int>, 4> HandlerStack;
+  // EndSymbolMap and MinClauseMap are maps described above.
+  std::unique_ptr<MCSymbol *[]> EndSymbolMap(new MCSymbol *[NumStates]);
+  SmallVector<int, 4> MinClauseMap((size_t)NumStates, NumStates);
+
+  // Visit the root function and each funclet.
+  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
+                                       FuncletEnd = MF->begin(),
+                                       End = MF->end();
+       FuncletStart != End; FuncletStart = FuncletEnd) {
+    int FuncletState = HandlerStates[&*FuncletStart];
+    // Find the end of the funclet
+    MCSymbol *EndSymbol = FuncEndSym;
+    while (++FuncletEnd != End) {
+      if (FuncletEnd->isEHFuncletEntry()) {
+        EndSymbol = getMCSymbolForMBB(Asm, &*FuncletEnd);
+        break;
+      }
     }
+    // Emit the function/funclet end and, if this is a funclet (and not the
+    // root function), record it in the EndSymbolMap.
+    OS.EmitValue(getOffset(EndSymbol, FuncBeginSym), 4);
+    if (FuncletState != NullState) {
+      // Record the end of the handler.
+      EndSymbolMap[FuncletState] = EndSymbol;
+    }
+
+    // Walk the state changes in this function/funclet and compute its clauses.
+    // Funclets always start in the null state.
+    const MCSymbol *CurrentStartLabel = nullptr;
+    int CurrentState = NullState;
+    assert(HandlerStack.empty());
+    for (const auto &StateChange :
+         InvokeStateChangeIterator::range(FuncInfo, FuncletStart, FuncletEnd)) {
+      // Close any try regions we're not still under
+      int StillPendingState =
+          getTryAncestor(FuncInfo, CurrentState, StateChange.NewState);
+      while (CurrentState != StillPendingState) {
+        assert(CurrentState != NullState &&
+               "Failed to find still-pending state!");
+        // Close the pending clause
+        Clauses.push_back({CurrentStartLabel, StateChange.PreviousEndLabel,
+                           CurrentState, FuncletState});
+        // Now the next-outer try region is current
+        CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].TryParentState;
+        // Pop the new start label from the handler stack if we've exited all
+        // inner try regions of the corresponding try region.
+        if (HandlerStack.back().second == CurrentState)
+          CurrentStartLabel = HandlerStack.pop_back_val().first;
+      }
+
+      if (StateChange.NewState != CurrentState) {
+        // For each clause we're starting, update the MinClauseMap so we can
+        // know which is the topmost funclet containing a clause targeting
+        // it.
+        for (int EnteredState = StateChange.NewState;
+             EnteredState != CurrentState;
+             EnteredState =
+                 FuncInfo.ClrEHUnwindMap[EnteredState].TryParentState) {
+          int &MinEnclosingState = MinClauseMap[EnteredState];
+          if (FuncletState < MinEnclosingState)
+            MinEnclosingState = FuncletState;
+        }
+        // Save the previous current start/label on the stack and update to
+        // the newly-current start/state.
+        HandlerStack.emplace_back(CurrentStartLabel, CurrentState);
+        CurrentStartLabel = StateChange.NewStartLabel;
+        CurrentState = StateChange.NewState;
+      }
+    }
+    assert(HandlerStack.empty());
+  }
+
+  // Now emit the clause info, starting with the number of clauses.
+  OS.EmitIntValue(Clauses.size(), 4);
+  for (ClrClause &Clause : Clauses) {
+    // Emit a CORINFO_EH_CLAUSE :
+    /*
+      struct CORINFO_EH_CLAUSE
+      {
+          CORINFO_EH_CLAUSE_FLAGS Flags;         // actually a CorExceptionFlag
+          DWORD                   TryOffset;
+          DWORD                   TryLength;     // actually TryEndOffset
+          DWORD                   HandlerOffset;
+          DWORD                   HandlerLength; // actually HandlerEndOffset
+          union
+          {
+              DWORD               ClassToken;   // use for catch clauses
+              DWORD               FilterOffset; // use for filter clauses
+          };
+      };
+
+      enum CORINFO_EH_CLAUSE_FLAGS
+      {
+          CORINFO_EH_CLAUSE_NONE    = 0,
+          CORINFO_EH_CLAUSE_FILTER  = 0x0001, // This clause is for a filter
+          CORINFO_EH_CLAUSE_FINALLY = 0x0002, // This clause is a finally clause
+          CORINFO_EH_CLAUSE_FAULT   = 0x0004, // This clause is a fault clause
+      };
+      typedef enum CorExceptionFlag
+      {
+          COR_ILEXCEPTION_CLAUSE_NONE,
+          COR_ILEXCEPTION_CLAUSE_FILTER  = 0x0001, // This is a filter clause
+          COR_ILEXCEPTION_CLAUSE_FINALLY = 0x0002, // This is a finally clause
+          COR_ILEXCEPTION_CLAUSE_FAULT = 0x0004,   // This is a fault clause
+          COR_ILEXCEPTION_CLAUSE_DUPLICATED = 0x0008, // duplicated clause. This
+                                                      // clause was duplicated
+                                                      // to a funclet which was
+                                                      // pulled out of line
+      } CorExceptionFlag;
+    */
+    // Add 1 to the start/end of the EH clause; the IP associated with a
+    // call when the runtime does its scan is the IP of the next instruction
+    // (the one to which control will return after the call), so we need
+    // to add 1 to the end of the clause to cover that offset.  We also add
+    // 1 to the start of the clause to make sure that the ranges reported
+    // for all clauses are disjoint.  Note that we'll need some additional
+    // logic when machine traps are supported, since in that case the IP
+    // that the runtime uses is the offset of the faulting instruction
+    // itself; if such an instruction immediately follows a call but the
+    // two belong to different clauses, we'll need to insert a nop between
+    // them so the runtime can distinguish the point to which the call will
+    // return from the point at which the fault occurs.
+
+    const MCExpr *ClauseBegin =
+        getOffsetPlusOne(Clause.StartLabel, FuncBeginSym);
+    const MCExpr *ClauseEnd = getOffsetPlusOne(Clause.EndLabel, FuncBeginSym);
+
+    const ClrEHUnwindMapEntry &Entry = FuncInfo.ClrEHUnwindMap[Clause.State];
+    MachineBasicBlock *HandlerBlock = Entry.Handler.get<MachineBasicBlock *>();
+    MCSymbol *BeginSym = getMCSymbolForMBB(Asm, HandlerBlock);
+    const MCExpr *HandlerBegin = getOffset(BeginSym, FuncBeginSym);
+    MCSymbol *EndSym = EndSymbolMap[Clause.State];
+    const MCExpr *HandlerEnd = getOffset(EndSym, FuncBeginSym);
+
+    uint32_t Flags = 0;
+    switch (Entry.HandlerType) {
+    case ClrHandlerType::Catch:
+      // Leaving bits 0-2 clear indicates catch.
+      break;
+    case ClrHandlerType::Filter:
+      Flags |= 1;
+      break;
+    case ClrHandlerType::Finally:
+      Flags |= 2;
+      break;
+    case ClrHandlerType::Fault:
+      Flags |= 4;
+      break;
+    }
+    if (Clause.EnclosingState != MinClauseMap[Clause.State]) {
+      // This is a "duplicate" clause; the handler needs to be entered from a
+      // frame above the one holding the invoke.
+      assert(Clause.EnclosingState > MinClauseMap[Clause.State]);
+      Flags |= 8;
+    }
+    OS.EmitIntValue(Flags, 4);
+
+    // Write the clause start/end
+    OS.EmitValue(ClauseBegin, 4);
+    OS.EmitValue(ClauseEnd, 4);
+
+    // Write out the handler start/end
+    OS.EmitValue(HandlerBegin, 4);
+    OS.EmitValue(HandlerEnd, 4);
+
+    // Write out the type token or filter offset
+    assert(Entry.HandlerType != ClrHandlerType::Filter && "NYI: filters");
+    OS.EmitIntValue(Entry.TypeToken, 4);
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
index 669c9cc..acb3010 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -21,6 +21,7 @@ class Function;
 class GlobalValue;
 class MachineFunction;
 class MCExpr;
+class Value;
 struct WinEHFuncInfo;
 
 class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
@@ -36,7 +37,14 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   /// True if this is a 64-bit target and we should use image relative offsets.
   bool useImageRel32 = false;
 
-  void emitCSpecificHandlerTable();
+  /// Pointer to the current funclet entry BB.
+  const MachineBasicBlock *CurrentFuncletEntry = nullptr;
+
+  void emitCSpecificHandlerTable(const MachineFunction *MF);
+
+  void emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
+                              const MCSymbol *BeginLabel,
+                              const MCSymbol *EndLabel, int State);
 
   /// Emit the EH table data for 32-bit and 64-bit functions using
   /// the __CxxFrameHandler3 personality.
@@ -47,8 +55,11 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   /// tables.
   void emitExceptHandlerTable(const MachineFunction *MF);
 
-  void extendIP2StateTable(const MachineFunction *MF, const Function *ParentF,
-                           WinEHFuncInfo &FuncInfo);
+  void emitCLRExceptionTable(const MachineFunction *MF);
+
+  void computeIP2StateTable(
+      const MachineFunction *MF, const WinEHFuncInfo &FuncInfo,
+      SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable);
 
   /// Emits the label used with llvm.x86.seh.recoverfp, which is used by
   /// outlined funclets.
@@ -57,6 +68,16 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
 
   const MCExpr *create32bitRef(const MCSymbol *Value);
   const MCExpr *create32bitRef(const GlobalValue *GV);
+  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
+  const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
+  const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
+                                 const MCSymbol *OffsetFrom);
+
+  /// Gets the offset that we should use in a table for a stack object with the
+  /// given index. For targets using CFI (Win64, etc), this is relative to the
+  /// established SP at the end of the prologue. For targets without CFI (Win32
+  /// only), it is relative to the frame pointer.
+  int getFrameIndexOffset(int FrameIndex, const WinEHFuncInfo &FuncInfo);
 
 public:
   //===--------------------------------------------------------------------===//
@@ -74,6 +95,10 @@ public:
 
   /// Gather and emit post-function exception information.
   void endFunction(const MachineFunction *) override;
+
+  /// \brief Emit target-specific EH funclet machinery.
+  void beginFunclet(const MachineBasicBlock &MBB, MCSymbol *Sym) override;
+  void endFunclet() override;
 };
 }
 
diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 530ab46..d12fdb2 100644
--- a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -8,10 +8,14 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains a pass (at IR level) to replace atomic instructions with
-// either (intrinsic-based) load-linked/store-conditional loops or AtomicCmpXchg.
+// target specific instruction which implement the same semantics in a way
+// which better fits the target backend.  This can include the use of either
+// (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or
+// type coercions.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -20,6 +24,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -44,13 +49,17 @@ namespace {
   private:
     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
                                bool IsStore, bool IsLoad);
-    bool expandAtomicLoad(LoadInst *LI);
+    IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
+    LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
+    bool tryExpandAtomicLoad(LoadInst *LI);
     bool expandAtomicLoadToLL(LoadInst *LI);
     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
+    StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
     bool expandAtomicStore(StoreInst *SI);
     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
-    bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI);
+    bool expandAtomicOpToLLSC(
+        Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
+        std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
     bool isIdempotentRMW(AtomicRMWInst *AI);
     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
@@ -108,7 +117,7 @@ bool AtomicExpand::runOnFunction(Function &F) {
         FenceOrdering = RMWI->getOrdering();
         RMWI->setOrdering(Monotonic);
         IsStore = IsLoad = true;
-      } else if (CASI && !TLI->hasLoadLinkedStoreConditional() &&
+      } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
                  (isAtLeastRelease(CASI->getSuccessOrdering()) ||
                   isAtLeastAcquire(CASI->getSuccessOrdering()))) {
         // If a compare and swap is lowered to LL/SC, we can do smarter fence
@@ -126,10 +135,28 @@ bool AtomicExpand::runOnFunction(Function &F) {
       }
     }
 
-    if (LI && TLI->shouldExpandAtomicLoadInIR(LI)) {
-      MadeChange |= expandAtomicLoad(LI);
-    } else if (SI && TLI->shouldExpandAtomicStoreInIR(SI)) {
-      MadeChange |= expandAtomicStore(SI);
+    if (LI) {
+      if (LI->getType()->isFloatingPointTy()) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        LI = convertAtomicLoadToIntegerType(LI);
+        assert(LI->getType()->isIntegerTy() && "invariant broken");
+        MadeChange = true;
+      }
+      
+      MadeChange |= tryExpandAtomicLoad(LI);
+    } else if (SI) {
+      if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        SI = convertAtomicStoreToIntegerType(SI);
+        assert(SI->getValueOperand()->getType()->isIntegerTy() &&
+               "invariant broken");
+        MadeChange = true;
+      }
+
+      if (TLI->shouldExpandAtomicStoreInIR(SI))
+        MadeChange |= expandAtomicStore(SI);
     } else if (RMWI) {
       // There are two different ways of expanding RMW instructions:
       // - into a load if it is idempotent
@@ -141,7 +168,7 @@ bool AtomicExpand::runOnFunction(Function &F) {
       } else {
         MadeChange |= tryExpandAtomicRMW(RMWI);
       }
-    } else if (CASI && TLI->hasLoadLinkedStoreConditional()) {
+    } else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI)) {
       MadeChange |= expandAtomicCmpXchg(CASI);
     }
   }
@@ -169,11 +196,56 @@ bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
   return (LeadingFence || TrailingFence);
 }
 
-bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
-  if (TLI->hasLoadLinkedStoreConditional())
+/// Get the iX type with the same bitwidth as T.
+IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
+                                                       const DataLayout &DL) {
+  EVT VT = TLI->getValueType(DL, T);
+  unsigned BitWidth = VT.getStoreSizeInBits();
+  assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
+  return IntegerType::get(T->getContext(), BitWidth);
+}
+
+/// Convert an atomic load of a non-integral type to an integer load of the
+/// equivelent bitwidth.  See the function comment on
+/// convertAtomicStoreToIntegerType for background.  
+LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
+  auto *M = LI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(LI->getType(),
+                                            M->getDataLayout());
+
+  IRBuilder<> Builder(LI);
+  
+  Value *Addr = LI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy,
+                              Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+  
+  auto *NewLI = Builder.CreateLoad(NewAddr);
+  NewLI->setAlignment(LI->getAlignment());
+  NewLI->setVolatile(LI->isVolatile());
+  NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
+  DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
+  
+  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  LI->replaceAllUsesWith(NewVal);
+  LI->eraseFromParent();
+  return NewLI;
+}
+
+bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
+  switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    return expandAtomicOpToLLSC(
+        LI, LI->getPointerOperand(), LI->getOrdering(),
+        [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
+  case TargetLoweringBase::AtomicExpansionKind::LLOnly:
     return expandAtomicLoadToLL(LI);
-  else
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
     return expandAtomicLoadToCmpXchg(LI);
+  }
+  llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
 }
 
 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
@@ -184,6 +256,7 @@ bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
   Value *Val =
       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
 
   LI->replaceAllUsesWith(Val);
   LI->eraseFromParent();
@@ -209,6 +282,35 @@ bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
   return true;
 }
 
+/// Convert an atomic store of a non-integral type to an integer store of the
+/// equivelent bitwidth.  We used to not support floating point or vector
+/// atomics in the IR at all.  The backends learned to deal with the bitcast
+/// idiom because that was the only way of expressing the notion of a atomic
+/// float or vector store.  The long term plan is to teach each backend to
+/// instruction select from the original atomic store, but as a migration
+/// mechanism, we convert back to the old format which the backends understand.
+/// Each backend will need individual work to recognize the new format.
+StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
+  IRBuilder<> Builder(SI);
+  auto *M = SI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
+                                            M->getDataLayout());
+  Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
+  
+  Value *Addr = SI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy,
+                              Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
+  NewSI->setAlignment(SI->getAlignment());
+  NewSI->setVolatile(SI->isVolatile());
+  NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
+  DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
+  SI->eraseFromParent();
+  return NewSI;
+}
+
 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
   // This function is only called on atomic stores that are too large to be
   // atomic if implemented as a native store. So we replace them by an
@@ -226,23 +328,15 @@ bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
   return tryExpandAtomicRMW(AI);
 }
 
-bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
-  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
-  case TargetLoweringBase::AtomicRMWExpansionKind::None:
-    return false;
-  case TargetLoweringBase::AtomicRMWExpansionKind::LLSC: {
-    assert(TLI->hasLoadLinkedStoreConditional() &&
-           "TargetLowering requested we expand AtomicRMW instruction into "
-           "load-linked/store-conditional combos, but such instructions aren't "
-           "supported");
-
-    return expandAtomicRMWToLLSC(AI);
-  }
-  case TargetLoweringBase::AtomicRMWExpansionKind::CmpXChg: {
-    return expandAtomicRMWToCmpXchg(AI);
-  }
-  }
-  llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
+static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
+                                 Value *Loaded, Value *NewVal,
+                                 AtomicOrdering MemOpOrder,
+                                 Value *&Success, Value *&NewLoaded) {
+  Value* Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, MemOpOrder,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+  Success = Builder.CreateExtractValue(Pair, 1, "success");
+  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
 }
 
 /// Emit IR to implement the given atomicrmw operation on values in registers,
@@ -282,10 +376,28 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
   }
 }
 
-bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
-  AtomicOrdering MemOpOrder = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
+bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
+  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
+                                [&](IRBuilder<> &Builder, Value *Loaded) {
+                                  return performAtomicOp(AI->getOperation(),
+                                                         Builder, Loaded,
+                                                         AI->getValOperand());
+                                });
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
+    return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
+  }
+}
+
+bool AtomicExpand::expandAtomicOpToLLSC(
+    Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
+    std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  BasicBlock *BB = I->getParent();
   Function *F = BB->getParent();
   LLVMContext &Ctx = F->getContext();
 
@@ -303,11 +415,11 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
   // atomicrmw.end:
   //     fence?
   //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
+  // This grabs the DebugLoc from I.
+  IRBuilder<> Builder(I);
 
   // The split call above "helpfully" added a branch at the end of BB (to the
   // wrong place), but we might want a fence too. It's easiest to just remove
@@ -320,8 +432,7 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
   Builder.SetInsertPoint(LoopBB);
   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
 
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+  Value *NewVal = PerformOp(Builder, Loaded);
 
   Value *StoreSuccess =
       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
@@ -331,72 +442,8 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
 
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
-
-  return true;
-}
-
-bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) {
-  AtomicOrdering MemOpOrder =
-      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     %init_loaded = load atomic iN* %addr
-  //     br label %loop
-  // loop:
-  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
-  //     %new = some_op iN %loaded, %incr
-  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
-  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
-  //     %success = extractvalue { iN, i1 } %pair, 1
-  //     br i1 %success, label %atomicrmw.end, label %loop
-  // atomicrmw.end:
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we want a load. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
-  // Atomics require at least natural alignment.
-  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
-  Loaded->addIncoming(InitLoaded, BB);
-
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
-
-  Value *Pair = Builder.CreateAtomicCmpXchg(
-      Addr, Loaded, NewVal, MemOpOrder,
-      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
-  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
-  Loaded->addIncoming(NewLoaded, LoopBB);
-
-  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
-  Builder.CreateCondBr(Success, ExitBB, LoopBB);
-
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-
-  AI->replaceAllUsesWith(NewLoaded);
-  AI->eraseFromParent();
+  I->replaceAllUsesWith(Loaded);
+  I->eraseFromParent();
 
   return true;
 }
@@ -424,7 +471,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //     %loaded = @load.linked(%addr)
   //     %should_store = icmp eq %loaded, %desired
   //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.failure
+  //                          label %cmpxchg.nostore
   // cmpxchg.trystore:
   //     %stored = @store_conditional(%new, %addr)
   //     %success = icmp eq i32 %stored, 0
@@ -432,6 +479,9 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // cmpxchg.success:
   //     fence?
   //     br label %cmpxchg.end
+  // cmpxchg.nostore:
+  //     @load_linked_fail_balance()?
+  //     br label %cmpxchg.failure
   // cmpxchg.failure:
   //     fence?
   //     br label %cmpxchg.end
@@ -440,9 +490,10 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
-  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
+  auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
   auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
   auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
 
@@ -466,7 +517,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
-  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
+  Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
   Value *StoreSuccess = TLI->emitStoreConditional(
@@ -482,6 +533,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
                          /*IsLoad=*/true);
   Builder.CreateBr(ExitBB);
 
+  Builder.SetInsertPoint(NoStoreBB);
+  // In the failing case, where we don't execute the store-conditional, the
+  // target might want to balance out the load-linked with a dedicated
+  // instruction (e.g., on ARM, clearing the exclusive monitor).
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
+  Builder.CreateBr(FailureBB);
+
   Builder.SetInsertPoint(FailureBB);
   TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
                          /*IsLoad=*/true);
@@ -556,9 +614,77 @@ bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
 
 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
-    if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
-      expandAtomicLoad(ResultingLoad);
+    tryExpandAtomicLoad(ResultingLoad);
     return true;
   }
   return false;
 }
+
+bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                    CreateCmpXchgInstFun CreateCmpXchg) {
+  assert(AI);
+
+  AtomicOrdering MemOpOrder =
+      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  // Atomics require at least natural alignment.
+  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *NewLoaded = nullptr;
+  Value *Success = nullptr;
+
+  CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder,
+                Success, NewLoaded);
+  assert(Success && NewLoaded);
+
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+
+  AI->replaceAllUsesWith(NewLoaded);
+  AI->eraseFromParent();
+
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
index db00910..a67e194 100644
--- a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -33,6 +33,6 @@ cl::opt<unsigned>
                                     cl::desc("Threshold for partial unrolling"),
                                     cl::Hidden);
 
-BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, Function &F)
+BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
       TLI(ST->getTargetLowering()) {}
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
index 6182667..df5cac5 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
@@ -12,7 +12,8 @@
 // it then removes.
 //
 // Note that this pass must be run after register allocation, it cannot handle
-// SSA form.
+// SSA form. It also must handle virtual registers for targets that emit virtual
+// ISA (e.g. NVPTX).
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,6 +21,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -95,7 +97,7 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structurized CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
-      PassConfig->getEnableTailMerge();
+                         PassConfig->getEnableTailMerge();
   BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true,
                       getAnalysis<MachineBlockFrequencyInfo>(),
                       getAnalysis<MachineBranchProbabilityInfo>());
@@ -132,6 +134,7 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
 
   // Remove the block.
   MF->erase(MBB);
+  FuncletMembership.erase(MBB);
 }
 
 /// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def
@@ -150,9 +153,13 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
     if (!I->isImplicitDef())
       break;
     unsigned Reg = I->getOperand(0).getReg();
-    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-         SubRegs.isValid(); ++SubRegs)
-      ImpDefRegs.insert(*SubRegs);
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        ImpDefRegs.insert(*SubRegs);
+    } else {
+      ImpDefRegs.insert(Reg);
+    }
     ++I;
   }
   if (ImpDefRegs.empty())
@@ -163,8 +170,7 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
     if (!TII->isUnpredicatedTerminator(I))
       return false;
     // See if it uses any of the implicitly defined registers.
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = I->getOperand(i);
+    for (const MachineOperand &MO : I->operands()) {
       if (!MO.isReg() || !MO.isUse())
         continue;
       unsigned Reg = MO.getReg();
@@ -208,14 +214,17 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   // Fix CFG.  The later algorithms expect it to be right.
   bool MadeChange = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; I++) {
-    MachineBasicBlock *MBB = I, *TBB = nullptr, *FBB = nullptr;
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
     SmallVector<MachineOperand, 4> Cond;
-    if (!TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true))
-      MadeChange |= MBB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
-    MadeChange |= OptimizeImpDefsBlock(MBB);
+    if (!TII->AnalyzeBranch(MBB, TBB, FBB, Cond, true))
+      MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+    MadeChange |= OptimizeImpDefsBlock(&MBB);
   }
 
+  // Recalculate funclet membership.
+  FuncletMembership = getFuncletMembership(MF);
+
   bool MadeChangeThisIteration = true;
   while (MadeChangeThisIteration) {
     MadeChangeThisIteration    = TailMergeBlocks(MF);
@@ -235,12 +244,9 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   // Walk the function to find jump tables that are live.
   BitVector JTIsLive(JTI->getJumpTables().size());
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end();
-       BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I)
-      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
-        MachineOperand &Op = I->getOperand(op);
+  for (const MachineBasicBlock &BB : MF) {
+    for (const MachineInstr &I : BB)
+      for (const MachineOperand &Op : I.operands()) {
         if (!Op.isJTI()) continue;
 
         // Remember that this JT is live.
@@ -365,7 +371,7 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   }
   // Back past possible debugging pseudos at beginning of block.  This matters
   // when one block differs from the other only by whether debugging pseudos
-  // are present at the beginning.  (This way, the various checks later for
+  // are present at the beginning. (This way, the various checks later for
   // I1==MBB1->begin() work as expected.)
   if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
     --I2;
@@ -426,7 +432,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   MachineFunction &MF = *CurMBB.getParent();
 
   // Create the fall-through block.
-  MachineFunction::iterator MBBI = &CurMBB;
+  MachineFunction::iterator MBBI = CurMBB.getIterator();
   MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB);
   CurMBB.getParent()->insert(++MBBI, NewMBB);
 
@@ -445,6 +451,11 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   // For targets that use the register scavenger, we must maintain LiveIns.
   MaintainLiveIns(&CurMBB, NewMBB);
 
+  // Add the new block to the funclet.
+  const auto &FuncletI = FuncletMembership.find(&CurMBB);
+  if (FuncletI != FuncletMembership.end())
+    FuncletMembership[NewMBB] = FuncletI->second;
+
   return NewMBB;
 }
 
@@ -479,7 +490,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   DebugLoc dl;  // FIXME: this is nowhere
   if (I != MF->end() &&
       !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
-    MachineBasicBlock *NextBB = I;
+    MachineBasicBlock *NextBB = &*I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
       if (!TII->ReverseBranchCondition(Cond)) {
         TII->RemoveBranch(*CurMBB);
@@ -549,14 +560,23 @@ static unsigned CountTerminators(MachineBasicBlock *MBB,
 /// and decide if it would be profitable to merge those tails.  Return the
 /// length of the common tail and iterators to the first common instruction
 /// in each block.
-static bool ProfitableToMerge(MachineBasicBlock *MBB1,
-                              MachineBasicBlock *MBB2,
-                              unsigned minCommonTailLength,
-                              unsigned &CommonTailLen,
-                              MachineBasicBlock::iterator &I1,
-                              MachineBasicBlock::iterator &I2,
-                              MachineBasicBlock *SuccBB,
-                              MachineBasicBlock *PredBB) {
+static bool
+ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
+                  unsigned minCommonTailLength, unsigned &CommonTailLen,
+                  MachineBasicBlock::iterator &I1,
+                  MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
+                  MachineBasicBlock *PredBB,
+                  DenseMap<const MachineBasicBlock *, int> &FuncletMembership) {
+  // It is never profitable to tail-merge blocks from two different funclets.
+  if (!FuncletMembership.empty()) {
+    auto Funclet1 = FuncletMembership.find(MBB1);
+    assert(Funclet1 != FuncletMembership.end());
+    auto Funclet2 = FuncletMembership.find(MBB2);
+    assert(Funclet2 != FuncletMembership.end());
+    if (Funclet1->second != Funclet2->second)
+      return false;
+  }
+
   CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
   if (CommonTailLen == 0)
     return false;
@@ -600,12 +620,8 @@ static bool ProfitableToMerge(MachineBasicBlock *MBB1,
   // branch instruction, which is likely to be smaller than the 2
   // instructions that would be deleted in the merge.
   MachineFunction *MF = MBB1->getParent();
-  if (EffectiveTailLen >= 2 &&
-      MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
-      (I1 == MBB1->begin() || I2 == MBB2->begin()))
-    return true;
-
-  return false;
+  return EffectiveTailLen >= 2 && MF->getFunction()->optForSize() &&
+         (I1 == MBB1->begin() || I2 == MBB2->begin());
 }
 
 /// ComputeSameTails - Look through all the blocks in MergePotentials that have
@@ -634,7 +650,8 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
       if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
                             minCommonTailLength,
                             CommonTailLen, TrialBBI1, TrialBBI2,
-                            SuccBB, PredBB)) {
+                            SuccBB, PredBB,
+                            FuncletMembership)) {
         if (CommonTailLen > maxCommonTailLength) {
           SameTails.clear();
           maxCommonTailLength = CommonTailLen;
@@ -727,18 +744,6 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   return true;
 }
 
-static bool hasIdenticalMMOs(const MachineInstr *MI1, const MachineInstr *MI2) {
-  auto I1 = MI1->memoperands_begin(), E1 = MI1->memoperands_end();
-  auto I2 = MI2->memoperands_begin(), E2 = MI2->memoperands_end();
-  if ((E1 - I1) != (E2 - I2))
-    return false;
-  for (; I1 != E1; ++I1, ++I2) {
-    if (**I1 != **I2)
-      return false;
-  }
-  return true;
-}
-
 static void
 removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
                                MachineBasicBlock &MBBCommon) {
@@ -775,8 +780,7 @@ removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
     assert(MBBICommon->isIdenticalTo(&*MBBI) && "Expected matching MIIs!");
 
     if (MBBICommon->mayLoad() || MBBICommon->mayStore())
-      if (!hasIdenticalMMOs(&*MBBI, &*MBBICommon))
-        MBBICommon->clearMemRefs();
+      MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI));
 
     ++MBBI;
     ++MBBICommon;
@@ -840,8 +844,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
     // block, which we can't jump to), we can treat all blocks with this same
     // tail at once.  Use PredBB if that is one of the possibilities, as that
     // will not introduce any extra branches.
-    MachineBasicBlock *EntryBB = MergePotentials.begin()->getBlock()->
-                                 getParent()->begin();
+    MachineBasicBlock *EntryBB =
+        &MergePotentials.front().getBlock()->getParent()->front();
     unsigned commonTailIndex = SameTails.size();
     // If there are two blocks, check to see if one can be made to fall through
     // into the other.
@@ -917,12 +921,11 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
   // First find blocks with no successors.
   MergePotentials.clear();
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E && MergePotentials.size() < TailMergeThreshold; ++I) {
-    if (TriedMerging.count(I))
-      continue;
-    if (I->succ_empty())
-      MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(I), I));
+  for (MachineBasicBlock &MBB : MF) {
+    if (MergePotentials.size() == TailMergeThreshold)
+      break;
+    if (!TriedMerging.count(&MBB) && MBB.succ_empty())
+      MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(&MBB), &MBB));
   }
 
   // If this is a large problem, avoid visiting the same basic blocks
@@ -958,13 +961,13 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
        I != E; ++I) {
     if (I->pred_size() < 2) continue;
     SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
-    MachineBasicBlock *IBB = I;
-    MachineBasicBlock *PredBB = std::prev(I);
+    MachineBasicBlock *IBB = &*I;
+    MachineBasicBlock *PredBB = &*std::prev(I);
     MergePotentials.clear();
-    for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
-           E2 = I->pred_end();
-         P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
-      MachineBasicBlock *PBB = *P;
+    for (MachineBasicBlock *PBB : I->predecessors()) {
+      if (MergePotentials.size() == TailMergeThreshold)
+        break;
+
       if (TriedMerging.count(PBB))
         continue;
 
@@ -977,7 +980,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
         continue;
 
       // Skip blocks which may jump to a landing pad. Can't tail merge these.
-      if (PBB->getLandingPadSuccessor())
+      if (PBB->hasEHPadSuccessor())
         continue;
 
       MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -990,18 +993,21 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
           if (TII->ReverseBranchCondition(NewCond))
             continue;
           // This is the QBB case described above
-          if (!FBB)
-            FBB = std::next(MachineFunction::iterator(PBB));
+          if (!FBB) {
+            auto Next = ++PBB->getIterator();
+            if (Next != MF.end())
+              FBB = &*Next;
+          }
         }
 
         // Failing case: the only way IBB can be reached from PBB is via
         // exception handling.  Happens for landing pads.  Would be nice to have
         // a bit in the edge so we didn't have to do all this.
-        if (IBB->isLandingPad()) {
-          MachineFunction::iterator IP = PBB;  IP++;
+        if (IBB->isEHPad()) {
+          MachineFunction::iterator IP = ++PBB->getIterator();
           MachineBasicBlock *PredNextBB = nullptr;
           if (IP != MF.end())
-            PredNextBB = IP;
+            PredNextBB = &*IP;
           if (!TBB) {
             if (IBB != PredNextBB)      // fallthrough
               continue;
@@ -1027,7 +1033,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
                               NewCond, dl);
         }
 
-        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), PBB));
       }
     }
 
@@ -1042,7 +1048,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
     // Reinsert an unconditional branch if needed. The 1 below can occur as a
     // result of removing blocks in TryTailMergeBlocks.
-    PredBB = std::prev(I);     // this may have been changed in TryTailMergeBlocks
+    PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks
     if (MergePotentials.size() == 1 &&
         MergePotentials.begin()->getBlock() != PredBB)
       FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
@@ -1080,13 +1086,19 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
   if (TailMBB.succ_size() <= 1)
     return;
 
-  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
-  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
+  auto SumEdgeFreq =
+      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
+          .getFrequency();
   auto EdgeFreq = EdgeFreqLs.begin();
 
-  for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
-       SuccI != SuccE; ++SuccI, ++EdgeFreq)
-    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
+  if (SumEdgeFreq > 0) {
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq) {
+      auto Prob = BranchProbability::getBranchProbability(
+          EdgeFreq->getFrequency(), SumEdgeFreq);
+      TailMBB.setSuccProbability(SuccI, Prob);
+    }
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1098,10 +1110,12 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
 
   // Make sure blocks are numbered in order
   MF.RenumberBlocks();
+  // Renumbering blocks alters funclet membership, recalculate it.
+  FuncletMembership = getFuncletMembership(MF);
 
   for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
        I != E; ) {
-    MachineBasicBlock *MBB = I++;
+    MachineBasicBlock *MBB = &*I++;
     MadeChange |= OptimizeBlock(MBB);
 
     // If it is dead, remove it.
@@ -1111,6 +1125,7 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
       ++NumDeadBlocks;
     }
   }
+
   return MadeChange;
 }
 
@@ -1167,20 +1182,31 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   MachineFunction &MF = *MBB->getParent();
 ReoptimizeBlock:
 
-  MachineFunction::iterator FallThrough = MBB;
+  MachineFunction::iterator FallThrough = MBB->getIterator();
   ++FallThrough;
 
+  // Make sure MBB and FallThrough belong to the same funclet.
+  bool SameFunclet = true;
+  if (!FuncletMembership.empty() && FallThrough != MF.end()) {
+    auto MBBFunclet = FuncletMembership.find(MBB);
+    assert(MBBFunclet != FuncletMembership.end());
+    auto FallThroughFunclet = FuncletMembership.find(&*FallThrough);
+    assert(FallThroughFunclet != FuncletMembership.end());
+    SameFunclet = MBBFunclet->second == FallThroughFunclet->second;
+  }
+
   // If this block is empty, make everyone use its fall-through, not the block
   // explicitly.  Landing pads should not do this since the landing-pad table
   // points to this block.  Blocks with their addresses taken shouldn't be
   // optimized away.
-  if (IsEmptyBlock(MBB) && !MBB->isLandingPad() && !MBB->hasAddressTaken()) {
+  if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
+      SameFunclet) {
     // Dead block?  Leave for cleanup later.
     if (MBB->pred_empty()) return MadeChange;
 
     if (FallThrough == MF.end()) {
       // TODO: Simplify preds to not branch here if possible!
-    } else if (FallThrough->isLandingPad()) {
+    } else if (FallThrough->isEHPad()) {
       // Don't rewrite to a landing pad fallthough.  That could lead to the case
       // where a BB jumps to more than one landing pad.
       // TODO: Is it ever worth rewriting predecessors which don't already
@@ -1190,12 +1216,12 @@ ReoptimizeBlock:
       // instead.
       while (!MBB->pred_empty()) {
         MachineBasicBlock *Pred = *(MBB->pred_end()-1);
-        Pred->ReplaceUsesOfBlockWith(MBB, FallThrough);
+        Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough);
       }
       // If MBB was the target of a jump table, update jump tables to go to the
       // fallthrough instead.
       if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
-        MJTI->ReplaceMBBInJumpTables(MBB, FallThrough);
+        MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough);
       MadeChange = true;
     }
     return MadeChange;
@@ -1237,7 +1263,7 @@ ReoptimizeBlock:
     // AnalyzeBranch.
     if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
         PrevBB.succ_size() == 1 &&
-        !MBB->hasAddressTaken() && !MBB->isLandingPad()) {
+        !MBB->hasAddressTaken() && !MBB->isEHPad()) {
       DEBUG(dbgs() << "\nMerging into block: " << PrevBB
                    << "From MBB: " << *MBB);
       // Remove redundant DBG_VALUEs first.
@@ -1333,7 +1359,7 @@ ReoptimizeBlock:
           TII->InsertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);
 
           // Move this block to the end of the function.
-          MBB->moveAfter(--MF.end());
+          MBB->moveAfter(&MF.back());
           MadeChange = true;
           ++NumBranchOpts;
           return MadeChange;
@@ -1371,7 +1397,7 @@ ReoptimizeBlock:
     // other blocks across it.
     if (CurTBB && CurCond.empty() && !CurFBB &&
         IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
-        !MBB->hasAddressTaken()) {
+        !MBB->hasAddressTaken() && !MBB->isEHPad()) {
       DebugLoc dl = getBranchDebugLoc(*MBB);
       // This block may contain just an unconditional branch.  Because there can
       // be 'non-branch terminators' in the block, try removing the branch and
@@ -1468,14 +1494,11 @@ ReoptimizeBlock:
     // see if it has a fall-through into its successor.
     bool CurFallsThru = MBB->canFallThrough();
 
-    if (!MBB->isLandingPad()) {
+    if (!MBB->isEHPad()) {
       // Check all the predecessors of this block.  If one of them has no fall
       // throughs, move this block right after it.
-      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-           E = MBB->pred_end(); PI != E; ++PI) {
+      for (MachineBasicBlock *PredBB : MBB->predecessors()) {
         // Analyze the branch at the end of the pred.
-        MachineBasicBlock *PredBB = *PI;
-        MachineFunction::iterator PredFallthrough = PredBB; ++PredFallthrough;
         MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
         SmallVector<MachineOperand, 4> PredCond;
         if (PredBB != MBB && !PredBB->canFallThrough() &&
@@ -1493,8 +1516,7 @@ ReoptimizeBlock:
           // B elsewhere
           // next:
           if (CurFallsThru) {
-            MachineBasicBlock *NextBB =
-                std::next(MachineFunction::iterator(MBB));
+            MachineBasicBlock *NextBB = &*std::next(MBB->getIterator());
             CurCond.clear();
             TII->InsertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
           }
@@ -1507,11 +1529,9 @@ ReoptimizeBlock:
 
     if (!CurFallsThru) {
       // Check all successors to see if we can move this block before it.
-      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-           E = MBB->succ_end(); SI != E; ++SI) {
+      for (MachineBasicBlock *SuccBB : MBB->successors()) {
         // Analyze the branch at the end of the block before the succ.
-        MachineBasicBlock *SuccBB = *SI;
-        MachineFunction::iterator SuccPrev = SuccBB; --SuccPrev;
+        MachineFunction::iterator SuccPrev = --SuccBB->getIterator();
 
         // If this block doesn't already fall-through to that successor, and if
         // the succ doesn't already have a block that can fall through into it,
@@ -1519,7 +1539,7 @@ ReoptimizeBlock:
         // fallthrough to happen.
         if (SuccBB != MBB && &*SuccPrev != MBB &&
             !SuccPrev->canFallThrough() && !CurUnAnalyzable &&
-            !SuccBB->isLandingPad()) {
+            !SuccBB->isEHPad()) {
           MBB->moveBefore(SuccBB);
           MadeChange = true;
           goto ReoptimizeBlock;
@@ -1531,10 +1551,18 @@ ReoptimizeBlock:
       // removed, move this block to the end of the function.
       MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
       SmallVector<MachineOperand, 4> PrevCond;
+      // We're looking for cases where PrevBB could possibly fall through to
+      // FallThrough, but if FallThrough is an EH pad that wouldn't be useful
+      // so here we skip over any EH pads so we might have a chance to find
+      // a branch target from PrevBB.
+      while (FallThrough != MF.end() && FallThrough->isEHPad())
+        ++FallThrough;
+      // Now check to see if the current block is sitting between PrevBB and
+      // a block to which it could fall through.
       if (FallThrough != MF.end() &&
           !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
-          PrevBB.isSuccessor(FallThrough)) {
-        MBB->moveAfter(--MF.end());
+          PrevBB.isSuccessor(&*FallThrough)) {
+        MBB->moveAfter(&MF.back());
         MadeChange = true;
         return MadeChange;
       }
@@ -1553,7 +1581,7 @@ ReoptimizeBlock:
 bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
   bool MadeChange = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
-    MachineBasicBlock *MBB = I++;
+    MachineBasicBlock *MBB = &*I++;
     MadeChange |= HoistCommonCodeInSuccs(MBB);
   }
 
@@ -1564,15 +1592,23 @@ bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
 /// its 'true' successor.
 static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
                                          MachineBasicBlock *TrueBB) {
-  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
-         E = BB->succ_end(); SI != E; ++SI) {
-    MachineBasicBlock *SuccBB = *SI;
+  for (MachineBasicBlock *SuccBB : BB->successors())
     if (SuccBB != TrueBB)
       return SuccBB;
-  }
   return nullptr;
 }
 
+template <class Container>
+static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
+                                Container &Set) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      Set.insert(*AI);
+  } else {
+    Set.insert(Reg);
+  }
+}
+
 /// findHoistingInsertPosAndDeps - Find the location to move common instructions
 /// in successors to. The location is usually just before the terminator,
 /// however if the terminator is a conditional branch and its previous
@@ -1590,16 +1626,14 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   if (!TII->isUnpredicatedTerminator(Loc))
     return MBB->end();
 
-  for (unsigned i = 0, e = Loc->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = Loc->getOperand(i);
+  for (const MachineOperand &MO : Loc->operands()) {
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isUse()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        Uses.insert(*AI);
+      addRegAndItsAliases(Reg, TRI, Uses);
     } else {
       if (!MO.isDead())
         // Don't try to hoist code in the rare case the terminator defines a
@@ -1608,8 +1642,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
 
       // If the terminator defines a register, make sure we don't hoist
       // the instruction whose def might be clobbered by the terminator.
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        Defs.insert(*AI);
+      addRegAndItsAliases(Reg, TRI, Defs);
     }
   }
 
@@ -1626,8 +1659,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     --PI;
 
   bool IsDef = false;
-  for (unsigned i = 0, e = PI->getNumOperands(); !IsDef && i != e; ++i) {
-    const MachineOperand &MO = PI->getOperand(i);
+  for (const MachineOperand &MO : PI->operands()) {
     // If PI has a regmask operand, it is probably a call. Separate away.
     if (MO.isRegMask())
       return Loc;
@@ -1636,8 +1668,10 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (Uses.count(Reg))
+    if (Uses.count(Reg)) {
       IsDef = true;
+      break;
+    }
   }
   if (!IsDef)
     // The condition setting instruction is not just before the conditional
@@ -1657,23 +1691,22 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
 
   // Find out what registers are live. Note this routine is ignoring other live
   // registers which are only used by instructions in successor blocks.
-  for (unsigned i = 0, e = PI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = PI->getOperand(i);
+  for (const MachineOperand &MO : PI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isUse()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        Uses.insert(*AI);
+      addRegAndItsAliases(Reg, TRI, Uses);
     } else {
       if (Uses.erase(Reg)) {
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-          Uses.erase(*SubRegs); // Use sub-registers to be conservative
+        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            Uses.erase(*SubRegs); // Use sub-registers to be conservative
+        }
       }
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        Defs.insert(*AI);
+      addRegAndItsAliases(Reg, TRI, Defs);
     }
   }
 
@@ -1737,8 +1770,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       break;
 
     bool IsSafe = true;
-    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = TIB->getOperand(i);
+    for (MachineOperand &MO : TIB->operands()) {
       // Don't attempt to hoist instructions with register masks.
       if (MO.isRegMask()) {
         IsSafe = false;
@@ -1793,28 +1825,29 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       break;
 
     // Remove kills from LocalDefsSet, these registers had short live ranges.
-    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = TIB->getOperand(i);
+    for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isUse() || !MO.isKill())
         continue;
       unsigned Reg = MO.getReg();
       if (!Reg || !LocalDefsSet.count(Reg))
         continue;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        LocalDefsSet.erase(*AI);
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          LocalDefsSet.erase(*AI);
+      } else {
+        LocalDefsSet.erase(Reg);
+      }
     }
 
     // Track local defs so we can update liveins.
-    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = TIB->getOperand(i);
+    for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
       LocalDefs.push_back(Reg);
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        LocalDefsSet.insert(*AI);
+      addRegAndItsAliases(Reg, TRI, LocalDefsSet);
     }
 
     HasDups = true;
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h
index 46c05dc..d759d53 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.h
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.h
@@ -54,6 +54,7 @@ namespace llvm {
     typedef std::vector<MergePotentialsElt>::iterator MPIterator;
     std::vector<MergePotentialsElt> MergePotentials;
     SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging;
+    DenseMap<const MachineBasicBlock *, int> FuncletMembership;
 
     class SameTailElt {
       MPIterator MPIter;
diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
index d08fae0..abc655a 100644
--- a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -24,6 +25,7 @@ using namespace llvm;
 
 void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
                            MachineFunction &MF,
+                           VirtRegMap *VRM,
                            const MachineLoopInfo &MLI,
                            const MachineBlockFrequencyInfo &MBFI,
                            VirtRegAuxInfo::NormalizingFn norm) {
@@ -31,7 +33,7 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
                << "********** Function: " << MF.getName() << '\n');
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  VirtRegAuxInfo VRAI(MF, LIS, MLI, MBFI, norm);
+  VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm);
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI.reg_nodbg_empty(Reg))
@@ -74,7 +76,10 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg,
 // Check if all values in LI are rematerializable
 static bool isRematerializable(const LiveInterval &LI,
                                const LiveIntervals &LIS,
+                               VirtRegMap *VRM,
                                const TargetInstrInfo &TII) {
+  unsigned Reg = LI.reg;
+  unsigned Original = VRM ? VRM->getOriginal(Reg) : 0;
   for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
        I != E; ++I) {
     const VNInfo *VNI = *I;
@@ -86,6 +91,36 @@ static bool isRematerializable(const LiveInterval &LI,
     MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
     assert(MI && "Dead valno in interval");
 
+    // Trace copies introduced by live range splitting.  The inline
+    // spiller can rematerialize through these copies, so the spill
+    // weight must reflect this.
+    if (VRM) {
+      while (MI->isFullCopy()) {
+        // The copy destination must match the interval register.
+        if (MI->getOperand(0).getReg() != Reg)
+          return false;
+
+        // Get the source register.
+        Reg = MI->getOperand(1).getReg();
+
+        // If the original (pre-splitting) registers match this
+        // copy came from a split.
+        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+            VRM->getOriginal(Reg) != Original)
+          return false;
+
+        // Follow the copy live-in value.
+        const LiveInterval &SrcLI = LIS.getInterval(Reg);
+        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
+        VNI = SrcQ.valueIn();
+        assert(VNI && "Copy from non-existing value");
+        if (VNI->isPHIDef())
+          return false;
+        MI = LIS.getInstructionFromIndex(VNI->def);
+        assert(MI && "Dead valno in interval");
+      }
+    }
+
     if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis()))
       return false;
   }
@@ -188,7 +223,7 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   // it is a preferred candidate for spilling.
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
-  if (isRematerializable(li, LIS, *MF.getSubtarget().getInstrInfo()))
+  if (isRematerializable(li, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
     totalWeight *= 0.5F;
 
   li.weight = normalize(totalWeight, li.getSize(), numInstr);
diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
index fb29b1d..23c0d54 100644
--- a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -32,6 +32,7 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
       CallOrPrologue(Unknown) {
   // No stack is used.
   StackOffset = 0;
+  MaxStackArgAlign = 1;
 
   clearByValRegsInfo();
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
@@ -192,6 +193,7 @@ static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
 void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
                                           MVT VT, CCAssignFn Fn) {
   unsigned SavedStackOffset = StackOffset;
+  unsigned SavedMaxStackArgAlign = MaxStackArgAlign;
   unsigned NumLocs = Locs.size();
 
   // Set the 'inreg' flag if it is used for this calling convention.
@@ -223,6 +225,7 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
   // as allocated so that future queries don't return the same registers, i.e.
   // when i64 and f64 are both passed in GPRs.
   StackOffset = SavedStackOffset;
+  MaxStackArgAlign = SavedMaxStackArgAlign;
   Locs.resize(NumLocs);
 }
 
diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp
index 155c5ec..dc13b5b 100644
--- a/contrib/llvm/lib/CodeGen/CodeGen.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp
@@ -29,6 +29,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeExpandISelPseudosPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
+  initializeFuncletLayoutPass(Registry);
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);
   initializeIfConverterPass(Registry);
@@ -66,6 +67,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 6ab6acc..03e5778 100644
--- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -63,6 +64,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
                           "computations were sunk");
 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
+STATISTIC(NumAndsAdded,
+          "Number of and mask instructions added to form ext loads");
+STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
@@ -109,25 +113,18 @@ static cl::opt<bool> StressExtLdPromotion(
 
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
-struct TypeIsSExt {
-  Type *Ty;
-  bool IsSExt;
-  TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {}
-};
+typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
 class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// transformation profitability.
     const TargetMachine *TM;
     const TargetLowering *TLI;
     const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
 
-    /// CurInstIterator - As we scan instructions optimizing them, this is the
-    /// next instruction to optimize.  Xforms that can invalidate this should
-    /// update it.
+    /// As we scan instructions optimizing them, this is the next instruction
+    /// to optimize. Transforms that can invalidate this should update it.
     BasicBlock::iterator CurInstIterator;
 
     /// Keeps track of non-local addresses that have been sunk into a block.
@@ -141,10 +138,10 @@ class TypePromotionTransaction;
     /// promotion for the current function.
     InstrToOrigTy PromotedInsts;
 
-    /// ModifiedDT - If CFG is modified in anyway.
+    /// True if CFG is modified in any way.
     bool ModifiedDT;
 
-    /// OptSize - True if optimizing for size.
+    /// True if optimizing for size.
     bool OptSize;
 
     /// DataLayout for the Function being processed.
@@ -167,30 +164,33 @@ class TypePromotionTransaction;
     }
 
   private:
-    bool EliminateFallThrough(Function &F);
-    bool EliminateMostlyEmptyBlocks(Function &F);
-    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
-    void EliminateMostlyEmptyBlock(BasicBlock *BB);
-    bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
-    bool OptimizeInst(Instruction *I, bool& ModifiedDT);
-    bool OptimizeMemoryInst(Instruction *I, Value *Addr,
+    bool eliminateFallThrough(Function &F);
+    bool eliminateMostlyEmptyBlocks(Function &F);
+    bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void eliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+    bool optimizeInst(Instruction *I, bool& ModifiedDT);
+    bool optimizeMemoryInst(Instruction *I, Value *Addr,
                             Type *AccessTy, unsigned AS);
-    bool OptimizeInlineAsmInst(CallInst *CS);
-    bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
-    bool MoveExtToFormExtLoad(Instruction *&I);
-    bool OptimizeExtUses(Instruction *I);
-    bool OptimizeSelectInst(SelectInst *SI);
-    bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
-    bool OptimizeExtractElementInst(Instruction *Inst);
-    bool DupRetToEnableTailCallOpts(BasicBlock *BB);
-    bool PlaceDbgValues(Function &F);
+    bool optimizeInlineAsmInst(CallInst *CS);
+    bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool moveExtToFormExtLoad(Instruction *&I);
+    bool optimizeExtUses(Instruction *I);
+    bool optimizeLoadExt(LoadInst *I);
+    bool optimizeSelectInst(SelectInst *SI);
+    bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool optimizeSwitchInst(SwitchInst *CI);
+    bool optimizeExtractElementInst(Instruction *Inst);
+    bool dupRetToEnableTailCallOpts(BasicBlock *BB);
+    bool placeDbgValues(Function &F);
     bool sinkAndCmp(Function &F);
-    bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
+    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
                         Instruction *&Inst,
                         const SmallVectorImpl<Instruction *> &Exts,
                         unsigned CreatedInstCost);
     bool splitBranchCondition(Function &F);
     bool simplifyOffsetableRelocate(Instruction &I);
+    void stripInvariantGroupMetadata(Instruction &I);
   };
 }
 
@@ -218,25 +218,31 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     TLI = TM->getSubtargetImpl(F)->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  OptSize = F.hasFnAttribute(Attribute::OptimizeForSize);
+  OptSize = F.optForSize();
 
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
   if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
     const DenseMap<unsigned int, unsigned int> &BypassWidths =
        TLI->getBypassSlowDivWidths();
-    for (Function::iterator I = F.begin(); I != F.end(); I++)
-      EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
+    BasicBlock* BB = &*F.begin();
+    while (BB != nullptr) {
+      // bypassSlowDivision may create new BBs, but we don't want to reapply the
+      // optimization to those blocks.
+      BasicBlock* Next = BB->getNextNode();
+      EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+      BB = Next;
+    }
   }
 
   // Eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
-  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
+  EverMadeChange |= eliminateMostlyEmptyBlocks(F);
 
   // llvm.dbg.value is far away from the value then iSel may not be able
   // handle it properly. iSel will drop llvm.dbg.value if it can not
   // find a node corresponding to the value.
-  EverMadeChange |= PlaceDbgValues(F);
+  EverMadeChange |= placeDbgValues(F);
 
   // If there is a mask, compare against zero, and branch that can be combined
   // into a single target instruction, push the mask and compare into branch
@@ -251,9 +257,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   while (MadeChange) {
     MadeChange = false;
     for (Function::iterator I = F.begin(); I != F.end(); ) {
-      BasicBlock *BB = I++;
+      BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
-      MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
 
       // Restart BB iteration if the dominator tree of the Function was changed
       if (ModifiedDTOnIteration)
@@ -296,7 +302,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     // Merge pairs of basic blocks with unconditional branches, connected by
     // a single edge.
     if (EverMadeChange || MadeChange)
-      MadeChange |= EliminateFallThrough(F);
+      MadeChange |= eliminateFallThrough(F);
 
     EverMadeChange |= MadeChange;
   }
@@ -314,14 +320,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   return EverMadeChange;
 }
 
-/// EliminateFallThrough - Merge basic blocks which are connected
-/// by a single edge, where one of the basic blocks has a single successor
-/// pointing to the other basic block, which has a single predecessor.
-bool CodeGenPrepare::EliminateFallThrough(Function &F) {
+/// Merge basic blocks which are connected by a single edge, where one of the
+/// basic blocks has a single successor pointing to the other basic block,
+/// which has a single predecessor.
+bool CodeGenPrepare::eliminateFallThrough(Function &F) {
   bool Changed = false;
   // Scan all of the blocks in the function, except for the entry block.
   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
-    BasicBlock *BB = I++;
+    BasicBlock *BB = &*I++;
     // If the destination block has a single pred, then this is a trivial
     // edge, just collapse it.
     BasicBlock *SinglePred = BB->getSinglePredecessor();
@@ -342,22 +348,21 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) {
         BB->moveBefore(&BB->getParent()->getEntryBlock());
 
       // We have erased a block. Update the iterator.
-      I = BB;
+      I = BB->getIterator();
     }
   }
   return Changed;
 }
 
-/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
-/// debug info directives, and an unconditional branch.  Passes before isel
-/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
-/// isel.  Start by eliminating these blocks so we can split them the way we
-/// want them.
-bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
+/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
+/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
+/// edges in ways that are non-optimal for isel. Start by eliminating these
+/// blocks so we can split them the way we want them.
+bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
   bool MadeChange = false;
   // Note that this intentionally skips the entry block.
   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
-    BasicBlock *BB = I++;
+    BasicBlock *BB = &*I++;
 
     // If this block doesn't end with an uncond branch, ignore it.
     BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
@@ -366,7 +371,7 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
 
     // If the instruction before the branch (skipping debug info) isn't a phi
     // node, then other stuff is happening here.
-    BasicBlock::iterator BBI = BI;
+    BasicBlock::iterator BBI = BI->getIterator();
     if (BBI != BB->begin()) {
       --BBI;
       while (isa<DbgInfoIntrinsic>(BBI)) {
@@ -383,19 +388,19 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
     if (DestBB == BB)
       continue;
 
-    if (!CanMergeBlocks(BB, DestBB))
+    if (!canMergeBlocks(BB, DestBB))
       continue;
 
-    EliminateMostlyEmptyBlock(BB);
+    eliminateMostlyEmptyBlock(BB);
     MadeChange = true;
   }
   return MadeChange;
 }
 
-/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
-/// single uncond branch between them, and BB contains no other non-phi
+/// Return true if we can merge BB into DestBB if there is a single
+/// unconditional branch between them, and BB contains no other non-phi
 /// instructions.
-bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
+bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
                                     const BasicBlock *DestBB) const {
   // We only want to eliminate blocks whose phi nodes are used by phi nodes in
   // the successor.  If there are more complex condition (e.g. preheaders),
@@ -461,9 +466,9 @@ bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
 }
 
 
-/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
-/// an unconditional branch in it.
-void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
+/// Eliminate a basic block that has only phi's and an unconditional branch in
+/// it.
+void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   BasicBlock *DestBB = BI->getSuccessor(0);
 
@@ -527,19 +532,17 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
 // Computes a map of base pointer relocation instructions to corresponding
 // derived pointer relocation instructions given a vector of all relocate calls
 static void computeBaseDerivedRelocateMap(
-    const SmallVectorImpl<User *> &AllRelocateCalls,
-    DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
-        RelocateInstMap) {
+    const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
+    DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
+        &RelocateInstMap) {
   // Collect information in two maps: one primarily for locating the base object
   // while filling the second map; the second map is the final structure holding
   // a mapping between Base and corresponding Derived relocate calls
-  DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
-  for (auto &U : AllRelocateCalls) {
-    GCRelocateOperands ThisRelocate(U);
-    IntrinsicInst *I = cast<IntrinsicInst>(U);
-    auto K = std::make_pair(ThisRelocate.getBasePtrIndex(),
-                            ThisRelocate.getDerivedPtrIndex());
-    RelocateIdxMap.insert(std::make_pair(K, I));
+  DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
+  for (auto *ThisRelocate : AllRelocateCalls) {
+    auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
+                            ThisRelocate->getDerivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
   }
   for (auto &Item : RelocateIdxMap) {
     std::pair<unsigned, unsigned> Key = Item.first;
@@ -547,7 +550,7 @@ static void computeBaseDerivedRelocateMap(
       // Base relocation: nothing to insert
       continue;
 
-    IntrinsicInst *I = Item.second;
+    GCRelocateInst *I = Item.second;
     auto BaseKey = std::make_pair(Key.first, Key.first);
 
     // We're iterating over RelocateIdxMap so we cannot modify it.
@@ -580,22 +583,27 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
 // replace, computes a replacement, and affects it.
 static bool
-simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
-                          const SmallVectorImpl<IntrinsicInst *> &Targets) {
+simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
+                          const SmallVectorImpl<GCRelocateInst *> &Targets) {
   bool MadeChange = false;
-  for (auto &ToReplace : Targets) {
-    GCRelocateOperands MasterRelocate(RelocatedBase);
-    GCRelocateOperands ThisRelocate(ToReplace);
-
-    assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() &&
+  for (GCRelocateInst *ToReplace : Targets) {
+    assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
            "Not relocating a derived object of the original base object");
-    if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) {
+    if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
       // A duplicate relocate call. TODO: coalesce duplicates.
       continue;
     }
 
-    Value *Base = ThisRelocate.getBasePtr();
-    auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.getDerivedPtr());
+    if (RelocatedBase->getParent() != ToReplace->getParent()) {
+      // Base and derived relocates are in different basic blocks.
+      // In this case transform is only valid when base dominates derived
+      // relocate. However it would be too expensive to check dominance
+      // for each such relocate, so we skip the whole transformation.
+      continue;
+    }
+
+    Value *Base = ToReplace->getBasePtr();
+    auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
     if (!Derived || Derived->getPointerOperand() != Base)
       continue;
 
@@ -631,21 +639,20 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
     // In this case, we can not find the bitcast any more. So we insert a new bitcast
     // no matter there is already one or not. In this way, we can handle all cases, and
     // the extra bitcast should be optimized away in later passes.
-    Instruction *ActualRelocatedBase = RelocatedBase;
+    Value *ActualRelocatedBase = RelocatedBase;
     if (RelocatedBase->getType() != Base->getType()) {
       ActualRelocatedBase =
-          cast<Instruction>(Builder.CreateBitCast(RelocatedBase, Base->getType()));
+          Builder.CreateBitCast(RelocatedBase, Base->getType());
     }
     Value *Replacement = Builder.CreateGEP(
         Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
-    Instruction *ReplacementInst = cast<Instruction>(Replacement);
     Replacement->takeName(ToReplace);
     // If the newly generated derived pointer's type does not match the original derived
     // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
-    Instruction *ActualReplacement = ReplacementInst;
-    if (ReplacementInst->getType() != ToReplace->getType()) {
+    Value *ActualReplacement = Replacement;
+    if (Replacement->getType() != ToReplace->getType()) {
       ActualReplacement =
-          cast<Instruction>(Builder.CreateBitCast(ReplacementInst, ToReplace->getType()));
+          Builder.CreateBitCast(Replacement, ToReplace->getType());
     }
     ToReplace->replaceAllUsesWith(ActualReplacement);
     ToReplace->eraseFromParent();
@@ -674,12 +681,12 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
 // %val = load %ptr'
 bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
   bool MadeChange = false;
-  SmallVector<User *, 2> AllRelocateCalls;
+  SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
 
   for (auto *U : I.users())
-    if (isGCRelocate(dyn_cast<Instruction>(U)))
+    if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
       // Collect all the relocate calls associated with a statepoint
-      AllRelocateCalls.push_back(U);
+      AllRelocateCalls.push_back(Relocate);
 
   // We need atleast one base pointer relocation + one derived pointer
   // relocation to mangle
@@ -688,7 +695,7 @@ bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
 
   // RelocateInstMap is a mapping from the base relocate instruction to the
   // corresponding derived relocate instructions
-  DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
+  DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
   computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
   if (RelocateInstMap.empty())
     return false;
@@ -723,6 +730,12 @@ static bool SinkCast(CastInst *CI) {
     // Preincrement use iterator so we don't invalidate it.
     ++UI;
 
+    // If the block selected to receive the cast is an EH pad that does not
+    // allow non-PHI instructions before the terminator, we can't sink the
+    // cast.
+    if (UserBB->getTerminator()->isEHPad())
+      continue;
+
     // If this user is in the same block as the cast, don't change the cast.
     if (UserBB == DefBB) continue;
 
@@ -731,9 +744,9 @@ static bool SinkCast(CastInst *CI) {
 
     if (!InsertedCast) {
       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      InsertedCast =
-        CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
-                         InsertPt);
+      assert(InsertPt != UserBB->end());
+      InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
+                                      CI->getType(), "", &*InsertPt);
     }
 
     // Replace a use of the cast with a use of the new cast.
@@ -751,10 +764,9 @@ static bool SinkCast(CastInst *CI) {
   return MadeChange;
 }
 
-/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
-/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
-/// sink it into user blocks to reduce the number of virtual
-/// registers that must be created and coalesced.
+/// If the specified cast instruction is a noop copy (e.g. it's casting from
+/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
+/// reduce the number of virtual registers that must be created and coalesced.
 ///
 /// Return true if any changes are made.
 ///
@@ -789,8 +801,8 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
   return SinkCast(CI);
 }
 
-/// CombineUAddWithOverflow - try to combine CI into a call to the
-/// llvm.uadd.with.overflow intrinsic if possible.
+/// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if
+/// possible.
 ///
 /// Return true if any changes were made.
 static bool CombineUAddWithOverflow(CmpInst *CI) {
@@ -818,7 +830,7 @@ static bool CombineUAddWithOverflow(CmpInst *CI) {
     assert(*AddI->user_begin() == CI && "expected!");
 #endif
 
-  Module *M = CI->getParent()->getParent()->getParent();
+  Module *M = CI->getModule();
   Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);
 
   auto *InsertPt = AddI->hasOneUse() ? CI : AddI;
@@ -836,16 +848,16 @@ static bool CombineUAddWithOverflow(CmpInst *CI) {
   return true;
 }
 
-/// SinkCmpExpression - Sink the given CmpInst into user blocks to reduce
-/// the number of virtual registers that must be created and coalesced.  This is
-/// a clear win except on targets with multiple condition code registers
-///  (PowerPC), where it might lose; some adjustment may be wanted there.
+/// Sink the given CmpInst into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced. This is a clear win except on
+/// targets with multiple condition code registers (PowerPC), where it might
+/// lose; some adjustment may be wanted there.
 ///
 /// Return true if any changes are made.
 static bool SinkCmpExpression(CmpInst *CI) {
   BasicBlock *DefBB = CI->getParent();
 
-  /// InsertedCmp - Only insert a cmp in each block once.
+  /// Only insert a cmp in each block once.
   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
 
   bool MadeChange = false;
@@ -872,10 +884,10 @@ static bool SinkCmpExpression(CmpInst *CI) {
 
     if (!InsertedCmp) {
       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
       InsertedCmp =
-        CmpInst::Create(CI->getOpcode(),
-                        CI->getPredicate(),  CI->getOperand(0),
-                        CI->getOperand(1), "", InsertPt);
+          CmpInst::Create(CI->getOpcode(), CI->getPredicate(),
+                          CI->getOperand(0), CI->getOperand(1), "", &*InsertPt);
     }
 
     // Replace a use of the cmp with a use of the new cmp.
@@ -903,8 +915,8 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
   return false;
 }
 
-/// isExtractBitsCandidateUse - Check if the candidates could
-/// be combined with shift instruction, which includes:
+/// Check if the candidates could be combined with a shift instruction, which
+/// includes:
 /// 1. Truncate instruction
 /// 2. And instruction and the imm is a mask of the low bits:
 /// imm & (imm+1) == 0
@@ -922,8 +934,7 @@ static bool isExtractBitsCandidateUse(Instruction *User) {
   return true;
 }
 
-/// SinkShiftAndTruncate - sink both shift and truncate instruction
-/// to the use of truncate's BB.
+/// Sink both shift and truncate instruction to the use of truncate's BB.
 static bool
 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
@@ -970,20 +981,22 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
 
     if (!InsertedShift && !InsertedTrunc) {
       BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
+      assert(InsertPt != TruncUserBB->end());
       // Sink the shift
       if (ShiftI->getOpcode() == Instruction::AShr)
-        InsertedShift =
-            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
+        InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
       else
-        InsertedShift =
-            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
+        InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
 
       // Sink the trunc
       BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
       TruncInsertPt++;
+      assert(TruncInsertPt != TruncUserBB->end());
 
       InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
-                                       TruncI->getType(), "", TruncInsertPt);
+                                       TruncI->getType(), "", &*TruncInsertPt);
 
       MadeChange = true;
 
@@ -993,10 +1006,10 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
   return MadeChange;
 }
 
-/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if
-/// the uses could potentially be combined with this shift instruction and
-/// generate BitExtract instruction. It will only be applied if the architecture
-/// supports BitExtract instruction. Here is an example:
+/// Sink the shift *right* instruction into user blocks if the uses could
+/// potentially be combined with this shift instruction and generate BitExtract
+/// instruction. It will only be applied if the architecture supports BitExtract
+/// instruction. Here is an example:
 /// BB1:
 ///   %x.extract.shift = lshr i64 %arg1, 32
 /// BB2:
@@ -1067,13 +1080,14 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
 
     if (!InsertedShift) {
       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
 
       if (ShiftI->getOpcode() == Instruction::AShr)
-        InsertedShift =
-            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
+        InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
       else
-        InsertedShift =
-            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
+        InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
 
       MadeChange = true;
     }
@@ -1089,12 +1103,12 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
   return MadeChange;
 }
 
-//  ScalarizeMaskedLoad() translates masked load intrinsic, like 
+// Translate a masked load intrinsic like
 // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
 //                               <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, whith loading element one-by-one if
+// to a chain of basic blocks, with loading element one-by-one if
 // the appropriate mask bit is set
-// 
+//
 //  %1 = bitcast i8* %addr to i32*
 //  %2 = extractelement <16 x i1> %mask, i32 0
 //  %3 = icmp eq i1 %2, true
@@ -1126,35 +1140,68 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
 //
 static void ScalarizeMaskedLoad(CallInst *CI) {
   Value *Ptr  = CI->getArgOperand(0);
-  Value *Src0 = CI->getArgOperand(3);
+  Value *Alignment = CI->getArgOperand(1);
   Value *Mask = CI->getArgOperand(2);
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-  Type *EltTy = VecType->getElementType();
+  Value *Src0 = CI->getArgOperand(3);
 
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
   assert(VecType && "Unexpected return type of masked load intrinsic");
 
+  Type *EltTy = CI->getType()->getVectorElementType();
+
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
   BasicBlock *CondBlock = nullptr;
   BasicBlock *PrevIfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
 
+  Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask = isa<Constant>(Mask) &&
+    cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
   // Bitcast %addr fron i8* to EltTy*
   Type *NewPtrType =
     EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
   Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
   Value *UndefVal = UndefValue::get(VecType);
 
   // The result vector
   Value *VResult = UndefVal;
 
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+          continue;
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+      VResult = Builder.CreateInsertElement(VResult, Load,
+                                            Builder.getInt32(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
   PHINode *Phi = nullptr;
   Value *PrevPhi = UndefVal;
 
-  unsigned VectorWidth = VecType->getNumElements();
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
 
     // Fill the "else" block, created in the previous iteration
@@ -1182,16 +1229,17 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
     //  %Elt = load i32* %EltAddr
     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
     //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
     Builder.SetInsertPoint(InsertPt);
 
     Value *Gep =
         Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    LoadInst* Load = Builder.CreateLoad(Gep, false);
+    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
     VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
 
     // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
     Builder.SetInsertPoint(InsertPt);
     Instruction *OldBr = IfBlock->getTerminator();
     BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
@@ -1208,7 +1256,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
   CI->eraseFromParent();
 }
 
-//  ScalarizeMaskedStore() translates masked store intrinsic, like
+// Translate a masked store intrinsic, like
 // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
 //                               <16 x i1> %mask)
 // to a chain of basic blocks, that stores element one-by-one if
@@ -1224,12 +1272,12 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
 //   %5 = getelementptr i32* %1, i32 0
 //   store i32 %4, i32* %5
 //   br label %else
-// 
+//
 // else:                                             ; preds = %0, %cond.store
 //   %6 = extractelement <16 x i1> %mask, i32 1
 //   %7 = icmp eq i1 %6, true
 //   br i1 %7, label %cond.store1, label %else2
-// 
+//
 // cond.store1:                                      ; preds = %else
 //   %8 = extractelement <16 x i32> %val, i32 1
 //   %9 = getelementptr i32* %1, i32 1
@@ -1237,34 +1285,61 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
 //   br label %else2
 //   . . .
 static void ScalarizeMaskedStore(CallInst *CI) {
-  Value *Ptr  = CI->getArgOperand(1);
   Value *Src = CI->getArgOperand(0);
+  Value *Ptr  = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
   Value *Mask = CI->getArgOperand(3);
 
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
   VectorType *VecType = dyn_cast<VectorType>(Src->getType());
-  Type *EltTy = VecType->getElementType();
-
   assert(VecType && "Unexpected data type in masked store intrinsic");
 
+  Type *EltTy = VecType->getElementType();
+
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
   Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask = isa<Constant>(Mask) &&
+    cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
   // Bitcast %addr fron i8* to EltTy*
   Type *NewPtrType =
     EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
   Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-
   unsigned VectorWidth = VecType->getNumElements();
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+          continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
 
     // Fill the "else" block, created in the previous iteration
     //
     //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
     //  %to_store = icmp eq i1 %mask_1, true
-    //  br i1 %to_load, label %cond.store, label %else
+    //  br i1 %to_store, label %cond.store, label %else
     //
     Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
     Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
@@ -1276,13 +1351,259 @@ static void ScalarizeMaskedStore(CallInst *CI) {
     //  %EltAddr = getelementptr i32* %1, i32 0
     //  %store i32 %OneElt, i32* %EltAddr
     //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
     Builder.SetInsertPoint(InsertPt);
-    
+
     Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
     Value *Gep =
         Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    Builder.CreateStore(OneElt, Gep);
+    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void ScalarizeMaskedGather(CallInst *CI) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
+                                                 "Load" + Twine(Idx));
+      VResult = Builder.CreateInsertElement(VResult, Load,
+                                            Builder.getInt32(Idx),
+                                            "Res" + Twine(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %ToLoad1 = icmp eq i1 %Mask1, true
+    //  br i1 %ToLoad1, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask,
+                                                    Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToLoad" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
+                                               "Load" + Twine(Idx));
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+                                          "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+//   . . .
+static void ScalarizeMaskedScatter(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  assert(isa<VectorType>(Src->getType()) &&
+         "Unexpected data type in masked scatter intrinsic");
+  assert(isa<VectorType>(Ptrs->getType()) &&
+         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+         "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                   "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+    //  % ToStore = icmp eq i1 % Mask1, true
+    //  br i1 % ToStore, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask,
+                                                    Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp =
+       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                          ConstantInt::get(Predicate->getType(), 1),
+                          "ToStore" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 % Elt1, i32* % Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                 "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
 
     // Create "else" block, fill it in the next iteration
     BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
@@ -1295,7 +1616,86 @@ static void ScalarizeMaskedStore(CallInst *CI) {
   CI->eraseFromParent();
 }
 
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
+/// If counting leading or trailing zeros is an expensive operation and a zero
+/// input is defined, add a check for zero to avoid calling the intrinsic.
+///
+/// We want to transform:
+///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
+///
+/// into:
+///   entry:
+///     %cmpz = icmp eq i64 %A, 0
+///     br i1 %cmpz, label %cond.end, label %cond.false
+///   cond.false:
+///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
+///     br label %cond.end
+///   cond.end:
+///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
+///
+/// If the transform is performed, return true and set ModifiedDT to true.
+static bool despeculateCountZeros(IntrinsicInst *CountZeros,
+                                  const TargetLowering *TLI,
+                                  const DataLayout *DL,
+                                  bool &ModifiedDT) {
+  if (!TLI || !DL)
+    return false;
+
+  // If a zero input is undefined, it doesn't make sense to despeculate that.
+  if (match(CountZeros->getOperand(1), m_One()))
+    return false;
+
+  // If it's cheap to speculate, there's nothing to do.
+  auto IntrinsicID = CountZeros->getIntrinsicID();
+  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
+      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
+    return false;
+
+  // Only handle legal scalar cases. Anything else requires too much work.
+  Type *Ty = CountZeros->getType();
+  unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
+  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize())
+    return false;
+
+  // The intrinsic will be sunk behind a compare against zero and branch.
+  BasicBlock *StartBlock = CountZeros->getParent();
+  BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
+
+  // Create another block after the count zero intrinsic. A PHI will be added
+  // in this block to select the result of the intrinsic or the bit-width
+  // constant if the input to the intrinsic is zero.
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
+  BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
+
+  // Set up a builder to create a compare, conditional branch, and PHI.
+  IRBuilder<> Builder(CountZeros->getContext());
+  Builder.SetInsertPoint(StartBlock->getTerminator());
+  Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
+
+  // Replace the unconditional branch that was created by the first split with
+  // a compare against zero and a conditional branch.
+  Value *Zero = Constant::getNullValue(Ty);
+  Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
+  Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
+  StartBlock->getTerminator()->eraseFromParent();
+
+  // Create a PHI in the end block to select either the output of the intrinsic
+  // or the bit width of the operand.
+  Builder.SetInsertPoint(&EndBlock->front());
+  PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
+  CountZeros->replaceAllUsesWith(PN);
+  Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
+  PN->addIncoming(BitWidth, StartBlock);
+  PN->addIncoming(CountZeros, CallBlock);
+
+  // We are explicitly handling the zero case, so we can set the intrinsic's
+  // undefined zero argument to 'true'. This will also prevent reprocessing the
+  // intrinsic; we only despeculate when a zero input is defined.
+  CountZeros->setArgOperand(1, Builder.getTrue());
+  ModifiedDT = true;
+  return true;
+}
+
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
   // Lower inline assembly if we can.
@@ -1311,7 +1711,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       return true;
     }
     // Sink address computing for memory operands into the block.
-    if (OptimizeInlineAsmInst(CI))
+    if (optimizeInlineAsmInst(CI))
       return true;
   }
 
@@ -1372,14 +1772,14 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       // Substituting this can cause recursive simplifications, which can
       // invalidate our iterator.  Use a WeakVH to hold onto it in case this
       // happens.
-      WeakVH IterHandle(CurInstIterator);
+      WeakVH IterHandle(&*CurInstIterator);
 
       replaceAndRecursivelySimplify(CI, RetVal,
                                     TLInfo, nullptr);
 
       // If the iterator instruction was recursively deleted, start over at the
       // start of the block.
-      if (IterHandle != CurInstIterator) {
+      if (IterHandle != CurInstIterator.getNodePtrUnchecked()) {
         CurInstIterator = BB->begin();
         SunkAddrs.clear();
       }
@@ -1387,7 +1787,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
     case Intrinsic::masked_load: {
       // Scalarize unsupported vector masked load
-      if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
+      if (!TTI->isLegalMaskedLoad(CI->getType())) {
         ScalarizeMaskedLoad(CI);
         ModifiedDT = true;
         return true;
@@ -1395,13 +1795,29 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       return false;
     }
     case Intrinsic::masked_store: {
-      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
         ScalarizeMaskedStore(CI);
         ModifiedDT = true;
         return true;
       }
       return false;
     }
+    case Intrinsic::masked_gather: {
+      if (!TTI->isLegalMaskedGather(CI->getType())) {
+        ScalarizeMaskedGather(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_scatter: {
+      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+        ScalarizeMaskedScatter(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
     case Intrinsic::aarch64_stlxr:
     case Intrinsic::aarch64_stxr: {
       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
@@ -1415,6 +1831,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       InsertedInsts.insert(ExtVal);
       return true;
     }
+    case Intrinsic::invariant_group_barrier:
+      II->replaceAllUsesWith(II->getArgOperand(0));
+      II->eraseFromParent();
+      return true;
+
+    case Intrinsic::cttz:
+    case Intrinsic::ctlz:
+      // If counting zeros is expensive, try to avoid it.
+      return despeculateCountZeros(II, TLI, DL, ModifiedDT);
     }
 
     if (TLI) {
@@ -1426,7 +1851,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       Type *AccessTy;
       if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
         while (!PtrOps.empty())
-          if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
+          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
             return true;
     }
   }
@@ -1447,9 +1872,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   return false;
 }
 
-/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
-/// instructions to the predecessor to enable tail call optimizations. The
-/// case it is currently looking for is:
+/// Look for opportunities to duplicate return instructions to the predecessor
+/// to enable tail call optimizations. The case it is currently looking for is:
 /// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
@@ -1478,7 +1902,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
 ///   %tmp2 = tail call i32 @f2()
 ///   ret i32 %tmp2
 /// @endcode
-bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
+bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
   if (!TLI)
     return false;
 
@@ -1597,7 +2021,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
 
 namespace {
 
-/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode
+/// This is an extended version of TargetLowering::AddrMode
 /// which holds actual Value*'s for register values.
 struct ExtAddrMode : public TargetLowering::AddrMode {
   Value *BaseReg;
@@ -1709,10 +2133,10 @@ class TypePromotionTransaction {
   public:
     /// \brief Record the position of \p Inst.
     InsertionHandler(Instruction *Inst) {
-      BasicBlock::iterator It = Inst;
+      BasicBlock::iterator It = Inst->getIterator();
       HasPrevInstruction = (It != (Inst->getParent()->begin()));
       if (HasPrevInstruction)
-        Point.PrevInst = --It;
+        Point.PrevInst = &*--It;
       else
         Point.BB = Inst->getParent();
     }
@@ -1724,7 +2148,7 @@ class TypePromotionTransaction {
           Inst->removeFromParent();
         Inst->insertAfter(Point.PrevInst);
       } else {
-        Instruction *Position = Point.BB->getFirstInsertionPt();
+        Instruction *Position = &*Point.BB->getFirstInsertionPt();
         if (Inst->getParent())
           Inst->moveBefore(Position);
         else
@@ -1797,7 +2221,7 @@ class TypePromotionTransaction {
         Value *Val = Inst->getOperand(It);
         OriginalValues.push_back(Val);
         // Set a dummy one.
-        // We could use OperandSetter here, but that would implied an overhead
+        // We could use OperandSetter here, but that would imply an overhead
         // that we are not willing to pay.
         Inst->setOperand(It, UndefValue::get(Val->getType()));
       }
@@ -2111,7 +2535,7 @@ class AddressingModeMatcher {
   unsigned AddrSpace;
   Instruction *MemoryInst;
 
-  /// AddrMode - This is the addressing mode that we're building up.  This is
+  /// This is the addressing mode that we're building up. This is
   /// part of the return value of this addressing mode matching stuff.
   ExtAddrMode &AddrMode;
 
@@ -2122,9 +2546,8 @@ class AddressingModeMatcher {
   /// The ongoing transaction where every action should be registered.
   TypePromotionTransaction &TPT;
 
-  /// IgnoreProfitability - This is set to true when we should not do
-  /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
-  /// always returns true.
+  /// This is set to true when we should not do profitability checks.
+  /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
   bool IgnoreProfitability;
 
   AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
@@ -2143,7 +2566,7 @@ class AddressingModeMatcher {
   }
 public:
 
-  /// Match - Find the maximal addressing mode that a load/store of V can fold,
+  /// Find the maximal addressing mode that a load/store of V can fold,
   /// give an access type of AccessTy.  This returns a list of involved
   /// instructions in AddrModeInsts.
   /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
@@ -2161,32 +2584,32 @@ public:
 
     bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
-                                         PromotedInsts, TPT).MatchAddr(V, 0);
+                                         PromotedInsts, TPT).matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
     return Result;
   }
 private:
-  bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
-  bool MatchAddr(Value *V, unsigned Depth);
-  bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
+  bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
+  bool matchAddr(Value *V, unsigned Depth);
+  bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
                           bool *MovedAway = nullptr);
-  bool IsProfitableToFoldIntoAddressingMode(Instruction *I,
+  bool isProfitableToFoldIntoAddressingMode(Instruction *I,
                                             ExtAddrMode &AMBefore,
                                             ExtAddrMode &AMAfter);
-  bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
-  bool IsPromotionProfitable(unsigned NewCost, unsigned OldCost,
+  bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
+  bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
                              Value *PromotedOperand) const;
 };
 
-/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
+/// Try adding ScaleReg*Scale to the current addressing mode.
 /// Return true and update AddrMode if this addr mode is legal for the target,
 /// false if not.
-bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
+bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
                                              unsigned Depth) {
   // If Scale is 1, then this is the same as adding ScaleReg to the addressing
   // mode.  Just process that directly.
   if (Scale == 1)
-    return MatchAddr(ScaleReg, Depth);
+    return matchAddr(ScaleReg, Depth);
 
   // If the scale is 0, it takes nothing to add this.
   if (Scale == 0)
@@ -2233,9 +2656,9 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
   return true;
 }
 
-/// MightBeFoldableInst - This is a little filter, which returns true if an
-/// addressing computation involving I might be folded into a load/store
-/// accessing it.  This doesn't need to be perfect, but needs to accept at least
+/// This is a little filter, which returns true if an addressing computation
+/// involving I might be folded into a load/store accessing it.
+/// This doesn't need to be perfect, but needs to accept at least
 /// the set of instructions that MatchOperationAddr can.
 static bool MightBeFoldableInst(Instruction *I) {
   switch (I->getOpcode()) {
@@ -2301,9 +2724,7 @@ class TypePromotionHelper {
   /// \brief Utility function to determine if \p OpIdx should be promoted when
   /// promoting \p Inst.
   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
-    if (isa<SelectInst>(Inst) && OpIdx == 0)
-      return false;
-    return true;
+    return !(isa<SelectInst>(Inst) && OpIdx == 0);
   }
 
   /// \brief Utility function to promote the operand of \p Ext when this
@@ -2413,8 +2834,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
 
   Value *OpndVal = Inst->getOperand(0);
   // Check if we can use this operand in the extension.
-  // If the type is larger than the result type of the extension,
-  // we cannot.
+  // If the type is larger than the result type of the extension, we cannot.
   if (!OpndVal->getType()->isIntegerTy() ||
       OpndVal->getType()->getIntegerBitWidth() >
           ConsideredExtType->getIntegerBitWidth())
@@ -2433,18 +2853,16 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
   // #1 get the type of the operand and check the kind of the extended bits.
   const Type *OpndType;
   InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
-  if (It != PromotedInsts.end() && It->second.IsSExt == IsSExt)
-    OpndType = It->second.Ty;
+  if (It != PromotedInsts.end() && It->second.getInt() == IsSExt)
+    OpndType = It->second.getPointer();
   else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
     OpndType = Opnd->getOperand(0)->getType();
   else
     return false;
 
-  // #2 check that the truncate just drop extended bits.
-  if (Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth())
-    return true;
-
-  return false;
+  // #2 check that the truncate just drops extended bits.
+  return Inst->getType()->getIntegerBitWidth() >=
+         OpndType->getIntegerBitWidth();
 }
 
 TypePromotionHelper::Action TypePromotionHelper::getAction(
@@ -2553,7 +2971,7 @@ Value *TypePromotionHelper::promoteOperandForOther(
     }
 
     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
-    // Restore the operand of Ext (which has been replace by the previous call
+    // Restore the operand of Ext (which has been replaced by the previous call
     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
     TPT.setOperand(Ext, 0, ExtOpnd);
   }
@@ -2631,8 +3049,7 @@ Value *TypePromotionHelper::promoteOperandForOther(
   return ExtOpnd;
 }
 
-/// IsPromotionProfitable - Check whether or not promoting an instruction
-/// to a wider type was profitable.
+/// Check whether or not promoting an instruction to a wider type is profitable.
 /// \p NewCost gives the cost of extension instructions created by the
 /// promotion.
 /// \p OldCost gives the cost of extension instructions before the promotion
@@ -2640,7 +3057,7 @@ Value *TypePromotionHelper::promoteOperandForOther(
 /// matched in the addressing mode the promotion.
 /// \p PromotedOperand is the value that has been promoted.
 /// \return True if the promotion is profitable, false otherwise.
-bool AddressingModeMatcher::IsPromotionProfitable(
+bool AddressingModeMatcher::isPromotionProfitable(
     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
   DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
   // The cost of the new extensions is greater than the cost of the
@@ -2656,9 +3073,9 @@ bool AddressingModeMatcher::IsPromotionProfitable(
   return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
 }
 
-/// MatchOperationAddr - Given an instruction or constant expr, see if we can
-/// fold the operation into the addressing mode.  If so, update the addressing
-/// mode and return true, otherwise return false without modifying AddrMode.
+/// Given an instruction or constant expr, see if we can fold the operation
+/// into the addressing mode. If so, update the addressing mode and return
+/// true, otherwise return false without modifying AddrMode.
 /// If \p MovedAway is not NULL, it contains the information of whether or
 /// not AddrInst has to be folded into the addressing mode on success.
 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
@@ -2667,7 +3084,7 @@ bool AddressingModeMatcher::IsPromotionProfitable(
 /// This state can happen when AddrInst is a sext, since it may be moved away.
 /// Therefore, AddrInst may not be valid when MovedAway is true and it must
 /// not be referenced anymore.
-bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
+bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
                                                unsigned Depth,
                                                bool *MovedAway) {
   // Avoid exponential behavior on extremely deep expression trees.
@@ -2680,13 +3097,13 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   switch (Opcode) {
   case Instruction::PtrToInt:
     // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return MatchAddr(AddrInst->getOperand(0), Depth);
+    return matchAddr(AddrInst->getOperand(0), Depth);
   case Instruction::IntToPtr: {
     auto AS = AddrInst->getType()->getPointerAddressSpace();
     auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
     // This inttoptr is a no-op if the integer type is pointer sized.
     if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
-      return MatchAddr(AddrInst->getOperand(0), Depth);
+      return matchAddr(AddrInst->getOperand(0), Depth);
     return false;
   }
   case Instruction::BitCast:
@@ -2698,14 +3115,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
         // and we don't want to mess around with them.  Assume it knows what it
         // is doing.
         AddrInst->getOperand(0)->getType() != AddrInst->getType())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
+      return matchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::AddrSpaceCast: {
     unsigned SrcAS
       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
     if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
-      return MatchAddr(AddrInst->getOperand(0), Depth);
+      return matchAddr(AddrInst->getOperand(0), Depth);
     return false;
   }
   case Instruction::Add: {
@@ -2719,8 +3136,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
 
-    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(0), Depth+1))
+    if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
+        matchAddr(AddrInst->getOperand(0), Depth+1))
       return true;
 
     // Restore the old addr mode info.
@@ -2729,8 +3146,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     TPT.rollback(LastKnownGood);
 
     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
-    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(1), Depth+1))
+    if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
+        matchAddr(AddrInst->getOperand(1), Depth+1))
       return true;
 
     // Otherwise we definitely can't merge the ADD in.
@@ -2752,7 +3169,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     if (Opcode == Instruction::Shl)
       Scale = 1LL << Scale;
 
-    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+    return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
   }
   case Instruction::GetElementPtr: {
     // Scan the GEP.  We check it if it contains constant offsets and at most
@@ -2791,7 +3208,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
       if (ConstantOffset == 0 ||
           TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
         // Check to see if we can fold the base pointer in too.
-        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
+        if (matchAddr(AddrInst->getOperand(0), Depth+1))
           return true;
       }
       AddrMode.BaseOffs -= ConstantOffset;
@@ -2806,7 +3223,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     AddrMode.BaseOffs += ConstantOffset;
 
     // Match the base operand of the GEP.
-    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
+    if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
       // If it couldn't be matched, just stuff the value in a register.
       if (AddrMode.HasBaseReg) {
         AddrMode = BackupAddrMode;
@@ -2818,7 +3235,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     }
 
     // Match the remaining variable portion of the GEP.
-    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+    if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
                           Depth)) {
       // If it couldn't be matched, try stuffing the base into a register
       // instead of matching it, and retrying the match of the scale.
@@ -2829,7 +3246,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
       AddrMode.HasBaseReg = true;
       AddrMode.BaseReg = AddrInst->getOperand(0);
       AddrMode.BaseOffs += ConstantOffset;
-      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
+      if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
                             VariableScale, Depth)) {
         // If even that didn't work, bail.
         AddrMode = BackupAddrMode;
@@ -2879,12 +3296,12 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     ExtAddrMode BackupAddrMode = AddrMode;
     unsigned OldSize = AddrModeInsts.size();
 
-    if (!MatchAddr(PromotedOperand, Depth) ||
-        // The total of the new cost is equals to the cost of the created
+    if (!matchAddr(PromotedOperand, Depth) ||
+        // The total of the new cost is equal to the cost of the created
         // instructions.
-        // The total of the old cost is equals to the cost of the extension plus
+        // The total of the old cost is equal to the cost of the extension plus
         // what we have saved in the addressing mode.
-        !IsPromotionProfitable(CreatedInstsCost,
+        !isPromotionProfitable(CreatedInstsCost,
                                ExtCost + (AddrModeInsts.size() - OldSize),
                                PromotedOperand)) {
       AddrMode = BackupAddrMode;
@@ -2899,12 +3316,12 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   return false;
 }
 
-/// MatchAddr - If we can, try to add the value of 'Addr' into the current
-/// addressing mode.  If Addr can't be added to AddrMode this returns false and
-/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
-/// or intptr_t for the target.
+/// If we can, try to add the value of 'Addr' into the current addressing mode.
+/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
+/// unmodified. This assumes that Addr is either a pointer type or intptr_t
+/// for the target.
 ///
-bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
+bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
   // Start a transaction at this point that we will rollback if the matching
   // fails.
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
@@ -2929,8 +3346,8 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
 
     // Check to see if it is possible to fold this operation.
     bool MovedAway = false;
-    if (MatchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
-      // This instruction may have been move away. If so, there is nothing
+    if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
+      // This instruction may have been moved away. If so, there is nothing
       // to check here.
       if (MovedAway)
         return true;
@@ -2938,7 +3355,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
       // *profitable* to do so.  We use a simple cost model to avoid increasing
       // register pressure too much.
       if (I->hasOneUse() ||
-          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+          isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
         AddrModeInsts.push_back(I);
         return true;
       }
@@ -2950,7 +3367,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
       TPT.rollback(LastKnownGood);
     }
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
-    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
+    if (matchOperationAddr(CE, CE->getOpcode(), Depth))
       return true;
     TPT.rollback(LastKnownGood);
   } else if (isa<ConstantPointerNull>(Addr)) {
@@ -2983,9 +3400,8 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
   return false;
 }
 
-/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
-/// inline asm call are due to memory operands.  If so, return true, otherwise
-/// return false.
+/// Check to see if all uses of OpVal by the specified inline asm call are due
+/// to memory operands. If so, return true, otherwise return false.
 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
                                     const TargetMachine &TM) {
   const Function *F = CI->getParent()->getParent();
@@ -3011,8 +3427,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
   return true;
 }
 
-/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
-/// memory use.  If we find an obviously non-foldable instruction, return true.
+/// Recursively walk all the uses of I until we find a memory use.
+/// If we find an obviously non-foldable instruction, return true.
 /// Add the ultimately found memory instructions to MemoryUses.
 static bool FindAllMemoryUses(
     Instruction *I,
@@ -3059,11 +3475,11 @@ static bool FindAllMemoryUses(
   return false;
 }
 
-/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
-/// the use site that we're folding it into.  If so, there is no cost to
-/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
-/// that we know are live at the instruction already.
-bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+/// Return true if Val is already known to be live at the use site that we're
+/// folding it into. If so, there is no cost to include it in the addressing
+/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
+/// instruction already.
+bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
                                                    Value *KnownLive2) {
   // If Val is either of the known-live values, we know it is live!
   if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
@@ -3085,11 +3501,11 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
   return Val->isUsedInBasicBlock(MemoryInst->getParent());
 }
 
-/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
-/// mode of the machine to fold the specified instruction into a load or store
-/// that ultimately uses it.  However, the specified instruction has multiple
-/// uses.  Given this, it may actually increase register pressure to fold it
-/// into the load.  For example, consider this code:
+/// It is possible for the addressing mode of the machine to fold the specified
+/// instruction into a load or store that ultimately uses it.
+/// However, the specified instruction has multiple uses.
+/// Given this, it may actually increase register pressure to fold it
+/// into the load. For example, consider this code:
 ///
 ///     X = ...
 ///     Y = X+1
@@ -3107,7 +3523,7 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
 /// X was live across 'load Z' for other reasons, we actually *would* want to
 /// fold the addressing mode in the Z case.  This would make Y die earlier.
 bool AddressingModeMatcher::
-IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
                                      ExtAddrMode &AMAfter) {
   if (IgnoreProfitability) return true;
 
@@ -3124,9 +3540,9 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
 
   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
   // lifetime wasn't extended by adding this instruction.
-  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+  if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
     BaseReg = nullptr;
-  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+  if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
     ScaledReg = nullptr;
 
   // If folding this instruction (and it's subexprs) didn't extend any live
@@ -3171,7 +3587,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
                                   MemoryInst, Result, InsertedInsts,
                                   PromotedInsts, TPT);
     Matcher.IgnoreProfitability = true;
-    bool Success = Matcher.MatchAddr(Address, 0);
+    bool Success = Matcher.matchAddr(Address, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
 
     // The match was to check the profitability, the changes made are not
@@ -3192,7 +3608,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
 
 } // end anonymous namespace
 
-/// IsNonLocalValue - Return true if the specified values are defined in a
+/// Return true if the specified values are defined in a
 /// different basic block than BB.
 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
   if (Instruction *I = dyn_cast<Instruction>(V))
@@ -3200,16 +3616,15 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
   return false;
 }
 
-/// OptimizeMemoryInst - Load and Store Instructions often have
-/// addressing modes that can do significant amounts of computation.  As such,
-/// instruction selection will try to get the load or store to do as much
-/// computation as possible for the program.  The problem is that isel can only
-/// see within a single block.  As such, we sink as much legal addressing mode
-/// stuff into the block as possible.
+/// Load and Store Instructions often have addressing modes that can do
+/// significant amounts of computation. As such, instruction selection will try
+/// to get the load or store to do as much computation as possible for the
+/// program. The problem is that isel can only see within a single block. As
+/// such, we sink as much legal addressing mode work into the block as possible.
 ///
 /// This method is used to optimize both load/store and inline asms with memory
 /// operands.
-bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                         Type *AccessTy, unsigned AddrSpace) {
   Value *Repl = Addr;
 
@@ -3530,12 +3945,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   if (Repl->use_empty()) {
     // This can cause recursive deletion, which can invalidate our iterator.
     // Use a WeakVH to hold onto it in case this happens.
-    WeakVH IterHandle(CurInstIterator);
+    WeakVH IterHandle(&*CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
 
     RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
 
-    if (IterHandle != CurInstIterator) {
+    if (IterHandle != CurInstIterator.getNodePtrUnchecked()) {
       // If the iterator instruction was recursively deleted, start over at the
       // start of the block.
       CurInstIterator = BB->begin();
@@ -3546,10 +3961,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   return true;
 }
 
-/// OptimizeInlineAsmInst - If there are any memory operands, use
-/// OptimizeMemoryInst to sink their address computing into the block when
-/// possible / profitable.
-bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
+/// If there are any memory operands, use OptimizeMemoryInst to sink their
+/// address computing into the block when possible / profitable.
+bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   bool MadeChange = false;
 
   const TargetRegisterInfo *TRI =
@@ -3566,7 +3980,7 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.isIndirect) {
       Value *OpVal = CS->getArgOperand(ArgNo++);
-      MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
+      MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
     } else if (OpInfo.Type == InlineAsm::isInput)
       ArgNo++;
   }
@@ -3646,7 +4060,7 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
 /// %add = add nuw i64 %zext, 4
 /// \encode
 /// Thanks to the promotion, we can match zext(load i32*) to i64.
-bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
+bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
                                     LoadInst *&LI, Instruction *&Inst,
                                     const SmallVectorImpl<Instruction *> &Exts,
                                     unsigned CreatedInstsCost = 0) {
@@ -3696,7 +4110,7 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
     }
     // The promotion is profitable.
     // Check if it exposes an ext(load).
-    (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
+    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
     if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
                // If we have created a new extension, i.e., now we have two
                // extensions. We must make sure one of them is merged with
@@ -3713,13 +4127,13 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
   return false;
 }
 
-/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
-/// basic block as the load, unless conditions are unfavorable. This allows
-/// SelectionDAG to fold the extend into the load.
+/// Move a zext or sext fed by a load into the same basic block as the load,
+/// unless conditions are unfavorable. This allows SelectionDAG to fold the
+/// extend into the load.
 /// \p I[in/out] the extension may be modified during the process if some
 /// promotions apply.
 ///
-bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
+bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
   // Try to promote a chain of computation if it allows to form
   // an extended load.
   TypePromotionTransaction TPT;
@@ -3730,7 +4144,7 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
   // Look for a load being extended.
   LoadInst *LI = nullptr;
   Instruction *OldExt = I;
-  bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts);
+  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
   if (!LI || !I) {
     assert(!HasPromoted && !LI && "If we did not match any load instruction "
                                   "the code must remain the same");
@@ -3780,7 +4194,7 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
   return true;
 }
 
-bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
+bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
   BasicBlock *DefBB = I->getParent();
 
   // If the result of a {s|z}ext and its source are both live out, rewrite all
@@ -3838,7 +4252,8 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
 
     if (!InsertedTrunc) {
       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
+      assert(InsertPt != UserBB->end());
+      InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
       InsertedInsts.insert(InsertedTrunc);
     }
 
@@ -3851,9 +4266,202 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
   return MadeChange;
 }
 
-/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be
-/// turned into an explicit branch.
-static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
+// Find loads whose uses only use some of the loaded value's bits.  Add an "and"
+// just after the load if the target can fold this into one extload instruction,
+// with the hope of eliminating some of the other later "and" instructions using
+// the loaded value.  "and"s that are made trivially redundant by the insertion
+// of the new "and" are removed by this function, while others (e.g. those whose
+// path from the load goes through a phi) are left for isel to potentially
+// remove.
+//
+// For example:
+//
+// b0:
+//   x = load i32
+//   ...
+// b1:
+//   y = and x, 0xff
+//   z = use y
+//
+// becomes:
+//
+// b0:
+//   x = load i32
+//   x' = and x, 0xff
+//   ...
+// b1:
+//   z = use x'
+//
+// whereas:
+//
+// b0:
+//   x1 = load i32
+//   ...
+// b1:
+//   x2 = load i32
+//   ...
+// b2:
+//   x = phi x1, x2
+//   y = and x, 0xff
+//
+// becomes (after a call to optimizeLoadExt for each load):
+//
+// b0:
+//   x1 = load i32
+//   x1' = and x1, 0xff
+//   ...
+// b1:
+//   x2 = load i32
+//   x2' = and x2, 0xff
+//   ...
+// b2:
+//   x = phi x1', x2'
+//   y = and x, 0xff
+//
+
+bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
+
+  if (!Load->isSimple() ||
+      !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
+    return false;
+
+  // Skip loads we've already transformed or have no reason to transform.
+  if (Load->hasOneUse()) {
+    User *LoadUser = *Load->user_begin();
+    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
+        !dyn_cast<PHINode>(LoadUser))
+      return false;
+  }
+
+  // Look at all uses of Load, looking through phis, to determine how many bits
+  // of the loaded value are needed.
+  SmallVector<Instruction *, 8> WorkList;
+  SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<Instruction *, 8> AndsToMaybeRemove;
+  for (auto *U : Load->users())
+    WorkList.push_back(cast<Instruction>(U));
+
+  EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
+  unsigned BitWidth = LoadResultVT.getSizeInBits();
+  APInt DemandBits(BitWidth, 0);
+  APInt WidestAndBits(BitWidth, 0);
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    // Break use-def graph loops.
+    if (!Visited.insert(I).second)
+      continue;
+
+    // For a PHI node, push all of its users.
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      for (auto *U : Phi->users())
+        WorkList.push_back(cast<Instruction>(U));
+      continue;
+    }
+
+    switch (I->getOpcode()) {
+    case llvm::Instruction::And: {
+      auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!AndC)
+        return false;
+      APInt AndBits = AndC->getValue();
+      DemandBits |= AndBits;
+      // Keep track of the widest and mask we see.
+      if (AndBits.ugt(WidestAndBits))
+        WidestAndBits = AndBits;
+      if (AndBits == WidestAndBits && I->getOperand(0) == Load)
+        AndsToMaybeRemove.push_back(I);
+      break;
+    }
+
+    case llvm::Instruction::Shl: {
+      auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!ShlC)
+        return false;
+      uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
+      auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt);
+      DemandBits |= ShlDemandBits;
+      break;
+    }
+
+    case llvm::Instruction::Trunc: {
+      EVT TruncVT = TLI->getValueType(*DL, I->getType());
+      unsigned TruncBitWidth = TruncVT.getSizeInBits();
+      auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth);
+      DemandBits |= TruncBits;
+      break;
+    }
+
+    default:
+      return false;
+    }
+  }
+
+  uint32_t ActiveBits = DemandBits.getActiveBits();
+  // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
+  // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
+  // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
+  // (and (load x) 1) is not matched as a single instruction, rather as a LDR
+  // followed by an AND.
+  // TODO: Look into removing this restriction by fixing backends to either
+  // return false for isLoadExtLegal for i1 or have them select this pattern to
+  // a single instruction.
+  //
+  // Also avoid hoisting if we didn't see any ands with the exact DemandBits
+  // mask, since these are the only ands that will be removed by isel.
+  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
+      WidestAndBits != DemandBits)
+    return false;
+
+  LLVMContext &Ctx = Load->getType()->getContext();
+  Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
+  EVT TruncVT = TLI->getValueType(*DL, TruncTy);
+
+  // Reject cases that won't be matched as extloads.
+  if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
+      !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
+    return false;
+
+  IRBuilder<> Builder(Load->getNextNode());
+  auto *NewAnd = dyn_cast<Instruction>(
+      Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+
+  // Replace all uses of load with new and (except for the use of load in the
+  // new and itself).
+  Load->replaceAllUsesWith(NewAnd);
+  NewAnd->setOperand(0, Load);
+
+  // Remove any and instructions that are now redundant.
+  for (auto *And : AndsToMaybeRemove)
+    // Check that the and mask is the same as the one we decided to put on the
+    // new and.
+    if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
+      And->replaceAllUsesWith(NewAnd);
+      if (&*CurInstIterator == And)
+        CurInstIterator = std::next(And->getIterator());
+      And->eraseFromParent();
+      ++NumAndUses;
+    }
+
+  ++NumAndsAdded;
+  return true;
+}
+
+/// Check if V (an operand of a select instruction) is an expensive instruction
+/// that is only used once.
+static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  // If it's safe to speculatively execute, then it should not have side
+  // effects; therefore, it's safe to sink and possibly *not* execute.
+  return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
+         TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
+}
+
+/// Returns true if a SelectInst should be turned into an explicit branch.
+static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
+                                                SelectInst *SI) {
   // FIXME: This should use the same heuristics as IfConversion to determine
   // whether a select is better represented as a branch.  This requires that
   // branch probability metadata is preserved for the select, which is not the
@@ -3861,28 +4469,36 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
 
   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
 
-  // If the branch is predicted right, an out of order CPU can avoid blocking on
-  // the compare.  Emit cmovs on compares with a memory operand as branches to
-  // avoid stalls on the load from memory.  If the compare has more than one use
-  // there's probably another cmov or setcc around so it's not worth emitting a
-  // branch.
-  if (!Cmp)
+  // If a branch is predictable, an out-of-order CPU can avoid blocking on its
+  // comparison condition. If the compare has more than one use, there's
+  // probably another cmov or setcc around, so it's not worth emitting a branch.
+  if (!Cmp || !Cmp->hasOneUse())
     return false;
 
   Value *CmpOp0 = Cmp->getOperand(0);
   Value *CmpOp1 = Cmp->getOperand(1);
 
-  // We check that the memory operand has one use to avoid uses of the loaded
-  // value directly after the compare, making branches unprofitable.
-  return Cmp->hasOneUse() &&
-         ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
-          (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()));
+  // Emit "cmov on compare with a memory operand" as a branch to avoid stalls
+  // on a load from memory. But if the load is used more than once, do not
+  // change the select to a branch because the load is probably needed
+  // regardless of whether the branch is taken or not.
+  if ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
+      (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()))
+    return true;
+
+  // If either operand of the select is expensive and only needed on one side
+  // of the select, we should form a branch.
+  if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
+      sinkSelectOperand(TTI, SI->getFalseValue()))
+    return true;
+
+  return false;
 }
 
 
 /// If we have a SelectInst that will likely profit from branch prediction,
 /// turn it into a branch.
-bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
+bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
   // Can we convert the 'select' to CF ?
@@ -3902,34 +4518,97 @@ bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
     // We have efficient codegen support for the select instruction.
     // Check if it is profitable to keep this 'select'.
     if (!TLI->isPredictableSelectExpensive() ||
-        !isFormingBranchFromSelectProfitable(SI))
+        !isFormingBranchFromSelectProfitable(TTI, SI))
       return false;
   }
 
   ModifiedDT = true;
 
+  // Transform a sequence like this:
+  //    start:
+  //       %cmp = cmp uge i32 %a, %b
+  //       %sel = select i1 %cmp, i32 %c, i32 %d
+  //
+  // Into:
+  //    start:
+  //       %cmp = cmp uge i32 %a, %b
+  //       br i1 %cmp, label %select.true, label %select.false
+  //    select.true:
+  //       br label %select.end
+  //    select.false:
+  //       br label %select.end
+  //    select.end:
+  //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
+  //
+  // In addition, we may sink instructions that produce %c or %d from
+  // the entry block into the destination(s) of the new branch.
+  // If the true or false blocks do not contain a sunken instruction, that
+  // block and its branch may be optimized away. In that case, one side of the
+  // first branch will point directly to select.end, and the corresponding PHI
+  // predecessor block will be the start block.
+
   // First, we split the block containing the select into 2 blocks.
   BasicBlock *StartBlock = SI->getParent();
   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
-  BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+  BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
 
-  // Create a new block serving as the landing pad for the branch.
-  BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid",
-                                             NextBlock->getParent(), NextBlock);
-
-  // Move the unconditional branch from the block with the select in it into our
-  // landing pad block.
+  // Delete the unconditional branch that was just created by the split.
   StartBlock->getTerminator()->eraseFromParent();
-  BranchInst::Create(NextBlock, SmallBlock);
+
+  // These are the new basic blocks for the conditional branch.
+  // At least one will become an actual new basic block.
+  BasicBlock *TrueBlock = nullptr;
+  BasicBlock *FalseBlock = nullptr;
+
+  // Sink expensive instructions into the conditional blocks to avoid executing
+  // them speculatively.
+  if (sinkSelectOperand(TTI, SI->getTrueValue())) {
+    TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
+                                   EndBlock->getParent(), EndBlock);
+    auto *TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+    auto *TrueInst = cast<Instruction>(SI->getTrueValue());
+    TrueInst->moveBefore(TrueBranch);
+  }
+  if (sinkSelectOperand(TTI, SI->getFalseValue())) {
+    FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
+                                    EndBlock->getParent(), EndBlock);
+    auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+    auto *FalseInst = cast<Instruction>(SI->getFalseValue());
+    FalseInst->moveBefore(FalseBranch);
+  }
+
+  // If there was nothing to sink, then arbitrarily choose the 'false' side
+  // for a new input value to the PHI.
+  if (TrueBlock == FalseBlock) {
+    assert(TrueBlock == nullptr &&
+           "Unexpected basic block transform while optimizing select");
+
+    FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+                                    EndBlock->getParent(), EndBlock);
+    BranchInst::Create(EndBlock, FalseBlock);
+  }
 
   // Insert the real conditional branch based on the original condition.
-  BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI);
+  // If we did not create a new block for one of the 'true' or 'false' paths
+  // of the condition, it means that side of the branch goes to the end block
+  // directly and the path originates from the start block from the point of
+  // view of the new PHI.
+  if (TrueBlock == nullptr) {
+    BranchInst::Create(EndBlock, FalseBlock, SI->getCondition(), SI);
+    TrueBlock = StartBlock;
+  } else if (FalseBlock == nullptr) {
+    BranchInst::Create(TrueBlock, EndBlock, SI->getCondition(), SI);
+    FalseBlock = StartBlock;
+  } else {
+    BranchInst::Create(TrueBlock, FalseBlock, SI->getCondition(), SI);
+  }
 
   // The select itself is replaced with a PHI Node.
-  PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin());
+  PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
   PN->takeName(SI);
-  PN->addIncoming(SI->getTrueValue(), StartBlock);
-  PN->addIncoming(SI->getFalseValue(), SmallBlock);
+  PN->addIncoming(SI->getTrueValue(), TrueBlock);
+  PN->addIncoming(SI->getFalseValue(), FalseBlock);
+
   SI->replaceAllUsesWith(PN);
   SI->eraseFromParent();
 
@@ -3955,7 +4634,7 @@ static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
 /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
 /// it's often worth sinking a shufflevector splat down to its use so that
 /// codegen can spot all lanes are identical.
-bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
   BasicBlock *DefBB = SVI->getParent();
 
   // Only do this xform if variable vector shifts are particularly expensive.
@@ -3987,9 +4666,10 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
 
     if (!InsertedShuffle) {
       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0),
-                                              SVI->getOperand(1),
-                                              SVI->getOperand(2), "", InsertPt);
+      assert(InsertPt != UserBB->end());
+      InsertedShuffle =
+          new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
+                                SVI->getOperand(2), "", &*InsertPt);
     }
 
     UI->replaceUsesOfWith(SVI, InsertedShuffle);
@@ -4005,6 +4685,49 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
   return MadeChange;
 }
 
+bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
+  if (!TLI || !DL)
+    return false;
+
+  Value *Cond = SI->getCondition();
+  Type *OldType = Cond->getType();
+  LLVMContext &Context = Cond->getContext();
+  MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
+  unsigned RegWidth = RegType.getSizeInBits();
+
+  if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
+    return false;
+
+  // If the register width is greater than the type width, expand the condition
+  // of the switch instruction and each case constant to the width of the
+  // register. By widening the type of the switch condition, subsequent
+  // comparisons (for case comparisons) will not need to be extended to the
+  // preferred register width, so we will potentially eliminate N-1 extends,
+  // where N is the number of cases in the switch.
+  auto *NewType = Type::getIntNTy(Context, RegWidth);
+
+  // Zero-extend the switch condition and case constants unless the switch
+  // condition is a function argument that is already being sign-extended.
+  // In that case, we can avoid an unnecessary mask/extension by sign-extending
+  // everything instead.
+  Instruction::CastOps ExtType = Instruction::ZExt;
+  if (auto *Arg = dyn_cast<Argument>(Cond))
+    if (Arg->hasSExtAttr())
+      ExtType = Instruction::SExt;
+
+  auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
+  ExtInst->insertBefore(SI);
+  SI->setCondition(ExtInst);
+  for (SwitchInst::CaseIt Case : SI->cases()) {
+    APInt NarrowConst = Case.getCaseValue()->getValue();
+    APInt WideConst = (ExtType == Instruction::ZExt) ?
+                      NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
+    Case.setValue(ConstantInt::get(Context, WideConst));
+  }
+
+  return true;
+}
+
 namespace {
 /// \brief Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.
@@ -4138,7 +4861,7 @@ class VectorPromoteHelper {
   /// \brief Generate a constant vector with \p Val with the same
   /// number of elements as the transition.
   /// \p UseSplat defines whether or not \p Val should be replicated
-  /// accross the whole vector.
+  /// across the whole vector.
   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
   /// otherwise we generate a vector with as many undef as possible:
   /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
@@ -4320,7 +5043,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
 /// Some targets can do store(extractelement) with one instruction.
 /// Try to push the extractelement towards the stores when the target
 /// has this feature and this is profitable.
-bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
+bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
   unsigned CombineCost = UINT_MAX;
   if (DisableStoreExtract || !TLI ||
       (!StressStoreExtract &&
@@ -4372,7 +5095,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
   return false;
 }
 
-bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
   if (InsertedInsts.count(I))
@@ -4413,8 +5136,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
               TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
-        bool MadeChange = MoveExtToFormExtLoad(I);
-        return MadeChange | OptimizeExtUses(I);
+        bool MadeChange = moveExtToFormExtLoad(I);
+        return MadeChange | optimizeExtUses(I);
       }
     }
     return false;
@@ -4425,17 +5148,21 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
       return OptimizeCmpExpression(CI);
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    stripInvariantGroupMetadata(*LI);
     if (TLI) {
+      bool Modified = optimizeLoadExt(LI);
       unsigned AS = LI->getPointerAddressSpace();
-      return OptimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+      Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+      return Modified;
     }
     return false;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    stripInvariantGroupMetadata(*SI);
     if (TLI) {
       unsigned AS = SI->getPointerAddressSpace();
-      return OptimizeMemoryInst(I, SI->getOperand(1),
+      return optimizeMemoryInst(I, SI->getOperand(1),
                                 SI->getOperand(0)->getType(), AS);
     }
     return false;
@@ -4460,23 +5187,26 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
       GEPI->replaceAllUsesWith(NC);
       GEPI->eraseFromParent();
       ++NumGEPsElim;
-      OptimizeInst(NC, ModifiedDT);
+      optimizeInst(NC, ModifiedDT);
       return true;
     }
     return false;
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(I))
-    return OptimizeCallInst(CI, ModifiedDT);
+    return optimizeCallInst(CI, ModifiedDT);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(I))
-    return OptimizeSelectInst(SI);
+    return optimizeSelectInst(SI);
 
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
-    return OptimizeShuffleVectorInst(SVI);
+    return optimizeShuffleVectorInst(SVI);
+
+  if (auto *Switch = dyn_cast<SwitchInst>(I))
+    return optimizeSwitchInst(Switch);
 
   if (isa<ExtractElementInst>(I))
-    return OptimizeExtractElementInst(I);
+    return optimizeExtractElementInst(I);
 
   return false;
 }
@@ -4484,17 +5214,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
   SunkAddrs.clear();
   bool MadeChange = false;
 
   CurInstIterator = BB.begin();
   while (CurInstIterator != BB.end()) {
-    MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
+    MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
     if (ModifiedDT)
       return true;
   }
-  MadeChange |= DupRetToEnableTailCallOpts(&BB);
+  MadeChange |= dupRetToEnableTailCallOpts(&BB);
 
   return MadeChange;
 }
@@ -4502,12 +5232,12 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
 // llvm.dbg.value is far away from the value then iSel may not be able
 // handle it properly. iSel will drop llvm.dbg.value if it can not
 // find a node corresponding to the value.
-bool CodeGenPrepare::PlaceDbgValues(Function &F) {
+bool CodeGenPrepare::placeDbgValues(Function &F) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     Instruction *PrevNonDbgInst = nullptr;
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
-      Instruction *Insn = BI++;
+      Instruction *Insn = &*BI++;
       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
       // Leave dbg.values that refer to an alloca alone. These
       // instrinsics describe the address of a variable (= the alloca)
@@ -4521,10 +5251,14 @@ bool CodeGenPrepare::PlaceDbgValues(Function &F) {
 
       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        // If VI is a phi in a block with an EHPad terminator, we can't insert
+        // after it.
+        if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
+          continue;
         DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
         DVI->removeFromParent();
         if (isa<PHINode>(VI))
-          DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
+          DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
         else
           DVI->insertAfter(VI);
         MadeChange = true;
@@ -4548,7 +5282,7 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) {
     return false;
   bool MadeChange = false;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    BasicBlock *BB = I++;
+    BasicBlock *BB = &*I++;
 
     // Does this BB end with the following?
     //   %andVal = and %val, #single-bit-set
@@ -4671,6 +5405,10 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
     if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
       continue;
 
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    if (Br1->getMetadata(LLVMContext::MD_unpredictable))
+      continue;
+
     unsigned Opc;
     Value *Cond1, *Cond2;
     if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
@@ -4697,7 +5435,6 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
 
     // Update original basic block by using the first condition directly by the
     // branch instruction and removing the no longer needed and/or instruction.
-    auto *Br1 = cast<BranchInst>(BB.getTerminator());
     Br1->setCondition(Cond1);
     LogicOp->eraseFromParent();
 
@@ -4828,3 +5565,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
   }
   return MadeChange;
 }
+
+void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) {
+  if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group))
+    I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID());
+}
diff --git a/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp b/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp
index 28c97ba..ff7c0d5 100644
--- a/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp
+++ b/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp
@@ -38,9 +38,9 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
-  Optional<bool> isGCManagedPointer(const Value *V) const override {
+  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
-    PointerType *PT = cast<PointerType>(V->getType());
+    const PointerType *PT = cast<PointerType>(Ty);
     // We pick addrspace(1) as our GC managed heap.
     return (1 == PT->getAddressSpace());
   }
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index dba280f..c924ba3 100644
--- a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -52,14 +52,13 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // Clear "do not change" set.
   KeepRegs.reset();
 
-  bool IsReturnBlock = (BBSize != 0 && BB->back().isReturn());
+  bool IsReturnBlock = BB->isReturnBlock();
 
   // Examine the live-in regs of all successors.
   for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
-    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-           E = (*SI)->livein_end(); I != E; ++I) {
-      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+    for (const auto &LI : (*SI)->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
         unsigned Reg = *AI;
         Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
         KillIndices[Reg] = BBSize;
diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
index 0a188c0..af6b6a3 100644
--- a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -31,10 +31,39 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
-DFAPacketizer::DFAPacketizer(const InstrItineraryData *I, const int (*SIT)[2],
+// --------------------------------------------------------------------
+// Definitions shared between DFAPacketizer.cpp and DFAPacketizerEmitter.cpp
+
+namespace {
+  DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) {
+    return (Inp << DFA_MAX_RESOURCES) | FuncUnits;
+  }
+
+  /// Return the DFAInput for an instruction class input vector.
+  /// This function is used in both DFAPacketizer.cpp and in
+  /// DFAPacketizerEmitter.cpp.
+  DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
+    DFAInput InsnInput = 0;
+    assert ((InsnClass.size() <= DFA_MAX_RESTERMS) &&
+            "Exceeded maximum number of DFA terms");
+    for (auto U : InsnClass)
+      InsnInput = addDFAFuncUnits(InsnInput, U);
+    return InsnInput;
+  }
+}
+// --------------------------------------------------------------------
+
+DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
+                             const DFAStateInput (*SIT)[2],
                              const unsigned *SET):
   InstrItins(I), CurrentState(0), DFAStateInputTable(SIT),
-  DFAStateEntryTable(SET) {}
+  DFAStateEntryTable(SET) {
+  // Make sure DFA types are large enough for the number of terms & resources.
+  assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAInput))
+        && "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
+  assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput))
+        && "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
+}
 
 
 //
@@ -60,26 +89,42 @@ void DFAPacketizer::ReadTable(unsigned int state) {
       DFAStateInputTable[i][1];
 }
 
+//
+// getInsnInput - Return the DFAInput for an instruction class.
+//
+DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
+  // Note: this logic must match that in DFAPacketizerDefs.h for input vectors.
+  DFAInput InsnInput = 0;
+  unsigned i = 0;
+  for (const InstrStage *IS = InstrItins->beginStage(InsnClass),
+        *IE = InstrItins->endStage(InsnClass); IS != IE; ++IS, ++i) {
+    InsnInput = addDFAFuncUnits(InsnInput, IS->getUnits());
+    assert ((i < DFA_MAX_RESTERMS) && "Exceeded maximum number of DFA inputs");
+  }
+  return InsnInput;
+}
+
+// getInsnInput - Return the DFAInput for an instruction class input vector.
+DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) {
+  return getDFAInsnInput(InsnClass);
+}
 
 // canReserveResources - Check if the resources occupied by a MCInstrDesc
 // are available in the current state.
 bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
-  const llvm::InstrStage *IS = InstrItins->beginStage(InsnClass);
-  unsigned FuncUnits = IS->getUnits();
-  UnsignPair StateTrans = UnsignPair(CurrentState, FuncUnits);
+  DFAInput InsnInput = getInsnInput(InsnClass);
+  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
   ReadTable(CurrentState);
   return (CachedTable.count(StateTrans) != 0);
 }
 
-
 // reserveResources - Reserve the resources occupied by a MCInstrDesc and
 // change the current state to reflect that change.
 void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
-  const llvm::InstrStage *IS = InstrItins->beginStage(InsnClass);
-  unsigned FuncUnits = IS->getUnits();
-  UnsignPair StateTrans = UnsignPair(CurrentState, FuncUnits);
+  DFAInput InsnInput = getInsnInput(InsnClass);
+  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
   ReadTable(CurrentState);
   assert(CachedTable.count(StateTrans) != 0);
   CurrentState = CachedTable[StateTrans];
@@ -104,32 +149,35 @@ namespace llvm {
 // DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides
 // Schedule method to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
+private:
+  AliasAnalysis *AA;
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                       bool IsPostRA);
+                       AliasAnalysis *AA);
   // Schedule - Actual scheduling work.
   void schedule() override;
 };
 }
 
 DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
-                                           MachineLoopInfo &MLI, bool IsPostRA)
-    : ScheduleDAGInstrs(MF, &MLI, IsPostRA) {
+                                           MachineLoopInfo &MLI,
+                                           AliasAnalysis *AA)
+    : ScheduleDAGInstrs(MF, &MLI), AA(AA) {
   CanHandleTerminators = true;
 }
 
 void DefaultVLIWScheduler::schedule() {
   // Build the scheduling graph.
-  buildSchedGraph(nullptr);
+  buildSchedGraph(AA);
 }
 
 // VLIWPacketizerList Ctor
 VLIWPacketizerList::VLIWPacketizerList(MachineFunction &MF,
-                                       MachineLoopInfo &MLI, bool IsPostRA)
-    : MF(MF) {
+                                       MachineLoopInfo &MLI, AliasAnalysis *AA)
+    : MF(MF), AA(AA) {
   TII = MF.getSubtarget().getInstrInfo();
   ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
-  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, IsPostRA);
+  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, AA);
 }
 
 // VLIWPacketizerList Dtor
@@ -147,7 +195,7 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
                                          MachineInstr *MI) {
   if (CurrentPacketMIs.size() > 1) {
     MachineInstr *MIFirst = CurrentPacketMIs.front();
-    finalizeBundle(*MBB, MIFirst, MI);
+    finalizeBundle(*MBB, MIFirst->getIterator(), MI->getIterator());
   }
   CurrentPacketMIs.clear();
   ResourceTracker->clearResources();
@@ -191,7 +239,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
 
     // Ask DFA if machine resource is available for MI.
     bool ResourceAvail = ResourceTracker->canReserveResources(MI);
-    if (ResourceAvail) {
+    if (ResourceAvail && shouldAddToPacket(MI)) {
       // Dependency check for MI with instructions in CurrentPacketMIs.
       for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
            VE = CurrentPacketMIs.end(); VI != VE; ++VI) {
@@ -210,7 +258,8 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
         } // !isLegalToPacketizeTogether.
       } // For all instructions in CurrentPacketMIs.
     } else {
-      // End the packet if resource is not available.
+      // End the packet if resource is not available, or if the instruction
+      // shoud not be added to the current packet.
       endPacket(MBB, MI);
     }
 
diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 941129b..b11b497 100644
--- a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -101,26 +101,22 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
   // Loop over all instructions in all blocks, from bottom to top, so that it's
   // more likely that chains of dependent but ultimately dead instructions will
   // be cleaned up.
-  for (MachineFunction::reverse_iterator I = MF.rbegin(), E = MF.rend();
-       I != E; ++I) {
-    MachineBasicBlock *MBB = &*I;
-
+  for (MachineBasicBlock &MBB : make_range(MF.rbegin(), MF.rend())) {
     // Start out assuming that reserved registers are live out of this block.
     LivePhysRegs = MRI->getReservedRegs();
 
     // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
     // live across blocks, but some targets (x86) can have flags live out of a
     // block.
-    for (MachineBasicBlock::succ_iterator S = MBB->succ_begin(),
-           E = MBB->succ_end(); S != E; S++)
-      for (MachineBasicBlock::livein_iterator LI = (*S)->livein_begin();
-           LI != (*S)->livein_end(); LI++)
-        LivePhysRegs.set(*LI);
+    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+           E = MBB.succ_end(); S != E; S++)
+      for (const auto &LI : (*S)->liveins())
+        LivePhysRegs.set(LI.PhysReg);
 
     // Now scan the instructions and delete dead ones, tracking physreg
     // liveness as we go.
-    for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
-         MIE = MBB->rend(); MII != MIE; ) {
+    for (MachineBasicBlock::reverse_iterator MII = MBB.rbegin(),
+         MIE = MBB.rend(); MII != MIE; ) {
       MachineInstr *MI = &*MII;
 
       // If the instruction is dead, delete it!
@@ -132,7 +128,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
         MI->eraseFromParentAndMarkDBGValuesForRemoval();
         AnyChanges = true;
         ++NumDeletes;
-        MIE = MBB->rend();
+        MIE = MBB.rend();
         // MII is now pointing to the next instruction to process,
         // so don't increment it.
         continue;
diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index e019dfb..eae78a9 100644
--- a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -192,9 +192,9 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
   if (Resumes.empty())
     return false;
 
-  // Check the personality, don't do anything if it's for MSVC.
+  // Check the personality, don't do anything if it's funclet-based.
   EHPersonality Pers = classifyEHPersonality(Fn.getPersonalityFn());
-  if (isMSVCEHPersonality(Pers))
+  if (isFuncletEHPersonality(Pers))
     return false;
 
   LLVMContext &Ctx = Fn.getContext();
diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
index fbc4d97..f3536d7 100644
--- a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -538,11 +538,11 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
 
   // Fix up the CFG, temporarily leave Head without any successors.
   Head->removeSuccessor(TBB);
-  Head->removeSuccessor(FBB);
+  Head->removeSuccessor(FBB, true);
   if (TBB != Tail)
-    TBB->removeSuccessor(Tail);
+    TBB->removeSuccessor(Tail, true);
   if (FBB != Tail)
-    FBB->removeSuccessor(Tail);
+    FBB->removeSuccessor(Tail, true);
 
   // Fix up Head's terminators.
   // It should become a single branch or a fallthrough.
diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
index 5b09cf1..c550008 100644
--- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
@@ -375,9 +375,8 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
 
   // This is the entry block.
   if (MBB->pred_empty()) {
-    for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
-         e = MBB->livein_end(); i != e; ++i) {
-      for (int rx : regIndices(*i)) {
+    for (const auto &LI : MBB->liveins()) {
+      for (int rx : regIndices(LI.PhysReg)) {
         // Treat function live-ins as if they were defined just before the first
         // instruction.  Usually, function arguments are set up immediately
         // before the call.
@@ -559,12 +558,11 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
   MachineInstr *UndefMI = UndefReads.back().first;
   unsigned OpIdx = UndefReads.back().second;
 
-  for (MachineBasicBlock::reverse_iterator I = MBB->rbegin(), E = MBB->rend();
-       I != E; ++I) {
+  for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
     // Update liveness, including the current instruction's defs.
-    LiveRegSet.stepBackward(*I);
+    LiveRegSet.stepBackward(I);
 
-    if (UndefMI == &*I) {
+    if (UndefMI == &I) {
       if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
         TII->breakPartialRegDependency(UndefMI, OpIdx, TRI);
 
@@ -733,12 +731,13 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   // If no relevant registers are used in the function, we can skip it
   // completely.
   bool anyregs = false;
-  for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
-       I != E; ++I)
-    if (MF->getRegInfo().isPhysRegUsed(*I)) {
+  const MachineRegisterInfo &MRI = mf.getRegInfo();
+  for (unsigned Reg : *RC) {
+    if (MRI.isPhysRegUsed(Reg)) {
       anyregs = true;
       break;
     }
+  }
   if (!anyregs) return false;
 
   // Initialize the AliasMap on the first use.
@@ -752,7 +751,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         AliasMap[*AI].push_back(i);
   }
 
-  MachineBasicBlock *Entry = MF->begin();
+  MachineBasicBlock *Entry = &*MF->begin();
   ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
   SmallVector<MachineBasicBlock*, 16> Loops;
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
@@ -761,22 +760,19 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     enterBasicBlock(MBB);
     if (SeenUnknownBackEdge)
       Loops.push_back(MBB);
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-        ++I)
-      visitInstr(I);
+    for (MachineInstr &MI : *MBB)
+      visitInstr(&MI);
     processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
   // Visit all the loop blocks again in order to merge DomainValues from
   // back-edges.
-  for (unsigned i = 0, e = Loops.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = Loops[i];
+  for (MachineBasicBlock *MBB : Loops) {
     enterBasicBlock(MBB);
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-        ++I)
-      if (!I->isDebugValue())
-        processDefs(I, false);
+    for (MachineInstr &MI : *MBB)
+      if (!MI.isDebugValue())
+        processDefs(&MI, false);
     processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
index 55e809e..90ddac9 100644
--- a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
@@ -50,7 +50,7 @@ bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
 
   // Iterate through each instruction in the function, looking for pseudos.
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I;
+    MachineBasicBlock *MBB = &*I;
     for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
          MBBI != MBBE; ) {
       MachineInstr *MI = MBBI++;
@@ -63,7 +63,7 @@ bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
         // The expansion may involve new basic blocks.
         if (NewMBB != MBB) {
           MBB = NewMBB;
-          I = NewMBB;
+          I = NewMBB->getIterator();
           MBBI = NewMBB->begin();
           MBBE = NewMBB->end();
         }
diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
new file mode 100644
index 0000000..8b2f505
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
@@ -0,0 +1,55 @@
+//===-- FuncletLayout.cpp - Contiguously lay out funclets -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic block placement transformations which result in
+// funclets being contiguous.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "funclet-layout"
+
+namespace {
+class FuncletLayout : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  FuncletLayout() : MachineFunctionPass(ID) {
+    initializeFuncletLayoutPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+}
+
+char FuncletLayout::ID = 0;
+char &llvm::FuncletLayoutID = FuncletLayout::ID;
+INITIALIZE_PASS(FuncletLayout, "funclet-layout",
+                "Contiguously Lay Out Funclets", false, false)
+
+bool FuncletLayout::runOnMachineFunction(MachineFunction &F) {
+  DenseMap<const MachineBasicBlock *, int> FuncletMembership =
+      getFuncletMembership(F);
+  if (FuncletMembership.empty())
+    return false;
+
+  F.sort([&](MachineBasicBlock &X, MachineBasicBlock &Y) {
+    auto FuncletX = FuncletMembership.find(&X);
+    auto FuncletY = FuncletMembership.find(&Y);
+    assert(FuncletX != FuncletMembership.end());
+    assert(FuncletY != FuncletMembership.end());
+    return FuncletX->second < FuncletY->second;
+  });
+
+  // Conservatively assume we changed something.
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
index d8edd7e..484d317 100644
--- a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -158,7 +158,7 @@ static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
 
   // Search for initializers in the initial BB.
   SmallPtrSet<AllocaInst *, 16> InitedRoots;
-  for (; !CouldBecomeSafePoint(IP); ++IP)
+  for (; !CouldBecomeSafePoint(&*IP); ++IP)
     if (StoreInst *SI = dyn_cast<StoreInst>(IP))
       if (AllocaInst *AI =
               dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
@@ -320,7 +320,9 @@ void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
     if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
       RI = FI->removeStackRoot(RI);
     } else {
-      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
+      unsigned FrameReg; // FIXME: surely GCRoot ought to store the
+                         // register that the offset is from?
+      RI->StackOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg);
       ++RI;
     }
   }
diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
index 6f9e839..dd9a840 100644
--- a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -108,10 +108,9 @@ EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
 
 // FIXME: this could be a transitional option, and we probably need to remove
 // it if only we are sure this optimization could always benefit all targets.
-static cl::opt<bool>
+static cl::opt<cl::boolOrDefault>
 EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
-     cl::desc("Enable global merge pass on external linkage"),
-     cl::init(false));
+     cl::desc("Enable global merge pass on external linkage"));
 
 STATISTIC(NumMerged, "Number of globals merged");
 namespace {
@@ -129,11 +128,14 @@ namespace {
     /// FIXME: This could learn about optsize, and be used in the cost model.
     bool OnlyOptimizeForSize;
 
+    /// Whether we should merge global variables that have external linkage.
+    bool MergeExternalGlobals;
+
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
     /// \brief Merge everything in \p Globals for which the corresponding bit
     /// in \p GlobalSet is set.
-    bool doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
+    bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
                  const BitVector &GlobalSet, Module &M, bool isConst,
                  unsigned AddrSpace) const;
 
@@ -158,9 +160,11 @@ namespace {
     static char ID;             // Pass identification, replacement for typeid.
     explicit GlobalMerge(const TargetMachine *TM = nullptr,
                          unsigned MaximalOffset = 0,
-                         bool OnlyOptimizeForSize = false)
+                         bool OnlyOptimizeForSize = false,
+                         bool MergeExternalGlobals = false)
         : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset),
-          OnlyOptimizeForSize(OnlyOptimizeForSize) {
+          OnlyOptimizeForSize(OnlyOptimizeForSize),
+          MergeExternalGlobals(MergeExternalGlobals) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
@@ -189,14 +193,11 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
   auto &DL = M.getDataLayout();
   // FIXME: Find better heuristics
-  std::stable_sort(
-      Globals.begin(), Globals.end(),
-      [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
-        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
-        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
-
-        return (DL.getTypeAllocSize(Ty1) < DL.getTypeAllocSize(Ty2));
-      });
+  std::stable_sort(Globals.begin(), Globals.end(),
+                   [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+                     return DL.getTypeAllocSize(GV1->getValueType()) <
+                            DL.getTypeAllocSize(GV2->getValueType());
+                   });
 
   // If we want to just blindly group all globals together, do so.
   if (!GlobalMergeGroupByUse) {
@@ -207,7 +208,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
 
   // If we want to be smarter, look at all uses of each global, to try to
   // discover all sets of globals used together, and how many times each of
-  // these sets occured.
+  // these sets occurred.
   //
   // Keep this reasonably efficient, by having an append-only list of all sets
   // discovered so far (UsedGlobalSet), and mapping each "together-ness" unit of
@@ -302,8 +303,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
         Function *ParentFn = I->getParent()->getParent();
 
         // If we're only optimizing for size, ignore non-minsize functions.
-        if (OnlyOptimizeForSize &&
-            !ParentFn->hasFnAttribute(Attribute::MinSize))
+        if (OnlyOptimizeForSize && !ParentFn->optForMinSize())
           continue;
 
         size_t UGSIdx = GlobalUsesByFunction[ParentFn];
@@ -406,15 +406,14 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   return Changed;
 }
 
-bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
+bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
                           const BitVector &GlobalSet, Module &M, bool isConst,
                           unsigned AddrSpace) const {
+  assert(Globals.size() > 1);
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
   auto &DL = M.getDataLayout();
 
-  assert(Globals.size() > 1);
-
   DEBUG(dbgs() << " Trying to merge set, starts with #"
                << GlobalSet.find_first() << "\n");
 
@@ -425,58 +424,44 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     std::vector<Type*> Tys;
     std::vector<Constant*> Inits;
 
-    bool HasExternal = false;
-    GlobalVariable *TheFirstExternal = 0;
     for (j = i; j != -1; j = GlobalSet.find_next(j)) {
-      Type *Ty = Globals[j]->getType()->getElementType();
+      Type *Ty = Globals[j]->getValueType();
       MergedSize += DL.getTypeAllocSize(Ty);
       if (MergedSize > MaxOffset) {
         break;
       }
       Tys.push_back(Ty);
       Inits.push_back(Globals[j]->getInitializer());
-
-      if (Globals[j]->hasExternalLinkage() && !HasExternal) {
-        HasExternal = true;
-        TheFirstExternal = Globals[j];
-      }
     }
 
-    // If merged variables doesn't have external linkage, we needn't to expose
-    // the symbol after merging.
-    GlobalValue::LinkageTypes Linkage = HasExternal
-                                            ? GlobalValue::ExternalLinkage
-                                            : GlobalValue::InternalLinkage;
-
     StructType *MergedTy = StructType::get(M.getContext(), Tys);
     Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
 
-    // If merged variables have external linkage, we use symbol name of the
-    // first variable merged as the suffix of global symbol name. This would
-    // be able to avoid the link-time naming conflict for globalm symbols.
     GlobalVariable *MergedGV = new GlobalVariable(
-        M, MergedTy, isConst, Linkage, MergedInit,
-        HasExternal ? "_MergedGlobals_" + TheFirstExternal->getName()
-                    : "_MergedGlobals",
-        nullptr, GlobalVariable::NotThreadLocal, AddrSpace);
+        M, MergedTy, isConst, GlobalValue::PrivateLinkage, MergedInit,
+        "_MergedGlobals", nullptr, GlobalVariable::NotThreadLocal, AddrSpace);
 
-    for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k)) {
+    for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) {
       GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
       std::string Name = Globals[k]->getName();
 
       Constant *Idx[2] = {
         ConstantInt::get(Int32Ty, 0),
-        ConstantInt::get(Int32Ty, idx++)
+        ConstantInt::get(Int32Ty, idx),
       };
       Constant *GEP =
           ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
 
-      if (Linkage != GlobalValue::InternalLinkage) {
-        // Generate a new alias...
-        auto *PTy = cast<PointerType>(GEP->getType());
-        GlobalAlias::create(PTy, Linkage, Name, GEP, &M);
+      // When the linkage is not internal we must emit an alias for the original
+      // variable name as it may be accessed from another object. On non-Mach-O
+      // we can also emit an alias for internal linkage as it's safe to do so.
+      // It's not safe on Mach-O as the alias (and thus the portion of the
+      // MergedGlobals variable) may be dead stripped at link time.
+      if (Linkage != GlobalValue::InternalLinkage ||
+          !TM->getTargetTriple().isOSBinFormatMachO()) {
+        GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M);
       }
 
       NumMerged++;
@@ -535,61 +520,57 @@ bool GlobalMerge::doInitialization(Module &M) {
   setMustKeepGlobalVariables(M);
 
   // Grab all non-const globals.
-  for (Module::global_iterator I = M.global_begin(),
-         E = M.global_end(); I != E; ++I) {
+  for (auto &GV : M.globals()) {
     // Merge is safe for "normal" internal or external globals only
-    if (I->isDeclaration() || I->isThreadLocal() || I->hasSection())
+    if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection())
       continue;
 
-    if (!(EnableGlobalMergeOnExternal && I->hasExternalLinkage()) &&
-        !I->hasInternalLinkage())
+    if (!(MergeExternalGlobals && GV.hasExternalLinkage()) &&
+        !GV.hasInternalLinkage())
       continue;
 
-    PointerType *PT = dyn_cast<PointerType>(I->getType());
+    PointerType *PT = dyn_cast<PointerType>(GV.getType());
     assert(PT && "Global variable is not a pointer!");
 
     unsigned AddressSpace = PT->getAddressSpace();
 
     // Ignore fancy-aligned globals for now.
-    unsigned Alignment = DL.getPreferredAlignment(I);
-    Type *Ty = I->getType()->getElementType();
+    unsigned Alignment = DL.getPreferredAlignment(&GV);
+    Type *Ty = GV.getValueType();
     if (Alignment > DL.getABITypeAlignment(Ty))
       continue;
 
     // Ignore all 'special' globals.
-    if (I->getName().startswith("llvm.") ||
-        I->getName().startswith(".llvm."))
+    if (GV.getName().startswith("llvm.") ||
+        GV.getName().startswith(".llvm."))
       continue;
 
     // Ignore all "required" globals:
-    if (isMustKeepGlobalVariable(I))
+    if (isMustKeepGlobalVariable(&GV))
       continue;
 
     if (DL.getTypeAllocSize(Ty) < MaxOffset) {
-      if (TargetLoweringObjectFile::getKindForGlobal(I, *TM).isBSSLocal())
-        BSSGlobals[AddressSpace].push_back(I);
-      else if (I->isConstant())
-        ConstGlobals[AddressSpace].push_back(I);
+      if (TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal())
+        BSSGlobals[AddressSpace].push_back(&GV);
+      else if (GV.isConstant())
+        ConstGlobals[AddressSpace].push_back(&GV);
       else
-        Globals[AddressSpace].push_back(I);
+        Globals[AddressSpace].push_back(&GV);
     }
   }
 
-  for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
-       I = Globals.begin(), E = Globals.end(); I != E; ++I)
-    if (I->second.size() > 1)
-      Changed |= doMerge(I->second, M, false, I->first);
+  for (auto &P : Globals)
+    if (P.second.size() > 1)
+      Changed |= doMerge(P.second, M, false, P.first);
 
-  for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
-       I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I)
-    if (I->second.size() > 1)
-      Changed |= doMerge(I->second, M, false, I->first);
+  for (auto &P : BSSGlobals)
+    if (P.second.size() > 1)
+      Changed |= doMerge(P.second, M, false, P.first);
 
   if (EnableGlobalMergeOnConst)
-    for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
-         I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I)
-      if (I->second.size() > 1)
-        Changed |= doMerge(I->second, M, true, I->first);
+    for (auto &P : ConstGlobals)
+      if (P.second.size() > 1)
+        Changed |= doMerge(P.second, M, true, P.first);
 
   return Changed;
 }
@@ -604,6 +585,9 @@ bool GlobalMerge::doFinalization(Module &M) {
 }
 
 Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset,
-                                  bool OnlyOptimizeForSize) {
-  return new GlobalMerge(TM, Offset, OnlyOptimizeForSize);
+                                  bool OnlyOptimizeForSize,
+                                  bool MergeExternalByDefault) {
+  bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ?
+    MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE);
+  return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal);
 }
diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp
index ee0532b..c38c9d2 100644
--- a/contrib/llvm/lib/CodeGen/IfConversion.cpp
+++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -190,10 +191,10 @@ namespace {
   private:
     bool ReverseBranchCondition(BBInfo &BBI);
     bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
-                     const BranchProbability &Prediction) const;
+                     BranchProbability Prediction) const;
     bool ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
                        bool FalseBranch, unsigned &Dups,
-                       const BranchProbability &Prediction) const;
+                       BranchProbability Prediction) const;
     bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
                       unsigned &Dups1, unsigned &Dups2) const;
     void ScanInstructions(BBInfo &BBI);
@@ -218,7 +219,7 @@ namespace {
 
     bool MeetIfcvtSizeLimit(MachineBasicBlock &BB,
                             unsigned Cycle, unsigned Extra,
-                            const BranchProbability &Prediction) const {
+                            BranchProbability Prediction) const {
       return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra,
                                                    Prediction);
     }
@@ -227,7 +228,7 @@ namespace {
                             unsigned TCycle, unsigned TExtra,
                             MachineBasicBlock &FBB,
                             unsigned FCycle, unsigned FExtra,
-                            const BranchProbability &Prediction) const {
+                            BranchProbability Prediction) const {
       return TCycle > 0 && FCycle > 0 &&
         TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
                                  Prediction);
@@ -462,11 +463,11 @@ bool IfConverter::ReverseBranchCondition(BBInfo &BBI) {
 /// getNextBlock - Returns the next block in the function blocks ordering. If
 /// it is the end, returns NULL.
 static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) {
-  MachineFunction::iterator I = BB;
+  MachineFunction::iterator I = BB->getIterator();
   MachineFunction::iterator E = BB->getParent()->end();
   if (++I == E)
     return nullptr;
-  return I;
+  return &*I;
 }
 
 /// ValidSimple - Returns true if the 'true' block (along with its
@@ -474,7 +475,7 @@ static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) {
 /// number of instructions that the ifcvt would need to duplicate if performed
 /// in Dups.
 bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
-                              const BranchProbability &Prediction) const {
+                              BranchProbability Prediction) const {
   Dups = 0;
   if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
     return false;
@@ -501,7 +502,7 @@ bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
 /// if performed in 'Dups'.
 bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
                                 bool FalseBranch, unsigned &Dups,
-                                const BranchProbability &Prediction) const {
+                                BranchProbability Prediction) const {
   Dups = 0;
   if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
     return false;
@@ -530,10 +531,10 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
 
   MachineBasicBlock *TExit = FalseBranch ? TrueBBI.FalseBB : TrueBBI.TrueBB;
   if (!TExit && blockAlwaysFallThrough(TrueBBI)) {
-    MachineFunction::iterator I = TrueBBI.BB;
+    MachineFunction::iterator I = TrueBBI.BB->getIterator();
     if (++I == TrueBBI.BB->getParent()->end())
       return false;
-    TExit = I;
+    TExit = &*I;
   }
   return TExit && TExit == FalseBBI.BB;
 }
@@ -948,10 +949,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB,
 /// candidates.
 void IfConverter::AnalyzeBlocks(MachineFunction &MF,
                                 std::vector<IfcvtToken*> &Tokens) {
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *BB = I;
-    AnalyzeBlock(BB, Tokens);
-  }
+  for (auto &BB : MF)
+    AnalyzeBlock(&BB, Tokens);
 
   // Sort to favor more complex ifcvt scheme.
   std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp);
@@ -961,14 +960,14 @@ void IfConverter::AnalyzeBlocks(MachineFunction &MF,
 /// that all the intervening blocks are empty (given BB can fall through to its
 /// next block).
 static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
-  MachineFunction::iterator PI = BB;
+  MachineFunction::iterator PI = BB->getIterator();
   MachineFunction::iterator I = std::next(PI);
-  MachineFunction::iterator TI = ToBB;
+  MachineFunction::iterator TI = ToBB->getIterator();
   MachineFunction::iterator E = BB->getParent()->end();
   while (I != TI) {
     // Check isSuccessor to avoid case where the next block is empty, but
     // it's not a successor.
-    if (I == E || !I->empty() || !PI->isSuccessor(I))
+    if (I == E || !I->empty() || !PI->isSuccessor(&*I))
       return false;
     PI = I++;
   }
@@ -1114,7 +1113,7 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
-    BBI.BB->removeSuccessor(CvtBBI->BB);
+    BBI.BB->removeSuccessor(CvtBBI->BB, true);
   } else {
     RemoveKills(CvtBBI->BB->begin(), CvtBBI->BB->end(), DontKill, *TRI);
     PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
@@ -1153,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
   return true;
 }
 
-/// Scale down weights to fit into uint32_t. NewTrue is the new weight
-/// for successor TrueBB, and NewFalse is the new weight for successor
-/// FalseBB.
-static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse,
-                         MachineBasicBlock *MBB,
-                         const MachineBasicBlock *TrueBB,
-                         const MachineBasicBlock *FalseBB,
-                         const MachineBranchProbabilityInfo *MBPI) {
-  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
-  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                        SE = MBB->succ_end();
-       SI != SE; ++SI) {
-    if (*SI == TrueBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale));
-    else if (*SI == FalseBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale));
-    else
-      MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale);
-  }
-}
-
 /// IfConvertTriangle - If convert a triangle sub-CFG.
 ///
 bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
@@ -1231,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
-  uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
-  uint32_t WeightScale = 0;
+  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
 
   if (HasEarlyExit) {
-    // Get weights before modifying CvtBBI->BB and BBI.BB.
-    CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
-    CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB);
-    BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB);
-    BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
-    SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
+    // Get probabilities before modifying CvtBBI->BB and BBI.BB.
+    CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB);
+    CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB);
+    BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB);
   }
 
   if (CvtBBI->BB->pred_size() > 1) {
@@ -1251,7 +1226,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
-    BBI.BB->removeSuccessor(CvtBBI->BB);
+    BBI.BB->removeSuccessor(CvtBBI->BB, true);
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
@@ -1268,22 +1243,23 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       llvm_unreachable("Unable to reverse branch condition!");
+
+    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
+    // NewNext = New_Prob(BBI.BB, NextBBI->BB) =
+    //   Prob(BBI.BB, NextBBI->BB) +
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB)
+    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB)
+    auto NewTrueBB = getNextBlock(BBI.BB);
+    auto NewNext = BBNext + BBCvt * CvtNext;
+    auto NewTrueBBIter =
+        std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
+    if (NewTrueBBIter != BBI.BB->succ_end())
+      BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+
+    auto NewFalse = BBCvt * CvtFalse;
     TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
-    BBI.BB->addSuccessor(CvtBBI->FalseBB);
-    // Update the edge weight for both CvtBBI->FalseBB and NextBBI.
-    // New_Weight(BBI.BB, NextBBI->BB) =
-    //   Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) +
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB)
-    // New_Weight(BBI.BB, CvtBBI->FalseBB) =
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB)
-
-    uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale;
-    uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale;
-    // We need to scale down all weights of BBI.BB to fit uint32_t.
-    // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to
-    // the next block.
-    ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB),
-                 CvtBBI->FalseBB, MBPI);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
   }
 
   // Merge in the 'false' block if the 'false' block has no other
@@ -1526,7 +1502,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
-      BBI.BB->addSuccessor(TailBB);
+      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
@@ -1536,7 +1512,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // which can happen here if TailBB is unanalyzable and is merged, so
   // explicitly remove BBI1 and BBI2 as successors.
   BBI.BB->removeSuccessor(BBI1->BB);
-  BBI.BB->removeSuccessor(BBI2->BB);
+  BBI.BB->removeSuccessor(BBI2->BB, true);
   RemoveExtraEdges(BBI);
 
   // Update block info.
@@ -1686,25 +1662,94 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   ToBBI.BB->splice(ToBBI.BB->end(),
                    FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end());
 
-  std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
-                                         FromBBI.BB->succ_end());
+  // Force normalizing the successors' probabilities of ToBBI.BB to convert all
+  // unknown probabilities into known ones.
+  // FIXME: This usage is too tricky and in the future we would like to
+  // eliminate all unknown probabilities in MBB.
+  ToBBI.BB->normalizeSuccProbs();
+
+  SmallVector<MachineBasicBlock *, 4> FromSuccs(FromBBI.BB->succ_begin(),
+                                                FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
   MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
+  // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when
+  // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
+  auto To2FromProb = BranchProbability::getZero();
+  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
+    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB);
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
+  }
 
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
-    MachineBasicBlock *Succ = Succs[i];
+  for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
+    MachineBasicBlock *Succ = FromSuccs[i];
     // Fallthrough edge can't be transferred.
     if (Succ == FallThrough)
       continue;
+
+    auto NewProb = BranchProbability::getZero();
+    if (AddEdges) {
+      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
+      // which is a portion of the edge probability from FromBBI.BB to Succ. The
+      // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if
+      // FromBBI is a successor of ToBBI.BB. See comment below for excepion).
+      NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ);
+
+      // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
+      // only happens when if-converting a diamond CFG and FromBBI.BB is the
+      // tail BB.  In this case FromBBI.BB post-dominates ToBBI.BB and hence we
+      // could just use the probabilities on FromBBI.BB's out-edges when adding
+      // new successors.
+      if (!To2FromProb.isZero())
+        NewProb *= To2FromProb;
+    }
+
     FromBBI.BB->removeSuccessor(Succ);
-    if (AddEdges && !ToBBI.BB->isSuccessor(Succ))
-      ToBBI.BB->addSuccessor(Succ);
+
+    if (AddEdges) {
+      // If the edge from ToBBI.BB to Succ already exists, update the
+      // probability of this edge by adding NewProb to it. An example is shown
+      // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we
+      // don't have to set C as A's successor as it already is. We only need to
+      // update the edge probability on A->C. Note that B will not be
+      // immediately removed from A's successors. It is possible that B->D is
+      // not removed either if D is a fallthrough of B. Later the edge A->D
+      // (generated here) and B->D will be combined into one edge. To maintain
+      // correct edge probability of this combined edge, we need to set the edge
+      // probability of A->B to zero, which is already done above. The edge
+      // probability on A->D is calculated by scaling the original probability
+      // on A->B by the probability of B->D.
+      //
+      // Before ifcvt:      After ifcvt (assume B->D is kept):
+      //
+      //       A                A
+      //      /|               /|\
+      //     / B              / B|
+      //    | /|             |  ||
+      //    |/ |             |  |/
+      //    C  D             C  D
+      //
+      if (ToBBI.BB->isSuccessor(Succ))
+        ToBBI.BB->setSuccProbability(
+            std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ),
+            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
+      else
+        ToBBI.BB->addSuccessor(Succ, NewProb);
+    }
   }
 
   // Now FromBBI always falls through to the next block!
   if (NBB && !FromBBI.BB->isSuccessor(NBB))
     FromBBI.BB->addSuccessor(NBB);
 
+  // Normalize the probabilities of ToBBI.BB's successors with all adjustment
+  // we've done above.
+  ToBBI.BB->normalizeSuccProbs();
+
   ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
   FromBBI.Predicate.clear();
 
diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 93e0487..39c1b9f 100644
--- a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -107,6 +108,98 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
+
+/// \brief Detect re-ordering hazards and dependencies.
+///
+/// This class keeps track of defs and uses, and can be queried if a given
+/// machine instruction can be re-ordered from after the machine instructions
+/// seen so far to before them.
+class HazardDetector {
+  DenseSet<unsigned> RegDefs;
+  DenseSet<unsigned> RegUses;
+  const TargetRegisterInfo &TRI;
+  bool hasSeenClobber;
+
+public:
+  explicit HazardDetector(const TargetRegisterInfo &TRI) :
+    TRI(TRI), hasSeenClobber(false) {}
+
+  /// \brief Make a note of \p MI for later queries to isSafeToHoist.
+  ///
+  /// May clobber this HazardDetector instance.  \see isClobbered.
+  void rememberInstruction(MachineInstr *MI);
+
+  /// \brief Return true if it is safe to hoist \p MI from after all the
+  /// instructions seen so far (via rememberInstruction) to before it.
+  bool isSafeToHoist(MachineInstr *MI);
+
+  /// \brief Return true if this instance of HazardDetector has been clobbered
+  /// (i.e. has no more useful information).
+  ///
+  /// A HazardDetecter is clobbered when it sees a construct it cannot
+  /// understand, and it would have to return a conservative answer for all
+  /// future queries.  Having a separate clobbered state lets the client code
+  /// bail early, without making queries about all of the future instructions
+  /// (which would have returned the most conservative answer anyway).
+  ///
+  /// Calling rememberInstruction or isSafeToHoist on a clobbered HazardDetector
+  /// is an error.
+  bool isClobbered() { return hasSeenClobber; }
+};
+}
+
+
+void HazardDetector::rememberInstruction(MachineInstr *MI) {
+  assert(!isClobbered() &&
+         "Don't add instructions to a clobbered hazard detector");
+
+  if (MI->mayStore() || MI->hasUnmodeledSideEffects()) {
+    hasSeenClobber = true;
+    return;
+  }
+
+  for (auto *MMO : MI->memoperands()) {
+    // Right now we don't want to worry about LLVM's memory model.
+    if (!MMO->isUnordered()) {
+      hasSeenClobber = true;
+      return;
+    }
+  }
+
+  for (auto &MO : MI->operands()) {
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+
+    if (MO.isDef())
+      RegDefs.insert(MO.getReg());
+    else
+      RegUses.insert(MO.getReg());
+  }
+}
+
+bool HazardDetector::isSafeToHoist(MachineInstr *MI) {
+  assert(!isClobbered() && "isSafeToHoist cannot do anything useful!");
+
+  // Right now we don't want to worry about LLVM's memory model.  This can be
+  // made more precise later.
+  for (auto *MMO : MI->memoperands())
+    if (!MMO->isUnordered())
+      return false;
+
+  for (auto &MO : MI->operands()) {
+    if (MO.isReg() && MO.getReg()) {
+      for (unsigned Reg : RegDefs)
+        if (TRI.regsOverlap(Reg, MO.getReg()))
+          return false;  // We found a write-after-write or read-after-write
+
+      if (MO.isDef())
+        for (unsigned Reg : RegUses)
+          if (TRI.regsOverlap(Reg, MO.getReg()))
+            return false;  // We found a write-after-read
+    }
+  }
+
+  return true;
 }
 
 bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
@@ -132,10 +225,10 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
     MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList) {
   typedef TargetInstrInfo::MachineBranchPredicate MachineBranchPredicate;
 
-  MDNode *BranchMD =
-      MBB.getBasicBlock()
-          ? MBB.getBasicBlock()->getTerminator()->getMetadata("make.implicit")
-          : nullptr;
+  MDNode *BranchMD = nullptr;
+  if (auto *BB = MBB.getBasicBlock())
+    BranchMD = BB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit);
+
   if (!BranchMD)
     return false;
 
@@ -188,7 +281,7 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //
   // we want to end up with
   //
-  //   Def = TrappingLoad (%RAX + <offset>), LblNull
+  //   Def = FaultingLoad (%RAX + <offset>), LblNull
   //   jmp LblNotNull ;; explicit or fallthrough
   //
   //  LblNotNull:
@@ -199,38 +292,34 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //  LblNull:
   //   callq throw_NullPointerException
   //
+  //
+  // To see why this is legal, consider the two possibilities:
+  //
+  //  1. %RAX is null: since we constrain <offset> to be less than PageSize, the
+  //     load instruction dereferences the null page, causing a segmentation
+  //     fault.
+  //
+  //  2. %RAX is not null: in this case we know that the load cannot fault, as
+  //     otherwise the load would've faulted in the original program too and the
+  //     original program would've been undefined.
+  //
+  // This reasoning cannot be extended to justify hoisting through arbitrary
+  // control flow.  For instance, in the example below (in pseudo-C)
+  //
+  //    if (ptr == null) { throw_npe(); unreachable; }
+  //    if (some_cond) { return 42; }
+  //    v = ptr->field;  // LD
+  //    ...
+  //
+  // we cannot (without code duplication) use the load marked "LD" to null check
+  // ptr -- clause (2) above does not apply in this case.  In the above program
+  // the safety of ptr->field can be dependent on some_cond; and, for instance,
+  // ptr could be some non-null invalid reference that never gets loaded from
+  // because some_cond is always true.
 
   unsigned PointerReg = MBP.LHS.getReg();
 
-  // As we scan NotNullSucc for a suitable load instruction, we keep track of
-  // the registers defined and used by the instructions we scan past.  This bit
-  // of information lets us decide if it is legal to hoist the load instruction
-  // we find (if we do find such an instruction) to before NotNullSucc.
-  DenseSet<unsigned> RegDefs, RegUses;
-
-  // Returns true if it is safe to reorder MI to before NotNullSucc.
-  auto IsSafeToHoist = [&](MachineInstr *MI) {
-    // Right now we don't want to worry about LLVM's memory model.  This can be
-    // made more precise later.
-    for (auto *MMO : MI->memoperands())
-      if (!MMO->isUnordered())
-        return false;
-
-    for (auto &MO : MI->operands()) {
-      if (MO.isReg() && MO.getReg()) {
-        for (unsigned Reg : RegDefs)
-          if (TRI->regsOverlap(Reg, MO.getReg()))
-            return false;  // We found a write-after-write or read-after-write
-
-        if (MO.isDef())
-          for (unsigned Reg : RegUses)
-            if (TRI->regsOverlap(Reg, MO.getReg()))
-              return false;  // We found a write-after-read
-      }
-    }
-
-    return true;
-  };
+  HazardDetector HD(*TRI);
 
   for (auto MII = NotNullSucc->begin(), MIE = NotNullSucc->end(); MII != MIE;
        ++MII) {
@@ -238,37 +327,16 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
     unsigned BaseReg, Offset;
     if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
       if (MI->mayLoad() && !MI->isPredicable() && BaseReg == PointerReg &&
-          Offset < PageSize && MI->getDesc().getNumDefs() == 1 &&
-          IsSafeToHoist(MI)) {
+          Offset < PageSize && MI->getDesc().getNumDefs() <= 1 &&
+          HD.isSafeToHoist(MI)) {
         NullCheckList.emplace_back(MI, MBP.ConditionDef, &MBB, NotNullSucc,
                                    NullSucc);
         return true;
       }
 
-    // MI did not match our criteria for conversion to a trapping load.  Check
-    // if we can continue looking.
-
-    if (MI->mayStore() || MI->hasUnmodeledSideEffects())
+    HD.rememberInstruction(MI);
+    if (HD.isClobbered())
       return false;
-
-    for (auto *MMO : MI->memoperands())
-      // Right now we don't want to worry about LLVM's memory model.
-      if (!MMO->isUnordered())
-        return false;
-
-    // It _may_ be okay to reorder a later load instruction across MI.  Make a
-    // note of its operands so that we can make the legality check if we find a
-    // suitable load instruction:
-
-    for (auto &MO : MI->operands()) {
-      if (!MO.isReg() || !MO.getReg())
-        continue;
-
-      if (MO.isDef())
-        RegDefs.insert(MO.getReg());
-      else
-        RegUses.insert(MO.getReg());
-    }
   }
 
   return false;
@@ -281,14 +349,19 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
 MachineInstr *ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
                                                      MachineBasicBlock *MBB,
                                                      MCSymbol *HandlerLabel) {
+  const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
+                                 // all targets.
+
   DebugLoc DL;
   unsigned NumDefs = LoadMI->getDesc().getNumDefs();
-  assert(NumDefs == 1 && "other cases unhandled!");
-  (void)NumDefs;
+  assert(NumDefs <= 1 && "other cases unhandled!");
 
-  unsigned DefReg = LoadMI->defs().begin()->getReg();
-  assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 &&
-         "expected exactly one def!");
+  unsigned DefReg = NoRegister;
+  if (NumDefs != 0) {
+    DefReg = LoadMI->defs().begin()->getReg();
+    assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 &&
+           "expected exactly one def!");
+  }
 
   auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg)
                  .addSym(HandlerLabel)
diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
index 9989f23..e310132 100644
--- a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -141,7 +141,7 @@ public:
   InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
       : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
         LSS(pass.getAnalysis<LiveStacks>()),
-        AA(&pass.getAnalysis<AliasAnalysis>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
         MDT(pass.getAnalysis<MachineDominatorTree>()),
         Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
         MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
@@ -329,8 +329,8 @@ static raw_ostream &operator<<(raw_ostream &OS,
   if (SVI.KillsSource)
     OS << " kill";
   OS << " deps[";
-  for (unsigned i = 0, e = SVI.Deps.size(); i != e; ++i)
-    OS << ' ' << SVI.Deps[i]->id << '@' << SVI.Deps[i]->def;
+  for (VNInfo *Dep : SVI.Deps)
+    OS << ' ' << Dep->id << '@' << Dep->def;
   OS << " ]";
   if (SVI.DefMI)
     OS << " def: " << *SVI.DefMI;
@@ -383,9 +383,8 @@ void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter,
     bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg);
     unsigned SpillDepth = ~0u;
 
-    for (TinyPtrVector<VNInfo*>::iterator DepI = Deps->begin(),
-         DepE = Deps->end(); DepI != DepE; ++DepI) {
-      SibValueMap::iterator DepSVI = SibValues.find(*DepI);
+    for (VNInfo *Dep : *Deps) {
+      SibValueMap::iterator DepSVI = SibValues.find(Dep);
       assert(DepSVI != SibValues.end() && "Dependent value not in SibValues");
       SibValueInfo &DepSV = DepSVI->second;
       if (!DepSV.SpillMBB)
@@ -566,12 +565,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
 
       // Create entries for all the PHIs.  Don't add them to the worklist, we
       // are processing all of them in one go here.
-      for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
-        SibValues.insert(std::make_pair(PHIs[i], SibValueInfo(Reg, PHIs[i])));
+      for (VNInfo *PHI : PHIs)
+        SibValues.insert(std::make_pair(PHI, SibValueInfo(Reg, PHI)));
 
       // Add every PHI as a dependent of all the non-PHIs.
-      for (unsigned i = 0, e = NonPHIs.size(); i != e; ++i) {
-        VNInfo *NonPHI = NonPHIs[i];
+      for (VNInfo *NonPHI : NonPHIs) {
         // Known value? Try an insertion.
         std::tie(SVI, Inserted) =
           SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
@@ -654,8 +652,7 @@ void InlineSpiller::analyzeSiblingValues() {
     return;
 
   LiveInterval &OrigLI = LIS.getInterval(Original);
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
-    unsigned Reg = RegsToSpill[i];
+  for (unsigned Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
     for (LiveInterval::const_vni_iterator VI = LI.vni_begin(),
          VE = LI.vni_end(); VI != VE; ++VI) {
@@ -831,9 +828,8 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
 
     if (VNI->isPHIDef()) {
       MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
-      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-             PE = MBB->pred_end(); PI != PE; ++PI) {
-        VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(*PI));
+      for (MachineBasicBlock *P : MBB->predecessors()) {
+        VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(P));
         if (PVNI)
           WorkList.push_back(std::make_pair(LI, PVNI));
       }
@@ -920,8 +916,8 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
                << *LIS.getInstructionFromIndex(DefIdx));
 
   // Replace operands
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-    MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second);
+  for (const auto &OpPair : Ops) {
+    MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
     if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
       MO.setReg(NewVReg);
       MO.setIsKill();
@@ -944,8 +940,7 @@ void InlineSpiller::reMaterializeAll() {
 
   // Try to remat before all uses of snippets.
   bool anyRemat = false;
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
-    unsigned Reg = RegsToSpill[i];
+  for (unsigned Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
     for (MachineRegisterInfo::reg_bundle_iterator
            RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
@@ -963,8 +958,7 @@ void InlineSpiller::reMaterializeAll() {
     return;
 
   // Remove any values that were completely rematted.
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
-    unsigned Reg = RegsToSpill[i];
+  for (unsigned Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
     for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end();
          I != E; ++I) {
@@ -989,8 +983,7 @@ void InlineSpiller::reMaterializeAll() {
 
   // Get rid of deleted and empty intervals.
   unsigned ResultPos = 0;
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
-    unsigned Reg = RegsToSpill[i];
+  for (unsigned Reg : RegsToSpill) {
     if (!LIS.hasInterval(Reg))
       continue;
 
@@ -1098,9 +1091,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
   // operands.
   SmallVector<unsigned, 8> FoldOps;
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-    unsigned Idx = Ops[i].second;
-    assert(MI == Ops[i].first && "Instruction conflict during operand folding");
+  for (const auto &OpPair : Ops) {
+    unsigned Idx = OpPair.second;
+    assert(MI == OpPair.first && "Instruction conflict during operand folding");
     MachineOperand &MO = MI->getOperand(Idx);
     if (MO.isImplicit()) {
       ImpReg = MO.getReg();
@@ -1139,7 +1132,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       continue;
     MIBundleOperands::PhysRegInfo RI =
       MIBundleOperands(FoldMI).analyzePhysReg(Reg, &TRI);
-    if (RI.Defines)
+    if (RI.FullyDefined)
       continue;
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
@@ -1152,10 +1145,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
 
   // Insert any new instructions other than FoldMI into the LIS maps.
   assert(!MIS.empty() && "Unexpected empty span of instructions!");
-  for (MachineBasicBlock::iterator MII = MIS.begin(), End = MIS.end();
-       MII != End; ++MII)
-    if (&*MII != FoldMI)
-      LIS.InsertMachineInstrInMaps(&*MII);
+  for (MachineInstr &MI : MIS)
+    if (&MI != FoldMI)
+      LIS.InsertMachineInstrInMaps(&MI);
 
   // TII.foldMemoryOperand may have left some implicit operands on the
   // instruction.  Strip them.
@@ -1301,11 +1293,11 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-      MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second);
+    for (const auto &OpPair : Ops) {
+      MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
       MO.setReg(NewVReg);
       if (MO.isUse()) {
-        if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second))
+        if (!OpPair.first->isRegTiedToDefOperand(OpPair.second))
           MO.setIsKill();
       } else {
         if (!MO.isDead())
@@ -1335,14 +1327,14 @@ void InlineSpiller::spillAll() {
     VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot);
 
   assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    StackInt->MergeSegmentsInAsValue(LIS.getInterval(RegsToSpill[i]),
+  for (unsigned Reg : RegsToSpill)
+    StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg),
                                      StackInt->getValNumInfo(0));
   DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
 
   // Spill around uses of all RegsToSpill.
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    spillAroundUses(RegsToSpill[i]);
+  for (unsigned Reg : RegsToSpill)
+    spillAroundUses(Reg);
 
   // Hoisted spills may cause dead code.
   if (!DeadDefs.empty()) {
@@ -1351,9 +1343,9 @@ void InlineSpiller::spillAll() {
   }
 
   // Finally delete the SnippetCopies.
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+  for (unsigned Reg : RegsToSpill) {
     for (MachineRegisterInfo::reg_instr_iterator
-         RI = MRI.reg_instr_begin(RegsToSpill[i]), E = MRI.reg_instr_end();
+         RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end();
          RI != E; ) {
       MachineInstr *MI = &*(RI++);
       assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
@@ -1364,8 +1356,8 @@ void InlineSpiller::spillAll() {
   }
 
   // Delete all spilled registers.
-  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    Edit->eraseVirtReg(RegsToSpill[i]);
+  for (unsigned Reg : RegsToSpill)
+    Edit->eraseVirtReg(Reg);
 }
 
 void InlineSpiller::spill(LiveRangeEdit &edit) {
diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
index fd5749b..f8cc247 100644
--- a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
+++ b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -144,7 +144,8 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
     PrevPos = Start;
   }
 
-  MachineFunction::const_iterator MFI = MF->getBlockNumbered(MBBNum);
+  MachineFunction::const_iterator MFI =
+      MF->getBlockNumbered(MBBNum)->getIterator();
   BlockInterference *BI = &Blocks[MBBNum];
   ArrayRef<SlotIndex> RegMaskSlots;
   ArrayRef<const uint32_t*> RegMaskBits;
diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 53c8adc..724f1d6 100644
--- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -52,7 +52,7 @@ using namespace llvm;
 static cl::opt<bool> LowerInterleavedAccesses(
     "lower-interleaved-accesses",
     cl::desc("Enable lowering interleaved accesses to intrinsics"),
-    cl::init(false), cl::Hidden);
+    cl::init(true), cl::Hidden);
 
 static unsigned MaxFactor; // The maximum supported interleave factor.
 
@@ -271,7 +271,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
   SmallVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
-  for (auto &I : inst_range(F)) {
+  for (auto &I : instructions(F)) {
     if (LoadInst *LI = dyn_cast<LoadInst>(&I))
       Changed |= lowerInterleavedLoad(LI, DeadInsts);
 
diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 2c95e9e..2962f87 100644
--- a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -35,24 +35,24 @@ static void EnsureFunctionExists(Module &M, const char *Name,
   M.getOrInsertFunction(Name, FunctionType::get(RetTy, ParamTys, false));
 }
 
-static void EnsureFPIntrinsicsExist(Module &M, Function *Fn,
+static void EnsureFPIntrinsicsExist(Module &M, Function &Fn,
                                     const char *FName,
                                     const char *DName, const char *LDName) {
   // Insert definitions for all the floating point types.
-  switch((int)Fn->arg_begin()->getType()->getTypeID()) {
+  switch((int)Fn.arg_begin()->getType()->getTypeID()) {
   case Type::FloatTyID:
-    EnsureFunctionExists(M, FName, Fn->arg_begin(), Fn->arg_end(),
+    EnsureFunctionExists(M, FName, Fn.arg_begin(), Fn.arg_end(),
                          Type::getFloatTy(M.getContext()));
     break;
   case Type::DoubleTyID:
-    EnsureFunctionExists(M, DName, Fn->arg_begin(), Fn->arg_end(),
+    EnsureFunctionExists(M, DName, Fn.arg_begin(), Fn.arg_end(),
                          Type::getDoubleTy(M.getContext()));
     break;
   case Type::X86_FP80TyID:
   case Type::FP128TyID:
   case Type::PPC_FP128TyID:
-    EnsureFunctionExists(M, LDName, Fn->arg_begin(), Fn->arg_end(),
-                         Fn->arg_begin()->getType());
+    EnsureFunctionExists(M, LDName, Fn.arg_begin(), Fn.arg_end(),
+                         Fn.arg_begin()->getType());
     break;
   }
 }
@@ -67,7 +67,7 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
                                  Type *RetTy) {
   // If we haven't already looked up this function, check to see if the
   // program already contains a function with this name.
-  Module *M = CI->getParent()->getParent()->getParent();
+  Module *M = CI->getModule();
   // Get or insert the definition now.
   std::vector<Type *> ParamTys;
   for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
@@ -75,7 +75,7 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
   Constant* FCache = M->getOrInsertFunction(NewFn,
                                   FunctionType::get(RetTy, ParamTys, false));
 
-  IRBuilder<> Builder(CI->getParent(), CI);
+  IRBuilder<> Builder(CI->getParent(), CI->getIterator());
   SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
   CallInst *NewCI = Builder.CreateCall(FCache, Args);
   NewCI->setName(CI->getName());
@@ -94,20 +94,20 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
 
 void IntrinsicLowering::AddPrototypes(Module &M) {
   LLVMContext &Context = M.getContext();
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (I->isDeclaration() && !I->use_empty())
-      switch (I->getIntrinsicID()) {
+  for (auto &F : M)
+    if (F.isDeclaration() && !F.use_empty())
+      switch (F.getIntrinsicID()) {
       default: break;
       case Intrinsic::setjmp:
-        EnsureFunctionExists(M, "setjmp", I->arg_begin(), I->arg_end(),
+        EnsureFunctionExists(M, "setjmp", F.arg_begin(), F.arg_end(),
                              Type::getInt32Ty(M.getContext()));
         break;
       case Intrinsic::longjmp:
-        EnsureFunctionExists(M, "longjmp", I->arg_begin(), I->arg_end(),
+        EnsureFunctionExists(M, "longjmp", F.arg_begin(), F.arg_end(),
                              Type::getVoidTy(M.getContext()));
         break;
       case Intrinsic::siglongjmp:
-        EnsureFunctionExists(M, "abort", I->arg_end(), I->arg_end(),
+        EnsureFunctionExists(M, "abort", F.arg_end(), F.arg_end(),
                              Type::getVoidTy(M.getContext()));
         break;
       case Intrinsic::memcpy:
@@ -132,31 +132,31 @@ void IntrinsicLowering::AddPrototypes(Module &M) {
                               DL.getIntPtrType(Context), nullptr);
         break;
       case Intrinsic::sqrt:
-        EnsureFPIntrinsicsExist(M, I, "sqrtf", "sqrt", "sqrtl");
+        EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl");
         break;
       case Intrinsic::sin:
-        EnsureFPIntrinsicsExist(M, I, "sinf", "sin", "sinl");
+        EnsureFPIntrinsicsExist(M, F, "sinf", "sin", "sinl");
         break;
       case Intrinsic::cos:
-        EnsureFPIntrinsicsExist(M, I, "cosf", "cos", "cosl");
+        EnsureFPIntrinsicsExist(M, F, "cosf", "cos", "cosl");
         break;
       case Intrinsic::pow:
-        EnsureFPIntrinsicsExist(M, I, "powf", "pow", "powl");
+        EnsureFPIntrinsicsExist(M, F, "powf", "pow", "powl");
         break;
       case Intrinsic::log:
-        EnsureFPIntrinsicsExist(M, I, "logf", "log", "logl");
+        EnsureFPIntrinsicsExist(M, F, "logf", "log", "logl");
         break;
       case Intrinsic::log2:
-        EnsureFPIntrinsicsExist(M, I, "log2f", "log2", "log2l");
+        EnsureFPIntrinsicsExist(M, F, "log2f", "log2", "log2l");
         break;
       case Intrinsic::log10:
-        EnsureFPIntrinsicsExist(M, I, "log10f", "log10", "log10l");
+        EnsureFPIntrinsicsExist(M, F, "log10f", "log10", "log10l");
         break;
       case Intrinsic::exp:
-        EnsureFPIntrinsicsExist(M, I, "expf", "exp", "expl");
+        EnsureFPIntrinsicsExist(M, F, "expf", "exp", "expl");
         break;
       case Intrinsic::exp2:
-        EnsureFPIntrinsicsExist(M, I, "exp2f", "exp2", "exp2l");
+        EnsureFPIntrinsicsExist(M, F, "exp2f", "exp2", "exp2l");
         break;
       }
 }
@@ -167,8 +167,8 @@ static Value *LowerBSWAP(LLVMContext &Context, Value *V, Instruction *IP) {
   assert(V->getType()->isIntegerTy() && "Can't bswap a non-integer type!");
 
   unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
-  
-  IRBuilder<> Builder(IP->getParent(), IP);
+
+  IRBuilder<> Builder(IP);
 
   switch(BitSize) {
   default: llvm_unreachable("Unhandled type size of value to byteswap!");
@@ -268,7 +268,7 @@ static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) {
     0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
   };
 
-  IRBuilder<> Builder(IP->getParent(), IP);
+  IRBuilder<> Builder(IP);
 
   unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
   unsigned WordSize = (BitSize + 63) / 64;
@@ -301,7 +301,7 @@ static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) {
 /// instruction IP.
 static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
 
-  IRBuilder<> Builder(IP->getParent(), IP);
+  IRBuilder<> Builder(IP);
 
   unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
   for (unsigned i = 1; i < BitSize; i <<= 1) {
@@ -338,7 +338,7 @@ static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
 }
 
 void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
-  IRBuilder<> Builder(CI->getParent(), CI);
+  IRBuilder<> Builder(CI);
   LLVMContext &Context = CI->getContext();
 
   const Function *Callee = CI->getCalledFunction();
@@ -424,6 +424,13 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
     
+  case Intrinsic::get_dynamic_area_offset:
+    errs() << "WARNING: this target does not support the custom llvm.get."
+              "dynamic.area.offset.  It is being lowered to a constant 0\n";
+    // Just lower it to a constant 0 because for most targets
+    // @llvm.get.dynamic.area.offset is lowered to zero.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
+    break;
   case Intrinsic::returnaddress:
   case Intrinsic::frameaddress:
     errs() << "WARNING: this target does not support the llvm."
@@ -589,7 +596,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
     return false;
 
   // Okay, we can do this xform, do so now.
-  Module *M = CI->getParent()->getParent()->getParent();
+  Module *M = CI->getModule();
   Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
 
   Value *Op = CI->getArgOperand(0);
diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 37299eb..1c27377 100644
--- a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -82,7 +82,7 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T,
 }
 
 TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(BasicTTIImpl(this, F));
   });
 }
@@ -125,9 +125,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
   PM.add(new MachineFunctionAnalysis(*TM, MFInitializer));
 
   // Enable FastISel with -fast, but allow that to be overridden.
+  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
   if (EnableFastISelOption == cl::BOU_TRUE ||
       (TM->getOptLevel() == CodeGenOpt::None &&
-       EnableFastISelOption != cl::BOU_FALSE))
+       TM->getO0WantsFastISel()))
     TM->setFastISel(true);
 
   // Ask the target for an isel.
@@ -202,6 +203,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(
     Triple T(getTargetTriple().str());
     AsmStreamer.reset(getTarget().createMCObjectStreamer(
         T, *Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll,
+        Options.MCOptions.MCIncrementalLinkerCompatible,
         /*DWARFMustBeAtTheEnd*/ true));
     break;
   }
@@ -254,6 +256,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
   const MCSubtargetInfo &STI = *getMCSubtargetInfo();
   std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
       T, *Ctx, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll,
+      Options.MCOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ true));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
new file mode 100644
index 0000000..b9937e5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -0,0 +1,416 @@
+//===------ LiveDebugValues.cpp - Tracking Debug Value MIs ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass implements a data flow analysis that propagates debug location
+/// information by inserting additional DBG_VALUE instructions into the machine
+/// instruction stream. The pass internally builds debug location liveness
+/// ranges to determine the points where additional DBG_VALUEs need to be
+/// inserted.
+///
+/// This is a separate pass from DbgValueHistoryCalculator to facilitate
+/// testing and improve modularity.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+#include <list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "live-debug-values"
+
+STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+
+namespace {
+
+class LiveDebugValues : public MachineFunctionPass {
+
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  typedef std::pair<const DILocalVariable *, const DILocation *>
+      InlinedVariable;
+
+  /// A potentially inlined instance of a variable.
+  struct DebugVariable {
+    const DILocalVariable *Var;
+    const DILocation *InlinedAt;
+
+    DebugVariable(const DILocalVariable *_var, const DILocation *_inlinedAt)
+        : Var(_var), InlinedAt(_inlinedAt) {}
+
+    bool operator==(const DebugVariable &DV) const {
+      return (Var == DV.Var) && (InlinedAt == DV.InlinedAt);
+    }
+  };
+
+  /// Member variables and functions for Range Extension across basic blocks.
+  struct VarLoc {
+    DebugVariable Var;
+    const MachineInstr *MI; // MachineInstr should be a DBG_VALUE instr.
+
+    VarLoc(DebugVariable _var, const MachineInstr *_mi) : Var(_var), MI(_mi) {}
+
+    bool operator==(const VarLoc &V) const;
+  };
+
+  typedef std::list<VarLoc> VarLocList;
+  typedef SmallDenseMap<const MachineBasicBlock *, VarLocList> VarLocInMBB;
+
+  void transferDebugValue(MachineInstr &MI, VarLocList &OpenRanges);
+  void transferRegisterDef(MachineInstr &MI, VarLocList &OpenRanges);
+  bool transferTerminatorInst(MachineInstr &MI, VarLocList &OpenRanges,
+                              VarLocInMBB &OutLocs);
+  bool transfer(MachineInstr &MI, VarLocList &OpenRanges, VarLocInMBB &OutLocs);
+
+  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs);
+
+  bool ExtendRanges(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  /// Default construct and initialize the pass.
+  LiveDebugValues();
+
+  /// Tell the pass manager which passes we depend on and what
+  /// information we preserve.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Print to ostream with a message.
+  void printVarLocInMBB(const VarLocInMBB &V, const char *msg,
+                        raw_ostream &Out) const;
+
+  /// Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+char LiveDebugValues::ID = 0;
+char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis",
+                false, false)
+
+/// Default construct and initialize the pass.
+LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+}
+
+/// Tell the pass manager which passes we depend on and what information we
+/// preserve.
+void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// \brief If @MI is a DBG_VALUE with debug value described by a defined
+// register, returns the number of this register. In the other case, returns 0.
+static unsigned isDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue());
+  assert(MI.getNumOperands() == 4);
+  // If location of variable is described using a register (directly or
+  // indirecltly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
+// \brief This function takes two DBG_VALUE instructions and returns true
+// if their offsets are equal; otherwise returns false.
+static bool areOffsetsEqual(const MachineInstr &MI1, const MachineInstr &MI2) {
+  assert(MI1.isDebugValue());
+  assert(MI1.getNumOperands() == 4);
+
+  assert(MI2.isDebugValue());
+  assert(MI2.getNumOperands() == 4);
+
+  if (!MI1.isIndirectDebugValue() && !MI2.isIndirectDebugValue())
+    return true;
+
+  // Check if both MIs are indirect and they are equal.
+  if (MI1.isIndirectDebugValue() && MI2.isIndirectDebugValue())
+    return MI1.getOperand(1).getImm() == MI2.getOperand(1).getImm();
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+void LiveDebugValues::printVarLocInMBB(const VarLocInMBB &V, const char *msg,
+                                       raw_ostream &Out) const {
+  Out << "Printing " << msg << ":\n";
+  for (const auto &L : V) {
+    Out << "MBB: " << L.first->getName() << ":\n";
+    for (const auto &VLL : L.second) {
+      Out << " Var: " << VLL.Var.Var->getName();
+      Out << " MI: ";
+      (*VLL.MI).dump();
+      Out << "\n";
+    }
+  }
+  Out << "\n";
+}
+
+bool LiveDebugValues::VarLoc::operator==(const VarLoc &V) const {
+  return (Var == V.Var) && (isDescribedByReg(*MI) == isDescribedByReg(*V.MI)) &&
+         (areOffsetsEqual(*MI, *V.MI));
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+void LiveDebugValues::transferDebugValue(MachineInstr &MI,
+                                         VarLocList &OpenRanges) {
+  if (!MI.isDebugValue())
+    return;
+  const DILocalVariable *RawVar = MI.getDebugVariable();
+  assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+         "Expected inlined-at fields to agree");
+  DebugVariable Var(RawVar, MI.getDebugLoc()->getInlinedAt());
+
+  // End all previous ranges of Var.
+  OpenRanges.erase(
+      std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                     [&](const VarLoc &V) { return (Var == V.Var); }),
+      OpenRanges.end());
+
+  // Add Var to OpenRanges from this DBG_VALUE.
+  // TODO: Currently handles DBG_VALUE which has only reg as location.
+  if (isDescribedByReg(MI)) {
+    VarLoc V(Var, &MI);
+    OpenRanges.push_back(std::move(V));
+  }
+}
+
+/// A definition of a register may mark the end of a range.
+void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
+                                          VarLocList &OpenRanges) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!(MO.isReg() && MO.isDef() && MO.getReg() &&
+          TRI->isPhysicalRegister(MO.getReg())))
+      continue;
+    // Remove ranges of all aliased registers.
+    for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+      OpenRanges.erase(std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                                      [&](const VarLoc &V) {
+                                        return (*RAI ==
+                                                isDescribedByReg(*V.MI));
+                                      }),
+                       OpenRanges.end());
+  }
+}
+
+/// Terminate all open ranges at the end of the current basic block.
+bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
+                                             VarLocList &OpenRanges,
+                                             VarLocInMBB &OutLocs) {
+  bool Changed = false;
+  const MachineBasicBlock *CurMBB = MI.getParent();
+  if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back())))
+    return false;
+
+  if (OpenRanges.empty())
+    return false;
+
+  VarLocList &VLL = OutLocs[CurMBB];
+
+  for (auto OR : OpenRanges) {
+    // Copy OpenRanges to OutLocs, if not already present.
+    assert(OR.MI->isDebugValue());
+    DEBUG(dbgs() << "Add to OutLocs: "; OR.MI->dump(););
+    if (std::find_if(VLL.begin(), VLL.end(),
+                     [&](const VarLoc &V) { return (OR == V); }) == VLL.end()) {
+      VLL.push_back(std::move(OR));
+      Changed = true;
+    }
+  }
+  OpenRanges.clear();
+  return Changed;
+}
+
+/// This routine creates OpenRanges and OutLocs.
+bool LiveDebugValues::transfer(MachineInstr &MI, VarLocList &OpenRanges,
+                               VarLocInMBB &OutLocs) {
+  bool Changed = false;
+  transferDebugValue(MI, OpenRanges);
+  transferRegisterDef(MI, OpenRanges);
+  Changed = transferTerminatorInst(MI, OpenRanges, OutLocs);
+  return Changed;
+}
+
+/// This routine joins the analysis results of all incoming edges in @MBB by
+/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
+/// source variable in all the predecessors of @MBB reside in the same location.
+bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
+                           VarLocInMBB &InLocs) {
+  DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
+  bool Changed = false;
+
+  VarLocList InLocsT; // Temporary incoming locations.
+
+  // For all predecessors of this MBB, find the set of VarLocs that can be
+  // joined.
+  for (auto p : MBB.predecessors()) {
+    auto OL = OutLocs.find(p);
+    // Join is null in case of empty OutLocs from any of the pred.
+    if (OL == OutLocs.end())
+      return false;
+
+    // Just copy over the Out locs to incoming locs for the first predecessor.
+    if (p == *MBB.pred_begin()) {
+      InLocsT = OL->second;
+      continue;
+    }
+
+    // Join with this predecessor.
+    VarLocList &VLL = OL->second;
+    InLocsT.erase(
+        std::remove_if(InLocsT.begin(), InLocsT.end(), [&](VarLoc &ILT) {
+          return (std::find_if(VLL.begin(), VLL.end(), [&](const VarLoc &V) {
+                    return (ILT == V);
+                  }) == VLL.end());
+        }), InLocsT.end());
+  }
+
+  if (InLocsT.empty())
+    return false;
+
+  VarLocList &ILL = InLocs[&MBB];
+
+  // Insert DBG_VALUE instructions, if not already inserted.
+  for (auto ILT : InLocsT) {
+    if (std::find_if(ILL.begin(), ILL.end(), [&](const VarLoc &I) {
+          return (ILT == I);
+        }) == ILL.end()) {
+      // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
+      // new range is started for the var from the mbb's beginning by inserting
+      // a new DBG_VALUE. transfer() will end this range however appropriate.
+      const MachineInstr *DMI = ILT.MI;
+      MachineInstr *MI =
+          BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(),
+                  DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0,
+                  DMI->getDebugVariable(), DMI->getDebugExpression());
+      if (DMI->isIndirectDebugValue())
+        MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
+      DEBUG(dbgs() << "Inserted: "; MI->dump(););
+      ++NumInserted;
+      Changed = true;
+
+      VarLoc V(ILT.Var, MI);
+      ILL.push_back(std::move(V));
+    }
+  }
+  return Changed;
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "\nDebug Range Extension\n");
+
+  bool Changed = false;
+  bool OLChanged = false;
+  bool MBBJoined = false;
+
+  VarLocList OpenRanges; // Ranges that are open until end of bb.
+  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
+
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>> Worklist;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>> Pending;
+  // Initialize every mbb with OutLocs.
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      transfer(MI, OpenRanges, OutLocs);
+  DEBUG(printVarLocInMBB(OutLocs, "OutLocs after initialization", dbgs()));
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  unsigned int RPONumber = 0;
+  for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
+    OrderToBB[RPONumber] = *RI;
+    BBToOrder[*RI] = RPONumber;
+    Worklist.push(RPONumber);
+    ++RPONumber;
+  }
+
+  // This is a standard "union of predecessor outs" dataflow problem.
+  // To solve it, we perform join() and transfer() using the two worklist method
+  // until the ranges converge.
+  // Ranges have converged when both worklists are empty.
+  while (!Worklist.empty() || !Pending.empty()) {
+    // We track what is on the pending worklist to avoid inserting the same
+    // thing twice.  We could avoid this with a custom priority queue, but this
+    // is probably not worth it.
+    SmallPtrSet<MachineBasicBlock *, 16> OnPending;
+    while (!Worklist.empty()) {
+      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
+      Worklist.pop();
+      MBBJoined = join(*MBB, OutLocs, InLocs);
+
+      if (MBBJoined) {
+        MBBJoined = false;
+        Changed = true;
+        for (auto &MI : *MBB)
+          OLChanged |= transfer(MI, OpenRanges, OutLocs);
+        DEBUG(printVarLocInMBB(OutLocs, "OutLocs after propagating", dbgs()));
+        DEBUG(printVarLocInMBB(InLocs, "InLocs after propagating", dbgs()));
+
+        if (OLChanged) {
+          OLChanged = false;
+          for (auto s : MBB->successors())
+            if (!OnPending.count(s)) {
+              OnPending.insert(s);
+              Pending.push(BBToOrder[s]);
+            }
+        }
+      }
+    }
+    Worklist.swap(Pending);
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  DEBUG(printVarLocInMBB(OutLocs, "Final OutLocs", dbgs()));
+  DEBUG(printVarLocInMBB(InLocs, "Final InLocs", dbgs()));
+  return Changed;
+}
+
+bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool Changed = false;
+
+  Changed |= ExtendRanges(MF);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 1571551..6dac7db 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -91,9 +91,7 @@ public:
   bool dominates(MachineBasicBlock *MBB) {
     if (LBlocks.empty())
       LS.getMachineBasicBlocks(DL, LBlocks);
-    if (LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB))
-      return true;
-    return false;
+    return LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB);
   }
 };
 } // end anonymous namespace
@@ -512,7 +510,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
   bool Changed = false;
   for (MachineFunction::iterator MFI = mf.begin(), MFE = mf.end(); MFI != MFE;
        ++MFI) {
-    MachineBasicBlock *MBB = MFI;
+    MachineBasicBlock *MBB = &*MFI;
     for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
          MBBI != MBBE;) {
       if (!MBBI->isDebugValue()) {
@@ -536,65 +534,49 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
   return Changed;
 }
 
-void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
-                          LiveRange *LR, const VNInfo *VNI,
-                          SmallVectorImpl<SlotIndex> *Kills,
+/// We only propagate DBG_VALUES locally here. LiveDebugValues performs a
+/// data-flow analysis to propagate them beyond basic block boundaries.
+void UserValue::extendDef(SlotIndex Idx, unsigned LocNo, LiveRange *LR,
+                          const VNInfo *VNI, SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS, MachineDominatorTree &MDT,
                           UserValueScopes &UVS) {
-  SmallVector<SlotIndex, 16> Todo;
-  Todo.push_back(Idx);
-  do {
-    SlotIndex Start = Todo.pop_back_val();
-    MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start);
-    SlotIndex Stop = LIS.getMBBEndIdx(MBB);
-    LocMap::iterator I = locInts.find(Start);
-
-    // Limit to VNI's live range.
-    bool ToEnd = true;
-    if (LR && VNI) {
-      LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
-      if (!Segment || Segment->valno != VNI) {
-        if (Kills)
-          Kills->push_back(Start);
-        continue;
-      }
-      if (Segment->end < Stop)
-        Stop = Segment->end, ToEnd = false;
-    }
-
-    // There could already be a short def at Start.
-    if (I.valid() && I.start() <= Start) {
-      // Stop when meeting a different location or an already extended interval.
-      Start = Start.getNextSlot();
-      if (I.value() != LocNo || I.stop() != Start)
-        continue;
-      // This is a one-slot placeholder. Just skip it.
-      ++I;
+  SlotIndex Start = Idx;
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start);
+  SlotIndex Stop = LIS.getMBBEndIdx(MBB);
+  LocMap::iterator I = locInts.find(Start);
+
+  // Limit to VNI's live range.
+  bool ToEnd = true;
+  if (LR && VNI) {
+    LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
+    if (!Segment || Segment->valno != VNI) {
+      if (Kills)
+        Kills->push_back(Start);
+      return;
     }
+    if (Segment->end < Stop)
+      Stop = Segment->end, ToEnd = false;
+  }
 
-    // Limited by the next def.
-    if (I.valid() && I.start() < Stop)
-      Stop = I.start(), ToEnd = false;
-    // Limited by VNI's live range.
-    else if (!ToEnd && Kills)
-      Kills->push_back(Stop);
+  // There could already be a short def at Start.
+  if (I.valid() && I.start() <= Start) {
+    // Stop when meeting a different location or an already extended interval.
+    Start = Start.getNextSlot();
+    if (I.value() != LocNo || I.stop() != Start)
+      return;
+    // This is a one-slot placeholder. Just skip it.
+    ++I;
+  }
 
-    if (Start >= Stop)
-      continue;
+  // Limited by the next def.
+  if (I.valid() && I.start() < Stop)
+    Stop = I.start(), ToEnd = false;
+  // Limited by VNI's live range.
+  else if (!ToEnd && Kills)
+    Kills->push_back(Stop);
 
+  if (Start < Stop)
     I.insert(Start, Stop, LocNo);
-
-    // If we extended to the MBB end, propagate down the dominator tree.
-    if (!ToEnd)
-      continue;
-    const std::vector<MachineDomTreeNode*> &Children =
-      MDT.getNode(MBB)->getChildren();
-    for (unsigned i = 0, e = Children.size(); i != e; ++i) {
-      MachineBasicBlock *MBB = Children[i]->getBlock();
-      if (UVS.dominates(MBB))
-        Todo.push_back(LIS.getMBBStartIdx(MBB));
-    }
-  } while (!Todo.empty());
 }
 
 void
@@ -763,7 +745,7 @@ static void removeDebugValues(MachineFunction &mf) {
 bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableLDV)
     return false;
-  if (!FunctionDIs.count(mf.getFunction())) {
+  if (!mf.getFunction()->getSubprogram()) {
     removeDebugValues(mf);
     return false;
   }
@@ -1004,11 +986,11 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
     SlotIndex Stop = I.stop();
     unsigned LocNo = I.value();
     DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << LocNo);
-    MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start);
-    SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB);
+    MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
+    SlotIndex MBBEnd = LIS.getMBBEndIdx(&*MBB);
 
     DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
-    insertDebugValue(MBB, Start, LocNo, LIS, TII);
+    insertDebugValue(&*MBB, Start, LocNo, LIS, TII);
     // This interval may span multiple basic blocks.
     // Insert a DBG_VALUE into each one.
     while(Stop > MBBEnd) {
@@ -1016,9 +998,9 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
       Start = MBBEnd;
       if (++MBB == MFEnd)
         break;
-      MBBEnd = LIS.getMBBEndIdx(MBB);
+      MBBEnd = LIS.getMBBEndIdx(&*MBB);
       DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
-      insertDebugValue(MBB, Start, LocNo, LIS, TII);
+      insertDebugValue(&*MBB, Start, LocNo, LIS, TII);
     }
     DEBUG(dbgs() << '\n');
     if (MBB == MFEnd)
@@ -1047,7 +1029,6 @@ void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
 }
 
 bool LiveDebugVariables::doInitialization(Module &M) {
-  FunctionDIs = makeSubprogramMap(M);
   return Pass::doInitialization(M);
 }
 
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
index 694aa17..3d36f4d 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -33,7 +33,6 @@ class VirtRegMap;
 
 class LLVM_LIBRARY_VISIBILITY LiveDebugVariables : public MachineFunctionPass {
   void *pImpl;
-  DenseMap<const Function *, DISubprogram *> FunctionDIs;
 
 public:
   static char ID; // Pass identification, replacement for typeid
diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
index d75e441..bb34883 100644
--- a/contrib/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
@@ -865,7 +864,7 @@ void LiveInterval::constructMainRangeFromSubranges(
   // - If any of the subranges is live at a point the main liverange has to be
   //   live too, conversily if no subrange is live the main range mustn't be
   //   live either.
-  // We do this by scannig through all the subranges simultaneously creating new
+  // We do this by scanning through all the subranges simultaneously creating new
   // segments in the main range as segments start/ends come up in the subranges.
   assert(hasSubRanges() && "expected subranges to be present");
   assert(segments.empty() && valnos.empty() && "expected empty main range");
@@ -889,7 +888,7 @@ void LiveInterval::constructMainRangeFromSubranges(
   Segment CurrentSegment;
   bool ConstructingSegment = false;
   bool NeedVNIFixup = false;
-  unsigned ActiveMask = 0;
+  LaneBitmask ActiveMask = 0;
   SlotIndex Pos = First;
   while (true) {
     SlotIndex NextPos = Last;
@@ -899,7 +898,7 @@ void LiveInterval::constructMainRangeFromSubranges(
       END_SEGMENT,
     } Event = NOTHING;
     // Which subregister lanes are affected by the current event.
-    unsigned EventMask = 0;
+    LaneBitmask EventMask = 0;
     // Whether a BEGIN_SEGMENT is also a valno definition point.
     bool IsDef = false;
     // Find the next begin or end of a subrange segment. Combine masks if we
@@ -1066,7 +1065,7 @@ void LiveInterval::print(raw_ostream &OS) const {
   super::print(OS);
   // Print subranges
   for (const SubRange &SR : subranges()) {
-    OS << format(" L%04X ", SR.LaneMask) << SR;
+    OS << " L" << PrintLaneMask(SR.LaneMask) << ' ' << SR;
   }
 }
 
@@ -1101,8 +1100,8 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
   super::verify();
 
   // Make sure SubRanges are fine and LaneMasks are disjunct.
-  unsigned Mask = 0;
-  unsigned MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u;
+  LaneBitmask Mask = 0;
+  LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u;
   for (const SubRange &SR : subranges()) {
     // Subrange lanemask should be disjunct to any previous subrange masks.
     assert((Mask & SR.LaneMask) == 0);
@@ -1110,6 +1109,8 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
 
     // subrange mask should not contained in maximum lane mask for the vreg.
     assert((Mask & ~MaxMask) == 0);
+    // empty subranges must be removed.
+    assert(!SR.empty());
 
     SR.verify();
     // Main liverange should cover subrange.
@@ -1327,15 +1328,15 @@ void LiveRangeUpdater::flush() {
   LR->verify();
 }
 
-unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
+unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) {
   // Create initial equivalence classes.
   EqClass.clear();
-  EqClass.grow(LI->getNumValNums());
+  EqClass.grow(LR.getNumValNums());
 
   const VNInfo *used = nullptr, *unused = nullptr;
 
   // Determine connections.
-  for (const VNInfo *VNI : LI->valnos) {
+  for (const VNInfo *VNI : LR.valnos) {
     // Group all unused values into one class.
     if (VNI->isUnused()) {
       if (unused)
@@ -1350,14 +1351,14 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
       // Connect to values live out of predecessors.
       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI)
-        if (const VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(*PI)))
+        if (const VNInfo *PVNI = LR.getVNInfoBefore(LIS.getMBBEndIdx(*PI)))
           EqClass.join(VNI->id, PVNI->id);
     } else {
       // Normal value defined by an instruction. Check for two-addr redef.
       // FIXME: This could be coincidental. Should we really check for a tied
       // operand constraint?
       // Note that VNI->def may be a use slot for an early clobber def.
-      if (const VNInfo *UVNI = LI->getVNInfoBefore(VNI->def))
+      if (const VNInfo *UVNI = LR.getVNInfoBefore(VNI->def))
         EqClass.join(VNI->id, UVNI->id);
     }
   }
@@ -1370,11 +1371,42 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
   return EqClass.getNumClasses();
 }
 
-void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
-                                          MachineRegisterInfo &MRI) {
-  assert(LIV[0] && "LIV[0] must be set");
-  LiveInterval &LI = *LIV[0];
+template<typename LiveRangeT, typename EqClassesT>
+static void DistributeRange(LiveRangeT &LR, LiveRangeT *SplitLRs[],
+                            EqClassesT VNIClasses) {
+  // Move segments to new intervals.
+  LiveRange::iterator J = LR.begin(), E = LR.end();
+  while (J != E && VNIClasses[J->valno->id] == 0)
+    ++J;
+  for (LiveRange::iterator I = J; I != E; ++I) {
+    if (unsigned eq = VNIClasses[I->valno->id]) {
+      assert((SplitLRs[eq-1]->empty() || SplitLRs[eq-1]->expiredAt(I->start)) &&
+             "New intervals should be empty");
+      SplitLRs[eq-1]->segments.push_back(*I);
+    } else
+      *J++ = *I;
+  }
+  LR.segments.erase(J, E);
+
+  // Transfer VNInfos to their new owners and renumber them.
+  unsigned j = 0, e = LR.getNumValNums();
+  while (j != e && VNIClasses[j] == 0)
+    ++j;
+  for (unsigned i = j; i != e; ++i) {
+    VNInfo *VNI = LR.getValNumInfo(i);
+    if (unsigned eq = VNIClasses[i]) {
+      VNI->id = SplitLRs[eq-1]->getNumValNums();
+      SplitLRs[eq-1]->valnos.push_back(VNI);
+    } else {
+      VNI->id = j;
+      LR.valnos[j++] = VNI;
+    }
+  }
+  LR.valnos.resize(j);
+}
 
+void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
+                                          MachineRegisterInfo &MRI) {
   // Rewrite instructions.
   for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg),
        RE = MRI.reg_end(); RI != RE;) {
@@ -1396,38 +1428,41 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     // NULL. If the use is tied to a def, VNI will be the defined value.
     if (!VNI)
       continue;
-    MO.setReg(LIV[getEqClass(VNI)]->reg);
-  }
-
-  // Move runs to new intervals.
-  LiveInterval::iterator J = LI.begin(), E = LI.end();
-  while (J != E && EqClass[J->valno->id] == 0)
-    ++J;
-  for (LiveInterval::iterator I = J; I != E; ++I) {
-    if (unsigned eq = EqClass[I->valno->id]) {
-      assert((LIV[eq]->empty() || LIV[eq]->expiredAt(I->start)) &&
-             "New intervals should be empty");
-      LIV[eq]->segments.push_back(*I);
-    } else
-      *J++ = *I;
+    if (unsigned EqClass = getEqClass(VNI))
+      MO.setReg(LIV[EqClass-1]->reg);
   }
-  // TODO: do not cheat anymore by simply cleaning all subranges
-  LI.clearSubRanges();
-  LI.segments.erase(J, E);
 
-  // Transfer VNInfos to their new owners and renumber them.
-  unsigned j = 0, e = LI.getNumValNums();
-  while (j != e && EqClass[j] == 0)
-    ++j;
-  for (unsigned i = j; i != e; ++i) {
-    VNInfo *VNI = LI.getValNumInfo(i);
-    if (unsigned eq = EqClass[i]) {
-      VNI->id = LIV[eq]->getNumValNums();
-      LIV[eq]->valnos.push_back(VNI);
-    } else {
-      VNI->id = j;
-      LI.valnos[j++] = VNI;
+  // Distribute subregister liveranges.
+  if (LI.hasSubRanges()) {
+    unsigned NumComponents = EqClass.getNumClasses();
+    SmallVector<unsigned, 8> VNIMapping;
+    SmallVector<LiveInterval::SubRange*, 8> SubRanges;
+    BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+    for (LiveInterval::SubRange &SR : LI.subranges()) {
+      // Create new subranges in the split intervals and construct a mapping
+      // for the VNInfos in the subrange.
+      unsigned NumValNos = SR.valnos.size();
+      VNIMapping.clear();
+      VNIMapping.reserve(NumValNos);
+      SubRanges.clear();
+      SubRanges.resize(NumComponents-1, nullptr);
+      for (unsigned I = 0; I < NumValNos; ++I) {
+        const VNInfo &VNI = *SR.valnos[I];
+        const VNInfo *MainRangeVNI = LI.getVNInfoAt(VNI.def);
+        assert(MainRangeVNI != nullptr
+               && "SubRange def must have corresponding main range def");
+        unsigned ComponentNum = getEqClass(MainRangeVNI);
+        VNIMapping.push_back(ComponentNum);
+        if (ComponentNum > 0 && SubRanges[ComponentNum-1] == nullptr) {
+          SubRanges[ComponentNum-1]
+            = LIV[ComponentNum-1]->createSubRange(Allocator, SR.LaneMask);
+        }
+      }
+      DistributeRange(SR, SubRanges.data(), VNIMapping);
     }
+    LI.removeEmptySubRanges();
   }
-  LI.valnos.resize(j);
+
+  // Distribute main liverange.
+  DistributeRange(LI, LIV, EqClass);
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
index c00b010..a506e05 100644
--- a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -48,7 +47,7 @@ char LiveIntervals::ID = 0;
 char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
@@ -76,8 +75,8 @@ cl::opt<bool> UseSegmentSetForPhysRegs(
 
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<AliasAnalysis>();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
   // LiveVariables isn't really required by this analysis, it is only required
   // here to make sure it is live during TwoAddressInstructionPass and
   // PHIElimination. This is temporary.
@@ -124,7 +123,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MRI = &MF->getRegInfo();
   TRI = MF->getSubtarget().getRegisterInfo();
   TII = MF->getSubtarget().getInstrInfo();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
 
@@ -198,9 +197,16 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
 void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
+  bool ShouldTrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(LI.reg);
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg));
-  computeDeadValues(LI, nullptr);
+  LRCalc->calculate(LI, ShouldTrackSubRegLiveness);
+  bool SeparatedComponents = computeDeadValues(LI, nullptr);
+  if (SeparatedComponents) {
+    assert(ShouldTrackSubRegLiveness
+           && "Separated components should only occur for unused subreg defs");
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    splitSeparateComponents(LI, SplitLIs);
+  }
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -216,19 +222,31 @@ void LiveIntervals::computeRegMasks() {
   RegMaskBlocks.resize(MF->getNumBlockIDs());
 
   // Find all instructions with regmask operands.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
+  for (MachineBasicBlock &MBB : *MF) {
+    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()];
     RMB.first = RegMaskSlots.size();
-    for (MachineBasicBlock::iterator MI = MBB->begin(), ME = MBB->end();
-         MI != ME; ++MI)
-      for (const MachineOperand &MO : MI->operands()) {
+
+    // Some block starts, such as EH funclets, create masks.
+    if (const uint32_t *Mask = MBB.getBeginClobberMask(TRI)) {
+      RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB));
+      RegMaskBits.push_back(Mask);
+    }
+
+    for (MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
           continue;
-          RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
-          RegMaskBits.push_back(MO.getRegMask());
+        RegMaskSlots.push_back(Indexes->getInstructionIndex(&MI).getRegSlot());
+        RegMaskBits.push_back(MO.getRegMask());
       }
+    }
+
+    // Some block ends, such as funclet returns, create masks.
+    if (const uint32_t *Mask = MBB.getEndClobberMask(TRI)) {
+      RegMaskSlots.push_back(Indexes->getMBBEndIdx(&MBB));
+      RegMaskBits.push_back(Mask);
+    }
+
     // Compute the number of register mask instructions in this block.
     RMB.second = RegMaskSlots.size() - RMB.first;
   }
@@ -296,18 +314,17 @@ void LiveIntervals::computeLiveInRegUnits() {
   // Check all basic blocks for live-ins.
   for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
        MFI != MFE; ++MFI) {
-    const MachineBasicBlock *MBB = MFI;
+    const MachineBasicBlock *MBB = &*MFI;
 
     // We only care about ABI blocks: Entry + landing pads.
-    if ((MFI != MF->begin() && !MBB->isLandingPad()) || MBB->livein_empty())
+    if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty())
       continue;
 
     // Create phi-defs at Begin for all live-in registers.
     SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
     DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
-    for (MachineBasicBlock::livein_iterator LII = MBB->livein_begin(),
-         LIE = MBB->livein_end(); LII != LIE; ++LII) {
-      for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
+    for (const auto &LI : MBB->liveins()) {
+      for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
         LiveRange *LR = RegUnitRanges[Unit];
         if (!LR) {
@@ -396,9 +413,6 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
   }
 }
 
-/// shrinkToUses - After removing some uses of a register, shrink its live
-/// range to just the remaining uses. This method does not compute reaching
-/// defs for new uses, and it doesn't remove dead defs.
 bool LiveIntervals::shrinkToUses(LiveInterval *li,
                                  SmallVectorImpl<MachineInstr*> *dead) {
   DEBUG(dbgs() << "Shrink: " << *li << '\n');
@@ -406,9 +420,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
          && "Can only shrink virtual registers");
 
   // Shrink subregister live ranges.
+  bool NeedsCleanup = false;
   for (LiveInterval::SubRange &S : li->subranges()) {
     shrinkToUses(S, li->reg);
+    if (S.empty())
+      NeedsCleanup = true;
   }
+  if (NeedsCleanup)
+    li->removeEmptySubRanges();
 
   // Find all the values used, including PHI kills.
   ShrinkToUsesWorkList WorkList;
@@ -456,7 +475,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
 bool LiveIntervals::computeDeadValues(LiveInterval &LI,
                                       SmallVectorImpl<MachineInstr*> *dead) {
-  bool PHIRemoved = false;
+  bool MayHaveSplitComponents = false;
   for (auto VNI : LI.valnos) {
     if (VNI->isUnused())
       continue;
@@ -466,10 +485,13 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
 
     // Is the register live before? Otherwise we may have to add a read-undef
     // flag for subregister defs.
-    if (MRI->shouldTrackSubRegLiveness(LI.reg)) {
+    bool DeadBeforeDef = false;
+    unsigned VReg = LI.reg;
+    if (MRI->shouldTrackSubRegLiveness(VReg)) {
       if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
         MachineInstr *MI = getInstructionFromIndex(Def);
-        MI->addRegisterDefReadUndef(LI.reg);
+        MI->setRegisterDefReadUndef(VReg);
+        DeadBeforeDef = true;
       }
     }
 
@@ -480,19 +502,27 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
       VNI->markUnused();
       LI.removeSegment(I);
       DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
-      PHIRemoved = true;
+      MayHaveSplitComponents = true;
     } else {
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(Def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(LI.reg, TRI);
+      MI->addRegisterDead(VReg, TRI);
+
+      // If we have a dead def that is completely separate from the rest of
+      // the liverange then we rewrite it to use a different VReg to not violate
+      // the rule that the liveness of a virtual register forms a connected
+      // component. This should only happen if subregister liveness is tracked.
+      if (DeadBeforeDef)
+        MayHaveSplitComponents = true;
+
       if (dead && MI->allDefsAreDead()) {
         DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
         dead->push_back(MI);
       }
     }
   }
-  return PHIRemoved;
+  return MayHaveSplitComponents;
 }
 
 void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
@@ -512,8 +542,8 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
     // Maybe the operand is for a subregister we don't care about.
     unsigned SubReg = MO.getSubReg();
     if (SubReg != 0) {
-      unsigned SubRegMask = TRI->getSubRegIndexLaneMask(SubReg);
-      if ((SubRegMask & SR.LaneMask) == 0)
+      LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((LaneMask & SR.LaneMask) == 0)
         continue;
     }
     // We only need to visit each instruction once.
@@ -712,7 +742,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
         // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0
         // are actually never written by %vreg2. After assignment the <kill>
         // flag at the read instruction is invalid.
-        unsigned DefinedLanesMask;
+        LaneBitmask DefinedLanesMask;
         if (!SRs.empty()) {
           // Compute a mask of lanes that are defined.
           DefinedLanesMask = 0;
@@ -736,7 +766,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
             continue;
           if (MO.isUse()) {
             // Reading any undefined lanes?
-            unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+            LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
             if ((UseMask & ~DefinedLanesMask) != 0)
               goto CancelKill;
           } else if (MO.getSubReg() == 0) {
@@ -944,7 +974,7 @@ public:
         LiveInterval &LI = LIS.getInterval(Reg);
         if (LI.hasSubRanges()) {
           unsigned SubReg = MO.getSubReg();
-          unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+          LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
           for (LiveInterval::SubRange &S : LI.subranges()) {
             if ((S.LaneMask & LaneMask) == 0)
               continue;
@@ -968,7 +998,7 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
+  void updateRange(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
     if (!Updated.insert(&LR).second)
       return;
     DEBUG({
@@ -976,7 +1006,7 @@ private:
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         dbgs() << PrintReg(Reg);
         if (LaneMask != 0)
-          dbgs() << format(" L%04X", LaneMask);
+          dbgs() << " L" << PrintLaneMask(LaneMask);
       } else {
         dbgs() << PrintRegUnit(Reg, &TRI);
       }
@@ -1098,7 +1128,7 @@ private:
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
     // First look for a kill at OldIdx.
     LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
     LiveRange::iterator E = LR.end();
@@ -1175,7 +1205,7 @@ private:
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(unsigned Reg, unsigned LaneMask) {
+  SlotIndex findLastUseBefore(unsigned Reg, LaneBitmask LaneMask) {
 
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       SlotIndex LastUse = NewIdx;
@@ -1255,7 +1285,7 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
                                         const MachineBasicBlock::iterator End,
                                         const SlotIndex endIdx,
                                         LiveRange &LR, const unsigned Reg,
-                                        const unsigned LaneMask) {
+                                        LaneBitmask LaneMask) {
   LiveInterval::iterator LII = LR.find(endIdx);
   SlotIndex lastUseIdx;
   if (LII != LR.end() && LII->start < endIdx)
@@ -1282,7 +1312,7 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
         continue;
 
       unsigned SubReg = MO.getSubReg();
-      unsigned Mask = TRI->getSubRegIndexLaneMask(SubReg);
+      LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
       if ((Mask & LaneMask) == 0)
         continue;
 
@@ -1412,3 +1442,20 @@ void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) {
   }
   LI.removeEmptySubRanges();
 }
+
+void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
+    SmallVectorImpl<LiveInterval*> &SplitLIs) {
+  ConnectedVNInfoEqClasses ConEQ(*this);
+  unsigned NumComp = ConEQ.Classify(LI);
+  if (NumComp <= 1)
+    return;
+  DEBUG(dbgs() << "  Split " << NumComp << " components: " << LI << '\n');
+  unsigned Reg = LI.reg;
+  const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+  for (unsigned I = 1; I < NumComp; ++I) {
+    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    LiveInterval &NewLI = createEmptyInterval(NewVReg);
+    SplitLIs.push_back(&NewLI);
+  }
+  ConEQ.Distribute(LI, SplitLIs.data(), *MRI);
+}
diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
index cbd98e3..efbbcbe 100644
--- a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -68,7 +68,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
 
 /// Simulates liveness when stepping forward over an instruction(bundle): Remove
 /// killed-uses, add defs. This is the not recommended way, because it depends
-/// on accurate kill flags. If possible use stepBackwards() instead of this
+/// on accurate kill flags. If possible use stepBackward() instead of this
 /// function.
 void LivePhysRegs::stepForward(const MachineInstr &MI,
         SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
@@ -128,8 +128,8 @@ void LivePhysRegs::dump() const {
 
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 static void addLiveIns(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) {
-  for (unsigned Reg : make_range(MBB.livein_begin(), MBB.livein_end()))
-    LiveRegs.addReg(Reg);
+  for (const auto &LI : MBB.liveins())
+    LiveRegs.addReg(LI.PhysReg);
 }
 
 /// Add pristine registers to the given \p LiveRegs. This function removes
@@ -147,11 +147,19 @@ static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
 }
 
 void LivePhysRegs::addLiveOuts(const MachineBasicBlock *MBB,
-                               bool AddPristines) {
-  if (AddPristines) {
+                               bool AddPristinesAndCSRs) {
+  if (AddPristinesAndCSRs) {
     const MachineFunction &MF = *MBB->getParent();
     addPristines(*this, MF, *TRI);
+    if (!MBB->isReturnBlock()) {
+      // The return block has no successors whose live-ins we could merge
+      // below. So instead we add the callee saved registers manually.
+      for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
+        addReg(*I);
+    }
   }
+
+  // To get the live-outs we simply merge the live-ins of all successors.
   for (const MachineBasicBlock *Succ : MBB->successors())
     ::addLiveIns(*this, *Succ);
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
index bb2877a..c408615 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -64,23 +64,23 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
 
     unsigned SubReg = MO.getSubReg();
     if (LI.hasSubRanges() || (SubReg != 0 && TrackSubRegs)) {
-      unsigned Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
-                                  : MRI->getMaxLaneMaskForVReg(Reg);
+      LaneBitmask Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                     : MRI->getMaxLaneMaskForVReg(Reg);
 
       // If this is the first time we see a subregister def, initialize
       // subranges by creating a copy of the main range.
       if (!LI.hasSubRanges() && !LI.empty()) {
-        unsigned ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LaneBitmask ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
         LI.createSubRangeFrom(*Alloc, ClassMask, LI);
       }
 
       for (LiveInterval::SubRange &S : LI.subranges()) {
         // A Mask for subregs common to the existing subrange and current def.
-        unsigned Common = S.LaneMask & Mask;
+        LaneBitmask Common = S.LaneMask & Mask;
         if (Common == 0)
           continue;
         // A Mask for subregs covered by the subrange but not the current def.
-        unsigned LRest = S.LaneMask & ~Mask;
+        LaneBitmask LRest = S.LaneMask & ~Mask;
         LiveInterval::SubRange *CommonRange;
         if (LRest != 0) {
           // Split current subrange into Common and LRest ranges.
@@ -138,7 +138,8 @@ void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
 }
 
 
-void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) {
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg,
+                                 LaneBitmask Mask) {
   // Visit all operands that read Reg. This may include partial defs.
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
@@ -157,7 +158,7 @@ void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) {
       continue;
     unsigned SubReg = MO.getSubReg();
     if (SubReg != 0) {
-      unsigned SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+      LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
       // Ignore uses not covering the current subrange.
       if ((SubRegMask & Mask) == 0)
         continue;
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
index 34d9953..ff38c68 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
@@ -129,7 +129,7 @@ class LiveRangeCalc {
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveRange &LR, unsigned Reg, unsigned LaneMask);
+  void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask);
 
   /// Reset Map and Seen fields.
   void resetLiveOutMap();
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 08bbe0c..5ce364a 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -226,7 +226,7 @@ bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
     return true;
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   unsigned SubReg = MO.getSubReg();
-  unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
   for (const LiveInterval::SubRange &S : LI.subranges()) {
     if ((S.LaneMask & LaneMask) != 0 && S.Query(Idx).isKill())
       return true;
@@ -349,8 +349,9 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     ToShrink.pop_back();
     if (foldAsLoad(LI, Dead))
       continue;
+    unsigned VReg = LI->reg;
     if (TheDelegate)
-      TheDelegate->LRE_WillShrinkVirtReg(LI->reg);
+      TheDelegate->LRE_WillShrinkVirtReg(VReg);
     if (!LIS.shrinkToUses(LI, &Dead))
       continue;
 
@@ -360,7 +361,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     // them results in incorrect code.
     bool BeingSpilled = false;
     for (unsigned i = 0, e = RegsBeingSpilled.size(); i != e; ++i) {
-      if (LI->reg == RegsBeingSpilled[i]) {
+      if (VReg == RegsBeingSpilled[i]) {
         BeingSpilled = true;
         break;
       }
@@ -370,29 +371,21 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
 
     // LI may have been separated, create new intervals.
     LI->RenumberValues();
-    ConnectedVNInfoEqClasses ConEQ(LIS);
-    unsigned NumComp = ConEQ.Classify(LI);
-    if (NumComp <= 1)
-      continue;
-    ++NumFracRanges;
-    bool IsOriginal = VRM && VRM->getOriginal(LI->reg) == LI->reg;
-    DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
-    SmallVector<LiveInterval*, 8> Dups(1, LI);
-    for (unsigned i = 1; i != NumComp; ++i) {
-      Dups.push_back(&createEmptyIntervalFrom(LI->reg));
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    LIS.splitSeparateComponents(*LI, SplitLIs);
+    if (!SplitLIs.empty())
+      ++NumFracRanges;
+
+    unsigned Original = VRM ? VRM->getOriginal(VReg) : 0;
+    for (const LiveInterval *SplitLI : SplitLIs) {
       // If LI is an original interval that hasn't been split yet, make the new
       // intervals their own originals instead of referring to LI. The original
       // interval must contain all the split products, and LI doesn't.
-      if (IsOriginal)
-        VRM->setIsSplitFromReg(Dups.back()->reg, 0);
+      if (Original != VReg && Original != 0)
+        VRM->setIsSplitFromReg(SplitLI->reg, Original);
       if (TheDelegate)
-        TheDelegate->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
+        TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg, VReg);
     }
-    ConEQ.Distribute(&Dups[0], MRI);
-    DEBUG({
-      for (unsigned i = 0; i != NumComp; ++i)
-        dbgs() << '\t' << *Dups[i] << '\n';
-    });
   }
 }
 
@@ -411,7 +404,7 @@ void
 LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
                                         const MachineLoopInfo &Loops,
                                         const MachineBlockFrequencyInfo &MBFI) {
-  VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI);
+  VirtRegAuxInfo VRAI(MF, LIS, VRM, Loops, MBFI);
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg))
diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 9ea031d..7ee87c1 100644
--- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -15,12 +15,11 @@
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -49,7 +48,6 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
-  MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
   VRM = &getAnalysis<VirtRegMap>();
 
@@ -78,7 +76,7 @@ bool foreachUnit(const TargetRegisterInfo *TRI, LiveInterval &VRegInterval,
   if (VRegInterval.hasSubRanges()) {
     for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
       unsigned Unit = (*Units).first;
-      unsigned Mask = (*Units).second;
+      LaneBitmask Mask = (*Units).second;
       for (LiveInterval::SubRange &S : VRegInterval.subranges()) {
         if (S.LaneMask & Mask) {
           if (Func(Unit, S))
@@ -101,7 +99,6 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
                << " to " << PrintReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
-  MRI->setPhysRegUsed(PhysReg);
 
   foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
                                          const LiveRange &Range) {
diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
index b355393..06b86d8 100644
--- a/contrib/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
@@ -522,11 +522,15 @@ void LiveVariables::runOnInstr(MachineInstr *MI,
       continue;
     unsigned MOReg = MO.getReg();
     if (MO.isUse()) {
-      MO.setIsKill(false);
+      if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+            MRI->isReserved(MOReg)))
+        MO.setIsKill(false);
       if (MO.readsReg())
         UseRegs.push_back(MOReg);
     } else /*MO.isDef()*/ {
-      MO.setIsDead(false);
+      if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+            MRI->isReserved(MOReg)))
+        MO.setIsDead(false);
       DefRegs.push_back(MOReg);
     }
   }
@@ -559,11 +563,10 @@ void LiveVariables::runOnInstr(MachineInstr *MI,
 void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
   // Mark live-in registers as live-in.
   SmallVector<unsigned, 4> Defs;
-  for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
-         EE = MBB->livein_end(); II != EE; ++II) {
-    assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
+  for (const auto &LI : MBB->liveins()) {
+    assert(TargetRegisterInfo::isPhysicalRegister(LI.PhysReg) &&
            "Cannot have a live-in virtual register!");
-    HandlePhysRegDef(*II, nullptr, Defs);
+    HandlePhysRegDef(LI.PhysReg, nullptr, Defs);
   }
 
   // Loop over all of the instructions, processing them.
@@ -599,14 +602,12 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
   for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
          SE = MBB->succ_end(); SI != SE; ++SI) {
     MachineBasicBlock *SuccMBB = *SI;
-    if (SuccMBB->isLandingPad())
+    if (SuccMBB->isEHPad())
       continue;
-    for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(),
-           LE = SuccMBB->livein_end(); LI != LE; ++LI) {
-      unsigned LReg = *LI;
-      if (!TRI->isInAllocatableClass(LReg))
+    for (const auto &LI : SuccMBB->liveins()) {
+      if (!TRI->isInAllocatableClass(LI.PhysReg))
         // Ignore other live-ins, e.g. those that are live into landing pads.
-        LiveOuts.insert(LReg);
+        LiveOuts.insert(LI.PhysReg);
     }
   }
 
@@ -640,7 +641,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   // function.  This guarantees that we will see the definition of a virtual
   // register before its uses due to dominance properties of SSA (except for PHI
   // nodes, which are treated as a special case).
-  MachineBasicBlock *Entry = MF->begin();
+  MachineBasicBlock *Entry = &MF->front();
   SmallPtrSet<MachineBasicBlock*,16> Visited;
 
   for (MachineBasicBlock *MBB : depth_first_ext(Entry, Visited)) {
diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 8378429..eb60005 100644
--- a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -325,7 +325,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
   // Sort the frame references by local offset
   array_pod_sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end());
 
-  MachineBasicBlock *Entry = Fn.begin();
+  MachineBasicBlock *Entry = &Fn.front();
 
   unsigned BaseReg = 0;
   int64_t BaseOffset = 0;
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 482c33a..28f9d4e 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MILexer.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include <cctype>
@@ -54,15 +55,132 @@ public:
 
 } // end anonymous namespace
 
+MIToken &MIToken::reset(TokenKind Kind, StringRef Range) {
+  this->Kind = Kind;
+  this->Range = Range;
+  return *this;
+}
+
+MIToken &MIToken::setStringValue(StringRef StrVal) {
+  StringValue = StrVal;
+  return *this;
+}
+
+MIToken &MIToken::setOwnedStringValue(std::string StrVal) {
+  StringValueStorage = std::move(StrVal);
+  StringValue = StringValueStorage;
+  return *this;
+}
+
+MIToken &MIToken::setIntegerValue(APSInt IntVal) {
+  this->IntVal = std::move(IntVal);
+  return *this;
+}
+
 /// Skip the leading whitespace characters and return the updated cursor.
 static Cursor skipWhitespace(Cursor C) {
-  while (isspace(C.peek()))
+  while (isblank(C.peek()))
+    C.advance();
+  return C;
+}
+
+static bool isNewlineChar(char C) { return C == '\n' || C == '\r'; }
+
+/// Skip a line comment and return the updated cursor.
+static Cursor skipComment(Cursor C) {
+  if (C.peek() != ';')
+    return C;
+  while (!isNewlineChar(C.peek()) && !C.isEOF())
     C.advance();
   return C;
 }
 
+/// Return true if the given character satisfies the following regular
+/// expression: [-a-zA-Z$._0-9]
 static bool isIdentifierChar(char C) {
-  return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.';
+  return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.' ||
+         C == '$';
+}
+
+/// Unescapes the given string value.
+///
+/// Expects the string value to be quoted.
+static std::string unescapeQuotedString(StringRef Value) {
+  assert(Value.front() == '"' && Value.back() == '"');
+  Cursor C = Cursor(Value.substr(1, Value.size() - 2));
+
+  std::string Str;
+  Str.reserve(C.remaining().size());
+  while (!C.isEOF()) {
+    char Char = C.peek();
+    if (Char == '\\') {
+      if (C.peek(1) == '\\') {
+        // Two '\' become one
+        Str += '\\';
+        C.advance(2);
+        continue;
+      }
+      if (isxdigit(C.peek(1)) && isxdigit(C.peek(2))) {
+        Str += hexDigitValue(C.peek(1)) * 16 + hexDigitValue(C.peek(2));
+        C.advance(3);
+        continue;
+      }
+    }
+    Str += Char;
+    C.advance();
+  }
+  return Str;
+}
+
+/// Lex a string constant using the following regular expression: \"[^\"]*\"
+static Cursor lexStringConstant(
+    Cursor C,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  assert(C.peek() == '"');
+  for (C.advance(); C.peek() != '"'; C.advance()) {
+    if (C.isEOF() || isNewlineChar(C.peek())) {
+      ErrorCallback(
+          C.location(),
+          "end of machine instruction reached before the closing '\"'");
+      return None;
+    }
+  }
+  C.advance();
+  return C;
+}
+
+static Cursor lexName(
+    Cursor C, MIToken &Token, MIToken::TokenKind Type, unsigned PrefixLength,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  auto Range = C;
+  C.advance(PrefixLength);
+  if (C.peek() == '"') {
+    if (Cursor R = lexStringConstant(C, ErrorCallback)) {
+      StringRef String = Range.upto(R);
+      Token.reset(Type, String)
+          .setOwnedStringValue(
+              unescapeQuotedString(String.drop_front(PrefixLength)));
+      return R;
+    }
+    Token.reset(MIToken::Error, Range.remaining());
+    return Range;
+  }
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  Token.reset(Type, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(PrefixLength));
+  return C;
+}
+
+static Cursor maybeLexIntegerType(Cursor C, MIToken &Token) {
+  if (C.peek() != 'i' || !isdigit(C.peek(1)))
+    return None;
+  auto Range = C;
+  C.advance(); // Skip 'i'
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::IntegerType, Range.upto(C));
+  return C;
 }
 
 static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
@@ -70,32 +188,70 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("_", MIToken::underscore)
       .Case("implicit", MIToken::kw_implicit)
       .Case("implicit-def", MIToken::kw_implicit_define)
+      .Case("def", MIToken::kw_def)
       .Case("dead", MIToken::kw_dead)
       .Case("killed", MIToken::kw_killed)
       .Case("undef", MIToken::kw_undef)
+      .Case("internal", MIToken::kw_internal)
+      .Case("early-clobber", MIToken::kw_early_clobber)
+      .Case("debug-use", MIToken::kw_debug_use)
+      .Case("tied-def", MIToken::kw_tied_def)
+      .Case("frame-setup", MIToken::kw_frame_setup)
+      .Case("debug-location", MIToken::kw_debug_location)
+      .Case(".cfi_same_value", MIToken::kw_cfi_same_value)
+      .Case(".cfi_offset", MIToken::kw_cfi_offset)
+      .Case(".cfi_def_cfa_register", MIToken::kw_cfi_def_cfa_register)
+      .Case(".cfi_def_cfa_offset", MIToken::kw_cfi_def_cfa_offset)
+      .Case(".cfi_def_cfa", MIToken::kw_cfi_def_cfa)
+      .Case("blockaddress", MIToken::kw_blockaddress)
+      .Case("target-index", MIToken::kw_target_index)
+      .Case("half", MIToken::kw_half)
+      .Case("float", MIToken::kw_float)
+      .Case("double", MIToken::kw_double)
+      .Case("x86_fp80", MIToken::kw_x86_fp80)
+      .Case("fp128", MIToken::kw_fp128)
+      .Case("ppc_fp128", MIToken::kw_ppc_fp128)
+      .Case("target-flags", MIToken::kw_target_flags)
+      .Case("volatile", MIToken::kw_volatile)
+      .Case("non-temporal", MIToken::kw_non_temporal)
+      .Case("invariant", MIToken::kw_invariant)
+      .Case("align", MIToken::kw_align)
+      .Case("stack", MIToken::kw_stack)
+      .Case("got", MIToken::kw_got)
+      .Case("jump-table", MIToken::kw_jump_table)
+      .Case("constant-pool", MIToken::kw_constant_pool)
+      .Case("call-entry", MIToken::kw_call_entry)
+      .Case("liveout", MIToken::kw_liveout)
+      .Case("address-taken", MIToken::kw_address_taken)
+      .Case("landing-pad", MIToken::kw_landing_pad)
+      .Case("liveins", MIToken::kw_liveins)
+      .Case("successors", MIToken::kw_successors)
       .Default(MIToken::Identifier);
 }
 
 static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) {
-  if (!isalpha(C.peek()) && C.peek() != '_')
+  if (!isalpha(C.peek()) && C.peek() != '_' && C.peek() != '.')
     return None;
   auto Range = C;
   while (isIdentifierChar(C.peek()))
     C.advance();
   auto Identifier = Range.upto(C);
-  Token = MIToken(getIdentifierKind(Identifier), Identifier);
+  Token.reset(getIdentifierKind(Identifier), Identifier)
+      .setStringValue(Identifier);
   return C;
 }
 
 static Cursor maybeLexMachineBasicBlock(
     Cursor C, MIToken &Token,
     function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
-  if (!C.remaining().startswith("%bb."))
+  bool IsReference = C.remaining().startswith("%bb.");
+  if (!IsReference && !C.remaining().startswith("bb."))
     return None;
   auto Range = C;
-  C.advance(4); // Skip '%bb.'
+  unsigned PrefixLength = IsReference ? 4 : 3;
+  C.advance(PrefixLength); // Skip '%bb.' or 'bb.'
   if (!isdigit(C.peek())) {
-    Token = MIToken(MIToken::Error, C.remaining());
+    Token.reset(MIToken::Error, C.remaining());
     ErrorCallback(C.location(), "expected a number after '%bb.'");
     return C;
   }
@@ -103,26 +259,103 @@ static Cursor maybeLexMachineBasicBlock(
   while (isdigit(C.peek()))
     C.advance();
   StringRef Number = NumberRange.upto(C);
-  unsigned StringOffset = 4 + Number.size(); // Drop '%bb.<id>'
+  unsigned StringOffset = PrefixLength + Number.size(); // Drop '%bb.<id>'
   if (C.peek() == '.') {
     C.advance(); // Skip '.'
     ++StringOffset;
     while (isIdentifierChar(C.peek()))
       C.advance();
   }
-  Token = MIToken(MIToken::MachineBasicBlock, Range.upto(C), APSInt(Number),
-                  StringOffset);
+  Token.reset(IsReference ? MIToken::MachineBasicBlock
+                          : MIToken::MachineBasicBlockLabel,
+              Range.upto(C))
+      .setIntegerValue(APSInt(Number))
+      .setStringValue(Range.upto(C).drop_front(StringOffset));
+  return C;
+}
+
+static Cursor maybeLexIndex(Cursor C, MIToken &Token, StringRef Rule,
+                            MIToken::TokenKind Kind) {
+  if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size())))
+    return None;
+  auto Range = C;
+  C.advance(Rule.size());
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(Kind, Range.upto(C)).setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+static Cursor maybeLexIndexAndName(Cursor C, MIToken &Token, StringRef Rule,
+                                   MIToken::TokenKind Kind) {
+  if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size())))
+    return None;
+  auto Range = C;
+  C.advance(Rule.size());
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  StringRef Number = NumberRange.upto(C);
+  unsigned StringOffset = Rule.size() + Number.size();
+  if (C.peek() == '.') {
+    C.advance();
+    ++StringOffset;
+    while (isIdentifierChar(C.peek()))
+      C.advance();
+  }
+  Token.reset(Kind, Range.upto(C))
+      .setIntegerValue(APSInt(Number))
+      .setStringValue(Range.upto(C).drop_front(StringOffset));
   return C;
 }
 
+static Cursor maybeLexJumpTableIndex(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%jump-table.", MIToken::JumpTableIndex);
+}
+
+static Cursor maybeLexStackObject(Cursor C, MIToken &Token) {
+  return maybeLexIndexAndName(C, Token, "%stack.", MIToken::StackObject);
+}
+
+static Cursor maybeLexFixedStackObject(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%fixed-stack.", MIToken::FixedStackObject);
+}
+
+static Cursor maybeLexConstantPoolItem(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%const.", MIToken::ConstantPoolItem);
+}
+
+static Cursor maybeLexIRBlock(
+    Cursor C, MIToken &Token,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  const StringRef Rule = "%ir-block.";
+  if (!C.remaining().startswith(Rule))
+    return None;
+  if (isdigit(C.peek(Rule.size())))
+    return maybeLexIndex(C, Token, Rule, MIToken::IRBlock);
+  return lexName(C, Token, MIToken::NamedIRBlock, Rule.size(), ErrorCallback);
+}
+
+static Cursor maybeLexIRValue(
+    Cursor C, MIToken &Token,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  const StringRef Rule = "%ir.";
+  if (!C.remaining().startswith(Rule))
+    return None;
+  if (isdigit(C.peek(Rule.size())))
+    return maybeLexIndex(C, Token, Rule, MIToken::IRValue);
+  return lexName(C, Token, MIToken::NamedIRValue, Rule.size(), ErrorCallback);
+}
+
 static Cursor lexVirtualRegister(Cursor C, MIToken &Token) {
   auto Range = C;
   C.advance(); // Skip '%'
   auto NumberRange = C;
   while (isdigit(C.peek()))
     C.advance();
-  Token = MIToken(MIToken::VirtualRegister, Range.upto(C),
-                  APSInt(NumberRange.upto(C)));
+  Token.reset(MIToken::VirtualRegister, Range.upto(C))
+      .setIntegerValue(APSInt(NumberRange.upto(C)));
   return C;
 }
 
@@ -135,41 +368,112 @@ static Cursor maybeLexRegister(Cursor C, MIToken &Token) {
   C.advance(); // Skip '%'
   while (isIdentifierChar(C.peek()))
     C.advance();
-  Token = MIToken(MIToken::NamedRegister, Range.upto(C),
-                  /*StringOffset=*/1); // Drop the '%'
+  Token.reset(MIToken::NamedRegister, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '%'
   return C;
 }
 
-static Cursor maybeLexGlobalValue(Cursor C, MIToken &Token) {
+static Cursor maybeLexGlobalValue(
+    Cursor C, MIToken &Token,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
   if (C.peek() != '@')
     return None;
+  if (!isdigit(C.peek(1)))
+    return lexName(C, Token, MIToken::NamedGlobalValue, /*PrefixLength=*/1,
+                   ErrorCallback);
   auto Range = C;
-  C.advance(); // Skip the '@'
-  // TODO: add support for quoted names.
-  if (!isdigit(C.peek())) {
-    while (isIdentifierChar(C.peek()))
-      C.advance();
-    Token = MIToken(MIToken::NamedGlobalValue, Range.upto(C),
-                    /*StringOffset=*/1); // Drop the '@'
-    return C;
-  }
+  C.advance(1); // Skip the '@'
   auto NumberRange = C;
   while (isdigit(C.peek()))
     C.advance();
-  Token =
-      MIToken(MIToken::GlobalValue, Range.upto(C), APSInt(NumberRange.upto(C)));
+  Token.reset(MIToken::GlobalValue, Range.upto(C))
+      .setIntegerValue(APSInt(NumberRange.upto(C)));
   return C;
 }
 
-static Cursor maybeLexIntegerLiteral(Cursor C, MIToken &Token) {
+static Cursor maybeLexExternalSymbol(
+    Cursor C, MIToken &Token,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  if (C.peek() != '$')
+    return None;
+  return lexName(C, Token, MIToken::ExternalSymbol, /*PrefixLength=*/1,
+                 ErrorCallback);
+}
+
+static bool isValidHexFloatingPointPrefix(char C) {
+  return C == 'H' || C == 'K' || C == 'L' || C == 'M';
+}
+
+static Cursor maybeLexHexFloatingPointLiteral(Cursor C, MIToken &Token) {
+  if (C.peek() != '0' || C.peek(1) != 'x')
+    return None;
+  Cursor Range = C;
+  C.advance(2); // Skip '0x'
+  if (isValidHexFloatingPointPrefix(C.peek()))
+    C.advance();
+  while (isxdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::FloatingPointLiteral, Range.upto(C));
+  return C;
+}
+
+static Cursor lexFloatingPointLiteral(Cursor Range, Cursor C, MIToken &Token) {
+  C.advance();
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(C.peek()))
+    C.advance();
+  if ((C.peek() == 'e' || C.peek() == 'E') &&
+      (isdigit(C.peek(1)) ||
+       ((C.peek(1) == '-' || C.peek(1) == '+') && isdigit(C.peek(2))))) {
+    C.advance(2);
+    while (isdigit(C.peek()))
+      C.advance();
+  }
+  Token.reset(MIToken::FloatingPointLiteral, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexNumericalLiteral(Cursor C, MIToken &Token) {
   if (!isdigit(C.peek()) && (C.peek() != '-' || !isdigit(C.peek(1))))
     return None;
   auto Range = C;
   C.advance();
   while (isdigit(C.peek()))
     C.advance();
+  if (C.peek() == '.')
+    return lexFloatingPointLiteral(Range, C, Token);
   StringRef StrVal = Range.upto(C);
-  Token = MIToken(MIToken::IntegerLiteral, StrVal, APSInt(StrVal));
+  Token.reset(MIToken::IntegerLiteral, StrVal).setIntegerValue(APSInt(StrVal));
+  return C;
+}
+
+static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
+  return StringSwitch<MIToken::TokenKind>(Identifier)
+      .Case("!tbaa", MIToken::md_tbaa)
+      .Case("!alias.scope", MIToken::md_alias_scope)
+      .Case("!noalias", MIToken::md_noalias)
+      .Case("!range", MIToken::md_range)
+      .Default(MIToken::Error);
+}
+
+static Cursor maybeLexExlaim(
+    Cursor C, MIToken &Token,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  if (C.peek() != '!')
+    return None;
+  auto Range = C;
+  C.advance(1);
+  if (isdigit(C.peek()) || !isIdentifierChar(C.peek())) {
+    Token.reset(MIToken::exclaim, Range.upto(C));
+    return C;
+  }
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  StringRef StrVal = Range.upto(C);
+  Token.reset(getMetadataKeywordKind(StrVal), StrVal);
+  if (Token.isError())
+    ErrorCallback(Token.location(),
+                  "use of unknown metadata keyword '" + StrVal + "'");
   return C;
 }
 
@@ -181,44 +485,119 @@ static MIToken::TokenKind symbolToken(char C) {
     return MIToken::equal;
   case ':':
     return MIToken::colon;
+  case '(':
+    return MIToken::lparen;
+  case ')':
+    return MIToken::rparen;
+  case '{':
+    return MIToken::lbrace;
+  case '}':
+    return MIToken::rbrace;
+  case '+':
+    return MIToken::plus;
+  case '-':
+    return MIToken::minus;
   default:
     return MIToken::Error;
   }
 }
 
 static Cursor maybeLexSymbol(Cursor C, MIToken &Token) {
-  auto Kind = symbolToken(C.peek());
+  MIToken::TokenKind Kind;
+  unsigned Length = 1;
+  if (C.peek() == ':' && C.peek(1) == ':') {
+    Kind = MIToken::coloncolon;
+    Length = 2;
+  } else
+    Kind = symbolToken(C.peek());
   if (Kind == MIToken::Error)
     return None;
   auto Range = C;
+  C.advance(Length);
+  Token.reset(Kind, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexNewline(Cursor C, MIToken &Token) {
+  if (!isNewlineChar(C.peek()))
+    return None;
+  auto Range = C;
+  C.advance();
+  Token.reset(MIToken::Newline, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexEscapedIRValue(
+    Cursor C, MIToken &Token,
+    function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
+  if (C.peek() != '`')
+    return None;
+  auto Range = C;
+  C.advance();
+  auto StrRange = C;
+  while (C.peek() != '`') {
+    if (C.isEOF() || isNewlineChar(C.peek())) {
+      ErrorCallback(
+          C.location(),
+          "end of machine instruction reached before the closing '`'");
+      Token.reset(MIToken::Error, Range.remaining());
+      return C;
+    }
+    C.advance();
+  }
+  StringRef Value = StrRange.upto(C);
   C.advance();
-  Token = MIToken(Kind, Range.upto(C));
+  Token.reset(MIToken::QuotedIRValue, Range.upto(C)).setStringValue(Value);
   return C;
 }
 
 StringRef llvm::lexMIToken(
     StringRef Source, MIToken &Token,
     function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) {
-  auto C = skipWhitespace(Cursor(Source));
+  auto C = skipComment(skipWhitespace(Cursor(Source)));
   if (C.isEOF()) {
-    Token = MIToken(MIToken::Eof, C.remaining());
+    Token.reset(MIToken::Eof, C.remaining());
     return C.remaining();
   }
 
-  if (Cursor R = maybeLexIdentifier(C, Token))
+  if (Cursor R = maybeLexIntegerType(C, Token))
     return R.remaining();
   if (Cursor R = maybeLexMachineBasicBlock(C, Token, ErrorCallback))
     return R.remaining();
+  if (Cursor R = maybeLexIdentifier(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexJumpTableIndex(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexStackObject(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexFixedStackObject(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexConstantPoolItem(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexIRBlock(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIRValue(C, Token, ErrorCallback))
+    return R.remaining();
   if (Cursor R = maybeLexRegister(C, Token))
     return R.remaining();
-  if (Cursor R = maybeLexGlobalValue(C, Token))
+  if (Cursor R = maybeLexGlobalValue(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexExternalSymbol(C, Token, ErrorCallback))
     return R.remaining();
-  if (Cursor R = maybeLexIntegerLiteral(C, Token))
+  if (Cursor R = maybeLexHexFloatingPointLiteral(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexNumericalLiteral(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexExlaim(C, Token, ErrorCallback))
     return R.remaining();
   if (Cursor R = maybeLexSymbol(C, Token))
     return R.remaining();
+  if (Cursor R = maybeLexNewline(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexEscapedIRValue(C, Token, ErrorCallback))
+    return R.remaining();
 
-  Token = MIToken(MIToken::Error, C.remaining());
+  Token.reset(MIToken::Error, C.remaining());
   ErrorCallback(C.location(),
                 Twine("unexpected character '") + Twine(C.peek()) + "'");
   return C.remaining();
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
index 55460b5..ff54aa3 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -30,50 +30,119 @@ struct MIToken {
     // Markers
     Eof,
     Error,
+    Newline,
 
     // Tokens with no info.
     comma,
     equal,
     underscore,
     colon,
+    coloncolon,
+    exclaim,
+    lparen,
+    rparen,
+    lbrace,
+    rbrace,
+    plus,
+    minus,
 
     // Keywords
     kw_implicit,
     kw_implicit_define,
+    kw_def,
     kw_dead,
     kw_killed,
     kw_undef,
+    kw_internal,
+    kw_early_clobber,
+    kw_debug_use,
+    kw_tied_def,
+    kw_frame_setup,
+    kw_debug_location,
+    kw_cfi_same_value,
+    kw_cfi_offset,
+    kw_cfi_def_cfa_register,
+    kw_cfi_def_cfa_offset,
+    kw_cfi_def_cfa,
+    kw_blockaddress,
+    kw_target_index,
+    kw_half,
+    kw_float,
+    kw_double,
+    kw_x86_fp80,
+    kw_fp128,
+    kw_ppc_fp128,
+    kw_target_flags,
+    kw_volatile,
+    kw_non_temporal,
+    kw_invariant,
+    kw_align,
+    kw_stack,
+    kw_got,
+    kw_jump_table,
+    kw_constant_pool,
+    kw_call_entry,
+    kw_liveout,
+    kw_address_taken,
+    kw_landing_pad,
+    kw_liveins,
+    kw_successors,
+
+    // Named metadata keywords
+    md_tbaa,
+    md_alias_scope,
+    md_noalias,
+    md_range,
 
     // Identifier tokens
     Identifier,
+    IntegerType,
     NamedRegister,
+    MachineBasicBlockLabel,
     MachineBasicBlock,
+    StackObject,
+    FixedStackObject,
     NamedGlobalValue,
     GlobalValue,
+    ExternalSymbol,
 
     // Other tokens
     IntegerLiteral,
-    VirtualRegister
+    FloatingPointLiteral,
+    VirtualRegister,
+    ConstantPoolItem,
+    JumpTableIndex,
+    NamedIRBlock,
+    IRBlock,
+    NamedIRValue,
+    IRValue,
+    QuotedIRValue // `<constant value>`
   };
 
 private:
   TokenKind Kind;
-  unsigned StringOffset;
   StringRef Range;
+  StringRef StringValue;
+  std::string StringValueStorage;
   APSInt IntVal;
 
 public:
-  MIToken(TokenKind Kind, StringRef Range, unsigned StringOffset = 0)
-      : Kind(Kind), StringOffset(StringOffset), Range(Range) {}
+  MIToken() : Kind(Error) {}
 
-  MIToken(TokenKind Kind, StringRef Range, const APSInt &IntVal,
-          unsigned StringOffset = 0)
-      : Kind(Kind), StringOffset(StringOffset), Range(Range), IntVal(IntVal) {}
+  MIToken &reset(TokenKind Kind, StringRef Range);
+
+  MIToken &setStringValue(StringRef StrVal);
+  MIToken &setOwnedStringValue(std::string StrVal);
+  MIToken &setIntegerValue(APSInt IntVal);
 
   TokenKind kind() const { return Kind; }
 
   bool isError() const { return Kind == Error; }
 
+  bool isNewlineOrEOF() const { return Kind == Newline || Kind == Eof; }
+
+  bool isErrorOrEOF() const { return Kind == Error || Kind == Eof; }
+
   bool isRegister() const {
     return Kind == NamedRegister || Kind == underscore ||
            Kind == VirtualRegister;
@@ -81,7 +150,14 @@ public:
 
   bool isRegisterFlag() const {
     return Kind == kw_implicit || Kind == kw_implicit_define ||
-           Kind == kw_dead || Kind == kw_killed || Kind == kw_undef;
+           Kind == kw_def || Kind == kw_dead || Kind == kw_killed ||
+           Kind == kw_undef || Kind == kw_internal ||
+           Kind == kw_early_clobber || Kind == kw_debug_use;
+  }
+
+  bool isMemoryOperandFlag() const {
+    return Kind == kw_volatile || Kind == kw_non_temporal ||
+           Kind == kw_invariant;
   }
 
   bool is(TokenKind K) const { return Kind == K; }
@@ -90,13 +166,19 @@ public:
 
   StringRef::iterator location() const { return Range.begin(); }
 
-  StringRef stringValue() const { return Range.drop_front(StringOffset); }
+  StringRef range() const { return Range; }
+
+  /// Return the token's string value.
+  StringRef stringValue() const { return StringValue; }
 
   const APSInt &integerValue() const { return IntVal; }
 
   bool hasIntegerValue() const {
     return Kind == IntegerLiteral || Kind == MachineBasicBlock ||
-           Kind == GlobalValue || Kind == VirtualRegister;
+           Kind == MachineBasicBlockLabel || Kind == StackObject ||
+           Kind == FixedStackObject || Kind == GlobalValue ||
+           Kind == VirtualRegister || Kind == ConstantPoolItem ||
+           Kind == JumpTableIndex || Kind == IRBlock || Kind == IRValue;
   }
 };
 
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index c000112..f2f6584 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -14,12 +14,20 @@
 #include "MIParser.h"
 #include "MILexer.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -30,15 +38,20 @@ using namespace llvm;
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
-/// range.
-struct MachineOperandWithLocation {
+/// range and other attributes.
+struct ParsedMachineOperand {
   MachineOperand Operand;
   StringRef::iterator Begin;
   StringRef::iterator End;
-
-  MachineOperandWithLocation(const MachineOperand &Operand,
-                             StringRef::iterator Begin, StringRef::iterator End)
-      : Operand(Operand), Begin(Begin), End(End) {}
+  Optional<unsigned> TiedDefIdx;
+
+  ParsedMachineOperand(const MachineOperand &Operand, StringRef::iterator Begin,
+                       StringRef::iterator End, Optional<unsigned> &TiedDefIdx)
+      : Operand(Operand), Begin(Begin), End(End), TiedDefIdx(TiedDefIdx) {
+    if (TiedDefIdx)
+      assert(Operand.isReg() && Operand.isUse() &&
+             "Only used register operands can be tied");
+  }
 };
 
 class MIParser {
@@ -58,6 +71,16 @@ class MIParser {
   StringMap<const uint32_t *> Names2RegMasks;
   /// Maps from subregister names to subregister indices.
   StringMap<unsigned> Names2SubRegIndices;
+  /// Maps from slot numbers to function's unnamed basic blocks.
+  DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
+  /// Maps from target index names to target indices.
+  StringMap<int> Names2TargetIndices;
+  /// Maps from direct target flag names to the direct target flag values.
+  StringMap<unsigned> Names2DirectTargetFlags;
+  /// Maps from direct target flag names to the bitmask target flag values.
+  StringMap<unsigned> Names2BitmaskTargetFlags;
 
 public:
   MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
@@ -76,19 +99,66 @@ public:
   /// This function always return true.
   bool error(StringRef::iterator Loc, const Twine &Msg);
 
+  bool
+  parseBasicBlockDefinitions(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool parseBasicBlocks();
   bool parse(MachineInstr *&MI);
-  bool parseMBB(MachineBasicBlock *&MBB);
-  bool parseNamedRegister(unsigned &Reg);
+  bool parseStandaloneMBB(MachineBasicBlock *&MBB);
+  bool parseStandaloneNamedRegister(unsigned &Reg);
+  bool parseStandaloneVirtualRegister(unsigned &Reg);
+  bool parseStandaloneStackObject(int &FI);
+  bool parseStandaloneMDNode(MDNode *&Node);
+
+  bool
+  parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool parseBasicBlock(MachineBasicBlock &MBB);
+  bool parseBasicBlockLiveins(MachineBasicBlock &MBB);
+  bool parseBasicBlockSuccessors(MachineBasicBlock &MBB);
 
   bool parseRegister(unsigned &Reg);
   bool parseRegisterFlag(unsigned &Flags);
   bool parseSubRegisterIndex(unsigned &SubReg);
-  bool parseRegisterOperand(MachineOperand &Dest, bool IsDef = false);
+  bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);
+  bool parseRegisterOperand(MachineOperand &Dest,
+                            Optional<unsigned> &TiedDefIdx, bool IsDef = false);
   bool parseImmediateOperand(MachineOperand &Dest);
+  bool parseIRConstant(StringRef::iterator Loc, StringRef Source,
+                       const Constant *&C);
+  bool parseIRConstant(StringRef::iterator Loc, const Constant *&C);
+  bool parseTypedImmediateOperand(MachineOperand &Dest);
+  bool parseFPImmediateOperand(MachineOperand &Dest);
   bool parseMBBReference(MachineBasicBlock *&MBB);
   bool parseMBBOperand(MachineOperand &Dest);
+  bool parseStackFrameIndex(int &FI);
+  bool parseStackObjectOperand(MachineOperand &Dest);
+  bool parseFixedStackFrameIndex(int &FI);
+  bool parseFixedStackObjectOperand(MachineOperand &Dest);
+  bool parseGlobalValue(GlobalValue *&GV);
   bool parseGlobalAddressOperand(MachineOperand &Dest);
-  bool parseMachineOperand(MachineOperand &Dest);
+  bool parseConstantPoolIndexOperand(MachineOperand &Dest);
+  bool parseJumpTableIndexOperand(MachineOperand &Dest);
+  bool parseExternalSymbolOperand(MachineOperand &Dest);
+  bool parseMDNode(MDNode *&Node);
+  bool parseMetadataOperand(MachineOperand &Dest);
+  bool parseCFIOffset(int &Offset);
+  bool parseCFIRegister(unsigned &Reg);
+  bool parseCFIOperand(MachineOperand &Dest);
+  bool parseIRBlock(BasicBlock *&BB, const Function &F);
+  bool parseBlockAddressOperand(MachineOperand &Dest);
+  bool parseTargetIndexOperand(MachineOperand &Dest);
+  bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
+  bool parseMachineOperand(MachineOperand &Dest,
+                           Optional<unsigned> &TiedDefIdx);
+  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
+                                         Optional<unsigned> &TiedDefIdx);
+  bool parseOffset(int64_t &Offset);
+  bool parseAlignment(unsigned &Alignment);
+  bool parseOperandsOffset(MachineOperand &Op);
+  bool parseIRValue(const Value *&V);
+  bool parseMemoryOperandFlag(unsigned &Flags);
+  bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
+  bool parseMachinePointerInfo(MachinePointerInfo &Dest);
+  bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
 
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
@@ -96,15 +166,31 @@ private:
   /// Return true if an error occurred.
   bool getUnsigned(unsigned &Result);
 
+  /// Convert the integer literal in the current token into an uint64.
+  ///
+  /// Return true if an error occurred.
+  bool getUint64(uint64_t &Result);
+
+  /// If the current token is of the given kind, consume it and return false.
+  /// Otherwise report an error and return true.
+  bool expectAndConsume(MIToken::TokenKind TokenKind);
+
+  /// If the current token is of the given kind, consume it and return true.
+  /// Otherwise return false.
+  bool consumeIfPresent(MIToken::TokenKind TokenKind);
+
   void initNames2InstrOpCodes();
 
   /// Try to convert an instruction name to an opcode. Return true if the
   /// instruction name is invalid.
   bool parseInstrName(StringRef InstrName, unsigned &OpCode);
 
-  bool parseInstruction(unsigned &OpCode);
+  bool parseInstruction(unsigned &OpCode, unsigned &Flags);
+
+  bool assignRegisterTies(MachineInstr &MI,
+                          ArrayRef<ParsedMachineOperand> Operands);
 
-  bool verifyImplicitOperands(ArrayRef<MachineOperandWithLocation> Operands,
+  bool verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
                               const MCInstrDesc &MCID);
 
   void initNames2Regs();
@@ -126,6 +212,34 @@ private:
   ///
   /// Return 0 if the name isn't a subregister index class.
   unsigned getSubRegIndex(StringRef Name);
+
+  const BasicBlock *getIRBlock(unsigned Slot);
+  const BasicBlock *getIRBlock(unsigned Slot, const Function &F);
+
+  const Value *getIRValue(unsigned Slot);
+
+  void initNames2TargetIndices();
+
+  /// Try to convert a name of target index to the corresponding target index.
+  ///
+  /// Return true if the name isn't a name of a target index.
+  bool getTargetIndex(StringRef Name, int &Index);
+
+  void initNames2DirectTargetFlags();
+
+  /// Try to convert a name of a direct target flag to the corresponding
+  /// target flag.
+  ///
+  /// Return true if the name isn't a name of a direct flag.
+  bool getDirectTargetFlag(StringRef Name, unsigned &Flag);
+
+  void initNames2BitmaskTargetFlags();
+
+  /// Try to convert a name of a bitmask target flag to the corresponding
+  /// target flag.
+  ///
+  /// Return true if the name isn't a name of a bitmask target flag.
+  bool getBitmaskTargetFlag(StringRef Name, unsigned &Flag);
 };
 
 } // end anonymous namespace
@@ -134,7 +248,7 @@ MIParser::MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
                    StringRef Source, const PerFunctionMIParsingState &PFS,
                    const SlotMapping &IRSlots)
     : SM(SM), MF(MF), Error(Error), Source(Source), CurrentSource(Source),
-      Token(MIToken::Error, StringRef()), PFS(PFS), IRSlots(IRSlots) {}
+      PFS(PFS), IRSlots(IRSlots) {}
 
 void MIParser::lex() {
   CurrentSource = lexMIToken(
@@ -146,49 +260,378 @@ bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); }
 
 bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
-  Error = SMDiagnostic(
-      SM, SMLoc(),
-      SM.getMemoryBuffer(SM.getMainFileID())->getBufferIdentifier(), 1,
-      Loc - Source.data(), SourceMgr::DK_Error, Msg.str(), Source, None, None);
+  const MemoryBuffer &Buffer = *SM.getMemoryBuffer(SM.getMainFileID());
+  if (Loc >= Buffer.getBufferStart() && Loc <= Buffer.getBufferEnd()) {
+    // Create an ordinary diagnostic when the source manager's buffer is the
+    // source string.
+    Error = SM.GetMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
+    return true;
+  }
+  // Create a diagnostic for a YAML string literal.
+  Error = SMDiagnostic(SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
+                       Loc - Source.data(), SourceMgr::DK_Error, Msg.str(),
+                       Source, None, None);
   return true;
 }
 
-bool MIParser::parse(MachineInstr *&MI) {
+static const char *toString(MIToken::TokenKind TokenKind) {
+  switch (TokenKind) {
+  case MIToken::comma:
+    return "','";
+  case MIToken::equal:
+    return "'='";
+  case MIToken::colon:
+    return "':'";
+  case MIToken::lparen:
+    return "'('";
+  case MIToken::rparen:
+    return "')'";
+  default:
+    return "<unknown token>";
+  }
+}
+
+bool MIParser::expectAndConsume(MIToken::TokenKind TokenKind) {
+  if (Token.isNot(TokenKind))
+    return error(Twine("expected ") + toString(TokenKind));
+  lex();
+  return false;
+}
+
+bool MIParser::consumeIfPresent(MIToken::TokenKind TokenKind) {
+  if (Token.isNot(TokenKind))
+    return false;
+  lex();
+  return true;
+}
+
+bool MIParser::parseBasicBlockDefinition(
+    DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  unsigned ID = 0;
+  if (getUnsigned(ID))
+    return true;
+  auto Loc = Token.location();
+  auto Name = Token.stringValue();
+  lex();
+  bool HasAddressTaken = false;
+  bool IsLandingPad = false;
+  unsigned Alignment = 0;
+  BasicBlock *BB = nullptr;
+  if (consumeIfPresent(MIToken::lparen)) {
+    do {
+      // TODO: Report an error when multiple same attributes are specified.
+      switch (Token.kind()) {
+      case MIToken::kw_address_taken:
+        HasAddressTaken = true;
+        lex();
+        break;
+      case MIToken::kw_landing_pad:
+        IsLandingPad = true;
+        lex();
+        break;
+      case MIToken::kw_align:
+        if (parseAlignment(Alignment))
+          return true;
+        break;
+      case MIToken::IRBlock:
+        // TODO: Report an error when both name and ir block are specified.
+        if (parseIRBlock(BB, *MF.getFunction()))
+          return true;
+        lex();
+        break;
+      default:
+        break;
+      }
+    } while (consumeIfPresent(MIToken::comma));
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+  }
+  if (expectAndConsume(MIToken::colon))
+    return true;
+
+  if (!Name.empty()) {
+    BB = dyn_cast_or_null<BasicBlock>(
+        MF.getFunction()->getValueSymbolTable().lookup(Name));
+    if (!BB)
+      return error(Loc, Twine("basic block '") + Name +
+                            "' is not defined in the function '" +
+                            MF.getName() + "'");
+  }
+  auto *MBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(MF.end(), MBB);
+  bool WasInserted = MBBSlots.insert(std::make_pair(ID, MBB)).second;
+  if (!WasInserted)
+    return error(Loc, Twine("redefinition of machine basic block with id #") +
+                          Twine(ID));
+  if (Alignment)
+    MBB->setAlignment(Alignment);
+  if (HasAddressTaken)
+    MBB->setHasAddressTaken();
+  MBB->setIsEHPad(IsLandingPad);
+  return false;
+}
+
+bool MIParser::parseBasicBlockDefinitions(
+    DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+  lex();
+  // Skip until the first machine basic block.
+  while (Token.is(MIToken::Newline))
+    lex();
+  if (Token.isErrorOrEOF())
+    return Token.isError();
+  if (Token.isNot(MIToken::MachineBasicBlockLabel))
+    return error("expected a basic block definition before instructions");
+  unsigned BraceDepth = 0;
+  do {
+    if (parseBasicBlockDefinition(MBBSlots))
+      return true;
+    bool IsAfterNewline = false;
+    // Skip until the next machine basic block.
+    while (true) {
+      if ((Token.is(MIToken::MachineBasicBlockLabel) && IsAfterNewline) ||
+          Token.isErrorOrEOF())
+        break;
+      else if (Token.is(MIToken::MachineBasicBlockLabel))
+        return error("basic block definition should be located at the start of "
+                     "the line");
+      else if (consumeIfPresent(MIToken::Newline)) {
+        IsAfterNewline = true;
+        continue;
+      }
+      IsAfterNewline = false;
+      if (Token.is(MIToken::lbrace))
+        ++BraceDepth;
+      if (Token.is(MIToken::rbrace)) {
+        if (!BraceDepth)
+          return error("extraneous closing brace ('}')");
+        --BraceDepth;
+      }
+      lex();
+    }
+    // Verify that we closed all of the '{' at the end of a file or a block.
+    if (!Token.isError() && BraceDepth)
+      return error("expected '}'"); // FIXME: Report a note that shows '{'.
+  } while (!Token.isErrorOrEOF());
+  return Token.isError();
+}
+
+bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) {
+  assert(Token.is(MIToken::kw_liveins));
+  lex();
+  if (expectAndConsume(MIToken::colon))
+    return true;
+  if (Token.isNewlineOrEOF()) // Allow an empty list of liveins.
+    return false;
+  do {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg = 0;
+    if (parseRegister(Reg))
+      return true;
+    MBB.addLiveIn(Reg);
+    lex();
+  } while (consumeIfPresent(MIToken::comma));
+  return false;
+}
+
+bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
+  assert(Token.is(MIToken::kw_successors));
   lex();
+  if (expectAndConsume(MIToken::colon))
+    return true;
+  if (Token.isNewlineOrEOF()) // Allow an empty list of successors.
+    return false;
+  do {
+    if (Token.isNot(MIToken::MachineBasicBlock))
+      return error("expected a machine basic block reference");
+    MachineBasicBlock *SuccMBB = nullptr;
+    if (parseMBBReference(SuccMBB))
+      return true;
+    lex();
+    unsigned Weight = 0;
+    if (consumeIfPresent(MIToken::lparen)) {
+      if (Token.isNot(MIToken::IntegerLiteral))
+        return error("expected an integer literal after '('");
+      if (getUnsigned(Weight))
+        return true;
+      lex();
+      if (expectAndConsume(MIToken::rparen))
+        return true;
+    }
+    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
+  } while (consumeIfPresent(MIToken::comma));
+  MBB.normalizeSuccProbs();
+  return false;
+}
 
+bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) {
+  // Skip the definition.
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  lex();
+  if (consumeIfPresent(MIToken::lparen)) {
+    while (Token.isNot(MIToken::rparen) && !Token.isErrorOrEOF())
+      lex();
+    consumeIfPresent(MIToken::rparen);
+  }
+  consumeIfPresent(MIToken::colon);
+
+  // Parse the liveins and successors.
+  // N.B: Multiple lists of successors and liveins are allowed and they're
+  // merged into one.
+  // Example:
+  //   liveins: %edi
+  //   liveins: %esi
+  //
+  // is equivalent to
+  //   liveins: %edi, %esi
+  while (true) {
+    if (Token.is(MIToken::kw_successors)) {
+      if (parseBasicBlockSuccessors(MBB))
+        return true;
+    } else if (Token.is(MIToken::kw_liveins)) {
+      if (parseBasicBlockLiveins(MBB))
+        return true;
+    } else if (consumeIfPresent(MIToken::Newline)) {
+      continue;
+    } else
+      break;
+    if (!Token.isNewlineOrEOF())
+      return error("expected line break at the end of a list");
+    lex();
+  }
+
+  // Parse the instructions.
+  bool IsInBundle = false;
+  MachineInstr *PrevMI = nullptr;
+  while (true) {
+    if (Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof))
+      return false;
+    else if (consumeIfPresent(MIToken::Newline))
+      continue;
+    if (consumeIfPresent(MIToken::rbrace)) {
+      // The first parsing pass should verify that all closing '}' have an
+      // opening '{'.
+      assert(IsInBundle);
+      IsInBundle = false;
+      continue;
+    }
+    MachineInstr *MI = nullptr;
+    if (parse(MI))
+      return true;
+    MBB.insert(MBB.end(), MI);
+    if (IsInBundle) {
+      PrevMI->setFlag(MachineInstr::BundledSucc);
+      MI->setFlag(MachineInstr::BundledPred);
+    }
+    PrevMI = MI;
+    if (Token.is(MIToken::lbrace)) {
+      if (IsInBundle)
+        return error("nested instruction bundles are not allowed");
+      lex();
+      // This instruction is the start of the bundle.
+      MI->setFlag(MachineInstr::BundledSucc);
+      IsInBundle = true;
+      if (!Token.is(MIToken::Newline))
+        // The next instruction can be on the same line.
+        continue;
+    }
+    assert(Token.isNewlineOrEOF() && "MI is not fully parsed");
+    lex();
+  }
+  return false;
+}
+
+bool MIParser::parseBasicBlocks() {
+  lex();
+  // Skip until the first machine basic block.
+  while (Token.is(MIToken::Newline))
+    lex();
+  if (Token.isErrorOrEOF())
+    return Token.isError();
+  // The first parsing pass should have verified that this token is a MBB label
+  // in the 'parseBasicBlockDefinitions' method.
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  do {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(MBB))
+      return true;
+    if (parseBasicBlock(*MBB))
+      return true;
+    // The method 'parseBasicBlock' should parse the whole block until the next
+    // block or the end of file.
+    assert(Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof));
+  } while (Token.isNot(MIToken::Eof));
+  return false;
+}
+
+bool MIParser::parse(MachineInstr *&MI) {
   // Parse any register operands before '='
-  // TODO: Allow parsing of multiple operands before '='
   MachineOperand MO = MachineOperand::CreateImm(0);
-  SmallVector<MachineOperandWithLocation, 8> Operands;
-  if (Token.isRegister() || Token.isRegisterFlag()) {
+  SmallVector<ParsedMachineOperand, 8> Operands;
+  while (Token.isRegister() || Token.isRegisterFlag()) {
     auto Loc = Token.location();
-    if (parseRegisterOperand(MO, /*IsDef=*/true))
+    Optional<unsigned> TiedDefIdx;
+    if (parseRegisterOperand(MO, TiedDefIdx, /*IsDef=*/true))
       return true;
-    Operands.push_back(MachineOperandWithLocation(MO, Loc, Token.location()));
-    if (Token.isNot(MIToken::equal))
-      return error("expected '='");
+    Operands.push_back(
+        ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
+    if (Token.isNot(MIToken::comma))
+      break;
     lex();
   }
-
-  unsigned OpCode;
-  if (Token.isError() || parseInstruction(OpCode))
+  if (!Operands.empty() && expectAndConsume(MIToken::equal))
     return true;
 
-  // TODO: Parse the instruction flags and memory operands.
+  unsigned OpCode, Flags = 0;
+  if (Token.isError() || parseInstruction(OpCode, Flags))
+    return true;
 
   // Parse the remaining machine operands.
-  while (Token.isNot(MIToken::Eof)) {
+  while (!Token.isNewlineOrEOF() && Token.isNot(MIToken::kw_debug_location) &&
+         Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
-    if (parseMachineOperand(MO))
+    Optional<unsigned> TiedDefIdx;
+    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
       return true;
-    Operands.push_back(MachineOperandWithLocation(MO, Loc, Token.location()));
-    if (Token.is(MIToken::Eof))
+    Operands.push_back(
+        ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
+    if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
+        Token.is(MIToken::lbrace))
       break;
     if (Token.isNot(MIToken::comma))
       return error("expected ',' before the next machine operand");
     lex();
   }
 
+  DebugLoc DebugLocation;
+  if (Token.is(MIToken::kw_debug_location)) {
+    lex();
+    if (Token.isNot(MIToken::exclaim))
+      return error("expected a metadata node after 'debug-location'");
+    MDNode *Node = nullptr;
+    if (parseMDNode(Node))
+      return true;
+    DebugLocation = DebugLoc(Node);
+  }
+
+  // Parse the machine memory operands.
+  SmallVector<MachineMemOperand *, 2> MemOperands;
+  if (Token.is(MIToken::coloncolon)) {
+    lex();
+    while (!Token.isNewlineOrEOF()) {
+      MachineMemOperand *MemOp = nullptr;
+      if (parseMachineMemoryOperand(MemOp))
+        return true;
+      MemOperands.push_back(MemOp);
+      if (Token.isNewlineOrEOF())
+        break;
+      if (Token.isNot(MIToken::comma))
+        return error("expected ',' before the next machine memory operand");
+      lex();
+    }
+  }
+
   const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode);
   if (!MCID.isVariadic()) {
     // FIXME: Move the implicit operand verification to the machine verifier.
@@ -197,13 +640,22 @@ bool MIParser::parse(MachineInstr *&MI) {
   }
 
   // TODO: Check for extraneous machine operands.
-  MI = MF.CreateMachineInstr(MCID, DebugLoc(), /*NoImplicit=*/true);
+  MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true);
+  MI->setFlags(Flags);
   for (const auto &Operand : Operands)
     MI->addOperand(MF, Operand.Operand);
+  if (assignRegisterTies(*MI, Operands))
+    return true;
+  if (MemOperands.empty())
+    return false;
+  MachineInstr::mmo_iterator MemRefs =
+      MF.allocateMemRefsArray(MemOperands.size());
+  std::copy(MemOperands.begin(), MemOperands.end(), MemRefs);
+  MI->setMemRefs(MemRefs, MemRefs + MemOperands.size());
   return false;
 }
 
-bool MIParser::parseMBB(MachineBasicBlock *&MBB) {
+bool MIParser::parseStandaloneMBB(MachineBasicBlock *&MBB) {
   lex();
   if (Token.isNot(MIToken::MachineBasicBlock))
     return error("expected a machine basic block reference");
@@ -216,18 +668,52 @@ bool MIParser::parseMBB(MachineBasicBlock *&MBB) {
   return false;
 }
 
-bool MIParser::parseNamedRegister(unsigned &Reg) {
+bool MIParser::parseStandaloneNamedRegister(unsigned &Reg) {
   lex();
   if (Token.isNot(MIToken::NamedRegister))
     return error("expected a named register");
   if (parseRegister(Reg))
-    return 0;
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneVirtualRegister(unsigned &Reg) {
+  lex();
+  if (Token.isNot(MIToken::VirtualRegister))
+    return error("expected a virtual register");
+  if (parseRegister(Reg))
+    return true;
   lex();
   if (Token.isNot(MIToken::Eof))
     return error("expected end of string after the register reference");
   return false;
 }
 
+bool MIParser::parseStandaloneStackObject(int &FI) {
+  lex();
+  if (Token.isNot(MIToken::StackObject))
+    return error("expected a stack object");
+  if (parseStackFrameIndex(FI))
+    return true;
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the stack object reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneMDNode(MDNode *&Node) {
+  lex();
+  if (Token.isNot(MIToken::exclaim))
+    return error("expected a metadata node");
+  if (parseMDNode(Node))
+    return true;
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the metadata node");
+  return false;
+}
+
 static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
   assert(MO.isImplicit());
   return MO.isDef() ? "implicit-def" : "implicit";
@@ -239,8 +725,18 @@ static std::string getRegisterName(const TargetRegisterInfo *TRI,
   return StringRef(TRI->getName(Reg)).lower();
 }
 
-bool MIParser::verifyImplicitOperands(
-    ArrayRef<MachineOperandWithLocation> Operands, const MCInstrDesc &MCID) {
+/// Return true if the parsed machine operands contain a given machine operand.
+static bool isImplicitOperandIn(const MachineOperand &ImplicitOperand,
+                                ArrayRef<ParsedMachineOperand> Operands) {
+  for (const auto &I : Operands) {
+    if (ImplicitOperand.isIdenticalTo(I.Operand))
+      return true;
+  }
+  return false;
+}
+
+bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
+                                      const MCInstrDesc &MCID) {
   if (MCID.isCall())
     // We can't verify call instructions as they can contain arbitrary implicit
     // register and register mask operands.
@@ -249,48 +745,32 @@ bool MIParser::verifyImplicitOperands(
   // Gather all the expected implicit operands.
   SmallVector<MachineOperand, 4> ImplicitOperands;
   if (MCID.ImplicitDefs)
-    for (const uint16_t *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
+    for (const MCPhysReg *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
       ImplicitOperands.push_back(
           MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID.ImplicitUses)
-    for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
+    for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
       ImplicitOperands.push_back(
           MachineOperand::CreateReg(*ImpUses, false, true));
 
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
   assert(TRI && "Expected target register info");
-  size_t I = ImplicitOperands.size(), J = Operands.size();
-  while (I) {
-    --I;
-    if (J) {
-      --J;
-      const auto &ImplicitOperand = ImplicitOperands[I];
-      const auto &Operand = Operands[J].Operand;
-      if (ImplicitOperand.isIdenticalTo(Operand))
-        continue;
-      if (Operand.isReg() && Operand.isImplicit()) {
-        return error(Operands[J].Begin,
-                     Twine("expected an implicit register operand '") +
-                         printImplicitRegisterFlag(ImplicitOperand) + " %" +
-                         getRegisterName(TRI, ImplicitOperand.getReg()) + "'");
-      }
-    }
-    // TODO: Fix source location when Operands[J].end is right before '=', i.e:
-    // insead of reporting an error at this location:
-    //            %eax = MOV32r0
-    //                 ^
-    // report the error at the following location:
-    //            %eax = MOV32r0
-    //                          ^
-    return error(J < Operands.size() ? Operands[J].End : Token.location(),
+  for (const auto &I : ImplicitOperands) {
+    if (isImplicitOperandIn(I, Operands))
+      continue;
+    return error(Operands.empty() ? Token.location() : Operands.back().End,
                  Twine("missing implicit register operand '") +
-                     printImplicitRegisterFlag(ImplicitOperands[I]) + " %" +
-                     getRegisterName(TRI, ImplicitOperands[I].getReg()) + "'");
+                     printImplicitRegisterFlag(I) + " %" +
+                     getRegisterName(TRI, I.getReg()) + "'");
   }
   return false;
 }
 
-bool MIParser::parseInstruction(unsigned &OpCode) {
+bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
+  if (Token.is(MIToken::kw_frame_setup)) {
+    Flags |= MachineInstr::FrameSetup;
+    lex();
+  }
   if (Token.isNot(MIToken::Identifier))
     return error("expected a machine instruction");
   StringRef InstrName = Token.stringValue();
@@ -330,6 +810,7 @@ bool MIParser::parseRegister(unsigned &Reg) {
 }
 
 bool MIParser::parseRegisterFlag(unsigned &Flags) {
+  const unsigned OldFlags = Flags;
   switch (Token.kind()) {
   case MIToken::kw_implicit:
     Flags |= RegState::Implicit;
@@ -337,6 +818,9 @@ bool MIParser::parseRegisterFlag(unsigned &Flags) {
   case MIToken::kw_implicit_define:
     Flags |= RegState::ImplicitDefine;
     break;
+  case MIToken::kw_def:
+    Flags |= RegState::Define;
+    break;
   case MIToken::kw_dead:
     Flags |= RegState::Dead;
     break;
@@ -346,11 +830,22 @@ bool MIParser::parseRegisterFlag(unsigned &Flags) {
   case MIToken::kw_undef:
     Flags |= RegState::Undef;
     break;
-  // TODO: report an error when we specify the same flag more than once.
-  // TODO: parse the other register flags.
+  case MIToken::kw_internal:
+    Flags |= RegState::InternalRead;
+    break;
+  case MIToken::kw_early_clobber:
+    Flags |= RegState::EarlyClobber;
+    break;
+  case MIToken::kw_debug_use:
+    Flags |= RegState::Debug;
+    break;
   default:
     llvm_unreachable("The current token should be a register flag");
   }
+  if (OldFlags == Flags)
+    // We know that the same flag is specified more than once when the flags
+    // weren't modified.
+    return error("duplicate '" + Token.stringValue() + "' register flag");
   lex();
   return false;
 }
@@ -368,7 +863,59 @@ bool MIParser::parseSubRegisterIndex(unsigned &SubReg) {
   return false;
 }
 
-bool MIParser::parseRegisterOperand(MachineOperand &Dest, bool IsDef) {
+bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) {
+  if (!consumeIfPresent(MIToken::kw_tied_def))
+    return error("expected 'tied-def' after '('");
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal after 'tied-def'");
+  if (getUnsigned(TiedDefIdx))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  return false;
+}
+
+bool MIParser::assignRegisterTies(MachineInstr &MI,
+                                  ArrayRef<ParsedMachineOperand> Operands) {
+  SmallVector<std::pair<unsigned, unsigned>, 4> TiedRegisterPairs;
+  for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
+    if (!Operands[I].TiedDefIdx)
+      continue;
+    // The parser ensures that this operand is a register use, so we just have
+    // to check the tied-def operand.
+    unsigned DefIdx = Operands[I].TiedDefIdx.getValue();
+    if (DefIdx >= E)
+      return error(Operands[I].Begin,
+                   Twine("use of invalid tied-def operand index '" +
+                         Twine(DefIdx) + "'; instruction has only ") +
+                       Twine(E) + " operands");
+    const auto &DefOperand = Operands[DefIdx].Operand;
+    if (!DefOperand.isReg() || !DefOperand.isDef())
+      // FIXME: add note with the def operand.
+      return error(Operands[I].Begin,
+                   Twine("use of invalid tied-def operand index '") +
+                       Twine(DefIdx) + "'; the operand #" + Twine(DefIdx) +
+                       " isn't a defined register");
+    // Check that the tied-def operand wasn't tied elsewhere.
+    for (const auto &TiedPair : TiedRegisterPairs) {
+      if (TiedPair.first == DefIdx)
+        return error(Operands[I].Begin,
+                     Twine("the tied-def operand #") + Twine(DefIdx) +
+                         " is already tied with another register operand");
+    }
+    TiedRegisterPairs.push_back(std::make_pair(DefIdx, I));
+  }
+  // FIXME: Verify that for non INLINEASM instructions, the def and use tied
+  // indices must be less than tied max.
+  for (const auto &TiedPair : TiedRegisterPairs)
+    MI.tieOperands(TiedPair.first, TiedPair.second);
+  return false;
+}
+
+bool MIParser::parseRegisterOperand(MachineOperand &Dest,
+                                    Optional<unsigned> &TiedDefIdx,
+                                    bool IsDef) {
   unsigned Reg;
   unsigned Flags = IsDef ? RegState::Define : 0;
   while (Token.isRegisterFlag()) {
@@ -385,10 +932,17 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest, bool IsDef) {
     if (parseSubRegisterIndex(SubReg))
       return true;
   }
+  if ((Flags & RegState::Define) == 0 && consumeIfPresent(MIToken::lparen)) {
+    unsigned Idx;
+    if (parseRegisterTiedDefIndex(Idx))
+      return true;
+    TiedDefIdx = Idx;
+  }
   Dest = MachineOperand::CreateReg(
       Reg, Flags & RegState::Define, Flags & RegState::Implicit,
       Flags & RegState::Kill, Flags & RegState::Dead, Flags & RegState::Undef,
-      /*isEarlyClobber=*/false, SubReg);
+      Flags & RegState::EarlyClobber, SubReg, Flags & RegState::Debug,
+      Flags & RegState::InternalRead);
   return false;
 }
 
@@ -396,13 +950,55 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::IntegerLiteral));
   const APSInt &Int = Token.integerValue();
   if (Int.getMinSignedBits() > 64)
-    // TODO: Replace this with an error when we can parse CIMM Machine Operands.
-    llvm_unreachable("Can't parse large integer literals yet!");
+    return error("integer literal is too large to be an immediate operand");
   Dest = MachineOperand::CreateImm(Int.getExtValue());
   lex();
   return false;
 }
 
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  auto Source = StringValue.str(); // The source has to be null terminated.
+  SMDiagnostic Err;
+  C = parseConstantValue(Source.c_str(), Err, *MF.getFunction()->getParent(),
+                         &IRSlots);
+  if (!C)
+    return error(Loc + Err.getColumnNo(), Err.getMessage());
+  return false;
+}
+
+bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
+  if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
+    return true;
+  lex();
+  return false;
+}
+
+bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::IntegerType));
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal");
+  const Constant *C = nullptr;
+  if (parseIRConstant(Loc, C))
+    return true;
+  Dest = MachineOperand::CreateCImm(cast<ConstantInt>(C));
+  return false;
+}
+
+bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::FloatingPointLiteral))
+    return error("expected a floating point literal");
+  const Constant *C = nullptr;
+  if (parseIRConstant(Loc, C))
+    return true;
+  Dest = MachineOperand::CreateFPImm(cast<ConstantFP>(C));
+  return false;
+}
+
 bool MIParser::getUnsigned(unsigned &Result) {
   assert(Token.hasIntegerValue() && "Expected a token with an integer value");
   const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
@@ -414,7 +1010,8 @@ bool MIParser::getUnsigned(unsigned &Result) {
 }
 
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
-  assert(Token.is(MIToken::MachineBasicBlock));
+  assert(Token.is(MIToken::MachineBasicBlock) ||
+         Token.is(MIToken::MachineBasicBlockLabel));
   unsigned Number;
   if (getUnsigned(Number))
     return true;
@@ -438,16 +1035,66 @@ bool MIParser::parseMBBOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
+bool MIParser::parseStackFrameIndex(int &FI) {
+  assert(Token.is(MIToken::StackObject));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ObjectInfo = PFS.StackObjectSlots.find(ID);
+  if (ObjectInfo == PFS.StackObjectSlots.end())
+    return error(Twine("use of undefined stack object '%stack.") + Twine(ID) +
+                 "'");
+  StringRef Name;
+  if (const auto *Alloca =
+          MF.getFrameInfo()->getObjectAllocation(ObjectInfo->second))
+    Name = Alloca->getName();
+  if (!Token.stringValue().empty() && Token.stringValue() != Name)
+    return error(Twine("the name of the stack object '%stack.") + Twine(ID) +
+                 "' isn't '" + Token.stringValue() + "'");
+  lex();
+  FI = ObjectInfo->second;
+  return false;
+}
+
+bool MIParser::parseStackObjectOperand(MachineOperand &Dest) {
+  int FI;
+  if (parseStackFrameIndex(FI))
+    return true;
+  Dest = MachineOperand::CreateFI(FI);
+  return false;
+}
+
+bool MIParser::parseFixedStackFrameIndex(int &FI) {
+  assert(Token.is(MIToken::FixedStackObject));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ObjectInfo = PFS.FixedStackObjectSlots.find(ID);
+  if (ObjectInfo == PFS.FixedStackObjectSlots.end())
+    return error(Twine("use of undefined fixed stack object '%fixed-stack.") +
+                 Twine(ID) + "'");
+  lex();
+  FI = ObjectInfo->second;
+  return false;
+}
+
+bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
+  int FI;
+  if (parseFixedStackFrameIndex(FI))
+    return true;
+  Dest = MachineOperand::CreateFI(FI);
+  return false;
+}
+
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    auto Name = Token.stringValue();
     const Module *M = MF.getFunction()->getParent();
-    if (const auto *GV = M->getNamedValue(Name)) {
-      Dest = MachineOperand::CreateGA(GV, /*Offset=*/0);
-      break;
-    }
-    return error(Twine("use of undefined global value '@") + Name + "'");
+    GV = M->getNamedValue(Token.stringValue());
+    if (!GV)
+      return error(Twine("use of undefined global value '") + Token.range() +
+                   "'");
+    break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
@@ -456,36 +1103,323 @@ bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
     if (GVIdx >= IRSlots.GlobalValues.size())
       return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
                    "'");
-    Dest = MachineOperand::CreateGA(IRSlots.GlobalValues[GVIdx],
-                                    /*Offset=*/0);
+    GV = IRSlots.GlobalValues[GVIdx];
     break;
   }
   default:
     llvm_unreachable("The current token should be a global value");
   }
-  // TODO: Parse offset and target flags.
+  return false;
+}
+
+bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
+  GlobalValue *GV = nullptr;
+  if (parseGlobalValue(GV))
+    return true;
+  lex();
+  Dest = MachineOperand::CreateGA(GV, /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseConstantPoolIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::ConstantPoolItem));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ConstantInfo = PFS.ConstantPoolSlots.find(ID);
+  if (ConstantInfo == PFS.ConstantPoolSlots.end())
+    return error("use of undefined constant '%const." + Twine(ID) + "'");
+  lex();
+  Dest = MachineOperand::CreateCPI(ID, /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseJumpTableIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::JumpTableIndex));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto JumpTableEntryInfo = PFS.JumpTableSlots.find(ID);
+  if (JumpTableEntryInfo == PFS.JumpTableSlots.end())
+    return error("use of undefined jump table '%jump-table." + Twine(ID) + "'");
+  lex();
+  Dest = MachineOperand::CreateJTI(JumpTableEntryInfo->second);
+  return false;
+}
+
+bool MIParser::parseExternalSymbolOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::ExternalSymbol));
+  const char *Symbol = MF.createExternalSymbolName(Token.stringValue());
+  lex();
+  Dest = MachineOperand::CreateES(Symbol);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseMDNode(MDNode *&Node) {
+  assert(Token.is(MIToken::exclaim));
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected metadata id after '!'");
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto NodeInfo = IRSlots.MetadataNodes.find(ID);
+  if (NodeInfo == IRSlots.MetadataNodes.end())
+    return error(Loc, "use of undefined metadata '!" + Twine(ID) + "'");
+  lex();
+  Node = NodeInfo->second.get();
+  return false;
+}
+
+bool MIParser::parseMetadataOperand(MachineOperand &Dest) {
+  MDNode *Node = nullptr;
+  if (parseMDNode(Node))
+    return true;
+  Dest = MachineOperand::CreateMetadata(Node);
+  return false;
+}
+
+bool MIParser::parseCFIOffset(int &Offset) {
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected a cfi offset");
+  if (Token.integerValue().getMinSignedBits() > 32)
+    return error("expected a 32 bit integer (the cfi offset is too large)");
+  Offset = (int)Token.integerValue().getExtValue();
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIRegister(unsigned &Reg) {
+  if (Token.isNot(MIToken::NamedRegister))
+    return error("expected a cfi register");
+  unsigned LLVMReg;
+  if (parseRegister(LLVMReg))
+    return true;
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  int DwarfReg = TRI->getDwarfRegNum(LLVMReg, true);
+  if (DwarfReg < 0)
+    return error("invalid DWARF register");
+  Reg = (unsigned)DwarfReg;
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIOperand(MachineOperand &Dest) {
+  auto Kind = Token.kind();
+  lex();
+  auto &MMI = MF.getMMI();
+  int Offset;
+  unsigned Reg;
+  unsigned CFIIndex;
+  switch (Kind) {
+  case MIToken::kw_cfi_same_value:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createSameValue(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_offset:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+    break;
+  case MIToken::kw_cfi_def_cfa_register:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_def_cfa_offset:
+    if (parseCFIOffset(Offset))
+      return true;
+    // NB: MCCFIInstruction::createDefCfaOffset negates the offset.
+    CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+    break;
+  case MIToken::kw_cfi_def_cfa:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    // NB: MCCFIInstruction::createDefCfa negates the offset.
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
+    break;
+  default:
+    // TODO: Parse the other CFI operands.
+    llvm_unreachable("The current token should be a cfi operand");
+  }
+  Dest = MachineOperand::CreateCFIIndex(CFIIndex);
+  return false;
+}
+
+bool MIParser::parseIRBlock(BasicBlock *&BB, const Function &F) {
+  switch (Token.kind()) {
+  case MIToken::NamedIRBlock: {
+    BB = dyn_cast_or_null<BasicBlock>(
+        F.getValueSymbolTable().lookup(Token.stringValue()));
+    if (!BB)
+      return error(Twine("use of undefined IR block '") + Token.range() + "'");
+    break;
+  }
+  case MIToken::IRBlock: {
+    unsigned SlotNumber = 0;
+    if (getUnsigned(SlotNumber))
+      return true;
+    BB = const_cast<BasicBlock *>(getIRBlock(SlotNumber, F));
+    if (!BB)
+      return error(Twine("use of undefined IR block '%ir-block.") +
+                   Twine(SlotNumber) + "'");
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be an IR block reference");
+  }
+  return false;
+}
+
+bool MIParser::parseBlockAddressOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_blockaddress));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  if (Token.isNot(MIToken::GlobalValue) &&
+      Token.isNot(MIToken::NamedGlobalValue))
+    return error("expected a global value");
+  GlobalValue *GV = nullptr;
+  if (parseGlobalValue(GV))
+    return true;
+  auto *F = dyn_cast<Function>(GV);
+  if (!F)
+    return error("expected an IR function reference");
+  lex();
+  if (expectAndConsume(MIToken::comma))
+    return true;
+  BasicBlock *BB = nullptr;
+  if (Token.isNot(MIToken::IRBlock) && Token.isNot(MIToken::NamedIRBlock))
+    return error("expected an IR block reference");
+  if (parseIRBlock(BB, *F))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateBA(BlockAddress::get(F, BB), /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_target_index));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected the name of the target index");
+  int Index = 0;
+  if (getTargetIndex(Token.stringValue(), Index))
+    return error("use of undefined target index '" + Token.stringValue() + "'");
   lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateTargetIndex(unsigned(Index), /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_liveout));
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  while (true) {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg = 0;
+    if (parseRegister(Reg))
+      return true;
+    lex();
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+    // TODO: Report an error if the same register is used more than once.
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegLiveOut(Mask);
   return false;
 }
 
-bool MIParser::parseMachineOperand(MachineOperand &Dest) {
+bool MIParser::parseMachineOperand(MachineOperand &Dest,
+                                   Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
   case MIToken::kw_implicit_define:
+  case MIToken::kw_def:
   case MIToken::kw_dead:
   case MIToken::kw_killed:
   case MIToken::kw_undef:
+  case MIToken::kw_internal:
+  case MIToken::kw_early_clobber:
+  case MIToken::kw_debug_use:
   case MIToken::underscore:
   case MIToken::NamedRegister:
   case MIToken::VirtualRegister:
-    return parseRegisterOperand(Dest);
+    return parseRegisterOperand(Dest, TiedDefIdx);
   case MIToken::IntegerLiteral:
     return parseImmediateOperand(Dest);
+  case MIToken::IntegerType:
+    return parseTypedImmediateOperand(Dest);
+  case MIToken::kw_half:
+  case MIToken::kw_float:
+  case MIToken::kw_double:
+  case MIToken::kw_x86_fp80:
+  case MIToken::kw_fp128:
+  case MIToken::kw_ppc_fp128:
+    return parseFPImmediateOperand(Dest);
   case MIToken::MachineBasicBlock:
     return parseMBBOperand(Dest);
+  case MIToken::StackObject:
+    return parseStackObjectOperand(Dest);
+  case MIToken::FixedStackObject:
+    return parseFixedStackObjectOperand(Dest);
   case MIToken::GlobalValue:
   case MIToken::NamedGlobalValue:
     return parseGlobalAddressOperand(Dest);
+  case MIToken::ConstantPoolItem:
+    return parseConstantPoolIndexOperand(Dest);
+  case MIToken::JumpTableIndex:
+    return parseJumpTableIndexOperand(Dest);
+  case MIToken::ExternalSymbol:
+    return parseExternalSymbolOperand(Dest);
+  case MIToken::exclaim:
+    return parseMetadataOperand(Dest);
+  case MIToken::kw_cfi_same_value:
+  case MIToken::kw_cfi_offset:
+  case MIToken::kw_cfi_def_cfa_register:
+  case MIToken::kw_cfi_def_cfa_offset:
+  case MIToken::kw_cfi_def_cfa:
+    return parseCFIOperand(Dest);
+  case MIToken::kw_blockaddress:
+    return parseBlockAddressOperand(Dest);
+  case MIToken::kw_target_index:
+    return parseTargetIndexOperand(Dest);
+  case MIToken::kw_liveout:
+    return parseLiveoutRegisterMaskOperand(Dest);
   case MIToken::Error:
     return true;
   case MIToken::Identifier:
@@ -496,12 +1430,314 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest) {
     }
   // fallthrough
   default:
-    // TODO: parse the other machine operands.
+    // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
   }
   return false;
 }
 
+bool MIParser::parseMachineOperandAndTargetFlags(
+    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
+  unsigned TF = 0;
+  bool HasTargetFlags = false;
+  if (Token.is(MIToken::kw_target_flags)) {
+    HasTargetFlags = true;
+    lex();
+    if (expectAndConsume(MIToken::lparen))
+      return true;
+    if (Token.isNot(MIToken::Identifier))
+      return error("expected the name of the target flag");
+    if (getDirectTargetFlag(Token.stringValue(), TF)) {
+      if (getBitmaskTargetFlag(Token.stringValue(), TF))
+        return error("use of undefined target flag '" + Token.stringValue() +
+                     "'");
+    }
+    lex();
+    while (Token.is(MIToken::comma)) {
+      lex();
+      if (Token.isNot(MIToken::Identifier))
+        return error("expected the name of the target flag");
+      unsigned BitFlag = 0;
+      if (getBitmaskTargetFlag(Token.stringValue(), BitFlag))
+        return error("use of undefined target flag '" + Token.stringValue() +
+                     "'");
+      // TODO: Report an error when using a duplicate bit target flag.
+      TF |= BitFlag;
+      lex();
+    }
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+  }
+  auto Loc = Token.location();
+  if (parseMachineOperand(Dest, TiedDefIdx))
+    return true;
+  if (!HasTargetFlags)
+    return false;
+  if (Dest.isReg())
+    return error(Loc, "register operands can't have target flags");
+  Dest.setTargetFlags(TF);
+  return false;
+}
+
+bool MIParser::parseOffset(int64_t &Offset) {
+  if (Token.isNot(MIToken::plus) && Token.isNot(MIToken::minus))
+    return false;
+  StringRef Sign = Token.range();
+  bool IsNegative = Token.is(MIToken::minus);
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal after '" + Sign + "'");
+  if (Token.integerValue().getMinSignedBits() > 64)
+    return error("expected 64-bit integer (too large)");
+  Offset = Token.integerValue().getExtValue();
+  if (IsNegative)
+    Offset = -Offset;
+  lex();
+  return false;
+}
+
+bool MIParser::parseAlignment(unsigned &Alignment) {
+  assert(Token.is(MIToken::kw_align));
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected an integer literal after 'align'");
+  if (getUnsigned(Alignment))
+    return true;
+  lex();
+  return false;
+}
+
+bool MIParser::parseOperandsOffset(MachineOperand &Op) {
+  int64_t Offset = 0;
+  if (parseOffset(Offset))
+    return true;
+  Op.setOffset(Offset);
+  return false;
+}
+
+bool MIParser::parseIRValue(const Value *&V) {
+  switch (Token.kind()) {
+  case MIToken::NamedIRValue: {
+    V = MF.getFunction()->getValueSymbolTable().lookup(Token.stringValue());
+    break;
+  }
+  case MIToken::IRValue: {
+    unsigned SlotNumber = 0;
+    if (getUnsigned(SlotNumber))
+      return true;
+    V = getIRValue(SlotNumber);
+    break;
+  }
+  case MIToken::NamedGlobalValue:
+  case MIToken::GlobalValue: {
+    GlobalValue *GV = nullptr;
+    if (parseGlobalValue(GV))
+      return true;
+    V = GV;
+    break;
+  }
+  case MIToken::QuotedIRValue: {
+    const Constant *C = nullptr;
+    if (parseIRConstant(Token.location(), Token.stringValue(), C))
+      return true;
+    V = C;
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be an IR block reference");
+  }
+  if (!V)
+    return error(Twine("use of undefined IR value '") + Token.range() + "'");
+  return false;
+}
+
+bool MIParser::getUint64(uint64_t &Result) {
+  assert(Token.hasIntegerValue());
+  if (Token.integerValue().getActiveBits() > 64)
+    return error("expected 64-bit integer (too large)");
+  Result = Token.integerValue().getZExtValue();
+  return false;
+}
+
+bool MIParser::parseMemoryOperandFlag(unsigned &Flags) {
+  const unsigned OldFlags = Flags;
+  switch (Token.kind()) {
+  case MIToken::kw_volatile:
+    Flags |= MachineMemOperand::MOVolatile;
+    break;
+  case MIToken::kw_non_temporal:
+    Flags |= MachineMemOperand::MONonTemporal;
+    break;
+  case MIToken::kw_invariant:
+    Flags |= MachineMemOperand::MOInvariant;
+    break;
+  // TODO: parse the target specific memory operand flags.
+  default:
+    llvm_unreachable("The current token should be a memory operand flag");
+  }
+  if (OldFlags == Flags)
+    // We know that the same flag is specified more than once when the flags
+    // weren't modified.
+    return error("duplicate '" + Token.stringValue() + "' memory operand flag");
+  lex();
+  return false;
+}
+
+bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
+  switch (Token.kind()) {
+  case MIToken::kw_stack:
+    PSV = MF.getPSVManager().getStack();
+    break;
+  case MIToken::kw_got:
+    PSV = MF.getPSVManager().getGOT();
+    break;
+  case MIToken::kw_jump_table:
+    PSV = MF.getPSVManager().getJumpTable();
+    break;
+  case MIToken::kw_constant_pool:
+    PSV = MF.getPSVManager().getConstantPool();
+    break;
+  case MIToken::FixedStackObject: {
+    int FI;
+    if (parseFixedStackFrameIndex(FI))
+      return true;
+    PSV = MF.getPSVManager().getFixedStack(FI);
+    // The token was already consumed, so use return here instead of break.
+    return false;
+  }
+  case MIToken::kw_call_entry: {
+    lex();
+    switch (Token.kind()) {
+    case MIToken::GlobalValue:
+    case MIToken::NamedGlobalValue: {
+      GlobalValue *GV = nullptr;
+      if (parseGlobalValue(GV))
+        return true;
+      PSV = MF.getPSVManager().getGlobalValueCallEntry(GV);
+      break;
+    }
+    case MIToken::ExternalSymbol:
+      PSV = MF.getPSVManager().getExternalSymbolCallEntry(
+          MF.createExternalSymbolName(Token.stringValue()));
+      break;
+    default:
+      return error(
+          "expected a global value or an external symbol after 'call-entry'");
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be pseudo source value");
+  }
+  lex();
+  return false;
+}
+
+bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
+  if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
+      Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
+      Token.is(MIToken::FixedStackObject) || Token.is(MIToken::kw_call_entry)) {
+    const PseudoSourceValue *PSV = nullptr;
+    if (parseMemoryPseudoSourceValue(PSV))
+      return true;
+    int64_t Offset = 0;
+    if (parseOffset(Offset))
+      return true;
+    Dest = MachinePointerInfo(PSV, Offset);
+    return false;
+  }
+  if (Token.isNot(MIToken::NamedIRValue) && Token.isNot(MIToken::IRValue) &&
+      Token.isNot(MIToken::GlobalValue) &&
+      Token.isNot(MIToken::NamedGlobalValue) &&
+      Token.isNot(MIToken::QuotedIRValue))
+    return error("expected an IR value reference");
+  const Value *V = nullptr;
+  if (parseIRValue(V))
+    return true;
+  if (!V->getType()->isPointerTy())
+    return error("expected a pointer IR value");
+  lex();
+  int64_t Offset = 0;
+  if (parseOffset(Offset))
+    return true;
+  Dest = MachinePointerInfo(V, Offset);
+  return false;
+}
+
+bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  unsigned Flags = 0;
+  while (Token.isMemoryOperandFlag()) {
+    if (parseMemoryOperandFlag(Flags))
+      return true;
+  }
+  if (Token.isNot(MIToken::Identifier) ||
+      (Token.stringValue() != "load" && Token.stringValue() != "store"))
+    return error("expected 'load' or 'store' memory operation");
+  if (Token.stringValue() == "load")
+    Flags |= MachineMemOperand::MOLoad;
+  else
+    Flags |= MachineMemOperand::MOStore;
+  lex();
+
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected the size integer literal after memory operation");
+  uint64_t Size;
+  if (getUint64(Size))
+    return true;
+  lex();
+
+  const char *Word = Flags & MachineMemOperand::MOLoad ? "from" : "into";
+  if (Token.isNot(MIToken::Identifier) || Token.stringValue() != Word)
+    return error(Twine("expected '") + Word + "'");
+  lex();
+
+  MachinePointerInfo Ptr = MachinePointerInfo();
+  if (parseMachinePointerInfo(Ptr))
+    return true;
+  unsigned BaseAlignment = Size;
+  AAMDNodes AAInfo;
+  MDNode *Range = nullptr;
+  while (consumeIfPresent(MIToken::comma)) {
+    switch (Token.kind()) {
+    case MIToken::kw_align:
+      if (parseAlignment(BaseAlignment))
+        return true;
+      break;
+    case MIToken::md_tbaa:
+      lex();
+      if (parseMDNode(AAInfo.TBAA))
+        return true;
+      break;
+    case MIToken::md_alias_scope:
+      lex();
+      if (parseMDNode(AAInfo.Scope))
+        return true;
+      break;
+    case MIToken::md_noalias:
+      lex();
+      if (parseMDNode(AAInfo.NoAlias))
+        return true;
+      break;
+    case MIToken::md_range:
+      lex();
+      if (parseMDNode(Range))
+        return true;
+      break;
+    // TODO: Report an error on duplicate metadata nodes.
+    default:
+      return error("expected 'align' or '!tbaa' or '!alias.scope' or "
+                   "'!noalias' or '!range'");
+    }
+  }
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest =
+      MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range);
+  return false;
+}
+
 void MIParser::initNames2InstrOpCodes() {
   if (!Names2InstrOpCodes.empty())
     return;
@@ -583,18 +1819,162 @@ unsigned MIParser::getSubRegIndex(StringRef Name) {
   return SubRegInfo->getValue();
 }
 
-bool llvm::parseMachineInstr(MachineInstr *&MI, SourceMgr &SM,
-                             MachineFunction &MF, StringRef Src,
-                             const PerFunctionMIParsingState &PFS,
-                             const SlotMapping &IRSlots, SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parse(MI);
+static void initSlots2BasicBlocks(
+    const Function &F,
+    DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (auto &BB : F) {
+    if (BB.hasName())
+      continue;
+    int Slot = MST.getLocalSlot(&BB);
+    if (Slot == -1)
+      continue;
+    Slots2BasicBlocks.insert(std::make_pair(unsigned(Slot), &BB));
+  }
+}
+
+static const BasicBlock *getIRBlockFromSlot(
+    unsigned Slot,
+    const DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
+  auto BlockInfo = Slots2BasicBlocks.find(Slot);
+  if (BlockInfo == Slots2BasicBlocks.end())
+    return nullptr;
+  return BlockInfo->second;
+}
+
+const BasicBlock *MIParser::getIRBlock(unsigned Slot) {
+  if (Slots2BasicBlocks.empty())
+    initSlots2BasicBlocks(*MF.getFunction(), Slots2BasicBlocks);
+  return getIRBlockFromSlot(Slot, Slots2BasicBlocks);
+}
+
+const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
+  if (&F == MF.getFunction())
+    return getIRBlock(Slot);
+  DenseMap<unsigned, const BasicBlock *> CustomSlots2BasicBlocks;
+  initSlots2BasicBlocks(F, CustomSlots2BasicBlocks);
+  return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
+}
+
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value *MIParser::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(*MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
+}
+
+void MIParser::initNames2TargetIndices() {
+  if (!Names2TargetIndices.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Indices = TII->getSerializableTargetIndices();
+  for (const auto &I : Indices)
+    Names2TargetIndices.insert(std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getTargetIndex(StringRef Name, int &Index) {
+  initNames2TargetIndices();
+  auto IndexInfo = Names2TargetIndices.find(Name);
+  if (IndexInfo == Names2TargetIndices.end())
+    return true;
+  Index = IndexInfo->second;
+  return false;
+}
+
+void MIParser::initNames2DirectTargetFlags() {
+  if (!Names2DirectTargetFlags.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2DirectTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getDirectTargetFlag(StringRef Name, unsigned &Flag) {
+  initNames2DirectTargetFlags();
+  auto FlagInfo = Names2DirectTargetFlags.find(Name);
+  if (FlagInfo == Names2DirectTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+void MIParser::initNames2BitmaskTargetFlags() {
+  if (!Names2BitmaskTargetFlags.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableBitmaskMachineOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2BitmaskTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getBitmaskTargetFlag(StringRef Name, unsigned &Flag) {
+  initNames2BitmaskTargetFlags();
+  auto FlagInfo = Names2BitmaskTargetFlags.find(Name);
+  if (FlagInfo == Names2BitmaskTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+bool llvm::parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src,
+                                             PerFunctionMIParsingState &PFS,
+                                             const SlotMapping &IRSlots,
+                                             SMDiagnostic &Error) {
+  SourceMgr SM;
+  SM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(Src, "", /*RequiresNullTerminator=*/false),
+      SMLoc());
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
+      .parseBasicBlockDefinitions(PFS.MBBSlots);
+}
+
+bool llvm::parseMachineInstructions(MachineFunction &MF, StringRef Src,
+                                    const PerFunctionMIParsingState &PFS,
+                                    const SlotMapping &IRSlots,
+                                    SMDiagnostic &Error) {
+  SourceMgr SM;
+  SM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(Src, "", /*RequiresNullTerminator=*/false),
+      SMLoc());
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseBasicBlocks();
 }
 
 bool llvm::parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM,
                              MachineFunction &MF, StringRef Src,
                              const PerFunctionMIParsingState &PFS,
                              const SlotMapping &IRSlots, SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseMBB(MBB);
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseStandaloneMBB(MBB);
 }
 
 bool llvm::parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
@@ -602,5 +1982,30 @@ bool llvm::parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
                                        const PerFunctionMIParsingState &PFS,
                                        const SlotMapping &IRSlots,
                                        SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseNamedRegister(Reg);
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
+      .parseStandaloneNamedRegister(Reg);
+}
+
+bool llvm::parseVirtualRegisterReference(unsigned &Reg, SourceMgr &SM,
+                                         MachineFunction &MF, StringRef Src,
+                                         const PerFunctionMIParsingState &PFS,
+                                         const SlotMapping &IRSlots,
+                                         SMDiagnostic &Error) {
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
+      .parseStandaloneVirtualRegister(Reg);
+}
+
+bool llvm::parseStackObjectReference(int &FI, SourceMgr &SM,
+                                     MachineFunction &MF, StringRef Src,
+                                     const PerFunctionMIParsingState &PFS,
+                                     const SlotMapping &IRSlots,
+                                     SMDiagnostic &Error) {
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots)
+      .parseStandaloneStackObject(FI);
+}
+
+bool llvm::parseMDNode(MDNode *&Node, SourceMgr &SM, MachineFunction &MF,
+                       StringRef Src, const PerFunctionMIParsingState &PFS,
+                       const SlotMapping &IRSlots, SMDiagnostic &Error) {
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseStandaloneMDNode(Node);
 }
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
index fca4c4e..8aef704 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
@@ -19,9 +19,11 @@
 
 namespace llvm {
 
+class BasicBlock;
 class MachineBasicBlock;
 class MachineInstr;
 class MachineFunction;
+class MDNode;
 struct SlotMapping;
 class SMDiagnostic;
 class SourceMgr;
@@ -29,11 +31,42 @@ class SourceMgr;
 struct PerFunctionMIParsingState {
   DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
   DenseMap<unsigned, unsigned> VirtualRegisterSlots;
+  DenseMap<unsigned, int> FixedStackObjectSlots;
+  DenseMap<unsigned, int> StackObjectSlots;
+  DenseMap<unsigned, unsigned> ConstantPoolSlots;
+  DenseMap<unsigned, unsigned> JumpTableSlots;
 };
 
-bool parseMachineInstr(MachineInstr *&MI, SourceMgr &SM, MachineFunction &MF,
-                       StringRef Src, const PerFunctionMIParsingState &PFS,
-                       const SlotMapping &IRSlots, SMDiagnostic &Error);
+/// Parse the machine basic block definitions, and skip the machine
+/// instructions.
+///
+/// This function runs the first parsing pass on the machine function's body.
+/// It parses only the machine basic block definitions and creates the machine
+/// basic blocks in the given machine function.
+///
+/// The machine instructions aren't parsed during the first pass because all
+/// the machine basic blocks aren't defined yet - this makes it impossible to
+/// resolve the machine basic block references.
+///
+/// Return true if an error occurred.
+bool parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src,
+                                       PerFunctionMIParsingState &PFS,
+                                       const SlotMapping &IRSlots,
+                                       SMDiagnostic &Error);
+
+/// Parse the machine instructions.
+///
+/// This function runs the second parsing pass on the machine function's body.
+/// It skips the machine basic block definitions and parses only the machine
+/// instructions and basic block attributes like liveins and successors.
+///
+/// The second parsing pass assumes that the first parsing pass already ran
+/// on the given source string.
+///
+/// Return true if an error occurred.
+bool parseMachineInstructions(MachineFunction &MF, StringRef Src,
+                              const PerFunctionMIParsingState &PFS,
+                              const SlotMapping &IRSlots, SMDiagnostic &Error);
 
 bool parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM,
                        MachineFunction &MF, StringRef Src,
@@ -46,6 +79,21 @@ bool parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
                                  const SlotMapping &IRSlots,
                                  SMDiagnostic &Error);
 
+bool parseVirtualRegisterReference(unsigned &Reg, SourceMgr &SM,
+                                   MachineFunction &MF, StringRef Src,
+                                   const PerFunctionMIParsingState &PFS,
+                                   const SlotMapping &IRSlots,
+                                   SMDiagnostic &Error);
+
+bool parseStackObjectReference(int &FI, SourceMgr &SM, MachineFunction &MF,
+                               StringRef Src,
+                               const PerFunctionMIParsingState &PFS,
+                               const SlotMapping &IRSlots, SMDiagnostic &Error);
+
+bool parseMDNode(MDNode *&Node, SourceMgr &SM, MachineFunction &MF,
+                 StringRef Src, const PerFunctionMIParsingState &PFS,
+                 const SlotMapping &IRSlots, SMDiagnostic &Error);
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 16b0e16..422efbc 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -20,8 +20,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/IR/BasicBlock.h"
@@ -95,30 +97,53 @@ public:
   /// Return true if error occurred.
   bool initializeMachineFunction(MachineFunction &MF);
 
-  /// Initialize the machine basic block using it's YAML representation.
-  ///
-  /// Return true if an error occurred.
-  bool initializeMachineBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
-                                   const yaml::MachineBasicBlock &YamlMBB,
-                                   const PerFunctionMIParsingState &PFS);
+  bool initializeRegisterInfo(MachineFunction &MF,
+                              const yaml::MachineFunction &YamlMF,
+                              PerFunctionMIParsingState &PFS);
+
+  void inferRegisterInfo(MachineFunction &MF,
+                         const yaml::MachineFunction &YamlMF);
+
+  bool initializeFrameInfo(MachineFunction &MF,
+                           const yaml::MachineFunction &YamlMF,
+                           PerFunctionMIParsingState &PFS);
+
+  bool parseCalleeSavedRegister(MachineFunction &MF,
+                                PerFunctionMIParsingState &PFS,
+                                std::vector<CalleeSavedInfo> &CSIInfo,
+                                const yaml::StringValue &RegisterSource,
+                                int FrameIdx);
+
+  bool parseStackObjectsDebugInfo(MachineFunction &MF,
+                                  PerFunctionMIParsingState &PFS,
+                                  const yaml::MachineStackObject &Object,
+                                  int FrameIdx);
 
-  bool
-  initializeRegisterInfo(const MachineFunction &MF,
-                         MachineRegisterInfo &RegInfo,
-                         const yaml::MachineFunction &YamlMF,
-                         DenseMap<unsigned, unsigned> &VirtualRegisterSlots);
+  bool initializeConstantPool(MachineConstantPool &ConstantPool,
+                              const yaml::MachineFunction &YamlMF,
+                              const MachineFunction &MF,
+                              DenseMap<unsigned, unsigned> &ConstantPoolSlots);
 
-  bool initializeFrameInfo(MachineFrameInfo &MFI,
-                           const yaml::MachineFunction &YamlMF);
+  bool initializeJumpTableInfo(MachineFunction &MF,
+                               const yaml::MachineJumpTable &YamlJTI,
+                               PerFunctionMIParsingState &PFS);
 
 private:
+  bool parseMDNode(MDNode *&Node, const yaml::StringValue &Source,
+                   MachineFunction &MF, const PerFunctionMIParsingState &PFS);
+
+  bool parseMBBReference(MachineBasicBlock *&MBB,
+                         const yaml::StringValue &Source, MachineFunction &MF,
+                         const PerFunctionMIParsingState &PFS);
+
   /// Return a MIR diagnostic converted from an MI string diagnostic.
   SMDiagnostic diagFromMIStringDiag(const SMDiagnostic &Error,
                                     SMRange SourceRange);
 
-  /// Return a MIR diagnostic converted from an LLVM assembly diagnostic.
-  SMDiagnostic diagFromLLVMAssemblyDiag(const SMDiagnostic &Error,
-                                        SMRange SourceRange);
+  /// Return a MIR diagnostic converted from a diagnostic located in a YAML
+  /// block scalar string.
+  SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error,
+                                       SMRange SourceRange);
 
   /// Create an empty function with the given name.
   void createDummyFunction(StringRef Name, Module &M);
@@ -200,7 +225,7 @@ std::unique_ptr<Module> MIRParserImpl::parse() {
     M = parseAssembly(MemoryBufferRef(BSN->getValue(), Filename), Error,
                       Context, &IRSlots);
     if (!M) {
-      reportDiagnostic(diagFromLLVMAssemblyDiag(Error, BSN->getSourceRange()));
+      reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange()));
       return M;
     }
     In.nextDocument();
@@ -261,88 +286,56 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasInlineAsm(YamlMF.HasInlineAsm);
   PerFunctionMIParsingState PFS;
-  if (initializeRegisterInfo(MF, MF.getRegInfo(), YamlMF,
-                             PFS.VirtualRegisterSlots))
-    return true;
-  if (initializeFrameInfo(*MF.getFrameInfo(), YamlMF))
+  if (initializeRegisterInfo(MF, YamlMF, PFS))
     return true;
-
-  const auto &F = *MF.getFunction();
-  for (const auto &YamlMBB : YamlMF.BasicBlocks) {
-    const BasicBlock *BB = nullptr;
-    const yaml::StringValue &Name = YamlMBB.Name;
-    if (!Name.Value.empty()) {
-      BB = dyn_cast_or_null<BasicBlock>(
-          F.getValueSymbolTable().lookup(Name.Value));
-      if (!BB)
-        return error(Name.SourceRange.Start,
-                     Twine("basic block '") + Name.Value +
-                         "' is not defined in the function '" + MF.getName() +
-                         "'");
-    }
-    auto *MBB = MF.CreateMachineBasicBlock(BB);
-    MF.insert(MF.end(), MBB);
-    bool WasInserted =
-        PFS.MBBSlots.insert(std::make_pair(YamlMBB.ID, MBB)).second;
-    if (!WasInserted)
-      return error(Twine("redefinition of machine basic block with id #") +
-                   Twine(YamlMBB.ID));
-  }
-
-  if (YamlMF.BasicBlocks.empty())
-    return error(Twine("machine function '") + Twine(MF.getName()) +
-                 "' requires at least one machine basic block in its body");
-  // Initialize the machine basic blocks after creating them all so that the
-  // machine instructions parser can resolve the MBB references.
-  unsigned I = 0;
-  for (const auto &YamlMBB : YamlMF.BasicBlocks) {
-    if (initializeMachineBasicBlock(MF, *MF.getBlockNumbered(I++), YamlMBB,
-                                    PFS))
+  if (!YamlMF.Constants.empty()) {
+    auto *ConstantPool = MF.getConstantPool();
+    assert(ConstantPool && "Constant pool must be created");
+    if (initializeConstantPool(*ConstantPool, YamlMF, MF,
+                               PFS.ConstantPoolSlots))
       return true;
   }
-  return false;
-}
 
-bool MIRParserImpl::initializeMachineBasicBlock(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    const yaml::MachineBasicBlock &YamlMBB,
-    const PerFunctionMIParsingState &PFS) {
-  MBB.setAlignment(YamlMBB.Alignment);
-  if (YamlMBB.AddressTaken)
-    MBB.setHasAddressTaken();
-  MBB.setIsLandingPad(YamlMBB.IsLandingPad);
   SMDiagnostic Error;
-  // Parse the successors.
-  for (const auto &MBBSource : YamlMBB.Successors) {
-    MachineBasicBlock *SuccMBB = nullptr;
-    if (parseMBBReference(SuccMBB, SM, MF, MBBSource.Value, PFS, IRSlots,
-                          Error))
-      return error(Error, MBBSource.SourceRange);
-    // TODO: Report an error when adding the same successor more than once.
-    MBB.addSuccessor(SuccMBB);
-  }
-  // Parse the liveins.
-  for (const auto &LiveInSource : YamlMBB.LiveIns) {
-    unsigned Reg = 0;
-    if (parseNamedRegisterReference(Reg, SM, MF, LiveInSource.Value, PFS,
-                                    IRSlots, Error))
-      return error(Error, LiveInSource.SourceRange);
-    MBB.addLiveIn(Reg);
+  if (parseMachineBasicBlockDefinitions(MF, YamlMF.Body.Value.Value, PFS,
+                                        IRSlots, Error)) {
+    reportDiagnostic(
+        diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
+    return true;
   }
-  // Parse the instructions.
-  for (const auto &MISource : YamlMBB.Instructions) {
-    MachineInstr *MI = nullptr;
-    if (parseMachineInstr(MI, SM, MF, MISource.Value, PFS, IRSlots, Error))
-      return error(Error, MISource.SourceRange);
-    MBB.insert(MBB.end(), MI);
+
+  if (MF.empty())
+    return error(Twine("machine function '") + Twine(MF.getName()) +
+                 "' requires at least one machine basic block in its body");
+  // Initialize the frame information after creating all the MBBs so that the
+  // MBB references in the frame information can be resolved.
+  if (initializeFrameInfo(MF, YamlMF, PFS))
+    return true;
+  // Initialize the jump table after creating all the MBBs so that the MBB
+  // references can be resolved.
+  if (!YamlMF.JumpTableInfo.Entries.empty() &&
+      initializeJumpTableInfo(MF, YamlMF.JumpTableInfo, PFS))
+    return true;
+  // Parse the machine instructions after creating all of the MBBs so that the
+  // parser can resolve the MBB references.
+  if (parseMachineInstructions(MF, YamlMF.Body.Value.Value, PFS, IRSlots,
+                               Error)) {
+    reportDiagnostic(
+        diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
+    return true;
   }
+  inferRegisterInfo(MF, YamlMF);
+  // FIXME: This is a temporary workaround until the reserved registers can be
+  // serialized.
+  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.verify();
   return false;
 }
 
-bool MIRParserImpl::initializeRegisterInfo(
-    const MachineFunction &MF, MachineRegisterInfo &RegInfo,
-    const yaml::MachineFunction &YamlMF,
-    DenseMap<unsigned, unsigned> &VirtualRegisterSlots) {
+bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF,
+                                           const yaml::MachineFunction &YamlMF,
+                                           PerFunctionMIParsingState &PFS) {
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
   assert(RegInfo.isSSA());
   if (!YamlMF.IsSSA)
     RegInfo.leaveSSA();
@@ -351,6 +344,7 @@ bool MIRParserImpl::initializeRegisterInfo(
     RegInfo.invalidateLiveness();
   RegInfo.enableSubRegLiveness(YamlMF.TracksSubRegLiveness);
 
+  SMDiagnostic Error;
   // Parse the virtual register information.
   for (const auto &VReg : YamlMF.VirtualRegisters) {
     const auto *RC = getRegClass(MF, VReg.Class.Value);
@@ -359,15 +353,71 @@ bool MIRParserImpl::initializeRegisterInfo(
                    Twine("use of undefined register class '") +
                        VReg.Class.Value + "'");
     unsigned Reg = RegInfo.createVirtualRegister(RC);
-    // TODO: Report an error when the same virtual register with the same ID is
-    // redefined.
-    VirtualRegisterSlots.insert(std::make_pair(VReg.ID, Reg));
+    if (!PFS.VirtualRegisterSlots.insert(std::make_pair(VReg.ID.Value, Reg))
+             .second)
+      return error(VReg.ID.SourceRange.Start,
+                   Twine("redefinition of virtual register '%") +
+                       Twine(VReg.ID.Value) + "'");
+    if (!VReg.PreferredRegister.Value.empty()) {
+      unsigned PreferredReg = 0;
+      if (parseNamedRegisterReference(PreferredReg, SM, MF,
+                                      VReg.PreferredRegister.Value, PFS,
+                                      IRSlots, Error))
+        return error(Error, VReg.PreferredRegister.SourceRange);
+      RegInfo.setSimpleHint(Reg, PreferredReg);
+    }
   }
+
+  // Parse the liveins.
+  for (const auto &LiveIn : YamlMF.LiveIns) {
+    unsigned Reg = 0;
+    if (parseNamedRegisterReference(Reg, SM, MF, LiveIn.Register.Value, PFS,
+                                    IRSlots, Error))
+      return error(Error, LiveIn.Register.SourceRange);
+    unsigned VReg = 0;
+    if (!LiveIn.VirtualRegister.Value.empty()) {
+      if (parseVirtualRegisterReference(
+              VReg, SM, MF, LiveIn.VirtualRegister.Value, PFS, IRSlots, Error))
+        return error(Error, LiveIn.VirtualRegister.SourceRange);
+    }
+    RegInfo.addLiveIn(Reg, VReg);
+  }
+
+  // Parse the callee saved register mask.
+  BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size());
+  if (!YamlMF.CalleeSavedRegisters)
+    return false;
+  for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
+    unsigned Reg = 0;
+    if (parseNamedRegisterReference(Reg, SM, MF, RegSource.Value, PFS, IRSlots,
+                                    Error))
+      return error(Error, RegSource.SourceRange);
+    CalleeSavedRegisterMask[Reg] = true;
+  }
+  RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip());
   return false;
 }
 
-bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI,
-                                        const yaml::MachineFunction &YamlMF) {
+void MIRParserImpl::inferRegisterInfo(MachineFunction &MF,
+                                      const yaml::MachineFunction &YamlMF) {
+  if (YamlMF.CalleeSavedRegisters)
+    return;
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        MF.getRegInfo().addPhysRegsUsedFromRegMask(MO.getRegMask());
+      }
+    }
+  }
+}
+
+bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF,
+                                        const yaml::MachineFunction &YamlMF,
+                                        PerFunctionMIParsingState &PFS) {
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const Function &F = *MF.getFunction();
   const yaml::MachineFrameInfo &YamlMFI = YamlMF.FrameInfo;
   MFI.setFrameAddressIsTaken(YamlMFI.IsFrameAddressTaken);
   MFI.setReturnAddressIsTaken(YamlMFI.IsReturnAddressTaken);
@@ -383,7 +433,20 @@ bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI,
   MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment);
   MFI.setHasVAStart(YamlMFI.HasVAStart);
   MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
+  if (!YamlMFI.SavePoint.Value.empty()) {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(MBB, YamlMFI.SavePoint, MF, PFS))
+      return true;
+    MFI.setSavePoint(MBB);
+  }
+  if (!YamlMFI.RestorePoint.Value.empty()) {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(MBB, YamlMFI.RestorePoint, MF, PFS))
+      return true;
+    MFI.setRestorePoint(MBB);
+  }
 
+  std::vector<CalleeSavedInfo> CSIInfo;
   // Initialize the fixed frame objects.
   for (const auto &Object : YamlMF.FixedStackObjects) {
     int ObjectIdx;
@@ -393,27 +456,190 @@ bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI,
     else
       ObjectIdx = MFI.CreateFixedSpillStackObject(Object.Size, Object.Offset);
     MFI.setObjectAlignment(ObjectIdx, Object.Alignment);
-    // TODO: Store the mapping between fixed object IDs and object indices to
-    // parse fixed stack object references correctly.
+    if (!PFS.FixedStackObjectSlots.insert(std::make_pair(Object.ID.Value,
+                                                         ObjectIdx))
+             .second)
+      return error(Object.ID.SourceRange.Start,
+                   Twine("redefinition of fixed stack object '%fixed-stack.") +
+                       Twine(Object.ID.Value) + "'");
+    if (parseCalleeSavedRegister(MF, PFS, CSIInfo, Object.CalleeSavedRegister,
+                                 ObjectIdx))
+      return true;
   }
 
   // Initialize the ordinary frame objects.
   for (const auto &Object : YamlMF.StackObjects) {
     int ObjectIdx;
+    const AllocaInst *Alloca = nullptr;
+    const yaml::StringValue &Name = Object.Name;
+    if (!Name.Value.empty()) {
+      Alloca = dyn_cast_or_null<AllocaInst>(
+          F.getValueSymbolTable().lookup(Name.Value));
+      if (!Alloca)
+        return error(Name.SourceRange.Start,
+                     "alloca instruction named '" + Name.Value +
+                         "' isn't defined in the function '" + F.getName() +
+                         "'");
+    }
     if (Object.Type == yaml::MachineStackObject::VariableSized)
-      ObjectIdx =
-          MFI.CreateVariableSizedObject(Object.Alignment, /*Alloca=*/nullptr);
+      ObjectIdx = MFI.CreateVariableSizedObject(Object.Alignment, Alloca);
     else
       ObjectIdx = MFI.CreateStackObject(
           Object.Size, Object.Alignment,
-          Object.Type == yaml::MachineStackObject::SpillSlot);
+          Object.Type == yaml::MachineStackObject::SpillSlot, Alloca);
     MFI.setObjectOffset(ObjectIdx, Object.Offset);
-    // TODO: Store the mapping between object IDs and object indices to parse
-    // stack object references correctly.
+    if (!PFS.StackObjectSlots.insert(std::make_pair(Object.ID.Value, ObjectIdx))
+             .second)
+      return error(Object.ID.SourceRange.Start,
+                   Twine("redefinition of stack object '%stack.") +
+                       Twine(Object.ID.Value) + "'");
+    if (parseCalleeSavedRegister(MF, PFS, CSIInfo, Object.CalleeSavedRegister,
+                                 ObjectIdx))
+      return true;
+    if (Object.LocalOffset)
+      MFI.mapLocalFrameObject(ObjectIdx, Object.LocalOffset.getValue());
+    if (parseStackObjectsDebugInfo(MF, PFS, Object, ObjectIdx))
+      return true;
+  }
+  MFI.setCalleeSavedInfo(CSIInfo);
+  if (!CSIInfo.empty())
+    MFI.setCalleeSavedInfoValid(true);
+
+  // Initialize the various stack object references after initializing the
+  // stack objects.
+  if (!YamlMFI.StackProtector.Value.empty()) {
+    SMDiagnostic Error;
+    int FI;
+    if (parseStackObjectReference(FI, SM, MF, YamlMFI.StackProtector.Value, PFS,
+                                  IRSlots, Error))
+      return error(Error, YamlMFI.StackProtector.SourceRange);
+    MFI.setStackProtectorIndex(FI);
+  }
+  return false;
+}
+
+bool MIRParserImpl::parseCalleeSavedRegister(
+    MachineFunction &MF, PerFunctionMIParsingState &PFS,
+    std::vector<CalleeSavedInfo> &CSIInfo,
+    const yaml::StringValue &RegisterSource, int FrameIdx) {
+  if (RegisterSource.Value.empty())
+    return false;
+  unsigned Reg = 0;
+  SMDiagnostic Error;
+  if (parseNamedRegisterReference(Reg, SM, MF, RegisterSource.Value, PFS,
+                                  IRSlots, Error))
+    return error(Error, RegisterSource.SourceRange);
+  CSIInfo.push_back(CalleeSavedInfo(Reg, FrameIdx));
+  return false;
+}
+
+/// Verify that given node is of a certain type. Return true on error.
+template <typename T>
+static bool typecheckMDNode(T *&Result, MDNode *Node,
+                            const yaml::StringValue &Source,
+                            StringRef TypeString, MIRParserImpl &Parser) {
+  if (!Node)
+    return false;
+  Result = dyn_cast<T>(Node);
+  if (!Result)
+    return Parser.error(Source.SourceRange.Start,
+                        "expected a reference to a '" + TypeString +
+                            "' metadata node");
+  return false;
+}
+
+bool MIRParserImpl::parseStackObjectsDebugInfo(
+    MachineFunction &MF, PerFunctionMIParsingState &PFS,
+    const yaml::MachineStackObject &Object, int FrameIdx) {
+  // Debug information can only be attached to stack objects; Fixed stack
+  // objects aren't supported.
+  assert(FrameIdx >= 0 && "Expected a stack object frame index");
+  MDNode *Var = nullptr, *Expr = nullptr, *Loc = nullptr;
+  if (parseMDNode(Var, Object.DebugVar, MF, PFS) ||
+      parseMDNode(Expr, Object.DebugExpr, MF, PFS) ||
+      parseMDNode(Loc, Object.DebugLoc, MF, PFS))
+    return true;
+  if (!Var && !Expr && !Loc)
+    return false;
+  DILocalVariable *DIVar = nullptr;
+  DIExpression *DIExpr = nullptr;
+  DILocation *DILoc = nullptr;
+  if (typecheckMDNode(DIVar, Var, Object.DebugVar, "DILocalVariable", *this) ||
+      typecheckMDNode(DIExpr, Expr, Object.DebugExpr, "DIExpression", *this) ||
+      typecheckMDNode(DILoc, Loc, Object.DebugLoc, "DILocation", *this))
+    return true;
+  MF.getMMI().setVariableDbgInfo(DIVar, DIExpr, unsigned(FrameIdx), DILoc);
+  return false;
+}
+
+bool MIRParserImpl::parseMDNode(MDNode *&Node, const yaml::StringValue &Source,
+                                MachineFunction &MF,
+                                const PerFunctionMIParsingState &PFS) {
+  if (Source.Value.empty())
+    return false;
+  SMDiagnostic Error;
+  if (llvm::parseMDNode(Node, SM, MF, Source.Value, PFS, IRSlots, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
+bool MIRParserImpl::initializeConstantPool(
+    MachineConstantPool &ConstantPool, const yaml::MachineFunction &YamlMF,
+    const MachineFunction &MF,
+    DenseMap<unsigned, unsigned> &ConstantPoolSlots) {
+  const auto &M = *MF.getFunction()->getParent();
+  SMDiagnostic Error;
+  for (const auto &YamlConstant : YamlMF.Constants) {
+    const Constant *Value = dyn_cast_or_null<Constant>(
+        parseConstantValue(YamlConstant.Value.Value, Error, M));
+    if (!Value)
+      return error(Error, YamlConstant.Value.SourceRange);
+    unsigned Alignment =
+        YamlConstant.Alignment
+            ? YamlConstant.Alignment
+            : M.getDataLayout().getPrefTypeAlignment(Value->getType());
+    unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment);
+    if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index))
+             .second)
+      return error(YamlConstant.ID.SourceRange.Start,
+                   Twine("redefinition of constant pool item '%const.") +
+                       Twine(YamlConstant.ID.Value) + "'");
   }
   return false;
 }
 
+bool MIRParserImpl::initializeJumpTableInfo(
+    MachineFunction &MF, const yaml::MachineJumpTable &YamlJTI,
+    PerFunctionMIParsingState &PFS) {
+  MachineJumpTableInfo *JTI = MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
+  for (const auto &Entry : YamlJTI.Entries) {
+    std::vector<MachineBasicBlock *> Blocks;
+    for (const auto &MBBSource : Entry.Blocks) {
+      MachineBasicBlock *MBB = nullptr;
+      if (parseMBBReference(MBB, MBBSource.Value, MF, PFS))
+        return true;
+      Blocks.push_back(MBB);
+    }
+    unsigned Index = JTI->createJumpTableIndex(Blocks);
+    if (!PFS.JumpTableSlots.insert(std::make_pair(Entry.ID.Value, Index))
+             .second)
+      return error(Entry.ID.SourceRange.Start,
+                   Twine("redefinition of jump table entry '%jump-table.") +
+                       Twine(Entry.ID.Value) + "'");
+  }
+  return false;
+}
+
+bool MIRParserImpl::parseMBBReference(MachineBasicBlock *&MBB,
+                                      const yaml::StringValue &Source,
+                                      MachineFunction &MF,
+                                      const PerFunctionMIParsingState &PFS) {
+  SMDiagnostic Error;
+  if (llvm::parseMBBReference(MBB, SM, MF, Source.Value, PFS, IRSlots, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
 SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
                                                  SMRange SourceRange) {
   assert(SourceRange.isValid() && "Invalid source range");
@@ -430,8 +656,8 @@ SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
                        Error.getFixIts());
 }
 
-SMDiagnostic MIRParserImpl::diagFromLLVMAssemblyDiag(const SMDiagnostic &Error,
-                                                     SMRange SourceRange) {
+SMDiagnostic MIRParserImpl::diagFromBlockStringDiag(const SMDiagnostic &Error,
+                                                    SMRange SourceRange) {
   assert(SourceRange.isValid());
 
   // Translate the location of the error from the location in the llvm IR string
diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
index d5cf924..175cb0d 100644
--- a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -14,13 +14,20 @@
 
 #include "MIRPrinter.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -31,11 +38,38 @@ using namespace llvm;
 
 namespace {
 
+/// This structure describes how to print out stack object references.
+struct FrameIndexOperand {
+  std::string Name;
+  unsigned ID;
+  bool IsFixed;
+
+  FrameIndexOperand(StringRef Name, unsigned ID, bool IsFixed)
+      : Name(Name.str()), ID(ID), IsFixed(IsFixed) {}
+
+  /// Return an ordinary stack object reference.
+  static FrameIndexOperand create(StringRef Name, unsigned ID) {
+    return FrameIndexOperand(Name, ID, /*IsFixed=*/false);
+  }
+
+  /// Return a fixed stack object reference.
+  static FrameIndexOperand createFixed(unsigned ID) {
+    return FrameIndexOperand("", ID, /*IsFixed=*/true);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
 /// This class prints out the machine functions using the MIR serialization
 /// format.
 class MIRPrinter {
   raw_ostream &OS;
   DenseMap<const uint32_t *, unsigned> RegisterMaskIds;
+  /// Maps from stack object indices to operand indices which will be used when
+  /// printing frame index machine operands.
+  DenseMap<int, FrameIndexOperand> StackObjectOperandMapping;
 
 public:
   MIRPrinter(raw_ostream &OS) : OS(OS) {}
@@ -44,11 +78,16 @@ public:
 
   void convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo,
                const TargetRegisterInfo *TRI);
-  void convert(yaml::MachineFrameInfo &YamlMFI, const MachineFrameInfo &MFI);
-  void convert(ModuleSlotTracker &MST, yaml::MachineBasicBlock &YamlMBB,
-               const MachineBasicBlock &MBB);
+  void convert(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
+               const MachineFrameInfo &MFI);
+  void convert(yaml::MachineFunction &MF,
+               const MachineConstantPool &ConstantPool);
+  void convert(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
+               const MachineJumpTableInfo &JTI);
   void convertStackObjects(yaml::MachineFunction &MF,
-                           const MachineFrameInfo &MFI);
+                           const MachineFrameInfo &MFI, MachineModuleInfo &MMI,
+                           ModuleSlotTracker &MST,
+                           const TargetRegisterInfo *TRI);
 
 private:
   void initRegisterMaskIds(const MachineFunction &MF);
@@ -60,18 +99,32 @@ class MIPrinter {
   raw_ostream &OS;
   ModuleSlotTracker &MST;
   const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds;
+  const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping;
 
 public:
   MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST,
-            const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds)
-      : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds) {}
+            const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds,
+            const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping)
+      : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds),
+        StackObjectOperandMapping(StackObjectOperandMapping) {}
+
+  void print(const MachineBasicBlock &MBB);
 
   void print(const MachineInstr &MI);
   void printMBBReference(const MachineBasicBlock &MBB);
-  void print(const MachineOperand &Op, const TargetRegisterInfo *TRI);
+  void printIRBlockReference(const BasicBlock &BB);
+  void printIRValueReference(const Value &V);
+  void printStackObjectReference(int FrameIndex);
+  void printOffset(int64_t Offset);
+  void printTargetFlags(const MachineOperand &Op);
+  void print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
+             unsigned I, bool ShouldPrintRegisterTies, bool IsDef = false);
+  void print(const MachineMemOperand &Op);
+
+  void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI);
 };
 
-} // end anonymous namespace
+} // end namespace llvm
 
 namespace llvm {
 namespace yaml {
@@ -103,6 +156,12 @@ static void printReg(unsigned Reg, raw_ostream &OS,
     llvm_unreachable("Can't print this kind of register yet");
 }
 
+static void printReg(unsigned Reg, yaml::StringValue &Dest,
+                     const TargetRegisterInfo *TRI) {
+  raw_string_ostream OS(Dest.Value);
+  printReg(Reg, OS, TRI);
+}
+
 void MIRPrinter::print(const MachineFunction &MF) {
   initRegisterMaskIds(MF);
 
@@ -112,23 +171,25 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasInlineAsm = MF.hasInlineAsm();
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
-  convert(YamlMF.FrameInfo, *MF.getFrameInfo());
-  convertStackObjects(YamlMF, *MF.getFrameInfo());
-
-  int I = 0;
   ModuleSlotTracker MST(MF.getFunction()->getParent());
+  MST.incorporateFunction(*MF.getFunction());
+  convert(MST, YamlMF.FrameInfo, *MF.getFrameInfo());
+  convertStackObjects(YamlMF, *MF.getFrameInfo(), MF.getMMI(), MST,
+                      MF.getSubtarget().getRegisterInfo());
+  if (const auto *ConstantPool = MF.getConstantPool())
+    convert(YamlMF, *ConstantPool);
+  if (const auto *JumpTableInfo = MF.getJumpTableInfo())
+    convert(MST, YamlMF.JumpTableInfo, *JumpTableInfo);
+  raw_string_ostream StrOS(YamlMF.Body.Value.Value);
+  bool IsNewlineNeeded = false;
   for (const auto &MBB : MF) {
-    // TODO: Allow printing of non sequentially numbered MBBs.
-    // This is currently needed as the basic block references get their index
-    // from MBB.getNumber(), thus it should be sequential so that the parser can
-    // map back to the correct MBBs when parsing the output.
-    assert(MBB.getNumber() == I++ &&
-           "Can't print MBBs that aren't sequentially numbered");
-    (void)I;
-    yaml::MachineBasicBlock YamlMBB;
-    convert(MST, YamlMBB, MBB);
-    YamlMF.BasicBlocks.push_back(YamlMBB);
+    if (IsNewlineNeeded)
+      StrOS << "\n";
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .print(MBB);
+    IsNewlineNeeded = true;
   }
+  StrOS.flush();
   yaml::Output Out(OS);
   Out << YamlMF;
 }
@@ -147,11 +208,38 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
     VReg.ID = I;
     VReg.Class =
         StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
+    unsigned PreferredReg = RegInfo.getSimpleHint(Reg);
+    if (PreferredReg)
+      printReg(PreferredReg, VReg.PreferredRegister, TRI);
     MF.VirtualRegisters.push_back(VReg);
   }
+
+  // Print the live ins.
+  for (auto I = RegInfo.livein_begin(), E = RegInfo.livein_end(); I != E; ++I) {
+    yaml::MachineFunctionLiveIn LiveIn;
+    printReg(I->first, LiveIn.Register, TRI);
+    if (I->second)
+      printReg(I->second, LiveIn.VirtualRegister, TRI);
+    MF.LiveIns.push_back(LiveIn);
+  }
+  // The used physical register mask is printed as an inverted callee saved
+  // register mask.
+  const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask();
+  if (UsedPhysRegMask.none())
+    return;
+  std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
+  for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) {
+    if (!UsedPhysRegMask[I]) {
+      yaml::FlowStringValue Reg;
+      printReg(I, Reg, TRI);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+  }
+  MF.CalleeSavedRegisters = CalleeSavedRegisters;
 }
 
-void MIRPrinter::convert(yaml::MachineFrameInfo &YamlMFI,
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineFrameInfo &YamlMFI,
                          const MachineFrameInfo &MFI) {
   YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken();
   YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken();
@@ -166,10 +254,23 @@ void MIRPrinter::convert(yaml::MachineFrameInfo &YamlMFI,
   YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment();
   YamlMFI.HasVAStart = MFI.hasVAStart();
   YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
+  if (MFI.getSavePoint()) {
+    raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printMBBReference(*MFI.getSavePoint());
+  }
+  if (MFI.getRestorePoint()) {
+    raw_string_ostream StrOS(YamlMFI.RestorePoint.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printMBBReference(*MFI.getRestorePoint());
+  }
 }
 
 void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
-                                     const MachineFrameInfo &MFI) {
+                                     const MachineFrameInfo &MFI,
+                                     MachineModuleInfo &MMI,
+                                     ModuleSlotTracker &MST,
+                                     const TargetRegisterInfo *TRI) {
   // Process fixed stack objects.
   unsigned ID = 0;
   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
@@ -177,7 +278,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
       continue;
 
     yaml::FixedMachineStackObject YamlObject;
-    YamlObject.ID = ID++;
+    YamlObject.ID = ID;
     YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
                           ? yaml::FixedMachineStackObject::SpillSlot
                           : yaml::FixedMachineStackObject::DefaultType;
@@ -187,8 +288,8 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
     YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
     YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
     MF.FixedStackObjects.push_back(YamlObject);
-    // TODO: Store the mapping between fixed object IDs and object indices to
-    // print the fixed stack object references correctly.
+    StackObjectOperandMapping.insert(
+        std::make_pair(I, FrameIndexOperand::createFixed(ID++)));
   }
 
   // Process ordinary stack objects.
@@ -198,7 +299,10 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
       continue;
 
     yaml::MachineStackObject YamlObject;
-    YamlObject.ID = ID++;
+    YamlObject.ID = ID;
+    if (const auto *Alloca = MFI.getObjectAllocation(I))
+      YamlObject.Name.Value =
+          Alloca->hasName() ? Alloca->getName() : "<unnamed alloca>";
     YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
                           ? yaml::MachineStackObject::SpillSlot
                           : MFI.isVariableSizedObjectIndex(I)
@@ -209,47 +313,100 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
     YamlObject.Alignment = MFI.getObjectAlignment(I);
 
     MF.StackObjects.push_back(YamlObject);
-    // TODO: Store the mapping between object IDs and object indices to print
-    // the stack object references correctly.
+    StackObjectOperandMapping.insert(std::make_pair(
+        I, FrameIndexOperand::create(YamlObject.Name.Value, ID++)));
+  }
+
+  for (const auto &CSInfo : MFI.getCalleeSavedInfo()) {
+    yaml::StringValue Reg;
+    printReg(CSInfo.getReg(), Reg, TRI);
+    auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx());
+    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+           "Invalid stack object index");
+    const FrameIndexOperand &StackObject = StackObjectInfo->second;
+    if (StackObject.IsFixed)
+      MF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg;
+    else
+      MF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg;
+  }
+  for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) {
+    auto LocalObject = MFI.getLocalFrameObjectMap(I);
+    auto StackObjectInfo = StackObjectOperandMapping.find(LocalObject.first);
+    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+           "Invalid stack object index");
+    const FrameIndexOperand &StackObject = StackObjectInfo->second;
+    assert(!StackObject.IsFixed && "Expected a locally mapped stack object");
+    MF.StackObjects[StackObject.ID].LocalOffset = LocalObject.second;
+  }
+
+  // Print the stack object references in the frame information class after
+  // converting the stack objects.
+  if (MFI.hasStackProtectorIndex()) {
+    raw_string_ostream StrOS(MF.FrameInfo.StackProtector.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printStackObjectReference(MFI.getStackProtectorIndex());
+  }
+
+  // Print the debug variable information.
+  for (MachineModuleInfo::VariableDbgInfo &DebugVar :
+       MMI.getVariableDbgInfo()) {
+    auto StackObjectInfo = StackObjectOperandMapping.find(DebugVar.Slot);
+    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+           "Invalid stack object index");
+    const FrameIndexOperand &StackObject = StackObjectInfo->second;
+    assert(!StackObject.IsFixed && "Expected a non-fixed stack object");
+    auto &Object = MF.StackObjects[StackObject.ID];
+    {
+      raw_string_ostream StrOS(Object.DebugVar.Value);
+      DebugVar.Var->printAsOperand(StrOS, MST);
+    }
+    {
+      raw_string_ostream StrOS(Object.DebugExpr.Value);
+      DebugVar.Expr->printAsOperand(StrOS, MST);
+    }
+    {
+      raw_string_ostream StrOS(Object.DebugLoc.Value);
+      DebugVar.Loc->printAsOperand(StrOS, MST);
+    }
   }
 }
 
-void MIRPrinter::convert(ModuleSlotTracker &MST,
-                         yaml::MachineBasicBlock &YamlMBB,
-                         const MachineBasicBlock &MBB) {
-  assert(MBB.getNumber() >= 0 && "Invalid MBB number");
-  YamlMBB.ID = (unsigned)MBB.getNumber();
-  // TODO: Serialize unnamed BB references.
-  if (const auto *BB = MBB.getBasicBlock())
-    YamlMBB.Name.Value = BB->hasName() ? BB->getName() : "<unnamed bb>";
-  else
-    YamlMBB.Name.Value = "";
-  YamlMBB.Alignment = MBB.getAlignment();
-  YamlMBB.AddressTaken = MBB.hasAddressTaken();
-  YamlMBB.IsLandingPad = MBB.isLandingPad();
-  for (const auto *SuccMBB : MBB.successors()) {
+void MIRPrinter::convert(yaml::MachineFunction &MF,
+                         const MachineConstantPool &ConstantPool) {
+  unsigned ID = 0;
+  for (const MachineConstantPoolEntry &Constant : ConstantPool.getConstants()) {
+    // TODO: Serialize target specific constant pool entries.
+    if (Constant.isMachineConstantPoolEntry())
+      llvm_unreachable("Can't print target specific constant pool entries yet");
+
+    yaml::MachineConstantPoolValue YamlConstant;
     std::string Str;
     raw_string_ostream StrOS(Str);
-    MIPrinter(StrOS, MST, RegisterMaskIds).printMBBReference(*SuccMBB);
-    YamlMBB.Successors.push_back(StrOS.str());
+    Constant.Val.ConstVal->printAsOperand(StrOS);
+    YamlConstant.ID = ID++;
+    YamlConstant.Value = StrOS.str();
+    YamlConstant.Alignment = Constant.getAlignment();
+    MF.Constants.push_back(YamlConstant);
   }
-  // Print the live in registers.
-  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
-  assert(TRI && "Expected target register info");
-  for (auto I = MBB.livein_begin(), E = MBB.livein_end(); I != E; ++I) {
+}
+
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineJumpTable &YamlJTI,
+                         const MachineJumpTableInfo &JTI) {
+  YamlJTI.Kind = JTI.getEntryKind();
+  unsigned ID = 0;
+  for (const auto &Table : JTI.getJumpTables()) {
     std::string Str;
-    raw_string_ostream StrOS(Str);
-    printReg(*I, StrOS, TRI);
-    YamlMBB.LiveIns.push_back(StrOS.str());
-  }
-  // Print the machine instructions.
-  YamlMBB.Instructions.reserve(MBB.size());
-  std::string Str;
-  for (const auto &MI : MBB) {
-    raw_string_ostream StrOS(Str);
-    MIPrinter(StrOS, MST, RegisterMaskIds).print(MI);
-    YamlMBB.Instructions.push_back(StrOS.str());
-    Str.clear();
+    yaml::MachineJumpTable::Entry Entry;
+    Entry.ID = ID++;
+    for (const auto *MBB : Table.MBBs) {
+      raw_string_ostream StrOS(Str);
+      MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+          .printMBBReference(*MBB);
+      Entry.Blocks.push_back(StrOS.str());
+      Str.clear();
+    }
+    YamlJTI.Entries.push_back(Entry);
   }
 }
 
@@ -260,26 +417,137 @@ void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) {
     RegisterMaskIds.insert(std::make_pair(Mask, I++));
 }
 
+void MIPrinter::print(const MachineBasicBlock &MBB) {
+  assert(MBB.getNumber() >= 0 && "Invalid MBB number");
+  OS << "bb." << MBB.getNumber();
+  bool HasAttributes = false;
+  if (const auto *BB = MBB.getBasicBlock()) {
+    if (BB->hasName()) {
+      OS << "." << BB->getName();
+    } else {
+      HasAttributes = true;
+      OS << " (";
+      int Slot = MST.getLocalSlot(BB);
+      if (Slot == -1)
+        OS << "<ir-block badref>";
+      else
+        OS << (Twine("%ir-block.") + Twine(Slot)).str();
+    }
+  }
+  if (MBB.hasAddressTaken()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "address-taken";
+    HasAttributes = true;
+  }
+  if (MBB.isEHPad()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "landing-pad";
+    HasAttributes = true;
+  }
+  if (MBB.getAlignment()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "align " << MBB.getAlignment();
+    HasAttributes = true;
+  }
+  if (HasAttributes)
+    OS << ")";
+  OS << ":\n";
+
+  bool HasLineAttributes = false;
+  // Print the successors
+  if (!MBB.succ_empty()) {
+    OS.indent(2) << "successors: ";
+    for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) {
+      if (I != MBB.succ_begin())
+        OS << ", ";
+      printMBBReference(**I);
+      if (MBB.hasSuccessorProbabilities())
+        OS << '(' << MBB.getSuccProbability(I) << ')';
+    }
+    OS << "\n";
+    HasLineAttributes = true;
+  }
+
+  // Print the live in registers.
+  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  if (!MBB.livein_empty()) {
+    OS.indent(2) << "liveins: ";
+    bool First = true;
+    for (const auto &LI : MBB.liveins()) {
+      if (!First)
+        OS << ", ";
+      First = false;
+      printReg(LI.PhysReg, OS, TRI);
+      if (LI.LaneMask != ~0u)
+        OS << ':' << PrintLaneMask(LI.LaneMask);
+    }
+    OS << "\n";
+    HasLineAttributes = true;
+  }
+
+  if (HasLineAttributes)
+    OS << "\n";
+  bool IsInBundle = false;
+  for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; ++I) {
+    const MachineInstr &MI = *I;
+    if (IsInBundle && !MI.isInsideBundle()) {
+      OS.indent(2) << "}\n";
+      IsInBundle = false;
+    }
+    OS.indent(IsInBundle ? 4 : 2);
+    print(MI);
+    if (!IsInBundle && MI.getFlag(MachineInstr::BundledSucc)) {
+      OS << " {";
+      IsInBundle = true;
+    }
+    OS << "\n";
+  }
+  if (IsInBundle)
+    OS.indent(2) << "}\n";
+}
+
+/// Return true when an instruction has tied register that can't be determined
+/// by the instruction's descriptor.
+static bool hasComplexRegisterTies(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+    const auto &Operand = MI.getOperand(I);
+    if (!Operand.isReg() || Operand.isDef())
+      // Ignore the defined registers as MCID marks only the uses as tied.
+      continue;
+    int ExpectedTiedIdx = MCID.getOperandConstraint(I, MCOI::TIED_TO);
+    int TiedIdx = Operand.isTied() ? int(MI.findTiedOperandIdx(I)) : -1;
+    if (ExpectedTiedIdx != TiedIdx)
+      return true;
+  }
+  return false;
+}
+
 void MIPrinter::print(const MachineInstr &MI) {
   const auto &SubTarget = MI.getParent()->getParent()->getSubtarget();
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
   assert(TII && "Expected target instruction info");
+  if (MI.isCFIInstruction())
+    assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
 
+  bool ShouldPrintRegisterTies = hasComplexRegisterTies(MI);
   unsigned I = 0, E = MI.getNumOperands();
   for (; I < E && MI.getOperand(I).isReg() && MI.getOperand(I).isDef() &&
          !MI.getOperand(I).isImplicit();
        ++I) {
     if (I)
       OS << ", ";
-    print(MI.getOperand(I), TRI);
+    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies, /*IsDef=*/true);
   }
 
   if (I)
     OS << " = ";
+  if (MI.getFlag(MachineInstr::FrameSetup))
+    OS << "frame-setup ";
   OS << TII->getName(MI.getOpcode());
-  // TODO: Print the instruction flags, machine mem operands.
   if (I < E)
     OS << ' ';
 
@@ -287,9 +555,27 @@ void MIPrinter::print(const MachineInstr &MI) {
   for (; I < E; ++I) {
     if (NeedComma)
       OS << ", ";
-    print(MI.getOperand(I), TRI);
+    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies);
     NeedComma = true;
   }
+
+  if (MI.getDebugLoc()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " debug-location ";
+    MI.getDebugLoc()->printAsOperand(OS, MST);
+  }
+
+  if (!MI.memoperands_empty()) {
+    OS << " :: ";
+    bool NeedComma = false;
+    for (const auto *Op : MI.memoperands()) {
+      if (NeedComma)
+        OS << ", ";
+      print(*Op);
+      NeedComma = true;
+    }
+  }
 }
 
 void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) {
@@ -300,32 +586,225 @@ void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) {
   }
 }
 
-void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) {
+static void printIRSlotNumber(raw_ostream &OS, int Slot) {
+  if (Slot == -1)
+    OS << "<badref>";
+  else
+    OS << Slot;
+}
+
+void MIPrinter::printIRBlockReference(const BasicBlock &BB) {
+  OS << "%ir-block.";
+  if (BB.hasName()) {
+    printLLVMNameWithoutPrefix(OS, BB.getName());
+    return;
+  }
+  const Function *F = BB.getParent();
+  int Slot;
+  if (F == MST.getCurrentFunction()) {
+    Slot = MST.getLocalSlot(&BB);
+  } else {
+    ModuleSlotTracker CustomMST(F->getParent(),
+                                /*ShouldInitializeAllMetadata=*/false);
+    CustomMST.incorporateFunction(*F);
+    Slot = CustomMST.getLocalSlot(&BB);
+  }
+  printIRSlotNumber(OS, Slot);
+}
+
+void MIPrinter::printIRValueReference(const Value &V) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  printIRSlotNumber(OS, MST.getLocalSlot(&V));
+}
+
+void MIPrinter::printStackObjectReference(int FrameIndex) {
+  auto ObjectInfo = StackObjectOperandMapping.find(FrameIndex);
+  assert(ObjectInfo != StackObjectOperandMapping.end() &&
+         "Invalid frame index");
+  const FrameIndexOperand &Operand = ObjectInfo->second;
+  if (Operand.IsFixed) {
+    OS << "%fixed-stack." << Operand.ID;
+    return;
+  }
+  OS << "%stack." << Operand.ID;
+  if (!Operand.Name.empty())
+    OS << '.' << Operand.Name;
+}
+
+void MIPrinter::printOffset(int64_t Offset) {
+  if (Offset == 0)
+    return;
+  if (Offset < 0) {
+    OS << " - " << -Offset;
+    return;
+  }
+  OS << " + " << Offset;
+}
+
+static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) {
+  auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
+  for (const auto &I : Flags) {
+    if (I.first == TF) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+void MIPrinter::printTargetFlags(const MachineOperand &Op) {
+  if (!Op.getTargetFlags())
+    return;
+  const auto *TII =
+      Op.getParent()->getParent()->getParent()->getSubtarget().getInstrInfo();
+  assert(TII && "expected instruction info");
+  auto Flags = TII->decomposeMachineOperandsTargetFlags(Op.getTargetFlags());
+  OS << "target-flags(";
+  const bool HasDirectFlags = Flags.first;
+  const bool HasBitmaskFlags = Flags.second;
+  if (!HasDirectFlags && !HasBitmaskFlags) {
+    OS << "<unknown>) ";
+    return;
+  }
+  if (HasDirectFlags) {
+    if (const auto *Name = getTargetFlagName(TII, Flags.first))
+      OS << Name;
+    else
+      OS << "<unknown target flag>";
+  }
+  if (!HasBitmaskFlags) {
+    OS << ") ";
+    return;
+  }
+  bool IsCommaNeeded = HasDirectFlags;
+  unsigned BitMask = Flags.second;
+  auto BitMasks = TII->getSerializableBitmaskMachineOperandTargetFlags();
+  for (const auto &Mask : BitMasks) {
+    // Check if the flag's bitmask has the bits of the current mask set.
+    if ((BitMask & Mask.first) == Mask.first) {
+      if (IsCommaNeeded)
+        OS << ", ";
+      IsCommaNeeded = true;
+      OS << Mask.second;
+      // Clear the bits which were serialized from the flag's bitmask.
+      BitMask &= ~(Mask.first);
+    }
+  }
+  if (BitMask) {
+    // When the resulting flag's bitmask isn't zero, we know that we didn't
+    // serialize all of the bit flags.
+    if (IsCommaNeeded)
+      OS << ", ";
+    OS << "<unknown bitmask target flag>";
+  }
+  OS << ") ";
+}
+
+static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "expected instruction info");
+  auto Indices = TII->getSerializableTargetIndices();
+  for (const auto &I : Indices) {
+    if (I.first == Index) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
+                      unsigned I, bool ShouldPrintRegisterTies, bool IsDef) {
+  printTargetFlags(Op);
   switch (Op.getType()) {
   case MachineOperand::MO_Register:
-    // TODO: Print the other register flags.
     if (Op.isImplicit())
       OS << (Op.isDef() ? "implicit-def " : "implicit ");
+    else if (!IsDef && Op.isDef())
+      // Print the 'def' flag only when the operand is defined after '='.
+      OS << "def ";
+    if (Op.isInternalRead())
+      OS << "internal ";
     if (Op.isDead())
       OS << "dead ";
     if (Op.isKill())
       OS << "killed ";
     if (Op.isUndef())
       OS << "undef ";
+    if (Op.isEarlyClobber())
+      OS << "early-clobber ";
+    if (Op.isDebug())
+      OS << "debug-use ";
     printReg(Op.getReg(), OS, TRI);
     // Print the sub register.
     if (Op.getSubReg() != 0)
       OS << ':' << TRI->getSubRegIndexName(Op.getSubReg());
+    if (ShouldPrintRegisterTies && Op.isTied() && !Op.isDef())
+      OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(I) << ")";
     break;
   case MachineOperand::MO_Immediate:
     OS << Op.getImm();
     break;
+  case MachineOperand::MO_CImmediate:
+    Op.getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
+    break;
+  case MachineOperand::MO_FPImmediate:
+    Op.getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST);
+    break;
   case MachineOperand::MO_MachineBasicBlock:
     printMBBReference(*Op.getMBB());
     break;
+  case MachineOperand::MO_FrameIndex:
+    printStackObjectReference(Op.getIndex());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "%const." << Op.getIndex();
+    printOffset(Op.getOffset());
+    break;
+  case MachineOperand::MO_TargetIndex: {
+    OS << "target-index(";
+    if (const auto *Name = getTargetIndexName(
+            *Op.getParent()->getParent()->getParent(), Op.getIndex()))
+      OS << Name;
+    else
+      OS << "<unknown>";
+    OS << ')';
+    printOffset(Op.getOffset());
+    break;
+  }
+  case MachineOperand::MO_JumpTableIndex:
+    OS << "%jump-table." << Op.getIndex();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    OS << '$';
+    printLLVMNameWithoutPrefix(OS, Op.getSymbolName());
+    printOffset(Op.getOffset());
+    break;
   case MachineOperand::MO_GlobalAddress:
     Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
-    // TODO: Print offset and target flags.
+    printOffset(Op.getOffset());
+    break;
+  case MachineOperand::MO_BlockAddress:
+    OS << "blockaddress(";
+    Op.getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false,
+                                                        MST);
+    OS << ", ";
+    printIRBlockReference(*Op.getBlockAddress()->getBasicBlock());
+    OS << ')';
+    printOffset(Op.getOffset());
     break;
   case MachineOperand::MO_RegisterMask: {
     auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask());
@@ -335,9 +814,157 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) {
       llvm_unreachable("Can't print this machine register mask yet.");
     break;
   }
+  case MachineOperand::MO_RegisterLiveOut: {
+    const uint32_t *RegMask = Op.getRegLiveOut();
+    OS << "liveout(";
+    bool IsCommaNeeded = false;
+    for (unsigned Reg = 0, E = TRI->getNumRegs(); Reg < E; ++Reg) {
+      if (RegMask[Reg / 32] & (1U << (Reg % 32))) {
+        if (IsCommaNeeded)
+          OS << ", ";
+        printReg(Reg, OS, TRI);
+        IsCommaNeeded = true;
+      }
+    }
+    OS << ")";
+    break;
+  }
+  case MachineOperand::MO_Metadata:
+    Op.getMetadata()->printAsOperand(OS, MST);
+    break;
+  case MachineOperand::MO_MCSymbol:
+    OS << "<mcsymbol " << *Op.getMCSymbol() << ">";
+    break;
+  case MachineOperand::MO_CFIIndex: {
+    const auto &MMI = Op.getParent()->getParent()->getParent()->getMMI();
+    print(MMI.getFrameInstructions()[Op.getCFIIndex()], TRI);
+    break;
+  }
+  }
+}
+
+void MIPrinter::print(const MachineMemOperand &Op) {
+  OS << '(';
+  // TODO: Print operand's target specific flags.
+  if (Op.isVolatile())
+    OS << "volatile ";
+  if (Op.isNonTemporal())
+    OS << "non-temporal ";
+  if (Op.isInvariant())
+    OS << "invariant ";
+  if (Op.isLoad())
+    OS << "load ";
+  else {
+    assert(Op.isStore() && "Non load machine operand must be a store");
+    OS << "store ";
+  }
+  OS << Op.getSize() << (Op.isLoad() ? " from " : " into ");
+  if (const Value *Val = Op.getValue()) {
+    printIRValueReference(*Val);
+  } else {
+    const PseudoSourceValue *PVal = Op.getPseudoValue();
+    assert(PVal && "Expected a pseudo source value");
+    switch (PVal->kind()) {
+    case PseudoSourceValue::Stack:
+      OS << "stack";
+      break;
+    case PseudoSourceValue::GOT:
+      OS << "got";
+      break;
+    case PseudoSourceValue::JumpTable:
+      OS << "jump-table";
+      break;
+    case PseudoSourceValue::ConstantPool:
+      OS << "constant-pool";
+      break;
+    case PseudoSourceValue::FixedStack:
+      printStackObjectReference(
+          cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex());
+      break;
+    case PseudoSourceValue::GlobalValueCallEntry:
+      OS << "call-entry ";
+      cast<GlobalValuePseudoSourceValue>(PVal)->getValue()->printAsOperand(
+          OS, /*PrintType=*/false, MST);
+      break;
+    case PseudoSourceValue::ExternalSymbolCallEntry:
+      OS << "call-entry $";
+      printLLVMNameWithoutPrefix(
+          OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
+      break;
+    }
+  }
+  printOffset(Op.getOffset());
+  if (Op.getBaseAlignment() != Op.getSize())
+    OS << ", align " << Op.getBaseAlignment();
+  auto AAInfo = Op.getAAInfo();
+  if (AAInfo.TBAA) {
+    OS << ", !tbaa ";
+    AAInfo.TBAA->printAsOperand(OS, MST);
+  }
+  if (AAInfo.Scope) {
+    OS << ", !alias.scope ";
+    AAInfo.Scope->printAsOperand(OS, MST);
+  }
+  if (AAInfo.NoAlias) {
+    OS << ", !noalias ";
+    AAInfo.NoAlias->printAsOperand(OS, MST);
+  }
+  if (Op.getRanges()) {
+    OS << ", !range ";
+    Op.getRanges()->printAsOperand(OS, MST);
+  }
+  OS << ')';
+}
+
+static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
+                             const TargetRegisterInfo *TRI) {
+  int Reg = TRI->getLLVMRegNum(DwarfReg, true);
+  if (Reg == -1) {
+    OS << "<badreg>";
+    return;
+  }
+  printReg(Reg, OS, TRI);
+}
+
+void MIPrinter::print(const MCCFIInstruction &CFI,
+                      const TargetRegisterInfo *TRI) {
+  switch (CFI.getOperation()) {
+  case MCCFIInstruction::OpSameValue:
+    OS << ".cfi_same_value ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpOffset:
+    OS << ".cfi_offset ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OS << ".cfi_def_cfa_register ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpDefCfaOffset:
+    OS << ".cfi_def_cfa_offset ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    OS << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OS << ".cfi_def_cfa ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
   default:
-    // TODO: Print the other machine operands.
-    llvm_unreachable("Can't print this machine operand at the moment");
+    // TODO: Print the other CFI Operations.
+    OS << "<unserializable cfi operation>";
+    break;
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
index 13d61e6..8e7566a 100644
--- a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
@@ -40,7 +40,7 @@ struct MIRPrintingPass : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) override {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     std::string Str;
     raw_string_ostream StrOS(Str);
     printMIR(StrOS, MF);
@@ -48,7 +48,7 @@ struct MIRPrintingPass : public MachineFunctionPass {
     return false;
   }
 
-  virtual bool doFinalization(Module &M) override {
+  bool doFinalization(Module &M) override {
     printMIR(OS, M);
     OS << MachineFunctions;
     return false;
diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 5d3f7eb..85d544d 100644
--- a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -38,22 +39,21 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
 
-MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
-  : BB(bb), Number(-1), xParent(&mf), Alignment(0), IsLandingPad(false),
-    AddressTaken(false), CachedMCSymbol(nullptr) {
+MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
+    : BB(B), Number(-1), xParent(&MF) {
   Insts.Parent = this;
 }
 
 MachineBasicBlock::~MachineBasicBlock() {
 }
 
-/// getSymbol - Return the MCSymbol for this basic block.
-///
+/// Return the MCSymbol for this basic block.
 MCSymbol *MachineBasicBlock::getSymbol() const {
   if (!CachedMCSymbol) {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
     const char *Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
+    assert(getNumber() >= 0 && "cannot get label for unreachable MBB");
     CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" +
                                            Twine(MF->getFunctionNumber()) +
                                            "_" + Twine(getNumber()));
@@ -68,9 +68,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
   return OS;
 }
 
-/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the
-/// parent pointer of the MBB, the MBB numbering, and any instructions in the
-/// MBB to be on the right operand list for registers.
+/// When an MBB is added to an MF, we need to update the parent pointer of the
+/// MBB, the MBB numbering, and any instructions in the MBB to be on the right
+/// operand list for registers.
 ///
 /// MBBs start out as #-1. When a MBB is added to a MachineFunction, it
 /// gets the next available unique MBB number. If it is removed from a
@@ -91,10 +91,8 @@ void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
   N->Number = -1;
 }
 
-
-/// addNodeToList (MI) - When we add an instruction to a basic block
-/// list, we update its parent pointer and add its operands from reg use/def
-/// lists if appropriate.
+/// When we add an instruction to a basic block list, we update its parent
+/// pointer and add its operands from reg use/def lists if appropriate.
 void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
   assert(!N->getParent() && "machine instruction already in a basic block");
   N->setParent(Parent);
@@ -105,9 +103,8 @@ void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
   N->AddRegOperandsToUseLists(MF->getRegInfo());
 }
 
-/// removeNodeFromList (MI) - When we remove an instruction from a basic block
-/// list, we update its parent pointer and remove its operands from reg use/def
-/// lists if appropriate.
+/// When we remove an instruction from a basic block list, we update its parent
+/// pointer and remove its operands from reg use/def lists if appropriate.
 void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
   assert(N->getParent() && "machine instruction not in a basic block");
 
@@ -118,23 +115,22 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
   N->setParent(nullptr);
 }
 
-/// transferNodesFromList (MI) - When moving a range of instructions from one
-/// MBB list to another, we need to update the parent pointers and the use/def
-/// lists.
+/// When moving a range of instructions from one MBB list to another, we need to
+/// update the parent pointers and the use/def lists.
 void ilist_traits<MachineInstr>::
-transferNodesFromList(ilist_traits<MachineInstr> &fromList,
-                      ilist_iterator<MachineInstr> first,
-                      ilist_iterator<MachineInstr> last) {
-  assert(Parent->getParent() == fromList.Parent->getParent() &&
+transferNodesFromList(ilist_traits<MachineInstr> &FromList,
+                      ilist_iterator<MachineInstr> First,
+                      ilist_iterator<MachineInstr> Last) {
+  assert(Parent->getParent() == FromList.Parent->getParent() &&
         "MachineInstr parent mismatch!");
 
   // Splice within the same MBB -> no change.
-  if (Parent == fromList.Parent) return;
+  if (Parent == FromList.Parent) return;
 
   // If splicing between two blocks within the same function, just update the
   // parent pointers.
-  for (; first != last; ++first)
-    first->setParent(Parent);
+  for (; First != Last; ++First)
+    First->setParent(Parent);
 }
 
 void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
@@ -208,11 +204,18 @@ const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const {
   if (succ_size() > 2)
     return nullptr;
   for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
-    if ((*I)->isLandingPad())
+    if ((*I)->isEHPad())
       return *I;
   return nullptr;
 }
 
+bool MachineBasicBlock::hasEHPadSuccessor() const {
+  for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
+    if ((*I)->isEHPad())
+      return true;
+  return false;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineBasicBlock::dump() const {
   print(dbgs());
@@ -271,7 +274,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     LBB->printAsOperand(OS, /*PrintType=*/false, MST);
     Comma = ", ";
   }
-  if (isLandingPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
+  if (isEHPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
   if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
   if (Alignment)
     OS << Comma << "Align " << Alignment << " (" << (1u << Alignment)
@@ -283,8 +286,11 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   if (!livein_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Live Ins:";
-    for (livein_iterator I = livein_begin(),E = livein_end(); I != E; ++I)
-      OS << ' ' << PrintReg(*I, TRI);
+    for (const auto &LI : make_range(livein_begin(), livein_end())) {
+      OS << ' ' << PrintReg(LI.PhysReg, TRI);
+      if (LI.LaneMask != ~0u)
+        OS << ':' << PrintLaneMask(LI.LaneMask);
+    }
     OS << '\n';
   }
   // Print the preds of this block according to the CFG.
@@ -298,8 +304,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   for (const_instr_iterator I = instr_begin(); I != instr_end(); ++I) {
     if (Indexes) {
-      if (Indexes->hasIndex(I))
-        OS << Indexes->getInstructionIndex(I);
+      if (Indexes->hasIndex(&*I))
+        OS << Indexes->getInstructionIndex(&*I);
       OS << '\t';
     }
     OS << '\t';
@@ -314,35 +320,63 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "    Successors according to CFG:";
     for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
-      if (!Weights.empty())
-        OS << '(' << *getWeightIterator(SI) << ')';
+      if (!Probs.empty())
+        OS << '(' << *getProbabilityIterator(SI) << ')';
     }
     OS << '\n';
   }
 }
 
-void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) const {
+void MachineBasicBlock::printAsOperand(raw_ostream &OS,
+                                       bool /*PrintType*/) const {
   OS << "BB#" << getNumber();
 }
 
-void MachineBasicBlock::removeLiveIn(unsigned Reg) {
-  std::vector<unsigned>::iterator I =
-    std::find(LiveIns.begin(), LiveIns.end(), Reg);
-  if (I != LiveIns.end())
+void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) {
+  LiveInVector::iterator I = std::find_if(
+      LiveIns.begin(), LiveIns.end(),
+      [Reg] (const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
+  if (I == LiveIns.end())
+    return;
+
+  I->LaneMask &= ~LaneMask;
+  if (I->LaneMask == 0)
     LiveIns.erase(I);
 }
 
-bool MachineBasicBlock::isLiveIn(unsigned Reg) const {
-  livein_iterator I = std::find(livein_begin(), livein_end(), Reg);
-  return I != livein_end();
+bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const {
+  livein_iterator I = std::find_if(
+      LiveIns.begin(), LiveIns.end(),
+      [Reg] (const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
+  return I != livein_end() && (I->LaneMask & LaneMask) != 0;
+}
+
+void MachineBasicBlock::sortUniqueLiveIns() {
+  std::sort(LiveIns.begin(), LiveIns.end(),
+            [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
+              return LI0.PhysReg < LI1.PhysReg;
+            });
+  // Liveins are sorted by physreg now we can merge their lanemasks.
+  LiveInVector::const_iterator I = LiveIns.begin();
+  LiveInVector::const_iterator J;
+  LiveInVector::iterator Out = LiveIns.begin();
+  for (; I != LiveIns.end(); ++Out, I = J) {
+    unsigned PhysReg = I->PhysReg;
+    LaneBitmask LaneMask = I->LaneMask;
+    for (J = std::next(I); J != LiveIns.end() && J->PhysReg == PhysReg; ++J)
+      LaneMask |= J->LaneMask;
+    Out->PhysReg = PhysReg;
+    Out->LaneMask = LaneMask;
+  }
+  LiveIns.erase(Out, LiveIns.end());
 }
 
 unsigned
-MachineBasicBlock::addLiveIn(unsigned PhysReg, const TargetRegisterClass *RC) {
+MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Expected physreg");
   assert(RC && "Register class is required");
-  assert((isLandingPad() || this == &getParent()->front()) &&
+  assert((isEHPad() || this == &getParent()->front()) &&
          "Only the entry block and landing pads can have physreg live ins");
 
   bool LiveIn = isLiveIn(PhysReg);
@@ -370,12 +404,11 @@ MachineBasicBlock::addLiveIn(unsigned PhysReg, const TargetRegisterClass *RC) {
 }
 
 void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
-  getParent()->splice(NewAfter, this);
+  getParent()->splice(NewAfter->getIterator(), getIterator());
 }
 
 void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
-  MachineFunction::iterator BBI = NewBefore;
-  getParent()->splice(++BBI, this);
+  getParent()->splice(++NewBefore->getIterator(), getIterator());
 }
 
 void MachineBasicBlock::updateTerminator() {
@@ -385,7 +418,7 @@ void MachineBasicBlock::updateTerminator() {
 
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc dl;  // FIXME: this is nowhere
+  DebugLoc DL;  // FIXME: this is nowhere
   bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
@@ -400,7 +433,7 @@ void MachineBasicBlock::updateTerminator() {
       // its layout successor, insert a branch. First we have to locate the
       // only non-landing-pad successor, as that is the fallthrough block.
       for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
-        if ((*SI)->isLandingPad())
+        if ((*SI)->isEHPad())
           continue;
         assert(!TBB && "Found more than one non-landing-pad successor!");
         TBB = *SI;
@@ -414,7 +447,7 @@ void MachineBasicBlock::updateTerminator() {
       // Finally update the unconditional successor to be reached via a branch
       // if it would not be reached by fallthrough.
       if (!isLayoutSuccessor(TBB))
-        TII->InsertBranch(*this, TBB, nullptr, Cond, dl);
+        TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
     }
   } else {
     if (FBB) {
@@ -425,10 +458,10 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond))
           return;
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FBB, nullptr, Cond, dl);
+        TII->InsertBranch(*this, FBB, nullptr, Cond, DL);
       } else if (isLayoutSuccessor(FBB)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, nullptr, Cond, dl);
+        TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
       }
     } else {
       // Walk through the successors and find the successor which is not
@@ -436,7 +469,7 @@ void MachineBasicBlock::updateTerminator() {
       // as the fallthrough successor.
       MachineBasicBlock *FallthroughBB = nullptr;
       for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
-        if ((*SI)->isLandingPad() || *SI == TBB)
+        if ((*SI)->isEHPad() || *SI == TBB)
           continue;
         assert(!FallthroughBB && "Found more than one fallthrough successor.");
         FallthroughBB = *SI;
@@ -445,14 +478,14 @@ void MachineBasicBlock::updateTerminator() {
         // We fallthrough to the same basic block as the conditional jump
         // targets. Remove the conditional jump, leaving unconditional
         // fallthrough.
-        // FIXME: This does not seem like a reasonable pattern to support, but it
-        // has been seen in the wild coming out of degenerate ARM test cases.
+        // FIXME: This does not seem like a reasonable pattern to support, but
+        // it has been seen in the wild coming out of degenerate ARM test cases.
         TII->RemoveBranch(*this);
 
         // Finally update the unconditional successor to be reached via a branch
         // if it would not be reached by fallthrough.
         if (!isLayoutSuccessor(TBB))
-          TII->InsertBranch(*this, TBB, nullptr, Cond, dl);
+          TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
         return;
       }
 
@@ -461,55 +494,69 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond)) {
           // We can't reverse the condition, add an unconditional branch.
           Cond.clear();
-          TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, dl);
+          TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
           return;
         }
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, dl);
+        TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
       } else if (!isLayoutSuccessor(FallthroughBB)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, FallthroughBB, Cond, dl);
+        TII->InsertBranch(*this, TBB, FallthroughBB, Cond, DL);
       }
     }
   }
 }
 
-void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ, uint32_t weight) {
-
-  // If we see non-zero value for the first time it means we actually use Weight
-  // list, so we fill all Weights with 0's.
-  if (weight != 0 && Weights.empty())
-    Weights.resize(Successors.size());
-
-  if (weight != 0 || !Weights.empty())
-    Weights.push_back(weight);
-
-   Successors.push_back(succ);
-   succ->addPredecessor(this);
- }
+void MachineBasicBlock::validateSuccProbs() const {
+#ifndef NDEBUG
+  int64_t Sum = 0;
+  for (auto Prob : Probs)
+    Sum += Prob.getNumerator();
+  // Due to precision issue, we assume that the sum of probabilities is one if
+  // the difference between the sum of their numerators and the denominator is
+  // no greater than the number of successors.
+  assert((uint64_t)std::abs(Sum - BranchProbability::getDenominator()) <=
+             Probs.size() &&
+         "The sum of successors's probabilities exceeds one.");
+#endif // NDEBUG
+}
 
-void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) {
-  succ->removePredecessor(this);
-  succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
-  assert(I != Successors.end() && "Not a current successor!");
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
+                                     BranchProbability Prob) {
+  // Probability list is either empty (if successor list isn't empty, this means
+  // disabled optimization) or has the same size as successor list.
+  if (!(Probs.empty() && !Successors.empty()))
+    Probs.push_back(Prob);
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(I);
-    Weights.erase(WI);
-  }
+void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
+  // We need to make sure probability list is either empty or has the same size
+  // of successor list. When this function is called, we can safely delete all
+  // probability in the list.
+  Probs.clear();
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
 
-  Successors.erase(I);
+void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ,
+                                        bool NormalizeSuccProbs) {
+  succ_iterator I = std::find(Successors.begin(), Successors.end(), Succ);
+  removeSuccessor(I, NormalizeSuccProbs);
 }
 
 MachineBasicBlock::succ_iterator
-MachineBasicBlock::removeSuccessor(succ_iterator I) {
+MachineBasicBlock::removeSuccessor(succ_iterator I, bool NormalizeSuccProbs) {
   assert(I != Successors.end() && "Not a current successor!");
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(I);
-    Weights.erase(WI);
+  // If probability list is empty it means we don't use it (disabled
+  // optimization).
+  if (!Probs.empty()) {
+    probability_iterator WI = getProbabilityIterator(I);
+    Probs.erase(WI);
+    if (NormalizeSuccProbs)
+      normalizeSuccProbs();
   }
 
   (*I)->removePredecessor(this);
@@ -537,74 +584,77 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
     }
   }
   assert(OldI != E && "Old is not a successor of this block");
-  Old->removePredecessor(this);
 
   // If New isn't already a successor, let it take Old's place.
   if (NewI == E) {
+    Old->removePredecessor(this);
     New->addPredecessor(this);
     *OldI = New;
     return;
   }
 
   // New is already a successor.
-  // Update its weight instead of adding a duplicate edge.
-  if (!Weights.empty()) {
-    weight_iterator OldWI = getWeightIterator(OldI);
-    *getWeightIterator(NewI) += *OldWI;
-    Weights.erase(OldWI);
+  // Update its probability instead of adding a duplicate edge.
+  if (!Probs.empty()) {
+    auto ProbIter = getProbabilityIterator(NewI);
+    if (!ProbIter->isUnknown())
+      *ProbIter += *getProbabilityIterator(OldI);
   }
-  Successors.erase(OldI);
+  removeSuccessor(OldI);
 }
 
-void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
-  Predecessors.push_back(pred);
+void MachineBasicBlock::addPredecessor(MachineBasicBlock *Pred) {
+  Predecessors.push_back(Pred);
 }
 
-void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
-  pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), pred);
+void MachineBasicBlock::removePredecessor(MachineBasicBlock *Pred) {
+  pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), Pred);
   assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
   Predecessors.erase(I);
 }
 
-void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
-  if (this == fromMBB)
+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
+  if (this == FromMBB)
     return;
 
-  while (!fromMBB->succ_empty()) {
-    MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    uint32_t Weight = 0;
+  while (!FromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *FromMBB->succ_begin();
 
-    // If Weight list is empty it means we don't use it (disabled optimization).
-    if (!fromMBB->Weights.empty())
-      Weight = *fromMBB->Weights.begin();
+    // If probability list is empty it means we don't use it (disabled optimization).
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
 
-    addSuccessor(Succ, Weight);
-    fromMBB->removeSuccessor(Succ);
+    FromMBB->removeSuccessor(Succ);
   }
 }
 
 void
-MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
-  if (this == fromMBB)
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
+  if (this == FromMBB)
     return;
 
-  while (!fromMBB->succ_empty()) {
-    MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    uint32_t Weight = 0;
-    if (!fromMBB->Weights.empty())
-      Weight = *fromMBB->Weights.begin();
-    addSuccessor(Succ, Weight);
-    fromMBB->removeSuccessor(Succ);
+  while (!FromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *FromMBB->succ_begin();
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
+    FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
     for (MachineBasicBlock::instr_iterator MI = Succ->instr_begin(),
            ME = Succ->instr_end(); MI != ME && MI->isPHI(); ++MI)
       for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
         MachineOperand &MO = MI->getOperand(i);
-        if (MO.getMBB() == fromMBB)
+        if (MO.getMBB() == FromMBB)
           MO.setMBB(this);
       }
   }
+  normalizeSuccProbs();
 }
 
 bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const {
@@ -621,14 +671,14 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
 }
 
 bool MachineBasicBlock::canFallThrough() {
-  MachineFunction::iterator Fallthrough = this;
+  MachineFunction::iterator Fallthrough = getIterator();
   ++Fallthrough;
   // If FallthroughBlock is off the end of the function, it can't fall through.
   if (Fallthrough == getParent()->end())
     return false;
 
   // If FallthroughBlock isn't a successor, no fallthrough is possible.
-  if (!isSuccessor(Fallthrough))
+  if (!isSuccessor(&*Fallthrough))
     return false;
 
   // Analyze the branches, if any, at the end of the block.
@@ -666,11 +716,11 @@ MachineBasicBlock *
 MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   // Splitting the critical edge to a landing pad block is non-trivial. Don't do
   // it in this generic function.
-  if (Succ->isLandingPad())
+  if (Succ->isEHPad())
     return nullptr;
 
   MachineFunction *MF = getParent();
-  DebugLoc dl;  // FIXME: this is nowhere
+  DebugLoc DL;  // FIXME: this is nowhere
 
   // Performance might be harmed on HW that implements branching using exec mask
   // where both sides of the branches are always executed.
@@ -719,7 +769,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   if (LV)
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I) {
-      MachineInstr *MI = I;
+      MachineInstr *MI = &*I;
       for (MachineInstr::mop_iterator OI = MI->operands_begin(),
            OE = MI->operands_end(); OI != OE; ++OI) {
         if (!OI->isReg() || OI->getReg() == 0 ||
@@ -739,7 +789,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   if (LIS) {
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I) {
-      MachineInstr *MI = I;
+      MachineInstr *MI = &*I;
 
       for (MachineInstr::mop_iterator OI = MI->operands_begin(),
            OE = MI->operands_end(); OI != OE; ++OI) {
@@ -761,7 +811,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   if (Indexes) {
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I)
-      Terminators.push_back(I);
+      Terminators.push_back(&*I);
   }
 
   updateTerminator();
@@ -770,7 +820,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
     SmallVector<MachineInstr*, 4> NewTerminators;
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I)
-      NewTerminators.push_back(I);
+      NewTerminators.push_back(&*I);
 
     for (SmallVectorImpl<MachineInstr*>::iterator I = Terminators.begin(),
         E = Terminators.end(); I != E; ++I) {
@@ -784,17 +834,16 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   NMBB->addSuccessor(Succ);
   if (!NMBB->isLayoutSuccessor(Succ)) {
     Cond.clear();
-    MF->getSubtarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond,
-                                                    dl);
+    TII->InsertBranch(*NMBB, Succ, nullptr, Cond, DL);
 
     if (Indexes) {
       for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
            I != E; ++I) {
         // Some instructions may have been moved to NMBB by updateTerminator(),
         // so we first remove any instruction that already has an index.
-        if (Indexes->hasIndex(I))
-          Indexes->removeMachineInstrFromMaps(I);
-        Indexes->insertMachineInstrInMaps(I);
+        if (Indexes->hasIndex(&*I))
+          Indexes->removeMachineInstrFromMaps(&*I);
+        Indexes->insertMachineInstrInMaps(&*I);
       }
     }
   }
@@ -808,9 +857,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         i->getOperand(ni+1).setMBB(NMBB);
 
   // Inherit live-ins from the successor
-  for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(),
-         E = Succ->livein_end(); I != E; ++I)
-    NMBB->addLiveIn(*I);
+  for (const auto &LI : Succ->liveins())
+    NMBB->addLiveIn(LI);
 
   // Update LiveVariables.
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
@@ -822,7 +870,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         if (!(--I)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
           continue;
         if (TargetRegisterInfo::isVirtualRegister(Reg))
-          LV->getVarInfo(Reg).Kills.push_back(I);
+          LV->getVarInfo(Reg).Kills.push_back(&*I);
         DEBUG(dbgs() << "Restored terminator kill: " << *I);
         break;
       }
@@ -834,10 +882,10 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   if (LIS) {
     // After splitting the edge and updating SlotIndexes, live intervals may be
     // in one of two situations, depending on whether this block was the last in
-    // the function. If the original block was the last in the function, all live
-    // intervals will end prior to the beginning of the new split block. If the
-    // original block was not at the end of the function, all live intervals will
-    // extend to the end of the new split block.
+    // the function. If the original block was the last in the function, all
+    // live intervals will end prior to the beginning of the new split block. If
+    // the original block was not at the end of the function, all live intervals
+    // will extend to the end of the new split block.
 
     bool isLastMBB =
       std::next(MachineFunction::iterator(NMBB)) == getParent()->end();
@@ -861,7 +909,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
 
           LiveInterval &LI = LIS->getInterval(Reg);
           VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
-          assert(VNI && "PHI sources should be live out of their predecessors.");
+          assert(VNI &&
+                 "PHI sources should be live out of their predecessors.");
           LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
         }
       }
@@ -941,7 +990,7 @@ static void unbundleSingleMI(MachineInstr *MI) {
 
 MachineBasicBlock::instr_iterator
 MachineBasicBlock::erase(MachineBasicBlock::instr_iterator I) {
-  unbundleSingleMI(I);
+  unbundleSingleMI(&*I);
   return Insts.erase(I);
 }
 
@@ -964,25 +1013,22 @@ MachineBasicBlock::insert(instr_iterator I, MachineInstr *MI) {
   return Insts.insert(I, MI);
 }
 
-/// removeFromParent - This method unlinks 'this' from the containing function,
-/// and returns it, but does not delete it.
+/// This method unlinks 'this' from the containing function, and returns it, but
+/// does not delete it.
 MachineBasicBlock *MachineBasicBlock::removeFromParent() {
   assert(getParent() && "Not embedded in a function!");
   getParent()->remove(this);
   return this;
 }
 
-
-/// eraseFromParent - This method unlinks 'this' from the containing function,
-/// and deletes it.
+/// This method unlinks 'this' from the containing function, and deletes it.
 void MachineBasicBlock::eraseFromParent() {
   assert(getParent() && "Not embedded in a function!");
   getParent()->erase(this);
 }
 
-
-/// ReplaceUsesOfBlockWith - Given a machine basic block that branched to
-/// 'Old', change the code and CFG so that it branches to 'New' instead.
+/// Given a machine basic block that branched to 'Old', change the code and CFG
+/// so that it branches to 'New' instead.
 void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
                                                MachineBasicBlock *New) {
   assert(Old != New && "Cannot replace self with self!");
@@ -1004,46 +1050,44 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
   replaceSuccessor(Old, New);
 }
 
-/// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the
-/// CFG to be inserted.  If we have proven that MBB can only branch to DestA and
-/// DestB, remove any other MBB successors from the CFG.  DestA and DestB can be
-/// null.
+/// Various pieces of code can cause excess edges in the CFG to be inserted.  If
+/// we have proven that MBB can only branch to DestA and DestB, remove any other
+/// MBB successors from the CFG.  DestA and DestB can be null.
 ///
 /// Besides DestA and DestB, retain other edges leading to LandingPads
 /// (currently there can be only one; we don't check or require that here).
 /// Note it is possible that DestA and/or DestB are LandingPads.
 bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
                                              MachineBasicBlock *DestB,
-                                             bool isCond) {
+                                             bool IsCond) {
   // The values of DestA and DestB frequently come from a call to the
   // 'TargetInstrInfo::AnalyzeBranch' method. We take our meaning of the initial
   // values from there.
   //
   // 1. If both DestA and DestB are null, then the block ends with no branches
   //    (it falls through to its successor).
-  // 2. If DestA is set, DestB is null, and isCond is false, then the block ends
+  // 2. If DestA is set, DestB is null, and IsCond is false, then the block ends
   //    with only an unconditional branch.
-  // 3. If DestA is set, DestB is null, and isCond is true, then the block ends
+  // 3. If DestA is set, DestB is null, and IsCond is true, then the block ends
   //    with a conditional branch that falls through to a successor (DestB).
-  // 4. If DestA and DestB is set and isCond is true, then the block ends with a
+  // 4. If DestA and DestB is set and IsCond is true, then the block ends with a
   //    conditional branch followed by an unconditional branch. DestA is the
   //    'true' destination and DestB is the 'false' destination.
 
   bool Changed = false;
 
-  MachineFunction::iterator FallThru =
-    std::next(MachineFunction::iterator(this));
+  MachineFunction::iterator FallThru = std::next(getIterator());
 
   if (!DestA && !DestB) {
     // Block falls through to successor.
-    DestA = FallThru;
-    DestB = FallThru;
+    DestA = &*FallThru;
+    DestB = &*FallThru;
   } else if (DestA && !DestB) {
-    if (isCond)
+    if (IsCond)
       // Block ends in conditional jump that falls through to successor.
-      DestB = FallThru;
+      DestB = &*FallThru;
   } else {
-    assert(DestA && DestB && isCond &&
+    assert(DestA && DestB && IsCond &&
            "CFG in a bad state. Cannot correct CFG edges");
   }
 
@@ -1054,7 +1098,7 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
   while (SI != succ_end()) {
     const MachineBasicBlock *MBB = *SI;
     if (!SeenMBBs.insert(MBB).second ||
-        (MBB != DestA && MBB != DestB && !MBB->isLandingPad())) {
+        (MBB != DestA && MBB != DestB && !MBB->isEHPad())) {
       // This is a superfluous edge, remove it.
       SI = removeSuccessor(SI);
       Changed = true;
@@ -1063,11 +1107,13 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
     }
   }
 
+  if (Changed)
+    normalizeSuccProbs();
   return Changed;
 }
 
-/// findDebugLoc - find the next valid DebugLoc starting at MBBI, skipping
-/// any DBG_VALUE instructions.  Return UnknownLoc if there is none.
+/// Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE
+/// instructions.  Return UnknownLoc if there is none.
 DebugLoc
 MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   DebugLoc DL;
@@ -1083,45 +1129,60 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return DL;
 }
 
-/// getSuccWeight - Return weight of the edge from this block to MBB.
-///
-uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
-  if (Weights.empty())
-    return 0;
-
-  return *getWeightIterator(Succ);
+/// Return probability of the edge from this block to MBB.
+BranchProbability
+MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
+  if (Probs.empty())
+    return BranchProbability(1, succ_size());
+
+  const auto &Prob = *getProbabilityIterator(Succ);
+  if (Prob.isUnknown()) {
+    // For unknown probabilities, collect the sum of all known ones, and evenly
+    // ditribute the complemental of the sum to each unknown probability.
+    unsigned KnownProbNum = 0;
+    auto Sum = BranchProbability::getZero();
+    for (auto &P : Probs) {
+      if (!P.isUnknown()) {
+        Sum += P;
+        KnownProbNum++;
+      }
+    }
+    return Sum.getCompl() / (Probs.size() - KnownProbNum);
+  } else
+    return Prob;
 }
 
-/// Set successor weight of a given iterator.
-void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t weight) {
-  if (Weights.empty())
+/// Set successor probability of a given iterator.
+void MachineBasicBlock::setSuccProbability(succ_iterator I,
+                                           BranchProbability Prob) {
+  assert(!Prob.isUnknown());
+  if (Probs.empty())
     return;
-  *getWeightIterator(I) = weight;
+  *getProbabilityIterator(I) = Prob;
 }
 
-/// getWeightIterator - Return wight iterator corresonding to the I successor
-/// iterator
-MachineBasicBlock::weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::succ_iterator I) {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
 }
 
-/// getWeightIterator - Return wight iterator corresonding to the I successor
-/// iterator
-MachineBasicBlock::const_weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
+/// Return probability iterator corresonding to the I successor iterator.
+MachineBasicBlock::probability_iterator
+MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
 }
 
 /// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
 /// as of just before "MI".
-/// 
+///
 /// Search is localised to a neighborhood of
 /// Neighborhood instructions before (searching for defs or kills) and N
 /// instructions after (searching just for defs) MI.
@@ -1138,33 +1199,33 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
     do {
       --I;
 
-      MachineOperandIteratorBase::PhysRegInfo Analysis =
+      MachineOperandIteratorBase::PhysRegInfo Info =
         ConstMIOperands(I).analyzePhysReg(Reg, TRI);
 
-      if (Analysis.Defines)
-        // Outputs happen after inputs so they take precedence if both are
-        // present.
-        return Analysis.DefinesDead ? LQR_Dead : LQR_Live;
+      // Defs happen after uses so they take precedence if both are present.
 
-      if (Analysis.Kills || Analysis.Clobbers)
-        // Register killed, so isn't live.
+      // Register is dead after a dead def of the full register.
+      if (Info.DeadDef)
         return LQR_Dead;
-
-      else if (Analysis.ReadsOverlap)
-        // Defined or read without a previous kill - live.
-        return Analysis.Reads ? LQR_Live : LQR_OverlappingLive;
-
+      // Register is (at least partially) live after a def.
+      if (Info.Defined)
+        return LQR_Live;
+      // Register is dead after a full kill or clobber and no def.
+      if (Info.Killed || Info.Clobbered)
+        return LQR_Dead;
+      // Register must be live if we read it.
+      if (Info.Read)
+        return LQR_Live;
     } while (I != begin() && --N > 0);
   }
 
   // Did we get to the start of the block?
   if (I == begin()) {
     // If so, the register's state is definitely defined by the live-in state.
-    for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true);
-         RAI.isValid(); ++RAI) {
+    for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true); RAI.isValid();
+         ++RAI)
       if (isLiveIn(*RAI))
-        return (*RAI == Reg) ? LQR_Live : LQR_OverlappingLive;
-    }
+        return LQR_Live;
 
     return LQR_Dead;
   }
@@ -1176,16 +1237,14 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
   // If this is the last insn in the block, don't search forwards.
   if (I != end()) {
     for (++I; I != end() && N > 0; ++I, --N) {
-      MachineOperandIteratorBase::PhysRegInfo Analysis =
+      MachineOperandIteratorBase::PhysRegInfo Info =
         ConstMIOperands(I).analyzePhysReg(Reg, TRI);
 
-      if (Analysis.ReadsOverlap)
-        // Used, therefore must have been live.
-        return (Analysis.Reads) ?
-          LQR_Live : LQR_OverlappingLive;
-
-      else if (Analysis.Clobbers || Analysis.Defines)
-        // Defined (but not read) therefore cannot have been live.
+      // Register is live when we read it here.
+      if (Info.Read)
+        return LQR_Live;
+      // Register is dead if we can fully overwrite or clobber it here.
+      if (Info.FullyDefined || Info.Clobbered)
         return LQR_Dead;
     }
   }
@@ -1193,3 +1252,17 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
   // At this point we have no idea of the liveness of the register.
   return LQR_Unknown;
 }
+
+const uint32_t *
+MachineBasicBlock::getBeginClobberMask(const TargetRegisterInfo *TRI) const {
+  // EH funclet entry does not preserve any registers.
+  return isEHFuncletEntry() ? TRI->getNoPreservedMask() : nullptr;
+}
+
+const uint32_t *
+MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const {
+  // If we see a return block with successors, this must be a funclet return,
+  // which does not preserve any registers. If there are no successors, we don't
+  // care what kind of return it is, putting a mask after it is a no-op.
+  return isReturnBlock() && !succ_empty() ? TRI->getNoPreservedMask() : nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 9151d99..9119e31 100644
--- a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -57,7 +57,7 @@ struct GraphTraits<MachineBlockFrequencyInfo *> {
 
   static inline
   const NodeType *getEntryNode(const MachineBlockFrequencyInfo *G) {
-    return G->getFunction()->begin();
+    return &G->getFunction()->front();
   }
 
   static ChildIteratorType child_begin(const NodeType *N) {
@@ -143,7 +143,7 @@ bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
   if (!MBFI)
     MBFI.reset(new ImplType);
-  MBFI->doFunction(&F, &MBPI, &MLI);
+  MBFI->calculate(F, MBPI, MLI);
 #ifndef NDEBUG
   if (ViewMachineBlockFreqPropagationDAG != GVDT_None) {
     view();
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 2969bad..f5e3056 100644
--- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -51,7 +51,7 @@ using namespace llvm;
 #define DEBUG_TYPE "block-placement"
 
 STATISTIC(NumCondBranches, "Number of conditional branches");
-STATISTIC(NumUncondBranches, "Number of uncondittional branches");
+STATISTIC(NumUncondBranches, "Number of unconditional branches");
 STATISTIC(CondBranchTakenFreq,
           "Potential frequency of taking conditional branches");
 STATISTIC(UncondBranchTakenFreq,
@@ -62,6 +62,11 @@ static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
                                                 "blocks in the function."),
                                        cl::init(0), cl::Hidden);
 
+static cl::opt<unsigned>
+    AlignAllLoops("align-all-loops",
+                  cl::desc("Force the alignment of all loops in the function."),
+                  cl::init(0), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
     "block-placement-exit-block-bias",
@@ -81,6 +86,29 @@ static cl::opt<unsigned> OutlineOptionalThreshold(
              "instruction count below this threshold"),
     cl::init(4), cl::Hidden);
 
+static cl::opt<unsigned> LoopToColdBlockRatio(
+    "loop-to-cold-block-ratio",
+    cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
+             "(frequency of block) is greater than this ratio"),
+    cl::init(5), cl::Hidden);
+
+static cl::opt<bool>
+    PreciseRotationCost("precise-rotation-cost",
+                        cl::desc("Model the cost of loop rotation more "
+                                 "precisely by using profile data."),
+                        cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> MisfetchCost(
+    "misfetch-cost",
+    cl::desc("Cost that models the probablistic risk of an instruction "
+             "misfetch due to a jump comparing to falling through, whose cost "
+             "is zero."),
+    cl::init(1), cl::Hidden);
+
+static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
+                                      cl::desc("Cost of jump instructions."),
+                                      cl::init(1), cl::Hidden);
+
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
@@ -246,9 +274,12 @@ class MachineBlockPlacement : public MachineFunctionPass {
                                      const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L,
                                       const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L);
   void buildLoopChains(MachineFunction &F, MachineLoop &L);
   void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
                   const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
+                             const BlockFilterSet &LoopBlockSet);
   void buildCFGChains(MachineFunction &F);
 
 public:
@@ -354,31 +385,56 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   const BranchProbability HotProb(4, 5); // 80%
 
   MachineBasicBlock *BestSucc = nullptr;
-  // FIXME: Due to the performance of the probability and weight routines in
-  // the MBPI analysis, we manually compute probabilities using the edge
-  // weights. This is suboptimal as it means that the somewhat subtle
-  // definition of edge weight semantics is encoded here as well. We should
-  // improve the MBPI interface to efficiently support query patterns such as
-  // this.
-  uint32_t BestWeight = 0;
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
+  auto BestProb = BranchProbability::getZero();
+
+  // Adjust edge probabilities by excluding edges pointing to blocks that is
+  // either not in BlockFilter or is already in the current chain. Consider the
+  // following CFG:
+  //
+  //     --->A
+  //     |  / \
+  //     | B   C
+  //     |  \ / \
+  //     ----D   E
+  //
+  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
+  // A->C is chosen as a fall-through, D won't be selected as a successor of C
+  // due to CFG constraint (the probability of C->D is not greater than
+  // HotProb). If we exclude E that is not in BlockFilter when calculating the
+  // probability of C->D, D will be selected and we will get A C D B as the
+  // layout of this loop.
+  auto AdjustedSumProb = BranchProbability::getOne();
+  SmallVector<MachineBasicBlock *, 4> Successors;
   for (MachineBasicBlock *Succ : BB->successors()) {
-    if (BlockFilter && !BlockFilter->count(Succ))
-      continue;
-    BlockChain &SuccChain = *BlockToChain[Succ];
-    if (&SuccChain == &Chain) {
-      DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Already merged!\n");
-      continue;
-    }
-    if (Succ != *SuccChain.begin()) {
-      DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
-      continue;
+    bool SkipSucc = false;
+    if (BlockFilter && !BlockFilter->count(Succ)) {
+      SkipSucc = true;
+    } else {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (SuccChain == &Chain) {
+        DEBUG(dbgs() << "    " << getBlockName(Succ)
+                     << " -> Already merged!\n");
+        SkipSucc = true;
+      } else if (Succ != *SuccChain->begin()) {
+        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
+        continue;
+      }
     }
+    if (SkipSucc)
+      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
+    else
+      Successors.push_back(Succ);
+  }
 
-    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-    BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
+  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
+  for (MachineBasicBlock *Succ : Successors) {
+    BranchProbability SuccProb;
+    uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator();
+    uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+    if (SuccProbN >= SuccProbD)
+      SuccProb = BranchProbability::getOne();
+    else
+      SuccProb = BranchProbability(SuccProbN, SuccProbD);
 
     // If we outline optional branches, look whether Succ is unavoidable, i.e.
     // dominates all terminators of the MachineFunction. If it does, other
@@ -406,6 +462,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
     // Only consider successors which are either "hot", or wouldn't violate
     // any CFG constraints.
+    BlockChain &SuccChain = *BlockToChain[Succ];
     if (SuccChain.LoopPredecessors != 0) {
       if (SuccProb < HotProb) {
         DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
@@ -415,8 +472,9 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
       // Make sure that a hot successor doesn't have a globally more
       // important predecessor.
+      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
       BlockFrequency CandidateEdgeFreq =
-          MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+          MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
       for (MachineBasicBlock *Pred : Succ->predecessors()) {
         if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
@@ -440,10 +498,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
-    if (BestSucc && BestWeight >= SuccWeight)
+    if (BestSucc && BestProb >= SuccProb)
       continue;
     BestSucc = Succ;
-    BestWeight = SuccWeight;
+    BestProb = SuccProb;
   }
   return BestSucc;
 }
@@ -505,14 +563,14 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
     const BlockFilterSet *BlockFilter) {
   for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E;
        ++I) {
-    if (BlockFilter && !BlockFilter->count(I))
+    if (BlockFilter && !BlockFilter->count(&*I))
       continue;
-    if (BlockToChain[I] != &PlacedChain) {
+    if (BlockToChain[&*I] != &PlacedChain) {
       PrevUnplacedBlockIt = I;
       // Now select the head of the chain to which the unplaced block belongs
       // as the block to place. This will force the entire chain to be placed,
       // and satisfies the requirements of merging chains.
-      return *BlockToChain[I]->begin();
+      return *BlockToChain[&*I]->begin();
     }
   }
   return nullptr;
@@ -672,13 +730,8 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
     bool HasLoopingSucc = false;
-    // FIXME: Due to the performance of the probability and weight routines in
-    // the MBPI analysis, we use the internal weights and manually compute the
-    // probabilities to avoid quadratic behavior.
-    uint32_t WeightScale = 0;
-    uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale);
     for (MachineBasicBlock *Succ : MBB->successors()) {
-      if (Succ->isLandingPad())
+      if (Succ->isEHPad())
         continue;
       if (Succ == MBB)
         continue;
@@ -690,10 +743,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
         continue;
       }
 
-      uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ);
+      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
       if (LoopBlockSet.count(Succ)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (" << SuccWeight << ")\n");
+                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
         HasLoopingSucc = true;
         continue;
       }
@@ -705,7 +758,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
           BlocksExitingToOuterLoop.insert(MBB);
       }
 
-      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
                    << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
@@ -791,6 +843,188 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
   std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
 }
 
+/// \brief Attempt to rotate a loop based on profile data to reduce branch cost.
+///
+/// With profile data, we can determine the cost in terms of missed fall through
+/// opportunities when rotating a loop chain and select the best rotation.
+/// Basically, there are three kinds of cost to consider for each rotation:
+///    1. The possibly missed fall through edge (if it exists) from BB out of
+///    the loop to the loop header.
+///    2. The possibly missed fall through edges (if they exist) from the loop
+///    exits to BB out of the loop.
+///    3. The missed fall through edge (if it exists) from the last BB to the
+///    first BB in the loop chain.
+///  Therefore, the cost for a given rotation is the sum of costs listed above.
+///  We select the best rotation with the smallest cost.
+void MachineBlockPlacement::rotateLoopWithProfile(
+    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
+  auto HeaderBB = L.getHeader();
+  auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB);
+  auto RotationPos = LoopChain.end();
+
+  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
+
+  // A utility lambda that scales up a block frequency by dividing it by a
+  // branch probability which is the reciprocal of the scale.
+  auto ScaleBlockFrequency = [](BlockFrequency Freq,
+                                unsigned Scale) -> BlockFrequency {
+    if (Scale == 0)
+      return 0;
+    // Use operator / between BlockFrequency and BranchProbability to implement
+    // saturating multiplication.
+    return Freq / BranchProbability(1, Scale);
+  };
+
+  // Compute the cost of the missed fall-through edge to the loop header if the
+  // chain head is not the loop header. As we only consider natural loops with
+  // single header, this computation can be done only once.
+  BlockFrequency HeaderFallThroughCost(0);
+  for (auto *Pred : HeaderBB->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      auto EdgeFreq =
+          MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
+      auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
+      // If the predecessor has only an unconditional jump to the header, we
+      // need to consider the cost of this jump.
+      if (Pred->succ_size() == 1)
+        FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
+      HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
+    }
+  }
+
+  // Here we collect all exit blocks in the loop, and for each exit we find out
+  // its hottest exit edge. For each loop rotation, we define the loop exit cost
+  // as the sum of frequencies of exit edges we collect here, excluding the exit
+  // edge from the tail of the loop chain.
+  SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
+  for (auto BB : LoopChain) {
+    auto LargestExitEdgeProb = BranchProbability::getZero();
+    for (auto *Succ : BB->successors()) {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (!LoopBlockSet.count(Succ) &&
+          (!SuccChain || Succ == *SuccChain->begin())) {
+        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
+        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
+      }
+    }
+    if (LargestExitEdgeProb > BranchProbability::getZero()) {
+      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
+      ExitsWithFreq.emplace_back(BB, ExitFreq);
+    }
+  }
+
+  // In this loop we iterate every block in the loop chain and calculate the
+  // cost assuming the block is the head of the loop chain. When the loop ends,
+  // we should have found the best candidate as the loop chain's head.
+  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
+            EndIter = LoopChain.end();
+       Iter != EndIter; Iter++, TailIter++) {
+    // TailIter is used to track the tail of the loop chain if the block we are
+    // checking (pointed by Iter) is the head of the chain.
+    if (TailIter == LoopChain.end())
+      TailIter = LoopChain.begin();
+
+    auto TailBB = *TailIter;
+
+    // Calculate the cost by putting this BB to the top.
+    BlockFrequency Cost = 0;
+
+    // If the current BB is the loop header, we need to take into account the
+    // cost of the missed fall through edge from outside of the loop to the
+    // header.
+    if (Iter != HeaderIter)
+      Cost += HeaderFallThroughCost;
+
+    // Collect the loop exit cost by summing up frequencies of all exit edges
+    // except the one from the chain tail.
+    for (auto &ExitWithFreq : ExitsWithFreq)
+      if (TailBB != ExitWithFreq.first)
+        Cost += ExitWithFreq.second;
+
+    // The cost of breaking the once fall-through edge from the tail to the top
+    // of the loop chain. Here we need to consider three cases:
+    // 1. If the tail node has only one successor, then we will get an
+    //    additional jmp instruction. So the cost here is (MisfetchCost +
+    //    JumpInstCost) * tail node frequency.
+    // 2. If the tail node has two successors, then we may still get an
+    //    additional jmp instruction if the layout successor after the loop
+    //    chain is not its CFG successor. Note that the more frequently executed
+    //    jmp instruction will be put ahead of the other one. Assume the
+    //    frequency of those two branches are x and y, where x is the frequency
+    //    of the edge to the chain head, then the cost will be
+    //    (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
+    // 3. If the tail node has more than two successors (this rarely happens),
+    //    we won't consider any additional cost.
+    if (TailBB->isSuccessor(*Iter)) {
+      auto TailBBFreq = MBFI->getBlockFreq(TailBB);
+      if (TailBB->succ_size() == 1)
+        Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
+                                    MisfetchCost + JumpInstCost);
+      else if (TailBB->succ_size() == 2) {
+        auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
+        auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
+        auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
+                                  ? TailBBFreq * TailToHeadProb.getCompl()
+                                  : TailToHeadFreq;
+        Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
+                ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
+      }
+    }
+
+    DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter)
+                 << " to the top: " << Cost.getFrequency() << "\n");
+
+    if (Cost < SmallestRotationCost) {
+      SmallestRotationCost = Cost;
+      RotationPos = Iter;
+    }
+  }
+
+  if (RotationPos != LoopChain.end()) {
+    DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos)
+                 << " to the top\n");
+    std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
+  }
+}
+
+/// \brief Collect blocks in the given loop that are to be placed.
+///
+/// When profile data is available, exclude cold blocks from the returned set;
+/// otherwise, collect all blocks in the loop.
+MachineBlockPlacement::BlockFilterSet
+MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
+  BlockFilterSet LoopBlockSet;
+
+  // Filter cold blocks off from LoopBlockSet when profile data is available.
+  // Collect the sum of frequencies of incoming edges to the loop header from
+  // outside. If we treat the loop as a super block, this is the frequency of
+  // the loop. Then for each block in the loop, we calculate the ratio between
+  // its frequency and the frequency of the loop block. When it is too small,
+  // don't add it to the loop chain. If there are outer loops, then this block
+  // will be merged into the first outer loop chain for which this block is not
+  // cold anymore. This needs precise profile data and we only do this when
+  // profile data is available.
+  if (F.getFunction()->getEntryCount()) {
+    BlockFrequency LoopFreq(0);
+    for (auto LoopPred : L.getHeader()->predecessors())
+      if (!L.contains(LoopPred))
+        LoopFreq += MBFI->getBlockFreq(LoopPred) *
+                    MBPI->getEdgeProbability(LoopPred, L.getHeader());
+
+    for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+      auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
+      if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
+        continue;
+      LoopBlockSet.insert(LoopBB);
+    }
+  } else
+    LoopBlockSet.insert(L.block_begin(), L.block_end());
+
+  return LoopBlockSet;
+}
+
 /// \brief Forms basic block chains from the natural loop structures.
 ///
 /// These chains are designed to preserve the existing *structure* of the code
@@ -805,19 +1039,27 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
     buildLoopChains(F, *InnerLoop);
 
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
-  BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
+  BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L);
+
+  // Check if we have profile data for this function. If yes, we will rotate
+  // this loop by modeling costs more precisely which requires the profile data
+  // for better layout.
+  bool RotateLoopWithProfile =
+      PreciseRotationCost && F.getFunction()->getEntryCount();
 
   // First check to see if there is an obviously preferable top block for the
   // loop. This will default to the header, but may end up as one of the
   // predecessors to the header if there is one which will result in strictly
   // fewer branches in the loop body.
-  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
+  // When we use profile data to rotate the loop, this is unnecessary.
+  MachineBasicBlock *LoopTop =
+      RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
 
   // If we selected just the header for the loop top, look for a potentially
   // profitable exit block in the event that rotating the loop can eliminate
   // branches by placing an exit edge at the bottom.
   MachineBasicBlock *ExitingBB = nullptr;
-  if (LoopTop == L.getHeader())
+  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
     ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
 
   BlockChain &LoopChain = *BlockToChain[LoopTop];
@@ -828,7 +1070,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   assert(LoopChain.LoopPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
-  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+
+  for (MachineBasicBlock *LoopBB : LoopBlockSet) {
     BlockChain &Chain = *BlockToChain[LoopBB];
     if (!UpdatedPreds.insert(&Chain).second)
       continue;
@@ -848,7 +1091,11 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   }
 
   buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet);
-  rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
+
+  if (RotateLoopWithProfile)
+    rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
+  else
+    rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
 
   DEBUG({
     // Crash at the end so we get all of the debugging output first.
@@ -889,7 +1136,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   // the assumptions of the remaining algorithm.
   SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
   for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-    MachineBasicBlock *BB = FI;
+    MachineBasicBlock *BB = &*FI;
     BlockChain *Chain =
         new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
     // Also, merge any blocks which we cannot reason about and must preserve
@@ -900,8 +1147,8 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
         break;
 
-      MachineFunction::iterator NextFI(std::next(FI));
-      MachineBasicBlock *NextBB = NextFI;
+      MachineFunction::iterator NextFI = std::next(FI);
+      MachineBasicBlock *NextBB = &*NextFI;
       // Ensure that the layout successor is a viable block, as we know that
       // fallthrough is a possibility.
       assert(NextFI != FE && "Can't fallthrough past the last block.");
@@ -1004,7 +1251,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // Update the terminator of the previous block.
     if (ChainBB == *FunctionChain.begin())
       continue;
-    MachineBasicBlock *PrevBB = std::prev(MachineFunction::iterator(ChainBB));
+    MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
 
     // FIXME: It would be awesome of updateTerminator would just return rather
     // than assert when the branch cannot be analyzed in order to remove this
@@ -1035,14 +1282,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       }
 
       // If PrevBB has a two-way branch, try to re-order the branches
-      // such that we branch to the successor with higher weight first.
+      // such that we branch to the successor with higher probability first.
       if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          MBPI->getEdgeProbability(PrevBB, FBB) >
+              MBPI->getEdgeProbability(PrevBB, TBB) &&
           !TII->ReverseBranchCondition(Cond)) {
         DEBUG(dbgs() << "Reverse order of the two branches: "
                      << getBlockName(PrevBB) << "\n");
-        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
-                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DEBUG(dbgs() << "    Edge probability: "
+                     << MBPI->getEdgeProbability(PrevBB, FBB) << " vs "
+                     << MBPI->getEdgeProbability(PrevBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
         TII->RemoveBranch(*PrevBB);
         TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
@@ -1064,13 +1313,14 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
+  // FIXME: Use Function::optForSize().
   if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
     return;
   if (FunctionChain.begin() == FunctionChain.end())
     return; // Empty chain.
 
   const BranchProbability ColdProb(1, 5); // 20%
-  BlockFrequency EntryFreq = MBFI->getBlockFreq(F.begin());
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F.front());
   BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
   for (MachineBasicBlock *ChainBB : FunctionChain) {
     if (ChainBB == *FunctionChain.begin())
@@ -1084,6 +1334,11 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     if (!L)
       continue;
 
+    if (AlignAllLoops) {
+      ChainBB->setAlignment(AlignAllLoops);
+      continue;
+    }
+
     unsigned Align = TLI->getPrefLoopAlignment(L);
     if (!Align)
       continue; // Don't care about loop alignment.
@@ -1224,4 +1479,3 @@ bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
 
   return false;
 }
-
diff --git a/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 6fbc2be..cf6d401 100644
--- a/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -28,91 +28,48 @@ char MachineBranchProbabilityInfo::ID = 0;
 
 void MachineBranchProbabilityInfo::anchor() { }
 
-uint32_t MachineBranchProbabilityInfo::
-getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
-  // First we compute the sum with 64-bits of precision, ensuring that cannot
-  // overflow by bounding the number of weights considered. Hopefully no one
-  // actually needs 2^32 successors.
-  assert(MBB->succ_size() < UINT32_MAX);
-  uint64_t Sum = 0;
-  Scale = 1;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight;
-  }
-
-  // If the computed sum fits in 32-bits, we're done.
-  if (Sum <= UINT32_MAX)
-    return Sum;
-
-  // Otherwise, compute the scale necessary to cause the weights to fit, and
-  // re-sum with that scale applied.
-  assert((Sum / UINT32_MAX) < UINT32_MAX);
-  Scale = (Sum / UINT32_MAX) + 1;
-  Sum = 0;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight / Scale;
-  }
-  assert(Sum <= UINT32_MAX);
-  return Sum;
-}
-
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              MachineBasicBlock::const_succ_iterator Dst) const {
-  uint32_t Weight = Src->getSuccWeight(Dst);
-  if (!Weight)
-    return DEFAULT_WEIGHT;
-  return Weight;
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst);
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              const MachineBasicBlock *Dst) const {
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
   // This is a linear search. Try to use the const_succ_iterator version when
   // possible.
-  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+  return getEdgeProbability(Src,
+                            std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
 bool
 MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
                                         const MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
-  // FIXME: Compare against a static "hot" BranchProbability.
-  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
+  static BranchProbability HotProb(4, 5);
+  return getEdgeProbability(Src, Dst) > HotProb;
 }
 
 MachineBasicBlock *
 MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
-  uint32_t MaxWeight = 0;
+  auto MaxProb = BranchProbability::getZero();
   MachineBasicBlock *MaxSucc = nullptr;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    if (Weight > MaxWeight) {
-      MaxWeight = Weight;
+    auto Prob = getEdgeProbability(MBB, I);
+    if (Prob > MaxProb) {
+      MaxProb = Prob;
       MaxSucc = *I;
     }
   }
 
-  if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
+  static BranchProbability HotProb(4, 5);
+  if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
     return MaxSucc;
 
   return nullptr;
 }
 
-BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
-    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
-  uint32_t Scale = 1;
-  uint32_t D = getSumForBlock(Src, Scale);
-  uint32_t N = getEdgeWeight(Src, Dst) / Scale;
-
-  return BranchProbability(N, D);
-}
-
 raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
     raw_ostream &OS, const MachineBasicBlock *Src,
     const MachineBasicBlock *Dst) const {
diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
index 87aaaa0..aad376c 100644
--- a/contrib/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
@@ -57,7 +57,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addPreservedID(MachineLoopInfoID);
       AU.addRequired<MachineDominatorTree>();
       AU.addPreserved<MachineDominatorTree>();
@@ -111,7 +111,7 @@ char &llvm::MachineCSEID = MachineCSE::ID;
 INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse",
                 "Machine Common Subexpression Elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineCSE, "machine-cse",
                 "Machine Common Subexpression Elimination", false, false)
 
@@ -122,8 +122,7 @@ INITIALIZE_PASS_END(MachineCSE, "machine-cse",
 bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
                                                MachineBasicBlock *MBB) {
   bool Changed = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
@@ -186,8 +185,7 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
       return true;
 
     bool SeenDef = false;
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = I->getOperand(i);
+    for (const MachineOperand &MO : I->operands()) {
       if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
         SeenDef = true;
       if (!MO.isReg() || !MO.getReg())
@@ -220,8 +218,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
                                        SmallVectorImpl<unsigned> &PhysDefs,
                                        bool &PhysUseDef) const{
   // First, add all uses to PhysRefs.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || MO.isDef())
       continue;
     unsigned Reg = MO.getReg();
@@ -239,8 +236,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
   // (which currently contains only uses), set the PhysUseDef flag.
   PhysUseDef = false;
   MachineBasicBlock::const_iterator I = MI; I = std::next(I);
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
     unsigned Reg = MO.getReg();
@@ -311,8 +307,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
     if (I == E)
       return true;
 
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = I->getOperand(i);
+    for (const MachineOperand &MO : I->operands()) {
       // RegMasks go on instructions like calls that clobber lots of physregs.
       // Don't attempt to CSE across such an instruction.
       if (MO.isRegMask())
@@ -398,8 +393,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // Heuristics #2: If the expression doesn't not use a vr and the only use
   // of the redundant computation are copies, do not cse.
   bool HasVRegUse = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isReg() && MO.isUse() &&
         TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
       HasVRegUse = true;
@@ -580,9 +574,9 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
     // Actually perform the elimination.
     if (DoCSE) {
-      for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
-        unsigned OldReg = CSEPairs[i].first;
-        unsigned NewReg = CSEPairs[i].second;
+      for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
+        unsigned OldReg = CSEPair.first;
+        unsigned NewReg = CSEPair.second;
         // OldReg may have been unused but is used now, clear the Dead flag
         MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
         assert(Def != nullptr && "CSEd register has no unique definition?");
@@ -594,8 +588,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
       // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
       // we should make sure it is not dead at CSMI.
-      for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
-        CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
+      for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
+        CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
 
       // Go through implicit defs of CSMI and MI, and clear the kill flags on
       // their uses in all the instructions between CSMI and MI.
@@ -685,18 +679,14 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     Node = WorkList.pop_back_val();
     Scopes.push_back(Node);
     const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
-    unsigned NumChildren = Children.size();
-    OpenChildren[Node] = NumChildren;
-    for (unsigned i = 0; i != NumChildren; ++i) {
-      MachineDomTreeNode *Child = Children[i];
+    OpenChildren[Node] = Children.size();
+    for (MachineDomTreeNode *Child : Children)
       WorkList.push_back(Child);
-    }
   } while (!WorkList.empty());
 
   // Now perform CSE.
   bool Changed = false;
-  for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = Scopes[i];
+  for (MachineDomTreeNode *Node : Scopes) {
     MachineBasicBlock *MBB = Node->getBlock();
     EnterScope(MBB);
     Changed |= ProcessBlock(MBB);
@@ -714,7 +704,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<MachineDominatorTree>();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();
   return PerformCSE(DT->getRootNode());
diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
index f33d0e6..fa43c4d 100644
--- a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -10,6 +10,7 @@
 // The machine combiner pass uses machine trace metrics to ensure the combined
 // instructions does not lengthen the critical path or the resource depth.
 //===----------------------------------------------------------------------===//
+
 #define DEBUG_TYPE "machine-combiner"
 
 #include "llvm/ADT/Statistic.h"
@@ -68,10 +69,10 @@ private:
                       MachineTraceMetrics::Trace BlockTrace);
   bool
   improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
-                           MachineTraceMetrics::Trace BlockTrace,
-                           SmallVectorImpl<MachineInstr *> &InsInstrs,
-                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-                           bool NewCodeHasLessInsts);
+                          MachineTraceMetrics::Trace BlockTrace,
+                          SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineCombinerPattern Pattern);
   bool preservesResourceLen(MachineBasicBlock *MBB,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -122,9 +123,9 @@ unsigned
 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                           MachineTraceMetrics::Trace BlockTrace) {
-
   SmallVector<unsigned, 16> InstrDepth;
-  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
+         "Missing machine model\n");
 
   // For each instruction in the new sequence compute the depth based on the
   // operands. Use the trace information when possible. For new operands which
@@ -180,8 +181,8 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
 /// \returns Latency of \p NewRoot
 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
                                      MachineTraceMetrics::Trace BlockTrace) {
-
-  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
+         "Missing machine model\n");
 
   // Check each definition in NewRoot and compute the latency
   unsigned NewRootLatency = 0;
@@ -202,62 +203,86 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
           NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
           UseMO->findRegisterUseOperandIdx(MO.getReg()));
     } else {
-      LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode());
+      LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
     }
     NewRootLatency = std::max(NewRootLatency, LatencyOp);
   }
   return NewRootLatency;
 }
 
-/// True when the new instruction sequence does not lengthen the critical path
-/// and the new sequence has less instructions or the new sequence improves the
-/// critical path.
+/// The combiner's goal may differ based on which pattern it is attempting
+/// to optimize.
+enum class CombinerObjective {
+  MustReduceDepth, // The data dependency chain must be improved.
+  Default          // The critical path must not be lengthened.
+};
+
+static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
+  // TODO: If C++ ever gets a real enum class, make this part of the
+  // MachineCombinerPattern class.
+  switch (P) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    return CombinerObjective::MustReduceDepth;
+  default:
+    return CombinerObjective::Default;
+  }
+}
+
 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
 /// sequence to replace the old sequence is that it cannot lengthen the critical
-/// path. This is decided by the formula:
-/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
-/// If the new sequence has an equal length critical path but does not reduce
-/// the number of instructions (NewCodeHasLessInsts is false), then it is not
-/// considered an improvement. The slack is the number of cycles Root can be
-/// delayed before the critical patch becomes longer.
+/// path. The definition of "improve" may be restricted by specifying that the
+/// new path improves the data dependency chain (MustReduceDepth).
 bool MachineCombiner::improvesCriticalPathLen(
     MachineBasicBlock *MBB, MachineInstr *Root,
     MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-    bool NewCodeHasLessInsts) {
-
-  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+    MachineCombinerPattern Pattern) {
+  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
+         "Missing machine model\n");
   // NewRoot is the last instruction in the \p InsInstrs vector.
-  // Get depth and latency of NewRoot.
   unsigned NewRootIdx = InsInstrs.size() - 1;
   MachineInstr *NewRoot = InsInstrs[NewRootIdx];
-  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
-  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
 
-  // Get depth, latency and slack of Root.
+  // Get depth and latency of NewRoot and Root.
+  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
+
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+        dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
+        dbgs() << " RootDepth: " << RootDepth << "\n");
+
+  // For a transform such as reassociation, the cost equation is
+  // conservatively calculated so that we must improve the depth (data
+  // dependency cycles) in the critical path to proceed with the transform.
+  // Being conservative also protects against inaccuracies in the underlying
+  // machine trace metrics and CPU models.
+  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
+    return NewRootDepth < RootDepth;
+
+  // A more flexible cost calculation for the critical path includes the slack
+  // of the original code sequence. This may allow the transform to proceed
+  // even if the instruction depths (data dependency cycles) become worse.
+  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
   unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
   unsigned RootSlack = BlockTrace.getInstrSlack(Root);
 
-  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
-        dbgs() << " NewRootDepth: " << NewRootDepth
-               << " NewRootLatency: " << NewRootLatency << "\n";
-        dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency
-               << " RootSlack: " << RootSlack << "\n";
-        dbgs() << " NewRootDepth + NewRootLatency "
+  DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
+        dbgs() << " RootLatency: " << RootLatency << "\n";
+        dbgs() << " RootSlack: " << RootSlack << "\n";
+        dbgs() << " NewRootDepth + NewRootLatency = "
                << NewRootDepth + NewRootLatency << "\n";
-        dbgs() << " RootDepth + RootLatency + RootSlack "
+        dbgs() << " RootDepth + RootLatency + RootSlack = "
                << RootDepth + RootLatency + RootSlack << "\n";);
 
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
   unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
   
-  if (NewCodeHasLessInsts)
-    return NewCycleCount <= OldCycleCount;
-  else
-    return NewCycleCount < OldCycleCount;
+  return NewCycleCount <= OldCycleCount;
 }
 
 /// helper routine to convert instructions into SC
@@ -271,11 +296,14 @@ void MachineCombiner::instr2instrSC(
     InstrsSC.push_back(SC);
   }
 }
+
 /// True when the new instructions do not increase resource length
 bool MachineCombiner::preservesResourceLen(
     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
     SmallVectorImpl<MachineInstr *> &DelInstrs) {
+  if (!TSchedModel.hasInstrSchedModel())
+    return true;
 
   // Compute current resource length
 
@@ -310,7 +338,7 @@ bool MachineCombiner::preservesResourceLen(
 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   if (OptSize && (NewSize < OldSize))
     return true;
-  if (!TSchedModel.hasInstrSchedModel())
+  if (!TSchedModel.hasInstrSchedModelOrItineraries())
     return true;
   return false;
 }
@@ -332,7 +360,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
     auto &MI = *BlockIter++;
 
     DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
-    SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Patterns;
+    SmallVector<MachineCombinerPattern, 16> Patterns;
     // The motivating example is:
     //
     //     MUL  Other        MUL_op1 MUL_op2  Other
@@ -358,54 +386,55 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
     // based on an internal cost heuristic.
 
-    if (TII->getMachineCombinerPatterns(MI, Patterns)) {
-      for (auto P : Patterns) {
-        SmallVector<MachineInstr *, 16> InsInstrs;
-        SmallVector<MachineInstr *, 16> DelInstrs;
-        DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
-        if (!MinInstr)
-          MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+    if (!TII->getMachineCombinerPatterns(MI, Patterns))
+      continue;
+
+    for (auto P : Patterns) {
+      SmallVector<MachineInstr *, 16> InsInstrs;
+      SmallVector<MachineInstr *, 16> DelInstrs;
+      DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+      if (!MinInstr)
+        MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+      MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+      Traces->verifyAnalysis();
+      TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
+                                      InstrIdxForVirtReg);
+      unsigned NewInstCount = InsInstrs.size();
+      unsigned OldInstCount = DelInstrs.size();
+      // Found pattern, but did not generate alternative sequence.
+      // This can happen e.g. when an immediate could not be materialized
+      // in a single instruction.
+      if (!NewInstCount)
+        continue;
+
+      // Substitute when we optimize for codesize and the new sequence has
+      // fewer instructions OR
+      // the new sequence neither lengthens the critical path nor increases
+      // resource pressure.
+      if (doSubstitute(NewInstCount, OldInstCount) ||
+          (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+                                   InstrIdxForVirtReg, P) &&
+           preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
+        for (auto *InstrPtr : InsInstrs)
+          MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
+        for (auto *InstrPtr : DelInstrs)
+          InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+
+        Changed = true;
+        ++NumInstCombined;
+
+        Traces->invalidate(MBB);
         Traces->verifyAnalysis();
-        TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
-                                        InstrIdxForVirtReg);
-        unsigned NewInstCount = InsInstrs.size();
-        unsigned OldInstCount = DelInstrs.size();
-        // Found pattern, but did not generate alternative sequence.
-        // This can happen e.g. when an immediate could not be materialized
-        // in a single instruction.
-        if (!NewInstCount)
-          continue;
-        // Substitute when we optimize for codesize and the new sequence has
-        // fewer instructions OR
-        // the new sequence neither lengthens the critical path nor increases
-        // resource pressure.
-        if (doSubstitute(NewInstCount, OldInstCount) ||
-            (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                      InstrIdxForVirtReg,
-                                      NewInstCount < OldInstCount) &&
-             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
-          for (auto *InstrPtr : InsInstrs)
-            MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
-          for (auto *InstrPtr : DelInstrs)
-            InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
-
-          Changed = true;
-          ++NumInstCombined;
-
-          Traces->invalidate(MBB);
-          Traces->verifyAnalysis();
-          // Eagerly stop after the first pattern fires.
-          break;
-        } else {
-          // Cleanup instructions of the alternative code sequence. There is no
-          // use for them.
-          MachineFunction *MF = MBB->getParent();
-          for (auto *InstrPtr : InsInstrs)
-            MF->DeleteMachineInstr(InstrPtr);
-        }
-        InstrIdxForVirtReg.clear();
+        // Eagerly stop after the first pattern fires.
+        break;
+      } else {
+        // Cleanup instructions of the alternative code sequence. There is no
+        // use for them.
+        MachineFunction *MF = MBB->getParent();
+        for (auto *InstrPtr : InsInstrs)
+          MF->DeleteMachineInstr(InstrPtr);
       }
+      InstrIdxForVirtReg.clear();
     }
   }
 
@@ -420,9 +449,8 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
   TSchedModel.init(SchedModel, &STI, TII);
   MRI = &MF.getRegInfo();
   Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = 0;
-
-  OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  MinInstr = nullptr;
+  OptSize = MF.getFunction()->optForSize();
 
   DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
   if (!TII->useMachineCombiner()) {
diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
index 9856e70..ca4bb1c 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionInitializer.h"
@@ -26,6 +27,8 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -44,6 +47,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
 
+static cl::opt<unsigned>
+    AlignAllFunctions("align-all-functions",
+                      cl::desc("Force the alignment of all functions."),
+                      cl::init(0), cl::Hidden);
+
 void MachineFunctionInitializer::anchor() {}
 
 //===----------------------------------------------------------------------===//
@@ -79,12 +87,27 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
+  // FIXME: Use Function::optForSize().
   if (!Fn->hasFnAttribute(Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
+  if (AlignAllFunctions)
+    Alignment = AlignAllFunctions;
+
   FunctionNumber = FunctionNum;
   JumpTableInfo = nullptr;
+
+  if (isFuncletEHPersonality(classifyEHPersonality(
+          F->hasPersonalityFn() ? F->getPersonalityFn() : nullptr))) {
+    WinEHInfo = new (Allocator) WinEHFuncInfo();
+  }
+
+  assert(TM.isCompatibleDataLayout(getDataLayout()) &&
+         "Can't create a MachineFunction using a Module with a "
+         "Target-incompatible DataLayout attached\n");
+
+  PSVManager = llvm::make_unique<PseudoSourceValueManager>();
 }
 
 MachineFunction::~MachineFunction() {
@@ -117,6 +140,11 @@ MachineFunction::~MachineFunction() {
     JumpTableInfo->~MachineJumpTableInfo();
     Allocator.Deallocate(JumpTableInfo);
   }
+
+  if (WinEHInfo) {
+    WinEHInfo->~WinEHFuncInfo();
+    Allocator.Deallocate(WinEHInfo);
+  }
 }
 
 const DataLayout &MachineFunction::getDataLayout() const {
@@ -149,7 +177,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
   if (MBB == nullptr)
     MBBI = begin();
   else
-    MBBI = MBB;
+    MBBI = MBB->getIterator();
 
   // Figure out the block number this should have.
   unsigned BlockNo = 0;
@@ -169,7 +197,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
       if (MBBNumbering[BlockNo])
         MBBNumbering[BlockNo]->setNumber(-1);
 
-      MBBNumbering[BlockNo] = MBBI;
+      MBBNumbering[BlockNo] = &*MBBI;
       MBBI->setNumber(BlockNo);
     }
   }
@@ -322,6 +350,13 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
   return std::make_pair(Result, Result + Num);
 }
 
+const char *MachineFunction::createExternalSymbolName(StringRef Name) {
+  char *Dest = Allocator.Allocate<char>(Name.size() + 1);
+  std::copy(Name.begin(), Name.end(), Dest);
+  Dest[Name.size()] = 0;
+  return Dest;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineFunction::dump() const {
   print(dbgs());
@@ -593,10 +628,9 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
     BV.set(*CSR);
 
   // Saved CSRs are not pristine.
-  const std::vector<CalleeSavedInfo> &CSI = getCalleeSavedInfo();
-  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-         E = CSI.end(); I != E; ++I)
-    BV.reset(I->getReg());
+  for (auto &I : getCalleeSavedInfo())
+    for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S)
+      BV.reset(*S);
 
   return BV;
 }
@@ -801,42 +835,26 @@ Type *MachineConstantPoolEntry::getType() const {
   return Val.ConstVal->getType();
 }
 
-
-unsigned MachineConstantPoolEntry::getRelocationInfo() const {
+bool MachineConstantPoolEntry::needsRelocation() const {
   if (isMachineConstantPoolEntry())
-    return Val.MachineCPVal->getRelocationInfo();
-  return Val.ConstVal->getRelocationInfo();
+    return true;
+  return Val.ConstVal->needsRelocation();
 }
 
 SectionKind
 MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
-  SectionKind Kind;
-  switch (getRelocationInfo()) {
+  if (needsRelocation())
+    return SectionKind::getReadOnlyWithRel();
+  switch (DL->getTypeAllocSize(getType())) {
+  case 4:
+    return SectionKind::getMergeableConst4();
+  case 8:
+    return SectionKind::getMergeableConst8();
+  case 16:
+    return SectionKind::getMergeableConst16();
   default:
-    llvm_unreachable("Unknown section kind");
-  case Constant::GlobalRelocations:
-    Kind = SectionKind::getReadOnlyWithRel();
-    break;
-  case Constant::LocalRelocation:
-    Kind = SectionKind::getReadOnlyWithRelLocal();
-    break;
-  case Constant::NoRelocation:
-    switch (DL->getTypeAllocSize(getType())) {
-    case 4:
-      Kind = SectionKind::getMergeableConst4();
-      break;
-    case 8:
-      Kind = SectionKind::getMergeableConst8();
-      break;
-    case 16:
-      Kind = SectionKind::getMergeableConst16();
-      break;
-    default:
-      Kind = SectionKind::getReadOnly();
-      break;
-    }
+    return SectionKind::getReadOnly();
   }
-  return Kind;
 }
 
 MachineConstantPool::~MachineConstantPool() {
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
index aaf06a7..05463fc 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -13,11 +13,14 @@
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/StackProtector.h"
@@ -49,13 +52,16 @@ void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
   // passes explicitly. This does not include setPreservesCFG,
   // because CodeGen overloads that to mean preserving the MachineBasicBlock
   // CFG in addition to the LLVM IR CFG.
-  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved<BasicAAWrapperPass>();
   AU.addPreserved<DominanceFrontier>();
   AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
   AU.addPreserved<IVUsers>();
   AU.addPreserved<LoopInfoWrapperPass>();
   AU.addPreserved<MemoryDependenceAnalysis>();
-  AU.addPreserved<ScalarEvolution>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
   AU.addPreserved<StackProtector>();
 
   FunctionPass::getAnalysisUsage(AU);
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 790f5ac..4f424ff 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -31,7 +31,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
   const std::string Banner;
 
   MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { }
-  MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) 
+  MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner)
       : MachineFunctionPass(ID), OS(os), Banner(banner) {}
 
   const char *getPassName() const override { return "MachineFunction Printer"; }
@@ -42,6 +42,8 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!llvm::isFunctionInPrintList(MF.getName()))
+      return false;
     OS << "# " << Banner << ":\n";
     MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
     return false;
diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
index fdc4226..6dca74d 100644
--- a/contrib/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -43,6 +44,11 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+static cl::opt<bool> PrintWholeRegMask(
+    "print-whole-regmask",
+    cl::desc("Print the full contents of regmask operands in IR dumps"),
+    cl::init(true), cl::Hidden);
+
 //===----------------------------------------------------------------------===//
 // MachineOperand Implementation
 //===----------------------------------------------------------------------===//
@@ -407,9 +413,26 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (getOffset()) OS << "+" << getOffset();
     OS << '>';
     break;
-  case MachineOperand::MO_RegisterMask:
-    OS << "<regmask>";
+  case MachineOperand::MO_RegisterMask: {
+    unsigned NumRegsInMask = 0;
+    unsigned NumRegsEmitted = 0;
+    OS << "<regmask";
+    for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
+      unsigned MaskWord = i / 32;
+      unsigned MaskBit = i % 32;
+      if (getRegMask()[MaskWord] & (1 << MaskBit)) {
+        if (PrintWholeRegMask || NumRegsEmitted <= 10) {
+          OS << " " << PrintReg(i, TRI);
+          NumRegsEmitted++;
+        }
+        NumRegsInMask++;
+      }
+    }
+    if (NumRegsEmitted != NumRegsInMask)
+      OS << " and " << (NumRegsInMask - NumRegsEmitted) << " more...";
+    OS << ">";
     break;
+  }
   case MachineOperand::MO_RegisterLiveOut:
     OS << "<regliveout>";
     break;
@@ -443,26 +466,28 @@ unsigned MachinePointerInfo::getAddrSpace() const {
 
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
 /// constant pool.
-MachinePointerInfo MachinePointerInfo::getConstantPool() {
-  return MachinePointerInfo(PseudoSourceValue::getConstantPool());
+MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getConstantPool());
 }
 
 /// getFixedStack - Return a MachinePointerInfo record that refers to the
 /// the specified FrameIndex.
-MachinePointerInfo MachinePointerInfo::getFixedStack(int FI, int64_t offset) {
-  return MachinePointerInfo(PseudoSourceValue::getFixedStack(FI), offset);
+MachinePointerInfo MachinePointerInfo::getFixedStack(MachineFunction &MF,
+                                                     int FI, int64_t Offset) {
+  return MachinePointerInfo(MF.getPSVManager().getFixedStack(FI), Offset);
 }
 
-MachinePointerInfo MachinePointerInfo::getJumpTable() {
-  return MachinePointerInfo(PseudoSourceValue::getJumpTable());
+MachinePointerInfo MachinePointerInfo::getJumpTable(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getJumpTable());
 }
 
-MachinePointerInfo MachinePointerInfo::getGOT() {
-  return MachinePointerInfo(PseudoSourceValue::getGOT());
+MachinePointerInfo MachinePointerInfo::getGOT(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getGOT());
 }
 
-MachinePointerInfo MachinePointerInfo::getStack(int64_t Offset) {
-  return MachinePointerInfo(PseudoSourceValue::getStack(), Offset);
+MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF,
+                                                int64_t Offset) {
+  return MachinePointerInfo(MF.getPSVManager().getStack(), Offset);
 }
 
 MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
@@ -606,10 +631,12 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
 
 void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
   if (MCID->ImplicitDefs)
-    for (const uint16_t *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+    for (const MCPhysReg *ImpDefs = MCID->getImplicitDefs(); *ImpDefs;
+           ++ImpDefs)
       addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID->ImplicitUses)
-    for (const uint16_t *ImpUses = MCID->getImplicitUses(); *ImpUses; ++ImpUses)
+    for (const MCPhysReg *ImpUses = MCID->getImplicitUses(); *ImpUses;
+           ++ImpUses)
       addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true));
 }
 
@@ -839,9 +866,60 @@ void MachineInstr::addMemOperand(MachineFunction &MF,
   setMemRefs(NewMemRefs, NewMemRefs + NewNum);
 }
 
+/// Check to see if the MMOs pointed to by the two MemRefs arrays are
+/// identical. 
+static bool hasIdenticalMMOs(const MachineInstr &MI1, const MachineInstr &MI2) {
+  auto I1 = MI1.memoperands_begin(), E1 = MI1.memoperands_end();
+  auto I2 = MI2.memoperands_begin(), E2 = MI2.memoperands_end();
+  if ((E1 - I1) != (E2 - I2))
+    return false;
+  for (; I1 != E1; ++I1, ++I2) {
+    if (**I1 != **I2)
+      return false;
+  }
+  return true;
+}
+
+std::pair<MachineInstr::mmo_iterator, unsigned>
+MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
+
+  // If either of the incoming memrefs are empty, we must be conservative and
+  // treat this as if we've exhausted our space for memrefs and dropped them.
+  if (memoperands_empty() || Other.memoperands_empty())
+    return std::make_pair(nullptr, 0);
+
+  // If both instructions have identical memrefs, we don't need to merge them.
+  // Since many instructions have a single memref, and we tend to merge things
+  // like pairs of loads from the same location, this catches a large number of
+  // cases in practice.
+  if (hasIdenticalMMOs(*this, Other))
+    return std::make_pair(MemRefs, NumMemRefs);
+  
+  // TODO: consider uniquing elements within the operand lists to reduce
+  // space usage and fall back to conservative information less often.
+  size_t CombinedNumMemRefs = NumMemRefs + Other.NumMemRefs;
+
+  // If we don't have enough room to store this many memrefs, be conservative
+  // and drop them.  Otherwise, we'd fail asserts when trying to add them to
+  // the new instruction.
+  if (CombinedNumMemRefs != uint8_t(CombinedNumMemRefs))
+    return std::make_pair(nullptr, 0);
+
+  MachineFunction *MF = getParent()->getParent();
+  mmo_iterator MemBegin = MF->allocateMemRefsArray(CombinedNumMemRefs);
+  mmo_iterator MemEnd = std::copy(memoperands_begin(), memoperands_end(),
+                                  MemBegin);
+  MemEnd = std::copy(Other.memoperands_begin(), Other.memoperands_end(),
+                     MemEnd);
+  assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs &&
+         "missing memrefs");
+  
+  return std::make_pair(MemBegin, CombinedNumMemRefs);
+}
+
 bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
   assert(!isBundledWithPred() && "Must be called on bundle header");
-  for (MachineBasicBlock::const_instr_iterator MII = this;; ++MII) {
+  for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
     if (MII->getDesc().getFlags() & Mask) {
       if (Type == AnyInBundle)
         return true;
@@ -865,13 +943,13 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
 
   if (isBundle()) {
     // Both instructions are bundles, compare MIs inside the bundle.
-    MachineBasicBlock::const_instr_iterator I1 = *this;
+    MachineBasicBlock::const_instr_iterator I1 = getIterator();
     MachineBasicBlock::const_instr_iterator E1 = getParent()->instr_end();
-    MachineBasicBlock::const_instr_iterator I2 = *Other;
+    MachineBasicBlock::const_instr_iterator I2 = Other->getIterator();
     MachineBasicBlock::const_instr_iterator E2= Other->getParent()->instr_end();
     while (++I1 != E1 && I1->isInsideBundle()) {
       ++I2;
-      if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(I2, Check))
+      if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(&*I2, Check))
         return false;
     }
   }
@@ -976,7 +1054,7 @@ unsigned MachineInstr::getNumExplicitOperands() const {
 void MachineInstr::bundleWithPred() {
   assert(!isBundledWithPred() && "MI is already bundled with its predecessor");
   setFlag(BundledPred);
-  MachineBasicBlock::instr_iterator Pred = this;
+  MachineBasicBlock::instr_iterator Pred = getIterator();
   --Pred;
   assert(!Pred->isBundledWithSucc() && "Inconsistent bundle flags");
   Pred->setFlag(BundledSucc);
@@ -985,7 +1063,7 @@ void MachineInstr::bundleWithPred() {
 void MachineInstr::bundleWithSucc() {
   assert(!isBundledWithSucc() && "MI is already bundled with its successor");
   setFlag(BundledSucc);
-  MachineBasicBlock::instr_iterator Succ = this;
+  MachineBasicBlock::instr_iterator Succ = getIterator();
   ++Succ;
   assert(!Succ->isBundledWithPred() && "Inconsistent bundle flags");
   Succ->setFlag(BundledPred);
@@ -994,7 +1072,7 @@ void MachineInstr::bundleWithSucc() {
 void MachineInstr::unbundleFromPred() {
   assert(isBundledWithPred() && "MI isn't bundled with its predecessor");
   clearFlag(BundledPred);
-  MachineBasicBlock::instr_iterator Pred = this;
+  MachineBasicBlock::instr_iterator Pred = getIterator();
   --Pred;
   assert(Pred->isBundledWithSucc() && "Inconsistent bundle flags");
   Pred->clearFlag(BundledSucc);
@@ -1003,7 +1081,7 @@ void MachineInstr::unbundleFromPred() {
 void MachineInstr::unbundleFromSucc() {
   assert(isBundledWithSucc() && "MI isn't bundled with its successor");
   clearFlag(BundledSucc);
-  MachineBasicBlock::instr_iterator Succ = this;
+  MachineBasicBlock::instr_iterator Succ = getIterator();
   ++Succ;
   assert(Succ->isBundledWithPred() && "Inconsistent bundle flags");
   Succ->clearFlag(BundledPred);
@@ -1139,7 +1217,7 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect(
 /// Return the number of instructions inside the MI bundle, not counting the
 /// header instruction.
 unsigned MachineInstr::getBundleSize() const {
-  MachineBasicBlock::const_instr_iterator I = this;
+  MachineBasicBlock::const_instr_iterator I = getIterator();
   unsigned Size = 0;
   while (I->isBundledWithSucc())
     ++Size, ++I;
@@ -1501,6 +1579,10 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
   return false;
 }
 
+bool MachineInstr::isLoadFoldBarrier() const {
+  return mayStore() || isCall() || hasUnmodeledSideEffects();
+}
+
 /// allDefsAreDead - Return true if all the defs of this instruction are dead.
 ///
 bool MachineInstr::allDefsAreDead() const {
@@ -1615,7 +1697,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     FirstOp = false;
   }
 
-
   for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
 
@@ -1706,17 +1787,26 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
 
   bool HaveSemi = false;
-  const unsigned PrintableFlags = FrameSetup;
+  const unsigned PrintableFlags = FrameSetup | FrameDestroy;
   if (Flags & PrintableFlags) {
-    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
     OS << " flags: ";
 
     if (Flags & FrameSetup)
       OS << "FrameSetup";
+
+    if (Flags & FrameDestroy)
+      OS << "FrameDestroy";
   }
 
   if (!memoperands_empty()) {
-    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
 
     OS << " mem:";
     for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
@@ -1729,7 +1819,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   // Print the regclass of any virtual registers encountered.
   if (MRI && !VirtRegs.empty()) {
-    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
     for (unsigned i = 0; i != VirtRegs.size(); ++i) {
       const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
       OS << " " << TRI->getRegClassName(RC)
@@ -1748,21 +1841,23 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   // Print debug location information.
   if (isDebugValue() && getOperand(e - 2).isMetadata()) {
-    if (!HaveSemi) OS << ";";
+    if (!HaveSemi)
+      OS << ";";
     auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
     OS << " line no:" <<  DV->getLine();
     if (auto *InlinedAt = debugLoc->getInlinedAt()) {
       DebugLoc InlinedAtDL(InlinedAt);
       if (InlinedAtDL && MF) {
         OS << " inlined @[ ";
-	InlinedAtDL.print(OS);
+        InlinedAtDL.print(OS);
         OS << " ]";
       }
     }
     if (isIndirectDebugValue())
       OS << " indirect";
   } else if (debugLoc && MF) {
-    if (!HaveSemi) OS << ";";
+    if (!HaveSemi)
+      OS << ";";
     OS << " dbg:";
     debugLoc.print(OS);
   }
@@ -1902,11 +1997,11 @@ void MachineInstr::clearRegisterDeads(unsigned Reg) {
   }
 }
 
-void MachineInstr::addRegisterDefReadUndef(unsigned Reg) {
+void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
       continue;
-    MO.setIsUndef();
+    MO.setIsUndef(IsUndef);
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
index cd820ee..4619daf 100644
--- a/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -293,15 +293,17 @@ MachineOperandIteratorBase::PhysRegInfo
 MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
                                            const TargetRegisterInfo *TRI) {
   bool AllDefsDead = true;
-  PhysRegInfo PRI = {false, false, false, false, false, false};
+  PhysRegInfo PRI = {false, false, false, false, false, false, false};
 
   assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
          "analyzePhysReg not given a physical register!");
   for (; isValid(); ++*this) {
     MachineOperand &MO = deref();
 
-    if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
-      PRI.Clobbers = true;    // Regmask clobbers Reg.
+    if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) {
+      PRI.Clobbered = true;
+      continue;
+    }
 
     if (!MO.isReg())
       continue;
@@ -310,33 +312,28 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
     if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
       continue;
 
-    bool IsRegOrSuperReg = MOReg == Reg || TRI->isSubRegister(MOReg, Reg);
-    bool IsRegOrOverlapping = MOReg == Reg || TRI->regsOverlap(MOReg, Reg);
-
-    if (IsRegOrSuperReg && MO.readsReg()) {
-      // Reg or a super-reg is read, and perhaps killed also.
-      PRI.Reads = true;
-      PRI.Kills = MO.isKill();
-    }
-
-    if (IsRegOrOverlapping && MO.readsReg()) {
-      PRI.ReadsOverlap = true;// Reg or an overlapping register is read.
-    }
-
-    if (!MO.isDef())
+    if (!TRI->regsOverlap(MOReg, Reg))
       continue;
 
-    if (IsRegOrSuperReg) {
-      PRI.Defines = true;     // Reg or a super-register is defined.
+    bool Covered = TRI->isSuperRegisterEq(Reg, MOReg);
+    if (MO.readsReg()) {
+      PRI.Read = true;
+      if (Covered) {
+        PRI.FullyRead = true;
+        if (MO.isKill())
+          PRI.Killed = true;
+      }
+    } else if (MO.isDef()) {
+      PRI.Defined = true;
+      if (Covered)
+        PRI.FullyDefined = true;
       if (!MO.isDead())
         AllDefsDead = false;
     }
-    if (IsRegOrOverlapping)
-      PRI.Clobbers = true;    // Reg or an overlapping reg is defined.
   }
 
-  if (AllDefsDead && PRI.Defines)
-    PRI.DefinesDead = true;   // Reg or super-register was defined and was dead.
+  if (AllDefsDead && PRI.FullyDefined)
+    PRI.DeadDef = true;
 
   return PRI;
 }
diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
index e9ea5ed..99a97d2 100644
--- a/contrib/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
@@ -138,7 +138,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineLoopInfo>();
       AU.addRequired<MachineDominatorTree>();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
       AU.addPreserved<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -153,7 +153,7 @@ namespace {
     }
 
   private:
-    /// CandidateInfo - Keep track of information about hoisting candidates.
+    /// Keep track of information about hoisting candidates.
     struct CandidateInfo {
       MachineInstr *MI;
       unsigned      Def;
@@ -162,149 +162,76 @@ namespace {
         : MI(mi), Def(def), FI(fi) {}
     };
 
-    /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
-    /// invariants out to the preheader.
     void HoistRegionPostRA();
 
-    /// HoistPostRA - When an instruction is found to only use loop invariant
-    /// operands that is safe to hoist, this instruction is called to do the
-    /// dirty work.
     void HoistPostRA(MachineInstr *MI, unsigned Def);
 
-    /// ProcessMI - Examine the instruction for potentai LICM candidate. Also
-    /// gather register def and frame object update information.
-    void ProcessMI(MachineInstr *MI,
-                   BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers,
-                   SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
+                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates);
 
-    /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the
-    /// current loop.
     void AddToLiveIns(unsigned Reg);
 
-    /// IsLICMCandidate - Returns true if the instruction may be a suitable
-    /// candidate for LICM. e.g. If the instruction is a call, then it's
-    /// obviously not safe to hoist it.
     bool IsLICMCandidate(MachineInstr &I);
 
-    /// IsLoopInvariantInst - Returns true if the instruction is loop
-    /// invariant. I.e., all virtual register operands are defined outside of
-    /// the loop, physical registers aren't accessed (explicitly or implicitly),
-    /// and the instruction is hoistable.
-    ///
     bool IsLoopInvariantInst(MachineInstr &I);
 
-    /// HasLoopPHIUse - Return true if the specified instruction is used by any
-    /// phi node in the current loop.
     bool HasLoopPHIUse(const MachineInstr *MI) const;
 
-    /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
-    /// and an use in the current loop, return true if the target considered
-    /// it 'high'.
     bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
                                unsigned Reg) const;
 
     bool IsCheapInstruction(MachineInstr &MI) const;
 
-    /// CanCauseHighRegPressure - Visit BBs from header to current BB,
-    /// check if hoisting an instruction of the given cost matrix can cause high
-    /// register pressure.
     bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost,
                                  bool Cheap);
 
-    /// UpdateBackTraceRegPressure - Traverse the back trace from header to
-    /// the current block and update their register pressures to reflect the
-    /// effect of hoisting MI from the current block to the preheader.
     void UpdateBackTraceRegPressure(const MachineInstr *MI);
 
-    /// IsProfitableToHoist - Return true if it is potentially profitable to
-    /// hoist the given loop invariant.
     bool IsProfitableToHoist(MachineInstr &MI);
 
-    /// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
-    /// If not then a load from this mbb may not be safe to hoist.
     bool IsGuaranteedToExecute(MachineBasicBlock *BB);
 
     void EnterScope(MachineBasicBlock *MBB);
 
     void ExitScope(MachineBasicBlock *MBB);
 
-    /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to given
-    /// dominator tree node if its a leaf or all of its children are done. Walk
-    /// up the dominator tree to destroy ancestors which are now done.
-    void ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
-
-    /// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
-    /// blocks dominated by the specified header block, and that are in the
-    /// current loop) in depth first order w.r.t the DominatorTree. This allows
-    /// us to visit definitions before uses, allowing us to hoist a loop body in
-    /// one pass without iteration.
-    ///
+    void ExitScopeIfDone(
+        MachineDomTreeNode *Node,
+        DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
+        DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
+
     void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
+
     void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
 
-    /// SinkIntoLoop - Sink instructions into loops if profitable. This
-    /// especially tries to prevent register spills caused by register pressure
-    /// if there is little to no overhead moving instructions into loops.
     void SinkIntoLoop();
 
-    /// InitRegPressure - Find all virtual register references that are liveout
-    /// of the preheader to initialize the starting "register pressure". Note
-    /// this does not count live through (livein but not used) registers.
     void InitRegPressure(MachineBasicBlock *BB);
 
-    /// calcRegisterCost - Calculate the additional register pressure that the
-    /// registers used in MI cause.
-    ///
-    /// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
-    /// figure out which usages are live-ins.
-    /// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
     DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
                                              bool ConsiderSeen,
                                              bool ConsiderUnseenAsDef);
 
-    /// UpdateRegPressure - Update estimate of register pressure after the
-    /// specified instruction.
     void UpdateRegPressure(const MachineInstr *MI,
                            bool ConsiderUnseenAsDef = false);
 
-    /// ExtractHoistableLoad - Unfold a load from the given machineinstr if
-    /// the load itself could be hoisted. Return the unfolded and hoistable
-    /// load, or null if the load couldn't be unfolded or if it wouldn't
-    /// be hoistable.
     MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
 
-    /// LookForDuplicate - Find an instruction amount PrevMIs that is a
-    /// duplicate of MI. Return this instruction if it's found.
-    const MachineInstr *LookForDuplicate(const MachineInstr *MI,
-                                     std::vector<const MachineInstr*> &PrevMIs);
+    const MachineInstr *
+    LookForDuplicate(const MachineInstr *MI,
+                     std::vector<const MachineInstr *> &PrevMIs);
 
-    /// EliminateCSE - Given a LICM'ed instruction, look for an instruction on
-    /// the preheader that compute the same value. If it's found, do a RAU on
-    /// with the definition of the existing instruction rather than hoisting
-    /// the instruction to the preheader.
-    bool EliminateCSE(MachineInstr *MI,
-           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI);
+    bool EliminateCSE(
+        MachineInstr *MI,
+        DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI);
 
-    /// MayCSE - Return true if the given instruction will be CSE'd if it's
-    /// hoisted out of the loop.
     bool MayCSE(MachineInstr *MI);
 
-    /// Hoist - When an instruction is found to only use loop invariant operands
-    /// that is safe to hoist, this instruction is called to do the dirty work.
-    /// It returns true if the instruction is hoisted.
     bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader);
 
-    /// InitCSEMap - Initialize the CSE map with instructions that are in the
-    /// current loop preheader that may become duplicates of instructions that
-    /// are hoisted out of the loop.
     void InitCSEMap(MachineBasicBlock *BB);
 
-    /// getCurPreheader - Get the preheader for the current loop, splitting
-    /// a critical edge if needed.
     MachineBasicBlock *getCurPreheader();
   };
 } // end anonymous namespace
@@ -315,12 +242,11 @@ INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm",
                 "Machine Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineLICM, "machinelicm",
                 "Machine Loop Invariant Code Motion", false, false)
 
-/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
-/// loop that has a unique predecessor.
+/// Test if the given loop is the outer-most loop that has a unique predecessor.
 static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
   // Check whether this loop even has a unique predecessor.
   if (!CurLoop->getLoopPredecessor())
@@ -367,7 +293,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   // Get our Loop information...
   MLI = &getAnalysis<MachineLoopInfo>();
   DT  = &getAnalysis<MachineDominatorTree>();
-  AA  = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
   while (!Worklist.empty()) {
@@ -402,15 +328,17 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-/// InstructionStoresToFI - Return true if instruction stores to the
-/// specified frame.
+/// Return true if instruction stores to the specified frame.
 static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end(); o != oe; ++o) {
-    if (!(*o)->isStore() || !(*o)->getPseudoValue())
+  // If we lost memory operands, conservatively assume that the instruction
+  // writes to all slots. 
+  if (MI->memoperands_empty())
+    return true;
+  for (const MachineMemOperand *MemOp : MI->memoperands()) {
+    if (!MemOp->isStore() || !MemOp->getPseudoValue())
       continue;
     if (const FixedStackPseudoSourceValue *Value =
-        dyn_cast<FixedStackPseudoSourceValue>((*o)->getPseudoValue())) {
+        dyn_cast<FixedStackPseudoSourceValue>(MemOp->getPseudoValue())) {
       if (Value->getFrameIndex() == FI)
         return true;
     }
@@ -418,7 +346,7 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
-/// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+/// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
 void MachineLICM::ProcessMI(MachineInstr *MI,
                             BitVector &PhysRegDefs,
@@ -428,8 +356,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
   bool RuledOut = false;
   bool HasNonInvariantUse = false;
   unsigned Def = 0;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isFI()) {
       // Remember if the instruction stores to the frame index.
       int FI = MO.getIndex();
@@ -506,8 +433,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
   }
 }
 
-/// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
-/// invariants out to the preheader.
+/// Walk the specified region of the CFG and hoist loop invariants out to the
+/// preheader.
 void MachineLICM::HoistRegionPostRA() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
@@ -523,38 +450,30 @@ void MachineLICM::HoistRegionPostRA() {
   // Walk the entire region, count number of defs for each register, and
   // collect potential LICM candidates.
   const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *BB = Blocks[i];
-
+  for (MachineBasicBlock *BB : Blocks) {
     // If the header of the loop containing this basic block is a landing pad,
     // then don't try to hoist instructions out of this loop.
     const MachineLoop *ML = MLI->getLoopFor(BB);
-    if (ML && ML->getHeader()->isLandingPad()) continue;
+    if (ML && ML->getHeader()->isEHPad()) continue;
 
     // Conservatively treat live-in's as an external def.
     // FIXME: That means a reload that're reused in successor block(s) will not
     // be LICM'ed.
-    for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
-           E = BB->livein_end(); I != E; ++I) {
-      unsigned Reg = *I;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    for (const auto &LI : BB->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
         PhysRegDefs.set(*AI);
     }
 
     SpeculationState = SpeculateUnknown;
-    for (MachineBasicBlock::iterator
-           MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
-      MachineInstr *MI = &*MII;
-      ProcessMI(MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
-    }
+    for (MachineInstr &MI : *BB)
+      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
   }
 
   // Gather the registers read / clobbered by the terminator.
   BitVector TermRegs(NumRegs);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
   if (TI != Preheader->end()) {
-    for (unsigned i = 0, e = TI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = TI->getOperand(i);
+    for (const MachineOperand &MO : TI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned Reg = MO.getReg();
@@ -573,17 +492,16 @@ void MachineLICM::HoistRegionPostRA() {
   // 3. Make sure candidate def should not clobber
   //    registers read by the terminator. Similarly its def should not be
   //    clobbered by the terminator.
-  for (unsigned i = 0, e = Candidates.size(); i != e; ++i) {
-    if (Candidates[i].FI != INT_MIN &&
-        StoredFIs.count(Candidates[i].FI))
+  for (CandidateInfo &Candidate : Candidates) {
+    if (Candidate.FI != INT_MIN &&
+        StoredFIs.count(Candidate.FI))
       continue;
 
-    unsigned Def = Candidates[i].Def;
+    unsigned Def = Candidate.Def;
     if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
       bool Safe = true;
-      MachineInstr *MI = Candidates[i].MI;
-      for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
-        const MachineOperand &MO = MI->getOperand(j);
+      MachineInstr *MI = Candidate.MI;
+      for (const MachineOperand &MO : MI->operands()) {
         if (!MO.isReg() || MO.isDef() || !MO.getReg())
           continue;
         unsigned Reg = MO.getReg();
@@ -596,24 +514,20 @@ void MachineLICM::HoistRegionPostRA() {
         }
       }
       if (Safe)
-        HoistPostRA(MI, Candidates[i].Def);
+        HoistPostRA(MI, Candidate.Def);
     }
   }
 }
 
-/// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
-/// loop, and make sure it is not killed by any instructions in the loop.
+/// Add register 'Reg' to the livein sets of BBs in the current loop, and make
+/// sure it is not killed by any instructions in the loop.
 void MachineLICM::AddToLiveIns(unsigned Reg) {
   const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *BB = Blocks[i];
+  for (MachineBasicBlock *BB : Blocks) {
     if (!BB->isLiveIn(Reg))
       BB->addLiveIn(Reg);
-    for (MachineBasicBlock::iterator
-           MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
-      MachineInstr *MI = &*MII;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
+    for (MachineInstr &MI : *BB) {
+      for (MachineOperand &MO : MI.operands()) {
         if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue;
         if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg()))
           MO.setIsKill(false);
@@ -622,9 +536,8 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
   }
 }
 
-/// HoistPostRA - When an instruction is found to only use loop invariant
-/// operands that is safe to hoist, this instruction is called to do the
-/// dirty work.
+/// When an instruction is found to only use loop invariant operands that is
+/// safe to hoist, this instruction is called to do the dirty work.
 void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *Preheader = getCurPreheader();
 
@@ -646,8 +559,8 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   Changed = true;
 }
 
-// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
-// If not then a load from this mbb may not be safe to hoist.
+/// Check if this mbb is guaranteed to execute. If not then a load from this mbb
+/// may not be safe to hoist.
 bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   if (SpeculationState != SpeculateUnknown)
     return SpeculationState == SpeculateFalse;
@@ -656,8 +569,8 @@ bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
     // Check loop exiting blocks.
     SmallVector<MachineBasicBlock*, 8> CurrentLoopExitingBlocks;
     CurLoop->getExitingBlocks(CurrentLoopExitingBlocks);
-    for (unsigned i = 0, e = CurrentLoopExitingBlocks.size(); i != e; ++i)
-      if (!DT->dominates(BB, CurrentLoopExitingBlocks[i])) {
+    for (MachineBasicBlock *CurrentLoopExitingBlock : CurrentLoopExitingBlocks)
+      if (!DT->dominates(BB, CurrentLoopExitingBlock)) {
         SpeculationState = SpeculateTrue;
         return false;
       }
@@ -679,9 +592,9 @@ void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
   BackTrace.pop_back();
 }
 
-/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
-/// dominator tree node if its a leaf or all of its children are done. Walk
-/// up the dominator tree to destroy ancestors which are now done.
+/// Destroy scope for the MBB that corresponds to the given dominator tree node
+/// if its a leaf or all of its children are done. Walk up the dominator tree to
+/// destroy ancestors which are now done.
 void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
@@ -701,11 +614,10 @@ void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
   }
 }
 
-/// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
-/// blocks dominated by the specified header block, and that are in the
-/// current loop) in depth first order w.r.t the DominatorTree. This allows
-/// us to visit definitions before uses, allowing us to hoist a loop body in
-/// one pass without iteration.
+/// Walk the specified loop in the CFG (defined by all blocks dominated by the
+/// specified header block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
 void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   MachineBasicBlock *Preheader = getCurPreheader();
@@ -727,7 +639,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
     // If the header of the loop containing this basic block is a landing pad,
     // then don't try to hoist instructions out of this loop.
     const MachineLoop *ML = MLI->getLoopFor(BB);
-    if (ML && ML->getHeader()->isLandingPad())
+    if (ML && ML->getHeader()->isEHPad())
       continue;
 
     // If this subregion is not in the top level loop at all, exit.
@@ -764,8 +676,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   InitRegPressure(Preheader);
 
   // Now perform LICM.
-  for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = Scopes[i];
+  for (MachineDomTreeNode *Node : Scopes) {
     MachineBasicBlock *MBB = Node->getBlock();
 
     EnterScope(MBB);
@@ -786,6 +697,9 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   }
 }
 
+/// Sink instructions into loops if profitable. This especially tries to prevent
+/// register spills caused by register pressure if there is little to no
+/// overhead moving instructions into loops.
 void MachineLICM::SinkIntoLoop() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
@@ -796,8 +710,8 @@ void MachineLICM::SinkIntoLoop() {
        I != Preheader->instr_end(); ++I) {
     // We need to ensure that we can safely move this instruction into the loop.
     // As such, it must not have side-effects, e.g. such as a call has.  
-    if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(I))
-      Candidates.push_back(I);
+    if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I))
+      Candidates.push_back(&*I);
   }
 
   for (MachineInstr *I : Candidates) {
@@ -837,9 +751,9 @@ static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
 }
 
-/// InitRegPressure - Find all virtual register references that are liveout of
-/// the preheader to initialize the starting "register pressure". Note this
-/// does not count live through (livein but not used) registers.
+/// Find all virtual register references that are liveout of the preheader to
+/// initialize the starting "register pressure". Note this does not count live
+/// through (livein but not used) registers.
 void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
   std::fill(RegPressure.begin(), RegPressure.end(), 0);
 
@@ -858,8 +772,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
     UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true);
 }
 
-/// UpdateRegPressure - Update estimate of register pressure after the
-/// specified instruction.
+/// Update estimate of register pressure after the specified instruction.
 void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
                                     bool ConsiderUnseenAsDef) {
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
@@ -872,6 +785,12 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
   }
 }
 
+/// Calculate the additional register pressure that the registers used in MI
+/// cause.
+///
+/// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
+/// figure out which usages are live-ins.
+/// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
 DenseMap<unsigned, int>
 MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
                               bool ConsiderUnseenAsDef) {
@@ -915,23 +834,26 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
   return Cost;
 }
 
-/// isLoadFromGOTOrConstantPool - Return true if this machine instruction
-/// loads from global offset table or constant pool.
-static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
+/// Return true if this machine instruction loads from global offset table or
+/// constant pool.
+static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
   assert (MI.mayLoad() && "Expected MI that loads!");
-  for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
-         E = MI.memoperands_end(); I != E; ++I) {
-    if (const PseudoSourceValue *PSV = (*I)->getPseudoValue()) {
-      if (PSV == PSV->getGOT() || PSV == PSV->getConstantPool())
+  
+  // If we lost memory operands, conservatively assume that the instruction
+  // reads from everything.. 
+  if (MI.memoperands_empty())
+    return true;
+
+  for (MachineMemOperand *MemOp : MI.memoperands())
+    if (const PseudoSourceValue *PSV = MemOp->getPseudoValue())
+      if (PSV->isGOT() || PSV->isConstantPool())
         return true;
-    }
-  }
+
   return false;
 }
 
-/// IsLICMCandidate - Returns true if the instruction may be a suitable
-/// candidate for LICM. e.g. If the instruction is a call, then it's obviously
-/// not safe to hoist it.
+/// Returns true if the instruction may be a suitable candidate for LICM.
+/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
 bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = true;
@@ -944,16 +866,16 @@ bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   // from constant memory are not safe to speculate all the time, for example
   // indexed load from a jump table.
   // Stores and side effects are already checked by isSafeToMove.
-  if (I.mayLoad() && !isLoadFromGOTOrConstantPool(I) &&
+  if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) &&
       !IsGuaranteedToExecute(I.getParent()))
     return false;
 
   return true;
 }
 
-/// IsLoopInvariantInst - Returns true if the instruction is loop
-/// invariant. I.e., all virtual register operands are defined outside of the
-/// loop, physical registers aren't accessed explicitly, and there are no side
+/// Returns true if the instruction is loop invariant.
+/// I.e., all virtual register operands are defined outside of the loop,
+/// physical registers aren't accessed explicitly, and there are no side
 /// effects that aren't captured by the operands or other flags.
 ///
 bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
@@ -961,9 +883,7 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
     return false;
 
   // The instruction is loop invariant if all of its operands are.
-  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = I.getOperand(i);
-
+  for (const MachineOperand &MO : I.operands()) {
     if (!MO.isReg())
       continue;
 
@@ -1007,8 +927,8 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
 }
 
 
-/// HasLoopPHIUse - Return true if the specified instruction is used by a
-/// phi node and hoisting it could cause a copy to be inserted.
+/// Return true if the specified instruction is used by a phi node and hoisting
+/// it could cause a copy to be inserted.
 bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
   SmallVector<const MachineInstr*, 8> Work(1, MI);
   do {
@@ -1042,9 +962,8 @@ bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
   return false;
 }
 
-/// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
-/// and an use in the current loop, return true if the target considered
-/// it 'high'.
+/// Compute operand latency between a def of 'Reg' and an use in the current
+/// loop, return true if the target considered it high.
 bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
                                         unsigned DefIdx, unsigned Reg) const {
   if (MRI->use_nodbg_empty(Reg))
@@ -1074,8 +993,8 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
   return false;
 }
 
-/// IsCheapInstruction - Return true if the instruction is marked "cheap" or
-/// the operand latency between its def and a use is one or less.
+/// Return true if the instruction is marked "cheap" or the operand latency
+/// between its def and a use is one or less.
 bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
   if (TII->isAsCheapAsAMove(&MI) || MI.isCopyLike())
     return true;
@@ -1099,9 +1018,8 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
   return isCheap;
 }
 
-/// CanCauseHighRegPressure - Visit BBs from header to current BB, check
-/// if hoisting an instruction of the given cost matrix can cause high
-/// register pressure.
+/// Visit BBs from header to current BB, check if hoisting an instruction of the
+/// given cost matrix can cause high register pressure.
 bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
                                           bool CheapInstr) {
   for (const auto &RPIdAndCost : Cost) {
@@ -1124,9 +1042,9 @@ bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
   return false;
 }
 
-/// UpdateBackTraceRegPressure - Traverse the back trace from header to the
-/// current block and update their register pressures to reflect the effect
-/// of hoisting MI from the current block to the preheader.
+/// Traverse the back trace from header to the current block and update their
+/// register pressures to reflect the effect of hoisting MI from the current
+/// block to the preheader.
 void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
   // First compute the 'cost' of the instruction, i.e. its contribution
   // to register pressure.
@@ -1139,8 +1057,8 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
       RP[RPIdAndCost.first] += RPIdAndCost.second;
 }
 
-/// IsProfitableToHoist - Return true if it is potentially profitable to hoist
-/// the given loop invariant.
+/// Return true if it is potentially profitable to hoist the given loop
+/// invariant.
 bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   if (MI.isImplicitDef())
     return true;
@@ -1230,6 +1148,9 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   return true;
 }
 
+/// Unfold a load from the given machineinstr if the load itself could be
+/// hoisted. Return the unfolded and hoistable load, or null if the load
+/// couldn't be unfolded or if it wouldn't be hoistable.
 MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   // Don't unfold simple loads.
   if (MI->canFoldAsLoad())
@@ -1287,25 +1208,30 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   return NewMIs[0];
 }
 
+/// Initialize the CSE map with instructions that are in the current loop
+/// preheader that may become duplicates of instructions that are hoisted
+/// out of the loop.
 void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
-  for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
-    const MachineInstr *MI = &*I;
-    unsigned Opcode = MI->getOpcode();
-    CSEMap[Opcode].push_back(MI);
-  }
+  for (MachineInstr &MI : *BB)
+    CSEMap[MI.getOpcode()].push_back(&MI);
 }
 
+/// Find an instruction amount PrevMIs that is a duplicate of MI.
+/// Return this instruction if it's found.
 const MachineInstr*
 MachineLICM::LookForDuplicate(const MachineInstr *MI,
                               std::vector<const MachineInstr*> &PrevMIs) {
-  for (unsigned i = 0, e = PrevMIs.size(); i != e; ++i) {
-    const MachineInstr *PrevMI = PrevMIs[i];
+  for (const MachineInstr *PrevMI : PrevMIs)
     if (TII->produceSameValue(MI, PrevMI, (PreRegAlloc ? MRI : nullptr)))
       return PrevMI;
-  }
+
   return nullptr;
 }
 
+/// Given a LICM'ed instruction, look for an instruction on the preheader that
+/// computes the same value. If it's found, do a RAU on with the definition of
+/// the existing instruction rather than hoisting the instruction to the
+/// preheader.
 bool MachineLICM::EliminateCSE(MachineInstr *MI,
           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) {
   // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
@@ -1348,8 +1274,7 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
       }
     }
 
-    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
-      unsigned Idx = Defs[i];
+    for (unsigned Idx : Defs) {
       unsigned Reg = MI->getOperand(Idx).getReg();
       unsigned DupReg = Dup->getOperand(Idx).getReg();
       MRI->replaceRegWith(Reg, DupReg);
@@ -1363,8 +1288,8 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
   return false;
 }
 
-/// MayCSE - Return true if the given instruction will be CSE'd if it's
-/// hoisted out of the loop.
+/// Return true if the given instruction will be CSE'd if it's hoisted out of
+/// the loop.
 bool MachineLICM::MayCSE(MachineInstr *MI) {
   unsigned Opcode = MI->getOpcode();
   DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
@@ -1377,9 +1302,9 @@ bool MachineLICM::MayCSE(MachineInstr *MI) {
   return LookForDuplicate(MI, CI->second) != nullptr;
 }
 
-/// Hoist - When an instruction is found to use only loop invariant operands
+/// When an instruction is found to use only loop invariant operands
 /// that are safe to hoist, this instruction is called to do the dirty work.
-///
+/// It returns true if the instruction is hoisted.
 bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   // First check whether we should hoist this instruction.
   if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
@@ -1422,11 +1347,9 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
     // Clear the kill flags of any register this instruction defines,
     // since they may need to be live throughout the entire loop
     // rather than just live for part of it.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isDef() && !MO.isDead())
         MRI->clearKillFlags(MO.getReg());
-    }
 
     // Add to the CSE map.
     if (CI != CSEMap.end())
@@ -1441,6 +1364,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   return true;
 }
 
+/// Get the preheader for the current loop, splitting a critical edge if needed.
 MachineBasicBlock *MachineLICM::getCurPreheader() {
   // Determine the block to which to hoist instructions. If we can't find a
   // suitable loop predecessor, we can't do any hoisting.
diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
index ce6abdd..2f5c9e0 100644
--- a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -37,7 +37,7 @@ char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
 
 bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
   releaseMemory();
-  LI.Analyze(getAnalysis<MachineDominatorTree>().getBase());
+  LI.analyze(getAnalysis<MachineDominatorTree>().getBase());
   return false;
 }
 
@@ -51,11 +51,11 @@ MachineBasicBlock *MachineLoop::getTopBlock() {
   MachineBasicBlock *TopMBB = getHeader();
   MachineFunction::iterator Begin = TopMBB->getParent()->begin();
   if (TopMBB != Begin) {
-    MachineBasicBlock *PriorMBB = std::prev(MachineFunction::iterator(TopMBB));
+    MachineBasicBlock *PriorMBB = &*std::prev(TopMBB->getIterator());
     while (contains(PriorMBB)) {
       TopMBB = PriorMBB;
       if (TopMBB == Begin) break;
-      PriorMBB = std::prev(MachineFunction::iterator(TopMBB));
+      PriorMBB = &*std::prev(TopMBB->getIterator());
     }
   }
   return TopMBB;
@@ -65,11 +65,12 @@ MachineBasicBlock *MachineLoop::getBottomBlock() {
   MachineBasicBlock *BotMBB = getHeader();
   MachineFunction::iterator End = BotMBB->getParent()->end();
   if (BotMBB != std::prev(End)) {
-    MachineBasicBlock *NextMBB = std::next(MachineFunction::iterator(BotMBB));
+    MachineBasicBlock *NextMBB = &*std::next(BotMBB->getIterator());
     while (contains(NextMBB)) {
       BotMBB = NextMBB;
-      if (BotMBB == std::next(MachineFunction::iterator(BotMBB))) break;
-      NextMBB = std::next(MachineFunction::iterator(BotMBB));
+      if (BotMBB == &*std::next(BotMBB->getIterator()))
+        break;
+      NextMBB = &*std::next(BotMBB->getIterator());
     }
   }
   return BotMBB;
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 6a20624..1956a70 100644
--- a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -9,12 +9,12 @@
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/PointerUnion.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -35,7 +35,7 @@ char MachineModuleInfo::ID = 0;
 MachineModuleInfoImpl::~MachineModuleInfoImpl() {}
 
 namespace llvm {
-class MMIAddrLabelMapCallbackPtr : CallbackVH {
+class MMIAddrLabelMapCallbackPtr final : CallbackVH {
   MMIAddrLabelMap *Map;
 public:
   MMIAddrLabelMapCallbackPtr() : Map(nullptr) {}
@@ -209,9 +209,8 @@ bool MachineModuleInfo::doInitialization(Module &M) {
   CurCallSite = 0;
   CallsEHReturn = false;
   CallsUnwindInit = false;
+  HasEHFunclets = false;
   DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
-  // Always emit some info, by default "no personality" info.
-  Personalities.push_back(nullptr);
   PersonalityTypeCache = EHPersonality::Unknown;
   AddrLabelSymbols = nullptr;
   TheModule = nullptr;
@@ -249,6 +248,7 @@ void MachineModuleInfo::EndFunction() {
   FilterEnds.clear();
   CallsEHReturn = false;
   CallsUnwindInit = false;
+  HasEHFunclets = false;
   VariableDbgInfos.clear();
 }
 
@@ -314,32 +314,11 @@ MCSymbol *MachineModuleInfo::addLandingPad(MachineBasicBlock *LandingPad) {
   return LandingPadLabel;
 }
 
-/// addPersonality - Provide the personality function for the exception
-/// information.
-void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
-                                       const Function *Personality) {
-  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
-  LP.Personality = Personality;
-  addPersonality(Personality);
-}
-
 void MachineModuleInfo::addPersonality(const Function *Personality) {
   for (unsigned i = 0; i < Personalities.size(); ++i)
     if (Personalities[i] == Personality)
       return;
-
-  // If this is the first personality we're adding go
-  // ahead and add it at the beginning.
-  if (!Personalities[0])
-    Personalities[0] = Personality;
-  else
-    Personalities.push_back(Personality);
-}
-
-void MachineModuleInfo::addWinEHState(MachineBasicBlock *LandingPad,
-                                      int State) {
-  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
-  LP.WinEHState = State;
+  Personalities.push_back(Personality);
 }
 
 /// addCatchTypeInfo - Provide the catch typeinfo for a landing pad.
@@ -481,56 +460,3 @@ try_next:;
   FilterIds.push_back(0); // terminator
   return FilterID;
 }
-
-/// getPersonality - Return the personality function for the current function.
-const Function *MachineModuleInfo::getPersonality() const {
-  for (const LandingPadInfo &LPI : LandingPads)
-    if (LPI.Personality)
-      return LPI.Personality;
-  return nullptr;
-}
-
-EHPersonality MachineModuleInfo::getPersonalityType() {
-  if (PersonalityTypeCache == EHPersonality::Unknown) {
-    if (const Function *F = getPersonality())
-      PersonalityTypeCache = classifyEHPersonality(F);
-  }
-  return PersonalityTypeCache;
-}
-
-/// getPersonalityIndex - Return unique index for current personality
-/// function. NULL/first personality function should always get zero index.
-unsigned MachineModuleInfo::getPersonalityIndex() const {
-  const Function* Personality = nullptr;
-
-  // Scan landing pads. If there is at least one non-NULL personality - use it.
-  for (unsigned i = 0, e = LandingPads.size(); i != e; ++i)
-    if (LandingPads[i].Personality) {
-      Personality = LandingPads[i].Personality;
-      break;
-    }
-
-  for (unsigned i = 0, e = Personalities.size(); i < e; ++i) {
-    if (Personalities[i] == Personality)
-      return i;
-  }
-
-  // This will happen if the current personality function is
-  // in the zero index.
-  return 0;
-}
-
-const Function *MachineModuleInfo::getWinEHParent(const Function *F) const {
-  StringRef WinEHParentName =
-      F->getFnAttribute("wineh-parent").getValueAsString();
-  if (WinEHParentName.empty() || WinEHParentName == F->getName())
-    return F;
-  return F->getParent()->getFunction(WinEHParentName);
-}
-
-WinEHFuncInfo &MachineModuleInfo::getWinEHFuncInfo(const Function *F) {
-  auto &Ptr = FuncInfoMap[getWinEHParent(F)];
-  if (!Ptr)
-    Ptr.reset(new WinEHFuncInfo);
-  return *Ptr;
-}
diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index e883ce5..03c82f4 100644
--- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -27,13 +27,11 @@ void MachineRegisterInfo::Delegate::anchor() {}
 MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
   : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
     TracksSubRegLiveness(false) {
+  unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
-  UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
-
-  // Create the physreg use/def lists.
-  PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr);
+  UsedPhysRegMask.resize(NumRegs);
+  PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]());
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
@@ -117,6 +115,8 @@ void MachineRegisterInfo::clearVirtRegs() {
   }
 #endif
   VRegInfo.clear();
+  for (auto &I : LiveIns)
+    I.second = 0;
 }
 
 void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
@@ -394,8 +394,7 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
     }
 }
 
-unsigned MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const
-{
+LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
   // Lane masks are only defined for vregs.
   assert(TargetRegisterInfo::isVirtualRegister(Reg));
   const TargetRegisterClass &TRC = *getRegClass(Reg);
@@ -468,11 +467,8 @@ static bool isNoReturnDef(const MachineOperand &MO) {
   if (MF.getFunction()->hasFnAttribute(Attribute::UWTable))
     return false;
   const Function *Called = getCalledFunction(MI);
-  if (Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn)
-      || !Called->hasFnAttribute(Attribute::NoUnwind))
-    return false;
-
-  return true;
+  return !(Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn) ||
+           !Called->hasFnAttribute(Attribute::NoUnwind));
 }
 
 bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg) const {
@@ -488,3 +484,15 @@ bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg) const {
   }
   return false;
 }
+
+bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
+  if (UsedPhysRegMask.test(PhysReg))
+    return true;
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegAliasIterator AliasReg(PhysReg, TRI, true); AliasReg.isValid();
+       ++AliasReg) {
+    if (!reg_nodbg_empty(*AliasReg))
+      return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
index a48e54c..bcee15c 100644
--- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -49,6 +49,11 @@ DumpCriticalPathLength("misched-dcpl", cl::Hidden,
 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
   cl::desc("Pop up a window to show MISched dags after they are processed"));
 
+/// In some situations a few uninteresting nodes depend on nearly all other
+/// nodes in the graph, provide a cutoff to hide them.
+static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
+  cl::desc("Hide nodes with more predecessor/successor than cutoff"));
+
 static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
   cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
 
@@ -106,7 +111,7 @@ public:
   void print(raw_ostream &O, const Module* = nullptr) const override;
 
 protected:
-  void scheduleRegions(ScheduleDAGInstrs &Scheduler);
+  void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
 };
 
 /// MachineScheduler runs after coalescing and before register allocation.
@@ -146,7 +151,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;
 
 INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                       "Machine Instruction Scheduler", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
@@ -161,7 +166,7 @@ void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequiredID(MachineDominatorsID);
   AU.addRequired<MachineLoopInfo>();
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<TargetPassConfig>();
   AU.addRequired<SlotIndexes>();
   AU.addPreserved<SlotIndexes>();
@@ -315,14 +320,14 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   } else if (!mf.getSubtarget().enableMachineScheduler())
     return false;
 
-  DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs()));
+  DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
 
   // Initialize the context of the pass.
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
   MDT = &getAnalysis<MachineDominatorTree>();
   PassConfig = &getAnalysis<TargetPassConfig>();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   LIS = &getAnalysis<LiveIntervals>();
 
@@ -335,7 +340,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
-  scheduleRegions(*Scheduler);
+  scheduleRegions(*Scheduler, false);
 
   DEBUG(LIS->dump());
   if (VerifyScheduling)
@@ -363,7 +368,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
-  scheduleRegions(*Scheduler);
+  scheduleRegions(*Scheduler, true);
 
   if (VerifyScheduling)
     MF->verify(this, "After post machine scheduling.");
@@ -383,15 +388,14 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 static bool isSchedBoundary(MachineBasicBlock::iterator MI,
                             MachineBasicBlock *MBB,
                             MachineFunction *MF,
-                            const TargetInstrInfo *TII,
-                            bool IsPostRA) {
+                            const TargetInstrInfo *TII) {
   return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF);
 }
 
 /// Main driver for both MachineScheduler and PostMachineScheduler.
-void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
+void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
+                                           bool FixKillFlags) {
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  bool IsPostRA = Scheduler.isPostRA();
 
   // Visit all machine basic blocks.
   //
@@ -400,7 +404,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
        MBB != MBBEnd; ++MBB) {
 
-    Scheduler.startBlock(MBB);
+    Scheduler.startBlock(&*MBB);
 
 #ifndef NDEBUG
     if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
@@ -429,7 +433,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
 
       // Avoid decrementing RegionEnd for blocks with no terminator.
       if (RegionEnd != MBB->end() ||
-          isSchedBoundary(std::prev(RegionEnd), MBB, MF, TII, IsPostRA)) {
+          isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
         --RegionEnd;
         // Count the boundary instruction.
         --RemainingInstrs;
@@ -440,14 +444,14 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingInstrs) {
-        if (isSchedBoundary(std::prev(I), MBB, MF, TII, IsPostRA))
+        if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII))
           break;
         if (!I->isDebugValue())
           ++NumRegionInstrs;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
-      Scheduler.enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
+      Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (I == RegionEnd || I == std::prev(RegionEnd)) {
@@ -456,8 +460,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
         Scheduler.exitRegion();
         continue;
       }
-      DEBUG(dbgs() << "********** " << ((Scheduler.isPostRA()) ? "PostRA " : "")
-            << "MI Scheduling **********\n");
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
       DEBUG(dbgs() << MF->getName()
             << ":BB#" << MBB->getNumber() << " " << MBB->getName()
             << "\n  From: " << *I << "    To: ";
@@ -484,11 +487,11 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
     }
     assert(RemainingInstrs == 0 && "Instruction count mismatch!");
     Scheduler.finishBlock();
-    if (Scheduler.isPostRA()) {
-      // FIXME: Ideally, no further passes should rely on kill flags. However,
-      // thumb2 size reduction is currently an exception.
-      Scheduler.fixupKills(MBB);
-    }
+    // FIXME: Ideally, no further passes should rely on kill flags. However,
+    // thumb2 size reduction is currently an exception, so the PostMIScheduler
+    // needs to do this.
+    if (FixKillFlags)
+        Scheduler.fixupKills(&*MBB);
   }
   Scheduler.finalizeSchedule();
 }
@@ -499,7 +502,7 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
 
 LLVM_DUMP_METHOD
 void ReadyQueue::dump() {
-  dbgs() << Name << ": ";
+  dbgs() << "Queue " << Name << ": ";
   for (unsigned i = 0, e = Queue.size(); i < e; ++i)
     dbgs() << Queue[i]->NodeNum << " ";
   dbgs() << "\n";
@@ -660,6 +663,9 @@ bool ScheduleDAGMI::checkSchedLimit() {
 /// does not consider liveness or register pressure. It is useful for PostRA
 /// scheduling and potentially other custom schedulers.
 void ScheduleDAGMI::schedule() {
+  DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
+  DEBUG(SchedImpl->dumpPolicy());
+
   // Build the DAG.
   buildSchedGraph(AA);
 
@@ -682,7 +688,11 @@ void ScheduleDAGMI::schedule() {
   initQueues(TopRoots, BotRoots);
 
   bool IsTopNode = false;
-  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+  while (true) {
+    DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
     assert(!SU->isScheduled && "Node already scheduled");
     if (!checkSchedLimit())
       break;
@@ -900,6 +910,13 @@ void ScheduleDAGMILive::initRegPressure() {
     updatePressureDiffs(LiveUses);
   }
 
+  DEBUG(
+    dbgs() << "Top Pressure:\n";
+    dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+    dbgs() << "Bottom Pressure:\n";
+    dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
+  );
+
   assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
 
   // Cache the list of excess pressure sets in this region. This will also track
@@ -976,18 +993,24 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
     }
     // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
     assert(VNI && "No live value at use.");
-    for (VReg2UseMap::iterator
-           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
-      SUnit *SU = UI->SU;
-      DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
-            << *SU->getInstr());
+    for (const VReg2SUnit &V2SU
+         : make_range(VRegUses.find(Reg), VRegUses.end())) {
+      SUnit *SU = V2SU.SU;
       // If this use comes before the reaching def, it cannot be a last use, so
       // descrease its pressure change.
       if (!SU->isScheduled && SU != &ExitSU) {
         LiveQueryResult LRQ
           = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
-        if (LRQ.valueIn() == VNI)
-          getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
+        if (LRQ.valueIn() == VNI) {
+          PressureDiff &PDiff = getPressureDiff(SU);
+          PDiff.addPressureChange(Reg, true, &MRI);
+          DEBUG(
+            dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+                   << *SU->getInstr();
+            dbgs() << "              to ";
+            PDiff.dump(*TRI);
+          );
+        }
       }
     }
   }
@@ -998,12 +1021,14 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
 /// only includes instructions that have DAG nodes, not scheduling boundaries.
 ///
 /// This is a skeletal driver, with all the functionality pushed into helpers,
-/// so that it can be easilly extended by experimental schedulers. Generally,
+/// so that it can be easily extended by experimental schedulers. Generally,
 /// implementing MachineSchedStrategy should be sufficient to implement a new
 /// scheduling algorithm. However, if a scheduler further subclasses
 /// ScheduleDAGMILive then it will want to override this virtual method in order
 /// to update any specialized state.
 void ScheduleDAGMILive::schedule() {
+  DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
+  DEBUG(SchedImpl->dumpPolicy());
   buildDAGWithRegPressure();
 
   Topo.InitDAGTopologicalSorting();
@@ -1017,8 +1042,16 @@ void ScheduleDAGMILive::schedule() {
   // This may initialize a DFSResult to be used for queue priority.
   SchedImpl->initialize(this);
 
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
+  DEBUG(
+    for (const SUnit &SU : SUnits) {
+      SU.dumpAll(this);
+      if (ShouldTrackPressure) {
+        dbgs() << "  Pressure Diff      : ";
+        getPressureDiff(&SU).dump(*TRI);
+      }
+      dbgs() << '\n';
+    }
+  );
   if (ViewMISchedDAGs) viewGraph();
 
   // Initialize ready queues now that the DAG and priority data are finalized.
@@ -1030,7 +1063,11 @@ void ScheduleDAGMILive::schedule() {
   }
 
   bool IsTopNode = false;
-  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+  while (true) {
+    DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
     assert(!SU->isScheduled && "Node already scheduled");
     if (!checkSchedLimit())
       break;
@@ -1149,14 +1186,15 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
     unsigned LiveOutHeight = DefSU->getHeight();
     unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
     // Visit all local users of the vreg def.
-    for (VReg2UseMap::iterator
-           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
-      if (UI->SU == &ExitSU)
+    for (const VReg2SUnit &V2SU
+         : make_range(VRegUses.find(Reg), VRegUses.end())) {
+      SUnit *SU = V2SU.SU;
+      if (SU == &ExitSU)
         continue;
 
       // Only consider uses of the phi.
       LiveQueryResult LRQ =
-        LI.Query(LIS->getInstructionIndex(UI->SU->getInstr()));
+        LI.Query(LIS->getInstructionIndex(SU->getInstr()));
       if (!LRQ.valueIn()->isPHIDef())
         continue;
 
@@ -1164,10 +1202,10 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
       // overestimate in strange cases. This allows cyclic latency to be
       // estimated as the minimum slack of the vreg's depth or height.
       unsigned CyclicLatency = 0;
-      if (LiveOutDepth > UI->SU->getDepth())
-        CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+      if (LiveOutDepth > SU->getDepth())
+        CyclicLatency = LiveOutDepth - SU->getDepth();
 
-      unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
+      unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
       if (LiveInHeight > LiveOutHeight) {
         if (LiveInHeight - LiveOutHeight < CyclicLatency)
           CyclicLatency = LiveInHeight - LiveOutHeight;
@@ -1176,7 +1214,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
         CyclicLatency = 0;
 
       DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
-            << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
+            << SU->NodeNum << ") = " << CyclicLatency << "c\n");
       if (CyclicLatency > MaxCyclicLatency)
         MaxCyclicLatency = CyclicLatency;
     }
@@ -1203,6 +1241,11 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
       // Update top scheduled pressure.
       TopRPTracker.advance();
       assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      DEBUG(
+        dbgs() << "Top Pressure:\n";
+        dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+      );
+
       updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
     }
   }
@@ -1225,6 +1268,11 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
       SmallVector<unsigned, 8> LiveUses;
       BotRPTracker.recede(&LiveUses);
       assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      DEBUG(
+        dbgs() << "Bottom Pressure:\n";
+        dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
+      );
+
       updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
       updatePressureDiffs(LiveUses);
     }
@@ -1349,25 +1397,49 @@ namespace {
 /// \brief Post-process the DAG to create cluster edges between instructions
 /// that may be fused by the processor into a single operation.
 class MacroFusion : public ScheduleDAGMutation {
-  const TargetInstrInfo *TII;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
 public:
-  MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
+  MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
+    : TII(TII), TRI(TRI) {}
 
   void apply(ScheduleDAGMI *DAG) override;
 };
 } // anonymous
 
+/// Returns true if \p MI reads a register written by \p Other.
+static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
+                       const MachineInstr &Other) {
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Other.modifiesRegister(Reg, &TRI))
+      return true;
+  }
+  return false;
+}
+
 /// \brief Callback from DAG postProcessing to create cluster edges to encourage
 /// fused operations.
 void MacroFusion::apply(ScheduleDAGMI *DAG) {
   // For now, assume targets can only fuse with the branch.
-  MachineInstr *Branch = DAG->ExitSU.getInstr();
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
   if (!Branch)
     return;
 
-  for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
-    SUnit *SU = &DAG->SUnits[--Idx];
-    if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+  for (SUnit &SU : DAG->SUnits) {
+    // SUnits with successors can't be schedule in front of the ExitSU.
+    if (!SU.Succs.empty())
+      continue;
+    // We only care if the node writes to a register that the branch reads.
+    MachineInstr *Pred = SU.getInstr();
+    if (!HasDataDep(TRI, *Branch, *Pred))
+      continue;
+
+    if (!TII.shouldScheduleAdjacent(Pred, Branch))
       continue;
 
     // Create a single weak edge from SU to ExitSU. The only effect is to cause
@@ -1376,11 +1448,11 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {
     // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
     // of SU, we could create an artificial edge from the deepest root, but it
     // hasn't been needed yet.
-    bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
     (void)Success;
     assert(Success && "No DAG nodes should be reachable from ExitSU");
 
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
+    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
     break;
   }
 }
@@ -2277,7 +2349,7 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
     Latency = Cand.SU->getDepth();
     break;
   }
-  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
   if (P.isValid())
     dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
            << ":" << P.getUnitInc() << " ";
@@ -2438,6 +2510,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   }
 }
 
+void GenericScheduler::dumpPolicy() {
+  dbgs() << "GenericScheduler RegionPolicy: "
+         << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
+         << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
+         << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
+         << "\n";
+}
+
 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
 /// critical path by more cycles than it takes to drain the instruction buffer.
 /// We estimate an upper bounds on in-flight instructions as:
@@ -2499,11 +2579,13 @@ static bool tryPressure(const PressureChange &TryP,
                         const PressureChange &CandP,
                         GenericSchedulerBase::SchedCandidate &TryCand,
                         GenericSchedulerBase::SchedCandidate &Cand,
-                        GenericSchedulerBase::CandReason Reason) {
-  int TryRank = TryP.getPSetOrMax();
-  int CandRank = CandP.getPSetOrMax();
+                        GenericSchedulerBase::CandReason Reason,
+                        const TargetRegisterInfo *TRI,
+                        const MachineFunction &MF) {
+  unsigned TryPSet = TryP.getPSetOrMax();
+  unsigned CandPSet = CandP.getPSetOrMax();
   // If both candidates affect the same set, go with the smallest increase.
-  if (TryRank == CandRank) {
+  if (TryPSet == CandPSet) {
     return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
                    Reason);
   }
@@ -2513,6 +2595,13 @@ static bool tryPressure(const PressureChange &TryP,
                  Reason)) {
     return true;
   }
+
+  int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
+                                 std::numeric_limits<int>::max();
+
+  int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
+                                   std::numeric_limits<int>::max();
+
   // If the candidates are decreasing pressure, reverse priority.
   if (TryP.getUnitInc() < 0)
     std::swap(TryRank, CandRank);
@@ -2597,7 +2686,7 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
     }
   }
   DEBUG(if (TryCand.RPDelta.Excess.isValid())
-          dbgs() << "  SU(" << TryCand.SU->NodeNum << ") "
+          dbgs() << "  Try  SU(" << TryCand.SU->NodeNum << ") "
                  << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
                  << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
 
@@ -2615,13 +2704,15 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
   // Avoid exceeding the target's limit.
   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
                                                Cand.RPDelta.Excess,
-                                               TryCand, Cand, RegExcess))
+                                               TryCand, Cand, RegExcess, TRI,
+                                               DAG->MF))
     return;
 
   // Avoid increasing the max critical pressure in the scheduled region.
   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
                                                Cand.RPDelta.CriticalMax,
-                                               TryCand, Cand, RegCritical))
+                                               TryCand, Cand, RegCritical, TRI,
+                                               DAG->MF))
     return;
 
   // For loops that are acyclic path limited, aggressively schedule for latency.
@@ -2657,7 +2748,8 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
   // Avoid increasing the max pressure of the entire region.
   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
                                                Cand.RPDelta.CurrentMax,
-                                               TryCand, Cand, RegMax))
+                                               TryCand, Cand, RegMax, TRI,
+                                               DAG->MF))
     return;
 
   // Avoid critical resource consumption and balance the schedule.
@@ -2672,8 +2764,8 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
 
   // Avoid serializing long latency dependence chains.
   // For acyclic path limited loops, latency was already checked above.
-  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
-      && tryLatency(TryCand, Cand, Zone)) {
+  if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency &&
+      !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) {
     return;
   }
 
@@ -2727,12 +2819,12 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
     IsTopNode = false;
-    DEBUG(dbgs() << "Pick Bot NOCAND\n");
+    DEBUG(dbgs() << "Pick Bot ONLY1\n");
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
     IsTopNode = true;
-    DEBUG(dbgs() << "Pick Top NOCAND\n");
+    DEBUG(dbgs() << "Pick Top ONLY1\n");
     return SU;
   }
   CandPolicy NoPolicy;
@@ -2887,7 +2979,7 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
   if (EnableLoadCluster && DAG->TII->enableClusterLoads())
     DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
   if (EnableMacroFusion)
-    DAG->addMutation(make_unique<MacroFusion>(DAG->TII));
+    DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI));
   return DAG;
 }
 
@@ -3254,12 +3346,10 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 
   static bool isNodeHidden(const SUnit *Node) {
-    return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
-  }
-
-  static bool hasNodeAddressLabel(const SUnit *Node,
-                                  const ScheduleDAG *Graph) {
-    return false;
+    if (ViewMISchedCutoff == 0)
+      return false;
+    return (Node->Preds.size() > ViewMISchedCutoff
+         || Node->Succs.size() > ViewMISchedCutoff);
   }
 
   /// If you want to override the dot attributes printed for a particular
diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp
index 1b9be50..5e6d619 100644
--- a/contrib/llvm/lib/CodeGen/MachineSink.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp
@@ -87,7 +87,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachinePostDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
@@ -150,7 +150,7 @@ INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink",
                 "Machine code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineSinking, "machine-sink",
                 "Machine code sinking", false, false)
 
@@ -268,7 +268,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   PDT = &getAnalysis<MachinePostDominatorTree>();
   LI = &getAnalysis<MachineLoopInfo>();
   MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   bool EverMadeChange = false;
 
@@ -667,7 +667,7 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
 
   // It's not safe to sink instructions to EH landing pad. Control flow into
   // landing pad is implicitly defined.
-  if (SuccToSinkTo && SuccToSinkTo->isLandingPad())
+  if (SuccToSinkTo && SuccToSinkTo->isEHPad())
     return nullptr;
 
   return SuccToSinkTo;
@@ -686,7 +686,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore,
   if (!MI->isSafeToMove(AA, SawStore))
     return false;
 
-  // Convergent operations may only be moved to control equivalent locations.
+  // Convergent operations may not be made control-dependent on additional
+  // values.
   if (MI->isConvergent())
     return false;
 
diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index d9a6b684..f7edacd 100644
--- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -724,13 +724,12 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
 
   // Update RegUnits to reflect live registers after UseMI.
   // First kills.
-  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
-    for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units)
+  for (unsigned Kill : Kills)
+    for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units)
       RegUnits.erase(*Units);
 
   // Second, live defs.
-  for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) {
-    unsigned DefOp = LiveDefOps[i];
+  for (unsigned DefOp : LiveDefOps) {
     for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
          Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
@@ -756,8 +755,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   assert(TBI.HasValidInstrDepths && "Missing depth info");
   assert(TBI.HasValidInstrHeights && "Missing height info");
   unsigned MaxLen = 0;
-  for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
-    const LiveInReg &LIR = TBI.LiveIns[i];
+  for (const LiveInReg &LIR : TBI.LiveIns) {
     if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
       continue;
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
index ca35ec5..428295e 100644
--- a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
@@ -42,7 +43,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -204,18 +204,19 @@ namespace {
     void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
     void visitMachineFunctionAfter();
 
+    template <typename T> void report(const char *msg, ilist_iterator<T> I) {
+      report(msg, &*I);
+    }
     void report(const char *msg, const MachineFunction *MF);
     void report(const char *msg, const MachineBasicBlock *MBB);
     void report(const char *msg, const MachineInstr *MI);
     void report(const char *msg, const MachineOperand *MO, unsigned MONum);
-    void report(const char *msg, const MachineFunction *MF,
-                const LiveInterval &LI);
-    void report(const char *msg, const MachineBasicBlock *MBB,
-                const LiveInterval &LI);
-    void report(const char *msg, const MachineFunction *MF,
-                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
-    void report(const char *msg, const MachineBasicBlock *MBB,
-                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
+
+    void report_context(const LiveInterval &LI) const;
+    void report_context(const LiveRange &LR, unsigned Reg,
+                        LaneBitmask LaneMask) const;
+    void report_context(const LiveRange::Segment &S) const;
+    void report_context(const VNInfo &VNI) const;
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -233,9 +234,11 @@ namespace {
     void verifyLiveRangeSegment(const LiveRange&,
                                 const LiveRange::const_iterator I, unsigned,
                                 unsigned);
-    void verifyLiveRange(const LiveRange&, unsigned, unsigned LaneMask = 0);
+    void verifyLiveRange(const LiveRange&, unsigned, LaneBitmask LaneMask = 0);
 
     void verifyStackFrame();
+
+    void verifySlotIndexes() const;
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
@@ -273,6 +276,19 @@ void MachineFunction::verify(Pass *p, const char *Banner) const {
     .runOnMachineFunction(const_cast<MachineFunction&>(*this));
 }
 
+void MachineVerifier::verifySlotIndexes() const {
+  if (Indexes == nullptr)
+    return;
+
+  // Ensure the IdxMBB list is sorted by slot indexes.
+  SlotIndex Last;
+  for (SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin(),
+       E = Indexes->MBBIndexEnd(); I != E; ++I) {
+    assert(!Last.isValid() || I->first > Last);
+    Last = I->first;
+  }
+}
+
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   foundErrors = 0;
 
@@ -295,10 +311,12 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
     Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>();
   }
 
+  verifySlotIndexes();
+
   visitMachineFunctionBefore();
   for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
        MFI!=MFE; ++MFI) {
-    visitMachineBasicBlockBefore(MFI);
+    visitMachineBasicBlockBefore(&*MFI);
     // Keep track of the current bundle header.
     const MachineInstr *CurBundle = nullptr;
     // Do we expect the next instruction to be part of the same bundle?
@@ -306,7 +324,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
 
     for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
            MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
-      if (MBBI->getParent() != MFI) {
+      if (MBBI->getParent() != &*MFI) {
         report("Bad instruction parent pointer", MFI);
         errs() << "Instruction: " << *MBBI;
         continue;
@@ -315,20 +333,22 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
       // Check for consistent bundle flags.
       if (InBundle && !MBBI->isBundledWithPred())
         report("Missing BundledPred flag, "
-               "BundledSucc was set on predecessor", MBBI);
+               "BundledSucc was set on predecessor",
+               &*MBBI);
       if (!InBundle && MBBI->isBundledWithPred())
         report("BundledPred flag is set, "
-               "but BundledSucc not set on predecessor", MBBI);
+               "but BundledSucc not set on predecessor",
+               &*MBBI);
 
       // Is this a bundle header?
       if (!MBBI->isInsideBundle()) {
         if (CurBundle)
           visitMachineBundleAfter(CurBundle);
-        CurBundle = MBBI;
+        CurBundle = &*MBBI;
         visitMachineBundleBefore(CurBundle);
       } else if (!CurBundle)
         report("No bundle header", MBBI);
-      visitMachineInstrBefore(MBBI);
+      visitMachineInstrBefore(&*MBBI);
       for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
         const MachineInstr &MI = *MBBI;
         const MachineOperand &Op = MI.getOperand(I);
@@ -341,7 +361,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
         visitMachineOperand(&Op, I);
       }
 
-      visitMachineInstrAfter(MBBI);
+      visitMachineInstrAfter(&*MBBI);
 
       // Was this the last bundled instruction?
       InBundle = MBBI->isBundledWithSucc();
@@ -350,7 +370,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
       visitMachineBundleAfter(CurBundle);
     if (InBundle)
       report("BundledSucc flag set on last instruction in block", &MFI->back());
-    visitMachineBasicBlockAfter(MFI);
+    visitMachineBasicBlockAfter(&*MFI);
   }
   visitMachineFunctionAfter();
 
@@ -375,7 +395,10 @@ void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
   if (!foundErrors++) {
     if (Banner)
       errs() << "# " << Banner << '\n';
-    MF->print(errs(), Indexes);
+    if (LiveInts != nullptr)
+      LiveInts->print(errs());
+    else
+      MF->print(errs(), Indexes);
   }
   errs() << "*** Bad machine code: " << msg << " ***\n"
       << "- function:    " << MF->getName() << "\n";
@@ -399,7 +422,8 @@ void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   errs() << "- instruction: ";
   if (Indexes && Indexes->hasIndex(MI))
     errs() << Indexes->getInstructionIndex(MI) << '\t';
-  MI->print(errs(), TM);
+  MI->print(errs(), /*SkipOpers=*/true);
+  errs() << '\n';
 }
 
 void MachineVerifier::report(const char *msg,
@@ -411,36 +435,24 @@ void MachineVerifier::report(const char *msg,
   errs() << "\n";
 }
 
-void MachineVerifier::report(const char *msg, const MachineFunction *MF,
-                             const LiveInterval &LI) {
-  report(msg, MF);
-  errs() << "- interval:    " << LI << '\n';
-}
-
-void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
-                             const LiveInterval &LI) {
-  report(msg, MBB);
+void MachineVerifier::report_context(const LiveInterval &LI) const {
   errs() << "- interval:    " << LI << '\n';
 }
 
-void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
-                             const LiveRange &LR, unsigned Reg,
-                             unsigned LaneMask) {
-  report(msg, MBB);
-  errs() << "- liverange:   " << LR << '\n';
+void MachineVerifier::report_context(const LiveRange &LR, unsigned Reg,
+                                     LaneBitmask LaneMask) const {
   errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
   if (LaneMask != 0)
-    errs() << "- lanemask:    " << format("%04X\n", LaneMask);
+    errs() << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
+  errs() << "- liverange:   " << LR << '\n';
 }
 
-void MachineVerifier::report(const char *msg, const MachineFunction *MF,
-                             const LiveRange &LR, unsigned Reg,
-                             unsigned LaneMask) {
-  report(msg, MF);
-  errs() << "- liverange:   " << LR << '\n';
-  errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
-  if (LaneMask != 0)
-    errs() << "- lanemask:    " << format("%04X\n", LaneMask);
+void MachineVerifier::report_context(const LiveRange::Segment &S) const {
+  errs() << "- segment:     " << S << '\n';
+}
+
+void MachineVerifier::report_context(const VNInfo &VNI) const {
+  errs() << "- ValNo:       " << VNI.id << " (def " << VNI.def << ")\n";
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -507,11 +519,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   if (MRI->isSSA()) {
     // If this block has allocatable physical registers live-in, check that
     // it is an entry block or landing pad.
-    for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(),
-           LE = MBB->livein_end();
-         LI != LE; ++LI) {
-      unsigned reg = *LI;
-      if (isAllocatable(reg) && !MBB->isLandingPad() &&
+    for (const auto &LI : MBB->liveins()) {
+      if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
           MBB != MBB->getParent()->begin()) {
         report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
       }
@@ -522,7 +531,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   SmallPtrSet<MachineBasicBlock*, 4> LandingPadSuccs;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    if ((*I)->isLandingPad())
+    if ((*I)->isEHPad())
       LandingPadSuccs.insert(*I);
     if (!FunctionBlocks.count(*I))
       report("MBB has successor that isn't part of the function.", MBB);
@@ -547,10 +556,12 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
 
   const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
   const BasicBlock *BB = MBB->getBasicBlock();
+  const Function *Fn = MF->getFunction();
   if (LandingPadSuccs.size() > 1 &&
       !(AsmInfo &&
         AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
-        BB && isa<SwitchInst>(BB->getTerminator())))
+        BB && isa<SwitchInst>(BB->getTerminator())) &&
+      !isFuncletEHPersonality(classifyEHPersonality(Fn->getPersonalityFn())))
     report("MBB has more than one landing pad successor", MBB);
 
   // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
@@ -562,7 +573,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     // check whether its answers match up with reality.
     if (!TBB && !FBB) {
       // Block falls through to its successor.
-      MachineFunction::const_iterator MBBI = MBB;
+      MachineFunction::const_iterator MBBI = MBB->getIterator();
       ++MBBI;
       if (MBBI == MF->end()) {
         // It's possible that the block legitimately ends with a noreturn
@@ -575,7 +586,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       } else if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
         report("MBB exits via unconditional fall-through but doesn't have "
                "exactly one CFG successor!", MBB);
-      } else if (!MBB->isSuccessor(MBBI)) {
+      } else if (!MBB->isSuccessor(&*MBBI)) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
@@ -613,7 +624,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       }
     } else if (TBB && !FBB && !Cond.empty()) {
       // Block conditionally branches somewhere, otherwise falls through.
-      MachineFunction::const_iterator MBBI = MBB;
+      MachineFunction::const_iterator MBBI = MBB->getIterator();
       ++MBBI;
       if (MBBI == MF->end()) {
         report("MBB conditionally falls through out of function!", MBB);
@@ -628,7 +639,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       } else if (MBB->succ_size() != 2) {
         report("MBB exits via conditional branch/fall-through but doesn't have "
                "exactly two CFG successors!", MBB);
-      } else if (!matchPair(MBB->succ_begin(), TBB, MBBI)) {
+      } else if (!matchPair(MBB->succ_begin(), TBB, &*MBBI)) {
         report("MBB exits via conditional branch/fall-through but the CFG "
                "successors don't match the actual successors!", MBB);
       }
@@ -680,13 +691,12 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   }
 
   regsLive.clear();
-  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
-         E = MBB->livein_end(); I != E; ++I) {
-    if (!TargetRegisterInfo::isPhysicalRegister(*I)) {
+  for (const auto &LI : MBB->liveins()) {
+    if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
       report("MBB live-in list contains non-physical register", MBB);
       continue;
     }
-    for (MCSubRegIterator SubRegs(*I, TRI, /*IncludeSelf=*/true);
+    for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
          SubRegs.isValid(); ++SubRegs)
       regsLive.insert(*SubRegs);
   }
@@ -822,9 +832,12 @@ void
 MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   const MachineInstr *MI = MO->getParent();
   const MCInstrDesc &MCID = MI->getDesc();
+  unsigned NumDefs = MCID.getNumDefs();
+  if (MCID.getOpcode() == TargetOpcode::PATCHPOINT)
+    NumDefs = (MONum == 0 && MO->isReg()) ? NumDefs : 0;
 
   // The first MCID.NumDefs operands must be explicit register defines
-  if (MONum < MCID.getNumDefs()) {
+  if (MONum < NumDefs) {
     const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
     if (!MO->isReg())
       report("Explicit definition must be a register", MO, MONum);
@@ -972,13 +985,38 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   case MachineOperand::MO_FrameIndex:
     if (LiveStks && LiveStks->hasInterval(MO->getIndex()) &&
         LiveInts && !LiveInts->isNotInMIMap(MI)) {
-      LiveInterval &LI = LiveStks->getInterval(MO->getIndex());
+      int FI = MO->getIndex();
+      LiveInterval &LI = LiveStks->getInterval(FI);
       SlotIndex Idx = LiveInts->getInstructionIndex(MI);
-      if (MI->mayLoad() && !LI.liveAt(Idx.getRegSlot(true))) {
+
+      bool stores = MI->mayStore();
+      bool loads = MI->mayLoad();
+      // For a memory-to-memory move, we need to check if the frame
+      // index is used for storing or loading, by inspecting the
+      // memory operands.
+      if (stores && loads) {
+        for (auto *MMO : MI->memoperands()) {
+          const PseudoSourceValue *PSV = MMO->getPseudoValue();
+          if (PSV == nullptr) continue;
+          const FixedStackPseudoSourceValue *Value =
+            dyn_cast<FixedStackPseudoSourceValue>(PSV);
+          if (Value == nullptr) continue;
+          if (Value->getFrameIndex() != FI) continue;
+
+          if (MMO->isStore())
+            loads = false;
+          else
+            stores = false;
+          break;
+        }
+        if (loads == stores)
+          report("Missing fixed stack memoperand.", MI);
+      }
+      if (loads && !LI.liveAt(Idx.getRegSlot(true))) {
         report("Instruction loads from dead spill slot", MO, MONum);
         errs() << "Live stack: " << LI << '\n';
       }
-      if (MI->mayStore() && !LI.liveAt(Idx.getRegSlot())) {
+      if (stores && !LI.liveAt(Idx.getRegSlot())) {
         report("Instruction stores to dead spill slot", MO, MONum);
         errs() << "Live stack: " << LI << '\n';
       }
@@ -1387,40 +1425,39 @@ void MachineVerifier::verifyLiveIntervals() {
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
                                            const VNInfo *VNI, unsigned Reg,
-                                           unsigned LaneMask) {
+                                           LaneBitmask LaneMask) {
   if (VNI->isUnused())
     return;
 
   const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LR, Reg,
-           LaneMask);
-    errs() << "Valno #" << VNI->id << '\n';
+    report("Value not live at VNInfo def and not marked unused", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live segment at def has different valno", MF, LR, Reg, LaneMask);
-    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
-        << " where valno #" << DefVNI->id << " is live\n";
+    report("Live segment at def has different VNInfo", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
     return;
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LR, Reg, LaneMask);
-    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
-        << " in " << LR << '\n';
+    report("Invalid VNInfo definition index", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
     return;
   }
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LR, Reg,
-             LaneMask);
-      errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
-          << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
+      report("PHIDef VNInfo is not defined at MBB start", MBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
     }
     return;
   }
@@ -1428,8 +1465,9 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LR, Reg, LaneMask);
-    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    report("No instruction at VNInfo def index", MBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
     return;
   }
 
@@ -1457,60 +1495,67 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
     if (!hasDef) {
       report("Defining instruction does not modify register", MI);
-      errs() << "Valno #" << VNI->id << " in " << LR << '\n';
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
     }
 
     // Early clobber defs begin at USE slots, but other defs must begin at
     // DEF slots.
     if (isEarlyClobber) {
       if (!VNI->def.isEarlyClobber()) {
-        report("Early clobber def must be at an early-clobber slot", MBB, LR,
-               Reg, LaneMask);
-        errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+        report("Early clobber def must be at an early-clobber slot", MBB);
+        report_context(LR, Reg, LaneMask);
+        report_context(*VNI);
       }
     } else if (!VNI->def.isRegister()) {
-      report("Non-PHI, non-early clobber def must be at a register slot",
-             MBB, LR, Reg, LaneMask);
-      errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+      report("Non-PHI, non-early clobber def must be at a register slot", MBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
     }
   }
 }
 
 void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
                                              const LiveRange::const_iterator I,
-                                             unsigned Reg, unsigned LaneMask) {
+                                             unsigned Reg, LaneBitmask LaneMask)
+{
   const LiveRange::Segment &S = *I;
   const VNInfo *VNI = S.valno;
   assert(VNI && "Live segment has no valno");
 
   if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live segment", MF, LR, Reg, LaneMask);
-    errs() << S << " has a bad valno\n";
+    report("Foreign valno in live segment", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    report_context(*VNI);
   }
 
   if (VNI->isUnused()) {
-    report("Live segment valno is marked unused", MF, LR, Reg, LaneMask);
-    errs() << S << '\n';
+    report("Live segment valno is marked unused", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LR, Reg, LaneMask);
-    errs() << S << '\n';
+    report("Bad start of live segment, no basic block", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
   if (S.start != MBBStartIdx && S.start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg,
-           LaneMask);
-    errs() << S << '\n';
+    report("Live segment must begin at MBB entry or valno def", MBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
   }
 
   const MachineBasicBlock *EndMBB =
     LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LR, Reg, LaneMask);
-    errs() << S << '\n';
+    report("Bad end of live segment, no basic block", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
     return;
   }
 
@@ -1527,26 +1572,26 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   const MachineInstr *MI =
     LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg,
-           LaneMask);
-    errs() << S << '\n';
+    report("Live segment doesn't end at a valid instruction", EndMBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
     return;
   }
 
   // The block slot must refer to a basic block boundary.
   if (S.end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg,
-           LaneMask);
-    errs() << S << '\n';
+    report("Live segment ends at B slot of an instruction", EndMBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
   }
 
   if (S.end.isDead()) {
     // Segment ends on the dead slot.
     // That means there must be a dead def.
     if (!SlotIndex::isSameInstr(S.start, S.end)) {
-      report("Live segment ending at dead slot spans instructions", EndMBB, LR,
-             Reg, LaneMask);
-      errs() << S << '\n';
+      report("Live segment ending at dead slot spans instructions", EndMBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(S);
     }
   }
 
@@ -1555,9 +1600,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   if (S.end.isEarlyClobber()) {
     if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LR, Reg,
-             LaneMask);
-      errs() << S << '\n';
+             "redefined by an EC def in the same instruction", EndMBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(S);
     }
   }
 
@@ -1587,14 +1632,15 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
             !hasSubRegDef) {
           report("Instruction ending live segment doesn't read the register",
                  MI);
-          errs() << S << " in " << LR << '\n';
+          report_context(LR, Reg, LaneMask);
+          report_context(S);
         }
       }
     }
   }
 
   // Now check all the basic blocks in this live segment.
-  MachineFunction::const_iterator MFI = MBB;
+  MachineFunction::const_iterator MFI = MBB->getIterator();
   // Is this live segment the beginning of a non-PHIDef VN?
   if (S.start == VNI->def && !VNI->isPHIDef()) {
     // Not live-in to any blocks.
@@ -1604,10 +1650,10 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     ++MFI;
   }
   for (;;) {
-    assert(LiveInts->isLiveInToMBB(LR, MFI));
+    assert(LiveInts->isLiveInToMBB(LR, &*MFI));
     // We don't know how to track physregs into a landing pad.
     if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
-        MFI->isLandingPad()) {
+        MFI->isEHPad()) {
       if (&*MFI == EndMBB)
         break;
       ++MFI;
@@ -1616,7 +1662,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
     // Is VNI a PHI-def in the current block?
     bool IsPHI = VNI->isPHIDef() &&
-      VNI->def == LiveInts->getMBBStartIdx(MFI);
+      VNI->def == LiveInts->getMBBStartIdx(&*MFI);
 
     // Check that VNI is live-out of all predecessors.
     for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
@@ -1626,22 +1672,23 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LR, Reg,
-               LaneMask);
-        errs() << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
-            << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
-            << PEnd << '\n';
+        report("Register not marked live out of predecessor", *PI);
+        report_context(LR, Reg, LaneMask);
+        report_context(*VNI);
+        errs() << " live into BB#" << MFI->getNumber()
+               << '@' << LiveInts->getMBBStartIdx(&*MFI) << ", not live before "
+               << PEnd << '\n';
         continue;
       }
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LR, Reg,
-               LaneMask);
+        report("Different value live out of predecessor", *PI);
+        report_context(LR, Reg, LaneMask);
         errs() << "Valno #" << PVNI->id << " live out of BB#"
-            << (*PI)->getNumber() << '@' << PEnd
-            << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
-            << '@' << LiveInts->getMBBStartIdx(MFI) << '\n';
+               << (*PI)->getNumber() << '@' << PEnd << "\nValno #" << VNI->id
+               << " live into BB#" << MFI->getNumber() << '@'
+               << LiveInts->getMBBStartIdx(&*MFI) << '\n';
       }
     }
     if (&*MFI == EndMBB)
@@ -1651,7 +1698,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 }
 
 void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
-                                      unsigned LaneMask) {
+                                      LaneBitmask LaneMask) {
   for (const VNInfo *VNI : LR.valnos)
     verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
 
@@ -1664,24 +1711,35 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   assert(TargetRegisterInfo::isVirtualRegister(Reg));
   verifyLiveRange(LI, Reg);
 
-  unsigned Mask = 0;
-  unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+  LaneBitmask Mask = 0;
+  LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
   for (const LiveInterval::SubRange &SR : LI.subranges()) {
-    if ((Mask & SR.LaneMask) != 0)
-      report("Lane masks of sub ranges overlap in live interval", MF, LI);
-    if ((SR.LaneMask & ~MaxMask) != 0)
-      report("Subrange lanemask is invalid", MF, LI);
+    if ((Mask & SR.LaneMask) != 0) {
+      report("Lane masks of sub ranges overlap in live interval", MF);
+      report_context(LI);
+    }
+    if ((SR.LaneMask & ~MaxMask) != 0) {
+      report("Subrange lanemask is invalid", MF);
+      report_context(LI);
+    }
+    if (SR.empty()) {
+      report("Subrange must not be empty", MF);
+      report_context(SR, LI.reg, SR.LaneMask);
+    }
     Mask |= SR.LaneMask;
     verifyLiveRange(SR, LI.reg, SR.LaneMask);
-    if (!LI.covers(SR))
-      report("A Subrange is not covered by the main range", MF, LI);
+    if (!LI.covers(SR)) {
+      report("A Subrange is not covered by the main range", MF);
+      report_context(LI);
+    }
   }
 
   // Check the LI only has one connected component.
   ConnectedVNInfoEqClasses ConEQ(*LiveInts);
-  unsigned NumComp = ConEQ.Classify(&LI);
+  unsigned NumComp = ConEQ.Classify(LI);
   if (NumComp > 1) {
-    report("Multiple connected components in live interval", MF, LI);
+    report("Multiple connected components in live interval", MF);
+    report_context(LI);
     for (unsigned comp = 0; comp != NumComp; ++comp) {
       errs() << comp << ": valnos";
       for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
index d343301..2c93792 100644
--- a/contrib/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
@@ -548,7 +548,7 @@ void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
 bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
                                    MachineBasicBlock &MBB,
                                    MachineLoopInfo *MLI) {
-  if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
+  if (MBB.empty() || !MBB.front().isPHI() || MBB.isEHPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
   const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : nullptr;
diff --git a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
index 99bbad1..4cabc3a 100644
--- a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
+++ b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -28,7 +28,7 @@ llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
   // Usually, we just want to insert the copy before the first terminator
   // instruction. However, for the edge going to a landing pad, we must insert
   // the copy before the call/invoke instruction.
-  if (!SuccMBB->isLandingPad())
+  if (!SuccMBB->isEHPad())
     return MBB->getFirstTerminator();
 
   // Discover any defs/uses in this basic block.
diff --git a/contrib/llvm/lib/CodeGen/ParallelCG.cpp b/contrib/llvm/lib/CodeGen/ParallelCG.cpp
new file mode 100644
index 0000000..e73ba02
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ParallelCG.cpp
@@ -0,0 +1,96 @@
+//===-- ParallelCG.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that can be used for parallel code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ParallelCG.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/thread.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SplitModule.h"
+
+using namespace llvm;
+
+static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
+                    const Target *TheTarget, StringRef CPU, StringRef Features,
+                    const TargetOptions &Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL,
+                    TargetMachine::CodeGenFileType FileType) {
+  std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
+      M->getTargetTriple(), CPU, Features, Options, RM, CM, OL));
+
+  legacy::PassManager CodeGenPasses;
+  if (TM->addPassesToEmitFile(CodeGenPasses, OS, FileType))
+    report_fatal_error("Failed to setup codegen");
+  CodeGenPasses.run(*M);
+}
+
+std::unique_ptr<Module>
+llvm::splitCodeGen(std::unique_ptr<Module> M,
+                   ArrayRef<llvm::raw_pwrite_stream *> OSs, StringRef CPU,
+                   StringRef Features, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL,
+                   TargetMachine::CodeGenFileType FileType) {
+  StringRef TripleStr = M->getTargetTriple();
+  std::string ErrMsg;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
+  if (!TheTarget)
+    report_fatal_error(Twine("Target not found: ") + ErrMsg);
+
+  if (OSs.size() == 1) {
+    codegen(M.get(), *OSs[0], TheTarget, CPU, Features, Options, RM, CM,
+            OL, FileType);
+    return M;
+  }
+
+  std::vector<thread> Threads;
+  SplitModule(std::move(M), OSs.size(), [&](std::unique_ptr<Module> MPart) {
+    // We want to clone the module in a new context to multi-thread the codegen.
+    // We do it by serializing partition modules to bitcode (while still on the
+    // main thread, in order to avoid data races) and spinning up new threads
+    // which deserialize the partitions into separate contexts.
+    // FIXME: Provide a more direct way to do this in LLVM.
+    SmallVector<char, 0> BC;
+    raw_svector_ostream BCOS(BC);
+    WriteBitcodeToFile(MPart.get(), BCOS);
+
+    llvm::raw_pwrite_stream *ThreadOS = OSs[Threads.size()];
+    Threads.emplace_back(
+        [TheTarget, CPU, Features, Options, RM, CM, OL, FileType,
+         ThreadOS](const SmallVector<char, 0> &BC) {
+          LLVMContext Ctx;
+          ErrorOr<std::unique_ptr<Module>> MOrErr =
+              parseBitcodeFile(MemoryBufferRef(StringRef(BC.data(), BC.size()),
+                                               "<split-module>"),
+                               Ctx);
+          if (!MOrErr)
+            report_fatal_error("Failed to read bitcode");
+          std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
+
+          codegen(MPartInCtx.get(), *ThreadOS, TheTarget, CPU, Features,
+                  Options, RM, CM, OL, FileType);
+        },
+        // Pass BC using std::move to ensure that it get moved rather than
+        // copied into the thread's context.
+        std::move(BC));
+  });
+
+  for (thread &T : Threads)
+    T.join();
+
+  return {};
+}
diff --git a/contrib/llvm/lib/CodeGen/Passes.cpp b/contrib/llvm/lib/CodeGen/Passes.cpp
index 024d166..873f712 100644
--- a/contrib/llvm/lib/CodeGen/Passes.cpp
+++ b/contrib/llvm/lib/CodeGen/Passes.cpp
@@ -13,7 +13,11 @@
 //===---------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/IR/IRPrintingPasses.h"
@@ -52,9 +56,6 @@ static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
     cl::desc("Disable Machine Common Subexpression Elimination"));
-static cl::opt<cl::boolOrDefault>
-    EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden,
-                        cl::desc("enable the shrink-wrapping pass"));
 static cl::opt<cl::boolOrDefault> OptimizeRegAlloc(
     "optimize-regalloc", cl::Hidden,
     cl::desc("Enable optimized register allocation compilation path."));
@@ -95,10 +96,10 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
-// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); Ideally it
-// wouldn't be part of the standard pass pipeline, and the target would just add
-// a PostRA scheduling pass wherever it wants.
-static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
+// Targets can return true in targetSchedulesPostRAScheduling() and
+// insert a PostRA scheduling pass wherever it wants.
+cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
   cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
 
 // Experimental option to run live interval analysis early.
@@ -188,6 +189,29 @@ char TargetPassConfig::ID = 0;
 char TargetPassConfig::EarlyTailDuplicateID = 0;
 char TargetPassConfig::PostRAMachineLICMID = 0;
 
+namespace {
+struct InsertedPass {
+  AnalysisID TargetPassID;
+  IdentifyingPassPtr InsertedPassID;
+  bool VerifyAfter;
+  bool PrintAfter;
+
+  InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
+               bool VerifyAfter, bool PrintAfter)
+      : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID),
+        VerifyAfter(VerifyAfter), PrintAfter(PrintAfter) {}
+
+  Pass *getInsertedPass() const {
+    assert(InsertedPassID.isValid() && "Illegal Pass ID!");
+    if (InsertedPassID.isInstance())
+      return InsertedPassID.getInstance();
+    Pass *NP = Pass::createPass(InsertedPassID.getID());
+    assert(NP && "Pass ID not registered");
+    return NP;
+  }
+};
+}
+
 namespace llvm {
 class PassConfigImpl {
 public:
@@ -202,7 +226,7 @@ public:
 
   /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
   /// is inserted after each instance of the first one.
-  SmallVector<std::pair<AnalysisID, IdentifyingPassPtr>, 4> InsertedPasses;
+  SmallVector<InsertedPass, 4> InsertedPasses;
 };
 } // namespace llvm
 
@@ -217,7 +241,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
     : ImmutablePass(ID), PM(&pm), StartBefore(nullptr), StartAfter(nullptr),
       StopAfter(nullptr), Started(true), Stopped(false),
       AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
-      DisableVerify(false), EnableTailMerge(true), EnableShrinkWrap(false) {
+      DisableVerify(false), EnableTailMerge(true) { 
 
   Impl = new PassConfigImpl();
 
@@ -225,6 +249,10 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   // including this pass itself.
   initializeCodeGen(*PassRegistry::getPassRegistry());
 
+  // Also register alias analysis passes required by codegen passes.
+  initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
+
   // Substitute Pseudo Pass IDs for real ones.
   substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
@@ -232,14 +260,15 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
 
 /// Insert InsertedPassID pass after TargetPassID.
 void TargetPassConfig::insertPass(AnalysisID TargetPassID,
-                                  IdentifyingPassPtr InsertedPassID) {
+                                  IdentifyingPassPtr InsertedPassID,
+                                  bool VerifyAfter, bool PrintAfter) {
   assert(((!InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getID()) ||
           (InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
          "Insert a pass after itself!");
-  std::pair<AnalysisID, IdentifyingPassPtr> P(TargetPassID, InsertedPassID);
-  Impl->InsertedPasses.push_back(P);
+  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter,
+                                    PrintAfter);
 }
 
 /// createPassConfig - Create a pass configuration object to be used by
@@ -304,21 +333,9 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
     }
 
     // Add the passes after the pass P if there is any.
-    for (SmallVectorImpl<std::pair<AnalysisID, IdentifyingPassPtr> >::iterator
-             I = Impl->InsertedPasses.begin(),
-             E = Impl->InsertedPasses.end();
-         I != E; ++I) {
-      if ((*I).first == PassID) {
-        assert((*I).second.isValid() && "Illegal Pass ID!");
-        Pass *NP;
-        if ((*I).second.isInstance())
-          NP = (*I).second.getInstance();
-        else {
-          NP = Pass::createPass((*I).second.getID());
-          assert(NP && "Pass ID not registered");
-        }
-        addPass(NP, false, false);
-      }
+    for (auto IP : Impl->InsertedPasses) {
+      if (IP.TargetPassID == PassID)
+        addPass(IP.getInsertedPass(), IP.VerifyAfter, IP.PrintAfter);
     }
   } else {
     delete P;
@@ -380,10 +397,10 @@ void TargetPassConfig::addIRPasses() {
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
   if (UseCFLAA)
-    addPass(createCFLAliasAnalysisPass());
-  addPass(createTypeBasedAliasAnalysisPass());
-  addPass(createScopedNoAliasAAPass());
-  addPass(createBasicAliasAnalysisPass());
+    addPass(createCFLAAWrapperPass());
+  addPass(createTypeBasedAAWrapperPass());
+  addPass(createScopedNoAliasAAWrapperPass());
+  addPass(createBasicAAWrapperPass());
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
@@ -461,7 +478,7 @@ void TargetPassConfig::addISelPrepare() {
 
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
-  addPass(createSafeStackPass());
+  addPass(createSafeStackPass(TM));
   addPass(createStackProtectorPass(TM));
 
   if (PrintISelInput)
@@ -539,8 +556,9 @@ void TargetPassConfig::addMachinePasses() {
   addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
-  if (getEnableShrinkWrap())
+  if (getOptLevel() != CodeGenOpt::None) 
     addPass(&ShrinkWrapID);
+
   addPass(&PrologEpilogCodeInserterID);
 
   /// Add passes that optimize machine instructions after register allocation.
@@ -557,7 +575,10 @@ void TargetPassConfig::addMachinePasses() {
     addPass(&ImplicitNullChecksID);
 
   // Second pass scheduler.
-  if (getOptLevel() != CodeGenOpt::None) {
+  // Let Target optionally insert this pass by itself at some other
+  // point.
+  if (getOptLevel() != CodeGenOpt::None &&
+      !TM->targetSchedulesPostRAScheduling()) {
     if (MISchedPostRA)
       addPass(&PostMachineSchedulerID);
     else
@@ -576,7 +597,10 @@ void TargetPassConfig::addMachinePasses() {
 
   addPreEmitPass();
 
+  addPass(&FuncletLayoutID, false);
+
   addPass(&StackMapLivenessID, false);
+  addPass(&LiveDebugValuesID, false);
 
   AddingMachinePasses = false;
 }
@@ -613,27 +637,12 @@ void TargetPassConfig::addMachineSSAOptimization() {
   addPass(&MachineCSEID, false);
   addPass(&MachineSinkingID);
 
-  addPass(&PeepholeOptimizerID, false);
+  addPass(&PeepholeOptimizerID);
   // Clean-up the dead code that may have been generated by peephole
   // rewriting.
   addPass(&DeadMachineInstructionElimID);
 }
 
-bool TargetPassConfig::getEnableShrinkWrap() const {
-  switch (EnableShrinkWrapOpt) {
-  case cl::BOU_UNSET:
-    return EnableShrinkWrap && getOptLevel() != CodeGenOpt::None;
-  // If EnableShrinkWrap is set, it takes precedence on whatever the
-  // target sets. The rational is that we assume we want to test
-  // something related to shrink-wrapping.
-  case cl::BOU_TRUE:
-    return true;
-  case cl::BOU_FALSE:
-    return false;
-  }
-  llvm_unreachable("Invalid shrink-wrapping state");
-}
-
 //===---------------------------------------------------------------------===//
 /// Register Allocation Pass Configuration
 //===---------------------------------------------------------------------===//
@@ -717,7 +726,8 @@ void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
   addPass(&PHIEliminationID, false);
   addPass(&TwoAddressInstructionPassID, false);
 
-  addPass(RegAllocPass);
+  if (RegAllocPass)
+    addPass(RegAllocPass);
 }
 
 /// Add standard target-independent passes that are tightly coupled with
@@ -748,25 +758,27 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // PreRA instruction scheduling.
   addPass(&MachineSchedulerID);
 
-  // Add the selected register allocation pass.
-  addPass(RegAllocPass);
+  if (RegAllocPass) {
+    // Add the selected register allocation pass.
+    addPass(RegAllocPass);
 
-  // Allow targets to change the register assignments before rewriting.
-  addPreRewrite();
+    // Allow targets to change the register assignments before rewriting.
+    addPreRewrite();
 
-  // Finally rewrite virtual registers.
-  addPass(&VirtRegRewriterID);
+    // Finally rewrite virtual registers.
+    addPass(&VirtRegRewriterID);
 
-  // Perform stack slot coloring and post-ra machine LICM.
-  //
-  // FIXME: Re-enable coloring with register when it's capable of adding
-  // kill markers.
-  addPass(&StackSlotColoringID);
+    // Perform stack slot coloring and post-ra machine LICM.
+    //
+    // FIXME: Re-enable coloring with register when it's capable of adding
+    // kill markers.
+    addPass(&StackSlotColoringID);
 
-  // Run post-ra machine LICM to hoist reloads / remats.
-  //
-  // FIXME: can this move into MachineLateOptimization?
-  addPass(&PostRAMachineLICMID);
+    // Run post-ra machine LICM to hoist reloads / remats.
+    //
+    // FIXME: can this move into MachineLateOptimization?
+    addPass(&PostRAMachineLICMID);
+  }
 }
 
 //===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index ebe05e3..52b42b6 100644
--- a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -43,7 +43,7 @@
 // - Optimize Loads:
 //
 //     Loads that can be folded into a later instruction. A load is foldable
-//     if it loads to virtual registers and the virtual register defined has 
+//     if it loads to virtual registers and the virtual register defined has
 //     a single use.
 //
 // - Optimize Copies and Bitcast (more generally, target specific copies):
@@ -98,6 +98,16 @@ static cl::opt<bool>
 DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
                   cl::desc("Disable advanced copy optimization"));
 
+static cl::opt<bool> DisableNAPhysCopyOpt(
+    "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+    cl::desc("Disable non-allocatable physical register copy optimization"));
+
+// Limit the number of PHI instructions to process
+// in PeepholeOptimizer::getNextSource.
+static cl::opt<unsigned> RewritePHILimit(
+    "rewrite-phi-limit", cl::Hidden, cl::init(10),
+    cl::desc("Limit the length of PHI chains to lookup"));
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -105,8 +115,11 @@ STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
 STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
 STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
 
 namespace {
+  class ValueTrackerResult;
+
   class PeepholeOptimizer : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -130,6 +143,10 @@ namespace {
       }
     }
 
+    /// \brief Track Def -> Use info used for rewriting copies.
+    typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
+        RewriteMapTy;
+
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
@@ -137,17 +154,38 @@ namespace {
     bool optimizeSelect(MachineInstr *MI,
                         SmallPtrSetImpl<MachineInstr *> &LocalMIs);
     bool optimizeCondBranch(MachineInstr *MI);
-    bool optimizeCopyOrBitcast(MachineInstr *MI);
     bool optimizeCoalescableCopy(MachineInstr *MI);
     bool optimizeUncoalescableCopy(MachineInstr *MI,
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
-    bool findNextSource(unsigned &Reg, unsigned &SubReg);
+    bool findNextSource(unsigned Reg, unsigned SubReg,
+                        RewriteMapTy &RewriteMap);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+
+    /// \brief If copy instruction \p MI is a virtual register copy, track it in
+    /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
+    /// previously seen as a copy, replace the uses of this copy with the
+    /// previously seen copy's destination register.
+    bool foldRedundantCopy(MachineInstr *MI,
+                           SmallSet<unsigned, 4> &CopySrcRegs,
+                           DenseMap<unsigned, MachineInstr *> &CopyMIs);
+
+    /// \brief Is the register \p Reg a non-allocatable physical register?
+    bool isNAPhysCopy(unsigned Reg);
+
+    /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+    /// non-allocatable physical register was previously copied to a virtual
+    /// registered and hasn't been clobbered, the virt->phys copy can be
+    /// deleted.
+    bool foldRedundantNAPhysCopy(
+        MachineInstr *MI,
+        DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
+
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
 
@@ -171,6 +209,69 @@ namespace {
     }
   };
 
+  /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
+  /// returned sources for a given search and the instructions where the sources
+  /// were tracked from.
+  class ValueTrackerResult {
+  private:
+    /// Track all sources found by one ValueTracker query.
+    SmallVector<TargetInstrInfo::RegSubRegPair, 2> RegSrcs;
+
+    /// Instruction using the sources in 'RegSrcs'.
+    const MachineInstr *Inst;
+
+  public:
+    ValueTrackerResult() : Inst(nullptr) {}
+    ValueTrackerResult(unsigned Reg, unsigned SubReg) : Inst(nullptr) {
+      addSource(Reg, SubReg);
+    }
+
+    bool isValid() const { return getNumSources() > 0; }
+
+    void setInst(const MachineInstr *I) { Inst = I; }
+    const MachineInstr *getInst() const { return Inst; }
+
+    void clear() {
+      RegSrcs.clear();
+      Inst = nullptr;
+    }
+
+    void addSource(unsigned SrcReg, unsigned SrcSubReg) {
+      RegSrcs.push_back(TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg));
+    }
+
+    void setSource(int Idx, unsigned SrcReg, unsigned SrcSubReg) {
+      assert(Idx < getNumSources() && "Reg pair source out of index");
+      RegSrcs[Idx] = TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg);
+    }
+
+    int getNumSources() const { return RegSrcs.size(); }
+
+    unsigned getSrcReg(int Idx) const {
+      assert(Idx < getNumSources() && "Reg source out of index");
+      return RegSrcs[Idx].Reg;
+    }
+
+    unsigned getSrcSubReg(int Idx) const {
+      assert(Idx < getNumSources() && "SubReg source out of index");
+      return RegSrcs[Idx].SubReg;
+    }
+
+    bool operator==(const ValueTrackerResult &Other) {
+      if (Other.getInst() != getInst())
+        return false;
+
+      if (Other.getNumSources() != getNumSources())
+        return false;
+
+      for (int i = 0, e = Other.getNumSources(); i != e; ++i)
+        if (Other.getSrcReg(i) != getSrcReg(i) ||
+            Other.getSrcSubReg(i) != getSrcSubReg(i))
+          return false;
+      return true;
+    }
+  };
+
   /// \brief Helper class to track the possible sources of a value defined by
   /// a (chain of) copy related instructions.
   /// Given a definition (instruction and definition index), this class
@@ -213,23 +314,25 @@ namespace {
 
     /// \brief Dispatcher to the right underlying implementation of
     /// getNextSource.
-    bool getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceImpl();
     /// \brief Specialized version of getNextSource for Copy instructions.
-    bool getNextSourceFromCopy(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceFromCopy();
     /// \brief Specialized version of getNextSource for Bitcast instructions.
-    bool getNextSourceFromBitcast(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceFromBitcast();
     /// \brief Specialized version of getNextSource for RegSequence
     /// instructions.
-    bool getNextSourceFromRegSequence(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceFromRegSequence();
     /// \brief Specialized version of getNextSource for InsertSubreg
     /// instructions.
-    bool getNextSourceFromInsertSubreg(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceFromInsertSubreg();
     /// \brief Specialized version of getNextSource for ExtractSubreg
     /// instructions.
-    bool getNextSourceFromExtractSubreg(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceFromExtractSubreg();
     /// \brief Specialized version of getNextSource for SubregToReg
     /// instructions.
-    bool getNextSourceFromSubregToReg(unsigned &SrcReg, unsigned &SrcSubReg);
+    ValueTrackerResult getNextSourceFromSubregToReg();
+    /// \brief Specialized version of getNextSource for PHI instructions.
+    ValueTrackerResult getNextSourceFromPHI();
 
   public:
     /// \brief Create a ValueTracker instance for the value defined by \p Reg.
@@ -276,16 +379,10 @@ namespace {
 
     /// \brief Following the use-def chain, get the next available source
     /// for the tracked value.
-    /// When the returned value is not nullptr, \p SrcReg gives the register
-    /// that contain the tracked value.
-    /// \note The sub register index returned in \p SrcSubReg must be used
-    /// on \p SrcReg to access the actual value.
-    /// \return Unless the returned value is nullptr (i.e., no source found),
-    /// \p SrcReg gives the register of the next source used in the returned
-    /// instruction and \p SrcSubReg the sub-register index to be used on that
-    /// source to get the tracked value. When nullptr is returned, no
-    /// alternative source has been found.
-    const MachineInstr *getNextSource(unsigned &SrcReg, unsigned &SrcSubReg);
+    /// \return A ValueTrackerResult containing a set of registers
+    /// and sub registers with tracked values. A ValueTrackerResult with
+    /// an empty set of registers means no source was found.
+    ValueTrackerResult getNextSource();
 
     /// \brief Get the last register where the initial value can be found.
     /// Initially this is the register of the definition.
@@ -303,11 +400,10 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
                 "Peephole Optimizations", false, false)
 
-/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
-/// a single register and writes a single register and it does not modify the
-/// source, and if the source value is preserved as a sub-register of the
-/// result, then replace all reachable uses of the source with the subreg of the
-/// result.
+/// If instruction is a copy-like instruction, i.e. it reads a single register
+/// and writes a single register and it does not modify the source, and if the
+/// source value is preserved as a sub-register of the result, then replace all
+/// reachable uses of the source with the subreg of the result.
 ///
 /// Do not generate an EXTRACT that is used only in a debug use, as this changes
 /// the code. Since this code does not currently share EXTRACTs, just ignore all
@@ -458,10 +554,10 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   return Changed;
 }
 
-/// optimizeCmpInstr - If the instruction is a compare and the previous
-/// instruction it's comparing against all ready sets (or could be modified to
-/// set) the same flag as the compare, then we can remove the comparison and use
-/// the flag from the previous instruction.
+/// If the instruction is a compare and the previous instruction it's comparing
+/// against already sets (or could be modified to set) the same flag as the
+/// compare, then we can remove the comparison and use the flag from the
+/// previous instruction.
 bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
                                          MachineBasicBlock *MBB) {
   // If this instruction is a comparison against zero and isn't comparing a
@@ -506,88 +602,138 @@ bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
   return TII->optimizeCondBranch(MI);
 }
 
-/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
-/// share the same register file.
-static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
-                                  const TargetRegisterClass *DefRC,
-                                  unsigned DefSubReg,
-                                  const TargetRegisterClass *SrcRC,
-                                  unsigned SrcSubReg) {
-  // Same register class.
-  if (DefRC == SrcRC)
-    return true;
-
-  // Both operands are sub registers. Check if they share a register class.
-  unsigned SrcIdx, DefIdx;
-  if (SrcSubReg && DefSubReg)
-    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
-                                      SrcIdx, DefIdx) != nullptr;
-  // At most one of the register is a sub register, make it Src to avoid
-  // duplicating the test.
-  if (!SrcSubReg) {
-    std::swap(DefSubReg, SrcSubReg);
-    std::swap(DefRC, SrcRC);
-  }
-
-  // One of the register is a sub register, check if we can get a superclass.
-  if (SrcSubReg)
-    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
-  // Plain copy.
-  return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
-}
-
 /// \brief Try to find the next source that share the same register file
 /// for the value defined by \p Reg and \p SubReg.
-/// When true is returned, \p Reg and \p SubReg are updated with the
-/// register number and sub-register index of the new source.
+/// When true is returned, the \p RewriteMap can be used by the client to
+/// retrieve all Def -> Use along the way up to the next source. Any found
+/// Use that is not itself a key for another entry, is the next source to
+/// use. During the search for the next source, multiple sources can be found
+/// given multiple incoming sources of a PHI instruction. In this case, we
+/// look in each PHI source for the next source; all found next sources must
+/// share the same register file as \p Reg and \p SubReg. The client should
+/// then be capable to rewrite all intermediate PHIs to get the next source.
 /// \return False if no alternative sources are available. True otherwise.
-bool PeepholeOptimizer::findNextSource(unsigned &Reg, unsigned &SubReg) {
+bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
+                                       RewriteMapTy &RewriteMap) {
   // Do not try to find a new source for a physical register.
   // So far we do not have any motivating example for doing that.
   // Thus, instead of maintaining untested code, we will revisit that if
   // that changes at some point.
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
     return false;
-
   const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
-  unsigned DefSubReg = SubReg;
-
-  unsigned Src;
-  unsigned SrcSubReg;
-  bool ShouldRewrite = false;
-
-  // Follow the chain of copies until we reach the top of the use-def chain
-  // or find a more suitable source.
-  ValueTracker ValTracker(Reg, DefSubReg, *MRI, !DisableAdvCopyOpt, TII);
-  do {
-    unsigned CopySrcReg, CopySrcSubReg;
-    if (!ValTracker.getNextSource(CopySrcReg, CopySrcSubReg))
-      break;
-    Src = CopySrcReg;
-    SrcSubReg = CopySrcSubReg;
-
-    // Do not extend the live-ranges of physical registers as they add
-    // constraints to the register allocator.
-    // Moreover, if we want to extend the live-range of a physical register,
-    // unlike SSA virtual register, we will have to check that they are not
-    // redefine before the related use.
-    if (TargetRegisterInfo::isPhysicalRegister(Src))
-      break;
 
-    const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
+  SmallVector<TargetInstrInfo::RegSubRegPair, 4> SrcToLook;
+  TargetInstrInfo::RegSubRegPair CurSrcPair(Reg, SubReg);
+  SrcToLook.push_back(CurSrcPair);
+
+  unsigned PHICount = 0;
+  while (!SrcToLook.empty() && PHICount < RewritePHILimit) {
+    TargetInstrInfo::RegSubRegPair Pair = SrcToLook.pop_back_val();
+    // As explained above, do not handle physical registers
+    if (TargetRegisterInfo::isPhysicalRegister(Pair.Reg))
+      return false;
 
-    // If this source does not incur a cross register bank copy, use it.
-    ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, DefSubReg, SrcRC,
-                                          SrcSubReg);
-  } while (!ShouldRewrite);
+    CurSrcPair = Pair;
+    ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI,
+                            !DisableAdvCopyOpt, TII);
+    ValueTrackerResult Res;
+    bool ShouldRewrite = false;
+
+    do {
+      // Follow the chain of copies until we reach the top of the use-def chain
+      // or find a more suitable source.
+      Res = ValTracker.getNextSource();
+      if (!Res.isValid())
+        break;
+
+      // Insert the Def -> Use entry for the recently found source.
+      ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
+      if (CurSrcRes.isValid()) {
+        assert(CurSrcRes == Res && "ValueTrackerResult found must match");
+        // An existent entry with multiple sources is a PHI cycle we must avoid.
+        // Otherwise it's an entry with a valid next source we already found.
+        if (CurSrcRes.getNumSources() > 1) {
+          DEBUG(dbgs() << "findNextSource: found PHI cycle, aborting...\n");
+          return false;
+        }
+        break;
+      }
+      RewriteMap.insert(std::make_pair(CurSrcPair, Res));
+
+      // ValueTrackerResult usually have one source unless it's the result from
+      // a PHI instruction. Add the found PHI edges to be looked up further.
+      unsigned NumSrcs = Res.getNumSources();
+      if (NumSrcs > 1) {
+        PHICount++;
+        for (unsigned i = 0; i < NumSrcs; ++i)
+          SrcToLook.push_back(TargetInstrInfo::RegSubRegPair(
+              Res.getSrcReg(i), Res.getSrcSubReg(i)));
+        break;
+      }
 
-  // If we did not find a more suitable source, there is nothing to optimize.
-  if (!ShouldRewrite || Src == Reg)
+      CurSrcPair.Reg = Res.getSrcReg(0);
+      CurSrcPair.SubReg = Res.getSrcSubReg(0);
+      // Do not extend the live-ranges of physical registers as they add
+      // constraints to the register allocator. Moreover, if we want to extend
+      // the live-range of a physical register, unlike SSA virtual register,
+      // we will have to check that they aren't redefine before the related use.
+      if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
+        return false;
+
+      const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);
+      ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC,
+                                                CurSrcPair.SubReg);
+    } while (!ShouldRewrite);
+
+    // Continue looking for new sources...
+    if (Res.isValid())
+      continue;
+
+    // Do not continue searching for a new source if the there's at least
+    // one use-def which cannot be rewritten.
+    if (!ShouldRewrite)
+      return false;
+  }
+
+  if (PHICount >= RewritePHILimit) {
+    DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
     return false;
+  }
 
-  Reg = Src;
-  SubReg = SrcSubReg;
-  return true;
+  // If we did not find a more suitable source, there is nothing to optimize.
+  return CurSrcPair.Reg != Reg;
+}
+
+/// \brief Insert a PHI instruction with incoming edges \p SrcRegs that are
+/// guaranteed to have the same register class. This is necessary whenever we
+/// successfully traverse a PHI instruction and find suitable sources coming
+/// from its edges. By inserting a new PHI, we provide a rewritten PHI def
+/// suitable to be used in a new COPY instruction.
+static MachineInstr *
+insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+          const SmallVectorImpl<TargetInstrInfo::RegSubRegPair> &SrcRegs,
+          MachineInstr *OrigPHI) {
+  assert(!SrcRegs.empty() && "No sources to create a PHI instruction?");
+
+  const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg);
+  unsigned NewVR = MRI->createVirtualRegister(NewRC);
+  MachineBasicBlock *MBB = OrigPHI->getParent();
+  MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(),
+                                    TII->get(TargetOpcode::PHI), NewVR);
+
+  unsigned MBBOpIdx = 2;
+  for (auto RegPair : SrcRegs) {
+    MIB.addReg(RegPair.Reg, 0, RegPair.SubReg);
+    MIB.addMBB(OrigPHI->getOperand(MBBOpIdx).getMBB());
+    // Since we're extended the lifetime of RegPair.Reg, clear the
+    // kill flags to account for that and make RegPair.Reg reaches
+    // the new PHI.
+    MRI->clearKillFlags(RegPair.Reg);
+    MBBOpIdx += 2;
+  }
+
+  return MIB;
 }
 
 namespace {
@@ -624,7 +770,7 @@ public:
   /// This source defines the whole definition, i.e.,
   /// (TrackReg, TrackSubReg) = (dst, dstSubIdx).
   ///
-  /// The second and subsequent calls will return false, has there is only one
+  /// The second and subsequent calls will return false, as there is only one
   /// rewritable source.
   ///
   /// \return True if a rewritable source has been found, false otherwise.
@@ -632,9 +778,9 @@ public:
   virtual bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
                                        unsigned &TrackReg,
                                        unsigned &TrackSubReg) {
-    // If CurrentSrcIdx == 1, this means this function has already been
-    // called once. CopyLike has one defintiion and one argument, thus,
-    // there is nothing else to rewrite.
+    // If CurrentSrcIdx == 1, this means this function has already been called
+    // once. CopyLike has one definition and one argument, thus, there is
+    // nothing else to rewrite.
     if (!CopyLike.isCopy() || CurrentSrcIdx == 1)
       return false;
     // This is the first call to getNextRewritableSource.
@@ -653,7 +799,7 @@ public:
 
   /// \brief Rewrite the current source with \p NewReg and \p NewSubReg
   /// if possible.
-  /// \return True if the rewritting was possible, false otherwise.
+  /// \return True if the rewriting was possible, false otherwise.
   virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) {
     if (!CopyLike.isCopy() || CurrentSrcIdx != 1)
       return false;
@@ -662,6 +808,157 @@ public:
     MOSrc.setSubReg(NewSubReg);
     return true;
   }
+
+  /// \brief Given a \p Def.Reg and Def.SubReg  pair, use \p RewriteMap to find
+  /// the new source to use for rewrite. If \p HandleMultipleSources is true and
+  /// multiple sources for a given \p Def are found along the way, we found a
+  /// PHI instructions that needs to be rewritten.
+  /// TODO: HandleMultipleSources should be removed once we test PHI handling
+  /// with coalescable copies.
+  TargetInstrInfo::RegSubRegPair
+  getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+               TargetInstrInfo::RegSubRegPair Def,
+               PeepholeOptimizer::RewriteMapTy &RewriteMap,
+               bool HandleMultipleSources = true) {
+
+    TargetInstrInfo::RegSubRegPair LookupSrc(Def.Reg, Def.SubReg);
+    do {
+      ValueTrackerResult Res = RewriteMap.lookup(LookupSrc);
+      // If there are no entries on the map, LookupSrc is the new source.
+      if (!Res.isValid())
+        return LookupSrc;
+
+      // There's only one source for this definition, keep searching...
+      unsigned NumSrcs = Res.getNumSources();
+      if (NumSrcs == 1) {
+        LookupSrc.Reg = Res.getSrcReg(0);
+        LookupSrc.SubReg = Res.getSrcSubReg(0);
+        continue;
+      }
+
+      // TODO: Remove once multiple srcs w/ coalescable copies are supported.
+      if (!HandleMultipleSources)
+        break;
+
+      // Multiple sources, recurse into each source to find a new source
+      // for it. Then, rewrite the PHI accordingly to its new edges.
+      SmallVector<TargetInstrInfo::RegSubRegPair, 4> NewPHISrcs;
+      for (unsigned i = 0; i < NumSrcs; ++i) {
+        TargetInstrInfo::RegSubRegPair PHISrc(Res.getSrcReg(i),
+                                              Res.getSrcSubReg(i));
+        NewPHISrcs.push_back(
+            getNewSource(MRI, TII, PHISrc, RewriteMap, HandleMultipleSources));
+      }
+
+      // Build the new PHI node and return its def register as the new source.
+      MachineInstr *OrigPHI = const_cast<MachineInstr *>(Res.getInst());
+      MachineInstr *NewPHI = insertPHI(MRI, TII, NewPHISrcs, OrigPHI);
+      DEBUG(dbgs() << "-- getNewSource\n");
+      DEBUG(dbgs() << "   Replacing: " << *OrigPHI);
+      DEBUG(dbgs() << "        With: " << *NewPHI);
+      const MachineOperand &MODef = NewPHI->getOperand(0);
+      return TargetInstrInfo::RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+
+    } while (1);
+
+    return TargetInstrInfo::RegSubRegPair(0, 0);
+  }
+
+  /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
+  /// and create a new COPY instruction. More info about RewriteMap in
+  /// PeepholeOptimizer::findNextSource. Right now this is only used to handle
+  /// Uncoalescable copies, since they are copy like instructions that aren't
+  /// recognized by the register allocator.
+  virtual MachineInstr *
+  RewriteSource(TargetInstrInfo::RegSubRegPair Def,
+                PeepholeOptimizer::RewriteMapTy &RewriteMap) {
+    return nullptr;
+  }
+};
+
+/// \brief Helper class to rewrite uncoalescable copy like instructions
+/// into new COPY (coalescable friendly) instructions.
+class UncoalescableRewriter : public CopyRewriter {
+protected:
+  const TargetInstrInfo &TII;
+  MachineRegisterInfo   &MRI;
+  /// The number of defs in the bitcast
+  unsigned NumDefs;
+
+public:
+  UncoalescableRewriter(MachineInstr &MI, const TargetInstrInfo &TII,
+                         MachineRegisterInfo &MRI)
+      : CopyRewriter(MI), TII(TII), MRI(MRI) {
+    NumDefs = MI.getDesc().getNumDefs();
+  }
+
+  /// \brief Get the next rewritable def source (TrackReg, TrackSubReg)
+  /// All such sources need to be considered rewritable in order to
+  /// rewrite a uncoalescable copy-like instruction. This method return
+  /// each definition that must be checked if rewritable.
+  ///
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // Find the next non-dead definition and continue from there.
+    if (CurrentSrcIdx == NumDefs)
+      return false;
+
+    while (CopyLike.getOperand(CurrentSrcIdx).isDead()) {
+      ++CurrentSrcIdx;
+      if (CurrentSrcIdx == NumDefs)
+        return false;
+    }
+
+    // What we track are the alternative sources of the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+
+    CurrentSrcIdx++;
+    return true;
+  }
+
+  /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
+  /// and create a new COPY instruction. More info about RewriteMap in
+  /// PeepholeOptimizer::findNextSource. Right now this is only used to handle
+  /// Uncoalescable copies, since they are copy like instructions that aren't
+  /// recognized by the register allocator.
+  MachineInstr *
+  RewriteSource(TargetInstrInfo::RegSubRegPair Def,
+                PeepholeOptimizer::RewriteMapTy &RewriteMap) override {
+    assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
+           "We do not rewrite physical registers");
+
+    // Find the new source to use in the COPY rewrite.
+    TargetInstrInfo::RegSubRegPair NewSrc =
+        getNewSource(&MRI, &TII, Def, RewriteMap);
+
+    // Insert the COPY.
+    const TargetRegisterClass *DefRC = MRI.getRegClass(Def.Reg);
+    unsigned NewVR = MRI.createVirtualRegister(DefRC);
+
+    MachineInstr *NewCopy =
+        BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
+                TII.get(TargetOpcode::COPY), NewVR)
+            .addReg(NewSrc.Reg, 0, NewSrc.SubReg);
+
+    NewCopy->getOperand(0).setSubReg(Def.SubReg);
+    if (Def.SubReg)
+      NewCopy->getOperand(0).setIsUndef();
+
+    DEBUG(dbgs() << "-- RewriteSource\n");
+    DEBUG(dbgs() << "   Replacing: " << CopyLike);
+    DEBUG(dbgs() << "        With: " << *NewCopy);
+    MRI.replaceRegWith(Def.Reg, NewVR);
+    MRI.clearKillFlags(NewVR);
+
+    // We extended the lifetime of NewSrc.Reg, clear the kill flags to
+    // account for that.
+    MRI.clearKillFlags(NewSrc.Reg);
+
+    return NewCopy;
+  }
 };
 
 /// \brief Specialized rewriter for INSERT_SUBREG instruction.
@@ -699,7 +996,7 @@ public:
     // partial definition.
     TrackReg = MODef.getReg();
     if (MODef.getSubReg())
-      // Bails if we have to compose sub-register indices.
+      // Bail if we have to compose sub-register indices.
       return false;
     TrackSubReg = (unsigned)CopyLike.getOperand(3).getImm();
     return true;
@@ -740,7 +1037,7 @@ public:
     CurrentSrcIdx = 1;
     const MachineOperand &MOExtractedReg = CopyLike.getOperand(1);
     SrcReg = MOExtractedReg.getReg();
-    // If we have to compose sub-register indices, bails out.
+    // If we have to compose sub-register indices, bail out.
     if (MOExtractedReg.getSubReg())
       return false;
 
@@ -818,7 +1115,7 @@ public:
     }
     const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
     SrcReg = MOInsertedReg.getReg();
-    // If we have to compose sub-register indices, bails out.
+    // If we have to compose sub-register indices, bail out.
     if ((SrcSubReg = MOInsertedReg.getSubReg()))
       return false;
 
@@ -828,7 +1125,7 @@ public:
 
     const MachineOperand &MODef = CopyLike.getOperand(0);
     TrackReg = MODef.getReg();
-    // If we have to compose sub-registers, bails.
+    // If we have to compose sub-registers, bail.
     return MODef.getSubReg() == 0;
   }
 
@@ -850,7 +1147,13 @@ public:
 /// \return A pointer to a dynamically allocated CopyRewriter or nullptr
 /// if no rewriter works for \p MI.
 static CopyRewriter *getCopyRewriter(MachineInstr &MI,
-                                     const TargetInstrInfo &TII) {
+                                     const TargetInstrInfo &TII,
+                                     MachineRegisterInfo &MRI) {
+  // Handle uncoalescable copy-like instructions.
+  if (MI.isBitcast() || (MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+                         MI.isExtractSubregLike()))
+    return new UncoalescableRewriter(MI, TII, MRI);
+
   switch (MI.getOpcode()) {
   default:
     return nullptr;
@@ -874,7 +1177,7 @@ static CopyRewriter *getCopyRewriter(MachineInstr &MI,
 /// the same register bank.
 /// New copies issued by this optimization are register allocator
 /// friendly. This optimization does not remove any copy as it may
-/// overconstraint the register allocator, but replaces some operands
+/// overconstrain the register allocator, but replaces some operands
 /// when possible.
 /// \pre isCoalescableCopy(*MI) is true.
 /// \return True, when \p MI has been rewritten. False otherwise.
@@ -889,25 +1192,33 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr *MI) {
 
   bool Changed = false;
   // Get the right rewriter for the current copy.
-  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII));
-  // If none exists, bails out.
+  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII, *MRI));
+  // If none exists, bail out.
   if (!CpyRewriter)
     return false;
   // Rewrite each rewritable source.
   unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg;
   while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg,
                                               TrackSubReg)) {
-    unsigned NewSrc = TrackReg;
-    unsigned NewSubReg = TrackSubReg;
-    // Try to find a more suitable source.
-    // If we failed to do so, or get the actual source,
-    // move to the next source.
-    if (!findNextSource(NewSrc, NewSubReg) || SrcReg == NewSrc)
+    // Keep track of PHI nodes and its incoming edges when looking for sources.
+    RewriteMapTy RewriteMap;
+    // Try to find a more suitable source. If we failed to do so, or get the
+    // actual source, move to the next source.
+    if (!findNextSource(TrackReg, TrackSubReg, RewriteMap))
+      continue;
+
+    // Get the new source to rewrite. TODO: Only enable handling of multiple
+    // sources (PHIs) once we have a motivating example and testcases for it.
+    TargetInstrInfo::RegSubRegPair TrackPair(TrackReg, TrackSubReg);
+    TargetInstrInfo::RegSubRegPair NewSrc = CpyRewriter->getNewSource(
+        MRI, TII, TrackPair, RewriteMap, false /* multiple sources */);
+    if (SrcReg == NewSrc.Reg || NewSrc.Reg == 0)
       continue;
+
     // Rewrite source.
-    if (CpyRewriter->RewriteCurrentSource(NewSrc, NewSubReg)) {
+    if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) {
       // We may have extended the live-range of NewSrc, account for that.
-      MRI->clearKillFlags(NewSrc);
+      MRI->clearKillFlags(NewSrc.Reg);
       Changed = true;
     }
   }
@@ -936,61 +1247,53 @@ bool PeepholeOptimizer::optimizeUncoalescableCopy(
   assert(MI && isUncoalescableCopy(*MI) && "Invalid argument");
 
   // Check if we can rewrite all the values defined by this instruction.
-  SmallVector<
-      std::pair<TargetInstrInfo::RegSubRegPair, TargetInstrInfo::RegSubRegPair>,
-      4> RewritePairs;
-  for (const MachineOperand &MODef : MI->defs()) {
-    if (MODef.isDead())
-      // We can ignore those.
-      continue;
+  SmallVector<TargetInstrInfo::RegSubRegPair, 4> RewritePairs;
+  // Get the right rewriter for the current copy.
+  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII, *MRI));
+  // If none exists, bail out.
+  if (!CpyRewriter)
+    return false;
 
+  // Rewrite each rewritable source by generating new COPYs. This works
+  // differently from optimizeCoalescableCopy since it first makes sure that all
+  // definitions can be rewritten.
+  RewriteMapTy RewriteMap;
+  unsigned Reg, SubReg, CopyDefReg, CopyDefSubReg;
+  while (CpyRewriter->getNextRewritableSource(Reg, SubReg, CopyDefReg,
+                                              CopyDefSubReg)) {
     // If a physical register is here, this is probably for a good reason.
     // Do not rewrite that.
-    if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+    if (TargetRegisterInfo::isPhysicalRegister(CopyDefReg))
       return false;
 
     // If we do not know how to rewrite this definition, there is no point
     // in trying to kill this instruction.
-    TargetInstrInfo::RegSubRegPair Def(MODef.getReg(), MODef.getSubReg());
-    TargetInstrInfo::RegSubRegPair Src = Def;
-    if (!findNextSource(Src.Reg, Src.SubReg))
+    TargetInstrInfo::RegSubRegPair Def(CopyDefReg, CopyDefSubReg);
+    if (!findNextSource(Def.Reg, Def.SubReg, RewriteMap))
       return false;
-    RewritePairs.push_back(std::make_pair(Def, Src));
+
+    RewritePairs.push_back(Def);
   }
+
   // The change is possible for all defs, do it.
-  for (const auto &PairDefSrc : RewritePairs) {
-    const auto &Def = PairDefSrc.first;
-    const auto &Src = PairDefSrc.second;
+  for (const auto &Def : RewritePairs) {
     // Rewrite the "copy" in a way the register coalescer understands.
-    assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
-           "We do not rewrite physical registers");
-    const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
-    unsigned NewVR = MRI->createVirtualRegister(DefRC);
-    MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                    TII->get(TargetOpcode::COPY),
-                                    NewVR).addReg(Src.Reg, 0, Src.SubReg);
-    NewCopy->getOperand(0).setSubReg(Def.SubReg);
-    if (Def.SubReg)
-      NewCopy->getOperand(0).setIsUndef();
+    MachineInstr *NewCopy = CpyRewriter->RewriteSource(Def, RewriteMap);
+    assert(NewCopy && "Should be able to always generate a new copy");
     LocalMIs.insert(NewCopy);
-    MRI->replaceRegWith(Def.Reg, NewVR);
-    MRI->clearKillFlags(NewVR);
-    // We extended the lifetime of Src.
-    // Clear the kill flags to account for that.
-    MRI->clearKillFlags(Src.Reg);
   }
+
   // MI is now dead.
   MI->eraseFromParent();
   ++NumUncoalescableCopies;
   return true;
 }
 
-/// isLoadFoldable - Check whether MI is a candidate for folding into a later
-/// instruction. We only fold loads to virtual registers and the virtual
-/// register defined has a single use.
+/// Check whether MI is a candidate for folding into a later instruction.
+/// We only fold loads to virtual registers and the virtual register defined
+/// has a single use.
 bool PeepholeOptimizer::isLoadFoldable(
-                              MachineInstr *MI,
-                              SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) {
+    MachineInstr *MI, SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) {
   if (!MI->canFoldAsLoad() || !MI->mayLoad())
     return false;
   const MCInstrDesc &MCID = MI->getDesc();
@@ -1010,9 +1313,9 @@ bool PeepholeOptimizer::isLoadFoldable(
   return false;
 }
 
-bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
-                                        SmallSet<unsigned, 4> &ImmDefRegs,
-                                 DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
+bool PeepholeOptimizer::isMoveImmediate(
+    MachineInstr *MI, SmallSet<unsigned, 4> &ImmDefRegs,
+    DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (!MI->isMoveImmediate())
     return false;
@@ -1028,23 +1331,26 @@ bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
   return false;
 }
 
-/// foldImmediate - Try folding register operands that are defined by move
-/// immediate instructions, i.e. a trivial constant folding optimization, if
+/// Try folding register operands that are defined by move immediate
+/// instructions, i.e. a trivial constant folding optimization, if
 /// and only if the def and use are in the same BB.
-bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
-                                      SmallSet<unsigned, 4> &ImmDefRegs,
-                                 DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
+bool PeepholeOptimizer::foldImmediate(
+    MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs,
+    DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
   for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isDef())
       continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
     if (ImmDefRegs.count(Reg) == 0)
       continue;
     DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
-    assert(II != ImmDefMIs.end());
+    assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
     if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
       ++NumImmFold;
       return true;
@@ -1053,6 +1359,117 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
   return false;
 }
 
+// FIXME: This is very simple and misses some cases which should be handled when
+// motivating examples are found.
+//
+// The copy rewriting logic should look at uses as well as defs and be able to
+// eliminate copies across blocks.
+//
+// Later copies that are subregister extracts will also not be eliminated since
+// only the first copy is considered.
+//
+// e.g.
+// %vreg1 = COPY %vreg0
+// %vreg2 = COPY %vreg0:sub1
+//
+// Should replace %vreg2 uses with %vreg1:sub1
+bool PeepholeOptimizer::foldRedundantCopy(
+    MachineInstr *MI, SmallSet<unsigned, 4> &CopySrcRegs,
+    DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  if (CopySrcRegs.insert(SrcReg).second) {
+    // First copy of this reg seen.
+    CopyMIs.insert(std::make_pair(SrcReg, MI));
+    return false;
+  }
+
+  MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second;
+
+  unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+  unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg();
+
+  // Can't replace different subregister extracts.
+  if (SrcSubReg != PrevSrcSubReg)
+    return false;
+
+  unsigned PrevDstReg = PrevCopy->getOperand(0).getReg();
+
+  // Only replace if the copy register class is the same.
+  //
+  // TODO: If we have multiple copies to different register classes, we may want
+  // to track multiple copies of the same source register.
+  if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg))
+    return false;
+
+  MRI->replaceRegWith(DstReg, PrevDstReg);
+
+  // Lifetime of the previous copy has been extended.
+  MRI->clearKillFlags(PrevDstReg);
+  return true;
+}
+
+bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
+  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+    MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  if (DisableNAPhysCopyOpt)
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    // %vreg = COPY %PHYSREG
+    // Avoid using a datastructure which can track multiple live non-allocatable
+    // phys->virt copies since LLVM doesn't seem to do this.
+    NAPhysToVirtMIs.insert({SrcReg, MI});
+    return false;
+  }
+
+  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+    return false;
+
+  // %PHYSREG = COPY %vreg
+  auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+  if (PrevCopy == NAPhysToVirtMIs.end()) {
+    // We can't remove the copy: there was an intervening clobber of the
+    // non-allocatable physical register after the copy to virtual.
+    DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
+                 << '\n');
+    return false;
+  }
+
+  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  if (PrevDstReg == SrcReg) {
+    // Remove the virt->phys copy: we saw the virtual register definition, and
+    // the non-allocatable physical register's state hasn't changed since then.
+    DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
+    ++NumNAPhysCopies;
+    return true;
+  }
+
+  // Potential missed optimization opportunity: we saw a different virtual
+  // register get a copy of the non-allocatable physical register, and we only
+  // track one such copy. Avoid getting confused by this new non-allocatable
+  // physical register definition, and remove it from the tracked copies.
+  DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
+  NAPhysToVirtMIs.erase(PrevCopy);
+  return false;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
@@ -1070,9 +1487,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = &*I;
-
+  for (MachineBasicBlock &MBB : MF) {
     bool SeenMoveImm = false;
 
     // During this forward scan, at some point it needs to answer the question
@@ -1086,8 +1501,19 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
     SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
 
-    for (MachineBasicBlock::iterator
-           MII = I->begin(), MIE = I->end(); MII != MIE; ) {
+    // Track when a non-allocatable physical register is copied to a virtual
+    // register so that useless moves can be removed.
+    //
+    // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG`
+    // without any intervening re-definition of %PHYSREG.
+    DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+
+    // Set of virtual registers that are copied from.
+    SmallSet<unsigned, 4> CopySrcRegs;
+    DenseMap<unsigned, MachineInstr *> CopySrcMIs;
+
+    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+         MII != MIE; ) {
       MachineInstr *MI = &*MII;
       // We may be erasing MI below, increment MII now.
       ++MII;
@@ -1097,20 +1523,60 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isDebugValue())
           continue;
 
-      // If there exists an instruction which belongs to the following
-      // categories, we will discard the load candidates.
-      if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() ||
-          MI->isKill() || MI->isInlineAsm() ||
-          MI->hasUnmodeledSideEffects()) {
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.
+      if (MI->isLoadFoldBarrier())
         FoldAsLoadDefCandidates.clear();
+
+      if (MI->isPosition() || MI->isPHI())
+        continue;
+
+      if (!MI->isCopy()) {
+        for (const auto &Op : MI->operands()) {
+          // Visit all operands: definitions can be implicit or explicit.
+          if (Op.isReg()) {
+            unsigned Reg = Op.getReg();
+            if (Op.isDef() && isNAPhysCopy(Reg)) {
+              const auto &Def = NAPhysToVirtMIs.find(Reg);
+              if (Def != NAPhysToVirtMIs.end()) {
+                // A new definition of the non-allocatable physical register
+                // invalidates previous copies.
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          } else if (Op.isRegMask()) {
+            const uint32_t *RegMask = Op.getRegMask();
+            for (auto &RegMI : NAPhysToVirtMIs) {
+              unsigned Def = RegMI.first;
+              if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          }
+        }
+      }
+
+      if (MI->isImplicitDef() || MI->isKill())
+        continue;
+
+      if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+        // Blow away all non-allocatable physical registers knowledge since we
+        // don't know what's correct anymore.
+        //
+        // FIXME: handle explicit asm clobbers.
+        DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
+                     << '\n');
+        NAPhysToVirtMIs.clear();
         continue;
       }
-      if (MI->mayStore() || MI->isCall())
-        FoldAsLoadDefCandidates.clear();
 
       if ((isUncoalescableCopy(*MI) &&
            optimizeUncoalescableCopy(MI, LocalMIs)) ||
-          (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
+          (MI->isCompare() && optimizeCmpInstr(MI, &MBB)) ||
           (MI->isSelect() && optimizeSelect(MI, LocalMIs))) {
         // MI is deleted.
         LocalMIs.erase(MI);
@@ -1129,17 +1595,26 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
+      if (MI->isCopy() &&
+          (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) ||
+           foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
+        LocalMIs.erase(MI);
+        MI->eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
         SeenMoveImm = true;
       } else {
-        Changed |= optimizeExtInstr(MI, MBB, LocalMIs);
+        Changed |= optimizeExtInstr(MI, &MBB, LocalMIs);
         // optimizeExtInstr might have created new instructions after MI
         // and before the already incremented MII. Adjust MII so that the
         // next iteration sees the new instructions.
         MII = MI;
         ++MII;
         if (SeenMoveImm)
-          Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
+          Changed |= foldImmediate(MI, &MBB, ImmDefRegs, ImmDefMIs);
       }
 
       // Check whether MI is a load candidate for folding into a later
@@ -1190,8 +1665,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-bool ValueTracker::getNextSourceFromCopy(unsigned &SrcReg,
-                                         unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
   assert(Def->isCopy() && "Invalid definition");
   // Copy instruction are supposed to be: Def = Src.
   // If someone breaks this assumption, bad things will happen everywhere.
@@ -1199,30 +1673,27 @@ bool ValueTracker::getNextSourceFromCopy(unsigned &SrcReg,
 
   if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of src.
-    // Bails as we do not support composing subreg yet.
-    return false;
+    // Bails as we do not support composing subregs yet.
+    return ValueTrackerResult();
   // Otherwise, we want the whole source.
   const MachineOperand &Src = Def->getOperand(1);
-  SrcReg = Src.getReg();
-  SrcSubReg = Src.getSubReg();
-  return true;
+  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
 }
 
-bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcReg,
-                                            unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
   assert(Def->isBitcast() && "Invalid definition");
 
   // Bail if there are effects that a plain copy will not expose.
   if (Def->hasUnmodeledSideEffects())
-    return false;
+    return ValueTrackerResult();
 
   // Bitcasts with more than one def are not supported.
   if (Def->getDesc().getNumDefs() != 1)
-    return false;
+    return ValueTrackerResult();
   if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of the src.
-    // Bails as we do not support composing subreg yet.
-    return false;
+    // Bails as we do not support composing subregs yet.
+    return ValueTrackerResult();
 
   unsigned SrcIdx = Def->getNumOperands();
   for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
@@ -1230,25 +1701,25 @@ bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcReg,
     const MachineOperand &MO = Def->getOperand(OpIdx);
     if (!MO.isReg() || !MO.getReg())
       continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
     assert(!MO.isDef() && "We should have skipped all the definitions by now");
     if (SrcIdx != EndOpIdx)
       // Multiple sources?
-      return false;
+      return ValueTrackerResult();
     SrcIdx = OpIdx;
   }
   const MachineOperand &Src = Def->getOperand(SrcIdx);
-  SrcReg = Src.getReg();
-  SrcSubReg = Src.getSubReg();
-  return true;
+  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
 }
 
-bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg,
-                                                unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() {
   assert((Def->isRegSequence() || Def->isRegSequenceLike()) &&
          "Invalid definition");
 
   if (Def->getOperand(DefIdx).getSubReg())
-    // If we are composing subreg, bails out.
+    // If we are composing subregs, bail out.
     // The case we are checking is Def.<subreg> = REG_SEQUENCE.
     // This should almost never happen as the SSA property is tracked at
     // the register level (as opposed to the subreg level).
@@ -1262,16 +1733,16 @@ bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg,
     // have this case.
     // If we can ascertain (or force) that this never happens, we could
     // turn that into an assertion.
-    return false;
+    return ValueTrackerResult();
 
   if (!TII)
     // We could handle the REG_SEQUENCE here, but we do not want to
     // duplicate the code from the generic TII.
-    return false;
+    return ValueTrackerResult();
 
   SmallVector<TargetInstrInfo::RegSubRegPairAndIdx, 8> RegSeqInputRegs;
   if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
-    return false;
+    return ValueTrackerResult();
 
   // We are looking at:
   // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
@@ -1279,41 +1750,38 @@ bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg,
   for (auto &RegSeqInput : RegSeqInputRegs) {
     if (RegSeqInput.SubIdx == DefSubReg) {
       if (RegSeqInput.SubReg)
-        // Bails if we have to compose sub registers.
-        return false;
+        // Bail if we have to compose sub registers.
+        return ValueTrackerResult();
 
-      SrcReg = RegSeqInput.Reg;
-      SrcSubReg = RegSeqInput.SubReg;
-      return true;
+      return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg);
     }
   }
 
   // If the subreg we are tracking is super-defined by another subreg,
   // we could follow this value. However, this would require to compose
   // the subreg and we do not do that for now.
-  return false;
+  return ValueTrackerResult();
 }
 
-bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg,
-                                                 unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() {
   assert((Def->isInsertSubreg() || Def->isInsertSubregLike()) &&
          "Invalid definition");
 
   if (Def->getOperand(DefIdx).getSubReg())
-    // If we are composing subreg, bails out.
+    // If we are composing subreg, bail out.
     // Same remark as getNextSourceFromRegSequence.
     // I.e., this may be turned into an assert.
-    return false;
+    return ValueTrackerResult();
 
   if (!TII)
     // We could handle the REG_SEQUENCE here, but we do not want to
     // duplicate the code from the generic TII.
-    return false;
+    return ValueTrackerResult();
 
   TargetInstrInfo::RegSubRegPair BaseReg;
   TargetInstrInfo::RegSubRegPairAndIdx InsertedReg;
   if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg))
-    return false;
+    return ValueTrackerResult();
 
   // We are looking at:
   // Def = INSERT_SUBREG v0, v1, sub1
@@ -1323,9 +1791,7 @@ bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg,
 
   // #1 Check if the inserted register matches the required sub index.
   if (InsertedReg.SubIdx == DefSubReg) {
-    SrcReg = InsertedReg.Reg;
-    SrcSubReg = InsertedReg.SubReg;
-    return true;
+    return ValueTrackerResult(InsertedReg.Reg, InsertedReg.SubReg);
   }
   // #2 Otherwise, if the sub register we are looking for is not partial
   // defined by the inserted element, we can look through the main
@@ -1333,10 +1799,10 @@ bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg,
   const MachineOperand &MODef = Def->getOperand(DefIdx);
   // If the result register (Def) and the base register (v0) do not
   // have the same register class or if we have to compose
-  // subregisters, bails out.
+  // subregisters, bail out.
   if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) ||
       BaseReg.SubReg)
-    return false;
+    return ValueTrackerResult();
 
   // Get the TRI and check if the inserted sub-register overlaps with the
   // sub-register we are tracking.
@@ -1344,121 +1810,138 @@ bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg,
   if (!TRI ||
       (TRI->getSubRegIndexLaneMask(DefSubReg) &
        TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) != 0)
-    return false;
+    return ValueTrackerResult();
   // At this point, the value is available in v0 via the same subreg
   // we used for Def.
-  SrcReg = BaseReg.Reg;
-  SrcSubReg = DefSubReg;
-  return true;
+  return ValueTrackerResult(BaseReg.Reg, DefSubReg);
 }
 
-bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcReg,
-                                                  unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceFromExtractSubreg() {
   assert((Def->isExtractSubreg() ||
           Def->isExtractSubregLike()) && "Invalid definition");
   // We are looking at:
   // Def = EXTRACT_SUBREG v0, sub0
 
-  // Bails if we have to compose sub registers.
+  // Bail if we have to compose sub registers.
   // Indeed, if DefSubReg != 0, we would have to compose it with sub0.
   if (DefSubReg)
-    return false;
+    return ValueTrackerResult();
 
   if (!TII)
     // We could handle the EXTRACT_SUBREG here, but we do not want to
     // duplicate the code from the generic TII.
-    return false;
+    return ValueTrackerResult();
 
   TargetInstrInfo::RegSubRegPairAndIdx ExtractSubregInputReg;
   if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg))
-    return false;
+    return ValueTrackerResult();
 
-  // Bails if we have to compose sub registers.
+  // Bail if we have to compose sub registers.
   // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
   if (ExtractSubregInputReg.SubReg)
-    return false;
+    return ValueTrackerResult();
   // Otherwise, the value is available in the v0.sub0.
-  SrcReg = ExtractSubregInputReg.Reg;
-  SrcSubReg = ExtractSubregInputReg.SubIdx;
-  return true;
+  return ValueTrackerResult(ExtractSubregInputReg.Reg,
+                            ExtractSubregInputReg.SubIdx);
 }
 
-bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcReg,
-                                                unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceFromSubregToReg() {
   assert(Def->isSubregToReg() && "Invalid definition");
   // We are looking at:
   // Def = SUBREG_TO_REG Imm, v0, sub0
 
-  // Bails if we have to compose sub registers.
+  // Bail if we have to compose sub registers.
   // If DefSubReg != sub0, we would have to check that all the bits
   // we track are included in sub0 and if yes, we would have to
   // determine the right subreg in v0.
   if (DefSubReg != Def->getOperand(3).getImm())
-    return false;
-  // Bails if we have to compose sub registers.
+    return ValueTrackerResult();
+  // Bail if we have to compose sub registers.
   // Likewise, if v0.subreg != 0, we would have to compose it with sub0.
   if (Def->getOperand(2).getSubReg())
-    return false;
+    return ValueTrackerResult();
 
-  SrcReg = Def->getOperand(2).getReg();
-  SrcSubReg = Def->getOperand(3).getImm();
-  return true;
+  return ValueTrackerResult(Def->getOperand(2).getReg(),
+                            Def->getOperand(3).getImm());
+}
+
+/// \brief Explore each PHI incoming operand and return its sources
+ValueTrackerResult ValueTracker::getNextSourceFromPHI() {
+  assert(Def->isPHI() && "Invalid definition");
+  ValueTrackerResult Res;
+
+  // If we look for a different subreg, bail as we do not support composing
+  // subregs yet.
+  if (Def->getOperand(0).getSubReg() != DefSubReg)
+    return ValueTrackerResult();
+
+  // Return all register sources for PHI instructions.
+  for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) {
+    auto &MO = Def->getOperand(i);
+    assert(MO.isReg() && "Invalid PHI instruction");
+    Res.addSource(MO.getReg(), MO.getSubReg());
+  }
+
+  return Res;
 }
 
-bool ValueTracker::getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSourceImpl() {
   assert(Def && "This method needs a valid definition");
 
   assert(
       (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) &&
       Def->getOperand(DefIdx).isDef() && "Invalid DefIdx");
   if (Def->isCopy())
-    return getNextSourceFromCopy(SrcReg, SrcSubReg);
+    return getNextSourceFromCopy();
   if (Def->isBitcast())
-    return getNextSourceFromBitcast(SrcReg, SrcSubReg);
+    return getNextSourceFromBitcast();
   // All the remaining cases involve "complex" instructions.
-  // Bails if we did not ask for the advanced tracking.
+  // Bail if we did not ask for the advanced tracking.
   if (!UseAdvancedTracking)
-    return false;
+    return ValueTrackerResult();
   if (Def->isRegSequence() || Def->isRegSequenceLike())
-    return getNextSourceFromRegSequence(SrcReg, SrcSubReg);
+    return getNextSourceFromRegSequence();
   if (Def->isInsertSubreg() || Def->isInsertSubregLike())
-    return getNextSourceFromInsertSubreg(SrcReg, SrcSubReg);
+    return getNextSourceFromInsertSubreg();
   if (Def->isExtractSubreg() || Def->isExtractSubregLike())
-    return getNextSourceFromExtractSubreg(SrcReg, SrcSubReg);
+    return getNextSourceFromExtractSubreg();
   if (Def->isSubregToReg())
-    return getNextSourceFromSubregToReg(SrcReg, SrcSubReg);
-  return false;
+    return getNextSourceFromSubregToReg();
+  if (Def->isPHI())
+    return getNextSourceFromPHI();
+  return ValueTrackerResult();
 }
 
-const MachineInstr *ValueTracker::getNextSource(unsigned &SrcReg,
-                                                unsigned &SrcSubReg) {
+ValueTrackerResult ValueTracker::getNextSource() {
   // If we reach a point where we cannot move up in the use-def chain,
   // there is nothing we can get.
   if (!Def)
-    return nullptr;
+    return ValueTrackerResult();
 
-  const MachineInstr *PrevDef = nullptr;
-  // Try to find the next source.
-  if (getNextSourceImpl(SrcReg, SrcSubReg)) {
+  ValueTrackerResult Res = getNextSourceImpl();
+  if (Res.isValid()) {
     // Update definition, definition index, and subregister for the
     // next call of getNextSource.
     // Update the current register.
-    Reg = SrcReg;
-    // Update the return value before moving up in the use-def chain.
-    PrevDef = Def;
+    bool OneRegSrc = Res.getNumSources() == 1;
+    if (OneRegSrc)
+      Reg = Res.getSrcReg(0);
+    // Update the result before moving up in the use-def chain
+    // with the instruction containing the last found sources.
+    Res.setInst(Def);
+
     // If we can still move up in the use-def chain, move to the next
-    // defintion.
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    // definition.
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) {
       Def = MRI.getVRegDef(Reg);
       DefIdx = MRI.def_begin(Reg).getOperandNo();
-      DefSubReg = SrcSubReg;
-      return PrevDef;
+      DefSubReg = Res.getSrcSubReg(0);
+      return Res;
     }
   }
   // If we end up here, this means we will not be able to find another source
-  // for the next iteration.
-  // Make sure any new call to getNextSource bails out early by cutting the
-  // use-def chain.
+  // for the next iteration. Make sure any new call to getNextSource bails out
+  // early by cutting the use-def chain.
   Def = nullptr;
-  return PrevDef;
+  return Res;
 }
diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
index 6f76116..b95dffd 100644
--- a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -87,7 +87,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetPassConfig>();
       AU.addRequired<MachineDominatorTree>();
       AU.addPreserved<MachineDominatorTree>();
@@ -196,7 +196,7 @@ SchedulePostRATDList::SchedulePostRATDList(
     const RegisterClassInfo &RCI,
     TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
     SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs)
-    : ScheduleDAGInstrs(MF, &MLI, /*IsPostRA=*/true), AA(AA), EndIndex(0) {
+    : ScheduleDAGInstrs(MF, &MLI), AA(AA), EndIndex(0) {
 
   const InstrItineraryData *InstrItins =
       MF.getSubtarget().getInstrItineraryData();
@@ -267,7 +267,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 
   TII = Fn.getSubtarget().getInstrInfo();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-  AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
 
   RegClassInfo.runOnMachineFunction(Fn);
@@ -302,8 +302,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
                                  CriticalPathRCs);
 
   // Loop over all of the basic blocks
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
+  for (auto &MBB : Fn) {
 #ifndef NDEBUG
     // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
     if (DebugDiv > 0) {
@@ -311,25 +310,25 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
       if (bbcnt++ % DebugDiv != DebugMod)
         continue;
       dbgs() << "*** DEBUG scheduling " << Fn.getName()
-             << ":BB#" << MBB->getNumber() << " ***\n";
+             << ":BB#" << MBB.getNumber() << " ***\n";
     }
 #endif
 
     // Initialize register live-range state for scheduling in this block.
-    Scheduler.startBlock(MBB);
+    Scheduler.startBlock(&MBB);
 
     // Schedule each sequence of instructions not interrupted by a label
     // or anything else that effectively needs to shut down scheduling.
-    MachineBasicBlock::iterator Current = MBB->end();
-    unsigned Count = MBB->size(), CurrentCount = Count;
-    for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
+    MachineBasicBlock::iterator Current = MBB.end();
+    unsigned Count = MBB.size(), CurrentCount = Count;
+    for (MachineBasicBlock::iterator I = Current; I != MBB.begin();) {
       MachineInstr *MI = std::prev(I);
       --Count;
       // Calls are not scheduling boundaries before register allocation, but
       // post-ra we don't gain anything by scheduling across calls since we
       // don't need to worry about register pressure.
-      if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) {
-        Scheduler.enterRegion(MBB, I, Current, CurrentCount - Count);
+      if (MI->isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) {
+        Scheduler.enterRegion(&MBB, I, Current, CurrentCount - Count);
         Scheduler.setEndIndex(CurrentCount);
         Scheduler.schedule();
         Scheduler.exitRegion();
@@ -343,9 +342,9 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
         Count -= MI->getBundleSize();
     }
     assert(Count == 0 && "Instruction count mismatch!");
-    assert((MBB->begin() == Current || CurrentCount != 0) &&
+    assert((MBB.begin() == Current || CurrentCount != 0) &&
            "Instruction count mismatch!");
-    Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.enterRegion(&MBB, MBB.begin(), Current, CurrentCount);
     Scheduler.setEndIndex(CurrentCount);
     Scheduler.schedule();
     Scheduler.exitRegion();
@@ -355,7 +354,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     Scheduler.finishBlock();
 
     // Update register kills
-    Scheduler.fixupKills(MBB);
+    Scheduler.fixupKills(&MBB);
   }
 
   return true;
@@ -400,8 +399,12 @@ void SchedulePostRATDList::schedule() {
   }
 
   DEBUG(dbgs() << "********** List Scheduling **********\n");
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
+  DEBUG(
+    for (const SUnit &SU : SUnits) {
+      SU.dumpAll(this);
+      dbgs() << '\n';
+    }
+  );
 
   AvailableQueue.initNodes(SUnits);
   ListScheduleTopDown();
diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
index 5f81949..d27ea2f 100644
--- a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -58,7 +58,7 @@ INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
 
 void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved<AAResultsWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -96,7 +96,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
 
   // This is a physreg implicit-def.
   // Look for the first instruction to use or define an alias.
-  MachineBasicBlock::instr_iterator UserMI = MI;
+  MachineBasicBlock::instr_iterator UserMI = MI->getIterator();
   MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end();
   bool Found = false;
   for (++UserMI; UserMI != UserE; ++UserMI) {
@@ -151,7 +151,7 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(),
          MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI)
       if (MBBI->isImplicitDef())
-        WorkList.insert(MBBI);
+        WorkList.insert(&*MBBI);
 
     if (WorkList.empty())
       continue;
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 6ca69a1..939c500 100644
--- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -71,8 +71,9 @@ private:
   // stack frame indexes.
   unsigned MinCSFrameIndex, MaxCSFrameIndex;
 
-  // Save and Restore blocks of the current function.
-  MachineBasicBlock *SaveBlock;
+  // Save and Restore blocks of the current function. Typically there is a
+  // single save block, unless Windows EH funclets are involved.
+  SmallVector<MachineBasicBlock *, 1> SaveBlocks;
   SmallVector<MachineBasicBlock *, 4> RestoreBlocks;
 
   // Flag to control whether to use the register scavenger to resolve
@@ -91,9 +92,6 @@ private:
                            int &SPAdj);
   void scavengeFrameVirtualRegs(MachineFunction &Fn);
   void insertPrologEpilogCode(MachineFunction &Fn);
-
-  // Convenience for recognizing return blocks.
-  bool isReturnBlock(const MachineBasicBlock *MBB) const;
 };
 } // namespace
 
@@ -128,10 +126,6 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool PEI::isReturnBlock(const MachineBasicBlock* MBB) const {
-  return (MBB && !MBB->empty() && MBB->back().isReturn());
-}
-
 /// Compute the set of return blocks
 void PEI::calculateSets(MachineFunction &Fn) {
   const MachineFrameInfo *MFI = Fn.getFrameInfo();
@@ -142,25 +136,25 @@ void PEI::calculateSets(MachineFunction &Fn) {
 
   // Use the points found by shrink-wrapping, if any.
   if (MFI->getSavePoint()) {
-    SaveBlock = MFI->getSavePoint();
+    SaveBlocks.push_back(MFI->getSavePoint());
     assert(MFI->getRestorePoint() && "Both restore and save must be set");
     MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
     // If RestoreBlock does not have any successor and is not a return block
     // then the end point is unreachable and we do not need to insert any
     // epilogue.
-    if (!RestoreBlock->succ_empty() || isReturnBlock(RestoreBlock))
+    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
       RestoreBlocks.push_back(RestoreBlock);
     return;
   }
 
   // Save refs to entry and return blocks.
-  SaveBlock = Fn.begin();
-  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
-       MBB != E; ++MBB)
-    if (isReturnBlock(MBB))
-      RestoreBlocks.push_back(MBB);
-
-  return;
+  SaveBlocks.push_back(&Fn.front());
+  for (MachineBasicBlock &MBB : Fn) {
+    if (MBB.isEHFuncletEntry())
+      SaveBlocks.push_back(&MBB);
+    if (MBB.isReturnBlock())
+      RestoreBlocks.push_back(&MBB);
+  }
 }
 
 /// StackObjSet - A set of stack object indexes
@@ -195,7 +189,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // place all spills in the entry block, all restores in return blocks.
   calculateSets(Fn);
 
-  // Add the code to save and restore the callee saved registers
+  // Add the code to save and restore the callee saved registers.
   if (!F->hasFnAttribute(Attribute::Naked))
     insertCSRSpillsAndRestores(Fn);
 
@@ -237,6 +231,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   }
 
   delete RS;
+  SaveBlocks.clear();
   RestoreBlocks.clear();
   return true;
 }
@@ -407,7 +402,7 @@ static void updateLiveness(MachineFunction &MF) {
     const MachineBasicBlock *CurBB = WorkList.pop_back_val();
     // By construction, the region that is after the save point is
     // dominated by the Save and post-dominated by the Restore.
-    if (CurBB == Save)
+    if (CurBB == Save && Save != Restore)
       continue;
     // Enqueue all the successors not already visited.
     // Those are by construction either before Save or after Restore.
@@ -419,10 +414,13 @@ static void updateLiveness(MachineFunction &MF) {
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    for (MachineBasicBlock *MBB : Visited)
+    for (MachineBasicBlock *MBB : Visited) {
+      MCPhysReg Reg = CSI[i].getReg();
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
-      MBB->addLiveIn(CSI[i].getReg());
+      if (!MBB->isLiveIn(Reg))
+        MBB->addLiveIn(Reg);
+    }
   }
 }
 
@@ -446,18 +444,20 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   MachineBasicBlock::iterator I;
 
   // Spill using target interface.
-  I = SaveBlock->begin();
-  if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
-    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-      // Insert the spill to the stack frame.
-      unsigned Reg = CSI[i].getReg();
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
-                              RC, TRI);
+  for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+    I = SaveBlock->begin();
+    if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        // Insert the spill to the stack frame.
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
+                                RC, TRI);
+      }
     }
+    // Update the live-in information of all the blocks up to the save point.
+    updateLiveness(Fn);
   }
-  // Update the live-in information of all the blocks up to the save point.
-  updateLiveness(Fn);
 
   // Restore using target interface.
   for (MachineBasicBlock *MBB : RestoreBlocks) {
@@ -500,7 +500,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
 static inline void
 AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
                   bool StackGrowsDown, int64_t &Offset,
-                  unsigned &MaxAlign) {
+                  unsigned &MaxAlign, unsigned Skew) {
   // If the stack grows down, add the object size to find the lowest address.
   if (StackGrowsDown)
     Offset += MFI->getObjectSize(FrameIdx);
@@ -512,7 +512,7 @@ AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
   MaxAlign = std::max(MaxAlign, Align);
 
   // Adjust to alignment boundary.
-  Offset = (Offset + Align - 1) / Align * Align;
+  Offset = RoundUpToAlignment(Offset, Align, Skew);
 
   if (StackGrowsDown) {
     DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
@@ -530,12 +530,12 @@ static void
 AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
                       SmallSet<int, 16> &ProtectedObjs,
                       MachineFrameInfo *MFI, bool StackGrowsDown,
-                      int64_t &Offset, unsigned &MaxAlign) {
+                      int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
 
   for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
         E = UnassignedObjs.end(); I != E; ++I) {
     int i = *I;
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
     ProtectedObjs.insert(i);
   }
 }
@@ -563,6 +563,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
          && "Local area offset should be in direction of stack growth");
   int64_t Offset = LocalAreaOffset;
 
+  // Skew to be applied to alignment.
+  unsigned Skew = TFI.getStackAlignmentSkew(Fn);
+
   // If there are fixed sized objects that are preallocated in the local area,
   // non-fixed objects can't be allocated right at the start of local area.
   // We currently don't support filling in holes in between fixed sized
@@ -593,7 +596,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align);
+      Offset = RoundUpToAlignment(Offset, Align, Skew);
 
       MFI->setObjectOffset(i, -Offset);        // Set the computed offset
     }
@@ -602,7 +605,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     for (int i = MaxCSFI; i >= MinCSFI ; --i) {
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align);
+      Offset = RoundUpToAlignment(Offset, Align, Skew);
 
       MFI->setObjectOffset(i, Offset);
       Offset += MFI->getObjectSize(i);
@@ -624,7 +627,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     RS->getScavengingFrameIndices(SFIs);
     for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
            IE = SFIs.end(); I != IE; ++I)
-      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
   }
 
   // FIXME: Once this is working, then enable flag will change to a target
@@ -635,7 +638,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     unsigned Align = MFI->getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
-    Offset = RoundUpToAlignment(Offset, Align);
+    Offset = RoundUpToAlignment(Offset, Align, Skew);
 
     DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
@@ -662,7 +665,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     StackObjSet AddrOfObjs;
 
     AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
-                      Offset, MaxAlign);
+                      Offset, MaxAlign, Skew);
 
     // Assign large stack objects first.
     for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
@@ -695,11 +698,11 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     }
 
     AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign);
+                          Offset, MaxAlign, Skew);
     AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign);
+                          Offset, MaxAlign, Skew);
     AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign);
+                          Offset, MaxAlign, Skew);
   }
 
   // Then assign frame offsets to stack objects that are not used to spill
@@ -719,7 +722,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     if (ProtectedObjs.count(i))
       continue;
 
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
   }
 
   // Make sure the special register scavenging spill slot is closest to the
@@ -729,7 +732,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     RS->getScavengingFrameIndices(SFIs);
     for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
            IE = SFIs.end(); I != IE; ++I)
-      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
   }
 
   if (!TFI.targetHandlesStackFrameRounding()) {
@@ -754,7 +757,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
     StackAlign = std::max(StackAlign, MaxAlign);
-    Offset = RoundUpToAlignment(Offset, StackAlign);
+    Offset = RoundUpToAlignment(Offset, StackAlign, Skew);
   }
 
   // Update frame info to pretend that this is part of the stack...
@@ -771,18 +774,24 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
   const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
 
   // Add prologue to the function...
-  TFI.emitPrologue(Fn, *SaveBlock);
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.emitPrologue(Fn, *SaveBlock);
 
   // Add epilogue to restore the callee-save registers in each exiting block.
   for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
     TFI.emitEpilogue(Fn, *RestoreBlock);
 
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.inlineStackProbe(Fn, *SaveBlock);
+
   // Emit additional code that is required to support segmented stacks, if
   // we've been asked for it.  This, when linked with a runtime with support
   // for segmented stacks (libgcc is one), will result in allocating stack
   // space in small chunks instead of one large contiguous block.
-  if (Fn.shouldSplitStack())
-    TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
+  if (Fn.shouldSplitStack()) {
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
+  }
 
   // Emit additional code that is required to explicitly handle the stack in
   // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
@@ -790,7 +799,8 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
   // different conditional check and another BIF for allocating more stack
   // space.
   if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
-    TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
 }
 
 /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
@@ -800,25 +810,6 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
   const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   if (!TFI.needsFrameIndexResolution(Fn)) return;
 
-  MachineModuleInfo &MMI = Fn.getMMI();
-  const Function *F = Fn.getFunction();
-  const Function *ParentF = MMI.getWinEHParent(F);
-  unsigned FrameReg;
-  if (F == ParentF) {
-    WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(Fn.getFunction());
-    // FIXME: This should be unconditional but we have bugs in the preparation
-    // pass.
-    if (FuncInfo.UnwindHelpFrameIdx != INT_MAX)
-      FuncInfo.UnwindHelpFrameOffset = TFI.getFrameIndexReferenceFromSP(
-          Fn, FuncInfo.UnwindHelpFrameIdx, FrameReg);
-  } else if (MMI.hasWinEHFuncInfo(F)) {
-    WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(Fn.getFunction());
-    auto I = FuncInfo.CatchHandlerParentFrameObjIdx.find(F);
-    if (I != FuncInfo.CatchHandlerParentFrameObjIdx.end())
-      FuncInfo.CatchHandlerParentFrameObjOffset[F] =
-          TFI.getFrameIndexReferenceFromSP(Fn, I->second, FrameReg);
-  }
-
   // Store SPAdj at exit of a basic block.
   SmallVector<int, 8> SPState;
   SPState.resize(Fn.getNumBlockIDs());
@@ -841,12 +832,12 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
   }
 
   // Handle the unreachable blocks.
-  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
+  for (auto &BB : Fn) {
+    if (Reachable.count(&BB))
       // Already handled in DFS traversal.
       continue;
     int SPAdj = 0;
-    replaceFrameIndices(BB, Fn, SPAdj);
+    replaceFrameIndices(&BB, Fn, SPAdj);
   }
 }
 
@@ -889,11 +880,11 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       if (!MI->getOperand(i).isFI())
         continue;
 
-      // Frame indicies in debug values are encoded in a target independent
+      // Frame indices in debug values are encoded in a target independent
       // way with simply the frame index and offset rather than any
       // target-specific addressing mode.
       if (MI->isDebugValue()) {
-        assert(i == 0 && "Frame indicies can only appear as the first "
+        assert(i == 0 && "Frame indices can only appear as the first "
                          "operand of a DBG_VALUE machine instruction");
         unsigned Reg;
         MachineOperand &Offset = MI->getOperand(1);
@@ -979,7 +970,7 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
   // Run through the instructions and find any virtual registers.
   for (MachineFunction::iterator BB = Fn.begin(),
        E = Fn.end(); BB != E; ++BB) {
-    RS->enterBasicBlock(BB);
+    RS->enterBasicBlock(&*BB);
 
     int SPAdj = 0;
 
@@ -1026,12 +1017,8 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
           // Replace this reference to the virtual register with the
           // scratch register.
           assert (ScratchReg && "Missing scratch register!");
-          MachineRegisterInfo &MRI = Fn.getRegInfo();
           Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
           
-          // Make sure MRI now accounts this register as used.
-          MRI.setPhysRegUsed(ScratchReg);
-
           // Because this instruction was processed by the RS before this
           // register was allocated, make sure that the RS now records the
           // register as being used.
@@ -1044,7 +1031,7 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
       // problem because we need the spill code before I: Move I to just
       // prior to J.
       if (I != std::prev(J)) {
-        BB->splice(J, BB, I);
+        BB->splice(J, &*BB, I);
 
         // Before we move I, we need to prepare the RS to visit I again.
         // Specifically, RS will assert if it sees uses of registers that
diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
index b1c341d..1f46417 100644
--- a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
+++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -22,87 +23,38 @@
 #include <map>
 using namespace llvm;
 
-namespace {
-struct PSVGlobalsTy {
-  // PseudoSourceValues are immutable so don't need locking.
-  const PseudoSourceValue PSVs[4];
-  sys::Mutex Lock;  // Guards FSValues, but not the values inside it.
-  std::map<int, const PseudoSourceValue *> FSValues;
-
-  PSVGlobalsTy() : PSVs() {}
-  ~PSVGlobalsTy() {
-    for (std::map<int, const PseudoSourceValue *>::iterator
-           I = FSValues.begin(), E = FSValues.end(); I != E; ++I) {
-      delete I->second;
-    }
-  }
-};
-
-static ManagedStatic<PSVGlobalsTy> PSVGlobals;
-
-}  // anonymous namespace
-
-const PseudoSourceValue *PseudoSourceValue::getStack()
-{ return &PSVGlobals->PSVs[0]; }
-const PseudoSourceValue *PseudoSourceValue::getGOT()
-{ return &PSVGlobals->PSVs[1]; }
-const PseudoSourceValue *PseudoSourceValue::getJumpTable()
-{ return &PSVGlobals->PSVs[2]; }
-const PseudoSourceValue *PseudoSourceValue::getConstantPool()
-{ return &PSVGlobals->PSVs[3]; }
-
 static const char *const PSVNames[] = {
-  "Stack",
-  "GOT",
-  "JumpTable",
-  "ConstantPool"
-};
+    "Stack", "GOT", "JumpTable", "ConstantPool", "FixedStack",
+    "GlobalValueCallEntry", "ExternalSymbolCallEntry"};
 
-PseudoSourceValue::PseudoSourceValue(bool isFixed) : isFixed(isFixed) {}
+PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {}
 
 PseudoSourceValue::~PseudoSourceValue() {}
 
 void PseudoSourceValue::printCustom(raw_ostream &O) const {
-  O << PSVNames[this - PSVGlobals->PSVs];
-}
-
-const PseudoSourceValue *PseudoSourceValue::getFixedStack(int FI) {
-  PSVGlobalsTy &PG = *PSVGlobals;
-  sys::ScopedLock locked(PG.Lock);
-  const PseudoSourceValue *&V = PG.FSValues[FI];
-  if (!V)
-    V = new FixedStackPseudoSourceValue(FI);
-  return V;
+  O << PSVNames[Kind];
 }
 
 bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
-  if (this == getStack())
+  if (isStack())
     return false;
-  if (this == getGOT() ||
-      this == getConstantPool() ||
-      this == getJumpTable())
+  if (isGOT() || isConstantPool() || isJumpTable())
     return true;
   llvm_unreachable("Unknown PseudoSourceValue!");
 }
 
-bool PseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
-  if (this == getStack() ||
-      this == getGOT() ||
-      this == getConstantPool() ||
-      this == getJumpTable())
+bool PseudoSourceValue::isAliased(const MachineFrameInfo *) const {
+  if (isStack() || isGOT() || isConstantPool() || isJumpTable())
     return false;
   llvm_unreachable("Unknown PseudoSourceValue!");
 }
 
-bool PseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
-  if (this == getGOT() ||
-      this == getConstantPool() ||
-      this == getJumpTable())
-    return false;
-  return true;
+bool PseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
+  return !(isGOT() || isConstantPool() || isJumpTable());
 }
 
-bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{
+bool FixedStackPseudoSourceValue::isConstant(
+    const MachineFrameInfo *MFI) const {
   return MFI && MFI->isImmutableObjectIndex(FI);
 }
 
@@ -122,3 +74,69 @@ bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
 void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const {
   OS << "FixedStack" << FI;
 }
+
+CallEntryPseudoSourceValue::CallEntryPseudoSourceValue(PSVKind Kind)
+    : PseudoSourceValue(Kind) {}
+
+bool CallEntryPseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool CallEntryPseudoSourceValue::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool CallEntryPseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+GlobalValuePseudoSourceValue::GlobalValuePseudoSourceValue(
+    const GlobalValue *GV)
+    : CallEntryPseudoSourceValue(GlobalValueCallEntry), GV(GV) {}
+
+ExternalSymbolPseudoSourceValue::ExternalSymbolPseudoSourceValue(const char *ES)
+    : CallEntryPseudoSourceValue(ExternalSymbolCallEntry), ES(ES) {}
+
+PseudoSourceValueManager::PseudoSourceValueManager()
+    : StackPSV(PseudoSourceValue::Stack), GOTPSV(PseudoSourceValue::GOT),
+      JumpTablePSV(PseudoSourceValue::JumpTable),
+      ConstantPoolPSV(PseudoSourceValue::ConstantPool) {}
+
+const PseudoSourceValue *PseudoSourceValueManager::getStack() {
+  return &StackPSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getGOT() { return &GOTPSV; }
+
+const PseudoSourceValue *PseudoSourceValueManager::getConstantPool() {
+  return &ConstantPoolPSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getJumpTable() {
+  return &JumpTablePSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getFixedStack(int FI) {
+  std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI];
+  if (!V)
+    V = llvm::make_unique<FixedStackPseudoSourceValue>(FI);
+  return V.get();
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) {
+  std::unique_ptr<const GlobalValuePseudoSourceValue> &E =
+      GlobalCallEntries[GV];
+  if (!E)
+    E = llvm::make_unique<GlobalValuePseudoSourceValue>(GV);
+  return E.get();
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) {
+  std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E =
+      ExternalCallEntries[ES];
+  if (!E)
+    E = llvm::make_unique<ExternalSymbolPseudoSourceValue>(ES);
+  return E.get();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
index 0090332..cfe367d 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -133,8 +133,8 @@ RABasic::RABasic(): MachineFunctionPass(ID) {
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<AliasAnalysis>();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
   AU.addRequired<LiveIntervals>();
   AU.addPreserved<LiveIntervals>();
   AU.addPreserved<SlotIndexes>();
@@ -223,7 +223,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
-  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
   while (unsigned PhysReg = Order.next()) {
     // Check for interference in PhysReg
     switch (Matrix->checkInterference(VirtReg, PhysReg)) {
@@ -276,7 +276,7 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
 
-  calculateSpillWeightsAndHints(*LIS, *MF,
+  calculateSpillWeightsAndHints(*LIS, *MF, VRM,
                                 getAnalysis<MachineLoopInfo>(),
                                 getAnalysis<MachineBlockFrequencyInfo>());
 
diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
index fd3d4d7..f4c076f 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -799,10 +799,9 @@ void RAFast::AllocateBasicBlock() {
   MachineBasicBlock::iterator MII = MBB->begin();
 
   // Add live-in registers as live.
-  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
-         E = MBB->livein_end(); I != E; ++I)
-    if (MRI->isAllocatable(*I))
-      definePhysReg(MII, *I, regReserved);
+  for (const auto &LI : MBB->liveins())
+    if (MRI->isAllocatable(LI.PhysReg))
+      definePhysReg(MII, LI.PhysReg, regReserved);
 
   SmallVector<unsigned, 8> VirtDead;
   SmallVector<MachineInstr*, 32> Coalesced;
@@ -986,10 +985,6 @@ void RAFast::AllocateBasicBlock() {
       }
     }
 
-    for (UsedInInstrSet::iterator
-         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
-      MRI->setRegUnitUsed(*I);
-
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
     UsedInInstr.clear();
@@ -1050,10 +1045,6 @@ void RAFast::AllocateBasicBlock() {
       killVirtReg(VirtDead[i]);
     VirtDead.clear();
 
-    for (UsedInInstrSet::iterator
-         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
-      MRI->setRegUnitUsed(*I);
-
     if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << *MI);
       Coalesced.push_back(MI);
@@ -1103,12 +1094,6 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
     AllocateBasicBlock();
   }
 
-  // Add the clobber lists for all the instructions we skipped earlier.
-  for (const MCInstrDesc *Desc : SkippedInstrs)
-    if (const uint16_t *Defs = Desc->getImplicitDefs())
-      while (*Defs)
-        MRI->setPhysRegUsed(*Defs++);
-
   // All machine operands and other references to virtual registers have been
   // replaced. Remove the virtual registers.
   MRI->clearVirtRegs();
diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 7ebcf7f..945cb9e 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -86,6 +86,14 @@ static cl::opt<bool> EnableLocalReassignment(
              "may be compile time intensive"),
     cl::init(false));
 
+static cl::opt<bool> EnableDeferredSpilling(
+    "enable-deferred-spilling", cl::Hidden,
+    cl::desc("Instead of spilling a variable right away, defer the actual "
+             "code insertion to the end of the allocation. That way the "
+             "allocator might still find a suitable coloring for this "
+             "variable because of other evicted variables."),
+    cl::init(false));
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned>
 CSRFirstTimeCost("regalloc-csr-first-time-cost",
@@ -157,6 +165,11 @@ class RAGreedy : public MachineFunctionPass,
     /// Live range will be spilled.  No more splitting will be attempted.
     RS_Spill,
 
+
+    /// Live range is in memory. Because of other evictions, it might get moved
+    /// in a register in the end.
+    RS_Memory,
+
     /// There is nothing more we can do to this live range.  Abort compilation
     /// if it can't be assigned.
     RS_Done
@@ -414,6 +427,7 @@ const char *const RAGreedy::StageName[] = {
     "RS_Split",
     "RS_Split2",
     "RS_Spill",
+    "RS_Memory",
     "RS_Done"
 };
 #endif
@@ -447,8 +461,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<MachineBlockFrequencyInfo>();
   AU.addPreserved<MachineBlockFrequencyInfo>();
-  AU.addRequired<AliasAnalysis>();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
   AU.addRequired<LiveIntervals>();
   AU.addPreserved<LiveIntervals>();
   AU.addRequired<SlotIndexes>();
@@ -536,6 +550,13 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     // Unsplit ranges that couldn't be allocated immediately are deferred until
     // everything else has been allocated.
     Prio = Size;
+  } else if (ExtraRegInfo[Reg].Stage == RS_Memory) {
+    // Memory operand should be considered last.
+    // Change the priority such that Memory operand are assigned in
+    // the reverse order that they came in.
+    // TODO: Make this a member variable and probably do something about hints.
+    static unsigned MemOp = 0;
+    Prio = MemOp++;
   } else {
     // Giant live ranges fall back to the global assignment heuristic, which
     // prevents excessive spilling in pathological cases.
@@ -637,7 +658,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 //===----------------------------------------------------------------------===//
 
 unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
-  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
   unsigned PhysReg;
   while ((PhysReg = Order.next())) {
     if (PhysReg == PrevReg)
@@ -2450,7 +2471,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                                      unsigned Depth) {
   unsigned CostPerUseLimit = ~0u;
   // First try assigning a free register.
-  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) {
     // When NewVRegs is not empty, we may have made decisions such as evicting
     // a virtual register, go with the earlier decisions and use the physical
@@ -2512,13 +2533,23 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     return PhysReg;
 
   // Finally spill VirtReg itself.
-  NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
-  spiller().spill(LRE);
-  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
+  if (EnableDeferredSpilling && getStage(VirtReg) < RS_Memory) {
+    // TODO: This is experimental and in particular, we do not model
+    // the live range splitting done by spilling correctly.
+    // We would need a deep integration with the spiller to do the
+    // right thing here. Anyway, that is still good for early testing.
+    setStage(VirtReg, RS_Memory);
+    DEBUG(dbgs() << "Do as if this register is in memory\n");
+    NewVRegs.push_back(VirtReg.reg);
+  } else {
+    NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+    spiller().spill(LRE);
+    setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
-  if (VerifyEnabled)
-    MF->verify(this, "After spilling");
+    if (VerifyEnabled)
+      MF->verify(this, "After spilling");
+  }
 
   // The live virtual register requesting allocation was spilled, so tell
   // the caller not to allocate anything during this round.
@@ -2555,7 +2586,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
 
   initializeCSRCost();
 
-  calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI);
+  calculateSpillWeightsAndHints(*LIS, mf, VRM, *Loops, *MBFI);
 
   DEBUG(LIS->dump());
 
diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
index eeff73d..fd28b05 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -497,8 +498,8 @@ void PBQPRAConstraintList::anchor() {}
 
 void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.setPreservesCFG();
-  au.addRequired<AliasAnalysis>();
-  au.addPreserved<AliasAnalysis>();
+  au.addRequired<AAResultsWrapperPass>();
+  au.addPreserved<AAResultsWrapperPass>();
   au.addRequired<SlotIndexes>();
   au.addPreserved<SlotIndexes>();
   au.addRequired<LiveIntervals>();
@@ -724,11 +725,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   MachineBlockFrequencyInfo &MBFI =
     getAnalysis<MachineBlockFrequencyInfo>();
 
-  calculateSpillWeightsAndHints(LIS, MF, getAnalysis<MachineLoopInfo>(), MBFI,
-                                normalizePBQPSpillWeight);
-
   VirtRegMap &VRM = getAnalysis<VirtRegMap>();
 
+  calculateSpillWeightsAndHints(LIS, MF, &VRM, getAnalysis<MachineLoopInfo>(),
+                                MBFI, normalizePBQPSpillWeight);
+
   std::unique_ptr<Spiller> VRegSpiller(createInlineSpiller(*this, MF, VRM));
 
   MF.getRegInfo().freezeReservedRegs(MF);
@@ -805,33 +806,17 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-namespace {
-// A helper class for printing node and register info in a consistent way
-class PrintNodeInfo {
-public:
-  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
-  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
-
-  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
-
-  void print(raw_ostream &OS) const {
+/// Create Printable object for node and register info.
+static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
+                               const PBQP::RegAlloc::PBQPRAGraph &G) {
+  return Printable([NId, &G](raw_ostream &OS) {
     const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
     const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     unsigned VReg = G.getNodeMetadata(NId).getVReg();
     const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
     OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
-  }
-
-private:
-  const Graph &G;
-  NodeId NId;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
-  PR.print(OS);
-  return OS;
+  });
 }
-} // anonymous namespace
 
 void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
index c911b9b..c1ff13e 100644
--- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -93,7 +92,7 @@ namespace {
 
     /// A LaneMask to remember on which subregister live ranges we need to call
     /// shrinkToUses() later.
-    unsigned ShrinkMask;
+    LaneBitmask ShrinkMask;
 
     /// True if the main range of the currently coalesced intervals should be
     /// checked for smaller live intervals.
@@ -164,15 +163,13 @@ namespace {
     /// LaneMask are split as necessary. @p LaneMask are the lanes that
     /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
     /// lanemasks already adjusted to the coalesced register.
-    /// @returns false if live range conflicts couldn't get resolved.
-    bool mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
-                           unsigned LaneMask, CoalescerPair &CP);
+    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                           LaneBitmask LaneMask, CoalescerPair &CP);
 
     /// Join the liveranges of two subregisters. Joins @p RRange into
     /// @p LRange, @p RRange may be invalid afterwards.
-    /// @returns false if live range conflicts couldn't get resolved.
-    bool joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
-                          unsigned LaneMask, const CoalescerPair &CP);
+    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                          LaneBitmask LaneMask, const CoalescerPair &CP);
 
     /// We found a non-trivially-coalescable copy. If the source value number is
     /// defined by a copy from the destination reg see if we can merge these two
@@ -224,30 +221,17 @@ namespace {
     /// Dst, we can drop \p Copy.
     bool applyTerminalRule(const MachineInstr &Copy) const;
 
-    /// Check whether or not \p LI is composed by multiple connected
-    /// components and if that is the case, fix that.
-    void splitNewRanges(LiveInterval *LI) {
-      ConnectedVNInfoEqClasses ConEQ(*LIS);
-      unsigned NumComps = ConEQ.Classify(LI);
-      if (NumComps <= 1)
-        return;
-      SmallVector<LiveInterval*, 8> NewComps(1, LI);
-      for (unsigned i = 1; i != NumComps; ++i) {
-        unsigned VReg = MRI->createVirtualRegister(MRI->getRegClass(LI->reg));
-        NewComps.push_back(&LIS->createEmptyInterval(VReg));
-      }
-
-      ConEQ.Distribute(&NewComps[0], *MRI);
-    }
-
     /// Wrapper method for \see LiveIntervals::shrinkToUses.
     /// This method does the proper fixing of the live-ranges when the afore
     /// mentioned method returns true.
     void shrinkToUses(LiveInterval *LI,
                       SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
-      if (LIS->shrinkToUses(LI, Dead))
-        // We may have created multiple connected components, split them.
-        splitNewRanges(LI);
+      if (LIS->shrinkToUses(LI, Dead)) {
+        /// Check whether or not \p LI is composed by multiple connected
+        /// components and if that is the case, fix that.
+        SmallVector<LiveInterval*, 8> SplitLIs;
+        LIS->splitSeparateComponents(*LI, SplitLIs);
+      }
     }
 
   public:
@@ -275,7 +259,7 @@ INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
                     "Simple Register Coalescing", false, false)
 
@@ -453,7 +437,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
 
 void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<LiveIntervals>();
   AU.addPreserved<LiveIntervals>();
   AU.addPreserved<SlotIndexes>();
@@ -679,14 +663,18 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   unsigned UseOpIdx;
   if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
     return false;
-  unsigned Op1, Op2, NewDstIdx;
-  if (!TII->findCommutedOpIndices(DefMI, Op1, Op2))
-    return false;
-  if (Op1 == UseOpIdx)
-    NewDstIdx = Op2;
-  else if (Op2 == UseOpIdx)
-    NewDstIdx = Op1;
-  else
+
+  // FIXME: The code below tries to commute 'UseOpIdx' operand with some other
+  // commutable operand which is expressed by 'CommuteAnyOperandIndex'value
+  // passed to the method. That _other_ operand is chosen by
+  // the findCommutedOpIndices() method.
+  //
+  // That is obviously an area for improvement in case of instructions having
+  // more than 2 operands. For example, if some instruction has 3 commutable
+  // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
+  // op#2<->op#3) of commute transformation should be considered/tried here.
+  unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+  if (!TII->findCommutedOpIndices(DefMI, UseOpIdx, NewDstIdx))
     return false;
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
@@ -719,7 +707,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   // At this point we have decided that it is legal to do this
   // transformation.  Start by commuting the instruction.
   MachineBasicBlock *MBB = DefMI->getParent();
-  MachineInstr *NewMI = TII->commuteInstruction(DefMI);
+  MachineInstr *NewMI =
+      TII->commuteInstruction(DefMI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
     return false;
   if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
@@ -804,7 +793,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
   if (IntB.hasSubRanges()) {
     if (!IntA.hasSubRanges()) {
-      unsigned Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
+      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
       IntA.createSubRangeFrom(Allocator, Mask, IntA);
     }
     SlotIndex AIdx = CopyIdx.getRegSlot(true);
@@ -812,20 +801,21 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
       assert(ASubValNo != nullptr);
 
-      unsigned AMask = SA.LaneMask;
+      LaneBitmask AMask = SA.LaneMask;
       for (LiveInterval::SubRange &SB : IntB.subranges()) {
-        unsigned BMask = SB.LaneMask;
-        unsigned Common = BMask & AMask;
+        LaneBitmask BMask = SB.LaneMask;
+        LaneBitmask Common = BMask & AMask;
         if (Common == 0)
           continue;
 
-        DEBUG(
-            dbgs() << format("\t\tCopy+Merge %04X into %04X\n", BMask, Common));
-        unsigned BRest = BMask & ~AMask;
+        DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
+                      << " into " << PrintLaneMask(Common) << '\n');
+        LaneBitmask BRest = BMask & ~AMask;
         LiveInterval::SubRange *CommonRange;
         if (BRest != 0) {
           SB.LaneMask = BRest;
-          DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", BRest));
+          DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
+                       << '\n');
           // Duplicate SubRange for newly merged common stuff.
           CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
         } else {
@@ -842,7 +832,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
         AMask &= ~BMask;
       }
       if (AMask != 0) {
-        DEBUG(dbgs() << format("\t\tNew Lane %04X\n", AMask));
+        DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
         LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
         VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
         addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
@@ -1107,7 +1097,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
   const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
   // CopyMI is undef iff SrcReg is not live before the instruction.
   if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
-    unsigned SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
+    LaneBitmask SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
     for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
       if ((SR.LaneMask & SrcMask) == 0)
         continue;
@@ -1128,7 +1118,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
     DstLI.MergeValueNumberInto(VNI, PrevVNI);
 
     // The affected subregister segments can be removed.
-    unsigned DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
+    LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
     for (LiveInterval::SubRange &SR : DstLI.subranges()) {
       if ((SR.LaneMask & DstMask) == 0)
         continue;
@@ -1147,7 +1137,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
       continue;
     const MachineInstr &MI = *MO.getParent();
     SlotIndex UseIdx = LIS->getInstructionIndex(&MI);
-    unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
     bool isLive;
     if (UseMask != ~0u && DstLI.hasSubRanges()) {
       isLive = false;
@@ -1213,10 +1203,10 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
       if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) {
         if (!DstInt->hasSubRanges()) {
           BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
-          unsigned Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
+          LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
           DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
         }
-        unsigned Mask = TRI->getSubRegIndexLaneMask(SubIdx);
+        LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubIdx);
         bool IsUndef = true;
         SlotIndex MIIdx = UseMI->isDebugValue()
           ? LIS->getSlotIndexes()->getIndexBefore(UseMI)
@@ -1445,8 +1435,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     for (LiveInterval::SubRange &S : LI.subranges()) {
       if ((S.LaneMask & ShrinkMask) == 0)
         continue;
-      DEBUG(dbgs() << "Shrink LaneUses (Lane "
-                   << format("%04X", S.LaneMask) << ")\n");
+      DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
+                   << ")\n");
       LIS->shrinkToUses(S, LI.reg);
     }
     LI.removeEmptySubRanges();
@@ -1644,7 +1634,7 @@ class JoinVals {
   const unsigned SubIdx;
   /// The LaneMask that this liverange will occupy the coalesced register. May
   /// be smaller than the lanemask produced by SubIdx when merging subranges.
-  const unsigned LaneMask;
+  const LaneBitmask LaneMask;
 
   /// This is true when joining sub register ranges, false when joining main
   /// ranges.
@@ -1699,11 +1689,11 @@ class JoinVals {
     ConflictResolution Resolution;
 
     /// Lanes written by this def, 0 for unanalyzed values.
-    unsigned WriteLanes;
+    LaneBitmask WriteLanes;
 
     /// Lanes with defined values in this register. Other lanes are undef and
     /// safe to clobber.
-    unsigned ValidLanes;
+    LaneBitmask ValidLanes;
 
     /// Value in LI being redefined by this def.
     VNInfo *RedefVNI;
@@ -1744,7 +1734,7 @@ class JoinVals {
   /// Compute the bitmask of lanes actually written by DefMI.
   /// Set Redef if there are any partial register definitions that depend on the
   /// previous value of the register.
-  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
+  LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
 
   /// Find the ultimate value that VNI was copied from.
   std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
@@ -1780,12 +1770,12 @@ class JoinVals {
   /// entry to TaintedVals.
   ///
   /// Returns false if the tainted lanes extend beyond the basic block.
-  bool taintExtent(unsigned, unsigned, JoinVals&,
-                   SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
+  bool taintExtent(unsigned, LaneBitmask, JoinVals&,
+                   SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> >&);
 
   /// Return true if MI uses any of the given Lanes from Reg.
   /// This does not include partial redefinitions of Reg.
-  bool usesLanes(const MachineInstr *MI, unsigned, unsigned, unsigned) const;
+  bool usesLanes(const MachineInstr *MI, unsigned, unsigned, LaneBitmask) const;
 
   /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
   /// be pruned:
@@ -1796,7 +1786,7 @@ class JoinVals {
   bool isPrunedValue(unsigned ValNo, JoinVals &Other);
 
 public:
-  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, unsigned LaneMask,
+  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask,
            SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
            LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
            bool TrackSubRegLiveness)
@@ -1822,8 +1812,8 @@ public:
 
   /// Removes subranges starting at copies that get removed. This sometimes
   /// happens when undefined subranges are copied around. These ranges contain
-  /// no usefull information and can be removed.
-  void pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask);
+  /// no useful information and can be removed.
+  void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask);
 
   /// Erase any machine instructions that have been coalesced away.
   /// Add erased instructions to ErasedInstrs.
@@ -1840,9 +1830,9 @@ public:
 };
 } // end anonymous namespace
 
-unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
+LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
   const {
-  unsigned L = 0;
+  LaneBitmask L = 0;
   for (const MachineOperand &MO : DefMI->operands()) {
     if (!MO.isReg() || MO.getReg() != Reg || !MO.isDef())
       continue;
@@ -1879,7 +1869,7 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
       ValueIn = nullptr;
       for (const LiveInterval::SubRange &S : LI.subranges()) {
         // Transform lanemask to a mask in the joined live interval.
-        unsigned SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
+        LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
         if ((SMask & LaneMask) == 0)
           continue;
         LiveQueryResult LRQ = S.Query(Def);
@@ -1928,7 +1918,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   const MachineInstr *DefMI = nullptr;
   if (VNI->isPHIDef()) {
     // Conservatively assume that all lanes in a PHI are valid.
-    unsigned Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx);
+    LaneBitmask Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx);
     V.ValidLanes = V.WriteLanes = Lanes;
   } else {
     DefMI = Indexes->getInstructionFromIndex(VNI->def);
@@ -2190,8 +2180,8 @@ bool JoinVals::mapValues(JoinVals &Other) {
 }
 
 bool JoinVals::
-taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
-            SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
+taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
+            SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> > &TaintExtent) {
   VNInfo *VNI = LR.getValNumInfo(ValNo);
   MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
   SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
@@ -2230,7 +2220,7 @@ taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
 }
 
 bool JoinVals::usesLanes(const MachineInstr *MI, unsigned Reg, unsigned SubIdx,
-                         unsigned Lanes) const {
+                         LaneBitmask Lanes) const {
   if (MI->isDebugValue())
     return false;
   for (const MachineOperand &MO : MI->operands()) {
@@ -2264,8 +2254,8 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
     // join, those lanes will be tainted with a wrong value. Get the extent of
     // the tainted lanes.
-    unsigned TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
-    SmallVector<std::pair<SlotIndex, unsigned>, 8> TaintExtent;
+    LaneBitmask TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
+    SmallVector<std::pair<SlotIndex, LaneBitmask>, 8> TaintExtent;
     if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
       // Tainted lanes would extend beyond the basic block.
       return false;
@@ -2384,7 +2374,7 @@ void JoinVals::pruneValues(JoinVals &Other,
   }
 }
 
-void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask)
+void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask)
 {
   // Look for values being erased.
   bool DidPrune = false;
@@ -2401,7 +2391,7 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask)
       // copied and we must remove that subrange value as well.
       VNInfo *ValueOut = Q.valueOutOrDead();
       if (ValueOut != nullptr && Q.valueIn() == nullptr) {
-        DEBUG(dbgs() << "\t\tPrune sublane " << format("%04X", S.LaneMask)
+        DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
                      << " at " << Def << "\n");
         LIS->pruneValue(S, Def, nullptr);
         DidPrune = true;
@@ -2410,10 +2400,10 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask)
         continue;
       }
       // If a subrange ends at the copy, then a value was copied but only
-      // partially used later. Shrink the subregister range apropriately.
+      // partially used later. Shrink the subregister range appropriately.
       if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
-        DEBUG(dbgs() << "\t\tDead uses at sublane "
-                     << format("%04X", S.LaneMask) << " at " << Def << "\n");
+        DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask)
+                     << " at " << Def << "\n");
         ShrinkMask |= S.LaneMask;
       }
     }
@@ -2477,8 +2467,8 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
   }
 }
 
-bool RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
-                                         unsigned LaneMask,
+void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                                         LaneBitmask LaneMask,
                                          const CoalescerPair &CP) {
   SmallVector<VNInfo*, 16> NewVNInfo;
   JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
@@ -2492,13 +2482,15 @@ bool RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
   // ranges get mapped to the "overflow" lane mask bit which creates unexpected
   // interferences.
   if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals)) {
-    DEBUG(dbgs() << "*** Couldn't join subrange!\n");
-    return false;
+    // We already determined that it is legal to merge the intervals, so this
+    // should never fail.
+    llvm_unreachable("*** Couldn't join subrange!\n");
   }
   if (!LHSVals.resolveConflicts(RHSVals) ||
       !RHSVals.resolveConflicts(LHSVals)) {
-    DEBUG(dbgs() << "*** Couldn't join subrange!\n");
-    return false;
+    // We already determined that it is legal to merge the intervals, so this
+    // should never fail.
+    llvm_unreachable("*** Couldn't join subrange!\n");
   }
 
   // The merging algorithm in LiveInterval::join() can't handle conflicting
@@ -2521,36 +2513,37 @@ bool RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
 
   DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
   if (EndPoints.empty())
-    return true;
+    return;
 
   // Recompute the parts of the live range we had to remove because of
   // CR_Replace conflicts.
   DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
                << " points: " << LRange << '\n');
   LIS->extendToIndices(LRange, EndPoints);
-  return true;
 }
 
-bool RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
+void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
                                           const LiveRange &ToMerge,
-                                          unsigned LaneMask, CoalescerPair &CP) {
+                                          LaneBitmask LaneMask,
+                                          CoalescerPair &CP) {
   BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
   for (LiveInterval::SubRange &R : LI.subranges()) {
-    unsigned RMask = R.LaneMask;
+    LaneBitmask RMask = R.LaneMask;
     // LaneMask of subregisters common to subrange R and ToMerge.
-    unsigned Common = RMask & LaneMask;
+    LaneBitmask Common = RMask & LaneMask;
     // There is nothing to do without common subregs.
     if (Common == 0)
       continue;
 
-    DEBUG(dbgs() << format("\t\tCopy+Merge %04X into %04X\n", RMask, Common));
+    DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
+                 << PrintLaneMask(Common) << '\n');
     // LaneMask of subregisters contained in the R range but not in ToMerge,
     // they have to split into their own subrange.
-    unsigned LRest = RMask & ~LaneMask;
+    LaneBitmask LRest = RMask & ~LaneMask;
     LiveInterval::SubRange *CommonRange;
     if (LRest != 0) {
       R.LaneMask = LRest;
-      DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", LRest));
+      DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
       // Duplicate SubRange for newly merged common stuff.
       CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
     } else {
@@ -2559,16 +2552,14 @@ bool RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
       CommonRange = &R;
     }
     LiveRange RangeCopy(ToMerge, Allocator);
-    if (!joinSubRegRanges(*CommonRange, RangeCopy, Common, CP))
-      return false;
+    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
     LaneMask &= ~RMask;
   }
 
   if (LaneMask != 0) {
-    DEBUG(dbgs() << format("\t\tNew Lane %04X\n", LaneMask));
+    DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
     LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
   }
-  return true;
 }
 
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
@@ -2602,15 +2593,15 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     // create initial subranges if necessary.
     unsigned DstIdx = CP.getDstIdx();
     if (!LHS.hasSubRanges()) {
-      unsigned Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
-                                  : TRI->getSubRegIndexLaneMask(DstIdx);
+      LaneBitmask Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                     : TRI->getSubRegIndexLaneMask(DstIdx);
       // LHS must support subregs or we wouldn't be in this codepath.
       assert(Mask != 0);
       LHS.createSubRangeFrom(Allocator, Mask, LHS);
     } else if (DstIdx != 0) {
       // Transform LHS lanemasks to new register class if necessary.
       for (LiveInterval::SubRange &R : LHS.subranges()) {
-        unsigned Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
+        LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
         R.LaneMask = Mask;
       }
     }
@@ -2619,41 +2610,21 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
 
     // Determine lanemasks of RHS in the coalesced register and merge subranges.
     unsigned SrcIdx = CP.getSrcIdx();
-    bool Abort = false;
     if (!RHS.hasSubRanges()) {
-      unsigned Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
-                                  : TRI->getSubRegIndexLaneMask(SrcIdx);
-      if (!mergeSubRangeInto(LHS, RHS, Mask, CP))
-        Abort = true;
+      LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                     : TRI->getSubRegIndexLaneMask(SrcIdx);
+      mergeSubRangeInto(LHS, RHS, Mask, CP);
     } else {
       // Pair up subranges and merge.
       for (LiveInterval::SubRange &R : RHS.subranges()) {
-        unsigned Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
-        if (!mergeSubRangeInto(LHS, R, Mask, CP)) {
-          Abort = true;
-          break;
-        }
+        LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
+        mergeSubRangeInto(LHS, R, Mask, CP);
       }
     }
-    if (Abort) {
-      // This shouldn't have happened :-(
-      // However we are aware of at least one existing problem where we
-      // can't merge subranges when multiple ranges end up in the
-      // "overflow bit" 32. As a workaround we drop all subregister ranges
-      // which means we loose some precision but are back to a well defined
-      // state.
-      assert(TargetRegisterInfo::isImpreciseLaneMask(
-             CP.getNewRC()->getLaneMask())
-             && "SubRange merge should only fail when merging into bit 32.");
-      DEBUG(dbgs() << "\tSubrange join aborted!\n");
-      LHS.clearSubRanges();
-      RHS.clearSubRanges();
-    } else {
-      DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+    DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
 
-      LHSVals.pruneSubRegValues(LHS, ShrinkMask);
-      RHSVals.pruneSubRegValues(LHS, ShrinkMask);
-    }
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
+    RHSVals.pruneSubRegValues(LHS, ShrinkMask);
   }
 
   // The merging algorithm in LiveInterval::join() can't handle conflicting
@@ -2799,7 +2770,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
       !isTerminalReg(DstReg, Copy, MRI))
     return false;
 
-  // DstReg is a terminal node. Check if it inteferes with any other
+  // DstReg is a terminal node. Check if it interferes with any other
   // copy involving SrcReg.
   const MachineBasicBlock *OrigBB = Copy.getParent();
   const LiveInterval &DstLI = LIS->getInterval(DstReg);
@@ -2903,8 +2874,8 @@ void RegisterCoalescer::joinAllIntervals() {
 
   std::vector<MBBPriorityInfo> MBBs;
   MBBs.reserve(MF->size());
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end();I != E;++I){
-    MachineBasicBlock *MBB = I;
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
     MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB),
                                    JoinSplitEdges && isSplitEdge(MBB)));
   }
@@ -2943,7 +2914,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   TRI = STI.getRegisterInfo();
   TII = STI.getInstrInfo();
   LIS = &getAnalysis<LiveIntervals>();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   Loops = &getAnalysis<MachineLoopInfo>();
   if (EnableGlobalCopies == cl::BOU_UNSET)
     JoinGlobalCopies = STI.enableJoinGlobalCopies();
@@ -2981,22 +2952,25 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
     if (MRI->recomputeRegClass(Reg)) {
       DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
                    << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      ++NumInflated;
+
       LiveInterval &LI = LIS->getInterval(Reg);
-      unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
-      if (MaxMask == 0) {
+      if (LI.hasSubRanges()) {
         // If the inflated register class does not support subregisters anymore
         // remove the subranges.
-        LI.clearSubRanges();
-      } else {
+        if (!MRI->shouldTrackSubRegLiveness(Reg)) {
+          LI.clearSubRanges();
+        } else {
 #ifndef NDEBUG
-        // If subranges are still supported, then the same subregs should still
-        // be supported.
-        for (LiveInterval::SubRange &S : LI.subranges()) {
-          assert ((S.LaneMask & ~MaxMask) == 0);
-        }
+          LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+          // If subranges are still supported, then the same subregs
+          // should still be supported.
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            assert((S.LaneMask & ~MaxMask) == 0);
+          }
 #endif
+        }
       }
-      ++NumInflated;
     }
   }
 
diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
index c3786e5..f33dc3e 100644
--- a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -59,12 +59,12 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Max Pressure: ";
   dumpRegSetPressure(MaxSetPressure, TRI);
   dbgs() << "Live In: ";
-  for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
-    dbgs() << PrintVRegOrUnit(LiveInRegs[i], TRI) << " ";
+  for (unsigned Reg : LiveInRegs)
+    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
   dbgs() << '\n';
   dbgs() << "Live Out: ";
-  for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i)
-    dbgs() << PrintVRegOrUnit(LiveOutRegs[i], TRI) << " ";
+  for (unsigned Reg : LiveOutRegs)
+    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
   dbgs() << '\n';
 }
 
@@ -78,11 +78,13 @@ void RegPressureTracker::dump() const {
 }
 
 void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
+  const char *sep = "";
   for (const PressureChange &Change : *this) {
-    if (!Change.isValid() || Change.getUnitInc() == 0)
-      continue;
-    dbgs() << "    " << TRI.getRegPressureSetName(Change.getPSet())
+    if (!Change.isValid())
+      break;
+    dbgs() << sep << TRI.getRegPressureSetName(Change.getPSet())
            << " " << Change.getUnitInc();
+    sep = "    ";
   }
   dbgs() << '\n';
 }
@@ -90,22 +92,21 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
 void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
-    PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]);
+  for (unsigned RegUnit : RegUnits) {
+    PSetIterator PSetI = MRI->getPressureSets(RegUnit);
     unsigned Weight = PSetI.getWeight();
     for (; PSetI.isValid(); ++PSetI) {
       CurrSetPressure[*PSetI] += Weight;
-      if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) {
-        P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI];
-      }
+      P.MaxSetPressure[*PSetI] =
+          std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
     }
   }
 }
 
 /// Simply decrease the current pressure as impacted by these registers.
 void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned I = 0, E = RegUnits.size(); I != E; ++I)
-    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I]));
+  for (unsigned RegUnit : RegUnits)
+    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnit));
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -157,10 +158,22 @@ void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
   LiveInRegs.clear();
 }
 
-const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
+void LiveRegSet::init(const MachineRegisterInfo &MRI) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned NumRegUnits = TRI.getNumRegs();
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  Regs.setUniverse(NumRegUnits + NumVirtRegs);
+  this->NumRegUnits = NumRegUnits;
+}
+
+void LiveRegSet::clear() {
+  Regs.clear();
+}
+
+static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return &LIS->getInterval(Reg);
-  return LIS->getCachedRegUnit(Reg);
+    return &LIS.getInterval(Reg);
+  return LIS.getCachedRegUnit(Reg);
 }
 
 void RegPressureTracker::reset() {
@@ -176,8 +189,7 @@ void RegPressureTracker::reset() {
   else
     static_cast<RegionPressure&>(P).reset();
 
-  LiveRegs.PhysRegs.clear();
-  LiveRegs.VirtRegs.clear();
+  LiveRegs.clear();
   UntiedDefs.clear();
 }
 
@@ -210,8 +222,7 @@ void RegPressureTracker::init(const MachineFunction *mf,
 
   P.MaxSetPressure = CurrSetPressure;
 
-  LiveRegs.PhysRegs.setUniverse(TRI->getNumRegs());
-  LiveRegs.VirtRegs.setUniverse(MRI->getNumVirtRegs());
+  LiveRegs.init(*MRI);
   if (TrackUntiedDefs)
     UntiedDefs.setUniverse(MRI->getNumVirtRegs());
 }
@@ -250,14 +261,8 @@ void RegPressureTracker::closeTop() {
     static_cast<RegionPressure&>(P).TopPos = CurrPos;
 
   assert(P.LiveInRegs.empty() && "inconsistent max pressure result");
-  P.LiveInRegs.reserve(LiveRegs.PhysRegs.size() + LiveRegs.VirtRegs.size());
-  P.LiveInRegs.append(LiveRegs.PhysRegs.begin(), LiveRegs.PhysRegs.end());
-  for (SparseSet<unsigned>::const_iterator I =
-         LiveRegs.VirtRegs.begin(), E = LiveRegs.VirtRegs.end(); I != E; ++I)
-    P.LiveInRegs.push_back(*I);
-  std::sort(P.LiveInRegs.begin(), P.LiveInRegs.end());
-  P.LiveInRegs.erase(std::unique(P.LiveInRegs.begin(), P.LiveInRegs.end()),
-                     P.LiveInRegs.end());
+  P.LiveInRegs.reserve(LiveRegs.size());
+  LiveRegs.appendTo(P.LiveInRegs);
 }
 
 /// Set the boundary for the bottom of the region and summarize live outs.
@@ -268,21 +273,14 @@ void RegPressureTracker::closeBottom() {
     static_cast<RegionPressure&>(P).BottomPos = CurrPos;
 
   assert(P.LiveOutRegs.empty() && "inconsistent max pressure result");
-  P.LiveOutRegs.reserve(LiveRegs.PhysRegs.size() + LiveRegs.VirtRegs.size());
-  P.LiveOutRegs.append(LiveRegs.PhysRegs.begin(), LiveRegs.PhysRegs.end());
-  for (SparseSet<unsigned>::const_iterator I =
-         LiveRegs.VirtRegs.begin(), E = LiveRegs.VirtRegs.end(); I != E; ++I)
-    P.LiveOutRegs.push_back(*I);
-  std::sort(P.LiveOutRegs.begin(), P.LiveOutRegs.end());
-  P.LiveOutRegs.erase(std::unique(P.LiveOutRegs.begin(), P.LiveOutRegs.end()),
-                      P.LiveOutRegs.end());
+  P.LiveOutRegs.reserve(LiveRegs.size());
+  LiveRegs.appendTo(P.LiveOutRegs);
 }
 
 /// Finalize the region boundaries and record live ins and live outs.
 void RegPressureTracker::closeRegion() {
   if (!isTopClosed() && !isBottomClosed()) {
-    assert(LiveRegs.PhysRegs.empty() && LiveRegs.VirtRegs.empty() &&
-           "no region boundary");
+    assert(LiveRegs.size() == 0 && "no region boundary");
     return;
   }
   if (!isBottomClosed())
@@ -299,8 +297,7 @@ void RegPressureTracker::closeRegion() {
 void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
-  for (unsigned i = 0, e = P.LiveOutRegs.size(); i < e; ++i) {
-    unsigned Reg = P.LiveOutRegs[i];
+  for (unsigned Reg : P.LiveOutRegs) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)
         && !RPTracker.hasUntiedDef(Reg)) {
       increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
@@ -315,69 +312,96 @@ static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
 }
 
 namespace {
+
 /// Collect this instruction's unique uses and defs into SmallVectors for
 /// processing defs and uses in order.
 ///
 /// FIXME: always ignore tied opers
-class RegisterOperands {
-  const TargetRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
+class RegisterOperandsCollector {
+  RegisterOperands &RegOpers;
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
   bool IgnoreDead;
 
-public:
-  SmallVector<unsigned, 8> Uses;
-  SmallVector<unsigned, 8> Defs;
-  SmallVector<unsigned, 8> DeadDefs;
-
-  RegisterOperands(const TargetRegisterInfo *tri,
-                   const MachineRegisterInfo *mri, bool ID = false):
-    TRI(tri), MRI(mri), IgnoreDead(ID) {}
+  RegisterOperandsCollector(RegisterOperands &RegOpers,
+                            const TargetRegisterInfo &TRI,
+                            const MachineRegisterInfo &MRI,
+                            bool IgnoreDead)
+    : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {}
+
+  void collectInstr(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI)
+      collectOperand(*OperI);
+
+    // Remove redundant physreg dead defs.
+    SmallVectorImpl<unsigned>::iterator I =
+      std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
+                     std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
+    RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+  }
 
-  /// Push this operand's register onto the correct vector.
-  void collect(const MachineOperand &MO) {
+  /// Push this operand's register onto the correct vectors.
+  void collectOperand(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
+    unsigned Reg = MO.getReg();
     if (MO.readsReg())
-      pushRegUnits(MO.getReg(), Uses);
+      pushRegUnits(Reg, RegOpers.Uses);
     if (MO.isDef()) {
       if (MO.isDead()) {
         if (!IgnoreDead)
-          pushRegUnits(MO.getReg(), DeadDefs);
-      }
-      else
-        pushRegUnits(MO.getReg(), Defs);
+          pushRegUnits(Reg, RegOpers.DeadDefs);
+      } else
+        pushRegUnits(Reg, RegOpers.Defs);
     }
   }
 
-protected:
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) {
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) const {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       if (containsReg(RegUnits, Reg))
         return;
       RegUnits.push_back(Reg);
-    }
-    else if (MRI->isAllocatable(Reg)) {
-      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
         if (containsReg(RegUnits, *Units))
           continue;
         RegUnits.push_back(*Units);
       }
     }
   }
+
+  friend class llvm::RegisterOperands;
 };
-} // namespace
 
-/// Collect physical and virtual register operands.
-static void collectOperands(const MachineInstr *MI,
-                            RegisterOperands &RegOpers) {
-  for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
-    RegOpers.collect(*OperI);
+} // namespace
 
-  // Remove redundant physreg dead defs.
-  SmallVectorImpl<unsigned>::iterator I =
-    std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
-                   std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
-  RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+void RegisterOperands::collect(const MachineInstr &MI,
+                               const TargetRegisterInfo &TRI,
+                               const MachineRegisterInfo &MRI,
+                               bool IgnoreDead) {
+  RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead);
+  Collector.collectInstr(MI);
+}
+
+void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
+                                      const LiveIntervals &LIS) {
+  SlotIndex SlotIdx = LIS.getInstructionIndex(&MI);
+  for (SmallVectorImpl<unsigned>::iterator RI = Defs.begin();
+       RI != Defs.end(); /*empty*/) {
+    unsigned Reg = *RI;
+    const LiveRange *LR = getLiveRange(LIS, Reg);
+    if (LR != nullptr) {
+      LiveQueryResult LRQ = LR->Query(SlotIdx);
+      if (LRQ.isDeadDef()) {
+        // LiveIntervals knows this is a dead even though it's MachineOperand is
+        // not flagged as such.
+        DeadDefs.push_back(Reg);
+        RI = Defs.erase(RI);
+        continue;
+      }
+    }
+    ++RI;
+  }
 }
 
 /// Initialize an array of N PressureDiffs.
@@ -392,6 +416,18 @@ void PressureDiffs::init(unsigned N) {
   PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff)));
 }
 
+void PressureDiffs::addInstruction(unsigned Idx,
+                                   const RegisterOperands &RegOpers,
+                                   const MachineRegisterInfo &MRI) {
+  PressureDiff &PDiff = (*this)[Idx];
+  assert(!PDiff.begin()->isValid() && "stale PDiff");
+  for (unsigned Reg : RegOpers.Defs)
+    PDiff.addPressureChange(Reg, true, &MRI);
+
+  for (unsigned Reg : RegOpers.Uses)
+    PDiff.addPressureChange(Reg, false, &MRI);
+}
+
 /// Add a change in pressure to the pressure diff of a given instruction.
 void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
                                      const MachineRegisterInfo *MRI) {
@@ -399,7 +435,7 @@ void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
   int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI) {
     // Find an existing entry in the pressure diff for this PSet.
-    PressureDiff::iterator I = begin(), E = end();
+    PressureDiff::iterator I = nonconst_begin(), E = nonconst_end();
     for (; I != E && I->isValid(); ++I) {
       if (I->getPSet() >= *PSetI)
         break;
@@ -411,30 +447,28 @@ void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
     if (!I->isValid() || I->getPSet() != *PSetI) {
       PressureChange PTmp = PressureChange(*PSetI);
       for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J)
-        std::swap(*J,PTmp);
+        std::swap(*J, PTmp);
     }
     // Update the units for this pressure set.
-    I->setUnitInc(I->getUnitInc() + Weight);
+    unsigned NewUnitInc = I->getUnitInc() + Weight;
+    if (NewUnitInc != 0) {
+      I->setUnitInc(NewUnitInc);
+    } else {
+      // Remove entry
+      PressureDiff::iterator J;
+      for (J = std::next(I); J != E && J->isValid(); ++J, ++I)
+        *I = *J;
+      if (J != E)
+        *I = *J;
+    }
   }
 }
 
-/// Record the pressure difference induced by the given operand list.
-static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers,
-                         const MachineRegisterInfo *MRI) {
-  assert(!PDiff.begin()->isValid() && "stale PDiff");
-
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i)
-    PDiff.addPressureChange(RegOpers.Defs[i], true, MRI);
-
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i)
-    PDiff.addPressureChange(RegOpers.Uses[i], false, MRI);
-}
-
 /// Force liveness of registers.
 void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    if (LiveRegs.insert(Regs[i]))
-      increaseRegPressure(Regs[i]);
+  for (unsigned Reg : Regs) {
+    if (LiveRegs.insert(Reg))
+      increaseRegPressure(Reg);
   }
 }
 
@@ -465,42 +499,9 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 /// registers that are both defined and used by the instruction.  If a pressure
 /// difference pointer is provided record the changes is pressure caused by this
 /// instruction independent of liveness.
-bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
-                                PressureDiff *PDiff) {
-  // Check for the top of the analyzable region.
-  if (CurrPos == MBB->begin()) {
-    closeRegion();
-    return false;
-  }
-  if (!isBottomClosed())
-    closeBottom();
-
-  // Open the top of the region using block iterators.
-  if (!RequireIntervals && isTopClosed())
-    static_cast<RegionPressure&>(P).openTop(CurrPos);
-
-  // Find the previous instruction.
-  do
-    --CurrPos;
-  while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
-
-  if (CurrPos->isDebugValue()) {
-    closeRegion();
-    return false;
-  }
-  SlotIndex SlotIdx;
-  if (RequireIntervals)
-    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
-
-  // Open the top of the region using slot indexes.
-  if (RequireIntervals && isTopClosed())
-    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
-
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(CurrPos, RegOpers);
-
-  if (PDiff)
-    collectPDiff(*PDiff, RegOpers, MRI);
+void RegPressureTracker::recede(const RegisterOperands &RegOpers,
+                                SmallVectorImpl<unsigned> *LiveUses) {
+  assert(!CurrPos->isDebugValue());
 
   // Boost pressure for all dead defs together.
   increaseRegPressure(RegOpers.DeadDefs);
@@ -508,37 +509,23 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
 
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
-    bool DeadDef = false;
-    if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
-      if (LR) {
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        DeadDef = LRQ.isDeadDef();
-      }
-    }
-    if (DeadDef) {
-      // LiveIntervals knows this is a dead even though it's MachineOperand is
-      // not flagged as such. Since this register will not be recorded as
-      // live-out, increase its PDiff value to avoid underflowing pressure.
-      if (PDiff)
-        PDiff->addPressureChange(Reg, false, MRI);
-    } else {
-      if (LiveRegs.erase(Reg))
-        decreaseRegPressure(Reg);
-      else
-        discoverLiveOut(Reg);
-    }
+  for (unsigned Reg : RegOpers.Defs) {
+    if (LiveRegs.erase(Reg))
+      decreaseRegPressure(Reg);
+    else
+      discoverLiveOut(Reg);
   }
 
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
   // Generate liveness for uses.
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
-        const LiveRange *LR = getLiveRange(Reg);
+        const LiveRange *LR = getLiveRange(*LIS, Reg);
         if (LR) {
           LiveQueryResult LRQ = LR->Query(SlotIdx);
           if (!LRQ.isKill() && !LRQ.valueDefined())
@@ -552,24 +539,53 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
     }
   }
   if (TrackUntiedDefs) {
-    for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-      unsigned Reg = RegOpers.Defs[i];
+    for (unsigned Reg : RegOpers.Defs) {
       if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg))
         UntiedDefs.insert(Reg);
     }
   }
-  return true;
+}
+
+void RegPressureTracker::recedeSkipDebugValues() {
+  assert(CurrPos != MBB->begin());
+  if (!isBottomClosed())
+    closeBottom();
+
+  // Open the top of the region using block iterators.
+  if (!RequireIntervals && isTopClosed())
+    static_cast<RegionPressure&>(P).openTop(CurrPos);
+
+  // Find the previous instruction.
+  do
+    --CurrPos;
+  while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
+  // Open the top of the region using slot indexes.
+  if (RequireIntervals && isTopClosed())
+    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
+}
+
+void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses) {
+  recedeSkipDebugValues();
+
+  const MachineInstr &MI = *CurrPos;
+  RegisterOperands RegOpers;
+  RegOpers.collect(MI, *TRI, *MRI);
+  if (RequireIntervals)
+    RegOpers.detectDeadDefs(MI, *LIS);
+
+  recede(RegOpers, LiveUses);
 }
 
 /// Advance across the current instruction.
-bool RegPressureTracker::advance() {
+void RegPressureTracker::advance() {
   assert(!TrackUntiedDefs && "unsupported mode");
 
-  // Check for the bottom of the analyzable region.
-  if (CurrPos == MBB->end()) {
-    closeRegion();
-    return false;
-  }
+  assert(CurrPos != MBB->end());
   if (!isTopClosed())
     closeTop();
 
@@ -585,11 +601,10 @@ bool RegPressureTracker::advance() {
       static_cast<RegionPressure&>(P).openBottom(CurrPos);
   }
 
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(CurrPos, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*CurrPos, *TRI, *MRI);
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     // Discover live-ins.
     bool isLive = LiveRegs.contains(Reg);
     if (!isLive)
@@ -597,24 +612,21 @@ bool RegPressureTracker::advance() {
     // Kill liveness at last uses.
     bool lastUse = false;
     if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
+      const LiveRange *LR = getLiveRange(*LIS, Reg);
       lastUse = LR && LR->Query(SlotIdx).isKill();
-    }
-    else {
+    } else {
       // Allocatable physregs are always single-use before register rewriting.
       lastUse = !TargetRegisterInfo::isVirtualRegister(Reg);
     }
     if (lastUse && isLive) {
       LiveRegs.erase(Reg);
       decreaseRegPressure(Reg);
-    }
-    else if (!lastUse && !isLive)
+    } else if (!lastUse && !isLive)
       increaseRegPressure(Reg);
   }
 
   // Generate liveness for defs.
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     if (LiveRegs.insert(Reg))
       increaseRegPressure(Reg);
   }
@@ -627,7 +639,6 @@ bool RegPressureTracker::advance() {
   do
     ++CurrPos;
   while (CurrPos != MBB->end() && CurrPos->isDebugValue());
-  return true;
 }
 
 /// Find the max change in excess pressure across all sets.
@@ -653,8 +664,7 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
         PDiff = 0;            // Under the limit
       else
         PDiff = PNew - Limit; // Just exceeded limit.
-    }
-    else if (Limit > PNew)
+    } else if (Limit > PNew)
       PDiff = Limit - POld;   // Just obeyed limit.
 
     if (PDiff) {
@@ -719,34 +729,19 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true);
-  collectOperands(MI, RegOpers);
-
-  // Boost max pressure for all dead defs together.
-  // Since CurrSetPressure and MaxSetPressure
-  increaseRegPressure(RegOpers.DeadDefs);
-  decreaseRegPressure(RegOpers.DeadDefs);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
+  assert(RegOpers.DeadDefs.size() == 0);
+  if (RequireIntervals)
+    RegOpers.detectDeadDefs(*MI, *LIS);
 
   // Kill liveness at live defs.
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
-    bool DeadDef = false;
-    if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
-      if (LR) {
-        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        DeadDef = LRQ.isDeadDef();
-      }
-    }
-    if (!DeadDef) {
-      if (!containsReg(RegOpers.Uses, Reg))
-        decreaseRegPressure(Reg);
-    }
+  for (unsigned Reg : RegOpers.Defs) {
+    if (!containsReg(RegOpers.Uses, Reg))
+      decreaseRegPressure(Reg);
   }
   // Generate liveness for uses.
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (!LiveRegs.contains(Reg))
       increaseRegPressure(Reg);
   }
@@ -853,7 +848,8 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
     unsigned MNew = MOld;
     // Ignore DeadDefs here because they aren't captured by PressureChange.
     unsigned PNew = POld + PDiffI->getUnitInc();
-    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld) && "PSet overflow");
+    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld)
+           && "PSet overflow/underflow");
     if (PNew > MOld)
       MNew = PNew;
     // Check if current pressure has exceeded the limit.
@@ -892,19 +888,13 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
 }
 
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
-static bool findUseBetween(unsigned Reg,
-                           SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
-                           const MachineRegisterInfo *MRI,
+static bool findUseBetween(unsigned Reg, SlotIndex PriorUseIdx,
+                           SlotIndex NextUseIdx, const MachineRegisterInfo &MRI,
                            const LiveIntervals *LIS) {
-  for (MachineRegisterInfo::use_instr_nodbg_iterator
-       UI = MRI->use_instr_nodbg_begin(Reg),
-       UE = MRI->use_instr_nodbg_end(); UI != UE; ++UI) {
-      const MachineInstr* MI = &*UI;
-      if (MI->isDebugValue())
-        continue;
-      SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
-      if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx)
-        return true;
+  for (const MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) {
+    SlotIndex InstSlot = LIS->getInstructionIndex(&MI).getRegSlot();
+    if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx)
+      return true;
   }
   return false;
 }
@@ -919,8 +909,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(MI, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI);
 
   // Kill liveness at last uses. Assume allocatable physregs are single-use
   // rather than checking LiveIntervals.
@@ -928,21 +918,18 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   if (RequireIntervals)
     SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (RequireIntervals) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveRange *LR = getLiveRange(Reg);
+      const LiveRange *LR = getLiveRange(*LIS, Reg);
       if (LR) {
         LiveQueryResult LRQ = LR->Query(SlotIdx);
-        if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+        if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS))
           decreaseRegPressure(Reg);
-        }
       }
-    }
-    else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    } else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
       // Allocatable physregs are always single-use before register rewriting.
       decreaseRegPressure(Reg);
     }
@@ -966,7 +953,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 /// This is expensive for an on-the-fly query because it calls
 /// bumpDownwardPressure to recompute the pressure sets based on current
 /// liveness. We don't yet have a fast version of downward pressure tracking
-/// analagous to getUpwardPressureDelta.
+/// analogous to getUpwardPressureDelta.
 void RegPressureTracker::
 getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
                             ArrayRef<PressureChange> CriticalPSets,
diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
index 4176686..8fa1bf7 100644
--- a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -31,9 +31,12 @@ using namespace llvm;
 #define DEBUG_TYPE "reg-scavenging"
 
 /// setUsed - Set the register units of this register as used.
-void RegScavenger::setRegUsed(unsigned Reg) {
-  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
-    RegUnitsAvailable.reset(*RUI);
+void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
+  for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+    LaneBitmask UnitMask = (*RUI).second;
+    if (UnitMask == 0 || (LaneMask & UnitMask) != 0)
+      RegUnitsAvailable.reset((*RUI).first);
+  }
 }
 
 void RegScavenger::initRegState() {
@@ -50,9 +53,8 @@ void RegScavenger::initRegState() {
     return;
 
   // Live-in registers are in use.
-  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
-         E = MBB->livein_end(); I != E; ++I)
-    setRegUsed(*I);
+  for (const auto &LI : MBB->liveins())
+    setRegUsed(LI.PhysReg, LI.LaneMask);
 
   // Pristine CSRs are also unavailable.
   const MachineFunction &MF = *MBB->getParent();
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
index 76a7fef..efde61e 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -372,7 +372,6 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
       dbgs() << "\n";
     }
   }
-  dbgs() << "\n";
 }
 #endif
 
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 390b6d2..11b246a 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -51,15 +51,11 @@ static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
-                                     bool IsPostRAFlag, bool RemoveKillFlags,
-                                     LiveIntervals *lis)
-    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
-      IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
-      CanHandleTerminators(false), FirstDbgValue(nullptr) {
-  assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
+                                     bool RemoveKillFlags)
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
+      RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
+      TrackLaneMasks(false), FirstDbgValue(nullptr) {
   DbgValues.clear();
-  assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
-         "Virtual registers must be removed prior to PostRA scheduling");
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
@@ -230,11 +226,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
 
       if (TRI->isPhysicalRegister(Reg))
         Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
-      else {
-        assert(!IsPostRA && "Virtual register encountered after regalloc.");
-        if (MO.readsReg()) // ignore undef operands
-          addVRegUseDeps(&ExitSU, i);
-      }
+      else if (MO.readsReg()) // ignore undef operands
+        addVRegUseDeps(&ExitSU, i);
     }
   } else {
     // For others, e.g. fallthrough, conditional branch, assume the exit
@@ -242,11 +235,9 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
     assert(Uses.empty() && "Uses in set before adding deps?");
     for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
            SE = BB->succ_end(); SI != SE; ++SI)
-      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-             E = (*SI)->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        if (!Uses.contains(Reg))
-          Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
+      for (const auto &LI : (*SI)->liveins()) {
+        if (!Uses.contains(LI.PhysReg))
+          Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
       }
   }
 }
@@ -371,6 +362,20 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
+LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
+{
+  unsigned Reg = MO.getReg();
+  // No point in tracking lanemasks if we don't have interesting subregisters.
+  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+  if (!RC.HasDisjunctSubRegs)
+    return ~0u;
+
+  unsigned SubReg = MO.getSubReg();
+  if (SubReg == 0)
+    return RC.getLaneMask();
+  return TRI->getSubRegIndexLaneMask(SubReg);
+}
+
 /// addVRegDefDeps - Add register output and data dependencies from this SUnit
 /// to instructions that occur later in the same scheduling region if they read
 /// from or write to the virtual register defined at OperIdx.
@@ -378,35 +383,106 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
-  const MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  LaneBitmask DefLaneMask;
+  LaneBitmask KillLaneMask;
+  if (TrackLaneMasks) {
+    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
+    DefLaneMask = getLaneMaskForMO(MO);
+    // If we have a <read-undef> flag, none of the lane values comes from an
+    // earlier instruction.
+    KillLaneMask = IsKill ? ~0u : DefLaneMask;
+
+    // Clear undef flag, we'll re-add it later once we know which subregister
+    // Def is first.
+    MO.setIsUndef(false);
+  } else {
+    DefLaneMask = ~0u;
+    KillLaneMask = ~0u;
+  }
 
-  // Singly defined vregs do not have output/anti dependencies.
-  // The current operand is a def, so we have at least one.
-  // Check here if there are any others...
+  if (MO.isDead()) {
+    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
+           "Dead defs should have no uses");
+  } else {
+    // Add data dependence to all uses we found so far.
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
+         E = CurrentVRegUses.end(); I != E; /*empty*/) {
+      LaneBitmask LaneMask = I->LaneMask;
+      // Ignore uses of other lanes.
+      if ((LaneMask & KillLaneMask) == 0) {
+        ++I;
+        continue;
+      }
+
+      if ((LaneMask & DefLaneMask) != 0) {
+        SUnit *UseSU = I->SU;
+        MachineInstr *Use = UseSU->getInstr();
+        SDep Dep(SU, SDep::Data, Reg);
+        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
+                                                        I->OperandIndex));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+        UseSU->addPred(Dep);
+      }
+
+      LaneMask &= ~KillLaneMask;
+      // If we found a Def for all lanes of this use, remove it from the list.
+      if (LaneMask != 0) {
+        I->LaneMask = LaneMask;
+        ++I;
+      } else
+        I = CurrentVRegUses.erase(I);
+    }
+  }
+
+  // Shortcut: Singly defined vregs do not have output/anti dependencies.
   if (MRI.hasOneDef(Reg))
     return;
 
-  // Add output dependence to the next nearest def of this vreg.
+  // Add output dependence to the next nearest defs of this vreg.
   //
   // Unless this definition is dead, the output dependence should be
   // transitively redundant with antidependencies from this definition's
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI == VRegDefs.end())
-    VRegDefs.insert(VReg2SUnit(Reg, SU));
-  else {
-    SUnit *DefSU = DefI->SU;
-    if (DefSU != SU && DefSU != &ExitSU) {
-      SDep Dep(SU, SDep::Output, Reg);
-      Dep.setLatency(
-        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
-      DefSU->addPred(Dep);
-    }
-    DefI->SU = SU;
+  LaneBitmask LaneMask = DefLaneMask;
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for other lanes.
+    if ((V2SU.LaneMask & LaneMask) == 0)
+      continue;
+    // Add an output dependence.
+    SUnit *DefSU = V2SU.SU;
+    // Ignore additional defs of the same lanes in one instruction. This can
+    // happen because lanemasks are shared for targets with too many
+    // subregisters. We also use some representration tricks/hacks where we
+    // add super-register defs/uses, to imply that although we only access parts
+    // of the reg we care about the full one.
+    if (DefSU == SU)
+      continue;
+    SDep Dep(SU, SDep::Output, Reg);
+    Dep.setLatency(
+      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+    DefSU->addPred(Dep);
+
+    // Update current definition. This can get tricky if the def was about a
+    // bigger lanemask before. We then have to shrink it and create a new
+    // VReg2SUnit for the non-overlapping part.
+    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
+    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
+    if (NonOverlapMask != 0)
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
+    V2SU.SU = SU;
+    V2SU.LaneMask = OverlapMask;
   }
+  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
+  if (LaneMask != 0)
+    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
 /// addVRegUseDeps - Add a register data dependency if the instruction that
@@ -416,59 +492,34 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
-  MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  const MachineInstr *MI = SU->getInstr();
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  // Remember the use. Data dependencies will be added when we find the def.
+  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
+  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
+
+  // Add antidependences to the following defs of the vreg.
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for unrelated lanes.
+    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
+    if ((PrevDefLaneMask & LaneMask) == 0)
+      continue;
+    if (V2SU.SU == SU)
+      continue;
 
-  // Record this local VReg use.
-  VReg2UseMap::iterator UI = VRegUses.find(Reg);
-  for (; UI != VRegUses.end(); ++UI) {
-    if (UI->SU == SU)
-      break;
+    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
   }
-  if (UI == VRegUses.end())
-    VRegUses.insert(VReg2SUnit(Reg, SU));
-
-  // Lookup this operand's reaching definition.
-  assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveQueryResult LRQ
-    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
-  VNInfo *VNI = LRQ.valueIn();
-
-  // VNI will be valid because MachineOperand::readsReg() is checked by caller.
-  assert(VNI && "No value to read by operand");
-  MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
-  // Phis and other noninstructions (after coalescing) have a NULL Def.
-  if (Def) {
-    SUnit *DefSU = getSUnit(Def);
-    if (DefSU) {
-      // The reaching Def lives within this scheduling region.
-      // Create a data dependence.
-      SDep dep(DefSU, SDep::Data, Reg);
-      // Adjust the dependence latency using operand def/use information, then
-      // allow the target to perform its own adjustments.
-      int DefOp = Def->findRegisterDefOperandIdx(Reg);
-      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
-
-      const TargetSubtargetInfo &ST = MF.getSubtarget();
-      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
-      SU->addPred(dep);
-    }
-  }
-
-  // Add antidependence to the following def of the vreg it uses.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI != VRegDefs.end() && DefI->SU != SU)
-    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
-  if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
-      (MI->hasOrderedMemoryRef() &&
-       (!MI->mayLoad() || !MI->isInvariantLoad(AA))))
-    return true;
-  return false;
+  return MI->isCall() || MI->hasUnmodeledSideEffects() ||
+         (MI->hasOrderedMemoryRef() &&
+          (!MI->mayLoad() || !MI->isInvariantLoad(AA)));
 }
 
 // This MI might have either incomplete info, or known to be unsafe
@@ -508,7 +559,7 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
   return false;
 }
 
-/// This returns true if the two MIs need a chain edge betwee them.
+/// This returns true if the two MIs need a chain edge between them.
 /// If these are not even memory operations, we still may need
 /// chain deps between them. The question really is - could
 /// these two MIs be reordered during scheduling from memory dependency
@@ -670,7 +721,7 @@ static inline void addChainDependency(AliasAnalysis *AA,
                                       unsigned TrueMemOrderLatency = 0,
                                       bool isNormalMemory = false) {
   // If this is a false dependency,
-  // do not add the edge, but rememeber the rejected node.
+  // do not add the edge, but remember the rejected node.
   if (MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) {
     SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
     Dep.setLatency(TrueMemOrderLatency);
@@ -685,7 +736,7 @@ static inline void addChainDependency(AliasAnalysis *AA,
   }
 }
 
-/// Create an SUnit for each real instruction, numbered in top-down toplological
+/// Create an SUnit for each real instruction, numbered in top-down topological
 /// order. The instruction order A < B, implies that no edge exists from B to A.
 ///
 /// Map each real instruction to its SUnit.
@@ -743,17 +794,44 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
+void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (!MO.readsReg())
+      continue;
+    if (TrackLaneMasks && !MO.isUse())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    // Record this local VReg use.
+    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
+    for (; UI != VRegUses.end(); ++UI) {
+      if (UI->SU == SU)
+        break;
+    }
+    if (UI == VRegUses.end())
+      VRegUses.insert(VReg2SUnit(Reg, 0, SU));
+  }
+}
+
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
-                                        PressureDiffs *PDiffs) {
+                                        PressureDiffs *PDiffs,
+                                        bool TrackLaneMasks) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
 
+  this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
   ScheduleDAG::clearDAG();
 
@@ -766,7 +844,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   // We build scheduling units by walking a block's instruction list from bottom
   // to top.
 
-  // Remember where a generic side-effecting instruction is as we procede.
+  // Remember where a generic side-effecting instruction is as we proceed.
   SUnit *BarrierChain = nullptr, *AliasChain = nullptr;
 
   // Memory references to specific known memory locations are tracked
@@ -787,10 +865,14 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Defs.setUniverse(TRI->getNumRegs());
   Uses.setUniverse(TRI->getNumRegs());
 
-  assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
+  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
+  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  CurrentVRegDefs.setUniverse(NumVirtRegs);
+  CurrentVRegUses.setUniverse(NumVirtRegs);
+
   VRegUses.clear();
-  VRegDefs.setUniverse(MRI.getNumVirtRegs());
-  VRegUses.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(NumVirtRegs);
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -814,10 +896,16 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
     assert(SU && "No SUnit mapped to this MI");
 
     if (RPTracker) {
-      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : nullptr;
-      RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
-      assert(RPTracker->getPos() == std::prev(MII) &&
-             "RPTracker can't find MI");
+      collectVRegUses(SU);
+
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI);
+      if (PDiffs != nullptr)
+        PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
+
+      RPTracker->recedeSkipDebugValues();
+      assert(&*RPTracker->getPos() == MI && "RPTracker in sync");
+      RPTracker->recede(RegOpers);
     }
 
     assert(
@@ -835,7 +923,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       if (TRI->isPhysicalRegister(Reg))
         addPhysRegDeps(SU, j);
       else {
-        assert(!IsPostRA && "Virtual register encountered!");
         if (MO.isDef()) {
           HasVRegDef = true;
           addVRegDefDeps(SU, j);
@@ -890,7 +977,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       BarrierChain = SU;
       // This is a barrier event that acts as a pivotal node in the DAG,
       // so it is safe to clear list of exposed nodes.
-      adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes,
+      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
                       TrueMemOrderLatency);
       RejectMemNodes.clear();
       NonAliasMemDefs.clear();
@@ -903,27 +990,30 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
           ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain,
+        addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
                            RejectMemNodes, ChainLatency);
       }
       AliasChain = SU;
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+        addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                            PendingLoads[k], RejectMemNodes,
                            TrueMemOrderLatency);
       for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
            AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                              I->second[i], RejectMemNodes);
       }
       for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                              I->second[i], RejectMemNodes, TrueMemOrderLatency);
       }
-      adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes,
+      // This call must come after calls to addChainDependency() since it
+      // consumes the 'RejectMemNodes' list that addChainDependency() possibly
+      // adds to.
+      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
                       TrueMemOrderLatency);
       PendingLoads.clear();
       AliasMemDefs.clear();
@@ -937,7 +1027,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         BarrierChain->addPred(SDep(SU, SDep::Barrier));
 
       UnderlyingObjectsVector Objs;
-      getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout());
+      getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
 
       if (Objs.empty()) {
         // Treat all other stores conservatively.
@@ -961,7 +1051,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
           for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-            addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+            addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                                I->second[i], RejectMemNodes, 0, true);
 
           // If we're not using AA, then we only need one store per object.
@@ -986,7 +1076,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+            addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                                J->second[i], RejectMemNodes,
                                TrueMemOrderLatency, true);
           J->second.clear();
@@ -996,15 +1086,18 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // Add dependencies from all the PendingLoads, i.e. loads
         // with no underlying object.
         for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                              PendingLoads[k], RejectMemNodes,
                              TrueMemOrderLatency);
         // Add dependence on alias chain, if needed.
         if (AliasChain)
-          addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain,
+          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
                              RejectMemNodes);
       }
-      adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes,
+      // This call must come after calls to addChainDependency() since it
+      // consumes the 'RejectMemNodes' list that addChainDependency() possibly
+      // adds to.
+      adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes,
                       TrueMemOrderLatency);
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
@@ -1012,7 +1105,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // Invariant load, no chain dependencies needed!
       } else {
         UnderlyingObjectsVector Objs;
-        getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout());
+        getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout());
 
         if (Objs.empty()) {
           // A load with no underlying object. Depend on all
@@ -1020,7 +1113,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
             for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-              addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+              addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                                  I->second[i], RejectMemNodes);
 
           PendingLoads.push_back(SU);
@@ -1044,20 +1137,23 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
             ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
             for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-              addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU,
+              addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU,
                                  I->second[i], RejectMemNodes, 0, true);
           if (ThisMayAlias)
             AliasMemUses[V].push_back(SU);
           else
             NonAliasMemUses[V].push_back(SU);
         }
-        if (MayAlias)
-          adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU,
-                          RejectMemNodes, /*Latency=*/0);
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
-          addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain,
+          addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain,
                              RejectMemNodes);
+        if (MayAlias)
+          // This call must come after calls to addChainDependency() since it
+          // consumes the 'RejectMemNodes' list that addChainDependency()
+          // possibly adds to.
+          adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU,
+                          RejectMemNodes, /*Latency=*/0);
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Barrier));
       }
@@ -1068,7 +1164,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
   Defs.clear();
   Uses.clear();
-  VRegDefs.clear();
+  CurrentVRegDefs.clear();
+  CurrentVRegUses.clear();
   PendingLoads.clear();
 }
 
@@ -1080,11 +1177,9 @@ void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {
   // Examine the live-in regs of all successors.
   for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
        SE = BB->succ_end(); SI != SE; ++SI) {
-    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-         E = (*SI)->livein_end(); I != E; ++I) {
-      unsigned Reg = *I;
+    for (const auto &LI : (*SI)->liveins()) {
       // Repeat, for reg and all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         LiveRegs.set(*SubRegs);
     }
@@ -1103,7 +1198,7 @@ static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
   // Once we set a kill flag on an instruction, we bail out, as otherwise we
   // might set it on too many operands.  We will clear as many flags as we
   // can though.
-  MachineBasicBlock::instr_iterator Begin = MI;
+  MachineBasicBlock::instr_iterator Begin = MI->getIterator();
   MachineBasicBlock::instr_iterator End = getBundleEnd(MI);
   while (Begin != End) {
     for (MachineOperand &MO : (--End)->operands()) {
@@ -1237,7 +1332,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
         toggleKillFlag(MI, MO);
         DEBUG(MI->dump());
         DEBUG(if (MI->getOpcode() == TargetOpcode::BUNDLE) {
-          MachineBasicBlock::instr_iterator Begin = MI;
+          MachineBasicBlock::instr_iterator Begin = MI->getIterator();
           MachineBasicBlock::instr_iterator End = getBundleEnd(MI);
           while (++Begin != End)
             DEBUG(Begin->dump());
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index b2e4617..1150d26 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -43,9 +43,12 @@ namespace llvm {
       return (Node->NumPreds > 10 || Node->NumSuccs > 10);
     }
 
-    static bool hasNodeAddressLabel(const SUnit *Node,
-                                    const ScheduleDAG *Graph) {
-      return true;
+    static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                              const ScheduleDAG *Graph) {
+      std::string R;
+      raw_string_ostream OS(R);
+      OS << static_cast<const void *>(Node);
+      return R;
     }
 
     /// If you want to override the dot attributes printed for a particular
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3b29306..c741982 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -156,13 +156,16 @@ namespace {
     void deleteAndRecombine(SDNode *N);
     bool recursivelyDeleteUnusedNodes(SDNode *N);
 
+    /// Replaces all uses of the results of one DAG node with new values.
     SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                       bool AddTo = true);
 
+    /// Replaces all uses of the results of one DAG node with new values.
     SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
       return CombineTo(N, &Res, 1, AddTo);
     }
 
+    /// Replaces all uses of the results of one DAG node with new values.
     SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
                       bool AddTo = true) {
       SDValue To[] = { Res0, Res1 };
@@ -233,18 +236,17 @@ namespace {
     SDValue visitADDE(SDNode *N);
     SDValue visitSUBE(SDNode *N);
     SDValue visitMUL(SDNode *N);
+    SDValue useDivRem(SDNode *N);
     SDValue visitSDIV(SDNode *N);
     SDValue visitUDIV(SDNode *N);
-    SDValue visitSREM(SDNode *N);
-    SDValue visitUREM(SDNode *N);
+    SDValue visitREM(SDNode *N);
     SDValue visitMULHU(SDNode *N);
     SDValue visitMULHS(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
     SDValue visitSMULO(SDNode *N);
     SDValue visitUMULO(SDNode *N);
-    SDValue visitSDIVREM(SDNode *N);
-    SDValue visitUDIVREM(SDNode *N);
+    SDValue visitIMINMAX(SDNode *N);
     SDValue visitAND(SDNode *N);
     SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference);
     SDValue visitOR(SDNode *N);
@@ -265,6 +267,7 @@ namespace {
     SDValue visitVSELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
     SDValue visitSETCC(SDNode *N);
+    SDValue visitSETCCE(SDNode *N);
     SDValue visitSIGN_EXTEND(SDNode *N);
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
@@ -298,6 +301,10 @@ namespace {
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
+
+    SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
+    SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
+
     SDValue visitSTORE(SDNode *N);
     SDValue visitINSERT_VECTOR_ELT(SDNode *N);
     SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
@@ -312,9 +319,11 @@ namespace {
     SDValue visitMGATHER(SDNode *N);
     SDValue visitMSCATTER(SDNode *N);
     SDValue visitFP_TO_FP16(SDNode *N);
+    SDValue visitFP16_TO_FP(SDNode *N);
 
     SDValue visitFADDForFMACombine(SDNode *N);
     SDValue visitFSUBForFMACombine(SDNode *N);
+    SDValue visitFMULForFMACombine(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
@@ -338,14 +347,17 @@ namespace {
                                          unsigned HiOp);
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
     SDValue CombineExtLoad(SDNode *N);
+    SDValue combineRepeatedFPDivisors(SDNode *N);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
-    SDValue BuildReciprocalEstimate(SDValue Op);
-    SDValue BuildRsqrtEstimate(SDValue Op);
-    SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations);
-    SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations);
+    SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
+                                 SDNodeFlags *Flags);
+    SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
+                                 SDNodeFlags *Flags);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -374,6 +386,10 @@ namespace {
     /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
+    /// Do FindBetterChain for a store and any possibly adjacent stores on
+    /// consecutive chains.
+    bool findBetterNeighborChains(StoreSDNode *St);
+
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
@@ -388,19 +404,37 @@ namespace {
       unsigned SequenceNum;
     };
 
+    /// This is a helper function for visitMUL to check the profitability
+    /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+    /// MulNode is the original multiply, AddNode is (add x, c1),
+    /// and ConstNode is c2.
+    bool isMulAddWithConstProfitable(SDNode *MulNode,
+                                     SDValue &AddNode,
+                                     SDValue &ConstNode);
+
     /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
     /// constant build_vector of the stored constant values in Stores.
     SDValue getMergedConstantVectorStore(SelectionDAG &DAG,
                                          SDLoc SL,
                                          ArrayRef<MemOpLink> Stores,
+                                         SmallVectorImpl<SDValue> &Chains,
                                          EVT Ty) const;
 
+    /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
+    /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
+    /// the type of the loaded value to be extended.  LoadedVT returns the type
+    /// of the original loaded value.  NarrowLoad returns whether the load would
+    /// need to be narrowed in order to match.
+    bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
+                          EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
+                          bool &NarrowLoad);
+
     /// This is a helper function for MergeConsecutiveStores. When the source
     /// elements of the consecutive stores are all constants or all extracted
     /// vector elements, try to merge them into one larger store.
     /// \return True if a merged store was created.
     bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
-                                         EVT MemVT, unsigned NumElem,
+                                         EVT MemVT, unsigned NumStores,
                                          bool IsConstantSrc, bool UseVector);
 
     /// This is a helper function for MergeConsecutiveStores.
@@ -409,7 +443,7 @@ namespace {
     void getStoreMergeAndAliasCandidates(
         StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
         SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
-    
+
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return True if some memory operations were changed.
@@ -427,9 +461,7 @@ namespace {
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
-      auto *F = DAG.getMachineFunction().getFunction();
-      ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) ||
-                    F->hasFnAttribute(Attribute::MinSize);
+      ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
     }
 
     /// Runs the dag combiner on all nodes in the work list
@@ -606,6 +638,9 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
   assert(Op.hasOneUse() && "Unknown reuse!");
 
   assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
+
+  const SDNodeFlags *Flags = Op.getNode()->getFlags();
+
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown code");
   case ISD::ConstantFP: {
@@ -623,12 +658,12 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
       return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
-                         Op.getOperand(1));
+                         Op.getOperand(1), Flags);
     // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
     return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                        GetNegatedExpression(Op.getOperand(1), DAG,
                                             LegalOperations, Depth+1),
-                       Op.getOperand(0));
+                       Op.getOperand(0), Flags);
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
     assert(Options.UnsafeFPMath);
@@ -640,7 +675,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
     return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(0));
+                       Op.getOperand(1), Op.getOperand(0), Flags);
 
   case ISD::FMUL:
   case ISD::FDIV:
@@ -652,13 +687,13 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
       return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
-                         Op.getOperand(1));
+                         Op.getOperand(1), Flags);
 
     // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                        Op.getOperand(0),
                        GetNegatedExpression(Op.getOperand(1), DAG,
-                                            LegalOperations, Depth+1));
+                                            LegalOperations, Depth+1), Flags);
 
   case ISD::FP_EXTEND:
   case ISD::FSIN:
@@ -1216,9 +1251,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   LegalTypes = Level >= AfterLegalizeTypes;
 
   // Add all the dag nodes to the worklist.
-  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = DAG.allnodes_end(); I != E; ++I)
-    AddToWorklist(I);
+  for (SDNode &Node : DAG.allnodes())
+    AddToWorklist(&Node);
 
   // Create a dummy node (which is not added to allnodes), that adds a reference
   // to the root node, preventing it from being deleted, and tracking any
@@ -1333,16 +1367,18 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MUL:                return visitMUL(N);
   case ISD::SDIV:               return visitSDIV(N);
   case ISD::UDIV:               return visitUDIV(N);
-  case ISD::SREM:               return visitSREM(N);
-  case ISD::UREM:               return visitUREM(N);
+  case ISD::SREM:
+  case ISD::UREM:               return visitREM(N);
   case ISD::MULHU:              return visitMULHU(N);
   case ISD::MULHS:              return visitMULHS(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
   case ISD::SMULO:              return visitSMULO(N);
   case ISD::UMULO:              return visitUMULO(N);
-  case ISD::SDIVREM:            return visitSDIVREM(N);
-  case ISD::UDIVREM:            return visitUDIVREM(N);
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:               return visitIMINMAX(N);
   case ISD::AND:                return visitAND(N);
   case ISD::OR:                 return visitOR(N);
   case ISD::XOR:                return visitXOR(N);
@@ -1361,6 +1397,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
+  case ISD::SETCCE:             return visitSETCCE(N);
   case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
   case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
@@ -1408,6 +1445,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
+  case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
   }
   return SDValue();
 }
@@ -1470,13 +1508,8 @@ SDValue DAGCombiner::combine(SDNode *N) {
     // Constant operands are canonicalized to RHS.
     if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
       SDValue Ops[] = {N1, N0};
-      SDNode *CSENode;
-      if (const auto *BinNode = dyn_cast<BinaryWithFlagsSDNode>(N)) {
-        CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
-                                      &BinNode->Flags);
-      } else {
-        CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops);
-      }
+      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
+                                            N->getFlags());
       if (CSENode)
         return SDValue(CSENode, 0);
     }
@@ -1595,26 +1628,6 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
   return SDValue(N, 0);   // Return N so it doesn't get rechecked!
 }
 
-static bool isNullConstant(SDValue V) {
-  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
-  return Const != nullptr && Const->isNullValue();
-}
-
-static bool isNullFPConstant(SDValue V) {
-  ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
-  return Const != nullptr && Const->isZero() && !Const->isNegative();
-}
-
-static bool isAllOnesConstant(SDValue V) {
-  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
-  return Const != nullptr && Const->isAllOnesValue();
-}
-
-static bool isOneConstant(SDValue V) {
-  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
-  return Const != nullptr && Const->isOne();
-}
-
 /// If \p N is a ContantSDNode with isOpaque() == false return it casted to a
 /// ContantSDNode pointer else nullptr.
 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
@@ -1721,22 +1734,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     return SDValue(N, 0);
 
   // fold (a+b) -> (a|b) iff a and b share no bits.
-  if (VT.isInteger() && !VT.isVector()) {
-    APInt LHSZero, LHSOne;
-    APInt RHSZero, RHSOne;
-    DAG.computeKnownBits(N0, LHSZero, LHSOne);
-
-    if (LHSZero.getBoolValue()) {
-      DAG.computeKnownBits(N1, RHSZero, RHSOne);
-
-      // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
-      // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
-      if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero){
-        if (!LegalOperations || TLI.isOperationLegal(ISD::OR, VT))
-          return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1);
-      }
-    }
-  }
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
+      VT.isInteger() && !VT.isVector() && DAG.haveNoCommonBitsSet(N0, N1))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1);
 
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
@@ -1971,31 +1971,26 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  SDLoc DL(N);
 
   // If the flag result is dead, turn this into an SUB.
   if (!N->hasAnyUseOfValue(1))
-    return CombineTo(N, DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1),
-                     DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
-                                 MVT::Glue));
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // fold (subc x, x) -> 0 + no borrow
-  if (N0 == N1) {
-    SDLoc DL(N);
+  if (N0 == N1)
     return CombineTo(N, DAG.getConstant(0, DL, VT),
-                     DAG.getNode(ISD::CARRY_FALSE, DL,
-                                 MVT::Glue));
-  }
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // fold (subc x, 0) -> x + no borrow
   if (isNullConstant(N1))
-    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
-                                        MVT::Glue));
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
   if (isAllOnesConstant(N0))
-    return CombineTo(N, DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0),
-                     DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
-                                 MVT::Glue));
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   return SDValue();
 }
@@ -2130,14 +2125,15 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
 
   // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
-  if (N1IsConst && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
-      (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
-                     isa<ConstantSDNode>(N0.getOperand(1))))
-    return DAG.getNode(ISD::ADD, SDLoc(N), VT,
-                       DAG.getNode(ISD::MUL, SDLoc(N0), VT,
-                                   N0.getOperand(0), N1),
-                       DAG.getNode(ISD::MUL, SDLoc(N1), VT,
-                                   N0.getOperand(1), N1));
+  if (isConstantIntBuildVectorOrConstantInt(N1) &&
+      N0.getOpcode() == ISD::ADD &&
+      isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
+      isMulAddWithConstProfitable(N, N0, N1))
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT,
+                         DAG.getNode(ISD::MUL, SDLoc(N0), VT,
+                                     N0.getOperand(0), N1),
+                         DAG.getNode(ISD::MUL, SDLoc(N1), VT,
+                                     N0.getOperand(1), N1));
 
   // reassociate mul
   if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1))
@@ -2146,6 +2142,88 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   return SDValue();
 }
 
+/// Return true if divmod libcall is available.
+static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
+                                     const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: return false; // No libcall for vector types.
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  return TLI.getLibcallName(LC) != nullptr;
+}
+
+/// Issue divrem if both quotient and remainder are needed.
+SDValue DAGCombiner::useDivRem(SDNode *Node) {
+  if (Node->use_empty())
+    return SDValue(); // This is a dead node, leave it alone.
+
+  EVT VT = Node->getValueType(0);
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
+
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+  // If DIVREM is going to get expanded into a libcall,
+  // but there is no libcall available, then don't combine.
+  if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
+      !isDivRemLibcallAvailable(Node, isSigned, TLI))
+    return SDValue();
+
+  // If div is legal, it's better to do the normal expansion
+  unsigned OtherOpcode = 0;
+  if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
+    OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
+    if (TLI.isOperationLegalOrCustom(Opcode, VT))
+      return SDValue();
+  } else {
+    OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+    if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
+      return SDValue();
+  }
+
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue combined;
+  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
+         UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node || User->use_empty())
+      continue;
+    // Convert the other matching node(s), too;
+    // otherwise, the DIVREM may get target-legalized into something
+    // target-specific that we won't be able to recognize.
+    unsigned UserOpc = User->getOpcode();
+    if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
+        User->getOperand(0) == Op0 &&
+        User->getOperand(1) == Op1) {
+      if (!combined) {
+        if (UserOpc == OtherOpcode) {
+          SDVTList VTs = DAG.getVTList(VT, VT);
+          combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
+        } else if (UserOpc == DivRemOpc) {
+          combined = SDValue(User, 0);
+        } else {
+          assert(UserOpc == Opcode);
+          continue;
+        }
+      }
+      if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
+        CombineTo(User, combined);
+      else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
+        CombineTo(User, combined.getValue(1));
+    }
+  }
+  return combined;
+}
+
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2156,26 +2234,26 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
+  SDLoc DL(N);
+
   // fold (sdiv c1, c2) -> c1/c2
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::SDIV, SDLoc(N), VT, N0C, N1C);
+    return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);
   // fold (sdiv X, 1) -> X
   if (N1C && N1C->isOne())
     return N0;
   // fold (sdiv X, -1) -> 0-X
-  if (N1C && N1C->isAllOnesValue()) {
-    SDLoc DL(N);
+  if (N1C && N1C->isAllOnesValue())
     return DAG.getNode(ISD::SUB, DL, VT,
                        DAG.getConstant(0, DL, VT), N0);
-  }
+
   // If we know the sign bits of both operands are zero, strength reduce to a
   // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
   if (!VT.isVector()) {
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::UDIV, SDLoc(N), N1.getValueType(),
-                         N0, N1);
+      return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
   }
 
   // fold (sdiv X, pow2) -> simple ops after legalize
@@ -2186,18 +2264,11 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
       !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() &&
       (N1C->getAPIntValue().isPowerOf2() ||
        (-N1C->getAPIntValue()).isPowerOf2())) {
-    // If dividing by powers of two is cheap, then don't perform the following
-    // fold.
-    if (TLI.isPow2SDivCheap())
-      return SDValue();
-
     // Target-specific implementation of sdiv x, pow2.
-    SDValue Res = BuildSDIVPow2(N);
-    if (Res.getNode())
+    if (SDValue Res = BuildSDIVPow2(N))
       return Res;
 
     unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
-    SDLoc DL(N);
 
     // Splat the sign bit into the register
     SDValue SGN =
@@ -2228,15 +2299,23 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   }
 
   // If integer divide is expensive and we satisfy the requirements, emit an
-  // alternate sequence.
-  if (N1C && !TLI.isIntDivCheap()) {
-    SDValue Op = BuildSDIV(N);
-    if (Op.getNode()) return Op;
-  }
+  // alternate sequence.  Targets may check function attributes for size/speed
+  // trade-offs.
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue Op = BuildSDIV(N))
+      return Op;
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true.
+  // Otherwise, we break the simplification logic in visitREM().
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
 
   // undef / X -> 0
   if (N0.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
   // X / undef -> undef
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
@@ -2254,26 +2333,26 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
+  SDLoc DL(N);
+
   // fold (udiv c1, c2) -> c1/c2
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C)
-    if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, SDLoc(N), VT,
+    if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
                                                     N0C, N1C))
       return Folded;
   // fold (udiv x, (1 << c)) -> x >>u c
-  if (N1C && !N1C->isOpaque() && N1C->getAPIntValue().isPowerOf2()) {
-    SDLoc DL(N);
+  if (N1C && !N1C->isOpaque() && N1C->getAPIntValue().isPowerOf2())
     return DAG.getNode(ISD::SRL, DL, VT, N0,
                        DAG.getConstant(N1C->getAPIntValue().logBase2(), DL,
                                        getShiftAmountTy(N0.getValueType())));
-  }
+
   // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
   if (N1.getOpcode() == ISD::SHL) {
     if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) {
       if (SHC->getAPIntValue().isPowerOf2()) {
         EVT ADDVT = N1.getOperand(1).getValueType();
-        SDLoc DL(N);
         SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT,
                                   N1.getOperand(1),
                                   DAG.getConstant(SHC->getAPIntValue()
@@ -2284,15 +2363,23 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
       }
     }
   }
+
   // fold (udiv x, c) -> alternate
-  if (N1C && !TLI.isIntDivCheap()) {
-    SDValue Op = BuildUDIV(N);
-    if (Op.getNode()) return Op;
-  }
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue Op = BuildUDIV(N))
+      return Op;
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true.
+  // Otherwise, we break the simplification logic in visitREM().
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
 
   // undef / X -> 0
   if (N0.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
   // X / undef -> undef
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
@@ -2300,102 +2387,83 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitSREM(SDNode *N) {
+// handles ISD::SREM and ISD::UREM
+SDValue DAGCombiner::visitREM(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  bool isSigned = (Opcode == ISD::SREM);
+  SDLoc DL(N);
 
-  // fold (srem c1, c2) -> c1%c2
+  // fold (rem c1, c2) -> c1%c2
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C)
-    if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::SREM, SDLoc(N), VT,
-                                                    N0C, N1C))
+    if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
       return Folded;
-  // If we know the sign bits of both operands are zero, strength reduce to a
-  // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
-  if (!VT.isVector()) {
-    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::UREM, SDLoc(N), VT, N0, N1);
-  }
 
-  // If X/C can be simplified by the division-by-constant logic, lower
-  // X%C to the equivalent of X-X/C*C.
-  if (N1C && !N1C->isNullValue()) {
-    SDValue Div = DAG.getNode(ISD::SDIV, SDLoc(N), VT, N0, N1);
-    AddToWorklist(Div.getNode());
-    SDValue OptimizedDiv = combine(Div.getNode());
-    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
-                                OptimizedDiv, N1);
-      SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul);
-      AddToWorklist(Mul.getNode());
-      return Sub;
+  if (isSigned) {
+    // If we know the sign bits of both operands are zero, strength reduce to a
+    // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
+    if (!VT.isVector()) {
+      if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+        return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
     }
-  }
-
-  // undef % X -> 0
-  if (N0.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(0, SDLoc(N), VT);
-  // X % undef -> undef
-  if (N1.getOpcode() == ISD::UNDEF)
-    return N1;
-
-  return SDValue();
-}
-
-SDValue DAGCombiner::visitUREM(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  EVT VT = N->getValueType(0);
-
-  // fold (urem c1, c2) -> c1%c2
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (N0C && N1C)
-    if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UREM, SDLoc(N), VT,
-                                                    N0C, N1C))
-      return Folded;
-  // fold (urem x, pow2) -> (and x, pow2-1)
-  if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
-      N1C->getAPIntValue().isPowerOf2()) {
-    SDLoc DL(N);
-    return DAG.getNode(ISD::AND, DL, VT, N0,
-                       DAG.getConstant(N1C->getAPIntValue() - 1, DL, VT));
-  }
-  // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
-  if (N1.getOpcode() == ISD::SHL) {
-    if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) {
-      if (SHC->getAPIntValue().isPowerOf2()) {
-        SDLoc DL(N);
-        SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1,
+  } else {
+    // fold (urem x, pow2) -> (and x, pow2-1)
+    if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
+        N1C->getAPIntValue().isPowerOf2()) {
+      return DAG.getNode(ISD::AND, DL, VT, N0,
+                         DAG.getConstant(N1C->getAPIntValue() - 1, DL, VT));
+    }
+    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+    if (N1.getOpcode() == ISD::SHL) {
+      if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) {
+        if (SHC->getAPIntValue().isPowerOf2()) {
+          SDValue Add =
+            DAG.getNode(ISD::ADD, DL, VT, N1,
                  DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL,
                                  VT));
-        AddToWorklist(Add.getNode());
-        return DAG.getNode(ISD::AND, DL, VT, N0, Add);
+          AddToWorklist(Add.getNode());
+          return DAG.getNode(ISD::AND, DL, VT, N0, Add);
+        }
       }
     }
   }
 
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
-  if (N1C && !N1C->isNullValue()) {
-    SDValue Div = DAG.getNode(ISD::UDIV, SDLoc(N), VT, N0, N1);
+  // To avoid mangling nodes, this simplification requires that the combine()
+  // call for the speculative DIV must not cause a DIVREM conversion.  We guard
+  // against this by skipping the simplification if isIntDivCheap().  When
+  // div is not cheap, combine will not return a DIVREM.  Regardless,
+  // checking cheapness here makes sense since the simplification results in
+  // fatter code.
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) {
+    unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+    SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1);
     AddToWorklist(Div.getNode());
     SDValue OptimizedDiv = combine(Div.getNode());
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
-                                OptimizedDiv, N1);
-      SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul);
+      assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) &&
+             (OptimizedDiv.getOpcode() != ISD::SDIVREM));
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
       AddToWorklist(Mul.getNode());
       return Sub;
     }
   }
 
+  // sdiv, srem -> sdivrem
+  if (SDValue DivRem = useDivRem(N))
+    return DivRem.getValue(1);
+
   // undef % X -> 0
   if (N0.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
   // X % undef -> undef
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
@@ -2532,8 +2600,8 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
 }
 
 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
+    return Res;
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
@@ -2563,8 +2631,8 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
+    return Res;
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
@@ -2613,16 +2681,26 @@ SDValue DAGCombiner::visitUMULO(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
-  if (Res.getNode()) return Res;
+SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
 
-  return SDValue();
-}
+  // fold (add c1, c2) -> c1+c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);
 
-SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM);
-  if (Res.getNode()) return Res;
+  // canonicalize constant to RHS
+  if (isConstantIntBuildVectorOrConstantInt(N0) &&
+     !isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
   return SDValue();
 }
@@ -2848,10 +2926,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
       if (Result != ISD::SETCC_INVALID &&
           (!LegalOperations ||
            (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC,
-                            getSetCCResultType(N0.getSimpleValueType())))))
-        return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                            LL, LR, Result);
+            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
+        EVT CCVT = getSetCCResultType(LL.getValueType());
+        if (N0.getValueType() == CCVT ||
+            (!LegalOperations && N0.getValueType() == MVT::i1))
+          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
+                              LL, LR, Result);
+      }
     }
   }
 
@@ -2887,6 +2968,46 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
   return SDValue();
 }
 
+bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
+                                   EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
+                                   bool &NarrowLoad) {
+  uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits();
+
+  if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue()))
+    return false;
+
+  ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+  LoadedVT = LoadN->getMemoryVT();
+
+  if (ExtVT == LoadedVT &&
+      (!LegalOperations ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
+    // ZEXTLOAD will match without needing to change the size of the value being
+    // loaded.
+    NarrowLoad = false;
+    return true;
+  }
+
+  // Do not change the width of a volatile load.
+  if (LoadN->isVolatile())
+    return false;
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
+    return false;
+
+  if (LegalOperations &&
+      !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
+    return false;
+
+  if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
+    return false;
+
+  NarrowLoad = true;
+  return true;
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -3079,16 +3200,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       : cast<LoadSDNode>(N0);
     if (LN0->getExtensionType() != ISD::SEXTLOAD &&
         LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) {
-      uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits();
-      if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
-        EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
-        EVT LoadedVT = LN0->getMemoryVT();
-        EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
-
-        if (ExtVT == LoadedVT &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
-                                                    ExtVT))) {
-
+      auto NarrowLoad = false;
+      EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+      EVT ExtVT, LoadedVT;
+      if (isAndLoadExtLoad(N1C, LN0, LoadResultTy, ExtVT, LoadedVT,
+                           NarrowLoad)) {
+        if (!NarrowLoad) {
           SDValue NewLoad =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
                            LN0->getChain(), LN0->getBasePtr(), ExtVT,
@@ -3096,14 +3213,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           AddToWorklist(N);
           CombineTo(LN0, NewLoad, NewLoad.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
-        }
-
-        // Do not change the width of a volatile load.
-        // Do not generate loads of non-round integer types since these can
-        // be expensive (and would be wrong if the type is not byte sized).
-        if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
-                                                    ExtVT))) {
+        } else {
           EVT PtrType = LN0->getOperand(1).getValueType();
 
           unsigned Alignment = LN0->getAlignment();
@@ -3142,10 +3252,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     return Combined;
 
   // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
 
   // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
   // fold (and (sra)) -> (and (srl)) when possible.
@@ -3507,10 +3616,13 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
       if (Result != ISD::SETCC_INVALID &&
           (!LegalOperations ||
            (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC,
-              getSetCCResultType(N0.getValueType())))))
-        return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                            LL, LR, Result);
+            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
+        EVT CCVT = getSetCCResultType(LL.getValueType());
+        if (N0.getValueType() == CCVT ||
+            (!LegalOperations && N0.getValueType() == MVT::i1))
+          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
+                              LL, LR, Result);
+      }
     }
   }
 
@@ -3665,11 +3777,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     return Combined;
 
   // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
-  SDValue BSwap = MatchBSwapHWord(N, N0, N1);
-  if (BSwap.getNode())
+  if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
     return BSwap;
-  BSwap = MatchBSwapHWordLow(N, N0, N1);
-  if (BSwap.getNode())
+  if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
     return BSwap;
 
   // reassociate or
@@ -3690,10 +3800,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     }
   }
   // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
 
   // See if this is some rotate idiom.
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
@@ -3710,7 +3819,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 /// Match "(X shl/srl V1) & V2" where V2 may not be present.
 static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
   if (Op.getOpcode() == ISD::AND) {
-    if (isa<ConstantSDNode>(Op.getOperand(1))) {
+    if (isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
       Mask = Op.getOperand(1);
       Op = Op.getOperand(0);
     } else {
@@ -3727,105 +3836,106 @@ static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
 }
 
 // Return true if we can prove that, whenever Neg and Pos are both in the
-// range [0, OpSize), Neg == (Pos == 0 ? 0 : OpSize - Pos).  This means that
+// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos).  This means that
 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
 //
 //     (or (shift1 X, Neg), (shift2 X, Pos))
 //
 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
-// in direction shift1 by Neg.  The range [0, OpSize) means that we only need
+// in direction shift1 by Neg.  The range [0, EltSize) means that we only need
 // to consider shift amounts with defined behavior.
-static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned OpSize) {
-  // If OpSize is a power of 2 then:
+static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
+  // If EltSize is a power of 2 then:
   //
-  //  (a) (Pos == 0 ? 0 : OpSize - Pos) == (OpSize - Pos) & (OpSize - 1)
-  //  (b) Neg == Neg & (OpSize - 1) whenever Neg is in [0, OpSize).
+  //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
+  //  (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
   //
-  // So if OpSize is a power of 2 and Neg is (and Neg', OpSize-1), we check
+  // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
   // for the stronger condition:
   //
-  //     Neg & (OpSize - 1) == (OpSize - Pos) & (OpSize - 1)    [A]
+  //     Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1)    [A]
   //
-  // for all Neg and Pos.  Since Neg & (OpSize - 1) == Neg' & (OpSize - 1)
+  // for all Neg and Pos.  Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
   // we can just replace Neg with Neg' for the rest of the function.
   //
   // In other cases we check for the even stronger condition:
   //
-  //     Neg == OpSize - Pos                                    [B]
+  //     Neg == EltSize - Pos                                    [B]
   //
   // for all Neg and Pos.  Note that the (or ...) then invokes undefined
-  // behavior if Pos == 0 (and consequently Neg == OpSize).
+  // behavior if Pos == 0 (and consequently Neg == EltSize).
   //
-  // We could actually use [A] whenever OpSize is a power of 2, but the
+  // We could actually use [A] whenever EltSize is a power of 2, but the
   // only extra cases that it would match are those uninteresting ones
   // where Neg and Pos are never in range at the same time.  E.g. for
-  // OpSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
+  // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
   // as well as (sub 32, Pos), but:
   //
   //     (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
   //
   // always invokes undefined behavior for 32-bit X.
   //
-  // Below, Mask == OpSize - 1 when using [A] and is all-ones otherwise.
+  // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
   unsigned MaskLoBits = 0;
-  if (Neg.getOpcode() == ISD::AND &&
-      isPowerOf2_64(OpSize) &&
-      Neg.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Neg.getOperand(1))->getAPIntValue() == OpSize - 1) {
-    Neg = Neg.getOperand(0);
-    MaskLoBits = Log2_64(OpSize);
+  if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
+    if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
+      if (NegC->getAPIntValue() == EltSize - 1) {
+        Neg = Neg.getOperand(0);
+        MaskLoBits = Log2_64(EltSize);
+      }
+    }
   }
 
   // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
   if (Neg.getOpcode() != ISD::SUB)
-    return 0;
-  ConstantSDNode *NegC = dyn_cast<ConstantSDNode>(Neg.getOperand(0));
+    return false;
+  ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
   if (!NegC)
-    return 0;
+    return false;
   SDValue NegOp1 = Neg.getOperand(1);
 
-  // On the RHS of [A], if Pos is Pos' & (OpSize - 1), just replace Pos with
+  // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
   // Pos'.  The truncation is redundant for the purpose of the equality.
-  if (MaskLoBits &&
-      Pos.getOpcode() == ISD::AND &&
-      Pos.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Pos.getOperand(1))->getAPIntValue() == OpSize - 1)
-    Pos = Pos.getOperand(0);
+  if (MaskLoBits && Pos.getOpcode() == ISD::AND)
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+      if (PosC->getAPIntValue() == EltSize - 1)
+        Pos = Pos.getOperand(0);
 
   // The condition we need is now:
   //
-  //     (NegC - NegOp1) & Mask == (OpSize - Pos) & Mask
+  //     (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
   //
   // If NegOp1 == Pos then we need:
   //
-  //              OpSize & Mask == NegC & Mask
+  //              EltSize & Mask == NegC & Mask
   //
   // (because "x & Mask" is a truncation and distributes through subtraction).
   APInt Width;
   if (Pos == NegOp1)
     Width = NegC->getAPIntValue();
+
   // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
   // Then the condition we want to prove becomes:
   //
-  //     (NegC - NegOp1) & Mask == (OpSize - (NegOp1 + PosC)) & Mask
+  //     (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
   //
   // which, again because "x & Mask" is a truncation, becomes:
   //
-  //                NegC & Mask == (OpSize - PosC) & Mask
-  //              OpSize & Mask == (NegC + PosC) & Mask
-  else if (Pos.getOpcode() == ISD::ADD &&
-           Pos.getOperand(0) == NegOp1 &&
-           Pos.getOperand(1).getOpcode() == ISD::Constant)
-    Width = (cast<ConstantSDNode>(Pos.getOperand(1))->getAPIntValue() +
-             NegC->getAPIntValue());
-  else
+  //                NegC & Mask == (EltSize - PosC) & Mask
+  //             EltSize & Mask == (NegC + PosC) & Mask
+  else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+      Width = PosC->getAPIntValue() + NegC->getAPIntValue();
+    else
+      return false;
+  } else
     return false;
 
-  // Now we just need to check that OpSize & Mask == Width & Mask.
+  // Now we just need to check that EltSize & Mask == Width & Mask.
   if (MaskLoBits)
-    // Opsize & Mask is 0 since Mask is Opsize - 1.
+    // EltSize & Mask is 0 since Mask is EltSize - 1.
     return Width.getLoBits(MaskLoBits) == 0;
-  return Width == OpSize;
+  return Width == EltSize;
 }
 
 // A subroutine of MatchRotate used once we have found an OR of two opposite
@@ -3845,7 +3955,7 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   //          (srl x, (*ext y))) ->
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
-  if (matchRotateSub(InnerPos, InnerNeg, VT.getSizeInBits())) {
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg).getNode();
@@ -3888,10 +3998,10 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   if (RHSShift.getOpcode() == ISD::SHL) {
     std::swap(LHS, RHS);
     std::swap(LHSShift, RHSShift);
-    std::swap(LHSMask , RHSMask );
+    std::swap(LHSMask, RHSMask);
   }
 
-  unsigned OpSizeInBits = VT.getSizeInBits();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
   SDValue LHSShiftArg = LHSShift.getOperand(0);
   SDValue LHSShiftAmt = LHSShift.getOperand(1);
   SDValue RHSShiftArg = RHSShift.getOperand(0);
@@ -3899,11 +4009,10 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
 
   // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
   // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
-  if (LHSShiftAmt.getOpcode() == ISD::Constant &&
-      RHSShiftAmt.getOpcode() == ISD::Constant) {
-    uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue();
-    uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue();
-    if ((LShVal + RShVal) != OpSizeInBits)
+  if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
+    uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
+    uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
+    if ((LShVal + RShVal) != EltSizeInBits)
       return nullptr;
 
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
@@ -3911,18 +4020,23 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
-      APInt Mask = APInt::getAllOnesValue(OpSizeInBits);
+      APInt AllBits = APInt::getAllOnesValue(EltSizeInBits);
+      SDValue Mask = DAG.getConstant(AllBits, DL, VT);
 
       if (LHSMask.getNode()) {
-        APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal);
-        Mask &= cast<ConstantSDNode>(LHSMask)->getAPIntValue() | RHSBits;
+        APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
+        Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
+                           DAG.getNode(ISD::OR, DL, VT, LHSMask,
+                                       DAG.getConstant(RHSBits, DL, VT)));
       }
       if (RHSMask.getNode()) {
-        APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal);
-        Mask &= cast<ConstantSDNode>(RHSMask)->getAPIntValue() | LHSBits;
+        APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
+        Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
+                           DAG.getNode(ISD::OR, DL, VT, RHSMask,
+                                       DAG.getConstant(LHSBits, DL, VT)));
       }
 
-      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, DL, VT));
+      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
     }
 
     return Rot.getNode();
@@ -4112,10 +4226,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
 
   // Simplify the expression using non-local knowledge.
   if (!VT.isVector() &&
@@ -4434,12 +4547,19 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
   }
 
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSHL = visitShiftByConstant(N, N1C);
-    if (NewSHL.getNode())
-      return NewSHL;
+  // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
+  if (N1C && N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse()) {
+    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      if (SDValue Folded =
+              DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, N0C1, N1C))
+        return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Folded);
+    }
   }
 
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSHL = visitShiftByConstant(N, N1C))
+      return NewSHL;
+
   return SDValue();
 }
 
@@ -4583,11 +4703,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
 
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSRA = visitShiftByConstant(N, N1C);
-    if (NewSRA.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRA = visitShiftByConstant(N, N1C))
       return NewSRA;
-  }
 
   return SDValue();
 }
@@ -4744,8 +4862,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
-    SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode());
-    if (NewOp1.getNode())
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
   }
 
@@ -4754,15 +4871,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSRL = visitShiftByConstant(N, N1C);
-    if (NewSRL.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRL = visitShiftByConstant(N, N1C))
       return NewSRL;
-  }
 
   // Attempt to convert a srl of a load into a narrower zero-extending load.
-  SDValue NarrowLoad = ReduceLoadWidth(N);
-  if (NarrowLoad.getNode())
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
     return NarrowLoad;
 
   // Here is a common situation. We want to optimize:
@@ -4973,70 +5087,47 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (SimplifySelectOps(N, N1, N2))
     return SDValue(N, 0);  // Don't revisit N.
 
-  // fold selects based on a setcc into other things, such as min/max/abs
-  if (N0.getOpcode() == ISD::SETCC) {
-    // select x, y (fcmp lt x, y) -> fminnum x, y
-    // select x, y (fcmp gt x, y) -> fmaxnum x, y
-    //
-    // This is OK if we don't care about what happens if either operand is a
-    // NaN.
-    //
-
-    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
-    // no signed zeros as well as no nans.
-    const TargetOptions &Options = DAG.getTarget().Options;
-    if (Options.UnsafeFPMath &&
-        VT.isFloatingPoint() && N0.hasOneUse() &&
-        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
-      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-
-      SDValue FMinMax =
-          combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1),
-                              N1, N2, CC, TLI, DAG);
-      if (FMinMax)
-        return FMinMax;
-    }
-
-    if ((!LegalOperations &&
-         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
-        TLI.isOperationLegal(ISD::SELECT_CC, VT))
-      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1),
-                         N1, N2, N0.getOperand(2));
-    return SimplifySelect(SDLoc(N), N0, N1, N2);
-  }
-
   if (VT0 == MVT::i1) {
-    if (TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) {
-      // select (and Cond0, Cond1), X, Y
-      //   -> select Cond0, (select Cond1, X, Y), Y
-      if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
-        SDValue Cond0 = N0->getOperand(0);
-        SDValue Cond1 = N0->getOperand(1);
-        SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
-                                          N1.getValueType(), Cond1, N1, N2);
+    // The code in this block deals with the following 2 equivalences:
+    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
+    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
+    // The target can specify its prefered form with the
+    // shouldNormalizeToSelectSequence() callback. However we always transform
+    // to the right anyway if we find the inner select exists in the DAG anyway
+    // and we always transform to the left side if we know that we can further
+    // optimize the combination of the conditions.
+    bool normalizeToSequence
+      = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    // select (and Cond0, Cond1), X, Y
+    //   -> select Cond0, (select Cond1, X, Y), Y
+    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
+                                        N1.getValueType(), Cond1, N1, N2);
+      if (normalizeToSequence || !InnerSelect.use_empty())
         return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0,
                            InnerSelect, N2);
-      }
-      // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
-      if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
-        SDValue Cond0 = N0->getOperand(0);
-        SDValue Cond1 = N0->getOperand(1);
-        SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
-                                          N1.getValueType(), Cond1, N1, N2);
+    }
+    // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
+    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
+                                        N1.getValueType(), Cond1, N1, N2);
+      if (normalizeToSequence || !InnerSelect.use_empty())
         return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, N1,
                            InnerSelect);
-      }
     }
 
     // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
-    if (N1->getOpcode() == ISD::SELECT) {
+    if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
       SDValue N1_0 = N1->getOperand(0);
       SDValue N1_1 = N1->getOperand(1);
       SDValue N1_2 = N1->getOperand(2);
       if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
         // Create the actual and node if we can generate good code for it.
-        if (!TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) {
+        if (!normalizeToSequence) {
           SDValue And = DAG.getNode(ISD::AND, SDLoc(N), N0.getValueType(),
                                     N0, N1_0);
           return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), And,
@@ -5049,13 +5140,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       }
     }
     // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
-    if (N2->getOpcode() == ISD::SELECT) {
+    if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
       SDValue N2_0 = N2->getOperand(0);
       SDValue N2_1 = N2->getOperand(1);
       SDValue N2_2 = N2->getOperand(2);
       if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
         // Create the actual or node if we can generate good code for it.
-        if (!TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) {
+        if (!normalizeToSequence) {
           SDValue Or = DAG.getNode(ISD::OR, SDLoc(N), N0.getValueType(),
                                    N0, N2_0);
           return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Or,
@@ -5069,6 +5160,38 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     }
   }
 
+  // fold selects based on a setcc into other things, such as min/max/abs
+  if (N0.getOpcode() == ISD::SETCC) {
+    // select x, y (fcmp lt x, y) -> fminnum x, y
+    // select x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+
+    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
+    // no signed zeros as well as no nans.
+    const TargetOptions &Options = DAG.getTarget().Options;
+    if (Options.UnsafeFPMath &&
+        VT.isFloatingPoint() && N0.hasOneUse() &&
+        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+      if (SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0),
+                                                N0.getOperand(1), N1, N2, CC,
+                                                TLI, DAG))
+        return FMinMax;
+    }
+
+    if ((!LegalOperations &&
+         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
+        TLI.isOperationLegal(ISD::SELECT_CC, VT))
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         N1, N2, N0.getOperand(2));
+    return SimplifySelect(SDLoc(N), N0, N1, N2);
+  }
+
   return SDValue();
 }
 
@@ -5523,8 +5646,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
       N2.getOpcode() == ISD::CONCAT_VECTORS &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
-    SDValue CV = ConvertSelectToConcatVector(N, DAG);
-    if (CV.getNode())
+    if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
       return CV;
   }
 
@@ -5580,7 +5702,20 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
                        SDLoc(N));
 }
 
-/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or 
+SDValue DAGCombiner::visitSETCCE(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+
+  // If Carry is false, fold to a regular SETCC.
+  if (Carry.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
+
+  return SDValue();
+}
+
+/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
 /// a build_vector of constants.
 /// This function is called by the DAGCombiner when visiting sext/zext/aext
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
@@ -5837,8 +5972,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (sext (truncate (load x))) -> (sext (smaller load x))
     // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
       SDNode* oye = N0.getNode()->getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
@@ -6024,7 +6158,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
 
     if (!VT.isVector()) {
       EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType());
-      if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, SetCCVT)) {
+      if (!LegalOperations ||
+          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
         SDLoc DL(N);
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
         SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
@@ -6120,8 +6255,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   // fold (zext (truncate (load x))) -> (zext (smaller load x))
   // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
   if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
       SDNode* oye = N0.getNode()->getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
@@ -6133,32 +6267,45 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   }
 
   // fold (zext (truncate x)) -> (and x, mask)
-  if (N0.getOpcode() == ISD::TRUNCATE &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
-
+  if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (zext (truncate (load x))) -> (zext (smaller load x))
     // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
-      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+      SDNode *oye = N0.getNode()->getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
         AddToWorklist(oye);
       }
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
 
-    SDValue Op = N0.getOperand(0);
-    if (Op.getValueType().bitsLT(VT)) {
-      Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
-      AddToWorklist(Op.getNode());
-    } else if (Op.getValueType().bitsGT(VT)) {
-      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-      AddToWorklist(Op.getNode());
+    EVT SrcVT = N0.getOperand(0).getValueType();
+    EVT MinVT = N0.getValueType();
+
+    // Try to mask before the extension to avoid having to generate a larger mask,
+    // possibly over several sub-vectors.
+    if (SrcVT.bitsLT(VT)) {
+      if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
+                               TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
+        SDValue Op = N0.getOperand(0);
+        Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+        AddToWorklist(Op.getNode());
+        return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
+      }
+    }
+
+    if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
+      SDValue Op = N0.getOperand(0);
+      if (SrcVT.bitsLT(VT)) {
+        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
+        AddToWorklist(Op.getNode());
+      } else if (SrcVT.bitsGT(VT)) {
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+        AddToWorklist(Op.getNode());
+      }
+      return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
     }
-    return DAG.getZeroExtendInReg(Op, SDLoc(N),
-                                  N0.getValueType().getScalarType());
   }
 
   // Fold (zext (and (trunc x), cst)) -> (and x, cst),
@@ -6219,6 +6366,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
   // fold (zext (and/or/xor (load x), cst)) ->
   //      (and/or/xor (zextload x), (zext cst))
+  // Unless (and (load x) cst) will match as a zextload already and has
+  // additional users.
   if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
@@ -6229,9 +6378,20 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
       bool DoXform = true;
       SmallVector<SDNode*, 4> SetCCs;
-      if (!N0.hasOneUse())
-        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND,
-                                          SetCCs, TLI);
+      if (!N0.hasOneUse()) {
+        if (N0.getOpcode() == ISD::AND) {
+          auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
+          auto NarrowLoad = false;
+          EVT LoadResultTy = AndC->getValueType(0);
+          EVT ExtVT, LoadedVT;
+          if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT, LoadedVT,
+                               NarrowLoad))
+            DoXform = false;
+        }
+        if (DoXform)
+          DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0),
+                                            ISD::ZERO_EXTEND, SetCCs, TLI);
+      }
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
@@ -6378,8 +6538,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   // fold (aext (truncate (load x))) -> (aext (smaller load x))
   // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
   if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
       SDNode* oye = N0.getNode()->getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
@@ -6546,8 +6705,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
       // Watch out for shift count overflow though.
       if (Amt >= Mask.getBitWidth()) break;
       APInt NewMask = Mask << Amt;
-      SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
-      if (SimplifyLHS.getNode())
+      if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
         return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                            SimplifyLHS, V.getOperand(1));
     }
@@ -6685,9 +6843,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   uint64_t PtrOff = ShAmt / 8;
   unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
   SDLoc DL(LN0);
+  // The original load itself didn't wrap, so an offset within it doesn't.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
   SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
                                PtrType, LN0->getBasePtr(),
-                               DAG.getConstant(PtrOff, DL, PtrType));
+                               DAG.getConstant(PtrOff, DL, PtrType),
+                               &Flags);
   AddToWorklist(NewPtr.getNode());
 
   SDValue Load;
@@ -6736,8 +6898,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   unsigned VTBits = VT.getScalarType().getSizeInBits();
   unsigned EVTBits = EVT.getScalarType().getSizeInBits();
 
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
   // fold (sext_in_reg c1) -> c1
-  if (isa<ConstantSDNode>(N0) || N0.getOpcode() == ISD::UNDEF)
+  if (isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
@@ -6771,8 +6936,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
 
   // fold (sext_in_reg (load x)) -> (smaller sextload x)
   // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
-  SDValue NarrowLoad = ReduceLoadWidth(N);
-  if (NarrowLoad.getNode())
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
     return NarrowLoad;
 
   // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
@@ -6831,29 +6995,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
                          BSwap, N1);
   }
 
-  // Fold a sext_inreg of a build_vector of ConstantSDNodes or undefs
-  // into a build_vector.
-  if (ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
-    SmallVector<SDValue, 8> Elts;
-    unsigned NumElts = N0->getNumOperands();
-    unsigned ShAmt = VTBits - EVTBits;
-
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Op = N0->getOperand(i);
-      if (Op->getOpcode() == ISD::UNDEF) {
-        Elts.push_back(Op);
-        continue;
-      }
-
-      ConstantSDNode *CurrentND = cast<ConstantSDNode>(Op);
-      const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue());
-      Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(),
-                                     SDLoc(Op), Op.getValueType()));
-    }
-
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Elts);
-  }
-
   return SDValue();
 }
 
@@ -6999,9 +7140,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
   if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
-    SDValue Reduced = ReduceLoadWidth(N);
-    if (Reduced.getNode())
+    if (SDValue Reduced = ReduceLoadWidth(N))
       return Reduced;
+
     // Handle the case where the load remains an extending load even
     // after truncation.
     if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
@@ -7107,6 +7248,12 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   return SDValue();
 }
 
+static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
+  // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
+  // and Lo parts; on big-endian machines it doesn't.
+  return DAG.getDataLayout().isBigEndian() ? 1 : 0;
+}
+
 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7173,6 +7320,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fneg x)) ->
+  //     flipbit = signbit
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  //
+  // fold (bitcast (fabs x)) ->
+  //     flipbit = (and (extract_element (bitcast x), 0), signbit)
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
@@ -7183,6 +7339,29 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     AddToWorklist(NewConv.getNode());
 
     SDLoc DL(N);
+    if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+      assert(VT.getSizeInBits() == 128);
+      SDValue SignBit = DAG.getConstant(
+          APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
+      SDValue FlipBit;
+      if (N0.getOpcode() == ISD::FNEG) {
+        FlipBit = SignBit;
+        AddToWorklist(FlipBit.getNode());
+      } else {
+        assert(N0.getOpcode() == ISD::FABS);
+        SDValue Hi =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
+                        DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                              SDLoc(NewConv)));
+        AddToWorklist(Hi.getNode());
+        FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
+        AddToWorklist(FlipBit.getNode());
+      }
+      SDValue FlipBits =
+          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+      AddToWorklist(FlipBits.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
+    }
     APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
       return DAG.getNode(ISD::XOR, DL, VT,
@@ -7196,6 +7375,13 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   //         (or (and (bitconvert x), sign), (and cst, (not sign)))
   // Note that we don't handle (copysign x, cst) because this can always be
   // folded to an fneg or fabs.
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fcopysign cst, x)) ->
+  //     flipbit = (and (extract_element
+  //                     (xor (bitcast cst), (bitcast x)), 0),
+  //                    signbit)
+  //     (xor (bitcast cst) (build_pair flipbit, flipbit))
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
       isa<ConstantFPSDNode>(N0.getOperand(0)) &&
       VT.isInteger() && !VT.isVector()) {
@@ -7224,6 +7410,30 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         AddToWorklist(X.getNode());
       }
 
+      if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+        APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2);
+        SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(0)), VT,
+                                  N0.getOperand(0));
+        AddToWorklist(Cst.getNode());
+        SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(1)), VT,
+                                N0.getOperand(1));
+        AddToWorklist(X.getNode());
+        SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
+        AddToWorklist(XorResult.getNode());
+        SDValue XorResult64 = DAG.getNode(
+            ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
+            DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                  SDLoc(XorResult)));
+        AddToWorklist(XorResult64.getNode());
+        SDValue FlipBit =
+            DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
+                        DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
+        AddToWorklist(FlipBit.getNode());
+        SDValue FlipBits =
+            DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+        AddToWorklist(FlipBits.getNode());
+        return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
+      }
       APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
       X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, SDLoc(X), VT));
@@ -7240,11 +7450,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   }
 
   // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
-  if (N0.getOpcode() == ISD::BUILD_PAIR) {
-    SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT);
-    if (CombineLD.getNode())
+  if (N0.getOpcode() == ISD::BUILD_PAIR)
+    if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
       return CombineLD;
-  }
 
   // Remove double bitcasts from shuffles - this is often a legacy of
   // XformToShuffleWithZero being used to combine bitmaskings (of
@@ -7257,10 +7465,10 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
 
     // If operands are a bitcast, peek through if it casts the original VT.
-    // If operands are a UNDEF or constant, just bitcast back to original VT.
+    // If operands are a constant, just bitcast back to original VT.
     auto PeekThroughBitcast = [&](SDValue Op) {
       if (Op.getOpcode() == ISD::BITCAST &&
-          Op.getOperand(0)->getValueType(0) == VT)
+          Op.getOperand(0).getValueType() == VT)
         return SDValue(Op.getOperand(0));
       if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
           ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
@@ -7431,28 +7639,34 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                       Options.UnsafeFPMath);
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
 
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations &&
-                  TLI.isOperationLegal(ISD::FMAD, VT));
+  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
-  bool HasFMA = ((!LegalOperations ||
-                  TLI.isOperationLegalOrCustom(ISD::FMA, VT)) &&
-                 TLI.isFMAFasterThanFMulAndFAdd(VT) &&
-                 UnsafeFPMath);
+  bool HasFMA =
+      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
-  unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && N0.getOpcode() == ISD::FMUL &&
+      N1.getOpcode() == ISD::FMUL) {
+    if (N0.getNode()->use_size() > N1.getNode()->use_size())
+      std::swap(N0, N1);
+  }
+
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
   if (N0.getOpcode() == ISD::FMUL &&
       (Aggressive || N0->hasOneUse())) {
@@ -7469,7 +7683,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (UnsafeFPMath && LookThroughFPExt) {
+  if (AllowFusion && LookThroughFPExt) {
     // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
@@ -7495,7 +7709,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   }
 
   // More folding opportunities when target permits.
-  if ((UnsafeFPMath || HasFMAD)  && Aggressive) {
+  if ((AllowFusion || HasFMAD)  && Aggressive) {
     // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
     if (N0.getOpcode() == PreferredFusedOpcode &&
         N0.getOperand(2).getOpcode() == ISD::FMUL) {
@@ -7518,7 +7732,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                      N0));
     }
 
-    if (UnsafeFPMath && LookThroughFPExt) {
+    if (AllowFusion && LookThroughFPExt) {
       // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y, (fma (fpext u), (fpext v), z))
       auto FoldFAddFMAFPExtFMul = [&] (
@@ -7608,25 +7822,23 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                       Options.UnsafeFPMath);
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
 
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations &&
-                  TLI.isOperationLegal(ISD::FMAD, VT));
+  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
-  bool HasFMA = ((!LegalOperations ||
-                  TLI.isOperationLegalOrCustom(ISD::FMA, VT)) &&
-                 TLI.isFMAFasterThanFMulAndFAdd(VT) &&
-                 UnsafeFPMath);
+  bool HasFMA =
+      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
-  unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
@@ -7659,7 +7871,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (UnsafeFPMath && LookThroughFPExt) {
+  if (AllowFusion && LookThroughFPExt) {
     // fold (fsub (fpext (fmul x, y)), z)
     //   -> (fma (fpext x), (fpext y), (fneg z))
     if (N0.getOpcode() == ISD::FP_EXTEND) {
@@ -7735,7 +7947,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // More folding opportunities when target permits.
-  if ((UnsafeFPMath || HasFMAD) && Aggressive) {
+  if ((AllowFusion || HasFMAD) && Aggressive) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (N0.getOpcode() == PreferredFusedOpcode &&
@@ -7765,7 +7977,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                      N21, N0));
     }
 
-    if (UnsafeFPMath && LookThroughFPExt) {
+    if (AllowFusion && LookThroughFPExt) {
       // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
       if (N0.getOpcode() == PreferredFusedOpcode) {
@@ -7866,14 +8078,97 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   return SDValue();
 }
 
+/// Try to perform FMA combining on a given FMUL node.
+SDValue DAGCombiner::visitFMULForFMACombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
+
+  const TargetOptions &Options = DAG.getTarget().Options;
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+
+  // Floating-point multiply-add with intermediate rounding.
+  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+
+  // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
+  // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
+  auto FuseFADD = [&](SDValue X, SDValue Y) {
+    if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
+      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
+      if (XC1 && XC1->isExactlyValue(+1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+      if (XC1 && XC1->isExactlyValue(-1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+    }
+    return SDValue();
+  };
+
+  if (SDValue FMA = FuseFADD(N0, N1))
+    return FMA;
+  if (SDValue FMA = FuseFADD(N1, N0))
+    return FMA;
+
+  // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
+  // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
+  // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
+  // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
+  auto FuseFSUB = [&](SDValue X, SDValue Y) {
+    if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
+      auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
+      if (XC0 && XC0->isExactlyValue(+1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                           Y);
+      if (XC0 && XC0->isExactlyValue(-1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+
+      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
+      if (XC1 && XC1->isExactlyValue(+1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+      if (XC1 && XC1->isExactlyValue(-1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+    }
+    return SDValue();
+  };
+
+  if (SDValue FMA = FuseFSUB(N0, N1))
+    return FMA;
+  if (SDValue FMA = FuseFSUB(N1, N0))
+    return FMA;
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
 
   // fold vector ops
   if (VT.isVector())
@@ -7882,23 +8177,23 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 
   // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FADD, DL, VT, N0, N1);
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);
 
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
-    return DAG.getNode(ISD::FADD, DL, VT, N1, N0);
+    return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
 
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, DL, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations));
+                       GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, DL, VT, N1,
-                       GetNegatedExpression(N0, DAG, LegalOperations));
+                       GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
   // If 'unsafe math' is enabled, fold lots of things.
   if (Options.UnsafeFPMath) {
@@ -7907,14 +8202,17 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     bool AllowNewConst = (Level < AfterLegalizeDAG);
 
     // fold (fadd A, 0) -> A
-    if (N1CFP && N1CFP->isZero())
-      return N0;
+    if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
+      if (N1C->isZero())
+        return N0;
 
     // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
     if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
-        isa<ConstantFPSDNode>(N0.getOperand(1)))
+        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)))
       return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0),
-                         DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1));
+                         DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1,
+                                     Flags),
+                         Flags);
 
     // If allowed, fold (fadd (fneg x), x) -> 0.0
     if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
@@ -7929,64 +8227,64 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     // of rounding steps.
     if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
       if (N0.getOpcode() == ISD::FMUL) {
-        ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
-        ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+        bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
 
         // (fadd (fmul x, c), x) -> (fmul x, c+1)
         if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
-          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP01, 0),
-                                       DAG.getConstantFP(1.0, DL, VT));
-          return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP);
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
+                                       DAG.getConstantFP(1.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
         }
 
         // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
         if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
             N1.getOperand(0) == N1.getOperand(1) &&
             N0.getOperand(0) == N1.getOperand(0)) {
-          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP01, 0),
-                                       DAG.getConstantFP(2.0, DL, VT));
-          return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP);
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
+                                       DAG.getConstantFP(2.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
         }
       }
 
       if (N1.getOpcode() == ISD::FMUL) {
-        ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
-        ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
+        bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
 
         // (fadd x, (fmul x, c)) -> (fmul x, c+1)
         if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
-          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP11, 0),
-                                       DAG.getConstantFP(1.0, DL, VT));
-          return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP);
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
+                                       DAG.getConstantFP(1.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
         }
 
         // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
         if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
             N0.getOperand(0) == N0.getOperand(1) &&
             N1.getOperand(0) == N0.getOperand(0)) {
-          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP11, 0),
-                                       DAG.getConstantFP(2.0, DL, VT));
-          return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP);
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
+                                       DAG.getConstantFP(2.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
         }
       }
 
       if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
-        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+        bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
         // (fadd (fadd x, x), x) -> (fmul x, 3.0)
-        if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
+        if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
             (N0.getOperand(0) == N1)) {
           return DAG.getNode(ISD::FMUL, DL, VT,
-                             N1, DAG.getConstantFP(3.0, DL, VT));
+                             N1, DAG.getConstantFP(3.0, DL, VT), Flags);
         }
       }
 
       if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
-        ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+        bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
         // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
         if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
             N1.getOperand(0) == N0) {
           return DAG.getNode(ISD::FMUL, DL, VT,
-                             N0, DAG.getConstantFP(3.0, DL, VT));
+                             N0, DAG.getConstantFP(3.0, DL, VT), Flags);
         }
       }
 
@@ -7996,15 +8294,14 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
           N0.getOperand(0) == N0.getOperand(1) &&
           N1.getOperand(0) == N1.getOperand(1) &&
           N0.getOperand(0) == N1.getOperand(0)) {
-        return DAG.getNode(ISD::FMUL, DL, VT,
-                           N0.getOperand(0), DAG.getConstantFP(4.0, DL, VT));
+        return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
+                           DAG.getConstantFP(4.0, DL, VT), Flags);
       }
     }
   } // enable-unsafe-fp-math
 
   // FADD -> FMA combines:
-  SDValue Fused = visitFADDForFMACombine(N);
-  if (Fused) {
+  if (SDValue Fused = visitFADDForFMACombine(N)) {
     AddToWorklist(Fused.getNode());
     return Fused;
   }
@@ -8020,6 +8317,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
 
   // fold vector ops
   if (VT.isVector())
@@ -8028,12 +8326,12 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // fold (fsub c1, c2) -> c1-c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FSUB, dl, VT, N0, N1);
+    return DAG.getNode(ISD::FSUB, dl, VT, N0, N1, Flags);
 
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, dl, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations));
+                       GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // If 'unsafe math' is enabled, fold lots of things.
   if (Options.UnsafeFPMath) {
@@ -8068,8 +8366,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // FSUB -> FMA combines:
-  SDValue Fused = visitFSUBForFMACombine(N);
-  if (Fused) {
+  if (SDValue Fused = visitFSUBForFMACombine(N)) {
     AddToWorklist(Fused.getNode());
     return Fused;
   }
@@ -8085,6 +8382,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -8095,12 +8393,12 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
 
   // fold (fmul c1, c2) -> c1*c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FMUL, DL, VT, N0, N1);
+    return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);
 
   // canonicalize constant to RHS
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
      !isConstantFPBuildVectorOrConstantFP(N1))
-    return DAG.getNode(ISD::FMUL, DL, VT, N1, N0);
+    return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
 
   // fold (fmul A, 1.0) -> A
   if (N1CFP && N1CFP->isExactlyValue(1.0))
@@ -8129,8 +8427,8 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
         // the second operand of the outer multiply are constants.
         if ((N1CFP && isConstOrConstSplatFP(N01)) ||
             (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
-          SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1);
-          return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts);
+          SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
         }
       }
     }
@@ -8139,16 +8437,18 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
     // during an early run of DAGCombiner can prevent folding with fmuls
     // inserted during lowering.
-    if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) {
+    if (N0.getOpcode() == ISD::FADD &&
+        (N0.getOperand(0) == N0.getOperand(1)) &&
+        N0.hasOneUse()) {
       const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
-      SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
-      return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
+      SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
+      return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
     }
   }
 
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
-    return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);
 
   // fold (fmul X, -1.0) -> (fneg X)
   if (N1CFP && N1CFP->isExactlyValue(-1.0))
@@ -8163,10 +8463,17 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
       if (LHSNeg == 2 || RHSNeg == 2)
         return DAG.getNode(ISD::FMUL, DL, VT,
                            GetNegatedExpression(N0, DAG, LegalOperations),
-                           GetNegatedExpression(N1, DAG, LegalOperations));
+                           GetNegatedExpression(N1, DAG, LegalOperations),
+                           Flags);
     }
   }
 
+  // FMUL -> FMA combines:
+  if (SDValue Fused = visitFMULForFMACombine(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+
   return SDValue();
 }
 
@@ -8193,66 +8500,145 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     if (N1CFP && N1CFP->isZero())
       return N2;
   }
+  // TODO: The FMA node should have flags that propagate to these nodes.
   if (N0CFP && N0CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
 
   // Canonicalize (fma c, x, y) -> (fma x, c, y)
-  if (N0CFP && !N1CFP)
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
-  // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
-  if (Options.UnsafeFPMath && N1CFP &&
-      N2.getOpcode() == ISD::FMUL &&
-      N0 == N2.getOperand(0) &&
-      N2.getOperand(1).getOpcode() == ISD::ConstantFP) {
-    return DAG.getNode(ISD::FMUL, dl, VT, N0,
-                       DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1)));
-  }
+  // TODO: FMA nodes should have flags that propagate to the created nodes.
+  // For now, create a Flags object for use with all unsafe math transforms.
+  SDNodeFlags Flags;
+  Flags.setUnsafeAlgebra(true);
 
+  if (Options.UnsafeFPMath) {
+    // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+    if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
+        isConstantFPBuildVectorOrConstantFP(N1) &&
+        isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
+      return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                         DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1),
+                                     &Flags), &Flags);
+    }
 
-  // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
-  if (Options.UnsafeFPMath &&
-      N0.getOpcode() == ISD::FMUL && N1CFP &&
-      N0.getOperand(1).getOpcode() == ISD::ConstantFP) {
-    return DAG.getNode(ISD::FMA, dl, VT,
-                       N0.getOperand(0),
-                       DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1)),
-                       N2);
+    // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+    if (N0.getOpcode() == ISD::FMUL &&
+        isConstantFPBuildVectorOrConstantFP(N1) &&
+        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         N0.getOperand(0),
+                         DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1),
+                                     &Flags),
+                         N2);
+    }
   }
 
   // (fma x, 1, y) -> (fadd x, y)
   // (fma x, -1, y) -> (fadd (fneg x), y)
   if (N1CFP) {
     if (N1CFP->isExactlyValue(1.0))
+      // TODO: The FMA node should have flags that propagate to this node.
       return DAG.getNode(ISD::FADD, dl, VT, N0, N2);
 
     if (N1CFP->isExactlyValue(-1.0) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
       SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0);
       AddToWorklist(RHSNeg.getNode());
+      // TODO: The FMA node should have flags that propagate to this node.
       return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg);
     }
   }
 
-  // (fma x, c, x) -> (fmul x, (c+1))
-  if (Options.UnsafeFPMath && N1CFP && N0 == N2)
-    return DAG.getNode(ISD::FMUL, dl, VT, N0,
-                       DAG.getNode(ISD::FADD, dl, VT,
-                                   N1, DAG.getConstantFP(1.0, dl, VT)));
-
-  // (fma x, c, (fneg x)) -> (fmul x, (c-1))
-  if (Options.UnsafeFPMath && N1CFP &&
-      N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0)
+  if (Options.UnsafeFPMath) {
+    // (fma x, c, x) -> (fmul x, (c+1))
+    if (N1CFP && N0 == N2) {
     return DAG.getNode(ISD::FMUL, dl, VT, N0,
-                       DAG.getNode(ISD::FADD, dl, VT,
-                                   N1, DAG.getConstantFP(-1.0, dl, VT)));
+                         DAG.getNode(ISD::FADD, dl, VT,
+                                     N1, DAG.getConstantFP(1.0, dl, VT),
+                                     &Flags), &Flags);
+    }
 
+    // (fma x, c, (fneg x)) -> (fmul x, (c-1))
+    if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
+      return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                         DAG.getNode(ISD::FADD, dl, VT,
+                                     N1, DAG.getConstantFP(-1.0, dl, VT),
+                                     &Flags), &Flags);
+    }
+  }
 
   return SDValue();
 }
 
+// Combine multiple FDIVs with the same divisor into multiple FMULs by the
+// reciprocal.
+// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+// Notice that this is not always beneficial. One reason is different target
+// may have different costs for FDIV and FMUL, so sometimes the cost of two
+// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
+  bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
+  const SDNodeFlags *Flags = N->getFlags();
+  if (!UnsafeMath && !Flags->hasAllowReciprocal())
+    return SDValue();
+
+  // Skip if current node is a reciprocal.
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return SDValue();
+
+  // Exit early if the target does not want this transform or if there can't
+  // possibly be enough uses of the divisor to make the transform worthwhile.
+  SDValue N1 = N->getOperand(1);
+  unsigned MinUses = TLI.combineRepeatedFPDivisors();
+  if (!MinUses || N1->use_size() < MinUses)
+    return SDValue();
+
+  // Find all FDIV users of the same divisor.
+  // Use a set because duplicates may be present in the user list.
+  SetVector<SDNode *> Users;
+  for (auto *U : N1->uses()) {
+    if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+      // This division is eligible for optimization only if global unsafe math
+      // is enabled or if this division allows reciprocal formation.
+      if (UnsafeMath || U->getFlags()->hasAllowReciprocal())
+        Users.insert(U);
+    }
+  }
+
+  // Now that we have the actual number of divisor uses, make sure it meets
+  // the minimum threshold specified by the target.
+  if (Users.size() < MinUses)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+  SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
+
+  // Dividend / Divisor -> Dividend * Reciprocal
+  for (auto *U : Users) {
+    SDValue Dividend = U->getOperand(0);
+    if (Dividend != FPOne) {
+      SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
+                                    Reciprocal, Flags);
+      CombineTo(U, NewNode);
+    } else if (U != Reciprocal.getNode()) {
+      // In the absence of fast-math-flags, this user node is always the
+      // same node as Reciprocal, but with FMF they may be different nodes.
+      CombineTo(U, Reciprocal);
+    }
+  }
+  return SDValue(N, 0);  // N was replaced.
+}
+
 SDValue DAGCombiner::visitFDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8261,6 +8647,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
+  SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
 
   // fold vector ops
   if (VT.isVector())
@@ -8269,7 +8656,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   // fold (fdiv c1, c2) -> c1/c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
+    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
 
   if (Options.UnsafeFPMath) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
@@ -8288,28 +8675,30 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
            TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
            TLI.isFPImmLegal(Recip, VT)))
         return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                           DAG.getConstantFP(Recip, DL, VT));
+                           DAG.getConstantFP(Recip, DL, VT), Flags);
     }
 
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) {
-        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0), Flags)) {
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+                                          Flags)) {
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
         AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FP_ROUND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+                                          Flags)) {
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
         AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FMUL) {
       // Look through an FMUL. Even though this won't remove the FDIV directly,
@@ -8326,18 +8715,18 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
       if (SqrtOp.getNode()) {
         // We found a FSQRT, so try to make this fold:
         // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
-        if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) {
-          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp);
+        if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
+          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
           AddToWorklist(RV.getNode());
-          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
         }
       }
     }
 
     // Fold into a reciprocal estimate and multiply instead of a real divide.
-    if (SDValue RV = BuildReciprocalEstimate(N1)) {
+    if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
       AddToWorklist(RV.getNode());
-      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
     }
   }
 
@@ -8349,52 +8738,13 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
       if (LHSNeg == 2 || RHSNeg == 2)
         return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
                            GetNegatedExpression(N0, DAG, LegalOperations),
-                           GetNegatedExpression(N1, DAG, LegalOperations));
+                           GetNegatedExpression(N1, DAG, LegalOperations),
+                           Flags);
     }
   }
 
-  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
-  // reciprocal.
-  // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
-  // Notice that this is not always beneficial. One reason is different target
-  // may have different costs for FDIV and FMUL, so sometimes the cost of two
-  // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
-  // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
-  if (Options.UnsafeFPMath) {
-    // Skip if current node is a reciprocal.
-    if (N0CFP && N0CFP->isExactlyValue(1.0))
-      return SDValue();
-
-    // Find all FDIV users of the same divisor.
-    // Use a set because duplicates may be present in the user list.
-    SetVector<SDNode *> Users;
-    for (auto *U : N1->uses())
-      if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
-        Users.insert(U);
-
-    if (TLI.combineRepeatedFPDivisors(Users.size())) {
-      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
-      // FIXME: This optimization requires some level of fast-math, so the
-      // created reciprocal node should at least have the 'allowReciprocal'
-      // fast-math-flag set.
-      SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1);
-
-      // Dividend / Divisor -> Dividend * Reciprocal
-      for (auto *U : Users) {
-        SDValue Dividend = U->getOperand(0);
-        if (Dividend != FPOne) {
-          SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
-                                        Reciprocal);
-          CombineTo(U, NewNode);
-        } else if (U != Reciprocal.getNode()) {
-          // In the absence of fast-math-flags, this user node is always the
-          // same node as Reciprocal, but with FMF they may be different nodes.
-          CombineTo(U, Reciprocal);
-        }
-      }
-      return SDValue(N, 0);  // N was replaced.
-    }
-  }
+  if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
+    return CombineRepeatedDivisors;
 
   return SDValue();
 }
@@ -8408,7 +8758,8 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1);
+    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1,
+                       &cast<BinaryWithFlagsSDNode>(N)->Flags);
 
   return SDValue();
 }
@@ -8417,20 +8768,25 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
     return SDValue();
 
+  // TODO: FSQRT nodes should have flags that propagate to the created nodes.
+  // For now, create a Flags object for use with all unsafe math transforms.
+  SDNodeFlags Flags;
+  Flags.setUnsafeAlgebra(true);
+
   // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
-  SDValue RV = BuildRsqrtEstimate(N->getOperand(0));
+  SDValue RV = BuildRsqrtEstimate(N->getOperand(0), &Flags);
   if (!RV)
     return SDValue();
-  
+
   EVT VT = RV.getValueType();
   SDLoc DL(N);
-  RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV);
+  RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV, &Flags);
   AddToWorklist(RV.getNode());
 
   // Unfortunately, RV is now NaN if the input was exactly 0.
   // Select out this case and force the answer to 0.
   SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
-  EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  EVT CCVT = getSetCCResultType(VT);
   SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, N->getOperand(0), Zero, ISD::SETEQ);
   AddToWorklist(ZeroCmp.getNode());
   AddToWorklist(RV.getNode());
@@ -8439,6 +8795,23 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
                      ZeroCmp, Zero, RV);
 }
 
+/// copysign(x, fp_extend(y)) -> copysign(x, y)
+/// copysign(x, fp_round(y)) -> copysign(x, y)
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  SDValue N1 = N->getOperand(1);
+  if ((N1.getOpcode() == ISD::FP_EXTEND ||
+       N1.getOpcode() == ISD::FP_ROUND)) {
+    // Do not optimize out type conversion of f128 type yet.
+    // For some targets like x86_64, configuration is changed to keep one f128
+    // value in one SSE register, but instruction selection cannot handle
+    // FCOPYSIGN on SSE registers yet.
+    EVT N1VT = N1->getValueType(0);
+    EVT N1Op0VT = N1->getOperand(0)->getValueType(0);
+    return (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
+  }
+  return false;
+}
+
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8482,7 +8855,7 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
 
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
-  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+  if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0, N1.getOperand(0));
 
@@ -8837,11 +9210,12 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
       APFloat CVal = CFP1->getValueAPF();
       CVal.changeSign();
       if (Level >= AfterLegalizeDAG &&
-          (TLI.isFPImmLegal(CVal, N->getValueType(0)) ||
-           TLI.isOperationLegal(ISD::ConstantFP, N->getValueType(0))))
-        return DAG.getNode(
-            ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
-            DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)));
+          (TLI.isFPImmLegal(CVal, VT) ||
+           TLI.isOperationLegal(ISD::ConstantFP, VT)))
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
+                           DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                       N0.getOperand(1)),
+                           &cast<BinaryWithFlagsSDNode>(N0)->Flags);
     }
   }
 
@@ -8851,20 +9225,20 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
 SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
 
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
     const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), N->getValueType(0));
+    return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
   }
 
-  if (N0CFP) {
-    EVT VT = N->getValueType(0);
-    // Canonicalize to constant on RHS.
+  // Canonicalize to constant on RHS.
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
-  }
 
   return SDValue();
 }
@@ -8872,20 +9246,20 @@ SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
 SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
 
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
     const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), N->getValueType(0));
+    return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
   }
 
-  if (N0CFP) {
-    EVT VT = N->getValueType(0);
-    // Canonicalize to constant on RHS.
+  // Canonicalize to constant on RHS.
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
-  }
 
   return SDValue();
 }
@@ -9034,8 +9408,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
     SDValue Op1 = TheXor->getOperand(1);
     if (Op0.getOpcode() == Op1.getOpcode()) {
       // Avoid missing important xor optimizations.
-      SDValue Tmp = visitXOR(TheXor);
-      if (Tmp.getNode()) {
+      if (SDValue Tmp = visitXOR(TheXor)) {
         if (Tmp.getNode() != TheXor) {
           DEBUG(dbgs() << "\nReplacing.8 ";
                 TheXor->dump(&DAG);
@@ -9722,8 +10095,8 @@ struct LoadedSlice {
     void addSliceGain(const LoadedSlice &LS) {
       // Each slice saves a truncate.
       const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
-      if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
-                              LS.Inst->getOperand(0).getValueType()))
+      if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
+                              LS.Inst->getValueType(0)))
         ++Truncates;
       // If there is a shift amount, this slice gets rid of it.
       if (LS.Shift)
@@ -10625,30 +10998,109 @@ struct BaseIndexOffset {
 };
 } // namespace
 
+// This is a helper function for visitMUL to check the profitability
+// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+// MulNode is the original multiply, AddNode is (add x, c1),
+// and ConstNode is c2.
+//
+// If the (add x, c1) has multiple uses, we could increase
+// the number of adds if we make this transformation.
+// It would only be worth doing this if we can remove a
+// multiply in the process. Check for that here.
+// To illustrate:
+//     (A + c1) * c3
+//     (A + c2) * c3
+// We're checking for cases where we have common "c3 * A" expressions.
+bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
+                                              SDValue &AddNode,
+                                              SDValue &ConstNode) {
+  APInt Val;
+
+  // If the add only has one use, this would be OK to do.
+  if (AddNode.getNode()->hasOneUse())
+    return true;
+
+  // Walk all the users of the constant with which we're multiplying.
+  for (SDNode *Use : ConstNode->uses()) {
+
+    if (Use == MulNode) // This use is the one we're on right now. Skip it.
+      continue;
+
+    if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
+      SDNode *OtherOp;
+      SDNode *MulVar = AddNode.getOperand(0).getNode();
+
+      // OtherOp is what we're multiplying against the constant.
+      if (Use->getOperand(0) == ConstNode)
+        OtherOp = Use->getOperand(1).getNode();
+      else
+        OtherOp = Use->getOperand(0).getNode();
+
+      // Check to see if multiply is with the same operand of our "add".
+      //
+      //     ConstNode  = CONST
+      //     Use = ConstNode * A  <-- visiting Use. OtherOp is A.
+      //     ...
+      //     AddNode  = (A + c1)  <-- MulVar is A.
+      //         = AddNode * ConstNode   <-- current visiting instruction.
+      //
+      // If we make this transformation, we will have a common
+      // multiply (ConstNode * A) that we can save.
+      if (OtherOp == MulVar)
+        return true;
+
+      // Now check to see if a future expansion will give us a common
+      // multiply.
+      //
+      //     ConstNode  = CONST
+      //     AddNode    = (A + c1)
+      //     ...   = AddNode * ConstNode <-- current visiting instruction.
+      //     ...
+      //     OtherOp = (A + c2)
+      //     Use     = OtherOp * ConstNode <-- visiting Use.
+      //
+      // If we make this transformation, we will have a common
+      // multiply (CONST * A) after we also do the same transformation
+      // to the "t2" instruction.
+      if (OtherOp->getOpcode() == ISD::ADD &&
+          isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
+          OtherOp->getOperand(0).getNode() == MulVar)
+        return true;
+    }
+  }
+
+  // Didn't find a case where this would be profitable.
+  return false;
+}
+
 SDValue DAGCombiner::getMergedConstantVectorStore(SelectionDAG &DAG,
                                                   SDLoc SL,
                                                   ArrayRef<MemOpLink> Stores,
+                                                  SmallVectorImpl<SDValue> &Chains,
                                                   EVT Ty) const {
   SmallVector<SDValue, 8> BuildVector;
 
-  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I)
-    BuildVector.push_back(cast<StoreSDNode>(Stores[I].MemNode)->getValue());
+  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
+    StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode);
+    Chains.push_back(St->getChain());
+    BuildVector.push_back(St->getValue());
+  }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SL, Ty, BuildVector);
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
                   SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
-                  unsigned NumElem, bool IsConstantSrc, bool UseVector) {
+                  unsigned NumStores, bool IsConstantSrc, bool UseVector) {
   // Make sure we have something to merge.
-  if (NumElem < 2)
+  if (NumStores < 2)
     return false;
 
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
   unsigned LatestNodeUsed = 0;
 
-  for (unsigned i=0; i < NumElem; ++i) {
+  for (unsigned i=0; i < NumStores; ++i) {
     // Find a chain for the new wide-store operand. Notice that some
     // of the store nodes that we found may not be selected for inclusion
     // in the wide store. The chain we use needs to be the chain of the
@@ -10657,45 +11109,57 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
       LatestNodeUsed = i;
   }
 
+  SmallVector<SDValue, 8> Chains;
+
   // The latest Node in the DAG.
   LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
   SDLoc DL(StoreNodes[0].MemNode);
 
   SDValue StoredVal;
   if (UseVector) {
-    // Find a legal type for the vector store.
-    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    bool IsVec = MemVT.isVector();
+    unsigned Elts = NumStores;
+    if (IsVec) {
+      // When merging vector stores, get the total number of elements.
+      Elts *= MemVT.getVectorNumElements();
+    }
+    // Get the type for the merged vector store.
+    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
     assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+
     if (IsConstantSrc) {
-      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Ty);
+      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty);
     } else {
       SmallVector<SDValue, 8> Ops;
-      for (unsigned i = 0; i < NumElem ; ++i) {
+      for (unsigned i = 0; i < NumStores; ++i) {
         StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
         SDValue Val = St->getValue();
-        // All of the operands of a BUILD_VECTOR must have the same type.
+        // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type.
         if (Val.getValueType() != MemVT)
           return false;
         Ops.push_back(Val);
+        Chains.push_back(St->getChain());
       }
 
       // Build the extracted vector elements back into a vector.
-      StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
-    }
+      StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR,
+                              DL, Ty, Ops);    }
   } else {
     // We should always use a vector store when merging extracted vector
     // elements, so this path implies a store of constants.
     assert(IsConstantSrc && "Merged vector elements should use vector store");
 
-    unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+    unsigned SizeInBits = NumStores * ElementSizeBytes * 8;
     APInt StoreInt(SizeInBits, 0);
 
     // Construct a single integer constant which is made of the smaller
     // constant inputs.
     bool IsLE = DAG.getDataLayout().isLittleEndian();
-    for (unsigned i = 0; i < NumElem ; ++i) {
-      unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
+    for (unsigned i = 0; i < NumStores; ++i) {
+      unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+      Chains.push_back(St->getChain());
+
       SDValue Val = St->getValue();
       StoreInt <<= ElementSizeBytes * 8;
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
@@ -10712,7 +11176,10 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
   }
 
-  SDValue NewStore = DAG.getStore(LatestOp->getChain(), DL, StoredVal,
+  assert(!Chains.empty());
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
                                   FirstInChain->getBasePtr(),
                                   FirstInChain->getPointerInfo(),
                                   false, false,
@@ -10721,7 +11188,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
   // Replace the last store with the new store
   CombineTo(LatestOp, NewStore);
   // Erase all other stores.
-  for (unsigned i = 0; i < NumElem ; ++i) {
+  for (unsigned i = 0; i < NumStores; ++i) {
     if (StoreNodes[i].MemNode == LatestOp)
       continue;
     StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
@@ -10743,17 +11210,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
   return true;
 }
 
-static bool allowableAlignment(const SelectionDAG &DAG,
-                               const TargetLowering &TLI, EVT EVTTy,
-                               unsigned AS, unsigned Align) {
-  if (TLI.allowsMisalignedMemoryAccesses(EVTTy, AS, Align))
-    return true;
-
-  Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext());
-  unsigned ABIAlignment = DAG.getDataLayout().getPrefTypeAlignment(Ty);
-  return (Align >= ABIAlignment);
-}
-
 void DAGCombiner::getStoreMergeAndAliasCandidates(
     StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
     SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
@@ -10775,6 +11231,38 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
   EVT MemVT = St->getMemoryVT();
   unsigned Seq = 0;
   StoreSDNode *Index = St;
+
+
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+
+  if (UseAA) {
+    // Look at other users of the same chain. Stores on the same chain do not
+    // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized
+    // to be on the same chain, so don't bother looking at adjacent chains.
+
+    SDValue Chain = St->getChain();
+    for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) {
+      if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
+        if (I.getOperandNo() != 0)
+          continue;
+
+        if (OtherST->isVolatile() || OtherST->isIndexed())
+          continue;
+
+        if (OtherST->getMemoryVT() != MemVT)
+          continue;
+
+        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr());
+
+        if (Ptr.equalBaseIndex(BasePtr))
+          StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
+      }
+    }
+
+    return;
+  }
+
   while (Index) {
     // If the chain has more than one use, then we can't reorder the mem ops.
     if (Index != St && !SDValue(Index, 0)->hasOneUse())
@@ -10800,6 +11288,13 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
     if (Index->getMemoryVT() != MemVT)
       break;
 
+    // We do not allow under-aligned stores in order to prevent
+    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
+    // be irrelevant here; what MATTERS is that we not move memory
+    // operations that potentially overlap past each-other.
+    if (Index->getAlignment() < MemVT.getStoreSize())
+      break;
+
     // We found a potential memory operand to merge.
     StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
 
@@ -10844,8 +11339,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
     return false;
 
-  // Don't merge vectors into wider inputs.
-  if (MemVT.isVector() || !MemVT.isSimple())
+  if (!MemVT.isSimple())
     return false;
 
   // Perform an early exit check. Do not bother looking at stored values that
@@ -10854,9 +11348,16 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
   bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
                        isa<ConstantFPSDNode>(StoredVal);
-  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+                          StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
 
-  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
+    return false;
+
+  // Don't merge vectors into wider vectors if the source data comes from loads.
+  // TODO: This restriction can be lifted by using logic similar to the
+  // ExtractVecSrc case.
+  if (MemVT.isVector() && IsLoadSrc)
     return false;
 
   // Only look at ends of store sequences.
@@ -10868,22 +11369,28 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   // We need to make sure that these nodes do not interfere with
   // any of the store nodes.
   SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
-  
+
   // Save the StoreSDNodes that we find in the chain.
   SmallVector<MemOpLink, 8> StoreNodes;
 
   getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
-  
+
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
     return false;
 
-  // Sort the memory operands according to their distance from the base pointer.
+  // Sort the memory operands according to their distance from the
+  // base pointer.  As a secondary criteria: make sure stores coming
+  // later in the code come first in the list. This is important for
+  // the non-UseAA case, because we're merging stores into the FINAL
+  // store along a chain which potentially contains aliasing stores.
+  // Thus, if there are multiple stores to the same address, the last
+  // one can be considered for merging but not the others.
   std::sort(StoreNodes.begin(), StoreNodes.end(),
             [](MemOpLink LHS, MemOpLink RHS) {
     return LHS.OffsetFromBase < RHS.OffsetFromBase ||
            (LHS.OffsetFromBase == RHS.OffsetFromBase &&
-            LHS.SequenceNum > RHS.SequenceNum);
+            LHS.SequenceNum < RHS.SequenceNum);
   });
 
   // Scan the memory operations on the chain and find the first non-consecutive
@@ -10900,15 +11407,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         break;
     }
 
-    bool Alias = false;
     // Check if this store interferes with any of the loads that we found.
-    for (unsigned ld = 0, lde = AliasLoadNodes.size(); ld < lde; ++ld)
-      if (isAlias(AliasLoadNodes[ld], StoreNodes[i].MemNode)) {
-        Alias = true;
-        break;
-      }
-    // We found a load that alias with this store. Stop the sequence.
-    if (Alias)
+    // If we find a load that alias with this store. Stop the sequence.
+    if (std::any_of(AliasLoadNodes.begin(), AliasLoadNodes.end(),
+                    [&](LSBaseSDNode* Ldn) {
+                      return isAlias(Ldn, StoreNodes[i].MemNode);
+                    }))
       break;
 
     // Mark this node as useful.
@@ -10919,6 +11423,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
   unsigned FirstStoreAS = FirstInChain->getAddressSpace();
   unsigned FirstStoreAlign = FirstInChain->getAlignment();
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
 
   // Store the constants into memory as one consecutive store.
   if (IsConstantSrc) {
@@ -10940,43 +11446,40 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
 
       // Find a legal type for the constant store.
       unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      bool IsFast;
       if (TLI.isTypeLegal(StoreTy) &&
-          allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS,
-                             FirstStoreAlign)) {
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign, &IsFast) && IsFast) {
         LastLegalType = i+1;
       // Or check whether a truncstore is legal.
-      } else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
                  TargetLowering::TypePromoteInteger) {
         EVT LegalizedStoredValueTy =
-          TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
+          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
         if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS,
-                               FirstStoreAlign)) {
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstStoreAS, FirstStoreAlign, &IsFast) &&
+            IsFast) {
           LastLegalType = i + 1;
         }
       }
 
-      // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
-      if (TLI.isTypeLegal(Ty) &&
-          allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) {
-        LastLegalVectorType = i + 1;
+      // We only use vectors if the constant is known to be zero or the target
+      // allows it and the function is not marked with the noimplicitfloat
+      // attribute.
+      if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1,
+                                                        FirstStoreAS)) &&
+          !NoVectors) {
+        // Find a legal type for the vector store.
+        EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
+        if (TLI.isTypeLegal(Ty) &&
+            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                   FirstStoreAlign, &IsFast) && IsFast)
+          LastLegalVectorType = i + 1;
       }
     }
 
-
-    // We only use vectors if the constant is known to be zero or the target
-    // allows it and the function is not marked with the noimplicitfloat
-    // attribute.
-    if (NoVectors) {
-      LastLegalVectorType = 0;
-    } else if (NonZero && !TLI.storeOfVectorConstantIsCheap(MemVT,
-                                                            LastLegalVectorType,
-                                                            FirstStoreAS)) {
-      LastLegalVectorType = 0;
-    }
-
     // Check if we found a legal integer type to store.
     if (LastLegalType == 0 && LastLegalVectorType == 0)
       return false;
@@ -10990,27 +11493,36 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
 
   // When extracting multiple vector elements, try to store them
   // in one vector store rather than a sequence of scalar stores.
-  if (IsExtractVecEltSrc) {
-    unsigned NumElem = 0;
+  if (IsExtractVecSrc) {
+    unsigned NumStoresToMerge = 0;
+    bool IsVec = MemVT.isVector();
     for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
+      unsigned StoreValOpcode = St->getValue().getOpcode();
       // This restriction could be loosened.
       // Bail out if any stored values are not elements extracted from a vector.
       // It should be possible to handle mixed sources, but load sources need
       // more careful handling (see the block of code below that handles
       // consecutive loads).
-      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
+          StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
         return false;
 
       // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      unsigned Elts = i + 1;
+      if (IsVec) {
+        // When merging vector stores, get the total number of elements.
+        Elts *= MemVT.getVectorNumElements();
+      }
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+      bool IsFast;
       if (TLI.isTypeLegal(Ty) &&
-          allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign))
-        NumElem = i + 1;
+          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                 FirstStoreAlign, &IsFast) && IsFast)
+        NumStoresToMerge = i + 1;
     }
 
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge,
                                            false, true);
   }
 
@@ -11084,7 +11596,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   StartAddress = LoadNodes[0].OffsetFromBase;
   SDValue FirstChain = FirstLoad->getChain();
   for (unsigned i = 1; i < LoadNodes.size(); ++i) {
-    // All loads much share the same chain.
+    // All loads must share the same chain.
     if (LoadNodes[i].MemNode->getChain() != FirstChain)
       break;
 
@@ -11092,35 +11604,41 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     if (CurrAddress - StartAddress != (ElementSizeBytes * i))
       break;
     LastConsecutiveLoad = i;
-
     // Find a legal type for the vector store.
-    EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+    EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1);
+    bool IsFastSt, IsFastLd;
     if (TLI.isTypeLegal(StoreTy) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) {
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign, &IsFastLd) && IsFastLd) {
       LastLegalVectorType = i + 1;
     }
 
     // Find a legal type for the integer store.
     unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-    StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    StoreTy = EVT::getIntegerVT(Context, SizeInBits);
     if (TLI.isTypeLegal(StoreTy) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign))
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign, &IsFastLd) && IsFastLd)
       LastLegalIntegerType = i + 1;
     // Or check whether a truncstore and extload is legal.
-    else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+    else if (TLI.getTypeAction(Context, StoreTy) ==
              TargetLowering::TypePromoteInteger) {
       EVT LegalizedStoredValueTy =
-        TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
+        TLI.getTypeToTransformTo(Context, StoreTy);
       if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
           TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
           TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
           TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS,
-                             FirstStoreAlign) &&
-          allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstLoadAS,
-                             FirstLoadAlign))
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstLoadAS, FirstLoadAlign, &IsFastLd) &&
+          IsFastLd)
         LastLegalIntegerType = i+1;
     }
   }
@@ -11138,6 +11656,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   if (NumElem < 2)
     return false;
 
+  // Collect the chains from all merged stores.
+  SmallVector<SDValue, 8> MergeStoreChains;
+  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
+
   // The latest Node in the DAG.
   unsigned LatestNodeUsed = 0;
   for (unsigned i=1; i<NumElem; ++i) {
@@ -11147,6 +11669,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     // latest store node which is *used* and replaced by the wide store.
     if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
       LatestNodeUsed = i;
+
+    MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
   }
 
   LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
@@ -11155,34 +11679,33 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   // to memory.
   EVT JointMemOpVT;
   if (UseVectorTy) {
-    JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
   } else {
     unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-    JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
   }
 
   SDLoc LoadDL(LoadNodes[0].MemNode);
   SDLoc StoreDL(StoreNodes[0].MemNode);
 
+  // The merged loads are required to have the same incoming chain, so
+  // using the first's chain is acceptable.
   SDValue NewLoad = DAG.getLoad(
       JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
       FirstLoad->getPointerInfo(), false, false, false, FirstLoadAlign);
 
+  SDValue NewStoreChain =
+    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
+
   SDValue NewStore = DAG.getStore(
-      LatestOp->getChain(), StoreDL, NewLoad, FirstInChain->getBasePtr(),
+    NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
       FirstInChain->getPointerInfo(), false, false, FirstStoreAlign);
 
-  // Replace one of the loads with the new load.
-  LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[0].MemNode);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                SDValue(NewLoad.getNode(), 1));
-
-  // Remove the rest of the load chains.
-  for (unsigned i = 1; i < NumElem ; ++i) {
-    // Replace all chain users of the old load nodes with the chain of the new
-    // load node.
+  // Transfer chain users from old loads to the new load.
+  for (unsigned i = 0; i < NumElem; ++i) {
     LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Ld->getChain());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                  SDValue(NewLoad.getNode(), 1));
   }
 
   // Replace the last store with the new store.
@@ -11200,6 +11723,114 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   return true;
 }
 
+SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
+  SDLoc SL(ST);
+  SDValue ReplStore;
+
+  // Replace the chain to avoid dependency.
+  if (ST->isTruncatingStore()) {
+    ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+  } else {
+    ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
+                             ST->getMemOperand());
+  }
+
+  // Create token to keep both nodes around.
+  SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
+                              MVT::Other, ST->getChain(), ReplStore);
+
+  // Make sure the new and old chains are cleaned up.
+  AddToWorklist(Token.getNode());
+
+  // Don't add users to work list.
+  return CombineTo(ST, Token, false);
+}
+
+SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
+  SDValue Value = ST->getValue();
+  if (Value.getOpcode() == ISD::TargetConstantFP)
+    return SDValue();
+
+  SDLoc DL(ST);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
+
+  // NOTE: If the original store is volatile, this transform must not increase
+  // the number of stores.  For example, on x86-32 an f64 can be stored in one
+  // processor operation but an i64 (which is not legal) requires two.  So the
+  // transform should not be done in this case.
+
+  SDValue Tmp;
+  switch (CFP->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unknown FP type");
+  case MVT::f16:    // We don't do this for these yet.
+  case MVT::f80:
+  case MVT::f128:
+  case MVT::ppcf128:
+    return SDValue();
+  case MVT::f32:
+    if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+      ;
+      Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
+                            bitcastToAPInt().getZExtValue(), SDLoc(CFP),
+                            MVT::i32);
+      return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
+    }
+
+    return SDValue();
+  case MVT::f64:
+    if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
+         !ST->isVolatile()) ||
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
+      ;
+      Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                            getZExtValue(), SDLoc(CFP), MVT::i64);
+      return DAG.getStore(Chain, DL, Tmp,
+                          Ptr, ST->getMemOperand());
+    }
+
+    if (!ST->isVolatile() &&
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+      // Many FP stores are not made apparent until after legalize, e.g. for
+      // argument passing.  Since this is so common, custom legalize the
+      // 64-bit integer store into two 32-bit stores.
+      uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+      SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
+      SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      unsigned Alignment = ST->getAlignment();
+      bool isVolatile = ST->isVolatile();
+      bool isNonTemporal = ST->isNonTemporal();
+      AAMDNodes AAInfo = ST->getAAInfo();
+
+      SDValue St0 = DAG.getStore(Chain, DL, Lo,
+                                 Ptr, ST->getPointerInfo(),
+                                 isVolatile, isNonTemporal,
+                                 ST->getAlignment(), AAInfo);
+      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                        DAG.getConstant(4, DL, Ptr.getValueType()));
+      Alignment = MinAlign(Alignment, 4U);
+      SDValue St1 = DAG.getStore(Chain, DL, Hi,
+                                 Ptr, ST->getPointerInfo().getWithOffset(4),
+                                 isVolatile, isNonTemporal,
+                                 Alignment, AAInfo);
+      return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                         St0, St1);
+    }
+
+    return SDValue();
+  }
+}
+
 SDValue DAGCombiner::visitSTORE(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
@@ -11227,81 +11858,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed())
     return Chain;
 
-  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
-  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
-    // NOTE: If the original store is volatile, this transform must not increase
-    // the number of stores.  For example, on x86-32 an f64 can be stored in one
-    // processor operation but an i64 (which is not legal) requires two.  So the
-    // transform should not be done in this case.
-    if (Value.getOpcode() != ISD::TargetConstantFP) {
-      SDValue Tmp;
-      switch (CFP->getSimpleValueType(0).SimpleTy) {
-      default: llvm_unreachable("Unknown FP type");
-      case MVT::f16:    // We don't do this for these yet.
-      case MVT::f80:
-      case MVT::f128:
-      case MVT::ppcf128:
-        break;
-      case MVT::f32:
-        if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
-            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
-          ;
-          Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
-                              bitcastToAPInt().getZExtValue(), SDLoc(CFP),
-                              MVT::i32);
-          return DAG.getStore(Chain, SDLoc(N), Tmp,
-                              Ptr, ST->getMemOperand());
-        }
-        break;
-      case MVT::f64:
-        if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
-             !ST->isVolatile()) ||
-            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
-          ;
-          Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
-                                getZExtValue(), SDLoc(CFP), MVT::i64);
-          return DAG.getStore(Chain, SDLoc(N), Tmp,
-                              Ptr, ST->getMemOperand());
-        }
-
-        if (!ST->isVolatile() &&
-            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
-          // Many FP stores are not made apparent until after legalize, e.g. for
-          // argument passing.  Since this is so common, custom legalize the
-          // 64-bit integer store into two 32-bit stores.
-          uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-          SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
-          SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
-          if (DAG.getDataLayout().isBigEndian())
-            std::swap(Lo, Hi);
-
-          unsigned Alignment = ST->getAlignment();
-          bool isVolatile = ST->isVolatile();
-          bool isNonTemporal = ST->isNonTemporal();
-          AAMDNodes AAInfo = ST->getAAInfo();
-
-          SDLoc DL(N);
-
-          SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo,
-                                     Ptr, ST->getPointerInfo(),
-                                     isVolatile, isNonTemporal,
-                                     ST->getAlignment(), AAInfo);
-          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-                            DAG.getConstant(4, DL, Ptr.getValueType()));
-          Alignment = MinAlign(Alignment, 4U);
-          SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi,
-                                     Ptr, ST->getPointerInfo().getWithOffset(4),
-                                     isVolatile, isNonTemporal,
-                                     Alignment, AAInfo);
-          return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                             St0, St1);
-        }
-
-        break;
-      }
-    }
-  }
-
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
@@ -11319,8 +11875,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Try transforming a pair floating point load / store ops to integer
   // load / store ops.
-  SDValue NewST = TransformFPLoadStorePair(N);
-  if (NewST.getNode())
+  if (SDValue NewST = TransformFPLoadStorePair(N))
     return NewST;
 
   bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
@@ -11331,31 +11886,17 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     UseAA = false;
 #endif
   if (UseAA && ST->isUnindexed()) {
-    // Walk up chain skipping non-aliasing memory nodes.
-    SDValue BetterChain = FindBetterChain(N, Chain);
-
-    // If there is a better chain.
-    if (Chain != BetterChain) {
-      SDValue ReplStore;
-
-      // Replace the chain to avoid dependency.
-      if (ST->isTruncatingStore()) {
-        ReplStore = DAG.getTruncStore(BetterChain, SDLoc(N), Value, Ptr,
-                                      ST->getMemoryVT(), ST->getMemOperand());
-      } else {
-        ReplStore = DAG.getStore(BetterChain, SDLoc(N), Value, Ptr,
-                                 ST->getMemOperand());
-      }
+    // FIXME: We should do this even without AA enabled. AA will just allow
+    // FindBetterChain to work in more situations. The problem with this is that
+    // any combine that expects memory operations to be on consecutive chains
+    // first needs to be updated to look for users of the same chain.
 
-      // Create token to keep both nodes around.
-      SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
-                                  MVT::Other, Chain, ReplStore);
-
-      // Make sure the new and old chains are cleaned up.
-      AddToWorklist(Token.getNode());
-
-      // Don't add users to work list.
-      return CombineTo(N, Token, false);
+    // Walk up chain skipping non-aliasing memory nodes, on this store and any
+    // adjacent stores.
+    if (findBetterNeighborChains(ST)) {
+      // replaceStoreChain uses CombineTo, which handled all of the worklist
+      // manipulation. Return the original node to not do anything else.
+      return SDValue(ST, 0);
     }
   }
 
@@ -11440,6 +11981,16 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       return SDValue(N, 0);
   }
 
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  //
+  // Make sure to do this only after attempting to merge stores in order to
+  //  avoid changing the types of some subset of stores due to visit order,
+  //  preventing their merging.
+  if (isa<ConstantFPSDNode>(Value)) {
+    if (SDValue NewSt = replaceStoreOfFPConstant(ST))
+      return NewSt;
+  }
+
   return ReduceLoadOpStoreWidth(N);
 }
 
@@ -11613,7 +12164,24 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   }
 
   SDValue EltNo = N->getOperand(1);
-  bool ConstEltNo = isa<ConstantSDNode>(EltNo);
+  ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+
+  // extract_vector_elt (build_vector x, y), 1 -> y
+  if (ConstEltNo &&
+      InVec.getOpcode() == ISD::BUILD_VECTOR &&
+      TLI.isTypeLegal(VT) &&
+      (InVec.hasOneUse() ||
+       TLI.aggressivelyPreferBuildVectorSources(VT))) {
+    SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
+    EVT InEltVT = Elt.getValueType();
+
+    // Sometimes build_vector's scalar input types do not match result type.
+    if (NVT == InEltVT)
+      return Elt;
+
+    // TODO: It may be useful to truncate if free if the build_vector implicitly
+    // converts.
+  }
 
   // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
   // We only perform this optimization before the op legalization phase because
@@ -11621,13 +12189,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // patterns. For example on AVX, extracting elements from a wide vector
   // without using extract_subvector. However, if we can find an underlying
   // scalar value, then we can always use that.
-  if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE
-      && ConstEltNo) {
-    int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
     int NumElem = VT.getVectorNumElements();
     ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
     // Find the new index to extract from.
-    int OrigElt = SVOp->getMaskElt(Elt);
+    int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());
 
     // Extracting an undef index is undef.
     if (OrigElt == -1)
@@ -12183,12 +12749,90 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
                      DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Ops));
 }
 
-SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
-  // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of
-  // EXTRACT_SUBVECTOR operations.  If so, and if the EXTRACT_SUBVECTOR vector
-  // inputs come from at most two distinct vectors, turn this into a shuffle
-  // node.
+// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
+// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
+// most two distinct vectors the same size as the result, attempt to turn this
+// into a legal shuffle.
+static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N->getOperand(0).getValueType();
+  int NumElts = VT.getVectorNumElements();
+  int NumOpElts = OpVT.getVectorNumElements();
+
+  SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
+  SmallVector<int, 8> Mask;
+
+  for (SDValue Op : N->ops()) {
+    // Peek through any bitcast.
+    while (Op.getOpcode() == ISD::BITCAST)
+      Op = Op.getOperand(0);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (Op.getOpcode() == ISD::UNDEF) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return SDValue();
+
+    // What vector are we extracting the subvector from and at what index?
+    SDValue ExtVec = Op.getOperand(0);
+
+    // We want the EVT of the original extraction to correctly scale the
+    // extraction index.
+    EVT ExtVT = ExtVec.getValueType();
+
+    // Peek through any bitcast.
+    while (ExtVec.getOpcode() == ISD::BITCAST)
+      ExtVec = ExtVec.getOperand(0);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (ExtVec.getOpcode() == ISD::UNDEF) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    if (!isa<ConstantSDNode>(Op.getOperand(1)))
+      return SDValue();
+    int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    // Ensure that we are extracting a subvector from a vector the same
+    // size as the result.
+    if (ExtVT.getSizeInBits() != VT.getSizeInBits())
+      return SDValue();
+
+    // Scale the subvector index to account for any bitcast.
+    int NumExtElts = ExtVT.getVectorNumElements();
+    if (0 == (NumExtElts % NumElts))
+      ExtIdx /= (NumExtElts / NumElts);
+    else if (0 == (NumElts % NumExtElts))
+      ExtIdx *= (NumElts / NumExtElts);
+    else
+      return SDValue();
 
+    // At most we can reference 2 inputs in the final shuffle.
+    if (SV0.getOpcode() == ISD::UNDEF || SV0 == ExtVec) {
+      SV0 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx);
+    } else if (SV1.getOpcode() == ISD::UNDEF || SV1 == ExtVec) {
+      SV1 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx + NumElts);
+    } else {
+      return SDValue();
+    }
+  }
+
+  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
+                              DAG.getBitcast(VT, SV1), Mask);
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   // If we only have one input vector, we don't need to do any concatenation.
   if (N->getNumOperands() == 1)
     return N->getOperand(0);
@@ -12289,6 +12933,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   if (SDValue V = combineConcatVectorOfScalars(N, DAG))
     return V;
 
+  // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
+    if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
+      return V;
+
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
   // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
@@ -12503,7 +13152,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
       std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
                   SVN->getMask().end(), [](int i) { return i == -1; })) {
     N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
-                              ArrayRef<int>(SVN->getMask().begin(), NumElemsPerConcat));
+                              makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat));
     N1 = DAG.getUNDEF(ConcatVT);
     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
   }
@@ -12981,6 +13630,21 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
+  if (N0->getOpcode() == ISD::AND) {
+    ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
+    if (AndConst && AndConst->getAPIntValue() == 0xffff) {
+      return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
+                         N0.getOperand(0));
+    }
+  }
+
+  return SDValue();
+}
+
 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
 /// with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
@@ -13002,34 +13666,76 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
   if (RHS.getOpcode() == ISD::BITCAST)
     RHS = RHS.getOperand(0);
 
-  if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  EVT RVT = RHS.getValueType();
+  unsigned NumElts = RHS.getNumOperands();
+
+  // Attempt to create a valid clear mask, splitting the mask into
+  // sub elements and checking to see if each is
+  // all zeros or all ones - suitable for shuffle masking.
+  auto BuildClearMask = [&](int Split) {
+    int NumSubElts = NumElts * Split;
+    int NumSubBits = RVT.getScalarSizeInBits() / Split;
+
     SmallVector<int, 8> Indices;
-    unsigned NumElts = RHS.getNumOperands();
+    for (int i = 0; i != NumSubElts; ++i) {
+      int EltIdx = i / Split;
+      int SubIdx = i % Split;
+      SDValue Elt = RHS.getOperand(EltIdx);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Indices.push_back(-1);
+        continue;
+      }
 
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Elt = RHS.getOperand(i);
-      if (isAllOnesConstant(Elt))
+      APInt Bits;
+      if (isa<ConstantSDNode>(Elt))
+        Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
+      else if (isa<ConstantFPSDNode>(Elt))
+        Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
+      else
+        return SDValue();
+
+      // Extract the sub element from the constant bit mask.
+      if (DAG.getDataLayout().isBigEndian()) {
+        Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits);
+      } else {
+        Bits = Bits.lshr(SubIdx * NumSubBits);
+      }
+
+      if (Split > 1)
+        Bits = Bits.trunc(NumSubBits);
+
+      if (Bits.isAllOnesValue())
         Indices.push_back(i);
-      else if (isNullConstant(Elt))
-        Indices.push_back(NumElts+i);
+      else if (Bits == 0)
+        Indices.push_back(i + NumSubElts);
       else
         return SDValue();
     }
 
     // Let's see if the target supports this vector_shuffle.
-    EVT RVT = RHS.getValueType();
-    if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+    EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
+    EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
+    if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
       return SDValue();
 
-    // Return the new VECTOR_SHUFFLE node.
-    EVT EltVT = RVT.getVectorElementType();
-    SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
-                                   DAG.getConstant(0, dl, EltVT));
-    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, RVT, ZeroOps);
-    LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
-    SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
-    return DAG.getNode(ISD::BITCAST, dl, VT, Shuf);
-  }
+    SDValue Zero = DAG.getConstant(0, dl, ClearVT);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl,
+                                                   DAG.getBitcast(ClearVT, LHS),
+                                                   Zero, &Indices[0]));
+  };
+
+  // Determine maximum split level (byte level masking).
+  int MaxSplit = 1;
+  if (RVT.getScalarSizeInBits() % 8 == 0)
+    MaxSplit = RVT.getScalarSizeInBits() / 8;
+
+  for (int Split = 1; Split <= MaxSplit; ++Split)
+    if (RVT.getScalarSizeInBits() % Split == 0)
+      if (SDValue S = BuildClearMask(Split))
+        return S;
 
   return SDValue();
 }
@@ -13041,60 +13747,17 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
+  SDValue Ops[] = {LHS, RHS};
 
+  // See if we can constant fold the vector operation.
+  if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
+          N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
+    return Fold;
+
+  // Try to convert a constant mask AND into a shuffle clear mask.
   if (SDValue Shuffle = XformToShuffleWithZero(N))
     return Shuffle;
 
-  // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
-  // this operation.
-  if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
-      RHS.getOpcode() == ISD::BUILD_VECTOR) {
-    // Check if both vectors are constants. If not bail out.
-    if (!(cast<BuildVectorSDNode>(LHS)->isConstant() &&
-          cast<BuildVectorSDNode>(RHS)->isConstant()))
-      return SDValue();
-
-    SmallVector<SDValue, 8> Ops;
-    for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
-      SDValue LHSOp = LHS.getOperand(i);
-      SDValue RHSOp = RHS.getOperand(i);
-
-      // Can't fold divide by zero.
-      if (N->getOpcode() == ISD::SDIV || N->getOpcode() == ISD::UDIV ||
-          N->getOpcode() == ISD::FDIV) {
-        if (isNullConstant(RHSOp) || (RHSOp.getOpcode() == ISD::ConstantFP &&
-             cast<ConstantFPSDNode>(RHSOp.getNode())->isZero()))
-          break;
-      }
-
-      EVT VT = LHSOp.getValueType();
-      EVT RVT = RHSOp.getValueType();
-      if (RVT != VT) {
-        // Integer BUILD_VECTOR operands may have types larger than the element
-        // size (e.g., when the element type is not legal).  Prior to type
-        // legalization, the types may not match between the two BUILD_VECTORS.
-        // Truncate one of the operands to make them match.
-        if (RVT.getSizeInBits() > VT.getSizeInBits()) {
-          RHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, RHSOp);
-        } else {
-          LHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), RVT, LHSOp);
-          VT = RVT;
-        }
-      }
-      SDValue FoldOp = DAG.getNode(N->getOpcode(), SDLoc(LHS), VT,
-                                   LHSOp, RHSOp);
-      if (FoldOp.getOpcode() != ISD::UNDEF &&
-          FoldOp.getOpcode() != ISD::Constant &&
-          FoldOp.getOpcode() != ISD::ConstantFP)
-        break;
-      Ops.push_back(FoldOp);
-      AddToWorklist(FoldOp.getNode());
-    }
-
-    if (Ops.size() == LHS.getNumOperands())
-      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
-  }
-
   // Type legalization might introduce new shuffles in the DAG.
   // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
   //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
@@ -13109,7 +13772,8 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
       EVT VT = N->getValueType(0);
       SDValue UndefVector = LHS.getOperand(1);
       SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
-                                     LHS.getOperand(0), RHS.getOperand(0));
+                                     LHS.getOperand(0), RHS.getOperand(0),
+                                     N->getFlags());
       AddUsersToWorklist(N);
       return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
                                   &SVN0->getMask()[0]);
@@ -13390,9 +14054,10 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
         CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                             CstOffset);
         AddToWorklist(CPIdx.getNode());
-        return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
-                           MachinePointerInfo::getConstantPool(), false,
-                           false, false, Alignment);
+        return DAG.getLoad(
+            TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+            MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+            false, false, false, Alignment);
       }
     }
 
@@ -13481,8 +14146,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
     // Get a SetCC of the condition
     // NOTE: Don't create a SETCC if it's not legal on this target.
     if (!LegalOperations ||
-        TLI.isOperationLegal(ISD::SETCC,
-          LegalTypes ? getSetCCResultType(N0.getValueType()) : MVT::i1)) {
+        TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) {
       SDValue Temp, SCC;
       // cast from setcc result type to select result type
       if (LegalTypes) {
@@ -13514,51 +14178,6 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
     }
   }
 
-  // Check to see if this is the equivalent of setcc
-  // FIXME: Turn all of these into setcc if setcc if setcc is legal
-  // otherwise, go ahead with the folds.
-  if (0 && isNullConstant(N3) && isOneConstant(N2)) {
-    EVT XType = N0.getValueType();
-    if (!LegalOperations ||
-        TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(XType))) {
-      SDValue Res = DAG.getSetCC(DL, getSetCCResultType(XType), N0, N1, CC);
-      if (Res.getValueType() != VT)
-        Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
-      return Res;
-    }
-
-    // fold (seteq X, 0) -> (srl (ctlz X, log2(size(X))))
-    if (isNullConstant(N1) && CC == ISD::SETEQ &&
-        (!LegalOperations ||
-         TLI.isOperationLegal(ISD::CTLZ, XType))) {
-      SDValue Ctlz = DAG.getNode(ISD::CTLZ, SDLoc(N0), XType, N0);
-      return DAG.getNode(ISD::SRL, DL, XType, Ctlz,
-                         DAG.getConstant(Log2_32(XType.getSizeInBits()),
-                                         SDLoc(Ctlz),
-                                       getShiftAmountTy(Ctlz.getValueType())));
-    }
-    // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1))
-    if (isNullConstant(N1) && CC == ISD::SETGT) {
-      SDLoc DL(N0);
-      SDValue NegN0 = DAG.getNode(ISD::SUB, DL,
-                                  XType, DAG.getConstant(0, DL, XType), N0);
-      SDValue NotN0 = DAG.getNOT(DL, N0, XType);
-      return DAG.getNode(ISD::SRL, DL, XType,
-                         DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0),
-                         DAG.getConstant(XType.getSizeInBits() - 1, DL,
-                                         getShiftAmountTy(XType)));
-    }
-    // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1))
-    if (isAllOnesConstant(N1) && CC == ISD::SETGT) {
-      SDLoc DL(N0);
-      SDValue Sign = DAG.getNode(ISD::SRL, DL, XType, N0,
-                                 DAG.getConstant(XType.getSizeInBits() - 1, DL,
-                                         getShiftAmountTy(N0.getValueType())));
-      return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, DL,
-                                                                    XType));
-    }
-  }
-
   // Check to see if this is an integer abs.
   // select_cc setg[te] X,  0,  X, -X ->
   // select_cc setgt    X, -1,  X, -X ->
@@ -13666,7 +14285,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   return S;
 }
 
-SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
+SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
@@ -13690,16 +14309,16 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
 
       // Newton iterations: Est = Est + Est (1 - Arg * Est)
       for (unsigned i = 0; i < Iterations; ++i) {
-        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est);
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
         AddToWorklist(NewEst.getNode());
 
-        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst);
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
-        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
-        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst);
+        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
         AddToWorklist(Est.getNode());
       }
     }
@@ -13716,31 +14335,32 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
 ///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
 /// As a result, we precompute A/2 prior to the iteration loop.
 SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
-                                          unsigned Iterations) {
+                                          unsigned Iterations,
+                                          SDNodeFlags *Flags) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
 
   // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
   // this entire sequence requires only one FP constant.
-  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
+  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
   AddToWorklist(HalfArg.getNode());
 
-  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
+  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
   AddToWorklist(HalfArg.getNode());
 
   // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
   for (unsigned i = 0; i < Iterations; ++i) {
-    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
     AddToWorklist(NewEst.getNode());
 
-    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
+    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
     AddToWorklist(NewEst.getNode());
 
-    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
+    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
     AddToWorklist(NewEst.getNode());
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
     AddToWorklist(Est.getNode());
   }
   return Est;
@@ -13752,7 +14372,8 @@ SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
 ///     =>
 ///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
 SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
-                                          unsigned Iterations) {
+                                          unsigned Iterations,
+                                          SDNodeFlags *Flags) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
@@ -13760,25 +14381,25 @@ SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
 
   // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
   for (unsigned i = 0; i < Iterations; ++i) {
-    SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
+    SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
     AddToWorklist(HalfEst.getNode());
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
     AddToWorklist(Est.getNode());
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
     AddToWorklist(Est.getNode());
 
-    Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree);
+    Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree, Flags);
     AddToWorklist(Est.getNode());
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst, Flags);
     AddToWorklist(Est.getNode());
   }
   return Est;
 }
 
-SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
+SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
@@ -13790,8 +14411,8 @@ SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
     AddToWorklist(Est.getNode());
     if (Iterations) {
       Est = UseOneConstNR ?
-        BuildRsqrtNROneConst(Op, Est, Iterations) :
-        BuildRsqrtNRTwoConst(Op, Est, Iterations);
+        BuildRsqrtNROneConst(Op, Est, Iterations, Flags) :
+        BuildRsqrtNRTwoConst(Op, Est, Iterations, Flags);
     }
     return Est;
   }
@@ -13955,14 +14576,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
     SDValue Chain = Chains.pop_back_val();
 
     // For TokenFactor nodes, look at each operand and only continue up the
-    // chain until we find two aliases.  If we've seen two aliases, assume we'll
-    // find more and revert to original chain since the xform is unlikely to be
-    // profitable.
+    // chain until we reach the depth limit.
     //
     // FIXME: The depth check could be made to return the last non-aliasing
     // chain we found before we hit a tokenfactor rather than the original
     // chain.
-    if (Depth > 6 || Aliases.size() == 2) {
+    if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
       Aliases.clear();
       Aliases.push_back(OriginalChain);
       return;
@@ -14094,6 +14713,83 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
+bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) {
+  // This holds the base pointer, index, and the offset in bytes from the base
+  // pointer.
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
+
+  // We must have a base and an offset.
+  if (!BasePtr.Base.getNode())
+    return false;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.Base.getOpcode() == ISD::UNDEF)
+    return false;
+
+  SmallVector<StoreSDNode *, 8> ChainedStores;
+  ChainedStores.push_back(St);
+
+  // Walk up the chain and look for nodes with offsets from the same
+  // base pointer. Stop when reaching an instruction with a different kind
+  // or instruction which has a different base pointer.
+  StoreSDNode *Index = St;
+  while (Index) {
+    // If the chain has more than one use, then we can't reorder the mem ops.
+    if (Index != St && !SDValue(Index, 0)->hasOneUse())
+      break;
+
+    if (Index->isVolatile() || Index->isIndexed())
+      break;
+
+    // Find the base pointer and offset for this memory node.
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr());
+
+    // Check that the base pointer is the same as the original one.
+    if (!Ptr.equalBaseIndex(BasePtr))
+      break;
+
+    // Find the next memory operand in the chain. If the next operand in the
+    // chain is a store then move up and continue the scan with the next
+    // memory operand. If the next operand is a load save it and use alias
+    // information to check if it interferes with anything.
+    SDNode *NextInChain = Index->getChain().getNode();
+    while (true) {
+      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
+        // We found a store node. Use it for the next iteration.
+        ChainedStores.push_back(STn);
+        Index = STn;
+        break;
+      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        NextInChain = Ldn->getChain().getNode();
+        continue;
+      } else {
+        Index = nullptr;
+        break;
+      }
+    }
+  }
+
+  bool MadeChange = false;
+  SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
+
+  for (StoreSDNode *ChainedStore : ChainedStores) {
+    SDValue Chain = ChainedStore->getChain();
+    SDValue BetterChain = FindBetterChain(ChainedStore, Chain);
+
+    if (Chain != BetterChain) {
+      MadeChange = true;
+      BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
+    }
+  }
+
+  // Do all replacements after finding the replacements to make to avoid making
+  // the chains more complicated by introducing new TokenFactors.
+  for (auto Replacement : BetterChains)
+    replaceStoreChain(Replacement.first, Replacement.second);
+
+  return MadeChange;
+}
+
 /// This is the entry point for the file.
 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
                            CodeGenOpt::Level OptLevel) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 2b9ba2c..cfbb209 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -118,9 +118,9 @@ bool FastISel::lowerArguments() {
   for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
                                     E = FuncInfo.Fn->arg_end();
        I != E; ++I) {
-    DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
+    DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(&*I);
     assert(VI != LocalValueMap.end() && "Missed an argument?");
-    FuncInfo.ValueMap[I] = VI->second;
+    FuncInfo.ValueMap[&*I] = VI->second;
   }
   return true;
 }
@@ -611,7 +611,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   // have to worry about calling conventions and target-specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // CALLSEQ_START(0)
+  // CALLSEQ_START(0...)
   // STACKMAP(id, nbytes, ...)
   // CALLSEQ_END(0, 0)
   //
@@ -647,8 +647,11 @@ bool FastISel::selectStackmap(const CallInst *I) {
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-      .addImm(0);
+  auto Builder =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown));
+  const MCInstrDesc &MCID = Builder.getInstr()->getDesc();
+  for (unsigned I = 0, E = MCID.getNumOperands(); I < E; ++I)
+    Builder.addImm(0);
 
   // Issue STACKMAP.
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1100,13 +1103,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   // The donothing intrinsic does, well, nothing.
   case Intrinsic::donothing:
     return true;
-  case Intrinsic::eh_actions: {
-    unsigned ResultReg = getRegForValue(UndefValue::get(II->getType()));
-    if (!ResultReg)
-      return false;
-    updateValueMap(II, ResultReg);
-    return true;
-  }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     assert(DI->getVariable() && "Missing variable");
@@ -1326,12 +1322,38 @@ bool FastISel::selectBitCast(const User *I) {
   return true;
 }
 
+// Remove local value instructions starting from the instruction after
+// SavedLastLocalValue to the current function insert point.
+void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue)
+{
+  MachineInstr *CurLastLocalValue = getLastLocalValue();
+  if (CurLastLocalValue != SavedLastLocalValue) {
+    // Find the first local value instruction to be deleted. 
+    // This is the instruction after SavedLastLocalValue if it is non-NULL.
+    // Otherwise it's the first instruction in the block.
+    MachineBasicBlock::iterator FirstDeadInst(SavedLastLocalValue);
+    if (SavedLastLocalValue)
+      ++FirstDeadInst;
+    else
+      FirstDeadInst = FuncInfo.MBB->getFirstNonPHI();
+    setLastLocalValue(SavedLastLocalValue);
+    removeDeadCode(FirstDeadInst, FuncInfo.InsertPt);
+  }
+}
+
 bool FastISel::selectInstruction(const Instruction *I) {
+  MachineInstr *SavedLastLocalValue = getLastLocalValue();
   // Just before the terminator instruction, insert instructions to
   // feed PHI nodes in successor blocks.
   if (isa<TerminatorInst>(I))
-    if (!handlePHINodesInSuccessorBlocks(I->getParent()))
+    if (!handlePHINodesInSuccessorBlocks(I->getParent())) {
+      // PHI node handling may have generated local value instructions,
+      // even though it failed to handle all PHI nodes.
+      // We remove these instructions because SelectionDAGISel will generate 
+      // them again.
+      removeDeadLocalValueCode(SavedLastLocalValue);
       return false;
+    }
 
   DbgLoc = I->getDebugLoc();
 
@@ -1348,7 +1370,7 @@ bool FastISel::selectInstruction(const Instruction *I) {
         LibInfo->hasOptimizedCodeGen(Func))
       return false;
 
-    // Don't handle Intrinsic::trap if a trap funciton is specified.
+    // Don't handle Intrinsic::trap if a trap function is specified.
     if (F && F->getIntrinsicID() == Intrinsic::trap &&
         Call->hasFnAttr("trap-func-name"))
       return false;
@@ -1380,8 +1402,12 @@ bool FastISel::selectInstruction(const Instruction *I) {
 
   DbgLoc = DebugLoc();
   // Undo phi node updates, because they will be added again by SelectionDAG.
-  if (isa<TerminatorInst>(I))
+  if (isa<TerminatorInst>(I)) {
+    // PHI node handling may have generated local value instructions. 
+    // We remove them because SelectionDAGISel will generate them again.
+    removeDeadLocalValueCode(SavedLastLocalValue);
     FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+  }
   return false;
 }
 
@@ -1398,11 +1424,30 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
     TII.InsertBranch(*FuncInfo.MBB, MSucc, nullptr,
                      SmallVector<MachineOperand, 0>(), DbgLoc);
   }
-  uint32_t BranchWeight = 0;
-  if (FuncInfo.BPI)
-    BranchWeight = FuncInfo.BPI->getEdgeWeight(FuncInfo.MBB->getBasicBlock(),
-                                               MSucc->getBasicBlock());
-  FuncInfo.MBB->addSuccessor(MSucc, BranchWeight);
+  if (FuncInfo.BPI) {
+    auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+        FuncInfo.MBB->getBasicBlock(), MSucc->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(MSucc, BranchProbability);
+  } else
+    FuncInfo.MBB->addSuccessorWithoutProb(MSucc);
+}
+
+void FastISel::finishCondBranch(const BasicBlock *BranchBB,
+                                MachineBasicBlock *TrueMBB,
+                                MachineBasicBlock *FalseMBB) {
+  // Add TrueMBB as successor unless it is equal to the FalseMBB: This can
+  // happen in degenerate IR and MachineIR forbids to have a block twice in the
+  // successor/predecessor lists.
+  if (TrueMBB != FalseMBB) {
+    if (FuncInfo.BPI) {
+      auto BranchProbability =
+          FuncInfo.BPI->getEdgeProbability(BranchBB, TrueMBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TrueMBB, BranchProbability);
+    } else
+      FuncInfo.MBB->addSuccessorWithoutProb(TrueMBB);
+  }
+
+  fastEmitBranch(FalseMBB, DbgLoc);
 }
 
 /// Emit an FNeg operation.
@@ -1864,21 +1909,18 @@ unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_rf(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC, unsigned Op0,
-                                   bool Op0IsKill, const ConstantFP *FPImm) {
+unsigned FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  const ConstantFP *FPImm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
-  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-        .addReg(Op0, getKillRegState(Op0IsKill))
         .addFPImm(FPImm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-        .addReg(Op0, getKillRegState(Op0IsKill))
         .addFPImm(FPImm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
@@ -1912,35 +1954,6 @@ unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_rrii(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill, unsigned Op1,
-                                     bool Op1IsKill, uint64_t Imm1,
-                                     uint64_t Imm2) {
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  unsigned ResultReg = createResultReg(RC);
-  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
-  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
-
-  if (II.getNumDefs() >= 1)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-        .addReg(Op0, getKillRegState(Op0IsKill))
-        .addReg(Op1, getKillRegState(Op1IsKill))
-        .addImm(Imm1)
-        .addImm(Imm2);
-  else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-        .addReg(Op0, getKillRegState(Op0IsKill))
-        .addReg(Op1, getKillRegState(Op1IsKill))
-        .addImm(Imm1)
-        .addImm(Imm2);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
-  }
-  return ResultReg;
-}
-
 unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC, uint64_t Imm) {
   unsigned ResultReg = createResultReg(RC);
@@ -1957,25 +1970,6 @@ unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_ii(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC, uint64_t Imm1,
-                                   uint64_t Imm2) {
-  unsigned ResultReg = createResultReg(RC);
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  if (II.getNumDefs() >= 1)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-        .addImm(Imm1)
-        .addImm(Imm2);
-  else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm1)
-        .addImm(Imm2);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
-  }
-  return ResultReg;
-}
-
 unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
                                               bool Op0IsKill, uint32_t Idx) {
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index cc306cb..08815ed 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -87,6 +87,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
   MachineModuleInfo &MMI = MF->getMMI();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
@@ -103,28 +104,29 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
          I != E; ++I) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-        // Static allocas can be folded into the initial stack frame adjustment.
-        if (AI->isStaticAlloca()) {
+        Type *Ty = AI->getAllocatedType();
+        unsigned Align =
+          std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
+                   AI->getAlignment());
+        unsigned StackAlign = TFI->getStackAlignment();
+
+        // Static allocas can be folded into the initial stack frame
+        // adjustment. For targets that don't realign the stack, don't
+        // do this if there is an extra alignment requirement.
+        if (AI->isStaticAlloca() && 
+            (TFI->isStackRealignable() || (Align <= StackAlign))) {
           const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
-          Type *Ty = AI->getAllocatedType();
           uint64_t TySize = MF->getDataLayout().getTypeAllocSize(Ty);
-          unsigned Align =
-              std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
-                       AI->getAlignment());
 
           TySize *= CUI->getZExtValue();   // Get total allocated size.
           if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
 
           StaticAllocaMap[AI] =
             MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
-
         } else {
-          unsigned Align =
-              std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(
-                           AI->getAllocatedType()),
-                       AI->getAlignment());
-          unsigned StackAlign =
-              MF->getSubtarget().getFrameLowering()->getStackAlignment();
+          // FIXME: Overaligned static allocas should be grouped into
+          // a single dynamic allocation instead of using a separate
+          // stack allocation for each one.
           if (Align <= StackAlign)
             Align = 0;
           // Inform the Frame Information that we have variable-sized objects.
@@ -134,7 +136,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
 
       // Look for inline asm that clobbers the SP register.
       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-        ImmutableCallSite CS(I);
+        ImmutableCallSite CS(&*I);
         if (isa<InlineAsm>(CS.getCalledValue())) {
           unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
@@ -163,7 +165,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           MF->getFrameInfo()->setHasVAStart(true);
       }
 
-      // If we have a musttail call in a variadic funciton, we need to ensure we
+      // If we have a musttail call in a variadic function, we need to ensure we
       // forward implicit register parameters.
       if (const auto *CI = dyn_cast<CallInst>(I)) {
         if (CI->isMustTailCall() && Fn->isVarArg())
@@ -172,10 +174,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
 
       // Mark values used outside their block as exported, by allocating
       // a virtual register for them.
-      if (isUsedOutsideOfDefiningBlock(I))
-        if (!isa<AllocaInst>(I) ||
-            !StaticAllocaMap.count(cast<AllocaInst>(I)))
-          InitializeRegForValue(I);
+      if (isUsedOutsideOfDefiningBlock(&*I))
+        if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(I)))
+          InitializeRegForValue(&*I);
 
       // Collect llvm.dbg.declare information. This is done now instead of
       // during the initial isel pass through the IR so that it is done
@@ -205,15 +206,36 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       }
 
       // Decide the preferred extend type for a value.
-      PreferredExtendType[I] = getPreferredExtendForValue(I);
+      PreferredExtendType[&*I] = getPreferredExtendForValue(&*I);
     }
 
   // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
   // also creates the initial PHI MachineInstrs, though none of the input
   // operands are populated.
   for (BB = Fn->begin(); BB != EB; ++BB) {
-    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(BB);
-    MBBMap[BB] = MBB;
+    // Don't create MachineBasicBlocks for imaginary EH pad blocks. These blocks
+    // are really data, and no instructions can live here.
+    if (BB->isEHPad()) {
+      const Instruction *I = BB->getFirstNonPHI();
+      // If this is a non-landingpad EH pad, mark this function as using
+      // funclets.
+      // FIXME: SEH catchpads do not create funclets, so we could avoid setting
+      // this in such cases in order to improve frame layout.
+      if (!isa<LandingPadInst>(I)) {
+        MMI.setHasEHFunclets(true);
+        MF->getFrameInfo()->setHasOpaqueSPAdjustment(true);
+      }
+      if (isa<CatchSwitchInst>(I)) {
+        assert(&*BB->begin() == I &&
+               "WinEHPrepare failed to remove PHIs from imaginary BBs");
+        continue;
+      }
+      if (isa<FuncletPadInst>(I))
+        assert(&*BB->begin() == I && "WinEHPrepare failed to demote PHIs");
+    }
+
+    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(&*BB);
+    MBBMap[&*BB] = MBB;
     MF->push_back(MBB);
 
     // Transfer the address-taken flag. This is necessary because there could
@@ -252,94 +274,62 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   // Mark landing pad blocks.
   SmallVector<const LandingPadInst *, 4> LPads;
   for (BB = Fn->begin(); BB != EB; ++BB) {
-    if (const auto *Invoke = dyn_cast<InvokeInst>(BB->getTerminator()))
-      MBBMap[Invoke->getSuccessor(1)]->setIsLandingPad();
-    if (BB->isLandingPad())
-      LPads.push_back(BB->getLandingPadInst());
+    const Instruction *FNP = BB->getFirstNonPHI();
+    if (BB->isEHPad() && MBBMap.count(&*BB))
+      MBBMap[&*BB]->setIsEHPad();
+    if (const auto *LPI = dyn_cast<LandingPadInst>(FNP))
+      LPads.push_back(LPI);
   }
 
-  // If this is an MSVC EH personality, we need to do a bit more work.
-  EHPersonality Personality = EHPersonality::Unknown;
-  if (Fn->hasPersonalityFn())
-    Personality = classifyEHPersonality(Fn->getPersonalityFn());
-  if (!isMSVCEHPersonality(Personality))
+  // If this personality uses funclets, we need to do a bit more work.
+  if (!Fn->hasPersonalityFn())
+    return;
+  EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+  if (!isFuncletEHPersonality(Personality))
     return;
 
-  if (Personality == EHPersonality::MSVC_Win64SEH ||
-      Personality == EHPersonality::MSVC_X86SEH) {
-    addSEHHandlersForLPads(LPads);
-  }
-
-  WinEHFuncInfo &EHInfo = MMI.getWinEHFuncInfo(&fn);
-  if (Personality == EHPersonality::MSVC_CXX) {
-    const Function *WinEHParentFn = MMI.getWinEHParent(&fn);
-    calculateWinCXXEHStateNumbers(WinEHParentFn, EHInfo);
-  }
-
-  // Copy the state numbers to LandingPadInfo for the current function, which
-  // could be a handler or the parent. This should happen for 32-bit SEH and
-  // C++ EH.
-  if (Personality == EHPersonality::MSVC_CXX ||
-      Personality == EHPersonality::MSVC_X86SEH) {
-    for (const LandingPadInst *LP : LPads) {
-      MachineBasicBlock *LPadMBB = MBBMap[LP->getParent()];
-      MMI.addWinEHState(LPadMBB, EHInfo.LandingPadStateMap[LP]);
-    }
-  }
-}
-
-void FunctionLoweringInfo::addSEHHandlersForLPads(
-    ArrayRef<const LandingPadInst *> LPads) {
-  MachineModuleInfo &MMI = MF->getMMI();
-
-  // Iterate over all landing pads with llvm.eh.actions calls.
-  for (const LandingPadInst *LP : LPads) {
-    const IntrinsicInst *ActionsCall =
-        dyn_cast<IntrinsicInst>(LP->getNextNode());
-    if (!ActionsCall ||
-        ActionsCall->getIntrinsicID() != Intrinsic::eh_actions)
-      continue;
-
-    // Parse the llvm.eh.actions call we found.
-    MachineBasicBlock *LPadMBB = MBBMap[LP->getParent()];
-    SmallVector<std::unique_ptr<ActionHandler>, 4> Actions;
-    parseEHActions(ActionsCall, Actions);
-
-    // Iterate EH actions from most to least precedence, which means
-    // iterating in reverse.
-    for (auto I = Actions.rbegin(), E = Actions.rend(); I != E; ++I) {
-      ActionHandler *Action = I->get();
-      if (auto *CH = dyn_cast<CatchHandler>(Action)) {
-        const auto *Filter =
-            dyn_cast<Function>(CH->getSelector()->stripPointerCasts());
-        assert((Filter || CH->getSelector()->isNullValue()) &&
-               "expected function or catch-all");
-        const auto *RecoverBA =
-            cast<BlockAddress>(CH->getHandlerBlockOrFunc());
-        MMI.addSEHCatchHandler(LPadMBB, Filter, RecoverBA);
+  // Calculate state numbers if we haven't already.
+  WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+  if (Personality == EHPersonality::MSVC_CXX)
+    calculateWinCXXEHStateNumbers(&fn, EHInfo);
+  else if (isAsynchronousEHPersonality(Personality))
+    calculateSEHStateNumbers(&fn, EHInfo);
+  else if (Personality == EHPersonality::CoreCLR)
+    calculateClrEHStateNumbers(&fn, EHInfo);
+
+  // Map all BB references in the WinEH data to MBBs.
+  for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+    for (WinEHHandlerType &H : TBME.HandlerArray) {
+      if (H.CatchObj.Alloca) {
+        assert(StaticAllocaMap.count(H.CatchObj.Alloca));
+        H.CatchObj.FrameIndex = StaticAllocaMap[H.CatchObj.Alloca];
       } else {
-        assert(isa<CleanupHandler>(Action));
-        const auto *Fini = cast<Function>(Action->getHandlerBlockOrFunc());
-        MMI.addSEHCleanupHandler(LPadMBB, Fini);
+        H.CatchObj.FrameIndex = INT_MAX;
       }
+      if (H.Handler)
+        H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()];
     }
   }
+  for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap)
+    if (UME.Cleanup)
+      UME.Cleanup = MBBMap[UME.Cleanup.get<const BasicBlock *>()];
+  for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) {
+    const BasicBlock *BB = UME.Handler.get<const BasicBlock *>();
+    UME.Handler = MBBMap[BB];
+  }
+  for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) {
+    const BasicBlock *BB = CME.Handler.get<const BasicBlock *>();
+    CME.Handler = MBBMap[BB];
+  }
 }
 
 /// clear - Clear out all the function-specific state. This returns this
 /// FunctionLoweringInfo to an empty state, ready to be used for a
 /// different function.
 void FunctionLoweringInfo::clear() {
-  assert(CatchInfoFound.size() == CatchInfoLost.size() &&
-         "Not all catch info was assigned to a landing pad!");
-
   MBBMap.clear();
   ValueMap.clear();
   StaticAllocaMap.clear();
-#ifndef NDEBUG
-  CatchInfoLost.clear();
-  CatchInfoFound.clear();
-#endif
   LiveOutRegInfo.clear();
   VisitedBBs.clear();
   ArgDbgValues.clear();
@@ -520,6 +510,17 @@ int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
   return 0;
 }
 
+unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
+    const Value *CPI, const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  auto I = CatchPadExceptionPointers.insert({CPI, 0});
+  unsigned &VReg = I.first->second;
+  if (I.second)
+    VReg = MRI.createVirtualRegister(RC);
+  assert(VReg && "null vreg in exception pointer table!");
+  return VReg;
+}
+
 /// ComputeUsesVAFloatArgument - Determine if any floating-point values are
 /// being passed to this variadic function, and set the MachineModuleInfo's
 /// usesVAFloatArgument flag if so. This flag is used to emit an undefined
@@ -547,10 +548,9 @@ void llvm::ComputeUsesVAFloatArgument(const CallInst &I,
 /// landingpad instruction and add them to the specified machine module info.
 void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
                              MachineBasicBlock *MBB) {
-  MMI.addPersonality(
-      MBB,
-      cast<Function>(
-          I.getParent()->getParent()->getPersonalityFn()->stripPointerCasts()));
+  if (const auto *PF = dyn_cast<Function>(
+      I.getParent()->getParent()->getPersonalityFn()->stripPointerCasts()))
+    MMI.addPersonality(PF);
 
   if (I.isCleanup())
     MMI.addCleanup(MBB);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 5ec1030..a1e2d41 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -139,7 +139,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
               UseRC = RC;
             else if (RC) {
               const TargetRegisterClass *ComRC =
-                TRI->getCommonSubClass(UseRC, RC);
+                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index fbc8f1e..5d572c4 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -39,6 +39,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "legalizedag"
 
+namespace {
+
+struct FloatSignAsInt;
+
 //===----------------------------------------------------------------------===//
 /// This takes an arbitrary SelectionDAG as input and
 /// hacks on it until the target machine can handle it.  This involves
@@ -51,7 +55,6 @@ using namespace llvm;
 /// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
 /// will attempt merge setcc and brc instructions into brcc's.
 ///
-namespace {
 class SelectionDAGLegalize {
   const TargetMachine &TM;
   const TargetLowering &TLI;
@@ -130,7 +133,11 @@ private:
   SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
   void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
                                 SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandFCOPYSIGN(SDNode *Node);
+  void getSignAsIntValue(FloatSignAsInt &State, SDLoc DL, SDValue Value) const;
+  SDValue modifySignAsInt(const FloatSignAsInt &State, SDLoc DL,
+                          SDValue NewIntValue) const;
+  SDValue ExpandFCOPYSIGN(SDNode *Node) const;
+  SDValue ExpandFABS(SDNode *Node) const;
   SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
                                SDLoc dl);
   SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
@@ -138,6 +145,7 @@ private:
   SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
                                 SDLoc dl);
 
+  SDValue ExpandBITREVERSE(SDValue Op, SDLoc dl);
   SDValue ExpandBSWAP(SDValue Op, SDLoc dl);
   SDValue ExpandBitCount(unsigned Opc, SDValue Op, SDLoc dl);
 
@@ -146,10 +154,11 @@ private:
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+  SDValue ExpandConstant(ConstantSDNode *CP);
 
-  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
-
-  void ExpandNode(SDNode *Node);
+  // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
+  bool ExpandNode(SDNode *Node);
+  void ConvertNodeToLibcall(SDNode *Node);
   void PromoteNode(SDNode *Node);
 
 public:
@@ -273,17 +282,30 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
       DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   if (Extend) {
-    SDValue Result =
-      DAG.getExtLoad(ISD::EXTLOAD, dl, OrigVT,
-                     DAG.getEntryNode(),
-                     CPIdx, MachinePointerInfo::getConstantPool(),
-                     VT, false, false, false, Alignment);
+    SDValue Result = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
+        false, false, false, Alignment);
     return Result;
   }
   SDValue Result =
-    DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
-                MachinePointerInfo::getConstantPool(), false, false, false,
-                Alignment);
+      DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, Alignment);
+  return Result;
+}
+
+/// Expands the Constant node to a load from the constant pool.
+SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
+  SDLoc dl(CP);
+  EVT VT = CP->getValueType(0);
+  SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Result =
+    DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                false, false, false, Alignment);
   return Result;
 }
 
@@ -594,13 +616,13 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
 
   // Store the vector.
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Tmp1, StackPtr,
-                            MachinePointerInfo::getFixedStack(SPFI),
-                            false, false, 0);
+  SDValue Ch = DAG.getStore(
+      DAG.getEntryNode(), dl, Tmp1, StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), false,
+      false, 0);
 
   // Truncate or zero extend offset to target pointer type.
-  unsigned CastOpc = IdxVT.bitsGT(PtrVT) ? ISD::TRUNCATE : ISD::ZERO_EXTEND;
-  Tmp3 = DAG.getNode(CastOpc, dl, PtrVT, Tmp3);
+  Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT);
   // Add the offset to the index.
   unsigned EltSize = EltVT.getSizeInBits()/8;
   Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,
@@ -610,9 +632,9 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
   Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT,
                          false, false, 0);
   // Load the updated vector.
-  return DAG.getLoad(VT, dl, Ch, StackPtr,
-                     MachinePointerInfo::getFixedStack(SPFI), false, false,
-                     false, 0);
+  return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
+                                               DAG.getMachineFunction(), SPFI),
+                     false, false, false, 0);
 }
 
 
@@ -728,14 +750,12 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         case TargetLowering::Legal: {
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
+          EVT MemVT = ST->getMemoryVT();
           unsigned AS = ST->getAddressSpace();
           unsigned Align = ST->getAlignment();
-          if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-            if (Align < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
-          }
+          const DataLayout &DL = DAG.getDataLayout();
+          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
+            ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           break;
         }
         case TargetLowering::Custom: {
@@ -839,20 +859,16 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
         ReplaceNode(SDValue(Node, 0), Result);
       } else {
-        switch (TLI.getTruncStoreAction(ST->getValue().getSimpleValueType(),
-                                        StVT.getSimpleVT())) {
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
         default: llvm_unreachable("This action is not supported yet!");
         case TargetLowering::Legal: {
+          EVT MemVT = ST->getMemoryVT();
           unsigned AS = ST->getAddressSpace();
           unsigned Align = ST->getAlignment();
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
-          if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment = DL.getABITypeAlignment(Ty);
-            if (Align < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
-          }
+          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
+            ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           break;
         }
         case TargetLowering::Custom: {
@@ -895,17 +911,14 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal: {
+      EVT MemVT = LD->getMemoryVT();
       unsigned AS = LD->getAddressSpace();
       unsigned Align = LD->getAlignment();
+      const DataLayout &DL = DAG.getDataLayout();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsMisalignedMemoryAccesses(LD->getMemoryVT(), AS, Align)) {
-        Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-        unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-        if (Align < ABIAlignment){
-          ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
-        }
-      }
+      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
+        ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
       break;
     }
     case TargetLowering::Custom: {
@@ -1092,23 +1105,20 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
           Chain = Res.getValue(1);
         }
       } else {
-        // If this is an unaligned load and the target doesn't support
-        // it, expand it.
+        // If this is an unaligned load and the target doesn't support it,
+        // expand it.
         EVT MemVT = LD->getMemoryVT();
         unsigned AS = LD->getAddressSpace();
         unsigned Align = LD->getAlignment();
-        if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) {
-          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-          if (Align < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
-          }
-        }
+        const DataLayout &DL = DAG.getDataLayout();
+        if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align))
+          ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
       }
       break;
     }
     case TargetLowering::Expand:
-      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, Node->getValueType(0), SrcVT)) {
+      EVT DestVT = Node->getValueType(0);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
         // If the source type is not legal, see if there is a legal extload to
         // an intermediate type that we can then extend further.
         EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
@@ -1127,6 +1137,23 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
           Chain = Load.getValue(1);
           break;
         }
+
+        // Handle the special case of fp16 extloads. EXTLOAD doesn't have the
+        // normal undefined upper bits behavior to allow using an in-reg extend
+        // with the illegal FP type, so load as an integer and do the
+        // from-integer conversion.
+        if (SrcVT.getScalarType() == MVT::f16) {
+          EVT ISrcVT = SrcVT.changeTypeToInteger();
+          EVT IDestVT = DestVT.changeTypeToInteger();
+          EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
+
+          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
+                                          Chain, Ptr, ISrcVT,
+                                          LD->getMemOperand());
+          Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
+          Chain = Result.getValue(1);
+          break;
+        }
       }
 
       assert(!SrcVT.isVector() &&
@@ -1180,15 +1207,17 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
-             TargetLowering::TypeLegal &&
+    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Node->getValueType(i))) &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
-    assert((TLI.getTypeAction(*DAG.getContext(),
-                              Op.getValueType()) == TargetLowering::TypeLegal ||
-                              Op.getOpcode() == ISD::TargetConstant) &&
-                              "Unexpected illegal type!");
+    assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Op.getValueType()) ||
+            Op.getOpcode() == ISD::TargetConstant) &&
+            "Unexpected illegal type!");
 #endif
 
   // Figure out the correct action; the way to query this varies by opcode
@@ -1201,6 +1230,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STACKSAVE:
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    break;
   case ISD::VAARG:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getValueType(0));
@@ -1229,7 +1262,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::SETCC:
   case ISD::BR_CC: {
     unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
-                         Node->getOpcode() == ISD::SETCC ? 2 : 1;
+                         Node->getOpcode() == ISD::SETCC ? 2 :
+                         Node->getOpcode() == ISD::SETCCE ? 3 : 1;
     unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
     MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
     ISD::CondCode CCCode =
@@ -1265,6 +1299,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::FRAME_TO_ARGS_OFFSET:
   case ISD::EH_SJLJ_SETJMP:
   case ISD::EH_SJLJ_LONGJMP:
+  case ISD::EH_SJLJ_SETUP_DISPATCH:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be expanded.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
@@ -1281,6 +1316,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Custom;
     break;
+  case ISD::READCYCLECOUNTER:
+    // READCYCLECOUNTER returns an i64, even if type legalization might have
+    // expanded that to several smaller types.
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
+    break;
   case ISD::READ_REGISTER:
   case ISD::WRITE_REGISTER:
     // Named register is legal in the DAG, but blocked by register name
@@ -1379,7 +1419,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     }
       // FALL THROUGH
     case TargetLowering::Expand:
-      ExpandNode(Node);
+      if (ExpandNode(Node))
+        return;
+      // FALL THROUGH
+    case TargetLowering::LibCall:
+      ConvertNodeToLibcall(Node);
       return;
     case TargetLowering::Promote:
       PromoteNode(Node);
@@ -1419,6 +1463,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
   // the vector. If all are expanded here, we don't want one store per vector
   // element.
+
+  // Caches for hasPredecessorHelper
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+
   SDValue StackPtr, Ch;
   for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
        UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
@@ -1433,6 +1482,12 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
       if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
         continue;
 
+      // If the index is dependent on the store we will introduce a cycle when
+      // creating the load (the load uses the index, and by replacing the chain
+      // we will make the index dependent on the load).
+      if (Idx.getNode()->hasPredecessorHelper(ST, Visited, Worklist))
+        continue;
+
       StackPtr = ST->getBasePtr();
       Ch = SDValue(ST, 0);
       break;
@@ -1490,7 +1545,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // First store the whole vector.
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
@@ -1528,7 +1584,8 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
-  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // Emit a store of each element to the stack slot.
   SmallVector<SDValue, 8> Stores;
@@ -1568,69 +1625,143 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
                      false, false, false, 0);
 }
 
-SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
-  SDLoc dl(Node);
-  SDValue Tmp1 = Node->getOperand(0);
-  SDValue Tmp2 = Node->getOperand(1);
-
-  // Get the sign bit of the RHS.  First obtain a value that has the same
-  // sign as the sign bit, i.e. negative if and only if the sign bit is 1.
-  SDValue SignBit;
-  EVT FloatVT = Tmp2.getValueType();
-  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), FloatVT.getSizeInBits());
+namespace {
+/// Keeps track of state when getting the sign of a floating-point value as an
+/// integer.
+struct FloatSignAsInt {
+  EVT FloatVT;
+  SDValue Chain;
+  SDValue FloatPtr;
+  SDValue IntPtr;
+  MachinePointerInfo IntPointerInfo;
+  MachinePointerInfo FloatPointerInfo;
+  SDValue IntValue;
+  APInt SignMask;
+};
+}
+
+/// Bitcast a floating-point value to an integer value. Only bitcast the part
+/// containing the sign bit if the target has no integer value capable of
+/// holding all bits of the floating-point value.
+void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
+                                             SDLoc DL, SDValue Value) const {
+  EVT FloatVT = Value.getValueType();
+  unsigned NumBits = FloatVT.getSizeInBits();
+  State.FloatVT = FloatVT;
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
+  // Convert to an integer of the same size.
   if (TLI.isTypeLegal(IVT)) {
-    // Convert to an integer with the same sign bit.
-    SignBit = DAG.getNode(ISD::BITCAST, dl, IVT, Tmp2);
+    State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
+    State.SignMask = APInt::getSignBit(NumBits);
+    return;
+  }
+
+  auto &DataLayout = DAG.getDataLayout();
+  // Store the float to memory, then load the sign part out as an integer.
+  MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8);
+  // First create a temporary that is aligned for both the load and store.
+  SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  // Then store the float to it.
+  State.FloatPtr = StackPtr;
+  MachineFunction &MF = DAG.getMachineFunction();
+  State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
+  State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
+                             State.FloatPointerInfo, false, false, 0);
+
+  SDValue IntPtr;
+  if (DataLayout.isBigEndian()) {
+    assert(FloatVT.isByteSized() && "Unsupported floating point type!");
+    // Load out a legal integer with the same sign bit as the float.
+    IntPtr = StackPtr;
+    State.IntPointerInfo = State.FloatPointerInfo;
   } else {
-    auto &DL = DAG.getDataLayout();
-    // Store the float to memory, then load the sign part out as an integer.
-    MVT LoadTy = TLI.getPointerTy(DL);
-    // First create a temporary that is aligned for both the load and store.
-    SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
-    // Then store the float to it.
-    SDValue Ch =
-      DAG.getStore(DAG.getEntryNode(), dl, Tmp2, StackPtr, MachinePointerInfo(),
-                   false, false, 0);
-    if (DL.isBigEndian()) {
-      assert(FloatVT.isByteSized() && "Unsupported floating point type!");
-      // Load out a legal integer with the same sign bit as the float.
-      SignBit = DAG.getLoad(LoadTy, dl, Ch, StackPtr, MachinePointerInfo(),
-                            false, false, false, 0);
-    } else { // Little endian
-      SDValue LoadPtr = StackPtr;
-      // The float may be wider than the integer we are going to load.  Advance
-      // the pointer so that the loaded integer will contain the sign bit.
-      unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits();
-      unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8;
-      LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(), LoadPtr,
-                           DAG.getConstant(ByteOffset, dl,
-                                           LoadPtr.getValueType()));
-      // Load a legal integer containing the sign bit.
-      SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, MachinePointerInfo(),
-                            false, false, false, 0);
-      // Move the sign bit to the top bit of the loaded integer.
-      unsigned BitShift = LoadTy.getSizeInBits() -
-        (FloatVT.getSizeInBits() - 8 * ByteOffset);
-      assert(BitShift < LoadTy.getSizeInBits() && "Pointer advanced wrong?");
-      if (BitShift)
-        SignBit = DAG.getNode(
-            ISD::SHL, dl, LoadTy, SignBit,
-            DAG.getConstant(BitShift, dl,
-                            TLI.getShiftAmountTy(SignBit.getValueType(), DL)));
-    }
+    // Advance the pointer so that the loaded byte will contain the sign bit.
+    unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1;
+    IntPtr = DAG.getNode(ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+                      DAG.getConstant(ByteOffset, DL, StackPtr.getValueType()));
+    State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
+                                                             ByteOffset);
   }
-  // Now get the sign bit proper, by seeing whether the value is negative.
-  SignBit = DAG.getSetCC(dl, getSetCCResultType(SignBit.getValueType()),
-                         SignBit,
-                         DAG.getConstant(0, dl, SignBit.getValueType()),
-                         ISD::SETLT);
-  // Get the absolute value of the result.
-  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, Tmp1.getValueType(), Tmp1);
-  // Select between the nabs and abs value based on the sign bit of
-  // the input.
-  return DAG.getSelect(dl, AbsVal.getValueType(), SignBit,
-                      DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal),
-                      AbsVal);
+
+  State.IntPtr = IntPtr;
+  State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain,
+                                  IntPtr, State.IntPointerInfo, MVT::i8,
+                                  false, false, false, 0);
+  State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
+}
+
+/// Replace the integer value produced by getSignAsIntValue() with a new value
+/// and cast the result back to a floating-point type.
+SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
+                                          SDLoc DL, SDValue NewIntValue) const {
+  if (!State.Chain)
+    return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);
+
+  // Override the part containing the sign bit in the value stored on the stack.
+  SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
+                                    State.IntPointerInfo, MVT::i8, false, false,
+                                    0);
+  return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
+                     State.FloatPointerInfo, false, false, false, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
+  SDLoc DL(Node);
+  SDValue Mag = Node->getOperand(0);
+  SDValue Sign = Node->getOperand(1);
+
+  // Get sign bit into an integer value.
+  FloatSignAsInt SignAsInt;
+  getSignAsIntValue(SignAsInt, DL, Sign);
+
+  EVT IntVT = SignAsInt.IntValue.getValueType();
+  SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
+  SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue,
+                                SignMask);
+
+  // If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X)
+  EVT FloatVT = Mag.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) {
+    SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag);
+    SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue);
+    SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit,
+                                DAG.getConstant(0, DL, IntVT), ISD::SETNE);
+    return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue);
+  }
+
+  // Transform values to integer, copy the sign bit and transform back.
+  FloatSignAsInt MagAsInt;
+  getSignAsIntValue(MagAsInt, DL, Mag);
+  assert(SignAsInt.SignMask == MagAsInt.SignMask);
+  SDValue ClearSignMask = DAG.getConstant(~SignAsInt.SignMask, DL, IntVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, MagAsInt.IntValue,
+                                    ClearSignMask);
+  SDValue CopiedSign = DAG.getNode(ISD::OR, DL, IntVT, ClearedSign, SignBit);
+
+  return modifySignAsInt(MagAsInt, DL, CopiedSign);
+}
+
+SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
+  SDLoc DL(Node);
+  SDValue Value = Node->getOperand(0);
+
+  // Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal.
+  EVT FloatVT = Value.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT);
+    return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero);
+  }
+
+  // Transform value to integer, clear the sign bit and transform back.
+  FloatSignAsInt ValueAsInt;
+  getSignAsIntValue(ValueAsInt, DL, Value);
+  EVT IntVT = ValueAsInt.IntValue.getValueType();
+  SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue,
+                                    ClearSignMask);
+  return modifySignAsInt(ValueAsInt, DL, ClearedSign);
 }
 
 void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
@@ -1798,7 +1929,8 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
 
   FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
   int SPFI = StackPtrFI->getIndex();
-  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
 
   unsigned SrcSize = SrcOp.getValueType().getSizeInBits();
   unsigned SlotSize = SlotVT.getSizeInBits();
@@ -1838,14 +1970,14 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
   FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
   int SPFI = StackPtrFI->getIndex();
 
-  SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(0),
-                                 StackPtr,
-                                 MachinePointerInfo::getFixedStack(SPFI),
-                                 Node->getValueType(0).getVectorElementType(),
-                                 false, false, 0);
-  return DAG.getLoad(Node->getValueType(0), dl, Ch, StackPtr,
-                     MachinePointerInfo::getFixedStack(SPFI),
-                     false, false, false, 0);
+  SDValue Ch = DAG.getTruncStore(
+      DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
+      Node->getValueType(0).getVectorElementType(), false, false, 0);
+  return DAG.getLoad(
+      Node->getValueType(0), dl, Ch, StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), false,
+      false, false, 0);
 }
 
 static bool
@@ -2011,9 +2143,10 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
     SDValue CPIdx =
         DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
     unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-    return DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                       MachinePointerInfo::getConstantPool(),
-                       false, false, false, Alignment);
+    return DAG.getLoad(
+        VT, dl, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, Alignment);
   }
 
   SmallSet<SDValue, 16> DefinedValues;
@@ -2205,47 +2338,6 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
   return ExpandLibCall(LC, Node, isSigned);
 }
 
-/// Return true if divmod libcall is available.
-static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
-                                     const TargetLowering &TLI) {
-  RTLIB::Libcall LC;
-  switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
-  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
-  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
-  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
-  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
-  }
-
-  return TLI.getLibcallName(LC) != nullptr;
-}
-
-/// Only issue divrem libcall if both quotient and remainder are needed.
-static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
-  // The other use might have been replaced with a divrem already.
-  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
-  unsigned OtherOpcode = 0;
-  if (isSigned)
-    OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV;
-  else
-    OtherOpcode = isDIV ? ISD::UREM : ISD::UDIV;
-
-  SDValue Op0 = Node->getOperand(0);
-  SDValue Op1 = Node->getOperand(1);
-  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
-         UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User == Node)
-      continue;
-    if ((User->getOpcode() == OtherOpcode || User->getOpcode() == DivRemOpc) &&
-        User->getOperand(0) == Op0 &&
-        User->getOperand(1) == Op1)
-      return true;
-  }
-  return false;
-}
-
 /// Issue libcalls to __{u}divmod to compute div / rem pairs.
 void
 SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
@@ -2428,6 +2520,8 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
                                                    SDValue Op0,
                                                    EVT DestVT,
                                                    SDLoc dl) {
+  // TODO: Should any fast-math-flags be set for the created nodes?
+  
   if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
     // simple 32-bit [signed|unsigned] integer to float/double expansion
 
@@ -2611,14 +2705,15 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   Alignment = std::min(Alignment, 4u);
   SDValue FudgeInReg;
   if (DestVT == MVT::f32)
-    FudgeInReg = DAG.getLoad(MVT::f32, dl, DAG.getEntryNode(), CPIdx,
-                             MachinePointerInfo::getConstantPool(),
-                             false, false, false, Alignment);
+    FudgeInReg = DAG.getLoad(
+        MVT::f32, dl, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, Alignment);
   else {
-    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
-                                  DAG.getEntryNode(), CPIdx,
-                                  MachinePointerInfo::getConstantPool(),
-                                  MVT::f32, false, false, false, Alignment);
+    SDValue Load = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+        false, false, false, Alignment);
     HandleSDNode Handle(Load);
     LegalizeOp(Load.getNode());
     FudgeInReg = Handle.getValue();
@@ -2713,6 +2808,31 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
   return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
 }
 
+/// Open code the operations for BITREVERSE.
+SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, SDLoc dl) {
+  EVT VT = Op.getValueType();
+  EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+  
+  SDValue Tmp, Tmp2;
+  Tmp = DAG.getConstant(0, dl, VT);
+  for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
+    if (I < J)
+      Tmp2 =
+          DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT));
+    else
+      Tmp2 =
+          DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));
+    
+    APInt Shift(Sz, 1);
+    Shift = Shift.shl(J);
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
+    Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
+  }
+
+  return Tmp;
+}
+
 /// Open code the operations for BSWAP of the specified operation.
 SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   EVT VT = Op.getValueType();
@@ -2821,6 +2941,18 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // This trivially expands to CTLZ.
     return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
   case ISD::CTLZ: {
+    EVT VT = Op.getValueType();
+    unsigned len = VT.getSizeInBits();
+
+    if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
+      EVT SetCCVT = getSetCCResultType(VT);
+      SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+      SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+      return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+                         DAG.getConstant(len, dl, VT), CTLZ);
+    }
+
     // for now, we do this:
     // x = x | (x >> 1);
     // x = x | (x >> 2);
@@ -2830,9 +2962,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // return popcount(~x);
     //
     // Ref: "Hacker's Delight" by Henry Warren
-    EVT VT = Op.getValueType();
     EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-    unsigned len = VT.getSizeInBits();
     for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
       SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
       Op = DAG.getNode(ISD::OR, dl, VT, Op,
@@ -2865,16 +2995,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
   }
 }
 
-std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
-  unsigned Opc = Node->getOpcode();
-  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
-  RTLIB::Libcall LC = RTLIB::getATOMIC(Opc, VT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
-
-  return ExpandChainLibCall(LC, Node, false);
-}
-
-void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
+bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4;
@@ -2888,6 +3009,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
     Results.push_back(Tmp1);
     break;
+  case ISD::BITREVERSE:
+    Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
+    break;
   case ISD::BSWAP:
     Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
     break;
@@ -2908,30 +3032,19 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // preserve the chain and be done.
     Results.push_back(Node->getOperand(0));
     break;
+  case ISD::READCYCLECOUNTER:
+    // If the target didn't expand this, just return 'zero' and preserve the
+    // chain.
+    Results.append(Node->getNumValues() - 1,
+                   DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Node->getOperand(0));
+    break;
   case ISD::EH_SJLJ_SETJMP:
     // If the target didn't expand this, just return 'zero' and preserve the
     // chain.
     Results.push_back(DAG.getConstant(0, dl, MVT::i32));
     Results.push_back(Node->getOperand(0));
     break;
-  case ISD::ATOMIC_FENCE: {
-    // If the target didn't lower this, lower it to '__sync_synchronize()' call
-    // FIXME: handle "fence singlethread" more efficiently.
-    TargetLowering::ArgListTy Args;
-
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl)
-        .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__sync_synchronize",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args), 0);
-
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
-    Results.push_back(CallResult.second);
-    break;
-  }
   case ISD::ATOMIC_LOAD: {
     // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
     SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
@@ -2959,26 +3072,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Swap.getValue(1));
     break;
   }
-  // By default, atomic intrinsics are marked Legal and lowered. Targets
-  // which don't support them directly, however, may want libcalls, in which
-  // case they mark them Expand, and we get here.
-  case ISD::ATOMIC_SWAP:
-  case ISD::ATOMIC_LOAD_ADD:
-  case ISD::ATOMIC_LOAD_SUB:
-  case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_NAND:
-  case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_MAX:
-  case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX:
-  case ISD::ATOMIC_CMP_SWAP: {
-    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(Node);
-    Results.push_back(Tmp.first);
-    Results.push_back(Tmp.second);
-    break;
-  }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
     // splits out the success value as a comparison. Expanding the resulting
@@ -3017,21 +3110,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     break;
   }
-  case ISD::TRAP: {
-    // If this operation is not supported, lower it to 'abort()' call
-    TargetLowering::ArgListTy Args;
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl)
-        .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("abort",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args), 0);
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
-    Results.push_back(CallResult.second);
-    break;
-  }
   case ISD::FP_ROUND:
   case ISD::BITCAST:
     Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
@@ -3097,6 +3175,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                         Node->getOperand(0),
                         Tmp1, ISD::SETLT);
     True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
+    // TODO: Should any fast-math-flags be set for the FSUB?
     False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
                         DAG.getNode(ISD::FSUB, dl, VT,
                                     Node->getOperand(0), Tmp1));
@@ -3106,57 +3185,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-  case ISD::VAARG: {
-    const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-    EVT VT = Node->getValueType(0);
-    Tmp1 = Node->getOperand(0);
-    Tmp2 = Node->getOperand(1);
-    unsigned Align = Node->getConstantOperandVal(3);
-
-    SDValue VAListLoad =
-        DAG.getLoad(TLI.getPointerTy(DAG.getDataLayout()), dl, Tmp1, Tmp2,
-                    MachinePointerInfo(V), false, false, false, 0);
-    SDValue VAList = VAListLoad;
-
-    if (Align > TLI.getMinStackArgumentAlignment()) {
-      assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
-
-      VAList = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
-                           DAG.getConstant(Align - 1, dl,
-                                           VAList.getValueType()));
-
-      VAList = DAG.getNode(ISD::AND, dl, VAList.getValueType(), VAList,
-                           DAG.getConstant(-(int64_t)Align, dl,
-                                           VAList.getValueType()));
-    }
-
-    // Increment the pointer, VAList, to the next vaarg
-    Tmp3 = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
-                       DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(
-                                           VT.getTypeForEVT(*DAG.getContext())),
-                                       dl, VAList.getValueType()));
-    // Store the incremented VAList to the legalized pointer
-    Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2,
-                        MachinePointerInfo(V), false, false, 0);
-    // Load the actual argument out of the pointer VAList
-    Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, MachinePointerInfo(),
-                                  false, false, false, 0));
+  case ISD::VAARG:
+    Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
     break;
-  }
-  case ISD::VACOPY: {
-    // This defaults to loading a pointer from the input and storing it to the
-    // output, returning the chain.
-    const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
-    const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
-    Tmp1 = DAG.getLoad(TLI.getPointerTy(DAG.getDataLayout()), dl,
-                       Node->getOperand(0), Node->getOperand(2),
-                       MachinePointerInfo(VS), false, false, false, 0);
-    Tmp1 = DAG.getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
-                        MachinePointerInfo(VD), false, false, 0);
-    Results.push_back(Tmp1);
+  case ISD::VACOPY:
+    Results.push_back(DAG.expandVACopy(Node));
     break;
-  }
   case ISD::EXTRACT_VECTOR_ELT:
     if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
       // This must be an access of the only element.  Return it.
@@ -3302,28 +3337,24 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Node->getOperand(0));
     }
     break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(0));
+    break;
   case ISD::FCOPYSIGN:
     Results.push_back(ExpandFCOPYSIGN(Node));
     break;
   case ISD::FNEG:
     // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
     Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0));
+    // TODO: If FNEG has fast-math-flags, propagate them to the FSUB.
     Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
                        Node->getOperand(0));
     Results.push_back(Tmp1);
     break;
-  case ISD::FABS: {
-    // Expand Y = FABS(X) -> Y = (X >u 0.0) ? X : fneg(X).
-    EVT VT = Node->getValueType(0);
-    Tmp1 = Node->getOperand(0);
-    Tmp2 = DAG.getConstantFP(0.0, dl, VT);
-    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(Tmp1.getValueType()),
-                        Tmp1, Tmp2, ISD::SETUGT);
-    Tmp3 = DAG.getNode(ISD::FNEG, dl, VT, Tmp1);
-    Tmp1 = DAG.getSelect(dl, VT, Tmp2, Tmp1, Tmp3);
-    Results.push_back(Tmp1);
+  case ISD::FABS:
+    Results.push_back(ExpandFABS(Node));
     break;
-  }
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -3344,25 +3375,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
     
-  case ISD::FMINNUM:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
-                                      RTLIB::FMIN_F80, RTLIB::FMIN_F128,
-                                      RTLIB::FMIN_PPCF128));
-    break;
-  case ISD::FMAXNUM:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
-                                      RTLIB::FMAX_F80, RTLIB::FMAX_F128,
-                                      RTLIB::FMAX_PPCF128));
-    break;
-  case ISD::FSQRT:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
-                                      RTLIB::SQRT_F80, RTLIB::SQRT_F128,
-                                      RTLIB::SQRT_PPCF128));
-    break;
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
-    bool isSIN = Node->getOpcode() == ISD::FSIN;
     // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
     // fcos which share the same operand and both are used.
     if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
@@ -3370,137 +3385,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         && useSinCos(Node)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
-      if (!isSIN)
+      if (Node->getOpcode() == ISD::FCOS)
         Tmp1 = Tmp1.getValue(1);
       Results.push_back(Tmp1);
-    } else if (isSIN) {
-      Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
-                                        RTLIB::SIN_F80, RTLIB::SIN_F128,
-                                        RTLIB::SIN_PPCF128));
-    } else {
-      Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
-                                        RTLIB::COS_F80, RTLIB::COS_F128,
-                                        RTLIB::COS_PPCF128));
     }
     break;
   }
-  case ISD::FSINCOS:
-    // Expand into sincos libcall.
-    ExpandSinCosLibCall(Node, Results);
-    break;
-  case ISD::FLOG:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
-                                      RTLIB::LOG_F80, RTLIB::LOG_F128,
-                                      RTLIB::LOG_PPCF128));
-    break;
-  case ISD::FLOG2:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
-                                      RTLIB::LOG2_F80, RTLIB::LOG2_F128,
-                                      RTLIB::LOG2_PPCF128));
-    break;
-  case ISD::FLOG10:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
-                                      RTLIB::LOG10_F80, RTLIB::LOG10_F128,
-                                      RTLIB::LOG10_PPCF128));
-    break;
-  case ISD::FEXP:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
-                                      RTLIB::EXP_F80, RTLIB::EXP_F128,
-                                      RTLIB::EXP_PPCF128));
-    break;
-  case ISD::FEXP2:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
-                                      RTLIB::EXP2_F80, RTLIB::EXP2_F128,
-                                      RTLIB::EXP2_PPCF128));
-    break;
-  case ISD::FTRUNC:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
-                                      RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
-                                      RTLIB::TRUNC_PPCF128));
-    break;
-  case ISD::FFLOOR:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
-                                      RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
-                                      RTLIB::FLOOR_PPCF128));
-    break;
-  case ISD::FCEIL:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
-                                      RTLIB::CEIL_F80, RTLIB::CEIL_F128,
-                                      RTLIB::CEIL_PPCF128));
-    break;
-  case ISD::FRINT:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
-                                      RTLIB::RINT_F80, RTLIB::RINT_F128,
-                                      RTLIB::RINT_PPCF128));
-    break;
-  case ISD::FNEARBYINT:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
-                                      RTLIB::NEARBYINT_F64,
-                                      RTLIB::NEARBYINT_F80,
-                                      RTLIB::NEARBYINT_F128,
-                                      RTLIB::NEARBYINT_PPCF128));
-    break;
-  case ISD::FROUND:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
-                                      RTLIB::ROUND_F64,
-                                      RTLIB::ROUND_F80,
-                                      RTLIB::ROUND_F128,
-                                      RTLIB::ROUND_PPCF128));
-    break;
-  case ISD::FPOWI:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
-                                      RTLIB::POWI_F80, RTLIB::POWI_F128,
-                                      RTLIB::POWI_PPCF128));
-    break;
-  case ISD::FPOW:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
-                                      RTLIB::POW_F80, RTLIB::POW_F128,
-                                      RTLIB::POW_PPCF128));
-    break;
-  case ISD::FDIV:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
-                                      RTLIB::DIV_F80, RTLIB::DIV_F128,
-                                      RTLIB::DIV_PPCF128));
-    break;
-  case ISD::FREM:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
-                                      RTLIB::REM_F80, RTLIB::REM_F128,
-                                      RTLIB::REM_PPCF128));
-    break;
-  case ISD::FMA:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
-                                      RTLIB::FMA_F80, RTLIB::FMA_F128,
-                                      RTLIB::FMA_PPCF128));
-    break;
   case ISD::FMAD:
     llvm_unreachable("Illegal fmad should never be formed");
 
-  case ISD::FADD:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
-                                      RTLIB::ADD_F80, RTLIB::ADD_F128,
-                                      RTLIB::ADD_PPCF128));
-    break;
-  case ISD::FMUL:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
-                                      RTLIB::MUL_F80, RTLIB::MUL_F128,
-                                      RTLIB::MUL_PPCF128));
-    break;
-  case ISD::FP16_TO_FP: {
-    if (Node->getValueType(0) == MVT::f32) {
-      Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
-      break;
+  case ISD::FP16_TO_FP:
+    if (Node->getValueType(0) != MVT::f32) {
+      // We can extend to types bigger than f32 in two steps without changing
+      // the result. Since "f16 -> f32" is much more commonly available, give
+      // CodeGen the option of emitting that before resorting to a libcall.
+      SDValue Res =
+          DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
+      Results.push_back(
+          DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
     }
-
-    // We can extend to types bigger than f32 in two steps without changing the
-    // result. Since "f16 -> f32" is much more commonly available, give CodeGen
-    // the option of emitting that before resorting to a libcall.
-    SDValue Res =
-        DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
-    Results.push_back(
-        DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
     break;
-  }
-  case ISD::FP_TO_FP16: {
+  case ISD::FP_TO_FP16:
     if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
       SDValue Op = Node->getOperand(0);
       MVT SVT = Op.getSimpleValueType();
@@ -3512,16 +3417,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                        DAG.getIntPtrConstant(0, dl));
         Results.push_back(
             DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, FloatVal));
-        break;
       }
     }
-
-    RTLIB::Libcall LC =
-        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
-    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
-    Results.push_back(ExpandLibCall(LC, Node, false));
     break;
-  }
   case ISD::ConstantFP: {
     ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
     // Check to see if this FP immediate is already legal.
@@ -3530,17 +3428,19 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
+  case ISD::Constant: {
+    ConstantSDNode *CP = cast<ConstantSDNode>(Node);
+    Results.push_back(ExpandConstant(CP));
+    break;
+  }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
         TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
+      const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(Node)->Flags;
       Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
-      Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
+      Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
       Results.push_back(Tmp1);
-    } else {
-      Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
-                                        RTLIB::SUB_F80, RTLIB::SUB_F128,
-                                        RTLIB::SUB_PPCF128));
     }
     break;
   }
@@ -3564,29 +3464,17 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
     Tmp2 = Node->getOperand(0);
     Tmp3 = Node->getOperand(1);
-    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
-        (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
-         // If div is legal, it's better to do the normal expansion
-         !TLI.isOperationLegalOrCustom(DivOpc, Node->getValueType(0)) &&
-         useDivRem(Node, isSigned, false))) {
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
+      Results.push_back(Tmp1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
       // X % Y -> X-X/Y*Y
       Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
       Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
       Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
-    } else if (isSigned)
-      Tmp1 = ExpandIntLibCall(Node, true,
-                              RTLIB::SREM_I8,
-                              RTLIB::SREM_I16, RTLIB::SREM_I32,
-                              RTLIB::SREM_I64, RTLIB::SREM_I128);
-    else
-      Tmp1 = ExpandIntLibCall(Node, false,
-                              RTLIB::UREM_I8,
-                              RTLIB::UREM_I16, RTLIB::UREM_I32,
-                              RTLIB::UREM_I64, RTLIB::UREM_I128);
-    Results.push_back(Tmp1);
+      Results.push_back(Tmp1);
+    }
     break;
   }
   case ISD::UDIV:
@@ -3594,23 +3482,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     bool isSigned = Node->getOpcode() == ISD::SDIV;
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
     EVT VT = Node->getValueType(0);
-    SDVTList VTs = DAG.getVTList(VT, VT);
-    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
-        (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
-         useDivRem(Node, isSigned, true)))
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
                          Node->getOperand(1));
-    else if (isSigned)
-      Tmp1 = ExpandIntLibCall(Node, true,
-                              RTLIB::SDIV_I8,
-                              RTLIB::SDIV_I16, RTLIB::SDIV_I32,
-                              RTLIB::SDIV_I64, RTLIB::SDIV_I128);
-    else
-      Tmp1 = ExpandIntLibCall(Node, false,
-                              RTLIB::UDIV_I8,
-                              RTLIB::UDIV_I16, RTLIB::UDIV_I32,
-                              RTLIB::UDIV_I64, RTLIB::UDIV_I128);
-    Results.push_back(Tmp1);
+      Results.push_back(Tmp1);
+    }
     break;
   }
   case ISD::MULHU:
@@ -3626,11 +3503,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1.getValue(1));
     break;
   }
-  case ISD::SDIVREM:
-  case ISD::UDIVREM:
-    // Expand into divrem libcall
-    ExpandDivRemLibCall(Node, Results);
-    break;
   case ISD::MUL: {
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
@@ -3673,14 +3545,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                           TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
       Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
       Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
-      break;
     }
-
-    Tmp1 = ExpandIntLibCall(Node, false,
-                            RTLIB::MUL_I8,
-                            RTLIB::MUL_I16, RTLIB::MUL_I32,
-                            RTLIB::MUL_I64, RTLIB::MUL_I128);
-    Results.push_back(Tmp1);
     break;
   }
   case ISD::SADDO:
@@ -3867,9 +3732,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
-    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
-                                MachinePointerInfo::getJumpTable(), MemVT,
-                                false, false, false, 0);
+    SDValue LD = DAG.getExtLoad(
+        ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+        MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT,
+        false, false, false, 0);
     Addr = LD;
     if (TM.getRelocationModel() == Reloc::PIC_) {
       // For PIC, the sequence is:
@@ -4092,16 +3958,276 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   }
 
   // Replace the original node with the legalized result.
+  if (Results.empty())
+    return false;
+
+  ReplaceNode(Node, Results.data());
+  return true;
+}
+
+void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
+  SmallVector<SDValue, 8> Results;
+  SDLoc dl(Node);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  unsigned Opc = Node->getOpcode();
+  switch (Opc) {
+  case ISD::ATOMIC_FENCE: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    // FIXME: handle "fence singlethread" more efficiently.
+    TargetLowering::ArgListTy Args;
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("__sync_synchronize",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args), 0);
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+    RTLIB::Libcall LC = RTLIB::getATOMIC(Opc, VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
+
+    std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::TRAP: {
+    // If this operation is not supported, lower it to 'abort()' call
+    TargetLowering::ArgListTy Args;
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("abort",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args), 0);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::FMINNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                      RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                      RTLIB::FMIN_PPCF128));
+    break;
+  case ISD::FMAXNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                      RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                      RTLIB::FMAX_PPCF128));
+    break;
+  case ISD::FSQRT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                      RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                      RTLIB::SQRT_PPCF128));
+    break;
+  case ISD::FSIN:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                      RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                      RTLIB::SIN_PPCF128));
+    break;
+  case ISD::FCOS:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                                      RTLIB::COS_F80, RTLIB::COS_F128,
+                                      RTLIB::COS_PPCF128));
+    break;
+  case ISD::FSINCOS:
+    // Expand into sincos libcall.
+    ExpandSinCosLibCall(Node, Results);
+    break;
+  case ISD::FLOG:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                      RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                      RTLIB::LOG_PPCF128));
+    break;
+  case ISD::FLOG2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                      RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                      RTLIB::LOG2_PPCF128));
+    break;
+  case ISD::FLOG10:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                      RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                      RTLIB::LOG10_PPCF128));
+    break;
+  case ISD::FEXP:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                      RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                      RTLIB::EXP_PPCF128));
+    break;
+  case ISD::FEXP2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                      RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                      RTLIB::EXP2_PPCF128));
+    break;
+  case ISD::FTRUNC:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                      RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                      RTLIB::TRUNC_PPCF128));
+    break;
+  case ISD::FFLOOR:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                      RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                      RTLIB::FLOOR_PPCF128));
+    break;
+  case ISD::FCEIL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                      RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                      RTLIB::CEIL_PPCF128));
+    break;
+  case ISD::FRINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                      RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                      RTLIB::RINT_PPCF128));
+    break;
+  case ISD::FNEARBYINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
+                                      RTLIB::NEARBYINT_F64,
+                                      RTLIB::NEARBYINT_F80,
+                                      RTLIB::NEARBYINT_F128,
+                                      RTLIB::NEARBYINT_PPCF128));
+    break;
+  case ISD::FROUND:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
+                                      RTLIB::ROUND_F64,
+                                      RTLIB::ROUND_F80,
+                                      RTLIB::ROUND_F128,
+                                      RTLIB::ROUND_PPCF128));
+    break;
+  case ISD::FPOWI:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                      RTLIB::POWI_F80, RTLIB::POWI_F128,
+                                      RTLIB::POWI_PPCF128));
+    break;
+  case ISD::FPOW:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
+                                      RTLIB::POW_F80, RTLIB::POW_F128,
+                                      RTLIB::POW_PPCF128));
+    break;
+  case ISD::FDIV:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
+                                      RTLIB::DIV_F80, RTLIB::DIV_F128,
+                                      RTLIB::DIV_PPCF128));
+    break;
+  case ISD::FREM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
+                                      RTLIB::REM_F80, RTLIB::REM_F128,
+                                      RTLIB::REM_PPCF128));
+    break;
+  case ISD::FMA:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                                      RTLIB::FMA_F80, RTLIB::FMA_F128,
+                                      RTLIB::FMA_PPCF128));
+    break;
+  case ISD::FADD:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                      RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                      RTLIB::ADD_PPCF128));
+    break;
+  case ISD::FMUL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
+                                      RTLIB::MUL_F80, RTLIB::MUL_F128,
+                                      RTLIB::MUL_PPCF128));
+    break;
+  case ISD::FP16_TO_FP:
+    if (Node->getValueType(0) == MVT::f32) {
+      Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
+    }
+    break;
+  case ISD::FP_TO_FP16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
+    Results.push_back(ExpandLibCall(LC, Node, false));
+    break;
+  }
+  case ISD::FSUB:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
+                                      RTLIB::SUB_F80, RTLIB::SUB_F128,
+                                      RTLIB::SUB_PPCF128));
+    break;
+  case ISD::SREM:
+    Results.push_back(ExpandIntLibCall(Node, true,
+                                       RTLIB::SREM_I8,
+                                       RTLIB::SREM_I16, RTLIB::SREM_I32,
+                                       RTLIB::SREM_I64, RTLIB::SREM_I128));
+    break;
+  case ISD::UREM:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::UREM_I8,
+                                       RTLIB::UREM_I16, RTLIB::UREM_I32,
+                                       RTLIB::UREM_I64, RTLIB::UREM_I128));
+    break;
+  case ISD::SDIV:
+    Results.push_back(ExpandIntLibCall(Node, true,
+                                       RTLIB::SDIV_I8,
+                                       RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                                       RTLIB::SDIV_I64, RTLIB::SDIV_I128));
+    break;
+  case ISD::UDIV:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::UDIV_I8,
+                                       RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                                       RTLIB::UDIV_I64, RTLIB::UDIV_I128));
+    break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    // Expand into divrem libcall
+    ExpandDivRemLibCall(Node, Results);
+    break;
+  case ISD::MUL:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::MUL_I8,
+                                       RTLIB::MUL_I16, RTLIB::MUL_I32,
+                                       RTLIB::MUL_I64, RTLIB::MUL_I128));
+    break;
+  }
+
+  // Replace the original node with the legalized result.
   if (!Results.empty())
     ReplaceNode(Node, Results.data());
 }
 
+// Determine the vector type to use in place of an original scalar element when
+// promoting equally sized vectors.
+static MVT getPromotedVectorElementType(const TargetLowering &TLI,
+                                        MVT EltVT, MVT NewEltVT) {
+  unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
+  MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
+  assert(TLI.isTypeLegal(MidVT) && "unexpected");
+  return MidVT;
+}
+
 void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   SmallVector<SDValue, 8> Results;
   MVT OVT = Node->getSimpleValueType(0);
   if (Node->getOpcode() == ISD::UINT_TO_FP ||
       Node->getOpcode() == ISD::SINT_TO_FP ||
-      Node->getOpcode() == ISD::SETCC) {
+      Node->getOpcode() == ISD::SETCC ||
+      Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+      Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
     OVT = Node->getOperand(0).getSimpleValueType();
   }
   if (Node->getOpcode() == ISD::BR_CC)
@@ -4284,11 +4410,11 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FREM:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FCOPYSIGN:
   case ISD::FPOW: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
-    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
+                       Node->getFlags());
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
                                   Tmp3, DAG.getIntPtrConstant(0, dl)));
     break;
@@ -4303,12 +4429,20 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                     DAG.getIntPtrConstant(0, dl)));
     break;
   }
+  case ISD::FCOPYSIGN:
   case ISD::FPOWI: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = Node->getOperand(1);
     Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+
+    // fcopysign doesn't change anything but the sign bit, so
+    //   (fp_round (fcopysign (fpext a), b))
+    // is as precise as
+    //   (fp_round (fpext a))
+    // which is a no-op. Mark it as a TRUNCating FP_ROUND.
+    const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN);
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
-                                  Tmp3, DAG.getIntPtrConstant(0, dl)));
+                                  Tmp3, DAG.getIntPtrConstant(isTrunc, dl)));
     break;
   }
   case ISD::FFLOOR:
@@ -4333,6 +4467,157 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                                   Tmp2, DAG.getIntPtrConstant(0, dl)));
     break;
   }
+  case ISD::BUILD_VECTOR: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size
+    //
+    // e.g. v2i64 = build_vector i64:x, i64:y => v4i32
+    //  =>
+    //  v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y))
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for build_vector");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) {
+      SDValue Op = Node->getOperand(I);
+      NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op));
+    }
+
+    SDLoc SL(Node);
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
+    SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
+    Results.push_back(CvtVec);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size.
+    //
+    // e.g. v2i64 = extract_vector_elt x:v2i64, y:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //
+    // i64 = bitcast
+    //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
+    //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
+    //
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for extract_vector_elt");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
+
+    SDValue Idx = Node->getOperand(1);
+    EVT IdxVT = Idx.getValueType();
+    SDLoc SL(Node);
+    SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT);
+    SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
+
+    SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
+
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
+      SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
+
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
+                                CastVec, TmpIdx);
+      NewOps.push_back(Elt);
+    }
+
+    SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps);
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size
+    //
+    // e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //  v2i32:casty = bitcast y:i64
+    //
+    // v2i64 = bitcast
+    //   (v4i32 insert_vector_elt
+    //       (v4i32 insert_vector_elt v4i32:castx,
+    //                                (extract_vector_elt casty, 0), 2 * z),
+    //        (extract_vector_elt casty, 1), (2 * z + 1))
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for insert_vector_elt");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
+
+    SDValue Val = Node->getOperand(1);
+    SDValue Idx = Node->getOperand(2);
+    EVT IdxVT = Idx.getValueType();
+    SDLoc SL(Node);
+
+    SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT);
+    SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
+
+    SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
+
+    SDValue NewVec = CastVec;
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
+      SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
+
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
+                                CastVal, IdxOffset);
+
+      NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT,
+                           NewVec, Elt, InEltIdx);
+    }
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec));
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to different vector type with the smae total bit size.
+    //
+    // e.g. v2i64 = scalar_to_vector x:i64
+    //   =>
+    //  concat_vectors (v2i32 bitcast x:i64), (v2i32 undef)
+    //
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    SDValue Val = Node->getOperand(0);
+    SDLoc SL(Node);
+
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
+    SDValue Undef = DAG.getUNDEF(MidVT);
+
+    SmallVector<SDValue, 8> NewElts;
+    NewElts.push_back(CastVal);
+    for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I)
+      NewElts.push_back(Undef);
+
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts);
+    SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
+    Results.push_back(CvtVec);
+    break;
+  }
   }
 
   // Replace the original node with the legalized result.
@@ -4356,7 +4641,7 @@ void SelectionDAG::Legalize() {
     for (auto NI = allnodes_end(); NI != allnodes_begin();) {
       --NI;
 
-      SDNode *N = NI;
+      SDNode *N = &*NI;
       if (N->use_empty() && N != getRoot().getNode()) {
         ++NI;
         DeleteNode(N);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 3c50a41..6c0193a 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -43,10 +43,10 @@ static RTLIB::Libcall GetFPLibCall(EVT VT,
 }
 
 //===----------------------------------------------------------------------===//
-//  Result Float to Integer Conversion.
+//  Convert Float Results to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
-void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
         dbgs() << "\n");
   SDValue R = SDValue();
@@ -59,20 +59,26 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
+             "Unsupported SoftenFloatRes opcode!");
+      // Only when isLegalInHWReg, we can skip check of the operands.
+      R = SDValue(N, ResNo);
+      break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
-    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
-    case ISD::ConstantFP:
-      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
-      break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
-    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
-    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
@@ -84,7 +90,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
-    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
@@ -97,9 +103,9 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
-    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
-    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
-    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
@@ -107,11 +113,19 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   }
 
   // If R is null, the sub-method took care of registering the result.
-  if (R.getNode())
+  if (R.getNode()) {
     SetSoftenedFloat(SDValue(N, ResNo), R);
+    ReplaceSoftenFloatResult(N, ResNo, R);
+  }
+  // Return true only if the node is changed,
+  // assuming that the operands are also converted when necessary.
+  // Otherwise, return false to tell caller to scan operands.
+  return R.getNode() && R.getNode() != N;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   return BitConvertToInteger(N->getOperand(0));
 }
 
@@ -130,10 +144,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
                      BitConvertToInteger(N->getOperand(1)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
-  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(), SDLoc(N),
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, we can load better from the constant pool.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
                          TLI.getTypeToTransformTo(*DAG.getContext(),
-                                                  N->getValueType(0)));
+                                                  CN->getValueType(0)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -143,7 +161,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
                      NewOp, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
@@ -165,7 +186,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
                                            RTLIB::FMIN_F80,
                                            RTLIB::FMIN_F128,
                                            RTLIB::FMIN_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
@@ -178,7 +199,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
                                            RTLIB::FMAX_F80,
                                            RTLIB::FMAX_F128,
                                            RTLIB::FMAX_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
@@ -191,7 +212,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
@@ -203,10 +224,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
@@ -263,7 +287,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
@@ -276,7 +300,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
@@ -288,7 +312,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
@@ -300,7 +324,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
@@ -312,7 +336,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
@@ -324,7 +348,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
@@ -336,7 +360,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
@@ -348,7 +372,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -362,7 +386,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
-                         NVT, Ops, 3, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
@@ -375,7 +399,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
@@ -387,10 +411,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
@@ -402,7 +429,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, dl).first;
+                         NVT, Ops, false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -418,11 +445,20 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
       SoftenFloatResult(Op.getNode(), 0);
   }
 
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
+    Op = GetPromotedFloat(Op);
+    // If the promotion did the FP_EXTEND to the destination type for us,
+    // there's nothing left to do here.
+    if (Op.getValueType() == N->getValueType(0)) {
+      return BitConvertToInteger(Op);
+    }
+  }
+
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
     Op = GetSoftenedFloat(Op);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -430,7 +466,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
   SDValue Op = N->getOperand(0);
-  SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, &Op, 1,
+  SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
                                   false, SDLoc(N)).first;
   if (N->getValueType(0) == MVT::f32)
     return Res32;
@@ -438,7 +474,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Res32, 1, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, NVT, Res32, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -452,7 +488,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
@@ -465,7 +501,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -479,7 +515,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
@@ -492,7 +528,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
@@ -504,7 +540,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
@@ -516,7 +552,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
                                            RTLIB::ROUND_F80,
                                            RTLIB::ROUND_F128,
                                            RTLIB::ROUND_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
@@ -528,7 +564,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
@@ -540,7 +576,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
@@ -553,7 +589,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N)).first;
+                         NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
@@ -568,10 +604,11 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N)).first;
+                         NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
+  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
@@ -586,7 +623,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                        L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
-    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    if (N != NewL.getValue(1).getNode())
+      ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
     return NewL;
   }
 
@@ -600,17 +638,24 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
-  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+  auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
+  if (LegalInHWReg)
+    return ExtendNode;
+  return BitConvertToInteger(ExtendNode);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
@@ -636,7 +681,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  if (N != NewVAARG.getValue(1).getNode())
+    ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
   return NewVAARG;
 }
 
@@ -665,12 +711,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, Signed, dl).first;
+                         Op, Signed, dl).first;
 }
 
 
 //===----------------------------------------------------------------------===//
-//  Operand Float to Integer Conversion..
+//  Convert Float Operand to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
@@ -680,6 +726,8 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   switch (N->getOpcode()) {
   default:
+    if (CanSkipSoftenFloatOperand(N, OpNo))
+      return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
@@ -691,18 +739,27 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
-  case ISD::FP_TO_SINT:  Res = SoftenFloatOp_FP_TO_SINT(N); break;
-  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
-  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::STORE:
+    Res = SoftenFloatOp_STORE(N, OpNo);
+    // Do not try to analyze or soften this node again if the value is
+    // or can be held in a register. In that case, Res.getNode() should
+    // be equal to N.
+    if (Res.getNode() == N &&
+        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+      return false;
+    // Otherwise, we need to reanalyze and lower the new Res nodes.
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
-  // core about this.
+  // core about this to re-analyze.
   if (Res.getNode() == N)
     return true;
 
@@ -713,6 +770,41 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
+bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+    return false;
+  // When the operand type can be kept in registers, SoftenFloatResult
+  // will call ReplaceValueWith to replace all references and we can
+  // skip softening this operand.
+  switch (N->getOperand(OpNo).getOpcode()) {
+    case ISD::BITCAST:
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+    case ISD::SELECT:
+    case ISD::SELECT_CC:
+      return true;
+  }
+  // For some opcodes, SoftenFloatResult handles all conversion of softening
+  // and replacing operands, so that there is no need to soften operands
+  // again, although such opcode could be scanned for other illegal operands.
+  switch (N->getOpcode()) {
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+      return true;
+  }
+  return false;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
@@ -730,7 +822,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
 
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
 }
 
 
@@ -747,7 +839,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -773,20 +865,33 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
                  0);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
+  bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
+  EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
-  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
-}
+  EVT NVT = EVT();
+  SDLoc dl(N);
+
+  // If the result is not legal, eg: fp -> i1, then it needs to be promoted to
+  // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly
+  // match, eg. we don't have fp -> i8 conversions.
+  // Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+       IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+       ++IntVT) {
+    NVT = (MVT::SimpleValueType)IntVT;
+    // The type needs to big enough to hold the result.
+    if (NVT.bitsGE(RVT))
+      LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT):RTLIB::getFPTOUINT(SVT, NVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
-  EVT RVT = N->getValueType(0);
-  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
+  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, false, dl).first;
+
+  // Truncate the result if the libcall returns a larger type.
+  return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
@@ -1028,7 +1133,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                                    RTLIB::DIV_F80,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
-                                 N->getValueType(0), Ops, 2, false,
+                                 N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1102,7 +1207,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                                    RTLIB::FMA_F80,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
-                                 N->getValueType(0), Ops, 3, false,
+                                 N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1116,7 +1221,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                                    RTLIB::MUL_F80,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
-                                 N->getValueType(0), Ops, 2, false,
+                                 N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1231,7 +1336,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                                    RTLIB::SUB_F80,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
-                                 N->getValueType(0), Ops, 2, false,
+                                 N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1310,7 +1415,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl).first;
+    Hi = TLI.makeLibCall(DAG, LC, VT, Src, true, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1341,6 +1446,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     break;
   }
 
+  // TODO: Are there fast-math-flags to propagate to this FADD?
   Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
                    DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble,
                                              APInt(128, Parts)),
@@ -1494,7 +1600,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl).first;
+  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
@@ -1511,6 +1617,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
     SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
     //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
     // FIXME: generated code sucks.
+    // TODO: Are there fast-math-flags to propagate to this FSUB?
     return DAG.getSelectCC(dl, N->getOperand(0), Tmp,
                            DAG.getNode(ISD::ADD, dl, MVT::i32,
                                        DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
@@ -1527,7 +1634,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
-  return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1,
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
                          false, dl).first;
 }
 
@@ -1912,8 +2019,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_BinOp(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op0 = GetPromotedFloat(N->getOperand(0));
   SDValue Op1 = GetPromotedFloat(N->getOperand(1));
-
-  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 9f060a09..74f80db 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -53,6 +53,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
   case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
   case ISD::BITCAST:     Res = PromoteIntRes_BITCAST(N); break;
+  case ISD::BITREVERSE:  Res = PromoteIntRes_BITREVERSE(N); break;
   case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
   case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
@@ -65,16 +66,20 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
-  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
-  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));break;
+  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
+  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
+  case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
+    break;
   case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
   case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
   case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
   case ISD::SMIN:
-  case ISD::SMAX:
+  case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
   case ISD::UMIN:
-  case ISD::UMAX:        Res = PromoteIntRes_SimpleIntBinOp(N); break;
+  case ISD::UMAX:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
+
   case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
   case ISD::SIGN_EXTEND_INREG:
                          Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
@@ -114,10 +119,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
 
   case ISD::SDIV:
-  case ISD::SREM:        Res = PromoteIntRes_SDIV(N); break;
+  case ISD::SREM:        Res = PromoteIntRes_SExtIntBinOp(N); break;
 
   case ISD::UDIV:
-  case ISD::UREM:        Res = PromoteIntRes_UDIV(N); break;
+  case ISD::UREM:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
 
   case ISD::SADDO:
   case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
@@ -180,7 +185,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
                               N->getChain(), N->getBasePtr(),
                               N->getMemOperand(), N->getOrdering(),
                               N->getSynchScope());
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
@@ -193,7 +198,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
                               N->getChain(), N->getBasePtr(),
                               Op2, N->getMemOperand(), N->getOrdering(),
                               N->getSynchScope());
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
@@ -257,12 +262,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
     return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
   case TargetLowering::TypePromoteFloat: {
     // Convert the promoted float by hand.
-    if (NOutVT.bitsEq(NInVT)) {
-      SDValue PromotedOp = GetPromotedFloat(InOp);
-      SDValue Trunc = DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp);
-      return DAG.getNode(ISD::AssertZext, dl, NOutVT, Trunc,
-                         DAG.getValueType(OutVT));
-    }
+    SDValue PromotedOp = GetPromotedFloat(InOp);
+    return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp);
     break;
   }
   case TargetLowering::TypeExpandInteger:
@@ -316,6 +317,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
                       TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+
+  unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  return DAG.getNode(
+      ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
+      DAG.getConstant(DiffBits, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
   // The pair element type may be legal, or may not promote to the same type as
   // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
@@ -465,7 +479,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
                                N->getMemoryVT(), N->getMemOperand());
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
@@ -475,20 +489,34 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
 
-  SDValue Mask = N->getMask();
-  EVT NewMaskVT = getSetCCResultType(NVT);
-  if (NewMaskVT != N->getMask().getValueType())
-    Mask = PromoteTargetBoolean(Mask, NewMaskVT);
   SDLoc dl(N);
-
   SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
-                                  Mask, ExtSrc0, N->getMemoryVT(),
+                                  N->getMask(), ExtSrc0, N->getMemoryVT(),
                                   N->getMemOperand(), ISD::SEXTLOAD);
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtSrc0 = GetPromotedInteger(N->getValue());
+  assert(NVT == ExtSrc0.getValueType() &&
+      "Gather result type and the passThru agrument type should be the same");
+
+  SDLoc dl(N);
+  SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(),
+                   N->getIndex()};
+  SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
+                                    N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand()); 
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
+
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Simply change the return type of the boolean result.
@@ -534,14 +562,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) {
-  // Sign extend the input.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     LHS.getValueType(), LHS, RHS);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
@@ -629,6 +649,22 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
+  // Sign extend the input.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
+  // Zero extend the input.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
@@ -770,14 +806,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   return Mul;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) {
-  // Zero extend the input.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     LHS.getValueType(), LHS, RHS);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
   return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
                                                N->getValueType(0)));
@@ -875,6 +903,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                     OpNo); break;
   case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
                                                     OpNo); break;
+  case ISD::MGATHER:  Res = PromoteIntOp_MGATHER(cast<MaskedGatherSDNode>(N),
+                                                 OpNo); break;
+  case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
+                                                  OpNo); break;
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::FP16_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
@@ -1143,56 +1175,49 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
                            N->getMemoryVT(), N->getMemOperand());
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
+                                              unsigned OpNo) {
 
   SDValue DataOp = N->getValue();
   EVT DataVT = DataOp.getValueType();
   SDValue Mask = N->getMask();
-  EVT MaskVT = Mask.getValueType();
   SDLoc dl(N);
 
   bool TruncateStore = false;
-  if (!TLI.isTypeLegal(DataVT)) {
-    if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
-      DataOp = GetPromotedInteger(DataOp);
-      if (!TLI.isTypeLegal(MaskVT))
-        Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
-      TruncateStore = true;
-    }
+  if (OpNo == 2) {
+    // Mask comes before the data operand. If the data operand is legal, we just
+    // promote the mask.
+    // When the data operand has illegal type, we should legalize the data
+    // operand first. The mask will be promoted/splitted/widened according to
+    // the data operand type.
+    if (TLI.isTypeLegal(DataVT))
+      Mask = PromoteTargetBoolean(Mask, DataVT);
     else {
-      assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
-             "Unexpected data legalization in MSTORE");
-      DataOp = GetWidenedVector(DataOp);
-
-      if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
-        Mask = GetWidenedVector(Mask);
-      else {
-        EVT BoolVT = getSetCCResultType(DataOp.getValueType());
+      if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger)
+        return PromoteIntOp_MSTORE(N, 3);
 
-        // We can't use ModifyToType() because we should fill the mask with
-        // zeroes
-        unsigned WidenNumElts = BoolVT.getVectorNumElements();
-        unsigned MaskNumElts = MaskVT.getVectorNumElements();
+      else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector)
+        return WidenVecOp_MSTORE(N, 3);
 
-        unsigned NumConcat = WidenNumElts / MaskNumElts;
-        SmallVector<SDValue, 16> Ops(NumConcat);
-        SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
-        Ops[0] = Mask;
-        for (unsigned i = 1; i != NumConcat; ++i)
-          Ops[i] = ZeroVal;
-
-        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+      else {
+        assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector);
+        return SplitVecOp_MSTORE(N, 3);
       }
     }
+  } else { // Data operand
+    assert(OpNo == 3 && "Unexpected operand for promotion");
+    DataOp = GetPromotedInteger(DataOp);
+    Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+    TruncateStore = true;
   }
-  else
-    Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
+
   return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
                             N->getMemoryVT(), N->getMemOperand(),
                             TruncateStore);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
+                                             unsigned OpNo) {
   assert(OpNo == 2 && "Only know how to promote the mask!");
   EVT DataVT = N->getValueType(0);
   SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
@@ -1201,6 +1226,31 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo)
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
+                                               unsigned OpNo) {
+
+  SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+  if (OpNo == 2) {
+    // The Mask
+    EVT DataVT = N->getValueType(0);
+    NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  } else
+    NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
+                                                unsigned OpNo) {
+  SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+  if (OpNo == 2) {
+    // The Mask
+    EVT DataVT = N->getValue().getValueType();
+    NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  } else
+    NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
@@ -1259,6 +1309,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
   case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
   case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
+  case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
   case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
   case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
   case ISD::CTLZ_ZERO_UNDEF:
@@ -1270,6 +1321,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
   case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
   case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
+  case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break;
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
@@ -1763,12 +1815,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
-void DAGTypeLegalizer::ExpandIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
-                                                 SDValue &Lo, SDValue &Hi) {
-  SDValue Res = DisintegrateMERGE_VALUES(N, ResNo);
-  SplitInteger(Res, Lo, Hi);
-}
-
 void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -1834,6 +1880,14 @@ void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
   }
 }
 
+void DAGTypeLegalizer::ExpandIntRes_BITREVERSE(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BITREVERSE, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BITREVERSE, dl, Hi.getValueType(), Hi);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
@@ -1918,8 +1972,7 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/,
-                               dl).first,
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first,
                Lo, Hi);
 }
 
@@ -1934,8 +1987,7 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/,
-                               dl).first,
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first,
                Lo, Hi);
 }
 
@@ -2055,7 +2107,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     }
   }
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Ch);
 }
@@ -2096,11 +2148,21 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/,
-                               dl).first,
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
                Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDLoc DL(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other);
+  SDValue R = DAG.getNode(N->getOpcode(), DL, VTs, N->getOperand(0));
+  Lo = R.getValue(0);
+  Hi = R.getValue(1);
+  ReplaceValueWith(SDValue(N, 1), R.getValue(2));
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = Node->getOperand(0);
@@ -2166,7 +2228,7 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
     LC = RTLIB::SDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
@@ -2261,8 +2323,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl).first, Lo,
-                 Hi);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi);
     return;
   }
 
@@ -2352,7 +2413,7 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
     LC = RTLIB::SREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
@@ -2499,7 +2560,7 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
     LC = RTLIB::UDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
@@ -2525,7 +2586,7 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
     LC = RTLIB::UREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
@@ -2605,6 +2666,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
   case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
   case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
+  case ISD::SETCCE:            Res = ExpandIntOp_SETCCE(N); break;
   case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
   case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
@@ -2732,6 +2794,47 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
     return;
   }
 
+  if (LHSHi == RHSHi) {
+    // Comparing the low bits is enough.
+    NewLHS = Tmp1;
+    NewRHS = SDValue();
+    return;
+  }
+
+  // Lower with SETCCE if the target supports it.
+  // FIXME: Make all targets support this, then remove the other lowering.
+  if (TLI.getOperationAction(
+          ISD::SETCCE,
+          TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) ==
+      TargetLowering::Custom) {
+    // SETCCE can detect < and >= directly. For > and <=, flip operands and
+    // condition code.
+    bool FlipOperands = false;
+    switch (CCCode) {
+    case ISD::SETGT:  CCCode = ISD::SETLT;  FlipOperands = true; break;
+    case ISD::SETUGT: CCCode = ISD::SETULT; FlipOperands = true; break;
+    case ISD::SETLE:  CCCode = ISD::SETGE;  FlipOperands = true; break;
+    case ISD::SETULE: CCCode = ISD::SETUGE; FlipOperands = true; break;
+    default: break;
+    }
+    if (FlipOperands) {
+      std::swap(LHSLo, RHSLo);
+      std::swap(LHSHi, RHSHi);
+    }
+    // Perform a wide subtraction, feeding the carry from the low part into
+    // SETCCE. The SETCCE operation is essentially looking at the high part of
+    // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or
+    // positive iff LHS >= RHS.
+    SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue);
+    SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo);
+    SDValue Res =
+        DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()),
+                    LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode));
+    NewLHS = Res;
+    NewRHS = SDValue();
+    return;
+  }
+
   NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()),
                              LHSHi, RHSHi, ISD::SETEQ, false,
                              DagCombineInfo, dl);
@@ -2796,6 +2899,24 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
                                 DAG.getCondCode(CCCode)), 0);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+  SDLoc dl = SDLoc(N);
+
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(LHS, LHSLo, LHSHi);
+  GetExpandedInteger(RHS, RHSLo, RHSHi);
+
+  // Expand to a SUBE for the low part and a smaller SETCCE for the high.
+  SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue);
+  SDValue LowCmp = DAG.getNode(ISD::SUBE, dl, VTList, LHSLo, RHSLo, Carry);
+  return DAG.getNode(ISD::SETCCE, dl, N->getValueType(0), LHSHi, RHSHi,
+                     LowCmp.getValue(1), Cond);
+}
+
 SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   // The value being shifted is legal, but the shift amount is too big.
   // It follows that either the result of the shift is undefined, or the
@@ -2820,7 +2941,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2980,11 +3101,10 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
 
     // Load the value out, extending it from f32 to the destination float type.
     // FIXME: Avoid the extend by constructing the right constant pool?
-    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(),
-                                   FudgePtr,
-                                   MachinePointerInfo::getConstantPool(),
-                                   MVT::f32,
-                                   false, false, false, Alignment);
+    SDValue Fudge = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+        false, false, false, Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
   }
 
@@ -2992,7 +3112,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first;
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 54cfaf5..2a0b0aa 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -73,21 +73,20 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
   // (for example because it was created but not used).  In general, we cannot
   // distinguish between new nodes and deleted nodes.
   SmallVector<SDNode*, 16> NewNodes;
-  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = DAG.allnodes_end(); I != E; ++I) {
+  for (SDNode &Node : DAG.allnodes()) {
     // Remember nodes marked NewNode - they are subject to extra checking below.
-    if (I->getNodeId() == NewNode)
-      NewNodes.push_back(I);
+    if (Node.getNodeId() == NewNode)
+      NewNodes.push_back(&Node);
 
-    for (unsigned i = 0, e = I->getNumValues(); i != e; ++i) {
-      SDValue Res(I, i);
+    for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) {
+      SDValue Res(&Node, i);
       bool Failed = false;
 
       unsigned Mapped = 0;
       if (ReplacedValues.find(Res) != ReplacedValues.end()) {
         Mapped |= 1;
         // Check that remapped values are only used by nodes marked NewNode.
-        for (SDNode::use_iterator UI = I->use_begin(), UE = I->use_end();
+        for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
              UI != UE; ++UI)
           if (UI.getUse().getResNo() == i)
             assert(UI->getNodeId() == NewNode &&
@@ -119,16 +118,16 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
       if (WidenedVectors.find(Res) != WidenedVectors.end())
         Mapped |= 128;
 
-      if (I->getNodeId() != Processed) {
+      if (Node.getNodeId() != Processed) {
         // Since we allow ReplacedValues to map deleted nodes, it may map nodes
         // marked NewNode too, since a deleted node may have been reallocated as
         // another node that has not been seen by the LegalizeTypes machinery.
-        if ((I->getNodeId() == NewNode && Mapped > 1) ||
-            (I->getNodeId() != NewNode && Mapped != 0)) {
+        if ((Node.getNodeId() == NewNode && Mapped > 1) ||
+            (Node.getNodeId() != NewNode && Mapped != 0)) {
           dbgs() << "Unprocessed value in a map!";
           Failed = true;
         }
-      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(I)) {
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) {
         if (Mapped > 1) {
           dbgs() << "Value with legal type was transformed!";
           Failed = true;
@@ -194,13 +193,12 @@ bool DAGTypeLegalizer::run() {
   // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess'
   // (and remembering them) if they are leaves and assigning 'Unanalyzed' if
   // non-leaves.
-  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = DAG.allnodes_end(); I != E; ++I) {
-    if (I->getNumOperands() == 0) {
-      I->setNodeId(ReadyToProcess);
-      Worklist.push_back(I);
+  for (SDNode &Node : DAG.allnodes()) {
+    if (Node.getNumOperands() == 0) {
+      Node.setNodeId(ReadyToProcess);
+      Worklist.push_back(&Node);
     } else {
-      I->setNodeId(Unanalyzed);
+      Node.setNodeId(Unanalyzed);
     }
   }
 
@@ -240,9 +238,13 @@ bool DAGTypeLegalizer::run() {
         Changed = true;
         goto NodeDone;
       case TargetLowering::TypeSoftenFloat:
-        SoftenFloatResult(N, i);
-        Changed = true;
-        goto NodeDone;
+        Changed = SoftenFloatResult(N, i);
+        if (Changed)
+          goto NodeDone;
+        // If not changed, the result type should be legally in register.
+        assert(isLegalInHWReg(ResultVT) &&
+               "Unchanged SoftenFloatResult should be legal in register!");
+        goto ScanOperands;
       case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
@@ -409,40 +411,48 @@ NodeDone:
   // In a debug build, scan all the nodes to make sure we found them all.  This
   // ensures that there are no cycles and that everything got processed.
 #ifndef NDEBUG
-  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = DAG.allnodes_end(); I != E; ++I) {
+  for (SDNode &Node : DAG.allnodes()) {
     bool Failed = false;
 
     // Check that all result types are legal.
-    if (!IgnoreNodeResults(I))
-      for (unsigned i = 0, NumVals = I->getNumValues(); i < NumVals; ++i)
-        if (!isTypeLegal(I->getValueType(i))) {
-          dbgs() << "Result type " << i << " illegal!\n";
+    // A value type is illegal if its TypeAction is not TypeLegal,
+    // and TLI.RegClassForVT does not have a register class for this type.
+    // For example, the x86_64 target has f128 that is not TypeLegal,
+    // to have softened operators, but it also has FR128 register class to
+    // pass and return f128 values. Hence a legalized node can have f128 type.
+    if (!IgnoreNodeResults(&Node))
+      for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
+        if (!isTypeLegal(Node.getValueType(i)) &&
+            !TLI.isTypeLegal(Node.getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal: ";
+          Node.dump();
           Failed = true;
         }
 
     // Check that all operand types are legal.
-    for (unsigned i = 0, NumOps = I->getNumOperands(); i < NumOps; ++i)
-      if (!IgnoreNodeResults(I->getOperand(i).getNode()) &&
-          !isTypeLegal(I->getOperand(i).getValueType())) {
-        dbgs() << "Operand type " << i << " illegal!\n";
+    for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
+      if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
+          !isTypeLegal(Node.getOperand(i).getValueType()) &&
+          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal: ";
+        Node.getOperand(i).dump();
         Failed = true;
       }
 
-    if (I->getNodeId() != Processed) {
-       if (I->getNodeId() == NewNode)
+    if (Node.getNodeId() != Processed) {
+       if (Node.getNodeId() == NewNode)
          dbgs() << "New node not analyzed?\n";
-       else if (I->getNodeId() == Unanalyzed)
+       else if (Node.getNodeId() == Unanalyzed)
          dbgs() << "Unanalyzed node not noticed?\n";
-       else if (I->getNodeId() > 0)
+       else if (Node.getNodeId() > 0)
          dbgs() << "Operand not processed?\n";
-       else if (I->getNodeId() == ReadyToProcess)
+       else if (Node.getNodeId() == ReadyToProcess)
          dbgs() << "Not added to worklist?\n";
        Failed = true;
     }
 
     if (Failed) {
-      I->dump(&DAG); dbgs() << "\n";
+      Node.dump(&DAG); dbgs() << "\n";
       llvm_unreachable(nullptr);
     }
   }
@@ -751,13 +761,23 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
-  assert(Result.getValueType() ==
-         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+  // f128 of x86_64 could be kept in SSE registers,
+  // but sometimes softened to i128.
+  assert((Result.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = SoftenedFloats[Op];
-  assert(!OpEntry.getNode() && "Node is already converted to integer!");
+  // Allow repeated calls to save f128 type nodes
+  // or any node with type that transforms to itself.
+  // Many operations on these types are not softened.
+  assert((!OpEntry.getNode()||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+         "Node is already converted to integer!");
   OpEntry = Result;
 }
 
@@ -1042,23 +1062,22 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
   unsigned NumOps = N->getNumOperands();
   SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), nullptr, 0, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, isSigned,
                            dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, isSigned,
                            dl).first;
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned,
                            dl).first;
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
-  return TLI.makeLibCall(DAG, LC, N->getValueType(0),
-                         &Ops[0], NumOps, isSigned, dl).first;
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first;
 }
 
 // ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
@@ -1108,6 +1127,23 @@ SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
   return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
 }
 
+/// WidenTargetBoolean - Widen the given target boolean to a target boolean
+/// of the given type. The boolean vector is widened and then promoted to match
+/// the target boolean type of the given ValVT.
+SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT,
+                                             bool WithZeroes) {
+  SDLoc dl(Bool);
+  EVT BoolVT = Bool.getValueType();
+
+  assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() &&
+         TLI.isTypeLegal(ValVT) &&
+         "Unexpected types in WidenTargetBoolean");
+  EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(),
+                                ValVT.getVectorNumElements());
+  Bool = ModifyToType(Bool, WideVT, WithZeroes);
+  return PromoteTargetBoolean(Bool, ValVT);
+}
+
 /// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
 /// bits in Hi.
 void DAGTypeLegalizer::SplitInteger(SDValue Op,
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d1131a7..8ba19f7 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -72,6 +72,20 @@ private:
     return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
+  /// isSimpleLegalType - Return true if this is a simple legal type.
+  bool isSimpleLegalType(EVT VT) const {
+    return VT.isSimple() && TLI.isTypeLegal(VT);
+  }
+
+  /// isLegalInHWReg - Return true if this type can be passed in registers.
+  /// For example, x86_64's f128, should to be legally in registers
+  /// and only some operations converted to library calls or integer
+  /// bitwise operations.
+  bool isLegalInHWReg(EVT VT) const {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    return VT == NVT && isSimpleLegalType(VT);
+  }
+
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
@@ -173,6 +187,11 @@ private:
   std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
 
   SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT);
+
+  /// Modify Bit Vector to match SetCC result type of ValVT.
+  /// The bit vector is widened with zeroes when WithZeroes is true.
+  SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false);
+
   void ReplaceValueWith(SDValue From, SDValue To);
   void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
   void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
@@ -234,6 +253,7 @@ private:
   SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntRes_BITCAST(SDNode *N);
   SDValue PromoteIntRes_BSWAP(SDNode *N);
+  SDValue PromoteIntRes_BITREVERSE(SDNode *N);
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntRes_Constant(SDNode *N);
   SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
@@ -246,21 +266,22 @@ private:
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
+  SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
-  SDValue PromoteIntRes_SDIV(SDNode *N);
   SDValue PromoteIntRes_SELECT(SDNode *N);
   SDValue PromoteIntRes_VSELECT(SDNode *N);
   SDValue PromoteIntRes_SELECT_CC(SDNode *N);
   SDValue PromoteIntRes_SETCC(SDNode *N);
   SDValue PromoteIntRes_SHL(SDNode *N);
   SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N);
   SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
   SDValue PromoteIntRes_SRA(SDNode *N);
   SDValue PromoteIntRes_SRL(SDNode *N);
   SDValue PromoteIntRes_TRUNCATE(SDNode *N);
   SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
-  SDValue PromoteIntRes_UDIV(SDNode *N);
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
@@ -276,7 +297,6 @@ private:
   SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
-  SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N);
   SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
@@ -284,7 +304,6 @@ private:
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
-  SDValue PromoteIntOp_VSETCC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_Shift(SDNode *N);
   SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
   SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
@@ -294,6 +313,8 @@ private:
   SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
   SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -312,8 +333,6 @@ private:
 
   // Integer Result Expansion.
   void ExpandIntegerResult(SDNode *N, unsigned ResNo);
-  void ExpandIntRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
-                                       SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -322,6 +341,7 @@ private:
   void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_READCYCLECOUNTER  (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -333,6 +353,7 @@ private:
   void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -354,12 +375,10 @@ private:
 
   // Integer Operand Expansion.
   bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo);
-  SDValue ExpandIntOp_BITCAST(SDNode *N);
   SDValue ExpandIntOp_BR_CC(SDNode *N);
-  SDValue ExpandIntOp_BUILD_VECTOR(SDNode *N);
-  SDValue ExpandIntOp_EXTRACT_ELEMENT(SDNode *N);
   SDValue ExpandIntOp_SELECT_CC(SDNode *N);
   SDValue ExpandIntOp_SETCC(SDNode *N);
+  SDValue ExpandIntOp_SETCCE(SDNode *N);
   SDValue ExpandIntOp_Shift(SDNode *N);
   SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
   SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
@@ -375,32 +394,48 @@ private:
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
-  /// integer of the same size, this returns the integer.  The integer contains
-  /// exactly the same bits as Op - only the type changed.  For example, if Op
-  /// is an f32 which was softened to an i32, then this method returns an i32,
-  /// the bits of which coincide with those of Op.
+  /// GetSoftenedFloat - Given an operand Op of Float type, returns the integer
+  /// if the Op is not supported in target HW and converted to the integer.
+  /// The integer contains exactly the same bits as Op - only the type changed.
+  /// For example, if Op is an f32 which was softened to an i32, then this method
+  /// returns an i32, the bits of which coincide with those of Op.
+  /// If the Op can be efficiently supported in target HW or the operand must
+  /// stay in a register, the Op is not converted to an integer.
+  /// In that case, the given op is returned.
   SDValue GetSoftenedFloat(SDValue Op) {
     SDValue &SoftenedOp = SoftenedFloats[Op];
+    if (!SoftenedOp.getNode() &&
+        isSimpleLegalType(Op.getValueType()))
+      return Op;
     RemapValue(SoftenedOp);
     assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
     return SoftenedOp;
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Result Float to Integer Conversion.
-  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary.
+  void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) {
+    // When the result type can be kept in HW registers, the converted
+    // NewRes node could have the same type. We can save the effort in
+    // cloning every user of N in SoftenFloatOperand or other legalization functions,
+    // by calling ReplaceValueWith here to update all users.
+    if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo)))
+      ReplaceValueWith(SDValue(N, ResNo), NewRes);
+  }
+
+  // Convert Float Results to Integer for Non-HW-supported Operations.
+  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_BITCAST(SDNode *N);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
-  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
-  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
-  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FCOS(SDNode *N);
   SDValue SoftenFloatRes_FDIV(SDNode *N);
   SDValue SoftenFloatRes_FEXP(SDNode *N);
@@ -412,7 +447,7 @@ private:
   SDValue SoftenFloatRes_FMA(SDNode *N);
   SDValue SoftenFloatRes_FMUL(SDNode *N);
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
-  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
@@ -425,21 +460,25 @@ private:
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
-  SDValue SoftenFloatRes_LOAD(SDNode *N);
-  SDValue SoftenFloatRes_SELECT(SDNode *N);
-  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
-  // Operand Float to Integer Conversion.
+  // Return true if we can skip softening the given operand or SDNode because
+  // it was soften before by SoftenFloatResult and references to the operand
+  // were replaced by ReplaceValueWith.
+  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
+
+  // Convert Float Operand to Integer for Non-HW-supported Operations.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
   SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
-  SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N);
-  SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
@@ -575,7 +614,6 @@ private:
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
-  SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
@@ -617,20 +655,18 @@ private:
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_MGATHER(MaskedGatherSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
                                   SDValue &Hi);
 
@@ -650,6 +686,7 @@ private:
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
+  SDValue SplitVecOp_FCOPYSIGN(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Support: LegalizeVectorTypes.cpp
@@ -680,8 +717,8 @@ private:
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
+  SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
-  SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_SETCC(SDNode* N);
@@ -693,6 +730,7 @@ private:
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Shift(SDNode *N);
   SDValue WidenVecRes_Unary(SDNode *N);
@@ -707,9 +745,11 @@ private:
   SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
   SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo);
   SDValue WidenVecOp_SETCC(SDNode* N);
 
   SDValue WidenVecOp_Convert(SDNode *N);
+  SDValue WidenVecOp_FCOPYSIGN(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
@@ -745,8 +785,10 @@ private:
 
   /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
   /// input vector must have the same element type as NVT.
-  SDValue ModifyToType(SDValue InOp, EVT WidenVT);
-
+  /// When FillWithZeroes is "on" the vector will be widened with
+  /// zeroes.
+  /// By default, the vector will be widened with undefined values.
+  SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
 
   //===--------------------------------------------------------------------===//
   // Generic Splitting: LegalizeTypesGeneric.cpp
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 14d8f77..593c346 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -53,12 +53,17 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypePromoteFloat:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
-    case TargetLowering::TypeSoftenFloat:
-      // Convert the integer operand instead.
-      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+    case TargetLowering::TypeSoftenFloat: {
+      // Expand the floating point operand only if it was converted to integers.
+      // Otherwise, it is a legal type like f128 that can be saved in a register.
+      auto SoftenedOp = GetSoftenedFloat(InOp);
+      if (SoftenedOp == InOp)
+        break;
+      SplitInteger(SoftenedOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
+    }
     case TargetLowering::TypeExpandInteger:
     case TargetLowering::TypeExpandFloat: {
       auto &DL = DAG.getDataLayout();
@@ -161,7 +166,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       NOutVT.getTypeForEVT(*DAG.getContext()));
   SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
 
   // Emit a store to the stack slot.
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo,
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 83d4ad5..f61f631 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -105,6 +105,8 @@ class VectorLegalizer {
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
+  SDValue ExpandBITREVERSE(SDValue Op);
+  SDValue ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op);
 
   /// \brief Implements vector promotion.
   ///
@@ -159,7 +161,7 @@ bool VectorLegalizer::Run() {
   DAG.AssignTopologicalOrder();
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
        E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I)
-    LegalizeOp(SDValue(I, 0));
+    LegalizeOp(SDValue(&*I, 0));
 
   // Finally, it's possible the root changed.  Get the new root.
   SDValue OldRoot = DAG.getRoot();
@@ -218,9 +220,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
             assert(Result.getValue(1).use_empty() &&
                    "There are still live users of the old chain!");
             return LegalizeOp(Lowered);
-          } else {
-            return TranslateLegalizeResults(Op, Lowered);
           }
+          return TranslateLegalizeResults(Op, Lowered);
         }
       case TargetLowering::Expand:
         Changed = true;
@@ -231,7 +232,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     EVT StVT = ST->getMemoryVT();
     MVT ValVT = ST->getValue().getSimpleValueType();
     if (StVT.isVector() && ST->isTruncatingStore())
-      switch (TLI.getTruncStoreAction(ValVT, StVT.getSimpleVT())) {
+      switch (TLI.getTruncStoreAction(ValVT, StVT)) {
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
@@ -244,7 +245,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
       }
-  } else if (Op.getOpcode() == ISD::MSCATTER)
+  } else if (Op.getOpcode() == ISD::MSCATTER || Op.getOpcode() == ISD::MSTORE)
     HasVectorValue = true;
 
   for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
@@ -265,6 +266,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
@@ -279,6 +282,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::ROTL:
   case ISD::ROTR:
   case ISD::BSWAP:
+  case ISD::BITREVERSE:
   case ISD::CTLZ:
   case ISD::CTTZ:
   case ISD::CTLZ_ZERO_UNDEF:
@@ -298,6 +302,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FABS:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
@@ -338,9 +344,13 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::MSCATTER:
     QueryType = cast<MaskedScatterSDNode>(Node)->getValue().getValueType();
     break;
+  case ISD::MSTORE:
+    QueryType = cast<MaskedStoreSDNode>(Node)->getValue().getValueType();
+    break;
   }
 
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
+  default: llvm_unreachable("This action is not supported yet!");
   case TargetLowering::Promote:
     Result = Promote(Op);
     Changed = true;
@@ -411,7 +421,7 @@ SDValue VectorLegalizer::Promote(SDValue Op) {
       Operands[j] = Op.getOperand(j);
   }
 
-  Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands);
+  Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands, Op.getNode()->getFlags());
   if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
       (VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
        NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
@@ -708,6 +718,11 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandFNEG(Op);
   case ISD::SETCC:
     return UnrollVSETCC(Op);
+  case ISD::BITREVERSE:
+    return ExpandBITREVERSE(Op);
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+    return ExpandCTLZ_CTTZ_ZERO_UNDEF(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
   }
@@ -893,6 +908,25 @@ SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, DL, VT, Op);
 }
 
+SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // If we have the scalar operation, it's probably cheaper to unroll it.
+  if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType()))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
+  if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::SRL, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::AND, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::OR, VT))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Let LegalizeDAG handle this later.
+  return Op;
+}
+
 SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // Implement VSELECT in terms of XOR, AND, OR
   // on platforms which do not support blend natively.
@@ -971,6 +1005,7 @@ SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
 
   // Convert hi and lo to floats
   // Convert the hi part back to the upper values
+  // TODO: Can any fast-math-flags be set on these nodes?
   SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), HI);
           fHI = DAG.getNode(ISD::FMUL, DL, Op.getValueType(), fHI, TWOHW);
   SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), LO);
@@ -984,12 +1019,23 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
   if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) {
     SDLoc DL(Op);
     SDValue Zero = DAG.getConstantFP(-0.0, DL, Op.getValueType());
+    // TODO: If FNEG had fast-math-flags, they'd get propagated to this FSUB.
     return DAG.getNode(ISD::FSUB, DL, Op.getValueType(),
                        Zero, Op.getOperand(0));
   }
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) {
+  // If the non-ZERO_UNDEF version is supported we can let LegalizeDAG handle.
+  unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ;
+  if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType()))
+    return Op;
+
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
   EVT VT = Op.getValueType();
   unsigned NumElems = VT.getVectorNumElements();
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 51cd661..d0187d3 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -67,6 +67,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
   case ISD::ANY_EXTEND:
+  case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
@@ -108,6 +109,12 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
 
   case ISD::FPOW:
   case ISD::FREM:
@@ -139,7 +146,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     LHS.getValueType(), LHS, RHS);
+                     LHS.getValueType(), LHS, RHS, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
@@ -228,7 +235,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
                                N->isInvariant(), N->getOriginalAlignment(),
                                N->getAAInfo());
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
   return Result;
@@ -594,6 +601,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
@@ -613,6 +621,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
     break;
 
+  case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CONVERT_RNDSAT:
   case ISD::CTLZ:
@@ -656,11 +665,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SUB:
   case ISD::MUL:
   case ISD::FADD:
-  case ISD::FCOPYSIGN:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::FDIV:
@@ -698,8 +708,10 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
   GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
   SDLoc dl(N);
 
-  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo);
-  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
+  const SDNodeFlags *Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
 }
 
 void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
@@ -870,6 +882,25 @@ void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
 }
 
+void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDLoc DL(N);
+
+  SDValue RHSLo, RHSHi;
+  SDValue RHS = N->getOperand(1);
+  EVT RHSVT = RHS.getValueType();
+  if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(RHS, RHSLo, RHSHi);
+  else
+    std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));
+
+
+  Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo);
+  Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue LHSLo, LHSHi;
@@ -989,7 +1020,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(LD, 1), Ch);
 }
@@ -1003,6 +1034,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
   SDValue Ch = MLD->getChain();
   SDValue Ptr = MLD->getBasePtr();
   SDValue Mask = MLD->getMask();
+  SDValue Src0 = MLD->getSrc0();
   unsigned Alignment = MLD->getOriginalAlignment();
   ISD::LoadExtType ExtType = MLD->getExtensionType();
 
@@ -1012,16 +1044,22 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
     (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
      Alignment/2 : Alignment;
 
+  // Split Mask operand
   SDValue MaskLo, MaskHi;
-  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
 
   EVT MemoryVT = MLD->getMemoryVT();
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
-  SDValue Src0 = MLD->getSrc0();
   SDValue Src0Lo, Src0Hi;
-  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+  if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Src0, Src0Lo, Src0Hi);
+  else
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MLD->getPointerInfo(), 
@@ -1049,7 +1087,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(MLD, 1), Ch);
 
@@ -1064,20 +1102,33 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Ch = MGT->getChain();
   SDValue Ptr = MGT->getBasePtr();
   SDValue Mask = MGT->getMask();
+  SDValue Src0 = MGT->getValue();
+  SDValue Index = MGT->getIndex();
   unsigned Alignment = MGT->getOriginalAlignment();
 
+  // Split Mask operand
   SDValue MaskLo, MaskHi;
-  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
 
   EVT MemoryVT = MGT->getMemoryVT();
   EVT LoMemVT, HiMemVT;
+  // Split MemoryVT
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue Src0Lo, Src0Hi;
-  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl);
+  if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Src0, Src0Lo, Src0Hi);
+  else
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   SDValue IndexHi, IndexLo;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(MGT->getIndex(), dl);
+  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Index, IndexLo, IndexHi);
+  else
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(), 
@@ -1097,7 +1148,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(MGT, 1), Ch);
 }
@@ -1357,6 +1408,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       Res = SplitVecOp_TruncateHelper(N);
       break;
     case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
+    case ISD::FCOPYSIGN:         Res = SplitVecOp_FCOPYSIGN(N); break;
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
@@ -1567,23 +1619,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Ptr = MGT->getBasePtr();
   SDValue Index = MGT->getIndex();
   SDValue Mask = MGT->getMask();
+  SDValue Src0 = MGT->getValue();
   unsigned Alignment = MGT->getOriginalAlignment();
 
   SDValue MaskLo, MaskHi;
-  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Mask operand
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
 
   EVT MemoryVT = MGT->getMemoryVT();
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue Src0Lo, Src0Hi;
-  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl);
+  if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Src0, Src0Lo, Src0Hi);
+  else
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   SDValue IndexHi, IndexLo;
-  if (Index.getNode())
-    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
+  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Index, IndexLo, IndexHi);
   else
-    IndexLo = IndexHi = Index;
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(), 
@@ -1609,7 +1669,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(MGT, 1), Ch);
 
@@ -1633,9 +1693,21 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
-  GetSplitVector(Data, DataLo, DataHi);
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
   SDValue MaskLo, MaskHi;
-  GetSplitVector(Mask, MaskLo, MaskHi);
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Mask operand
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+
+  MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType());
+  MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType());
 
   // if Alignment is equal to the vector size,
   // take the half of it for the second part
@@ -1680,25 +1752,29 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   unsigned Alignment = N->getOriginalAlignment();
   SDLoc DL(N);
 
+  // Split all operands
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
-  GetSplitVector(Data, DataLo, DataHi);
-  SDValue MaskLo, MaskHi;
-  GetSplitVector(Mask, MaskLo, MaskHi);
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
-    SDValue PtrLo, PtrHi;
-  if (Ptr.getValueType().isVector()) // gather form vector of pointers
-    std::tie(PtrLo, PtrHi) = DAG.SplitVector(Ptr, DL);
+  SDValue MaskLo, MaskHi;
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Mask operand
+    GetSplitVector(Mask, MaskLo, MaskHi);
   else
-    PtrLo = PtrHi = Ptr;
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
 
   SDValue IndexHi, IndexLo;
-  if (Index.getNode())
-    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Index, IndexLo, IndexHi);
   else
-    IndexLo = IndexHi = Index;
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
 
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -1706,7 +1782,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
-  SDValue OpsLo[] = {Ch, DataLo, MaskLo, PtrLo, IndexLo};
+  SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo};
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
                             DL, OpsLo, MMO);
 
@@ -1715,7 +1791,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
                          MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
-  SDValue OpsHi[] = {Ch, DataHi, MaskHi, PtrHi, IndexHi};
+  SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi};
   Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
                             DL, OpsHi, MMO);
 
@@ -1891,6 +1967,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
+  // The result (and the first input) has a legal vector type, but the second
+  // input needs splitting.
+  return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -1938,6 +2019,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
+  case ISD::MGATHER:
+    Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
+    break;
 
   case ISD::ADD:
   case ISD::AND:
@@ -1949,11 +2033,16 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::XOR:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
     Res = WidenVecRes_Binary(N);
     break;
 
   case ISD::FADD:
-  case ISD::FCOPYSIGN:
   case ISD::FMUL:
   case ISD::FPOW:
   case ISD::FSUB:
@@ -1966,6 +2055,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
+  case ISD::FCOPYSIGN:
+    Res = WidenVecRes_FCOPYSIGN(N);
+    break;
+
   case ISD::FPOWI:
     Res = WidenVecRes_POWI(N);
     break;
@@ -1989,6 +2082,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Convert(N);
     break;
 
+  case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CTLZ:
   case ISD::CTPOP:
@@ -2037,7 +2131,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
@@ -2048,6 +2142,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
   unsigned NumElts =  VT.getVectorNumElements();
+  const SDNodeFlags *Flags = N->getFlags();
   while (!TLI.isTypeLegal(VT) && NumElts != 1) {
     NumElts = NumElts / 2;
     VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
@@ -2057,7 +2152,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
     // Operation doesn't trap so just widen as normal.
     SDValue InOp1 = GetWidenedVector(N->getOperand(0));
     SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
   }
 
   // No legal vector version so unroll the vector operation and then widen.
@@ -2087,7 +2182,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
       SDValue EOp2 = DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
           DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-      ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
+      ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
       Idx += NumElts;
       CurNumElts -= NumElts;
     }
@@ -2105,7 +2200,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
             ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2,
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
         ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
-                                             EOp1, EOp2);
+                                             EOp1, EOp2, Flags);
       }
       CurNumElts = 0;
     }
@@ -2195,7 +2290,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
 
   unsigned Opcode = N->getOpcode();
   unsigned InVTNumElts = InVT.getVectorNumElements();
-
+  const SDNodeFlags *Flags = N->getFlags();
   if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
@@ -2203,7 +2298,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     if (InVTNumElts == WidenNumElts) {
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InOp);
-      return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1));
+      return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
     }
   }
 
@@ -2224,7 +2319,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVec);
-      return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1));
+      return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
     }
 
     if (InVTNumElts % WidenNumElts == 0) {
@@ -2234,7 +2329,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       // Extract the input and convert the shorten input vector.
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVal);
-      return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1));
+      return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
     }
   }
 
@@ -2250,7 +2345,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     if (N->getNumOperands() == 1)
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
     else
-      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1));
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
   }
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
@@ -2260,6 +2355,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
+  // If this is an FCOPYSIGN with same input types, we can treat it as a
+  // normal (can trap) binary op.
+  if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
+    return WidenVecRes_BinaryCanTrap(N);
+
+  // If the types are different, fall back to unrolling.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
@@ -2669,7 +2775,35 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
   SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
                                   Mask, Src0, N->getMemoryVT(),
                                   N->getMemOperand(), ExtType);
-  // Legalized the chain result - switch anything that used the old chain to
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
+
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  SDValue Src0 = GetWidenedVector(N->getValue());
+  unsigned NumElts = WideVT.getVectorNumElements();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  Mask = WidenTargetBoolean(Mask, WideVT, true);
+
+  // Widen the Index operand
+  SDValue Index = N->getIndex();
+  EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                     Index.getValueType().getScalarType(),
+                                     NumElts);
+  Index = ModifyToType(Index, WideIndexVT);
+  SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+  SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
+                                    N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
@@ -2831,7 +2965,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
   case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
+  case ISD::MSCATTER:           Res = WidenVecOp_MSCATTER(N, OpNo); break;
   case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
+  case ISD::FCOPYSIGN:          Res = WidenVecOp_FCOPYSIGN(N); break;
 
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
@@ -2928,6 +3064,13 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   }
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
+  // The result (and first input) is legal, but the second input is illegal.
+  // We can't do much to fix that, so just unroll and let the extracts off of
+  // the second input be widened as needed later.
+  return DAG.UnrollVectorOp(N);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   // Since the result is legal and the input is illegal, it is unlikely
   // that we can fix the input to a legal type so unroll the convert
@@ -3070,6 +3213,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
                             false);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can widen only data operand of mscatter");
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+  SDValue DataOp = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+
+  // Widen the value
+  SDValue WideVal = GetWidenedVector(DataOp);
+  EVT WideVT = WideVal.getValueType();
+  unsigned NumElts = WideVal.getValueType().getVectorNumElements();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  Mask = WidenTargetBoolean(Mask, WideVT, true);
+
+  // Widen index
+  SDValue Index = MSC->getIndex();
+  EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                     Index.getValueType().getScalarType(),
+                                     NumElts);
+  Index = ModifyToType(Index, WideIndexVT);
+
+  SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index};
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+                              MSC->getMemoryVT(), dl, Ops,
+                              MSC->getMemOperand());
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
@@ -3533,7 +3704,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
 
 /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
-SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
+/// FillWithZeroes specifies that the vector should be widened with zeroes.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
+                                       bool FillWithZeroes) {
   // Note that InOp might have been widened so it might already have
   // the right width or it might need be narrowed.
   EVT InVT = InOp.getValueType();
@@ -3550,10 +3723,11 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
   if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
     unsigned NumConcat = WidenNumElts / InNumElts;
     SmallVector<SDValue, 16> Ops(NumConcat);
-    SDValue UndefVal = DAG.getUNDEF(InVT);
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
+      DAG.getUNDEF(InVT);
     Ops[0] = InOp;
     for (unsigned i = 1; i != NumConcat; ++i)
-      Ops[i] = UndefVal;
+      Ops[i] = FillVal;
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
   }
@@ -3573,8 +3747,9 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
         ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
         DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+    DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
-    Ops[Idx] = UndefVal;
+    Ops[Idx] = FillVal;
   return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 6303422..622e06f 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -49,7 +49,7 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
   TII = STI.getInstrInfo();
   ResourcesModel.reset(TII->CreateTargetScheduleState(STI));
   // This hard requirement could be relaxed, but for now
-  // do not let it procede.
+  // do not let it proceed.
   assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
 
   unsigned NumRC = TRI->getNumRegClasses();
@@ -269,12 +269,12 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
     }
 
   // Now see if there are no other dependencies
-  // to instructions alredy in the packet.
+  // to instructions already in the packet.
   for (unsigned i = 0, e = Packet.size(); i != e; ++i)
     for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
          E = Packet[i]->Succs.end(); I != E; ++I) {
       // Since we do not add pseudos to packets, might as well
-      // ignor order deps.
+      // ignore order deps.
       if (I->isCtrl())
         continue;
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 34e1a70..62e7733 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -440,7 +440,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
     NumRes = MCID.getNumDefs();
-    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
       if (Reg == *ImpDef)
         break;
       ++NumRes;
@@ -519,7 +519,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
     }
   }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index e9bd520..91024e6 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -141,8 +141,8 @@ private:
   /// that are "live". These nodes must be scheduled before any other nodes that
   /// modifies the registers can be scheduled.
   unsigned NumLiveRegs;
-  std::vector<SUnit*> LiveRegDefs;
-  std::vector<SUnit*> LiveRegGens;
+  std::unique_ptr<SUnit*[]> LiveRegDefs;
+  std::unique_ptr<SUnit*[]> LiveRegGens;
 
   // Collect interferences between physical register use/defs.
   // Each interference is an SUnit and set of physical registers.
@@ -328,8 +328,8 @@ void ScheduleDAGRRList::Schedule() {
   NumLiveRegs = 0;
   // Allocate slots for each physical register, plus one for a special register
   // to track the virtual resource of a calling sequence.
-  LiveRegDefs.resize(TRI->getNumRegs() + 1, nullptr);
-  LiveRegGens.resize(TRI->getNumRegs() + 1, nullptr);
+  LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());
   CallSeqEndForStart.clear();
   assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
 
@@ -1206,7 +1206,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
     NumRes = MCID.getNumDefs();
-    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
       if (Reg == *ImpDef)
         break;
       ++NumRes;
@@ -1218,7 +1218,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 /// CheckForLiveRegDef - Return true and update live register vector if the
 /// specified register def of the specified SUnit clobbers any "live" registers.
 static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
-                               std::vector<SUnit*> &LiveRegDefs,
+                               SUnit **LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVectorImpl<unsigned> &LRegs,
                                const TargetRegisterInfo *TRI) {
@@ -1240,7 +1240,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
 /// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
 /// by RegMask, and add them to LRegs.
 static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
-                                     std::vector<SUnit*> &LiveRegDefs,
+                                     ArrayRef<SUnit*> LiveRegDefs,
                                      SmallSet<unsigned, 4> &RegAdded,
                                      SmallVectorImpl<unsigned> &LRegs) {
   // Look at all live registers. Skip Reg0 and the special CallResource.
@@ -1278,7 +1278,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU)
-      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(),
                          RegAdded, LRegs, TRI);
   }
 
@@ -1302,7 +1302,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
             if (TargetRegisterInfo::isPhysicalRegister(Reg))
-              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
           }
         } else
           i += NumVals;
@@ -1328,13 +1328,15 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
       }
     }
     if (const uint32_t *RegMask = getNodeRegMask(Node))
-      CheckForLiveRegDefMasked(SU, RegMask, LiveRegDefs, RegAdded, LRegs);
+      CheckForLiveRegDefMasked(SU, RegMask,
+                               makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()),
+                               RegAdded, LRegs);
 
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
-      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
   }
 
   return !LRegs.empty();
@@ -2718,7 +2720,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
                                          ScheduleDAGRRList *scheduleDAG,
                                          const TargetInstrInfo *TII,
                                          const TargetRegisterInfo *TRI) {
-  const uint16_t *ImpDefs
+  const MCPhysReg *ImpDefs
     = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
   const uint32_t *RegMask = getNodeRegMask(SU->getNode());
   if(!ImpDefs && !RegMask)
@@ -2737,7 +2739,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
         return true;
 
       if (ImpDefs)
-        for (const uint16_t *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
+        for (const MCPhysReg *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
           // Return true if SU clobbers this physical register use and the
           // definition of the register reaches from DepSU. IsReachable queries
           // a topological forward sort of the DAG (following the successors).
@@ -2756,13 +2758,13 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
                                   const TargetRegisterInfo *TRI) {
   SDNode *N = SuccSU->getNode();
   unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
-  const uint16_t *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  const MCPhysReg *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
   assert(ImpDefs && "Caller should check hasPhysRegDefs");
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (!SUNode->isMachineOpcode())
       continue;
-    const uint16_t *SUImpDefs =
+    const MCPhysReg *SUImpDefs =
       TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
     const uint32_t *SURegMask = getNodeRegMask(SUNode);
     if (!SUImpDefs && !SURegMask)
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 159c28c..5cc8066 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -86,12 +86,6 @@ namespace llvm {
     /// flagged together nodes with a single SUnit.
     void BuildSchedGraph(AliasAnalysis *AA);
 
-    /// InitVRegCycleFlag - Set isVRegCycle if this node's single use is
-    /// CopyToReg and its only active data operands are CopyFromReg within a
-    /// single block loop.
-    ///
-    void InitVRegCycleFlag(SUnit *SU);
-
     /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
     ///
     void InitNumRegDefsLeft(SUnit *SU);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 14f44cc..96bf914 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeDbgValue.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -210,28 +211,6 @@ bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
   return true;
 }
 
-/// isScalarToVector - Return true if the specified node is a
-/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
-/// element is not an undef.
-bool ISD::isScalarToVector(const SDNode *N) {
-  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR)
-    return true;
-
-  if (N->getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-  if (N->getOperand(0).getOpcode() == ISD::UNDEF)
-    return false;
-  unsigned NumElems = N->getNumOperands();
-  if (NumElems == 1)
-    return false;
-  for (unsigned i = 1; i < NumElems; ++i) {
-    SDValue V = N->getOperand(i);
-    if (V.getOpcode() != ISD::UNDEF)
-      return false;
-  }
-  return true;
-}
-
 /// allOperandsUndef - Return true if the node has at least one operand
 /// and all operands of the specified node are ISD::UNDEF.
 bool ISD::allOperandsUndef(const SDNode *N) {
@@ -397,24 +376,21 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
     ID.AddInteger(Op.getResNo());
   }
 }
+
 /// Add logical or fast math flag values to FoldingSetNodeID value.
 static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode,
                            const SDNodeFlags *Flags) {
-  if (!Flags || !isBinOpWithFlags(Opcode))
+  if (!isBinOpWithFlags(Opcode))
     return;
 
-  unsigned RawFlags = Flags->getRawFlags();
-  // If no flags are set, do not alter the ID. We must match the ID of nodes
-  // that were created without explicitly specifying flags. This also saves time
-  // and allows a gradual increase in API usage of the optional optimization
-  // flags.
-  if (RawFlags != 0)
-    ID.AddInteger(RawFlags);
+  unsigned RawFlags = 0;
+  if (Flags)
+    RawFlags = Flags->getRawFlags();
+  ID.AddInteger(RawFlags);
 }
 
 static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) {
-  if (auto *Node = dyn_cast<BinaryWithFlagsSDNode>(N))
-    AddNodeIDFlags(ID, Node->getOpcode(), &Node->Flags);
+  AddNodeIDFlags(ID, N->getOpcode(), N->getFlags());
 }
 
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
@@ -624,9 +600,9 @@ void SelectionDAG::RemoveDeadNodes() {
   SmallVector<SDNode*, 128> DeadNodes;
 
   // Add all obviously-dead nodes to the DeadNodes worklist.
-  for (allnodes_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I)
-    if (I->use_empty())
-      DeadNodes.push_back(I);
+  for (SDNode &Node : allnodes())
+    if (Node.use_empty())
+      DeadNodes.push_back(&Node);
 
   RemoveDeadNodes(DeadNodes);
 
@@ -766,6 +742,7 @@ static void VerifySDNode(SDNode *N) {
 void SelectionDAG::InsertNode(SDNode *N) {
   AllNodes.push_back(N);
 #ifndef NDEBUG
+  N->PersistentId = NextPersistentId++;
   VerifySDNode(N);
 #endif
 }
@@ -929,7 +906,7 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
       EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
       Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
       UpdateListeners(nullptr) {
-  AllNodes.push_back(&EntryNode);
+  InsertNode(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
@@ -950,7 +927,10 @@ void SelectionDAG::allnodes_clear() {
   assert(&*AllNodes.begin() == &EntryNode);
   AllNodes.remove(AllNodes.begin());
   while (!AllNodes.empty())
-    DeallocateNode(AllNodes.begin());
+    DeallocateNode(&AllNodes.front());
+#ifndef NDEBUG
+  NextPersistentId = 0;
+#endif
 }
 
 BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL,
@@ -1023,7 +1003,7 @@ void SelectionDAG::clear() {
             static_cast<SDNode*>(nullptr));
 
   EntryNode.UseList = nullptr;
-  AllNodes.push_back(&EntryNode);
+  InsertNode(&EntryNode);
   Root = getEntryNode();
   DbgInfo->clear();
 }
@@ -1429,8 +1409,8 @@ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset,
-                                                    TargetFlags);
+  SDNode *N =
+      new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1852,8 +1832,58 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
   if (OpTy == ShTy || OpTy.isVector()) return Op;
 
-  ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
-  return getNode(Opcode, SDLoc(Op), ShTy, Op);
+  return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
+}
+
+SDValue SelectionDAG::expandVAArg(SDNode *Node) {
+  SDLoc dl(Node);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = Node->getOperand(0);
+  SDValue Tmp2 = Node->getOperand(1);
+  unsigned Align = Node->getConstantOperandVal(3);
+
+  SDValue VAListLoad =
+    getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1, Tmp2,
+            MachinePointerInfo(V), false, false, false, 0);
+  SDValue VAList = VAListLoad;
+
+  if (Align > TLI.getMinStackArgumentAlignment()) {
+    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+    VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                     getConstant(Align - 1, dl, VAList.getValueType()));
+
+    VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
+                     getConstant(-(int64_t)Align, dl, VAList.getValueType()));
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                 getConstant(getDataLayout().getTypeAllocSize(
+                                               VT.getTypeForEVT(*getContext())),
+                             dl, VAList.getValueType()));
+  // Store the incremented VAList to the legalized pointer
+  Tmp1 = getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2,
+                  MachinePointerInfo(V), false, false, 0);
+  // Load the actual argument out of the pointer VAList
+  return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo(),
+                 false, false, false, 0);
+}
+
+SDValue SelectionDAG::expandVACopy(SDNode *Node) {
+  SDLoc dl(Node);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  // This defaults to loading a pointer from the input and storing it to the
+  // output, returning the chain.
+  const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+  const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+  SDValue Tmp1 = getLoad(TLI.getPointerTy(getDataLayout()), dl,
+                         Node->getOperand(0), Node->getOperand(2),
+                         MachinePointerInfo(VS), false, false, false, 0);
+  return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
+                  MachinePointerInfo(VD), false, false, 0);
 }
 
 /// CreateStackTemporary - Create a stack temporary, suitable for holding the
@@ -1872,8 +1902,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
 /// CreateStackTemporary - Create a stack temporary suitable for holding
 /// either of the specified value types.
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
-  unsigned Bytes = std::max(VT1.getStoreSizeInBits(),
-                            VT2.getStoreSizeInBits())/8;
+  unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
   const DataLayout &DL = getDataLayout();
@@ -2255,7 +2284,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       unsigned MemBits = VT.getScalarType().getSizeInBits();
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
-      computeKnownBitsFromRangeMetadata(*Ranges, KnownZero);
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD)
+        computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne);
     }
     break;
   }
@@ -2564,6 +2594,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     if (Tmp == 1) return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
     return std::min(Tmp, Tmp2);
+  case ISD::SELECT_CC:
+    Tmp = ComputeNumSignBits(Op.getOperand(2), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(3), Depth+1);
+    return std::min(Tmp, Tmp2);
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -2679,7 +2714,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     const int rIndex = Items - 1 -
       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
-    // If the sign portion ends in our element the substraction gives correct
+    // If the sign portion ends in our element the subtraction gives correct
     // result. Otherwise it gives either negative or > bitwidth result
     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
   }
@@ -2798,6 +2833,53 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   return false;
 }
 
+bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
+  assert(A.getValueType() == B.getValueType() &&
+         "Values must have the same type");
+  APInt AZero, AOne;
+  APInt BZero, BOne;
+  computeKnownBits(A, AZero, AOne);
+  computeKnownBits(B, BZero, BOne);
+  return (AZero | BZero).isAllOnesValue();
+}
+
+static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef<SDValue> Ops,
+                                  llvm::SelectionDAG &DAG) {
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // Concat of UNDEFs is UNDEF.
+  if (std::all_of(Ops.begin(), Ops.end(),
+                  [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified
+  // to one big BUILD_VECTOR.
+  // FIXME: Add support for UNDEF and SCALAR_TO_VECTOR as well.
+  if (!std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) {
+        return Op.getOpcode() == ISD::BUILD_VECTOR;
+      }))
+    return SDValue();
+
+  EVT SVT = VT.getScalarType();
+  SmallVector<SDValue, 16> Elts;
+  for (SDValue Op : Ops)
+    Elts.append(Op->op_begin(), Op->op_end());
+
+  // BUILD_VECTOR requires all inputs to be of the same type, find the
+  // maximum type and extend them all.
+  for (SDValue Op : Elts)
+    SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+
+  if (SVT.bitsGT(VT.getScalarType()))
+    for (SDValue &Op : Elts)
+      Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+               ? DAG.getZExtOrTrunc(Op, DL, SVT)
+               : DAG.getSExtOrTrunc(Op, DL, SVT);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
+}
+
 /// getNode - Gets or creates the specified node.
 ///
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
@@ -2848,8 +2930,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
         return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT);
-      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+      if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
         return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT);
+      if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+        return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
@@ -2954,44 +3038,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
       case ISD::CTTZ:
       case ISD::CTTZ_ZERO_UNDEF:
       case ISD::CTPOP: {
-        EVT SVT = VT.getScalarType();
-        EVT InVT = BV->getValueType(0);
-        EVT InSVT = InVT.getScalarType();
-
-        // Find legal integer scalar type for constant promotion and
-        // ensure that its scalar size is at least as large as source.
-        EVT LegalSVT = SVT;
-        if (SVT.isInteger()) {
-          LegalSVT = TLI->getTypeToTransformTo(*getContext(), SVT);
-          if (LegalSVT.bitsLT(SVT)) break;
-        }
-
-        // Let the above scalar folding handle the folding of each element.
-        SmallVector<SDValue, 8> Ops;
-        for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-          SDValue OpN = BV->getOperand(i);
-          EVT OpVT = OpN.getValueType();
-
-          // Build vector (integer) scalar operands may need implicit
-          // truncation - do this before constant folding.
-          if (OpVT.isInteger() && OpVT.bitsGT(InSVT))
-            OpN = getNode(ISD::TRUNCATE, DL, InSVT, OpN);
-
-          OpN = getNode(Opcode, DL, SVT, OpN);
-
-          // Legalize the (integer) scalar constant if necessary.
-          if (LegalSVT != SVT)
-            OpN = getNode(ISD::ANY_EXTEND, DL, LegalSVT, OpN);
-
-          if (OpN.getOpcode() != ISD::UNDEF &&
-              OpN.getOpcode() != ISD::Constant &&
-              OpN.getOpcode() != ISD::ConstantFP)
-            break;
-          Ops.push_back(OpN);
-        }
-        if (Ops.size() == VT.getVectorNumElements())
-          return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
-        break;
+        SDValue Ops = { Operand };
+        if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
+          return Fold;
       }
       }
     }
@@ -3012,6 +3061,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid fpext node, dst < src!");
     if (Operand.getOpcode() == ISD::UNDEF)
       return getUNDEF(VT);
     break;
@@ -3019,12 +3070,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid SIGN_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
-           "Invalid sext node, dst < src!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid sext node, dst < src!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
@@ -3035,12 +3086,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ZERO_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
-           "Invalid zext node, dst < src!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid zext node, dst < src!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
       return getNode(ISD::ZERO_EXTEND, DL, VT,
                      Operand.getNode()->getOperand(0));
@@ -3052,12 +3103,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ANY_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
-           "Invalid anyext node, dst < src!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid anyext node, dst < src!");
 
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND)
@@ -3077,12 +3128,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid TRUNCATE!");
     if (Operand.getValueType() == VT) return Operand;   // noop truncate
-    assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) &&
-           "Invalid truncate node, src < dst!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsGT(VT) &&
+           "Invalid truncate node, src < dst!");
     if (OpOpcode == ISD::TRUNCATE)
       return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
@@ -3135,8 +3186,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
   case ISD::FNEG:
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
     if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
+      // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
       return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
-                     Operand.getNode()->getOperand(0));
+                       Operand.getNode()->getOperand(0),
+                       &cast<BinaryWithFlagsSDNode>(Operand.getNode())->Flags);
     if (OpOpcode == ISD::FNEG)  // --X -> X
       return Operand.getNode()->getOperand(0);
     break;
@@ -3182,6 +3235,10 @@ static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
   case ISD::SRA:  return std::make_pair(C1.ashr(C2), true);
   case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
   case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
+  case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
+  case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
+  case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
+  case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
   case ISD::UDIV:
     if (!C2.getBoolValue())
       break;
@@ -3284,10 +3341,118 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
   return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs);
 }
 
+SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
+                                                   EVT VT,
+                                                   ArrayRef<SDValue> Ops,
+                                                   const SDNodeFlags *Flags) {
+  // If the opcode is a target-specific ISD node, there's nothing we can
+  // do here and the operand rules may not line up with the below, so
+  // bail early.
+  if (Opcode >= ISD::BUILTIN_OP_END)
+    return SDValue();
+
+  // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
+  if (!VT.isVector())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
+    return !Op.getValueType().isVector() ||
+           Op.getValueType().getVectorNumElements() == NumElts;
+  };
+
+  auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
+    return (Op.getOpcode() == ISD::UNDEF) ||
+           (Op.getOpcode() == ISD::CONDCODE) || (BV && BV->isConstant());
+  };
+
+  // All operands must be vector types with the same number of elements as
+  // the result type and must be either UNDEF or a build vector of constant
+  // or UNDEF scalars.
+  if (!std::all_of(Ops.begin(), Ops.end(), IsConstantBuildVectorOrUndef) ||
+      !std::all_of(Ops.begin(), Ops.end(), IsScalarOrSameVectorSize))
+    return SDValue();
+
+  // If we are comparing vectors, then the result needs to be a i1 boolean
+  // that is then sign-extended back to the legal result type.
+  EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+
+  // Find legal integer scalar type for constant promotion and
+  // ensure that its scalar size is at least as large as source.
+  EVT LegalSVT = VT.getScalarType();
+  if (LegalSVT.isInteger()) {
+    LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
+    if (LegalSVT.bitsLT(SVT))
+      return SDValue();
+  }
+
+  // Constant fold each scalar lane separately.
+  SmallVector<SDValue, 4> ScalarResults;
+  for (unsigned i = 0; i != NumElts; i++) {
+    SmallVector<SDValue, 4> ScalarOps;
+    for (SDValue Op : Ops) {
+      EVT InSVT = Op.getValueType().getScalarType();
+      BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
+      if (!InBV) {
+        // We've checked that this is UNDEF or a constant of some kind.
+        if (Op.isUndef())
+          ScalarOps.push_back(getUNDEF(InSVT));
+        else
+          ScalarOps.push_back(Op);
+        continue;
+      }
+
+      SDValue ScalarOp = InBV->getOperand(i);
+      EVT ScalarVT = ScalarOp.getValueType();
+
+      // Build vector (integer) scalar operands may need implicit
+      // truncation - do this before constant folding.
+      if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
+        ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
+
+      ScalarOps.push_back(ScalarOp);
+    }
+
+    // Constant fold the scalar operands.
+    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
+
+    // Legalize the (integer) scalar constant if necessary.
+    if (LegalSVT != SVT)
+      ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
+
+    // Scalar folding only succeeded if the result is a constant or UNDEF.
+    if (ScalarResult.getOpcode() != ISD::UNDEF &&
+        ScalarResult.getOpcode() != ISD::Constant &&
+        ScalarResult.getOpcode() != ISD::ConstantFP)
+      return SDValue();
+    ScalarResults.push_back(ScalarResult);
+  }
+
+  assert(ScalarResults.size() == NumElts &&
+         "Unexpected number of scalar results for BUILD_VECTOR");
+  return getNode(ISD::BUILD_VECTOR, DL, VT, ScalarResults);
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
                               SDValue N2, const SDNodeFlags *Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+
+  // Canonicalize constant to RHS if commutative.
+  if (isCommutativeBinOp(Opcode)) {
+    if (N1C && !N2C) {
+      std::swap(N1C, N2C);
+      std::swap(N1, N2);
+    } else if (N1CFP && !N2CFP) {
+      std::swap(N1CFP, N2CFP);
+      std::swap(N1, N2);
+    }
+  }
+
   switch (Opcode) {
   default: break;
   case ISD::TokenFactor:
@@ -3298,34 +3463,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     if (N2.getOpcode() == ISD::EntryToken) return N1;
     if (N1 == N2) return N1;
     break;
-  case ISD::CONCAT_VECTORS:
-    // Concat of UNDEFs is UNDEF.
-    if (N1.getOpcode() == ISD::UNDEF &&
-        N2.getOpcode() == ISD::UNDEF)
-      return getUNDEF(VT);
-
-    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
-    // one big BUILD_VECTOR.
-    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
-        N2.getOpcode() == ISD::BUILD_VECTOR) {
-      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
-                                    N1.getNode()->op_end());
-      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
-
-      // BUILD_VECTOR requires all inputs to be of the same type, find the
-      // maximum type and extend them all.
-      EVT SVT = VT.getScalarType();
-      for (SDValue Op : Elts)
-        SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
-      if (SVT.bitsGT(VT.getScalarType()))
-        for (SDValue &Op : Elts)
-          Op = TLI->isZExtFree(Op.getValueType(), SVT)
-             ? getZExtOrTrunc(Op, DL, SVT)
-             : getSExtOrTrunc(Op, DL, SVT);
-
-      return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
-    }
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
     break;
+  }
   case ISD::AND:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
@@ -3356,6 +3500,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
@@ -3367,37 +3515,20 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   case ISD::FREM:
     if (getTarget().Options.UnsafeFPMath) {
       if (Opcode == ISD::FADD) {
-        // 0+x --> x
-        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1))
-          if (CFP->getValueAPF().isZero())
-            return N2;
         // x+0 --> x
-        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
-          if (CFP->getValueAPF().isZero())
-            return N1;
+        if (N2CFP && N2CFP->getValueAPF().isZero())
+          return N1;
       } else if (Opcode == ISD::FSUB) {
         // x-0 --> x
-        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
-          if (CFP->getValueAPF().isZero())
-            return N1;
+        if (N2CFP && N2CFP->getValueAPF().isZero())
+          return N1;
       } else if (Opcode == ISD::FMUL) {
-        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1);
-        SDValue V = N2;
-
-        // If the first operand isn't the constant, try the second
-        if (!CFP) {
-          CFP = dyn_cast<ConstantFPSDNode>(N2);
-          V = N1;
-        }
-
-        if (CFP) {
-          // 0*x --> 0
-          if (CFP->isZero())
-            return SDValue(CFP,0);
-          // 1*x --> x
-          if (CFP->isExactlyValue(1.0))
-            return V;
-        }
+        // x*0 --> 0
+        if (N2CFP && N2CFP->isZero())
+          return N2;
+        // x*1 --> x
+        if (N2CFP && N2CFP->isExactlyValue(1.0))
+          return N1;
       }
     }
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
@@ -3457,7 +3588,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     assert(VT.isFloatingPoint() &&
            N1.getValueType().isFloatingPoint() &&
            VT.bitsLE(N1.getValueType()) &&
-           isa<ConstantSDNode>(N2) && "Invalid FP_ROUND!");
+           N2C && "Invalid FP_ROUND!");
     if (N1.getValueType() == VT) return N1;  // noop conversion.
     break;
   case ISD::AssertSext:
@@ -3502,13 +3633,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       SmallVector<SDValue, 8> Ops;
       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N1.getOperand(i);
-        if (Op.getValueType() != VT.getScalarType()) break;
         if (Op.getOpcode() == ISD::UNDEF) {
-          Ops.push_back(Op);
+          Ops.push_back(getUNDEF(VT.getScalarType()));
           continue;
         }
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
           APInt Val = C->getAPIntValue();
+          Val = Val.zextOrTrunc(VT.getScalarSizeInBits());
           Ops.push_back(SignExtendInReg(Val));
           continue;
         }
@@ -3590,15 +3721,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       return N1.getOperand(N2C->getZExtValue());
 
     // EXTRACT_ELEMENT of a constant int is also very common.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+    if (N1C) {
       unsigned ElementSize = VT.getSizeInBits();
       unsigned Shift = ElementSize * N2C->getZExtValue();
-      APInt ShiftedVal = C->getAPIntValue().lshr(Shift);
+      APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
       return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
     }
     break;
-  case ISD::EXTRACT_SUBVECTOR: {
-    SDValue Index = N2;
+  case ISD::EXTRACT_SUBVECTOR:
     if (VT.isSimple() && N1.getValueType().isSimple()) {
       assert(VT.isVector() && N1.getValueType().isVector() &&
              "Extract subvector VTs must be a vectors!");
@@ -3608,9 +3738,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
              "Extract subvector must be from larger vector to smaller vector!");
 
-      if (isa<ConstantSDNode>(Index)) {
-        assert((VT.getVectorNumElements() +
-                cast<ConstantSDNode>(Index)->getZExtValue()
+      if (N2C) {
+        assert((VT.getVectorNumElements() + N2C->getZExtValue()
                 <= N1.getValueType().getVectorNumElements())
                && "Extract subvector overflow!");
       }
@@ -3621,29 +3750,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     }
     break;
   }
-  }
 
   // Perform trivial constant folding.
   if (SDValue SV =
           FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
     return SV;
 
-  // Canonicalize constant to RHS if commutative.
-  if (N1C && !N2C && isCommutativeBinOp(Opcode)) {
-    std::swap(N1C, N2C);
-    std::swap(N1, N2);
-  }
-
   // Constant fold FP operations.
   bool HasFPExceptions = TLI->hasFloatingPointExceptions();
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
-  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
   if (N1CFP) {
-    if (!N2CFP && isCommutativeBinOp(Opcode)) {
-      // Canonicalize constant to RHS if commutative.
-      std::swap(N1CFP, N2CFP);
-      std::swap(N1, N2);
-    } else if (N2CFP) {
+    if (N2CFP) {
       APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
       APFloat::opStatus s;
       switch (Opcode) {
@@ -3670,7 +3786,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
         }
         break;
       case ISD::FREM :
-        s = V1.mod(V2, APFloat::rmNearestTiesToEven);
+        s = V1.mod(V2);
         if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
                                  s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, DL, VT);
@@ -3795,7 +3911,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3) {
   // Perform various simplifications.
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   switch (Opcode) {
   case ISD::FMA: {
     ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
@@ -3812,27 +3927,25 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
     }
     break;
   }
-  case ISD::CONCAT_VECTORS:
-    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
-    // one big BUILD_VECTOR.
-    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
-        N2.getOpcode() == ISD::BUILD_VECTOR &&
-        N3.getOpcode() == ISD::BUILD_VECTOR) {
-      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
-                                    N1.getNode()->op_end());
-      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
-      Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
-      return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
-    }
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
     break;
+  }
   case ISD::SETCC: {
     // Use FoldSetCC to simplify SETCC's.
-    SDValue Simp = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL);
-    if (Simp.getNode()) return Simp;
+    if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
+      return V;
+    // Vector constant folding.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
+      return V;
     break;
   }
   case ISD::SELECT:
-    if (N1C) {
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
      if (N1C->getZExtValue())
        return N2;             // select true, X, Y -> X
      return N3;             // select false, X, Y -> Y
@@ -4153,6 +4266,14 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
   return true;
 }
 
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction()->optForMinSize();
+  return MF.getFunction()->optForSize();
+}
+
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
                                        SDValue Chain, SDValue Dst,
                                        SDValue Src, uint64_t Size,
@@ -4173,7 +4294,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  bool OptSize = shouldLowerMemFuncForSize(MF);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4286,7 +4407,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  bool OptSize = shouldLowerMemFuncForSize(MF);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4380,7 +4501,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  bool OptSize = shouldLowerMemFuncForSize(MF);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4446,6 +4567,16 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
+static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
+                                            unsigned AS) {
+  // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
+  // pointer operands can be losslessly bitcasted to pointers of address space 0
+  if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
+    report_fatal_error("cannot lower memory intrinsic in address space " +
+                       Twine(AS));
+  }
+}
+
 SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
                                 SDValue Src, SDValue Size,
                                 unsigned Align, bool isVol, bool AlwaysInline,
@@ -4487,6 +4618,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
                                    true, DstPtrInfo, SrcPtrInfo);
   }
 
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+  checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+
   // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
   // memcpy is not guaranteed to be safe. libc memcpys aren't required to
   // respect volatile, so they may do things like read or write memory
@@ -4548,6 +4682,9 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
       return Result;
   }
 
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+  checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
 
@@ -4605,6 +4742,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
       return Result;
   }
 
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+
   // Emit a library call.
   Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
@@ -4872,10 +5011,12 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
 /// MachinePointerInfo record from it.  This is particularly useful because the
 /// code generator has many cases where it doesn't bother passing in a
 /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
-static MachinePointerInfo InferPointerInfo(SDValue Ptr, int64_t Offset = 0) {
+static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
+                                           int64_t Offset = 0) {
   // If this is FI+Offset, we can model it.
   if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
-    return MachinePointerInfo::getFixedStack(FI->getIndex(), Offset);
+    return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                             FI->getIndex(), Offset);
 
   // If this is (FI+Offset1)+Offset2, we can model it.
   if (Ptr.getOpcode() != ISD::ADD ||
@@ -4884,20 +5025,22 @@ static MachinePointerInfo InferPointerInfo(SDValue Ptr, int64_t Offset = 0) {
     return MachinePointerInfo();
 
   int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
-  return MachinePointerInfo::getFixedStack(FI, Offset+
-                       cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
+  return MachinePointerInfo::getFixedStack(
+      DAG.getMachineFunction(), FI,
+      Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
 }
 
 /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
 /// MachinePointerInfo record from it.  This is particularly useful because the
 /// code generator has many cases where it doesn't bother passing in a
 /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
-static MachinePointerInfo InferPointerInfo(SDValue Ptr, SDValue OffsetOp) {
+static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
+                                           SDValue OffsetOp) {
   // If the 'Offset' value isn't a constant, we can't handle this.
   if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
-    return InferPointerInfo(Ptr, OffsetNode->getSExtValue());
+    return InferPointerInfo(DAG, Ptr, OffsetNode->getSExtValue());
   if (OffsetOp.getOpcode() == ISD::UNDEF)
-    return InferPointerInfo(Ptr);
+    return InferPointerInfo(DAG, Ptr);
   return MachinePointerInfo();
 }
 
@@ -4926,7 +5069,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   // If we don't have a PtrInfo, infer the trivial frame index case to simplify
   // clients.
   if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(Ptr, Offset);
+    PtrInfo = InferPointerInfo(*this, Ptr, Offset);
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
@@ -5054,7 +5197,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
     Flags |= MachineMemOperand::MONonTemporal;
 
   if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(Ptr);
+    PtrInfo = InferPointerInfo(*this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
@@ -5109,7 +5252,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
     Flags |= MachineMemOperand::MONonTemporal;
 
   if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(Ptr);
+    PtrInfo = InferPointerInfo(*this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
@@ -5261,7 +5404,7 @@ SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, SDLoc dl,
     cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  MaskedGatherSDNode *N = 
+  MaskedGatherSDNode *N =
     new (NodeAllocator) MaskedGatherSDNode(dl.getIROrder(), dl.getDebugLoc(),
                                            Ops, VTs, VT, MMO);
   CSEMap.InsertNode(N, IP);
@@ -5317,18 +5460,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                              ArrayRef<SDValue> Ops) {
+                              ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) {
   unsigned NumOps = Ops.size();
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
   case 1: return getNode(Opcode, DL, VT, Ops[0]);
-  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
   }
 
   switch (Opcode) {
   default: break;
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::SELECT_CC: {
     assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
     assert(Ops[0].getValueType() == Ops[1].getValueType() &&
@@ -5656,7 +5805,7 @@ UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
          "Update with wrong number of operands");
 
   // If no operands changed just return the input node.
-  if (Ops.empty() || std::equal(Ops.begin(), Ops.end(), N->op_begin()))
+  if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
     return N;
 
   // See if the modified node already exists.
@@ -6451,13 +6600,13 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   // Node Id fields for nodes At SortedPos and after will contain the
   // count of outstanding operands.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
-    SDNode *N = I++;
+    SDNode *N = &*I++;
     checkForCycles(N, this);
     unsigned Degree = N->getNumOperands();
     if (Degree == 0) {
       // A node with no uses, add it to the result array immediately.
       N->setNodeId(DAGSize++);
-      allnodes_iterator Q = N;
+      allnodes_iterator Q(N);
       if (Q != SortedPos)
         SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
       assert(SortedPos != AllNodes.end() && "Overran node list");
@@ -6470,8 +6619,8 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
 
   // Visit all the nodes. As we iterate, move nodes into sorted order,
   // such that by the time the end is reached all nodes will be sorted.
-  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
-    SDNode *N = I;
+  for (SDNode &Node : allnodes()) {
+    SDNode *N = &Node;
     checkForCycles(N, this);
     // N is in sorted position, so all its uses have one less operand
     // that needs to be sorted.
@@ -6493,9 +6642,10 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
         P->setNodeId(Degree);
       }
     }
-    if (I == SortedPos) {
+    if (&Node == SortedPos) {
 #ifndef NDEBUG
-      SDNode *S = ++I;
+      allnodes_iterator I(N);
+      SDNode *S = &*++I;
       dbgs() << "Overran sorted position:\n";
       S->dumprFull(this); dbgs() << "\n";
       dbgs() << "Checking if this is due to cycles\n";
@@ -6559,6 +6709,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
 //                              SDNode Class
 //===----------------------------------------------------------------------===//
 
+bool llvm::isNullConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isNullValue();
+}
+
+bool llvm::isNullFPConstant(SDValue V) {
+  ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
+  return Const != nullptr && Const->isZero() && !Const->isNegative();
+}
+
+bool llvm::isAllOnesConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isAllOnesValue();
+}
+
+bool llvm::isOneConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isOne();
+}
+
 HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
@@ -6772,6 +6942,12 @@ uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
   return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
 }
 
+const SDNodeFlags *SDNode::getFlags() const {
+  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
+    return &FlagsNode->Flags;
+  return nullptr;
+}
+
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
   assert(N->getNumValues() == 1 &&
          "Can't unroll a vector with multiple results!");
@@ -6808,9 +6984,11 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
     }
 
     switch (N->getOpcode()) {
-    default:
-      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands));
+    default: {
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
+                                N->getFlags()));
       break;
+    }
     case ISD::VSELECT:
       Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
       break;
@@ -7101,6 +7279,24 @@ BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
   return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
 }
 
+int32_t
+BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
+                                                   uint32_t BitWidth) const {
+  if (ConstantFPSDNode *CN =
+          dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
+    bool IsExact;
+    APSInt IntVal(BitWidth);
+    APFloat APF = CN->getValueAPF();
+    if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
+            APFloat::opOK ||
+        !IsExact)
+      return -1;
+
+    return IntVal.exactLogBase2();
+  }
+  return -1;
+}
+
 bool BuildVectorSDNode::isConstant() const {
   for (const SDValue &Op : op_values()) {
     unsigned Opc = Op.getOpcode();
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2c3c0eb1..45ae39a 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -63,6 +64,7 @@
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
@@ -79,7 +81,7 @@ LimitFPPrecision("limit-float-precision",
                  cl::init(0));
 
 static cl::opt<bool>
-EnableFMFInDAG("enable-fmf-dag", cl::init(false), cl::Hidden,
+EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
                 cl::desc("Enable fast-math-flags for DAG nodes"));
 
 // Limit the width of DAG chains. This is important in general to prevent
@@ -196,6 +198,14 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
   if (PartEVT == ValueVT)
     return Val;
 
+  if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
+      ValueVT.bitsLT(PartEVT)) {
+    // For an FP value in an integer part, we need to truncate to the right
+    // width first.
+    PartEVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
+    Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
+  }
+
   if (PartEVT.isInteger() && ValueVT.isInteger()) {
     if (ValueVT.bitsLT(PartEVT)) {
       // For a truncate, see if we have any information to
@@ -319,9 +329,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
     assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
       "Cannot handle this kind of promotion");
     // Promoted vector extract
-    bool Smaller = ValueVT.bitsLE(PartEVT);
-    return DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
-                       DL, ValueVT, Val);
+    return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
 
   }
 
@@ -339,11 +347,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
   }
 
   if (ValueVT.getVectorNumElements() == 1 &&
-      ValueVT.getVectorElementType() != PartEVT) {
-    bool Smaller = ValueVT.bitsLE(PartEVT);
-    Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
-                       DL, ValueVT.getScalarType(), Val);
-  }
+      ValueVT.getVectorElementType() != PartEVT)
+    Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType());
 
   return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
 }
@@ -387,6 +392,12 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
       assert(NumParts == 1 && "Do not know what to promote to!");
       Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
     } else {
+      if (ValueVT.isFloatingPoint()) {
+        // FP values need to be bitcast, then extended if they are being put
+        // into a larger container.
+        ValueVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
+        Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+      }
       assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
              ValueVT.isInteger() &&
              "Unknown mismatch!");
@@ -520,9 +531,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
                PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
 
       // Promoted vector extract
-      bool Smaller = PartEVT.bitsLE(ValueVT);
-      Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
-                        DL, PartVT, Val);
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
     } else{
       // Vector -> scalar conversion.
       assert(ValueVT.getVectorNumElements() == 1 &&
@@ -531,9 +540,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
           ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
           DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-      bool Smaller = ValueVT.bitsLE(PartVT);
-      Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
-                         DL, PartVT, Val);
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
     }
 
     Parts[0] = Val;
@@ -595,8 +602,7 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
                            const DataLayout &DL, unsigned Reg, Type *Ty) {
   ComputeValueVTs(TLI, DL, Ty, ValueVTs);
 
-  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
+  for (EVT ValueVT : ValueVTs) {
     unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT);
     MVT RegisterVT = TLI.getRegisterType(Context, ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i)
@@ -907,7 +913,8 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
 
   visit(I.getOpcode(), I);
 
-  if (!isa<TerminatorInst>(&I) && !HasTailCall)
+  if (!isa<TerminatorInst>(&I) && !HasTailCall &&
+      !isStatepoint(&I)) // statepoints handle their exports internally
     CopyToExportRegsIfNeeded(&I);
 
   CurInst = nullptr;
@@ -943,14 +950,12 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
     assert(Variable->isValidLocationForIntrinsic(dl) &&
            "Expected inlined-at fields to agree");
     uint64_t Offset = DI->getOffset();
-    // A dbg.value for an alloca is always indirect.
-    bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
     SDDbgValue *SDV;
     if (Val.getNode()) {
-      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, Offset, IsIndirect,
+      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, Offset, false,
                                     Val)) {
         SDV = DAG.getDbgValue(Variable, Expr, Val.getNode(), Val.getResNo(),
-                              IsIndirect, Offset, dl, DbgSDNodeOrder);
+                              false, Offset, dl, DbgSDNodeOrder);
         DAG.AddDbgValue(SDV, Val.getNode(), false);
       }
     } else
@@ -1168,6 +1173,140 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   llvm_unreachable("Can't get register for value!");
 }
 
+void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
+  MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
+  // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
+  if (IsMSVCCXX || IsCoreCLR)
+    CatchPadMBB->setIsEHFuncletEntry();
+
+  DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other, getControlRoot()));
+}
+
+void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
+  // Update machine-CFG edge.
+  MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
+  FuncInfo.MBB->addSuccessor(TargetMBB);
+
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsSEH = isAsynchronousEHPersonality(Pers);
+  if (IsSEH) {
+    // If this is not a fall-through branch or optimizations are switched off,
+    // emit the branch.
+    if (TargetMBB != NextBlock(FuncInfo.MBB) ||
+        TM.getOptLevel() == CodeGenOpt::None)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(TargetMBB)));
+    return;
+  }
+
+  // Figure out the funclet membership for the catchret's successor.
+  // This will be used by the FuncletLayout pass to determine how to order the
+  // BB's.
+  // A 'catchret' returns to the outer scope's color.
+  Value *ParentPad = I.getParentPad();
+  const BasicBlock *SuccessorColor;
+  if (isa<ConstantTokenNone>(ParentPad))
+    SuccessorColor = &FuncInfo.Fn->getEntryBlock();
+  else
+    SuccessorColor = cast<Instruction>(ParentPad)->getParent();
+  assert(SuccessorColor && "No parent funclet for catchret!");
+  MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
+  assert(SuccessorColorMBB && "No MBB for SuccessorColor!");
+
+  // Create the terminator node.
+  SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
+                            getControlRoot(), DAG.getBasicBlock(TargetMBB),
+                            DAG.getBasicBlock(SuccessorColorMBB));
+  DAG.setRoot(Ret);
+}
+
+void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
+  // Don't emit any special code for the cleanuppad instruction. It just marks
+  // the start of a funclet.
+  FuncInfo.MBB->setIsEHFuncletEntry();
+  FuncInfo.MBB->setIsCleanupFuncletEntry();
+}
+
+/// When an invoke or a cleanupret unwinds to the next EH pad, there are
+/// many places it could ultimately go. In the IR, we have a single unwind
+/// destination, but in the machine CFG, we enumerate all the possible blocks.
+/// This function skips over imaginary basic blocks that hold catchswitch
+/// instructions, and finds all the "real" machine
+/// basic block destinations. As those destinations may not be successors of
+/// EHPadBB, here we also calculate the edge probability to those destinations.
+/// The passed-in Prob is the edge probability to EHPadBB.
+static void findUnwindDestinations(
+    FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
+    BranchProbability Prob,
+    SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+        &UnwindDests) {
+  EHPersonality Personality =
+    classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
+
+  while (EHPadBB) {
+    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock *NewEHPadBB = nullptr;
+    if (isa<LandingPadInst>(Pad)) {
+      // Stop on landingpads. They are not funclets.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      break;
+    } else if (isa<CleanupPadInst>(Pad)) {
+      // Stop on cleanup pads. Cleanups are always funclet entries for all known
+      // personalities.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      UnwindDests.back().first->setIsEHFuncletEntry();
+      break;
+    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+      // Add the catchpad handlers to the possible destinations.
+      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+        UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
+        // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
+        if (IsMSVCCXX || IsCoreCLR)
+          UnwindDests.back().first->setIsEHFuncletEntry();
+      }
+      NewEHPadBB = CatchSwitch->getUnwindDest();
+    } else {
+      continue;
+    }
+
+    BranchProbabilityInfo *BPI = FuncInfo.BPI;
+    if (BPI && NewEHPadBB)
+      Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
+    EHPadBB = NewEHPadBB;
+  }
+}
+
+void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
+  // Update successor info.
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  auto UnwindDest = I.getUnwindDest();
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  BranchProbability UnwindDestProb =
+      (BPI && UnwindDest)
+          ? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
+          : BranchProbability::getZero();
+  findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
+  }
+  FuncInfo.MBB->normalizeSuccProbs();
+
+  // Create the terminator node.
+  SDValue Ret =
+      DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
+  DAG.setRoot(Ret);
+}
+
+void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
+  report_fatal_error("visitCatchSwitch not yet implemented!");
+}
+
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   auto &DL = DAG.getDataLayout();
@@ -1186,7 +1325,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     ComputeValueVTs(TLI, DL, PointerType::getUnqual(F->getReturnType()),
                     PtrValueVTs);
 
-    SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
+    SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                                        DemoteReg, PtrValueVTs[0]);
     SDValue RetOp = getValue(I.getOperand(0));
 
     SmallVector<EVT, 4> ValueVTs;
@@ -1194,12 +1334,18 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
     unsigned NumValues = ValueVTs.size();
 
+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
     SmallVector<SDValue, 4> Chains(NumValues);
     for (unsigned i = 0; i != NumValues; ++i) {
       SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(),
                                 RetPtr.getValueType(), RetPtr,
                                 DAG.getIntPtrConstant(Offsets[i],
-                                                      getCurSDLoc()));
+                                                      getCurSDLoc()),
+                                &Flags);
       Chains[i] =
         DAG.getStore(Chain, getCurSDLoc(),
                      SDValue(RetOp.getNode(), RetOp.getResNo() + i),
@@ -1334,25 +1480,34 @@ bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
 }
 
 /// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
-uint32_t SelectionDAGBuilder::getEdgeWeight(const MachineBasicBlock *Src,
-                                            const MachineBasicBlock *Dst) const {
+BranchProbability
+SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
+                                        const MachineBasicBlock *Dst) const {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
-  if (!BPI)
-    return 0;
   const BasicBlock *SrcBB = Src->getBasicBlock();
   const BasicBlock *DstBB = Dst->getBasicBlock();
-  return BPI->getEdgeWeight(SrcBB, DstBB);
+  if (!BPI) {
+    // If BPI is not available, set the default probability as 1 / N, where N is
+    // the number of successors.
+    auto SuccSize = std::max<uint32_t>(
+        std::distance(succ_begin(SrcBB), succ_end(SrcBB)), 1);
+    return BranchProbability(1, SuccSize);
+  }
+  return BPI->getEdgeProbability(SrcBB, DstBB);
 }
 
-void SelectionDAGBuilder::
-addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
-                       uint32_t Weight /* = 0 */) {
-  if (!Weight)
-    Weight = getEdgeWeight(Src, Dst);
-  Src->addSuccessor(Dst, Weight);
+void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
+                                               MachineBasicBlock *Dst,
+                                               BranchProbability Prob) {
+  if (!FuncInfo.BPI)
+    Src->addSuccessorWithoutProb(Dst);
+  else {
+    if (Prob.isUnknown())
+      Prob = getEdgeProbability(Src, Dst);
+    Src->addSuccessor(Dst, Prob);
+  }
 }
 
-
 static bool InBlock(const Value *V, const BasicBlock *BB) {
   if (const Instruction *I = dyn_cast<Instruction>(V))
     return I->getParent() == BB;
@@ -1369,8 +1524,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
                                                   MachineBasicBlock *FBB,
                                                   MachineBasicBlock *CurBB,
                                                   MachineBasicBlock *SwitchBB,
-                                                  uint32_t TWeight,
-                                                  uint32_t FWeight) {
+                                                  BranchProbability TProb,
+                                                  BranchProbability FProb) {
   const BasicBlock *BB = CurBB->getBasicBlock();
 
   // If the leaf of the tree is a comparison, merge the condition into
@@ -1385,17 +1540,15 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
       ISD::CondCode Condition;
       if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
         Condition = getICmpCondCode(IC->getPredicate());
-      } else if (const FCmpInst *FC = dyn_cast<FCmpInst>(Cond)) {
+      } else {
+        const FCmpInst *FC = cast<FCmpInst>(Cond);
         Condition = getFCmpCondCode(FC->getPredicate());
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
-      } else {
-        (void)Condition; // silence warning.
-        llvm_unreachable("Unknown compare instruction");
       }
 
       CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
-                   TBB, FBB, CurBB, TWeight, FWeight);
+                   TBB, FBB, CurBB, TProb, FProb);
       SwitchCases.push_back(CB);
       return;
     }
@@ -1403,26 +1556,19 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
 
   // Create a CaseBlock record representing this branch.
   CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
-               nullptr, TBB, FBB, CurBB, TWeight, FWeight);
+               nullptr, TBB, FBB, CurBB, TProb, FProb);
   SwitchCases.push_back(CB);
 }
 
-/// Scale down both weights to fit into uint32_t.
-static void ScaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
-  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
-  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
-  NewTrue = NewTrue / Scale;
-  NewFalse = NewFalse / Scale;
-}
-
 /// FindMergedConditions - If Cond is an expression like
 void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                MachineBasicBlock *TBB,
                                                MachineBasicBlock *FBB,
                                                MachineBasicBlock *CurBB,
                                                MachineBasicBlock *SwitchBB,
-                                               unsigned Opc, uint32_t TWeight,
-                                               uint32_t FWeight) {
+                                               Instruction::BinaryOps Opc,
+                                               BranchProbability TProb,
+                                               BranchProbability FProb) {
   // If this node is not part of the or/and tree, emit it as a branch.
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
   if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
@@ -1431,12 +1577,12 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
       !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
       !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
     EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
-                                 TWeight, FWeight);
+                                 TProb, FProb);
     return;
   }
 
   //  Create TmpBB after CurBB.
-  MachineFunction::iterator BBI = CurBB;
+  MachineFunction::iterator BBI(CurBB);
   MachineFunction &MF = DAG.getMachineFunction();
   MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
   CurBB->getParent()->insert(++BBI, TmpBB);
@@ -1455,26 +1601,25 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     // The requirement is that
     //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
     //     = TrueProb for original BB.
-    // Assuming the original weights are A and B, one choice is to set BB1's
-    // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
-    // assumes that
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
+    // A/(1+B) and 2B/(1+B). This choice assumes that
     //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
     // Another choice is to assume TrueProb for BB1 equals to TrueProb for
     // TmpBB, but the math is more complicated.
 
-    uint64_t NewTrueWeight = TWeight;
-    uint64_t NewFalseWeight = (uint64_t)TWeight + 2 * (uint64_t)FWeight;
-    ScaleWeights(NewTrueWeight, NewFalseWeight);
+    auto NewTrueProb = TProb / 2;
+    auto NewFalseProb = TProb / 2 + FProb;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
-                         NewTrueWeight, NewFalseWeight);
+                         NewTrueProb, NewFalseProb);
 
-    NewTrueWeight = TWeight;
-    NewFalseWeight = 2 * (uint64_t)FWeight;
-    ScaleWeights(NewTrueWeight, NewFalseWeight);
+    // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
+    SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         NewTrueWeight, NewFalseWeight);
+                         Probs[0], Probs[1]);
   } else {
     assert(Opc == Instruction::And && "Unknown merge op!");
     // Codegen X & Y as:
@@ -1491,24 +1636,23 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     // The requirement is that
     //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
     //     = FalseProb for original BB.
-    // Assuming the original weights are A and B, one choice is to set BB1's
-    // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
-    // assumes that
-    //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
-
-    uint64_t NewTrueWeight = 2 * (uint64_t)TWeight + (uint64_t)FWeight;
-    uint64_t NewFalseWeight = FWeight;
-    ScaleWeights(NewTrueWeight, NewFalseWeight);
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
+    // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
+    // TrueProb for BB1 * FalseProb for TmpBB.
+
+    auto NewTrueProb = TProb + FProb / 2;
+    auto NewFalseProb = FProb / 2;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
-                         NewTrueWeight, NewFalseWeight);
+                         NewTrueProb, NewFalseProb);
 
-    NewTrueWeight = 2 * (uint64_t)TWeight;
-    NewFalseWeight = FWeight;
-    ScaleWeights(NewTrueWeight, NewFalseWeight);
+    // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
+    SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         NewTrueWeight, NewFalseWeight);
+                         Probs[0], Probs[1]);
   }
 }
 
@@ -1585,12 +1729,14 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   //     jle foo
   //
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
-    if (!DAG.getTargetLoweringInfo().isJumpExpensive() &&
-        BOp->hasOneUse() && (BOp->getOpcode() == Instruction::And ||
-                             BOp->getOpcode() == Instruction::Or)) {
+    Instruction::BinaryOps Opcode = BOp->getOpcode();
+    if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
+        !I.getMetadata(LLVMContext::MD_unpredictable) &&
+        (Opcode == Instruction::And || Opcode == Instruction::Or)) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
-                           BOp->getOpcode(), getEdgeWeight(BrMBB, Succ0MBB),
-                           getEdgeWeight(BrMBB, Succ1MBB));
+                           Opcode,
+                           getEdgeProbability(BrMBB, Succ0MBB),
+                           getEdgeProbability(BrMBB, Succ1MBB));
       // If the compares in later blocks need to use values not currently
       // exported from this block, export them now.  This block should always
       // be the first entry.
@@ -1669,11 +1815,12 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
   }
 
   // Update successor info
-  addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight);
+  addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
   // TrueBB and FalseBB are always different unless the incoming IR is
   // degenerate. This only happens when running llc on weird IR.
   if (CB.TrueBB != CB.FalseBB)
-    addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
+    addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
+  SwitchBB->normalizeSuccProbs();
 
   // If the lhs block is the next block, invert the condition so that we can
   // fall through to the lhs instead of the rhs block.
@@ -1797,10 +1944,10 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
                         GuardPtr, MachinePointerInfo(IRGuard, 0),
                         true, false, false, Align);
 
-  SDValue StackSlot = DAG.getLoad(PtrTy, dl, DAG.getEntryNode(),
-                                  StackSlotPtr,
-                                  MachinePointerInfo::getFixedStack(FI),
-                                  true, false, false, Align);
+  SDValue StackSlot = DAG.getLoad(
+      PtrTy, dl, DAG.getEntryNode(), StackSlotPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), true,
+      false, false, Align);
 
   // Perform the comparison via a subtract/getsetcc.
   EVT VT = Guard.getValueType();
@@ -1837,7 +1984,7 @@ SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain =
       TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
-                      nullptr, 0, false, getCurSDLoc(), false, false).second;
+                      None, false, getCurSDLoc(), false, false).second;
   DAG.setRoot(Chain);
 }
 
@@ -1884,8 +2031,9 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   MachineBasicBlock* MBB = B.Cases[0].ThisBB;
 
-  addSuccessorWithWeight(SwitchBB, B.Default);
-  addSuccessorWithWeight(SwitchBB, MBB);
+  addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  addSuccessorWithProb(SwitchBB, MBB, B.Prob);
+  SwitchBB->normalizeSuccProbs();
 
   SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
                                 MVT::Other, CopyTo, RangeCmp,
@@ -1902,7 +2050,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 /// visitBitTestCase - this function produces one "bit test"
 void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                                            MachineBasicBlock* NextMBB,
-                                           uint32_t BranchWeightToNext,
+                                           BranchProbability BranchProbToNext,
                                            unsigned Reg,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
@@ -1938,10 +2086,14 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
         AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
   }
 
-  // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
-  addSuccessorWithWeight(SwitchBB, B.TargetBB, B.ExtraWeight);
-  // The branch weight from SwitchBB to NextMBB is BranchWeightToNext.
-  addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext);
+  // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
+  addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
+  // The branch probability from SwitchBB to NextMBB is BranchProbToNext.
+  addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
+  // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
+  // one as they are relative probabilities (and thus work more like weights),
+  // and hence we need to normalize them to let the sum of them become one.
+  SwitchBB->normalizeSuccProbs();
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
                               MVT::Other, getControlRoot(),
@@ -1958,9 +2110,10 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
 void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
 
-  // Retrieve successors.
+  // Retrieve successors. Look through artificial IR level blocks like
+  // catchswitch for successors.
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
-  MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)];
+  const BasicBlock *EHPadBB = I.getSuccessor(1);
 
   const Value *Callee(I.getCalledValue());
   const Function *Fn = dyn_cast<Function>(Callee);
@@ -1975,14 +2128,14 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
       break;
     case Intrinsic::experimental_patchpoint_void:
     case Intrinsic::experimental_patchpoint_i64:
-      visitPatchpoint(&I, LandingPad);
+      visitPatchpoint(&I, EHPadBB);
       break;
     case Intrinsic::experimental_gc_statepoint:
-      LowerStatepoint(ImmutableStatepoint(&I), LandingPad);
+      LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
       break;
     }
   } else
-    LowerCallTo(&I, getValue(Callee), false, LandingPad);
+    LowerCallTo(&I, getValue(Callee), false, EHPadBB);
 
   // If the value of the invoke is used outside of its defining block, make it
   // available as a virtual register.
@@ -1992,9 +2145,20 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
     CopyToExportRegsIfNeeded(&I);
   }
 
-  // Update successor info
-  addSuccessorWithWeight(InvokeMBB, Return);
-  addSuccessorWithWeight(InvokeMBB, LandingPad);
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  BranchProbability EHPadBBProb =
+      BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
+          : BranchProbability::getZero();
+  findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);
+
+  // Update successor info.
+  addSuccessorWithProb(InvokeMBB, Return);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
+  }
+  InvokeMBB->normalizeSuccProbs();
 
   // Drop into normal successor.
   DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
@@ -2007,7 +2171,7 @@ void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
 }
 
 void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
-  assert(FuncInfo.MBB->isLandingPad() &&
+  assert(FuncInfo.MBB->isEHPad() &&
          "Call to landingpad not in landing pad!");
 
   MachineBasicBlock *MBB = FuncInfo.MBB;
@@ -2017,8 +2181,16 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother to create these DAG nodes.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.getExceptionPointerRegister() == 0 &&
-      TLI.getExceptionSelectorRegister() == 0)
+  const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
+  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+    return;
+
+  // If landingpad's return type is token type, we don't create DAG nodes
+  // for its exception pointer and selector value. The extraction of exception
+  // pointer or selector value from token type landingpads is not currently
+  // supported.
+  if (LP.getType()->isTokenTy())
     return;
 
   SmallVector<EVT, 2> ValueVTs;
@@ -2074,8 +2246,7 @@ void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) {
       // If this case has the same successor and is a neighbour, merge it into
       // the previous cluster.
       Clusters[DstIndex - 1].High = CaseVal;
-      Clusters[DstIndex - 1].Weight += CC.Weight;
-      assert(Clusters[DstIndex - 1].Weight >= CC.Weight && "Weight overflow!");
+      Clusters[DstIndex - 1].Prob += CC.Prob;
     } else {
       std::memmove(&Clusters[DstIndex++], &Clusters[SrcIndex],
                    sizeof(Clusters[SrcIndex]));
@@ -2109,8 +2280,9 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
         continue;
 
     MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
-    addSuccessorWithWeight(IndirectBrMBB, Succ);
+    addSuccessorWithProb(IndirectBrMBB, Succ);
   }
+  IndirectBrMBB->normalizeSuccProbs();
 
   DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
                           MVT::Other, getControlRoot(),
@@ -2119,7 +2291,8 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
 
 void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
   if (DAG.getTarget().Options.TrapUnreachable)
-    DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+    DAG.setRoot(
+        DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
 void SelectionDAGBuilder::visitFSub(const User &I) {
@@ -2260,6 +2433,10 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Condition = getFCmpCondCode(predicate);
+  
+  // FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them.
+  // FIXME: We should propagate the fast-math-flags to the DAG node itself for
+  // further optimization, but currently FMF is only applicable to binary nodes.
   if (TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
@@ -2284,27 +2461,74 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 
   // Min/max matching is only viable if all output VTs are the same.
   if (std::equal(ValueVTs.begin(), ValueVTs.end(), ValueVTs.begin())) {
-    Value *LHS, *RHS;
-    SelectPatternFlavor SPF = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
-    ISD::NodeType Opc = ISD::DELETED_NODE;
-    switch (SPF) {
-    case SPF_UMAX: Opc = ISD::UMAX; break;
-    case SPF_UMIN: Opc = ISD::UMIN; break;
-    case SPF_SMAX: Opc = ISD::SMAX; break;
-    case SPF_SMIN: Opc = ISD::SMIN; break;
-    default: break;
-    }
-
     EVT VT = ValueVTs[0];
     LLVMContext &Ctx = *DAG.getContext();
     auto &TLI = DAG.getTargetLoweringInfo();
-    while (TLI.getTypeAction(Ctx, VT) == TargetLoweringBase::TypeSplitVector)
+
+    // We care about the legality of the operation after it has been type
+    // legalized.
+    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
+           VT != TLI.getTypeToTransformTo(Ctx, VT))
       VT = TLI.getTypeToTransformTo(Ctx, VT);
 
-    if (Opc != ISD::DELETED_NODE && TLI.isOperationLegalOrCustom(Opc, VT) &&
-        // If the underlying comparison instruction is used by any other instruction,
-        // the consumed instructions won't be destroyed, so it is not profitable
-        // to convert to a min/max.
+    // If the vselect is legal, assume we want to leave this as a vector setcc +
+    // vselect. Otherwise, if this is going to be scalarized, we want to see if
+    // min/max is legal on the scalar type.
+    bool UseScalarMinMax = VT.isVector() &&
+      !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);
+
+    Value *LHS, *RHS;
+    auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
+    ISD::NodeType Opc = ISD::DELETED_NODE;
+    switch (SPR.Flavor) {
+    case SPF_UMAX:    Opc = ISD::UMAX; break;
+    case SPF_UMIN:    Opc = ISD::UMIN; break;
+    case SPF_SMAX:    Opc = ISD::SMAX; break;
+    case SPF_SMIN:    Opc = ISD::SMIN; break;
+    case SPF_FMINNUM:
+      switch (SPR.NaNBehavior) {
+      case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMINNAN; break;
+      case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
+      case SPNB_RETURNS_ANY: {
+        if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
+          Opc = ISD::FMINNUM;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT))
+          Opc = ISD::FMINNAN;
+        else if (UseScalarMinMax)
+          Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
+            ISD::FMINNUM : ISD::FMINNAN;
+        break;
+      }
+      }
+      break;
+    case SPF_FMAXNUM:
+      switch (SPR.NaNBehavior) {
+      case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMAXNAN; break;
+      case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
+      case SPNB_RETURNS_ANY:
+
+        if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
+          Opc = ISD::FMAXNUM;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT))
+          Opc = ISD::FMAXNAN;
+        else if (UseScalarMinMax)
+          Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
+            ISD::FMAXNUM : ISD::FMAXNAN;
+        break;
+      }
+      break;
+    default: break;
+    }
+
+    if (Opc != ISD::DELETED_NODE &&
+        (TLI.isOperationLegalOrCustom(Opc, VT) ||
+         (UseScalarMinMax &&
+          TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
+        // If the underlying comparison instruction is used by any other
+        // instruction, the consumed instructions won't be destroyed, so it is
+        // not profitable to convert to a min/max.
         cast<SelectInst>(&I)->getCondition()->hasOneUse()) {
       OpCode = Opc;
       LHSVal = getValue(LHS);
@@ -2781,8 +3005,15 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (Field) {
         // N = N + Offset
         uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
+
+        // In an inbouds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
         N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
-                        DAG.getConstant(Offset, dl, N.getValueType()));
+                        DAG.getConstant(Offset, dl, N.getValueType()), &Flags);
       }
 
       Ty = StTy->getElementType(Field);
@@ -2807,7 +3038,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         SDValue OffsVal = VectorWidth ?
           DAG.getConstant(Offs, dl, MVT::getVectorVT(PtrTy, VectorWidth)) :
           DAG.getConstant(Offs, dl, PtrTy);
-        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal);
+
+        // In an inbouds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags);
         continue;
       }
 
@@ -2879,10 +3117,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
     Align = 0;
 
   // Round the size of the allocation up to the stack alignment size
-  // by add SA-1 to the size.
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
   AllocSize = DAG.getNode(ISD::ADD, dl,
                           AllocSize.getValueType(), AllocSize,
-                          DAG.getIntPtrConstant(StackAlign - 1, dl));
+                          DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags);
 
   // Mask out the low bits for alignment purposes.
   AllocSize = DAG.getNode(ISD::AND, dl,
@@ -2920,7 +3161,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   // throughout the function's lifetime.
 
   bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr &&
-    isDereferenceablePointer(SV, *DAG.getTarget().getDataLayout());
+                     isDereferenceablePointer(SV, DAG.getDataLayout());
   unsigned Alignment = I.getAlignment();
 
   AAMDNodes AAInfo;
@@ -2940,8 +3181,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (isVolatile || NumValues > MaxParallelChains)
     // Serialize volatile loads with other side effects.
     Root = getRoot();
-  else if (AA->pointsToConstantMemory(
-               MemoryLocation(SV, AA->getTypeStoreSize(Ty), AAInfo))) {
+  else if (AA->pointsToConstantMemory(MemoryLocation(
+               SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
@@ -2955,6 +3196,11 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (isVolatile)
     Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
 
+  // An aggregate load cannot wrap around the address space, so offsets to its
+  // parts don't wrap either.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+
   SmallVector<SDValue, 4> Values(NumValues);
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   EVT PtrVT = Ptr.getValueType();
@@ -2975,7 +3221,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     }
     SDValue A = DAG.getNode(ISD::ADD, dl,
                             PtrVT, Ptr,
-                            DAG.getConstant(Offsets[i], dl, PtrVT));
+                            DAG.getConstant(Offsets[i], dl, PtrVT),
+                            &Flags);
     SDValue L = DAG.getLoad(ValueVTs[i], dl, Root,
                             A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
                             isNonTemporal, isInvariant, Alignment, AAInfo,
@@ -3030,6 +3277,11 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
+  // An aggregate load cannot wrap around the address space, so offsets to its
+  // parts don't wrap either.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // See visitLoad comments.
@@ -3040,7 +3292,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
       ChainI = 0;
     }
     SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
-                              DAG.getConstant(Offsets[i], dl, PtrVT));
+                              DAG.getConstant(Offsets[i], dl, PtrVT), &Flags);
     SDValue St = DAG.getStore(Root, dl,
                               SDValue(Src.getNode(), Src.getResNo() + i),
                               Add, MachinePointerInfo(PtrV, Offsets[i]),
@@ -3056,7 +3308,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
 void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
-  // llvm.masked.store.*(Src0, Ptr, alignemt, Mask)
+  // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
   Value  *PtrOperand = I.getArgOperand(1);
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(I.getArgOperand(0));
@@ -3080,63 +3332,70 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
   setValue(&I, StoreNode);
 }
 
-// Gather/scatter receive a vector of pointers.
-// This vector of pointers may be represented as a base pointer + vector of 
-// indices, it depends on GEP and instruction preceeding GEP
-// that calculates indices
-static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index,
+// Get a uniform base for the Gather/Scatter intrinsic.
+// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
+// We try to represent it as a base pointer + vector of indices.
+// Usually, the vector of pointers comes from a 'getelementptr' instruction.
+// The first operand of the GEP may be a single pointer or a vector of pointers
+// Example:
+//   %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
+//  or
+//   %gep.ptr = getelementptr i32, i32* %ptr,        <8 x i32> %ind
+// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
+//
+// When the first GEP operand is a single pointer - it is the uniform base we
+// are looking for. If first operand of the GEP is a splat vector - we
+// extract the spalt value and use it as a uniform base.
+// In all other cases the function returns 'false'.
+//
+static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index,
                            SelectionDAGBuilder* SDB) {
 
-  assert (Ptr->getType()->isVectorTy() && "Uexpected pointer type");
-  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-  if (!Gep || Gep->getNumOperands() > 2)
+  SelectionDAG& DAG = SDB->DAG;
+  LLVMContext &Context = *DAG.getContext();
+
+  assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || GEP->getNumOperands() > 2)
     return false;
-  ShuffleVectorInst *ShuffleInst = 
-    dyn_cast<ShuffleVectorInst>(Gep->getPointerOperand());
-  if (!ShuffleInst || !ShuffleInst->getMask()->isNullValue() ||
-      cast<Instruction>(ShuffleInst->getOperand(0))->getOpcode() !=
-      Instruction::InsertElement)
+
+  const Value *GEPPtr = GEP->getPointerOperand();
+  if (!GEPPtr->getType()->isVectorTy())
+    Ptr = GEPPtr;
+  else if (!(Ptr = getSplatValue(GEPPtr)))
     return false;
 
-  Ptr = cast<InsertElementInst>(ShuffleInst->getOperand(0))->getOperand(1);
+  Value *IndexVal = GEP->getOperand(1);
 
-  SelectionDAG& DAG = SDB->DAG;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  // Check is the Ptr is inside current basic block
-  // If not, look for the shuffle instruction
-  if (SDB->findValue(Ptr))
-    Base = SDB->getValue(Ptr);
-  else if (SDB->findValue(ShuffleInst)) {
-    SDValue ShuffleNode = SDB->getValue(ShuffleInst);
-    SDLoc sdl = ShuffleNode;
-    Base = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, sdl,
-        ShuffleNode.getValueType().getScalarType(), ShuffleNode,
-        DAG.getConstant(0, sdl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    SDB->setValue(Ptr, Base);
-  }
-  else
+  // The operands of the GEP may be defined in another basic block.
+  // In this case we'll not find nodes for the operands.
+  if (!SDB->findValue(Ptr) || !SDB->findValue(IndexVal))
     return false;
 
-  Value *IndexVal = Gep->getOperand(1);
-  if (SDB->findValue(IndexVal)) {
-    Index = SDB->getValue(IndexVal);
+  Base = SDB->getValue(Ptr);
+  Index = SDB->getValue(IndexVal);
 
-    if (SExtInst* Sext = dyn_cast<SExtInst>(IndexVal)) {
+  // Suppress sign extension.
+  if (SExtInst* Sext = dyn_cast<SExtInst>(IndexVal)) {
+    if (SDB->findValue(Sext->getOperand(0))) {
       IndexVal = Sext->getOperand(0);
-      if (SDB->findValue(IndexVal))
-        Index = SDB->getValue(IndexVal);
+      Index = SDB->getValue(IndexVal);
     }
-    return true;
   }
-  return false;
+  if (!Index.getValueType().isVector()) {
+    unsigned GEPWidth = GEP->getType()->getVectorNumElements();
+    EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
+    SmallVector<SDValue, 16> Ops(GEPWidth, Index);
+    Index = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Index), VT, Ops);
+  }
+  return true;
 }
 
 void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
-  Value  *Ptr = I.getArgOperand(1);
+  const Value *Ptr = I.getArgOperand(1);
   SDValue Src0 = getValue(I.getArgOperand(0));
   SDValue Mask = getValue(I.getArgOperand(3));
   EVT VT = Src0.getValueType();
@@ -3150,10 +3409,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
-  Value *BasePtr = Ptr;
+  const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
 
-  Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
+  const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
@@ -3190,7 +3449,8 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
 
   SDValue InChain = DAG.getRoot();
   if (AA->pointsToConstantMemory(MemoryLocation(
-          PtrOperand, AA->getTypeStoreSize(I.getType()), AAInfo))) {
+          PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()),
+          AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     InChain = DAG.getEntryNode();
   }
@@ -3212,7 +3472,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
-  Value  *Ptr = I.getArgOperand(0);
+  const Value *Ptr = I.getArgOperand(0);
   SDValue Src0 = getValue(I.getArgOperand(3));
   SDValue Mask = getValue(I.getArgOperand(2));
 
@@ -3229,12 +3489,13 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
-  Value *BasePtr = Ptr;
+  const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
   bool ConstantMemory = false;
   if (UniformBase &&
-      AA->pointsToConstantMemory(
-          MemoryLocation(BasePtr, AA->getTypeStoreSize(I.getType()), AAInfo))) {
+      AA->pointsToConstantMemory(MemoryLocation(
+          BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
+          AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
@@ -3511,6 +3772,8 @@ getF32Constant(SelectionDAG &DAG, unsigned Flt, SDLoc dl) {
 
 static SDValue getLimitedPrecisionExp2(SDValue t0, SDLoc dl,
                                        SelectionDAG &DAG) {
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
   //   IntegerPartOfX = ((int32_t)(t0);
   SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
 
@@ -3609,6 +3872,8 @@ static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG,
     //
     //   #define LOG2OFe 1.4426950f
     //   t0 = Op * LOG2OFe
+
+    // TODO: What fast-math-flags should be set here?
     SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
                              getF32Constant(DAG, 0x3fb8aa3b, dl));
     return getLimitedPrecisionExp2(t0, dl, DAG);
@@ -3622,6 +3887,9 @@ static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 /// limited-precision mode.
 static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
+ 
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
@@ -3718,6 +3986,9 @@ static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 /// limited-precision mode.
 static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
+  
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
@@ -3813,6 +4084,9 @@ static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 /// limited-precision mode.
 static SDValue expandLog10(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                            const TargetLowering &TLI) {
+
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
@@ -3922,6 +4196,7 @@ static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS,
     }
   }
 
+  // TODO: What fast-math-flags should be set on the FMUL node?
   if (IsExp10) {
     // Put the exponent in the right bit position for later addition to the
     // final result:
@@ -3955,9 +4230,9 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
     const Function *F = DAG.getMachineFunction().getFunction();
-    if (!F->hasFnAttribute(Attribute::OptimizeForSize) ||
-        // If optimizing for size, don't insert too many multiplies.  This
-        // inserts up to 5 multiplies.
+    if (!F->optForSize() ||
+        // If optimizing for size, don't insert too many multiplies.
+        // This inserts up to 5 multiplies.
         countPopulation(Val) + Log2_32(Val) < 7) {
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
@@ -3965,6 +4240,8 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
       // the benefit of being both really simple and much better than a libcall.
       SDValue Res;  // Logically starts equal to 1.0
       SDValue CurSquare = LHS;
+      // TODO: Intrinsics should have fast-math-flags that propagate to these
+      // nodes.
       while (Val) {
         if (Val & 1) {
           if (Res.getNode())
@@ -3990,22 +4267,20 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
 }
 
-// getTruncatedArgReg - Find underlying register used for an truncated
-// argument.
-static unsigned getTruncatedArgReg(const SDValue &N) {
-  if (N.getOpcode() != ISD::TRUNCATE)
+// getUnderlyingArgReg - Find underlying register used for a truncated or
+// bitcasted argument.
+static unsigned getUnderlyingArgReg(const SDValue &N) {
+  switch (N.getOpcode()) {
+  case ISD::CopyFromReg:
+    return cast<RegisterSDNode>(N.getOperand(1))->getReg();
+  case ISD::BITCAST:
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+  case ISD::TRUNCATE:
+    return getUnderlyingArgReg(N.getOperand(0));
+  default:
     return 0;
-
-  const SDValue &Ext = N.getOperand(0);
-  if (Ext.getOpcode() == ISD::AssertZext ||
-      Ext.getOpcode() == ISD::AssertSext) {
-    const SDValue &CFR = Ext.getOperand(0);
-    if (CFR.getOpcode() == ISD::CopyFromReg)
-      return cast<RegisterSDNode>(CFR.getOperand(1))->getReg();
-    if (CFR.getOpcode() == ISD::TRUNCATE)
-      return getTruncatedArgReg(CFR);
   }
-  return 0;
 }
 
 /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
@@ -4033,11 +4308,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     Op = MachineOperand::CreateFI(FI);
 
   if (!Op && N.getNode()) {
-    unsigned Reg;
-    if (N.getOpcode() == ISD::CopyFromReg)
-      Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
-    else
-      Reg = getTruncatedArgReg(N);
+    unsigned Reg = getUnderlyingArgReg(N);
     if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
       unsigned PR = RegInfo.getLiveInPhysReg(Reg);
@@ -4145,14 +4416,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::longjmp:
     return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
-    // FIXME: this definition of "user defined address space" is x86-specific
-    // Assert for address < 256 since we support only user defined address
-    // spaces.
-    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
-           < 256 &&
-           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
-           < 256 &&
-           "Unknown address space");
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
@@ -4169,12 +4432,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::memset: {
-    // FIXME: this definition of "user defined address space" is x86-specific
-    // Assert for address < 256 since we support only user defined address
-    // spaces.
-    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
-           < 256 &&
-           "Unknown address space");
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
@@ -4189,14 +4446,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::memmove: {
-    // FIXME: this definition of "user defined address space" is x86-specific
-    // Assert for address < 256 since we support only user defined address
-    // spaces.
-    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
-           < 256 &&
-           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
-           < 256 &&
-           "Unknown address space");
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
@@ -4238,33 +4487,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
         Address = BCI->getOperand(0);
       // Parameters are handled specially.
-      bool isParameter = Variable->getTag() == dwarf::DW_TAG_arg_variable ||
-                         isa<Argument>(Address);
-
-      const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
-
-      if (isParameter && !AI) {
-        FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
-        if (FINode)
-          // Byval parameter.  We have a frame index at this point.
-          SDV = DAG.getFrameIndexDbgValue(
-              Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder);
-        else {
-          // Address is an argument, so try to emit its dbg value using
-          // virtual register info from the FuncInfo.ValueMap.
-          EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
-                                   N);
-          return nullptr;
-        }
-      } else if (AI)
+      bool isParameter = Variable->isParameter() || isa<Argument>(Address);
+      auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
+      if (isParameter && FINode) {
+        // Byval parameter. We have a frame index at this point.
+        SDV = DAG.getFrameIndexDbgValue(Variable, Expression,
+                                        FINode->getIndex(), 0, dl, SDNodeOrder);
+      } else if (isa<Argument>(Address)) {
+        // Address is an argument, so try to emit its dbg value using
+        // virtual register info from the FuncInfo.ValueMap.
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
+                                 N);
+        return nullptr;
+      } else {
         SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
                               true, 0, dl, SDNodeOrder);
-      else {
-        // Can't do anything with other non-AI cases yet.
-        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
-        DEBUG(dbgs() << "non-AllocaInst issue for Address: \n\t");
-        DEBUG(Address->dump());
-        return nullptr;
       }
       DAG.AddDbgValue(SDV, N.getNode(), isParameter);
     } else {
@@ -4315,12 +4552,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         // Check unused arguments map.
         N = UnusedArgNodeMap[V];
       if (N.getNode()) {
-        // A dbg.value for an alloca is always indirect.
-        bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
         if (!EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset,
-                                      IsIndirect, N)) {
+                                      false, N)) {
           SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
-                                IsIndirect, Offset, dl, SDNodeOrder);
+                                false, Offset, dl, SDNodeOrder);
           DAG.AddDbgValue(SDV, N.getNode(), false);
         }
       } else if (!V->use_empty() ) {
@@ -4421,6 +4656,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                             getRoot(), getValue(I.getArgOperand(0))));
     return nullptr;
   }
+  case Intrinsic::eh_sjlj_setup_dispatch: {
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
+                            getRoot()));
+    return nullptr;
+  }
 
   case Intrinsic::masked_gather:
     visitMaskedGather(I);
@@ -4614,6 +4854,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                                getValue(I.getArgOperand(1)),
                                getValue(I.getArgOperand(2))));
     } else {
+      // TODO: Intrinsic calls should have fast-math-flags.
       SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
                                 getValue(I.getArgOperand(0)).getValueType(),
                                 getValue(I.getArgOperand(0)),
@@ -4652,6 +4893,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(Res.getValue(1));
     return nullptr;
   }
+  case Intrinsic::bitreverse:
+    setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
   case Intrinsic::bswap:
     setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -4693,6 +4939,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
     return nullptr;
   }
+  case Intrinsic::get_dynamic_area_offset: {
+    SDValue Op = getRoot();
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
+    // target.
+    if (PtrTy != ResTy)
+      report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
+                         " intrinsic!");
+    Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
+                      Op);
+    DAG.setRoot(Op);
+    setValue(&I, Res);
+    return nullptr;
+  }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
@@ -4743,8 +5004,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
     // Store the stack protector onto the stack.
-    Res = DAG.getStore(Chain, sdl, Src, FIN,
-                       MachinePointerInfo::getFixedStack(FI),
+    Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
+                                                 DAG.getMachineFunction(), FI),
                        true, false, 0);
     setValue(&I, Res);
     DAG.setRoot(Res);
@@ -4946,9 +5207,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::clear_cache:
     return TLI.getClearCacheBuiltinName();
-  case Intrinsic::eh_actions:
-    setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
-    return nullptr;
   case Intrinsic::donothing:
     // ignore
     return nullptr;
@@ -4965,20 +5223,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     visitStatepoint(I);
     return nullptr;
   }
-  case Intrinsic::experimental_gc_result_int:
-  case Intrinsic::experimental_gc_result_float:
-  case Intrinsic::experimental_gc_result_ptr:
   case Intrinsic::experimental_gc_result: {
     visitGCResult(I);
     return nullptr;
   }
   case Intrinsic::experimental_gc_relocate: {
-    visitGCRelocate(I);
+    visitGCRelocate(cast<GCRelocateInst>(I));
     return nullptr;
   }
   case Intrinsic::instrprof_increment:
     llvm_unreachable("instrprof failed to lower an increment");
-
+  case Intrinsic::instrprof_value_profile:
+    llvm_unreachable("instrprof failed to lower a value profiling call");
   case Intrinsic::localescape: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
@@ -5032,19 +5288,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
     return nullptr;
   }
-  case Intrinsic::eh_begincatch:
-  case Intrinsic::eh_endcatch:
-    llvm_unreachable("begin/end catch intrinsics not lowered in codegen");
+
+  case Intrinsic::eh_exceptionpointer:
   case Intrinsic::eh_exceptioncode: {
-    unsigned Reg = TLI.getExceptionPointerRegister();
-    assert(Reg && "cannot get exception code on this platform");
+    // Get the exception pointer vreg, copy from it, and resize it to fit.
+    const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
     const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
-    assert(FuncInfo.MBB->isLandingPad() && "eh.exceptioncode in non-lpad");
-    unsigned VReg = FuncInfo.MBB->addLiveIn(Reg, PtrRC);
+    unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
     SDValue N =
         DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
-    N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
+    if (Intrinsic == Intrinsic::eh_exceptioncode)
+      N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
     setValue(&I, N);
     return nullptr;
   }
@@ -5053,11 +5308,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
 std::pair<SDValue, SDValue>
 SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
-                                    MachineBasicBlock *LandingPad) {
+                                    const BasicBlock *EHPadBB) {
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
   MCSymbol *BeginLabel = nullptr;
 
-  if (LandingPad) {
+  if (EHPadBB) {
     // Insert a label before the invoke call to mark the try range.  This can be
     // used to detect deletion of the invoke via the MachineModuleInfo.
     BeginLabel = MMI.getContext().createTempSymbol();
@@ -5067,7 +5322,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
     unsigned CallSiteIndex = MMI.getCurrentCallSite();
     if (CallSiteIndex) {
       MMI.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
-      LPadToCallSiteMap[LandingPad].push_back(CallSiteIndex);
+      LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
 
       // Now that the call site is handled, stop tracking it.
       MMI.setCurrentCallSite(0);
@@ -5100,14 +5355,21 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
     DAG.setRoot(Result.second);
   }
 
-  if (LandingPad) {
+  if (EHPadBB) {
     // Insert a label at the end of the invoke call to mark the try range.  This
     // can be used to detect deletion of the invoke via the MachineModuleInfo.
     MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
     DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));
 
     // Inform MachineModuleInfo of range.
-    MMI.addInvoke(LandingPad, BeginLabel, EndLabel);
+    if (MMI.hasEHFunclets()) {
+      assert(CLI.CS);
+      WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
+      EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS->getInstruction()),
+                                BeginLabel, EndLabel);
+    } else {
+      MMI.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
+    }
   }
 
   return Result;
@@ -5115,7 +5377,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
 
 void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                       bool isTailCall,
-                                      MachineBasicBlock *LandingPad) {
+                                      const BasicBlock *EHPadBB) {
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   Type *RetTy = FTy->getReturnType();
@@ -5154,7 +5416,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
     .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
     .setTailCall(isTailCall);
-  std::pair<SDValue,SDValue> Result = lowerInvokable(CLI, LandingPad);
+  std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   if (Result.first.getNode())
     setValue(CS.getInstruction(), Result.first);
@@ -5978,7 +6240,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+        const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
         std::pair<unsigned, const TargetRegisterClass *> MatchRC =
             TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
                                              OpInfo.ConstraintVT);
@@ -6037,10 +6299,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
         SDValue StackSlot =
             DAG.getFrameIndex(SSFI, TLI.getPointerTy(DAG.getDataLayout()));
-        Chain = DAG.getStore(Chain, getCurSDLoc(),
-                             OpInfo.CallOperand, StackSlot,
-                             MachinePointerInfo::getFixedStack(SSFI),
-                             false, false, 0);
+        Chain = DAG.getStore(
+            Chain, getCurSDLoc(), OpInfo.CallOperand, StackSlot,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+            false, false, 0);
         OpInfo.CallOperand = StackSlot;
       }
 
@@ -6460,12 +6722,9 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
 /// This is a helper for lowering intrinsics that follow a target calling
 /// convention or require stack pointer adjustment. Only a subset of the
 /// intrinsic's operands need to participate in the calling convention.
-std::pair<SDValue, SDValue>
-SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
-                                       unsigned NumArgs, SDValue Callee,
-                                       Type *ReturnTy,
-                                       MachineBasicBlock *LandingPad,
-                                       bool IsPatchPoint) {
+std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerCallOperands(
+    ImmutableCallSite CS, unsigned ArgIdx, unsigned NumArgs, SDValue Callee,
+    Type *ReturnTy, const BasicBlock *EHPadBB, bool IsPatchPoint) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
@@ -6489,7 +6748,7 @@ SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
     .setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args), NumArgs)
     .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint);
 
-  return lowerInvokable(CLI, LandingPad);
+  return lowerInvokable(CLI, EHPadBB);
 }
 
 /// \brief Add a stack map intrinsic call's live variable operands to a stackmap
@@ -6593,7 +6852,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
 
 /// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
 void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
-                                          MachineBasicBlock *LandingPad) {
+                                          const BasicBlock *EHPadBB) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
   //                                                 i32 <numBytes>,
   //                                                 i8* <target>,
@@ -6630,9 +6889,8 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   Type *ReturnTy =
     IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
-  std::pair<SDValue, SDValue> Result =
-    lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy,
-                      LandingPad, true);
+  std::pair<SDValue, SDValue> Result = lowerCallOperands(
+      CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy, EHPadBB, true);
 
   SDNode *CallEnd = Result.second.getNode();
   if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
@@ -6926,8 +7184,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
                                i, j*Parts[j].getValueType().getStoreSize());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
-        else if (j != 0)
+        else if (j != 0) {
           MyFlags.Flags.setOrigAlign(1);
+          if (j == NumParts - 1)
+            MyFlags.Flags.setSplitEnd();
+        }
 
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
@@ -6980,14 +7241,20 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     ReturnValues.resize(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
     for (unsigned i = 0; i < NumValues; ++i) {
       SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
                                     CLI.DAG.getConstant(Offsets[i], CLI.DL,
-                                                        PtrVT));
+                                                        PtrVT), &Flags);
       SDValue L = CLI.DAG.getLoad(
           RetTys[i], CLI.DL, CLI.Chain, Add,
-          MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]), false,
-          false, false, 1);
+          MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
+                                            DemoteStackIdx, Offsets[i]),
+          false, false, false, 1);
       ReturnValues[i] = L;
       Chains[i] = L.getValue(1);
     }
@@ -7069,9 +7336,9 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
   if (FastISel)
     return A->use_empty();
 
-  const BasicBlock *Entry = A->getParent()->begin();
+  const BasicBlock &Entry = A->getParent()->front();
   for (const User *U : A->users())
-    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
+    if (cast<Instruction>(U)->getParent() != &Entry || isa<SwitchInst>(U))
       return false;  // Use not in entry block.
 
   return true;
@@ -7138,6 +7405,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
+      if (F.getCallingConv() == CallingConv::X86_INTR) {
+        // IA Interrupt passes frame (1st parameter) by value in the stack.
+        if (Idx == 1)
+          Flags.setByVal();
+      }
       if (Flags.isByVal() || Flags.isInAlloca()) {
         PointerType *Ty = cast<PointerType>(I->getType());
         Type *ElementTy = Ty->getElementType();
@@ -7165,8 +7437,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
-        else if (i > 0)
+        else if (i > 0) {
           MyFlags.Flags.setOrigAlign(1);
+          if (i == NumRegs - 1)
+            MyFlags.Flags.setSplitEnd();
+        }
         Ins.push_back(MyFlags);
       }
       if (NeedsRegBlock && Value == NumValues - 1)
@@ -7235,12 +7510,12 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
     if (I->use_empty() && NumValues) {
-      SDB->setUnusedArgValue(I, InVals[i]);
+      SDB->setUnusedArgValue(&*I, InVals[i]);
 
       // Also remember any frame index for use in FastISel.
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
-        FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
     }
 
     for (unsigned Val = 0; Val != NumValues; ++Val) {
@@ -7270,18 +7545,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // Note down frame index.
     if (FrameIndexSDNode *FI =
         dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
-      FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+      FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
                                      SDB->getCurSDLoc());
 
-    SDB->setValue(I, Res);
+    SDB->setValue(&*I, Res);
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
       if (LoadSDNode *LNode =
           dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
         if (FrameIndexSDNode *FI =
             dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
-        FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
     }
 
     // If this argument is live outside of the entry block, insert a copy from
@@ -7293,13 +7568,13 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // uses with vregs.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        FuncInfo->ValueMap[I] = Reg;
+        FuncInfo->ValueMap[&*I] = Reg;
         continue;
       }
     }
-    if (!isOnlyUsedInEntryBlock(I, TM.Options.EnableFastISel)) {
-      FuncInfo->InitializeRegForValue(I);
-      SDB->CopyToExportRegsIfNeeded(I);
+    if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(&*I);
+      SDB->CopyToExportRegsIfNeeded(&*I);
     }
   }
 
@@ -7401,21 +7676,21 @@ AddSuccessorMBB(const BasicBlock *BB,
   // If SuccBB has not been created yet, create it.
   if (!SuccMBB) {
     MachineFunction *MF = ParentMBB->getParent();
-    MachineFunction::iterator BBI = ParentMBB;
+    MachineFunction::iterator BBI(ParentMBB);
     SuccMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(++BBI, SuccMBB);
   }
   // Add it as a successor of ParentMBB.
   ParentMBB->addSuccessor(
-      SuccMBB, BranchProbabilityInfo::getBranchWeightStackProtector(IsLikely));
+      SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
   return SuccMBB;
 }
 
 MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) {
-  MachineFunction::iterator I = MBB;
+  MachineFunction::iterator I(MBB);
   if (++I == FuncInfo.MF->end())
     return nullptr;
-  return I;
+  return &*I;
 }
 
 /// During lowering new call nodes can be created (such as memset, etc.).
@@ -7469,14 +7744,18 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters,
                                          CaseCluster &JTCluster) {
   assert(First <= Last);
 
-  uint32_t Weight = 0;
+  auto Prob = BranchProbability::getZero();
   unsigned NumCmps = 0;
   std::vector<MachineBasicBlock*> Table;
-  DenseMap<MachineBasicBlock*, uint32_t> JTWeights;
+  DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
+
+  // Initialize probabilities in JTProbs.
+  for (unsigned I = First; I <= Last; ++I)
+    JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
+
   for (unsigned I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
-    Weight += Clusters[I].Weight;
-    assert(Weight >= Clusters[I].Weight && "Weight overflow!");
+    Prob += Clusters[I].Prob;
     APInt Low = Clusters[I].Low->getValue();
     APInt High = Clusters[I].High->getValue();
     NumCmps += (Low == High) ? 1 : 2;
@@ -7491,10 +7770,10 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters,
     uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;
     for (uint64_t J = 0; J < ClusterSize; ++J)
       Table.push_back(Clusters[I].MBB);
-    JTWeights[Clusters[I].MBB] += Clusters[I].Weight;
+    JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
   }
 
-  unsigned NumDests = JTWeights.size();
+  unsigned NumDests = JTProbs.size();
   if (isSuitableForBitTests(NumDests, NumCmps,
                             Clusters[First].Low->getValue(),
                             Clusters[Last].High->getValue())) {
@@ -7513,9 +7792,10 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters,
   for (MachineBasicBlock *Succ : Table) {
     if (Done.count(Succ))
       continue;
-    addSuccessorWithWeight(JumpTableMBB, Succ, JTWeights[Succ]);
+    addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);
     Done.insert(Succ);
   }
+  JumpTableMBB->normalizeSuccProbs();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())
@@ -7529,7 +7809,7 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters,
   JTCases.emplace_back(std::move(JTH), std::move(JT));
 
   JTCluster = CaseCluster::jumpTable(Clusters[First].Low, Clusters[Last].High,
-                                     JTCases.size() - 1, Weight);
+                                     JTCases.size() - 1, Prob);
   return true;
 }
 
@@ -7707,19 +7987,29 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
                            .getSizeInBits();
   assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!");
 
-  if (Low.isNonNegative() && High.slt(BitWidth)) {
-    // Optimize the case where all the case values fit in a
-    // word without having to subtract minValue. In this case,
-    // we can optimize away the subtraction.
+  // Check if the clusters cover a contiguous range such that no value in the
+  // range will jump to the default statement.
+  bool ContiguousRange = true;
+  for (int64_t I = First + 1; I <= Last; ++I) {
+    if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {
+      ContiguousRange = false;
+      break;
+    }
+  }
+
+  if (Low.isStrictlyPositive() && High.slt(BitWidth)) {
+    // Optimize the case where all the case values fit in a word without having
+    // to subtract minValue. In this case, we can optimize away the subtraction.
     LowBound = APInt::getNullValue(Low.getBitWidth());
     CmpRange = High;
+    ContiguousRange = false;
   } else {
     LowBound = Low;
     CmpRange = High - Low;
   }
 
   CaseBitsVector CBV;
-  uint32_t TotalWeight = 0;
+  auto TotalProb = BranchProbability::getZero();
   for (unsigned i = First; i <= Last; ++i) {
     // Find the CaseBits for this destination.
     unsigned j;
@@ -7727,39 +8017,40 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
       if (CBV[j].BB == Clusters[i].MBB)
         break;
     if (j == CBV.size())
-      CBV.push_back(CaseBits(0, Clusters[i].MBB, 0, 0));
+      CBV.push_back(
+          CaseBits(0, Clusters[i].MBB, 0, BranchProbability::getZero()));
     CaseBits *CB = &CBV[j];
 
-    // Update Mask, Bits and ExtraWeight.
+    // Update Mask, Bits and ExtraProb.
     uint64_t Lo = (Clusters[i].Low->getValue() - LowBound).getZExtValue();
     uint64_t Hi = (Clusters[i].High->getValue() - LowBound).getZExtValue();
     assert(Hi >= Lo && Hi < 64 && "Invalid bit case!");
     CB->Mask |= (-1ULL >> (63 - (Hi - Lo))) << Lo;
     CB->Bits += Hi - Lo + 1;
-    CB->ExtraWeight += Clusters[i].Weight;
-    TotalWeight += Clusters[i].Weight;
-    assert(TotalWeight >= Clusters[i].Weight && "Weight overflow!");
+    CB->ExtraProb += Clusters[i].Prob;
+    TotalProb += Clusters[i].Prob;
   }
 
   BitTestInfo BTI;
   std::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) {
-    // Sort by weight first, number of bits second.
-    if (a.ExtraWeight != b.ExtraWeight)
-      return a.ExtraWeight > b.ExtraWeight;
+    // Sort by probability first, number of bits second.
+    if (a.ExtraProb != b.ExtraProb)
+      return a.ExtraProb > b.ExtraProb;
     return a.Bits > b.Bits;
   });
 
   for (auto &CB : CBV) {
     MachineBasicBlock *BitTestBB =
         FuncInfo.MF->CreateMachineBasicBlock(SI->getParent());
-    BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraWeight));
+    BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb));
   }
   BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange),
-                            SI->getCondition(), -1U, MVT::Other, false, nullptr,
-                            nullptr, std::move(BTI));
+                            SI->getCondition(), -1U, MVT::Other, false,
+                            ContiguousRange, nullptr, nullptr, std::move(BTI),
+                            TotalProb);
 
   BTCluster = CaseCluster::bitTests(Clusters[First].Low, Clusters[Last].High,
-                                    BitTestCases.size() - 1, TotalWeight);
+                                    BitTestCases.size() - 1, TotalProb);
   return true;
 }
 
@@ -7868,9 +8159,9 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
                                         MachineBasicBlock *DefaultMBB) {
   MachineFunction *CurMF = FuncInfo.MF;
   MachineBasicBlock *NextMBB = nullptr;
-  MachineFunction::iterator BBI = W.MBB;
+  MachineFunction::iterator BBI(W.MBB);
   if (++BBI != FuncInfo.MF->end())
-    NextMBB = BBI;
+    NextMBB = &*BBI;
 
   unsigned Size = W.LastCluster - W.FirstCluster + 1;
 
@@ -7906,13 +8197,16 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
             ISD::SETEQ);
 
         // Update successor info.
-        // Both Small and Big will jump to Small.BB, so we sum up the weights.
-        addSuccessorWithWeight(SwitchMBB, Small.MBB, Small.Weight + Big.Weight);
-        addSuccessorWithWeight(
-            SwitchMBB, DefaultMBB,
-            // The default destination is the first successor in IR.
-            BPI ? BPI->getEdgeWeight(SwitchMBB->getBasicBlock(), (unsigned)0)
-                : 0);
+        // Both Small and Big will jump to Small.BB, so we sum up the
+        // probabilities.
+        addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
+        if (BPI)
+          addSuccessorWithProb(
+              SwitchMBB, DefaultMBB,
+              // The default destination is the first successor in IR.
+              BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
+        else
+          addSuccessorWithProb(SwitchMBB, DefaultMBB);
 
         // Insert the true branch.
         SDValue BrCond =
@@ -7929,17 +8223,17 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
   }
 
   if (TM.getOptLevel() != CodeGenOpt::None) {
-    // Order cases by weight so the most likely case will be checked first.
+    // Order cases by probability so the most likely case will be checked first.
     std::sort(W.FirstCluster, W.LastCluster + 1,
               [](const CaseCluster &a, const CaseCluster &b) {
-      return a.Weight > b.Weight;
+      return a.Prob > b.Prob;
     });
 
     // Rearrange the case blocks so that the last one falls through if possible
-    // without without changing the order of weights.
+    // without without changing the order of probabilities.
     for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
       --I;
-      if (I->Weight > W.LastCluster->Weight)
+      if (I->Prob > W.LastCluster->Prob)
         break;
       if (I->Kind == CC_Range && I->MBB == NextMBB) {
         std::swap(*I, *W.LastCluster);
@@ -7948,12 +8242,11 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
     }
   }
 
-  // Compute total weight.
-  uint32_t UnhandledWeights = 0;
-  for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I) {
-    UnhandledWeights += I->Weight;
-    assert(UnhandledWeights >= I->Weight && "Weight overflow!");
-  }
+  // Compute total probability.
+  BranchProbability DefaultProb = W.DefaultProb;
+  BranchProbability UnhandledProbs = DefaultProb;
+  for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
+    UnhandledProbs += I->Prob;
 
   MachineBasicBlock *CurMBB = W.MBB;
   for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
@@ -7967,6 +8260,7 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
       // Put Cond in a virtual register to make it available from the new blocks.
       ExportFromCurrentBlock(Cond);
     }
+    UnhandledProbs -= I->Prob;
 
     switch (I->Kind) {
       case CC_JumpTable: {
@@ -7977,8 +8271,28 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
         // The jump block hasn't been inserted yet; insert it here.
         MachineBasicBlock *JumpMBB = JT->MBB;
         CurMF->insert(BBI, JumpMBB);
-        addSuccessorWithWeight(CurMBB, Fallthrough);
-        addSuccessorWithWeight(CurMBB, JumpMBB);
+
+        auto JumpProb = I->Prob;
+        auto FallthroughProb = UnhandledProbs;
+
+        // If the default statement is a target of the jump table, we evenly
+        // distribute the default probability to successors of CurMBB. Also
+        // update the probability on the edge from JumpMBB to Fallthrough.
+        for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
+                                              SE = JumpMBB->succ_end();
+             SI != SE; ++SI) {
+          if (*SI == DefaultMBB) {
+            JumpProb += DefaultProb / 2;
+            FallthroughProb -= DefaultProb / 2;
+            JumpMBB->setSuccProbability(SI, DefaultProb / 2);
+            JumpMBB->normalizeSuccProbs();
+            break;
+          }
+        }
+
+        addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
+        addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
+        CurMBB->normalizeSuccProbs();
 
         // The jump table header will be inserted in our current block, do the
         // range check, and fall through to our fallthrough block.
@@ -8004,8 +8318,17 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
         BTB->Parent = CurMBB;
         BTB->Default = Fallthrough;
 
-        // If we're in the right place, emit the bit test header header right now.
-        if (CurMBB ==SwitchMBB) {
+        BTB->DefaultProb = UnhandledProbs;
+        // If the cases in bit test don't form a contiguous range, we evenly
+        // distribute the probability on the edge to Fallthrough to two
+        // successors of CurMBB.
+        if (!BTB->ContiguousRange) {
+          BTB->Prob += DefaultProb / 2;
+          BTB->DefaultProb -= DefaultProb / 2;
+        }
+
+        // If we're in the right place, emit the bit test header right now.
+        if (CurMBB == SwitchMBB) {
           visitBitTestHeader(*BTB, SwitchMBB);
           BTB->Emitted = true;
         }
@@ -8028,10 +8351,9 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
           RHS = I->High;
         }
 
-        // The false weight is the sum of all unhandled cases.
-        UnhandledWeights -= I->Weight;
-        CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB, I->Weight,
-                     UnhandledWeights);
+        // The false probability is the sum of all unhandled cases.
+        CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB, I->Prob,
+                     UnhandledProbs);
 
         if (CurMBB == SwitchMBB)
           visitSwitchCase(CB, SwitchMBB);
@@ -8049,8 +8371,8 @@ unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
                                               CaseClusterIt First,
                                               CaseClusterIt Last) {
   return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
-    if (X.Weight != CC.Weight)
-      return X.Weight > CC.Weight;
+    if (X.Prob != CC.Prob)
+      return X.Prob > CC.Prob;
 
     // Ties are broken by comparing the case value.
     return X.Low->getValue().slt(CC.Low->getValue());
@@ -8066,24 +8388,24 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
 
   assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");
 
-  // Balance the tree based on branch weights to create a near-optimal (in terms
-  // of search time given key frequency) binary search tree. See e.g. Kurt
+  // Balance the tree based on branch probabilities to create a near-optimal (in
+  // terms of search time given key frequency) binary search tree. See e.g. Kurt
   // Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
   CaseClusterIt LastLeft = W.FirstCluster;
   CaseClusterIt FirstRight = W.LastCluster;
-  uint32_t LeftWeight = LastLeft->Weight;
-  uint32_t RightWeight = FirstRight->Weight;
+  auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
+  auto RightProb = FirstRight->Prob + W.DefaultProb / 2;
 
   // Move LastLeft and FirstRight towards each other from opposite directions to
-  // find a partitioning of the clusters which balances the weight on both
-  // sides. If LeftWeight and RightWeight are equal, alternate which side is
-  // taken to ensure 0-weight nodes are distributed evenly.
+  // find a partitioning of the clusters which balances the probability on both
+  // sides. If LeftProb and RightProb are equal, alternate which side is
+  // taken to ensure 0-probability nodes are distributed evenly.
   unsigned I = 0;
   while (LastLeft + 1 < FirstRight) {
-    if (LeftWeight < RightWeight || (LeftWeight == RightWeight && (I & 1)))
-      LeftWeight += (++LastLeft)->Weight;
+    if (LeftProb < RightProb || (LeftProb == RightProb && (I & 1)))
+      LeftProb += (++LastLeft)->Prob;
     else
-      RightWeight += (--FirstRight)->Weight;
+      RightProb += (--FirstRight)->Prob;
     I++;
   }
 
@@ -8144,7 +8466,7 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
   const ConstantInt *Pivot = PivotCluster->Low;
 
   // New blocks will be inserted immediately after the current one.
-  MachineFunction::iterator BBI = W.MBB;
+  MachineFunction::iterator BBI(W.MBB);
   ++BBI;
 
   // We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
@@ -8158,7 +8480,8 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
   } else {
     LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
     FuncInfo.MF->insert(BBI, LeftMBB);
-    WorkList.push_back({LeftMBB, FirstLeft, LastLeft, W.GE, Pivot});
+    WorkList.push_back(
+        {LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
     // Put Cond in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(Cond);
   }
@@ -8173,14 +8496,15 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
   } else {
     RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
     FuncInfo.MF->insert(BBI, RightMBB);
-    WorkList.push_back({RightMBB, FirstRight, LastRight, Pivot, W.LT});
+    WorkList.push_back(
+        {RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
     // Put Cond in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(Cond);
   }
 
   // Create the CaseBlock record that will be used to lower the branch.
   CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
-               LeftWeight, RightWeight);
+               LeftProb, RightProb);
 
   if (W.MBB == SwitchMBB)
     visitSwitchCase(CB, SwitchMBB);
@@ -8196,9 +8520,10 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   for (auto I : SI.cases()) {
     MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
     const ConstantInt *CaseVal = I.getCaseValue();
-    uint32_t Weight =
-        BPI ? BPI->getEdgeWeight(SI.getParent(), I.getSuccessorIndex()) : 0;
-    Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Weight));
+    BranchProbability Prob =
+        BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
+            : BranchProbability(1, SI.getNumCases() + 1);
+    Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
   }
 
   MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];
@@ -8274,7 +8599,8 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   SwitchWorkList WorkList;
   CaseClusterIt First = Clusters.begin();
   CaseClusterIt Last = Clusters.end() - 1;
-  WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr});
+  auto DefaultProb = getEdgeProbability(SwitchMBB, DefaultMBB);
+  WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
 
   while (!WorkList.empty()) {
     SwitchWorkListItem W = WorkList.back();
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 7006754..8fb85ff 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -17,6 +17,7 @@
 #include "StatepointLowering.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -30,7 +31,6 @@
 namespace llvm {
 
 class AddrSpaceCastInst;
-class AliasAnalysis;
 class AllocaInst;
 class BasicBlock;
 class BitCastInst;
@@ -154,39 +154,39 @@ private:
       unsigned JTCasesIndex;
       unsigned BTCasesIndex;
     };
-    uint32_t Weight;
+    BranchProbability Prob;
 
     static CaseCluster range(const ConstantInt *Low, const ConstantInt *High,
-                             MachineBasicBlock *MBB, uint32_t Weight) {
+                             MachineBasicBlock *MBB, BranchProbability Prob) {
       CaseCluster C;
       C.Kind = CC_Range;
       C.Low = Low;
       C.High = High;
       C.MBB = MBB;
-      C.Weight = Weight;
+      C.Prob = Prob;
       return C;
     }
 
     static CaseCluster jumpTable(const ConstantInt *Low,
                                  const ConstantInt *High, unsigned JTCasesIndex,
-                                 uint32_t Weight) {
+                                 BranchProbability Prob) {
       CaseCluster C;
       C.Kind = CC_JumpTable;
       C.Low = Low;
       C.High = High;
       C.JTCasesIndex = JTCasesIndex;
-      C.Weight = Weight;
+      C.Prob = Prob;
       return C;
     }
 
     static CaseCluster bitTests(const ConstantInt *Low, const ConstantInt *High,
-                                unsigned BTCasesIndex, uint32_t Weight) {
+                                unsigned BTCasesIndex, BranchProbability Prob) {
       CaseCluster C;
       C.Kind = CC_BitTests;
       C.Low = Low;
       C.High = High;
       C.BTCasesIndex = BTCasesIndex;
-      C.Weight = Weight;
+      C.Prob = Prob;
       return C;
     }
   };
@@ -198,13 +198,13 @@ private:
     uint64_t Mask;
     MachineBasicBlock* BB;
     unsigned Bits;
-    uint32_t ExtraWeight;
+    BranchProbability ExtraProb;
 
     CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits,
-             uint32_t Weight):
-      Mask(mask), BB(bb), Bits(bits), ExtraWeight(Weight) { }
+             BranchProbability Prob):
+      Mask(mask), BB(bb), Bits(bits), ExtraProb(Prob) { }
 
-    CaseBits() : Mask(0), BB(nullptr), Bits(0), ExtraWeight(0) {}
+    CaseBits() : Mask(0), BB(nullptr), Bits(0) {}
   };
 
   typedef std::vector<CaseBits> CaseBitsVector;
@@ -217,13 +217,13 @@ private:
   /// blocks needed by multi-case switch statements.
   struct CaseBlock {
     CaseBlock(ISD::CondCode cc, const Value *cmplhs, const Value *cmprhs,
-              const Value *cmpmiddle,
-              MachineBasicBlock *truebb, MachineBasicBlock *falsebb,
-              MachineBasicBlock *me,
-              uint32_t trueweight = 0, uint32_t falseweight = 0)
-      : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
-        TrueBB(truebb), FalseBB(falsebb), ThisBB(me),
-        TrueWeight(trueweight), FalseWeight(falseweight) { }
+              const Value *cmpmiddle, MachineBasicBlock *truebb,
+              MachineBasicBlock *falsebb, MachineBasicBlock *me,
+              BranchProbability trueprob = BranchProbability::getUnknown(),
+              BranchProbability falseprob = BranchProbability::getUnknown())
+        : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
+          TrueBB(truebb), FalseBB(falsebb), ThisBB(me), TrueProb(trueprob),
+          FalseProb(falseprob) {}
 
     // CC - the condition code to use for the case block's setcc node
     ISD::CondCode CC;
@@ -239,8 +239,8 @@ private:
     // ThisBB - the block into which to emit the code for the setcc and branches
     MachineBasicBlock *ThisBB;
 
-    // TrueWeight/FalseWeight - branch weights.
-    uint32_t TrueWeight, FalseWeight;
+    // TrueProb/FalseProb - branch weights.
+    BranchProbability TrueProb, FalseProb;
   };
 
   struct JumpTable {
@@ -272,32 +272,35 @@ private:
 
   struct BitTestCase {
     BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr,
-                uint32_t Weight):
-      Mask(M), ThisBB(T), TargetBB(Tr), ExtraWeight(Weight) { }
+                BranchProbability Prob):
+      Mask(M), ThisBB(T), TargetBB(Tr), ExtraProb(Prob) { }
     uint64_t Mask;
     MachineBasicBlock *ThisBB;
     MachineBasicBlock *TargetBB;
-    uint32_t ExtraWeight;
+    BranchProbability ExtraProb;
   };
 
   typedef SmallVector<BitTestCase, 3> BitTestInfo;
 
   struct BitTestBlock {
-    BitTestBlock(APInt F, APInt R, const Value* SV,
-                 unsigned Rg, MVT RgVT, bool E,
-                 MachineBasicBlock* P, MachineBasicBlock* D,
-                 BitTestInfo C):
-      First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E),
-      Parent(P), Default(D), Cases(std::move(C)) { }
+    BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT,
+                 bool E, bool CR, MachineBasicBlock *P, MachineBasicBlock *D,
+                 BitTestInfo C, BranchProbability Pr)
+        : First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E),
+          ContiguousRange(CR), Parent(P), Default(D), Cases(std::move(C)),
+          Prob(Pr) {}
     APInt First;
     APInt Range;
     const Value *SValue;
     unsigned Reg;
     MVT RegVT;
     bool Emitted;
+    bool ContiguousRange;
     MachineBasicBlock *Parent;
     MachineBasicBlock *Default;
     BitTestInfo Cases;
+    BranchProbability Prob;
+    BranchProbability DefaultProb;
   };
 
   /// Minimum jump table density, in percent.
@@ -339,6 +342,7 @@ private:
     CaseClusterIt LastCluster;
     const ConstantInt *GE;
     const ConstantInt *LT;
+    BranchProbability DefaultProb;
   };
   typedef SmallVector<SwitchWorkListItem, 4> SwitchWorkList;
 
@@ -515,6 +519,7 @@ private:
     void resetPerFunctionState() {
       FailureMBB = nullptr;
       Guard = nullptr;
+      GuardReg = 0;
     }
 
     MachineBasicBlock *getParentMBB() { return ParentMBB; }
@@ -592,10 +597,6 @@ public:
   ///
   FunctionLoweringInfo &FuncInfo;
 
-  /// OptLevel - What optimization level we're generating code for.
-  ///
-  CodeGenOpt::Level OptLevel;
-
   /// GFI - Garbage collection metadata for the function.
   GCFunctionInfo *GFI;
 
@@ -613,7 +614,7 @@ public:
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       CodeGenOpt::Level ol)
     : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
-      DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
+      DAG(dag), FuncInfo(funcinfo),
       HasTailCall(false) {
   }
 
@@ -692,19 +693,20 @@ public:
 
   void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
                             MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
-                            MachineBasicBlock *SwitchBB, unsigned Opc,
-                            uint32_t TW, uint32_t FW);
+                            MachineBasicBlock *SwitchBB,
+                            Instruction::BinaryOps Opc, BranchProbability TW,
+                            BranchProbability FW);
   void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     MachineBasicBlock *CurBB,
                                     MachineBasicBlock *SwitchBB,
-                                    uint32_t TW, uint32_t FW);
+                                    BranchProbability TW, BranchProbability FW);
   bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
   bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
   void CopyToExportRegsIfNeeded(const Value *V);
   void ExportFromCurrentBlock(const Value *V);
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
-                   MachineBasicBlock *LandingPad = nullptr);
+                   const BasicBlock *EHPadBB = nullptr);
 
   std::pair<SDValue, SDValue> lowerCallOperands(
           ImmutableCallSite CS,
@@ -712,7 +714,7 @@ public:
           unsigned NumArgs,
           SDValue Callee,
           Type *ReturnTy,
-          MachineBasicBlock *LandingPad = nullptr,
+          const BasicBlock *EHPadBB = nullptr,
           bool IsPatchPoint = false);
 
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
@@ -722,11 +724,11 @@ public:
   // This function is responsible for the whole statepoint lowering process.
   // It uniformly handles invoke and call statepoints.
   void LowerStatepoint(ImmutableStatepoint Statepoint,
-                       MachineBasicBlock *LandingPad = nullptr);
+                       const BasicBlock *EHPadBB = nullptr);
 private:
-  std::pair<SDValue, SDValue> lowerInvokable(
-          TargetLowering::CallLoweringInfo &CLI,
-          MachineBasicBlock *LandingPad);
+  std::pair<SDValue, SDValue>
+  lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                 const BasicBlock *EHPadBB = nullptr);
 
   // Terminator instructions.
   void visitRet(const ReturnInst &I);
@@ -734,11 +736,18 @@ private:
   void visitSwitch(const SwitchInst &I);
   void visitIndirectBr(const IndirectBrInst &I);
   void visitUnreachable(const UnreachableInst &I);
+  void visitCleanupRet(const CleanupReturnInst &I);
+  void visitCatchSwitch(const CatchSwitchInst &I);
+  void visitCatchRet(const CatchReturnInst &I);
+  void visitCatchPad(const CatchPadInst &I);
+  void visitCleanupPad(const CleanupPadInst &CPI);
+
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+  void addSuccessorWithProb(
+      MachineBasicBlock *Src, MachineBasicBlock *Dst,
+      BranchProbability Prob = BranchProbability::getUnknown());
 
-  uint32_t getEdgeWeight(const MachineBasicBlock *Src,
-                         const MachineBasicBlock *Dst) const;
-  void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
-                              uint32_t Weight = 0);
 public:
   void visitSwitchCase(CaseBlock &CB,
                        MachineBasicBlock *SwitchBB);
@@ -748,7 +757,7 @@ public:
   void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
   void visitBitTestCase(BitTestBlock &BB,
                         MachineBasicBlock* NextMBB,
-                        uint32_t BranchWeightToNext,
+                        BranchProbability BranchProbToNext,
                         unsigned Reg,
                         BitTestCase &B,
                         MachineBasicBlock *SwitchBB);
@@ -842,11 +851,11 @@ private:
   void visitVACopy(const CallInst &I);
   void visitStackmap(const CallInst &I);
   void visitPatchpoint(ImmutableCallSite CS,
-                       MachineBasicBlock *LandingPad = nullptr);
+                       const BasicBlock *EHPadBB = nullptr);
 
   // These three are implemented in StatepointLowering.cpp
   void visitStatepoint(const CallInst &I);
-  void visitGCRelocate(const CallInst &I);
+  void visitGCRelocate(const GCRelocateInst &I);
   void visitGCResult(const CallInst &I);
 
   void visitUserOp1(const Instruction &I) {
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 5b9b182..a1c6c4c 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -30,6 +31,11 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+static cl::opt<bool>
+VerboseDAGDumping("dag-dump-verbose", cl::Hidden,
+                  cl::desc("Display more information when dumping selection "
+                           "DAG nodes."));
+
 std::string SDNode::getOperationName(const SelectionDAG *G) const {
   switch (getOpcode()) {
   default:
@@ -102,6 +108,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EH_RETURN:                  return "EH_RETURN";
   case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
   case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
+  case ISD::EH_SJLJ_SETUP_DISPATCH:     return "EH_SJLJ_SETUP_DISPATCH";
   case ISD::ConstantPool:               return "ConstantPool";
   case ISD::TargetIndex:                return "TargetIndex";
   case ISD::ExternalSymbol:             return "ExternalSymbol";
@@ -145,6 +152,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FABS:                       return "fabs";
   case ISD::FMINNUM:                    return "fminnum";
   case ISD::FMAXNUM:                    return "fmaxnum";
+  case ISD::FMINNAN:                    return "fminnan";
+  case ISD::FMAXNAN:                    return "fmaxnan";
   case ISD::FNEG:                       return "fneg";
   case ISD::FSQRT:                      return "fsqrt";
   case ISD::FSIN:                       return "fsin";
@@ -201,6 +210,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
 
   case ISD::FPOWI:                      return "fpowi";
   case ISD::SETCC:                      return "setcc";
+  case ISD::SETCCE:                     return "setcce";
   case ISD::SELECT:                     return "select";
   case ISD::VSELECT:                    return "vselect";
   case ISD::SELECT_CC:                  return "select_cc";
@@ -273,6 +283,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CALLSEQ_START:              return "callseq_start";
   case ISD::CALLSEQ_END:                return "callseq_end";
 
+    // EH instructions
+  case ISD::CATCHRET:                   return "catchret";
+  case ISD::CLEANUPRET:                 return "cleanupret";
+
     // Other operators
   case ISD::LOAD:                       return "load";
   case ISD::STORE:                      return "store";
@@ -295,15 +309,17 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::LIFETIME_END:               return "lifetime.end";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
+  case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
 
   // Bit manipulation
+  case ISD::BITREVERSE:                 return "bitreverse";
   case ISD::BSWAP:                      return "bswap";
   case ISD::CTPOP:                      return "ctpop";
   case ISD::CTTZ:                       return "cttz";
   case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
   case ISD::CTLZ:                       return "ctlz";
   case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
-
+    
   // Trampolines
   case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
   case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
@@ -320,7 +336,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
 
     case ISD::SETO:                     return "seto";
     case ISD::SETUO:                    return "setuo";
-    case ISD::SETUEQ:                   return "setue";
+    case ISD::SETUEQ:                   return "setueq";
     case ISD::SETUGT:                   return "setugt";
     case ISD::SETUGE:                   return "setuge";
     case ISD::SETULT:                   return "setult";
@@ -352,6 +368,16 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
   }
 }
 
+static Printable PrintNodeId(const SDNode &Node) {
+  return Printable([&Node](raw_ostream &OS) {
+#ifndef NDEBUG
+    OS << 't' << Node.PersistentId;
+#else
+    OS << (const void*)&Node;
+#endif
+  });
+}
+
 void SDNode::dump() const { dump(nullptr); }
 void SDNode::dump(const SelectionDAG *G) const {
   print(dbgs(), G);
@@ -359,8 +385,6 @@ void SDNode::dump(const SelectionDAG *G) const {
 }
 
 void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
-  OS << (const void*)this << ": ";
-
   for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
     if (i) OS << ",";
     if (getValueType(i) == MVT::Other)
@@ -368,7 +392,6 @@ void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
     else
       OS << getValueType(i).getEVTString();
   }
-  OS << " = " << getOperationName(G);
 }
 
 void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
@@ -523,48 +546,58 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
        << ']';
   }
 
-  if (unsigned Order = getIROrder())
-      OS << " [ORD=" << Order << ']';
+  if (VerboseDAGDumping) {
+    if (unsigned Order = getIROrder())
+        OS << " [ORD=" << Order << ']';
 
-  if (getNodeId() != -1)
-    OS << " [ID=" << getNodeId() << ']';
+    if (getNodeId() != -1)
+      OS << " [ID=" << getNodeId() << ']';
 
-  if (!G)
-    return;
+    if (!G)
+      return;
 
-  DILocation *L = getDebugLoc();
-  if (!L)
-    return;
+    DILocation *L = getDebugLoc();
+    if (!L)
+      return;
+
+    if (auto *Scope = L->getScope())
+      OS << Scope->getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << L->getLine();
+    if (unsigned C = L->getColumn())
+      OS << ':' << C;
+  }
+}
 
-  if (auto *Scope = L->getScope())
-    OS << Scope->getFilename();
-  else
-    OS << "<unknown>";
-  OS << ':' << L->getLine();
-  if (unsigned C = L->getColumn())
-    OS << ':' << C;
+/// Return true if this node is so simple that we should just print it inline
+/// if it appears as an operand.
+static bool shouldPrintInline(const SDNode &Node) {
+  if (Node.getOpcode() == ISD::EntryToken)
+    return false;
+  return Node.getNumOperands() == 0;
 }
 
 static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
-  for (const SDValue &Op : N->op_values())
+  for (const SDValue &Op : N->op_values()) {
+    if (shouldPrintInline(*Op.getNode()))
+      continue;
     if (Op.getNode()->hasOneUse())
       DumpNodes(Op.getNode(), indent+2, G);
-    else
-      dbgs() << "\n" << std::string(indent+2, ' ')
-             << (void*)Op.getNode() << ": <multiple use>";
+  }
 
-  dbgs() << '\n';
   dbgs().indent(indent);
   N->dump(G);
 }
 
 void SelectionDAG::dump() const {
-  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
+  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n";
 
   for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
        I != E; ++I) {
-    const SDNode *N = I;
-    if (!N->hasOneUse() && N != getRoot().getNode())
+    const SDNode *N = &*I;
+    if (!N->hasOneUse() && N != getRoot().getNode() &&
+        (!shouldPrintInline(*N) || N->use_empty()))
       DumpNodes(N, 2, this);
   }
 
@@ -573,10 +606,30 @@ void SelectionDAG::dump() const {
 }
 
 void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << PrintNodeId(*this) << ": ";
   print_types(OS, G);
+  OS << " = " << getOperationName(G);
   print_details(OS, G);
 }
 
+static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
+                         const SDValue Value) {
+  if (!Value.getNode()) {
+    OS << "<null>";
+    return false;
+  } else if (shouldPrintInline(*Value.getNode())) {
+    OS << Value->getOperationName(G) << ':';
+    Value->print_types(OS, G);
+    Value->print_details(OS, G);
+    return true;
+  } else {
+    OS << PrintNodeId(*Value.getNode());
+    if (unsigned RN = Value.getResNo())
+      OS << ':' << RN;
+    return false;
+  }
+}
+
 typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
 static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
                        const SelectionDAG *G, VisitedSDNodeSet &once) {
@@ -589,20 +642,13 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
 
   // Having printed this SDNode, walk the children:
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    const SDNode *child = N->getOperand(i).getNode();
-
     if (i) OS << ",";
     OS << " ";
 
-    if (child->getNumOperands() == 0) {
-      // This child has no grandchildren; print it inline right here.
-      child->printr(OS, G);
-      once.insert(child);
-    } else {         // Just the address. FIXME: also print the child's opcode.
-      OS << (const void*)child;
-      if (unsigned RN = N->getOperand(i).getResNo())
-        OS << ":" << RN;
-    }
+    const SDValue Op = N->getOperand(i);
+    bool printedInline = printOperand(OS, G, Op);
+    if (printedInline)
+      once.insert(Op.getNode());
   }
 
   OS << "\n";
@@ -664,12 +710,9 @@ void SDNode::dumprFull(const SelectionDAG *G) const {
 }
 
 void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
-  print_types(OS, G);
+  printr(OS, G);
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     if (i) OS << ", "; else OS << " ";
-    OS << (void*)getOperand(i).getNode();
-    if (unsigned RN = getOperand(i).getResNo())
-      OS << ":" << RN;
+    printOperand(OS, G, getOperand(i));
   }
-  print_details(OS, G);
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 97ece8b..9f8759d 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -263,13 +264,17 @@ namespace llvm {
         return;
       IS.OptLevel = NewOptLevel;
       IS.TM.setOptLevel(NewOptLevel);
-      SavedFastISel = IS.TM.Options.EnableFastISel;
-      if (NewOptLevel == CodeGenOpt::None)
-        IS.TM.setFastISel(true);
       DEBUG(dbgs() << "\nChanging optimization level for Function "
             << IS.MF->getFunction()->getName() << "\n");
       DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
             << " ; After: -O" << NewOptLevel << "\n");
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None) {
+        IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
+        DEBUG(dbgs() << "\tFastISel is "
+              << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
+              << "\n");
+      }
     }
 
     ~OptLevelChanger() {
@@ -293,6 +298,11 @@ namespace llvm {
     const TargetLowering *TLI = IS->TLI;
     const TargetSubtargetInfo &ST = IS->MF->getSubtarget();
 
+    // Try first to see if the Target has its own way of selecting a scheduler
+    if (auto *SchedulerCtor = ST.getDAGScheduler(OptLevel)) {
+      return SchedulerCtor(IS, OptLevel);
+    }
+
     if (OptLevel == CodeGenOpt::None ||
         (ST.enableMachineScheduler() && ST.enableMachineSchedDefaultSched()) ||
         TLI->getSchedulingPreference() == Sched::Source)
@@ -350,8 +360,9 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
   OptLevel(OL),
   DAGSize(0) {
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
-    initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry());
-    initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
+    initializeBranchProbabilityInfoWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+    initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
     initializeTargetLibraryInfoWrapperPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -363,13 +374,12 @@ SelectionDAGISel::~SelectionDAGISel() {
 }
 
 void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addPreserved<GCModuleInfo>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
-    AU.addRequired<BranchProbabilityInfo>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -380,10 +390,10 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
 ///
 /// This is required for correctness, so it must be done at -O0.
 ///
-static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) {
+static void SplitCriticalSideEffectEdges(Function &Fn) {
   // Loop for blocks with phi nodes.
-  for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    PHINode *PN = dyn_cast<PHINode>(BB->begin());
+  for (BasicBlock &BB : Fn) {
+    PHINode *PN = dyn_cast<PHINode>(BB.begin());
     if (!PN) continue;
 
   ReprocessBlock:
@@ -391,7 +401,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) {
     // are potentially trapping constant expressions.  Constant expressions are
     // the only potentially trapping value that can occur as the argument to a
     // PHI.
-    for (BasicBlock::iterator I = BB->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
+    for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I)); ++I)
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i));
         if (!CE || !CE->canTrap()) continue;
@@ -405,8 +415,8 @@ static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) {
 
         // Okay, we have to split this edge.
         SplitCriticalEdge(
-            Pred->getTerminator(), GetSuccessorNumber(Pred, BB),
-            CriticalEdgeSplittingOptions(AA).setMergeIdenticalEdges());
+            Pred->getTerminator(), GetSuccessorNumber(Pred, &BB),
+            CriticalEdgeSplittingOptions().setMergeIdenticalEdges());
         goto ReprocessBlock;
       }
   }
@@ -437,19 +447,19 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
-  SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), AA);
+  SplitCriticalSideEffectEdges(const_cast<Function &>(Fn));
 
   CurDAG->init(*MF);
   FuncInfo->set(Fn, *MF, CurDAG);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
-    FuncInfo->BPI = &getAnalysis<BranchProbabilityInfo>();
+    FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
     FuncInfo->BPI = nullptr;
 
@@ -457,15 +467,50 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
   MF->setHasInlineAsm(false);
 
+  FuncInfo->SplitCSR = false;
+  SmallVector<MachineBasicBlock*, 4> Returns;
+
+  // We split CSR if the target supports it for the given function
+  // and the function has only return exits.
+  if (TLI->supportSplitCSR(MF)) {
+    FuncInfo->SplitCSR = true;
+
+    // Collect all the return blocks.
+    for (const BasicBlock &BB : Fn) {
+      if (!succ_empty(&BB))
+        continue;
+
+      const TerminatorInst *Term = BB.getTerminator();
+      if (isa<UnreachableInst>(Term))
+        continue;
+      if (isa<ReturnInst>(Term)) {
+        Returns.push_back(FuncInfo->MBBMap[&BB]);
+        continue;
+      }
+
+      // Bail out if the exit block is not Return nor Unreachable.
+      FuncInfo->SplitCSR = false;
+      break;
+    }
+  }
+
+  MachineBasicBlock *EntryMBB = &MF->front();
+  if (FuncInfo->SplitCSR)
+    // This performs initialization so lowering for SplitCSR will be correct.
+    TLI->initializeSplitCSR(EntryMBB);
+
   SelectAllBasicBlocks(Fn);
 
   // If the first basic block in the function has live ins that need to be
   // copied into vregs, emit the copies into the top of the block before
   // emitting the code for the block.
-  MachineBasicBlock *EntryMBB = MF->begin();
   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
   RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
 
+  // Insert copies in the entry block and the return blocks.
+  if (FuncInfo->SplitCSR)
+    TLI->insertCopiesSplitCSR(EntryMBB, Returns);
+
   DenseMap<unsigned, unsigned> LiveInMap;
   if (!FuncInfo->ArgDbgValues.empty())
     for (MachineRegisterInfo::livein_iterator LI = RegInfo->livein_begin(),
@@ -588,6 +633,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     MRI.replaceRegWith(From, To);
   }
 
+  if (TLI->hasCopyImplyingStackAdjustment(MF))
+    MFI->setHasOpaqueSPAdjustment(true);
+
   // Freeze the set of reserved registers now that MachineFrameInfo has been
   // set up. All the information required by getReservedRegs() should be
   // available now.
@@ -882,7 +930,7 @@ void SelectionDAGISel::DoInstructionSelection() {
     // graph) and preceding back toward the beginning (the entry
     // node).
     while (ISelPosition != CurDAG->allnodes_begin()) {
-      SDNode *Node = --ISelPosition;
+      SDNode *Node = &*--ISelPosition;
       // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes,
       // but there are currently some corner cases that it misses. Also, this
       // makes it theoretically possible to disable the DAGCombiner.
@@ -916,14 +964,47 @@ void SelectionDAGISel::DoInstructionSelection() {
   PostprocessISelDAG();
 }
 
+static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
+  for (const User *U : CPI->users()) {
+    if (const IntrinsicInst *EHPtrCall = dyn_cast<IntrinsicInst>(U)) {
+      Intrinsic::ID IID = EHPtrCall->getIntrinsicID();
+      if (IID == Intrinsic::eh_exceptionpointer ||
+          IID == Intrinsic::eh_exceptioncode)
+        return true;
+    }
+  }
+  return false;
+}
+
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 bool SelectionDAGISel::PrepareEHLandingPad() {
   MachineBasicBlock *MBB = FuncInfo->MBB;
-
+  const Constant *PersonalityFn = FuncInfo->Fn->getPersonalityFn();
+  const BasicBlock *LLVMBB = MBB->getBasicBlock();
   const TargetRegisterClass *PtrRC =
       TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
+  // Catchpads have one live-in register, which typically holds the exception
+  // pointer or code.
+  if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+    if (hasExceptionPointerOrCodeUser(CPI)) {
+      // Get or create the virtual register to hold the pointer or code.  Mark
+      // the live in physreg and copy into the vreg.
+      MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+      assert(EHPhysReg && "target lacks exception pointer register");
+      MBB->addLiveIn(EHPhysReg);
+      unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+      BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+              TII->get(TargetOpcode::COPY), VReg)
+          .addReg(EHPhysReg, RegState::Kill);
+    }
+    return true;
+  }
+
+  if (!LLVMBB->isLandingPad())
+    return true;
+
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
@@ -935,52 +1016,12 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
-  // If this is an MSVC-style personality function, we need to split the landing
-  // pad into several BBs.
-  const BasicBlock *LLVMBB = MBB->getBasicBlock();
-  const LandingPadInst *LPadInst = LLVMBB->getLandingPadInst();
-  MF->getMMI().addPersonality(MBB, cast<Function>(LPadInst->getParent()
-                                                      ->getParent()
-                                                      ->getPersonalityFn()
-                                                      ->stripPointerCasts()));
-  EHPersonality Personality = MF->getMMI().getPersonalityType();
-
-  if (isMSVCEHPersonality(Personality)) {
-    SmallVector<MachineBasicBlock *, 4> ClauseBBs;
-    const IntrinsicInst *ActionsCall =
-        dyn_cast<IntrinsicInst>(LLVMBB->getFirstInsertionPt());
-    // Get all invoke BBs that unwind to this landingpad.
-    SmallVector<MachineBasicBlock *, 4> InvokeBBs(MBB->pred_begin(),
-                                                  MBB->pred_end());
-    if (ActionsCall && ActionsCall->getIntrinsicID() == Intrinsic::eh_actions) {
-      // If this is a call to llvm.eh.actions followed by indirectbr, then we've
-      // run WinEHPrepare, and we should remove this block from the machine CFG.
-      // Mark the targets of the indirectbr as landingpads instead.
-      for (const BasicBlock *LLVMSucc : successors(LLVMBB)) {
-        MachineBasicBlock *ClauseBB = FuncInfo->MBBMap[LLVMSucc];
-        // Add the edge from the invoke to the clause.
-        for (MachineBasicBlock *InvokeBB : InvokeBBs)
-          InvokeBB->addSuccessor(ClauseBB);
-
-        // Mark the clause as a landing pad or MI passes will delete it.
-        ClauseBB->setIsLandingPad();
-      }
-    }
-
-    // Remove the edge from the invoke to the lpad.
-    for (MachineBasicBlock *InvokeBB : InvokeBBs)
-      InvokeBB->removeSuccessor(MBB);
-
-    // Don't select instructions for the landingpad.
-    return false;
-  }
-
   // Mark exception register as live in.
-  if (unsigned Reg = TLI->getExceptionPointerRegister())
+  if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
     FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
   // Mark exception selector register as live in.
-  if (unsigned Reg = TLI->getExceptionSelectorRegister())
+  if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
     FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
   return true;
@@ -992,9 +1033,9 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
 static bool isFoldedOrDeadInstruction(const Instruction *I,
                                       FunctionLoweringInfo *FuncInfo) {
   return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
-         !isa<TerminatorInst>(I) && // Terminators aren't folded.
+         !isa<TerminatorInst>(I) &&    // Terminators aren't folded.
          !isa<DbgInfoIntrinsic>(I) &&  // Debug instructions aren't folded.
-         !isa<LandingPadInst>(I) &&    // Landingpad instructions aren't folded.
+         !I->isEHPad() &&              // EH pad instructions aren't folded.
          !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
@@ -1143,17 +1184,20 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       FuncInfo->VisitedBBs.insert(LLVMBB);
     }
 
-    BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI();
+    BasicBlock::const_iterator const Begin =
+        LLVMBB->getFirstNonPHI()->getIterator();
     BasicBlock::const_iterator const End = LLVMBB->end();
     BasicBlock::const_iterator BI = End;
 
     FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    if (!FuncInfo->MBB)
+      continue; // Some blocks like catchpads have no code or MBB.
     FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
 
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = 0;
     FuncInfo->ExceptionSelectorVirtReg = 0;
-    if (LLVMBB->isLandingPad())
+    if (LLVMBB->isEHPad())
       if (!PrepareEHLandingPad())
         continue;
 
@@ -1192,7 +1236,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       unsigned NumFastIselRemaining = std::distance(Begin, End);
       // Do FastISel on as many instructions as possible.
       for (; BI != Begin; --BI) {
-        const Instruction *Inst = std::prev(BI);
+        const Instruction *Inst = &*std::prev(BI);
 
         // If we no longer require this instruction, skip it.
         if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
@@ -1212,8 +1256,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           // then see if there is a load right before the selected instructions.
           // Try to fold the load if so.
           const Instruction *BeforeInst = Inst;
-          while (BeforeInst != Begin) {
-            BeforeInst = std::prev(BasicBlock::const_iterator(BeforeInst));
+          while (BeforeInst != &*Begin) {
+            BeforeInst = &*std::prev(BasicBlock::const_iterator(BeforeInst));
             if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo))
               break;
           }
@@ -1245,7 +1289,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
             // For the purpose of debugging, just abort.
             report_fatal_error("FastISel didn't select the entire block");
 
-          if (!Inst->getType()->isVoidTy() && !Inst->use_empty()) {
+          if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
+              !Inst->use_empty()) {
             unsigned &R = FuncInfo->ValueMap[Inst];
             if (!R)
               R = FuncInfo->CreateRegs(Inst->getType());
@@ -1253,7 +1298,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
           bool HadTailCall = false;
           MachineBasicBlock::iterator SavedInsertPt = FuncInfo->InsertPt;
-          SelectBasicBlock(Inst, BI, HadTailCall);
+          SelectBasicBlock(Inst->getIterator(), BI, HadTailCall);
 
           // If the call was emitted as a tail call, we're done with the block.
           // We also need to delete any previously emitted instructions.
@@ -1483,35 +1528,39 @@ SelectionDAGISel::FinishBasicBlock() {
       CodeGenAndEmitDAG();
     }
 
-    uint32_t UnhandledWeight = 0;
-    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j)
-      UnhandledWeight += SDB->BitTestCases[i].Cases[j].ExtraWeight;
-
+    BranchProbability UnhandledProb = SDB->BitTestCases[i].Prob;
     for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
-      UnhandledWeight -= SDB->BitTestCases[i].Cases[j].ExtraWeight;
+      UnhandledProb -= SDB->BitTestCases[i].Cases[j].ExtraProb;
       // Set the current basic block to the mbb we wish to insert the code into
       FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
-      if (j+1 != ej)
-        SDB->visitBitTestCase(SDB->BitTestCases[i],
-                              SDB->BitTestCases[i].Cases[j+1].ThisBB,
-                              UnhandledWeight,
-                              SDB->BitTestCases[i].Reg,
-                              SDB->BitTestCases[i].Cases[j],
-                              FuncInfo->MBB);
+
+      // If all cases cover a contiguous range, it is not necessary to jump to
+      // the default block after the last bit test fails. This is because the
+      // range check during bit test header creation has guaranteed that every
+      // case here doesn't go outside the range.
+      MachineBasicBlock *NextMBB;
+      if (SDB->BitTestCases[i].ContiguousRange && j + 2 == ej)
+        NextMBB = SDB->BitTestCases[i].Cases[j + 1].TargetBB;
+      else if (j + 1 != ej)
+        NextMBB = SDB->BitTestCases[i].Cases[j + 1].ThisBB;
       else
-        SDB->visitBitTestCase(SDB->BitTestCases[i],
-                              SDB->BitTestCases[i].Default,
-                              UnhandledWeight,
-                              SDB->BitTestCases[i].Reg,
-                              SDB->BitTestCases[i].Cases[j],
-                              FuncInfo->MBB);
+        NextMBB = SDB->BitTestCases[i].Default;
 
+      SDB->visitBitTestCase(SDB->BitTestCases[i],
+                            NextMBB,
+                            UnhandledProb,
+                            SDB->BitTestCases[i].Reg,
+                            SDB->BitTestCases[i].Cases[j],
+                            FuncInfo->MBB);
 
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
+
+      if (SDB->BitTestCases[i].ContiguousRange && j + 2 == ej)
+        break;
     }
 
     // Update PHI Nodes
@@ -1642,14 +1691,7 @@ SelectionDAGISel::FinishBasicBlock() {
 /// one preferred by the target.
 ///
 ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
-  RegisterScheduler::FunctionPassCtor Ctor = RegisterScheduler::getDefault();
-
-  if (!Ctor) {
-    Ctor = ISHeuristic;
-    RegisterScheduler::setDefault(Ctor);
-  }
-
-  return Ctor(this, OptLevel);
+  return ISHeuristic(this, OptLevel);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1961,7 +2003,7 @@ SDNode *SelectionDAGISel::Select_UNDEF(SDNode *N) {
 }
 
 /// GetVBR - decode a vbr encoding whose top bit is set.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
   assert(Val >= 128 && "Not a VBR");
   Val &= 127;  // Remove first vbr bit.
@@ -2287,7 +2329,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 }
 
 /// CheckSame - Implements OP_CheckSame.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
           const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
@@ -2298,7 +2340,7 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 }
 
 /// CheckChildSame - Implements OP_CheckChildXSame.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
              SDValue N,
              const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
@@ -2310,20 +2352,20 @@ CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 }
 
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                       const SelectionDAGISel &SDISel) {
   return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
 }
 
 /// CheckNodePredicate - Implements OP_CheckNodePredicate.
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                    const SelectionDAGISel &SDISel, SDNode *N) {
   return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDNode *N) {
   uint16_t Opc = MatcherTable[MatcherIndex++];
@@ -2331,7 +2373,7 @@ CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return N->getOpcode() == Opc;
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
           const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
@@ -2341,7 +2383,7 @@ CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
   return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL,
                unsigned ChildNo) {
@@ -2351,14 +2393,14 @@ CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                      DL);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
               SDValue N) {
   return cast<CondCodeSDNode>(N)->get() ==
       (ISD::CondCode)MatcherTable[MatcherIndex++];
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
@@ -2369,7 +2411,7 @@ CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
              SDValue N) {
   int64_t Val = MatcherTable[MatcherIndex++];
@@ -2380,7 +2422,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return C && C->getSExtValue() == Val;
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                   SDValue N, unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
@@ -2388,7 +2430,7 @@ CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return ::CheckInteger(MatcherTable, MatcherIndex, N.getOperand(ChildNo));
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
@@ -2401,7 +2443,7 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return C && SDISel.CheckAndMask(N.getOperand(0), C, Val);
 }
 
-LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
            SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 4df5ede..2764688 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -80,9 +80,16 @@ namespace llvm {
       return true;
     }
 
-    static bool hasNodeAddressLabel(const SDNode *Node,
-                                    const SelectionDAG *Graph) {
-      return true;
+    static std::string getNodeIdentifierLabel(const SDNode *Node,
+                                              const SelectionDAG *Graph) {
+      std::string R;
+      raw_string_ostream OS(R);
+#ifndef NDEBUG
+      OS << 't' << Node->PersistentId;
+#else
+      OS << static_cast<const void *>(Node);
+#endif
+      return R;
     }
 
     /// If you want to override the dot attributes printed for a particular
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 34688df..02545a7 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -95,6 +96,9 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType,
 
       SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
       const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo();
+      MFI->markAsStatepointSpillSlotObjectIndex(FI);
+
       Builder.FuncInfo.StatepointStackSlots.push_back(FI);
       AllocatedStackSlots.push_back(true);
       return SpillSlot;
@@ -105,8 +109,8 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType,
       return Builder.DAG.getFrameIndex(FI, ValueType);
     }
     // Note: We deliberately choose to advance this only on the failing path.
-    // Doing so on the suceeding path involes a bit of complexity that caused a
-    // minor bug previously.  Unless performance shows this matters, please
+    // Doing so on the succeeding path involves a bit of complexity that caused
+    // a minor bug previously.  Unless performance shows this matters, please
     // keep this code as simple as possible.
     NextSlotToAllocate++;
   }
@@ -119,18 +123,16 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType,
 static Optional<int> findPreviousSpillSlot(const Value *Val,
                                            SelectionDAGBuilder &Builder,
                                            int LookUpDepth) {
-  // Can not look any futher - give up now
+  // Can not look any further - give up now
   if (LookUpDepth <= 0)
     return Optional<int>();
 
   // Spill location is known for gc relocates
-  if (isGCRelocate(Val)) {
-    GCRelocateOperands RelocOps(cast<Instruction>(Val));
-
+  if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
     FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-        Builder.FuncInfo.StatepointRelocatedValues[RelocOps.getStatepoint()];
+        Builder.FuncInfo.StatepointRelocatedValues[Relocate->getStatepoint()];
 
-    auto It = SpillMap.find(RelocOps.getDerivedPtr());
+    auto It = SpillMap.find(Relocate->getDerivedPtr());
     if (It == SpillMap.end())
       return Optional<int>();
 
@@ -196,7 +198,7 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
-/// This helps to avoid series of loads and stores that only serve to resuffle
+/// This helps to avoid series of loads and stores that only serve to reshuffle
 /// values on the stack between calls.
 static void reservePreviousStackSlotForValue(const Value *IncomingValue,
                                              SelectionDAGBuilder &Builder) {
@@ -255,7 +257,7 @@ static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
                                    SmallVectorImpl<const Value *> &Relocs,
                                    SelectionDAGBuilder &Builder) {
 
-  // This is horribly ineffecient, but I don't care right now
+  // This is horribly inefficient, but I don't care right now
   SmallSet<SDValue, 64> Seen;
 
   SmallVector<const Value *, 64> NewBases, NewPtrs, NewRelocs;
@@ -283,13 +285,29 @@ static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
 /// call node. Also update NodeMap so that getValue(statepoint) will
 /// reference lowered call result
 static SDNode *
-lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad,
+lowerCallFromStatepoint(ImmutableStatepoint ISP, const BasicBlock *EHPadBB,
                         SelectionDAGBuilder &Builder,
                         SmallVectorImpl<SDValue> &PendingExports) {
 
   ImmutableCallSite CS(ISP.getCallSite());
 
-  SDValue ActualCallee = Builder.getValue(ISP.getCalledValue());
+  SDValue ActualCallee;
+
+  if (ISP.getNumPatchBytes() > 0) {
+    // If we've been asked to emit a nop sequence instead of a call instruction
+    // for this statepoint then don't lower the call target, but use a constant
+    // `null` instead.  Not lowering the call target lets statepoint clients get
+    // away without providing a physical address for the symbolic call target at
+    // link time.
+
+    const auto &TLI = Builder.DAG.getTargetLoweringInfo();
+    const auto &DL = Builder.DAG.getDataLayout();
+
+    unsigned AS = ISP.getCalledValue()->getType()->getPointerAddressSpace();
+    ActualCallee = Builder.DAG.getConstant(0, Builder.getCurSDLoc(),
+                                           TLI.getPointerTy(DL, AS));
+  } else
+    ActualCallee = Builder.getValue(ISP.getCalledValue());
 
   assert(CS.getCallingConv() != CallingConv::AnyReg &&
          "anyregcc is not supported on statepoints!");
@@ -300,7 +318,7 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad,
   SDValue ReturnValue, CallEndVal;
   std::tie(ReturnValue, CallEndVal) = Builder.lowerCallOperands(
       ISP.getCallSite(), ImmutableStatepoint::CallArgsBeginPos,
-      ISP.getNumCallArgs(), ActualCallee, DefTy, LandingPad,
+      ISP.getNumCallArgs(), ActualCallee, DefTy, EHPadBB,
       false /* IsPatchPoint */);
 
   SDNode *CallEnd = CallEndVal.getNode();
@@ -317,25 +335,33 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad,
   //   ch, glue = callseq_end ch, glue
   //   get_return_value ch, glue
   //
-  // get_return_value can either be a CopyFromReg to grab the return value from
-  // %RAX, or it can be a LOAD to load a value returned by reference via a stack
-  // slot.
+  // get_return_value can either be a sequence of CopyFromReg instructions
+  // to grab the return value from the return register(s), or it can be a LOAD
+  // to load a value returned by reference via a stack slot.
 
-  if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg ||
-                 CallEnd->getOpcode() == ISD::LOAD))
-    CallEnd = CallEnd->getOperand(0).getNode();
+  if (HasDef) {
+    if (CallEnd->getOpcode() == ISD::LOAD)
+      CallEnd = CallEnd->getOperand(0).getNode();
+    else
+      while (CallEnd->getOpcode() == ISD::CopyFromReg)
+        CallEnd = CallEnd->getOperand(0).getNode();
+  }
 
   assert(CallEnd->getOpcode() == ISD::CALLSEQ_END && "expected!");
 
-  if (HasDef) {
-    if (CS.isInvoke()) {
-      // Result value will be used in different basic block for invokes
-      // so we need to export it now. But statepoint call has a different type
-      // than the actuall call. It means that standart exporting mechanism will
-      // create register of the wrong type. So instead we need to create
-      // register with correct type and save value into it manually.
+  // Export the result value if needed
+  const Instruction *GCResult = ISP.getGCResult();
+  if (HasDef && GCResult) {
+    if (GCResult->getParent() != CS.getParent()) {
+      // Result value will be used in a different basic block so we need to
+      // export it now.
+      // Default exporting mechanism will not work here because statepoint call
+      // has a different type than the actual call. It means that by default
+      // llvm will create export register of the wrong type (always i32 in our
+      // case). So instead we need to create export register with correct type
+      // manually.
       // TODO: To eliminate this problem we can remove gc.result intrinsics
-      //       completelly and make statepoint call to return a tuple.
+      //       completely and make statepoint call to return a tuple.
       unsigned Reg = Builder.FuncInfo.CreateRegs(ISP.getActualReturnType());
       RegsForValue RFV(
           *Builder.DAG.getContext(), Builder.DAG.getTargetLoweringInfo(),
@@ -347,8 +373,9 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad,
       PendingExports.push_back(Chain);
       Builder.FuncInfo.ValueMap[CS.getInstruction()] = Reg;
     } else {
-      // The value of the statepoint itself will be the value of call itself.
-      // We'll replace the actually call node shortly.  gc_result will grab
+      // Result value will be used in a same basic block. Don't export it or
+      // perform any explicit register copies.
+      // We'll replace the actuall call node shortly. gc_result will grab
       // this value.
       Builder.setValue(CS.getInstruction(), ReturnValue);
     }
@@ -372,10 +399,10 @@ static void getIncomingStatepointGCValues(
     SmallVectorImpl<const Value *> &Bases, SmallVectorImpl<const Value *> &Ptrs,
     SmallVectorImpl<const Value *> &Relocs, ImmutableStatepoint StatepointSite,
     SelectionDAGBuilder &Builder) {
-  for (GCRelocateOperands relocateOpers : StatepointSite.getRelocates()) {
-    Relocs.push_back(relocateOpers.getUnderlyingCallSite().getInstruction());
-    Bases.push_back(relocateOpers.getBasePtr());
-    Ptrs.push_back(relocateOpers.getDerivedPtr());
+  for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+    Relocs.push_back(Relocate);
+    Bases.push_back(Relocate->getBasePtr());
+    Ptrs.push_back(Relocate->getDerivedPtr());
   }
 
   // Remove any redundant llvm::Values which map to the same SDValue as another
@@ -411,7 +438,8 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     //       chaining stores one after another, this may allow
     //       a bit more optimal scheduling for them
     Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
-                                 MachinePointerInfo::getFixedStack(Index),
+                                 MachinePointerInfo::getFixedStack(
+                                     Builder.DAG.getMachineFunction(), Index),
                                  false, false, 0);
 
     Builder.StatepointLowering.setLocation(Incoming, Loc);
@@ -433,7 +461,9 @@ static void lowerIncomingStatepointValue(SDValue Incoming,
     // If the original value was a constant, make sure it gets recorded as
     // such in the stackmap.  This is required so that the consumer can
     // parse any internal format to the deopt state.  It also handles null
-    // pointers and other constant pointers in GC states
+    // pointers and other constant pointers in GC states.  Note the constant
+    // vectors do not appear to actually hit this path and that anything larger
+    // than an i64 value (not type!) will fail asserts here.
     pushStackMapConstant(Ops, Builder, C->getSExtValue());
   } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
     // This handles allocas as arguments to the statepoint (this is only
@@ -477,27 +507,27 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
 
 #ifndef NDEBUG
   // Check that each of the gc pointer and bases we've gotten out of the
-  // safepoint is something the strategy thinks might be a pointer into the GC
-  // heap.  This is basically just here to help catch errors during statepoint
-  // insertion. TODO: This should actually be in the Verifier, but we can't get
-  // to the GCStrategy from there (yet).
+  // safepoint is something the strategy thinks might be a pointer (or vector
+  // of pointers) into the GC heap.  This is basically just here to help catch
+  // errors during statepoint insertion. TODO: This should actually be in the
+  // Verifier, but we can't get to the GCStrategy from there (yet).
   GCStrategy &S = Builder.GFI->getStrategy();
   for (const Value *V : Bases) {
-    auto Opt = S.isGCManagedPointer(V);
+    auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
     if (Opt.hasValue()) {
       assert(Opt.getValue() &&
              "non gc managed base pointer found in statepoint");
     }
   }
   for (const Value *V : Ptrs) {
-    auto Opt = S.isGCManagedPointer(V);
+    auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
     if (Opt.hasValue()) {
       assert(Opt.getValue() &&
              "non gc managed derived pointer found in statepoint");
     }
   }
   for (const Value *V : Relocations) {
-    auto Opt = S.isGCManagedPointer(V);
+    auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
     if (Opt.hasValue()) {
       assert(Opt.getValue() && "non gc managed pointer relocated");
     }
@@ -572,8 +602,8 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
     Builder.FuncInfo.StatepointRelocatedValues[StatepointInstr];
 
-  for (GCRelocateOperands RelocateOpers : StatepointSite.getRelocates()) {
-    const Value *V = RelocateOpers.getDerivedPtr();
+  for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+    const Value *V = Relocate->getDerivedPtr();
     SDValue SDV = Builder.getValue(V);
     SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
 
@@ -581,19 +611,20 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
       SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
     } else {
       // Record value as visited, but not spilled. This is case for allocas
-      // and constants. For this values we can avoid emiting spill load while
+      // and constants. For this values we can avoid emitting spill load while
       // visiting corresponding gc_relocate.
       // Actually we do not need to record them in this map at all.
-      // We do this only to check that we are not relocating any unvisited value.
+      // We do this only to check that we are not relocating any unvisited
+      // value.
       SpillMap[V] = None;
 
       // Default llvm mechanisms for exporting values which are used in
       // different basic blocks does not work for gc relocates.
       // Note that it would be incorrect to teach llvm that all relocates are
-      // uses of the corresponging values so that it would automatically
+      // uses of the corresponding values so that it would automatically
       // export them. Relocates of the spilled values does not use original
       // value.
-      if (StatepointSite.getCallSite().isInvoke())
+      if (Relocate->getParent() != StatepointInstr->getParent())
         Builder.ExportFromCurrentBlock(V);
     }
   }
@@ -608,7 +639,7 @@ void SelectionDAGBuilder::visitStatepoint(const CallInst &CI) {
 }
 
 void SelectionDAGBuilder::LowerStatepoint(
-    ImmutableStatepoint ISP, MachineBasicBlock *LandingPad /*=nullptr*/) {
+    ImmutableStatepoint ISP, const BasicBlock *EHPadBB /*= nullptr*/) {
   // The basic scheme here is that information about both the original call and
   // the safepoint is encoded in the CallInst.  We create a temporary call and
   // lower it, then reverse engineer the calling sequence.
@@ -620,14 +651,12 @@ void SelectionDAGBuilder::LowerStatepoint(
   ImmutableCallSite CS(ISP.getCallSite());
 
 #ifndef NDEBUG
-  // Consistency check. Don't do this for invokes. It would be too
-  // expensive to preserve this information across different basic blocks
-  if (!CS.isInvoke()) {
-    for (const User *U : CS->users()) {
-      const CallInst *Call = cast<CallInst>(U);
-      if (isGCRelocate(Call))
-        StatepointLowering.scheduleRelocCall(*Call);
-    }
+  // Consistency check. Check only relocates in the same basic block as thier
+  // statepoint.
+  for (const User *U : CS->users()) {
+    const CallInst *Call = cast<CallInst>(U);
+    if (isa<GCRelocateInst>(Call) && Call->getParent() == CS.getParent())
+      StatepointLowering.scheduleRelocCall(*Call);
   }
 #endif
 
@@ -648,7 +677,7 @@ void SelectionDAGBuilder::LowerStatepoint(
 
   // Get call node, we will replace it later with statepoint
   SDNode *CallNode =
-      lowerCallFromStatepoint(ISP, LandingPad, *this, PendingExports);
+      lowerCallFromStatepoint(ISP, EHPadBB, *this, PendingExports);
 
   // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END
   // nodes with all the appropriate arguments and return values.
@@ -790,7 +819,7 @@ void SelectionDAGBuilder::LowerStatepoint(
 
   // Replace original call
   DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
-  // Remove originall call node
+  // Remove original call node
   DAG.DeleteNode(CallNode);
 
   // DON'T set the root - under the assumption that it's already set past the
@@ -809,8 +838,9 @@ void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
   Instruction *I = cast<Instruction>(CI.getArgOperand(0));
   assert(isStatepoint(I) && "first argument must be a statepoint token");
 
-  if (isa<InvokeInst>(I)) {
-    // For invokes we should have stored call result in a virtual register.
+  if (I->getParent() != CI.getParent()) {
+    // Statepoint is in different basic block so we should have stored call
+    // result in a virtual register.
     // We can not use default getValue() functionality to copy value from this
     // register because statepoint and actuall call return types can be
     // different, and getValue() will use CopyFromReg of the wrong type,
@@ -828,23 +858,22 @@ void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
   }
 }
 
-void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
-  GCRelocateOperands RelocateOpers(&CI);
-
+void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
 #ifndef NDEBUG
   // Consistency check
-  // We skip this check for invoke statepoints. It would be too expensive to
-  // preserve validation info through different basic blocks.
-  if (!RelocateOpers.isTiedToInvoke()) {
-    StatepointLowering.relocCallVisited(CI);
+  // We skip this check for relocates not in the same basic block as thier
+  // statepoint. It would be too expensive to preserve validation info through
+  // different basic blocks.
+  if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) {
+    StatepointLowering.relocCallVisited(Relocate);
   }
 #endif
 
-  const Value *DerivedPtr = RelocateOpers.getDerivedPtr();
+  const Value *DerivedPtr = Relocate.getDerivedPtr();
   SDValue SD = getValue(DerivedPtr);
 
   FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-    FuncInfo.StatepointRelocatedValues[RelocateOpers.getStatepoint()];
+    FuncInfo.StatepointRelocatedValues[Relocate.getStatepoint()];
 
   // We should have recorded location for this pointer
   assert(SpillMap.count(DerivedPtr) && "Relocating not lowered gc value");
@@ -853,7 +882,7 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
   // We didn't need to spill these special cases (constants and allocas).
   // See the handling in spillIncomingValueForStatepoint for detail.
   if (!DerivedPtrLocation) {
-    setValue(&CI, SD);
+    setValue(&Relocate, SD);
     return;
   }
 
@@ -862,17 +891,18 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
 
   // Be conservative: flush all pending loads
   // TODO: Probably we can be less restrictive on this,
-  // it may allow more scheduling opprtunities
+  // it may allow more scheduling opportunities.
   SDValue Chain = getRoot();
 
   SDValue SpillLoad =
-    DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot,
-                MachinePointerInfo::getFixedStack(*DerivedPtrLocation),
-                false, false, false, 0);
+      DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot,
+                  MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                                    *DerivedPtrLocation),
+                  false, false, false, 0);
 
   // Again, be conservative, don't emit pending loads
   DAG.setRoot(SpillLoad.getValue(1));
 
   assert(SpillLoad.getNode());
-  setValue(&CI, SpillLoad);
+  setValue(&Relocate, SpillLoad);
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index fbf6512..c64d882 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -85,21 +85,22 @@ void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
 std::pair<SDValue, SDValue>
 TargetLowering::makeLibCall(SelectionDAG &DAG,
                             RTLIB::Libcall LC, EVT RetVT,
-                            const SDValue *Ops, unsigned NumOps,
+                            ArrayRef<SDValue> Ops,
                             bool isSigned, SDLoc dl,
                             bool doesNotReturn,
                             bool isReturnValueUsed) const {
   TargetLowering::ArgListTy Args;
-  Args.reserve(NumOps);
+  Args.reserve(Ops.size());
 
   TargetLowering::ArgListEntry Entry;
-  for (unsigned i = 0; i != NumOps; ++i) {
-    Entry.Node = Ops[i];
+  for (SDValue Op : Ops) {
+    Entry.Node = Op;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = shouldSignExtendTypeInLibCall(Ops[i].getValueType(), isSigned);
-    Entry.isZExt = !shouldSignExtendTypeInLibCall(Ops[i].getValueType(), isSigned);
+    Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
     Args.push_back(Entry);
   }
+
   if (LC == RTLIB::UNKNOWN_LIBCALL)
     report_fatal_error("Unsupported library call operation!");
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
@@ -115,9 +116,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
   return LowerCallTo(CLI);
 }
 
-
-/// SoftenSetCCOperands - Soften the operands of a comparison.  This code is
-/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+/// Soften the operands of a comparison. This code is shared among BR_CC,
+/// SELECT_CC, and SETCC handlers.
 void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
                                          SDValue &NewLHS, SDValue &NewRHS,
                                          ISD::CondCode &CCCode,
@@ -127,6 +127,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
 
   // Expand into one or more soft-fp libcall(s).
   RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  bool ShouldInvertCC = false;
   switch (CCCode) {
   case ISD::SETEQ:
   case ISD::SETOEQ:
@@ -166,34 +167,38 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
     LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
           (VT == MVT::f64) ? RTLIB::O_F64 : RTLIB::O_F128;
     break;
-  default:
+  case ISD::SETONE:
+    // SETONE = SETOLT | SETOGT
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+          (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+    LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+          (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+    break;
+  case ISD::SETUEQ:
     LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
           (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+    LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+    break;
+  default:
+    // Invert CC for unordered comparisons
+    ShouldInvertCC = true;
     switch (CCCode) {
-    case ISD::SETONE:
-      // SETONE = SETOLT | SETOGT
-      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
-            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
-      // Fallthrough
-    case ISD::SETUGT:
-      LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
-            (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
-      break;
-    case ISD::SETUGE:
-      LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
-            (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
-      break;
     case ISD::SETULT:
-      LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
-            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+      LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+            (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
       break;
     case ISD::SETULE:
-      LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+      LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+            (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+      break;
+    case ISD::SETUGT:
+      LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
             (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
       break;
-    case ISD::SETUEQ:
-      LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
-            (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+    case ISD::SETUGE:
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
       break;
     default: llvm_unreachable("Do not know how to soften this setcc!");
     }
@@ -201,17 +206,21 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
 
   // Use the target specific return value for comparions lib calls.
   EVT RetVT = getCmpLibcallReturnType();
-  SDValue Ops[2] = { NewLHS, NewRHS };
-  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/,
+  SDValue Ops[2] = {NewLHS, NewRHS};
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /*sign irrelevant*/,
                        dl).first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
+
   CCCode = getCmpLibcallCC(LC1);
+  if (ShouldInvertCC)
+    CCCode = getSetCCInverse(CCCode, /*isInteger=*/true);
+
   if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
     SDValue Tmp = DAG.getNode(
         ISD::SETCC, dl,
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
         NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/,
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/*sign irrelevant*/,
                          dl).first;
     NewLHS = DAG.getNode(
         ISD::SETCC, dl,
@@ -222,9 +231,8 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   }
 }
 
-/// getJumpTableEncoding - Return the entry encoding for a jump table in the
-/// current function.  The returned value is a member of the
-/// MachineJumpTableInfo::JTEntryKind enum.
+/// Return the entry encoding for a jump table in the current function. The
+/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
 unsigned TargetLowering::getJumpTableEncoding() const {
   // In non-pic modes, just use the address of a block.
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
@@ -250,9 +258,8 @@ SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   return Table;
 }
 
-/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
-/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
-/// MCExpr.
+/// This returns the relocation base for the given PIC jumptable, the same as
+/// getPICJumpTableRelocBase, but as an MCExpr.
 const MCExpr *
 TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
                                              unsigned JTI,MCContext &Ctx) const{
@@ -279,10 +286,9 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 //  Optimization Methods
 //===----------------------------------------------------------------------===//
 
-/// ShrinkDemandedConstant - Check to see if the specified operand of the
-/// specified instruction is a constant integer.  If so, check to see if there
-/// are any bits set in the constant that are not demanded.  If so, shrink the
-/// constant and return true.
+/// Check to see if the specified operand of the specified instruction is a
+/// constant integer. If so, check to see if there are any bits set in the
+/// constant that are not demanded. If so, shrink the constant and return true.
 bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
                                                         const APInt &Demanded) {
   SDLoc dl(Op);
@@ -317,10 +323,9 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
   return false;
 }
 
-/// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
-/// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
-/// cast, but it could be generalized for targets with other types of
-/// implicit widening casts.
+/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
+/// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
+/// generalized for targets with other types of implicit widening casts.
 bool
 TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
                                                     unsigned BitWidth,
@@ -366,13 +371,13 @@ TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
   return false;
 }
 
-/// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
-/// DemandedMask bits of the result of Op are ever used downstream.  If we can
-/// use this information to simplify Op, create a new simplified DAG node and
-/// return true, returning the original and new nodes in Old and New. Otherwise,
-/// analyze the expression and return a mask of KnownOne and KnownZero bits for
-/// the expression (used to simplify the caller).  The KnownZero/One bits may
-/// only be accurate for those bits in the DemandedMask.
+/// Look at Op. At this point, we know that only the DemandedMask bits of the
+/// result of Op are ever used downstream. If we can use this information to
+/// simplify Op, create a new simplified DAG node and return true, returning the
+/// original and new nodes in Old and New. Otherwise, analyze the expression and
+/// return a mask of KnownOne and KnownZero bits for the expression (used to
+/// simplify the caller).  The KnownZero/One bits may only be accurate for those
+/// bits in the DemandedMask.
 bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                           const APInt &DemandedMask,
                                           APInt &KnownZero,
@@ -1061,7 +1066,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         Op.getOperand(0).getValueType().isFloatingPoint()) {
       bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
       bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
-      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() &&
+           Op.getOperand(0).getValueType() != MVT::f128) {
+        // Cannot eliminate/lower SHL for f128 yet.
         EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
@@ -1120,9 +1127,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   return false;
 }
 
-/// computeKnownBitsForTargetNode - Determine which of the bits specified
-/// in Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
+/// Determine which of the bits specified in Mask are known to be either zero or
+/// one and return them in the KnownZero/KnownOne bitsets.
 void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                    APInt &KnownZero,
                                                    APInt &KnownOne,
@@ -1137,9 +1143,8 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
 }
 
-/// ComputeNumSignBitsForTargetNode - This method can be implemented by
-/// targets that want to expose additional information about sign bits to the
-/// DAG Combiner.
+/// This method can be implemented by targets that want to expose additional
+/// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
                                                          const SelectionDAG &,
                                                          unsigned Depth) const {
@@ -1152,10 +1157,8 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   return 1;
 }
 
-/// ValueHasExactlyOneBitSet - Test if the given value is known to have exactly
-/// one bit set. This differs from computeKnownBits in that it doesn't need to
-/// determine which bit is set.
-///
+/// Test if the given value is known to have exactly one bit set. This differs
+/// from computeKnownBits in that it doesn't need to determine which bit is set.
 static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
   // A left-shift of a constant one will have exactly one bit set, because
   // shifting the bit off the end is undefined.
@@ -1239,8 +1242,8 @@ bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   return CN->isNullValue();
 }
 
-/// SimplifySetCC - Try to simplify a setcc built with the specified operands
-/// and cc. If it is unable to simplify it, return a null SDValue.
+/// Try to simplify a setcc built with the specified operands and cc. If it is
+/// unable to simplify it, return a null SDValue.
 SDValue
 TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                               ISD::CondCode Cond, bool foldBooleans,
@@ -1270,7 +1273,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
        isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
     return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
 
-  if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
     const APInt &C1 = N1C->getAPIntValue();
 
     // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
@@ -1335,7 +1338,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         PreExt = N0->getOperand(0);
       } else if (N0->getOpcode() == ISD::AND) {
         // DAGCombine turns costly ZExts into ANDs
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
+        if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
           if ((C->getAPIntValue()+1).isPowerOf2()) {
             MinBits = C->getAPIntValue().countTrailingOnes();
             PreExt = N0->getOperand(0);
@@ -1345,7 +1348,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         MinBits = N0->getOperand(0).getValueSizeInBits();
         PreExt = N0->getOperand(0);
         Signed = true;
-      } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) {
+      } else if (auto *LN0 = dyn_cast<LoadSDNode>(N0)) {
         // ZEXTLOAD / SEXTLOAD
         if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
           MinBits = LN0->getMemoryVT().getSizeInBits();
@@ -1697,8 +1700,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
          (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
         N0.getOpcode() == ISD::AND) {
       auto &DL = DAG.getDataLayout();
-      if (ConstantSDNode *AndRHS =
-                  dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
         EVT ShiftTy = DCI.isBeforeLegalize()
                           ? getPointerTy(DL)
                           : getShiftAmountTy(N0.getValueType(), DL);
@@ -1728,8 +1730,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // (X & -256) == 256 -> (X >> 8) == 1
       if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
           N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
-        if (ConstantSDNode *AndRHS =
-            dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
           const APInt &AndRHSC = AndRHS->getAPIntValue();
           if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
             unsigned ShiftBits = AndRHSC.countTrailingZeros();
@@ -1783,7 +1784,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // Constant fold or commute setcc.
     SDValue O = DAG.FoldSetCC(VT, N0, N1, Cond, dl);
     if (O.getNode()) return O;
-  } else if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+  } else if (auto *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
     // If the RHS of an FP comparison is a constant, simplify it away in
     // some cases.
     if (CFP->getValueAPF().isNaN()) {
@@ -1900,8 +1901,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // to be careful about increasing register pressure needlessly.
       bool LegalRHSImm = false;
 
-      if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(N1)) {
-        if (ConstantSDNode *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) {
+        if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
           // Turn (X+C1) == C2 --> X == C2-C1
           if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
             return DAG.getSetCC(dl, VT, N0.getOperand(0),
@@ -1924,7 +1925,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
 
         // Turn (C1-X) == C2 --> X == C1-C2
-        if (ConstantSDNode *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+        if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
           if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
             return
               DAG.getSetCC(dl, VT, N0.getOperand(1),
@@ -2075,12 +2076,11 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
-/// node is a GlobalAddress + offset.
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
 bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA,
                                     int64_t &Offset) const {
-  if (isa<GlobalAddressSDNode>(N)) {
-    GlobalAddressSDNode *GASD = cast<GlobalAddressSDNode>(N);
+  if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) {
     GA = GASD->getGlobal();
     Offset += GASD->getOffset();
     return true;
@@ -2090,14 +2090,12 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA,
     SDValue N1 = N->getOperand(0);
     SDValue N2 = N->getOperand(1);
     if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
-      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
-      if (V) {
+      if (auto *V = dyn_cast<ConstantSDNode>(N2)) {
         Offset += V->getSExtValue();
         return true;
       }
     } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
-      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
-      if (V) {
+      if (auto *V = dyn_cast<ConstantSDNode>(N1)) {
         Offset += V->getSExtValue();
         return true;
       }
@@ -2107,9 +2105,8 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA,
   return false;
 }
 
-
-SDValue TargetLowering::
-PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+SDValue TargetLowering::PerformDAGCombine(SDNode *N,
+                                          DAGCombinerInfo &DCI) const {
   // Default implementation: no optimization.
   return SDValue();
 }
@@ -2159,9 +2156,9 @@ TargetLowering::getConstraintType(StringRef Constraint) const {
   return C_Unknown;
 }
 
-/// LowerXConstraint - try to replace an X constraint, which matches anything,
-/// with another that has more specific requirements based on the type of the
-/// corresponding operand.
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
 const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
   if (ConstraintVT.isInteger())
     return "r";
@@ -2170,8 +2167,8 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
   return nullptr;
 }
 
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops.
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
 void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                   std::string &Constraint,
                                                   std::vector<SDValue> &Ops,
@@ -2284,31 +2281,30 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
 //===----------------------------------------------------------------------===//
 // Constraint Selection.
 
-/// isMatchingInputConstraint - Return true of this is an input operand that is
-/// a matching constraint like "4".
+/// Return true of this is an input operand that is a matching constraint like
+/// "4".
 bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
   assert(!ConstraintCode.empty() && "No known constraint!");
   return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
 }
 
-/// getMatchedOperand - If this is an input matching constraint, this method
-/// returns the output operand it matches.
+/// If this is an input matching constraint, this method returns the output
+/// operand it matches.
 unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
   assert(!ConstraintCode.empty() && "No known constraint!");
   return atoi(ConstraintCode.c_str());
 }
 
-
-/// ParseConstraints - Split up the constraint string from the inline
-/// assembly value into the specific constraints and their prefixes,
-/// and also tie in the associated operand values.
+/// Split up the constraint string from the inline assembly value into the
+/// specific constraints and their prefixes, and also tie in the associated
+/// operand values.
 /// If this returns an empty vector, and if the constraint string itself
 /// isn't empty, there was an error parsing.
 TargetLowering::AsmOperandInfoVector
 TargetLowering::ParseConstraints(const DataLayout &DL,
                                  const TargetRegisterInfo *TRI,
                                  ImmutableCallSite CS) const {
-  /// ConstraintOperands - Information about all of the constraints.
+  /// Information about all of the constraints.
   AsmOperandInfoVector ConstraintOperands;
   const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
   unsigned maCount = 0; // Largest number of multiple alternative constraints.
@@ -2483,16 +2479,13 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
                              " incompatible type!");
         }
       }
-
     }
   }
 
   return ConstraintOperands;
 }
 
-
-/// getConstraintGenerality - Return an integer indicating how general CT
-/// is.
+/// Return an integer indicating how general CT is.
 static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
   switch (CT) {
   case TargetLowering::C_Other:
@@ -2581,8 +2574,8 @@ TargetLowering::ConstraintWeight
   return weight;
 }
 
-/// ChooseConstraint - If there are multiple different constraints that we
-/// could pick for this operand (e.g. "imr") try to pick the 'best' one.
+/// If there are multiple different constraints that we could pick for this
+/// operand (e.g. "imr") try to pick the 'best' one.
 /// This is somewhat tricky: constraints fall into four classes:
 ///    Other         -> immediates and magic values
 ///    Register      -> one specific register
@@ -2649,9 +2642,8 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
   OpInfo.ConstraintType = BestType;
 }
 
-/// ComputeConstraintToUse - Determines the constraint code and constraint
-/// type to use for the specific AsmOperandInfo, setting
-/// OpInfo.ConstraintCode and OpInfo.ConstraintType.
+/// Determines the constraint code and constraint type to use for the specific
+/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
 void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
                                             SDValue Op,
                                             SelectionDAG *DAG) const {
@@ -2717,6 +2709,16 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
   return Mul;
 }
 
+SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                      SelectionDAG &DAG,
+                                      std::vector<SDNode *> *Created) const {
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N,0); // Lower SDIV as SDIV
+  return SDValue();
+}
+
 /// \brief Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
@@ -3036,3 +3038,46 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
       DAG.getConstant(0, dl, NVT), Ret, ISD::SETLT);
   return true;
 }
+
+//===----------------------------------------------------------------------===//
+// Implementation of Emulated TLS Model
+//===----------------------------------------------------------------------===//
+
+SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
+                                                SelectionDAG &DAG) const {
+  // Access to address of TLS varialbe xyz is lowered to a function call:
+  //   __emutls_get_address( address of global variable named "__emutls_v.xyz" )
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  PointerType *VoidPtrType = Type::getInt8PtrTy(*DAG.getContext());
+  SDLoc dl(GA);
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+  std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str();
+  Module *VariableModule = const_cast<Module*>(GA->getGlobal()->getParent());
+  StringRef EmuTlsVarName(NameString);
+  GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName);
+  if (!EmuTlsVar)
+    EmuTlsVar = dyn_cast_or_null<GlobalVariable>(
+        VariableModule->getOrInsertGlobal(EmuTlsVarName, VoidPtrType));
+  Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
+  Entry.Ty = VoidPtrType;
+  Args.push_back(Entry);
+
+  SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
+  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args), 0);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  // At last for X86 targets, maybe good for other targets too?
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setAdjustsStack(true);  // Is this only for X86 target?
+  MFI->setHasCalls(true);
+
+  assert((GA->getOffset() == 0) &&
+         "Emulated TLS must have zero offset in GlobalAddressSDNode");
+  return CallResult.first;
+}
diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
index e7b2a8e..878eeee 100644
--- a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -112,7 +112,7 @@ public:
     case 1:
       // Find all 'return', 'resume', and 'unwind' instructions.
       while (StateBB != StateE) {
-        BasicBlock *CurBB = StateBB++;
+        BasicBlock *CurBB = &*StateBB++;
 
         // Branches and invokes do not escape, only unwind, resume, and return
         // do.
@@ -120,7 +120,7 @@ public:
         if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
           continue;
 
-        Builder.SetInsertPoint(TI->getParent(), TI);
+        Builder.SetInsertPoint(TI);
         return &Builder;
       }
 
@@ -163,8 +163,8 @@ public:
 
         // Split the basic block containing the function call.
         BasicBlock *CallBB = CI->getParent();
-        BasicBlock *NewBB =
-            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
+        BasicBlock *NewBB = CallBB->splitBasicBlock(
+            CI->getIterator(), CallBB->getName() + ".cont");
 
         // Remove the unconditional branch inserted at the end of CallBB.
         CallBB->getInstList().pop_back();
@@ -184,7 +184,7 @@ public:
         delete CI;
       }
 
-      Builder.SetInsertPoint(RI->getParent(), RI);
+      Builder.SetInsertPoint(RI);
       return &Builder;
     }
   }
diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
index 4463cc7..d361a6c 100644
--- a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -43,9 +43,12 @@
 // points must be in the same loop.
 // Property #3 is ensured via the MachineBlockFrequencyInfo.
 //
-// If this pass found points matching all this properties, then
-// MachineFrameInfo is updated this that information.
+// If this pass found points matching all these properties, then
+// MachineFrameInfo is updated with this information.
 //===----------------------------------------------------------------------===//
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 // To check for profitability.
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -61,11 +64,14 @@
 #include "llvm/CodeGen/Passes.h"
 // To know about callee-saved.
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Debug.h"
 // To query the target about frame lowering.
 #include "llvm/Target/TargetFrameLowering.h"
 // To know about frame setup operation.
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 // To access TargetInstrInfo.
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -78,6 +84,10 @@ STATISTIC(NumCandidates, "Number of shrink-wrapping candidates");
 STATISTIC(NumCandidatesDropped,
           "Number of shrink-wrapping candidates dropped because of frequency");
 
+static cl::opt<cl::boolOrDefault>
+    EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden,
+                        cl::desc("enable the shrink-wrapping pass"));
+
 namespace {
 /// \brief Class to determine where the safe point to insert the
 /// prologue and epilogue are.
@@ -113,18 +123,38 @@ class ShrinkWrap : public MachineFunctionPass {
   unsigned FrameDestroyOpcode;
   /// Entry block.
   const MachineBasicBlock *Entry;
+  typedef SmallSetVector<unsigned, 16> SetOfRegs;
+  /// Registers that need to be saved for the current function.
+  mutable SetOfRegs CurrentCSRs;
+  /// Current MachineFunction.
+  MachineFunction *MachineFunc;
 
   /// \brief Check if \p MI uses or defines a callee-saved register or
   /// a frame index. If this is the case, this means \p MI must happen
   /// after Save and before Restore.
-  bool useOrDefCSROrFI(const MachineInstr &MI) const;
+  bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const;
+
+  const SetOfRegs &getCurrentCSRs(RegScavenger *RS) const {
+    if (CurrentCSRs.empty()) {
+      BitVector SavedRegs;
+      const TargetFrameLowering *TFI =
+          MachineFunc->getSubtarget().getFrameLowering();
+
+      TFI->determineCalleeSaves(*MachineFunc, SavedRegs, RS);
+
+      for (int Reg = SavedRegs.find_first(); Reg != -1;
+           Reg = SavedRegs.find_next(Reg))
+        CurrentCSRs.insert((unsigned)Reg);
+    }
+    return CurrentCSRs;
+  }
 
   /// \brief Update the Save and Restore points such that \p MBB is in
   /// the region that is dominated by Save and post-dominated by Restore
   /// and Save and Restore still match the safe point definition.
   /// Such point may not exist and Save and/or Restore may be null after
   /// this call.
-  void updateSaveRestorePoints(MachineBasicBlock &MBB);
+  void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS);
 
   /// \brief Initialize the pass for \p MF.
   void init(MachineFunction &MF) {
@@ -140,6 +170,8 @@ class ShrinkWrap : public MachineFunctionPass {
     FrameSetupOpcode = TII.getCallFrameSetupOpcode();
     FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
     Entry = &MF.front();
+    CurrentCSRs.clear();
+    MachineFunc = &MF;
 
     ++NumFunc;
   }
@@ -148,6 +180,9 @@ class ShrinkWrap : public MachineFunctionPass {
   /// shrink-wrapping.
   bool ArePointsInteresting() const { return Save != Entry && Save && Restore; }
 
+  /// \brief Check if shrink wrapping is enabled for this target and function.
+  static bool isShrinkWrapEnabled(const MachineFunction &MF);
+
 public:
   static char ID;
 
@@ -185,27 +220,34 @@ INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false)
 
-bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI) const {
+bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
+                                 RegScavenger *RS) const {
   if (MI.getOpcode() == FrameSetupOpcode ||
       MI.getOpcode() == FrameDestroyOpcode) {
     DEBUG(dbgs() << "Frame instruction: " << MI << '\n');
     return true;
   }
   for (const MachineOperand &MO : MI.operands()) {
-    bool UseCSR = false;
+    bool UseOrDefCSR = false;
     if (MO.isReg()) {
       unsigned PhysReg = MO.getReg();
       if (!PhysReg)
         continue;
       assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
              "Unallocated register?!");
-      UseCSR = RCI.getLastCalleeSavedAlias(PhysReg);
+      UseOrDefCSR = RCI.getLastCalleeSavedAlias(PhysReg);
+    } else if (MO.isRegMask()) {
+      // Check if this regmask clobbers any of the CSRs.
+      for (unsigned Reg : getCurrentCSRs(RS)) {
+        if (MO.clobbersPhysReg(Reg)) {
+          UseOrDefCSR = true;
+          break;
+        }
+      }
     }
-    // TODO: Handle regmask more accurately.
-    // For now, be conservative about them.
-    if (UseCSR || MO.isFI() || MO.isRegMask()) {
-      DEBUG(dbgs() << "Use or define CSR(" << UseCSR << ") or FI(" << MO.isFI()
-                   << "): " << MI << '\n');
+    if (UseOrDefCSR || MO.isFI()) {
+      DEBUG(dbgs() << "Use or define CSR(" << UseOrDefCSR << ") or FI("
+                   << MO.isFI() << "): " << MI << '\n');
       return true;
     }
   }
@@ -222,10 +264,13 @@ MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs,
     if (!IDom)
       break;
   }
+  if (IDom == &Block)
+    return nullptr;
   return IDom;
 }
 
-void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) {
+void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
+                                         RegScavenger *RS) {
   // Get rid of the easy cases first.
   if (!Save)
     Save = &MBB;
@@ -246,7 +291,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) {
   // terminator.
   if (Restore == &MBB) {
     for (const MachineInstr &Terminator : MBB.terminators()) {
-      if (!useOrDefCSROrFI(Terminator))
+      if (!useOrDefCSROrFI(Terminator, RS))
         continue;
       // One of the terminator needs to happen before the restore point.
       if (MBB.succ_empty()) {
@@ -277,7 +322,24 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) {
   while (Save && Restore &&
          (!(SaveDominatesRestore = MDT->dominates(Save, Restore)) ||
           !(RestorePostDominatesSave = MPDT->dominates(Restore, Save)) ||
-          MLI->getLoopFor(Save) != MLI->getLoopFor(Restore))) {
+          // Post-dominance is not enough in loops to ensure that all uses/defs
+          // are after the prologue and before the epilogue at runtime.
+          // E.g.,
+          // while(1) {
+          //  Save
+          //  Restore
+          //   if (...)
+          //     break;
+          //  use/def CSRs
+          // }
+          // All the uses/defs of CSRs are dominated by Save and post-dominated
+          // by Restore. However, the CSRs uses are still reachable after
+          // Restore and before Save are executed.
+          //
+          // For now, just push the restore/save points outside of loops.
+          // FIXME: Refine the criteria to still find interesting cases
+          // for loops.
+          MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
     // Fix (A).
     if (!SaveDominatesRestore) {
       Save = MDT->findNearestCommonDominator(Save, Restore);
@@ -288,35 +350,114 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) {
       Restore = MPDT->findNearestCommonDominator(Restore, Save);
 
     // Fix (C).
-    if (Save && Restore && Save != Restore &&
-        MLI->getLoopFor(Save) != MLI->getLoopFor(Restore)) {
-      if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore))
-        // Push Save outside of this loop.
+    if (Save && Restore &&
+        (MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
+      if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore)) {
+        // Push Save outside of this loop if immediate dominator is different
+        // from save block. If immediate dominator is not different, bail out.
         Save = FindIDom<>(*Save, Save->predecessors(), *MDT);
-      else
+        if (!Save)
+          break;
+      } else {
+        // If the loop does not exit, there is no point in looking
+        // for a post-dominator outside the loop.
+        SmallVector<MachineBasicBlock*, 4> ExitBlocks;
+        MLI->getLoopFor(Restore)->getExitingBlocks(ExitBlocks);
         // Push Restore outside of this loop.
-        Restore = FindIDom<>(*Restore, Restore->successors(), *MPDT);
+        // Look for the immediate post-dominator of the loop exits.
+        MachineBasicBlock *IPdom = Restore;
+        for (MachineBasicBlock *LoopExitBB: ExitBlocks) {
+          IPdom = FindIDom<>(*IPdom, LoopExitBB->successors(), *MPDT);
+          if (!IPdom)
+            break;
+        }
+        // If the immediate post-dominator is not in a less nested loop,
+        // then we are stuck in a program with an infinite loop.
+        // In that case, we will not find a safe point, hence, bail out.
+        if (IPdom && MLI->getLoopDepth(IPdom) < MLI->getLoopDepth(Restore))
+          Restore = IPdom;
+        else {
+          Restore = nullptr;
+          break;
+        }
+      }
+    }
+  }
+}
+
+/// Check whether the edge (\p SrcBB, \p DestBB) is a backedge according to MLI.
+/// I.e., check if it exists a loop that contains SrcBB and where DestBB is the
+/// loop header.
+static bool isProperBackedge(const MachineLoopInfo &MLI,
+                             const MachineBasicBlock *SrcBB,
+                             const MachineBasicBlock *DestBB) {
+  for (const MachineLoop *Loop = MLI.getLoopFor(SrcBB); Loop;
+       Loop = Loop->getParentLoop()) {
+    if (Loop->getHeader() == DestBB)
+      return true;
+  }
+  return false;
+}
+
+/// Check if the CFG of \p MF is irreducible.
+static bool isIrreducibleCFG(const MachineFunction &MF,
+                             const MachineLoopInfo &MLI) {
+  const MachineBasicBlock *Entry = &*MF.begin();
+  ReversePostOrderTraversal<const MachineBasicBlock *> RPOT(Entry);
+  BitVector VisitedBB(MF.getNumBlockIDs());
+  for (const MachineBasicBlock *MBB : RPOT) {
+    VisitedBB.set(MBB->getNumber());
+    for (const MachineBasicBlock *SuccBB : MBB->successors()) {
+      if (!VisitedBB.test(SuccBB->getNumber()))
+        continue;
+      // We already visited SuccBB, thus MBB->SuccBB must be a backedge.
+      // Check that the head matches what we have in the loop information.
+      // Otherwise, we have an irreducible graph.
+      if (!isProperBackedge(MLI, MBB, SuccBB))
+        return true;
     }
   }
+  return false;
 }
 
 bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.empty())
+  if (MF.empty() || !isShrinkWrapEnabled(MF))
     return false;
+
   DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
 
   init(MF);
 
+  if (isIrreducibleCFG(MF, *MLI)) {
+    // If MF is irreducible, a block may be in a loop without
+    // MachineLoopInfo reporting it. I.e., we may use the
+    // post-dominance property in loops, which lead to incorrect
+    // results. Moreover, we may miss that the prologue and
+    // epilogue are not in the same loop, leading to unbalanced
+    // construction/deconstruction of the stack frame.
+    DEBUG(dbgs() << "Irreducible CFGs are not supported yet\n");
+    return false;
+  }
+
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  std::unique_ptr<RegScavenger> RS(
+      TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr);
+
   for (MachineBasicBlock &MBB : MF) {
     DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' << MBB.getName()
                  << '\n');
 
+    if (MBB.isEHFuncletEntry()) {
+      DEBUG(dbgs() << "EH Funclets are not supported yet.\n");
+      return false;
+    }
+
     for (const MachineInstr &MI : MBB) {
-      if (!useOrDefCSROrFI(MI))
+      if (!useOrDefCSROrFI(MI, RS.get()))
         continue;
       // Save (resp. restore) point must dominate (resp. post dominate)
       // MI. Look for the proper basic block for those.
-      updateSaveRestorePoints(MBB);
+      updateSaveRestorePoints(MBB, RS.get());
       // If we are at a point where we cannot improve the placement of
       // save/restore instructions, just give up.
       if (!ArePointsInteresting()) {
@@ -368,7 +509,7 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
         break;
       NewBB = Restore;
     }
-    updateSaveRestorePoints(*NewBB);
+    updateSaveRestorePoints(*NewBB, RS.get());
   } while (Save && Restore);
 
   if (!ArePointsInteresting()) {
@@ -386,3 +527,30 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
   ++NumCandidates;
   return false;
 }
+
+bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  switch (EnableShrinkWrapOpt) {
+  case cl::BOU_UNSET:
+    return TFI->enableShrinkWrapping(MF) &&
+      // Windows with CFI has some limitations that make it impossible
+      // to use shrink-wrapping.
+      !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+      // Sanitizers look at the value of the stack at the location
+      // of the crash. Since a crash can happen anywhere, the
+      // frame must be lowered before anything else happen for the
+      // sanitizers to be able to get a correct stack frame.
+      !(MF.getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+        MF.getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
+        MF.getFunction()->hasFnAttribute(Attribute::SanitizeMemory));
+  // If EnableShrinkWrap is set, it takes precedence on whatever the
+  // target sets. The rational is that we assume we want to test
+  // something related to shrink-wrapping.
+  case cl::BOU_TRUE:
+    return true;
+  case cl::BOU_FALSE:
+    return false;
+  }
+  llvm_unreachable("Invalid shrink-wrapping state");
+}
diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index d236e1f..e1f242a 100644
--- a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -50,7 +50,7 @@ class SjLjEHPrepare : public FunctionPass {
   Type *FunctionContextTy;
   Constant *RegisterFn;
   Constant *UnregisterFn;
-  Constant *BuiltinSetjmpFn;
+  Constant *BuiltinSetupDispatchFn;
   Constant *FrameAddrFn;
   Constant *StackAddrFn;
   Constant *StackRestoreFn;
@@ -112,7 +112,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
-  BuiltinSetjmpFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
+  BuiltinSetupDispatchFn =
+    Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
   LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
   CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
@@ -178,8 +179,8 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
   // values and replace the LPI with that aggregate.
   Type *LPadType = LPI->getType();
   Value *LPadVal = UndefValue::get(LPadType);
-  IRBuilder<> Builder(
-      std::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
+  auto *SelI = cast<Instruction>(SelVal);
+  IRBuilder<> Builder(SelI->getParent(), std::next(SelI->getIterator()));
   LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
   LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
 
@@ -190,7 +191,7 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
 /// it with all of the data that we know at this point.
 Value *SjLjEHPrepare::setupFunctionContext(Function &F,
                                            ArrayRef<LandingPadInst *> LPads) {
-  BasicBlock *EntryBB = F.begin();
+  BasicBlock *EntryBB = &F.front();
 
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
@@ -198,12 +199,13 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   auto &DL = F.getParent()->getDataLayout();
   unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
-                           EntryBB->begin());
+                           &EntryBB->front());
 
   // Fill in the function context structure.
   for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
     LandingPadInst *LPI = LPads[I];
-    IRBuilder<> Builder(LPI->getParent()->getFirstInsertionPt());
+    IRBuilder<> Builder(LPI->getParent(),
+                        LPI->getParent()->getFirstInsertionPt());
 
     // Reference the __data field.
     Value *FCData =
@@ -250,21 +252,20 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
   while (isa<AllocaInst>(AfterAllocaInsPt) &&
          isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
     ++AfterAllocaInsPt;
+  assert(AfterAllocaInsPt != F.front().end());
 
-  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
-       ++AI) {
-    Type *Ty = AI->getType();
+  for (auto &AI : F.args()) {
+    Type *Ty = AI.getType();
 
     // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction.
     Value *TrueValue = ConstantInt::getTrue(F.getContext());
     Value *UndefValue = UndefValue::get(Ty);
-    Instruction *SI = SelectInst::Create(TrueValue, AI, UndefValue,
-                                         AI->getName() + ".tmp",
-                                         AfterAllocaInsPt);
-    AI->replaceAllUsesWith(SI);
+    Instruction *SI = SelectInst::Create(
+        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", &*AfterAllocaInsPt);
+    AI.replaceAllUsesWith(SI);
 
     // Reset the operand, because it  was clobbered by the RAUW above.
-    SI->setOperand(1, AI);
+    SI->setOperand(1, &AI);
   }
 }
 
@@ -279,7 +280,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
       // Ignore obvious cases we don't have to handle. In particular, most
       // instructions either have no uses or only have a single use inside the
       // current block. Ignore them quickly.
-      Instruction *Inst = II;
+      Instruction *Inst = &*II;
       if (Inst->use_empty())
         continue;
       if (Inst->hasOneUse() &&
@@ -360,7 +361,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
       DemotePHIToStack(PN);
 
     // Move the landingpad instruction back to the top of the landing pad block.
-    LPI->moveBefore(UnwindBlock->begin());
+    LPI->moveBefore(&UnwindBlock->front());
   }
 }
 
@@ -400,7 +401,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
 
   Value *FuncCtx =
       setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
-  BasicBlock *EntryBB = F.begin();
+  BasicBlock *EntryBB = &F.front();
   IRBuilder<> Builder(EntryBB->getTerminator());
 
   // Get a reference to the jump buffer.
@@ -421,9 +422,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   Val = Builder.CreateCall(StackAddrFn, {}, "sp");
   Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
 
-  // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
-  Value *SetjmpArg = Builder.CreateBitCast(JBufPtr, Builder.getInt8PtrTy());
-  Builder.CreateCall(BuiltinSetjmpFn, SetjmpArg);
+  // Call the setup_dispatch instrinsic. It fills in the rest of the jmpbuf.
+  Builder.CreateCall(BuiltinSetupDispatchFn, {});
 
   // Store a pointer to the function context so that the back-end will know
   // where to look for it.
@@ -475,7 +475,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
         continue;
       }
       Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
-      StackAddr->insertAfter(I);
+      StackAddr->insertAfter(&*I);
       Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
       StoreStackAddr->insertAfter(StackAddr);
     }
diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
index 025ae70..c9d23f6 100644
--- a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -172,8 +172,8 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
   // optionally includes an additional position prior to MBB->begin(), indicated
   // by the includeStart flag. This is done so that we can iterate MIs in a MBB
   // in parallel with SlotIndexes, but there should be a better way to do this.
-  IndexList::iterator ListB = startIdx.listEntry();
-  IndexList::iterator ListI = endIdx.listEntry();
+  IndexList::iterator ListB = startIdx.listEntry()->getIterator();
+  IndexList::iterator ListI = endIdx.listEntry()->getIterator();
   MachineBasicBlock::iterator MBBI = End;
   bool pastStart = false;
   while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) {
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
index 97a5424..d30cfc2 100644
--- a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -36,7 +36,6 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
@@ -188,9 +187,9 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
   BlockFrequencies.resize(mf.getNumBlockIDs());
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   setThreshold(MBFI->getEntryFreq());
-  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) {
-    unsigned Num = I->getNumber();
-    BlockFrequencies[Num] = MBFI->getBlockFreq(I);
+  for (auto &I : mf) {
+    unsigned Num = I.getNumber();
+    BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
   }
 
   // We never change the function.
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp
index dab1dfe..51dddab 100644
--- a/contrib/llvm/lib/CodeGen/SplitKit.cpp
+++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp
@@ -56,6 +56,7 @@ void SplitAnalysis::clear() {
 
 SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) {
   const MachineBasicBlock *MBB = MF.getBlockNumbered(Num);
+  // FIXME: Handle multiple EH pad successors.
   const MachineBasicBlock *LPad = MBB->getLandingPadSuccessor();
   std::pair<SlotIndex, SlotIndex> &LSP = LastSplitPoint[Num];
   SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB);
@@ -176,10 +177,11 @@ bool SplitAnalysis::calcLiveBlockInfo() {
   UseE = UseSlots.end();
 
   // Loop over basic blocks where CurLI is live.
-  MachineFunction::iterator MFI = LIS.getMBBFromIndex(LVI->start);
+  MachineFunction::iterator MFI =
+      LIS.getMBBFromIndex(LVI->start)->getIterator();
   for (;;) {
     BlockInfo BI;
-    BI.MBB = MFI;
+    BI.MBB = &*MFI;
     SlotIndex Start, Stop;
     std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
@@ -259,7 +261,7 @@ bool SplitAnalysis::calcLiveBlockInfo() {
     if (LVI->start < Stop)
       ++MFI;
     else
-      MFI = LIS.getMBBFromIndex(LVI->start);
+      MFI = LIS.getMBBFromIndex(LVI->start)->getIterator();
   }
 
   assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
@@ -275,8 +277,9 @@ unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
   unsigned Count = 0;
 
   // Loop over basic blocks where li is live.
-  MachineFunction::const_iterator MFI = LIS.getMBBFromIndex(LVI->start);
-  SlotIndex Stop = LIS.getMBBEndIdx(MFI);
+  MachineFunction::const_iterator MFI =
+      LIS.getMBBFromIndex(LVI->start)->getIterator();
+  SlotIndex Stop = LIS.getMBBEndIdx(&*MFI);
   for (;;) {
     ++Count;
     LVI = li->advanceTo(LVI, Stop);
@@ -284,7 +287,7 @@ unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
       return Count;
     do {
       ++MFI;
-      Stop = LIS.getMBBEndIdx(MFI);
+      Stop = LIS.getMBBEndIdx(&*MFI);
     } while (Stop <= LVI->start);
   }
 }
@@ -864,9 +867,9 @@ bool SplitEditor::transferValues() {
       // This value has multiple defs in RegIdx, but it wasn't rematerialized,
       // so the live range is accurate. Add live-in blocks in [Start;End) to the
       // LiveInBlocks.
-      MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start);
+      MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
       SlotIndex BlockStart, BlockEnd;
-      std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(MBB);
+      std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(&*MBB);
 
       // The first block may be live-in, or it may have its own def.
       if (Start != BlockStart) {
@@ -875,7 +878,7 @@ bool SplitEditor::transferValues() {
         DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
         // MBB has its own def. Is it also live-out?
         if (BlockEnd <= End)
-          LRC.setLiveOutValue(MBB, VNI);
+          LRC.setLiveOutValue(&*MBB, VNI);
 
         // Skip to the next block for live-in.
         ++MBB;
@@ -886,23 +889,23 @@ bool SplitEditor::transferValues() {
       assert(Start <= BlockStart && "Expected live-in block");
       while (BlockStart < End) {
         DEBUG(dbgs() << ">BB#" << MBB->getNumber());
-        BlockEnd = LIS.getMBBEndIdx(MBB);
+        BlockEnd = LIS.getMBBEndIdx(&*MBB);
         if (BlockStart == ParentVNI->def) {
           // This block has the def of a parent PHI, so it isn't live-in.
           assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
           VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
           assert(VNI && "Missing def for complex mapped parent PHI");
           if (End >= BlockEnd)
-            LRC.setLiveOutValue(MBB, VNI); // Live-out as well.
+            LRC.setLiveOutValue(&*MBB, VNI); // Live-out as well.
         } else {
           // This block needs a live-in value.  The last block covered may not
           // be live-out.
           if (End < BlockEnd)
-            LRC.addLiveInBlock(LR, MDT[MBB], End);
+            LRC.addLiveInBlock(LR, MDT[&*MBB], End);
           else {
             // Live-through, and we don't know the value.
-            LRC.addLiveInBlock(LR, MDT[MBB]);
-            LRC.setLiveOutValue(MBB, nullptr);
+            LRC.addLiveInBlock(LR, MDT[&*MBB]);
+            LRC.setLiveOutValue(&*MBB, nullptr);
           }
         }
         BlockStart = BlockEnd;
@@ -1081,16 +1084,14 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ConnectedVNInfoEqClasses ConEQ(LIS);
   for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
     // Don't use iterators, they are invalidated by create() below.
-    LiveInterval *li = &LIS.getInterval(Edit->get(i));
-    unsigned NumComp = ConEQ.Classify(li);
-    if (NumComp <= 1)
-      continue;
-    DEBUG(dbgs() << "  " << NumComp << " components: " << *li << '\n');
-    SmallVector<LiveInterval*, 8> dups;
-    dups.push_back(li);
-    for (unsigned j = 1; j != NumComp; ++j)
-      dups.push_back(&Edit->createEmptyInterval());
-    ConEQ.Distribute(&dups[0], MRI);
+    unsigned VReg = Edit->get(i);
+    LiveInterval &LI = LIS.getInterval(VReg);
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    LIS.splitSeparateComponents(LI, SplitLIs);
+    unsigned Original = VRM.getOriginal(VReg);
+    for (LiveInterval *SplitLI : SplitLIs)
+      VRM.setIsSplitFromReg(SplitLI->reg, Original);
+
     // The new intervals all map back to i.
     if (LRMap)
       LRMap->resize(Edit->size(), i);
diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp
index 3541b33..7b52038 100644
--- a/contrib/llvm/lib/CodeGen/StackColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -570,6 +571,14 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
       }
     }
 
+  // Update the location of C++ catch objects for the MSVC personality routine.
+  if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
+    for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
+      for (WinEHHandlerType &H : TBME.HandlerArray)
+        if (H.CatchObj.FrameIndex != INT_MAX &&
+            SlotRemap.count(H.CatchObj.FrameIndex))
+          H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex];
+
   DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
   DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
   DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp
index 116eef6..b3cd8b3 100644
--- a/contrib/llvm/lib/CodeGen/StackMaps.cpp
+++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp
@@ -94,7 +94,9 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     default:
       llvm_unreachable("Unrecognized operand type.");
     case StackMaps::DirectMemRefOp: {
-      unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits();
+      auto &DL = AP.MF->getDataLayout();
+
+      unsigned Size = DL.getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
       unsigned Reg = (++MOI)->getReg();
diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp
index bcea37a..db3fef5 100644
--- a/contrib/llvm/lib/CodeGen/StackProtector.cpp
+++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp
@@ -373,7 +373,7 @@ bool StackProtector::InsertStackProtectors() {
   Value *StackGuardVar = nullptr; // The stack guard variable.
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
-    BasicBlock *BB = I++;
+    BasicBlock *BB = &*I++;
     ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
     if (!RI)
       continue;
@@ -433,7 +433,7 @@ bool StackProtector::InsertStackProtectors() {
       BasicBlock *FailBB = CreateFailBB();
 
       // Split the basic block before the return instruction.
-      BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+      BasicBlock *NewBB = BB->splitBasicBlock(RI->getIterator(), "SP_return");
 
       // Update the dominator tree if we need to.
       if (DT && DT->isReachableFromEntry(BB)) {
@@ -453,22 +453,20 @@ bool StackProtector::InsertStackProtectors() {
       LoadInst *LI1 = B.CreateLoad(StackGuardVar);
       LoadInst *LI2 = B.CreateLoad(AI);
       Value *Cmp = B.CreateICmpEQ(LI1, LI2);
-      unsigned SuccessWeight =
-          BranchProbabilityInfo::getBranchWeightStackProtector(true);
-      unsigned FailureWeight =
-          BranchProbabilityInfo::getBranchWeightStackProtector(false);
+      auto SuccessProb =
+          BranchProbabilityInfo::getBranchProbStackProtector(true);
+      auto FailureProb =
+          BranchProbabilityInfo::getBranchProbStackProtector(false);
       MDNode *Weights = MDBuilder(F->getContext())
-                            .createBranchWeights(SuccessWeight, FailureWeight);
+                            .createBranchWeights(SuccessProb.getNumerator(),
+                                                 FailureProb.getNumerator());
       B.CreateCondBr(Cmp, NewBB, FailBB, Weights);
     }
   }
 
   // Return if we didn't modify any basic blocks. i.e., there are no return
   // statements in the function.
-  if (!HasPrologue)
-    return false;
-
-  return true;
+  return HasPrologue;
 }
 
 /// CreateFailBB - Create a basic block to jump to when the stack protector
diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
index a5a175f..51f4d0e 100644
--- a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -318,7 +318,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
     if (NewFI == -1 || (NewFI == (int)SS))
       continue;
 
-    const PseudoSourceValue *NewSV = PseudoSourceValue::getFixedStack(NewFI);
+    const PseudoSourceValue *NewSV = MF.getPSVManager().getFixedStack(NewFI);
     SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS];
     for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i)
       RefMMOs[i]->setValue(NewSV);
diff --git a/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp b/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp
index 95dfd75..3f60e18 100644
--- a/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp
+++ b/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp
@@ -34,9 +34,9 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
-  Optional<bool> isGCManagedPointer(const Value *V) const override {
+  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
-    PointerType *PT = cast<PointerType>(V->getType());
+    const PointerType *PT = cast<PointerType>(Ty);
     // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
     // GC managed heap.  We know that a pointer into this heap needs to be
     // updated and that no other pointer does.  Note that addrspace(1) is used
diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
index 237460c..d2fbf53 100644
--- a/contrib/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
@@ -59,7 +59,7 @@ TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden);
 typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy;
 
 namespace {
-  /// TailDuplicatePass - Perform tail duplication.
+  /// Perform tail duplication.
   class TailDuplicatePass : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -69,11 +69,11 @@ namespace {
     std::unique_ptr<RegScavenger> RS;
     bool PreRegAlloc;
 
-    // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
+    // A list of virtual registers for which to update SSA form.
     SmallVector<unsigned, 16> SSAUpdateVRs;
 
-    // SSAUpdateVals - For each virtual register in SSAUpdateVals keep a list of
-    // source virtual registers.
+    // For each virtual register in SSAUpdateVals keep a list of source virtual
+    // registers.
     DenseMap<unsigned, AvailableValsTy> SSAUpdateVals;
 
   public:
@@ -161,7 +161,7 @@ void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const {
 
 static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
   for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I;
+    MachineBasicBlock *MBB = &*I;
     SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(),
                                                 MBB->pred_end());
     MachineBasicBlock::iterator MI = MBB->begin();
@@ -207,7 +207,7 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
   }
 }
 
-/// TailDuplicateAndUpdate - Tail duplicate the block and cleanup.
+/// Tail duplicate the block and cleanup.
 bool
 TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
                                           bool IsSimple,
@@ -310,9 +310,9 @@ TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
   return true;
 }
 
-/// TailDuplicateBlocks - Look for small blocks that are unconditionally
-/// branched to and do not fall through. Tail-duplicate their instructions
-/// into their predecessors to eliminate (dynamic) branches.
+/// Look for small blocks that are unconditionally branched to and do not fall
+/// through. Tail-duplicate their instructions into their predecessors to
+/// eliminate (dynamic) branches.
 bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
   bool MadeChange = false;
 
@@ -322,7 +322,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
   }
 
   for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
-    MachineBasicBlock *MBB = I++;
+    MachineBasicBlock *MBB = &*I++;
 
     if (NumTails == TailDupLimit)
       break;
@@ -375,8 +375,7 @@ static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
   }
 }
 
-/// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
-/// SSA update.
+/// Add a definition and source virtual registers pair for SSA update.
 void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
                                           MachineBasicBlock *BB) {
   DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg);
@@ -390,9 +389,8 @@ void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
   }
 }
 
-/// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB.
-/// Remember the source register that's contributed by PredBB and update SSA
-/// update map.
+/// Process PHI node in TailBB by turning it into a copy in PredBB. Remember the
+/// source register that's contributed by PredBB and update SSA update map.
 void TailDuplicatePass::ProcessPHI(
     MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
     DenseMap<unsigned, unsigned> &LocalVRMap,
@@ -422,7 +420,7 @@ void TailDuplicatePass::ProcessPHI(
     MI->eraseFromParent();
 }
 
-/// DuplicateInstruction - Duplicate a TailBB instruction to PredBB and update
+/// Duplicate a TailBB instruction to PredBB and update
 /// the source operands due to earlier PHI translation.
 void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
                                      MachineBasicBlock *TailBB,
@@ -459,9 +457,9 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
   PredBB->insert(PredBB->instr_end(), NewMI);
 }
 
-/// UpdateSuccessorsPHIs - After FromBB is tail duplicated into its predecessor
-/// blocks, the successors have gained new predecessors. Update the PHI
-/// instructions in them accordingly.
+/// After FromBB is tail duplicated into its predecessor blocks, the successors
+/// have gained new predecessors. Update the PHI instructions in them
+/// accordingly.
 void
 TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
                                   SmallVectorImpl<MachineBasicBlock *> &TDBBs,
@@ -545,7 +543,7 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
   }
 }
 
-/// shouldTailDuplicate - Determine if it is profitable to duplicate this block.
+/// Determine if it is profitable to duplicate this block.
 bool
 TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
                                        bool IsSimple,
@@ -563,6 +561,7 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
   if (TailDuplicateSize.getNumOccurrences() == 0 &&
+      // FIXME: Use Function::optForSize().
       MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
     MaxDuplicateCount = 1;
   else
@@ -584,30 +583,51 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
-  for (MachineBasicBlock::iterator I = TailBB.begin(); I != TailBB.end(); ++I) {
+  for (MachineInstr &MI : TailBB) {
     // Non-duplicable things shouldn't be tail-duplicated.
-    if (I->isNotDuplicable())
+    if (MI.isNotDuplicable())
       return false;
 
     // Do not duplicate 'return' instructions if this is a pre-regalloc run.
     // A return may expand into a lot more instructions (e.g. reload of callee
     // saved registers) after PEI.
-    if (PreRegAlloc && I->isReturn())
+    if (PreRegAlloc && MI.isReturn())
       return false;
 
     // Avoid duplicating calls before register allocation. Calls presents a
     // barrier to register allocation so duplicating them may end up increasing
     // spills.
-    if (PreRegAlloc && I->isCall())
+    if (PreRegAlloc && MI.isCall())
       return false;
 
-    if (!I->isPHI() && !I->isDebugValue())
+    if (!MI.isPHI() && !MI.isDebugValue())
       InstrCount += 1;
 
     if (InstrCount > MaxDuplicateCount)
       return false;
   }
 
+  // Check if any of the successors of TailBB has a PHI node in which the
+  // value corresponding to TailBB uses a subregister.
+  // If a phi node uses a register paired with a subregister, the actual
+  // "value type" of the phi may differ from the type of the register without
+  // any subregisters. Due to a bug, tail duplication may add a new operand
+  // without a necessary subregister, producing an invalid code. This is
+  // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll.
+  // Disable tail duplication for this case for now, until the problem is
+  // fixed.
+  for (auto SB : TailBB.successors()) {
+    for (auto &I : *SB) {
+      if (!I.isPHI())
+        break;
+      unsigned Idx = getPHISrcRegOpIdx(&I, &TailBB);
+      assert(Idx != 0);
+      MachineOperand &PU = I.getOperand(Idx);
+      if (PU.getSubReg() != 0)
+        return false;
+    }
+  }
+
   if (HasIndirectbr && PreRegAlloc)
     return true;
 
@@ -620,7 +640,7 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
   return canCompletelyDuplicateBB(TailBB);
 }
 
-/// isSimpleBB - True if this BB has only one unconditional jump.
+/// True if this BB has only one unconditional jump.
 bool
 TailDuplicatePass::isSimpleBB(MachineBasicBlock *TailBB) {
   if (TailBB->succ_size() != 1)
@@ -636,22 +656,16 @@ TailDuplicatePass::isSimpleBB(MachineBasicBlock *TailBB) {
 static bool
 bothUsedInPHI(const MachineBasicBlock &A,
               SmallPtrSet<MachineBasicBlock*, 8> SuccsB) {
-  for (MachineBasicBlock::const_succ_iterator SI = A.succ_begin(),
-         SE = A.succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock *BB = *SI;
+  for (MachineBasicBlock *BB : A.successors())
     if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI())
       return true;
-  }
 
   return false;
 }
 
 bool
 TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
-  for (MachineBasicBlock::pred_iterator PI = BB.pred_begin(),
-       PE = BB.pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock *PredBB = *PI;
-
+  for (MachineBasicBlock *PredBB : BB.predecessors()) {
     if (PredBB->succ_size() > 1)
       return false;
 
@@ -680,7 +694,7 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
        PE = Preds.end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
 
-    if (PredBB->getLandingPadSuccessor())
+    if (PredBB->hasEHPadSuccessor())
       continue;
 
     if (bothUsedInPHI(*PredBB, Succs))
@@ -696,7 +710,7 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
                  << "From simple Succ: " << *TailBB);
 
     MachineBasicBlock *NewTarget = *TailBB->succ_begin();
-    MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(PredBB));
+    MachineBasicBlock *NextBB = &*std::next(PredBB->getIterator());
 
     // Make PredFBB explicit.
     if (PredCond.empty())
@@ -731,19 +745,19 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     if (PredTBB)
       TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
 
-    uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB);
-    PredBB->removeSuccessor(TailBB);
-    unsigned NumSuccessors = PredBB->succ_size();
-    assert(NumSuccessors <= 1);
-    if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget)
-      PredBB->addSuccessor(NewTarget, Weight);
+    if (!PredBB->isSuccessor(NewTarget))
+      PredBB->replaceSuccessor(TailBB, NewTarget);
+    else {
+      PredBB->removeSuccessor(TailBB, true);
+      assert(PredBB->succ_size() <= 1);
+    }
 
     TDBBs.push_back(PredBB);
   }
   return Changed;
 }
 
-/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
+/// If it is profitable, duplicate TailBB's contents in each
 /// of its predecessors.
 bool
 TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
@@ -798,13 +812,12 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
       RS->enterBasicBlock(PredBB);
       if (!PredBB->empty())
         RS->forward(std::prev(PredBB->end()));
-      for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(),
-             E = TailBB->livein_end(); I != E; ++I) {
-        if (!RS->isRegUsed(*I, false))
+      for (const auto &LI : TailBB->liveins()) {
+        if (!RS->isRegUsed(LI.PhysReg, false))
           // If a register is previously livein to the tail but it's not live
           // at the end of predecessor BB, then it should be added to its
           // livein list.
-          PredBB->addLiveIn(*I);
+          PredBB->addLiveIn(LI);
       }
     }
 
@@ -845,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
            E = TailBB->succ_end(); I != E; ++I)
-      PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I));
+      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
 
     Changed = true;
     ++NumTailDups;
@@ -854,7 +867,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
   // If TailBB was duplicated into all its predecessors except for the prior
   // block, which falls through unconditionally, move the contents of this
   // block into the prior block.
-  MachineBasicBlock *PrevBB = std::prev(MachineFunction::iterator(TailBB));
+  MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
   MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   // This has to check PrevBB->succ_size() because EH edges are ignored by
@@ -960,8 +973,8 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
   return Changed;
 }
 
-/// RemoveDeadBlock - Remove the specified dead machine basic block from the
-/// function, updating the CFG.
+/// Remove the specified dead machine basic block from the function, updating
+/// the CFG.
 void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index f3cccd8..679ade1 100644
--- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -32,25 +33,22 @@ bool TargetFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
   return Attr.getValueAsString() == "true";
 }
 
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index. This is the default implementation
-/// which is overridden for some targets.
-int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                             int FI) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
-    getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
-}
-
+/// Returns the displacement from the frame register to the stack
+/// frame of the specified index, along with the frame register used
+/// (in output arg FrameReg). This is the default implementation which
+/// is overridden for some targets.
 int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                              int FI, unsigned &FrameReg) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
 
   // By default, assume all frame indices are referenced via whatever
   // getFrameRegister() says. The target can override this if it's doing
   // something different.
   FrameReg = RI->getFrameRegister(MF);
-  return getFrameIndexOffset(MF, FI);
+
+  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+         getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
 }
 
 bool TargetFrameLowering::needsFrameIndexResolution(
@@ -84,3 +82,13 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(Reg);
   }
 }
+
+unsigned TargetFrameLowering::getStackAlignmentSkew(
+    const MachineFunction &MF) const {
+  // When HHVM function is called, the stack is skewed as the return address
+  // is removed from the stack before we enter the function.
+  if (LLVM_UNLIKELY(MF.getFunction()->getCallingConv() == CallingConv::HHVM))
+    return MF.getTarget().getPointerSize();
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 97ca025..6eaf991 100644
--- a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -118,23 +118,24 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   MBB->addSuccessor(NewDest);
 }
 
-// commuteInstruction - The default implementation of this method just exchanges
-// the two operands returned by findCommutedOpIndices.
-MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
-                                                  bool NewMI) const {
+MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                      bool NewMI,
+                                                      unsigned Idx1,
+                                                      unsigned Idx2) const {
   const MCInstrDesc &MCID = MI->getDesc();
   bool HasDef = MCID.getNumDefs();
   if (HasDef && !MI->getOperand(0).isReg())
     // No idea how to commute this instruction. Target should implement its own.
     return nullptr;
-  unsigned Idx1, Idx2;
-  if (!findCommutedOpIndices(MI, Idx1, Idx2)) {
-    assert(MI->isCommutable() && "Precondition violation: MI must be commutable.");
-    return nullptr;
-  }
 
+  unsigned CommutableOpIdx1 = Idx1; (void)CommutableOpIdx1;
+  unsigned CommutableOpIdx2 = Idx2; (void)CommutableOpIdx2;
+  assert(findCommutedOpIndices(MI, CommutableOpIdx1, CommutableOpIdx2) &&
+         CommutableOpIdx1 == Idx1 && CommutableOpIdx2 == Idx2 &&
+         "TargetInstrInfo::CommuteInstructionImpl(): not commutable operands.");
   assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
          "This only knows how to commute register operands so far");
+
   unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
   unsigned Reg1 = MI->getOperand(Idx1).getReg();
   unsigned Reg2 = MI->getOperand(Idx2).getReg();
@@ -184,9 +185,53 @@ MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
   return MI;
 }
 
-/// findCommutedOpIndices - If specified MI is commutable, return the two
-/// operand indices that would swap value. Return true if the instruction
-/// is not in a form which this routine understands.
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
+                                                  bool NewMI,
+                                                  unsigned OpIdx1,
+                                                  unsigned OpIdx2) const {
+  // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose
+  // any commutable operand, which is done in findCommutedOpIndices() method
+  // called below.
+  if ((OpIdx1 == CommuteAnyOperandIndex || OpIdx2 == CommuteAnyOperandIndex) &&
+      !findCommutedOpIndices(MI, OpIdx1, OpIdx2)) {
+    assert(MI->isCommutable() &&
+           "Precondition violation: MI must be commutable.");
+    return nullptr;
+  }
+  return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
+                                           unsigned &ResultIdx2,
+                                           unsigned CommutableOpIdx1,
+                                           unsigned CommutableOpIdx2) {
+  if (ResultIdx1 == CommuteAnyOperandIndex &&
+      ResultIdx2 == CommuteAnyOperandIndex) {
+    ResultIdx1 = CommutableOpIdx1;
+    ResultIdx2 = CommutableOpIdx2;
+  } else if (ResultIdx1 == CommuteAnyOperandIndex) {
+    if (ResultIdx2 == CommutableOpIdx1)
+      ResultIdx1 = CommutableOpIdx2;
+    else if (ResultIdx2 == CommutableOpIdx2)
+      ResultIdx1 = CommutableOpIdx1;
+    else
+      return false;
+  } else if (ResultIdx2 == CommuteAnyOperandIndex) {
+    if (ResultIdx1 == CommutableOpIdx1)
+      ResultIdx2 = CommutableOpIdx2;
+    else if (ResultIdx1 == CommutableOpIdx2)
+      ResultIdx2 = CommutableOpIdx1;
+    else
+      return false;
+  } else
+    // Check that the result operand indices match the given commutable
+    // operand indices.
+    return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) ||
+           (ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1);
+
+  return true;
+}
+
 bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
                                             unsigned &SrcOpIdx1,
                                             unsigned &SrcOpIdx2) const {
@@ -196,10 +241,15 @@ bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   const MCInstrDesc &MCID = MI->getDesc();
   if (!MCID.isCommutable())
     return false;
+
   // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
   // is not true, then the target must implement this.
-  SrcOpIdx1 = MCID.getNumDefs();
-  SrcOpIdx2 = SrcOpIdx1 + 1;
+  unsigned CommutableOpIdx1 = MCID.getNumDefs();
+  unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
+  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                            CommutableOpIdx1, CommutableOpIdx2))
+    return false;
+
   if (!MI->getOperand(SrcOpIdx1).isReg() ||
       !MI->getOperand(SrcOpIdx2).isReg())
     // No idea.
@@ -207,7 +257,6 @@ bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   return true;
 }
 
-
 bool
 TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   if (!MI->isTerminator()) return false;
@@ -315,7 +364,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
   assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
 
-  if (!MF.getTarget().getDataLayout()->isLittleEndian()) {
+  if (!MF.getDataLayout().isLittleEndian()) {
     Offset = RC->getSize() - (Offset + Size);
   }
   return true;
@@ -384,11 +433,6 @@ void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   llvm_unreachable("Not a MachO target");
 }
 
-bool TargetInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                           ArrayRef<unsigned> Ops) const {
-  return MI->isCopy() && Ops.size() == 1 && canFoldCopy(MI, Ops[0]);
-}
-
 static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr *MI,
                                     ArrayRef<unsigned> Ops, int FrameIndex,
                                     const TargetInstrInfo &TII) {
@@ -489,10 +533,9 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
            "Folded a use to a non-load!");
     const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                              Flags, MFI.getObjectSize(FI),
-                              MFI.getObjectAlignment(FI));
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI),
+        MFI.getObjectAlignment(FI));
     NewMI->addMemOperand(MF, MMO);
 
     return NewMI;
@@ -517,6 +560,217 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   return --Pos;
 }
 
+bool TargetInstrInfo::hasReassociableOperands(
+    const MachineInstr &Inst, const MachineBasicBlock *MBB) const {
+  const MachineOperand &Op1 = Inst.getOperand(1);
+  const MachineOperand &Op2 = Inst.getOperand(2);
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // We need virtual register definitions for the operands that we will
+  // reassociate.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+
+  // And they need to be in the trace (otherwise, they won't have a depth).
+  return MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB;
+}
+
+bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
+                                             bool &Commuted) const {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  unsigned AssocOpcode = Inst.getOpcode();
+
+  // If only one operand has the same opcode and it's the second source operand,
+  // the operands must be commuted.
+  Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+  if (Commuted)
+    std::swap(MI1, MI2);
+
+  // 1. The previous instruction must be the same type as Inst.
+  // 2. The previous instruction must have virtual register definitions for its
+  //    operands in the same basic block as Inst.
+  // 3. The previous instruction's result must only be used by Inst.
+  return MI1->getOpcode() == AssocOpcode &&
+         hasReassociableOperands(*MI1, MBB) &&
+         MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg());
+}
+
+// 1. The operation must be associative and commutative.
+// 2. The instruction must have virtual register definitions for its
+//    operands in the same basic block.
+// 3. The instruction must have a reassociable sibling.
+bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
+                                               bool &Commuted) const {
+  return isAssociativeAndCommutative(Inst) &&
+         hasReassociableOperands(Inst, Inst.getParent()) &&
+         hasReassociableSibling(Inst, Commuted);
+}
+
+// The concept of the reassociation pass is that these operations can benefit
+// from this kind of transformation:
+//
+// A = ? op ?
+// B = A op X (Prev)
+// C = B op Y (Root)
+// -->
+// A = ? op ?
+// B = X op Y
+// C = A op B
+//
+// breaking the dependency between A and B, allowing them to be executed in
+// parallel (or back-to-back in a pipeline) instead of depending on each other.
+
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+//    instruction is known to not increase the critical path, then don't match
+//    that pattern.
+bool TargetInstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+
+  bool Commute;
+  if (isReassociationCandidate(Root, Commute)) {
+    // We found a sequence of instructions that may be suitable for a
+    // reassociation of operands to increase ILP. Specify each commutation
+    // possibility for the Prev instruction in the sequence and let the
+    // machine combiner decide if changing the operands is worthwhile.
+    if (Commute) {
+      Patterns.push_back(MachineCombinerPattern::REASSOC_AX_YB);
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XA_YB);
+    } else {
+      Patterns.push_back(MachineCombinerPattern::REASSOC_AX_BY);
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XA_BY);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// Attempt the reassociation transformation to reduce critical path length.
+/// See the above comments before getMachineCombinerPatterns().
+void TargetInstrInfo::reassociateOps(
+    MachineInstr &Root, MachineInstr &Prev,
+    MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineFunction *MF = Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+
+  // This array encodes the operand index for each parameter because the
+  // operands may be commuted. Each row corresponds to a pattern value,
+  // and each column specifies the index of A, B, X, Y.
+  unsigned OpIdx[4][4] = {
+    { 1, 1, 2, 2 },
+    { 1, 2, 2, 1 },
+    { 2, 1, 1, 2 },
+    { 2, 2, 1, 1 }
+  };
+
+  int Row;
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY: Row = 0; break;
+  case MachineCombinerPattern::REASSOC_AX_YB: Row = 1; break;
+  case MachineCombinerPattern::REASSOC_XA_BY: Row = 2; break;
+  case MachineCombinerPattern::REASSOC_XA_YB: Row = 3; break;
+  default: llvm_unreachable("unexpected MachineCombinerPattern");
+  }
+
+  MachineOperand &OpA = Prev.getOperand(OpIdx[Row][0]);
+  MachineOperand &OpB = Root.getOperand(OpIdx[Row][1]);
+  MachineOperand &OpX = Prev.getOperand(OpIdx[Row][2]);
+  MachineOperand &OpY = Root.getOperand(OpIdx[Row][3]);
+  MachineOperand &OpC = Root.getOperand(0);
+
+  unsigned RegA = OpA.getReg();
+  unsigned RegB = OpB.getReg();
+  unsigned RegX = OpX.getReg();
+  unsigned RegY = OpY.getReg();
+  unsigned RegC = OpC.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(RegA))
+    MRI.constrainRegClass(RegA, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegB))
+    MRI.constrainRegClass(RegB, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegX))
+    MRI.constrainRegClass(RegX, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegY))
+    MRI.constrainRegClass(RegY, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegC))
+    MRI.constrainRegClass(RegC, RC);
+
+  // Create a new virtual register for the result of (X op Y) instead of
+  // recycling RegB because the MachineCombiner's computation of the critical
+  // path requires a new register definition rather than an existing one.
+  unsigned NewVR = MRI.createVirtualRegister(RC);
+  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+
+  unsigned Opcode = Root.getOpcode();
+  bool KillA = OpA.isKill();
+  bool KillX = OpX.isKill();
+  bool KillY = OpY.isKill();
+
+  // Create new instructions for insertion.
+  MachineInstrBuilder MIB1 =
+      BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
+          .addReg(RegX, getKillRegState(KillX))
+          .addReg(RegY, getKillRegState(KillY));
+  MachineInstrBuilder MIB2 =
+      BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
+          .addReg(RegA, getKillRegState(KillA))
+          .addReg(NewVR, getKillRegState(true));
+
+  setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
+
+  // Record new instructions for insertion and old instructions for deletion.
+  InsInstrs.push_back(MIB1);
+  InsInstrs.push_back(MIB2);
+  DelInstrs.push_back(&Prev);
+  DelInstrs.push_back(&Root);
+}
+
+void TargetInstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
+
+  // Select the previous instruction in the sequence based on the input pattern.
+  MachineInstr *Prev = nullptr;
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+    break;
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+    break;
+  default:
+    break;
+  }
+
+  assert(Prev && "Unknown pattern for machine combiner");
+
+  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
+  return;
+}
+
 /// foldMemoryOperand - Same as the previous version except it allows folding
 /// of any load and store from / to any address, not just from a specific
 /// stack slot.
@@ -661,6 +915,7 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
     return 0;
  
   int SPAdj = MI->getOperand(0).getImm();
+  SPAdj = TFI->alignSPAdjust(SPAdj);
 
   if ((!StackGrowsDown && MI->getOpcode() == FrameSetupOpcode) ||
        (StackGrowsDown && MI->getOpcode() == FrameDestroyOpcode))
@@ -686,10 +941,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // modification.
   const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  if (MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI))
-    return true;
-
-  return false;
+  return MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
 }
 
 // Provide a global flag for disabling the PreRA hazard recognizer that targets
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
index ecfd659..36a31c9 100644
--- a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -247,13 +247,9 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
   Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2";
   Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
-  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi";
-  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi";
   Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
   Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
   Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
-  Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi";
-  Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi";
   Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
   Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
   Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
@@ -266,13 +262,9 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
   Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
   Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
-  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi";
-  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi";
   Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
   Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
   Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
-  Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi";
-  Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi";
   Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
   Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
   Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
@@ -501,10 +493,6 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::f32) {
-    if (RetVT == MVT::i8)
-      return FPTOSINT_F32_I8;
-    if (RetVT == MVT::i16)
-      return FPTOSINT_F32_I16;
     if (RetVT == MVT::i32)
       return FPTOSINT_F32_I32;
     if (RetVT == MVT::i64)
@@ -512,10 +500,6 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::i128)
       return FPTOSINT_F32_I128;
   } else if (OpVT == MVT::f64) {
-    if (RetVT == MVT::i8)
-      return FPTOSINT_F64_I8;
-    if (RetVT == MVT::i16)
-      return FPTOSINT_F64_I16;
     if (RetVT == MVT::i32)
       return FPTOSINT_F64_I32;
     if (RetVT == MVT::i64)
@@ -551,10 +535,6 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::f32) {
-    if (RetVT == MVT::i8)
-      return FPTOUINT_F32_I8;
-    if (RetVT == MVT::i16)
-      return FPTOUINT_F32_I16;
     if (RetVT == MVT::i32)
       return FPTOUINT_F32_I32;
     if (RetVT == MVT::i64)
@@ -562,10 +542,6 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
     if (RetVT == MVT::i128)
       return FPTOUINT_F32_I128;
   } else if (OpVT == MVT::f64) {
-    if (RetVT == MVT::i8)
-      return FPTOUINT_F64_I8;
-    if (RetVT == MVT::i16)
-      return FPTOUINT_F64_I16;
     if (RetVT == MVT::i32)
       return FPTOUINT_F64_I32;
     if (RetVT == MVT::i64)
@@ -758,17 +734,13 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   SelectIsExpensive = false;
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
-  IntDivIsCheap = false;
   FsqrtIsCheap = false;
-  Pow2SDivIsCheap = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;
   EnableExtLdPromotion = false;
   HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
-  ExceptionPointerRegister = 0;
-  ExceptionSelectorRegister = 0;
   BooleanContents = UndefinedBooleanContent;
   BooleanFloatContents = UndefinedBooleanContent;
   BooleanVectorContents = UndefinedBooleanContent;
@@ -778,6 +750,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinFunctionAlignment = 0;
   PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
+  GatherAllAliasesMaxDepth = 6;
   MinStackArgumentAlignment = 1;
   InsertFencesForAtomic = false;
   MinimumJumpTableEntries = 4;
@@ -814,6 +787,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
     setOperationAction(ISD::FMINNUM, VT, Expand);
     setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FMINNAN, VT, Expand);
+    setOperationAction(ISD::FMAXNAN, VT, Expand);
     setOperationAction(ISD::FMAD, VT, Expand);
     setOperationAction(ISD::SMIN, VT, Expand);
     setOperationAction(ISD::SMAX, VT, Expand);
@@ -828,6 +803,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMULO, VT, Expand);
     setOperationAction(ISD::UMULO, VT, Expand);
 
+    setOperationAction(ISD::BITREVERSE, VT, Expand);
+    
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
 
@@ -838,11 +815,17 @@ void TargetLoweringBase::initActions() {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
+
+    // For most targets @llvm.get.dynamic.area.offest just returns 0.
+    setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
   setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
 
+  // Most targets also ignore the @llvm.readcyclecounter intrinsic.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand);
+
   // ConstantFP nodes default to expand.  Targets can either change this to
   // Legal, in which case all fp constants are legal, or use isFPImmLegal()
   // to optimize expansions for certain constants.
@@ -1111,6 +1094,19 @@ MachineBasicBlock*
 TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
                                    MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  // We're handling multiple types of operands here:
+  // PATCHPOINT MetaArgs - live-in, read only, direct
+  // STATEPOINT Deopt Spill - live-through, read only, indirect
+  // STATEPOINT Deopt Alloca - live-through, read only, direct
+  // (We're currently conservative and mark the deopt slots read/write in
+  // practice.) 
+  // STATEPOINT GC Spill - live-through, read/write, indirect
+  // STATEPOINT GC Alloca - live-through, read/write, direct
+  // The live-in vs live-through is handled already (the live through ones are
+  // all stack slots), but we need to handle the different type of stackmap
+  // operands and memory effects here.
 
   // MI changes inside this loop as we grow operands.
   for(unsigned OperIdx = 0; OperIdx != MI->getNumOperands(); ++OperIdx) {
@@ -1126,10 +1122,24 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
     // Copy operands before the frame-index.
     for (unsigned i = 0; i < OperIdx; ++i)
       MIB.addOperand(MI->getOperand(i));
-    // Add frame index operands: direct-mem-ref tag, #FI, offset.
-    MIB.addImm(StackMaps::DirectMemRefOp);
-    MIB.addOperand(MI->getOperand(OperIdx));
-    MIB.addImm(0);
+    // Add frame index operands recognized by stackmaps.cpp
+    if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
+      // indirect-mem-ref tag, size, #FI, offset.
+      // Used for spills inserted by StatepointLowering.  This codepath is not
+      // used for patchpoints/stackmaps at all, for these spilling is done via
+      // foldMemoryOperand callback only.
+      assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
+      MIB.addImm(StackMaps::IndirectMemRefOp);
+      MIB.addImm(MFI.getObjectSize(FI));
+      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.addImm(0);
+    } else {
+      // direct-mem-ref tag, #FI, offset.
+      // Used by patchpoint, and direct alloca arguments to statepoints
+      MIB.addImm(StackMaps::DirectMemRefOp);
+      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.addImm(0);
+    }
     // Copy the operands after the frame index.
     for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
       MIB.addOperand(MI->getOperand(i));
@@ -1139,7 +1149,6 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
     assert(MIB->mayLoad() && "Folded a stackmap use to a non-load!");
 
     // Add a new memory operand for this FI.
-    const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
 
     unsigned Flags = MachineMemOperand::MOLoad;
@@ -1148,8 +1157,8 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
       Flags |= MachineMemOperand::MOVolatile;
     }
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI), Flags,
-        TM.getDataLayout()->getPointerSize(), MFI.getObjectAlignment(FI));
+        MachinePointerInfo::getFixedStack(MF, FI), Flags,
+        MF.getDataLayout().getPointerSize(), MFI.getObjectAlignment(FI));
     MIB->addMemOperand(MF, MMO);
 
     // Replace the instruction and update the operand index.
@@ -1274,20 +1283,14 @@ void TargetLoweringBase::computeRegisterProperties(
     ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
   }
 
+  // Decide how to handle f16. If the target does not have native f16 support,
+  // promote it to f32, because there are no f16 library calls (except for
+  // conversions).
   if (!isTypeLegal(MVT::f16)) {
-    // If the target has native f32 support, promote f16 operations to f32.  If
-    // f32 is not supported, generate soft float library calls.
-    if (isTypeLegal(MVT::f32)) {
-      NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
-      RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
-      TransformToType[MVT::f16] = MVT::f32;
-      ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
-    } else {
-      NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16];
-      RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16];
-      TransformToType[MVT::f16] = MVT::i16;
-      ValueTypeActions.setTypeAction(MVT::f16, TypeSoftenFloat);
-    }
+    NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
+    RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
+    TransformToType[MVT::f16] = MVT::f32;
+    ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
   }
 
   // Loop over all of the vector value types to see which need transformations.
@@ -1528,6 +1531,29 @@ unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
   return DL.getABITypeAlignment(Ty);
 }
 
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, EVT VT,
+                                            unsigned AddrSpace,
+                                            unsigned Alignment,
+                                            bool *Fast) const {
+  // Check if the specified alignment is sufficient based on the data layout.
+  // TODO: While using the data layout works in practice, a better solution
+  // would be to implement this check directly (make this a virtual function).
+  // For example, the ABI alignment may change based on software platform while
+  // this function should only be affected by hardware implementation.
+  Type *Ty = VT.getTypeForEVT(Context);
+  if (Alignment >= DL.getABITypeAlignment(Ty)) {
+    // Assume that an access that meets the ABI-specified alignment is fast.
+    if (Fast != nullptr)
+      *Fast = true;
+    return true;
+  }
+  
+  // This is a misaligned access.
+  return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Fast);
+}
+
+
 //===----------------------------------------------------------------------===//
 //  TargetTransformInfo Helpers
 //===----------------------------------------------------------------------===//
@@ -1546,6 +1572,11 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case Invoke:         return 0;
   case Resume:         return 0;
   case Unreachable:    return 0;
+  case CleanupRet:     return 0;
+  case CatchRet:       return 0;
+  case CatchPad:       return 0;
+  case CatchSwitch:    return 0;
+  case CleanupPad:     return 0;
   case Add:            return ISD::ADD;
   case FAdd:           return ISD::FADD;
   case Sub:            return ISD::SUB;
@@ -1603,13 +1634,13 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   llvm_unreachable("Unknown instruction type encountered!");
 }
 
-std::pair<unsigned, MVT>
+std::pair<int, MVT>
 TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
                                             Type *Ty) const {
   LLVMContext &C = Ty->getContext();
   EVT MTy = getValueType(DL, Ty);
 
-  unsigned Cost = 1;
+  int Cost = 1;
   // We keep legalizing the type until we find a legal kind. We assume that
   // the only operation that costs anything is the split. After splitting
   // we need to handle two types.
@@ -1622,11 +1653,28 @@ TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
     if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
       Cost *= 2;
 
+    // Do not loop with f128 type.
+    if (MTy == LK.second)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
     // Keep legalizing the type.
     MTy = LK.second;
   }
 }
 
+Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!TM.getTargetTriple().isAndroid())
+    return nullptr;
+
+  // Android provides a libc function to retrieve the address of the current
+  // thread's unsafe stack pointer.
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
+  Value *Fn = M->getOrInsertFunction("__safestack_pointer_address",
+                                     StackPtrTy->getPointerTo(0), nullptr);
+  return IRB.CreateCall(Fn);
+}
+
 //===----------------------------------------------------------------------===//
 //  Loop Strength Reduction hooks
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 2f78763..58ae9cc 100644
--- a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
@@ -32,6 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -58,9 +60,8 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
   report_fatal_error("We do not support this DWARF encoding yet!");
 }
 
-void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
-                                                       const TargetMachine &TM,
-                                                       const MCSymbol *Sym) const {
+void TargetLoweringObjectFileELF::emitPersonalityValue(
+    MCStreamer &Streamer, const DataLayout &DL, const MCSymbol *Sym) const {
   SmallString<64> NameData("DW.ref.");
   NameData += Sym->getName();
   MCSymbolELF *Label =
@@ -72,9 +73,9 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
   unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
   MCSection *Sec = getContext().getELFSection(NameData, ELF::SHT_PROGBITS,
                                               Flags, 0, Label->getName());
-  unsigned Size = TM.getDataLayout()->getPointerSize();
+  unsigned Size = DL.getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
+  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::create(Size, getContext());
   Streamer.emitELFSize(Label, E);
@@ -232,14 +233,8 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
     return ".tdata";
   if (Kind.isThreadBSS())
     return ".tbss";
-  if (Kind.isDataNoRel())
+  if (Kind.isData())
     return ".data";
-  if (Kind.isDataRelLocal())
-    return ".data.rel.local";
-  if (Kind.isDataRel())
-    return ".data.rel";
-  if (Kind.isReadOnlyWithRelLocal())
-    return ".data.rel.ro.local";
   assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
   return ".data.rel.ro";
 }
@@ -282,8 +277,8 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalValue *GV,
     // We also need alignment here.
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
-    unsigned Align =
-        TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
+    unsigned Align = GV->getParent()->getDataLayout().getPreferredAlignment(
+        cast<GlobalVariable>(GV));
 
     std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
     Name = SizeSpec + utostr(Align);
@@ -350,9 +345,8 @@ bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
 
 /// Given a mergeable constant with the specified size and relocation
 /// information, return a section that it should be placed in.
-MCSection *
-TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind,
-                                                   const Constant *C) const {
+MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
   if (Kind.isMergeableConst4() && MergeableConst4Section)
     return MergeableConst4Section;
   if (Kind.isMergeableConst8() && MergeableConst8Section)
@@ -362,7 +356,6 @@ TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind,
   if (Kind.isReadOnly())
     return ReadOnlySection;
 
-  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
   assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
   return DataRelROSection;
 }
@@ -507,7 +500,7 @@ emitModuleFlags(MCStreamer &Streamer,
 
   // Get the section.
   MCSectionMachO *S = getContext().getMachOSection(
-      Segment, Section, TAA, StubSize, SectionKind::getDataNoRel());
+      Segment, Section, TAA, StubSize, SectionKind::getData());
   Streamer.SwitchSection(S);
   Streamer.EmitLabel(getContext().
                      getOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
@@ -589,14 +582,16 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      GV->getParent()->getDataLayout().getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      GV->getParent()->getDataLayout().getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
     return UStringSection;
 
   // With MachO only variables whose corresponding symbol starts with 'l' or
@@ -634,12 +629,11 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
   return DataSection;
 }
 
-MCSection *
-TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind,
-                                                     const Constant *C) const {
+MCSection *TargetLoweringObjectFileMachO::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
   // If this constant requires a relocation, we have to put it in the data
   // segment, not in the text segment.
-  if (Kind.isDataRel() || Kind.isReadOnlyWithRel())
+  if (Kind.isData() || Kind.isReadOnlyWithRel())
     return ConstDataSection;
 
   if (Kind.isMergeableConst4())
@@ -706,7 +700,7 @@ MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol(
 const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
     const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
-  // Although MachO 32-bit targets do not explictly have a GOTPCREL relocation
+  // Although MachO 32-bit targets do not explicitly have a GOTPCREL relocation
   // as 64-bit do, we replace the GOT equivalent by accessing the final symbol
   // through a non_lazy_ptr stub instead. One advantage is that it allows the
   // computation of deltas to final external symbols. Example:
@@ -740,7 +734,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
   // non_lazy_ptr stubs.
   SmallString<128> Name;
   StringRef Suffix = "$non_lazy_ptr";
-  Name += DL->getPrivateGlobalPrefix();
+  Name += MMI->getModule()->getDataLayout().getPrivateGlobalPrefix();
   Name += Sym->getName();
   Name += Suffix;
   MCSymbol *Stub = Ctx.getOrCreateSymbol(Name);
@@ -763,6 +757,29 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
   return MCBinaryExpr::createSub(LHS, RHS, Ctx);
 }
 
+static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
+                               const MCSection &Section) {
+  if (!AsmInfo.isSectionAtomizableBySymbols(Section))
+    return true;
+
+  // If it is not dead stripped, it is safe to use private labels.
+  const MCSectionMachO &SMO = cast<MCSectionMachO>(Section);
+  if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
+    return true;
+
+  return false;
+}
+
+void TargetLoweringObjectFileMachO::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang,
+    const TargetMachine &TM) const {
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
+  const MCSection *TheSection = SectionForGlobal(GV, GVKind, Mang, TM);
+  bool CannotUsePrivateLabel =
+      !canUsePrivateLabel(*TM.getMCAsmInfo(), *TheSection);
+  Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+}
+
 //===----------------------------------------------------------------------===//
 //                                  COFF
 //===----------------------------------------------------------------------===//
@@ -918,7 +935,7 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
                                          COMDATSymName, Selection);
     } else {
       SmallString<256> TmpData;
-      getNameWithPrefix(TmpData, GV, /*CannotUsePrivateLabel=*/true, Mang, TM);
+      Mang.getNameWithPrefix(TmpData, GV, /*CannotUsePrivateLabel=*/true);
       return getContext().getCOFFSection(Name, Characteristics, Kind, TmpData,
                                          Selection);
     }
@@ -943,8 +960,9 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
 }
 
 void TargetLoweringObjectFileCOFF::getNameWithPrefix(
-    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
-    bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const {
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang,
+    const TargetMachine &TM) const {
+  bool CannotUsePrivateLabel = false;
   if (GV->hasPrivateLinkage() &&
       ((isa<Function>(GV) && TM.getFunctionSections()) ||
        (isa<GlobalVariable>(GV) && TM.getDataSections())))
@@ -1043,7 +1061,7 @@ void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
     raw_string_ostream FlagOS(Flag);
     Mang.getNameWithPrefix(FlagOS, GV, false);
     FlagOS.flush();
-    if (Flag[0] == DL->getGlobalPrefix())
+    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
       OS << Flag.substr(1);
     else
       OS << Flag;
diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 61a66b6..0a7042a 100644
--- a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -11,13 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define DEBUG_TYPE "target-reg-info"
 
 using namespace llvm;
 
@@ -34,54 +40,71 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
-void PrintReg::print(raw_ostream &OS) const {
-  if (!Reg)
-    OS << "%noreg";
-  else if (TargetRegisterInfo::isStackSlot(Reg))
-    OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-  else if (TargetRegisterInfo::isVirtualRegister(Reg))
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
-  else if (TRI && Reg < TRI->getNumRegs())
-    OS << '%' << TRI->getName(Reg);
-  else
-    OS << "%physreg" << Reg;
-  if (SubIdx) {
-    if (TRI)
-      OS << ':' << TRI->getSubRegIndexName(SubIdx);
+namespace llvm {
+
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI,
+                   unsigned SubIdx) {
+  return Printable([Reg, TRI, SubIdx](raw_ostream &OS) {
+    if (!Reg)
+      OS << "%noreg";
+    else if (TargetRegisterInfo::isStackSlot(Reg))
+      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+    else if (TargetRegisterInfo::isVirtualRegister(Reg))
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+    else if (TRI && Reg < TRI->getNumRegs())
+      OS << '%' << TRI->getName(Reg);
     else
-      OS << ":sub(" << SubIdx << ')';
-  }
+      OS << "%physreg" << Reg;
+    if (SubIdx) {
+      if (TRI)
+        OS << ':' << TRI->getSubRegIndexName(SubIdx);
+      else
+        OS << ":sub(" << SubIdx << ')';
+    }
+  });
 }
 
-void PrintRegUnit::print(raw_ostream &OS) const {
-  // Generic printout when TRI is missing.
-  if (!TRI) {
-    OS << "Unit~" << Unit;
-    return;
-  }
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    // Generic printout when TRI is missing.
+    if (!TRI) {
+      OS << "Unit~" << Unit;
+      return;
+    }
 
-  // Check for invalid register units.
-  if (Unit >= TRI->getNumRegUnits()) {
-    OS << "BadUnit~" << Unit;
-    return;
-  }
+    // Check for invalid register units.
+    if (Unit >= TRI->getNumRegUnits()) {
+      OS << "BadUnit~" << Unit;
+      return;
+    }
 
-  // Normal units have at least one root.
-  MCRegUnitRootIterator Roots(Unit, TRI);
-  assert(Roots.isValid() && "Unit has no roots.");
-  OS << TRI->getName(*Roots);
-  for (++Roots; Roots.isValid(); ++Roots)
-    OS << '~' << TRI->getName(*Roots);
+    // Normal units have at least one root.
+    MCRegUnitRootIterator Roots(Unit, TRI);
+    assert(Roots.isValid() && "Unit has no roots.");
+    OS << TRI->getName(*Roots);
+    for (++Roots; Roots.isValid(); ++Roots)
+      OS << '~' << TRI->getName(*Roots);
+  });
 }
 
-void PrintVRegOrUnit::print(raw_ostream &OS) const {
-  if (TRI && TRI->isVirtualRegister(Unit)) {
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
-    return;
-  }
-  PrintRegUnit::print(OS);
+Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    if (TRI && TRI->isVirtualRegister(Unit)) {
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    } else {
+      OS << PrintRegUnit(Unit, TRI);
+    }
+  });
+}
+
+Printable PrintLaneMask(LaneBitmask LaneMask) {
+  return Printable([LaneMask](raw_ostream &OS) {
+    OS << format("%08X", LaneMask);
+  });
 }
 
+} // End of llvm namespace
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
@@ -161,16 +184,24 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
-                                            const TargetRegisterInfo *TRI) {
+                                            const TargetRegisterInfo *TRI,
+                                            const MVT::SimpleValueType SVT =
+                                            MVT::SimpleValueType::Any) {
+  const MVT VT(SVT);
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
-    if (unsigned Common = *A++ & *B++)
-      return TRI->getRegClass(I + countTrailingZeros(Common));
+    if (unsigned Common = *A++ & *B++) {
+      const TargetRegisterClass *RC =
+          TRI->getRegClass(I + countTrailingZeros(Common));
+      if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT))
+        return RC;
+    }
   return nullptr;
 }
 
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
-                                      const TargetRegisterClass *B) const {
+                                      const TargetRegisterClass *B,
+                                      const MVT::SimpleValueType SVT) const {
   // First take care of the trivial cases.
   if (A == B)
     return A;
@@ -179,7 +210,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
 }
 
 const TargetRegisterClass *
@@ -260,13 +291,55 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
   return BestRC;
 }
 
+/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
+/// share the same register file.
+static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
+                                  const TargetRegisterClass *DefRC,
+                                  unsigned DefSubReg,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SrcSubReg) {
+  // Same register class.
+  if (DefRC == SrcRC)
+    return true;
+
+  // Both operands are sub registers. Check if they share a register class.
+  unsigned SrcIdx, DefIdx;
+  if (SrcSubReg && DefSubReg) {
+    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
+                                      SrcIdx, DefIdx) != nullptr;
+  }
+
+  // At most one of the register is a sub register, make it Src to avoid
+  // duplicating the test.
+  if (!SrcSubReg) {
+    std::swap(DefSubReg, SrcSubReg);
+    std::swap(DefRC, SrcRC);
+  }
+
+  // One of the register is a sub register, check if we can get a superclass.
+  if (SrcSubReg)
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
+
+  // Plain copy.
+  return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
+bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                                              unsigned DefSubReg,
+                                              const TargetRegisterClass *SrcRC,
+                                              unsigned SrcSubReg) const {
+  // If this source does not incur a cross register bank copy, use it.
+  return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
+}
+
 // Compute target-independent register allocator hints to help eliminate copies.
 void
 TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
                                           ArrayRef<MCPhysReg> Order,
                                           SmallVectorImpl<MCPhysReg> &Hints,
                                           const MachineFunction &MF,
-                                          const VirtRegMap *VRM) const {
+                                          const VirtRegMap *VRM,
+                                          const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
 
@@ -295,6 +368,26 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   Hints.push_back(Phys);
 }
 
+bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  return !MF.getFunction()->hasFnAttribute("no-realign-stack");
+}
+
+bool TargetRegisterInfo::needsStackRealignment(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = TFI->getStackAlignment();
+  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
+                              F->hasFnAttribute(Attribute::StackAlignment));
+  if (MF.getFunction()->hasFnAttribute("stackrealign") || requiresRealignment) {
+    if (canRealignStack(MF))
+      return true;
+    DEBUG(dbgs() << "Can't realign function's stack: " << F->getName() << "\n");
+  }
+  return false;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void
 TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
index 299380d..1c4558c 100644
--- a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -211,11 +211,9 @@ unsigned TargetSchedModel::computeOperandLatency(
   if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
       && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
       && SchedModel.isComplete()) {
-    std::string Err;
-    raw_string_ostream ss(Err);
-    ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
-       << *DefMI;
-    report_fatal_error(ss.str());
+    errs() << "DefIdx " << DefIdx << " exceeds machine model writes for "
+           << *DefMI << " (Try with MCSchedModel.CompleteModel set to false)";
+    llvm_unreachable("incomplete machine model");
   }
 #endif
   // FIXME: Automatically giving all implicit defs defaultDefLatency is
diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1e30821..c6bae24 100644
--- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -83,21 +83,20 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // The current basic block being processed.
   MachineBasicBlock *MBB;
 
-  // DistanceMap - Keep track the distance of a MI from the start of the
-  // current basic block.
+  // Keep track the distance of a MI from the start of the current basic block.
   DenseMap<MachineInstr*, unsigned> DistanceMap;
 
   // Set of already processed instructions in the current block.
   SmallPtrSet<MachineInstr*, 8> Processed;
 
-  // SrcRegMap - A map from virtual registers to physical registers which are
-  // likely targets to be coalesced to due to copies from physical registers to
-  // virtual registers. e.g. v1024 = move r0.
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies from physical registers to virtual
+  // registers. e.g. v1024 = move r0.
   DenseMap<unsigned, unsigned> SrcRegMap;
 
-  // DstRegMap - A map from virtual registers to physical registers which are
-  // likely targets to be coalesced to due to copies to physical registers from
-  // virtual registers. e.g. r1 = move v1024.
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies to physical registers from virtual
+  // registers. e.g. r1 = move v1024.
   DenseMap<unsigned, unsigned> DstRegMap;
 
   bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
@@ -110,8 +109,8 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
                              MachineInstr *MI, unsigned Dist);
 
-  bool commuteInstruction(MachineBasicBlock::iterator &mi,
-                          unsigned RegB, unsigned RegC, unsigned Dist);
+  bool commuteInstruction(MachineInstr *MI,
+                          unsigned RegBIdx, unsigned RegCIdx, unsigned Dist);
 
   bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
 
@@ -133,6 +132,11 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
                                unsigned SrcIdx, unsigned DstIdx,
                                unsigned Dist, bool shouldOnlyCommute);
 
+  bool tryInstructionCommute(MachineInstr *MI,
+                             unsigned DstOpIdx,
+                             unsigned BaseOpIdx,
+                             bool BaseOpKilled,
+                             unsigned Dist);
   void scanUses(unsigned DstReg);
 
   void processCopy(MachineInstr *MI);
@@ -151,7 +155,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<LiveVariables>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
@@ -160,7 +164,7 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  /// runOnMachineFunction - Pass entry point.
+  /// Pass entry point.
   bool runOnMachineFunction(MachineFunction&) override;
 };
 } // end anonymous namespace
@@ -168,7 +172,7 @@ public:
 char TwoAddressInstructionPass::ID = 0;
 INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction",
                 "Two-Address instruction pass", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
                 "Two-Address instruction pass", false, false)
 
@@ -176,10 +180,9 @@ char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
 
-/// sink3AddrInstruction - A two-address instruction has been converted to a
-/// three-address instruction to avoid clobbering a register. Try to sink it
-/// past the instruction that would kill the above mentioned register to reduce
-/// register pressure.
+/// A two-address instruction has been converted to a three-address instruction
+/// to avoid clobbering a register. Try to sink it past the instruction that
+/// would kill the above mentioned register to reduce register pressure.
 bool TwoAddressInstructionPass::
 sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
                      MachineBasicBlock::iterator OldPos) {
@@ -195,8 +198,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   unsigned DefReg = 0;
   SmallSet<unsigned, 4> UseRegs;
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -231,10 +233,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
     KillMI = LIS->getInstructionFromIndex(I->end);
   }
   if (!KillMI) {
-    for (MachineRegisterInfo::use_nodbg_iterator
-           UI = MRI->use_nodbg_begin(SavedReg),
-           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-      MachineOperand &UseMO = *UI;
+    for (MachineOperand &UseMO : MRI->use_nodbg_operands(SavedReg)) {
       if (!UseMO.isKill())
         continue;
       KillMI = UseMO.getParent();
@@ -312,8 +311,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   return true;
 }
 
-/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg
-/// in current BB.
+/// Return the MachineInstr* if it is the single def of the Reg in current BB.
 static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
                                   const MachineRegisterInfo *MRI) {
   MachineInstr *Ret = nullptr;
@@ -351,10 +349,10 @@ bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
   return false;
 }
 
-/// noUseAfterLastDef - Return true if there are no intervening uses between the
-/// last instruction in the MBB that defines the specified register and the
-/// two-address instruction which is being processed. It also returns the last
-/// def location by reference
+/// Return true if there are no intervening uses between the last instruction
+/// in the MBB that defines the specified register and the two-address
+/// instruction which is being processed. It also returns the last def location
+/// by reference.
 bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
                                                   unsigned &LastDef) {
   LastDef = 0;
@@ -375,9 +373,9 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
   return !(LastUse > LastDef && LastUse < Dist);
 }
 
-/// isCopyToReg - Return true if the specified MI is a copy instruction or
-/// a extract_subreg instruction. It also returns the source and destination
-/// registers and whether they are physical registers by reference.
+/// Return true if the specified MI is a copy instruction or an extract_subreg
+/// instruction. It also returns the source and destination registers and
+/// whether they are physical registers by reference.
 static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
                         unsigned &SrcReg, unsigned &DstReg,
                         bool &IsSrcPhys, bool &IsDstPhys) {
@@ -397,8 +395,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   return true;
 }
 
-/// isPLainlyKilled - Test if the given register value, which is used by the
-// given instruction, is killed by the given instruction.
+/// Test if the given register value, which is used by the
+/// given instruction, is killed by the given instruction.
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
                             LiveIntervals *LIS) {
   if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
@@ -424,7 +422,7 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
   return MI->killsRegister(Reg);
 }
 
-/// isKilled - Test if the given register value, which is used by the given
+/// Test if the given register value, which is used by the given
 /// instruction, is killed by the given instruction. This looks through
 /// coalescable copies to see if the original value is potentially not killed.
 ///
@@ -472,8 +470,8 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
   }
 }
 
-/// isTwoAddrUse - Return true if the specified MI uses the specified register
-/// as a two-address use. If so, return the destination register by reference.
+/// Return true if the specified MI uses the specified register as a two-address
+/// use. If so, return the destination register by reference.
 static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
@@ -488,8 +486,8 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   return false;
 }
 
-/// findOnlyInterestingUse - Given a register, if has a single in-basic block
-/// use, return the use instruction if it's a copy or a two-address use.
+/// Given a register, if has a single in-basic block use, return the use
+/// instruction if it's a copy or a two-address use.
 static
 MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
                                      MachineRegisterInfo *MRI,
@@ -516,8 +514,8 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
   return nullptr;
 }
 
-/// getMappedReg - Return the physical register the specified virtual register
-/// might be mapped to.
+/// Return the physical register the specified virtual register might be mapped
+/// to.
 static unsigned
 getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
   while (TargetRegisterInfo::isVirtualRegister(Reg))  {
@@ -531,8 +529,7 @@ getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
   return 0;
 }
 
-/// regsAreCompatible - Return true if the two registers are equal or aliased.
-///
+/// Return true if the two registers are equal or aliased.
 static bool
 regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
   if (RegA == RegB)
@@ -543,8 +540,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 }
 
 
-/// isProfitableToCommute - Return true if it's potentially profitable to commute
-/// the two-address instruction that's being processed.
+/// Return true if it's potentially profitable to commute the two-address
+/// instruction that's being processed.
 bool
 TwoAddressInstructionPass::
 isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
@@ -642,15 +639,15 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   return LastDefB && LastDefC && LastDefC > LastDefB;
 }
 
-/// commuteInstruction - Commute a two-address instruction and update the basic
-/// block, distance map, and live variables if needed. Return true if it is
-/// successful.
-bool TwoAddressInstructionPass::
-commuteInstruction(MachineBasicBlock::iterator &mi,
-                   unsigned RegB, unsigned RegC, unsigned Dist) {
-  MachineInstr *MI = mi;
+/// Commute a two-address instruction and update the basic block, distance map,
+/// and live variables if needed. Return true if it is successful.
+bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
+                                                   unsigned RegBIdx,
+                                                   unsigned RegCIdx,
+                                                   unsigned Dist) {
+  unsigned RegC = MI->getOperand(RegCIdx).getReg();
   DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
-  MachineInstr *NewMI = TII->commuteInstruction(MI);
+  MachineInstr *NewMI = TII->commuteInstruction(MI, false, RegBIdx, RegCIdx);
 
   if (NewMI == nullptr) {
     DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
@@ -672,8 +669,8 @@ commuteInstruction(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// isProfitableToConv3Addr - Return true if it is profitable to convert the
-/// given 2-address instruction to a 3-address one.
+/// Return true if it is profitable to convert the given 2-address instruction
+/// to a 3-address one.
 bool
 TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   // Look for situations like this:
@@ -689,17 +686,18 @@ TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
 }
 
-/// convertInstTo3Addr - Convert the specified two-address instruction into a
-/// three address one. Return true if this transformation was successful.
+/// Convert the specified two-address instruction into a three address one.
+/// Return true if this transformation was successful.
 bool
 TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
                                               MachineBasicBlock::iterator &nmi,
                                               unsigned RegA, unsigned RegB,
                                               unsigned Dist) {
   // FIXME: Why does convertToThreeAddress() need an iterator reference?
-  MachineFunction::iterator MFI = MBB;
+  MachineFunction::iterator MFI = MBB->getIterator();
   MachineInstr *NewMI = TII->convertToThreeAddress(MFI, mi, LV);
-  assert(MBB == MFI && "convertToThreeAddress changed iterator reference");
+  assert(MBB->getIterator() == MFI &&
+         "convertToThreeAddress changed iterator reference");
   if (!NewMI)
     return false;
 
@@ -730,8 +728,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// scanUses - Scan forward recursively for only uses, update maps if the use
-/// is a copy or a two-address instruction.
+/// Scan forward recursively for only uses, update maps if the use is a copy or
+/// a two-address instruction.
 void
 TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   SmallVector<unsigned, 4> VirtRegPairs;
@@ -777,8 +775,8 @@ TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   }
 }
 
-/// processCopy - If the specified instruction is not yet processed, process it
-/// if it's a copy. For a copy instruction, we find the physical registers the
+/// If the specified instruction is not yet processed, process it if it's a
+/// copy. For a copy instruction, we find the physical registers the
 /// source and destination registers might be mapped to. These are kept in
 /// point-to maps used to determine future optimizations. e.g.
 /// v1024 = mov r0
@@ -813,9 +811,9 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
   return;
 }
 
-/// rescheduleMIBelowKill - If there is one more local instruction that reads
-/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
-/// instruction in order to eliminate the need for the copy.
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the instruction below the kill instruction in order to
+/// eliminate the need for the copy.
 bool TwoAddressInstructionPass::
 rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
@@ -871,8 +869,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   SmallSet<unsigned, 2> Uses;
   SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -914,8 +911,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
         OtherMI->isBranch() || OtherMI->isTerminator())
       // Don't move pass calls, etc.
       return false;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = OtherMI->getOperand(i);
+    for (const MachineOperand &MO : OtherMI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -984,8 +980,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// isDefTooClose - Return true if the re-scheduling will put the given
-/// instruction too close to the defs of its register dependencies.
+/// Return true if the re-scheduling will put the given instruction too close
+/// to the defs of its register dependencies.
 bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
                                               MachineInstr *MI) {
   for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
@@ -1004,10 +1000,9 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
   return false;
 }
 
-/// rescheduleKillAboveMI - If there is one more local instruction that reads
-/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the
-/// current two-address instruction in order to eliminate the need for the
-/// copy.
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the kill instruction above the current two-address
+/// instruction in order to eliminate the need for the copy.
 bool TwoAddressInstructionPass::
 rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
@@ -1055,8 +1050,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
   SmallSet<unsigned, 2> LiveDefs;
-  for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = KillMI->getOperand(i);
+  for (const MachineOperand &MO : KillMI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -1094,8 +1088,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       // Don't move pass calls, etc.
       return false;
     SmallVector<unsigned, 2> OtherDefs;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = OtherMI->getOperand(i);
+    for (const MachineOperand &MO : OtherMI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -1155,13 +1148,68 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// tryInstructionTransform - For the case where an instruction has a single
-/// pair of tied register operands, attempt some transformations that may
-/// either eliminate the tied operands or improve the opportunities for
-/// coalescing away the register copy.  Returns true if no copy needs to be
-/// inserted to untie mi's operands (either because they were untied, or
-/// because mi was rescheduled, and will be visited again later). If the
-/// shouldOnlyCommute flag is true, only instruction commutation is attempted.
+/// Tries to commute the operand 'BaseOpIdx' and some other operand in the
+/// given machine instruction to improve opportunities for coalescing and
+/// elimination of a register to register copy.
+///
+/// 'DstOpIdx' specifies the index of MI def operand.
+/// 'BaseOpKilled' specifies if the register associated with 'BaseOpIdx'
+/// operand is killed by the given instruction.
+/// The 'Dist' arguments provides the distance of MI from the start of the
+/// current basic block and it is used to determine if it is profitable
+/// to commute operands in the instruction.
+///
+/// Returns true if the transformation happened. Otherwise, returns false.
+bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
+                                                      unsigned DstOpIdx,
+                                                      unsigned BaseOpIdx,
+                                                      bool BaseOpKilled,
+                                                      unsigned Dist) {
+  unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
+  unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
+  unsigned OpsNum = MI->getDesc().getNumOperands();
+  unsigned OtherOpIdx = MI->getDesc().getNumDefs();
+  for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
+    // The call of findCommutedOpIndices below only checks if BaseOpIdx
+    // and OtherOpIdx are commutable, it does not really search for
+    // other commutable operands and does not change the values of passed
+    // variables.
+    if (OtherOpIdx == BaseOpIdx ||
+        !TII->findCommutedOpIndices(MI, BaseOpIdx, OtherOpIdx))
+      continue;
+
+    unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
+    bool AggressiveCommute = false;
+
+    // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
+    // operands. This makes the live ranges of DstOp and OtherOp joinable.
+    bool DoCommute =
+        !BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);
+
+    if (!DoCommute &&
+        isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) {
+      DoCommute = true;
+      AggressiveCommute = true;
+    }
+
+    // If it's profitable to commute, try to do so.
+    if (DoCommute && commuteInstruction(MI, BaseOpIdx, OtherOpIdx, Dist)) {
+      ++NumCommuted;
+      if (AggressiveCommute)
+        ++NumAggrCommuted;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// For the case where an instruction has a single pair of tied register
+/// operands, attempt some transformations that may either eliminate the tied
+/// operands or improve the opportunities for coalescing away the register copy.
+/// Returns true if no copy needs to be inserted to untie mi's operands
+/// (either because they were untied, or because mi was rescheduled, and will
+/// be visited again later). If the shouldOnlyCommute flag is true, only
+/// instruction commutation is attempted.
 bool TwoAddressInstructionPass::
 tryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
@@ -1181,51 +1229,18 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
   if (TargetRegisterInfo::isVirtualRegister(regA))
     scanUses(regA);
 
-  // Check if it is profitable to commute the operands.
-  unsigned SrcOp1, SrcOp2;
-  unsigned regC = 0;
-  unsigned regCIdx = ~0U;
-  bool TryCommute = false;
-  bool AggressiveCommute = false;
-  if (MI.isCommutable() && MI.getNumOperands() >= 3 &&
-      TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) {
-    if (SrcIdx == SrcOp1)
-      regCIdx = SrcOp2;
-    else if (SrcIdx == SrcOp2)
-      regCIdx = SrcOp1;
-
-    if (regCIdx != ~0U) {
-      regC = MI.getOperand(regCIdx).getReg();
-      if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false))
-        // If C dies but B does not, swap the B and C operands.
-        // This makes the live ranges of A and C joinable.
-        TryCommute = true;
-      else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) {
-        TryCommute = true;
-        AggressiveCommute = true;
-      }
-    }
-  }
+  bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggresively and
   // use this variable to check later. Because it might be better.
   // For example, we can just use `leal (%rsi,%rdi), %eax` and `ret`
   // instead of the following code.
-  //   addl	%esi, %edi
-  //   movl	%edi, %eax
+  //   addl     %esi, %edi
+  //   movl     %edi, %eax
   //   ret
-  bool Commuted = false;
-
-  // If it's profitable to commute, try to do so.
-  if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) {
-    Commuted = true;
-    ++NumCommuted;
-    if (AggressiveCommute)
-      ++NumAggrCommuted;
-    if (!MI.isConvertibleTo3Addr())
-      return false;
-  }
+  if (Commuted && !MI.isConvertibleTo3Addr())
+    return false;
 
   if (shouldOnlyCommute)
     return false;
@@ -1237,6 +1252,13 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
     return true;
   }
 
+  // If we commuted, regB may have changed so we should re-sample it to avoid
+  // confusing the three address conversion below.
+  if (Commuted) {
+    regB = MI.getOperand(SrcIdx).getReg();
+    regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
+  }
+
   if (MI.isConvertibleTo3Addr()) {
     // This instruction is potentially convertible to a true
     // three-address instruction.  Check if it is profitable.
@@ -1348,10 +1370,9 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
 
           SmallVector<unsigned, 4> OrigRegs;
           if (LIS) {
-            for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
-                 MOE = MI.operands_end(); MOI != MOE; ++MOI) {
-              if (MOI->isReg())
-                OrigRegs.push_back(MOI->getReg());
+            for (const MachineOperand &MO : MI.operands()) {
+              if (MO.isReg())
+                OrigRegs.push_back(MO.getReg());
             }
           }
 
@@ -1536,12 +1557,10 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     SrcRegMap[RegA] = RegB;
   }
 
-
   if (AllUsesCopied) {
     if (!IsEarlyClobber) {
       // Replace other (un-tied) uses of regB with LastCopiedReg.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
+      for (MachineOperand &MO : MI->operands()) {
         if (MO.isReg() && MO.getReg() == RegB && MO.getSubReg() == SubRegB &&
             MO.isUse()) {
           if (MO.isKill()) {
@@ -1578,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     // regB is still used in this instruction, but a kill flag was
     // removed from a different tied use of regB, so now we need to add
     // a kill flag to one of the remaining uses of regB.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (MachineOperand &MO : MI->operands()) {
       if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
         MO.setIsKill(true);
         break;
@@ -1588,8 +1606,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   }
 }
 
-/// runOnMachineFunction - Reduce two-address instructions to two operands.
-///
+/// Reduce two-address instructions to two operands.
 bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();
@@ -1599,7 +1616,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   InstrItins = MF->getSubtarget().getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   OptLevel = TM.getOptLevel();
 
   bool MadeChange = false;
@@ -1614,7 +1631,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   TiedOperandMap TiedOperands;
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
-    MBB = MBBI;
+    MBB = &*MBBI;
     unsigned Dist = 0;
     DistanceMap.clear();
     SrcRegMap.clear();
@@ -1661,8 +1678,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
           unsigned DstReg = mi->getOperand(DstIdx).getReg();
           if (SrcReg != DstReg &&
               tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
-            // The tied operands have been eliminated or shifted further down the
-            // block to ease elimination. Continue processing with 'nmi'.
+            // The tied operands have been eliminated or shifted further down
+            // the block to ease elimination. Continue processing with 'nmi'.
             TiedOperands.clear();
             mi = nmi;
             continue;
@@ -1671,9 +1688,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
       }
 
       // Now iterate over the information collected above.
-      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
-             OE = TiedOperands.end(); OI != OE; ++OI) {
-        processTiedPairs(mi, OI->second, Dist);
+      for (auto &TO : TiedOperands) {
+        processTiedPairs(mi, TO.second, Dist);
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
       }
 
diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index d393e10..8c9631e 100644
--- a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -71,8 +71,8 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
   // in them.
   std::vector<BasicBlock*> DeadBlocks;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
-    if (!Reachable.count(I)) {
-      BasicBlock *BB = I;
+    if (!Reachable.count(&*I)) {
+      BasicBlock *BB = &*I;
       DeadBlocks.push_back(BB);
       while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
         PN->replaceAllUsesWith(Constant::getNullValue(PN->getType()));
@@ -131,7 +131,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   // in them.
   std::vector<MachineBasicBlock*> DeadBlocks;
   for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    MachineBasicBlock *BB = I;
+    MachineBasicBlock *BB = &*I;
 
     // Test for deadness.
     if (!Reachable.count(BB)) {
@@ -167,7 +167,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
 
   // Cleanup PHI nodes.
   for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    MachineBasicBlock *BB = I;
+    MachineBasicBlock *BB = &*I;
     // Prune unneeded PHI entries.
     SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(),
                                              BB->pred_end());
diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
index 2912bdd..bf1c0dc 100644
--- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -163,11 +163,12 @@ class VirtRegRewriter : public MachineFunctionPass {
   SlotIndexes *Indexes;
   LiveIntervals *LIS;
   VirtRegMap *VRM;
-  SparseSet<unsigned> PhysRegs;
 
   void rewrite();
   void addMBBLiveIns();
   bool readsUndefSubreg(const MachineOperand &MO) const;
+  void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
+
 public:
   static char ID;
   VirtRegRewriter() : MachineFunctionPass(ID) {}
@@ -237,10 +238,52 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
   return true;
 }
 
+void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
+                                             unsigned PhysReg) const {
+  assert(!LI.empty());
+  assert(LI.hasSubRanges());
+
+  typedef std::pair<const LiveInterval::SubRange *,
+                    LiveInterval::const_iterator> SubRangeIteratorPair;
+  SmallVector<SubRangeIteratorPair, 4> SubRanges;
+  SlotIndex First;
+  SlotIndex Last;
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    SubRanges.push_back(std::make_pair(&SR, SR.begin()));
+    if (!First.isValid() || SR.segments.front().start < First)
+      First = SR.segments.front().start;
+    if (!Last.isValid() || SR.segments.back().end > Last)
+      Last = SR.segments.back().end;
+  }
+
+  // Check all mbb start positions between First and Last while
+  // simulatenously advancing an iterator for each subrange.
+  for (SlotIndexes::MBBIndexIterator MBBI = Indexes->findMBBIndex(First);
+       MBBI != Indexes->MBBIndexEnd() && MBBI->first <= Last; ++MBBI) {
+    SlotIndex MBBBegin = MBBI->first;
+    // Advance all subrange iterators so that their end position is just
+    // behind MBBBegin (or the iterator is at the end).
+    LaneBitmask LaneMask = 0;
+    for (auto &RangeIterPair : SubRanges) {
+      const LiveInterval::SubRange *SR = RangeIterPair.first;
+      LiveInterval::const_iterator &SRI = RangeIterPair.second;
+      while (SRI != SR->end() && SRI->end <= MBBBegin)
+        ++SRI;
+      if (SRI == SR->end())
+        continue;
+      if (SRI->start <= MBBBegin)
+        LaneMask |= SR->LaneMask;
+    }
+    if (LaneMask == 0)
+      continue;
+    MachineBasicBlock *MBB = MBBI->second;
+    MBB->addLiveIn(PhysReg, LaneMask);
+  }
+}
+
 // Compute MBB live-in lists from virtual register live ranges and their
 // assignments.
 void VirtRegRewriter::addMBBLiveIns() {
-  SmallVector<MachineBasicBlock*, 16> LiveIn;
   for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
     unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
     if (MRI->reg_nodbg_empty(VirtReg))
@@ -254,31 +297,18 @@ void VirtRegRewriter::addMBBLiveIns() {
     assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
 
     if (LI.hasSubRanges()) {
-      for (LiveInterval::SubRange &S : LI.subranges()) {
-        for (const auto &Seg : S.segments) {
-          if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
-            continue;
-          for (MCSubRegIndexIterator SR(PhysReg, TRI); SR.isValid(); ++SR) {
-            unsigned SubReg = SR.getSubReg();
-            unsigned SubRegIndex = SR.getSubRegIndex();
-            unsigned SubRegLaneMask = TRI->getSubRegIndexLaneMask(SubRegIndex);
-            if ((SubRegLaneMask & S.LaneMask) == 0)
-              continue;
-            for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) {
-              LiveIn[i]->addLiveIn(SubReg);
-            }
-          }
-          LiveIn.clear();
-        }
-      }
+      addLiveInsForSubRanges(LI, PhysReg);
     } else {
-      // Scan the segments of LI.
-      for (const auto &Seg : LI.segments) {
-        if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
-          continue;
-        for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
-          LiveIn[i]->addLiveIn(PhysReg);
-        LiveIn.clear();
+      // Go over MBB begin positions and see if we have segments covering them.
+      // The following works because segments and the MBBIndex list are both
+      // sorted by slot indexes.
+      SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin();
+      for (const auto &Seg : LI) {
+        I = Indexes->advanceMBBIndex(I, Seg.start);
+        for (; I != Indexes->MBBIndexEnd() && I->first < Seg.end; ++I) {
+          MachineBasicBlock *MBB = I->second;
+          MBB->addLiveIn(PhysReg);
+        }
       }
     }
   }
@@ -305,7 +335,7 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
   assert(LI.liveAt(BaseIndex) &&
          "Reads of completely dead register should be marked undef already");
   unsigned SubRegIdx = MO.getSubReg();
-  unsigned UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+  LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
   // See if any of the relevant subregister liveranges is defined at this point.
   for (const LiveInterval::SubRange &SR : LI.subranges()) {
     if ((SR.LaneMask & UseMask) != 0 && SR.liveAt(BaseIndex))
@@ -319,54 +349,15 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
-  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
-
-  // Here we have a SparseSet to hold which PhysRegs are actually encountered
-  // in the MF we are about to iterate over so that later when we call
-  // setPhysRegUsed, we are only doing it for physRegs that were actually found
-  // in the program and not for all of the possible physRegs for the given
-  // target architecture. If the target has a lot of physRegs, then for a small
-  // program there will be a significant compile time reduction here.
-  PhysRegs.clear();
-  PhysRegs.setUniverse(TRI->getNumRegs());
-
-  // The function with uwtable should guarantee that the stack unwinder
-  // can unwind the stack to the previous frame.  Thus, we can't apply the
-  // noreturn optimization if the caller function has uwtable attribute.
-  bool HasUWTable = MF->getFunction()->hasFnAttribute(Attribute::UWTable);
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     DEBUG(MBBI->print(dbgs(), Indexes));
-    bool IsExitBB = MBBI->succ_empty();
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
-      MachineInstr *MI = MII;
+      MachineInstr *MI = &*MII;
       ++MII;
 
-      // Check if this instruction is a call to a noreturn function.  If this
-      // is a call to noreturn function and we don't need the stack unwinding
-      // functionality (i.e. this function does not have uwtable attribute and
-      // the callee function has the nounwind attribute), then we can ignore
-      // the definitions set by this instruction.
-      if (!HasUWTable && IsExitBB && MI->isCall()) {
-        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
-          MachineOperand &MO = *MOI;
-          if (!MO.isGlobal())
-            continue;
-          const Function *Func = dyn_cast<Function>(MO.getGlobal());
-          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
-              // We need to keep correct unwind information
-              // even if the function will not return, since the
-              // runtime may need it.
-              !Func->hasFnAttribute(Attribute::NoUnwind))
-            continue;
-          NoReturnInsts.insert(MI);
-          break;
-        }
-      }
-
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
            MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         MachineOperand &MO = *MOI;
@@ -375,15 +366,6 @@ void VirtRegRewriter::rewrite() {
         if (MO.isRegMask())
           MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
 
-        // If we encounter a VirtReg or PhysReg then get at the PhysReg and add
-        // it to the physreg bitset.  Later we use only the PhysRegs that were
-        // actually encountered in the MF to populate the MRI's used physregs.
-        if (MO.isReg() && MO.getReg())
-          PhysRegs.insert(
-              TargetRegisterInfo::isVirtualRegister(MO.getReg()) ?
-              VRM->getPhys(MO.getReg()) :
-              MO.getReg());
-
         if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
         unsigned VirtReg = MO.getReg();
@@ -418,14 +400,6 @@ void VirtRegRewriter::rewrite() {
                 MO.setIsUndef(true);
             } else if (!MO.isDead()) {
               assert(MO.isDef());
-              // Things get tricky when we ran out of lane mask bits and
-              // merged multiple lanes into the overflow bit: In this case
-              // our subregister liveness tracking isn't precise and we can't
-              // know what subregister parts are undefined, fall back to the
-              // implicit super-register def then.
-              unsigned LaneMask = TRI->getSubRegIndexLaneMask(SubReg);
-              if (TargetRegisterInfo::isImpreciseLaneMask(LaneMask))
-                SuperDefs.push_back(PhysReg);
             }
           }
 
@@ -470,29 +444,5 @@ void VirtRegRewriter::rewrite() {
       }
     }
   }
-
-  // Tell MRI about physical registers in use.
-  if (NoReturnInsts.empty()) {
-    for (SparseSet<unsigned>::iterator
-        RegI = PhysRegs.begin(), E = PhysRegs.end(); RegI != E; ++RegI)
-      if (!MRI->reg_nodbg_empty(*RegI))
-        MRI->setPhysRegUsed(*RegI);
-  } else {
-    for (SparseSet<unsigned>::iterator
-        I = PhysRegs.begin(), E = PhysRegs.end(); I != E; ++I) {
-      unsigned Reg = *I;
-      if (MRI->reg_nodbg_empty(Reg))
-        continue;
-      // Check if this register has a use that will impact the rest of the
-      // code. Uses in debug and noreturn instructions do not impact the
-      // generated code.
-      for (MachineInstr &It : MRI->reg_nodbg_instructions(Reg)) {
-        if (!NoReturnInsts.count(&It)) {
-          MRI->setPhysRegUsed(Reg);
-          break;
-        }
-      }
-    }
-  }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
index 0d26ed3..886c5f6 100644
--- a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -17,67 +17,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <memory>
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
-using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "winehprepare"
 
-namespace {
-
-// This map is used to model frame variable usage during outlining, to
-// construct a structure type to hold the frame variables in a frame
-// allocation block, and to remap the frame variable allocas (including
-// spill locations as needed) to GEPs that get the variable from the
-// frame allocation structure.
-typedef MapVector<Value *, TinyPtrVector<AllocaInst *>> FrameVarInfoMap;
-
-// TinyPtrVector cannot hold nullptr, so we need our own sentinel that isn't
-// quite null.
-AllocaInst *getCatchObjectSentinel() {
-  return static_cast<AllocaInst *>(nullptr) + 1;
-}
-
-typedef SmallSet<BasicBlock *, 4> VisitedBlockSet;
+static cl::opt<bool> DisableDemotion(
+    "disable-demotion", cl::Hidden,
+    cl::desc(
+        "Clone multicolor basic blocks but do not demote cross funclet values"),
+    cl::init(false));
 
-class LandingPadActions;
-class LandingPadMap;
-
-typedef DenseMap<const BasicBlock *, CatchHandler *> CatchHandlerMapTy;
-typedef DenseMap<const BasicBlock *, CleanupHandler *> CleanupHandlerMapTy;
+static cl::opt<bool> DisableCleanups(
+    "disable-cleanups", cl::Hidden,
+    cl::desc("Do not remove implausible terminators or other similar cleanups"),
+    cl::init(false));
 
+namespace {
+  
 class WinEHPrepare : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
-  WinEHPrepare(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID) {
-    if (TM)
-      TheTriple = TM->getTargetTriple();
-  }
+  WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {}
 
   bool runOnFunction(Function &Fn) override;
 
@@ -90,264 +67,27 @@ public:
   }
 
 private:
-  bool prepareExceptionHandlers(Function &F,
-                                SmallVectorImpl<LandingPadInst *> &LPads);
-  void identifyEHBlocks(Function &F, SmallVectorImpl<LandingPadInst *> &LPads);
-  void promoteLandingPadValues(LandingPadInst *LPad);
-  void demoteValuesLiveAcrossHandlers(Function &F,
-                                      SmallVectorImpl<LandingPadInst *> &LPads);
-  void findSEHEHReturnPoints(Function &F,
-                             SetVector<BasicBlock *> &EHReturnBlocks);
-  void findCXXEHReturnPoints(Function &F,
-                             SetVector<BasicBlock *> &EHReturnBlocks);
-  void getPossibleReturnTargets(Function *ParentF, Function *HandlerF,
-                                SetVector<BasicBlock*> &Targets);
-  void completeNestedLandingPad(Function *ParentFn,
-                                LandingPadInst *OutlinedLPad,
-                                const LandingPadInst *OriginalLPad,
-                                FrameVarInfoMap &VarInfo);
-  Function *createHandlerFunc(Function *ParentFn, Type *RetTy,
-                              const Twine &Name, Module *M, Value *&ParentFP);
-  bool outlineHandler(ActionHandler *Action, Function *SrcFn,
-                      LandingPadInst *LPad, BasicBlock *StartBB,
-                      FrameVarInfoMap &VarInfo);
-  void addStubInvokeToHandlerIfNeeded(Function *Handler);
-
-  void mapLandingPadBlocks(LandingPadInst *LPad, LandingPadActions &Actions);
-  CatchHandler *findCatchHandler(BasicBlock *BB, BasicBlock *&NextBB,
-                                 VisitedBlockSet &VisitedBlocks);
-  void findCleanupHandlers(LandingPadActions &Actions, BasicBlock *StartBB,
-                           BasicBlock *EndBB);
-
-  void processSEHCatchHandler(CatchHandler *Handler, BasicBlock *StartBB);
-
-  Triple TheTriple;
+  void insertPHIStores(PHINode *OriginalPHI, AllocaInst *SpillSlot);
+  void
+  insertPHIStore(BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
+                 SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist);
+  AllocaInst *insertPHILoads(PHINode *PN, Function &F);
+  void replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
+                          DenseMap<BasicBlock *, Value *> &Loads, Function &F);
+  bool prepareExplicitEH(Function &F);
+  void colorFunclets(Function &F);
+
+  void demotePHIsOnFunclets(Function &F);
+  void cloneCommonBlocks(Function &F);
+  void removeImplausibleInstructions(Function &F);
+  void cleanupPreparedFunclets(Function &F);
+  void verifyPreparedFunclets(Function &F);
 
   // All fields are reset by runOnFunction.
-  DominatorTree *DT = nullptr;
-  const TargetLibraryInfo *LibInfo = nullptr;
   EHPersonality Personality = EHPersonality::Unknown;
-  CatchHandlerMapTy CatchHandlerMap;
-  CleanupHandlerMapTy CleanupHandlerMap;
-  DenseMap<const LandingPadInst *, LandingPadMap> LPadMaps;
-  SmallPtrSet<BasicBlock *, 4> NormalBlocks;
-  SmallPtrSet<BasicBlock *, 4> EHBlocks;
-  SetVector<BasicBlock *> EHReturnBlocks;
-
-  // This maps landing pad instructions found in outlined handlers to
-  // the landing pad instruction in the parent function from which they
-  // were cloned.  The cloned/nested landing pad is used as the key
-  // because the landing pad may be cloned into multiple handlers.
-  // This map will be used to add the llvm.eh.actions call to the nested
-  // landing pads after all handlers have been outlined.
-  DenseMap<LandingPadInst *, const LandingPadInst *> NestedLPtoOriginalLP;
-
-  // This maps blocks in the parent function which are destinations of
-  // catch handlers to cloned blocks in (other) outlined handlers. This
-  // handles the case where a nested landing pads has a catch handler that
-  // returns to a handler function rather than the parent function.
-  // The original block is used as the key here because there should only
-  // ever be one handler function from which the cloned block is not pruned.
-  // The original block will be pruned from the parent function after all
-  // handlers have been outlined.  This map will be used to adjust the
-  // return instructions of handlers which return to the block that was
-  // outlined into a handler.  This is done after all handlers have been
-  // outlined but before the outlined code is pruned from the parent function.
-  DenseMap<const BasicBlock *, BasicBlock *> LPadTargetBlocks;
-
-  // Map from outlined handler to call to parent local address. Only used for
-  // 32-bit EH.
-  DenseMap<Function *, Value *> HandlerToParentFP;
-
-  AllocaInst *SEHExceptionCodeSlot = nullptr;
-};
-
-class WinEHFrameVariableMaterializer : public ValueMaterializer {
-public:
-  WinEHFrameVariableMaterializer(Function *OutlinedFn, Value *ParentFP,
-                                 FrameVarInfoMap &FrameVarInfo);
-  ~WinEHFrameVariableMaterializer() override {}
-
-  Value *materializeValueFor(Value *V) override;
-
-  void escapeCatchObject(Value *V);
-
-private:
-  FrameVarInfoMap &FrameVarInfo;
-  IRBuilder<> Builder;
-};
-
-class LandingPadMap {
-public:
-  LandingPadMap() : OriginLPad(nullptr) {}
-  void mapLandingPad(const LandingPadInst *LPad);
-
-  bool isInitialized() { return OriginLPad != nullptr; }
-
-  bool isOriginLandingPadBlock(const BasicBlock *BB) const;
-  bool isLandingPadSpecificInst(const Instruction *Inst) const;
-
-  void remapEHValues(ValueToValueMapTy &VMap, Value *EHPtrValue,
-                     Value *SelectorValue) const;
-
-private:
-  const LandingPadInst *OriginLPad;
-  // We will normally only see one of each of these instructions, but
-  // if more than one occurs for some reason we can handle that.
-  TinyPtrVector<const ExtractValueInst *> ExtractedEHPtrs;
-  TinyPtrVector<const ExtractValueInst *> ExtractedSelectors;
-};
-
-class WinEHCloningDirectorBase : public CloningDirector {
-public:
-  WinEHCloningDirectorBase(Function *HandlerFn, Value *ParentFP,
-                           FrameVarInfoMap &VarInfo, LandingPadMap &LPadMap)
-      : Materializer(HandlerFn, ParentFP, VarInfo),
-        SelectorIDType(Type::getInt32Ty(HandlerFn->getContext())),
-        Int8PtrType(Type::getInt8PtrTy(HandlerFn->getContext())),
-        LPadMap(LPadMap), ParentFP(ParentFP) {}
-
-  CloningAction handleInstruction(ValueToValueMapTy &VMap,
-                                  const Instruction *Inst,
-                                  BasicBlock *NewBB) override;
-
-  virtual CloningAction handleBeginCatch(ValueToValueMapTy &VMap,
-                                         const Instruction *Inst,
-                                         BasicBlock *NewBB) = 0;
-  virtual CloningAction handleEndCatch(ValueToValueMapTy &VMap,
-                                       const Instruction *Inst,
-                                       BasicBlock *NewBB) = 0;
-  virtual CloningAction handleTypeIdFor(ValueToValueMapTy &VMap,
-                                        const Instruction *Inst,
-                                        BasicBlock *NewBB) = 0;
-  virtual CloningAction handleIndirectBr(ValueToValueMapTy &VMap,
-                                         const IndirectBrInst *IBr,
-                                         BasicBlock *NewBB) = 0;
-  virtual CloningAction handleInvoke(ValueToValueMapTy &VMap,
-                                     const InvokeInst *Invoke,
-                                     BasicBlock *NewBB) = 0;
-  virtual CloningAction handleResume(ValueToValueMapTy &VMap,
-                                     const ResumeInst *Resume,
-                                     BasicBlock *NewBB) = 0;
-  virtual CloningAction handleCompare(ValueToValueMapTy &VMap,
-                                      const CmpInst *Compare,
-                                      BasicBlock *NewBB) = 0;
-  virtual CloningAction handleLandingPad(ValueToValueMapTy &VMap,
-                                         const LandingPadInst *LPad,
-                                         BasicBlock *NewBB) = 0;
-
-  ValueMaterializer *getValueMaterializer() override { return &Materializer; }
-
-protected:
-  WinEHFrameVariableMaterializer Materializer;
-  Type *SelectorIDType;
-  Type *Int8PtrType;
-  LandingPadMap &LPadMap;
-
-  /// The value representing the parent frame pointer.
-  Value *ParentFP;
-};
-
-class WinEHCatchDirector : public WinEHCloningDirectorBase {
-public:
-  WinEHCatchDirector(
-      Function *CatchFn, Value *ParentFP, Value *Selector,
-      FrameVarInfoMap &VarInfo, LandingPadMap &LPadMap,
-      DenseMap<LandingPadInst *, const LandingPadInst *> &NestedLPads,
-      DominatorTree *DT, SmallPtrSetImpl<BasicBlock *> &EHBlocks)
-      : WinEHCloningDirectorBase(CatchFn, ParentFP, VarInfo, LPadMap),
-        CurrentSelector(Selector->stripPointerCasts()),
-        ExceptionObjectVar(nullptr), NestedLPtoOriginalLP(NestedLPads),
-        DT(DT), EHBlocks(EHBlocks) {}
-
-  CloningAction handleBeginCatch(ValueToValueMapTy &VMap,
-                                 const Instruction *Inst,
-                                 BasicBlock *NewBB) override;
-  CloningAction handleEndCatch(ValueToValueMapTy &VMap, const Instruction *Inst,
-                               BasicBlock *NewBB) override;
-  CloningAction handleTypeIdFor(ValueToValueMapTy &VMap,
-                                const Instruction *Inst,
-                                BasicBlock *NewBB) override;
-  CloningAction handleIndirectBr(ValueToValueMapTy &VMap,
-                                 const IndirectBrInst *IBr,
-                                 BasicBlock *NewBB) override;
-  CloningAction handleInvoke(ValueToValueMapTy &VMap, const InvokeInst *Invoke,
-                             BasicBlock *NewBB) override;
-  CloningAction handleResume(ValueToValueMapTy &VMap, const ResumeInst *Resume,
-                             BasicBlock *NewBB) override;
-  CloningAction handleCompare(ValueToValueMapTy &VMap, const CmpInst *Compare,
-                              BasicBlock *NewBB) override;
-  CloningAction handleLandingPad(ValueToValueMapTy &VMap,
-                                 const LandingPadInst *LPad,
-                                 BasicBlock *NewBB) override;
-
-  Value *getExceptionVar() { return ExceptionObjectVar; }
-  TinyPtrVector<BasicBlock *> &getReturnTargets() { return ReturnTargets; }
-
-private:
-  Value *CurrentSelector;
-
-  Value *ExceptionObjectVar;
-  TinyPtrVector<BasicBlock *> ReturnTargets;
 
-  // This will be a reference to the field of the same name in the WinEHPrepare
-  // object which instantiates this WinEHCatchDirector object.
-  DenseMap<LandingPadInst *, const LandingPadInst *> &NestedLPtoOriginalLP;
-  DominatorTree *DT;
-  SmallPtrSetImpl<BasicBlock *> &EHBlocks;
-};
-
-class WinEHCleanupDirector : public WinEHCloningDirectorBase {
-public:
-  WinEHCleanupDirector(Function *CleanupFn, Value *ParentFP,
-                       FrameVarInfoMap &VarInfo, LandingPadMap &LPadMap)
-      : WinEHCloningDirectorBase(CleanupFn, ParentFP, VarInfo,
-                                 LPadMap) {}
-
-  CloningAction handleBeginCatch(ValueToValueMapTy &VMap,
-                                 const Instruction *Inst,
-                                 BasicBlock *NewBB) override;
-  CloningAction handleEndCatch(ValueToValueMapTy &VMap, const Instruction *Inst,
-                               BasicBlock *NewBB) override;
-  CloningAction handleTypeIdFor(ValueToValueMapTy &VMap,
-                                const Instruction *Inst,
-                                BasicBlock *NewBB) override;
-  CloningAction handleIndirectBr(ValueToValueMapTy &VMap,
-                                 const IndirectBrInst *IBr,
-                                 BasicBlock *NewBB) override;
-  CloningAction handleInvoke(ValueToValueMapTy &VMap, const InvokeInst *Invoke,
-                             BasicBlock *NewBB) override;
-  CloningAction handleResume(ValueToValueMapTy &VMap, const ResumeInst *Resume,
-                             BasicBlock *NewBB) override;
-  CloningAction handleCompare(ValueToValueMapTy &VMap, const CmpInst *Compare,
-                              BasicBlock *NewBB) override;
-  CloningAction handleLandingPad(ValueToValueMapTy &VMap,
-                                 const LandingPadInst *LPad,
-                                 BasicBlock *NewBB) override;
-};
-
-class LandingPadActions {
-public:
-  LandingPadActions() : HasCleanupHandlers(false) {}
-
-  void insertCatchHandler(CatchHandler *Action) { Actions.push_back(Action); }
-  void insertCleanupHandler(CleanupHandler *Action) {
-    Actions.push_back(Action);
-    HasCleanupHandlers = true;
-  }
-
-  bool includesCleanup() const { return HasCleanupHandlers; }
-
-  SmallVectorImpl<ActionHandler *> &actions() { return Actions; }
-  SmallVectorImpl<ActionHandler *>::iterator begin() { return Actions.begin(); }
-  SmallVectorImpl<ActionHandler *>::iterator end() { return Actions.end(); }
-
-private:
-  // Note that this class does not own the ActionHandler objects in this vector.
-  // The ActionHandlers are owned by the CatchHandlerMap and CleanupHandlerMap
-  // in the WinEHPrepare class.
-  SmallVector<ActionHandler *, 4> Actions;
-  bool HasCleanupHandlers;
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;
 };
 
 } // end anonymous namespace
@@ -361,2536 +101,1122 @@ FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) {
 }
 
 bool WinEHPrepare::runOnFunction(Function &Fn) {
-  // No need to prepare outlined handlers.
-  if (Fn.hasFnAttribute("wineh-parent"))
-    return false;
-
-  SmallVector<LandingPadInst *, 4> LPads;
-  SmallVector<ResumeInst *, 4> Resumes;
-  for (BasicBlock &BB : Fn) {
-    if (auto *LP = BB.getLandingPadInst())
-      LPads.push_back(LP);
-    if (auto *Resume = dyn_cast<ResumeInst>(BB.getTerminator()))
-      Resumes.push_back(Resume);
-  }
-
-  // No need to prepare functions that lack landing pads.
-  if (LPads.empty())
+  if (!Fn.hasPersonalityFn())
     return false;
 
   // Classify the personality to see what kind of preparation we need.
   Personality = classifyEHPersonality(Fn.getPersonalityFn());
 
-  // Do nothing if this is not an MSVC personality.
-  if (!isMSVCEHPersonality(Personality))
+  // Do nothing if this is not a funclet-based personality.
+  if (!isFuncletEHPersonality(Personality))
     return false;
 
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-
-  // If there were any landing pads, prepareExceptionHandlers will make changes.
-  prepareExceptionHandlers(Fn, LPads);
-  return true;
+  return prepareExplicitEH(Fn);
 }
 
 bool WinEHPrepare::doFinalization(Module &M) { return false; }
 
-void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<TargetLibraryInfoWrapperPass>();
-}
-
-static bool isSelectorDispatch(BasicBlock *BB, BasicBlock *&CatchHandler,
-                               Constant *&Selector, BasicBlock *&NextBB);
-
-// Finds blocks reachable from the starting set Worklist. Does not follow unwind
-// edges or blocks listed in StopPoints.
-static void findReachableBlocks(SmallPtrSetImpl<BasicBlock *> &ReachableBBs,
-                                SetVector<BasicBlock *> &Worklist,
-                                const SetVector<BasicBlock *> *StopPoints) {
-  while (!Worklist.empty()) {
-    BasicBlock *BB = Worklist.pop_back_val();
-
-    // Don't cross blocks that we should stop at.
-    if (StopPoints && StopPoints->count(BB))
-      continue;
-
-    if (!ReachableBBs.insert(BB).second)
-      continue; // Already visited.
-
-    // Don't follow unwind edges of invokes.
-    if (auto *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      Worklist.insert(II->getNormalDest());
-      continue;
-    }
-
-    // Otherwise, follow all successors.
-    Worklist.insert(succ_begin(BB), succ_end(BB));
-  }
-}
-
-// Attempt to find an instruction where a block can be split before
-// a call to llvm.eh.begincatch and its operands.  If the block
-// begins with the begincatch call or one of its adjacent operands
-// the block will not be split.
-static Instruction *findBeginCatchSplitPoint(BasicBlock *BB,
-                                             IntrinsicInst *II) {
-  // If the begincatch call is already the first instruction in the block,
-  // don't split.
-  Instruction *FirstNonPHI = BB->getFirstNonPHI();
-  if (II == FirstNonPHI)
-    return nullptr;
-
-  // If either operand is in the same basic block as the instruction and
-  // isn't used by another instruction before the begincatch call, include it
-  // in the split block.
-  auto *Op0 = dyn_cast<Instruction>(II->getOperand(0));
-  auto *Op1 = dyn_cast<Instruction>(II->getOperand(1));
-
-  Instruction *I = II->getPrevNode();
-  Instruction *LastI = II;
-
-  while (I == Op0 || I == Op1) {
-    // If the block begins with one of the operands and there are no other
-    // instructions between the operand and the begincatch call, don't split.
-    if (I == FirstNonPHI)
-      return nullptr;
-
-    LastI = I;
-    I = I->getPrevNode();
-  }
-
-  // If there is at least one instruction in the block before the begincatch
-  // call and its operands, split the block at either the begincatch or
-  // its operand.
-  return LastI;
-}
+void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {}
 
-/// Find all points where exceptional control rejoins normal control flow via
-/// llvm.eh.endcatch. Add them to the normal bb reachability worklist.
-void WinEHPrepare::findCXXEHReturnPoints(
-    Function &F, SetVector<BasicBlock *> &EHReturnBlocks) {
-  for (auto BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
-    BasicBlock *BB = BBI;
-    for (Instruction &I : *BB) {
-      if (match(&I, m_Intrinsic<Intrinsic::eh_begincatch>())) {
-        Instruction *SplitPt =
-            findBeginCatchSplitPoint(BB, cast<IntrinsicInst>(&I));
-        if (SplitPt) {
-          // Split the block before the llvm.eh.begincatch call to allow
-          // cleanup and catch code to be distinguished later.
-          // Do not update BBI because we still need to process the
-          // portion of the block that we are splitting off.
-          SplitBlock(BB, SplitPt, DT);
-          break;
-        }
-      }
-      if (match(&I, m_Intrinsic<Intrinsic::eh_endcatch>())) {
-        // Split the block after the call to llvm.eh.endcatch if there is
-        // anything other than an unconditional branch, or if the successor
-        // starts with a phi.
-        auto *Br = dyn_cast<BranchInst>(I.getNextNode());
-        if (!Br || !Br->isUnconditional() ||
-            isa<PHINode>(Br->getSuccessor(0)->begin())) {
-          DEBUG(dbgs() << "splitting block " << BB->getName()
-                       << " with llvm.eh.endcatch\n");
-          BBI = SplitBlock(BB, I.getNextNode(), DT);
-        }
-        // The next BB is normal control flow.
-        EHReturnBlocks.insert(BB->getTerminator()->getSuccessor(0));
-        break;
-      }
-    }
-  }
-}
-
-static bool isCatchAllLandingPad(const BasicBlock *BB) {
-  const LandingPadInst *LP = BB->getLandingPadInst();
-  if (!LP)
-    return false;
-  unsigned N = LP->getNumClauses();
-  return (N > 0 && LP->isCatch(N - 1) &&
-          isa<ConstantPointerNull>(LP->getClause(N - 1)));
+static int addUnwindMapEntry(WinEHFuncInfo &FuncInfo, int ToState,
+                             const BasicBlock *BB) {
+  CxxUnwindMapEntry UME;
+  UME.ToState = ToState;
+  UME.Cleanup = BB;
+  FuncInfo.CxxUnwindMap.push_back(UME);
+  return FuncInfo.getLastStateNumber();
 }
 
-/// Find all points where exceptions control rejoins normal control flow via
-/// selector dispatch.
-void WinEHPrepare::findSEHEHReturnPoints(
-    Function &F, SetVector<BasicBlock *> &EHReturnBlocks) {
-  for (auto BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
-    BasicBlock *BB = BBI;
-    // If the landingpad is a catch-all, treat the whole lpad as if it is
-    // reachable from normal control flow.
-    // FIXME: This is imprecise. We need a better way of identifying where a
-    // catch-all starts and cleanups stop. As far as LLVM is concerned, there
-    // is no difference.
-    if (isCatchAllLandingPad(BB)) {
-      EHReturnBlocks.insert(BB);
-      continue;
-    }
-
-    BasicBlock *CatchHandler;
-    BasicBlock *NextBB;
-    Constant *Selector;
-    if (isSelectorDispatch(BB, CatchHandler, Selector, NextBB)) {
-      // Split the edge if there are multiple predecessors. This creates a place
-      // where we can insert EH recovery code.
-      if (!CatchHandler->getSinglePredecessor()) {
-        DEBUG(dbgs() << "splitting EH return edge from " << BB->getName()
-                     << " to " << CatchHandler->getName() << '\n');
-        BBI = CatchHandler = SplitCriticalEdge(
-            BB, std::find(succ_begin(BB), succ_end(BB), CatchHandler));
-      }
-      EHReturnBlocks.insert(CatchHandler);
-    }
+static void addTryBlockMapEntry(WinEHFuncInfo &FuncInfo, int TryLow,
+                                int TryHigh, int CatchHigh,
+                                ArrayRef<const CatchPadInst *> Handlers) {
+  WinEHTryBlockMapEntry TBME;
+  TBME.TryLow = TryLow;
+  TBME.TryHigh = TryHigh;
+  TBME.CatchHigh = CatchHigh;
+  assert(TBME.TryLow <= TBME.TryHigh);
+  for (const CatchPadInst *CPI : Handlers) {
+    WinEHHandlerType HT;
+    Constant *TypeInfo = cast<Constant>(CPI->getArgOperand(0));
+    if (TypeInfo->isNullValue())
+      HT.TypeDescriptor = nullptr;
+    else
+      HT.TypeDescriptor = cast<GlobalVariable>(TypeInfo->stripPointerCasts());
+    HT.Adjectives = cast<ConstantInt>(CPI->getArgOperand(1))->getZExtValue();
+    HT.Handler = CPI->getParent();
+    if (auto *AI =
+            dyn_cast<AllocaInst>(CPI->getArgOperand(2)->stripPointerCasts()))
+      HT.CatchObj.Alloca = AI;
+    else
+      HT.CatchObj.Alloca = nullptr;
+    TBME.HandlerArray.push_back(HT);
   }
+  FuncInfo.TryBlockMap.push_back(TBME);
 }
 
-void WinEHPrepare::identifyEHBlocks(Function &F, 
-                                    SmallVectorImpl<LandingPadInst *> &LPads) {
-  DEBUG(dbgs() << "Demoting values live across exception handlers in function "
-               << F.getName() << '\n');
-
-  // Build a set of all non-exceptional blocks and exceptional blocks.
-  // - Non-exceptional blocks are blocks reachable from the entry block while
-  //   not following invoke unwind edges.
-  // - Exceptional blocks are blocks reachable from landingpads. Analysis does
-  //   not follow llvm.eh.endcatch blocks, which mark a transition from
-  //   exceptional to normal control.
-
-  if (Personality == EHPersonality::MSVC_CXX)
-    findCXXEHReturnPoints(F, EHReturnBlocks);
-  else
-    findSEHEHReturnPoints(F, EHReturnBlocks);
-
-  DEBUG({
-    dbgs() << "identified the following blocks as EH return points:\n";
-    for (BasicBlock *BB : EHReturnBlocks)
-      dbgs() << "  " << BB->getName() << '\n';
-  });
-
-// Join points should not have phis at this point, unless they are a
-// landingpad, in which case we will demote their phis later.
-#ifndef NDEBUG
-  for (BasicBlock *BB : EHReturnBlocks)
-    assert((BB->isLandingPad() || !isa<PHINode>(BB->begin())) &&
-           "non-lpad EH return block has phi");
-#endif
-
-  // Normal blocks are the blocks reachable from the entry block and all EH
-  // return points.
-  SetVector<BasicBlock *> Worklist;
-  Worklist = EHReturnBlocks;
-  Worklist.insert(&F.getEntryBlock());
-  findReachableBlocks(NormalBlocks, Worklist, nullptr);
-  DEBUG({
-    dbgs() << "marked the following blocks as normal:\n";
-    for (BasicBlock *BB : NormalBlocks)
-      dbgs() << "  " << BB->getName() << '\n';
-  });
-
-  // Exceptional blocks are the blocks reachable from landingpads that don't
-  // cross EH return points.
-  Worklist.clear();
-  for (auto *LPI : LPads)
-    Worklist.insert(LPI->getParent());
-  findReachableBlocks(EHBlocks, Worklist, &EHReturnBlocks);
-  DEBUG({
-    dbgs() << "marked the following blocks as exceptional:\n";
-    for (BasicBlock *BB : EHBlocks)
-      dbgs() << "  " << BB->getName() << '\n';
-  });
-
+static BasicBlock *getCleanupRetUnwindDest(const CleanupPadInst *CleanupPad) {
+  for (const User *U : CleanupPad->users())
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+      return CRI->getUnwindDest();
+  return nullptr;
 }
 
-/// Ensure that all values live into and out of exception handlers are stored
-/// in memory.
-/// FIXME: This falls down when values are defined in one handler and live into
-/// another handler. For example, a cleanup defines a value used only by a
-/// catch handler.
-void WinEHPrepare::demoteValuesLiveAcrossHandlers(
-    Function &F, SmallVectorImpl<LandingPadInst *> &LPads) {
-  DEBUG(dbgs() << "Demoting values live across exception handlers in function "
-               << F.getName() << '\n');
-
-  // identifyEHBlocks() should have been called before this function.
-  assert(!NormalBlocks.empty());
-
-  // Try to avoid demoting EH pointer and selector values. They get in the way
-  // of our pattern matching.
-  SmallPtrSet<Instruction *, 10> EHVals;
-  for (BasicBlock &BB : F) {
-    LandingPadInst *LP = BB.getLandingPadInst();
-    if (!LP)
+static void calculateStateNumbersForInvokes(const Function *Fn,
+                                            WinEHFuncInfo &FuncInfo) {
+  auto *F = const_cast<Function *>(Fn);
+  DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(*F);
+  for (BasicBlock &BB : *F) {
+    auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
+    if (!II)
       continue;
-    EHVals.insert(LP);
-    for (User *U : LP->users()) {
-      auto *EI = dyn_cast<ExtractValueInst>(U);
-      if (!EI)
-        continue;
-      EHVals.insert(EI);
-      for (User *U2 : EI->users()) {
-        if (auto *PN = dyn_cast<PHINode>(U2))
-          EHVals.insert(PN);
-      }
-    }
-  }
 
-  SetVector<Argument *> ArgsToDemote;
-  SetVector<Instruction *> InstrsToDemote;
-  for (BasicBlock &BB : F) {
-    bool IsNormalBB = NormalBlocks.count(&BB);
-    bool IsEHBB = EHBlocks.count(&BB);
-    if (!IsNormalBB && !IsEHBB)
-      continue; // Blocks that are neither normal nor EH are unreachable.
-    for (Instruction &I : BB) {
-      for (Value *Op : I.operands()) {
-        // Don't demote static allocas, constants, and labels.
-        if (isa<Constant>(Op) || isa<BasicBlock>(Op) || isa<InlineAsm>(Op))
-          continue;
-        auto *AI = dyn_cast<AllocaInst>(Op);
-        if (AI && AI->isStaticAlloca())
-          continue;
-
-        if (auto *Arg = dyn_cast<Argument>(Op)) {
-          if (IsEHBB) {
-            DEBUG(dbgs() << "Demoting argument " << *Arg
-                         << " used by EH instr: " << I << "\n");
-            ArgsToDemote.insert(Arg);
-          }
-          continue;
-        }
-
-        // Don't demote EH values.
-        auto *OpI = cast<Instruction>(Op);
-        if (EHVals.count(OpI))
-          continue;
-
-        BasicBlock *OpBB = OpI->getParent();
-        // If a value is produced and consumed in the same BB, we don't need to
-        // demote it.
-        if (OpBB == &BB)
-          continue;
-        bool IsOpNormalBB = NormalBlocks.count(OpBB);
-        bool IsOpEHBB = EHBlocks.count(OpBB);
-        if (IsNormalBB != IsOpNormalBB || IsEHBB != IsOpEHBB) {
-          DEBUG({
-            dbgs() << "Demoting instruction live in-out from EH:\n";
-            dbgs() << "Instr: " << *OpI << '\n';
-            dbgs() << "User: " << I << '\n';
-          });
-          InstrsToDemote.insert(OpI);
-        }
-      }
+    auto &BBColors = BlockColors[&BB];
+    assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+    BasicBlock *FuncletEntryBB = BBColors.front();
+
+    BasicBlock *FuncletUnwindDest;
+    auto *FuncletPad =
+        dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI());
+    assert(FuncletPad || FuncletEntryBB == &Fn->getEntryBlock());
+    if (!FuncletPad)
+      FuncletUnwindDest = nullptr;
+    else if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+      FuncletUnwindDest = CatchPad->getCatchSwitch()->getUnwindDest();
+    else if (auto *CleanupPad = dyn_cast<CleanupPadInst>(FuncletPad))
+      FuncletUnwindDest = getCleanupRetUnwindDest(CleanupPad);
+    else
+      llvm_unreachable("unexpected funclet pad!");
+
+    BasicBlock *InvokeUnwindDest = II->getUnwindDest();
+    int BaseState = -1;
+    if (FuncletUnwindDest == InvokeUnwindDest) {
+      auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+      if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+        BaseState = BaseStateI->second;
     }
-  }
 
-  // Demote values live into and out of handlers.
-  // FIXME: This demotion is inefficient. We should insert spills at the point
-  // of definition, insert one reload in each handler that uses the value, and
-  // insert reloads in the BB used to rejoin normal control flow.
-  Instruction *AllocaInsertPt = F.getEntryBlock().getFirstInsertionPt();
-  for (Instruction *I : InstrsToDemote)
-    DemoteRegToStack(*I, false, AllocaInsertPt);
-
-  // Demote arguments separately, and only for uses in EH blocks.
-  for (Argument *Arg : ArgsToDemote) {
-    auto *Slot = new AllocaInst(Arg->getType(), nullptr,
-                                Arg->getName() + ".reg2mem", AllocaInsertPt);
-    SmallVector<User *, 4> Users(Arg->user_begin(), Arg->user_end());
-    for (User *U : Users) {
-      auto *I = dyn_cast<Instruction>(U);
-      if (I && EHBlocks.count(I->getParent())) {
-        auto *Reload = new LoadInst(Slot, Arg->getName() + ".reload", false, I);
-        U->replaceUsesOfWith(Arg, Reload);
-      }
+    if (BaseState != -1) {
+      FuncInfo.InvokeStateMap[II] = BaseState;
+    } else {
+      Instruction *PadInst = InvokeUnwindDest->getFirstNonPHI();
+      assert(FuncInfo.EHPadStateMap.count(PadInst) && "EH Pad has no state!");
+      FuncInfo.InvokeStateMap[II] = FuncInfo.EHPadStateMap[PadInst];
     }
-    new StoreInst(Arg, Slot, AllocaInsertPt);
-  }
-
-  // Demote landingpad phis, as the landingpad will be removed from the machine
-  // CFG.
-  for (LandingPadInst *LPI : LPads) {
-    BasicBlock *BB = LPI->getParent();
-    while (auto *Phi = dyn_cast<PHINode>(BB->begin()))
-      DemotePHIToStack(Phi, AllocaInsertPt);
   }
-
-  DEBUG(dbgs() << "Demoted " << InstrsToDemote.size() << " instructions and "
-               << ArgsToDemote.size() << " arguments for WinEHPrepare\n\n");
 }
 
-bool WinEHPrepare::prepareExceptionHandlers(
-    Function &F, SmallVectorImpl<LandingPadInst *> &LPads) {
-  // Don't run on functions that are already prepared.
-  for (LandingPadInst *LPad : LPads) {
-    BasicBlock *LPadBB = LPad->getParent();
-    for (Instruction &Inst : *LPadBB)
-      if (match(&Inst, m_Intrinsic<Intrinsic::eh_actions>()))
-        return false;
-  }
-
-  identifyEHBlocks(F, LPads);
-  demoteValuesLiveAcrossHandlers(F, LPads);
-
-  // These containers are used to re-map frame variables that are used in
-  // outlined catch and cleanup handlers.  They will be populated as the
-  // handlers are outlined.
-  FrameVarInfoMap FrameVarInfo;
-
-  bool HandlersOutlined = false;
-
-  Module *M = F.getParent();
-  LLVMContext &Context = M->getContext();
-
-  // Create a new function to receive the handler contents.
-  PointerType *Int8PtrType = Type::getInt8PtrTy(Context);
-  Type *Int32Type = Type::getInt32Ty(Context);
-  Function *ActionIntrin = Intrinsic::getDeclaration(M, Intrinsic::eh_actions);
-
-  if (isAsynchronousEHPersonality(Personality)) {
-    // FIXME: Switch the ehptr type to i32 and then switch this.
-    SEHExceptionCodeSlot =
-        new AllocaInst(Int8PtrType, nullptr, "seh_exception_code",
-                       F.getEntryBlock().getFirstInsertionPt());
+// Given BB which ends in an unwind edge, return the EHPad that this BB belongs
+// to. If the unwind edge came from an invoke, return null.
+static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB,
+                                                 Value *ParentPad) {
+  const TerminatorInst *TI = BB->getTerminator();
+  if (isa<InvokeInst>(TI))
+    return nullptr;
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+    if (CatchSwitch->getParentPad() != ParentPad)
+      return nullptr;
+    return BB;
   }
+  assert(!TI->isEHPad() && "unexpected EHPad!");
+  auto *CleanupPad = cast<CleanupReturnInst>(TI)->getCleanupPad();
+  if (CleanupPad->getParentPad() != ParentPad)
+    return nullptr;
+  return CleanupPad->getParent();
+}
 
-  // In order to handle the case where one outlined catch handler returns
-  // to a block within another outlined catch handler that would otherwise
-  // be unreachable, we need to outline the nested landing pad before we
-  // outline the landing pad which encloses it.
-  if (!isAsynchronousEHPersonality(Personality))
-    std::sort(LPads.begin(), LPads.end(),
-              [this](LandingPadInst *const &L, LandingPadInst *const &R) {
-                return DT->properlyDominates(R->getParent(), L->getParent());
-              });
-
-  // This container stores the llvm.eh.recover and IndirectBr instructions
-  // that make up the body of each landing pad after it has been outlined.
-  // We need to defer the population of the target list for the indirectbr
-  // until all landing pads have been outlined so that we can handle the
-  // case of blocks in the target that are reached only from nested
-  // landing pads.
-  SmallVector<std::pair<CallInst*, IndirectBrInst *>, 4> LPadImpls;
-
-  for (LandingPadInst *LPad : LPads) {
-    // Look for evidence that this landingpad has already been processed.
-    bool LPadHasActionList = false;
-    BasicBlock *LPadBB = LPad->getParent();
-    for (Instruction &Inst : *LPadBB) {
-      if (match(&Inst, m_Intrinsic<Intrinsic::eh_actions>())) {
-        LPadHasActionList = true;
-        break;
-      }
-    }
-
-    // If we've already outlined the handlers for this landingpad,
-    // there's nothing more to do here.
-    if (LPadHasActionList)
-      continue;
-
-    // If either of the values in the aggregate returned by the landing pad is
-    // extracted and stored to memory, promote the stored value to a register.
-    promoteLandingPadValues(LPad);
-
-    LandingPadActions Actions;
-    mapLandingPadBlocks(LPad, Actions);
-
-    HandlersOutlined |= !Actions.actions().empty();
-    for (ActionHandler *Action : Actions) {
-      if (Action->hasBeenProcessed())
-        continue;
-      BasicBlock *StartBB = Action->getStartBlock();
-
-      // SEH doesn't do any outlining for catches. Instead, pass the handler
-      // basic block addr to llvm.eh.actions and list the block as a return
-      // target.
-      if (isAsynchronousEHPersonality(Personality)) {
-        if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) {
-          processSEHCatchHandler(CatchAction, StartBB);
-          continue;
-        }
-      }
-
-      outlineHandler(Action, &F, LPad, StartBB, FrameVarInfo);
-    }
-
-    // Split the block after the landingpad instruction so that it is just a
-    // call to llvm.eh.actions followed by indirectbr.
-    assert(!isa<PHINode>(LPadBB->begin()) && "lpad phi not removed");
-    SplitBlock(LPadBB, LPad->getNextNode(), DT);
-    // Erase the branch inserted by the split so we can insert indirectbr.
-    LPadBB->getTerminator()->eraseFromParent();
-
-    // Replace all extracted values with undef and ultimately replace the
-    // landingpad with undef.
-    SmallVector<Instruction *, 4> SEHCodeUses;
-    SmallVector<Instruction *, 4> EHUndefs;
-    for (User *U : LPad->users()) {
-      auto *E = dyn_cast<ExtractValueInst>(U);
-      if (!E)
-        continue;
-      assert(E->getNumIndices() == 1 &&
-             "Unexpected operation: extracting both landing pad values");
-      unsigned Idx = *E->idx_begin();
-      assert((Idx == 0 || Idx == 1) && "unexpected index");
-      if (Idx == 0 && isAsynchronousEHPersonality(Personality))
-        SEHCodeUses.push_back(E);
-      else
-        EHUndefs.push_back(E);
-    }
-    for (Instruction *E : EHUndefs) {
-      E->replaceAllUsesWith(UndefValue::get(E->getType()));
-      E->eraseFromParent();
-    }
-    LPad->replaceAllUsesWith(UndefValue::get(LPad->getType()));
-
-    // Rewrite uses of the exception pointer to loads of an alloca.
-    while (!SEHCodeUses.empty()) {
-      Instruction *E = SEHCodeUses.pop_back_val();
-      SmallVector<Use *, 4> Uses;
-      for (Use &U : E->uses())
-        Uses.push_back(&U);
-      for (Use *U : Uses) {
-        auto *I = cast<Instruction>(U->getUser());
-        if (isa<ResumeInst>(I))
-          continue;
-        if (auto *Phi = dyn_cast<PHINode>(I))
-          SEHCodeUses.push_back(Phi);
-        else
-          U->set(new LoadInst(SEHExceptionCodeSlot, "sehcode", false, I));
-      }
-      E->replaceAllUsesWith(UndefValue::get(E->getType()));
-      E->eraseFromParent();
-    }
-
-    // Add a call to describe the actions for this landing pad.
-    std::vector<Value *> ActionArgs;
-    for (ActionHandler *Action : Actions) {
-      // Action codes from docs are: 0 cleanup, 1 catch.
-      if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) {
-        ActionArgs.push_back(ConstantInt::get(Int32Type, 1));
-        ActionArgs.push_back(CatchAction->getSelector());
-        // Find the frame escape index of the exception object alloca in the
-        // parent.
-        int FrameEscapeIdx = -1;
-        Value *EHObj = const_cast<Value *>(CatchAction->getExceptionVar());
-        if (EHObj && !isa<ConstantPointerNull>(EHObj)) {
-          auto I = FrameVarInfo.find(EHObj);
-          assert(I != FrameVarInfo.end() &&
-                 "failed to map llvm.eh.begincatch var");
-          FrameEscapeIdx = std::distance(FrameVarInfo.begin(), I);
-        }
-        ActionArgs.push_back(ConstantInt::get(Int32Type, FrameEscapeIdx));
-      } else {
-        ActionArgs.push_back(ConstantInt::get(Int32Type, 0));
-      }
-      ActionArgs.push_back(Action->getHandlerBlockOrFunc());
-    }
-    CallInst *Recover =
-        CallInst::Create(ActionIntrin, ActionArgs, "recover", LPadBB);
-
-    SetVector<BasicBlock *> ReturnTargets;
-    for (ActionHandler *Action : Actions) {
-      if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) {
-        const auto &CatchTargets = CatchAction->getReturnTargets();
-        ReturnTargets.insert(CatchTargets.begin(), CatchTargets.end());
-      }
-    }
-    IndirectBrInst *Branch =
-        IndirectBrInst::Create(Recover, ReturnTargets.size(), LPadBB);
-    for (BasicBlock *Target : ReturnTargets)
-      Branch->addDestination(Target);
-
-    if (!isAsynchronousEHPersonality(Personality)) {
-      // C++ EH must repopulate the targets later to handle the case of
-      // targets that are reached indirectly through nested landing pads.
-      LPadImpls.push_back(std::make_pair(Recover, Branch));
-    }
-
-  } // End for each landingpad
+static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
+                                     const Instruction *FirstNonPHI,
+                                     int ParentState) {
+  const BasicBlock *BB = FirstNonPHI->getParent();
+  assert(BB->isEHPad() && "not a funclet!");
 
-  // If nothing got outlined, there is no more processing to be done.
-  if (!HandlersOutlined)
-    return false;
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) {
+    assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 &&
+           "shouldn't revist catch funclets!");
 
-  // Replace any nested landing pad stubs with the correct action handler.
-  // This must be done before we remove unreachable blocks because it
-  // cleans up references to outlined blocks that will be deleted.
-  for (auto &LPadPair : NestedLPtoOriginalLP)
-    completeNestedLandingPad(&F, LPadPair.first, LPadPair.second, FrameVarInfo);
-  NestedLPtoOriginalLP.clear();
-
-  // Update the indirectbr instructions' target lists if necessary.
-  SetVector<BasicBlock*> CheckedTargets;
-  SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
-  for (auto &LPadImplPair : LPadImpls) {
-    IntrinsicInst *Recover = cast<IntrinsicInst>(LPadImplPair.first);
-    IndirectBrInst *Branch = LPadImplPair.second;
-
-    // Get a list of handlers called by 
-    parseEHActions(Recover, ActionList);
-
-    // Add an indirect branch listing possible successors of the catch handlers.
-    SetVector<BasicBlock *> ReturnTargets;
-    for (const auto &Action : ActionList) {
-      if (auto *CA = dyn_cast<CatchHandler>(Action.get())) {
-        Function *Handler = cast<Function>(CA->getHandlerBlockOrFunc());
-        getPossibleReturnTargets(&F, Handler, ReturnTargets);
-      }
+    SmallVector<const CatchPadInst *, 2> Handlers;
+    for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+      auto *CatchPad = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI());
+      Handlers.push_back(CatchPad);
     }
-    ActionList.clear();
-    // Clear any targets we already knew about.
-    for (unsigned int I = 0, E = Branch->getNumDestinations(); I < E; ++I) {
-      BasicBlock *KnownTarget = Branch->getDestination(I);
-      if (ReturnTargets.count(KnownTarget))
-        ReturnTargets.remove(KnownTarget);
-    }
-    for (BasicBlock *Target : ReturnTargets) {
-      Branch->addDestination(Target);
-      // The target may be a block that we excepted to get pruned.
-      // If it is, it may contain a call to llvm.eh.endcatch.
-      if (CheckedTargets.insert(Target)) {
-        // Earlier preparations guarantee that all calls to llvm.eh.endcatch
-        // will be followed by an unconditional branch.
-        auto *Br = dyn_cast<BranchInst>(Target->getTerminator());
-        if (Br && Br->isUnconditional() &&
-            Br != Target->getFirstNonPHIOrDbgOrLifetime()) {
-          Instruction *Prev = Br->getPrevNode();
-          if (match(cast<Value>(Prev), m_Intrinsic<Intrinsic::eh_endcatch>()))
-            Prev->eraseFromParent();
-        }
+    int TryLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
+    FuncInfo.EHPadStateMap[CatchSwitch] = TryLow;
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CatchSwitch->getParentPad())))
+        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 TryLow);
+    int CatchLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
+
+    // catchpads are separate funclets in C++ EH due to the way rethrow works.
+    int TryHigh = CatchLow - 1;
+    for (const auto *CatchPad : Handlers) {
+      FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow;
+      for (const User *U : CatchPad->users()) {
+        const auto *UserI = cast<Instruction>(U);
+        if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI))
+          if (InnerCatchSwitch->getUnwindDest() == CatchSwitch->getUnwindDest())
+            calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
+        if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI))
+          if (getCleanupRetUnwindDest(InnerCleanupPad) ==
+              CatchSwitch->getUnwindDest())
+            calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
       }
     }
-  }
-  LPadImpls.clear();
-
-  F.addFnAttr("wineh-parent", F.getName());
-
-  // Delete any blocks that were only used by handlers that were outlined above.
-  removeUnreachableBlocks(F);
+    int CatchHigh = FuncInfo.getLastStateNumber();
+    addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers);
+    DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n');
+    DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh << '\n');
+    DEBUG(dbgs() << "CatchHigh[" << BB->getName() << "]: " << CatchHigh
+                 << '\n');
+  } else {
+    auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
 
-  BasicBlock *Entry = &F.getEntryBlock();
-  IRBuilder<> Builder(F.getParent()->getContext());
-  Builder.SetInsertPoint(Entry->getFirstInsertionPt());
-
-  Function *FrameEscapeFn =
-      Intrinsic::getDeclaration(M, Intrinsic::localescape);
-  Function *RecoverFrameFn =
-      Intrinsic::getDeclaration(M, Intrinsic::localrecover);
-  SmallVector<Value *, 8> AllocasToEscape;
-
-  // Scan the entry block for an existing call to llvm.localescape. We need to
-  // keep escaping those objects.
-  for (Instruction &I : F.front()) {
-    auto *II = dyn_cast<IntrinsicInst>(&I);
-    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
-      auto Args = II->arg_operands();
-      AllocasToEscape.append(Args.begin(), Args.end());
-      II->eraseFromParent();
-      break;
-    }
-  }
+    // It's possible for a cleanup to be visited twice: it might have multiple
+    // cleanupret instructions.
+    if (FuncInfo.EHPadStateMap.count(CleanupPad))
+      return;
 
-  // Finally, replace all of the temporary allocas for frame variables used in
-  // the outlined handlers with calls to llvm.localrecover.
-  for (auto &VarInfoEntry : FrameVarInfo) {
-    Value *ParentVal = VarInfoEntry.first;
-    TinyPtrVector<AllocaInst *> &Allocas = VarInfoEntry.second;
-    AllocaInst *ParentAlloca = cast<AllocaInst>(ParentVal);
-
-    // FIXME: We should try to sink unescaped allocas from the parent frame into
-    // the child frame. If the alloca is escaped, we have to use the lifetime
-    // markers to ensure that the alloca is only live within the child frame.
-
-    // Add this alloca to the list of things to escape.
-    AllocasToEscape.push_back(ParentAlloca);
-
-    // Next replace all outlined allocas that are mapped to it.
-    for (AllocaInst *TempAlloca : Allocas) {
-      if (TempAlloca == getCatchObjectSentinel())
-        continue; // Skip catch parameter sentinels.
-      Function *HandlerFn = TempAlloca->getParent()->getParent();
-      llvm::Value *FP = HandlerToParentFP[HandlerFn];
-      assert(FP);
-
-      // FIXME: Sink this localrecover into the blocks where it is used.
-      Builder.SetInsertPoint(TempAlloca);
-      Builder.SetCurrentDebugLocation(TempAlloca->getDebugLoc());
-      Value *RecoverArgs[] = {
-          Builder.CreateBitCast(&F, Int8PtrType, ""), FP,
-          llvm::ConstantInt::get(Int32Type, AllocasToEscape.size() - 1)};
-      Instruction *RecoveredAlloca =
-          Builder.CreateCall(RecoverFrameFn, RecoverArgs);
-
-      // Add a pointer bitcast if the alloca wasn't an i8.
-      if (RecoveredAlloca->getType() != TempAlloca->getType()) {
-        RecoveredAlloca->setName(Twine(TempAlloca->getName()) + ".i8");
-        RecoveredAlloca = cast<Instruction>(
-            Builder.CreateBitCast(RecoveredAlloca, TempAlloca->getType()));
+    int CleanupState = addUnwindMapEntry(FuncInfo, ParentState, BB);
+    FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
+    DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                 << BB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB)) {
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CleanupPad->getParentPad()))) {
+        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 CleanupState);
       }
-      TempAlloca->replaceAllUsesWith(RecoveredAlloca);
-      TempAlloca->removeFromParent();
-      RecoveredAlloca->takeName(TempAlloca);
-      delete TempAlloca;
     }
-  } // End for each FrameVarInfo entry.
-
-  // Insert 'call void (...)* @llvm.localescape(...)' at the end of the entry
-  // block.
-  Builder.SetInsertPoint(&F.getEntryBlock().back());
-  Builder.CreateCall(FrameEscapeFn, AllocasToEscape);
-
-  if (SEHExceptionCodeSlot) {
-    if (isAllocaPromotable(SEHExceptionCodeSlot)) {
-      SmallPtrSet<BasicBlock *, 4> UserBlocks;
-      for (User *U : SEHExceptionCodeSlot->users()) {
-        if (auto *Inst = dyn_cast<Instruction>(U))
-          UserBlocks.insert(Inst->getParent());
-      }
-      PromoteMemToReg(SEHExceptionCodeSlot, *DT);
-      // After the promotion, kill off dead instructions.
-      for (BasicBlock *BB : UserBlocks)
-        SimplifyInstructionsInBlock(BB, LibInfo);
+    for (const User *U : CleanupPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (UserI->isEHPad())
+        report_fatal_error("Cleanup funclets for the MSVC++ personality cannot "
+                           "contain exceptional actions");
     }
   }
+}
 
-  // Clean up the handler action maps we created for this function
-  DeleteContainerSeconds(CatchHandlerMap);
-  CatchHandlerMap.clear();
-  DeleteContainerSeconds(CleanupHandlerMap);
-  CleanupHandlerMap.clear();
-  HandlerToParentFP.clear();
-  DT = nullptr;
-  LibInfo = nullptr;
-  SEHExceptionCodeSlot = nullptr;
-  EHBlocks.clear();
-  NormalBlocks.clear();
-  EHReturnBlocks.clear();
-
-  return HandlersOutlined;
+static int addSEHExcept(WinEHFuncInfo &FuncInfo, int ParentState,
+                        const Function *Filter, const BasicBlock *Handler) {
+  SEHUnwindMapEntry Entry;
+  Entry.ToState = ParentState;
+  Entry.IsFinally = false;
+  Entry.Filter = Filter;
+  Entry.Handler = Handler;
+  FuncInfo.SEHUnwindMap.push_back(Entry);
+  return FuncInfo.SEHUnwindMap.size() - 1;
 }
 
-void WinEHPrepare::promoteLandingPadValues(LandingPadInst *LPad) {
-  // If the return values of the landing pad instruction are extracted and
-  // stored to memory, we want to promote the store locations to reg values.
-  SmallVector<AllocaInst *, 2> EHAllocas;
-
-  // The landingpad instruction returns an aggregate value.  Typically, its
-  // value will be passed to a pair of extract value instructions and the
-  // results of those extracts are often passed to store instructions.
-  // In unoptimized code the stored value will often be loaded and then stored
-  // again.
-  for (auto *U : LPad->users()) {
-    ExtractValueInst *Extract = dyn_cast<ExtractValueInst>(U);
-    if (!Extract)
-      continue;
+static int addSEHFinally(WinEHFuncInfo &FuncInfo, int ParentState,
+                         const BasicBlock *Handler) {
+  SEHUnwindMapEntry Entry;
+  Entry.ToState = ParentState;
+  Entry.IsFinally = true;
+  Entry.Filter = nullptr;
+  Entry.Handler = Handler;
+  FuncInfo.SEHUnwindMap.push_back(Entry);
+  return FuncInfo.SEHUnwindMap.size() - 1;
+}
 
-    for (auto *EU : Extract->users()) {
-      if (auto *Store = dyn_cast<StoreInst>(EU)) {
-        auto *AV = cast<AllocaInst>(Store->getPointerOperand());
-        EHAllocas.push_back(AV);
-      }
+static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
+                                     const Instruction *FirstNonPHI,
+                                     int ParentState) {
+  const BasicBlock *BB = FirstNonPHI->getParent();
+  assert(BB->isEHPad() && "no a funclet!");
+
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) {
+    assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 &&
+           "shouldn't revist catch funclets!");
+
+    // Extract the filter function and the __except basic block and create a
+    // state for them.
+    assert(CatchSwitch->getNumHandlers() == 1 &&
+           "SEH doesn't have multiple handlers per __try");
+    const auto *CatchPad =
+        cast<CatchPadInst>((*CatchSwitch->handler_begin())->getFirstNonPHI());
+    const BasicBlock *CatchPadBB = CatchPad->getParent();
+    const Constant *FilterOrNull =
+        cast<Constant>(CatchPad->getArgOperand(0)->stripPointerCasts());
+    const Function *Filter = dyn_cast<Function>(FilterOrNull);
+    assert((Filter || FilterOrNull->isNullValue()) &&
+           "unexpected filter value");
+    int TryState = addSEHExcept(FuncInfo, ParentState, Filter, CatchPadBB);
+
+    // Everything in the __try block uses TryState as its parent state.
+    FuncInfo.EHPadStateMap[CatchSwitch] = TryState;
+    DEBUG(dbgs() << "Assigning state #" << TryState << " to BB "
+                 << CatchPadBB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CatchSwitch->getParentPad())))
+        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 TryState);
+
+    // Everything in the __except block unwinds to ParentState, just like code
+    // outside the __try.
+    for (const User *U : CatchPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI))
+        if (InnerCatchSwitch->getUnwindDest() == CatchSwitch->getUnwindDest())
+          calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
+      if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI))
+        if (getCleanupRetUnwindDest(InnerCleanupPad) ==
+            CatchSwitch->getUnwindDest())
+          calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
     }
-  }
+  } else {
+    auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
 
-  // We can't do this without a dominator tree.
-  assert(DT);
+    // It's possible for a cleanup to be visited twice: it might have multiple
+    // cleanupret instructions.
+    if (FuncInfo.EHPadStateMap.count(CleanupPad))
+      return;
 
-  if (!EHAllocas.empty()) {
-    PromoteMemToReg(EHAllocas, *DT);
-    EHAllocas.clear();
+    int CleanupState = addSEHFinally(FuncInfo, ParentState, BB);
+    FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
+    DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                 << BB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock =
+               getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad())))
+        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 CleanupState);
+    for (const User *U : CleanupPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (UserI->isEHPad())
+        report_fatal_error("Cleanup funclets for the SEH personality cannot "
+                           "contain exceptional actions");
+    }
   }
+}
 
-  // After promotion, some extracts may be trivially dead. Remove them.
-  SmallVector<Value *, 4> Users(LPad->user_begin(), LPad->user_end());
-  for (auto *U : Users)
-    RecursivelyDeleteTriviallyDeadInstructions(U);
+static bool isTopLevelPadForMSVC(const Instruction *EHPad) {
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(EHPad))
+    return isa<ConstantTokenNone>(CatchSwitch->getParentPad()) &&
+           CatchSwitch->unwindsToCaller();
+  if (auto *CleanupPad = dyn_cast<CleanupPadInst>(EHPad))
+    return isa<ConstantTokenNone>(CleanupPad->getParentPad()) &&
+           getCleanupRetUnwindDest(CleanupPad) == nullptr;
+  if (isa<CatchPadInst>(EHPad))
+    return false;
+  llvm_unreachable("unexpected EHPad!");
 }
 
-void WinEHPrepare::getPossibleReturnTargets(Function *ParentF,
-                                            Function *HandlerF,
-                                            SetVector<BasicBlock*> &Targets) {
-  for (BasicBlock &BB : *HandlerF) {
-    // If the handler contains landing pads, check for any
-    // handlers that may return directly to a block in the
-    // parent function.
-    if (auto *LPI = BB.getLandingPadInst()) {
-      IntrinsicInst *Recover = cast<IntrinsicInst>(LPI->getNextNode());
-      SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
-      parseEHActions(Recover, ActionList);
-      for (const auto &Action : ActionList) {
-        if (auto *CH = dyn_cast<CatchHandler>(Action.get())) {
-          Function *NestedF = cast<Function>(CH->getHandlerBlockOrFunc());
-          getPossibleReturnTargets(ParentF, NestedF, Targets);
-        }
-      }
-    }
+void llvm::calculateSEHStateNumbers(const Function *Fn,
+                                    WinEHFuncInfo &FuncInfo) {
+  // Don't compute state numbers twice.
+  if (!FuncInfo.SEHUnwindMap.empty())
+    return;
 
-    auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
-    if (!Ret)
+  for (const BasicBlock &BB : *Fn) {
+    if (!BB.isEHPad())
       continue;
-
-    // Handler functions must always return a block address.
-    BlockAddress *BA = cast<BlockAddress>(Ret->getReturnValue());
-
-    // If this is the handler for a nested landing pad, the
-    // return address may have been remapped to a block in the
-    // parent handler.  We're not interested in those.
-    if (BA->getFunction() != ParentF)
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    if (!isTopLevelPadForMSVC(FirstNonPHI))
       continue;
-
-    Targets.insert(BA->getBasicBlock());
+    ::calculateSEHStateNumbers(FuncInfo, FirstNonPHI, -1);
   }
+
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
 }
 
-void WinEHPrepare::completeNestedLandingPad(Function *ParentFn,
-                                            LandingPadInst *OutlinedLPad,
-                                            const LandingPadInst *OriginalLPad,
-                                            FrameVarInfoMap &FrameVarInfo) {
-  // Get the nested block and erase the unreachable instruction that was
-  // temporarily inserted as its terminator.
-  LLVMContext &Context = ParentFn->getContext();
-  BasicBlock *OutlinedBB = OutlinedLPad->getParent();
-  // If the nested landing pad was outlined before the landing pad that enclosed
-  // it, it will already be in outlined form.  In that case, we just need to see
-  // if the returns and the enclosing branch instruction need to be updated.
-  IndirectBrInst *Branch =
-      dyn_cast<IndirectBrInst>(OutlinedBB->getTerminator());
-  if (!Branch) {
-    // If the landing pad wasn't in outlined form, it should be a stub with
-    // an unreachable terminator.
-    assert(isa<UnreachableInst>(OutlinedBB->getTerminator()));
-    OutlinedBB->getTerminator()->eraseFromParent();
-    // That should leave OutlinedLPad as the last instruction in its block.
-    assert(&OutlinedBB->back() == OutlinedLPad);
-  }
+void llvm::calculateWinCXXEHStateNumbers(const Function *Fn,
+                                         WinEHFuncInfo &FuncInfo) {
+  // Return if it's already been done.
+  if (!FuncInfo.EHPadStateMap.empty())
+    return;
 
-  // The original landing pad will have already had its action intrinsic
-  // built by the outlining loop.  We need to clone that into the outlined
-  // location.  It may also be necessary to add references to the exception
-  // variables to the outlined handler in which this landing pad is nested
-  // and remap return instructions in the nested handlers that should return
-  // to an address in the outlined handler.
-  Function *OutlinedHandlerFn = OutlinedBB->getParent();
-  BasicBlock::const_iterator II = OriginalLPad;
-  ++II;
-  // The instruction after the landing pad should now be a call to eh.actions.
-  const Instruction *Recover = II;
-  const IntrinsicInst *EHActions = cast<IntrinsicInst>(Recover);
-
-  // Remap the return target in the nested handler.
-  SmallVector<BlockAddress *, 4> ActionTargets;
-  SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
-  parseEHActions(EHActions, ActionList);
-  for (const auto &Action : ActionList) {
-    auto *Catch = dyn_cast<CatchHandler>(Action.get());
-    if (!Catch)
+  for (const BasicBlock &BB : *Fn) {
+    if (!BB.isEHPad())
       continue;
-    // The dyn_cast to function here selects C++ catch handlers and skips
-    // SEH catch handlers.
-    auto *Handler = dyn_cast<Function>(Catch->getHandlerBlockOrFunc());
-    if (!Handler)
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    if (!isTopLevelPadForMSVC(FirstNonPHI))
       continue;
-    // Visit all the return instructions, looking for places that return
-    // to a location within OutlinedHandlerFn.
-    for (BasicBlock &NestedHandlerBB : *Handler) {
-      auto *Ret = dyn_cast<ReturnInst>(NestedHandlerBB.getTerminator());
-      if (!Ret)
-        continue;
-
-      // Handler functions must always return a block address.
-      BlockAddress *BA = cast<BlockAddress>(Ret->getReturnValue());
-      // The original target will have been in the main parent function,
-      // but if it is the address of a block that has been outlined, it
-      // should be a block that was outlined into OutlinedHandlerFn.
-      assert(BA->getFunction() == ParentFn);
-
-      // Ignore targets that aren't part of an outlined handler function.
-      if (!LPadTargetBlocks.count(BA->getBasicBlock()))
-        continue;
-
-      // If the return value is the address ofF a block that we
-      // previously outlined into the parent handler function, replace
-      // the return instruction and add the mapped target to the list
-      // of possible return addresses.
-      BasicBlock *MappedBB = LPadTargetBlocks[BA->getBasicBlock()];
-      assert(MappedBB->getParent() == OutlinedHandlerFn);
-      BlockAddress *NewBA = BlockAddress::get(OutlinedHandlerFn, MappedBB);
-      Ret->eraseFromParent();
-      ReturnInst::Create(Context, NewBA, &NestedHandlerBB);
-      ActionTargets.push_back(NewBA);
-    }
-  }
-  ActionList.clear();
-
-  if (Branch) {
-    // If the landing pad was already in outlined form, just update its targets.
-    for (unsigned int I = Branch->getNumDestinations(); I > 0; --I)
-      Branch->removeDestination(I);
-    // Add the previously collected action targets.
-    for (auto *Target : ActionTargets)
-      Branch->addDestination(Target->getBasicBlock());
-  } else {
-    // If the landing pad was previously stubbed out, fill in its outlined form.
-    IntrinsicInst *NewEHActions = cast<IntrinsicInst>(EHActions->clone());
-    OutlinedBB->getInstList().push_back(NewEHActions);
-
-    // Insert an indirect branch into the outlined landing pad BB.
-    IndirectBrInst *IBr = IndirectBrInst::Create(NewEHActions, 0, OutlinedBB);
-    // Add the previously collected action targets.
-    for (auto *Target : ActionTargets)
-      IBr->addDestination(Target->getBasicBlock());
+    calculateCXXStateNumbers(FuncInfo, FirstNonPHI, -1);
   }
-}
-
-// This function examines a block to determine whether the block ends with a
-// conditional branch to a catch handler based on a selector comparison.
-// This function is used both by the WinEHPrepare::findSelectorComparison() and
-// WinEHCleanupDirector::handleTypeIdFor().
-static bool isSelectorDispatch(BasicBlock *BB, BasicBlock *&CatchHandler,
-                               Constant *&Selector, BasicBlock *&NextBB) {
-  ICmpInst::Predicate Pred;
-  BasicBlock *TBB, *FBB;
-  Value *LHS, *RHS;
-
-  if (!match(BB->getTerminator(),
-             m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TBB, FBB)))
-    return false;
 
-  if (!match(LHS,
-             m_Intrinsic<Intrinsic::eh_typeid_for>(m_Constant(Selector))) &&
-      !match(RHS, m_Intrinsic<Intrinsic::eh_typeid_for>(m_Constant(Selector))))
-    return false;
-
-  if (Pred == CmpInst::ICMP_EQ) {
-    CatchHandler = TBB;
-    NextBB = FBB;
-    return true;
-  }
-
-  if (Pred == CmpInst::ICMP_NE) {
-    CatchHandler = FBB;
-    NextBB = TBB;
-    return true;
-  }
-
-  return false;
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
 }
 
-static bool isCatchBlock(BasicBlock *BB) {
-  for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end();
-       II != IE; ++II) {
-    if (match(cast<Value>(II), m_Intrinsic<Intrinsic::eh_begincatch>()))
-      return true;
-  }
-  return false;
+static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState,
+                           int TryParentState, ClrHandlerType HandlerType,
+                           uint32_t TypeToken, const BasicBlock *Handler) {
+  ClrEHUnwindMapEntry Entry;
+  Entry.HandlerParentState = HandlerParentState;
+  Entry.TryParentState = TryParentState;
+  Entry.Handler = Handler;
+  Entry.HandlerType = HandlerType;
+  Entry.TypeToken = TypeToken;
+  FuncInfo.ClrEHUnwindMap.push_back(Entry);
+  return FuncInfo.ClrEHUnwindMap.size() - 1;
 }
 
-static BasicBlock *createStubLandingPad(Function *Handler) {
-  // FIXME: Finish this!
-  LLVMContext &Context = Handler->getContext();
-  BasicBlock *StubBB = BasicBlock::Create(Context, "stub");
-  Handler->getBasicBlockList().push_back(StubBB);
-  IRBuilder<> Builder(StubBB);
-  LandingPadInst *LPad = Builder.CreateLandingPad(
-      llvm::StructType::get(Type::getInt8PtrTy(Context),
-                            Type::getInt32Ty(Context), nullptr),
-      0);
-  // Insert a call to llvm.eh.actions so that we don't try to outline this lpad.
-  Function *ActionIntrin =
-      Intrinsic::getDeclaration(Handler->getParent(), Intrinsic::eh_actions);
-  Builder.CreateCall(ActionIntrin, {}, "recover");
-  LPad->setCleanup(true);
-  Builder.CreateUnreachable();
-  return StubBB;
-}
-
-// Cycles through the blocks in an outlined handler function looking for an
-// invoke instruction and inserts an invoke of llvm.donothing with an empty
-// landing pad if none is found.  The code that generates the .xdata tables for
-// the handler needs at least one landing pad to identify the parent function's
-// personality.
-void WinEHPrepare::addStubInvokeToHandlerIfNeeded(Function *Handler) {
-  ReturnInst *Ret = nullptr;
-  UnreachableInst *Unreached = nullptr;
-  for (BasicBlock &BB : *Handler) {
-    TerminatorInst *Terminator = BB.getTerminator();
-    // If we find an invoke, there is nothing to be done.
-    auto *II = dyn_cast<InvokeInst>(Terminator);
-    if (II)
-      return;
-    // If we've already recorded a return instruction, keep looking for invokes.
-    if (!Ret)
-      Ret = dyn_cast<ReturnInst>(Terminator);
-    // If we haven't recorded an unreachable instruction, try this terminator.
-    if (!Unreached)
-      Unreached = dyn_cast<UnreachableInst>(Terminator);
-  }
-
-  // If we got this far, the handler contains no invokes.  We should have seen
-  // at least one return or unreachable instruction.  We'll insert an invoke of
-  // llvm.donothing ahead of that instruction.
-  assert(Ret || Unreached);
-  TerminatorInst *Term;
-  if (Ret)
-    Term = Ret;
-  else
-    Term = Unreached;
-  BasicBlock *OldRetBB = Term->getParent();
-  BasicBlock *NewRetBB = SplitBlock(OldRetBB, Term, DT);
-  // SplitBlock adds an unconditional branch instruction at the end of the
-  // parent block.  We want to replace that with an invoke call, so we can
-  // erase it now.
-  OldRetBB->getTerminator()->eraseFromParent();
-  BasicBlock *StubLandingPad = createStubLandingPad(Handler);
-  Function *F =
-      Intrinsic::getDeclaration(Handler->getParent(), Intrinsic::donothing);
-  InvokeInst::Create(F, NewRetBB, StubLandingPad, None, "", OldRetBB);
-}
-
-// FIXME: Consider sinking this into lib/Target/X86 somehow. TargetLowering
-// usually doesn't build LLVM IR, so that's probably the wrong place.
-Function *WinEHPrepare::createHandlerFunc(Function *ParentFn, Type *RetTy,
-                                          const Twine &Name, Module *M,
-                                          Value *&ParentFP) {
-  // x64 uses a two-argument prototype where the parent FP is the second
-  // argument. x86 uses no arguments, just the incoming EBP value.
-  LLVMContext &Context = M->getContext();
-  Type *Int8PtrType = Type::getInt8PtrTy(Context);
-  FunctionType *FnType;
-  if (TheTriple.getArch() == Triple::x86_64) {
-    Type *ArgTys[2] = {Int8PtrType, Int8PtrType};
-    FnType = FunctionType::get(RetTy, ArgTys, false);
-  } else {
-    FnType = FunctionType::get(RetTy, None, false);
-  }
-
-  Function *Handler =
-      Function::Create(FnType, GlobalVariable::InternalLinkage, Name, M);
-  BasicBlock *Entry = BasicBlock::Create(Context, "entry");
-  Handler->getBasicBlockList().push_front(Entry);
-  if (TheTriple.getArch() == Triple::x86_64) {
-    ParentFP = &(Handler->getArgumentList().back());
-  } else {
-    assert(M);
-    Function *FrameAddressFn =
-        Intrinsic::getDeclaration(M, Intrinsic::frameaddress);
-    Function *RecoverFPFn =
-        Intrinsic::getDeclaration(M, Intrinsic::x86_seh_recoverfp);
-    IRBuilder<> Builder(&Handler->getEntryBlock());
-    Value *EBP =
-        Builder.CreateCall(FrameAddressFn, {Builder.getInt32(1)}, "ebp");
-    Value *ParentI8Fn = Builder.CreateBitCast(ParentFn, Int8PtrType);
-    ParentFP = Builder.CreateCall(RecoverFPFn, {ParentI8Fn, EBP});
-  }
-  return Handler;
-}
+void llvm::calculateClrEHStateNumbers(const Function *Fn,
+                                      WinEHFuncInfo &FuncInfo) {
+  // Return if it's already been done.
+  if (!FuncInfo.EHPadStateMap.empty())
+    return;
 
-bool WinEHPrepare::outlineHandler(ActionHandler *Action, Function *SrcFn,
-                                  LandingPadInst *LPad, BasicBlock *StartBB,
-                                  FrameVarInfoMap &VarInfo) {
-  Module *M = SrcFn->getParent();
-  LLVMContext &Context = M->getContext();
-  Type *Int8PtrType = Type::getInt8PtrTy(Context);
-
-  // Create a new function to receive the handler contents.
-  Value *ParentFP;
-  Function *Handler;
-  if (Action->getType() == Catch) {
-    Handler = createHandlerFunc(SrcFn, Int8PtrType, SrcFn->getName() + ".catch", M,
-                                ParentFP);
-  } else {
-    Handler = createHandlerFunc(SrcFn, Type::getVoidTy(Context),
-                                SrcFn->getName() + ".cleanup", M, ParentFP);
-  }
-  Handler->setPersonalityFn(SrcFn->getPersonalityFn());
-  HandlerToParentFP[Handler] = ParentFP;
-  Handler->addFnAttr("wineh-parent", SrcFn->getName());
-  BasicBlock *Entry = &Handler->getEntryBlock();
-
-  // Generate a standard prolog to setup the frame recovery structure.
-  IRBuilder<> Builder(Context);
-  Builder.SetInsertPoint(Entry);
-  Builder.SetCurrentDebugLocation(LPad->getDebugLoc());
-
-  std::unique_ptr<WinEHCloningDirectorBase> Director;
-
-  ValueToValueMapTy VMap;
-
-  LandingPadMap &LPadMap = LPadMaps[LPad];
-  if (!LPadMap.isInitialized())
-    LPadMap.mapLandingPad(LPad);
-  if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) {
-    Constant *Sel = CatchAction->getSelector();
-    Director.reset(new WinEHCatchDirector(Handler, ParentFP, Sel, VarInfo,
-                                          LPadMap, NestedLPtoOriginalLP, DT,
-                                          EHBlocks));
-    LPadMap.remapEHValues(VMap, UndefValue::get(Int8PtrType),
-                          ConstantInt::get(Type::getInt32Ty(Context), 1));
-  } else {
-    Director.reset(
-        new WinEHCleanupDirector(Handler, ParentFP, VarInfo, LPadMap));
-    LPadMap.remapEHValues(VMap, UndefValue::get(Int8PtrType),
-                          UndefValue::get(Type::getInt32Ty(Context)));
+  // This numbering assigns one state number to each catchpad and cleanuppad.
+  // It also computes two tree-like relations over states:
+  // 1) Each state has a "HandlerParentState", which is the state of the next
+  //    outer handler enclosing this state's handler (same as nearest ancestor
+  //    per the ParentPad linkage on EH pads, but skipping over catchswitches).
+  // 2) Each state has a "TryParentState", which:
+  //    a) for a catchpad that's not the last handler on its catchswitch, is
+  //       the state of the next catchpad on that catchswitch
+  //    b) for all other pads, is the state of the pad whose try region is the
+  //       next outer try region enclosing this state's try region.  The "try
+  //       regions are not present as such in the IR, but will be inferred
+  //       based on the placement of invokes and pads which reach each other
+  //       by exceptional exits
+  // Catchswitches do not get their own states, but each gets mapped to the
+  // state of its first catchpad.
+
+  // Step one: walk down from outermost to innermost funclets, assigning each
+  // catchpad and cleanuppad a state number.  Add an entry to the
+  // ClrEHUnwindMap for each state, recording its HandlerParentState and
+  // handler attributes.  Record the TryParentState as well for each catchpad
+  // that's not the last on its catchswitch, but initialize all other entries'
+  // TryParentStates to a sentinel -1 value that the next pass will update.
+
+  // Seed a worklist with pads that have no parent.
+  SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
+  for (const BasicBlock &BB : *Fn) {
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    const Value *ParentPad;
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI))
+      ParentPad = CPI->getParentPad();
+    else if (const auto *CSI = dyn_cast<CatchSwitchInst>(FirstNonPHI))
+      ParentPad = CSI->getParentPad();
+    else
+      continue;
+    if (isa<ConstantTokenNone>(ParentPad))
+      Worklist.emplace_back(FirstNonPHI, -1);
   }
 
-  SmallVector<ReturnInst *, 8> Returns;
-  ClonedCodeInfo OutlinedFunctionInfo;
-
-  // If the start block contains PHI nodes, we need to map them.
-  BasicBlock::iterator II = StartBB->begin();
-  while (auto *PN = dyn_cast<PHINode>(II)) {
-    bool Mapped = false;
-    // Look for PHI values that we have already mapped (such as the selector).
-    for (Value *Val : PN->incoming_values()) {
-      if (VMap.count(Val)) {
-        VMap[PN] = VMap[Val];
-        Mapped = true;
+  // Use the worklist to visit all pads, from outer to inner.  Record
+  // HandlerParentState for all pads.  Record TryParentState only for catchpads
+  // that aren't the last on their catchswitch (setting all other entries'
+  // TryParentStates to an initial value of -1).  This loop is also responsible
+  // for setting the EHPadStateMap entry for all catchpads, cleanuppads, and
+  // catchswitches.
+  while (!Worklist.empty()) {
+    const Instruction *Pad;
+    int HandlerParentState;
+    std::tie(Pad, HandlerParentState) = Worklist.pop_back_val();
+
+    if (const auto *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
+      // Create the entry for this cleanup with the appropriate handler
+      // properties.  Finaly and fault handlers are distinguished by arity.
+      ClrHandlerType HandlerType =
+          (Cleanup->getNumArgOperands() ? ClrHandlerType::Fault
+                                        : ClrHandlerType::Finally);
+      int CleanupState = addClrEHHandler(FuncInfo, HandlerParentState, -1,
+                                         HandlerType, 0, Pad->getParent());
+      // Queue any child EH pads on the worklist.
+      for (const User *U : Cleanup->users())
+        if (const auto *I = dyn_cast<Instruction>(U))
+          if (I->isEHPad())
+            Worklist.emplace_back(I, CleanupState);
+      // Remember this pad's state.
+      FuncInfo.EHPadStateMap[Cleanup] = CleanupState;
+    } else {
+      // Walk the handlers of this catchswitch in reverse order since all but
+      // the last need to set the following one as its TryParentState.
+      const auto *CatchSwitch = cast<CatchSwitchInst>(Pad);
+      int CatchState = -1, FollowerState = -1;
+      SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers());
+      for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend();
+           CBI != CBE; ++CBI, FollowerState = CatchState) {
+        const BasicBlock *CatchBlock = *CBI;
+        // Create the entry for this catch with the appropriate handler
+        // properties.
+        const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
+        uint32_t TypeToken = static_cast<uint32_t>(
+            cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue());
+        CatchState =
+            addClrEHHandler(FuncInfo, HandlerParentState, FollowerState,
+                            ClrHandlerType::Catch, TypeToken, CatchBlock);
+        // Queue any child EH pads on the worklist.
+        for (const User *U : Catch->users())
+          if (const auto *I = dyn_cast<Instruction>(U))
+            if (I->isEHPad())
+              Worklist.emplace_back(I, CatchState);
+        // Remember this catch's state.
+        FuncInfo.EHPadStateMap[Catch] = CatchState;
       }
+      // Associate the catchswitch with the state of its first catch.
+      assert(CatchSwitch->getNumHandlers());
+      FuncInfo.EHPadStateMap[CatchSwitch] = CatchState;
     }
-    // If we didn't find a match for this value, map it as an undef.
-    if (!Mapped) {
-      VMap[PN] = UndefValue::get(PN->getType());
-    }
-    ++II;
   }
 
-  // The landing pad value may be used by PHI nodes.  It will ultimately be
-  // eliminated, but we need it in the map for intermediate handling.
-  VMap[LPad] = UndefValue::get(LPad->getType());
-
-  // Skip over PHIs and, if applicable, landingpad instructions.
-  II = StartBB->getFirstInsertionPt();
-
-  CloneAndPruneIntoFromInst(Handler, SrcFn, II, VMap,
-                            /*ModuleLevelChanges=*/false, Returns, "",
-                            &OutlinedFunctionInfo, Director.get());
-
-  // Move all the instructions in the cloned "entry" block into our entry block.
-  // Depending on how the parent function was laid out, the block that will
-  // correspond to the outlined entry block may not be the first block in the
-  // list.  We can recognize it, however, as the cloned block which has no
-  // predecessors.  Any other block wouldn't have been cloned if it didn't
-  // have a predecessor which was also cloned.
-  Function::iterator ClonedIt = std::next(Function::iterator(Entry));
-  while (!pred_empty(ClonedIt))
-    ++ClonedIt;
-  BasicBlock *ClonedEntryBB = ClonedIt;
-  assert(ClonedEntryBB);
-  Entry->getInstList().splice(Entry->end(), ClonedEntryBB->getInstList());
-  ClonedEntryBB->eraseFromParent();
-
-  // Make sure we can identify the handler's personality later.
-  addStubInvokeToHandlerIfNeeded(Handler);
-
-  if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) {
-    WinEHCatchDirector *CatchDirector =
-        reinterpret_cast<WinEHCatchDirector *>(Director.get());
-    CatchAction->setExceptionVar(CatchDirector->getExceptionVar());
-    CatchAction->setReturnTargets(CatchDirector->getReturnTargets());
-
-    // Look for blocks that are not part of the landing pad that we just
-    // outlined but terminate with a call to llvm.eh.endcatch and a
-    // branch to a block that is in the handler we just outlined.
-    // These blocks will be part of a nested landing pad that intends to
-    // return to an address in this handler.  This case is best handled
-    // after both landing pads have been outlined, so for now we'll just
-    // save the association of the blocks in LPadTargetBlocks.  The
-    // return instructions which are created from these branches will be
-    // replaced after all landing pads have been outlined.
-    for (const auto MapEntry : VMap) {
-      // VMap maps all values and blocks that were just cloned, but dead
-      // blocks which were pruned will map to nullptr.
-      if (!isa<BasicBlock>(MapEntry.first) || MapEntry.second == nullptr)
+  // Step two: record the TryParentState of each state.  For cleanuppads that
+  // don't have cleanuprets, we may need to infer this from their child pads,
+  // so visit pads in descendant-most to ancestor-most order.
+  for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(),
+            End = FuncInfo.ClrEHUnwindMap.rend();
+       Entry != End; ++Entry) {
+    const Instruction *Pad =
+        Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI();
+    // For most pads, the TryParentState is the state associated with the
+    // unwind dest of exceptional exits from it.
+    const BasicBlock *UnwindDest;
+    if (const auto *Catch = dyn_cast<CatchPadInst>(Pad)) {
+      // If a catch is not the last in its catchswitch, its TryParentState is
+      // the state associated with the next catch in the switch, even though
+      // that's not the unwind dest of exceptions escaping the catch.  Those
+      // cases were already assigned a TryParentState in the first pass, so
+      // skip them.
+      if (Entry->TryParentState != -1)
         continue;
-      const BasicBlock *MappedBB = cast<BasicBlock>(MapEntry.first);
-      for (auto *Pred : predecessors(const_cast<BasicBlock *>(MappedBB))) {
-        auto *Branch = dyn_cast<BranchInst>(Pred->getTerminator());
-        if (!Branch || !Branch->isUnconditional() || Pred->size() <= 1)
-          continue;
-        BasicBlock::iterator II = const_cast<BranchInst *>(Branch);
-        --II;
-        if (match(cast<Value>(II), m_Intrinsic<Intrinsic::eh_endcatch>())) {
-          // This would indicate that a nested landing pad wants to return
-          // to a block that is outlined into two different handlers.
-          assert(!LPadTargetBlocks.count(MappedBB));
-          LPadTargetBlocks[MappedBB] = cast<BasicBlock>(MapEntry.second);
+      // Otherwise, get the unwind dest from the catchswitch.
+      UnwindDest = Catch->getCatchSwitch()->getUnwindDest();
+    } else {
+      const auto *Cleanup = cast<CleanupPadInst>(Pad);
+      UnwindDest = nullptr;
+      for (const User *U : Cleanup->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          // Common and unambiguous case -- cleanupret indicates cleanup's
+          // unwind dest.
+          UnwindDest = CleanupRet->getUnwindDest();
+          break;
         }
-      }
-    }
-  } // End if (CatchAction)
-
-  Action->setHandlerBlockOrFunc(Handler);
-
-  return true;
-}
 
-/// This BB must end in a selector dispatch. All we need to do is pass the
-/// handler block to llvm.eh.actions and list it as a possible indirectbr
-/// target.
-void WinEHPrepare::processSEHCatchHandler(CatchHandler *CatchAction,
-                                          BasicBlock *StartBB) {
-  BasicBlock *HandlerBB;
-  BasicBlock *NextBB;
-  Constant *Selector;
-  bool Res = isSelectorDispatch(StartBB, HandlerBB, Selector, NextBB);
-  if (Res) {
-    // If this was EH dispatch, this must be a conditional branch to the handler
-    // block.
-    // FIXME: Handle instructions in the dispatch block. Currently we drop them,
-    // leading to crashes if some optimization hoists stuff here.
-    assert(CatchAction->getSelector() && HandlerBB &&
-           "expected catch EH dispatch");
-  } else {
-    // This must be a catch-all. Split the block after the landingpad.
-    assert(CatchAction->getSelector()->isNullValue() && "expected catch-all");
-    HandlerBB = SplitBlock(StartBB, StartBB->getFirstInsertionPt(), DT);
-  }
-  IRBuilder<> Builder(HandlerBB->getFirstInsertionPt());
-  Function *EHCodeFn = Intrinsic::getDeclaration(
-      StartBB->getParent()->getParent(), Intrinsic::eh_exceptioncode);
-  Value *Code = Builder.CreateCall(EHCodeFn, {}, "sehcode");
-  Code = Builder.CreateIntToPtr(Code, SEHExceptionCodeSlot->getAllocatedType());
-  Builder.CreateStore(Code, SEHExceptionCodeSlot);
-  CatchAction->setHandlerBlockOrFunc(BlockAddress::get(HandlerBB));
-  TinyPtrVector<BasicBlock *> Targets(HandlerBB);
-  CatchAction->setReturnTargets(Targets);
-}
+        // Get an unwind dest for the user
+        const BasicBlock *UserUnwindDest = nullptr;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          UserUnwindDest = Invoke->getUnwindDest();
+        } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(U)) {
+          UserUnwindDest = CatchSwitch->getUnwindDest();
+        } else if (auto *ChildCleanup = dyn_cast<CleanupPadInst>(U)) {
+          int UserState = FuncInfo.EHPadStateMap[ChildCleanup];
+          int UserUnwindState =
+              FuncInfo.ClrEHUnwindMap[UserState].TryParentState;
+          if (UserUnwindState != -1)
+            UserUnwindDest = FuncInfo.ClrEHUnwindMap[UserUnwindState]
+                                 .Handler.get<const BasicBlock *>();
+        }
 
-void LandingPadMap::mapLandingPad(const LandingPadInst *LPad) {
-  // Each instance of this class should only ever be used to map a single
-  // landing pad.
-  assert(OriginLPad == nullptr || OriginLPad == LPad);
+        // Not having an unwind dest for this user might indicate that it
+        // doesn't unwind, so can't be taken as proof that the cleanup itself
+        // may unwind to caller (see e.g. SimplifyUnreachable and
+        // RemoveUnwindEdge).
+        if (!UserUnwindDest)
+          continue;
 
-  // If the landing pad has already been mapped, there's nothing more to do.
-  if (OriginLPad == LPad)
-    return;
+        // Now we have an unwind dest for the user, but we need to see if it
+        // unwinds all the way out of the cleanup or if it stays within it.
+        const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI();
+        const Value *UserUnwindParent;
+        if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad))
+          UserUnwindParent = CSI->getParentPad();
+        else
+          UserUnwindParent =
+              cast<CleanupPadInst>(UserUnwindPad)->getParentPad();
 
-  OriginLPad = LPad;
+        // The unwind stays within the cleanup iff it targets a child of the
+        // cleanup.
+        if (UserUnwindParent == Cleanup)
+          continue;
 
-  // The landingpad instruction returns an aggregate value.  Typically, its
-  // value will be passed to a pair of extract value instructions and the
-  // results of those extracts will have been promoted to reg values before
-  // this routine is called.
-  for (auto *U : LPad->users()) {
-    const ExtractValueInst *Extract = dyn_cast<ExtractValueInst>(U);
-    if (!Extract)
-      continue;
-    assert(Extract->getNumIndices() == 1 &&
-           "Unexpected operation: extracting both landing pad values");
-    unsigned int Idx = *(Extract->idx_begin());
-    assert((Idx == 0 || Idx == 1) &&
-           "Unexpected operation: extracting an unknown landing pad element");
-    if (Idx == 0) {
-      ExtractedEHPtrs.push_back(Extract);
-    } else if (Idx == 1) {
-      ExtractedSelectors.push_back(Extract);
+        // This unwind exits the cleanup, so its dest is the cleanup's dest.
+        UnwindDest = UserUnwindDest;
+        break;
+      }
     }
-  }
-}
-
-bool LandingPadMap::isOriginLandingPadBlock(const BasicBlock *BB) const {
-  return BB->getLandingPadInst() == OriginLPad;
-}
 
-bool LandingPadMap::isLandingPadSpecificInst(const Instruction *Inst) const {
-  if (Inst == OriginLPad)
-    return true;
-  for (auto *Extract : ExtractedEHPtrs) {
-    if (Inst == Extract)
-      return true;
-  }
-  for (auto *Extract : ExtractedSelectors) {
-    if (Inst == Extract)
-      return true;
-  }
-  return false;
-}
-
-void LandingPadMap::remapEHValues(ValueToValueMapTy &VMap, Value *EHPtrValue,
-                                  Value *SelectorValue) const {
-  // Remap all landing pad extract instructions to the specified values.
-  for (auto *Extract : ExtractedEHPtrs)
-    VMap[Extract] = EHPtrValue;
-  for (auto *Extract : ExtractedSelectors)
-    VMap[Extract] = SelectorValue;
-}
-
-static bool isLocalAddressCall(const Value *V) {
-  return match(const_cast<Value *>(V), m_Intrinsic<Intrinsic::localaddress>());
-}
-
-CloningDirector::CloningAction WinEHCloningDirectorBase::handleInstruction(
-    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
-  // If this is one of the boilerplate landing pad instructions, skip it.
-  // The instruction will have already been remapped in VMap.
-  if (LPadMap.isLandingPadSpecificInst(Inst))
-    return CloningDirector::SkipInstruction;
-
-  // Nested landing pads that have not already been outlined will be cloned as
-  // stubs, with just the landingpad instruction and an unreachable instruction.
-  // When all landingpads have been outlined, we'll replace this with the
-  // llvm.eh.actions call and indirect branch created when the landing pad was
-  // outlined.
-  if (auto *LPad = dyn_cast<LandingPadInst>(Inst)) {
-    return handleLandingPad(VMap, LPad, NewBB);
-  }
-
-  // Nested landing pads that have already been outlined will be cloned in their
-  // outlined form, but we need to intercept the ibr instruction to filter out
-  // targets that do not return to the handler we are outlining.
-  if (auto *IBr = dyn_cast<IndirectBrInst>(Inst)) {
-    return handleIndirectBr(VMap, IBr, NewBB);
-  }
-
-  if (auto *Invoke = dyn_cast<InvokeInst>(Inst))
-    return handleInvoke(VMap, Invoke, NewBB);
-
-  if (auto *Resume = dyn_cast<ResumeInst>(Inst))
-    return handleResume(VMap, Resume, NewBB);
-
-  if (auto *Cmp = dyn_cast<CmpInst>(Inst))
-    return handleCompare(VMap, Cmp, NewBB);
-
-  if (match(Inst, m_Intrinsic<Intrinsic::eh_begincatch>()))
-    return handleBeginCatch(VMap, Inst, NewBB);
-  if (match(Inst, m_Intrinsic<Intrinsic::eh_endcatch>()))
-    return handleEndCatch(VMap, Inst, NewBB);
-  if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>()))
-    return handleTypeIdFor(VMap, Inst, NewBB);
-
-  // When outlining llvm.localaddress(), remap that to the second argument,
-  // which is the FP of the parent.
-  if (isLocalAddressCall(Inst)) {
-    VMap[Inst] = ParentFP;
-    return CloningDirector::SkipInstruction;
-  }
-
-  // Continue with the default cloning behavior.
-  return CloningDirector::CloneInstruction;
-}
-
-CloningDirector::CloningAction WinEHCatchDirector::handleLandingPad(
-    ValueToValueMapTy &VMap, const LandingPadInst *LPad, BasicBlock *NewBB) {
-  // If the instruction after the landing pad is a call to llvm.eh.actions
-  // the landing pad has already been outlined.  In this case, we should
-  // clone it because it may return to a block in the handler we are
-  // outlining now that would otherwise be unreachable.  The landing pads
-  // are sorted before outlining begins to enable this case to work
-  // properly.
-  const Instruction *NextI = LPad->getNextNode();
-  if (match(NextI, m_Intrinsic<Intrinsic::eh_actions>()))
-    return CloningDirector::CloneInstruction;
-
-  // If the landing pad hasn't been outlined yet, the landing pad we are
-  // outlining now does not dominate it and so it cannot return to a block
-  // in this handler.  In that case, we can just insert a stub landing
-  // pad now and patch it up later.
-  Instruction *NewInst = LPad->clone();
-  if (LPad->hasName())
-    NewInst->setName(LPad->getName());
-  // Save this correlation for later processing.
-  NestedLPtoOriginalLP[cast<LandingPadInst>(NewInst)] = LPad;
-  VMap[LPad] = NewInst;
-  BasicBlock::InstListType &InstList = NewBB->getInstList();
-  InstList.push_back(NewInst);
-  InstList.push_back(new UnreachableInst(NewBB->getContext()));
-  return CloningDirector::StopCloningBB;
-}
-
-CloningDirector::CloningAction WinEHCatchDirector::handleBeginCatch(
-    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
-  // The argument to the call is some form of the first element of the
-  // landingpad aggregate value, but that doesn't matter.  It isn't used
-  // here.
-  // The second argument is an outparameter where the exception object will be
-  // stored. Typically the exception object is a scalar, but it can be an
-  // aggregate when catching by value.
-  // FIXME: Leave something behind to indicate where the exception object lives
-  // for this handler. Should it be part of llvm.eh.actions?
-  assert(ExceptionObjectVar == nullptr && "Multiple calls to "
-                                          "llvm.eh.begincatch found while "
-                                          "outlining catch handler.");
-  ExceptionObjectVar = Inst->getOperand(1)->stripPointerCasts();
-  if (isa<ConstantPointerNull>(ExceptionObjectVar))
-    return CloningDirector::SkipInstruction;
-  assert(cast<AllocaInst>(ExceptionObjectVar)->isStaticAlloca() &&
-         "catch parameter is not static alloca");
-  Materializer.escapeCatchObject(ExceptionObjectVar);
-  return CloningDirector::SkipInstruction;
-}
+    // Record the state of the unwind dest as the TryParentState.
+    int UnwindDestState;
+
+    // If UnwindDest is null at this point, either the pad in question can
+    // be exited by unwind to caller, or it cannot be exited by unwind.  In
+    // either case, reporting such cases as unwinding to caller is correct.
+    // This can lead to EH tables that "look strange" -- if this pad's is in
+    // a parent funclet which has other children that do unwind to an enclosing
+    // pad, the try region for this pad will be missing the "duplicate" EH
+    // clause entries that you'd expect to see covering the whole parent.  That
+    // should be benign, since the unwind never actually happens.  If it were
+    // an issue, we could add a subsequent pass that pushes unwind dests down
+    // from parents that have them to children that appear to unwind to caller.
+    if (!UnwindDest) {
+      UnwindDestState = -1;
+    } else {
+      UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
+    }
 
-CloningDirector::CloningAction
-WinEHCatchDirector::handleEndCatch(ValueToValueMapTy &VMap,
-                                   const Instruction *Inst, BasicBlock *NewBB) {
-  auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst);
-  // It might be interesting to track whether or not we are inside a catch
-  // function, but that might make the algorithm more brittle than it needs
-  // to be.
-
-  // The end catch call can occur in one of two places: either in a
-  // landingpad block that is part of the catch handlers exception mechanism,
-  // or at the end of the catch block.  However, a catch-all handler may call
-  // end catch from the original landing pad.  If the call occurs in a nested
-  // landing pad block, we must skip it and continue so that the landing pad
-  // gets cloned.
-  auto *ParentBB = IntrinCall->getParent();
-  if (ParentBB->isLandingPad() && !LPadMap.isOriginLandingPadBlock(ParentBB))
-    return CloningDirector::SkipInstruction;
-
-  // If an end catch occurs anywhere else we want to terminate the handler
-  // with a return to the code that follows the endcatch call.  If the
-  // next instruction is not an unconditional branch, we need to split the
-  // block to provide a clear target for the return instruction.
-  BasicBlock *ContinueBB;
-  auto Next = std::next(BasicBlock::const_iterator(IntrinCall));
-  const BranchInst *Branch = dyn_cast<BranchInst>(Next);
-  if (!Branch || !Branch->isUnconditional()) {
-    // We're interrupting the cloning process at this location, so the
-    // const_cast we're doing here will not cause a problem.
-    ContinueBB = SplitBlock(const_cast<BasicBlock *>(ParentBB),
-                            const_cast<Instruction *>(cast<Instruction>(Next)));
-  } else {
-    ContinueBB = Branch->getSuccessor(0);
+    Entry->TryParentState = UnwindDestState;
   }
 
-  ReturnInst::Create(NewBB->getContext(), BlockAddress::get(ContinueBB), NewBB);
-  ReturnTargets.push_back(ContinueBB);
-
-  // We just added a terminator to the cloned block.
-  // Tell the caller to stop processing the current basic block so that
-  // the branch instruction will be skipped.
-  return CloningDirector::StopCloningBB;
+  // Step three: transfer information from pads to invokes.
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
 }
 
-CloningDirector::CloningAction WinEHCatchDirector::handleTypeIdFor(
-    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
-  auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst);
-  Value *Selector = IntrinCall->getArgOperand(0)->stripPointerCasts();
-  // This causes a replacement that will collapse the landing pad CFG based
-  // on the filter function we intend to match.
-  if (Selector == CurrentSelector)
-    VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
-  else
-    VMap[Inst] = ConstantInt::get(SelectorIDType, 0);
-  // Tell the caller not to clone this instruction.
-  return CloningDirector::SkipInstruction;
-}
+void WinEHPrepare::colorFunclets(Function &F) {
+  BlockColors = colorEHFunclets(F);
 
-CloningDirector::CloningAction WinEHCatchDirector::handleIndirectBr(
-    ValueToValueMapTy &VMap,
-    const IndirectBrInst *IBr,
-    BasicBlock *NewBB) {
-  // If this indirect branch is not part of a landing pad block, just clone it.
-  const BasicBlock *ParentBB = IBr->getParent();
-  if (!ParentBB->isLandingPad())
-    return CloningDirector::CloneInstruction;
-
-  // If it is part of a landing pad, we want to filter out target blocks
-  // that are not part of the handler we are outlining.
-  const LandingPadInst *LPad = ParentBB->getLandingPadInst();
-
-  // Save this correlation for later processing.
-  NestedLPtoOriginalLP[cast<LandingPadInst>(VMap[LPad])] = LPad;
-
-  // We should only get here for landing pads that have already been outlined.
-  assert(match(LPad->getNextNode(), m_Intrinsic<Intrinsic::eh_actions>()));
-
-  // Copy the indirectbr, but only include targets that were previously
-  // identified as EH blocks and are dominated by the nested landing pad.
-  SetVector<const BasicBlock *> ReturnTargets;
-  for (int I = 0, E = IBr->getNumDestinations(); I < E; ++I) {
-    auto *TargetBB = IBr->getDestination(I);
-    if (EHBlocks.count(const_cast<BasicBlock*>(TargetBB)) &&
-        DT->dominates(ParentBB, TargetBB)) {
-      DEBUG(dbgs() << "  Adding destination " << TargetBB->getName() << "\n");
-      ReturnTargets.insert(TargetBB);
-    }
+  // Invert the map from BB to colors to color to BBs.
+  for (BasicBlock &BB : F) {
+    ColorVector &Colors = BlockColors[&BB];
+    for (BasicBlock *Color : Colors)
+      FuncletBlocks[Color].push_back(&BB);
   }
-  IndirectBrInst *NewBranch = 
-        IndirectBrInst::Create(const_cast<Value *>(IBr->getAddress()),
-                               ReturnTargets.size(), NewBB);
-  for (auto *Target : ReturnTargets)
-    NewBranch->addDestination(const_cast<BasicBlock*>(Target));
-
-  // The operands and targets of the branch instruction are remapped later
-  // because it is a terminator.  Tell the cloning code to clone the
-  // blocks we just added to the target list.
-  return CloningDirector::CloneSuccessors;
 }
 
-CloningDirector::CloningAction
-WinEHCatchDirector::handleInvoke(ValueToValueMapTy &VMap,
-                                 const InvokeInst *Invoke, BasicBlock *NewBB) {
-  return CloningDirector::CloneInstruction;
-}
+void WinEHPrepare::demotePHIsOnFunclets(Function &F) {
+  // Strip PHI nodes off of EH pads.
+  SmallVector<PHINode *, 16> PHINodes;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = &*FI++;
+    if (!BB->isEHPad())
+      continue;
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      Instruction *I = &*BI++;
+      auto *PN = dyn_cast<PHINode>(I);
+      // Stop at the first non-PHI.
+      if (!PN)
+        break;
 
-CloningDirector::CloningAction
-WinEHCatchDirector::handleResume(ValueToValueMapTy &VMap,
-                                 const ResumeInst *Resume, BasicBlock *NewBB) {
-  // Resume instructions shouldn't be reachable from catch handlers.
-  // We still need to handle it, but it will be pruned.
-  BasicBlock::InstListType &InstList = NewBB->getInstList();
-  InstList.push_back(new UnreachableInst(NewBB->getContext()));
-  return CloningDirector::StopCloningBB;
-}
+      AllocaInst *SpillSlot = insertPHILoads(PN, F);
+      if (SpillSlot)
+        insertPHIStores(PN, SpillSlot);
 
-CloningDirector::CloningAction
-WinEHCatchDirector::handleCompare(ValueToValueMapTy &VMap,
-                                  const CmpInst *Compare, BasicBlock *NewBB) {
-  const IntrinsicInst *IntrinCall = nullptr;
-  if (match(Compare->getOperand(0), m_Intrinsic<Intrinsic::eh_typeid_for>())) {
-    IntrinCall = dyn_cast<IntrinsicInst>(Compare->getOperand(0));
-  } else if (match(Compare->getOperand(1),
-                   m_Intrinsic<Intrinsic::eh_typeid_for>())) {
-    IntrinCall = dyn_cast<IntrinsicInst>(Compare->getOperand(1));
-  }
-  if (IntrinCall) {
-    Value *Selector = IntrinCall->getArgOperand(0)->stripPointerCasts();
-    // This causes a replacement that will collapse the landing pad CFG based
-    // on the filter function we intend to match.
-    if (Selector == CurrentSelector->stripPointerCasts()) {
-      VMap[Compare] = ConstantInt::get(SelectorIDType, 1);
-    } else {
-      VMap[Compare] = ConstantInt::get(SelectorIDType, 0);
+      PHINodes.push_back(PN);
     }
-    return CloningDirector::SkipInstruction;
-  }
-  return CloningDirector::CloneInstruction;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleLandingPad(
-    ValueToValueMapTy &VMap, const LandingPadInst *LPad, BasicBlock *NewBB) {
-  // The MS runtime will terminate the process if an exception occurs in a
-  // cleanup handler, so we shouldn't encounter landing pads in the actual
-  // cleanup code, but they may appear in catch blocks.  Depending on where
-  // we started cloning we may see one, but it will get dropped during dead
-  // block pruning.
-  Instruction *NewInst = new UnreachableInst(NewBB->getContext());
-  VMap[LPad] = NewInst;
-  BasicBlock::InstListType &InstList = NewBB->getInstList();
-  InstList.push_back(NewInst);
-  return CloningDirector::StopCloningBB;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleBeginCatch(
-    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
-  // Cleanup code may flow into catch blocks or the catch block may be part
-  // of a branch that will be optimized away.  We'll insert a return
-  // instruction now, but it may be pruned before the cloning process is
-  // complete.
-  ReturnInst::Create(NewBB->getContext(), nullptr, NewBB);
-  return CloningDirector::StopCloningBB;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleEndCatch(
-    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
-  // Cleanup handlers nested within catch handlers may begin with a call to
-  // eh.endcatch.  We can just ignore that instruction.
-  return CloningDirector::SkipInstruction;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleTypeIdFor(
-    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
-  // If we encounter a selector comparison while cloning a cleanup handler,
-  // we want to stop cloning immediately.  Anything after the dispatch
-  // will be outlined into a different handler.
-  BasicBlock *CatchHandler;
-  Constant *Selector;
-  BasicBlock *NextBB;
-  if (isSelectorDispatch(const_cast<BasicBlock *>(Inst->getParent()),
-                         CatchHandler, Selector, NextBB)) {
-    ReturnInst::Create(NewBB->getContext(), nullptr, NewBB);
-    return CloningDirector::StopCloningBB;
   }
-  // If eg.typeid.for is called for any other reason, it can be ignored.
-  VMap[Inst] = ConstantInt::get(SelectorIDType, 0);
-  return CloningDirector::SkipInstruction;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleIndirectBr(
-    ValueToValueMapTy &VMap,
-    const IndirectBrInst *IBr,
-    BasicBlock *NewBB) {
-  // No special handling is required for cleanup cloning.
-  return CloningDirector::CloneInstruction;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleInvoke(
-    ValueToValueMapTy &VMap, const InvokeInst *Invoke, BasicBlock *NewBB) {
-  // All invokes in cleanup handlers can be replaced with calls.
-  SmallVector<Value *, 16> CallArgs(Invoke->op_begin(), Invoke->op_end() - 3);
-  // Insert a normal call instruction...
-  CallInst *NewCall =
-      CallInst::Create(const_cast<Value *>(Invoke->getCalledValue()), CallArgs,
-                       Invoke->getName(), NewBB);
-  NewCall->setCallingConv(Invoke->getCallingConv());
-  NewCall->setAttributes(Invoke->getAttributes());
-  NewCall->setDebugLoc(Invoke->getDebugLoc());
-  VMap[Invoke] = NewCall;
-
-  // Remap the operands.
-  llvm::RemapInstruction(NewCall, VMap, RF_None, nullptr, &Materializer);
-
-  // Insert an unconditional branch to the normal destination.
-  BranchInst::Create(Invoke->getNormalDest(), NewBB);
-
-  // The unwind destination won't be cloned into the new function, so
-  // we don't need to clean up its phi nodes.
-
-  // We just added a terminator to the cloned block.
-  // Tell the caller to stop processing the current basic block.
-  return CloningDirector::CloneSuccessors;
-}
-
-CloningDirector::CloningAction WinEHCleanupDirector::handleResume(
-    ValueToValueMapTy &VMap, const ResumeInst *Resume, BasicBlock *NewBB) {
-  ReturnInst::Create(NewBB->getContext(), nullptr, NewBB);
-
-  // We just added a terminator to the cloned block.
-  // Tell the caller to stop processing the current basic block so that
-  // the branch instruction will be skipped.
-  return CloningDirector::StopCloningBB;
-}
 
-CloningDirector::CloningAction
-WinEHCleanupDirector::handleCompare(ValueToValueMapTy &VMap,
-                                    const CmpInst *Compare, BasicBlock *NewBB) {
-  if (match(Compare->getOperand(0), m_Intrinsic<Intrinsic::eh_typeid_for>()) ||
-      match(Compare->getOperand(1), m_Intrinsic<Intrinsic::eh_typeid_for>())) {
-    VMap[Compare] = ConstantInt::get(SelectorIDType, 1);
-    return CloningDirector::SkipInstruction;
+  for (auto *PN : PHINodes) {
+    // There may be lingering uses on other EH PHIs being removed
+    PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+    PN->eraseFromParent();
   }
-  return CloningDirector::CloneInstruction;
 }
 
-WinEHFrameVariableMaterializer::WinEHFrameVariableMaterializer(
-    Function *OutlinedFn, Value *ParentFP, FrameVarInfoMap &FrameVarInfo)
-    : FrameVarInfo(FrameVarInfo), Builder(OutlinedFn->getContext()) {
-  BasicBlock *EntryBB = &OutlinedFn->getEntryBlock();
-
-  // New allocas should be inserted in the entry block, but after the parent FP
-  // is established if it is an instruction.
-  Instruction *InsertPoint = EntryBB->getFirstInsertionPt();
-  if (auto *FPInst = dyn_cast<Instruction>(ParentFP))
-    InsertPoint = FPInst->getNextNode();
-  Builder.SetInsertPoint(EntryBB, InsertPoint);
-}
+void WinEHPrepare::cloneCommonBlocks(Function &F) {
+  // We need to clone all blocks which belong to multiple funclets.  Values are
+  // remapped throughout the funclet to propogate both the new instructions
+  // *and* the new basic blocks themselves.
+  for (auto &Funclets : FuncletBlocks) {
+    BasicBlock *FuncletPadBB = Funclets.first;
+    std::vector<BasicBlock *> &BlocksInFunclet = Funclets.second;
+    Value *FuncletToken;
+    if (FuncletPadBB == &F.getEntryBlock())
+      FuncletToken = ConstantTokenNone::get(F.getContext());
+    else
+      FuncletToken = FuncletPadBB->getFirstNonPHI();
+
+    std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone;
+    ValueToValueMapTy VMap;
+    for (BasicBlock *BB : BlocksInFunclet) {
+      ColorVector &ColorsForBB = BlockColors[BB];
+      // We don't need to do anything if the block is monochromatic.
+      size_t NumColorsForBB = ColorsForBB.size();
+      if (NumColorsForBB == 1)
+        continue;
 
-Value *WinEHFrameVariableMaterializer::materializeValueFor(Value *V) {
-  // If we're asked to materialize a static alloca, we temporarily create an
-  // alloca in the outlined function and add this to the FrameVarInfo map.  When
-  // all the outlining is complete, we'll replace these temporary allocas with
-  // calls to llvm.localrecover.
-  if (auto *AV = dyn_cast<AllocaInst>(V)) {
-    assert(AV->isStaticAlloca() &&
-           "cannot materialize un-demoted dynamic alloca");
-    AllocaInst *NewAlloca = dyn_cast<AllocaInst>(AV->clone());
-    Builder.Insert(NewAlloca, AV->getName());
-    FrameVarInfo[AV].push_back(NewAlloca);
-    return NewAlloca;
-  }
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Cloning block \'" << BB->getName()
+                              << "\' for funclet \'" << FuncletPadBB->getName()
+                              << "\'.\n");
 
-  if (isa<Instruction>(V) || isa<Argument>(V)) {
-    Function *Parent = isa<Instruction>(V)
-                           ? cast<Instruction>(V)->getParent()->getParent()
-                           : cast<Argument>(V)->getParent();
-    errs()
-        << "Failed to demote instruction used in exception handler of function "
-        << GlobalValue::getRealLinkageName(Parent->getName()) << ":\n";
-    errs() << "  " << *V << '\n';
-    report_fatal_error("WinEHPrepare failed to demote instruction");
-  }
+      // Create a new basic block and copy instructions into it!
+      BasicBlock *CBB =
+          CloneBasicBlock(BB, VMap, Twine(".for.", FuncletPadBB->getName()));
+      // Insert the clone immediately after the original to ensure determinism
+      // and to keep the same relative ordering of any funclet's blocks.
+      CBB->insertInto(&F, BB->getNextNode());
 
-  // Don't materialize other values.
-  return nullptr;
-}
+      // Add basic block mapping.
+      VMap[BB] = CBB;
 
-void WinEHFrameVariableMaterializer::escapeCatchObject(Value *V) {
-  // Catch parameter objects have to live in the parent frame. When we see a use
-  // of a catch parameter, add a sentinel to the multimap to indicate that it's
-  // used from another handler. This will prevent us from trying to sink the
-  // alloca into the handler and ensure that the catch parameter is present in
-  // the call to llvm.localescape.
-  FrameVarInfo[V].push_back(getCatchObjectSentinel());
-}
-
-// This function maps the catch and cleanup handlers that are reachable from the
-// specified landing pad. The landing pad sequence will have this basic shape:
-//
-//  <cleanup handler>
-//  <selector comparison>
-//  <catch handler>
-//  <cleanup handler>
-//  <selector comparison>
-//  <catch handler>
-//  <cleanup handler>
-//  ...
-//
-// Any of the cleanup slots may be absent.  The cleanup slots may be occupied by
-// any arbitrary control flow, but all paths through the cleanup code must
-// eventually reach the next selector comparison and no path can skip to a
-// different selector comparisons, though some paths may terminate abnormally.
-// Therefore, we will use a depth first search from the start of any given
-// cleanup block and stop searching when we find the next selector comparison.
-//
-// If the landingpad instruction does not have a catch clause, we will assume
-// that any instructions other than selector comparisons and catch handlers can
-// be ignored.  In practice, these will only be the boilerplate instructions.
-//
-// The catch handlers may also have any control structure, but we are only
-// interested in the start of the catch handlers, so we don't need to actually
-// follow the flow of the catch handlers.  The start of the catch handlers can
-// be located from the compare instructions, but they can be skipped in the
-// flow by following the contrary branch.
-void WinEHPrepare::mapLandingPadBlocks(LandingPadInst *LPad,
-                                       LandingPadActions &Actions) {
-  unsigned int NumClauses = LPad->getNumClauses();
-  unsigned int HandlersFound = 0;
-  BasicBlock *BB = LPad->getParent();
-
-  DEBUG(dbgs() << "Mapping landing pad: " << BB->getName() << "\n");
-
-  if (NumClauses == 0) {
-    findCleanupHandlers(Actions, BB, nullptr);
-    return;
-  }
-
-  VisitedBlockSet VisitedBlocks;
-
-  while (HandlersFound != NumClauses) {
-    BasicBlock *NextBB = nullptr;
+      // Record delta operations that we need to perform to our color mappings.
+      Orig2Clone.emplace_back(BB, CBB);
+    }
 
-    // Skip over filter clauses.
-    if (LPad->isFilter(HandlersFound)) {
-      ++HandlersFound;
+    // If nothing was cloned, we're done cloning in this funclet.
+    if (Orig2Clone.empty())
       continue;
+
+    // Update our color mappings to reflect that one block has lost a color and
+    // another has gained a color.
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      BlocksInFunclet.push_back(NewBlock);
+      ColorVector &NewColors = BlockColors[NewBlock];
+      assert(NewColors.empty() && "A new block should only have one color!");
+      NewColors.push_back(FuncletPadBB);
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Assigned color \'" << FuncletPadBB->getName()
+                              << "\' to block \'" << NewBlock->getName()
+                              << "\'.\n");
+
+      BlocksInFunclet.erase(
+          std::remove(BlocksInFunclet.begin(), BlocksInFunclet.end(), OldBlock),
+          BlocksInFunclet.end());
+      ColorVector &OldColors = BlockColors[OldBlock];
+      OldColors.erase(
+          std::remove(OldColors.begin(), OldColors.end(), FuncletPadBB),
+          OldColors.end());
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Removed color \'" << FuncletPadBB->getName()
+                              << "\' from block \'" << OldBlock->getName()
+                              << "\'.\n");
     }
 
-    // See if the clause we're looking for is a catch-all.
-    // If so, the catch begins immediately.
-    Constant *ExpectedSelector =
-        LPad->getClause(HandlersFound)->stripPointerCasts();
-    if (isa<ConstantPointerNull>(ExpectedSelector)) {
-      // The catch all must occur last.
-      assert(HandlersFound == NumClauses - 1);
-
-      // There can be additional selector dispatches in the call chain that we
-      // need to ignore.
-      BasicBlock *CatchBlock = nullptr;
-      Constant *Selector;
-      while (BB && isSelectorDispatch(BB, CatchBlock, Selector, NextBB)) {
-        DEBUG(dbgs() << "  Found extra catch dispatch in block "
-                     << CatchBlock->getName() << "\n");
-        BB = NextBB;
-      }
+    // Loop over all of the instructions in this funclet, fixing up operand
+    // references as we go.  This uses VMap to do all the hard work.
+    for (BasicBlock *BB : BlocksInFunclet)
+      // Loop over all instructions, fixing each one as we find it...
+      for (Instruction &I : *BB)
+        RemapInstruction(&I, VMap,
+                         RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+
+    // Catchrets targeting cloned blocks need to be updated separately from
+    // the loop above because they are not in the current funclet.
+    SmallVector<CatchReturnInst *, 2> FixupCatchrets;
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      FixupCatchrets.clear();
+      for (BasicBlock *Pred : predecessors(OldBlock))
+        if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator()))
+          if (CatchRet->getParentPad() == FuncletToken)
+            FixupCatchrets.push_back(CatchRet);
+
+      for (CatchReturnInst *CatchRet : FixupCatchrets)
+        CatchRet->setSuccessor(NewBlock);
+    }
 
-      // Add the catch handler to the action list.
-      CatchHandler *Action = nullptr;
-      if (CatchHandlerMap.count(BB) && CatchHandlerMap[BB] != nullptr) {
-        // If the CatchHandlerMap already has an entry for this BB, re-use it.
-        Action = CatchHandlerMap[BB];
-        assert(Action->getSelector() == ExpectedSelector);
-      } else {
-        // We don't expect a selector dispatch, but there may be a call to
-        // llvm.eh.begincatch, which separates catch handling code from
-        // cleanup code in the same control flow.  This call looks for the
-        // begincatch intrinsic.
-        Action = findCatchHandler(BB, NextBB, VisitedBlocks);
-        if (Action) {
-          // For C++ EH, check if there is any interesting cleanup code before
-          // we begin the catch. This is important because cleanups cannot
-          // rethrow exceptions but code called from catches can. For SEH, it
-          // isn't important if some finally code before a catch-all is executed
-          // out of line or after recovering from the exception.
-          if (Personality == EHPersonality::MSVC_CXX)
-            findCleanupHandlers(Actions, BB, BB);
+    auto UpdatePHIOnClonedBlock = [&](PHINode *PN, bool IsForOldBlock) {
+      unsigned NumPreds = PN->getNumIncomingValues();
+      for (unsigned PredIdx = 0, PredEnd = NumPreds; PredIdx != PredEnd;
+           ++PredIdx) {
+        BasicBlock *IncomingBlock = PN->getIncomingBlock(PredIdx);
+        bool EdgeTargetsFunclet;
+        if (auto *CRI =
+                dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+          EdgeTargetsFunclet = (CRI->getParentPad() == FuncletToken);
         } else {
-          // If an action was not found, it means that the control flows
-          // directly into the catch-all handler and there is no cleanup code.
-          // That's an expected situation and we must create a catch action.
-          // Since this is a catch-all handler, the selector won't actually
-          // appear in the code anywhere.  ExpectedSelector here is the constant
-          // null ptr that we got from the landing pad instruction.
-          Action = new CatchHandler(BB, ExpectedSelector, nullptr);
-          CatchHandlerMap[BB] = Action;
+          ColorVector &IncomingColors = BlockColors[IncomingBlock];
+          assert(!IncomingColors.empty() && "Block not colored!");
+          assert((IncomingColors.size() == 1 ||
+                  llvm::all_of(IncomingColors,
+                               [&](BasicBlock *Color) {
+                                 return Color != FuncletPadBB;
+                               })) &&
+                 "Cloning should leave this funclet's blocks monochromatic");
+          EdgeTargetsFunclet = (IncomingColors.front() == FuncletPadBB);
         }
+        if (IsForOldBlock != EdgeTargetsFunclet)
+          continue;
+        PN->removeIncomingValue(IncomingBlock, /*DeletePHIIfEmpty=*/false);
+        // Revisit the next entry.
+        --PredIdx;
+        --PredEnd;
       }
-      Actions.insertCatchHandler(Action);
-      DEBUG(dbgs() << "  Catch all handler at block " << BB->getName() << "\n");
-      ++HandlersFound;
-
-      // Once we reach a catch-all, don't expect to hit a resume instruction.
-      BB = nullptr;
-      break;
-    }
-
-    CatchHandler *CatchAction = findCatchHandler(BB, NextBB, VisitedBlocks);
-    assert(CatchAction);
-
-    // See if there is any interesting code executed before the dispatch.
-    findCleanupHandlers(Actions, BB, CatchAction->getStartBlock());
-
-    // When the source program contains multiple nested try blocks the catch
-    // handlers can get strung together in such a way that we can encounter
-    // a dispatch for a selector that we've already had a handler for.
-    if (CatchAction->getSelector()->stripPointerCasts() == ExpectedSelector) {
-      ++HandlersFound;
-
-      // Add the catch handler to the action list.
-      DEBUG(dbgs() << "  Found catch dispatch in block "
-                   << CatchAction->getStartBlock()->getName() << "\n");
-      Actions.insertCatchHandler(CatchAction);
-    } else {
-      // Under some circumstances optimized IR will flow unconditionally into a
-      // handler block without checking the selector.  This can only happen if
-      // the landing pad has a catch-all handler and the handler for the
-      // preceeding catch clause is identical to the catch-call handler
-      // (typically an empty catch).  In this case, the handler must be shared
-      // by all remaining clauses.
-      if (isa<ConstantPointerNull>(
-              CatchAction->getSelector()->stripPointerCasts())) {
-        DEBUG(dbgs() << "  Applying early catch-all handler in block "
-                     << CatchAction->getStartBlock()->getName()
-                     << "  to all remaining clauses.\n");
-        Actions.insertCatchHandler(CatchAction);
-        return;
+    };
+
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+      for (Instruction &OldI : *OldBlock) {
+        auto *OldPN = dyn_cast<PHINode>(&OldI);
+        if (!OldPN)
+          break;
+        UpdatePHIOnClonedBlock(OldPN, /*IsForOldBlock=*/true);
+      }
+      for (Instruction &NewI : *NewBlock) {
+        auto *NewPN = dyn_cast<PHINode>(&NewI);
+        if (!NewPN)
+          break;
+        UpdatePHIOnClonedBlock(NewPN, /*IsForOldBlock=*/false);
       }
-
-      DEBUG(dbgs() << "  Found extra catch dispatch in block "
-                   << CatchAction->getStartBlock()->getName() << "\n");
     }
 
-    // Move on to the block after the catch handler.
-    BB = NextBB;
-  }
-
-  // If we didn't wind up in a catch-all, see if there is any interesting code
-  // executed before the resume.
-  findCleanupHandlers(Actions, BB, BB);
-
-  // It's possible that some optimization moved code into a landingpad that
-  // wasn't
-  // previously being used for cleanup.  If that happens, we need to execute
-  // that
-  // extra code from a cleanup handler.
-  if (Actions.includesCleanup() && !LPad->isCleanup())
-    LPad->setCleanup(true);
-}
-
-// This function searches starting with the input block for the next
-// block that terminates with a branch whose condition is based on a selector
-// comparison.  This may be the input block.  See the mapLandingPadBlocks
-// comments for a discussion of control flow assumptions.
-//
-CatchHandler *WinEHPrepare::findCatchHandler(BasicBlock *BB,
-                                             BasicBlock *&NextBB,
-                                             VisitedBlockSet &VisitedBlocks) {
-  // See if we've already found a catch handler use it.
-  // Call count() first to avoid creating a null entry for blocks
-  // we haven't seen before.
-  if (CatchHandlerMap.count(BB) && CatchHandlerMap[BB] != nullptr) {
-    CatchHandler *Action = cast<CatchHandler>(CatchHandlerMap[BB]);
-    NextBB = Action->getNextBB();
-    return Action;
-  }
+    // Check to see if SuccBB has PHI nodes. If so, we need to add entries to
+    // the PHI nodes for NewBB now.
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+      for (BasicBlock *SuccBB : successors(NewBlock)) {
+        for (Instruction &SuccI : *SuccBB) {
+          auto *SuccPN = dyn_cast<PHINode>(&SuccI);
+          if (!SuccPN)
+            break;
+
+          // Ok, we have a PHI node.  Figure out what the incoming value was for
+          // the OldBlock.
+          int OldBlockIdx = SuccPN->getBasicBlockIndex(OldBlock);
+          if (OldBlockIdx == -1)
+            break;
+          Value *IV = SuccPN->getIncomingValue(OldBlockIdx);
+
+          // Remap the value if necessary.
+          if (auto *Inst = dyn_cast<Instruction>(IV)) {
+            ValueToValueMapTy::iterator I = VMap.find(Inst);
+            if (I != VMap.end())
+              IV = I->second;
+          }
 
-  // VisitedBlocks applies only to the current search.  We still
-  // need to consider blocks that we've visited while mapping other
-  // landing pads.
-  VisitedBlocks.insert(BB);
-
-  BasicBlock *CatchBlock = nullptr;
-  Constant *Selector = nullptr;
-
-  // If this is the first time we've visited this block from any landing pad
-  // look to see if it is a selector dispatch block.
-  if (!CatchHandlerMap.count(BB)) {
-    if (isSelectorDispatch(BB, CatchBlock, Selector, NextBB)) {
-      CatchHandler *Action = new CatchHandler(BB, Selector, NextBB);
-      CatchHandlerMap[BB] = Action;
-      return Action;
-    }
-    // If we encounter a block containing an llvm.eh.begincatch before we
-    // find a selector dispatch block, the handler is assumed to be
-    // reached unconditionally.  This happens for catch-all blocks, but
-    // it can also happen for other catch handlers that have been combined
-    // with the catch-all handler during optimization.
-    if (isCatchBlock(BB)) {
-      PointerType *Int8PtrTy = Type::getInt8PtrTy(BB->getContext());
-      Constant *NullSelector = ConstantPointerNull::get(Int8PtrTy);
-      CatchHandler *Action = new CatchHandler(BB, NullSelector, nullptr);
-      CatchHandlerMap[BB] = Action;
-      return Action;
+          SuccPN->addIncoming(IV, NewBlock);
+        }
+      }
     }
-  }
 
-  // Visit each successor, looking for the dispatch.
-  // FIXME: We expect to find the dispatch quickly, so this will probably
-  //        work better as a breadth first search.
-  for (BasicBlock *Succ : successors(BB)) {
-    if (VisitedBlocks.count(Succ))
-      continue;
+    for (ValueToValueMapTy::value_type VT : VMap) {
+      // If there were values defined in BB that are used outside the funclet,
+      // then we now have to update all uses of the value to use either the
+      // original value, the cloned value, or some PHI derived value.  This can
+      // require arbitrary PHI insertion, of which we are prepared to do, clean
+      // these up now.
+      SmallVector<Use *, 16> UsesToRename;
 
-    CatchHandler *Action = findCatchHandler(Succ, NextBB, VisitedBlocks);
-    if (Action)
-      return Action;
-  }
-  return nullptr;
-}
-
-// These are helper functions to combine repeated code from findCleanupHandlers.
-static void createCleanupHandler(LandingPadActions &Actions,
-                                 CleanupHandlerMapTy &CleanupHandlerMap,
-                                 BasicBlock *BB) {
-  CleanupHandler *Action = new CleanupHandler(BB);
-  CleanupHandlerMap[BB] = Action;
-  Actions.insertCleanupHandler(Action);
-  DEBUG(dbgs() << "  Found cleanup code in block "
-               << Action->getStartBlock()->getName() << "\n");
-}
-
-static CallSite matchOutlinedFinallyCall(BasicBlock *BB,
-                                         Instruction *MaybeCall) {
-  // Look for finally blocks that Clang has already outlined for us.
-  //   %fp = call i8* @llvm.localaddress()
-  //   call void @"fin$parent"(iN 1, i8* %fp)
-  if (isLocalAddressCall(MaybeCall) && MaybeCall != BB->getTerminator())
-    MaybeCall = MaybeCall->getNextNode();
-  CallSite FinallyCall(MaybeCall);
-  if (!FinallyCall || FinallyCall.arg_size() != 2)
-    return CallSite();
-  if (!match(FinallyCall.getArgument(0), m_SpecificInt(1)))
-    return CallSite();
-  if (!isLocalAddressCall(FinallyCall.getArgument(1)))
-    return CallSite();
-  return FinallyCall;
-}
-
-static BasicBlock *followSingleUnconditionalBranches(BasicBlock *BB) {
-  // Skip single ubr blocks.
-  while (BB->getFirstNonPHIOrDbg() == BB->getTerminator()) {
-    auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
-    if (Br && Br->isUnconditional())
-      BB = Br->getSuccessor(0);
-    else
-      return BB;
-  }
-  return BB;
-}
-
-// This function searches starting with the input block for the next block that
-// contains code that is not part of a catch handler and would not be eliminated
-// during handler outlining.
-//
-void WinEHPrepare::findCleanupHandlers(LandingPadActions &Actions,
-                                       BasicBlock *StartBB, BasicBlock *EndBB) {
-  // Here we will skip over the following:
-  //
-  // landing pad prolog:
-  //
-  // Unconditional branches
-  //
-  // Selector dispatch
-  //
-  // Resume pattern
-  //
-  // Anything else marks the start of an interesting block
-
-  BasicBlock *BB = StartBB;
-  // Anything other than an unconditional branch will kick us out of this loop
-  // one way or another.
-  while (BB) {
-    BB = followSingleUnconditionalBranches(BB);
-    // If we've already scanned this block, don't scan it again.  If it is
-    // a cleanup block, there will be an action in the CleanupHandlerMap.
-    // If we've scanned it and it is not a cleanup block, there will be a
-    // nullptr in the CleanupHandlerMap.  If we have not scanned it, there will
-    // be no entry in the CleanupHandlerMap.  We must call count() first to
-    // avoid creating a null entry for blocks we haven't scanned.
-    if (CleanupHandlerMap.count(BB)) {
-      if (auto *Action = CleanupHandlerMap[BB]) {
-        Actions.insertCleanupHandler(Action);
-        DEBUG(dbgs() << "  Found cleanup code in block "
-                     << Action->getStartBlock()->getName() << "\n");
-        // FIXME: This cleanup might chain into another, and we need to discover
-        // that.
-        return;
-      } else {
-        // Here we handle the case where the cleanup handler map contains a
-        // value for this block but the value is a nullptr.  This means that
-        // we have previously analyzed the block and determined that it did
-        // not contain any cleanup code.  Based on the earlier analysis, we
-        // know the block must end in either an unconditional branch, a
-        // resume or a conditional branch that is predicated on a comparison
-        // with a selector.  Either the resume or the selector dispatch
-        // would terminate the search for cleanup code, so the unconditional
-        // branch is the only case for which we might need to continue
-        // searching.
-        BasicBlock *SuccBB = followSingleUnconditionalBranches(BB);
-        if (SuccBB == BB || SuccBB == EndBB)
-          return;
-        BB = SuccBB;
+      auto *OldI = dyn_cast<Instruction>(const_cast<Value *>(VT.first));
+      if (!OldI)
         continue;
+      auto *NewI = cast<Instruction>(VT.second);
+      // Scan all uses of this instruction to see if it is used outside of its
+      // funclet, and if so, record them in UsesToRename.
+      for (Use &U : OldI->uses()) {
+        Instruction *UserI = cast<Instruction>(U.getUser());
+        BasicBlock *UserBB = UserI->getParent();
+        ColorVector &ColorsForUserBB = BlockColors[UserBB];
+        assert(!ColorsForUserBB.empty());
+        if (ColorsForUserBB.size() > 1 ||
+            *ColorsForUserBB.begin() != FuncletPadBB)
+          UsesToRename.push_back(&U);
       }
-    }
 
-    // Create an entry in the cleanup handler map for this block.  Initially
-    // we create an entry that says this isn't a cleanup block.  If we find
-    // cleanup code, the caller will replace this entry.
-    CleanupHandlerMap[BB] = nullptr;
+      // If there are no uses outside the block, we're done with this
+      // instruction.
+      if (UsesToRename.empty())
+        continue;
 
-    TerminatorInst *Terminator = BB->getTerminator();
+      // We found a use of OldI outside of the funclet.  Rename all uses of OldI
+      // that are outside its funclet to be uses of the appropriate PHI node
+      // etc.
+      SSAUpdater SSAUpdate;
+      SSAUpdate.Initialize(OldI->getType(), OldI->getName());
+      SSAUpdate.AddAvailableValue(OldI->getParent(), OldI);
+      SSAUpdate.AddAvailableValue(NewI->getParent(), NewI);
 
-    // Landing pad blocks have extra instructions we need to accept.
-    LandingPadMap *LPadMap = nullptr;
-    if (BB->isLandingPad()) {
-      LandingPadInst *LPad = BB->getLandingPadInst();
-      LPadMap = &LPadMaps[LPad];
-      if (!LPadMap->isInitialized())
-        LPadMap->mapLandingPad(LPad);
+      while (!UsesToRename.empty())
+        SSAUpdate.RewriteUseAfterInsertions(*UsesToRename.pop_back_val());
     }
+  }
+}
 
-    // Look for the bare resume pattern:
-    //   %lpad.val1 = insertvalue { i8*, i32 } undef, i8* %exn, 0
-    //   %lpad.val2 = insertvalue { i8*, i32 } %lpad.val1, i32 %sel, 1
-    //   resume { i8*, i32 } %lpad.val2
-    if (auto *Resume = dyn_cast<ResumeInst>(Terminator)) {
-      InsertValueInst *Insert1 = nullptr;
-      InsertValueInst *Insert2 = nullptr;
-      Value *ResumeVal = Resume->getOperand(0);
-      // If the resume value isn't a phi or landingpad value, it should be a
-      // series of insertions. Identify them so we can avoid them when scanning
-      // for cleanups.
-      if (!isa<PHINode>(ResumeVal) && !isa<LandingPadInst>(ResumeVal)) {
-        Insert2 = dyn_cast<InsertValueInst>(ResumeVal);
-        if (!Insert2)
-          return createCleanupHandler(Actions, CleanupHandlerMap, BB);
-        Insert1 = dyn_cast<InsertValueInst>(Insert2->getAggregateOperand());
-        if (!Insert1)
-          return createCleanupHandler(Actions, CleanupHandlerMap, BB);
-      }
-      for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end();
-           II != IE; ++II) {
-        Instruction *Inst = II;
-        if (LPadMap && LPadMap->isLandingPadSpecificInst(Inst))
+void WinEHPrepare::removeImplausibleInstructions(Function &F) {
+  // Remove implausible terminators and replace them with UnreachableInst.
+  for (auto &Funclet : FuncletBlocks) {
+    BasicBlock *FuncletPadBB = Funclet.first;
+    std::vector<BasicBlock *> &BlocksInFunclet = Funclet.second;
+    Instruction *FirstNonPHI = FuncletPadBB->getFirstNonPHI();
+    auto *FuncletPad = dyn_cast<FuncletPadInst>(FirstNonPHI);
+    auto *CatchPad = dyn_cast_or_null<CatchPadInst>(FuncletPad);
+    auto *CleanupPad = dyn_cast_or_null<CleanupPadInst>(FuncletPad);
+
+    for (BasicBlock *BB : BlocksInFunclet) {
+      for (Instruction &I : *BB) {
+        CallSite CS(&I);
+        if (!CS)
           continue;
-        if (Inst == Insert1 || Inst == Insert2 || Inst == Resume)
-          continue;
-        if (!Inst->hasOneUse() ||
-            (Inst->user_back() != Insert1 && Inst->user_back() != Insert2)) {
-          return createCleanupHandler(Actions, CleanupHandlerMap, BB);
-        }
-      }
-      return;
-    }
 
-    BranchInst *Branch = dyn_cast<BranchInst>(Terminator);
-    if (Branch && Branch->isConditional()) {
-      // Look for the selector dispatch.
-      //   %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
-      //   %matches = icmp eq i32 %sel, %2
-      //   br i1 %matches, label %catch14, label %eh.resume
-      CmpInst *Compare = dyn_cast<CmpInst>(Branch->getCondition());
-      if (!Compare || !Compare->isEquality())
-        return createCleanupHandler(Actions, CleanupHandlerMap, BB);
-      for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end();
-           II != IE; ++II) {
-        Instruction *Inst = II;
-        if (LPadMap && LPadMap->isLandingPadSpecificInst(Inst))
-          continue;
-        if (Inst == Compare || Inst == Branch)
+        Value *FuncletBundleOperand = nullptr;
+        if (auto BU = CS.getOperandBundle(LLVMContext::OB_funclet))
+          FuncletBundleOperand = BU->Inputs.front();
+
+        if (FuncletBundleOperand == FuncletPad)
           continue;
-        if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>()))
+
+        // Skip call sites which are nounwind intrinsics.
+        auto *CalledFn =
+            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+        if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow())
           continue;
-        return createCleanupHandler(Actions, CleanupHandlerMap, BB);
-      }
-      // The selector dispatch block should always terminate our search.
-      assert(BB == EndBB);
-      return;
-    }
 
-    if (isAsynchronousEHPersonality(Personality)) {
-      // If this is a landingpad block, split the block at the first non-landing
-      // pad instruction.
-      Instruction *MaybeCall = BB->getFirstNonPHIOrDbg();
-      if (LPadMap) {
-        while (MaybeCall != BB->getTerminator() &&
-               LPadMap->isLandingPadSpecificInst(MaybeCall))
-          MaybeCall = MaybeCall->getNextNode();
+        // This call site was not part of this funclet, remove it.
+        if (CS.isInvoke()) {
+          // Remove the unwind edge if it was an invoke.
+          removeUnwindEdge(BB);
+          // Get a pointer to the new call.
+          BasicBlock::iterator CallI =
+              std::prev(BB->getTerminator()->getIterator());
+          auto *CI = cast<CallInst>(&*CallI);
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+        } else {
+          changeToUnreachable(&I, /*UseLLVMTrap=*/false);
+        }
+
+        // There are no more instructions in the block (except for unreachable),
+        // we are done.
+        break;
       }
 
-      // Look for outlined finally calls on x64, since those happen to match the
-      // prototype provided by the runtime.
-      if (TheTriple.getArch() == Triple::x86_64) {
-        if (CallSite FinallyCall = matchOutlinedFinallyCall(BB, MaybeCall)) {
-          Function *Fin = FinallyCall.getCalledFunction();
-          assert(Fin && "outlined finally call should be direct");
-          auto *Action = new CleanupHandler(BB);
-          Action->setHandlerBlockOrFunc(Fin);
-          Actions.insertCleanupHandler(Action);
-          CleanupHandlerMap[BB] = Action;
-          DEBUG(dbgs() << "  Found frontend-outlined finally call to "
-                       << Fin->getName() << " in block "
-                       << Action->getStartBlock()->getName() << "\n");
-
-          // Split the block if there were more interesting instructions and
-          // look for finally calls in the normal successor block.
-          BasicBlock *SuccBB = BB;
-          if (FinallyCall.getInstruction() != BB->getTerminator() &&
-              FinallyCall.getInstruction()->getNextNode() !=
-                  BB->getTerminator()) {
-            SuccBB =
-                SplitBlock(BB, FinallyCall.getInstruction()->getNextNode(), DT);
-          } else {
-            if (FinallyCall.isInvoke()) {
-              SuccBB = cast<InvokeInst>(FinallyCall.getInstruction())
-                           ->getNormalDest();
-            } else {
-              SuccBB = BB->getUniqueSuccessor();
-              assert(SuccBB &&
-                     "splitOutlinedFinallyCalls didn't insert a branch");
-            }
-          }
-          BB = SuccBB;
-          if (BB == EndBB)
-            return;
-          continue;
+      TerminatorInst *TI = BB->getTerminator();
+      // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst.
+      bool IsUnreachableRet = isa<ReturnInst>(TI) && FuncletPad;
+      // The token consumed by a CatchReturnInst must match the funclet token.
+      bool IsUnreachableCatchret = false;
+      if (auto *CRI = dyn_cast<CatchReturnInst>(TI))
+        IsUnreachableCatchret = CRI->getCatchPad() != CatchPad;
+      // The token consumed by a CleanupReturnInst must match the funclet token.
+      bool IsUnreachableCleanupret = false;
+      if (auto *CRI = dyn_cast<CleanupReturnInst>(TI))
+        IsUnreachableCleanupret = CRI->getCleanupPad() != CleanupPad;
+      if (IsUnreachableRet || IsUnreachableCatchret ||
+          IsUnreachableCleanupret) {
+        changeToUnreachable(TI, /*UseLLVMTrap=*/false);
+      } else if (isa<InvokeInst>(TI)) {
+        if (Personality == EHPersonality::MSVC_CXX && CleanupPad) {
+          // Invokes within a cleanuppad for the MSVC++ personality never
+          // transfer control to their unwind edge: the personality will
+          // terminate the program.
+          removeUnwindEdge(BB);
         }
       }
     }
-
-    // Anything else is either a catch block or interesting cleanup code.
-    for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end();
-         II != IE; ++II) {
-      Instruction *Inst = II;
-      if (LPadMap && LPadMap->isLandingPadSpecificInst(Inst))
-        continue;
-      // Unconditional branches fall through to this loop.
-      if (Inst == Branch)
-        continue;
-      // If this is a catch block, there is no cleanup code to be found.
-      if (match(Inst, m_Intrinsic<Intrinsic::eh_begincatch>()))
-        return;
-      // If this a nested landing pad, it may contain an endcatch call.
-      if (match(Inst, m_Intrinsic<Intrinsic::eh_endcatch>()))
-        return;
-      // Anything else makes this interesting cleanup code.
-      return createCleanupHandler(Actions, CleanupHandlerMap, BB);
-    }
-
-    // Only unconditional branches in empty blocks should get this far.
-    assert(Branch && Branch->isUnconditional());
-    if (BB == EndBB)
-      return;
-    BB = Branch->getSuccessor(0);
   }
 }
 
-// This is a public function, declared in WinEHFuncInfo.h and is also
-// referenced by WinEHNumbering in FunctionLoweringInfo.cpp.
-void llvm::parseEHActions(
-    const IntrinsicInst *II,
-    SmallVectorImpl<std::unique_ptr<ActionHandler>> &Actions) {
-  assert(II->getIntrinsicID() == Intrinsic::eh_actions &&
-         "attempted to parse non eh.actions intrinsic");
-  for (unsigned I = 0, E = II->getNumArgOperands(); I != E;) {
-    uint64_t ActionKind =
-        cast<ConstantInt>(II->getArgOperand(I))->getZExtValue();
-    if (ActionKind == /*catch=*/1) {
-      auto *Selector = cast<Constant>(II->getArgOperand(I + 1));
-      ConstantInt *EHObjIndex = cast<ConstantInt>(II->getArgOperand(I + 2));
-      int64_t EHObjIndexVal = EHObjIndex->getSExtValue();
-      Constant *Handler = cast<Constant>(II->getArgOperand(I + 3));
-      I += 4;
-      auto CH = make_unique<CatchHandler>(/*BB=*/nullptr, Selector,
-                                          /*NextBB=*/nullptr);
-      CH->setHandlerBlockOrFunc(Handler);
-      CH->setExceptionVarIndex(EHObjIndexVal);
-      Actions.push_back(std::move(CH));
-    } else if (ActionKind == 0) {
-      Constant *Handler = cast<Constant>(II->getArgOperand(I + 1));
-      I += 2;
-      auto CH = make_unique<CleanupHandler>(/*BB=*/nullptr);
-      CH->setHandlerBlockOrFunc(Handler);
-      Actions.push_back(std::move(CH));
-    } else {
-      llvm_unreachable("Expected either a catch or cleanup handler!");
-    }
+void WinEHPrepare::cleanupPreparedFunclets(Function &F) {
+  // Clean-up some of the mess we made by removing useles PHI nodes, trivial
+  // branches, etc.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = &*FI++;
+    SimplifyInstructionsInBlock(BB);
+    ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true);
+    MergeBlockIntoPredecessor(BB);
   }
-  std::reverse(Actions.begin(), Actions.end());
+
+  // We might have some unreachable blocks after cleaning up some impossible
+  // control flow.
+  removeUnreachableBlocks(F);
 }
 
-namespace {
-struct WinEHNumbering {
-  WinEHNumbering(WinEHFuncInfo &FuncInfo) : FuncInfo(FuncInfo),
-      CurrentBaseState(-1), NextState(0) {}
+void WinEHPrepare::verifyPreparedFunclets(Function &F) {
+  for (BasicBlock &BB : F) {
+    size_t NumColors = BlockColors[&BB].size();
+    assert(NumColors == 1 && "Expected monochromatic BB!");
+    if (NumColors == 0)
+      report_fatal_error("Uncolored BB!");
+    if (NumColors > 1)
+      report_fatal_error("Multicolor BB!");
+    assert((DisableDemotion || !(BB.isEHPad() && isa<PHINode>(BB.begin()))) &&
+           "EH Pad still has a PHI!");
+  }
+}
 
-  WinEHFuncInfo &FuncInfo;
-  int CurrentBaseState;
-  int NextState;
+bool WinEHPrepare::prepareExplicitEH(Function &F) {
+  // Remove unreachable blocks.  It is not valuable to assign them a color and
+  // their existence can trick us into thinking values are alive when they are
+  // not.
+  removeUnreachableBlocks(F);
 
-  SmallVector<std::unique_ptr<ActionHandler>, 4> HandlerStack;
-  SmallPtrSet<const Function *, 4> VisitedHandlers;
+  // Determine which blocks are reachable from which funclet entries.
+  colorFunclets(F);
 
-  int currentEHNumber() const {
-    return HandlerStack.empty() ? CurrentBaseState : HandlerStack.back()->getEHState();
-  }
+  cloneCommonBlocks(F);
 
-  void createUnwindMapEntry(int ToState, ActionHandler *AH);
-  void createTryBlockMapEntry(int TryLow, int TryHigh,
-                              ArrayRef<CatchHandler *> Handlers);
-  void processCallSite(MutableArrayRef<std::unique_ptr<ActionHandler>> Actions,
-                       ImmutableCallSite CS);
-  void popUnmatchedActions(int FirstMismatch);
-  void calculateStateNumbers(const Function &F);
-  void findActionRootLPads(const Function &F);
-};
-}
+  if (!DisableDemotion)
+    demotePHIsOnFunclets(F);
 
-void WinEHNumbering::createUnwindMapEntry(int ToState, ActionHandler *AH) {
-  WinEHUnwindMapEntry UME;
-  UME.ToState = ToState;
-  if (auto *CH = dyn_cast_or_null<CleanupHandler>(AH))
-    UME.Cleanup = cast<Function>(CH->getHandlerBlockOrFunc());
-  else
-    UME.Cleanup = nullptr;
-  FuncInfo.UnwindMap.push_back(UME);
-}
+  if (!DisableCleanups) {
+    DEBUG(verifyFunction(F));
+    removeImplausibleInstructions(F);
 
-void WinEHNumbering::createTryBlockMapEntry(int TryLow, int TryHigh,
-                                            ArrayRef<CatchHandler *> Handlers) {
-  // See if we already have an entry for this set of handlers.
-  // This is using iterators rather than a range-based for loop because
-  // if we find the entry we're looking for we'll need the iterator to erase it.
-  int NumHandlers = Handlers.size();
-  auto I = FuncInfo.TryBlockMap.begin();
-  auto E = FuncInfo.TryBlockMap.end();
-  for ( ; I != E; ++I) {
-    auto &Entry = *I;
-    if (Entry.HandlerArray.size() != (size_t)NumHandlers)
-      continue;
-    int N;
-    for (N = 0; N < NumHandlers; ++N) {
-      if (Entry.HandlerArray[N].Handler != Handlers[N]->getHandlerBlockOrFunc())
-        break; // breaks out of inner loop
-    }
-    // If all the handlers match, this is what we were looking for.
-    if (N == NumHandlers) {
-      break;
-    }
+    DEBUG(verifyFunction(F));
+    cleanupPreparedFunclets(F);
   }
 
-  // If we found an existing entry for this set of handlers, extend the range
-  // but move the entry to the end of the map vector.  The order of entries
-  // in the map is critical to the way that the runtime finds handlers.
-  // FIXME: Depending on what has happened with block ordering, this may
-  //        incorrectly combine entries that should remain separate.
-  if (I != E) {
-    // Copy the existing entry.
-    WinEHTryBlockMapEntry Entry = *I;
-    Entry.TryLow = std::min(TryLow, Entry.TryLow);
-    Entry.TryHigh = std::max(TryHigh, Entry.TryHigh);
-    assert(Entry.TryLow <= Entry.TryHigh);
-    // Erase the old entry and add this one to the back.
-    FuncInfo.TryBlockMap.erase(I);
-    FuncInfo.TryBlockMap.push_back(Entry);
-    return;
-  }
+  DEBUG(verifyPreparedFunclets(F));
+  // Recolor the CFG to verify that all is well.
+  DEBUG(colorFunclets(F));
+  DEBUG(verifyPreparedFunclets(F));
 
-  // If we didn't find an entry, create a new one.
-  WinEHTryBlockMapEntry TBME;
-  TBME.TryLow = TryLow;
-  TBME.TryHigh = TryHigh;
-  assert(TBME.TryLow <= TBME.TryHigh);
-  for (CatchHandler *CH : Handlers) {
-    WinEHHandlerType HT;
-    if (CH->getSelector()->isNullValue()) {
-      HT.Adjectives = 0x40;
-      HT.TypeDescriptor = nullptr;
-    } else {
-      auto *GV = cast<GlobalVariable>(CH->getSelector()->stripPointerCasts());
-      // Selectors are always pointers to GlobalVariables with 'struct' type.
-      // The struct has two fields, adjectives and a type descriptor.
-      auto *CS = cast<ConstantStruct>(GV->getInitializer());
-      HT.Adjectives =
-          cast<ConstantInt>(CS->getAggregateElement(0U))->getZExtValue();
-      HT.TypeDescriptor =
-          cast<GlobalVariable>(CS->getAggregateElement(1)->stripPointerCasts());
-    }
-    HT.Handler = cast<Function>(CH->getHandlerBlockOrFunc());
-    HT.CatchObjRecoverIdx = CH->getExceptionVarIndex();
-    TBME.HandlerArray.push_back(HT);
-  }
-  FuncInfo.TryBlockMap.push_back(TBME);
-}
+  BlockColors.clear();
+  FuncletBlocks.clear();
 
-static void print_name(const Value *V) {
-#ifndef NDEBUG
-  if (!V) {
-    DEBUG(dbgs() << "null");
-    return;
-  }
-
-  if (const auto *F = dyn_cast<Function>(V))
-    DEBUG(dbgs() << F->getName());
-  else
-    DEBUG(V->dump());
-#endif
+  return true;
 }
 
-void WinEHNumbering::processCallSite(
-    MutableArrayRef<std::unique_ptr<ActionHandler>> Actions,
-    ImmutableCallSite CS) {
-  DEBUG(dbgs() << "processCallSite (EH state = " << currentEHNumber()
-               << ") for: ");
-  print_name(CS ? CS.getCalledValue() : nullptr);
-  DEBUG(dbgs() << '\n');
-
-  DEBUG(dbgs() << "HandlerStack: \n");
-  for (int I = 0, E = HandlerStack.size(); I < E; ++I) {
-    DEBUG(dbgs() << "  ");
-    print_name(HandlerStack[I]->getHandlerBlockOrFunc());
-    DEBUG(dbgs() << '\n');
-  }
-  DEBUG(dbgs() << "Actions: \n");
-  for (int I = 0, E = Actions.size(); I < E; ++I) {
-    DEBUG(dbgs() << "  ");
-    print_name(Actions[I]->getHandlerBlockOrFunc());
-    DEBUG(dbgs() << '\n');
-  }
-  int FirstMismatch = 0;
-  for (int E = std::min(HandlerStack.size(), Actions.size()); FirstMismatch < E;
-       ++FirstMismatch) {
-    if (HandlerStack[FirstMismatch]->getHandlerBlockOrFunc() !=
-        Actions[FirstMismatch]->getHandlerBlockOrFunc())
-      break;
-  }
-
-  // Remove unmatched actions from the stack and process their EH states.
-  popUnmatchedActions(FirstMismatch);
-
-  DEBUG(dbgs() << "Pushing actions for CallSite: ");
-  print_name(CS ? CS.getCalledValue() : nullptr);
-  DEBUG(dbgs() << '\n');
-
-  bool LastActionWasCatch = false;
-  const LandingPadInst *LastRootLPad = nullptr;
-  for (size_t I = FirstMismatch; I != Actions.size(); ++I) {
-    // We can reuse eh states when pushing two catches for the same invoke.
-    bool CurrActionIsCatch = isa<CatchHandler>(Actions[I].get());
-    auto *Handler = cast<Function>(Actions[I]->getHandlerBlockOrFunc());
-    // Various conditions can lead to a handler being popped from the
-    // stack and re-pushed later.  That shouldn't create a new state.
-    // FIXME: Can code optimization lead to re-used handlers?
-    if (FuncInfo.HandlerEnclosedState.count(Handler)) {
-      // If we already assigned the state enclosed by this handler re-use it.
-      Actions[I]->setEHState(FuncInfo.HandlerEnclosedState[Handler]);
+// TODO: Share loads when one use dominates another, or when a catchpad exit
+// dominates uses (needs dominators).
+AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {
+  BasicBlock *PHIBlock = PN->getParent();
+  AllocaInst *SpillSlot = nullptr;
+  Instruction *EHPad = PHIBlock->getFirstNonPHI();
+
+  if (!isa<TerminatorInst>(EHPad)) {
+    // If the EHPad isn't a terminator, then we can insert a load in this block
+    // that will dominate all uses.
+    SpillSlot = new AllocaInst(PN->getType(), nullptr,
+                               Twine(PN->getName(), ".wineh.spillslot"),
+                               &F.getEntryBlock().front());
+    Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"),
+                            &*PHIBlock->getFirstInsertionPt());
+    PN->replaceAllUsesWith(V);
+    return SpillSlot;
+  }
+
+  // Otherwise, we have a PHI on a terminator EHPad, and we give up and insert
+  // loads of the slot before every use.
+  DenseMap<BasicBlock *, Value *> Loads;
+  for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    auto *UsingInst = cast<Instruction>(U.getUser());
+    if (isa<PHINode>(UsingInst) && UsingInst->getParent()->isEHPad()) {
+      // Use is on an EH pad phi.  Leave it alone; we'll insert loads and
+      // stores for it separately.
       continue;
     }
-    const LandingPadInst* RootLPad = FuncInfo.RootLPad[Handler];
-    if (CurrActionIsCatch && LastActionWasCatch && RootLPad == LastRootLPad) {
-      DEBUG(dbgs() << "setEHState for handler to " << currentEHNumber() << "\n");
-      Actions[I]->setEHState(currentEHNumber());
-    } else {
-      DEBUG(dbgs() << "createUnwindMapEntry(" << currentEHNumber() << ", ");
-      print_name(Actions[I]->getHandlerBlockOrFunc());
-      DEBUG(dbgs() << ") with EH state " << NextState << "\n");
-      createUnwindMapEntry(currentEHNumber(), Actions[I].get());
-      DEBUG(dbgs() << "setEHState for handler to " << NextState << "\n");
-      Actions[I]->setEHState(NextState);
-      NextState++;
-    }
-    HandlerStack.push_back(std::move(Actions[I]));
-    LastActionWasCatch = CurrActionIsCatch;
-    LastRootLPad = RootLPad;
+    replaceUseWithLoad(PN, U, SpillSlot, Loads, F);
   }
-
-  // This is used to defer numbering states for a handler until after the
-  // last time it appears in an invoke action list.
-  if (CS.isInvoke()) {
-    for (int I = 0, E = HandlerStack.size(); I < E; ++I) {
-      auto *Handler = cast<Function>(HandlerStack[I]->getHandlerBlockOrFunc());
-      if (FuncInfo.LastInvoke[Handler] != cast<InvokeInst>(CS.getInstruction()))
-        continue;
-      FuncInfo.LastInvokeVisited[Handler] = true;
-      DEBUG(dbgs() << "Last invoke of ");
-      print_name(Handler);
-      DEBUG(dbgs() << " has been visited.\n");
-    }
-  }
-
-  DEBUG(dbgs() << "In EHState " << currentEHNumber() << " for CallSite: ");
-  print_name(CS ? CS.getCalledValue() : nullptr);
-  DEBUG(dbgs() << '\n');
+  return SpillSlot;
 }
 
-void WinEHNumbering::popUnmatchedActions(int FirstMismatch) {
-  // Don't recurse while we are looping over the handler stack.  Instead, defer
-  // the numbering of the catch handlers until we are done popping.
-  SmallVector<CatchHandler *, 4> PoppedCatches;
-  for (int I = HandlerStack.size() - 1; I >= FirstMismatch; --I) {
-    std::unique_ptr<ActionHandler> Handler = HandlerStack.pop_back_val();
-    if (isa<CatchHandler>(Handler.get()))
-      PoppedCatches.push_back(cast<CatchHandler>(Handler.release()));
-  }
+// TODO: improve store placement.  Inserting at def is probably good, but need
+// to be careful not to introduce interfering stores (needs liveness analysis).
+// TODO: identify related phi nodes that can share spill slots, and share them
+// (also needs liveness).
+void WinEHPrepare::insertPHIStores(PHINode *OriginalPHI,
+                                   AllocaInst *SpillSlot) {
+  // Use a worklist of (Block, Value) pairs -- the given Value needs to be
+  // stored to the spill slot by the end of the given Block.
+  SmallVector<std::pair<BasicBlock *, Value *>, 4> Worklist;
 
-  int TryHigh = NextState - 1;
-  int LastTryLowIdx = 0;
-  for (int I = 0, E = PoppedCatches.size(); I != E; ++I) {
-    CatchHandler *CH = PoppedCatches[I];
-    DEBUG(dbgs() << "Popped handler with state " << CH->getEHState() << "\n");
-    if (I + 1 == E || CH->getEHState() != PoppedCatches[I + 1]->getEHState()) {
-      int TryLow = CH->getEHState();
-      auto Handlers =
-          makeArrayRef(&PoppedCatches[LastTryLowIdx], I - LastTryLowIdx + 1);
-      DEBUG(dbgs() << "createTryBlockMapEntry(" << TryLow << ", " << TryHigh);
-      for (size_t J = 0; J < Handlers.size(); ++J) {
-        DEBUG(dbgs() << ", ");
-        print_name(Handlers[J]->getHandlerBlockOrFunc());
-      }
-      DEBUG(dbgs() << ")\n");
-      createTryBlockMapEntry(TryLow, TryHigh, Handlers);
-      LastTryLowIdx = I + 1;
-    }
-  }
+  Worklist.push_back({OriginalPHI->getParent(), OriginalPHI});
 
-  for (CatchHandler *CH : PoppedCatches) {
-    if (auto *F = dyn_cast<Function>(CH->getHandlerBlockOrFunc())) {
-      if (FuncInfo.LastInvokeVisited[F]) {
-        DEBUG(dbgs() << "Assigning base state " << NextState << " to ");
-        print_name(F);
-        DEBUG(dbgs() << '\n');
-        FuncInfo.HandlerBaseState[F] = NextState;
-        DEBUG(dbgs() << "createUnwindMapEntry(" << currentEHNumber()
-                     << ", null)\n");
-        createUnwindMapEntry(currentEHNumber(), nullptr);
-        ++NextState;
-        calculateStateNumbers(*F);
+  while (!Worklist.empty()) {
+    BasicBlock *EHBlock;
+    Value *InVal;
+    std::tie(EHBlock, InVal) = Worklist.pop_back_val();
+
+    PHINode *PN = dyn_cast<PHINode>(InVal);
+    if (PN && PN->getParent() == EHBlock) {
+      // The value is defined by another PHI we need to remove, with no room to
+      // insert a store after the PHI, so each predecessor needs to store its
+      // incoming value.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+        Value *PredVal = PN->getIncomingValue(i);
+
+        // Undef can safely be skipped.
+        if (isa<UndefValue>(PredVal))
+          continue;
+
+        insertPHIStore(PN->getIncomingBlock(i), PredVal, SpillSlot, Worklist);
       }
-      else {
-        DEBUG(dbgs() << "Deferring handling of ");
-        print_name(F);
-        DEBUG(dbgs() << " until last invoke visited.\n");
+    } else {
+      // We need to store InVal, which dominates EHBlock, but can't put a store
+      // in EHBlock, so need to put stores in each predecessor.
+      for (BasicBlock *PredBlock : predecessors(EHBlock)) {
+        insertPHIStore(PredBlock, InVal, SpillSlot, Worklist);
       }
     }
-    delete CH;
   }
 }
 
-void WinEHNumbering::calculateStateNumbers(const Function &F) {
-  auto I = VisitedHandlers.insert(&F);
-  if (!I.second)
-    return; // We've already visited this handler, don't renumber it.
+void WinEHPrepare::insertPHIStore(
+    BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
+    SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist) {
 
-  int OldBaseState = CurrentBaseState;
-  if (FuncInfo.HandlerBaseState.count(&F)) {
-    CurrentBaseState = FuncInfo.HandlerBaseState[&F];
-  }
-
-  size_t SavedHandlerStackSize = HandlerStack.size();
-
-  DEBUG(dbgs() << "Calculating state numbers for: " << F.getName() << '\n');
-  SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
-      const auto *CI = dyn_cast<CallInst>(&I);
-      if (!CI || CI->doesNotThrow())
-        continue;
-      processCallSite(None, CI);
-    }
-    const auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
-    if (!II)
-      continue;
-    const LandingPadInst *LPI = II->getLandingPadInst();
-    auto *ActionsCall = dyn_cast<IntrinsicInst>(LPI->getNextNode());
-    if (!ActionsCall)
-      continue;
-    parseEHActions(ActionsCall, ActionList);
-    if (ActionList.empty())
-      continue;
-    processCallSite(ActionList, II);
-    ActionList.clear();
-    FuncInfo.LandingPadStateMap[LPI] = currentEHNumber();
-    DEBUG(dbgs() << "Assigning state " << currentEHNumber()
-                  << " to landing pad at " << LPI->getParent()->getName()
-                  << '\n');
+  if (PredBlock->isEHPad() &&
+      isa<TerminatorInst>(PredBlock->getFirstNonPHI())) {
+    // Pred is unsplittable, so we need to queue it on the worklist.
+    Worklist.push_back({PredBlock, PredVal});
+    return;
   }
 
-  // Pop any actions that were pushed on the stack for this function.
-  popUnmatchedActions(SavedHandlerStackSize);
-
-  DEBUG(dbgs() << "Assigning max state " << NextState - 1
-               << " to " << F.getName() << '\n');
-  FuncInfo.CatchHandlerMaxState[&F] = NextState - 1;
-
-  CurrentBaseState = OldBaseState;
+  // Otherwise, insert the store at the end of the basic block.
+  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator());
 }
 
-// This function follows the same basic traversal as calculateStateNumbers
-// but it is necessary to identify the root landing pad associated
-// with each action before we start assigning state numbers.
-void WinEHNumbering::findActionRootLPads(const Function &F) {
-  auto I = VisitedHandlers.insert(&F);
-  if (!I.second)
-    return; // We've already visited this handler, don't revisit it.
-
-  SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
-  for (const BasicBlock &BB : F) {
-    const auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
-    if (!II)
-      continue;
-    const LandingPadInst *LPI = II->getLandingPadInst();
-    auto *ActionsCall = dyn_cast<IntrinsicInst>(LPI->getNextNode());
-    if (!ActionsCall)
-      continue;
-
-    assert(ActionsCall->getIntrinsicID() == Intrinsic::eh_actions);
-    parseEHActions(ActionsCall, ActionList);
-    if (ActionList.empty())
-      continue;
-    for (int I = 0, E = ActionList.size(); I < E; ++I) {
-      if (auto *Handler
-              = dyn_cast<Function>(ActionList[I]->getHandlerBlockOrFunc())) {
-        FuncInfo.LastInvoke[Handler] = II;
-        // Don't replace the root landing pad if we previously saw this
-        // handler in a different function.
-        if (FuncInfo.RootLPad.count(Handler) &&
-            FuncInfo.RootLPad[Handler]->getParent()->getParent() != &F)
-          continue;
-        DEBUG(dbgs() << "Setting root lpad for ");
-        print_name(Handler);
-        DEBUG(dbgs() << " to " << LPI->getParent()->getName() << '\n');
-        FuncInfo.RootLPad[Handler] = LPI;
-      }
+void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
+                                      DenseMap<BasicBlock *, Value *> &Loads,
+                                      Function &F) {
+  // Lazilly create the spill slot.
+  if (!SpillSlot)
+    SpillSlot = new AllocaInst(V->getType(), nullptr,
+                               Twine(V->getName(), ".wineh.spillslot"),
+                               &F.getEntryBlock().front());
+
+  auto *UsingInst = cast<Instruction>(U.getUser());
+  if (auto *UsingPHI = dyn_cast<PHINode>(UsingInst)) {
+    // If this is a PHI node, we can't insert a load of the value before
+    // the use.  Instead insert the load in the predecessor block
+    // corresponding to the incoming value.
+    //
+    // Note that if there are multiple edges from a basic block to this
+    // PHI node that we cannot have multiple loads.  The problem is that
+    // the resulting PHI node will have multiple values (from each load)
+    // coming in from the same block, which is illegal SSA form.
+    // For this reason, we keep track of and reuse loads we insert.
+    BasicBlock *IncomingBlock = UsingPHI->getIncomingBlock(U);
+    if (auto *CatchRet =
+            dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+      // Putting a load above a catchret and use on the phi would still leave
+      // a cross-funclet def/use.  We need to split the edge, change the
+      // catchret to target the new block, and put the load there.
+      BasicBlock *PHIBlock = UsingInst->getParent();
+      BasicBlock *NewBlock = SplitEdge(IncomingBlock, PHIBlock);
+      // SplitEdge gives us:
+      //   IncomingBlock:
+      //     ...
+      //     br label %NewBlock
+      //   NewBlock:
+      //     catchret label %PHIBlock
+      // But we need:
+      //   IncomingBlock:
+      //     ...
+      //     catchret label %NewBlock
+      //   NewBlock:
+      //     br label %PHIBlock
+      // So move the terminators to each others' blocks and swap their
+      // successors.
+      BranchInst *Goto = cast<BranchInst>(IncomingBlock->getTerminator());
+      Goto->removeFromParent();
+      CatchRet->removeFromParent();
+      IncomingBlock->getInstList().push_back(CatchRet);
+      NewBlock->getInstList().push_back(Goto);
+      Goto->setSuccessor(0, PHIBlock);
+      CatchRet->setSuccessor(NewBlock);
+      // Update the color mapping for the newly split edge.
+      ColorVector &ColorsForPHIBlock = BlockColors[PHIBlock];
+      BlockColors[NewBlock] = ColorsForPHIBlock;
+      for (BasicBlock *FuncletPad : ColorsForPHIBlock)
+        FuncletBlocks[FuncletPad].push_back(NewBlock);
+      // Treat the new block as incoming for load insertion.
+      IncomingBlock = NewBlock;
     }
-    // Walk the actions again and look for nested handlers.  This has to
-    // happen after all of the actions have been processed in the current
-    // function.
-    for (int I = 0, E = ActionList.size(); I < E; ++I)
-      if (auto *Handler
-              = dyn_cast<Function>(ActionList[I]->getHandlerBlockOrFunc()))
-        findActionRootLPads(*Handler);
-    ActionList.clear();
+    Value *&Load = Loads[IncomingBlock];
+    // Insert the load into the predecessor block
+    if (!Load)
+      Load = new LoadInst(SpillSlot, Twine(V->getName(), ".wineh.reload"),
+                          /*Volatile=*/false, IncomingBlock->getTerminator());
+
+    U.set(Load);
+  } else {
+    // Reload right before the old use.
+    auto *Load = new LoadInst(SpillSlot, Twine(V->getName(), ".wineh.reload"),
+                              /*Volatile=*/false, UsingInst);
+    U.set(Load);
   }
 }
 
-void llvm::calculateWinCXXEHStateNumbers(const Function *ParentFn,
-                                         WinEHFuncInfo &FuncInfo) {
-  // Return if it's already been done.
-  if (!FuncInfo.LandingPadStateMap.empty())
-    return;
-
-  WinEHNumbering Num(FuncInfo);
-  Num.findActionRootLPads(*ParentFn);
-  // The VisitedHandlers list is used by both findActionRootLPads and
-  // calculateStateNumbers, but both functions need to visit all handlers.
-  Num.VisitedHandlers.clear();
-  Num.calculateStateNumbers(*ParentFn);
-  // Pop everything on the handler stack.
-  // It may be necessary to call this more than once because a handler can
-  // be pushed on the stack as a result of clearing the stack.
-  while (!Num.HandlerStack.empty())
-    Num.processCallSite(None, ImmutableCallSite());
+void WinEHFuncInfo::addIPToStateRange(const InvokeInst *II,
+                                      MCSymbol *InvokeBegin,
+                                      MCSymbol *InvokeEnd) {
+  assert(InvokeStateMap.count(II) &&
+         "should get invoke with precomputed state");
+  LabelToStateMap[InvokeBegin] = std::make_pair(InvokeStateMap[II], InvokeEnd);
 }
+
+WinEHFuncInfo::WinEHFuncInfo() {}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp
new file mode 100644
index 0000000..91b71cc
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp
@@ -0,0 +1,165 @@
+//===-- FieldListRecordBuilder.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
+
+using namespace llvm;
+using namespace codeview;
+
+FieldListRecordBuilder::FieldListRecordBuilder()
+    : ListRecordBuilder(TypeRecordKind::FieldList) {}
+
+void FieldListRecordBuilder::writeBaseClass(MemberAccess Access, TypeIndex Type,
+                                            uint64_t Offset) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::BaseClass);
+  Builder.writeUInt16(static_cast<uint16_t>(Access));
+  Builder.writeTypeIndex(Type);
+  Builder.writeEncodedUnsignedInteger(Offset);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeEnumerate(MemberAccess Access, uint64_t Value,
+                                            StringRef Name) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::Enumerate);
+  Builder.writeUInt16(static_cast<uint16_t>(Access));
+  Builder.writeEncodedUnsignedInteger(Value);
+  Builder.writeNullTerminatedString(Name);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeMember(MemberAccess Access, TypeIndex Type,
+                                         uint64_t Offset, StringRef Name) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::Member);
+  Builder.writeUInt16(static_cast<uint16_t>(Access));
+  Builder.writeTypeIndex(Type);
+  Builder.writeEncodedUnsignedInteger(Offset);
+  Builder.writeNullTerminatedString(Name);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeMethod(uint16_t OverloadCount,
+                                         TypeIndex MethodList, StringRef Name) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::Method);
+  Builder.writeUInt16(OverloadCount);
+  Builder.writeTypeIndex(MethodList);
+  Builder.writeNullTerminatedString(Name);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeOneMethod(
+    MemberAccess Access, MethodKind Kind, MethodOptions Options, TypeIndex Type,
+    int32_t VTableSlotOffset, StringRef Name) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  uint16_t Flags = static_cast<uint16_t>(Access);
+  Flags |= static_cast<uint16_t>(Kind) << MethodKindShift;
+  Flags |= static_cast<uint16_t>(Options);
+
+  Builder.writeTypeRecordKind(TypeRecordKind::OneMethod);
+  Builder.writeUInt16(Flags);
+  Builder.writeTypeIndex(Type);
+  switch (Kind) {
+  case MethodKind::IntroducingVirtual:
+  case MethodKind::PureIntroducingVirtual:
+    assert(VTableSlotOffset >= 0);
+    Builder.writeInt32(VTableSlotOffset);
+    break;
+
+  default:
+    assert(VTableSlotOffset == -1);
+    break;
+  }
+
+  Builder.writeNullTerminatedString(Name);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeOneMethod(const MethodInfo &Method,
+                                            StringRef Name) {
+  writeOneMethod(Method.getAccess(), Method.getKind(), Method.getOptions(),
+                 Method.getType(), Method.getVTableSlotOffset(), Name);
+}
+
+void FieldListRecordBuilder::writeNestedType(TypeIndex Type, StringRef Name) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::NestedType);
+  Builder.writeUInt16(0);
+  Builder.writeTypeIndex(Type);
+  Builder.writeNullTerminatedString(Name);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeStaticMember(MemberAccess Access,
+                                               TypeIndex Type, StringRef Name) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::StaticMember);
+  Builder.writeUInt16(static_cast<uint16_t>(Access));
+  Builder.writeTypeIndex(Type);
+  Builder.writeNullTerminatedString(Name);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeIndirectVirtualBaseClass(
+    MemberAccess Access, TypeIndex Type, TypeIndex VirtualBasePointerType,
+    int64_t VirtualBasePointerOffset, uint64_t SlotIndex) {
+  writeVirtualBaseClass(TypeRecordKind::IndirectVirtualBaseClass, Access, Type,
+                        VirtualBasePointerType, VirtualBasePointerOffset,
+                        SlotIndex);
+}
+
+void FieldListRecordBuilder::writeVirtualBaseClass(
+    MemberAccess Access, TypeIndex Type, TypeIndex VirtualBasePointerType,
+    int64_t VirtualBasePointerOffset, uint64_t SlotIndex) {
+  writeVirtualBaseClass(TypeRecordKind::VirtualBaseClass, Access, Type,
+                        VirtualBasePointerType, VirtualBasePointerOffset,
+                        SlotIndex);
+}
+
+void FieldListRecordBuilder::writeVirtualBaseClass(
+    TypeRecordKind Kind, MemberAccess Access, TypeIndex Type,
+    TypeIndex VirtualBasePointerType, int64_t VirtualBasePointerOffset,
+    uint64_t SlotIndex) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(Kind);
+  Builder.writeUInt16(static_cast<uint16_t>(Access));
+  Builder.writeTypeIndex(Type);
+  Builder.writeTypeIndex(VirtualBasePointerType);
+  Builder.writeEncodedInteger(VirtualBasePointerOffset);
+  Builder.writeEncodedUnsignedInteger(SlotIndex);
+
+  finishSubRecord();
+}
+
+void FieldListRecordBuilder::writeVirtualFunctionTablePointer(TypeIndex Type) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  Builder.writeTypeRecordKind(TypeRecordKind::VirtualFunctionTablePointer);
+  Builder.writeUInt16(0);
+  Builder.writeTypeIndex(Type);
+
+  finishSubRecord();
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/Line.cpp b/contrib/llvm/lib/DebugInfo/CodeView/Line.cpp
new file mode 100644
index 0000000..4cb766b
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/Line.cpp
@@ -0,0 +1,22 @@
+//===-- Line.cpp ----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/Line.h"
+
+using namespace llvm;
+using namespace codeview;
+
+LineInfo::LineInfo(uint32_t StartLine, uint32_t EndLine, bool IsStatement) {
+  LineData = StartLine & StartLineMask;
+  uint32_t LineDelta = EndLine - StartLine;
+  LineData |= (LineDelta << EndLineDeltaShift) & EndLineDeltaMask;
+  if (IsStatement) {
+    LineData |= StatementFlag;
+  }
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/ListRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/ListRecordBuilder.cpp
new file mode 100644
index 0000000..69c7e87
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/ListRecordBuilder.cpp
@@ -0,0 +1,31 @@
+//===-- ListRecordBuilder.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/ListRecordBuilder.h"
+
+using namespace llvm;
+using namespace codeview;
+
+ListRecordBuilder::ListRecordBuilder(TypeRecordKind Kind) : Builder(Kind) {}
+
+void ListRecordBuilder::finishSubRecord() {
+  // The builder starts at offset 2 in the actual CodeView buffer, so add an
+  // additional offset of 2 before computing the alignment.
+  uint32_t Remainder = (Builder.size() + 2) % 4;
+  if (Remainder != 0) {
+    for (int32_t PaddingBytesLeft = 4 - Remainder; PaddingBytesLeft > 0;
+         --PaddingBytesLeft) {
+      Builder.writeUInt8(0xf0 + PaddingBytesLeft);
+    }
+  }
+
+  // TODO: Split the list into multiple records if it's longer than 64KB, using
+  // a subrecord of TypeRecordKind::Index to chain the records together.
+  assert(Builder.size() < 65536);
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp
new file mode 100644
index 0000000..9afce92
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp
@@ -0,0 +1,35 @@
+//===-- MemoryTypeTableBuilder.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/MemoryTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+
+using namespace llvm;
+using namespace codeview;
+
+MemoryTypeTableBuilder::Record::Record(StringRef RData)
+    : Size(RData.size()), Data(new char[RData.size()]) {
+  memcpy(Data.get(), RData.data(), RData.size());
+}
+
+TypeIndex MemoryTypeTableBuilder::writeRecord(StringRef Data) {
+  auto I = HashedRecords.find(Data);
+  if (I != HashedRecords.end()) {
+    return I->second;
+  }
+
+  std::unique_ptr<Record> R(new Record(Data));
+
+  TypeIndex TI(static_cast<uint32_t>(Records.size()) +
+               TypeIndex::FirstNonSimpleIndex);
+  HashedRecords.insert(std::make_pair(StringRef(R->data(), R->size()), TI));
+  Records.push_back(std::move(R));
+
+  return TI;
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp
new file mode 100644
index 0000000..8893025
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp
@@ -0,0 +1,49 @@
+//===-- MethodListRecordBuilder.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/MethodListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
+
+using namespace llvm;
+using namespace codeview;
+
+MethodListRecordBuilder::MethodListRecordBuilder()
+    : ListRecordBuilder(TypeRecordKind::MethodList) {}
+
+void MethodListRecordBuilder::writeMethod(MemberAccess Access, MethodKind Kind,
+                                          MethodOptions Options, TypeIndex Type,
+                                          int32_t VTableSlotOffset) {
+  TypeRecordBuilder &Builder = getBuilder();
+
+  uint16_t Flags = static_cast<uint16_t>(Access);
+  Flags |= static_cast<uint16_t>(Kind) << MethodKindShift;
+  Flags |= static_cast<uint16_t>(Options);
+
+  Builder.writeUInt16(Flags);
+  Builder.writeUInt16(0);
+  Builder.writeTypeIndex(Type);
+  switch (Kind) {
+  case MethodKind::IntroducingVirtual:
+  case MethodKind::PureIntroducingVirtual:
+    assert(VTableSlotOffset >= 0);
+    Builder.writeInt32(VTableSlotOffset);
+    break;
+
+  default:
+    assert(VTableSlotOffset == -1);
+    break;
+  }
+
+  // TODO: Fail if too big?
+}
+
+void MethodListRecordBuilder::writeMethod(const MethodInfo &Method) {
+  writeMethod(Method.getAccess(), Method.getKind(), Method.getOptions(),
+              Method.getType(), Method.getVTableSlotOffset());
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp
new file mode 100644
index 0000000..cbf464f
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp
@@ -0,0 +1,113 @@
+//===-- TypeRecordBuilder.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeRecordBuilder.h"
+
+using namespace llvm;
+using namespace codeview;
+
+TypeRecordBuilder::TypeRecordBuilder(TypeRecordKind Kind) : Stream(Buffer),
+  Writer(Stream) {
+  writeTypeRecordKind(Kind);
+}
+
+StringRef TypeRecordBuilder::str() {
+  return StringRef(Buffer.data(), Buffer.size());
+}
+
+void TypeRecordBuilder::writeUInt8(uint8_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeInt16(int16_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeUInt16(uint16_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeInt32(int32_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeUInt32(uint32_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeInt64(int64_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeUInt64(uint64_t Value) {
+  Writer.write(Value);
+}
+
+void TypeRecordBuilder::writeEncodedInteger(int64_t Value) {
+  if (Value >= 0) {
+    writeEncodedUnsignedInteger(static_cast<uint64_t>(Value));
+  } else {
+    writeEncodedSignedInteger(Value);
+  }
+}
+
+void TypeRecordBuilder::writeEncodedSignedInteger(int64_t Value) {
+  if (Value >= std::numeric_limits<int8_t>::min() &&
+      Value <= std::numeric_limits<int8_t>::max()) {
+    writeUInt16(static_cast<uint16_t>(TypeRecordKind::SByte));
+    writeInt16(static_cast<int8_t>(Value));
+  } else if (Value >= std::numeric_limits<int16_t>::min() &&
+             Value <= std::numeric_limits<int16_t>::max()) {
+    writeUInt16(static_cast<uint16_t>(TypeRecordKind::Int16));
+    writeInt16(static_cast<int16_t>(Value));
+  } else if (Value >= std::numeric_limits<int32_t>::min() &&
+             Value <= std::numeric_limits<int32_t>::max()) {
+    writeUInt16(static_cast<uint32_t>(TypeRecordKind::Int32));
+    writeInt32(static_cast<int32_t>(Value));
+  } else {
+    writeUInt16(static_cast<uint16_t>(TypeRecordKind::Int64));
+    writeInt64(Value);
+  }
+}
+
+void TypeRecordBuilder::writeEncodedUnsignedInteger(uint64_t Value) {
+  if (Value < static_cast<uint16_t>(TypeRecordKind::SByte)) {
+    writeUInt16(static_cast<uint16_t>(Value));
+  } else if (Value <= std::numeric_limits<uint16_t>::max()) {
+    writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt16));
+    writeUInt16(static_cast<uint16_t>(Value));
+  } else if (Value <= std::numeric_limits<uint32_t>::max()) {
+    writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt32));
+    writeUInt32(static_cast<uint32_t>(Value));
+  } else {
+    writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt64));
+    writeUInt64(Value);
+  }
+}
+
+void TypeRecordBuilder::writeNullTerminatedString(const char *Value) {
+  assert(Value != nullptr);
+
+  size_t Length = strlen(Value);
+  Stream.write(Value, Length);
+  writeUInt8(0);
+}
+
+void TypeRecordBuilder::writeNullTerminatedString(StringRef Value) {
+  Stream.write(Value.data(), Value.size());
+  writeUInt8(0);
+}
+
+void TypeRecordBuilder::writeTypeIndex(TypeIndex TypeInd) {
+  writeUInt32(TypeInd.getIndex());
+}
+
+void TypeRecordBuilder::writeTypeRecordKind(TypeRecordKind Kind) {
+  writeUInt16(static_cast<uint16_t>(Kind));
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeTableBuilder.cpp
new file mode 100644
index 0000000..4af5dca
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeTableBuilder.cpp
@@ -0,0 +1,217 @@
+//===-- TypeTableBuilder.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/MethodListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace codeview;
+
+namespace {
+
+const int PointerKindShift = 0;
+const int PointerModeShift = 5;
+const int PointerSizeShift = 13;
+
+const int ClassHfaKindShift = 11;
+const int ClassWindowsRTClassKindShift = 14;
+
+void writePointerBase(TypeRecordBuilder &Builder,
+                      const PointerRecordBase &Record) {
+  Builder.writeTypeIndex(Record.getReferentType());
+  uint32_t flags =
+      static_cast<uint32_t>(Record.getOptions()) |
+      (Record.getSize() << PointerSizeShift) |
+      (static_cast<uint32_t>(Record.getMode()) << PointerModeShift) |
+      (static_cast<uint32_t>(Record.getPointerKind()) << PointerKindShift);
+  Builder.writeUInt32(flags);
+}
+}
+
+TypeTableBuilder::TypeTableBuilder() {}
+
+TypeTableBuilder::~TypeTableBuilder() {}
+
+TypeIndex TypeTableBuilder::writeModifier(const ModifierRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Modifier);
+
+  Builder.writeTypeIndex(Record.getModifiedType());
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getOptions()));
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeProcedure(const ProcedureRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Procedure);
+
+  Builder.writeTypeIndex(Record.getReturnType());
+  Builder.writeUInt8(static_cast<uint8_t>(Record.getCallConv()));
+  Builder.writeUInt8(static_cast<uint8_t>(Record.getOptions()));
+  Builder.writeUInt16(Record.getParameterCount());
+  Builder.writeTypeIndex(Record.getArgumentList());
+
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writeMemberFunction(const MemberFunctionRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::MemberFunction);
+
+  Builder.writeTypeIndex(Record.getReturnType());
+  Builder.writeTypeIndex(Record.getClassType());
+  Builder.writeTypeIndex(Record.getThisType());
+  Builder.writeUInt8(static_cast<uint8_t>(Record.getCallConv()));
+  Builder.writeUInt8(static_cast<uint8_t>(Record.getOptions()));
+  Builder.writeUInt16(Record.getParameterCount());
+  Builder.writeTypeIndex(Record.getArgumentList());
+  Builder.writeInt32(Record.getThisPointerAdjustment());
+
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writeArgumentList(const ArgumentListRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::ArgumentList);
+
+  Builder.writeUInt32(Record.getArgumentTypes().size());
+  for (TypeIndex TI : Record.getArgumentTypes()) {
+    Builder.writeTypeIndex(TI);
+  }
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writePointer(const PointerRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Pointer);
+
+  writePointerBase(Builder, Record);
+
+  return writeRecord(Builder);
+}
+
+TypeIndex
+TypeTableBuilder::writePointerToMember(const PointerToMemberRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Pointer);
+
+  writePointerBase(Builder, Record);
+
+  Builder.writeTypeIndex(Record.getContainingType());
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getRepresentation()));
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeArray(const ArrayRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Array);
+
+  Builder.writeTypeIndex(Record.getElementType());
+  Builder.writeTypeIndex(Record.getIndexType());
+  Builder.writeEncodedUnsignedInteger(Record.getSize());
+  Builder.writeNullTerminatedString(Record.getName());
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeAggregate(const AggregateRecord &Record) {
+  assert((Record.getKind() == TypeRecordKind::Structure) ||
+         (Record.getKind() == TypeRecordKind::Class) ||
+         (Record.getKind() == TypeRecordKind::Union));
+
+  TypeRecordBuilder Builder(Record.getKind());
+
+  Builder.writeUInt16(Record.getMemberCount());
+  uint16_t Flags =
+      static_cast<uint16_t>(Record.getOptions()) |
+      (static_cast<uint16_t>(Record.getHfa()) << ClassHfaKindShift) |
+      (static_cast<uint16_t>(Record.getWinRTKind())
+       << ClassWindowsRTClassKindShift);
+  Builder.writeUInt16(Flags);
+  Builder.writeTypeIndex(Record.getFieldList());
+  if (Record.getKind() != TypeRecordKind::Union) {
+    Builder.writeTypeIndex(Record.getDerivationList());
+    Builder.writeTypeIndex(Record.getVTableShape());
+  } else {
+    assert(Record.getDerivationList() == TypeIndex());
+    assert(Record.getVTableShape() == TypeIndex());
+  }
+  Builder.writeEncodedUnsignedInteger(Record.getSize());
+  Builder.writeNullTerminatedString(Record.getName());
+  if ((Record.getOptions() & ClassOptions::HasUniqueName) !=
+      ClassOptions::None) {
+    Builder.writeNullTerminatedString(Record.getUniqueName());
+  }
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeEnum(const EnumRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::Enum);
+
+  Builder.writeUInt16(Record.getMemberCount());
+  Builder.writeUInt16(static_cast<uint16_t>(Record.getOptions()));
+  Builder.writeTypeIndex(Record.getUnderlyingType());
+  Builder.writeTypeIndex(Record.getFieldList());
+  Builder.writeNullTerminatedString(Record.getName());
+  if ((Record.getOptions() & ClassOptions::HasUniqueName) !=
+      ClassOptions::None) {
+    Builder.writeNullTerminatedString(Record.getUniqueName());
+  }
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeBitField(const BitFieldRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::BitField);
+
+  Builder.writeTypeIndex(Record.getType());
+  Builder.writeUInt8(Record.getBitSize());
+  Builder.writeUInt8(Record.getBitOffset());
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeVirtualTableShape(
+    const VirtualTableShapeRecord &Record) {
+  TypeRecordBuilder Builder(TypeRecordKind::VirtualTableShape);
+
+  ArrayRef<VirtualTableSlotKind> Slots = Record.getSlots();
+
+  Builder.writeUInt16(Slots.size());
+  for (size_t SlotIndex = 0; SlotIndex < Slots.size(); SlotIndex += 2) {
+    uint8_t Byte = static_cast<uint8_t>(Slots[SlotIndex]) << 4;
+    if ((SlotIndex + 1) < Slots.size()) {
+      Byte |= static_cast<uint8_t>(Slots[SlotIndex + 1]);
+    }
+    Builder.writeUInt8(Byte);
+  }
+
+  return writeRecord(Builder);
+}
+
+TypeIndex TypeTableBuilder::writeRecord(TypeRecordBuilder &Builder) {
+  return writeRecord(Builder.str());
+}
+
+TypeIndex TypeTableBuilder::writeFieldList(FieldListRecordBuilder &FieldList) {
+  // TODO: Split the list into multiple records if it's longer than 64KB, using
+  // a subrecord of TypeRecordKind::Index to chain the records together.
+  return writeRecord(FieldList.str());
+}
+
+TypeIndex
+TypeTableBuilder::writeMethodList(MethodListRecordBuilder &MethodList) {
+  // TODO: Split the list into multiple records if it's longer than 64KB, using
+  // a subrecord of TypeRecordKind::Index to chain the records together.
+  return writeRecord(MethodList.str());
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 96bcf15..a4195b7 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
@@ -126,6 +127,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
     getDebugFrame()->dump(OS);
   }
 
+  if (DumpType == DIDT_All || DumpType == DIDT_Macro) {
+    OS << "\n.debug_macinfo contents:\n";
+    getDebugMacro()->dump(OS);
+  }
+
   uint32_t offset = 0;
   if (DumpType == DIDT_All || DumpType == DIDT_Aranges) {
     OS << "\n.debug_aranges contents:\n";
@@ -155,6 +161,16 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
     }
   }
 
+  if (DumpType == DIDT_All || DumpType == DIDT_CUIndex) {
+    OS << "\n.debug_cu_index contents:\n";
+    getCUIndex().dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_TUIndex) {
+    OS << "\n.debug_tu_index contents:\n";
+    getTUIndex().dump(OS);
+  }
+
   if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
     OS << "\n.debug_line.dwo contents:\n";
     unsigned stmtOffset = 0;
@@ -250,6 +266,28 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
                      getStringSection(), isLittleEndian());
 }
 
+const DWARFUnitIndex &DWARFContext::getCUIndex() {
+  if (CUIndex)
+    return *CUIndex;
+
+  DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), 0);
+
+  CUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_INFO);
+  CUIndex->parse(CUIndexData);
+  return *CUIndex;
+}
+
+const DWARFUnitIndex &DWARFContext::getTUIndex() {
+  if (TUIndex)
+    return *TUIndex;
+
+  DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), 0);
+
+  TUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
+  TUIndex->parse(TUIndexData);
+  return *TUIndex;
+}
+
 const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
   if (Abbrev)
     return Abbrev.get();
@@ -322,24 +360,37 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
   return DebugFrame.get();
 }
 
+const DWARFDebugMacro *DWARFContext::getDebugMacro() {
+  if (Macro)
+    return Macro.get();
+
+  DataExtractor MacinfoData(getMacinfoSection(), isLittleEndian(), 0);
+  Macro.reset(new DWARFDebugMacro());
+  Macro->parse(MacinfoData);
+  return Macro.get();
+}
+
 const DWARFLineTable *
 DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   if (!Line)
     Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
+
   const auto *UnitDIE = U->getUnitDIE();
   if (UnitDIE == nullptr)
     return nullptr;
+
   unsigned stmtOffset =
       UnitDIE->getAttributeValueAsSectionOffset(U, DW_AT_stmt_list, -1U);
   if (stmtOffset == -1U)
     return nullptr; // No line table for this compile unit.
 
+  stmtOffset += U->getLineTableOffset();
   // See if the line table is cached.
   if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
     return lt;
 
   // We have to parse it first.
-  DataExtractor lineData(getLineSection().Data, isLittleEndian(),
+  DataExtractor lineData(U->getLineSection(), isLittleEndian(),
                          U->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset);
 }
@@ -556,10 +607,11 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
       continue;
     StringRef data;
 
+    section_iterator RelocatedSection = Section.getRelocatedSection();
     // Try to obtain an already relocated version of this section.
     // Else use the unrelocated section from the object file. We'll have to
     // apply relocations ourselves later.
-    if (!L || !L->getLoadedSectionContents(name,data))
+    if (!L || !L->getLoadedSectionContents(*RelocatedSection,data))
       Section.getContents(data);
 
     name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
@@ -591,6 +643,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
             .Case("debug_frame", &DebugFrameSection)
             .Case("debug_str", &StringSection)
             .Case("debug_ranges", &RangeSection)
+            .Case("debug_macinfo", &MacinfoSection)
             .Case("debug_pubnames", &PubNamesSection)
             .Case("debug_pubtypes", &PubTypesSection)
             .Case("debug_gnu_pubnames", &GnuPubNamesSection)
@@ -607,6 +660,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
             .Case("apple_namespaces", &AppleNamespacesSection.Data)
             .Case("apple_namespac", &AppleNamespacesSection.Data)
             .Case("apple_objc", &AppleObjCSection.Data)
+            .Case("debug_cu_index", &CUIndexSection)
+            .Case("debug_tu_index", &TUIndexSection)
             // Any more debug info sections go here.
             .Default(nullptr);
     if (SectionData) {
@@ -623,7 +678,6 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
       TypesDWOSections[Section].Data = data;
     }
 
-    section_iterator RelocatedSection = Section.getRelocatedSection();
     if (RelocatedSection == Obj.section_end())
       continue;
 
@@ -634,7 +688,15 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     // If the section we're relocating was relocated already by the JIT,
     // then we used the relocated version above, so we do not need to process
     // relocations for it now.
-    if (L && L->getLoadedSectionContents(RelSecName,RelSecData))
+    if (L && L->getLoadedSectionContents(*RelocatedSection,RelSecData))
+      continue;
+
+    // In Mach-o files, the relocations do not need to be applied if
+    // there is no load offset to apply. The value read at the
+    // relocation point already factors in the section address
+    // (actually applying the relocations will produce wrong results
+    // as the section address will be added twice).
+    if (!L && isa<MachOObjectFile>(&Obj))
       continue;
 
     RelSecName = RelSecName.substr(
@@ -685,13 +747,19 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
           }
           SymAddr = *SymAddrOrErr;
           // Also remember what section this symbol is in for later
-          Sym->getSection(RSec);
+          RSec = *Sym->getSection();
         } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
           // MachO also has relocations that point to sections and
           // scattered relocations.
-          // FIXME: We are not handling scattered relocations, do we have to?
-          RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
-          SymAddr = RSec->getAddress();
+          auto RelocInfo = MObj->getRelocation(Reloc.getRawDataRefImpl());
+          if (MObj->isRelocationScattered(RelocInfo)) {
+            // FIXME: it's not clear how to correctly handle scattered
+            // relocations.
+            continue;
+          } else {
+            RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
+            SymAddr = RSec->getAddress();
+          }
         }
 
         // If we are given load addresses for the sections, we need to adjust:
@@ -699,12 +767,15 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         //           (Address of Section in File) +
         //           (Load Address of Section)
         if (L != nullptr && RSec != Obj.section_end()) {
-          // RSec is now either the section being targetted or the section
-          // containing the symbol being targetted. In either case,
+          // RSec is now either the section being targeted or the section
+          // containing the symbol being targeted. In either case,
           // we need to perform the same computation.
           StringRef SecName;
           RSec->getName(SecName);
-          SectionLoadAddress = L->getSectionLoadAddress(SecName);
+//           llvm::dbgs() << "Name: '" << SecName
+//                        << "', RSec: " << RSec->getRawDataRefImpl()
+//                        << ", Section: " << Section.getRawDataRefImpl() << "\n";
+          SectionLoadAddress = L->getSectionLoadAddress(*RSec);
           if (SectionLoadAddress != 0)
             SymAddr += SectionLoadAddress - RSec->getAddress();
         }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 5abbde4..62d5e66 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -139,7 +139,7 @@ void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
   std::string File;
   auto Color = syntax::Enumerator;
   if (attr == DW_AT_decl_file || attr == DW_AT_call_file) {
-  Color = syntax::String;
+    Color = syntax::String;
     if (const auto *LT = u->getContext().getLineTableForUnit(u))
       if (LT->getFileNameByIndex(
              formValue.getAsUnsignedConstant().getValue(),
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
new file mode 100644
index 0000000..b6555fa
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -0,0 +1,103 @@
+//===-- DWARFDebugMacro.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SyntaxHighlighting.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace dwarf;
+using namespace syntax;
+
+void DWARFDebugMacro::dump(raw_ostream &OS) const {
+  unsigned IndLevel = 0;
+  for (const Entry &E : Macros) {
+    // There should not be DW_MACINFO_end_file when IndLevel is Zero. However,
+    // this check handles the case of corrupted ".debug_macinfo" section.
+    if (IndLevel > 0)
+      IndLevel -= (E.Type == DW_MACINFO_end_file);
+    // Print indentation.
+    for (unsigned I = 0; I < IndLevel; I++)
+      OS << "  ";
+    IndLevel += (E.Type == DW_MACINFO_start_file);
+
+    WithColor(OS, syntax::Macro).get() << MacinfoString(E.Type);
+    switch (E.Type) {
+    default:
+      // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
+      break;
+    case DW_MACINFO_define:
+    case DW_MACINFO_undef:
+      OS << " - lineno: " << E.Line;
+      OS << " macro: " << E.MacroStr;
+      break;
+    case DW_MACINFO_start_file:
+      OS << " - lineno: " << E.Line;
+      OS << " filenum: " << E.File;
+      break;
+    case DW_MACINFO_end_file:
+      break;
+    case DW_MACINFO_vendor_ext:
+      OS << " - constant: " << E.ExtConstant;
+      OS << " string: " << E.ExtStr;
+      break;
+    }
+    OS << "\n";
+  }
+}
+
+void DWARFDebugMacro::parse(DataExtractor data) {
+  uint32_t Offset = 0;
+  while (data.isValidOffset(Offset)) {
+    // A macro list entry consists of:
+    Entry E;
+    // 1. Macinfo type
+    E.Type = data.getULEB128(&Offset);
+
+    if (E.Type == 0) {
+      // Reached end of ".debug_macinfo" section.
+      return;
+    }
+
+    switch (E.Type) {
+    default:
+      // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
+      // Push the corrupted entry to the list and halt parsing.
+      E.Type = DW_MACINFO_invalid;
+      Macros.push_back(E);
+      return;
+    case DW_MACINFO_define:
+    case DW_MACINFO_undef:
+      // 2. Source line
+      E.Line = data.getULEB128(&Offset);
+      // 3. Macro string
+      E.MacroStr = data.getCStr(&Offset);
+      break;
+    case DW_MACINFO_start_file:
+      // 2. Source line
+      E.Line = data.getULEB128(&Offset);
+      // 3. Source file id
+      E.File = data.getULEB128(&Offset);
+      break;
+    case DW_MACINFO_end_file:
+      break;
+    case DW_MACINFO_vendor_ext:
+      // 2. Vendor extension constant
+      E.ExtConstant = data.getULEB128(&Offset);
+      // 3. Vendor extension string
+      E.ExtStr = data.getCStr(&Offset);
+      break;
+    }
+
+    Macros.push_back(E);
+  }
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 53a676e..3dc5842 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <climits>
+#include <limits>
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
@@ -110,7 +110,7 @@ static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
 
 bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   // First, check DWARF4 form classes.
-  if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() &&
+  if (Form < makeArrayRef(DWARF4FormClasses).size() &&
       DWARF4FormClasses[Form] == FC)
     return true;
   // Check more forms from DWARF4 and DWARF5 proposals.
@@ -261,6 +261,12 @@ DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
 bool
 DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
                           uint32_t *offset_ptr, const DWARFUnit *cu) {
+  return skipValue(form, debug_info_data, offset_ptr, cu->getVersion(),
+                   cu->getAddressByteSize());
+}
+bool DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
+                               uint32_t *offset_ptr, uint16_t Version,
+                               uint8_t AddrSize) {
   bool indirect = false;
   do {
     switch (form) {
@@ -295,10 +301,10 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
 
     // Compile unit address sized values
     case DW_FORM_addr:
-      *offset_ptr += cu->getAddressByteSize();
+      *offset_ptr += AddrSize;
       return true;
     case DW_FORM_ref_addr:
-      *offset_ptr += getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
+      *offset_ptr += getRefAddrSize(AddrSize, Version);
       return true;
 
     // 0 byte values - implied from the form.
@@ -565,7 +571,7 @@ Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
 
 Optional<int64_t> DWARFFormValue::getAsSignedConstant() const {
   if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag)) ||
-      (Form == DW_FORM_udata && uint64_t(LLONG_MAX) < Value.uval))
+      (Form == DW_FORM_udata && uint64_t(std::numeric_limits<int64_t>::max()) < Value.uval))
     return None;
   switch (Form) {
   case DW_FORM_data4:
@@ -584,6 +590,6 @@ Optional<int64_t> DWARFFormValue::getAsSignedConstant() const {
 Optional<ArrayRef<uint8_t>> DWARFFormValue::getAsBlock() const {
   if (!isFormClass(FC_Block) && !isFormClass(FC_Exprloc))
     return None;
-  return ArrayRef<uint8_t>(Value.data, Value.uval);
+  return makeArrayRef(Value.data, Value.uval);
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 348476d..92ca2d4 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -14,29 +14,37 @@
 #include "llvm/Support/Path.h"
 #include <cstdio>
 
-using namespace llvm;
+namespace llvm {
 using namespace dwarf;
 
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
   parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(),
             C.getStringSection(), StringRef(), C.getAddrSection(),
-            C.isLittleEndian());
+            C.getLineSection().Data, C.isLittleEndian());
 }
 
 void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
-                                    const DWARFSection &DWOSection) {
+                                    const DWARFSection &DWOSection,
+                                    DWARFUnitIndex *Index) {
   parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(),
             C.getStringDWOSection(), C.getStringOffsetDWOSection(),
-            C.getAddrSection(), C.isLittleEndian());
+            C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian());
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
                      const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                     StringRef SOS, StringRef AOS, bool LE,
-                     const DWARFUnitSectionBase &UnitSection)
+                     StringRef SOS, StringRef AOS, StringRef LS, bool LE,
+                     const DWARFUnitSectionBase &UnitSection,
+                     const DWARFUnitIndex::Entry *IndexEntry)
     : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
-      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
-      isLittleEndian(LE), UnitSection(UnitSection) {
+      LineSection(LS), StringSection(SS), StringOffsetSection([&]() {
+        if (IndexEntry)
+          if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
+            return SOS.slice(C->Offset, C->Offset + C->Length);
+        return SOS;
+      }()),
+      AddrOffsetSection(AOS), isLittleEndian(LE), UnitSection(UnitSection),
+      IndexEntry(IndexEntry) {
   clear();
 }
 
@@ -69,6 +77,17 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
   Version = debug_info.getU16(offset_ptr);
   uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
+  if (IndexEntry) {
+    if (AbbrOffset)
+      return false;
+    auto *UnitContrib = IndexEntry->getOffset();
+    if (!UnitContrib || UnitContrib->Length != (Length + 4))
+      return false;
+    auto *AbbrEntry = IndexEntry->getOffset(DW_SECT_ABBREV);
+    if (!AbbrEntry)
+      return false;
+    AbbrOffset = AbbrEntry->Offset;
+  }
   AddrSize = debug_info.getU8(offset_ptr);
 
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
@@ -375,3 +394,12 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
     return DWARFDebugInfoEntryInlinedChain();
   return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address);
 }
+
+const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
+                                        DWARFSectionKind Kind) {
+  if (Kind == DW_SECT_INFO)
+    return Context.getCUIndex();
+  assert(Kind == DW_SECT_TYPES);
+  return Context.getTUIndex();
+}
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
new file mode 100644
index 0000000..96b3169
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -0,0 +1,168 @@
+//===-- DWARFUnitIndex.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+bool DWARFUnitIndex::Header::parse(DataExtractor IndexData,
+                                   uint32_t *OffsetPtr) {
+  if (!IndexData.isValidOffsetForDataOfSize(*OffsetPtr, 16))
+    return false;
+  Version = IndexData.getU32(OffsetPtr);
+  NumColumns = IndexData.getU32(OffsetPtr);
+  NumUnits = IndexData.getU32(OffsetPtr);
+  NumBuckets = IndexData.getU32(OffsetPtr);
+  return Version <= 2;
+}
+
+void DWARFUnitIndex::Header::dump(raw_ostream &OS) const {
+  OS << format("version = %u slots = %u\n\n", Version, NumBuckets);
+}
+
+bool DWARFUnitIndex::parse(DataExtractor IndexData) {
+  bool b = parseImpl(IndexData);
+  if (!b) {
+    // Make sure we don't try to dump anything
+    Header.NumBuckets = 0;
+    // Release any partially initialized data.
+    ColumnKinds.reset();
+    Rows.reset();
+  }
+  return b;
+}
+
+bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
+  uint32_t Offset = 0;
+  if (!Header.parse(IndexData, &Offset))
+    return false;
+
+  if (!IndexData.isValidOffsetForDataOfSize(
+          Offset, Header.NumBuckets * (8 + 4) +
+                      (2 * Header.NumUnits + 1) * 4 * Header.NumColumns))
+    return false;
+
+  Rows = llvm::make_unique<Entry[]>(Header.NumBuckets);
+  auto Contribs =
+      llvm::make_unique<Entry::SectionContribution *[]>(Header.NumUnits);
+  ColumnKinds = llvm::make_unique<DWARFSectionKind[]>(Header.NumColumns);
+
+  // Read Hash Table of Signatures
+  for (unsigned i = 0; i != Header.NumBuckets; ++i)
+    Rows[i].Signature = IndexData.getU64(&Offset);
+
+  // Read Parallel Table of Indexes
+  for (unsigned i = 0; i != Header.NumBuckets; ++i) {
+    auto Index = IndexData.getU32(&Offset);
+    if (!Index)
+      continue;
+    Rows[i].Index = this;
+    Rows[i].Contributions =
+        llvm::make_unique<Entry::SectionContribution[]>(Header.NumColumns);
+    Contribs[Index - 1] = Rows[i].Contributions.get();
+  }
+
+  // Read the Column Headers
+  for (unsigned i = 0; i != Header.NumColumns; ++i) {
+    ColumnKinds[i] = static_cast<DWARFSectionKind>(IndexData.getU32(&Offset));
+    if (ColumnKinds[i] == InfoColumnKind) {
+      if (InfoColumn != -1)
+        return false;
+      InfoColumn = i;
+    }
+  }
+
+  if (InfoColumn == -1)
+    return false;
+
+  // Read Table of Section Offsets
+  for (unsigned i = 0; i != Header.NumUnits; ++i) {
+    auto *Contrib = Contribs[i];
+    for (unsigned i = 0; i != Header.NumColumns; ++i)
+      Contrib[i].Offset = IndexData.getU32(&Offset);
+  }
+
+  // Read Table of Section Sizes
+  for (unsigned i = 0; i != Header.NumUnits; ++i) {
+    auto *Contrib = Contribs[i];
+    for (unsigned i = 0; i != Header.NumColumns; ++i)
+      Contrib[i].Length = IndexData.getU32(&Offset);
+  }
+
+  return true;
+}
+
+StringRef DWARFUnitIndex::getColumnHeader(DWARFSectionKind DS) {
+#define CASE(DS)                                                               \
+  case DW_SECT_##DS:                                                           \
+    return #DS;
+  switch (DS) {
+    CASE(INFO);
+    CASE(TYPES);
+    CASE(ABBREV);
+    CASE(LINE);
+    CASE(LOC);
+    CASE(STR_OFFSETS);
+    CASE(MACINFO);
+    CASE(MACRO);
+  }
+  llvm_unreachable("unknown DWARFSectionKind");
+}
+
+void DWARFUnitIndex::dump(raw_ostream &OS) const {
+  if (!Header.NumBuckets)
+    return;
+
+  Header.dump(OS);
+  OS << "Index Signature         ";
+  for (unsigned i = 0; i != Header.NumColumns; ++i)
+    OS << ' ' << left_justify(getColumnHeader(ColumnKinds[i]), 24);
+  OS << "\n----- ------------------";
+  for (unsigned i = 0; i != Header.NumColumns; ++i)
+    OS << " ------------------------";
+  OS << '\n';
+  for (unsigned i = 0; i != Header.NumBuckets; ++i) {
+    auto &Row = Rows[i];
+    if (auto *Contribs = Row.Contributions.get()) {
+      OS << format("%5u 0x%016" PRIx64 " ", i + 1, Row.Signature);
+      for (unsigned i = 0; i != Header.NumColumns; ++i) {
+        auto &Contrib = Contribs[i];
+        OS << format("[0x%08x, 0x%08x) ", Contrib.Offset,
+                     Contrib.Offset + Contrib.Length);
+      }
+      OS << '\n';
+    }
+  }
+}
+
+const DWARFUnitIndex::Entry::SectionContribution *
+DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const {
+  uint32_t i = 0;
+  for (; i != Index->Header.NumColumns; ++i)
+    if (Index->ColumnKinds[i] == Sec)
+      return &Contributions[i];
+  return nullptr;
+}
+const DWARFUnitIndex::Entry::SectionContribution *
+DWARFUnitIndex::Entry::getOffset() const {
+  return &Contributions[Index->InfoColumn];
+}
+
+const DWARFUnitIndex::Entry *
+DWARFUnitIndex::getFromOffset(uint32_t Offset) const {
+  for (uint32_t i = 0; i != Header.NumBuckets; ++i)
+    if (const auto &Contribs = Rows[i].Contributions)
+      if (Contribs[InfoColumn].Offset == Offset)
+        return &Rows[i];
+  return nullptr;
+}
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
index a6b4c65..4f561d0 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
@@ -27,6 +27,7 @@ WithColor::WithColor(llvm::raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
     case Tag:        OS.changeColor(llvm::raw_ostream::BLUE);    break;
     case Attribute:  OS.changeColor(llvm::raw_ostream::CYAN);    break;
     case Enumerator: OS.changeColor(llvm::raw_ostream::MAGENTA); break;
+    case Macro:      OS.changeColor(llvm::raw_ostream::RED);     break;
     }
   }
 }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
index 946a313..16e6835 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
+++ b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
@@ -17,7 +17,7 @@ namespace dwarf {
 namespace syntax {
 
 // Symbolic names for various syntax elements.
-enum HighlightColor { Address, String, Tag, Attribute, Enumerator };
+enum HighlightColor { Address, String, Tag, Attribute, Enumerator, Macro };
 
 /// An RAII object that temporarily switches an output stream to a
 /// specific color.
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp
index 13201bb..613407e 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp
@@ -31,7 +31,7 @@ PDB_ErrorCode llvm::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
 
 PDB_ErrorCode llvm::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
                                    std::unique_ptr<IPDBSession> &Session) {
-// Create the correct concrete instance type based on the value of Type.
+  // Create the correct concrete instance type based on the value of Type.
 #if HAVE_DIA_SDK
   return DIASession::createFromExe(Path, Session);
 #endif
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp
index 83f27c7..ca2ae66 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp
@@ -21,24 +21,11 @@ using namespace llvm;
 using namespace llvm::object;
 
 PDBContext::PDBContext(const COFFObjectFile &Object,
-                       std::unique_ptr<IPDBSession> PDBSession,
-                       bool RelativeAddress)
+                       std::unique_ptr<IPDBSession> PDBSession)
     : DIContext(CK_PDB), Session(std::move(PDBSession)) {
-  if (!RelativeAddress) {
-    uint64_t ImageBase = 0;
-    if (Object.is64()) {
-      const pe32plus_header *Header = nullptr;
-      Object.getPE32PlusHeader(Header);
-      if (Header)
-        ImageBase = Header->ImageBase;
-    } else {
-      const pe32_header *Header = nullptr;
-      Object.getPE32Header(Header);
-      if (Header)
-        ImageBase = static_cast<uint64_t>(Header->ImageBase);
-    }
-    Session->setLoadAddress(ImageBase);
-  }
+  ErrorOr<uint64_t> ImageBase = Object.getImageBase();
+  if (ImageBase)
+    Session->setLoadAddress(ImageBase.get());
 }
 
 void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType) {}
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
new file mode 100644
index 0000000..a9dee7a
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -0,0 +1,101 @@
+//===- lib/DebugInfo/Symbolize/DIPrinter.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DIPrinter class, which is responsible for printing
+// structures defined in DebugInfo/DIContext.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/DIPrinter.h"
+
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/Support/LineIterator.h"
+
+namespace llvm {
+namespace symbolize {
+
+// By default, DILineInfo contains "<invalid>" for function/filename it
+// cannot fetch. We replace it to "??" to make our output closer to addr2line.
+static const char kDILineInfoBadString[] = "<invalid>";
+static const char kBadString[] = "??";
+
+// Prints source code around in the FileName the Line.
+void DIPrinter::printContext(std::string FileName, int64_t Line) {
+  if (PrintSourceContext <= 0)
+    return;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
+      MemoryBuffer::getFile(FileName);
+  if (!BufOrErr)
+    return;
+
+  std::unique_ptr<MemoryBuffer> Buf = std::move(BufOrErr.get());
+  int64_t FirstLine =
+      std::max(static_cast<int64_t>(1), Line - PrintSourceContext / 2);
+  int64_t LastLine = FirstLine + PrintSourceContext;
+  size_t MaxLineNumberWidth = std::ceil(std::log10(LastLine));
+
+  for (line_iterator I = line_iterator(*Buf, false);
+       !I.is_at_eof() && I.line_number() <= LastLine; ++I) {
+    int64_t L = I.line_number();
+    if (L >= FirstLine && L <= LastLine) {
+      OS << format_decimal(L, MaxLineNumberWidth);
+      if (L == Line)
+        OS << " >: ";
+      else
+        OS << "  : ";
+      OS << *I << "\n";
+    }
+  }
+}
+
+void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
+  if (PrintFunctionNames) {
+    std::string FunctionName = Info.FunctionName;
+    if (FunctionName == kDILineInfoBadString)
+      FunctionName = kBadString;
+
+    StringRef Delimiter = (PrintPretty == true) ? " at " : "\n";
+    StringRef Prefix = (PrintPretty && Inlined) ? " (inlined by) " : "";
+    OS << Prefix << FunctionName << Delimiter;
+  }
+  std::string Filename = Info.FileName;
+  if (Filename == kDILineInfoBadString)
+    Filename = kBadString;
+  OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
+  printContext(Filename, Info.Line);
+}
+
+DIPrinter &DIPrinter::operator<<(const DILineInfo &Info) {
+  print(Info, false);
+  return *this;
+}
+
+DIPrinter &DIPrinter::operator<<(const DIInliningInfo &Info) {
+  uint32_t FramesNum = Info.getNumberOfFrames();
+  if (FramesNum == 0) {
+    print(DILineInfo(), false);
+    return *this;
+  }
+  for (uint32_t i = 0; i < FramesNum; i++)
+    print(Info.getFrame(i), i > 0);
+  return *this;
+}
+
+DIPrinter &DIPrinter::operator<<(const DIGlobal &Global) {
+  std::string Name = Global.Name;
+  if (Name == kDILineInfoBadString)
+    Name = kBadString;
+  OS << Name << "\n";
+  OS << Global.Start << " " << Global.Size << "\n";
+  return *this;
+}
+
+}
+}
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
new file mode 100644
index 0000000..e314624
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -0,0 +1,254 @@
+//===-- SymbolizableObjectFile.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of SymbolizableObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SymbolizableObjectFile.h"
+#include "llvm/Object/SymbolSize.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+
+namespace llvm {
+namespace symbolize {
+
+using namespace object;
+
+static DILineInfoSpecifier
+getDILineInfoSpecifier(FunctionNameKind FNKind) {
+  return DILineInfoSpecifier(
+      DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FNKind);
+}
+
+ErrorOr<std::unique_ptr<SymbolizableObjectFile>>
+SymbolizableObjectFile::create(object::ObjectFile *Obj,
+                               std::unique_ptr<DIContext> DICtx) {
+  std::unique_ptr<SymbolizableObjectFile> res(
+      new SymbolizableObjectFile(Obj, std::move(DICtx)));
+  std::unique_ptr<DataExtractor> OpdExtractor;
+  uint64_t OpdAddress = 0;
+  // Find the .opd (function descriptor) section if any, for big-endian
+  // PowerPC64 ELF.
+  if (Obj->getArch() == Triple::ppc64) {
+    for (section_iterator Section : Obj->sections()) {
+      StringRef Name;
+      StringRef Data;
+      if (auto EC = Section->getName(Name))
+        return EC;
+      if (Name == ".opd") {
+        if (auto EC = Section->getContents(Data))
+          return EC;
+        OpdExtractor.reset(new DataExtractor(Data, Obj->isLittleEndian(),
+                                             Obj->getBytesInAddress()));
+        OpdAddress = Section->getAddress();
+        break;
+      }
+    }
+  }
+  std::vector<std::pair<SymbolRef, uint64_t>> Symbols =
+      computeSymbolSizes(*Obj);
+  for (auto &P : Symbols)
+    res->addSymbol(P.first, P.second, OpdExtractor.get(), OpdAddress);
+
+  // If this is a COFF object and we didn't find any symbols, try the export
+  // table.
+  if (Symbols.empty()) {
+    if (auto *CoffObj = dyn_cast<COFFObjectFile>(Obj))
+      if (auto EC = res->addCoffExportSymbols(CoffObj))
+        return EC;
+  }
+  return std::move(res);
+}
+
+SymbolizableObjectFile::SymbolizableObjectFile(ObjectFile *Obj,
+                                               std::unique_ptr<DIContext> DICtx)
+    : Module(Obj), DebugInfoContext(std::move(DICtx)) {}
+
+namespace {
+struct OffsetNamePair {
+  uint32_t Offset;
+  StringRef Name;
+  bool operator<(const OffsetNamePair &R) const {
+    return Offset < R.Offset;
+  }
+};
+}
+
+std::error_code SymbolizableObjectFile::addCoffExportSymbols(
+    const COFFObjectFile *CoffObj) {
+  // Get all export names and offsets.
+  std::vector<OffsetNamePair> ExportSyms;
+  for (const ExportDirectoryEntryRef &Ref : CoffObj->export_directories()) {
+    StringRef Name;
+    uint32_t Offset;
+    if (auto EC = Ref.getSymbolName(Name))
+      return EC;
+    if (auto EC = Ref.getExportRVA(Offset))
+      return EC;
+    ExportSyms.push_back(OffsetNamePair{Offset, Name});
+  }
+  if (ExportSyms.empty())
+    return std::error_code();
+
+  // Sort by ascending offset.
+  array_pod_sort(ExportSyms.begin(), ExportSyms.end());
+
+  // Approximate the symbol sizes by assuming they run to the next symbol.
+  // FIXME: This assumes all exports are functions.
+  uint64_t ImageBase = CoffObj->getImageBase();
+  for (auto I = ExportSyms.begin(), E = ExportSyms.end(); I != E; ++I) {
+    OffsetNamePair &Export = *I;
+    // FIXME: The last export has a one byte size now.
+    uint32_t NextOffset = I != E ? I->Offset : Export.Offset + 1;
+    uint64_t SymbolStart = ImageBase + Export.Offset;
+    uint64_t SymbolSize = NextOffset - Export.Offset;
+    SymbolDesc SD = {SymbolStart, SymbolSize};
+    Functions.insert(std::make_pair(SD, Export.Name));
+  }
+  return std::error_code();
+}
+
+std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
+                                                  uint64_t SymbolSize,
+                                                  DataExtractor *OpdExtractor,
+                                                  uint64_t OpdAddress) {
+  SymbolRef::Type SymbolType = Symbol.getType();
+  if (SymbolType != SymbolRef::ST_Function && SymbolType != SymbolRef::ST_Data)
+    return std::error_code();
+  ErrorOr<uint64_t> SymbolAddressOrErr = Symbol.getAddress();
+  if (auto EC = SymbolAddressOrErr.getError())
+    return EC;
+  uint64_t SymbolAddress = *SymbolAddressOrErr;
+  if (OpdExtractor) {
+    // For big-endian PowerPC64 ELF, symbols in the .opd section refer to
+    // function descriptors. The first word of the descriptor is a pointer to
+    // the function's code.
+    // For the purposes of symbolization, pretend the symbol's address is that
+    // of the function's code, not the descriptor.
+    uint64_t OpdOffset = SymbolAddress - OpdAddress;
+    uint32_t OpdOffset32 = OpdOffset;
+    if (OpdOffset == OpdOffset32 && 
+        OpdExtractor->isValidOffsetForAddress(OpdOffset32))
+      SymbolAddress = OpdExtractor->getAddress(&OpdOffset32);
+  }
+  ErrorOr<StringRef> SymbolNameOrErr = Symbol.getName();
+  if (auto EC = SymbolNameOrErr.getError())
+    return EC;
+  StringRef SymbolName = *SymbolNameOrErr;
+  // Mach-O symbol table names have leading underscore, skip it.
+  if (Module->isMachO() && SymbolName.size() > 0 && SymbolName[0] == '_')
+    SymbolName = SymbolName.drop_front();
+  // FIXME: If a function has alias, there are two entries in symbol table
+  // with same address size. Make sure we choose the correct one.
+  auto &M = SymbolType == SymbolRef::ST_Function ? Functions : Objects;
+  SymbolDesc SD = { SymbolAddress, SymbolSize };
+  M.insert(std::make_pair(SD, SymbolName));
+  return std::error_code();
+}
+
+// Return true if this is a 32-bit x86 PE COFF module.
+bool SymbolizableObjectFile::isWin32Module() const {
+  auto *CoffObject = dyn_cast<COFFObjectFile>(Module);
+  return CoffObject && CoffObject->getMachine() == COFF::IMAGE_FILE_MACHINE_I386;
+}
+
+uint64_t SymbolizableObjectFile::getModulePreferredBase() const {
+  if (auto *CoffObject = dyn_cast<COFFObjectFile>(Module))
+    return CoffObject->getImageBase();
+  return 0;
+}
+
+bool SymbolizableObjectFile::getNameFromSymbolTable(SymbolRef::Type Type,
+                                                    uint64_t Address,
+                                                    std::string &Name,
+                                                    uint64_t &Addr,
+                                                    uint64_t &Size) const {
+  const auto &SymbolMap = Type == SymbolRef::ST_Function ? Functions : Objects;
+  if (SymbolMap.empty())
+    return false;
+  SymbolDesc SD = { Address, Address };
+  auto SymbolIterator = SymbolMap.upper_bound(SD);
+  if (SymbolIterator == SymbolMap.begin())
+    return false;
+  --SymbolIterator;
+  if (SymbolIterator->first.Size != 0 &&
+      SymbolIterator->first.Addr + SymbolIterator->first.Size <= Address)
+    return false;
+  Name = SymbolIterator->second.str();
+  Addr = SymbolIterator->first.Addr;
+  Size = SymbolIterator->first.Size;
+  return true;
+}
+
+bool SymbolizableObjectFile::shouldOverrideWithSymbolTable(
+    FunctionNameKind FNKind, bool UseSymbolTable) const {
+  // When DWARF is used with -gline-tables-only / -gmlt, the symbol table gives
+  // better answers for linkage names than the DIContext. Otherwise, we are
+  // probably using PEs and PDBs, and we shouldn't do the override. PE files
+  // generally only contain the names of exported symbols.
+  return FNKind == FunctionNameKind::LinkageName && UseSymbolTable &&
+         isa<DWARFContext>(DebugInfoContext.get());
+}
+
+DILineInfo SymbolizableObjectFile::symbolizeCode(uint64_t ModuleOffset,
+                                                 FunctionNameKind FNKind,
+                                                 bool UseSymbolTable) const {
+  DILineInfo LineInfo;
+  if (DebugInfoContext) {
+    LineInfo = DebugInfoContext->getLineInfoForAddress(
+        ModuleOffset, getDILineInfoSpecifier(FNKind));
+  }
+  // Override function name from symbol table if necessary.
+  if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) {
+    std::string FunctionName;
+    uint64_t Start, Size;
+    if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
+                               FunctionName, Start, Size)) {
+      LineInfo.FunctionName = FunctionName;
+    }
+  }
+  return LineInfo;
+}
+
+DIInliningInfo SymbolizableObjectFile::symbolizeInlinedCode(
+    uint64_t ModuleOffset, FunctionNameKind FNKind, bool UseSymbolTable) const {
+  DIInliningInfo InlinedContext;
+
+  if (DebugInfoContext)
+    InlinedContext = DebugInfoContext->getInliningInfoForAddress(
+        ModuleOffset, getDILineInfoSpecifier(FNKind));
+  // Make sure there is at least one frame in context.
+  if (InlinedContext.getNumberOfFrames() == 0)
+    InlinedContext.addFrame(DILineInfo());
+
+  // Override the function name in lower frame with name from symbol table.
+  if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) {
+    std::string FunctionName;
+    uint64_t Start, Size;
+    if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
+                               FunctionName, Start, Size)) {
+      InlinedContext.getMutableFrame(InlinedContext.getNumberOfFrames() - 1)
+          ->FunctionName = FunctionName;
+    }
+  }
+
+  return InlinedContext;
+}
+
+DIGlobal SymbolizableObjectFile::symbolizeData(uint64_t ModuleOffset) const {
+  DIGlobal Res;
+  getNameFromSymbolTable(SymbolRef::ST_Data, ModuleOffset, Res.Name, Res.Start,
+                         Res.Size);
+  return Res;
+}
+
+}  // namespace symbolize
+}  // namespace llvm
+
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
new file mode 100644
index 0000000..8583b6a
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -0,0 +1,82 @@
+//===-- SymbolizableObjectFile.h -------------------------------- C++ -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SymbolizableObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+#define LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include <map>
+
+namespace llvm {
+class DataExtractor;
+}
+
+namespace llvm {
+namespace symbolize {
+
+class SymbolizableObjectFile : public SymbolizableModule {
+public:
+  static ErrorOr<std::unique_ptr<SymbolizableObjectFile>>
+  create(object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx);
+
+  DILineInfo symbolizeCode(uint64_t ModuleOffset, FunctionNameKind FNKind,
+                           bool UseSymbolTable) const override;
+  DIInliningInfo symbolizeInlinedCode(uint64_t ModuleOffset,
+                                      FunctionNameKind FNKind,
+                                      bool UseSymbolTable) const override;
+  DIGlobal symbolizeData(uint64_t ModuleOffset) const override;
+
+  // Return true if this is a 32-bit x86 PE COFF module.
+  bool isWin32Module() const override;
+
+  // Returns the preferred base of the module, i.e. where the loader would place
+  // it in memory assuming there were no conflicts.
+  uint64_t getModulePreferredBase() const override;
+
+private:
+  bool shouldOverrideWithSymbolTable(FunctionNameKind FNKind,
+                                     bool UseSymbolTable) const;
+
+  bool getNameFromSymbolTable(object::SymbolRef::Type Type, uint64_t Address,
+                              std::string &Name, uint64_t &Addr,
+                              uint64_t &Size) const;
+  // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd
+  // (function descriptor) section and OpdExtractor refers to its contents.
+  std::error_code addSymbol(const object::SymbolRef &Symbol,
+                            uint64_t SymbolSize,
+                            DataExtractor *OpdExtractor = nullptr,
+                            uint64_t OpdAddress = 0);
+  std::error_code addCoffExportSymbols(const object::COFFObjectFile *CoffObj);
+
+  object::ObjectFile *Module;
+  std::unique_ptr<DIContext> DebugInfoContext;
+
+  struct SymbolDesc {
+    uint64_t Addr;
+    // If size is 0, assume that symbol occupies the whole memory range up to
+    // the following symbol.
+    uint64_t Size;
+    friend bool operator<(const SymbolDesc &s1, const SymbolDesc &s2) {
+      return s1.Addr < s2.Addr;
+    }
+  };
+  std::map<SymbolDesc, StringRef> Functions;
+  std::map<SymbolDesc, StringRef> Objects;
+
+  SymbolizableObjectFile(object::ObjectFile *Obj,
+                         std::unique_ptr<DIContext> DICtx);
+};
+
+}  // namespace symbolize
+}  // namespace llvm
+
+#endif  // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
new file mode 100644
index 0000000..3da1963
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -0,0 +1,456 @@
+//===-- LLVMSymbolize.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation for LLVM symbolization library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/Symbolize.h"
+
+#include "SymbolizableObjectFile.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/config.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
+#include "llvm/DebugInfo/PDB/PDBContext.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <Windows.h>
+#include <DbgHelp.h>
+#pragma comment(lib, "dbghelp.lib")
+
+// Windows.h conflicts with our COFF header definitions.
+#ifdef IMAGE_FILE_MACHINE_I386
+#undef IMAGE_FILE_MACHINE_I386
+#endif
+#endif
+
+namespace llvm {
+namespace symbolize {
+
+ErrorOr<DILineInfo> LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
+                                                  uint64_t ModuleOffset) {
+  auto InfoOrErr = getOrCreateModuleInfo(ModuleName);
+  if (auto EC = InfoOrErr.getError())
+    return EC;
+  SymbolizableModule *Info = InfoOrErr.get();
+
+  // If the user is giving us relative addresses, add the preferred base of the
+  // object to the offset before we do the query. It's what DIContext expects.
+  if (Opts.RelativeAddresses)
+    ModuleOffset += Info->getModulePreferredBase();
+
+  DILineInfo LineInfo = Info->symbolizeCode(ModuleOffset, Opts.PrintFunctions,
+                                            Opts.UseSymbolTable);
+  if (Opts.Demangle)
+    LineInfo.FunctionName = DemangleName(LineInfo.FunctionName, Info);
+  return LineInfo;
+}
+
+ErrorOr<DIInliningInfo>
+LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName,
+                                     uint64_t ModuleOffset) {
+  auto InfoOrErr = getOrCreateModuleInfo(ModuleName);
+  if (auto EC = InfoOrErr.getError())
+    return EC;
+  SymbolizableModule *Info = InfoOrErr.get();
+
+  // If the user is giving us relative addresses, add the preferred base of the
+  // object to the offset before we do the query. It's what DIContext expects.
+  if (Opts.RelativeAddresses)
+    ModuleOffset += Info->getModulePreferredBase();
+
+  DIInliningInfo InlinedContext = Info->symbolizeInlinedCode(
+      ModuleOffset, Opts.PrintFunctions, Opts.UseSymbolTable);
+  if (Opts.Demangle) {
+    for (int i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) {
+      auto *Frame = InlinedContext.getMutableFrame(i);
+      Frame->FunctionName = DemangleName(Frame->FunctionName, Info);
+    }
+  }
+  return InlinedContext;
+}
+
+ErrorOr<DIGlobal> LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
+                                                uint64_t ModuleOffset) {
+  auto InfoOrErr = getOrCreateModuleInfo(ModuleName);
+  if (auto EC = InfoOrErr.getError())
+    return EC;
+  SymbolizableModule *Info = InfoOrErr.get();
+
+  // If the user is giving us relative addresses, add the preferred base of
+  // the object to the offset before we do the query. It's what DIContext
+  // expects.
+  if (Opts.RelativeAddresses)
+    ModuleOffset += Info->getModulePreferredBase();
+
+  DIGlobal Global = Info->symbolizeData(ModuleOffset);
+  if (Opts.Demangle)
+    Global.Name = DemangleName(Global.Name, Info);
+  return Global;
+}
+
+void LLVMSymbolizer::flush() {
+  ObjectForUBPathAndArch.clear();
+  BinaryForPath.clear();
+  ObjectPairForPathArch.clear();
+  Modules.clear();
+}
+
+// For Path="/path/to/foo" and Basename="foo" assume that debug info is in
+// /path/to/foo.dSYM/Contents/Resources/DWARF/foo.
+// For Path="/path/to/bar.dSYM" and Basename="foo" assume that debug info is in
+// /path/to/bar.dSYM/Contents/Resources/DWARF/foo.
+static
+std::string getDarwinDWARFResourceForPath(
+    const std::string &Path, const std::string &Basename) {
+  SmallString<16> ResourceName = StringRef(Path);
+  if (sys::path::extension(Path) != ".dSYM") {
+    ResourceName += ".dSYM";
+  }
+  sys::path::append(ResourceName, "Contents", "Resources", "DWARF");
+  sys::path::append(ResourceName, Basename);
+  return ResourceName.str();
+}
+
+static bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  if (!MB)
+    return false;
+  return !zlib::isAvailable() || CRCHash == zlib::crc32(MB.get()->getBuffer());
+}
+
+static bool findDebugBinary(const std::string &OrigPath,
+                            const std::string &DebuglinkName, uint32_t CRCHash,
+                            std::string &Result) {
+  std::string OrigRealPath = OrigPath;
+#if defined(HAVE_REALPATH)
+  if (char *RP = realpath(OrigPath.c_str(), nullptr)) {
+    OrigRealPath = RP;
+    free(RP);
+  }
+#endif
+  SmallString<16> OrigDir(OrigRealPath);
+  llvm::sys::path::remove_filename(OrigDir);
+  SmallString<16> DebugPath = OrigDir;
+  // Try /path/to/original_binary/debuglink_name
+  llvm::sys::path::append(DebugPath, DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = DebugPath.str();
+    return true;
+  }
+  // Try /path/to/original_binary/.debug/debuglink_name
+  DebugPath = OrigRealPath;
+  llvm::sys::path::append(DebugPath, ".debug", DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = DebugPath.str();
+    return true;
+  }
+  // Try /usr/lib/debug/path/to/original_binary/debuglink_name
+  DebugPath = "/usr/lib/debug";
+  llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir),
+                          DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = DebugPath.str();
+    return true;
+  }
+  return false;
+}
+
+static bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
+                                    uint32_t &CRCHash) {
+  if (!Obj)
+    return false;
+  for (const SectionRef &Section : Obj->sections()) {
+    StringRef Name;
+    Section.getName(Name);
+    Name = Name.substr(Name.find_first_not_of("._"));
+    if (Name == "gnu_debuglink") {
+      StringRef Data;
+      Section.getContents(Data);
+      DataExtractor DE(Data, Obj->isLittleEndian(), 0);
+      uint32_t Offset = 0;
+      if (const char *DebugNameStr = DE.getCStr(&Offset)) {
+        // 4-byte align the offset.
+        Offset = (Offset + 3) & ~0x3;
+        if (DE.isValidOffsetForDataOfSize(Offset, 4)) {
+          DebugName = DebugNameStr;
+          CRCHash = DE.getU32(&Offset);
+          return true;
+        }
+      }
+      break;
+    }
+  }
+  return false;
+}
+
+static
+bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj,
+                             const MachOObjectFile *Obj) {
+  ArrayRef<uint8_t> dbg_uuid = DbgObj->getUuid();
+  ArrayRef<uint8_t> bin_uuid = Obj->getUuid();
+  if (dbg_uuid.empty() || bin_uuid.empty())
+    return false;
+  return !memcmp(dbg_uuid.data(), bin_uuid.data(), dbg_uuid.size());
+}
+
+ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
+    const MachOObjectFile *MachExeObj, const std::string &ArchName) {
+  // On Darwin we may find DWARF in separate object file in
+  // resource directory.
+  std::vector<std::string> DsymPaths;
+  StringRef Filename = sys::path::filename(ExePath);
+  DsymPaths.push_back(getDarwinDWARFResourceForPath(ExePath, Filename));
+  for (const auto &Path : Opts.DsymHints) {
+    DsymPaths.push_back(getDarwinDWARFResourceForPath(Path, Filename));
+  }
+  for (const auto &Path : DsymPaths) {
+    auto DbgObjOrErr = getOrCreateObject(Path, ArchName);
+    if (!DbgObjOrErr)
+      continue;
+    ObjectFile *DbgObj = DbgObjOrErr.get();
+    const MachOObjectFile *MachDbgObj = dyn_cast<const MachOObjectFile>(DbgObj);
+    if (!MachDbgObj)
+      continue;
+    if (darwinDsymMatchesBinary(MachDbgObj, MachExeObj))
+      return DbgObj;
+  }
+  return nullptr;
+}
+
+ObjectFile *LLVMSymbolizer::lookUpDebuglinkObject(const std::string &Path,
+                                                  const ObjectFile *Obj,
+                                                  const std::string &ArchName) {
+  std::string DebuglinkName;
+  uint32_t CRCHash;
+  std::string DebugBinaryPath;
+  if (!getGNUDebuglinkContents(Obj, DebuglinkName, CRCHash))
+    return nullptr;
+  if (!findDebugBinary(Path, DebuglinkName, CRCHash, DebugBinaryPath))
+    return nullptr;
+  auto DbgObjOrErr = getOrCreateObject(DebugBinaryPath, ArchName);
+  if (!DbgObjOrErr)
+    return nullptr;
+  return DbgObjOrErr.get();
+}
+
+ErrorOr<LLVMSymbolizer::ObjectPair>
+LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
+                                      const std::string &ArchName) {
+  const auto &I = ObjectPairForPathArch.find(std::make_pair(Path, ArchName));
+  if (I != ObjectPairForPathArch.end())
+    return I->second;
+
+  auto ObjOrErr = getOrCreateObject(Path, ArchName);
+  if (auto EC = ObjOrErr.getError()) {
+    ObjectPairForPathArch.insert(
+        std::make_pair(std::make_pair(Path, ArchName), EC));
+    return EC;
+  }
+
+  ObjectFile *Obj = ObjOrErr.get();
+  assert(Obj != nullptr);
+  ObjectFile *DbgObj = nullptr;
+
+  if (auto MachObj = dyn_cast<const MachOObjectFile>(Obj))
+    DbgObj = lookUpDsymFile(Path, MachObj, ArchName);
+  if (!DbgObj)
+    DbgObj = lookUpDebuglinkObject(Path, Obj, ArchName);
+  if (!DbgObj)
+    DbgObj = Obj;
+  ObjectPair Res = std::make_pair(Obj, DbgObj);
+  ObjectPairForPathArch.insert(
+      std::make_pair(std::make_pair(Path, ArchName), Res));
+  return Res;
+}
+
+ErrorOr<ObjectFile *>
+LLVMSymbolizer::getOrCreateObject(const std::string &Path,
+                                  const std::string &ArchName) {
+  const auto &I = BinaryForPath.find(Path);
+  Binary *Bin = nullptr;
+  if (I == BinaryForPath.end()) {
+    ErrorOr<OwningBinary<Binary>> BinOrErr = createBinary(Path);
+    if (auto EC = BinOrErr.getError()) {
+      BinaryForPath.insert(std::make_pair(Path, EC));
+      return EC;
+    }
+    Bin = BinOrErr->getBinary();
+    BinaryForPath.insert(std::make_pair(Path, std::move(BinOrErr.get())));
+  } else if (auto EC = I->second.getError()) {
+    return EC;
+  } else {
+    Bin = I->second->getBinary();
+  }
+
+  assert(Bin != nullptr);
+
+  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin)) {
+    const auto &I = ObjectForUBPathAndArch.find(std::make_pair(Path, ArchName));
+    if (I != ObjectForUBPathAndArch.end()) {
+      if (auto EC = I->second.getError())
+        return EC;
+      return I->second->get();
+    }
+    ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+        UB->getObjectForArch(ArchName);
+    if (auto EC = ObjOrErr.getError()) {
+      ObjectForUBPathAndArch.insert(
+          std::make_pair(std::make_pair(Path, ArchName), EC));
+      return EC;
+    }
+    ObjectFile *Res = ObjOrErr->get();
+    ObjectForUBPathAndArch.insert(std::make_pair(std::make_pair(Path, ArchName),
+                                                 std::move(ObjOrErr.get())));
+    return Res;
+  }
+  if (Bin->isObject()) {
+    return cast<ObjectFile>(Bin);
+  }
+  return object_error::arch_not_found;
+}
+
+ErrorOr<SymbolizableModule *>
+LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
+  const auto &I = Modules.find(ModuleName);
+  if (I != Modules.end()) {
+    auto &InfoOrErr = I->second;
+    if (auto EC = InfoOrErr.getError())
+      return EC;
+    return InfoOrErr->get();
+  }
+  std::string BinaryName = ModuleName;
+  std::string ArchName = Opts.DefaultArch;
+  size_t ColonPos = ModuleName.find_last_of(':');
+  // Verify that substring after colon form a valid arch name.
+  if (ColonPos != std::string::npos) {
+    std::string ArchStr = ModuleName.substr(ColonPos + 1);
+    if (Triple(ArchStr).getArch() != Triple::UnknownArch) {
+      BinaryName = ModuleName.substr(0, ColonPos);
+      ArchName = ArchStr;
+    }
+  }
+  auto ObjectsOrErr = getOrCreateObjectPair(BinaryName, ArchName);
+  if (auto EC = ObjectsOrErr.getError()) {
+    // Failed to find valid object file.
+    Modules.insert(std::make_pair(ModuleName, EC));
+    return EC;
+  }
+  ObjectPair Objects = ObjectsOrErr.get();
+
+  std::unique_ptr<DIContext> Context;
+  if (auto CoffObject = dyn_cast<COFFObjectFile>(Objects.first)) {
+    // If this is a COFF object, assume it contains PDB debug information.  If
+    // we don't find any we will fall back to the DWARF case.
+    std::unique_ptr<IPDBSession> Session;
+    PDB_ErrorCode Error = loadDataForEXE(PDB_ReaderType::DIA,
+                                         Objects.first->getFileName(), Session);
+    if (Error == PDB_ErrorCode::Success) {
+      Context.reset(new PDBContext(*CoffObject, std::move(Session)));
+    }
+  }
+  if (!Context)
+    Context.reset(new DWARFContextInMemory(*Objects.second));
+  assert(Context);
+  auto InfoOrErr =
+      SymbolizableObjectFile::create(Objects.first, std::move(Context));
+  auto InsertResult =
+      Modules.insert(std::make_pair(ModuleName, std::move(InfoOrErr)));
+  assert(InsertResult.second);
+  if (auto EC = InsertResult.first->second.getError())
+    return EC;
+  return InsertResult.first->second->get();
+}
+
+// Undo these various manglings for Win32 extern "C" functions:
+// cdecl       - _foo
+// stdcall     - _foo@12
+// fastcall    - @foo@12
+// vectorcall  - foo@@12
+// These are all different linkage names for 'foo'.
+static StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
+  // Remove any '_' or '@' prefix.
+  char Front = SymbolName.empty() ? '\0' : SymbolName[0];
+  if (Front == '_' || Front == '@')
+    SymbolName = SymbolName.drop_front();
+
+  // Remove any '@[0-9]+' suffix.
+  if (Front != '?') {
+    size_t AtPos = SymbolName.rfind('@');
+    if (AtPos != StringRef::npos &&
+        std::all_of(SymbolName.begin() + AtPos + 1, SymbolName.end(),
+                    [](char C) { return C >= '0' && C <= '9'; })) {
+      SymbolName = SymbolName.substr(0, AtPos);
+    }
+  }
+
+  // Remove any ending '@' for vectorcall.
+  if (SymbolName.endswith("@"))
+    SymbolName = SymbolName.drop_back();
+
+  return SymbolName;
+}
+
+#if !defined(_MSC_VER)
+// Assume that __cxa_demangle is provided by libcxxabi (except for Windows).
+extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer,
+                                size_t *length, int *status);
+#endif
+
+std::string LLVMSymbolizer::DemangleName(const std::string &Name,
+                                         const SymbolizableModule *ModInfo) {
+#if !defined(_MSC_VER)
+  // We can spoil names of symbols with C linkage, so use an heuristic
+  // approach to check if the name should be demangled.
+  if (Name.substr(0, 2) == "_Z") {
+    int status = 0;
+    char *DemangledName = __cxa_demangle(Name.c_str(), nullptr, nullptr, &status);
+    if (status != 0)
+      return Name;
+    std::string Result = DemangledName;
+    free(DemangledName);
+    return Result;
+  }
+#else
+  if (!Name.empty() && Name.front() == '?') {
+    // Only do MSVC C++ demangling on symbols starting with '?'.
+    char DemangledName[1024] = {0};
+    DWORD result = ::UnDecorateSymbolName(
+        Name.c_str(), DemangledName, 1023,
+        UNDNAME_NO_ACCESS_SPECIFIERS |       // Strip public, private, protected
+            UNDNAME_NO_ALLOCATION_LANGUAGE | // Strip __thiscall, __stdcall, etc
+            UNDNAME_NO_THROW_SIGNATURES |    // Strip throw() specifications
+            UNDNAME_NO_MEMBER_TYPE | // Strip virtual, static, etc specifiers
+            UNDNAME_NO_MS_KEYWORDS | // Strip all MS extension keywords
+            UNDNAME_NO_FUNCTION_RETURNS); // Strip function return types
+    return (result == 0) ? Name : std::string(DemangledName);
+  }
+#endif
+  if (ModInfo && ModInfo->isWin32Module())
+    return std::string(demanglePE32ExternCFunc(Name));
+  return Name;
+}
+
+} // namespace symbolize
+} // namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index 67a1ca6..41c8da4 100644
--- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -61,8 +61,7 @@ ExecutionEngine *(*ExecutionEngine::InterpCtor)(std::unique_ptr<Module> M,
 
 void JITEventListener::anchor() {}
 
-ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M)
-  : LazyFunctionCreator(nullptr) {
+void ExecutionEngine::Init(std::unique_ptr<Module> M) {
   CompilingLazily         = false;
   GVCompilationDisabled   = false;
   SymbolSearchingDisabled = false;
@@ -79,6 +78,16 @@ ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M)
   Modules.push_back(std::move(M));
 }
 
+ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M)
+    : DL(M->getDataLayout()), LazyFunctionCreator(nullptr) {
+  Init(std::move(M));
+}
+
+ExecutionEngine::ExecutionEngine(DataLayout DL, std::unique_ptr<Module> M)
+    : DL(std::move(DL)), LazyFunctionCreator(nullptr) {
+  Init(std::move(M));
+}
+
 ExecutionEngine::~ExecutionEngine() {
   clearAllGlobalMappings();
 }
@@ -86,7 +95,7 @@ ExecutionEngine::~ExecutionEngine() {
 namespace {
 /// \brief Helper class which uses a value handler to automatically deletes the
 /// memory block when the GlobalVariable is destroyed.
-class GVMemoryBlock : public CallbackVH {
+class GVMemoryBlock final : public CallbackVH {
   GVMemoryBlock(const GlobalVariable *GV)
     : CallbackVH(const_cast<GlobalVariable*>(GV)) {}
 
@@ -115,7 +124,7 @@ public:
 }  // anonymous namespace
 
 char *ExecutionEngine::getMemoryForGV(const GlobalVariable *GV) {
-  return GVMemoryBlock::Create(GV, *getDataLayout());
+  return GVMemoryBlock::Create(GV, getDataLayout());
 }
 
 void ExecutionEngine::addObjectFile(std::unique_ptr<object::ObjectFile> O) {
@@ -187,7 +196,7 @@ std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
 
   const DataLayout &DL =
     GV->getParent()->getDataLayout().isDefault()
-      ? *getDataLayout()
+      ? getDataLayout()
       : GV->getParent()->getDataLayout();
 
   Mangler::getNameWithPrefix(FullName, GV->getName(), DL);
@@ -228,11 +237,10 @@ void ExecutionEngine::clearAllGlobalMappings() {
 void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
   MutexGuard locked(lock);
 
-  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
-    EEState.RemoveMapping(getMangledName(FI));
-  for (Module::global_iterator GI = M->global_begin(), GE = M->global_end();
-       GI != GE; ++GI)
-    EEState.RemoveMapping(getMangledName(GI));
+  for (Function &FI : *M)
+    EEState.RemoveMapping(getMangledName(&FI));
+  for (GlobalVariable &GI : M->globals())
+    EEState.RemoveMapping(getMangledName(&GI));
 }
 
 uint64_t ExecutionEngine::updateGlobalMapping(const GlobalValue *GV,
@@ -333,7 +341,7 @@ void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
                        const std::vector<std::string> &InputArgv) {
   Values.clear();  // Free the old contents.
   Values.reserve(InputArgv.size());
-  unsigned PtrSize = EE->getDataLayout()->getPointerSize();
+  unsigned PtrSize = EE->getDataLayout().getPointerSize();
   Array = make_unique<char[]>((InputArgv.size()+1)*PtrSize);
 
   DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array.get() << "\n");
@@ -408,7 +416,7 @@ void ExecutionEngine::runStaticConstructorsDestructors(bool isDtors) {
 #ifndef NDEBUG
 /// isTargetNullPtr - Return whether the target pointer stored at Loc is null.
 static bool isTargetNullPtr(ExecutionEngine *EE, void *Loc) {
-  unsigned PtrSize = EE->getDataLayout()->getPointerSize();
+  unsigned PtrSize = EE->getDataLayout().getPointerSize();
   for (unsigned i = 0; i < PtrSize; ++i)
     if (*(i + (uint8_t*)Loc))
       return false;
@@ -621,8 +629,8 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       break;
     case Type::VectorTyID:
       // if the whole vector is 'undef' just reserve memory for the value.
-      const VectorType* VTy = dyn_cast<VectorType>(C->getType());
-      const Type *ElemTy = VTy->getElementType();
+      auto* VTy = dyn_cast<VectorType>(C->getType());
+      Type *ElemTy = VTy->getElementType();
       unsigned int elemNum = VTy->getNumElements();
       Result.AggregateVal.resize(elemNum);
       if (ElemTy->isIntegerTy())
@@ -641,8 +649,8 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     case Instruction::GetElementPtr: {
       // Compute the index
       GenericValue Result = getConstantValue(Op0);
-      APInt Offset(DL->getPointerSizeInBits(), 0);
-      cast<GEPOperator>(CE)->accumulateConstantOffset(*DL, Offset);
+      APInt Offset(DL.getPointerSizeInBits(), 0);
+      cast<GEPOperator>(CE)->accumulateConstantOffset(DL, Offset);
 
       char* tmp = (char*) Result.PointerVal;
       Result = PTOGV(tmp + Offset.getSExtValue());
@@ -729,16 +737,16 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     }
     case Instruction::PtrToInt: {
       GenericValue GV = getConstantValue(Op0);
-      uint32_t PtrWidth = DL->getTypeSizeInBits(Op0->getType());
+      uint32_t PtrWidth = DL.getTypeSizeInBits(Op0->getType());
       assert(PtrWidth <= 64 && "Bad pointer width");
       GV.IntVal = APInt(PtrWidth, uintptr_t(GV.PointerVal));
-      uint32_t IntWidth = DL->getTypeSizeInBits(CE->getType());
+      uint32_t IntWidth = DL.getTypeSizeInBits(CE->getType());
       GV.IntVal = GV.IntVal.zextOrTrunc(IntWidth);
       return GV;
     }
     case Instruction::IntToPtr: {
       GenericValue GV = getConstantValue(Op0);
-      uint32_t PtrWidth = DL->getTypeSizeInBits(CE->getType());
+      uint32_t PtrWidth = DL.getTypeSizeInBits(CE->getType());
       GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth);
       assert(GV.IntVal.getBitWidth() <= 64 && "Bad pointer width");
       GV.PointerVal = PointerTy(uintptr_t(GV.IntVal.getZExtValue()));
@@ -860,8 +868,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           case Instruction::FRem:
-            apfLHS.mod(APFloat(Sem, RHS.IntVal),
-                       APFloat::rmNearestTiesToEven);
+            apfLHS.mod(APFloat(Sem, RHS.IntVal));
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           }
@@ -1040,7 +1047,7 @@ static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
 
 void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
                                          GenericValue *Ptr, Type *Ty) {
-  const unsigned StoreBytes = getDataLayout()->getTypeStoreSize(Ty);
+  const unsigned StoreBytes = getDataLayout().getTypeStoreSize(Ty);
 
   switch (Ty->getTypeID()) {
   default:
@@ -1080,7 +1087,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
     break;
   }
 
-  if (sys::IsLittleEndianHost != getDataLayout()->isLittleEndian())
+  if (sys::IsLittleEndianHost != getDataLayout().isLittleEndian())
     // Host and target are different endian - reverse the stored bytes.
     std::reverse((uint8_t*)Ptr, StoreBytes + (uint8_t*)Ptr);
 }
@@ -1117,7 +1124,7 @@ static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
 void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
                                           GenericValue *Ptr,
                                           Type *Ty) {
-  const unsigned LoadBytes = getDataLayout()->getTypeStoreSize(Ty);
+  const unsigned LoadBytes = getDataLayout().getTypeStoreSize(Ty);
 
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID:
@@ -1143,8 +1150,8 @@ void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
     break;
   }
   case Type::VectorTyID: {
-    const VectorType *VT = cast<VectorType>(Ty);
-    const Type *ElemT = VT->getElementType();
+    auto *VT = cast<VectorType>(Ty);
+    Type *ElemT = VT->getElementType();
     const unsigned numElems = VT->getNumElements();
     if (ElemT->isFloatTy()) {
       Result.AggregateVal.resize(numElems);
@@ -1183,20 +1190,20 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   
   if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
     unsigned ElementSize =
-      getDataLayout()->getTypeAllocSize(CP->getType()->getElementType());
+        getDataLayout().getTypeAllocSize(CP->getType()->getElementType());
     for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
       InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
   
   if (isa<ConstantAggregateZero>(Init)) {
-    memset(Addr, 0, (size_t)getDataLayout()->getTypeAllocSize(Init->getType()));
+    memset(Addr, 0, (size_t)getDataLayout().getTypeAllocSize(Init->getType()));
     return;
   }
   
   if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
     unsigned ElementSize =
-      getDataLayout()->getTypeAllocSize(CPA->getType()->getElementType());
+        getDataLayout().getTypeAllocSize(CPA->getType()->getElementType());
     for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
       InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
     return;
@@ -1204,7 +1211,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   
   if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
     const StructLayout *SL =
-      getDataLayout()->getStructLayout(cast<StructType>(CPS->getType()));
+        getDataLayout().getStructLayout(cast<StructType>(CPS->getType()));
     for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
       InitializeMemory(CPS->getOperand(i), (char*)Addr+SL->getElementOffset(i));
     return;
@@ -1349,7 +1356,7 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
     InitializeMemory(GV->getInitializer(), GA);
 
   Type *ElTy = GV->getType()->getElementType();
-  size_t GVSize = (size_t)getDataLayout()->getTypeAllocSize(ElTy);
+  size_t GVSize = (size_t)getDataLayout().getTypeAllocSize(ElTy);
   NumInitBytes += (unsigned)GVSize;
   ++NumGlobals;
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 55ab5af..ff7c4dc 100644
--- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(GenericValue, LLVMGenericValueRef)
 
 
-inline LLVMTargetMachineRef wrap(const TargetMachine *P) {
+static LLVMTargetMachineRef wrap(const TargetMachine *P) {
   return
   reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P));
 }
@@ -210,35 +210,6 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
   return 1;
 }
 
-LLVMBool LLVMCreateExecutionEngine(LLVMExecutionEngineRef *OutEE,
-                                   LLVMModuleProviderRef MP,
-                                   char **OutError) {
-  /* The module provider is now actually a module. */
-  return LLVMCreateExecutionEngineForModule(OutEE,
-                                            reinterpret_cast<LLVMModuleRef>(MP),
-                                            OutError);
-}
-
-LLVMBool LLVMCreateInterpreter(LLVMExecutionEngineRef *OutInterp,
-                               LLVMModuleProviderRef MP,
-                               char **OutError) {
-  /* The module provider is now actually a module. */
-  return LLVMCreateInterpreterForModule(OutInterp,
-                                        reinterpret_cast<LLVMModuleRef>(MP),
-                                        OutError);
-}
-
-LLVMBool LLVMCreateJITCompiler(LLVMExecutionEngineRef *OutJIT,
-                               LLVMModuleProviderRef MP,
-                               unsigned OptLevel,
-                               char **OutError) {
-  /* The module provider is now actually a module. */
-  return LLVMCreateJITCompilerForModule(OutJIT,
-                                        reinterpret_cast<LLVMModuleRef>(MP),
-                                        OptLevel, OutError);
-}
-
-
 void LLVMDisposeExecutionEngine(LLVMExecutionEngineRef EE) {
   delete unwrap(EE);
 }
@@ -282,11 +253,6 @@ void LLVMAddModule(LLVMExecutionEngineRef EE, LLVMModuleRef M){
   unwrap(EE)->addModule(std::unique_ptr<Module>(unwrap(M)));
 }
 
-void LLVMAddModuleProvider(LLVMExecutionEngineRef EE, LLVMModuleProviderRef MP){
-  /* The module provider is now actually a module. */
-  LLVMAddModule(EE, reinterpret_cast<LLVMModuleRef>(MP));
-}
-
 LLVMBool LLVMRemoveModule(LLVMExecutionEngineRef EE, LLVMModuleRef M,
                           LLVMModuleRef *OutMod, char **OutError) {
   Module *Mod = unwrap(M);
@@ -295,14 +261,6 @@ LLVMBool LLVMRemoveModule(LLVMExecutionEngineRef EE, LLVMModuleRef M,
   return 0;
 }
 
-LLVMBool LLVMRemoveModuleProvider(LLVMExecutionEngineRef EE,
-                                  LLVMModuleProviderRef MP,
-                                  LLVMModuleRef *OutMod, char **OutError) {
-  /* The module provider is now actually a module. */
-  return LLVMRemoveModule(EE, reinterpret_cast<LLVMModuleRef>(MP), OutMod,
-                          OutError);
-}
-
 LLVMBool LLVMFindFunction(LLVMExecutionEngineRef EE, const char *Name,
                           LLVMValueRef *OutFn) {
   if (Function *F = unwrap(EE)->FindFunctionNamed(Name)) {
@@ -318,7 +276,7 @@ void *LLVMRecompileAndRelinkFunction(LLVMExecutionEngineRef EE,
 }
 
 LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) {
-  return wrap(unwrap(EE)->getDataLayout());
+  return wrap(&unwrap(EE)->getDataLayout());
 }
 
 LLVMTargetMachineRef
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
index dbfa37e..1eb4f7d 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -593,7 +593,7 @@ static GenericValue executeFCMP_UNO(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_BOOL(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty, const bool val) {
+                                     Type *Ty, const bool val) {
   GenericValue Dest;
     if(Ty->isVectorTy()) {
       assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());
@@ -788,7 +788,7 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
 }
 
 static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
-                                      GenericValue Src3, const Type *Ty) {
+                                      GenericValue Src3, Type *Ty) {
     GenericValue Dest;
     if(Ty->isVectorTy()) {
       assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());
@@ -805,7 +805,7 @@ static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
 
 void Interpreter::visitSelectInst(SelectInst &I) {
   ExecutionContext &SF = ECStack.back();
-  const Type * Ty = I.getOperand(0)->getType();
+  Type * Ty = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
@@ -968,7 +968,7 @@ void Interpreter::visitAllocaInst(AllocaInst &I) {
   unsigned NumElements = 
     getOperandValue(I.getOperand(0), SF).IntVal.getZExtValue();
 
-  unsigned TypeSize = (size_t)TD.getTypeAllocSize(Ty);
+  unsigned TypeSize = (size_t)getDataLayout().getTypeAllocSize(Ty);
 
   // Avoid malloc-ing zero bytes, use max()...
   unsigned MemToAlloc = std::max(1U, NumElements * TypeSize);
@@ -1000,7 +1000,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
 
   for (; I != E; ++I) {
     if (StructType *STy = dyn_cast<StructType>(*I)) {
-      const StructLayout *SLO = TD.getStructLayout(STy);
+      const StructLayout *SLO = getDataLayout().getStructLayout(STy);
 
       const ConstantInt *CPU = cast<ConstantInt>(I.getOperand());
       unsigned Index = unsigned(CPU->getZExtValue());
@@ -1020,7 +1020,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
         assert(BitWidth == 64 && "Invalid index type for getelementptr");
         Idx = (int64_t)IdxGV.IntVal.getZExtValue();
       }
-      Total += TD.getTypeAllocSize(ST->getElementType())*Idx;
+      Total += getDataLayout().getTypeAllocSize(ST->getElementType()) * Idx;
     }
   }
 
@@ -1139,7 +1139,7 @@ void Interpreter::visitShl(BinaryOperator &I) {
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Dest;
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
 
   if (Ty->isVectorTy()) {
     uint32_t src1Size = uint32_t(Src1.AggregateVal.size());
@@ -1166,7 +1166,7 @@ void Interpreter::visitLShr(BinaryOperator &I) {
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Dest;
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
 
   if (Ty->isVectorTy()) {
     uint32_t src1Size = uint32_t(Src1.AggregateVal.size());
@@ -1193,7 +1193,7 @@ void Interpreter::visitAShr(BinaryOperator &I) {
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Dest;
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
 
   if (Ty->isVectorTy()) {
     size_t src1Size = Src1.AggregateVal.size();
@@ -1237,10 +1237,10 @@ GenericValue Interpreter::executeTruncInst(Value *SrcVal, Type *DstTy,
 
 GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy,
                                           ExecutionContext &SF) {
-  const Type *SrcTy = SrcVal->getType();
+  Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   if (SrcTy->isVectorTy()) {
-    const Type *DstVecTy = DstTy->getScalarType();
+    Type *DstVecTy = DstTy->getScalarType();
     unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal.
@@ -1248,7 +1248,7 @@ GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy,
     for (unsigned i = 0; i < size; i++)
       Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.sext(DBitWidth);
   } else {
-    const IntegerType *DITy = cast<IntegerType>(DstTy);
+    auto *DITy = cast<IntegerType>(DstTy);
     unsigned DBitWidth = DITy->getBitWidth();
     Dest.IntVal = Src.IntVal.sext(DBitWidth);
   }
@@ -1257,10 +1257,10 @@ GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy,
 
 GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy,
                                           ExecutionContext &SF) {
-  const Type *SrcTy = SrcVal->getType();
+  Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   if (SrcTy->isVectorTy()) {
-    const Type *DstVecTy = DstTy->getScalarType();
+    Type *DstVecTy = DstTy->getScalarType();
     unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
 
     unsigned size = Src.AggregateVal.size();
@@ -1269,7 +1269,7 @@ GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy,
     for (unsigned i = 0; i < size; i++)
       Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.zext(DBitWidth);
   } else {
-    const IntegerType *DITy = cast<IntegerType>(DstTy);
+    auto *DITy = cast<IntegerType>(DstTy);
     unsigned DBitWidth = DITy->getBitWidth();
     Dest.IntVal = Src.IntVal.zext(DBitWidth);
   }
@@ -1327,8 +1327,8 @@ GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, Type *DstTy,
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
   if (SrcTy->getTypeID() == Type::VectorTyID) {
-    const Type *DstVecTy = DstTy->getScalarType();
-    const Type *SrcVecTy = SrcTy->getScalarType();
+    Type *DstVecTy = DstTy->getScalarType();
+    Type *SrcVecTy = SrcTy->getScalarType();
     uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal.
@@ -1365,8 +1365,8 @@ GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, Type *DstTy,
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
   if (SrcTy->getTypeID() == Type::VectorTyID) {
-    const Type *DstVecTy = DstTy->getScalarType();
-    const Type *SrcVecTy = SrcTy->getScalarType();
+    Type *DstVecTy = DstTy->getScalarType();
+    Type *SrcVecTy = SrcTy->getScalarType();
     uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal
@@ -1401,7 +1401,7 @@ GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, Type *DstTy,
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
   if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
-    const Type *DstVecTy = DstTy->getScalarType();
+    Type *DstVecTy = DstTy->getScalarType();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal
     Dest.AggregateVal.resize(size);
@@ -1433,7 +1433,7 @@ GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, Type *DstTy,
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
   if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
-    const Type *DstVecTy = DstTy->getScalarType();
+    Type *DstVecTy = DstTy->getScalarType();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal
     Dest.AggregateVal.resize(size);
@@ -1477,7 +1477,7 @@ GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, Type *DstTy,
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(DstTy->isPointerTy() && "Invalid PtrToInt instruction");
 
-  uint32_t PtrSize = TD.getPointerSizeInBits();
+  uint32_t PtrSize = getDataLayout().getPointerSizeInBits();
   if (PtrSize != Src.IntVal.getBitWidth())
     Src.IntVal = Src.IntVal.zextOrTrunc(PtrSize);
 
@@ -1497,10 +1497,10 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
       (DstTy->getTypeID() == Type::VectorTyID)) {
     // vector src bitcast to vector dst or vector src bitcast to scalar dst or
     // scalar src bitcast to vector dst
-    bool isLittleEndian = TD.isLittleEndian();
+    bool isLittleEndian = getDataLayout().isLittleEndian();
     GenericValue TempDst, TempSrc, SrcVec;
-    const Type *SrcElemTy;
-    const Type *DstElemTy;
+    Type *SrcElemTy;
+    Type *DstElemTy;
     unsigned SrcBitSize;
     unsigned DstBitSize;
     unsigned SrcNum;
@@ -2091,7 +2091,7 @@ void Interpreter::callFunction(Function *F, ArrayRef<GenericValue> ArgVals) {
   }
 
   // Get pointers to first LLVM BB & Instruction in function.
-  StackFrame.CurBB     = F->begin();
+  StackFrame.CurBB     = &F->front();
   StackFrame.CurInst   = StackFrame.CurBB->begin();
 
   // Run through the function arguments and initialize their values...
@@ -2103,7 +2103,7 @@ void Interpreter::callFunction(Function *F, ArrayRef<GenericValue> ArgVals) {
   unsigned i = 0;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); 
        AI != E; ++AI, ++i)
-    SetValue(AI, ArgVals[i], StackFrame);
+    SetValue(&*AI, ArgVals[i], StackFrame);
 
   // Handle varargs arguments...
   StackFrame.VarArgs.assign(ArgVals.begin()+i, ArgVals.end());
@@ -2121,27 +2121,5 @@ void Interpreter::run() {
 
     DEBUG(dbgs() << "About to interpret: " << I);
     visit(I);   // Dispatch to one of the visit* methods...
-#if 0
-    // This is not safe, as visiting the instruction could lower it and free I.
-DEBUG(
-    if (!isa<CallInst>(I) && !isa<InvokeInst>(I) && 
-        I.getType() != Type::VoidTy) {
-      dbgs() << "  --> ";
-      const GenericValue &Val = SF.Values[&I];
-      switch (I.getType()->getTypeID()) {
-      default: llvm_unreachable("Invalid GenericValue Type");
-      case Type::VoidTyID:    dbgs() << "void"; break;
-      case Type::FloatTyID:   dbgs() << "float " << Val.FloatVal; break;
-      case Type::DoubleTyID:  dbgs() << "double " << Val.DoubleVal; break;
-      case Type::PointerTyID: dbgs() << "void* " << intptr_t(Val.PointerVal);
-        break;
-      case Type::IntegerTyID: 
-        dbgs() << "i" << Val.IntVal.getBitWidth() << " "
-               << Val.IntVal.toStringUnsigned(10)
-               << " (0x" << Val.IntVal.toStringUnsigned(16) << ")\n";
-        break;
-      }
-    });
-#endif
   }
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 9b44042..441f0eb 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -178,7 +178,7 @@ static void *ffiValueFor(Type *Ty, const GenericValue &AV,
 }
 
 static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals,
-                      const DataLayout *TD, GenericValue &Result) {
+                      const DataLayout &TD, GenericValue &Result) {
   ffi_cif cif;
   FunctionType *FTy = F->getFunctionType();
   const unsigned NumArgs = F->arg_size();
@@ -198,7 +198,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals,
     const unsigned ArgNo = A->getArgNo();
     Type *ArgTy = FTy->getParamType(ArgNo);
     args[ArgNo] = ffiTypeFor(ArgTy);
-    ArgBytes += TD->getTypeStoreSize(ArgTy);
+    ArgBytes += TD.getTypeStoreSize(ArgTy);
   }
 
   SmallVector<uint8_t, 128> ArgData;
@@ -210,7 +210,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals,
     const unsigned ArgNo = A->getArgNo();
     Type *ArgTy = FTy->getParamType(ArgNo);
     values[ArgNo] = ffiValueFor(ArgTy, ArgVals[ArgNo], ArgDataPtr);
-    ArgDataPtr += TD->getTypeStoreSize(ArgTy);
+    ArgDataPtr += TD.getTypeStoreSize(ArgTy);
   }
 
   Type *RetTy = FTy->getReturnType();
@@ -219,7 +219,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals,
   if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) {
     SmallVector<uint8_t, 128> ret;
     if (RetTy->getTypeID() != Type::VoidTyID)
-      ret.resize(TD->getTypeStoreSize(RetTy));
+      ret.resize(TD.getTypeStoreSize(RetTy));
     ffi_call(&cif, Fn, ret.data(), values.data());
     switch (RetTy->getTypeID()) {
       case Type::IntegerTyID:
@@ -368,7 +368,7 @@ static GenericValue lle_X_sprintf(FunctionType *FT,
       case 'x': case 'X':
         if (HowLong >= 1) {
           if (HowLong == 1 &&
-              TheInterpreter->getDataLayout()->getPointerSizeInBits() == 64 &&
+              TheInterpreter->getDataLayout().getPointerSizeInBits() == 64 &&
               sizeof(long) < sizeof(int64_t)) {
             // Make sure we use %lld with a 64 bit argument because we might be
             // compiling LLI on a 32 bit compiler.
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp
index f103c09..bc7da2e 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -35,7 +35,7 @@ extern "C" void LLVMLinkInInterpreter() { }
 ExecutionEngine *Interpreter::create(std::unique_ptr<Module> M,
                                      std::string *ErrStr) {
   // Tell this Module to materialize everything and release the GVMaterializer.
-  if (std::error_code EC = M->materializeAllPermanently()) {
+  if (std::error_code EC = M->materializeAll()) {
     if (ErrStr)
       *ErrStr = EC.message();
     // We got an error, just return 0
@@ -49,16 +49,15 @@ ExecutionEngine *Interpreter::create(std::unique_ptr<Module> M,
 // Interpreter ctor - Initialize stuff
 //
 Interpreter::Interpreter(std::unique_ptr<Module> M)
-  : ExecutionEngine(std::move(M)), TD(Modules.back().get()) {
+    : ExecutionEngine(std::move(M)) {
 
   memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
-  setDataLayout(&TD);
   // Initialize the "backend"
   initializeExecutionEngine();
   initializeExternalFunctions();
   emitGlobals();
 
-  IL = new IntrinsicLowering(TD);
+  IL = new IntrinsicLowering(getDataLayout());
 }
 
 Interpreter::~Interpreter() {
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
index f976641..2e5a867 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -26,7 +26,6 @@
 namespace llvm {
 
 class IntrinsicLowering;
-struct FunctionInfo;
 template<typename T> class generic_gep_type_iterator;
 class ConstantExpr;
 typedef generic_gep_type_iterator<User::const_op_iterator> gep_type_iterator;
@@ -95,7 +94,6 @@ struct ExecutionContext {
 //
 class Interpreter : public ExecutionEngine, public InstVisitor<Interpreter> {
   GenericValue ExitValue;          // The return value of the called function
-  DataLayout TD;
   IntrinsicLowering *IL;
 
   // The runtime stack of executing code.  The top of the stack is the current
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index f6944ee..6cbebe9 100644
--- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -65,12 +65,13 @@ MCJIT::createJIT(std::unique_ptr<Module> M,
                    std::move(Resolver));
 }
 
-MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
+MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> TM,
              std::shared_ptr<MCJITMemoryManager> MemMgr,
              std::shared_ptr<RuntimeDyld::SymbolResolver> Resolver)
-    : ExecutionEngine(std::move(M)), TM(std::move(tm)), Ctx(nullptr),
-      MemMgr(std::move(MemMgr)), Resolver(*this, std::move(Resolver)),
-      Dyld(*this->MemMgr, this->Resolver), ObjCache(nullptr) {
+    : ExecutionEngine(TM->createDataLayout(), std::move(M)), TM(std::move(TM)),
+      Ctx(nullptr), MemMgr(std::move(MemMgr)),
+      Resolver(*this, std::move(Resolver)), Dyld(*this->MemMgr, this->Resolver),
+      ObjCache(nullptr) {
   // FIXME: We are managing our modules, so we do not want the base class
   // ExecutionEngine to manage them as well. To avoid double destruction
   // of the first (and only) module added in ExecutionEngine constructor
@@ -85,7 +86,6 @@ MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
   Modules.clear();
 
   OwnedModules.addModule(std::move(First));
-  setDataLayout(TM->getDataLayout());
   RegisterJITEventListener(JITEventListener::createGDBRegistrationListener());
 }
 
@@ -159,7 +159,6 @@ std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
   // Initialize passes.
   PM.run(*M);
   // Flush the output buffer to get the generated code into memory
-  ObjStream.flush();
 
   std::unique_ptr<MemoryBuffer> CompiledObjBuffer(
                                 new ObjectMemoryBuffer(std::move(ObjBufferSV)));
@@ -193,7 +192,11 @@ void MCJIT::generateCodeForModule(Module *M) {
   if (ObjCache)
     ObjectToLoad = ObjCache->getObject(M);
 
-  M->setDataLayout(*TM->getDataLayout());
+  if (M->getDataLayout().isDefault()) {
+    M->setDataLayout(getDataLayout());
+  } else {
+    assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
+  }
 
   // If the cache did not contain a suitable object, compile the object
   if (!ObjectToLoad) {
@@ -265,7 +268,7 @@ void MCJIT::finalizeModule(Module *M) {
 
 RuntimeDyld::SymbolInfo MCJIT::findExistingSymbol(const std::string &Name) {
   SmallString<128> FullName;
-  Mangler::getNameWithPrefix(FullName, Name, *TM->getDataLayout());
+  Mangler::getNameWithPrefix(FullName, Name, getDataLayout());
 
   if (void *Addr = getPointerToGlobalIfAvailable(FullName))
     return RuntimeDyld::SymbolInfo(static_cast<uint64_t>(
@@ -315,10 +318,12 @@ RuntimeDyld::SymbolInfo MCJIT::findSymbol(const std::string &Name,
     object::Archive *A = OB.getBinary();
     // Look for our symbols in each Archive
     object::Archive::child_iterator ChildIt = A->findSym(Name);
+    if (std::error_code EC = ChildIt->getError())
+      report_fatal_error(EC.message());
     if (ChildIt != A->child_end()) {
       // FIXME: Support nested archives?
       ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
-          ChildIt->getAsBinary();
+          (*ChildIt)->getAsBinary();
       if (ChildBinOrErr.getError())
         continue;
       std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
index a45173c..3c9d2fd 100644
--- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -86,7 +86,7 @@ class MCJIT : public ExecutionEngine {
     ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
     ModulePtrSet::iterator end_added() { return AddedModules.end(); }
     iterator_range<ModulePtrSet::iterator> added() {
-      return iterator_range<ModulePtrSet::iterator>(begin_added(), end_added());
+      return make_range(begin_added(), end_added());
     }
 
     ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
@@ -223,12 +223,13 @@ public:
   /// FindFunctionNamed - Search all of the active modules to find the function that
   /// defines FnName.  This is very slow operation and shouldn't be used for
   /// general code.
-  virtual Function *FindFunctionNamed(const char *FnName) override;
+  Function *FindFunctionNamed(const char *FnName) override;
 
-  /// FindGlobalVariableNamed - Search all of the active modules to find the global variable
-  /// that defines Name.  This is very slow operation and shouldn't be used for
-  /// general code.
-  virtual GlobalVariable *FindGlobalVariableNamed(const char *Name, bool AllowInternal = false) override;
+  /// FindGlobalVariableNamed - Search all of the active modules to find the
+  /// global variable that defines Name.  This is very slow operation and
+  /// shouldn't be used for general code.
+  GlobalVariable *FindGlobalVariableNamed(const char *Name,
+                                          bool AllowInternal = false) override;
 
   /// Sets the object manager that MCJIT should use to avoid compilation.
   void setObjectCache(ObjectCache *manager) override;
@@ -335,6 +336,6 @@ protected:
                               bool CheckFunctionsOnly);
 };
 
-} // End llvm namespace
+} // end llvm namespace
 
-#endif
+#endif // LLVM_LIB_EXECUTIONENGINE_MCJIT_MCJIT_H
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index b439810..34564e4 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -19,6 +19,9 @@
 namespace llvm {
 namespace orc {
 
+void JITCompileCallbackManager::anchor() {}
+void IndirectStubsManager::anchor() {}
+
 Constant* createIRTypedAddress(FunctionType &FT, TargetAddress Addr) {
   Constant *AddrIntVal =
     ConstantInt::get(Type::getInt64Ty(FT.getContext()), Addr);
@@ -37,7 +40,7 @@ GlobalVariable* createImplPointer(PointerType &PT, Module &M,
   return IP;
 }
 
-void makeStub(Function &F, GlobalVariable &ImplPointer) {
+void makeStub(Function &F, Value &ImplPointer) {
   assert(F.isDeclaration() && "Can't turn a definition into a stub.");
   assert(F.getParent() && "Function isn't in a module.");
   Module &M = *F.getParent();
@@ -61,9 +64,7 @@ class GlobalRenamer {
 public:
 
   static bool needsRenaming(const Value &New) {
-    if (!New.hasName() || New.getName().startswith("\01L"))
-      return true;
-    return false;
+    return !New.hasName() || New.getName().startswith("\01L");
   }
 
   const std::string& getRename(const Value &Orig) {
@@ -106,6 +107,9 @@ void makeAllSymbolsExternallyAccessible(Module &M) {
 
   for (auto &GV : M.globals())
     raiseVisibilityOnValue(GV, Renamer);
+
+  for (auto &A : M.aliases())
+    raiseVisibilityOnValue(A, Renamer);
 }
 
 Function* cloneFunctionDecl(Module &Dst, const Function &F,
@@ -121,7 +125,7 @@ Function* cloneFunctionDecl(Module &Dst, const Function &F,
     auto NewArgI = NewF->arg_begin();
     for (auto ArgI = F.arg_begin(), ArgE = F.arg_end(); ArgI != ArgE;
          ++ArgI, ++NewArgI)
-      (*VMap)[ArgI] = NewArgI;
+      (*VMap)[&*ArgI] = &*NewArgI;
   }
 
   return NewF;
@@ -177,5 +181,16 @@ void moveGlobalVariableInitializer(GlobalVariable &OrigGV,
                                  nullptr, Materializer));
 }
 
+GlobalAlias* cloneGlobalAliasDecl(Module &Dst, const GlobalAlias &OrigA,
+                                  ValueToValueMapTy &VMap) {
+  assert(OrigA.getAliasee() && "Original alias doesn't have an aliasee?");
+  auto *NewA = GlobalAlias::create(OrigA.getValueType(),
+                                   OrigA.getType()->getPointerAddressSpace(),
+                                   OrigA.getLinkage(), OrigA.getName(), &Dst);
+  NewA->copyAttributesFrom(&OrigA);
+  VMap[&OrigA] = NewA;
+  return NewA;
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp
new file mode 100644
index 0000000..01e829f
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp
@@ -0,0 +1,171 @@
+//===------ OrcArchSupport.cpp - Architecture specific support code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/OrcArchitectureSupport.h"
+#include "llvm/Support/Process.h"
+#include <array>
+
+namespace llvm {
+namespace orc {
+
+void OrcX86_64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
+                                  void *CallbackMgr) {
+
+  const uint8_t ResolverCode[] = {
+                                               // resolver_entry:
+    0x55,                                      // 0x00: pushq     %rbp
+    0x48, 0x89, 0xe5,                          // 0x01: movq      %rsp, %rbp
+    0x50,                                      // 0x04: pushq     %rax
+    0x53,                                      // 0x05: pushq     %rbx
+    0x51,                                      // 0x06: pushq     %rcx
+    0x52,                                      // 0x07: pushq     %rdx
+    0x56,                                      // 0x08: pushq     %rsi
+    0x57,                                      // 0x09: pushq     %rdi
+    0x41, 0x50,                                // 0x0a: pushq     %r8
+    0x41, 0x51,                                // 0x0c: pushq     %r9
+    0x41, 0x52,                                // 0x0e: pushq     %r10
+    0x41, 0x53,                                // 0x10: pushq     %r11
+    0x41, 0x54,                                // 0x12: pushq     %r12
+    0x41, 0x55,                                // 0x14: pushq     %r13
+    0x41, 0x56,                                // 0x16: pushq     %r14
+    0x41, 0x57,                                // 0x18: pushq     %r15
+    0x48, 0x81, 0xec, 0x08, 0x02, 0x00, 0x00,  // 0x1a: subq      20, %rsp
+    0x48, 0x0f, 0xae, 0x04, 0x24,              // 0x21: fxsave64  (%rsp)
+    0x48, 0x8d, 0x3d, 0x43, 0x00, 0x00, 0x00,  // 0x26: leaq      67(%rip), %rdi
+    0x48, 0x8b, 0x3f,                          // 0x2d: movq      (%rdi), %rdi
+    0x48, 0x8b, 0x75, 0x08,                    // 0x30: movq      8(%rbp), %rsi
+    0x48, 0x83, 0xee, 0x06,                    // 0x34: subq      $6, %rsi
+    0x48, 0xb8,                                // 0x38: movabsq   $0, %rax
+
+    // 0x3a: JIT re-entry fn addr:
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+    0xff, 0xd0,                                // 0x42: callq     *%rax
+    0x48, 0x89, 0x45, 0x08,                    // 0x44: movq      %rax, 8(%rbp)
+    0x48, 0x0f, 0xae, 0x0c, 0x24,              // 0x48: fxrstor64 (%rsp)
+    0x48, 0x81, 0xc4, 0x08, 0x02, 0x00, 0x00,  // 0x4d: addq      20, %rsp
+    0x41, 0x5f,                                // 0x54: popq      %r15
+    0x41, 0x5e,                                // 0x56: popq      %r14
+    0x41, 0x5d,                                // 0x58: popq      %r13
+    0x41, 0x5c,                                // 0x5a: popq      %r12
+    0x41, 0x5b,                                // 0x5c: popq      %r11
+    0x41, 0x5a,                                // 0x5e: popq      %r10
+    0x41, 0x59,                                // 0x60: popq      %r9
+    0x41, 0x58,                                // 0x62: popq      %r8
+    0x5f,                                      // 0x64: popq      %rdi
+    0x5e,                                      // 0x65: popq      %rsi
+    0x5a,                                      // 0x66: popq      %rdx
+    0x59,                                      // 0x67: popq      %rcx
+    0x5b,                                      // 0x68: popq      %rbx
+    0x58,                                      // 0x69: popq      %rax
+    0x5d,                                      // 0x6a: popq      %rbp
+    0xc3,                                      // 0x6b: retq
+    0x00, 0x00, 0x00, 0x00,                    // 0x6c: <padding>
+
+    // 0x70: Callback mgr address.
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+
+  const unsigned ReentryFnAddrOffset = 0x3a;
+  const unsigned CallbackMgrAddrOffset = 0x70;
+  
+  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
+  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
+         sizeof(CallbackMgr));
+}
+
+void OrcX86_64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
+				 unsigned NumTrampolines) {
+
+  unsigned OffsetToPtr = NumTrampolines * TrampolineSize;
+
+  memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void*));
+
+  uint64_t *Trampolines = reinterpret_cast<uint64_t*>(TrampolineMem);
+  uint64_t CallIndirPCRel = 0xf1c40000000015ff;
+
+  for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize)
+    Trampolines[I] = CallIndirPCRel | ((OffsetToPtr - 6) << 16);
+}
+
+std::error_code OrcX86_64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
+                                                  unsigned MinStubs,
+                                                  void *InitialPtrVal) {
+  // Stub format is:
+  //
+  // .section __orc_stubs
+  // stub1:
+  //                 jmpq    *ptr1(%rip)
+  //                 .byte   0xC4         ; <- Invalid opcode padding.
+  //                 .byte   0xF1
+  // stub2:
+  //                 jmpq    *ptr2(%rip)
+  //
+  // ...
+  //
+  // .section __orc_ptrs
+  // ptr1:
+  //                 .quad 0x0
+  // ptr2:
+  //                 .quad 0x0
+  //
+  // ...
+
+  const unsigned StubSize = IndirectStubsInfo::StubSize;
+
+  // Emit at least MinStubs, rounded up to fill the pages allocated.
+  unsigned PageSize = sys::Process::getPageSize();
+  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
+  unsigned NumStubs = (NumPages * PageSize) / StubSize;
+
+  // Allocate memory for stubs and pointers in one call.
+  std::error_code EC;
+  auto StubsMem =
+    sys::OwningMemoryBlock(
+      sys::Memory::allocateMappedMemory(2 * NumPages * PageSize, nullptr,
+                                        sys::Memory::MF_READ |
+                                        sys::Memory::MF_WRITE,
+                                        EC));
+
+  if (EC)
+    return EC;
+
+  // Create separate MemoryBlocks representing the stubs and pointers.
+  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
+  sys::MemoryBlock PtrsBlock(static_cast<char*>(StubsMem.base()) +
+                               NumPages * PageSize,
+                             NumPages * PageSize);
+
+  // Populate the stubs page stubs and mark it executable.
+  uint64_t *Stub = reinterpret_cast<uint64_t*>(StubsBlock.base());
+  uint64_t PtrOffsetField =
+    static_cast<uint64_t>(NumPages * PageSize - 6) << 16;
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Stub[I] = 0xF1C40000000025ff | PtrOffsetField;
+
+  if (auto EC = sys::Memory::protectMappedMemory(StubsBlock,
+                                                 sys::Memory::MF_READ |
+                                                 sys::Memory::MF_EXEC))
+    return EC;
+
+  // Initialize all pointers to point at FailureAddress.
+  void **Ptr = reinterpret_cast<void**>(PtrsBlock.base());
+  for (unsigned I = 0; I < NumStubs; ++I)
+    Ptr[I] = InitialPtrVal;
+
+  StubsInfo.NumStubs = NumStubs;
+  StubsInfo.StubsMem = std::move(StubsMem);
+
+  return std::error_code();
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
new file mode 100644
index 0000000..d2379cd
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
@@ -0,0 +1,97 @@
+//===----------- OrcCBindings.cpp - C bindings for the Orc APIs -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "OrcCBindingsStack.h"
+#include "llvm-c/OrcBindings.h"
+
+using namespace llvm;
+
+LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) {
+  TargetMachine *TM2(unwrap(TM));
+
+  Triple T(TM2->getTargetTriple());
+
+  auto CompileCallbackMgr = OrcCBindingsStack::createCompileCallbackMgr(T);
+  auto IndirectStubsMgrBuilder =
+    OrcCBindingsStack::createIndirectStubsMgrBuilder(T);
+
+  OrcCBindingsStack *JITStack =
+    new OrcCBindingsStack(*TM2, std::move(CompileCallbackMgr),
+			  IndirectStubsMgrBuilder);
+
+  return wrap(JITStack);
+}
+
+void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledName,
+                             const char *SymbolName) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  std::string Mangled = J.mangle(SymbolName);
+  *MangledName = new char[Mangled.size() + 1];
+  strcpy(*MangledName, Mangled.c_str());
+}
+
+void LLVMOrcDisposeMangledSymbol(char *MangledName) {
+  delete[] MangledName;
+}
+
+LLVMOrcTargetAddress
+LLVMOrcCreateLazyCompileCallback(LLVMOrcJITStackRef JITStack,
+                                 LLVMOrcLazyCompileCallbackFn Callback,
+                                 void *CallbackCtx) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  return J.createLazyCompileCallback(Callback, CallbackCtx);
+}
+
+void LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack,
+                               const char *StubName,
+                               LLVMOrcTargetAddress InitAddr) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  J.createIndirectStub(StubName, InitAddr);
+}
+
+void LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
+                                   const char *StubName,
+                                   LLVMOrcTargetAddress NewAddr) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  J.setIndirectStubPointer(StubName, NewAddr);
+}
+
+LLVMOrcModuleHandle
+LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod,
+                            LLVMOrcSymbolResolverFn SymbolResolver,
+                            void *SymbolResolverCtx) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  Module *M(unwrap(Mod));
+  return J.addIRModuleEager(M, SymbolResolver, SymbolResolverCtx);
+}
+
+LLVMOrcModuleHandle
+LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod,
+                           LLVMOrcSymbolResolverFn SymbolResolver,
+                           void *SymbolResolverCtx) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  Module *M(unwrap(Mod));
+  return J.addIRModuleLazy(M, SymbolResolver, SymbolResolverCtx);
+}
+
+void LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack, LLVMOrcModuleHandle H) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  J.removeModule(H);
+}
+
+LLVMOrcTargetAddress LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
+                                             const char *SymbolName) {
+  OrcCBindingsStack &J = *unwrap(JITStack);
+  auto Sym = J.findSymbol(SymbolName, true);
+  return Sym.getAddress();
+}
+
+void LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack) {
+  delete unwrap(JITStack);
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
new file mode 100644
index 0000000..956daae
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
@@ -0,0 +1,43 @@
+//===-------- OrcCBindingsStack.cpp - Orc JIT stack for C bindings --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "OrcCBindingsStack.h"
+
+#include "llvm/ExecutionEngine/Orc/OrcArchitectureSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include <cstdio>
+#include <system_error>
+
+using namespace llvm;
+
+std::unique_ptr<OrcCBindingsStack::CompileCallbackMgr>
+OrcCBindingsStack::createCompileCallbackMgr(Triple T) {
+  switch (T.getArch()) {
+    default: return nullptr;
+
+    case Triple::x86_64: {
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
+      return llvm::make_unique<CCMgrT>(0);
+    }
+  }
+}
+
+OrcCBindingsStack::IndirectStubsManagerBuilder
+OrcCBindingsStack::createIndirectStubsMgrBuilder(Triple T) {
+  switch (T.getArch()) {
+    default: return nullptr;
+
+    case Triple::x86_64:
+      return [](){
+        return llvm::make_unique<
+                 orc::LocalIndirectStubsManager<orc::OrcX86_64>>();
+      };
+  }
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
new file mode 100644
index 0000000..aae6a99
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -0,0 +1,283 @@
+//===--- OrcCBindingsStack.h - Orc JIT stack for C bindings ---*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
+#define LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm-c/OrcBindings.h"
+
+namespace llvm {
+
+class OrcCBindingsStack;
+
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcCBindingsStack, LLVMOrcJITStackRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
+
+class OrcCBindingsStack {
+public:
+
+  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
+  typedef orc::ObjectLinkingLayer<> ObjLayerT;
+  typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
+  typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr> CODLayerT;
+
+  typedef std::function<std::unique_ptr<CompileCallbackMgr>()>
+    CallbackManagerBuilder;
+
+  typedef CODLayerT::IndirectStubsManagerBuilderT IndirectStubsManagerBuilder;
+
+private:
+
+  class GenericHandle {
+  public:
+    virtual ~GenericHandle() {}
+    virtual orc::JITSymbol findSymbolIn(const std::string &Name,
+                                        bool ExportedSymbolsOnly) = 0;
+    virtual void removeModule() = 0;
+  };
+
+  template <typename LayerT>
+  class GenericHandleImpl : public GenericHandle {
+  public:
+    GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle)
+      : Layer(Layer), Handle(std::move(Handle)) {}
+
+    orc::JITSymbol findSymbolIn(const std::string &Name,
+                                bool ExportedSymbolsOnly) override {
+      return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
+    }
+
+    void removeModule() override {
+      return Layer.removeModuleSet(Handle);
+    }
+
+  private:
+    LayerT &Layer;
+    typename LayerT::ModuleSetHandleT Handle;
+  };
+
+  template <typename LayerT>
+  std::unique_ptr<GenericHandleImpl<LayerT>>
+  createGenericHandle(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle) {
+    return llvm::make_unique<GenericHandleImpl<LayerT>>(Layer,
+                                                        std::move(Handle));
+  }
+
+public:
+
+  // We need a 'ModuleSetHandleT' to conform to the layer concept.
+  typedef unsigned ModuleSetHandleT;
+
+  typedef unsigned ModuleHandleT;
+
+  static std::unique_ptr<CompileCallbackMgr> createCompileCallbackMgr(Triple T);
+  static IndirectStubsManagerBuilder createIndirectStubsMgrBuilder(Triple T);
+
+  OrcCBindingsStack(TargetMachine &TM,
+		    std::unique_ptr<CompileCallbackMgr> CCMgr, 
+                    IndirectStubsManagerBuilder IndirectStubsMgrBuilder)
+    : DL(TM.createDataLayout()), CCMgr(std::move(CCMgr)),
+      ObjectLayer(),
+      CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)),
+      CODLayer(CompileLayer,
+               [](Function &F) { std::set<Function*> S; S.insert(&F); return S; },
+               *this->CCMgr, std::move(IndirectStubsMgrBuilder), false),
+      IndirectStubsMgr(IndirectStubsMgrBuilder()),
+      CXXRuntimeOverrides([this](const std::string &S) { return mangle(S); }) {}
+
+  ~OrcCBindingsStack() {
+    // Run any destructors registered with __cxa_atexit.
+    CXXRuntimeOverrides.runDestructors();
+    // Run any IR destructors.
+    for (auto &DtorRunner : IRStaticDestructorRunners)
+      DtorRunner.runViaLayer(*this);
+  }
+
+  std::string mangle(StringRef Name) {
+    std::string MangledName;
+    {
+      raw_string_ostream MangledNameStream(MangledName);
+      Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
+    }
+    return MangledName;
+  }
+
+  template <typename PtrTy>
+  static PtrTy fromTargetAddress(orc::TargetAddress Addr) {
+    return reinterpret_cast<PtrTy>(static_cast<uintptr_t>(Addr));
+  }
+
+  orc::TargetAddress
+  createLazyCompileCallback(LLVMOrcLazyCompileCallbackFn Callback,
+                            void *CallbackCtx) {
+    auto CCInfo = CCMgr->getCompileCallback();
+    CCInfo.setCompileAction(
+      [=]() -> orc::TargetAddress {
+        return Callback(wrap(this), CallbackCtx);
+      });
+    return CCInfo.getAddress();
+  }
+
+  void createIndirectStub(StringRef StubName, orc::TargetAddress Addr) {
+    IndirectStubsMgr->createStub(StubName, Addr, JITSymbolFlags::Exported);
+  }
+
+  void setIndirectStubPointer(StringRef Name, orc::TargetAddress Addr) {
+    IndirectStubsMgr->updatePointer(Name, Addr);
+  }
+
+  std::shared_ptr<RuntimeDyld::SymbolResolver>
+  createResolver(LLVMOrcSymbolResolverFn ExternalResolver,
+                 void *ExternalResolverCtx) {
+    auto Resolver = orc::createLambdaResolver(
+      [this, ExternalResolver, ExternalResolverCtx](const std::string &Name) {
+        // Search order:
+        // 1. JIT'd symbols.
+        // 2. Runtime overrides.
+        // 3. External resolver (if present).
+
+        if (auto Sym = CODLayer.findSymbol(Name, true))
+          return RuntimeDyld::SymbolInfo(Sym.getAddress(),
+                                         Sym.getFlags());
+        if (auto Sym = CXXRuntimeOverrides.searchOverrides(Name))
+          return Sym;
+
+        if (ExternalResolver)
+          return RuntimeDyld::SymbolInfo(ExternalResolver(Name.c_str(),
+                                                          ExternalResolverCtx),
+                                         llvm::JITSymbolFlags::Exported);
+
+        return RuntimeDyld::SymbolInfo(nullptr);
+      },
+      [](const std::string &Name) {
+        return RuntimeDyld::SymbolInfo(nullptr);
+      }
+    );
+
+    return std::shared_ptr<RuntimeDyld::SymbolResolver>(std::move(Resolver));
+  }
+
+  template <typename LayerT>
+  ModuleHandleT addIRModule(LayerT &Layer,
+                            Module *M,
+                            std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
+                            LLVMOrcSymbolResolverFn ExternalResolver,
+                            void *ExternalResolverCtx) {
+
+    // Attach a data-layout if one isn't already present.
+    if (M->getDataLayout().isDefault())
+      M->setDataLayout(DL);
+
+    // Record the static constructors and destructors. We have to do this before
+    // we hand over ownership of the module to the JIT.
+    std::vector<std::string> CtorNames, DtorNames;
+    for (auto Ctor : orc::getConstructors(*M))
+      CtorNames.push_back(mangle(Ctor.Func->getName()));
+    for (auto Dtor : orc::getDestructors(*M))
+      DtorNames.push_back(mangle(Dtor.Func->getName()));
+
+    // Create the resolver.
+    auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx);
+
+    // Add the module to the JIT.
+    std::vector<Module*> S;
+    S.push_back(std::move(M));
+
+    auto LH = Layer.addModuleSet(std::move(S), std::move(MemMgr),
+                                 std::move(Resolver));
+    ModuleHandleT H = createHandle(Layer, LH);
+
+    // Run the static constructors, and save the static destructor runner for
+    // execution when the JIT is torn down.
+    orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), H);
+    CtorRunner.runViaLayer(*this);
+
+    IRStaticDestructorRunners.emplace_back(std::move(DtorNames), H);
+
+    return H;
+  }
+
+  ModuleHandleT addIRModuleEager(Module* M,
+                                 LLVMOrcSymbolResolverFn ExternalResolver,
+                                 void *ExternalResolverCtx) {
+    return addIRModule(CompileLayer, std::move(M),
+                       llvm::make_unique<SectionMemoryManager>(),
+                       std::move(ExternalResolver), ExternalResolverCtx);
+  }
+
+  ModuleHandleT addIRModuleLazy(Module* M,
+                                LLVMOrcSymbolResolverFn ExternalResolver,
+                                void *ExternalResolverCtx) {
+    return addIRModule(CODLayer, std::move(M),
+		       llvm::make_unique<SectionMemoryManager>(),
+                       std::move(ExternalResolver), ExternalResolverCtx);
+  }
+
+  void removeModule(ModuleHandleT H) {
+    GenericHandles[H]->removeModule();
+    GenericHandles[H] = nullptr;
+    FreeHandleIndexes.push_back(H);
+  }
+
+  orc::JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
+    if (auto Sym = IndirectStubsMgr->findStub(Name, ExportedSymbolsOnly))
+      return Sym;
+    return CODLayer.findSymbol(mangle(Name), ExportedSymbolsOnly);
+  }
+
+  orc::JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
+                              bool ExportedSymbolsOnly) {
+    return GenericHandles[H]->findSymbolIn(Name, ExportedSymbolsOnly);
+  }
+
+private:
+
+  template <typename LayerT>
+  unsigned createHandle(LayerT &Layer,
+                        typename LayerT::ModuleSetHandleT Handle) {
+    unsigned NewHandle;
+    if (!FreeHandleIndexes.empty()) {
+      NewHandle = FreeHandleIndexes.back();
+      FreeHandleIndexes.pop_back();
+      GenericHandles[NewHandle] = createGenericHandle(Layer, std::move(Handle));
+      return NewHandle;
+    } else {
+      NewHandle = GenericHandles.size();
+      GenericHandles.push_back(createGenericHandle(Layer, std::move(Handle)));
+    }
+    return NewHandle;
+  }
+
+  DataLayout DL;
+  SectionMemoryManager CCMgrMemMgr;
+
+  std::unique_ptr<CompileCallbackMgr> CCMgr;
+  ObjLayerT ObjectLayer;
+  CompileLayerT CompileLayer;
+  CODLayerT CODLayer;
+
+  std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
+
+  std::vector<std::unique_ptr<GenericHandle>> GenericHandles;
+  std::vector<unsigned> FreeHandleIndexes;
+
+  orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
+  std::vector<orc::CtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
new file mode 100644
index 0000000..e95115e
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
@@ -0,0 +1,57 @@
+//===---------------- OrcError.cpp - Error codes for ORC ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Error codes for ORC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+class OrcErrorCategory : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "orc"; }
+
+  std::string message(int condition) const override {
+    switch (static_cast<OrcErrorCode>(condition)) {
+    case OrcErrorCode::RemoteAllocatorDoesNotExist:
+      return "Remote allocator does not exist";
+    case OrcErrorCode::RemoteAllocatorIdAlreadyInUse:
+      return "Remote allocator Id already in use";
+    case OrcErrorCode::RemoteMProtectAddrUnrecognized:
+      return "Remote mprotect call references unallocated memory";
+    case OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist:
+      return "Remote indirect stubs owner does not exist";
+    case OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse:
+      return "Remote indirect stubs owner Id already in use";
+    case OrcErrorCode::UnexpectedRPCCall:
+      return "Unexpected RPC call";
+    }
+    llvm_unreachable("Unhandled error code");
+  }
+};
+
+static ManagedStatic<OrcErrorCategory> OrcErrCat;
+}
+
+namespace llvm {
+namespace orc {
+
+std::error_code orcError(OrcErrorCode ErrCode) {
+  typedef std::underlying_type<OrcErrorCode>::type UT;
+  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
+}
+}
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 7dc5164..2ab70a9 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -54,10 +54,13 @@ class OrcMCJITReplacement : public ExecutionEngine {
       return Addr;
     }
 
-    void reserveAllocationSpace(uintptr_t CodeSize, uintptr_t DataSizeRO,
-                                uintptr_t DataSizeRW) override {
-      return ClientMM->reserveAllocationSpace(CodeSize, DataSizeRO,
-                                                DataSizeRW);
+    void reserveAllocationSpace(uintptr_t CodeSize, uint32_t CodeAlign,
+                                uintptr_t RODataSize, uint32_t RODataAlign,
+                                uintptr_t RWDataSize,
+                                uint32_t RWDataAlign) override {
+      return ClientMM->reserveAllocationSpace(CodeSize, CodeAlign,
+                                              RODataSize, RODataAlign,
+                                              RWDataSize, RWDataAlign);
     }
 
     bool needsToReserveAllocationSpace() override {
@@ -74,6 +77,11 @@ class OrcMCJITReplacement : public ExecutionEngine {
       return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
     }
 
+    void notifyObjectLoaded(RuntimeDyld &RTDyld,
+                            const object::ObjectFile &O) override {
+      return ClientMM->notifyObjectLoaded(RTDyld, O);
+    }
+
     void notifyObjectLoaded(ExecutionEngine *EE,
                             const object::ObjectFile &O) override {
       return ClientMM->notifyObjectLoaded(EE, O);
@@ -137,25 +145,26 @@ public:
   }
 
   OrcMCJITReplacement(
-                    std::shared_ptr<MCJITMemoryManager> MemMgr,
-                    std::shared_ptr<RuntimeDyld::SymbolResolver> ClientResolver,
-                    std::unique_ptr<TargetMachine> TM)
-      : TM(std::move(TM)), MemMgr(*this, std::move(MemMgr)),
-        Resolver(*this), ClientResolver(std::move(ClientResolver)),
-        NotifyObjectLoaded(*this), NotifyFinalized(*this),
+      std::shared_ptr<MCJITMemoryManager> MemMgr,
+      std::shared_ptr<RuntimeDyld::SymbolResolver> ClientResolver,
+      std::unique_ptr<TargetMachine> TM)
+      : ExecutionEngine(TM->createDataLayout()), TM(std::move(TM)),
+        MemMgr(*this, std::move(MemMgr)), Resolver(*this),
+        ClientResolver(std::move(ClientResolver)), NotifyObjectLoaded(*this),
+        NotifyFinalized(*this),
         ObjectLayer(NotifyObjectLoaded, NotifyFinalized),
         CompileLayer(ObjectLayer, SimpleCompiler(*this->TM)),
-        LazyEmitLayer(CompileLayer) {
-    setDataLayout(this->TM->getDataLayout());
-  }
+        LazyEmitLayer(CompileLayer) {}
 
   void addModule(std::unique_ptr<Module> M) override {
 
     // If this module doesn't have a DataLayout attached then attach the
     // default.
-    if (M->getDataLayout().isDefault())
-      M->setDataLayout(*getDataLayout());
-
+    if (M->getDataLayout().isDefault()) {
+      M->setDataLayout(getDataLayout());
+    } else {
+      assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
+    }
     Modules.push_back(std::move(M));
     std::vector<Module *> Ms;
     Ms.push_back(&*Modules.back());
@@ -174,12 +183,7 @@ public:
     std::tie(Obj, Buf) = O.takeBinary();
     std::vector<std::unique_ptr<object::ObjectFile>> Objs;
     Objs.push_back(std::move(Obj));
-    auto H =
-      ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
-
-    std::vector<std::unique_ptr<MemoryBuffer>> Bufs;
-    Bufs.push_back(std::move(Buf));
-    ObjectLayer.takeOwnershipOfBuffers(H, std::move(Bufs));
+    ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
   }
 
   void addArchive(object::OwningBinary<object::Archive> A) override {
@@ -234,6 +238,10 @@ public:
     CompileLayer.setObjectCache(NewCache);
   }
 
+  void setProcessAllSections(bool ProcessAllSections) override {
+    ObjectLayer.setProcessAllSections(ProcessAllSections);
+  }
+
 private:
 
   RuntimeDyld::SymbolInfo findMangledSymbol(StringRef Name) {
@@ -252,10 +260,12 @@ private:
       object::Archive *A = OB.getBinary();
       // Look for our symbols in each Archive
       object::Archive::child_iterator ChildIt = A->findSym(Name);
+      if (std::error_code EC = ChildIt->getError())
+        report_fatal_error(EC.message());
       if (ChildIt != A->child_end()) {
         // FIXME: Support nested archives?
         ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
-            ChildIt->getAsBinary();
+            (*ChildIt)->getAsBinary();
         if (ChildBinOrErr.getError())
           continue;
         std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
@@ -289,7 +299,7 @@ private:
              "Incorrect number of Infos for Objects.");
       for (unsigned I = 0; I < Objects.size(); ++I)
         M.MemMgr.notifyObjectLoaded(&M, *Objects[I]);
-    };
+    }
 
   private:
     OrcMCJITReplacement &M;
@@ -310,7 +320,7 @@ private:
     std::string MangledName;
     {
       raw_string_ostream MangledNameStream(MangledName);
-      Mang.getNameWithPrefix(MangledNameStream, Name, *TM->getDataLayout());
+      Mang.getNameWithPrefix(MangledNameStream, Name, getDataLayout());
     }
     return MangledName;
   }
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp
new file mode 100644
index 0000000..064633b
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp
@@ -0,0 +1,83 @@
+//===------- OrcRemoteTargetRPCAPI.cpp - ORC Remote API utilities ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
+
+namespace llvm {
+namespace orc {
+namespace remote {
+
+const char *OrcRemoteTargetRPCAPI::getJITProcIdName(JITProcId Id) {
+  switch (Id) {
+  case InvalidId:
+    return "*** Invalid JITProcId ***";
+  case CallIntVoidId:
+    return "CallIntVoid";
+  case CallIntVoidResponseId:
+    return "CallIntVoidResponse";
+  case CallMainId:
+    return "CallMain";
+  case CallMainResponseId:
+    return "CallMainResponse";
+  case CallVoidVoidId:
+    return "CallVoidVoid";
+  case CallVoidVoidResponseId:
+    return "CallVoidVoidResponse";
+  case CreateRemoteAllocatorId:
+    return "CreateRemoteAllocator";
+  case CreateIndirectStubsOwnerId:
+    return "CreateIndirectStubsOwner";
+  case DestroyRemoteAllocatorId:
+    return "DestroyRemoteAllocator";
+  case DestroyIndirectStubsOwnerId:
+    return "DestroyIndirectStubsOwner";
+  case EmitIndirectStubsId:
+    return "EmitIndirectStubs";
+  case EmitIndirectStubsResponseId:
+    return "EmitIndirectStubsResponse";
+  case EmitResolverBlockId:
+    return "EmitResolverBlock";
+  case EmitTrampolineBlockId:
+    return "EmitTrampolineBlock";
+  case EmitTrampolineBlockResponseId:
+    return "EmitTrampolineBlockResponse";
+  case GetSymbolAddressId:
+    return "GetSymbolAddress";
+  case GetSymbolAddressResponseId:
+    return "GetSymbolAddressResponse";
+  case GetRemoteInfoId:
+    return "GetRemoteInfo";
+  case GetRemoteInfoResponseId:
+    return "GetRemoteInfoResponse";
+  case ReadMemId:
+    return "ReadMem";
+  case ReadMemResponseId:
+    return "ReadMemResponse";
+  case ReserveMemId:
+    return "ReserveMem";
+  case ReserveMemResponseId:
+    return "ReserveMemResponse";
+  case RequestCompileId:
+    return "RequestCompile";
+  case RequestCompileResponseId:
+    return "RequestCompileResponse";
+  case SetProtectionsId:
+    return "SetProtections";
+  case TerminateSessionId:
+    return "TerminateSession";
+  case WriteMemId:
+    return "WriteMem";
+  case WritePtrId:
+    return "WritePtr";
+  };
+  return nullptr;
+}
+}
+}
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp
deleted file mode 100644
index 258868a..0000000
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/Orc/OrcTargetSupport.h"
-#include <array>
-
-using namespace llvm::orc;
-
-namespace {
-
-uint64_t executeCompileCallback(JITCompileCallbackManagerBase *JCBM,
-                                TargetAddress CallbackID) {
-  return JCBM->executeCompileCallback(CallbackID);
-}
-
-}
-
-namespace llvm {
-namespace orc {
-
-const char* OrcX86_64::ResolverBlockName = "orc_resolver_block";
-
-void OrcX86_64::insertResolverBlock(
-    Module &M, JITCompileCallbackManagerBase &JCBM) {
-
-  // Trampoline code-sequence length, used to get trampoline address from return
-  // address.
-  const unsigned X86_64_TrampolineLength = 6;
-
-  // List of x86-64 GPRs to save. Note - RBP saved separately below.
-  std::array<const char *, 14> GPRs = {{
-      "rax", "rbx", "rcx", "rdx",
-      "rsi", "rdi", "r8", "r9",
-      "r10", "r11", "r12", "r13",
-      "r14", "r15"
-    }};
-
-  // Address of the executeCompileCallback function.
-  uint64_t CallbackAddr =
-      static_cast<uint64_t>(
-        reinterpret_cast<uintptr_t>(executeCompileCallback));
-
-  std::ostringstream AsmStream;
-  Triple TT(M.getTargetTriple());
-
-  // Switch to text section.
-  if (TT.getOS() == Triple::Darwin)
-    AsmStream << ".section __TEXT,__text,regular,pure_instructions\n"
-              << ".align 4, 0x90\n";
-  else
-    AsmStream << ".text\n"
-              << ".align 16, 0x90\n";
-
-  // Bake in a pointer to the callback manager immediately before the
-  // start of the resolver function.
-  AsmStream << "jit_callback_manager_addr:\n"
-            << "  .quad " << &JCBM << "\n";
-
-  // Start the resolver function.
-  AsmStream << ResolverBlockName << ":\n"
-            << "  pushq     %rbp\n"
-            << "  movq      %rsp, %rbp\n";
-
-  // Store the GPRs.
-  for (const auto &GPR : GPRs)
-    AsmStream << "  pushq     %" << GPR << "\n";
-
-  // Store floating-point state with FXSAVE.
-  // Note: We need to keep the stack 16-byte aligned, so if we've emitted an odd
-  //       number of 64-bit pushes so far (GPRs.size() plus 1 for RBP) then add
-  //       an extra 64 bits of padding to the FXSave area.
-  unsigned Padding = (GPRs.size() + 1) % 2 ? 8 : 0;
-  unsigned FXSaveSize = 512 + Padding;
-  AsmStream << "  subq      $" << FXSaveSize << ", %rsp\n"
-            << "  fxsave64  (%rsp)\n"
-
-  // Load callback manager address, compute trampoline address, call JIT.
-            << "  lea       jit_callback_manager_addr(%rip), %rdi\n"
-            << "  movq      (%rdi), %rdi\n"
-            << "  movq      0x8(%rbp), %rsi\n"
-            << "  subq      $" << X86_64_TrampolineLength << ", %rsi\n"
-            << "  movabsq   $" << CallbackAddr << ", %rax\n"
-            << "  callq     *%rax\n"
-
-  // Replace the return to the trampoline with the return address of the
-  // compiled function body.
-            << "  movq      %rax, 0x8(%rbp)\n"
-
-  // Restore the floating point state.
-            << "  fxrstor64 (%rsp)\n"
-            << "  addq      $" << FXSaveSize << ", %rsp\n";
-
-  for (const auto &GPR : make_range(GPRs.rbegin(), GPRs.rend()))
-    AsmStream << "  popq      %" << GPR << "\n";
-
-  // Restore original RBP and return to compiled function body.
-  AsmStream << "  popq      %rbp\n"
-            << "  retq\n";
-
-  M.appendModuleInlineAsm(AsmStream.str());
-}
-
-OrcX86_64::LabelNameFtor
-OrcX86_64::insertCompileCallbackTrampolines(Module &M,
-                                            TargetAddress ResolverBlockAddr,
-                                            unsigned NumCalls,
-                                            unsigned StartIndex) {
-  const char *ResolverBlockPtrName = "Lorc_resolve_block_addr";
-
-  std::ostringstream AsmStream;
-  Triple TT(M.getTargetTriple());
-
-  if (TT.getOS() == Triple::Darwin)
-    AsmStream << ".section __TEXT,__text,regular,pure_instructions\n"
-              << ".align 4, 0x90\n";
-  else
-    AsmStream << ".text\n"
-              << ".align 16, 0x90\n";
-
-  AsmStream << ResolverBlockPtrName << ":\n"
-            << "  .quad " << ResolverBlockAddr << "\n";
-
-  auto GetLabelName =
-    [=](unsigned I) {
-      std::ostringstream LabelStream;
-      LabelStream << "orc_jcc_" << (StartIndex + I);
-      return LabelStream.str();
-  };
-
-  for (unsigned I = 0; I < NumCalls; ++I)
-    AsmStream << GetLabelName(I) << ":\n"
-              << "  callq *" << ResolverBlockPtrName << "(%rip)\n";
-
-  M.appendModuleInlineAsm(AsmStream.str());
-
-  return GetLabelName;
-}
-
-} // End namespace orc.
-} // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 93287a3..d16b2db 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -41,20 +41,21 @@ void RuntimeDyldImpl::deregisterEHFrames() {}
 
 #ifndef NDEBUG
 static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
-  dbgs() << "----- Contents of section " << S.Name << " " << State << " -----";
+  dbgs() << "----- Contents of section " << S.getName() << " " << State
+         << " -----";
 
-  if (S.Address == nullptr) {
+  if (S.getAddress() == nullptr) {
     dbgs() << "\n          <section not emitted>\n";
     return;
   }
 
   const unsigned ColsPerRow = 16;
 
-  uint8_t *DataAddr = S.Address;
-  uint64_t LoadAddr = S.LoadAddress;
+  uint8_t *DataAddr = S.getAddress();
+  uint64_t LoadAddr = S.getLoadAddress();
 
   unsigned StartPadding = LoadAddr & (ColsPerRow - 1);
-  unsigned BytesRemaining = S.Size;
+  unsigned BytesRemaining = S.getSize();
 
   if (StartPadding) {
     dbgs() << "\n" << format("0x%016" PRIx64,
@@ -82,30 +83,41 @@ static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
 void RuntimeDyldImpl::resolveRelocations() {
   MutexGuard locked(lock);
 
+  // Print out the sections prior to relocation.
+  DEBUG(
+    for (int i = 0, e = Sections.size(); i != e; ++i)
+      dumpSectionMemory(Sections[i], "before relocations");
+  );
+
   // First, resolve relocations associated with external symbols.
   resolveExternalSymbols();
 
-  // Just iterate over the sections we have and resolve all the relocations
-  // in them. Gross overkill, but it gets the job done.
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
+  // Iterate over all outstanding relocations
+  for (auto it = Relocations.begin(), e = Relocations.end(); it != e; ++it) {
     // The Section here (Sections[i]) refers to the section in which the
     // symbol for the relocation is located.  The SectionID in the relocation
     // entry provides the section to which the relocation will be applied.
-    uint64_t Addr = Sections[i].LoadAddress;
-    DEBUG(dbgs() << "Resolving relocations Section #" << i << "\t"
+    int Idx = it->first;
+    uint64_t Addr = Sections[Idx].getLoadAddress();
+    DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t"
                  << format("%p", (uintptr_t)Addr) << "\n");
-    DEBUG(dumpSectionMemory(Sections[i], "before relocations"));
-    resolveRelocationList(Relocations[i], Addr);
-    DEBUG(dumpSectionMemory(Sections[i], "after relocations"));
-    Relocations.erase(i);
+    resolveRelocationList(it->second, Addr);
   }
+  Relocations.clear();
+
+  // Print out sections after relocation.
+  DEBUG(
+    for (int i = 0, e = Sections.size(); i != e; ++i)
+      dumpSectionMemory(Sections[i], "after relocations");
+  );
+
 }
 
 void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
   MutexGuard locked(lock);
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Address == LocalAddress) {
+    if (Sections[i].getAddress() == LocalAddress) {
       reassignSectionAddress(i, TargetAddress);
       return;
     }
@@ -122,14 +134,10 @@ static std::error_code getOffset(const SymbolRef &Sym, SectionRef Sec,
   return std::error_code();
 }
 
-std::pair<unsigned, unsigned>
+RuntimeDyldImpl::ObjSectionToIDMap
 RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   MutexGuard locked(lock);
 
-  // Grab the first Section ID. We'll use this later to construct the underlying
-  // range for the returned LoadedObjectInfo.
-  unsigned SectionsAddedBeginIdx = Sections.size();
-
   // Save information about our target
   Arch = (Triple::ArchType)Obj.getArch();
   IsTargetLittleEndian = Obj.isLittleEndian();
@@ -138,9 +146,12 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   // Compute the memory size required to load all sections to be loaded
   // and pass this information to the memory manager
   if (MemMgr.needsToReserveAllocationSpace()) {
-    uint64_t CodeSize = 0, DataSizeRO = 0, DataSizeRW = 0;
-    computeTotalAllocSize(Obj, CodeSize, DataSizeRO, DataSizeRW);
-    MemMgr.reserveAllocationSpace(CodeSize, DataSizeRO, DataSizeRW);
+    uint64_t CodeSize = 0, RODataSize = 0, RWDataSize = 0;
+    uint32_t CodeAlign = 1, RODataAlign = 1, RWDataAlign = 1;
+    computeTotalAllocSize(Obj, CodeSize, CodeAlign, RODataSize, RODataAlign,
+                          RWDataSize, RWDataAlign);
+    MemMgr.reserveAllocationSpace(CodeSize, CodeAlign, RODataSize, RODataAlign,
+                                  RWDataSize, RWDataAlign);
   }
 
   // Used sections from the object file
@@ -155,39 +166,56 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
        ++I) {
     uint32_t Flags = I->getFlags();
 
-    bool IsCommon = Flags & SymbolRef::SF_Common;
-    if (IsCommon)
+    if (Flags & SymbolRef::SF_Common)
       CommonSymbols.push_back(*I);
     else {
       object::SymbolRef::Type SymType = I->getType();
 
-      if (SymType == object::SymbolRef::ST_Function ||
-          SymType == object::SymbolRef::ST_Data ||
-          SymType == object::SymbolRef::ST_Unknown) {
-
-        ErrorOr<StringRef> NameOrErr = I->getName();
-        Check(NameOrErr.getError());
-        StringRef Name = *NameOrErr;
-        section_iterator SI = Obj.section_end();
-        Check(I->getSection(SI));
+      // Get symbol name.
+      ErrorOr<StringRef> NameOrErr = I->getName();
+      Check(NameOrErr.getError());
+      StringRef Name = *NameOrErr;
+  
+      // Compute JIT symbol flags.
+      JITSymbolFlags RTDyldSymFlags = JITSymbolFlags::None;
+      if (Flags & SymbolRef::SF_Weak)
+        RTDyldSymFlags |= JITSymbolFlags::Weak;
+      if (Flags & SymbolRef::SF_Exported)
+        RTDyldSymFlags |= JITSymbolFlags::Exported;
+
+      if (Flags & SymbolRef::SF_Absolute &&
+          SymType != object::SymbolRef::ST_File) {
+        auto Addr = I->getAddress();
+        Check(Addr.getError());
+        uint64_t SectOffset = *Addr;
+        unsigned SectionID = AbsoluteSymbolSection;
+
+        DEBUG(dbgs() << "\tType: " << SymType << " (absolute) Name: " << Name
+                     << " SID: " << SectionID << " Offset: "
+                     << format("%p", (uintptr_t)SectOffset)
+                     << " flags: " << Flags << "\n");
+        GlobalSymbolTable[Name] =
+          SymbolTableEntry(SectionID, SectOffset, RTDyldSymFlags);
+      } else if (SymType == object::SymbolRef::ST_Function ||
+                 SymType == object::SymbolRef::ST_Data ||
+                 SymType == object::SymbolRef::ST_Unknown ||
+                 SymType == object::SymbolRef::ST_Other) {
+
+        ErrorOr<section_iterator> SIOrErr = I->getSection();
+        Check(SIOrErr.getError());
+        section_iterator SI = *SIOrErr;
         if (SI == Obj.section_end())
           continue;
+        // Get symbol offset.
         uint64_t SectOffset;
         Check(getOffset(*I, *SI, SectOffset));
-        StringRef SectionData;
-        Check(SI->getContents(SectionData));
         bool IsCode = SI->isText();
-        unsigned SectionID =
-            findOrEmitSection(Obj, *SI, IsCode, LocalSections);
+        unsigned SectionID = findOrEmitSection(Obj, *SI, IsCode, LocalSections);
+
         DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name
                      << " SID: " << SectionID << " Offset: "
                      << format("%p", (uintptr_t)SectOffset)
                      << " flags: " << Flags << "\n");
-        JITSymbolFlags RTDyldSymFlags = JITSymbolFlags::None;
-        if (Flags & SymbolRef::SF_Weak)
-          RTDyldSymFlags |= JITSymbolFlags::Weak;
-        if (Flags & SymbolRef::SF_Exported)
-          RTDyldSymFlags |= JITSymbolFlags::Exported;
         GlobalSymbolTable[Name] =
           SymbolTableEntry(SectionID, SectOffset, RTDyldSymFlags);
       }
@@ -231,9 +259,10 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   // Give the subclasses a chance to tie-up any loose ends.
   finalizeLoad(Obj, LocalSections);
 
-  unsigned SectionsAddedEndIdx = Sections.size();
+//   for (auto E : LocalSections)
+//     llvm::dbgs() << "Added: " << E.first.getRawDataRefImpl() << " -> " << E.second << "\n";
 
-  return std::make_pair(SectionsAddedBeginIdx, SectionsAddedEndIdx);
+  return LocalSections;
 }
 
 // A helper method for computeTotalAllocSize.
@@ -309,13 +338,15 @@ static bool isZeroInit(const SectionRef Section) {
 // sections
 void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
                                             uint64_t &CodeSize,
-                                            uint64_t &DataSizeRO,
-                                            uint64_t &DataSizeRW) {
+                                            uint32_t &CodeAlign,
+                                            uint64_t &RODataSize,
+                                            uint32_t &RODataAlign,
+                                            uint64_t &RWDataSize,
+                                            uint32_t &RWDataAlign) {
   // Compute the size of all sections required for execution
   std::vector<uint64_t> CodeSectionSizes;
   std::vector<uint64_t> ROSectionSizes;
   std::vector<uint64_t> RWSectionSizes;
-  uint64_t MaxAlignment = sizeof(void *);
 
   // Collect sizes of all sections to be loaded;
   // also determine the max alignment of all sections
@@ -350,17 +381,15 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
         SectionSize = 1;
 
       if (IsCode) {
+        CodeAlign = std::max(CodeAlign, Alignment);
         CodeSectionSizes.push_back(SectionSize);
       } else if (IsReadOnly) {
+        RODataAlign = std::max(RODataAlign, Alignment);
         ROSectionSizes.push_back(SectionSize);
       } else {
+        RWDataAlign = std::max(RWDataAlign, Alignment);
         RWSectionSizes.push_back(SectionSize);
       }
-
-      // update the max alignment
-      if (Alignment > MaxAlignment) {
-        MaxAlignment = Alignment;
-      }
     }
   }
 
@@ -384,9 +413,9 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
   // allocated with the max alignment. Note that we cannot compute with the
   // individual alignments of the sections, because then the required size
   // depends on the order, in which the sections are allocated.
-  CodeSize = computeAllocationSizeForSections(CodeSectionSizes, MaxAlignment);
-  DataSizeRO = computeAllocationSizeForSections(ROSectionSizes, MaxAlignment);
-  DataSizeRW = computeAllocationSizeForSections(RWSectionSizes, MaxAlignment);
+  CodeSize = computeAllocationSizeForSections(CodeSectionSizes, CodeAlign);
+  RODataSize = computeAllocationSizeForSections(ROSectionSizes, RODataAlign);
+  RWDataSize = computeAllocationSizeForSections(RWSectionSizes, RWDataAlign);
 }
 
 // compute stub buffer size for the given section
@@ -406,10 +435,9 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
     if (!(RelSecI == Section))
       continue;
 
-    for (const RelocationRef &Reloc : SI->relocations()) {
-      (void)Reloc;
-      StubBufSize += StubSize;
-    }
+    for (const RelocationRef &Reloc : SI->relocations())
+      if (relocationNeedsStub(Reloc))
+        StubBufSize += StubSize;
   }
 
   // Get section data size and alignment
@@ -492,7 +520,8 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
-  Sections.push_back(SectionEntry("<common symbols>", Addr, CommonSize, 0));
+  Sections.push_back(
+      SectionEntry("<common symbols>", Addr, CommonSize, CommonSize, 0));
   memset(Addr, 0, CommonSize);
 
   DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID << " new addr: "
@@ -524,6 +553,9 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
     Offset += Size;
     Addr += Size;
   }
+
+  if (Checker)
+    Checker->registerSection(Obj.getFileName(), SectionID);
 }
 
 unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
@@ -556,12 +588,20 @@ unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   uint8_t *Addr;
   const char *pData = nullptr;
 
-  // In either case, set the location of the unrelocated section in memory,
-  // since we still process relocations for it even if we're not applying them.
-  Check(Section.getContents(data));
-  // Virtual sections have no data in the object image, so leave pData = 0
-  if (!IsVirtual)
+  // If this section contains any bits (i.e. isn't a virtual or bss section),
+  // grab a reference to them.
+  if (!IsVirtual && !IsZeroInit) {
+    // In either case, set the location of the unrelocated section in memory,
+    // since we still process relocations for it even if we're not applying them.
+    Check(Section.getContents(data));
     pData = data.data();
+  }
+
+  // Code section alignment needs to be at least as high as stub alignment or
+  // padding calculations may by incorrect when the section is remapped to a
+  // higher alignment.
+  if (IsCode)
+    Alignment = std::max(Alignment, getStubAlignment());
 
   // Some sections, such as debug info, don't need to be loaded for execution.
   // Leave those where they are.
@@ -606,7 +646,8 @@ unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
                  << " Allocate: " << Allocate << "\n");
   }
 
-  Sections.push_back(SectionEntry(Name, Addr, DataSize, (uintptr_t)pData));
+  Sections.push_back(
+      SectionEntry(Name, Addr, DataSize, Allocate, (uintptr_t)pData));
 
   if (Checker)
     Checker->registerSection(Obj.getFileName(), SectionID);
@@ -742,11 +783,11 @@ void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
   // Addr is a uint64_t because we can't assume the pointer width
   // of the target is the same as that of the host. Just use a generic
   // "big enough" type.
-  DEBUG(dbgs() << "Reassigning address for section "
-               << SectionID << " (" << Sections[SectionID].Name << "): "
-               << format("0x%016" PRIx64, Sections[SectionID].LoadAddress) << " -> "
-               << format("0x%016" PRIx64, Addr) << "\n");
-  Sections[SectionID].LoadAddress = Addr;
+  DEBUG(dbgs() << "Reassigning address for section " << SectionID << " ("
+               << Sections[SectionID].getName() << "): "
+               << format("0x%016" PRIx64, Sections[SectionID].getLoadAddress())
+               << " -> " << format("0x%016" PRIx64, Addr) << "\n");
+  Sections[SectionID].setLoadAddress(Addr);
 }
 
 void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
@@ -754,7 +795,7 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
   for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
     const RelocationEntry &RE = Relocs[i];
     // Ignore relocations for sections that were not loaded
-    if (Sections[RE.SectionID].Address == nullptr)
+    if (Sections[RE.SectionID].getAddress() == nullptr)
       continue;
     resolveRelocation(RE, Value);
   }
@@ -818,10 +859,11 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
 // RuntimeDyld class implementation
 
 uint64_t RuntimeDyld::LoadedObjectInfo::getSectionLoadAddress(
-                                                  StringRef SectionName) const {
-  for (unsigned I = BeginIdx; I != EndIdx; ++I)
-    if (RTDyld.Sections[I].Name == SectionName)
-      return RTDyld.Sections[I].LoadAddress;
+                                          const object::SectionRef &Sec) const {
+
+  auto I = ObjSecToIDMap.find(Sec);
+  if (I != ObjSecToIDMap.end())
+    return RTDyld.Sections[I->second].getLoadAddress();
 
   return 0;
 }
@@ -898,7 +940,9 @@ RuntimeDyld::loadObject(const ObjectFile &Obj) {
   if (!Dyld->isCompatibleFile(Obj))
     report_fatal_error("Incompatible object format!");
 
-  return Dyld->loadObject(Obj);
+  auto LoadedObjInfo = Dyld->loadObject(Obj);
+  MemMgr.notifyObjectLoaded(*this, Obj);
+  return LoadedObjInfo;
 }
 
 void *RuntimeDyld::getSymbolLocalAddress(StringRef Name) const {
@@ -928,6 +972,17 @@ bool RuntimeDyld::hasError() { return Dyld->hasError(); }
 
 StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); }
 
+void RuntimeDyld::finalizeWithMemoryManagerLocking() {
+  bool MemoryFinalizationLocked = MemMgr.FinalizationLocked;
+  MemMgr.FinalizationLocked = true;
+  resolveRelocations();
+  registerEHFrames();
+  if (!MemoryFinalizationLocked) {
+    MemMgr.finalizeMemory();
+    MemMgr.FinalizationLocked = false;
+  }
+}
+
 void RuntimeDyld::registerEHFrames() {
   if (Dyld)
     Dyld->registerEHFrames();
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index 1dacc13..e5fab92 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RuntimeDyldCOFF.h"
+#include "Targets/RuntimeDyldCOFFI386.h"
 #include "Targets/RuntimeDyldCOFFX86_64.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
@@ -24,12 +25,11 @@ using namespace llvm::object;
 
 namespace {
 
-class LoadedCOFFObjectInfo
+class LoadedCOFFObjectInfo final
     : public RuntimeDyld::LoadedObjectInfoHelper<LoadedCOFFObjectInfo> {
 public:
-  LoadedCOFFObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx,
-                       unsigned EndIdx)
-      : LoadedObjectInfoHelper(RTDyld, BeginIdx, EndIdx) {}
+  LoadedCOFFObjectInfo(RuntimeDyldImpl &RTDyld, ObjSectionToIDMap ObjSecToIDMap)
+      : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {}
 
   OwningBinary<ObjectFile>
   getObjectForDebug(const ObjectFile &Obj) const override {
@@ -48,6 +48,8 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch,
   default:
     llvm_unreachable("Unsupported target for RuntimeDyldCOFF.");
     break;
+  case Triple::x86:
+    return make_unique<RuntimeDyldCOFFI386>(MemMgr, Resolver);
   case Triple::x86_64:
     return make_unique<RuntimeDyldCOFFX86_64>(MemMgr, Resolver);
   }
@@ -55,10 +57,7 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch,
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) {
-  unsigned SectionStartIdx, SectionEndIdx;
-  std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O);
-  return llvm::make_unique<LoadedCOFFObjectInfo>(*this, SectionStartIdx,
-                                                 SectionEndIdx);
+  return llvm::make_unique<LoadedCOFFObjectInfo>(*this, loadObjectImpl(O));
 }
 
 uint64_t RuntimeDyldCOFF::getSymbolOffset(const SymbolRef &Sym) {
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index ae199b7..58ce88a 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -727,7 +727,7 @@ bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix,
 }
 
 bool RuntimeDyldCheckerImpl::isSymbolValid(StringRef Symbol) const {
-  if (getRTDyld().getSymbolLocalAddress(Symbol))
+  if (getRTDyld().getSymbol(Symbol))
     return true;
   return !!getRTDyld().Resolver.findSymbol(Symbol);
 }
@@ -799,11 +799,10 @@ std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getSectionAddr(
   unsigned SectionID = SectionInfo->SectionID;
   uint64_t Addr;
   if (IsInsideLoad)
-    Addr =
-      static_cast<uint64_t>(
-        reinterpret_cast<uintptr_t>(getRTDyld().Sections[SectionID].Address));
+    Addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(
+        getRTDyld().Sections[SectionID].getAddress()));
   else
-    Addr = getRTDyld().Sections[SectionID].LoadAddress;
+    Addr = getRTDyld().Sections[SectionID].getLoadAddress();
 
   return std::make_pair(Addr, std::string(""));
 }
@@ -835,11 +834,11 @@ std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getStubAddrFor(
 
   uint64_t Addr;
   if (IsInsideLoad) {
-    uintptr_t SectionBase =
-        reinterpret_cast<uintptr_t>(getRTDyld().Sections[SectionID].Address);
+    uintptr_t SectionBase = reinterpret_cast<uintptr_t>(
+        getRTDyld().Sections[SectionID].getAddress());
     Addr = static_cast<uint64_t>(SectionBase) + StubOffset;
   } else {
-    uint64_t SectionBase = getRTDyld().Sections[SectionID].LoadAddress;
+    uint64_t SectionBase = getRTDyld().Sections[SectionID].getLoadAddress();
     Addr = SectionBase + StubOffset;
   }
 
@@ -855,16 +854,16 @@ RuntimeDyldCheckerImpl::getSubsectionStartingAt(StringRef Name) const {
   const auto &SymInfo = pos->second;
   uint8_t *SectionAddr = getRTDyld().getSectionAddress(SymInfo.getSectionID());
   return StringRef(reinterpret_cast<const char *>(SectionAddr) +
-                     SymInfo.getOffset(),
-                   getRTDyld().Sections[SymInfo.getSectionID()].Size -
-                     SymInfo.getOffset());
+                       SymInfo.getOffset(),
+                   getRTDyld().Sections[SymInfo.getSectionID()].getSize() -
+                       SymInfo.getOffset());
 }
 
 void RuntimeDyldCheckerImpl::registerSection(
     StringRef FilePath, unsigned SectionID) {
   StringRef FileName = sys::path::filename(FilePath);
   const SectionEntry &Section = getRTDyld().Sections[SectionID];
-  StringRef SectionName = Section.Name;
+  StringRef SectionName = Section.getName();
 
   Stubs[FileName][SectionName].SectionID = SectionID;
 }
@@ -874,7 +873,7 @@ void RuntimeDyldCheckerImpl::registerStubMap(
     const RuntimeDyldImpl::StubMap &RTDyldStubs) {
   StringRef FileName = sys::path::filename(FilePath);
   const SectionEntry &Section = getRTDyld().Sections[SectionID];
-  StringRef SectionName = Section.Name;
+  StringRef SectionName = Section.getName();
 
   Stubs[FileName][SectionName].SectionID = SectionID;
 
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 3787950..e09b71a 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -66,7 +66,6 @@ public:
   static inline bool classof(const ELFObjectFile<ELFT> *v) {
     return v->isDyldType();
   }
-
 };
 
 
@@ -104,12 +103,11 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
   sym->st_value = static_cast<addr_type>(Addr);
 }
 
-class LoadedELFObjectInfo
+class LoadedELFObjectInfo final
     : public RuntimeDyld::LoadedObjectInfoHelper<LoadedELFObjectInfo> {
 public:
-  LoadedELFObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx,
-                      unsigned EndIdx)
-      : LoadedObjectInfoHelper(RTDyld, BeginIdx, EndIdx) {}
+  LoadedELFObjectInfo(RuntimeDyldImpl &RTDyld, ObjSectionToIDMap ObjSecToIDMap)
+      : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {}
 
   OwningBinary<ObjectFile>
   getObjectForDebug(const ObjectFile &Obj) const override;
@@ -118,6 +116,7 @@ public:
 template <typename ELFT>
 std::unique_ptr<DyldELFObject<ELFT>>
 createRTDyldELFObject(MemoryBufferRef Buffer,
+                      const ObjectFile &SourceObject,
                       const LoadedELFObjectInfo &L,
                       std::error_code &ec) {
   typedef typename ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
@@ -127,6 +126,7 @@ createRTDyldELFObject(MemoryBufferRef Buffer,
     llvm::make_unique<DyldELFObject<ELFT>>(Buffer, ec);
 
   // Iterate over all sections in the object.
+  auto SI = SourceObject.section_begin();
   for (const auto &Sec : Obj->sections()) {
     StringRef SectionName;
     Sec.getName(SectionName);
@@ -135,12 +135,13 @@ createRTDyldELFObject(MemoryBufferRef Buffer,
       Elf_Shdr *shdr = const_cast<Elf_Shdr *>(
           reinterpret_cast<const Elf_Shdr *>(ShdrRef.p));
 
-      if (uint64_t SecLoadAddr = L.getSectionLoadAddress(SectionName)) {
+      if (uint64_t SecLoadAddr = L.getSectionLoadAddress(*SI)) {
         // This assumes that the address passed in matches the target address
         // bitness. The template-based type cast handles everything else.
         shdr->sh_addr = static_cast<addr_type>(SecLoadAddr);
       }
     }
+    ++SI;
   }
 
   return Obj;
@@ -158,16 +159,20 @@ OwningBinary<ObjectFile> createELFDebugObject(const ObjectFile &Obj,
   std::unique_ptr<ObjectFile> DebugObj;
   if (Obj.getBytesInAddress() == 4 && Obj.isLittleEndian()) {
     typedef ELFType<support::little, false> ELF32LE;
-    DebugObj = createRTDyldELFObject<ELF32LE>(Buffer->getMemBufferRef(), L, ec);
+    DebugObj = createRTDyldELFObject<ELF32LE>(Buffer->getMemBufferRef(), Obj, L,
+                                              ec);
   } else if (Obj.getBytesInAddress() == 4 && !Obj.isLittleEndian()) {
     typedef ELFType<support::big, false> ELF32BE;
-    DebugObj = createRTDyldELFObject<ELF32BE>(Buffer->getMemBufferRef(), L, ec);
+    DebugObj = createRTDyldELFObject<ELF32BE>(Buffer->getMemBufferRef(), Obj, L,
+                                              ec);
   } else if (Obj.getBytesInAddress() == 8 && !Obj.isLittleEndian()) {
     typedef ELFType<support::big, true> ELF64BE;
-    DebugObj = createRTDyldELFObject<ELF64BE>(Buffer->getMemBufferRef(), L, ec);
+    DebugObj = createRTDyldELFObject<ELF64BE>(Buffer->getMemBufferRef(), Obj, L,
+                                              ec);
   } else if (Obj.getBytesInAddress() == 8 && Obj.isLittleEndian()) {
     typedef ELFType<support::little, true> ELF64LE;
-    DebugObj = createRTDyldELFObject<ELF64LE>(Buffer->getMemBufferRef(), L, ec);
+    DebugObj = createRTDyldELFObject<ELF64LE>(Buffer->getMemBufferRef(), Obj, L,
+                                              ec);
   } else
     llvm_unreachable("Unexpected ELF format");
 
@@ -181,7 +186,7 @@ LoadedELFObjectInfo::getObjectForDebug(const ObjectFile &Obj) const {
   return createELFDebugObject(Obj, *this);
 }
 
-} // namespace
+} // anonymous namespace
 
 namespace llvm {
 
@@ -193,9 +198,9 @@ RuntimeDyldELF::~RuntimeDyldELF() {}
 void RuntimeDyldELF::registerEHFrames() {
   for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
     SID EHFrameSID = UnregisteredEHFrameSections[i];
-    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
-    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
-    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
+    size_t EHFrameSize = Sections[EHFrameSID].getSize();
     MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
     RegisteredEHFrameSections.push_back(EHFrameSID);
   }
@@ -205,9 +210,9 @@ void RuntimeDyldELF::registerEHFrames() {
 void RuntimeDyldELF::deregisterEHFrames() {
   for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
     SID EHFrameSID = RegisteredEHFrameSections[i];
-    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
-    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
-    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
+    size_t EHFrameSize = Sections[EHFrameSID].getSize();
     MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
   }
   RegisteredEHFrameSections.clear();
@@ -215,10 +220,7 @@ void RuntimeDyldELF::deregisterEHFrames() {
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldELF::loadObject(const object::ObjectFile &O) {
-  unsigned SectionStartIdx, SectionEndIdx;
-  std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O);
-  return llvm::make_unique<LoadedELFObjectInfo>(*this, SectionStartIdx,
-                                                SectionEndIdx);
+  return llvm::make_unique<LoadedELFObjectInfo>(*this, loadObjectImpl(O));
 }
 
 void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
@@ -230,9 +232,10 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     llvm_unreachable("Relocation type not implemented yet!");
     break;
   case ELF::R_X86_64_64: {
-    support::ulittle64_t::ref(Section.Address + Offset) = Value + Addend;
+    support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
+        Value + Addend;
     DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
-                 << format("%p\n", Section.Address + Offset));
+                 << format("%p\n", Section.getAddressWithOffset(Offset)));
     break;
   }
   case ELF::R_X86_64_32:
@@ -242,23 +245,34 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
            (Type == ELF::R_X86_64_32S &&
             ((int64_t)Value <= INT32_MAX && (int64_t)Value >= INT32_MIN)));
     uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
-    support::ulittle32_t::ref(Section.Address + Offset) = TruncatedAddr;
+    support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) =
+        TruncatedAddr;
     DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at "
-                 << format("%p\n", Section.Address + Offset));
+                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    break;
+  }
+  case ELF::R_X86_64_PC8: {
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    int64_t RealOffset = Value + Addend - FinalAddress;
+    assert(isInt<8>(RealOffset));
+    int8_t TruncOffset = (RealOffset & 0xFF);
+    Section.getAddress()[Offset] = TruncOffset;
     break;
   }
   case ELF::R_X86_64_PC32: {
-    uint64_t FinalAddress = Section.LoadAddress + Offset;
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     int64_t RealOffset = Value + Addend - FinalAddress;
     assert(isInt<32>(RealOffset));
     int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
-    support::ulittle32_t::ref(Section.Address + Offset) = TruncOffset;
+    support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) =
+        TruncOffset;
     break;
   }
   case ELF::R_X86_64_PC64: {
-    uint64_t FinalAddress = Section.LoadAddress + Offset;
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     int64_t RealOffset = Value + Addend - FinalAddress;
-    support::ulittle64_t::ref(Section.Address + Offset) = RealOffset;
+    support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
+        RealOffset;
     break;
   }
   }
@@ -269,13 +283,16 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section,
                                           uint32_t Type, int32_t Addend) {
   switch (Type) {
   case ELF::R_386_32: {
-    support::ulittle32_t::ref(Section.Address + Offset) = Value + Addend;
+    support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) =
+        Value + Addend;
     break;
   }
   case ELF::R_386_PC32: {
-    uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
+    uint32_t FinalAddress =
+        Section.getLoadAddressWithOffset(Offset) & 0xFFFFFFFF;
     uint32_t RealOffset = Value + Addend - FinalAddress;
-    support::ulittle32_t::ref(Section.Address + Offset) = RealOffset;
+    support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) =
+        RealOffset;
     break;
   }
   default:
@@ -289,11 +306,12 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section,
 void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
                                               uint64_t Offset, uint64_t Value,
                                               uint32_t Type, int64_t Addend) {
-  uint32_t *TargetPtr = reinterpret_cast<uint32_t *>(Section.Address + Offset);
-  uint64_t FinalAddress = Section.LoadAddress + Offset;
+  uint32_t *TargetPtr =
+      reinterpret_cast<uint32_t *>(Section.getAddressWithOffset(Offset));
+  uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
 
   DEBUG(dbgs() << "resolveAArch64Relocation, LocalAddress: 0x"
-               << format("%llx", Section.Address + Offset)
+               << format("%llx", Section.getAddressWithOffset(Offset))
                << " FinalAddress: 0x" << format("%llx", FinalAddress)
                << " Value: 0x" << format("%llx", Value) << " Type: 0x"
                << format("%x", Type) << " Addend: 0x" << format("%llx", Addend)
@@ -305,7 +323,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     break;
   case ELF::R_AARCH64_ABS64: {
     uint64_t *TargetPtr =
-        reinterpret_cast<uint64_t *>(Section.Address + Offset);
+        reinterpret_cast<uint64_t *>(Section.getAddressWithOffset(Offset));
     *TargetPtr = Value + Addend;
     break;
   }
@@ -428,12 +446,13 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
                                           uint64_t Offset, uint32_t Value,
                                           uint32_t Type, int32_t Addend) {
   // TODO: Add Thumb relocations.
-  uint32_t *TargetPtr = (uint32_t *)(Section.Address + Offset);
-  uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
+  uint32_t *TargetPtr =
+      reinterpret_cast<uint32_t *>(Section.getAddressWithOffset(Offset));
+  uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset) & 0xFFFFFFFF;
   Value += Addend;
 
   DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: "
-               << Section.Address + Offset
+               << Section.getAddressWithOffset(Offset)
                << " FinalAddress: " << format("%p", FinalAddress) << " Value: "
                << format("%x", Value) << " Type: " << format("%x", Type)
                << " Addend: " << format("%x", Addend) << "\n");
@@ -477,13 +496,14 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
 void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
                                            uint64_t Offset, uint32_t Value,
                                            uint32_t Type, int32_t Addend) {
-  uint8_t *TargetPtr = Section.Address + Offset;
+  uint8_t *TargetPtr = Section.getAddressWithOffset(Offset);
   Value += Addend;
 
   DEBUG(dbgs() << "resolveMIPSRelocation, LocalAddress: "
-               << Section.Address + Offset << " FinalAddress: "
-               << format("%p", Section.LoadAddress + Offset) << " Value: "
-               << format("%x", Value) << " Type: " << format("%x", Type)
+               << Section.getAddressWithOffset(Offset) << " FinalAddress: "
+               << format("%p", Section.getLoadAddressWithOffset(Offset))
+               << " Value: " << format("%x", Value)
+               << " Type: " << format("%x", Type)
                << " Addend: " << format("%x", Addend) << "\n");
 
   uint32_t Insn = readBytesUnaligned(TargetPtr, 4);
@@ -512,47 +532,47 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   case ELF::R_MIPS_PC32: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     writeBytesUnaligned(Value - FinalAddress, (uint8_t *)TargetPtr, 4);
     break;
   }
   case ELF::R_MIPS_PC16: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     Insn &= 0xffff0000;
     Insn |= ((Value - FinalAddress) >> 2) & 0xffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   }
   case ELF::R_MIPS_PC19_S2: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     Insn &= 0xfff80000;
     Insn |= ((Value - (FinalAddress & ~0x3)) >> 2) & 0x7ffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   }
   case ELF::R_MIPS_PC21_S2: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     Insn &= 0xffe00000;
     Insn |= ((Value - FinalAddress) >> 2) & 0x1fffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   }
   case ELF::R_MIPS_PC26_S2: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     Insn &= 0xfc000000;
     Insn |= ((Value - FinalAddress) >> 2) & 0x3ffffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   }
   case ELF::R_MIPS_PCHI16: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     Insn &= 0xffff0000;
     Insn |= ((Value - FinalAddress + 0x8000) >> 16) & 0xffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   }
   case ELF::R_MIPS_PCLO16: {
-    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     Insn &= 0xffff0000;
     Insn |= (Value - FinalAddress) & 0xffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
@@ -603,7 +623,8 @@ void RuntimeDyldELF::resolveMIPS64Relocation(const SectionEntry &Section,
                                                CalculatedValue, SymOffset,
                                                SectionID);
   }
-  applyMIPS64Relocation(Section.Address + Offset, CalculatedValue, RelType);
+  applyMIPS64Relocation(Section.getAddressWithOffset(Offset), CalculatedValue,
+                        RelType);
 }
 
 int64_t
@@ -613,13 +634,12 @@ RuntimeDyldELF::evaluateMIPS64Relocation(const SectionEntry &Section,
                                          uint64_t SymOffset, SID SectionID) {
 
   DEBUG(dbgs() << "evaluateMIPS64Relocation, LocalAddress: 0x"
-               << format("%llx", Section.Address + Offset)
+               << format("%llx", Section.getAddressWithOffset(Offset))
                << " FinalAddress: 0x"
-               << format("%llx", Section.LoadAddress + Offset)
+               << format("%llx", Section.getLoadAddressWithOffset(Offset))
                << " Value: 0x" << format("%llx", Value) << " Type: 0x"
                << format("%x", Type) << " Addend: 0x" << format("%llx", Addend)
-               << " SymOffset: " << format("%x", SymOffset)
-               << "\n");
+               << " SymOffset: " << format("%x", SymOffset) << "\n");
 
   switch (Type) {
   default:
@@ -672,35 +692,35 @@ RuntimeDyldELF::evaluateMIPS64Relocation(const SectionEntry &Section,
     return Value + Addend - (GOTAddr + 0x7ff0);
   }
   case ELF::R_MIPS_PC16: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     return ((Value + Addend - FinalAddress) >> 2) & 0xffff;
   }
   case ELF::R_MIPS_PC32: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     return Value + Addend - FinalAddress;
   }
   case ELF::R_MIPS_PC18_S3: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
-    return ((Value + Addend - ((FinalAddress | 7) ^ 7)) >> 3) & 0x3ffff;
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return ((Value + Addend - (FinalAddress & ~0x7)) >> 3) & 0x3ffff;
   }
   case ELF::R_MIPS_PC19_S2: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
-    return ((Value + Addend - FinalAddress) >> 2) & 0x7ffff;
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return ((Value + Addend - (FinalAddress & ~0x3)) >> 2) & 0x7ffff;
   }
   case ELF::R_MIPS_PC21_S2: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     return ((Value + Addend - FinalAddress) >> 2) & 0x1fffff;
   }
   case ELF::R_MIPS_PC26_S2: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     return ((Value + Addend - FinalAddress) >> 2) & 0x3ffffff;
   }
   case ELF::R_MIPS_PCHI16: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     return ((Value + Addend - FinalAddress + 0x8000) >> 16) & 0xffff;
   }
   case ELF::R_MIPS_PCLO16: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     return (Value + Addend - FinalAddress) & 0xffff;
   }
   }
@@ -769,7 +789,7 @@ void RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj,
   // relocation) without a .toc directive.  In this case just use the
   // first section (which is usually the .odp) since the code won't
   // reference the .toc base directly.
-  Rel.SymbolName = NULL;
+  Rel.SymbolName = nullptr;
   Rel.SectionID = 0;
 
   // The TOC consists of sections .got, .toc, .tocbss, .plt in that
@@ -842,8 +862,9 @@ void RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
       if (Rel.Addend != (int64_t)TargetSymbolOffset)
         continue;
 
-      section_iterator tsi(Obj.section_end());
-      check(TargetSymbol->getSection(tsi));
+      ErrorOr<section_iterator> TSIOrErr = TargetSymbol->getSection();
+      check(TSIOrErr.getError());
+      section_iterator tsi = *TSIOrErr;
       bool IsCode = tsi->isText();
       Rel.SectionID = findOrEmitSection(Obj, (*tsi), IsCode, LocalSections);
       Rel.Addend = (intptr_t)Addend;
@@ -884,10 +905,30 @@ static inline uint16_t applyPPChighesta (uint64_t value) {
   return ((value + 0x8000) >> 48) & 0xffff;
 }
 
+void RuntimeDyldELF::resolvePPC32Relocation(const SectionEntry &Section,
+                                            uint64_t Offset, uint64_t Value,
+                                            uint32_t Type, int64_t Addend) {
+  uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+    break;
+  case ELF::R_PPC_ADDR16_LO:
+    writeInt16BE(LocalAddress, applyPPClo(Value + Addend));
+    break;
+  case ELF::R_PPC_ADDR16_HI:
+    writeInt16BE(LocalAddress, applyPPChi(Value + Addend));
+    break;
+  case ELF::R_PPC_ADDR16_HA:
+    writeInt16BE(LocalAddress, applyPPCha(Value + Addend));
+    break;
+  }
+}
+
 void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
                                             uint64_t Offset, uint64_t Value,
                                             uint32_t Type, int64_t Addend) {
-  uint8_t *LocalAddress = Section.Address + Offset;
+  uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
   switch (Type) {
   default:
     llvm_unreachable("Relocation type not implemented yet!");
@@ -929,17 +970,17 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc));
   } break;
   case ELF::R_PPC64_REL16_LO: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     uint64_t Delta = Value - FinalAddress + Addend;
     writeInt16BE(LocalAddress, applyPPClo(Delta));
   } break;
   case ELF::R_PPC64_REL16_HI: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     uint64_t Delta = Value - FinalAddress + Addend;
     writeInt16BE(LocalAddress, applyPPChi(Delta));
   } break;
   case ELF::R_PPC64_REL16_HA: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     uint64_t Delta = Value - FinalAddress + Addend;
     writeInt16BE(LocalAddress, applyPPCha(Delta));
   } break;
@@ -950,22 +991,22 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     writeInt32BE(LocalAddress, Result);
   } break;
   case ELF::R_PPC64_REL24: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
-    if (SignExtend32<24>(delta) != delta)
+    if (SignExtend32<26>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL24 overflow");
     // Generates a 'bl <address>' instruction
     writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
   } break;
   case ELF::R_PPC64_REL32: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
     if (SignExtend32<32>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL32 overflow");
     writeInt32BE(LocalAddress, delta);
   } break;
   case ELF::R_PPC64_REL64: {
-    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
     uint64_t Delta = Value - FinalAddress + Addend;
     writeInt64BE(LocalAddress, Delta);
   } break;
@@ -978,27 +1019,27 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
 void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
                                               uint64_t Offset, uint64_t Value,
                                               uint32_t Type, int64_t Addend) {
-  uint8_t *LocalAddress = Section.Address + Offset;
+  uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
   switch (Type) {
   default:
     llvm_unreachable("Relocation type not implemented yet!");
     break;
   case ELF::R_390_PC16DBL:
   case ELF::R_390_PLT16DBL: {
-    int64_t Delta = (Value + Addend) - (Section.LoadAddress + Offset);
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     assert(int16_t(Delta / 2) * 2 == Delta && "R_390_PC16DBL overflow");
     writeInt16BE(LocalAddress, Delta / 2);
     break;
   }
   case ELF::R_390_PC32DBL:
   case ELF::R_390_PLT32DBL: {
-    int64_t Delta = (Value + Addend) - (Section.LoadAddress + Offset);
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     assert(int32_t(Delta / 2) * 2 == Delta && "R_390_PC32DBL overflow");
     writeInt32BE(LocalAddress, Delta / 2);
     break;
   }
   case ELF::R_390_PC32: {
-    int64_t Delta = (Value + Addend) - (Section.LoadAddress + Offset);
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     assert(int32_t(Delta) == Delta && "R_390_PC32 overflow");
     writeInt32BE(LocalAddress, Delta);
     break;
@@ -1072,6 +1113,9 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
     else
       llvm_unreachable("Mips ABI not handled");
     break;
+  case Triple::ppc:
+    resolvePPC32Relocation(Section, Offset, Value, Type, Addend);
+    break;
   case Triple::ppc64: // Fall through.
   case Triple::ppc64le:
     resolvePPC64Relocation(Section, Offset, Value, Type, Addend);
@@ -1085,7 +1129,7 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
 }
 
 void *RuntimeDyldELF::computePlaceholderAddress(unsigned SectionID, uint64_t Offset) const {
-  return (void*)(Sections[SectionID].ObjAddress + Offset);
+  return (void *)(Sections[SectionID].getObjAddress() + Offset);
 }
 
 void RuntimeDyldELF::processSimpleRelocation(unsigned SectionID, uint64_t Offset, unsigned RelType, RelocationValueRef Value) {
@@ -1096,6 +1140,29 @@ void RuntimeDyldELF::processSimpleRelocation(unsigned SectionID, uint64_t Offset
     addRelocationForSection(RE, Value.SectionID);
 }
 
+uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
+                                                 bool IsLocal) const {
+  switch (RelType) {
+  case ELF::R_MICROMIPS_GOT16:
+    if (IsLocal)
+      return ELF::R_MICROMIPS_LO16;
+    break;
+  case ELF::R_MICROMIPS_HI16:
+    return ELF::R_MICROMIPS_LO16;
+  case ELF::R_MIPS_GOT16:
+    if (IsLocal)
+      return ELF::R_MIPS_LO16;
+    break;
+  case ELF::R_MIPS_HI16:
+    return ELF::R_MIPS_LO16;
+  case ELF::R_MIPS_PCHI16:
+    return ELF::R_MIPS_PCLO16;
+  default:
+    break;
+  }
+  return ELF::R_MIPS_NONE;
+}
+
 relocation_iterator RuntimeDyldELF::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
     ObjSectionToIDMap &ObjSectionToID, StubMap &Stubs) {
@@ -1136,8 +1203,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
       // and can be changed by another developers. Maybe best way is add
       // a new symbol type ST_Section to SymbolRef and use it.
-      section_iterator si(Obj.section_end());
-      Symbol->getSection(si);
+      section_iterator si = *Symbol->getSection();
       if (si == Obj.section_end())
         llvm_unreachable("Symbol section not found, bad object file format!");
       DEBUG(dbgs() << "\t\tThis is section symbol\n");
@@ -1178,24 +1244,28 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
     // Look for an existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset, (uint64_t)Section.Address + i->second,
+      resolveRelocation(Section, Offset,
+                        (uint64_t)Section.getAddressWithOffset(i->second),
                         RelType, 0);
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
       DEBUG(dbgs() << " Create a new stub function\n");
-      Stubs[Value] = Section.StubOffset;
-      uint8_t *StubTargetAddr =
-          createStubFunction(Section.Address + Section.StubOffset);
+      Stubs[Value] = Section.getStubOffset();
+      uint8_t *StubTargetAddr = createStubFunction(
+          Section.getAddressWithOffset(Section.getStubOffset()));
 
-      RelocationEntry REmovz_g3(SectionID, StubTargetAddr - Section.Address,
+      RelocationEntry REmovz_g3(SectionID,
+                                StubTargetAddr - Section.getAddress(),
                                 ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
-      RelocationEntry REmovk_g2(SectionID, StubTargetAddr - Section.Address + 4,
+      RelocationEntry REmovk_g2(SectionID, StubTargetAddr -
+                                               Section.getAddress() + 4,
                                 ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
-      RelocationEntry REmovk_g1(SectionID, StubTargetAddr - Section.Address + 8,
+      RelocationEntry REmovk_g1(SectionID, StubTargetAddr -
+                                               Section.getAddress() + 8,
                                 ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
-      RelocationEntry REmovk_g0(SectionID,
-                                StubTargetAddr - Section.Address + 12,
+      RelocationEntry REmovk_g0(SectionID, StubTargetAddr -
+                                               Section.getAddress() + 12,
                                 ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
 
       if (Value.SymbolName) {
@@ -1210,9 +1280,10 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
         addRelocationForSection(REmovk_g0, Value.SectionID);
       }
       resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + Section.StubOffset, RelType,
-                        0);
-      Section.StubOffset += getMaxStubSize();
+                        reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
+                            Section.getStubOffset())),
+                        RelType, 0);
+      Section.advanceStubOffset(getMaxStubSize());
     }
   } else if (Arch == Triple::arm) {
     if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL ||
@@ -1224,26 +1295,29 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // Look for an existing stub.
       StubMap::const_iterator i = Stubs.find(Value);
       if (i != Stubs.end()) {
-        resolveRelocation(Section, Offset, (uint64_t)Section.Address + i->second,
-          RelType, 0);
+        resolveRelocation(
+            Section, Offset,
+            reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)),
+            RelType, 0);
         DEBUG(dbgs() << " Stub function found\n");
       } else {
         // Create a new stub function.
         DEBUG(dbgs() << " Create a new stub function\n");
-        Stubs[Value] = Section.StubOffset;
-        uint8_t *StubTargetAddr =
-          createStubFunction(Section.Address + Section.StubOffset);
-        RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
-          ELF::R_ARM_ABS32, Value.Addend);
+        Stubs[Value] = Section.getStubOffset();
+        uint8_t *StubTargetAddr = createStubFunction(
+            Section.getAddressWithOffset(Section.getStubOffset()));
+        RelocationEntry RE(SectionID, StubTargetAddr - Section.getAddress(),
+                           ELF::R_ARM_ABS32, Value.Addend);
         if (Value.SymbolName)
           addRelocationForSymbol(RE, Value.SymbolName);
         else
           addRelocationForSection(RE, Value.SectionID);
 
-        resolveRelocation(Section, Offset,
-          (uint64_t)Section.Address + Section.StubOffset, RelType,
-          0);
-        Section.StubOffset += getMaxStubSize();
+        resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>(
+                                               Section.getAddressWithOffset(
+                                                   Section.getStubOffset())),
+                          RelType, 0);
+        Section.advanceStubOffset(getMaxStubSize());
       }
     } else {
       uint32_t *Placeholder =
@@ -1282,15 +1356,16 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       } else {
         // Create a new stub function.
         DEBUG(dbgs() << " Create a new stub function\n");
-        Stubs[Value] = Section.StubOffset;
-        uint8_t *StubTargetAddr =
-          createStubFunction(Section.Address + Section.StubOffset);
+        Stubs[Value] = Section.getStubOffset();
+        uint8_t *StubTargetAddr = createStubFunction(
+            Section.getAddressWithOffset(Section.getStubOffset()));
 
         // Creating Hi and Lo relocations for the filled stub instructions.
-        RelocationEntry REHi(SectionID, StubTargetAddr - Section.Address,
-          ELF::R_MIPS_HI16, Value.Addend);
-        RelocationEntry RELo(SectionID, StubTargetAddr - Section.Address + 4,
-          ELF::R_MIPS_LO16, Value.Addend);
+        RelocationEntry REHi(SectionID, StubTargetAddr - Section.getAddress(),
+                             ELF::R_MIPS_HI16, Value.Addend);
+        RelocationEntry RELo(SectionID,
+                             StubTargetAddr - Section.getAddress() + 4,
+                             ELF::R_MIPS_LO16, Value.Addend);
 
         if (Value.SymbolName) {
           addRelocationForSymbol(REHi, Value.SymbolName);
@@ -1301,21 +1376,39 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
           addRelocationForSection(RELo, Value.SectionID);
         }
 
-        RelocationEntry RE(SectionID, Offset, RelType, Section.StubOffset);
+        RelocationEntry RE(SectionID, Offset, RelType, Section.getStubOffset());
         addRelocationForSection(RE, SectionID);
-        Section.StubOffset += getMaxStubSize();
+        Section.advanceStubOffset(getMaxStubSize());
       }
+    } else if (RelType == ELF::R_MIPS_HI16 || RelType == ELF::R_MIPS_PCHI16) {
+      int64_t Addend = (Opcode & 0x0000ffff) << 16;
+      RelocationEntry RE(SectionID, Offset, RelType, Addend);
+      PendingRelocs.push_back(std::make_pair(Value, RE));
+    } else if (RelType == ELF::R_MIPS_LO16 || RelType == ELF::R_MIPS_PCLO16) {
+      int64_t Addend = Value.Addend + SignExtend32<16>(Opcode & 0x0000ffff);
+      for (auto I = PendingRelocs.begin(); I != PendingRelocs.end();) {
+        const RelocationValueRef &MatchingValue = I->first;
+        RelocationEntry &Reloc = I->second;
+        if (MatchingValue == Value &&
+            RelType == getMatchingLoRelocation(Reloc.RelType) &&
+            SectionID == Reloc.SectionID) {
+          Reloc.Addend += Addend;
+          if (Value.SymbolName)
+            addRelocationForSymbol(Reloc, Value.SymbolName);
+          else
+            addRelocationForSection(Reloc, Value.SectionID);
+          I = PendingRelocs.erase(I);
+        } else
+          ++I;
+      }
+      RelocationEntry RE(SectionID, Offset, RelType, Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
     } else {
-      // FIXME: Calculate correct addends for R_MIPS_HI16, R_MIPS_LO16,
-      // R_MIPS_PCHI16 and R_MIPS_PCLO16 relocations.
-      if (RelType == ELF::R_MIPS_HI16 || RelType == ELF::R_MIPS_PCHI16)
-        Value.Addend += (Opcode & 0x0000ffff) << 16;
-      else if (RelType == ELF::R_MIPS_LO16)
-        Value.Addend += (Opcode & 0x0000ffff);
-      else if (RelType == ELF::R_MIPS_32)
+      if (RelType == ELF::R_MIPS_32)
         Value.Addend += Opcode;
-      else if (RelType == ELF::R_MIPS_PCLO16)
-        Value.Addend += SignExtend32<16>((Opcode & 0x0000ffff));
       else if (RelType == ELF::R_MIPS_PC16)
         Value.Addend += SignExtend32<18>((Opcode & 0x0000ffff) << 2);
       else if (RelType == ELF::R_MIPS_PC19_S2)
@@ -1353,7 +1446,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // an external symbol (Symbol::ST_Unknown) or if the target address
       // is not within the signed 24-bits branch address.
       SectionEntry &Section = Sections[SectionID];
-      uint8_t *Target = Section.Address + Offset;
+      uint8_t *Target = Section.getAddressWithOffset(Offset);
       bool RangeOverflow = false;
       if (SymType != SymbolRef::ST_Unknown) {
         if (AbiVariant != 2) {
@@ -1367,10 +1460,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
           uint8_t SymOther = Symbol->getOther();
           Value.Addend += ELF::decodePPC64LocalEntryOffset(SymOther);
         }
-        uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend;
+        uint8_t *RelocTarget =
+            Sections[Value.SectionID].getAddressWithOffset(Value.Addend);
         int32_t delta = static_cast<int32_t>(Target - RelocTarget);
-        // If it is within 24-bits branch range, just set the branch target
-        if (SignExtend32<24>(delta) == delta) {
+        // If it is within 26-bits branch range, just set the branch target
+        if (SignExtend32<26>(delta) == delta) {
           RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
           if (Value.SymbolName)
             addRelocationForSymbol(RE, Value.SymbolName);
@@ -1387,23 +1481,25 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
         if (i != Stubs.end()) {
           // Symbol function stub already created, just relocate to it
           resolveRelocation(Section, Offset,
-                            (uint64_t)Section.Address + i->second, RelType, 0);
+                            reinterpret_cast<uint64_t>(
+                                Section.getAddressWithOffset(i->second)),
+                            RelType, 0);
           DEBUG(dbgs() << " Stub function found\n");
         } else {
           // Create a new stub function.
           DEBUG(dbgs() << " Create a new stub function\n");
-          Stubs[Value] = Section.StubOffset;
-          uint8_t *StubTargetAddr =
-              createStubFunction(Section.Address + Section.StubOffset,
-                                 AbiVariant);
-          RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
+          Stubs[Value] = Section.getStubOffset();
+          uint8_t *StubTargetAddr = createStubFunction(
+              Section.getAddressWithOffset(Section.getStubOffset()),
+              AbiVariant);
+          RelocationEntry RE(SectionID, StubTargetAddr - Section.getAddress(),
                              ELF::R_PPC64_ADDR64, Value.Addend);
 
           // Generates the 64-bits address loads as exemplified in section
           // 4.5.1 in PPC64 ELF ABI.  Note that the relocations need to
           // apply to the low part of the instructions, so we have to update
           // the offset according to the target endianness.
-          uint64_t StubRelocOffset = StubTargetAddr - Section.Address;
+          uint64_t StubRelocOffset = StubTargetAddr - Section.getAddress();
           if (!IsTargetLittleEndian)
             StubRelocOffset += 2;
 
@@ -1428,10 +1524,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
             addRelocationForSection(REl, Value.SectionID);
           }
 
-          resolveRelocation(Section, Offset,
-                            (uint64_t)Section.Address + Section.StubOffset,
+          resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>(
+                                                 Section.getAddressWithOffset(
+                                                     Section.getStubOffset())),
                             RelType, 0);
-          Section.StubOffset += getMaxStubSize();
+          Section.advanceStubOffset(getMaxStubSize());
         }
         if (SymType == SymbolRef::ST_Unknown) {
           // Restore the TOC for external calls
@@ -1450,11 +1547,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // These relocations are supposed to subtract the TOC address from
       // the final value.  This does not fit cleanly into the RuntimeDyld
       // scheme, since there may be *two* sections involved in determining
-      // the relocation value (the section of the symbol refered to by the
+      // the relocation value (the section of the symbol referred to by the
       // relocation, and the TOC section associated with the current module).
       //
       // Fortunately, these relocations are currently only ever generated
-      // refering to symbols that themselves reside in the TOC, which means
+      // referring to symbols that themselves reside in the TOC, which means
       // that the two sections are actually the same.  Thus they cancel out
       // and we can immediately resolve the relocation right now.
       switch (RelType) {
@@ -1511,16 +1608,17 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
     StubMap::const_iterator i = Stubs.find(Value);
     uintptr_t StubAddress;
     if (i != Stubs.end()) {
-      StubAddress = uintptr_t(Section.Address) + i->second;
+      StubAddress = uintptr_t(Section.getAddressWithOffset(i->second));
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
       DEBUG(dbgs() << " Create a new stub function\n");
 
-      uintptr_t BaseAddress = uintptr_t(Section.Address);
+      uintptr_t BaseAddress = uintptr_t(Section.getAddress());
       uintptr_t StubAlignment = getStubAlignment();
-      StubAddress = (BaseAddress + Section.StubOffset + StubAlignment - 1) &
-                    -StubAlignment;
+      StubAddress =
+          (BaseAddress + Section.getStubOffset() + StubAlignment - 1) &
+          -StubAlignment;
       unsigned StubOffset = StubAddress - BaseAddress;
 
       Stubs[Value] = StubOffset;
@@ -1531,7 +1629,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
-      Section.StubOffset = StubOffset + getMaxStubSize();
+      Section.advanceStubOffset(getMaxStubSize());
     }
 
     if (RelType == ELF::R_390_GOTENT)
@@ -1564,37 +1662,39 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
         StubMap::const_iterator i = Stubs.find(Value);
         uintptr_t StubAddress;
         if (i != Stubs.end()) {
-        StubAddress = uintptr_t(Section.Address) + i->second;
-        DEBUG(dbgs() << " Stub function found\n");
+          StubAddress = uintptr_t(Section.getAddress()) + i->second;
+          DEBUG(dbgs() << " Stub function found\n");
         } else {
-        // Create a new stub function (equivalent to a PLT entry).
-        DEBUG(dbgs() << " Create a new stub function\n");
+          // Create a new stub function (equivalent to a PLT entry).
+          DEBUG(dbgs() << " Create a new stub function\n");
 
-        uintptr_t BaseAddress = uintptr_t(Section.Address);
-        uintptr_t StubAlignment = getStubAlignment();
-        StubAddress = (BaseAddress + Section.StubOffset + StubAlignment - 1) &
-                -StubAlignment;
-        unsigned StubOffset = StubAddress - BaseAddress;
-        Stubs[Value] = StubOffset;
-        createStubFunction((uint8_t *)StubAddress);
+          uintptr_t BaseAddress = uintptr_t(Section.getAddress());
+          uintptr_t StubAlignment = getStubAlignment();
+          StubAddress =
+              (BaseAddress + Section.getStubOffset() + StubAlignment - 1) &
+              -StubAlignment;
+          unsigned StubOffset = StubAddress - BaseAddress;
+          Stubs[Value] = StubOffset;
+          createStubFunction((uint8_t *)StubAddress);
 
-        // Bump our stub offset counter
-        Section.StubOffset = StubOffset + getMaxStubSize();
+          // Bump our stub offset counter
+          Section.advanceStubOffset(getMaxStubSize());
 
-        // Allocate a GOT Entry
-        uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
+          // Allocate a GOT Entry
+          uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
 
-        // The load of the GOT address has an addend of -4
-        resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4);
+          // The load of the GOT address has an addend of -4
+          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4);
 
-        // Fill in the value of the symbol we're targeting into the GOT
-        addRelocationForSymbol(computeGOTOffsetRE(SectionID,GOTOffset,0,ELF::R_X86_64_64),
-          Value.SymbolName);
+          // Fill in the value of the symbol we're targeting into the GOT
+          addRelocationForSymbol(
+              computeGOTOffsetRE(SectionID, GOTOffset, 0, ELF::R_X86_64_64),
+              Value.SymbolName);
         }
 
         // Make the target call a call into the stub table.
         resolveRelocation(Section, Offset, StubAddress, ELF::R_X86_64_PC32,
-                Addend);
+                          Addend);
       } else {
         RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend,
                   Value.Offset);
@@ -1670,7 +1770,7 @@ uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
     GOTSectionID = Sections.size();
     // Reserve a section id. We'll allocate the section later
     // once we know the total size
-    Sections.push_back(SectionEntry(".got", 0, 0, 0));
+    Sections.push_back(SectionEntry(".got", nullptr, 0, 0, 0));
   }
   uint64_t StartOffset = CurrentGOTIndex * getGOTEntrySize();
   CurrentGOTIndex += no;
@@ -1693,6 +1793,10 @@ RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(unsigned SectionID, uint64_t
 
 void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
                                   ObjSectionToIDMap &SectionMap) {
+  if (IsMipsO32ABI)
+    if (!PendingRelocs.empty())
+      report_fatal_error("Can't find matching LO16 reloc");
+
   // If necessary, allocate the global offset table
   if (GOTSectionID != 0) {
     // Allocate memory for the section
@@ -1702,7 +1806,8 @@ void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
     if (!Addr)
       report_fatal_error("Unable to allocate memory for GOT!");
 
-    Sections[GOTSectionID] = SectionEntry(".got", Addr, TotalSize, 0);
+    Sections[GOTSectionID] =
+        SectionEntry(".got", Addr, TotalSize, TotalSize, 0);
 
     if (Checker)
       Checker->registerSection(Obj.getFileName(), GOTSectionID);
@@ -1746,4 +1851,23 @@ bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const {
   return Obj.isELF();
 }
 
+bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const {
+  if (Arch != Triple::x86_64)
+    return true;  // Conservative answer
+
+  switch (R.getType()) {
+  default:
+    return true;  // Conservative answer
+
+
+  case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_PC32:
+  case ELF::R_X86_64_PC64:
+  case ELF::R_X86_64_64:
+    // We know that these reloation types won't need a stub function.  This list
+    // can be extended as needed.
+    return false;
+  }
+}
+
 } // namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 1a2552d..041811d 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -43,6 +43,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   void resolveMIPSRelocation(const SectionEntry &Section, uint64_t Offset,
                              uint32_t Value, uint32_t Type, int32_t Addend);
 
+  void resolvePPC32Relocation(const SectionEntry &Section, uint64_t Offset,
+                              uint64_t Value, uint32_t Type, int64_t Addend);
+
   void resolvePPC64Relocation(const SectionEntry &Section, uint64_t Offset,
                               uint64_t Value, uint32_t Type, int64_t Addend);
 
@@ -120,6 +123,10 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   // no particular advanced processing.
   void processSimpleRelocation(unsigned SectionID, uint64_t Offset, unsigned RelType, RelocationValueRef Value);
 
+  // Return matching *LO16 relocation (Mips specific)
+  uint32_t getMatchingLoRelocation(uint32_t RelType,
+                                   bool IsLocal = false) const;
+
   // The tentative ID for the GOT section
   unsigned GOTSectionID;
 
@@ -135,12 +142,18 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   // A map to avoid duplicate got entries (Mips64 specific)
   StringMap<uint64_t> GOTSymbolOffsets;
 
+  // *HI16 relocations will be added for resolving when we find matching
+  // *LO16 part. (Mips specific)
+  SmallVector<std::pair<RelocationValueRef, RelocationEntry>, 8> PendingRelocs;
+
   // When a module is loaded we save the SectionID of the EH frame section
   // in a table until we receive a request to register all unregistered
   // EH frame sections with the memory manager.
   SmallVector<SID, 2> UnregisteredEHFrameSections;
   SmallVector<SID, 2> RegisteredEHFrameSections;
 
+  bool relocationNeedsStub(const RelocationRef &R) const override;
+
 public:
   RuntimeDyldELF(RuntimeDyld::MemoryManager &MemMgr,
                  RuntimeDyld::SymbolResolver &Resolver);
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index e085a92..ab732c6 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -30,6 +30,7 @@
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <unordered_map>
 #include <system_error>
 
 using namespace llvm;
@@ -50,7 +51,6 @@ class Twine;
 /// SectionEntry - represents a section emitted into memory by the dynamic
 /// linker.
 class SectionEntry {
-public:
   /// Name - section name.
   std::string Name;
 
@@ -70,15 +70,54 @@ public:
   /// relocations (like ARM).
   uintptr_t StubOffset;
 
+  /// The total amount of space allocated for this section.  This includes the
+  /// section size and the maximum amount of space that the stubs can occupy.
+  size_t AllocationSize;
+
   /// ObjAddress - address of the section in the in-memory object file.  Used
   /// for calculating relocations in some object formats (like MachO).
   uintptr_t ObjAddress;
 
+public:
   SectionEntry(StringRef name, uint8_t *address, size_t size,
-               uintptr_t objAddress)
+               size_t allocationSize, uintptr_t objAddress)
       : Name(name), Address(address), Size(size),
         LoadAddress(reinterpret_cast<uintptr_t>(address)), StubOffset(size),
-        ObjAddress(objAddress) {}
+        AllocationSize(allocationSize), ObjAddress(objAddress) {
+    // AllocationSize is used only in asserts, prevent an "unused private field"
+    // warning:
+    (void)AllocationSize;
+  }
+
+  StringRef getName() const { return Name; }
+
+  uint8_t *getAddress() const { return Address; }
+
+  /// \brief Return the address of this section with an offset.
+  uint8_t *getAddressWithOffset(unsigned OffsetBytes) const {
+    assert(OffsetBytes <= AllocationSize && "Offset out of bounds!");
+    return Address + OffsetBytes;
+  }
+
+  size_t getSize() const { return Size; }
+
+  uint64_t getLoadAddress() const { return LoadAddress; }
+  void setLoadAddress(uint64_t LA) { LoadAddress = LA; }
+
+  /// \brief Return the load address of this section with an offset.
+  uint64_t getLoadAddressWithOffset(unsigned OffsetBytes) const {
+    assert(OffsetBytes <= AllocationSize && "Offset out of bounds!");
+    return LoadAddress + OffsetBytes;
+  }
+
+  uintptr_t getStubOffset() const { return StubOffset; }
+
+  void advanceStubOffset(unsigned StubSize) {
+    StubOffset += StubSize;
+    assert(StubOffset <= AllocationSize && "Not enough space allocated!");
+  }
+
+  uintptr_t getObjAddress() const { return ObjAddress; }
 };
 
 /// RelocationEntry - used to represent relocations internally in the dynamic
@@ -188,6 +227,8 @@ class RuntimeDyldImpl {
   friend class RuntimeDyld::LoadedObjectInfo;
   friend class RuntimeDyldCheckerImpl;
 protected:
+  static const unsigned AbsoluteSymbolSection = ~0U;
+
   // The MemoryManager to load objects into.
   RuntimeDyld::MemoryManager &MemMgr;
 
@@ -224,7 +265,7 @@ protected:
   // Relocations to sections already loaded. Indexed by SectionID which is the
   // source of the address. The target where the address will be written is
   // SectionID/Offset in the relocation itself.
-  DenseMap<unsigned, RelocationList> Relocations;
+  std::unordered_map<unsigned, RelocationList> Relocations;
 
   // Relocations to external symbols that are not yet resolved.  Symbols are
   // external when they aren't found in the global symbol table of all loaded
@@ -269,11 +310,11 @@ protected:
   }
 
   uint64_t getSectionLoadAddress(unsigned SectionID) const {
-    return Sections[SectionID].LoadAddress;
+    return Sections[SectionID].getLoadAddress();
   }
 
   uint8_t *getSectionAddress(unsigned SectionID) const {
-    return (uint8_t *)Sections[SectionID].Address;
+    return Sections[SectionID].getAddress();
   }
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
@@ -370,15 +411,22 @@ protected:
 
   // \brief Compute an upper bound of the memory that is required to load all
   // sections
-  void computeTotalAllocSize(const ObjectFile &Obj, uint64_t &CodeSize,
-                             uint64_t &DataSizeRO, uint64_t &DataSizeRW);
+  void computeTotalAllocSize(const ObjectFile &Obj,
+                             uint64_t &CodeSize, uint32_t &CodeAlign,
+                             uint64_t &RODataSize, uint32_t &RODataAlign,
+                             uint64_t &RWDataSize, uint32_t &RWDataAlign);
 
   // \brief Compute the stub buffer size required for a section
   unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
 
   // \brief Implementation of the generic part of the loadObject algorithm.
-  std::pair<unsigned, unsigned> loadObjectImpl(const object::ObjectFile &Obj);
+  ObjSectionToIDMap loadObjectImpl(const object::ObjectFile &Obj);
+
+  // \brief Return true if the relocation R may require allocating a stub.
+  virtual bool relocationNeedsStub(const RelocationRef &R) const {
+    return true;    // Conservative answer
+  }
 
 public:
   RuntimeDyldImpl(RuntimeDyld::MemoryManager &MemMgr,
@@ -407,6 +455,9 @@ public:
     if (pos == GlobalSymbolTable.end())
       return nullptr;
     const auto &SymInfo = pos->second;
+    // Absolute symbols do not have a local address.
+    if (SymInfo.getSectionID() == AbsoluteSymbolSection)
+      return nullptr;
     return getSectionAddress(SymInfo.getSectionID()) + SymInfo.getOffset();
   }
 
@@ -417,8 +468,10 @@ public:
     if (pos == GlobalSymbolTable.end())
       return nullptr;
     const auto &SymEntry = pos->second;
-    uint64_t TargetAddr =
-      getSectionLoadAddress(SymEntry.getSectionID()) + SymEntry.getOffset();
+    uint64_t SectionAddr = 0;
+    if (SymEntry.getSectionID() != AbsoluteSymbolSection)
+      SectionAddr = getSectionLoadAddress(SymEntry.getSectionID());
+    uint64_t TargetAddr = SectionAddr + SymEntry.getOffset();
     return RuntimeDyld::SymbolInfo(TargetAddr, SymEntry.getFlags());
   }
 
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index c074114..739e8d6 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -26,12 +26,12 @@ using namespace llvm::object;
 
 namespace {
 
-class LoadedMachOObjectInfo
+class LoadedMachOObjectInfo final
     : public RuntimeDyld::LoadedObjectInfoHelper<LoadedMachOObjectInfo> {
 public:
-  LoadedMachOObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx,
-                        unsigned EndIdx)
-      : LoadedObjectInfoHelper(RTDyld, BeginIdx, EndIdx) {}
+  LoadedMachOObjectInfo(RuntimeDyldImpl &RTDyld,
+                        ObjSectionToIDMap ObjSecToIDMap)
+      : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {}
 
   OwningBinary<ObjectFile>
   getObjectForDebug(const ObjectFile &Obj) const override {
@@ -45,11 +45,47 @@ namespace llvm {
 
 int64_t RuntimeDyldMachO::memcpyAddend(const RelocationEntry &RE) const {
   unsigned NumBytes = 1 << RE.Size;
-  uint8_t *Src = Sections[RE.SectionID].Address + RE.Offset;
+  uint8_t *Src = Sections[RE.SectionID].getAddress() + RE.Offset;
 
   return static_cast<int64_t>(readBytesUnaligned(Src, NumBytes));
 }
 
+relocation_iterator RuntimeDyldMachO::processScatteredVANILLA(
+                          unsigned SectionID, relocation_iterator RelI,
+                          const ObjectFile &BaseObjT,
+                          RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) {
+  const MachOObjectFile &Obj =
+    static_cast<const MachOObjectFile&>(BaseObjT);
+  MachO::any_relocation_info RE =
+    Obj.getRelocation(RelI->getRawDataRefImpl());
+
+  SectionEntry &Section = Sections[SectionID];
+  uint32_t RelocType = Obj.getAnyRelocationType(RE);
+  bool IsPCRel = Obj.getAnyRelocationPCRel(RE);
+  unsigned Size = Obj.getAnyRelocationLength(RE);
+  uint64_t Offset = RelI->getOffset();
+  uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
+  unsigned NumBytes = 1 << Size;
+  int64_t Addend = readBytesUnaligned(LocalAddress, NumBytes);
+
+  unsigned SymbolBaseAddr = Obj.getScatteredRelocationValue(RE);
+  section_iterator TargetSI = getSectionByAddress(Obj, SymbolBaseAddr);
+  assert(TargetSI != Obj.section_end() && "Can't find section for symbol");
+  uint64_t SectionBaseAddr = TargetSI->getAddress();
+  SectionRef TargetSection = *TargetSI;
+  bool IsCode = TargetSection.isText();
+  uint32_t TargetSectionID =
+    findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID);
+
+  Addend -= SectionBaseAddr;
+  RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size);
+
+  addRelocationForSection(R, TargetSectionID);
+
+  return ++RelI;
+}
+
+
 RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
     const ObjectFile &BaseTObj, const relocation_iterator &RI,
     const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID) {
@@ -99,8 +135,8 @@ void RuntimeDyldMachO::makeValueAddendPCRel(RelocationValueRef &Value,
 void RuntimeDyldMachO::dumpRelocationToResolve(const RelocationEntry &RE,
                                                uint64_t Value) const {
   const SectionEntry &Section = Sections[RE.SectionID];
-  uint8_t *LocalAddress = Section.Address + RE.Offset;
-  uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+  uint8_t *LocalAddress = Section.getAddress() + RE.Offset;
+  uint64_t FinalAddress = Section.getLoadAddress() + RE.Offset;
 
   dbgs() << "resolveRelocation Section: " << RE.SectionID
          << " LocalAddress: " << format("%p", LocalAddress)
@@ -147,10 +183,9 @@ void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
          "Pointers section does not contain a whole number of stubs?");
 
   DEBUG(dbgs() << "Populating pointer table section "
-               << Sections[PTSectionID].Name
-               << ", Section ID " << PTSectionID << ", "
-               << NumPTEntries << " entries, " << PTEntrySize
-               << " bytes each:\n");
+               << Sections[PTSectionID].getName() << ", Section ID "
+               << PTSectionID << ", " << NumPTEntries << " entries, "
+               << PTEntrySize << " bytes each:\n");
 
   for (unsigned i = 0; i < NumPTEntries; ++i) {
     unsigned SymbolIndex =
@@ -204,7 +239,7 @@ void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj,
 }
 
 template <typename Impl>
-unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P,
+unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(uint8_t *P,
                                                           int64_t DeltaForText,
                                                           int64_t DeltaForEH) {
   typedef typename Impl::TargetPtrT TargetPtrT;
@@ -213,7 +248,7 @@ unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P,
                << ", Delta for EH: " << DeltaForEH << "\n");
   uint32_t Length = readBytesUnaligned(P, 4);
   P += 4;
-  unsigned char *Ret = P + Length;
+  uint8_t *Ret = P + Length;
   uint32_t Offset = readBytesUnaligned(P, 4);
   if (Offset == 0) // is a CIE
     return Ret;
@@ -240,9 +275,9 @@ unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P,
 }
 
 static int64_t computeDelta(SectionEntry *A, SectionEntry *B) {
-  int64_t ObjDistance =
-    static_cast<int64_t>(A->ObjAddress) - static_cast<int64_t>(B->ObjAddress);
-  int64_t MemDistance = A->LoadAddress - B->LoadAddress;
+  int64_t ObjDistance = static_cast<int64_t>(A->getObjAddress()) -
+                        static_cast<int64_t>(B->getObjAddress());
+  int64_t MemDistance = A->getLoadAddress() - B->getLoadAddress();
   return ObjDistance - MemDistance;
 }
 
@@ -265,14 +300,14 @@ void RuntimeDyldMachOCRTPBase<Impl>::registerEHFrames() {
     if (ExceptTab)
       DeltaForEH = computeDelta(ExceptTab, EHFrame);
 
-    unsigned char *P = EHFrame->Address;
-    unsigned char *End = P + EHFrame->Size;
+    uint8_t *P = EHFrame->getAddress();
+    uint8_t *End = P + EHFrame->getSize();
     do {
       P = processFDE(P, DeltaForText, DeltaForEH);
     } while (P != End);
 
-    MemMgr.registerEHFrames(EHFrame->Address, EHFrame->LoadAddress,
-                            EHFrame->Size);
+    MemMgr.registerEHFrames(EHFrame->getAddress(), EHFrame->getLoadAddress(),
+                            EHFrame->getSize());
   }
   UnregisteredEHFrameSections.clear();
 }
@@ -298,10 +333,7 @@ RuntimeDyldMachO::create(Triple::ArchType Arch,
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldMachO::loadObject(const object::ObjectFile &O) {
-  unsigned SectionStartIdx, SectionEndIdx;
-  std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O);
-  return llvm::make_unique<LoadedMachOObjectInfo>(*this, SectionStartIdx,
-                                                  SectionEndIdx);
+  return llvm::make_unique<LoadedMachOObjectInfo>(*this, loadObjectImpl(O));
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 0d7364f..c8ae47b 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -79,6 +79,12 @@ protected:
     return RelocationEntry(SectionID, Offset, RelType, 0, IsPCRel, Size);
   }
 
+  /// Process a scattered vanilla relocation.
+  relocation_iterator processScatteredVANILLA(
+                           unsigned SectionID, relocation_iterator RelI,
+                           const ObjectFile &BaseObjT,
+                           RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID);
+
   /// Construct a RelocationValueRef representing the relocation target.
   /// For Symbols in known sections, this will return a RelocationValueRef
   /// representing a (SectionID, Offset) pair.
@@ -140,7 +146,7 @@ private:
   Impl &impl() { return static_cast<Impl &>(*this); }
   const Impl &impl() const { return static_cast<const Impl &>(*this); }
 
-  unsigned char *processFDE(unsigned char *P, int64_t DeltaForText,
+  unsigned char *processFDE(uint8_t *P, int64_t DeltaForText,
                             int64_t DeltaForEH);
 
 public:
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
new file mode 100644
index 0000000..fbfbb32
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -0,0 +1,201 @@
+//===--- RuntimeDyldCOFFI386.h --- COFF/X86_64 specific code ---*- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// COFF x86 support for MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFI386_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFI386_H
+
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/COFF.h"
+#include "../RuntimeDyldCOFF.h"
+
+#define DEBUG_TYPE "dyld"
+
+namespace llvm {
+
+class RuntimeDyldCOFFI386 : public RuntimeDyldCOFF {
+public:
+  RuntimeDyldCOFFI386(RuntimeDyld::MemoryManager &MM,
+                      RuntimeDyld::SymbolResolver &Resolver)
+      : RuntimeDyldCOFF(MM, Resolver) {}
+
+  unsigned getMaxStubSize() override {
+    return 8; // 2-byte jmp instruction + 32-bit relative address + 2 byte pad
+  }
+
+  unsigned getStubAlignment() override { return 1; }
+
+  relocation_iterator processRelocationRef(unsigned SectionID,
+                                           relocation_iterator RelI,
+                                           const ObjectFile &Obj,
+                                           ObjSectionToIDMap &ObjSectionToID,
+                                           StubMap &Stubs) override {
+    auto Symbol = RelI->getSymbol();
+    if (Symbol == Obj.symbol_end())
+      report_fatal_error("Unknown symbol in relocation");
+
+    ErrorOr<StringRef> TargetNameOrErr = Symbol->getName();
+    if (auto EC = TargetNameOrErr.getError())
+      report_fatal_error(EC.message());
+    StringRef TargetName = *TargetNameOrErr;
+
+    auto Section = *Symbol->getSection();
+
+    uint64_t RelType = RelI->getType();
+    uint64_t Offset = RelI->getOffset();
+
+#if !defined(NDEBUG)
+    SmallString<32> RelTypeName;
+    RelI->getTypeName(RelTypeName);
+#endif
+    DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset
+                 << " RelType: " << RelTypeName << " TargetName: " << TargetName
+                 << "\n");
+
+    unsigned TargetSectionID = -1;
+    if (Section == Obj.section_end()) {
+      RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0);
+      addRelocationForSymbol(RE, TargetName);
+    } else {
+      TargetSectionID =
+          findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID);
+
+      switch (RelType) {
+      case COFF::IMAGE_REL_I386_ABSOLUTE:
+        // This relocation is ignored.
+        break;
+      case COFF::IMAGE_REL_I386_DIR32:
+      case COFF::IMAGE_REL_I386_DIR32NB:
+      case COFF::IMAGE_REL_I386_REL32: {
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, 0, TargetSectionID,
+                            getSymbolOffset(*Symbol), 0, 0, false, 0);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      case COFF::IMAGE_REL_I386_SECTION: {
+        RelocationEntry RE =
+            RelocationEntry(TargetSectionID, Offset, RelType, 0);
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      case COFF::IMAGE_REL_I386_SECREL: {
+        RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType,
+                                             getSymbolOffset(*Symbol));
+        addRelocationForSection(RE, TargetSectionID);
+        break;
+      }
+      default:
+        llvm_unreachable("unsupported relocation type");
+      }
+
+    }
+
+    return ++RelI;
+  }
+
+  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
+    const auto Section = Sections[RE.SectionID];
+    uint8_t *Target = Section.getAddressWithOffset(RE.Offset);
+
+    switch (RE.RelType) {
+    case COFF::IMAGE_REL_I386_ABSOLUTE:
+      // This relocation is ignored.
+      break;
+    case COFF::IMAGE_REL_I386_DIR32: {
+      // The target's 32-bit VA.
+      uint64_t Result =
+          RE.Sections.SectionA == static_cast<uint32_t>(-1)
+              ? Value
+              : Sections[RE.Sections.SectionA].getLoadAddressWithOffset(
+                    RE.Addend);
+      assert(static_cast<int32_t>(Result) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(Result) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_I386_DIR32"
+                   << " TargetSection: " << RE.Sections.SectionA
+                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      writeBytesUnaligned(Result, Target, 4);
+      break;
+    }
+    case COFF::IMAGE_REL_I386_DIR32NB: {
+      // The target's 32-bit RVA.
+      // NOTE: use Section[0].getLoadAddress() as an approximation of ImageBase
+      uint64_t Result =
+          Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend) -
+          Sections[0].getLoadAddress();
+      assert(static_cast<int32_t>(Result) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(Result) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_I386_DIR32NB"
+                   << " TargetSection: " << RE.Sections.SectionA
+                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      writeBytesUnaligned(Result, Target, 4);
+      break;
+    }
+    case COFF::IMAGE_REL_I386_REL32: {
+      // 32-bit relative displacement to the target.
+      uint64_t Result = Sections[RE.Sections.SectionA].getLoadAddress() -
+                        Section.getLoadAddress() + RE.Addend - 4 - RE.Offset;
+      assert(static_cast<int32_t>(Result) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(Result) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_I386_REL32"
+                   << " TargetSection: " << RE.Sections.SectionA
+                   << " Value: " << format("0x%08" PRIx32, Result) << '\n');
+      writeBytesUnaligned(Result, Target, 4);
+      break;
+    }
+    case COFF::IMAGE_REL_I386_SECTION:
+      // 16-bit section index of the section that contains the target.
+      assert(static_cast<int32_t>(RE.SectionID) <= INT16_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.SectionID) >= INT16_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_I386_SECTION Value: " << RE.SectionID
+                   << '\n');
+      writeBytesUnaligned(RE.SectionID, Target, 2);
+      break;
+    case COFF::IMAGE_REL_I386_SECREL:
+      // 32-bit offset of the target from the beginning of its section.
+      assert(static_cast<int32_t>(RE.Addend) <= INT32_MAX &&
+             "relocation overflow");
+      assert(static_cast<int32_t>(RE.Addend) >= INT32_MIN &&
+             "relocation underflow");
+      DEBUG(dbgs() << "\t\tOffset: " << RE.Offset
+                   << " RelType: IMAGE_REL_I386_SECREL Value: " << RE.Addend
+                   << '\n');
+      writeBytesUnaligned(RE.Addend, Target, 2);
+      break;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  }
+
+  void registerEHFrames() override {}
+  void deregisterEHFrames() override {}
+
+  void finalizeLoad(const ObjectFile &Obj,
+                    ObjSectionToIDMap &SectionMap) override {}
+};
+
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 408227e..25f538d 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -62,7 +62,7 @@ public:
   // symbol in the target address space.
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *Target = Section.Address + RE.Offset;
+    uint8_t *Target = Section.getAddressWithOffset(RE.Offset);
 
     switch (RE.RelType) {
 
@@ -72,8 +72,7 @@ public:
     case COFF::IMAGE_REL_AMD64_REL32_3:
     case COFF::IMAGE_REL_AMD64_REL32_4:
     case COFF::IMAGE_REL_AMD64_REL32_5: {
-      uint32_t *TargetAddress = (uint32_t *)Target;
-      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset);
       // Delta is the distance from the start of the reloc to the end of the
       // instruction with the reloc.
       uint64_t Delta = 4 + (RE.RelType - COFF::IMAGE_REL_AMD64_REL32);
@@ -81,7 +80,7 @@ public:
       uint64_t Result = Value + RE.Addend;
       assert(((int64_t)Result <= INT32_MAX) && "Relocation overflow");
       assert(((int64_t)Result >= INT32_MIN) && "Relocation underflow");
-      *TargetAddress = Result;
+      writeBytesUnaligned(Result, Target, 4);
       break;
     }
 
@@ -92,14 +91,12 @@ public:
       // within a 32 bit offset from the base.
       //
       // For now we just set these to zero.
-      uint32_t *TargetAddress = (uint32_t *)Target;
-      *TargetAddress = 0;
+      writeBytesUnaligned(0, Target, 4);
       break;
     }
 
     case COFF::IMAGE_REL_AMD64_ADDR64: {
-      uint64_t *TargetAddress = (uint64_t *)Target;
-      *TargetAddress = Value + RE.Addend;
+      writeBytesUnaligned(Value + RE.Addend, Target, 8);
       break;
     }
 
@@ -119,8 +116,7 @@ public:
     symbol_iterator Symbol = RelI->getSymbol();
     if (Symbol == Obj.symbol_end())
       report_fatal_error("Unknown symbol in relocation");
-    section_iterator SecI(Obj.section_end());
-    Symbol->getSection(SecI);
+    section_iterator SecI = *Symbol->getSection();
     // If there is no section, this must be an external reference.
     const bool IsExtern = SecI == Obj.section_end();
 
@@ -129,7 +125,7 @@ public:
     uint64_t Offset = RelI->getOffset();
     uint64_t Addend = 0;
     SectionEntry &Section = Sections[SectionID];
-    uintptr_t ObjTarget = Section.ObjAddress + Offset;
+    uintptr_t ObjTarget = Section.getObjAddress() + Offset;
 
     switch (RelType) {
 
@@ -140,14 +136,14 @@ public:
     case COFF::IMAGE_REL_AMD64_REL32_4:
     case COFF::IMAGE_REL_AMD64_REL32_5:
     case COFF::IMAGE_REL_AMD64_ADDR32NB: {
-      uint32_t *Displacement = (uint32_t *)ObjTarget;
-      Addend = *Displacement;
+      uint8_t *Displacement = (uint8_t *)ObjTarget;
+      Addend = readBytesUnaligned(Displacement, 4);
       break;
     }
 
     case COFF::IMAGE_REL_AMD64_ADDR64: {
-      uint64_t *Displacement = (uint64_t *)ObjTarget;
-      Addend = *Displacement;
+      uint8_t *Displacement = (uint8_t *)ObjTarget;
+      Addend = readBytesUnaligned(Displacement, 8);
       break;
     }
 
@@ -182,9 +178,9 @@ public:
   unsigned getStubAlignment() override { return 1; }
   void registerEHFrames() override {
     for (auto const &EHFrameSID : UnregisteredEHFrameSections) {
-      uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
-      uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
-      size_t EHFrameSize = Sections[EHFrameSID].Size;
+      uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
+      uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
+      size_t EHFrameSize = Sections[EHFrameSID].getSize();
       MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
       RegisteredEHFrameSections.push_back(EHFrameSID);
     }
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
index 7bf7641..dbca377 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
@@ -34,7 +34,7 @@ public:
   /// Extract the addend encoded in the instruction / memory location.
   int64_t decodeAddend(const RelocationEntry &RE) const {
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
     unsigned NumBytes = 1 << RE.Size;
     int64_t Addend = 0;
     // Verify that the relocation has the correct size and alignment.
@@ -272,15 +272,14 @@ public:
 
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
     RE.Addend = decodeAddend(RE);
-    RelocationValueRef Value(
-        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     assert((ExplicitAddend == 0 || RE.Addend == 0) && "Relocation has "\
       "ARM64_RELOC_ADDEND and embedded addend in the instruction.");
-    if (ExplicitAddend) {
+    if (ExplicitAddend)
       RE.Addend = ExplicitAddend;
-      Value.Offset = ExplicitAddend;
-    }
+
+    RelocationValueRef Value(
+        getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
@@ -305,7 +304,7 @@ public:
     DEBUG(dumpRelocationToResolve(RE, Value));
 
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
     MachO::RelocationInfoType RelType =
       static_cast<MachO::RelocationInfoType>(RE.RelType);
 
@@ -325,7 +324,7 @@ public:
     case MachO::ARM64_RELOC_BRANCH26: {
       assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_BRANCH26 not supported");
       // Check if branch is in range.
-      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset);
       int64_t PCRelVal = Value - FinalAddress + RE.Addend;
       encodeAddend(LocalAddress, /*Size=*/4, RelType, PCRelVal);
       break;
@@ -334,7 +333,7 @@ public:
     case MachO::ARM64_RELOC_PAGE21: {
       assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_PAGE21 not supported");
       // Adjust for PC-relative relocation and offset.
-      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset);
       int64_t PCRelVal =
         ((Value + RE.Addend) & (-4096)) - (FinalAddress & (-4096));
       encodeAddend(LocalAddress, /*Size=*/4, RelType, PCRelVal);
@@ -376,10 +375,10 @@ private:
     else {
       // FIXME: There must be a better way to do this then to check and fix the
       // alignment every time!!!
-      uintptr_t BaseAddress = uintptr_t(Section.Address);
+      uintptr_t BaseAddress = uintptr_t(Section.getAddress());
       uintptr_t StubAlignment = getStubAlignment();
       uintptr_t StubAddress =
-          (BaseAddress + Section.StubOffset + StubAlignment - 1) &
+          (BaseAddress + Section.getStubOffset() + StubAlignment - 1) &
           -StubAlignment;
       unsigned StubOffset = StubAddress - BaseAddress;
       Stubs[Value] = StubOffset;
@@ -392,7 +391,7 @@ private:
         addRelocationForSymbol(GOTRE, Value.SymbolName);
       else
         addRelocationForSection(GOTRE, Value.SectionID);
-      Section.StubOffset = StubOffset + getMaxStubSize();
+      Section.advanceStubOffset(getMaxStubSize());
       Offset = static_cast<int64_t>(StubOffset);
     }
     RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, Offset,
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index 0a24bb2..7731df0 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -35,7 +35,7 @@ public:
 
   int64_t decodeAddend(const RelocationEntry &RE) const {
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
 
     switch (RE.RelType) {
       default:
@@ -64,8 +64,10 @@ public:
       if (RelType == MachO::ARM_RELOC_HALF_SECTDIFF)
         return processHALFSECTDIFFRelocation(SectionID, RelI, Obj,
                                              ObjSectionToID);
+      else if (RelType == MachO::GENERIC_RELOC_VANILLA)
+        return processScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID);
       else
-        return ++++RelI;
+        return ++RelI;
     }
 
     RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI));
@@ -92,12 +94,12 @@ public:
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
     DEBUG(dumpRelocationToResolve(RE, Value));
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
 
     // If the relocation is PC-relative, the value to be encoded is the
     // pointer difference.
     if (RE.IsPCRel) {
-      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset);
       Value -= FinalAddress;
       // ARM PCRel relocations have an effective-PC offset of two instructions
       // (four bytes in Thumb mode, 8 bytes in ARM mode).
@@ -130,8 +132,8 @@ public:
       break;
     }
     case MachO::ARM_RELOC_HALF_SECTDIFF: {
-      uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress;
-      uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress;
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress();
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress();
       assert((Value == SectionABase || Value == SectionBBase) &&
              "Unexpected HALFSECTDIFF relocation value.");
       Value = SectionABase - SectionBBase + RE.Addend;
@@ -178,21 +180,21 @@ private:
     RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value);
     uint8_t *Addr;
     if (i != Stubs.end()) {
-      Addr = Section.Address + i->second;
+      Addr = Section.getAddressWithOffset(i->second);
     } else {
       // Create a new stub function.
-      Stubs[Value] = Section.StubOffset;
-      uint8_t *StubTargetAddr =
-          createStubFunction(Section.Address + Section.StubOffset);
-      RelocationEntry StubRE(RE.SectionID, StubTargetAddr - Section.Address,
-                             MachO::GENERIC_RELOC_VANILLA, Value.Offset, false,
-                             2);
+      Stubs[Value] = Section.getStubOffset();
+      uint8_t *StubTargetAddr = createStubFunction(
+          Section.getAddressWithOffset(Section.getStubOffset()));
+      RelocationEntry StubRE(
+          RE.SectionID, StubTargetAddr - Section.getAddress(),
+          MachO::GENERIC_RELOC_VANILLA, Value.Offset, false, 2);
       if (Value.SymbolName)
         addRelocationForSymbol(StubRE, Value.SymbolName);
       else
         addRelocationForSection(StubRE, Value.SectionID);
-      Addr = Section.Address + Section.StubOffset;
-      Section.StubOffset += getMaxStubSize();
+      Addr = Section.getAddressWithOffset(Section.getStubOffset());
+      Section.advanceStubOffset(getMaxStubSize());
     }
     RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, 0,
                              RE.IsPCRel, RE.Size);
@@ -221,7 +223,7 @@ private:
     uint32_t RelocType = MachO.getAnyRelocationType(RE);
     bool IsPCRel = MachO.getAnyRelocationPCRel(RE);
     uint64_t Offset = RelI->getOffset();
-    uint8_t *LocalAddress = Section.Address + Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
     int64_t Immediate = readBytesUnaligned(LocalAddress, 4); // Copy the whole instruction out.
     Immediate = ((Immediate >> 4) & 0xf000) | (Immediate & 0xfff);
 
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index 569a078..85059d7 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -47,8 +47,7 @@ public:
         return processSECTDIFFRelocation(SectionID, RelI, Obj,
                                          ObjSectionToID);
       else if (RelType == MachO::GENERIC_RELOC_VANILLA)
-        return processI386ScatteredVANILLA(SectionID, RelI, Obj,
-                                           ObjSectionToID);
+        return processScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID);
       llvm_unreachable("Unhandled scattered relocation.");
     }
 
@@ -84,10 +83,10 @@ public:
     DEBUG(dumpRelocationToResolve(RE, Value));
 
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
 
     if (RE.IsPCRel) {
-      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset);
       Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation.
     }
 
@@ -99,8 +98,8 @@ public:
       break;
     case MachO::GENERIC_RELOC_SECTDIFF:
     case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
-      uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress;
-      uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress;
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress();
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress();
       assert((Value == SectionABase || Value == SectionBBase) &&
              "Unexpected SECTDIFF relocation value.");
       Value = SectionABase - SectionBBase + RE.Addend;
@@ -139,7 +138,7 @@ private:
     bool IsPCRel = Obj.getAnyRelocationPCRel(RE);
     unsigned Size = Obj.getAnyRelocationLength(RE);
     uint64_t Offset = RelI->getOffset();
-    uint8_t *LocalAddress = Section.Address + Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
     unsigned NumBytes = 1 << Size;
     uint64_t Addend = readBytesUnaligned(LocalAddress, NumBytes);
 
@@ -183,41 +182,6 @@ private:
     return ++RelI;
   }
 
-  relocation_iterator processI386ScatteredVANILLA(
-      unsigned SectionID, relocation_iterator RelI,
-      const ObjectFile &BaseObjT,
-      RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) {
-    const MachOObjectFile &Obj =
-        static_cast<const MachOObjectFile&>(BaseObjT);
-    MachO::any_relocation_info RE =
-        Obj.getRelocation(RelI->getRawDataRefImpl());
-
-    SectionEntry &Section = Sections[SectionID];
-    uint32_t RelocType = Obj.getAnyRelocationType(RE);
-    bool IsPCRel = Obj.getAnyRelocationPCRel(RE);
-    unsigned Size = Obj.getAnyRelocationLength(RE);
-    uint64_t Offset = RelI->getOffset();
-    uint8_t *LocalAddress = Section.Address + Offset;
-    unsigned NumBytes = 1 << Size;
-    int64_t Addend = readBytesUnaligned(LocalAddress, NumBytes);
-
-    unsigned SymbolBaseAddr = Obj.getScatteredRelocationValue(RE);
-    section_iterator TargetSI = getSectionByAddress(Obj, SymbolBaseAddr);
-    assert(TargetSI != Obj.section_end() && "Can't find section for symbol");
-    uint64_t SectionBaseAddr = TargetSI->getAddress();
-    SectionRef TargetSection = *TargetSI;
-    bool IsCode = TargetSection.isText();
-    uint32_t TargetSectionID =
-        findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID);
-
-    Addend -= SectionBaseAddr;
-    RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size);
-
-    addRelocationForSection(R, TargetSectionID);
-
-    return ++RelI;
-  }
-
   // Populate stubs in __jump_table section.
   void populateJumpTable(const MachOObjectFile &Obj, const SectionRef &JTSection,
                          unsigned JTSectionID) {
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index dd56e72..2242295 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -39,6 +39,10 @@ public:
       static_cast<const MachOObjectFile &>(BaseObjT);
     MachO::any_relocation_info RelInfo =
         Obj.getRelocation(RelI->getRawDataRefImpl());
+    uint32_t RelType = Obj.getAnyRelocationType(RelInfo);
+
+    if (RelType == MachO::X86_64_RELOC_SUBTRACTOR)
+      return processSubtractRelocation(SectionID, RelI, Obj, ObjSectionToID);
 
     assert(!Obj.isRelocationScattered(RelInfo) &&
            "Scattered relocations not supported on X86_64");
@@ -69,14 +73,14 @@ public:
   void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
     DEBUG(dumpRelocationToResolve(RE, Value));
     const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset);
 
     // If the relocation is PC-relative, the value to be encoded is the
     // pointer difference.
     if (RE.IsPCRel) {
       // FIXME: It seems this value needs to be adjusted by 4 for an effective
       // PC address. Is that expected? Only for branches, perhaps?
-      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset);
       Value -= FinalAddress + 4;
     }
 
@@ -91,9 +95,17 @@ public:
     case MachO::X86_64_RELOC_BRANCH:
       writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size);
       break;
+    case MachO::X86_64_RELOC_SUBTRACTOR: {
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress();
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress();
+      assert((Value == SectionABase || Value == SectionBBase) &&
+             "Unexpected SUBTRACTOR relocation value.");
+      Value = SectionABase - SectionBBase + RE.Addend;
+      writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size);
+      break;
+    }
     case MachO::X86_64_RELOC_GOT_LOAD:
     case MachO::X86_64_RELOC_GOT:
-    case MachO::X86_64_RELOC_SUBTRACTOR:
     case MachO::X86_64_RELOC_TLV:
       Error("Relocation type not implemented yet!");
     }
@@ -112,24 +124,65 @@ private:
     RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value);
     uint8_t *Addr;
     if (i != Stubs.end()) {
-      Addr = Section.Address + i->second;
+      Addr = Section.getAddressWithOffset(i->second);
     } else {
-      Stubs[Value] = Section.StubOffset;
-      uint8_t *GOTEntry = Section.Address + Section.StubOffset;
-      RelocationEntry GOTRE(RE.SectionID, Section.StubOffset,
+      Stubs[Value] = Section.getStubOffset();
+      uint8_t *GOTEntry = Section.getAddressWithOffset(Section.getStubOffset());
+      RelocationEntry GOTRE(RE.SectionID, Section.getStubOffset(),
                             MachO::X86_64_RELOC_UNSIGNED, Value.Offset, false,
                             3);
       if (Value.SymbolName)
         addRelocationForSymbol(GOTRE, Value.SymbolName);
       else
         addRelocationForSection(GOTRE, Value.SectionID);
-      Section.StubOffset += 8;
+      Section.advanceStubOffset(8);
       Addr = GOTEntry;
     }
     RelocationEntry TargetRE(RE.SectionID, RE.Offset,
                              MachO::X86_64_RELOC_UNSIGNED, RE.Addend, true, 2);
     resolveRelocation(TargetRE, (uint64_t)Addr);
   }
+
+  relocation_iterator
+  processSubtractRelocation(unsigned SectionID, relocation_iterator RelI,
+                            const ObjectFile &BaseObjT,
+                            ObjSectionToIDMap &ObjSectionToID) {
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile&>(BaseObjT);
+    MachO::any_relocation_info RE =
+        Obj.getRelocation(RelI->getRawDataRefImpl());
+
+    unsigned Size = Obj.getAnyRelocationLength(RE);
+    uint64_t Offset = RelI->getOffset();
+    uint8_t *LocalAddress = Sections[SectionID].getAddressWithOffset(Offset);
+    unsigned NumBytes = 1 << Size;
+
+    ErrorOr<StringRef> SubtrahendNameOrErr = RelI->getSymbol()->getName();
+    if (auto EC = SubtrahendNameOrErr.getError())
+      report_fatal_error(EC.message());
+    auto SubtrahendI = GlobalSymbolTable.find(*SubtrahendNameOrErr);
+    unsigned SectionBID = SubtrahendI->second.getSectionID();
+    uint64_t SectionBOffset = SubtrahendI->second.getOffset();
+    int64_t Addend =
+      SignExtend64(readBytesUnaligned(LocalAddress, NumBytes), NumBytes * 8);
+
+    ++RelI;
+    ErrorOr<StringRef> MinuendNameOrErr = RelI->getSymbol()->getName();
+    if (auto EC = MinuendNameOrErr.getError())
+      report_fatal_error(EC.message());
+    auto MinuendI = GlobalSymbolTable.find(*MinuendNameOrErr);
+    unsigned SectionAID = MinuendI->second.getSectionID();
+    uint64_t SectionAOffset = MinuendI->second.getOffset();
+
+    RelocationEntry R(SectionID, Offset, MachO::X86_64_RELOC_SUBTRACTOR, (uint64_t)Addend,
+                      SectionAID, SectionAOffset, SectionBID, SectionBOffset,
+                      false, Size);
+
+    addRelocationForSection(R, SectionAID);
+
+    return ++RelI;
+  }
+
 };
 }
 
diff --git a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 5986084..1ad5f17 100644
--- a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Process.h"
 
 namespace llvm {
 
@@ -48,16 +49,27 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
 
   // Look in the list of free memory regions and use a block there if one
   // is available.
-  for (int i = 0, e = MemGroup.FreeMem.size(); i != e; ++i) {
-    sys::MemoryBlock &MB = MemGroup.FreeMem[i];
-    if (MB.size() >= RequiredSize) {
-      Addr = (uintptr_t)MB.base();
-      uintptr_t EndOfBlock = Addr + MB.size();
+  for (FreeMemBlock &FreeMB : MemGroup.FreeMem) {
+    if (FreeMB.Free.size() >= RequiredSize) {
+      Addr = (uintptr_t)FreeMB.Free.base();
+      uintptr_t EndOfBlock = Addr + FreeMB.Free.size();
       // Align the address.
       Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
-      // Store cutted free memory block.
-      MemGroup.FreeMem[i] = sys::MemoryBlock((void*)(Addr + Size),
-                                             EndOfBlock - Addr - Size);
+
+      if (FreeMB.PendingPrefixIndex == (unsigned)-1) {
+        // The part of the block we're giving out to the user is now pending
+        MemGroup.PendingMem.push_back(sys::MemoryBlock((void *)Addr, Size));
+
+        // Remember this pending block, such that future allocations can just
+        // modify it rather than creating a new one
+        FreeMB.PendingPrefixIndex = MemGroup.PendingMem.size() - 1;
+      } else {
+        sys::MemoryBlock &PendingMB = MemGroup.PendingMem[FreeMB.PendingPrefixIndex];
+        PendingMB = sys::MemoryBlock(PendingMB.base(), Addr + Size - (uintptr_t)PendingMB.base());
+      }
+
+      // Remember how much free space is now left in this block
+      FreeMB.Free = sys::MemoryBlock((void *)(Addr + Size), EndOfBlock - Addr - Size);
       return (uint8_t*)Addr;
     }
   }
@@ -85,6 +97,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
   // Save this address as the basis for our next request
   MemGroup.Near = MB;
 
+  // Remember that we allocated this memory
   MemGroup.AllocatedMem.push_back(MB);
   Addr = (uintptr_t)MB.base();
   uintptr_t EndOfBlock = Addr + MB.size();
@@ -92,11 +105,18 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
   // Align the address.
   Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
 
+  // The part of the block we're giving out to the user is now pending
+  MemGroup.PendingMem.push_back(sys::MemoryBlock((void *)Addr, Size));
+
   // The allocateMappedMemory may allocate much more memory than we need. In
   // this case, we store the unused memory as a free memory block.
   unsigned FreeSize = EndOfBlock-Addr-Size;
-  if (FreeSize > 16)
-    MemGroup.FreeMem.push_back(sys::MemoryBlock((void*)(Addr + Size), FreeSize));
+  if (FreeSize > 16) {
+    FreeMemBlock FreeMB;
+    FreeMB.Free = sys::MemoryBlock((void*)(Addr + Size), FreeSize);
+    FreeMB.PendingPrefixIndex = (unsigned)-1;
+    MemGroup.FreeMem.push_back(FreeMB);
+  }
 
   // Return aligned address
   return (uint8_t*)Addr;
@@ -107,9 +127,6 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
   // FIXME: Should in-progress permissions be reverted if an error occurs?
   std::error_code ec;
 
-  // Don't allow free memory blocks to be used after setting protection flags.
-  CodeMem.FreeMem.clear();
-
   // Make code memory executable.
   ec = applyMemoryGroupPermissions(CodeMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
@@ -120,9 +137,6 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
     return true;
   }
 
-  // Don't allow free memory blocks to be used after setting protection flags.
-  RODataMem.FreeMem.clear();
-
   // Make read-only data memory read-only.
   ec = applyMemoryGroupPermissions(RODataMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
@@ -143,36 +157,62 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
   return false;
 }
 
+static sys::MemoryBlock trimBlockToPageSize(sys::MemoryBlock M) {
+  static const size_t PageSize = sys::Process::getPageSize();
+
+  size_t StartOverlap =
+      (PageSize - ((uintptr_t)M.base() % PageSize)) % PageSize;
+
+  size_t TrimmedSize = M.size();
+  TrimmedSize -= StartOverlap;
+  TrimmedSize -= TrimmedSize % PageSize;
+
+  sys::MemoryBlock Trimmed((void *)((uintptr_t)M.base() + StartOverlap), TrimmedSize);
+
+  assert(((uintptr_t)Trimmed.base() % PageSize) == 0);
+  assert((Trimmed.size() % PageSize) == 0);
+  assert(M.base() <= Trimmed.base() && Trimmed.size() <= M.size());
+
+  return Trimmed;
+}
+
+
 std::error_code
 SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
                                                   unsigned Permissions) {
-
-  for (int i = 0, e = MemGroup.AllocatedMem.size(); i != e; ++i) {
-    std::error_code ec;
-    ec =
-        sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i], Permissions);
-    if (ec) {
-      return ec;
-    }
+  for (sys::MemoryBlock &MB : MemGroup.PendingMem)
+    if (std::error_code EC = sys::Memory::protectMappedMemory(MB, Permissions))
+      return EC;
+
+  MemGroup.PendingMem.clear();
+
+  // Now go through free blocks and trim any of them that don't span the entire
+  // page because one of the pending blocks may have overlapped it.
+  for (FreeMemBlock &FreeMB : MemGroup.FreeMem) {
+    FreeMB.Free = trimBlockToPageSize(FreeMB.Free);
+    // We cleared the PendingMem list, so all these pointers are now invalid
+    FreeMB.PendingPrefixIndex = (unsigned)-1;
   }
 
+  // Remove all blocks which are now empty
+  MemGroup.FreeMem.erase(
+      std::remove_if(MemGroup.FreeMem.begin(), MemGroup.FreeMem.end(),
+                     [](FreeMemBlock &FreeMB) { return FreeMB.Free.size() == 0; }),
+      MemGroup.FreeMem.end());
+
   return std::error_code();
 }
 
 void SectionMemoryManager::invalidateInstructionCache() {
-  for (int i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i)
-    sys::Memory::InvalidateInstructionCache(CodeMem.AllocatedMem[i].base(),
-                                            CodeMem.AllocatedMem[i].size());
+  for (sys::MemoryBlock &Block : CodeMem.PendingMem)
+    sys::Memory::InvalidateInstructionCache(Block.base(), Block.size());
 }
 
 SectionMemoryManager::~SectionMemoryManager() {
-  for (unsigned i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i)
-    sys::Memory::releaseMappedMemory(CodeMem.AllocatedMem[i]);
-  for (unsigned i = 0, e = RWDataMem.AllocatedMem.size(); i != e; ++i)
-    sys::Memory::releaseMappedMemory(RWDataMem.AllocatedMem[i]);
-  for (unsigned i = 0, e = RODataMem.AllocatedMem.size(); i != e; ++i)
-    sys::Memory::releaseMappedMemory(RODataMem.AllocatedMem[i]);
+  for (MemoryGroup *Group : {&CodeMem, &RWDataMem, &RODataMem}) {
+    for (sys::MemoryBlock &Block : Group->AllocatedMem)
+      sys::Memory::releaseMappedMemory(Block);
+  }
 }
 
 } // namespace llvm
-
diff --git a/contrib/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm/lib/IR/AsmWriter.cpp
index b553f11..0ce44e1 100644
--- a/contrib/llvm/lib/IR/AsmWriter.cpp
+++ b/contrib/llvm/lib/IR/AsmWriter.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -102,17 +103,9 @@ static OrderMap orderModule(const Module *M) {
     orderValue(&A, OM);
   }
   for (const Function &F : *M) {
-    if (F.hasPrefixData())
-      if (!isa<GlobalValue>(F.getPrefixData()))
-        orderValue(F.getPrefixData(), OM);
-
-    if (F.hasPrologueData())
-      if (!isa<GlobalValue>(F.getPrologueData()))
-        orderValue(F.getPrologueData(), OM);
-
-    if (F.hasPersonalityFn())
-      if (!isa<GlobalValue>(F.getPersonalityFn()))
-        orderValue(F.getPersonalityFn(), OM);
+    for (const Use &U : F.operands())
+      if (!isa<GlobalValue>(U.get()))
+        orderValue(U.get(), OM);
 
     orderValue(&F, OM);
 
@@ -232,8 +225,7 @@ static UseListOrderStack predictUseListOrder(const Module *M) {
   // We want to visit the functions backward now so we can list function-local
   // constants in the last Function they're used in.  Module-level constants
   // have already been visited above.
-  for (auto I = M->rbegin(), E = M->rend(); I != E; ++I) {
-    const Function &F = *I;
+  for (const Function &F : make_range(M->rbegin(), M->rend())) {
     if (F.isDeclaration())
       continue;
     for (const BasicBlock &BB : F)
@@ -263,8 +255,8 @@ static UseListOrderStack predictUseListOrder(const Module *M) {
   for (const GlobalAlias &A : M->aliases())
     predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
   for (const Function &F : *M)
-    if (F.hasPrefixData())
-      predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack);
+    for (const Use &U : F.operands())
+      predictValueUseListOrder(U.get(), nullptr, OM, Stack);
 
   return Stack;
 }
@@ -304,6 +296,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::PreserveMost:  Out << "preserve_mostcc"; break;
   case CallingConv::PreserveAll:   Out << "preserve_allcc"; break;
+  case CallingConv::CXX_FAST_TLS:  Out << "cxx_fast_tlscc"; break;
   case CallingConv::GHC:           Out << "ghccc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
@@ -320,6 +313,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::X86_64_Win64:  Out << "x86_64_win64cc"; break;
   case CallingConv::SPIR_FUNC:     Out << "spir_func"; break;
   case CallingConv::SPIR_KERNEL:   Out << "spir_kernel"; break;
+  case CallingConv::X86_INTR:      Out << "x86_intrcc"; break;
+  case CallingConv::HHVM:          Out << "hhvmcc"; break;
+  case CallingConv::HHVM_C:        Out << "hhvm_ccc"; break;
   }
 }
 
@@ -343,18 +339,8 @@ enum PrefixType {
   NoPrefix
 };
 
-/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either
-/// prefixed with % (if the string only contains simple characters) or is
-/// surrounded with ""'s (if it has special chars in it).  Print it out.
-static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
+void llvm::printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name) {
   assert(!Name.empty() && "Cannot get empty name!");
-  switch (Prefix) {
-  case NoPrefix: break;
-  case GlobalPrefix: OS << '@'; break;
-  case ComdatPrefix: OS << '$'; break;
-  case LabelPrefix:  break;
-  case LocalPrefix:  OS << '%'; break;
-  }
 
   // Scan the name to see if it needs quotes first.
   bool NeedsQuotes = isdigit(static_cast<unsigned char>(Name[0]));
@@ -386,9 +372,31 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
   OS << '"';
 }
 
-/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either
-/// prefixed with % (if the string only contains simple characters) or is
-/// surrounded with ""'s (if it has special chars in it).  Print it out.
+/// Turn the specified name into an 'LLVM name', which is either prefixed with %
+/// (if the string only contains simple characters) or is surrounded with ""'s
+/// (if it has special chars in it). Print it out.
+static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
+  switch (Prefix) {
+  case NoPrefix:
+    break;
+  case GlobalPrefix:
+    OS << '@';
+    break;
+  case ComdatPrefix:
+    OS << '$';
+    break;
+  case LabelPrefix:
+    break;
+  case LocalPrefix:
+    OS << '%';
+    break;
+  }
+  printLLVMNameWithoutPrefix(OS, Name);
+}
+
+/// Turn the specified name into an 'LLVM name', which is either prefixed with %
+/// (if the string only contains simple characters) or is surrounded with ""'s
+/// (if it has special chars in it). Print it out.
 static void PrintLLVMName(raw_ostream &OS, const Value *V) {
   PrintLLVMName(OS, V->getName(),
                 isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
@@ -456,6 +464,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   case Type::LabelTyID:     OS << "label"; return;
   case Type::MetadataTyID:  OS << "metadata"; return;
   case Type::X86_MMXTyID:   OS << "x86_mmx"; return;
+  case Type::TokenTyID:     OS << "token"; return;
   case Type::IntegerTyID:
     OS << 'i' << cast<IntegerType>(Ty)->getBitWidth();
     return;
@@ -691,8 +700,9 @@ void ModuleSlotTracker::incorporateFunction(const Function &F) {
   this->F = &F;
 }
 
-static SlotTracker *createSlotTracker(const Module *M) {
-  return new SlotTracker(M);
+int ModuleSlotTracker::getLocalSlot(const Value *V) {
+  assert(F && "No function incorporated");
+  return Machine->getLocalSlot(V);
 }
 
 static SlotTracker *createSlotTracker(const Value *V) {
@@ -802,7 +812,7 @@ void SlotTracker::processFunction() {
   for(Function::const_arg_iterator AI = TheFunction->arg_begin(),
       AE = TheFunction->arg_end(); AI != AE; ++AI)
     if (!AI->hasName())
-      CreateFunctionSlot(AI);
+      CreateFunctionSlot(&*AI);
 
   ST_DEBUG("Inserting Instructions:\n");
 
@@ -1093,11 +1103,10 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // the value back and get the same value.
       //
       bool ignored;
-      bool isHalf = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEhalf;
       bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
       bool isInf = CFP->getValueAPF().isInfinity();
       bool isNaN = CFP->getValueAPF().isNaN();
-      if (!isHalf && !isInf && !isNaN) {
+      if (!isInf && !isNaN) {
         double Val = isDouble ? CFP->getValueAPF().convertToDouble() :
                                 CFP->getValueAPF().convertToFloat();
         SmallString<128> StrVal;
@@ -1123,15 +1132,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // x86, so we must not use these types.
       static_assert(sizeof(double) == sizeof(uint64_t),
                     "assuming that double is 64 bits!");
-      char Buffer[40];
       APFloat apf = CFP->getValueAPF();
-      // Halves and floats are represented in ASCII IR as double, convert.
+      // Floats are represented in ASCII IR as double, convert.
       if (!isDouble)
         apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
                           &ignored);
-      Out << "0x" <<
-              utohex_buffer(uint64_t(apf.bitcastToAPInt().getZExtValue()),
-                            Buffer+40);
+      Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
       return;
     }
 
@@ -1139,60 +1145,32 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     // These appear as a magic letter identifying the type, then a
     // fixed number of hex digits.
     Out << "0x";
-    // Bit position, in the current word, of the next nibble to print.
-    int shiftcount;
-
+    APInt API = CFP->getValueAPF().bitcastToAPInt();
     if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
       Out << 'K';
-      // api needed to prevent premature destruction
-      APInt api = CFP->getValueAPF().bitcastToAPInt();
-      const uint64_t* p = api.getRawData();
-      uint64_t word = p[1];
-      shiftcount = 12;
-      int width = api.getBitWidth();
-      for (int j=0; j<width; j+=4, shiftcount-=4) {
-        unsigned int nibble = (word>>shiftcount) & 15;
-        if (nibble < 10)
-          Out << (unsigned char)(nibble + '0');
-        else
-          Out << (unsigned char)(nibble - 10 + 'A');
-        if (shiftcount == 0 && j+4 < width) {
-          word = *p;
-          shiftcount = 64;
-          if (width-j-4 < 64)
-            shiftcount = width-j-4;
-        }
-      }
+      Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
       return;
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) {
-      shiftcount = 60;
       Out << 'L';
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) {
-      shiftcount = 60;
       Out << 'M';
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) {
-      shiftcount = 12;
       Out << 'H';
+      Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                  /*Upper=*/true);
     } else
       llvm_unreachable("Unsupported floating point type");
-    // api needed to prevent premature destruction
-    APInt api = CFP->getValueAPF().bitcastToAPInt();
-    const uint64_t* p = api.getRawData();
-    uint64_t word = *p;
-    int width = api.getBitWidth();
-    for (int j=0; j<width; j+=4, shiftcount-=4) {
-      unsigned int nibble = (word>>shiftcount) & 15;
-      if (nibble < 10)
-        Out << (unsigned char)(nibble + '0');
-      else
-        Out << (unsigned char)(nibble - 10 + 'A');
-      if (shiftcount == 0 && j+4 < width) {
-        word = *(++p);
-        shiftcount = 64;
-        if (width-j-4 < 64)
-          shiftcount = width-j-4;
-      }
-    }
     return;
   }
 
@@ -1313,6 +1291,11 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (isa<ConstantTokenNone>(CV)) {
+    Out << "none";
+    return;
+  }
+
   if (isa<UndefValue>(CV)) {
     Out << "undef";
     return;
@@ -1326,10 +1309,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     Out << " (";
 
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
-      TypePrinter.print(
-          cast<PointerType>(GEP->getPointerOperandType()->getScalarType())
-              ->getElementType(),
-          Out);
+      TypePrinter.print(GEP->getSourceElementType(), Out);
       Out << ", ";
     }
 
@@ -1409,6 +1389,7 @@ struct MDFieldPrinter {
       : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) {
   }
   void printTag(const DINode *N);
+  void printMacinfoType(const DIMacroNode *N);
   void printString(StringRef Name, StringRef Value,
                    bool ShouldSkipEmpty = true);
   void printMetadata(StringRef Name, const Metadata *MD,
@@ -1431,6 +1412,14 @@ void MDFieldPrinter::printTag(const DINode *N) {
     Out << N->getTag();
 }
 
+void MDFieldPrinter::printMacinfoType(const DIMacroNode *N) {
+  Out << FS << "type: ";
+  if (const char *Type = dwarf::MacinfoString(N->getMacinfoType()))
+    Out << Type;
+  else
+    Out << N->getMacinfoType();
+}
+
 void MDFieldPrinter::printString(StringRef Name, StringRef Value,
                                  bool ShouldSkipEmpty) {
   if (ShouldSkipEmpty && Value.empty())
@@ -1656,6 +1645,7 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printMetadata("subprograms", N->getRawSubprograms());
   Printer.printMetadata("globals", N->getRawGlobalVariables());
   Printer.printMetadata("imports", N->getRawImportedEntities());
+  Printer.printMetadata("macros", N->getRawMacros());
   Printer.printInt("dwoId", N->getDWOId());
   Out << ")";
 }
@@ -1680,7 +1670,6 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printInt("virtualIndex", N->getVirtualIndex());
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printBool("isOptimized", N->isOptimized());
-  Printer.printMetadata("function", N->getRawFunction());
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printMetadata("declaration", N->getRawDeclaration());
   Printer.printMetadata("variables", N->getRawVariables());
@@ -1725,6 +1714,29 @@ static void writeDINamespace(raw_ostream &Out, const DINamespace *N,
   Out << ")";
 }
 
+static void writeDIMacro(raw_ostream &Out, const DIMacro *N,
+                         TypePrinting *TypePrinter, SlotTracker *Machine,
+                         const Module *Context) {
+  Out << "!DIMacro(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printMacinfoType(N);
+  Printer.printInt("line", N->getLine());
+  Printer.printString("name", N->getName());
+  Printer.printString("value", N->getValue());
+  Out << ")";
+}
+
+static void writeDIMacroFile(raw_ostream &Out, const DIMacroFile *N,
+                             TypePrinting *TypePrinter, SlotTracker *Machine,
+                             const Module *Context) {
+  Out << "!DIMacroFile(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printInt("line", N->getLine());
+  Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
+  Printer.printMetadata("nodes", N->getRawElements());
+  Out << ")";
+}
+
 static void writeDIModule(raw_ostream &Out, const DIModule *N,
                           TypePrinting *TypePrinter, SlotTracker *Machine,
                           const Module *Context) {
@@ -1789,11 +1801,8 @@ static void writeDILocalVariable(raw_ostream &Out, const DILocalVariable *N,
                                  SlotTracker *Machine, const Module *Context) {
   Out << "!DILocalVariable(";
   MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
-  Printer.printTag(N);
   Printer.printString("name", N->getName());
-  Printer.printInt("arg", N->getArg(),
-                   /* ShouldSkipZero */
-                   N->getTag() == dwarf::DW_TAG_auto_variable);
+  Printer.printInt("arg", N->getArg());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
@@ -1998,6 +2007,7 @@ class AssemblyWriter {
   TypePrinting TypePrinter;
   AssemblyAnnotationWriter *AnnotationWriter;
   SetVector<const Comdat *> Comdats;
+  bool IsForDebug;
   bool ShouldPreserveUseListOrder;
   UseListOrderStack UseListOrders;
   SmallVector<StringRef, 8> MDNames;
@@ -2005,12 +2015,7 @@ class AssemblyWriter {
 public:
   /// Construct an AssemblyWriter with an external SlotTracker
   AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac, const Module *M,
-                 AssemblyAnnotationWriter *AAW,
-                 bool ShouldPreserveUseListOrder = false);
-
-  /// Construct an AssemblyWriter with an internally allocated SlotTracker
-  AssemblyWriter(formatted_raw_ostream &o, const Module *M,
-                 AssemblyAnnotationWriter *AAW,
+                 AssemblyAnnotationWriter *AAW, bool IsForDebug,
                  bool ShouldPreserveUseListOrder = false);
 
   void printMDNodeBody(const MDNode *MD);
@@ -2020,6 +2025,7 @@ public:
 
   void writeOperand(const Value *Op, bool PrintType);
   void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
+  void writeOperandBundles(ImmutableCallSite CS);
   void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
   void writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
                           AtomicOrdering FailureOrdering,
@@ -2043,8 +2049,6 @@ public:
   void printUseLists(const Function *F);
 
 private:
-  void init();
-
   /// \brief Print out metadata attachments.
   void printMetadataAttachments(
       const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs,
@@ -2056,11 +2060,16 @@ private:
 
   // printGCRelocateComment - print comment after call to the gc.relocate
   // intrinsic indicating base and derived pointer names.
-  void printGCRelocateComment(const Value &V);
+  void printGCRelocateComment(const GCRelocateInst &Relocate);
 };
 } // namespace
 
-void AssemblyWriter::init() {
+AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
+                               const Module *M, AssemblyAnnotationWriter *AAW,
+                               bool IsForDebug, bool ShouldPreserveUseListOrder)
+    : Out(o), TheModule(M), Machine(Mac), AnnotationWriter(AAW),
+      IsForDebug(IsForDebug),
+      ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
   if (!TheModule)
     return;
   TypePrinter.incorporateTypes(*TheModule);
@@ -2072,23 +2081,6 @@ void AssemblyWriter::init() {
       Comdats.insert(C);
 }
 
-AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
-                               const Module *M, AssemblyAnnotationWriter *AAW,
-                               bool ShouldPreserveUseListOrder)
-    : Out(o), TheModule(M), Machine(Mac), AnnotationWriter(AAW),
-      ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
-  init();
-}
-
-AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, const Module *M,
-                               AssemblyAnnotationWriter *AAW,
-                               bool ShouldPreserveUseListOrder)
-    : Out(o), TheModule(M), SlotTrackerStorage(createSlotTracker(M)),
-      Machine(*SlotTrackerStorage), AnnotationWriter(AAW),
-      ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
-  init();
-}
-
 void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
   if (!Operand) {
     Out << "<null operand!>";
@@ -2170,6 +2162,43 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
   WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
 }
 
+void AssemblyWriter::writeOperandBundles(ImmutableCallSite CS) {
+  if (!CS.hasOperandBundles())
+    return;
+
+  Out << " [ ";
+
+  bool FirstBundle = true;
+  for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i) {
+    OperandBundleUse BU = CS.getOperandBundleAt(i);
+
+    if (!FirstBundle)
+      Out << ", ";
+    FirstBundle = false;
+
+    Out << '"';
+    PrintEscapedString(BU.getTagName(), Out);
+    Out << '"';
+
+    Out << '(';
+
+    bool FirstInput = true;
+    for (const auto &Input : BU.Inputs) {
+      if (!FirstInput)
+        Out << ", ";
+      FirstInput = false;
+
+      TypePrinter.print(Input->getType(), Out);
+      Out << " ";
+      WriteAsOperandInternal(Out, Input, &TypePrinter, &Machine, TheModule);
+    }
+
+    Out << ')';
+  }
+
+  Out << " ]";
+}
+
 void AssemblyWriter::printModule(const Module *M) {
   Machine.initialize();
 
@@ -2422,6 +2451,10 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
 
   Out << "alias ";
 
+  TypePrinter.print(GA->getValueType(), Out);
+
+  Out << ", ";
+
   const Constant *Aliasee = GA->getAliasee();
 
   if (!Aliasee) {
@@ -2536,28 +2569,26 @@ void AssemblyWriter::printFunction(const Function *F) {
   Machine.incorporateFunction(F);
 
   // Loop over the arguments, printing them...
-
-  unsigned Idx = 1;
-  if (!F->isDeclaration()) {
-    // If this isn't a declaration, print the argument names as well.
-    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-         I != E; ++I) {
+  if (F->isDeclaration() && !IsForDebug) {
+    // We're only interested in the type here - don't print argument names.
+    for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) {
       // Insert commas as we go... the first arg doesn't get a comma
-      if (I != F->arg_begin()) Out << ", ";
-      printArgument(I, Attrs, Idx);
-      Idx++;
+      if (I)
+        Out << ", ";
+      // Output type...
+      TypePrinter.print(FT->getParamType(I), Out);
+
+      if (Attrs.hasAttributes(I + 1))
+        Out << ' ' << Attrs.getAsString(I + 1);
     }
   } else {
-    // Otherwise, print the types from the function type.
-    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+    // The arguments are meaningful here, print them in detail.
+    unsigned Idx = 1;
+    for (const Argument &Arg : F->args()) {
       // Insert commas as we go... the first arg doesn't get a comma
-      if (i) Out << ", ";
-
-      // Output type...
-      TypePrinter.print(FT->getParamType(i), Out);
-
-      if (Attrs.hasAttributes(i+1))
-        Out << ' ' << Attrs.getAsString(i+1);
+      if (Idx != 1)
+        Out << ", ";
+      printArgument(&Arg, Attrs, Idx++);
     }
   }
 
@@ -2604,7 +2635,7 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " {";
     // Output all of the function's basic blocks.
     for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I)
-      printBasicBlock(I);
+      printBasicBlock(&*I);
 
     // Output the function's use-lists.
     printUseLists(F);
@@ -2691,14 +2722,11 @@ void AssemblyWriter::printInstructionLine(const Instruction &I) {
 
 /// printGCRelocateComment - print comment after call to the gc.relocate
 /// intrinsic indicating base and derived pointer names.
-void AssemblyWriter::printGCRelocateComment(const Value &V) {
-  assert(isGCRelocate(&V));
-  GCRelocateOperands GCOps(cast<Instruction>(&V));
-
+void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) {
   Out << " ; (";
-  writeOperand(GCOps.getBasePtr(), false);
+  writeOperand(Relocate.getBasePtr(), false);
   Out << ", ";
-  writeOperand(GCOps.getDerivedPtr(), false);
+  writeOperand(Relocate.getDerivedPtr(), false);
   Out << ")";
 }
 
@@ -2706,8 +2734,8 @@ void AssemblyWriter::printGCRelocateComment(const Value &V) {
 /// which slot it occupies.
 ///
 void AssemblyWriter::printInfoComment(const Value &V) {
-  if (isGCRelocate(&V))
-    printGCRelocateComment(V);
+  if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V))
+    printGCRelocateComment(*Relocate);
 
   if (AnnotationWriter)
     AnnotationWriter->printInfoComment(V, Out);
@@ -2738,6 +2766,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << "musttail ";
     else if (CI->isTailCall())
       Out << "tail ";
+    else if (CI->isNoTailCall())
+      Out << "notail ";
   }
 
   // Print out the opcode...
@@ -2850,8 +2880,50 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
       writeOperand(LPI->getClause(i), true);
     }
+  } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(&I)) {
+    Out << " within ";
+    writeOperand(CatchSwitch->getParentPad(), /*PrintType=*/false);
+    Out << " [";
+    unsigned Op = 0;
+    for (const BasicBlock *PadBB : CatchSwitch->handlers()) {
+      if (Op > 0)
+        Out << ", ";
+      writeOperand(PadBB, /*PrintType=*/true);
+      ++Op;
+    }
+    Out << "] unwind ";
+    if (const BasicBlock *UnwindDest = CatchSwitch->getUnwindDest())
+      writeOperand(UnwindDest, /*PrintType=*/true);
+    else
+      Out << "to caller";
+  } else if (const auto *FPI = dyn_cast<FuncletPadInst>(&I)) {
+    Out << " within ";
+    writeOperand(FPI->getParentPad(), /*PrintType=*/false);
+    Out << " [";
+    for (unsigned Op = 0, NumOps = FPI->getNumArgOperands(); Op < NumOps;
+         ++Op) {
+      if (Op > 0)
+        Out << ", ";
+      writeOperand(FPI->getArgOperand(Op), /*PrintType=*/true);
+    }
+    Out << ']';
   } else if (isa<ReturnInst>(I) && !Operand) {
     Out << " void";
+  } else if (const auto *CRI = dyn_cast<CatchReturnInst>(&I)) {
+    Out << " from ";
+    writeOperand(CRI->getOperand(0), /*PrintType=*/false);
+
+    Out << " to ";
+    writeOperand(CRI->getOperand(1), /*PrintType=*/true);
+  } else if (const auto *CRI = dyn_cast<CleanupReturnInst>(&I)) {
+    Out << " from ";
+    writeOperand(CRI->getOperand(0), /*PrintType=*/false);
+
+    Out << " unwind ";
+    if (CRI->hasUnwindDest())
+      writeOperand(CRI->getOperand(1), /*PrintType=*/true);
+    else
+      Out << "to caller";
   } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Print the calling convention being used.
     if (CI->getCallingConv() != CallingConv::C) {
@@ -2892,6 +2964,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ')';
     if (PAL.hasAttributes(AttributeSet::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+
+    writeOperandBundles(CI);
+
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledValue();
     FunctionType *FTy = cast<FunctionType>(II->getFunctionType());
@@ -2926,6 +3001,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     if (PAL.hasAttributes(AttributeSet::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
+    writeOperandBundles(II);
+
     Out << "\n          to ";
     writeOperand(II->getNormalDest(), true);
     Out << " unwind ";
@@ -3044,7 +3121,7 @@ void AssemblyWriter::printMetadataAttachments(
     return;
 
   if (MDNames.empty())
-    TheModule->getMDKindNames(MDNames);
+    MDs[0].second->getContext().getMDKindNames(MDNames);
 
   for (const auto &I : MDs) {
     unsigned Kind = I.first;
@@ -3138,29 +3215,23 @@ void AssemblyWriter::printUseLists(const Function *F) {
 //                       External Interface declarations
 //===----------------------------------------------------------------------===//
 
-void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
-  SlotTracker SlotTable(this->getParent());
-  formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW);
-  W.printFunction(this);
-}
-
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
-                   bool ShouldPreserveUseListOrder) const {
+                   bool ShouldPreserveUseListOrder, bool IsForDebug) const {
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, this, AAW, ShouldPreserveUseListOrder);
+  AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
+                   ShouldPreserveUseListOrder);
   W.printModule(this);
 }
 
-void NamedMDNode::print(raw_ostream &ROS) const {
+void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
   SlotTracker SlotTable(getParent());
   formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, getParent(), nullptr);
+  AssemblyWriter W(OS, SlotTable, getParent(), nullptr, IsForDebug);
   W.printNamedMDNode(this);
 }
 
-void Comdat::print(raw_ostream &ROS) const {
+void Comdat::print(raw_ostream &ROS, bool /*IsForDebug*/) const {
   PrintLLVMName(ROS, getName(), ComdatPrefix);
   ROS << " = comdat ";
 
@@ -3185,7 +3256,7 @@ void Comdat::print(raw_ostream &ROS) const {
   ROS << '\n';
 }
 
-void Type::print(raw_ostream &OS) const {
+void Type::print(raw_ostream &OS, bool /*IsForDebug*/) const {
   TypePrinting TP;
   TP.print(const_cast<Type*>(this), OS);
 
@@ -3208,7 +3279,7 @@ static bool isReferencingMDNode(const Instruction &I) {
   return false;
 }
 
-void Value::print(raw_ostream &ROS) const {
+void Value::print(raw_ostream &ROS, bool IsForDebug) const {
   bool ShouldInitializeAllMetadata = false;
   if (auto *I = dyn_cast<Instruction>(this))
     ShouldInitializeAllMetadata = isReferencingMDNode(*I);
@@ -3216,10 +3287,11 @@ void Value::print(raw_ostream &ROS) const {
     ShouldInitializeAllMetadata = true;
 
   ModuleSlotTracker MST(getModuleFromVal(this), ShouldInitializeAllMetadata);
-  print(ROS, MST);
+  print(ROS, MST, IsForDebug);
 }
 
-void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST) const {
+void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+                  bool IsForDebug) const {
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -3231,14 +3303,14 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST) const {
 
   if (const Instruction *I = dyn_cast<Instruction>(this)) {
     incorporateFunction(I->getParent() ? I->getParent()->getParent() : nullptr);
-    AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), nullptr);
+    AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), nullptr, IsForDebug);
     W.printInstruction(*I);
   } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(this)) {
     incorporateFunction(BB->getParent());
-    AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), nullptr);
+    AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), nullptr, IsForDebug);
     W.printBasicBlock(BB);
   } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
-    AssemblyWriter W(OS, SlotTable, GV->getParent(), nullptr);
+    AssemblyWriter W(OS, SlotTable, GV->getParent(), nullptr, IsForDebug);
     if (const GlobalVariable *V = dyn_cast<GlobalVariable>(GV))
       W.printGlobal(V);
     else if (const Function *F = dyn_cast<Function>(GV))
@@ -3261,7 +3333,7 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST) const {
 
 /// Print without a type, skipping the TypePrinting object.
 ///
-/// \return \c true iff printing was succesful.
+/// \return \c true iff printing was successful.
 static bool printWithoutType(const Value &V, raw_ostream &O,
                              SlotTracker *Machine, const Module *M) {
   if (V.hasName() || isa<GlobalValue>(V) ||
@@ -3340,41 +3412,45 @@ void Metadata::printAsOperand(raw_ostream &OS, ModuleSlotTracker &MST,
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ true);
 }
 
-void Metadata::print(raw_ostream &OS, const Module *M) const {
+void Metadata::print(raw_ostream &OS, const Module *M,
+                     bool /*IsForDebug*/) const {
   ModuleSlotTracker MST(M, isa<MDNode>(this));
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false);
 }
 
 void Metadata::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                     const Module *M) const {
+                     const Module *M, bool /*IsForDebug*/) const {
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false);
 }
 
 // Value::dump - allow easy printing of Values from the debugger.
 LLVM_DUMP_METHOD
-void Value::dump() const { print(dbgs()); dbgs() << '\n'; }
+void Value::dump() const { print(dbgs(), /*IsForDebug=*/true); dbgs() << '\n'; }
 
 // Type::dump - allow easy printing of Types from the debugger.
 LLVM_DUMP_METHOD
-void Type::dump() const { print(dbgs()); dbgs() << '\n'; }
+void Type::dump() const { print(dbgs(), /*IsForDebug=*/true); dbgs() << '\n'; }
 
 // Module::dump() - Allow printing of Modules from the debugger.
 LLVM_DUMP_METHOD
-void Module::dump() const { print(dbgs(), nullptr); }
+void Module::dump() const {
+  print(dbgs(), nullptr,
+        /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true);
+}
 
 // \brief Allow printing of Comdats from the debugger.
 LLVM_DUMP_METHOD
-void Comdat::dump() const { print(dbgs()); }
+void Comdat::dump() const { print(dbgs(), /*IsForDebug=*/true); }
 
 // NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger.
 LLVM_DUMP_METHOD
-void NamedMDNode::dump() const { print(dbgs()); }
+void NamedMDNode::dump() const { print(dbgs(), /*IsForDebug=*/true); }
 
 LLVM_DUMP_METHOD
 void Metadata::dump() const { dump(nullptr); }
 
 LLVM_DUMP_METHOD
 void Metadata::dump(const Module *M) const {
-  print(dbgs(), M);
+  print(dbgs(), M, /*IsForDebug=*/true);
   dbgs() << '\n';
 }
diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h
index 6f338ae..659f956 100644
--- a/contrib/llvm/lib/IR/AttributeImpl.h
+++ b/contrib/llvm/lib/IR/AttributeImpl.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/Support/TrailingObjects.h"
 #include <string>
 
 namespace llvm {
@@ -141,13 +142,16 @@ public:
 /// \class
 /// \brief This class represents a group of attributes that apply to one
 /// element: function, return type, or parameter.
-class AttributeSetNode : public FoldingSetNode {
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
   unsigned NumAttrs; ///< Number of attributes in this node.
 
   AttributeSetNode(ArrayRef<Attribute> Attrs) : NumAttrs(Attrs.size()) {
     // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(),
-              reinterpret_cast<Attribute *>(this + 1));
+    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
   }
 
   // AttributesSetNode is uniqued, these should not be publicly available.
@@ -170,7 +174,7 @@ public:
   std::string getAsString(bool InAttrGrp) const;
 
   typedef const Attribute *iterator;
-  iterator begin() const { return reinterpret_cast<iterator>(this + 1); }
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
   iterator end() const { return begin() + NumAttrs; }
 
   void Profile(FoldingSetNodeID &ID) const {
@@ -181,27 +185,29 @@ public:
       AttrList[I].Profile(ID);
   }
 };
-static_assert(
-    AlignOf<AttributeSetNode>::Alignment >= AlignOf<Attribute>::Alignment,
-    "Alignment is insufficient for objects appended to AttributeSetNode");
+
+typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;
 
 //===----------------------------------------------------------------------===//
 /// \class
 /// \brief This class represents a set of attributes that apply to the function,
 /// return type, and parameters.
-class AttributeSetImpl : public FoldingSetNode {
+class AttributeSetImpl final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetImpl, IndexAttrPair> {
   friend class AttributeSet;
-
-public:
-  typedef std::pair<unsigned, AttributeSetNode*> IndexAttrPair;
+  friend TrailingObjects;
 
 private:
   LLVMContext &Context;
   unsigned NumAttrs; ///< Number of entries in this set.
 
+  // Helper fn for TrailingObjects class.
+  size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumAttrs; }
+
   /// \brief Return a pointer to the IndexAttrPair for the specified slot.
   const IndexAttrPair *getNode(unsigned Slot) const {
-    return reinterpret_cast<const IndexAttrPair *>(this + 1) + Slot;
+    return getTrailingObjects<IndexAttrPair>() + Slot;
   }
 
   // AttributesSet is uniqued, these should not be publicly available.
@@ -222,8 +228,7 @@ public:
     }
 #endif
     // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(),
-              reinterpret_cast<IndexAttrPair *>(this + 1));
+    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<IndexAttrPair>());
   }
 
   /// \brief Get the context that created this AttributeSetImpl.
@@ -273,10 +278,6 @@ public:
 
   void dump() const;
 };
-static_assert(
-    AlignOf<AttributeSetImpl>::Alignment >=
-        AlignOf<AttributeSetImpl::IndexAttrPair>::Alignment,
-    "Alignment is insufficient for objects appended to AttributeSetImpl");
 
 } // end llvm namespace
 
diff --git a/contrib/llvm/lib/IR/Attributes.cpp b/contrib/llvm/lib/IR/Attributes.cpp
index 546a986..6c01bb6 100644
--- a/contrib/llvm/lib/IR/Attributes.cpp
+++ b/contrib/llvm/lib/IR/Attributes.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
 #include "AttributeImpl.h"
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/STLExtras.h"
@@ -120,28 +121,28 @@ Attribute::AttrKind Attribute::getKindAsEnum() const {
   if (!pImpl) return None;
   assert((isEnumAttribute() || isIntAttribute()) &&
          "Invalid attribute type to get the kind as an enum!");
-  return pImpl ? pImpl->getKindAsEnum() : None;
+  return pImpl->getKindAsEnum();
 }
 
 uint64_t Attribute::getValueAsInt() const {
   if (!pImpl) return 0;
   assert(isIntAttribute() &&
          "Expected the attribute to be an integer attribute!");
-  return pImpl ? pImpl->getValueAsInt() : 0;
+  return pImpl->getValueAsInt();
 }
 
 StringRef Attribute::getKindAsString() const {
   if (!pImpl) return StringRef();
   assert(isStringAttribute() &&
          "Invalid attribute type to get the kind as a string!");
-  return pImpl ? pImpl->getKindAsString() : StringRef();
+  return pImpl->getKindAsString();
 }
 
 StringRef Attribute::getValueAsString() const {
   if (!pImpl) return StringRef();
   assert(isStringAttribute() &&
          "Invalid attribute type to get the value as a string!");
-  return pImpl ? pImpl->getValueAsString() : StringRef();
+  return pImpl->getValueAsString();
 }
 
 bool Attribute::hasAttribute(AttrKind Kind) const {
@@ -198,6 +199,10 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "byval";
   if (hasAttribute(Attribute::Convergent))
     return "convergent";
+  if (hasAttribute(Attribute::InaccessibleMemOnly))
+    return "inaccessiblememonly";
+  if (hasAttribute(Attribute::InaccessibleMemOrArgMemOnly))
+    return "inaccessiblemem_or_argmemonly";
   if (hasAttribute(Attribute::InAlloca))
     return "inalloca";
   if (hasAttribute(Attribute::InlineHint))
@@ -232,6 +237,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noredzone";
   if (hasAttribute(Attribute::NoReturn))
     return "noreturn";
+  if (hasAttribute(Attribute::NoRecurse))
+    return "norecurse";
   if (hasAttribute(Attribute::NoUnwind))
     return "nounwind";
   if (hasAttribute(Attribute::OptimizeNone))
@@ -442,6 +449,9 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::JumpTable:       return 1ULL << 45;
   case Attribute::Convergent:      return 1ULL << 46;
   case Attribute::SafeStack:       return 1ULL << 47;
+  case Attribute::NoRecurse:       return 1ULL << 48;
+  case Attribute::InaccessibleMemOnly:         return 1ULL << 49;
+  case Attribute::InaccessibleMemOrArgMemOnly: return 1ULL << 50;
   case Attribute::Dereferenceable:
     llvm_unreachable("dereferenceable attribute not supported in raw format");
     break;
@@ -472,9 +482,8 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
   SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
   array_pod_sort(SortedAttrs.begin(), SortedAttrs.end());
 
-  for (SmallVectorImpl<Attribute>::iterator I = SortedAttrs.begin(),
-         E = SortedAttrs.end(); I != E; ++I)
-    I->Profile(ID);
+  for (Attribute Attr : SortedAttrs)
+    Attr.Profile(ID);
 
   void *InsertPoint;
   AttributeSetNode *PA =
@@ -484,8 +493,7 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
   // new one and insert it.
   if (!PA) {
     // Coallocate entries after the AttributeSetNode itself.
-    void *Mem = ::operator new(sizeof(AttributeSetNode) +
-                               sizeof(Attribute) * SortedAttrs.size());
+    void *Mem = ::operator new(totalSizeToAlloc<Attribute>(SortedAttrs.size()));
     PA = new (Mem) AttributeSetNode(SortedAttrs);
     pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
   }
@@ -617,9 +625,8 @@ AttributeSet::getImpl(LLVMContext &C,
   // create a new one and insert it.
   if (!PA) {
     // Coallocate entries after the AttributeSetImpl itself.
-    void *Mem = ::operator new(sizeof(AttributeSetImpl) +
-                               sizeof(std::pair<unsigned, AttributeSetNode *>) *
-                                   Attrs.size());
+    void *Mem = ::operator new(
+        AttributeSetImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
     PA = new (Mem) AttributeSetImpl(C, Attrs);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
@@ -634,14 +641,15 @@ AttributeSet AttributeSet::get(LLVMContext &C,
   if (Attrs.empty())
     return AttributeSet();
 
-#ifndef NDEBUG
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    assert((!i || Attrs[i-1].first <= Attrs[i].first) &&
-           "Misordered Attributes list!");
-    assert(!Attrs[i].second.hasAttribute(Attribute::None) &&
-           "Pointless attribute!");
-  }
-#endif
+  assert(std::is_sorted(Attrs.begin(), Attrs.end(),
+                        [](const std::pair<unsigned, Attribute> &LHS,
+                           const std::pair<unsigned, Attribute> &RHS) {
+                          return LHS.first < RHS.first;
+                        }) && "Misordered Attributes list!");
+  assert(std::none_of(Attrs.begin(), Attrs.end(),
+                      [](const std::pair<unsigned, Attribute> &Pair) {
+                        return Pair.second.hasAttribute(Attribute::None);
+                      }) && "Pointless attribute!");
 
   // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
   // list.
@@ -684,22 +692,26 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
     if (!B.contains(Kind))
       continue;
 
-    if (Kind == Attribute::Alignment)
-      Attrs.push_back(std::make_pair(Index, Attribute::
-                                     getWithAlignment(C, B.getAlignment())));
-    else if (Kind == Attribute::StackAlignment)
-      Attrs.push_back(std::make_pair(Index, Attribute::
-                              getWithStackAlignment(C, B.getStackAlignment())));
-    else if (Kind == Attribute::Dereferenceable)
-      Attrs.push_back(std::make_pair(Index,
-                                     Attribute::getWithDereferenceableBytes(C,
-                                       B.getDereferenceableBytes())));
-    else if (Kind == Attribute::DereferenceableOrNull)
-      Attrs.push_back(
-          std::make_pair(Index, Attribute::getWithDereferenceableOrNullBytes(
-                                    C, B.getDereferenceableOrNullBytes())));
-    else
-      Attrs.push_back(std::make_pair(Index, Attribute::get(C, Kind)));
+    Attribute Attr;
+    switch (Kind) {
+    case Attribute::Alignment:
+      Attr = Attribute::getWithAlignment(C, B.getAlignment());
+      break;
+    case Attribute::StackAlignment:
+      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
+      break;
+    case Attribute::Dereferenceable:
+      Attr = Attribute::getWithDereferenceableBytes(
+          C, B.getDereferenceableBytes());
+      break;
+    case Attribute::DereferenceableOrNull:
+      Attr = Attribute::getWithDereferenceableOrNullBytes(
+          C, B.getDereferenceableOrNullBytes());
+      break;
+    default:
+      Attr = Attribute::get(C, Kind);
+    }
+    Attrs.push_back(std::make_pair(Index, Attr));
   }
 
   // Add target-dependent (string) attributes.
@@ -713,9 +725,8 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
 AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
                                ArrayRef<Attribute::AttrKind> Kind) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (ArrayRef<Attribute::AttrKind>::iterator I = Kind.begin(),
-         E = Kind.end(); I != E; ++I)
-    Attrs.push_back(std::make_pair(Index, Attribute::get(C, *I)));
+  for (Attribute::AttrKind K : Kind)
+    Attrs.push_back(std::make_pair(Index, Attribute::get(C, K)));
   return get(C, Attrs);
 }
 
@@ -736,9 +747,8 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
     if (!AS) continue;
     SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator
       ANVI = AttrNodeVec.begin(), ANVE;
-    for (const AttributeSetImpl::IndexAttrPair
-             *AI = AS->getNode(0),
-             *AE = AS->getNode(AS->getNumAttributes());
+    for (const IndexAttrPair *AI = AS->getNode(0),
+                             *AE = AS->getNode(AS->getNumAttributes());
          AI != AE; ++AI) {
       ANVE = AttrNodeVec.end();
       while (ANVI != ANVE && ANVI->first <= AI->first)
@@ -770,6 +780,36 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
   return addAttributes(C, Index, AttributeSet::get(C, Index, B));
 }
 
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+                                        ArrayRef<unsigned> Indices,
+                                        Attribute A) const {
+  unsigned I = 0, E = pImpl ? pImpl->getNumAttributes() : 0;
+  auto IdxI = Indices.begin(), IdxE = Indices.end();
+  SmallVector<AttributeSet, 4> AttrSet;
+
+  while (I != E && IdxI != IdxE) {
+    if (getSlotIndex(I) < *IdxI)
+      AttrSet.emplace_back(getSlotAttributes(I++));
+    else if (getSlotIndex(I) > *IdxI)
+      AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+    else {
+      AttrBuilder B(getSlotAttributes(I), *IdxI);
+      B.addAttribute(A);
+      AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B));
+      ++I;
+      ++IdxI;
+    }
+  }
+
+  while (I != E)
+    AttrSet.emplace_back(getSlotAttributes(I++));
+
+  while (IdxI != IdxE)
+    AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+
+  return get(C, AttrSet);
+}
+
 AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
                                          AttributeSet Attrs) const {
   if (!pImpl) return Attrs;
@@ -955,17 +995,17 @@ AttributeSet AttributeSet::getFnAttributes() const {
 
 bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{
   AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->hasAttribute(Kind) : false;
+  return ASN && ASN->hasAttribute(Kind);
 }
 
 bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const {
   AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->hasAttribute(Kind) : false;
+  return ASN && ASN->hasAttribute(Kind);
 }
 
 bool AttributeSet::hasAttributes(unsigned Index) const {
   AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->hasAttributes() : false;
+  return ASN && ASN->hasAttributes();
 }
 
 /// \brief Return true if the specified attribute is set for at least one
@@ -1111,6 +1151,7 @@ AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
 
 void AttrBuilder::clear() {
   Attrs.reset();
+  TargetDepAttrs.clear();
   Alignment = StackAlignment = DerefBytes = DerefOrNullBytes = 0;
 }
 
@@ -1177,23 +1218,10 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
   for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
     Attribute Attr = *I;
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
-      Attribute::AttrKind Kind = I->getKindAsEnum();
-      Attrs[Kind] = false;
-
-      if (Kind == Attribute::Alignment)
-        Alignment = 0;
-      else if (Kind == Attribute::StackAlignment)
-        StackAlignment = 0;
-      else if (Kind == Attribute::Dereferenceable)
-        DerefBytes = 0;
-      else if (Kind == Attribute::DereferenceableOrNull)
-        DerefOrNullBytes = 0;
+      removeAttribute(Attr.getKindAsEnum());
     } else {
       assert(Attr.isStringAttribute() && "Invalid attribute type!");
-      std::map<std::string, std::string>::iterator
-        Iter = TargetDepAttrs.find(Attr.getKindAsString());
-      if (Iter != TargetDepAttrs.end())
-        TargetDepAttrs.erase(Iter);
+      removeAttribute(Attr.getKindAsString());
     }
   }
 
@@ -1322,8 +1350,7 @@ bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
 
   assert(Slot != ~0U && "Couldn't find the index!");
 
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot);
-       I != E; ++I) {
+  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
     Attribute Attr = *I;
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       if (Attrs[I->getKindAsEnum()])
@@ -1382,7 +1409,7 @@ AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
 //===----------------------------------------------------------------------===//
 
 /// \brief Which attributes cannot be applied to a type.
-AttrBuilder AttributeFuncs::typeIncompatible(const Type *Ty) {
+AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
   AttrBuilder Incompatible;
 
   if (!Ty->isIntegerTy())
@@ -1406,3 +1433,80 @@ AttrBuilder AttributeFuncs::typeIncompatible(const Type *Ty) {
 
   return Incompatible;
 }
+
+template<typename AttrClass>
+static bool isEqual(const Function &Caller, const Function &Callee) {
+  return Caller.getFnAttribute(AttrClass::getKind()) ==
+         Callee.getFnAttribute(AttrClass::getKind());
+}
+
+/// \brief Compute the logical AND of the attributes of the caller and the
+/// callee.
+///
+/// This function sets the caller's attribute to false if the callee's attribute
+/// is false.
+template<typename AttrClass>
+static void setAND(Function &Caller, const Function &Callee) {
+  if (AttrClass::isSet(Caller, AttrClass::getKind()) &&
+      !AttrClass::isSet(Callee, AttrClass::getKind()))
+    AttrClass::set(Caller, AttrClass::getKind(), false);
+}
+
+/// \brief Compute the logical OR of the attributes of the caller and the
+/// callee.
+///
+/// This function sets the caller's attribute to true if the callee's attribute
+/// is true.
+template<typename AttrClass>
+static void setOR(Function &Caller, const Function &Callee) {
+  if (!AttrClass::isSet(Caller, AttrClass::getKind()) &&
+      AttrClass::isSet(Callee, AttrClass::getKind()))
+    AttrClass::set(Caller, AttrClass::getKind(), true);
+}
+
+/// \brief If the inlined function had a higher stack protection level than the
+/// calling function, then bump up the caller's stack protection level.
+static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
+  // If upgrading the SSP attribute, clear out the old SSP Attributes first.
+  // Having multiple SSP attributes doesn't actually hurt, but it adds useless
+  // clutter to the IR.
+  AttrBuilder B;
+  B.addAttribute(Attribute::StackProtect)
+    .addAttribute(Attribute::StackProtectStrong)
+    .addAttribute(Attribute::StackProtectReq);
+  AttributeSet OldSSPAttr = AttributeSet::get(Caller.getContext(),
+                                              AttributeSet::FunctionIndex,
+                                              B);
+
+  if (Callee.hasFnAttribute(Attribute::SafeStack)) {
+    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.addFnAttr(Attribute::SafeStack);
+  } else if (Callee.hasFnAttribute(Attribute::StackProtectReq) &&
+             !Caller.hasFnAttribute(Attribute::SafeStack)) {
+    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.addFnAttr(Attribute::StackProtectReq);
+  } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
+             !Caller.hasFnAttribute(Attribute::SafeStack) &&
+             !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
+    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.addFnAttr(Attribute::StackProtectStrong);
+  } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
+             !Caller.hasFnAttribute(Attribute::SafeStack) &&
+             !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
+             !Caller.hasFnAttribute(Attribute::StackProtectStrong))
+    Caller.addFnAttr(Attribute::StackProtect);
+}
+
+#define GET_ATTR_COMPAT_FUNC
+#include "AttributesCompatFunc.inc"
+
+bool AttributeFuncs::areInlineCompatible(const Function &Caller,
+                                         const Function &Callee) {
+  return hasCompatibleFnAttrs(Caller, Callee);
+}
+
+
+void AttributeFuncs::mergeAttributesForInlining(Function &Caller,
+                                                const Function &Callee) {
+  mergeFnAttrs(Caller, Callee);
+}
diff --git a/contrib/llvm/lib/IR/AttributesCompatFunc.td b/contrib/llvm/lib/IR/AttributesCompatFunc.td
new file mode 100644
index 0000000..7c85b3d
--- /dev/null
+++ b/contrib/llvm/lib/IR/AttributesCompatFunc.td
@@ -0,0 +1 @@
+include "llvm/IR/Attributes.td"
diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp
index f1c6ebd..12c354c 100644
--- a/contrib/llvm/lib/IR/AutoUpgrade.cpp
+++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Regex.h"
 #include <cstring>
 using namespace llvm;
 
@@ -92,8 +93,42 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                         F->arg_begin()->getType());
       return true;
     }
+    Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$");
+    if (vldRegex.match(Name)) {
+      auto fArgs = F->getFunctionType()->params();
+      SmallVector<Type *, 4> Tys(fArgs.begin(), fArgs.end());
+      // Can't use Intrinsic::getDeclaration here as the return types might
+      // then only be structurally equal.
+      FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false);
+      NewFn = Function::Create(fType, F->getLinkage(),
+                               "llvm." + Name + ".p0i8", F->getParent());
+      return true;
+    }
+    Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
+    if (vstRegex.match(Name)) {
+      static const Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1,
+                                                Intrinsic::arm_neon_vst2,
+                                                Intrinsic::arm_neon_vst3,
+                                                Intrinsic::arm_neon_vst4};
+
+      static const Intrinsic::ID StoreLaneInts[] = {
+        Intrinsic::arm_neon_vst2lane, Intrinsic::arm_neon_vst3lane,
+        Intrinsic::arm_neon_vst4lane
+      };
+
+      auto fArgs = F->getFunctionType()->params();
+      Type *Tys[] = {fArgs[0], fArgs[1]};
+      if (Name.find("lane") == StringRef::npos)
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          StoreInts[fArgs.size() - 3], Tys);
+      else
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          StoreLaneInts[fArgs.size() - 5], Tys);
+      return true;
+    }
     break;
   }
+
   case 'c': {
     if (Name.startswith("ctlz.") && F->arg_size() == 1) {
       F->setName(Name + ".old");
@@ -129,7 +164,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name.startswith("x86.sse2.pcmpgt.") ||
         Name.startswith("x86.avx2.pcmpeq.") ||
         Name.startswith("x86.avx2.pcmpgt.") ||
+        Name.startswith("x86.avx2.vbroadcast") ||
+        Name.startswith("x86.avx2.pbroadcast") ||
         Name.startswith("x86.avx.vpermil.") ||
+        Name.startswith("x86.sse41.pmovsx") ||
         Name == "x86.avx.vinsertf128.pd.256" ||
         Name == "x86.avx.vinsertf128.ps.256" ||
         Name == "x86.avx.vinsertf128.si.256" ||
@@ -162,6 +200,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx2.pblendd.128" ||
         Name == "x86.avx2.pblendd.256" ||
         Name == "x86.avx2.vbroadcasti128" ||
+        Name == "x86.xop.vpcmov" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = nullptr;
       return true;
@@ -325,7 +364,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   Function *F = CI->getCalledFunction();
   LLVMContext &C = CI->getContext();
   IRBuilder<> Builder(C);
-  Builder.SetInsertPoint(CI->getParent(), CI);
+  Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   assert(F && "Intrinsic call is not direct?");
 
@@ -351,7 +390,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                Name == "llvm.x86.avx.movnt.ps.256" ||
                Name == "llvm.x86.avx.movnt.pd.256") {
       IRBuilder<> Builder(C);
-      Builder.SetInsertPoint(CI->getParent(), CI);
+      Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
       Module *M = F->getParent();
       SmallVector<Metadata *, 1> Elts;
@@ -368,7 +407,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                         "cast");
       StoreInst *SI = Builder.CreateStore(Arg1, BC);
       SI->setMetadata(M->getMDKindID("nontemporal"), Node);
-      SI->setAlignment(16);
+      SI->setAlignment(32);
 
       // Remove intrinsic.
       CI->eraseFromParent();
@@ -419,6 +458,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
+    } else if (Name == "llvm.x86.xop.vpcmov") {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+      Value *Sel = CI->getArgOperand(2);
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+      Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1));
+      Value *NotSel = Builder.CreateXor(Sel, MinusOne);
+      Value *Sel0 = Builder.CreateAnd(Arg0, Sel);
+      Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
+      Rep = Builder.CreateOr(Sel0, Sel1);
     } else if (Name == "llvm.x86.sse42.crc32.64.8") {
       Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
                                                Intrinsic::x86_sse42_crc32_32_8);
@@ -438,6 +487,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       for (unsigned I = 0; I < EltNum; ++I)
         Rep = Builder.CreateInsertElement(Rep, Load,
                                           ConstantInt::get(I32Ty, I));
+    } else if (Name.startswith("llvm.x86.sse41.pmovsx")) {
+      VectorType *SrcTy = cast<VectorType>(CI->getArgOperand(0)->getType());
+      VectorType *DstTy = cast<VectorType>(CI->getType());
+      unsigned NumDstElts = DstTy->getNumElements();
+
+      // Extract a subvector of the first NumDstElts lanes and sign extend.
+      SmallVector<int, 8> ShuffleMask;
+      for (int i = 0; i != (int)NumDstElts; ++i)
+        ShuffleMask.push_back(i);
+
+      Value *SV = Builder.CreateShuffleVector(
+          CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);
+      Rep = Builder.CreateSExt(SV, DstTy);
     } else if (Name == "llvm.x86.avx2.vbroadcasti128") {
       // Replace vbroadcasts with a vector shuffle.
       Type *VT = VectorType::get(Type::getInt64Ty(C), 2);
@@ -447,6 +509,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       const int Idxs[4] = { 0, 1, 0, 1 };
       Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
                                         Idxs);
+    } else if (Name.startswith("llvm.x86.avx2.pbroadcast") ||
+               Name.startswith("llvm.x86.avx2.vbroadcast")) {
+      // Replace vp?broadcasts with a vector shuffle.
+      Value *Op = CI->getArgOperand(0);
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+      Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
+      Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
+                                        Constant::getNullValue(MaskTy));
     } else if (Name == "llvm.x86.sse2.psll.dq") {
       // 128-bit shift left specified in bits.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
@@ -517,10 +587,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       VectorType *VecTy = cast<VectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
-      
+
       // Mask off the high bits of the immediate value; hardware ignores those.
       Imm = Imm & 1;
-      
+
       // Extend the second operand into a vector that is twice as big.
       Value *UndefV = UndefValue::get(Op1->getType());
       SmallVector<Constant*, 8> Idxs;
@@ -562,7 +632,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
       VectorType *VecTy = cast<VectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
-      
+
       // Mask off the high bits of the immediate value; hardware ignores those.
       Imm = Imm & 1;
 
@@ -627,6 +697,27 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   default:
     llvm_unreachable("Unknown function for CallInst upgrade.");
 
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
+    CI->eraseFromParent();
+    return;
+  }
+
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
diff --git a/contrib/llvm/lib/IR/BasicBlock.cpp b/contrib/llvm/lib/IR/BasicBlock.cpp
index 0a04494..f61276f 100644
--- a/contrib/llvm/lib/IR/BasicBlock.cpp
+++ b/contrib/llvm/lib/IR/BasicBlock.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include <algorithm>
+
 using namespace llvm;
 
 ValueSymbolTable *BasicBlock::getValueSymbolTable() {
@@ -35,8 +36,7 @@ LLVMContext &BasicBlock::getContext() const {
 
 // Explicit instantiation of SymbolTableListTraits since some of the methods
 // are not in the public header file...
-template class llvm::SymbolTableListTraits<Instruction, BasicBlock>;
-
+template class llvm::SymbolTableListTraits<Instruction>;
 
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
@@ -56,7 +56,7 @@ void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
   assert(!Parent && "Already has a parent");
 
   if (InsertBefore)
-    NewParent->getBasicBlockList().insert(InsertBefore, this);
+    NewParent->getBasicBlockList().insert(InsertBefore->getIterator(), this);
   else
     NewParent->getBasicBlockList().push_back(this);
 }
@@ -91,26 +91,26 @@ void BasicBlock::setParent(Function *parent) {
 }
 
 void BasicBlock::removeFromParent() {
-  getParent()->getBasicBlockList().remove(this);
+  getParent()->getBasicBlockList().remove(getIterator());
 }
 
 iplist<BasicBlock>::iterator BasicBlock::eraseFromParent() {
-  return getParent()->getBasicBlockList().erase(this);
+  return getParent()->getBasicBlockList().erase(getIterator());
 }
 
 /// Unlink this basic block from its current function and
 /// insert it into the function that MovePos lives in, right before MovePos.
 void BasicBlock::moveBefore(BasicBlock *MovePos) {
-  MovePos->getParent()->getBasicBlockList().splice(MovePos,
-                       getParent()->getBasicBlockList(), this);
+  MovePos->getParent()->getBasicBlockList().splice(
+      MovePos->getIterator(), getParent()->getBasicBlockList(), getIterator());
 }
 
 /// Unlink this basic block from its current function and
 /// insert it into the function that MovePos lives in, right after MovePos.
 void BasicBlock::moveAfter(BasicBlock *MovePos) {
-  Function::iterator I = MovePos;
-  MovePos->getParent()->getBasicBlockList().splice(++I,
-                                       getParent()->getBasicBlockList(), this);
+  MovePos->getParent()->getBasicBlockList().splice(
+      ++MovePos->getIterator(), getParent()->getBasicBlockList(),
+      getIterator());
 }
 
 const Module *BasicBlock::getModule() const {
@@ -196,8 +196,8 @@ BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
   if (!FirstNonPHI)
     return end();
 
-  iterator InsertPt = FirstNonPHI;
-  if (isa<LandingPadInst>(InsertPt)) ++InsertPt;
+  iterator InsertPt = FirstNonPHI->getIterator();
+  if (InsertPt->isEHPad()) ++InsertPt;
   return InsertPt;
 }
 
@@ -245,12 +245,12 @@ BasicBlock *BasicBlock::getSingleSuccessor() {
 
 BasicBlock *BasicBlock::getUniqueSuccessor() {
   succ_iterator SI = succ_begin(this), E = succ_end(this);
-  if (SI == E) return NULL; // No successors
+  if (SI == E) return nullptr; // No successors
   BasicBlock *SuccBB = *SI;
   ++SI;
   for (;SI != E; ++SI) {
     if (*SI != SuccBB)
-      return NULL;
+      return nullptr;
     // The same successor appears multiple times in the successor list.
     // This is OK.
   }
@@ -333,6 +333,17 @@ void BasicBlock::removePredecessor(BasicBlock *Pred,
   }
 }
 
+bool BasicBlock::canSplitPredecessors() const {
+  const Instruction *FirstNonPHI = getFirstNonPHI();
+  if (isa<LandingPadInst>(FirstNonPHI))
+    return true;
+  // This is perhaps a little conservative because constructs like
+  // CleanupBlockInst are pretty easy to split.  However, SplitBlockPredecessors
+  // cannot handle such things just yet.
+  if (FirstNonPHI->isEHPad())
+    return false;
+  return true;
+}
 
 /// This splits a basic block into two at the specified
 /// instruction.  Note that all instructions BEFORE the specified iterator stay
@@ -393,8 +404,7 @@ void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
     // Cope with being called on a BasicBlock that doesn't have a terminator
     // yet. Clang's CodeGenFunction::EmitReturnBlock() likes to do this.
     return;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    BasicBlock *Succ = TI->getSuccessor(i);
+  for (BasicBlock *Succ : TI->successors()) {
     // N.B. Succ might not be a complete BasicBlock, so don't assume
     // that it ends with a non-phi instruction.
     for (iterator II = Succ->begin(), IE = Succ->end(); II != IE; ++II) {
diff --git a/contrib/llvm/lib/IR/ConstantFold.cpp b/contrib/llvm/lib/IR/ConstantFold.cpp
index 46bb20e..ce3fe03 100644
--- a/contrib/llvm/lib/IR/ConstantFold.cpp
+++ b/contrib/llvm/lib/IR/ConstantFold.cpp
@@ -83,7 +83,7 @@ foldConstantCastPair(
   assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type");
   assert(CastInst::isCast(opc) && "Invalid cast opcode");
 
-  // The the types and opcodes for the two Cast constant expressions
+  // The types and opcodes for the two Cast constant expressions
   Type *SrcTy = Op->getOperand(0)->getType();
   Type *MidTy = Op->getType();
   Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
@@ -109,7 +109,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
   if (PointerType *PTy = dyn_cast<PointerType>(V->getType()))
     if (PointerType *DPTy = dyn_cast<PointerType>(DestTy))
       if (PTy->getAddressSpace() == DPTy->getAddressSpace()
-          && DPTy->getElementType()->isSized()) {
+          && PTy->getElementType()->isSized()) {
         SmallVector<Value*, 8> IdxList;
         Value *Zero =
           Constant::getNullValue(Type::getInt32Ty(DPTy->getContext()));
@@ -1187,7 +1187,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         (void)C3V.divide(C2V, APFloat::rmNearestTiesToEven);
         return ConstantFP::get(C1->getContext(), C3V);
       case Instruction::FRem:
-        (void)C3V.mod(C2V, APFloat::rmNearestTiesToEven);
+        (void)C3V.mod(C2V);
         return ConstantFP::get(C1->getContext(), C3V);
       }
     }
@@ -1277,9 +1277,9 @@ static bool isMaybeZeroSizedType(Type *Ty) {
 }
 
 /// IdxCompare - Compare the two constants as though they were getelementptr
-/// indices.  This allows coersion of the types to be the same thing.
+/// indices.  This allows coercion of the types to be the same thing.
 ///
-/// If the two constants are the "same" (after coersion), return 0.  If the
+/// If the two constants are the "same" (after coercion), return 0.  If the
 /// first is less than the second, return -1, if the second is less than the
 /// first, return 1.  If the constants are not integral, return -2.
 ///
@@ -1685,7 +1685,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     // Otherwise, for integer compare, pick the same value as the non-undef
     // operand, and fold it to true or false.
     if (isIntegerPredicate)
-      return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(pred));
+      return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(Predicate));
 
     // Choosing NaN for the undef will always make unordered comparison succeed
     // and ordered comparison fails.
@@ -1869,7 +1869,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   } else {
     // Evaluate the relation between the two constants, per the predicate.
     int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
-    switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(pred))) {
+    switch (evaluateICmpRelation(C1, C2,
+                                 CmpInst::isSigned((CmpInst::Predicate)pred))) {
     default: llvm_unreachable("Unknown relational!");
     case ICmpInst::BAD_ICMP_PREDICATE:
       break;  // Couldn't determine anything about these constants.
@@ -1950,8 +1951,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
 
     // If the left hand side is an extension, try eliminating it.
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
-      if ((CE1->getOpcode() == Instruction::SExt && ICmpInst::isSigned(pred)) ||
-          (CE1->getOpcode() == Instruction::ZExt && !ICmpInst::isSigned(pred))){
+      if ((CE1->getOpcode() == Instruction::SExt &&
+           ICmpInst::isSigned((ICmpInst::Predicate)pred)) ||
+          (CE1->getOpcode() == Instruction::ZExt &&
+           !ICmpInst::isSigned((ICmpInst::Predicate)pred))){
         Constant *CE1Op0 = CE1->getOperand(0);
         Constant *CE1Inverse = ConstantExpr::getTrunc(CE1, CE1Op0->getType());
         if (CE1Inverse == CE1Op0) {
@@ -1997,17 +2000,17 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
 }
 
 /// \brief Test whether a given ConstantInt is in-range for a SequentialType.
-static bool isIndexInRangeOfSequentialType(const SequentialType *STy,
+static bool isIndexInRangeOfSequentialType(SequentialType *STy,
                                            const ConstantInt *CI) {
-  if (const PointerType *PTy = dyn_cast<PointerType>(STy))
-    // Only handle pointers to sized types, not pointers to functions.
-    return PTy->getElementType()->isSized();
+  // And indices are valid when indexing along a pointer
+  if (isa<PointerType>(STy))
+    return true;
 
   uint64_t NumElements = 0;
   // Determine the number of elements in our sequential type.
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+  if (auto *ATy = dyn_cast<ArrayType>(STy))
     NumElements = ATy->getNumElements();
-  else if (const VectorType *VTy = dyn_cast<VectorType>(STy))
+  else if (auto *VTy = dyn_cast<VectorType>(STy))
     NumElements = VTy->getNumElements();
 
   assert((isa<ArrayType>(STy) || NumElements > 0) &&
@@ -2178,7 +2181,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Type *PointeeTy, Constant *C,
             // dimension.
             NewIdxs.resize(Idxs.size());
             uint64_t NumElements = 0;
-            if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+            if (auto *ATy = dyn_cast<ArrayType>(Ty))
               NumElements = ATy->getNumElements();
             else
               NumElements = cast<VectorType>(Ty)->getNumElements();
diff --git a/contrib/llvm/lib/IR/ConstantRange.cpp b/contrib/llvm/lib/IR/ConstantRange.cpp
index 91095cf..48f9b27 100644
--- a/contrib/llvm/lib/IR/ConstantRange.cpp
+++ b/contrib/llvm/lib/IR/ConstantRange.cpp
@@ -21,7 +21,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -125,6 +127,57 @@ ConstantRange ConstantRange::makeSatisfyingICmpRegion(CmpInst::Predicate Pred,
       .inverse();
 }
 
+ConstantRange ConstantRange::makeNoWrapRegion(Instruction::BinaryOps BinOp,
+                                              const APInt &C,
+                                              unsigned NoWrapKind) {
+  typedef OverflowingBinaryOperator OBO;
+
+  // Computes the intersection of CR0 and CR1.  It is different from
+  // intersectWith in that the ConstantRange returned will only contain elements
+  // in both CR0 and CR1 (i.e. SubsetIntersect(X, Y) is a *subset*, proper or
+  // not, of both X and Y).
+  auto SubsetIntersect =
+      [](const ConstantRange &CR0, const ConstantRange &CR1) {
+    return CR0.inverse().unionWith(CR1.inverse()).inverse();
+  };
+
+  assert(BinOp >= Instruction::BinaryOpsBegin &&
+         BinOp < Instruction::BinaryOpsEnd && "Binary operators only!");
+
+  assert((NoWrapKind == OBO::NoSignedWrap ||
+          NoWrapKind == OBO::NoUnsignedWrap ||
+          NoWrapKind == (OBO::NoUnsignedWrap | OBO::NoSignedWrap)) &&
+         "NoWrapKind invalid!");
+
+  unsigned BitWidth = C.getBitWidth();
+  if (BinOp != Instruction::Add)
+    // Conservative answer: empty set
+    return ConstantRange(BitWidth, false);
+
+  if (C.isMinValue())
+    // Full set: nothing signed / unsigned wraps when added to 0.
+    return ConstantRange(BitWidth);
+
+  ConstantRange Result(BitWidth);
+
+  if (NoWrapKind & OBO::NoUnsignedWrap)
+    Result = SubsetIntersect(Result,
+                             ConstantRange(APInt::getNullValue(BitWidth), -C));
+
+  if (NoWrapKind & OBO::NoSignedWrap) {
+    if (C.isStrictlyPositive())
+      Result = SubsetIntersect(
+          Result, ConstantRange(APInt::getSignedMinValue(BitWidth),
+                                APInt::getSignedMinValue(BitWidth) - C));
+    else
+      Result = SubsetIntersect(
+          Result, ConstantRange(APInt::getSignedMinValue(BitWidth) - C,
+                                APInt::getSignedMinValue(BitWidth)));
+  }
+
+  return Result;
+}
+
 /// isFullSet - Return true if this set contains all of the elements possible
 /// for this data-type
 bool ConstantRange::isFullSet() const {
diff --git a/contrib/llvm/lib/IR/Constants.cpp b/contrib/llvm/lib/IR/Constants.cpp
index 308e6bd..0898bf6 100644
--- a/contrib/llvm/lib/IR/Constants.cpp
+++ b/contrib/llvm/lib/IR/Constants.cpp
@@ -53,6 +53,11 @@ bool Constant::isNegativeZeroValue() const {
       if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative())
         return true;
 
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
+      if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative())
+        return true;
+
   // We've already handled true FP case; any other FP vectors can't represent -0.0.
   if (getType()->isFPOrFPVectorTy())
     return false;
@@ -68,6 +73,17 @@ bool Constant::isZeroValue() const {
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isZero();
 
+  // Equivalent for a vector of -0.0's.
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
+      if (SplatCFP && SplatCFP->isZero())
+        return true;
+
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
+      if (SplatCFP && SplatCFP->isZero())
+        return true;
+
   // Otherwise, just use +0.0.
   return isNullValue();
 }
@@ -81,8 +97,10 @@ bool Constant::isNullValue() const {
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isZero() && !CFP->isNegative();
 
-  // constant zero is zero for aggregates and cpnull is null for pointers.
-  return isa<ConstantAggregateZero>(this) || isa<ConstantPointerNull>(this);
+  // constant zero is zero for aggregates, cpnull is null for pointers, none for
+  // tokens.
+  return isa<ConstantAggregateZero>(this) || isa<ConstantPointerNull>(this) ||
+         isa<ConstantTokenNone>(this);
 }
 
 bool Constant::isAllOnesValue() const {
@@ -204,6 +222,8 @@ Constant *Constant::getNullValue(Type *Ty) {
   case Type::ArrayTyID:
   case Type::VectorTyID:
     return ConstantAggregateZero::get(Ty);
+  case Type::TokenTyID:
+    return ConstantTokenNone::get(Ty->getContext());
   default:
     // Function, Label, or Opaque type?
     llvm_unreachable("Cannot create a null constant of that type!");
@@ -410,32 +430,13 @@ bool Constant::isConstantUsed() const {
   return false;
 }
 
+bool Constant::needsRelocation() const {
+  if (isa<GlobalValue>(this))
+    return true; // Global reference.
 
-
-/// getRelocationInfo - This method classifies the entry according to
-/// whether or not it may generate a relocation entry.  This must be
-/// conservative, so if it might codegen to a relocatable entry, it should say
-/// so.  The return values are:
-/// 
-///  NoRelocation: This constant pool entry is guaranteed to never have a
-///     relocation applied to it (because it holds a simple constant like
-///     '4').
-///  LocalRelocation: This entry has relocations, but the entries are
-///     guaranteed to be resolvable by the static linker, so the dynamic
-///     linker will never see them.
-///  GlobalRelocations: This entry may have arbitrary relocations.
-///
-/// FIXME: This really should not be in IR.
-Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
-    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
-      return LocalRelocation;  // Local to this file/library.
-    return GlobalRelocations;    // Global reference.
-  }
-  
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(this))
-    return BA->getFunction()->getRelocationInfo();
-  
+    return BA->getFunction()->needsRelocation();
+
   // While raw uses of blockaddress need to be relocated, differences between
   // two of them don't when they are for labels in the same function.  This is a
   // common idiom when creating a table for the indirect goto extension, so we
@@ -444,20 +445,18 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
     if (CE->getOpcode() == Instruction::Sub) {
       ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
       ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
-      if (LHS && RHS &&
-          LHS->getOpcode() == Instruction::PtrToInt &&
+      if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt &&
           RHS->getOpcode() == Instruction::PtrToInt &&
           isa<BlockAddress>(LHS->getOperand(0)) &&
           isa<BlockAddress>(RHS->getOperand(0)) &&
           cast<BlockAddress>(LHS->getOperand(0))->getFunction() ==
-            cast<BlockAddress>(RHS->getOperand(0))->getFunction())
-        return NoRelocation;
+              cast<BlockAddress>(RHS->getOperand(0))->getFunction())
+        return false;
     }
 
-  PossibleRelocationsTy Result = NoRelocation;
+  bool Result = false;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    Result = std::max(Result,
-                      cast<Constant>(getOperand(i))->getRelocationInfo());
+    Result |= cast<Constant>(getOperand(i))->needsRelocation();
 
   return Result;
 }
@@ -797,10 +796,10 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
 }
 
 unsigned ConstantAggregateZero::getNumElements() const {
-  const Type *Ty = getType();
-  if (const auto *AT = dyn_cast<ArrayType>(Ty))
+  Type *Ty = getType();
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
     return AT->getNumElements();
-  if (const auto *VT = dyn_cast<VectorType>(Ty))
+  if (auto *VT = dyn_cast<VectorType>(Ty))
     return VT->getNumElements();
   return Ty->getStructNumElements();
 }
@@ -838,10 +837,10 @@ UndefValue *UndefValue::getElementValue(unsigned Idx) const {
 }
 
 unsigned UndefValue::getNumElements() const {
-  const Type *Ty = getType();
-  if (const auto *AT = dyn_cast<ArrayType>(Ty))
+  Type *Ty = getType();
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
     return AT->getNumElements();
-  if (const auto *VT = dyn_cast<VectorType>(Ty))
+  if (auto *VT = dyn_cast<VectorType>(Ty))
     return VT->getNumElements();
   return Ty->getStructNumElements();
 }
@@ -858,6 +857,59 @@ static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) {
   return true;
 }
 
+template <typename SequentialTy, typename ElementTy>
+static Constant *getIntSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty int sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      Elts.push_back(CI->getZExtValue());
+    else
+      return nullptr;
+  return SequentialTy::get(V[0]->getContext(), Elts);
+}
+
+template <typename SequentialTy, typename ElementTy>
+static Constant *getFPSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty FP sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CFP = dyn_cast<ConstantFP>(C))
+      Elts.push_back(CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+    else
+      return nullptr;
+  return SequentialTy::getFP(V[0]->getContext(), Elts);
+}
+
+template <typename SequenceTy>
+static Constant *getSequenceIfElementsMatch(Constant *C,
+                                            ArrayRef<Constant *> V) {
+  // We speculatively build the elements here even if it turns out that there is
+  // a constantexpr or something else weird, since it is so uncommon for that to
+  // happen.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    if (CI->getType()->isIntegerTy(8))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint8_t>(V);
+    else if (CI->getType()->isIntegerTy(16))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CI->getType()->isIntegerTy(32))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CI->getType()->isIntegerTy(64))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    if (CFP->getType()->isHalfTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CFP->getType()->isFloatTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CFP->getType()->isDoubleTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  }
+
+  return nullptr;
+}
+
 ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
   : Constant(T, ConstantArrayVal,
              OperandTraits<ConstantArray>::op_end(this) - V.size(),
@@ -875,6 +927,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
     return C;
   return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V);
 }
+
 Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   // Empty arrays are canonicalized to ConstantAggregateZero.
   if (V.empty())
@@ -897,74 +950,8 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
-  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
-    // We speculatively build the elements here even if it turns out that there
-    // is a constantexpr or something else weird in the array, since it is so
-    // uncommon for that to happen.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      if (CI->getType()->isIntegerTy(8)) {
-        SmallVector<uint8_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(16)) {
-        SmallVector<uint16_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(32)) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(64)) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      }
-    }
-
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-      if (CFP->getType()->isFloatTy()) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::getFP(C->getContext(), Elts);
-      } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::getFP(C->getContext(), Elts);
-      }
-    }
-  }
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
+    return getSequenceIfElementsMatch<ConstantDataArray>(C, V);
 
   // Otherwise, we really do want to create a ConstantArray.
   return nullptr;
@@ -1060,6 +1047,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
   VectorType *Ty = VectorType::get(V.front()->getType(), V.size());
   return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V);
 }
+
 Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   assert(!V.empty() && "Vectors can't be empty");
   VectorType *T = VectorType::get(V.front()->getType(), V.size());
@@ -1085,74 +1073,8 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
-  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
-    // We speculatively build the elements here even if it turns out that there
-    // is a constantexpr or something else weird in the array, since it is so
-    // uncommon for that to happen.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      if (CI->getType()->isIntegerTy(8)) {
-        SmallVector<uint8_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(16)) {
-        SmallVector<uint16_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(32)) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(64)) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      }
-    }
-
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-      if (CFP->getType()->isFloatTy()) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::getFP(C->getContext(), Elts);
-      } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::getFP(C->getContext(), Elts);
-      }
-    }
-  }
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
+    return getSequenceIfElementsMatch<ConstantDataVector>(C, V);
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
   // the operand list constants a ConstantExpr or something else strange.
@@ -1170,6 +1092,17 @@ Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) {
   return get(Elts);
 }
 
+ConstantTokenNone *ConstantTokenNone::get(LLVMContext &Context) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  if (!pImpl->TheNoneToken)
+    pImpl->TheNoneToken.reset(new ConstantTokenNone(Context));
+  return pImpl->TheNoneToken.get();
+}
+
+/// Remove the constant from the constant table.
+void ConstantTokenNone::destroyConstantImpl() {
+  llvm_unreachable("You can't ConstantTokenNone->destroyConstantImpl()!");
+}
 
 // Utility function for determining if a ConstantExpr is a CastOp or not. This
 // can't be inline because we don't want to #include Instruction.h into
@@ -1221,8 +1154,7 @@ ArrayRef<unsigned> ConstantExpr::getIndices() const {
 }
 
 unsigned ConstantExpr::getPredicate() const {
-  assert(isCompare());
-  return ((const CompareConstantExpr*)this)->predicate;
+  return cast<CompareConstantExpr>(this)->predicate;
 }
 
 /// getWithOperandReplaced - Return a constant expression identical to this
@@ -1245,7 +1177,7 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
 /// operands replaced with the specified values.  The specified array must
 /// have the same number of operands as our current one.
 Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
-                                        bool OnlyIfReduced) const {
+                                        bool OnlyIfReduced, Type *SrcTy) const {
   assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
 
   // If no operands changed return self.
@@ -1283,10 +1215,13 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
   case Instruction::ShuffleVector:
     return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2],
                                           OnlyIfReducedTy);
-  case Instruction::GetElementPtr:
-    return ConstantExpr::getGetElementPtr(nullptr, Ops[0], Ops.slice(1),
-                                          cast<GEPOperator>(this)->isInBounds(),
-                                          OnlyIfReducedTy);
+  case Instruction::GetElementPtr: {
+    auto *GEPO = cast<GEPOperator>(this);
+    assert(SrcTy || (Ops[0]->getType() == getOperand(0)->getType()));
+    return ConstantExpr::getGetElementPtr(
+        SrcTy ? SrcTy : GEPO->getSourceElementType(), Ops[0], Ops.slice(1),
+        GEPO->isInBounds(), OnlyIfReducedTy);
+  }
   case Instruction::ICmp:
   case Instruction::FCmp:
     return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1],
@@ -2430,9 +2365,9 @@ StringRef ConstantDataSequential::getRawDataValues() const {
 /// formed with a vector or array of the specified element type.
 /// ConstantDataArray only works with normal float and int types that are
 /// stored densely in memory, not with things like i42 or x86_f80.
-bool ConstantDataSequential::isElementTypeCompatible(const Type *Ty) {
-  if (Ty->isFloatTy() || Ty->isDoubleTy()) return true;
-  if (const IntegerType *IT = dyn_cast<IntegerType>(Ty)) {
+bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
+  if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true;
+  if (auto *IT = dyn_cast<IntegerType>(Ty)) {
     switch (IT->getBitWidth()) {
     case 8:
     case 16:
@@ -2587,7 +2522,7 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
 /// object.
 Constant *ConstantDataArray::getFP(LLVMContext &Context,
                                    ArrayRef<uint16_t> Elts) {
-  Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
+  Type *Ty = ArrayType::get(Type::getHalfTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
 }
@@ -2703,6 +2638,11 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
   }
 
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType()->isHalfTy()) {
+      SmallVector<uint16_t, 16> Elts(
+          NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+      return getFP(V->getContext(), Elts);
+    }
     if (CFP->getType()->isFloatTy()) {
       SmallVector<uint32_t, 16> Elts(
           NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
@@ -2748,6 +2688,10 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   switch (getElementType()->getTypeID()) {
   default:
     llvm_unreachable("Accessor can only be used when element is float/double!");
+  case Type::HalfTyID: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APFloat(APFloat::IEEEhalf, APInt(16, EltVal));
+  }
   case Type::FloatTyID: {
     auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
     return APFloat(APFloat::IEEEsingle, APInt(32, EltVal));
@@ -2782,7 +2726,8 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
 /// Note that this has to compute a new constant to return, so it isn't as
 /// efficient as getElementAsInteger/Float/Double.
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
-  if (getElementType()->isFloatTy() || getElementType()->isDoubleTy())
+  if (getElementType()->isHalfTy() || getElementType()->isFloatTy() ||
+      getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
 
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
@@ -2872,6 +2817,11 @@ Value *ConstantFP::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
   llvm_unreachable("Unsupported class for handleOperandChange()!");
 }
 
+Value *ConstantTokenNone::handleOperandChangeImpl(Value *From, Value *To,
+                                                  Use *U) {
+  llvm_unreachable("Unsupported class for handleOperandChange()!");
+}
+
 Value *UndefValue::handleOperandChangeImpl(Value *From, Value *To, Use *U) {
   llvm_unreachable("Unsupported class for handleOperandChange()!");
 }
@@ -3070,7 +3020,7 @@ Instruction *ConstantExpr::getAsInstruction() {
   case Instruction::ICmp:
   case Instruction::FCmp:
     return CmpInst::Create((Instruction::OtherOps)getOpcode(),
-                           getPredicate(), Ops[0], Ops[1]);
+                           (CmpInst::Predicate)getPredicate(), Ops[0], Ops[1]);
 
   default:
     assert(getNumOperands() == 2 && "Must be binary operator?");
diff --git a/contrib/llvm/lib/IR/ConstantsContext.h b/contrib/llvm/lib/IR/ConstantsContext.h
index f3ddcd7..13fcbd2 100644
--- a/contrib/llvm/lib/IR/ConstantsContext.h
+++ b/contrib/llvm/lib/IR/ConstantsContext.h
@@ -179,6 +179,13 @@ public:
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::ExtractValue;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// InsertValueConstantExpr - This class is private to
@@ -205,6 +212,13 @@ public:
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::InsertValue;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is
@@ -235,6 +249,13 @@ public:
   Type *getSourceElementType() const;
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::GetElementPtr;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 // CompareConstantExpr - This class is private to Constants.cpp, and is used
@@ -257,6 +278,14 @@ public:
   }
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::ICmp ||
+           CE->getOpcode() == Instruction::FCmp;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 template <>
@@ -373,41 +402,45 @@ template <class ConstantClass> struct ConstantAggrKeyType {
 struct InlineAsmKeyType {
   StringRef AsmString;
   StringRef Constraints;
+  FunctionType *FTy;
   bool HasSideEffects;
   bool IsAlignStack;
   InlineAsm::AsmDialect AsmDialect;
 
   InlineAsmKeyType(StringRef AsmString, StringRef Constraints,
-                   bool HasSideEffects, bool IsAlignStack,
+                   FunctionType *FTy, bool HasSideEffects, bool IsAlignStack,
                    InlineAsm::AsmDialect AsmDialect)
-      : AsmString(AsmString), Constraints(Constraints),
+      : AsmString(AsmString), Constraints(Constraints), FTy(FTy),
         HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
         AsmDialect(AsmDialect) {}
   InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
       : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
-        HasSideEffects(Asm->hasSideEffects()),
+        FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()),
         IsAlignStack(Asm->isAlignStack()), AsmDialect(Asm->getDialect()) {}
 
   bool operator==(const InlineAsmKeyType &X) const {
     return HasSideEffects == X.HasSideEffects &&
            IsAlignStack == X.IsAlignStack && AsmDialect == X.AsmDialect &&
-           AsmString == X.AsmString && Constraints == X.Constraints;
+           AsmString == X.AsmString && Constraints == X.Constraints &&
+           FTy == X.FTy;
   }
   bool operator==(const InlineAsm *Asm) const {
     return HasSideEffects == Asm->hasSideEffects() &&
            IsAlignStack == Asm->isAlignStack() &&
            AsmDialect == Asm->getDialect() &&
            AsmString == Asm->getAsmString() &&
-           Constraints == Asm->getConstraintString();
+           Constraints == Asm->getConstraintString() &&
+           FTy == Asm->getFunctionType();
   }
   unsigned getHash() const {
     return hash_combine(AsmString, Constraints, HasSideEffects, IsAlignStack,
-                        AsmDialect);
+                        AsmDialect, FTy);
   }
 
   typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
   InlineAsm *create(TypeClass *Ty) const {
-    return new InlineAsm(Ty, AsmString, Constraints, HasSideEffects,
+    assert(PointerType::getUnqual(FTy) == Ty);
+    return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
                          IsAlignStack, AsmDialect);
   }
 };
diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp
index 0eb88a9..591dafa 100644
--- a/contrib/llvm/lib/IR/Core.cpp
+++ b/contrib/llvm/lib/IR/Core.cpp
@@ -262,6 +262,8 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMVectorTypeKind;
   case Type::X86_MMXTyID:
     return LLVMX86_MMXTypeKind;
+  case Type::TokenTyID:
+    return LLVMTokenTypeKind;
   }
   llvm_unreachable("Unhandled TypeID.");
 }
@@ -366,6 +368,9 @@ LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
 LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
 }
+LLVMTypeRef LLVMTokenTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getTokenTy(*unwrap(C));
+}
 
 LLVMTypeRef LLVMHalfType(void) {
   return LLVMHalfTypeInContext(LLVMGetGlobalContext());
@@ -1528,7 +1533,7 @@ LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) {
   Module::global_iterator I = Mod->global_begin();
   if (I == Mod->global_end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) {
@@ -1536,23 +1541,23 @@ LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) {
   Module::global_iterator I = Mod->global_end();
   if (I == Mod->global_begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) {
   GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
-  Module::global_iterator I = GV;
+  Module::global_iterator I(GV);
   if (++I == GV->getParent()->global_end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) {
   GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
-  Module::global_iterator I = GV;
+  Module::global_iterator I(GV);
   if (I == GV->getParent()->global_begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 void LLVMDeleteGlobal(LLVMValueRef GlobalVar) {
@@ -1639,7 +1644,8 @@ void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) {
 LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                           const char *Name) {
   auto *PTy = cast<PointerType>(unwrap(Ty));
-  return wrap(GlobalAlias::create(PTy, GlobalValue::ExternalLinkage, Name,
+  return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                                  GlobalValue::ExternalLinkage, Name,
                                   unwrap<Constant>(Aliasee), unwrap(M)));
 }
 
@@ -1660,7 +1666,7 @@ LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) {
   Module::iterator I = Mod->begin();
   if (I == Mod->end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) {
@@ -1668,23 +1674,23 @@ LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) {
   Module::iterator I = Mod->end();
   if (I == Mod->begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
-  Module::iterator I = Func;
+  Module::iterator I(Func);
   if (++I == Func->getParent()->end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
-  Module::iterator I = Func;
+  Module::iterator I(Func);
   if (I == Func->getParent()->begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 void LLVMDeleteFunction(LLVMValueRef Fn) {
@@ -1716,7 +1722,7 @@ void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) {
 
 const char *LLVMGetGC(LLVMValueRef Fn) {
   Function *F = unwrap<Function>(Fn);
-  return F->hasGC()? F->getGC() : nullptr;
+  return F->hasGC()? F->getGC().c_str() : nullptr;
 }
 
 void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
@@ -1779,14 +1785,14 @@ void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
   Function *Fn = unwrap<Function>(FnRef);
   for (Function::arg_iterator I = Fn->arg_begin(),
                               E = Fn->arg_end(); I != E; I++)
-    *ParamRefs++ = wrap(I);
+    *ParamRefs++ = wrap(&*I);
 }
 
 LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
   Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin();
   while (index --> 0)
     AI++;
-  return wrap(AI);
+  return wrap(&*AI);
 }
 
 LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
@@ -1798,7 +1804,7 @@ LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) {
   Function::arg_iterator I = Func->arg_begin();
   if (I == Func->arg_end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
@@ -1806,23 +1812,23 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
   Function::arg_iterator I = Func->arg_end();
   if (I == Func->arg_begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I = A;
+  Function::arg_iterator I(A);
   if (++I == A->getParent()->arg_end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I = A;
+  Function::arg_iterator I(A);
   if (I == A->getParent()->arg_begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 void LLVMAddAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
@@ -1880,7 +1886,7 @@ unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) {
 void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){
   Function *Fn = unwrap<Function>(FnRef);
   for (Function::iterator I = Fn->begin(), E = Fn->end(); I != E; I++)
-    *BasicBlocksRefs++ = wrap(I);
+    *BasicBlocksRefs++ = wrap(&*I);
 }
 
 LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) {
@@ -1892,7 +1898,7 @@ LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) {
   Function::iterator I = Func->begin();
   if (I == Func->end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) {
@@ -1900,23 +1906,23 @@ LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) {
   Function::iterator I = Func->end();
   if (I == Func->begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) {
   BasicBlock *Block = unwrap(BB);
-  Function::iterator I = Block;
+  Function::iterator I(Block);
   if (++I == Block->getParent()->end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) {
   BasicBlock *Block = unwrap(BB);
-  Function::iterator I = Block;
+  Function::iterator I(Block);
   if (I == Block->getParent()->begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 LLVMBasicBlockRef LLVMAppendBasicBlockInContext(LLVMContextRef C,
@@ -1968,7 +1974,7 @@ LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) {
   BasicBlock::iterator I = Block->begin();
   if (I == Block->end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) {
@@ -1976,23 +1982,23 @@ LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) {
   BasicBlock::iterator I = Block->end();
   if (I == Block->begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
-  BasicBlock::iterator I = Instr;
+  BasicBlock::iterator I(Instr);
   if (++I == Instr->getParent()->end())
     return nullptr;
-  return wrap(I);
+  return wrap(&*I);
 }
 
 LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
-  BasicBlock::iterator I = Instr;
+  BasicBlock::iterator I(Instr);
   if (I == Instr->getParent()->begin())
     return nullptr;
-  return wrap(--I);
+  return wrap(&*--I);
 }
 
 void LLVMInstructionEraseFromParent(LLVMValueRef Inst) {
@@ -2160,12 +2166,12 @@ void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block,
                          LLVMValueRef Instr) {
   BasicBlock *BB = unwrap(Block);
   Instruction *I = Instr? unwrap<Instruction>(Instr) : (Instruction*) BB->end();
-  unwrap(Builder)->SetInsertPoint(BB, I);
+  unwrap(Builder)->SetInsertPoint(BB, I->getIterator());
 }
 
 void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) {
   Instruction *I = unwrap<Instruction>(Instr);
-  unwrap(Builder)->SetInsertPoint(I->getParent(), I);
+  unwrap(Builder)->SetInsertPoint(I->getParent(), I->getIterator());
 }
 
 void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) {
@@ -2489,7 +2495,6 @@ LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) {
      CallInst::CreateFree(unwrap(PointerVal), unwrap(B)->GetInsertBlock())));
 }
 
-
 LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
                            const char *Name) {
   return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name));
@@ -2515,6 +2520,21 @@ static AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) {
   llvm_unreachable("Invalid LLVMAtomicOrdering value!");
 }
 
+static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) {
+  switch (Ordering) {
+    case NotAtomic: return LLVMAtomicOrderingNotAtomic;
+    case Unordered: return LLVMAtomicOrderingUnordered;
+    case Monotonic: return LLVMAtomicOrderingMonotonic;
+    case Acquire: return LLVMAtomicOrderingAcquire;
+    case Release: return LLVMAtomicOrderingRelease;
+    case AcquireRelease: return LLVMAtomicOrderingAcquireRelease;
+    case SequentiallyConsistent:
+      return LLVMAtomicOrderingSequentiallyConsistent;
+  }
+
+  llvm_unreachable("Invalid AtomicOrdering value!");
+}
+
 LLVMValueRef LLVMBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
                             LLVMBool isSingleThread, const char *Name) {
   return wrap(
@@ -2567,6 +2587,25 @@ void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
   return cast<StoreInst>(P)->setVolatile(isVolatile);
 }
 
+LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemAccessInst) {
+  Value *P = unwrap<Value>(MemAccessInst);
+  AtomicOrdering O;
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    O = LI->getOrdering();
+  else
+    O = cast<StoreInst>(P)->getOrdering();
+  return mapToLLVMOrdering(O);
+}
+
+void LLVMSetOrdering(LLVMValueRef MemAccessInst, LLVMAtomicOrdering Ordering) {
+  Value *P = unwrap<Value>(MemAccessInst);
+  AtomicOrdering O = mapFromLLVMOrdering(Ordering);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->setOrdering(O);
+  return cast<StoreInst>(P)->setOrdering(O);
+}
+
 /*--.. Casts ...............................................................--*/
 
 LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val,
diff --git a/contrib/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm/lib/IR/DIBuilder.cpp
index 2a90e70..b7841fe 100644
--- a/contrib/llvm/lib/IR/DIBuilder.cpp
+++ b/contrib/llvm/lib/IR/DIBuilder.cpp
@@ -148,7 +148,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, DIFile::get(VMContext, Filename, Directory), Producer,
       isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr,
-      nullptr, nullptr, nullptr, nullptr, DWOId);
+      nullptr, nullptr, nullptr, nullptr, nullptr, DWOId);
 
   // Create a named metadata so that it is easier to find cu in a module.
   // Note that we only generate this when the caller wants to actually
@@ -255,10 +255,12 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                             DITypeRef::get(Base));
 }
 
-DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy) {
+DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy,
+                                              uint64_t SizeInBits,
+                                              uint64_t AlignInBits) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr,
-                            DITypeRef::get(RTy), 0, 0, 0, 0);
+                            DITypeRef::get(RTy), SizeInBits, AlignInBits, 0, 0);
 }
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
@@ -429,12 +431,23 @@ DICompositeType *DIBuilder::createUnionType(
   return R;
 }
 
-DISubroutineType *DIBuilder::createSubroutineType(DIFile *File,
-                                                  DITypeRefArray ParameterTypes,
+DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
                                                   unsigned Flags) {
   return DISubroutineType::get(VMContext, Flags, ParameterTypes);
 }
 
+DICompositeType *DIBuilder::createExternalTypeRef(unsigned Tag, DIFile *File,
+                                                  StringRef UniqueIdentifier) {
+  assert(!UniqueIdentifier.empty() && "external type ref without uid");
+  auto *CTy =
+      DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr, 0,
+                           0, 0, DINode::FlagExternalTypeRef, nullptr, 0,
+                           nullptr, nullptr, UniqueIdentifier);
+  // Types with unique IDs need to be in the type map.
+  retainType(CTy);
+  return CTy;
+}
+
 DICompositeType *DIBuilder::createEnumerationType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t AlignInBits, DINodeArray Elements,
@@ -590,18 +603,20 @@ DIGlobalVariable *DIBuilder::createTempGlobalVariableFwdDecl(
       .release();
 }
 
-DILocalVariable *DIBuilder::createLocalVariable(
-    unsigned Tag, DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo,
-    DIType *Ty, bool AlwaysPreserve, unsigned Flags, unsigned ArgNo) {
+static DILocalVariable *createLocalVariable(
+    LLVMContext &VMContext,
+    DenseMap<MDNode *, std::vector<TrackingMDNodeRef>> &PreservedVariables,
+    DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File,
+    unsigned LineNo, DIType *Ty, bool AlwaysPreserve, unsigned Flags) {
   // FIXME: Why getNonCompileUnitScope()?
   // FIXME: Why is "!Context" okay here?
   // FIXME: Why doesn't this check for a subprogram or lexical block (AFAICT
   // the only valid scopes)?
   DIScope *Context = getNonCompileUnitScope(Scope);
 
-  auto *Node = DILocalVariable::get(
-      VMContext, Tag, cast_or_null<DILocalScope>(Context), Name, File, LineNo,
-      DITypeRef::get(Ty), ArgNo, Flags);
+  auto *Node =
+      DILocalVariable::get(VMContext, cast_or_null<DILocalScope>(Context), Name,
+                           File, LineNo, DITypeRef::get(Ty), ArgNo, Flags);
   if (AlwaysPreserve) {
     // The optimizer may remove local variables. If there is an interest
     // to preserve variable info in such situation then stash it in a
@@ -613,6 +628,23 @@ DILocalVariable *DIBuilder::createLocalVariable(
   return Node;
 }
 
+DILocalVariable *DIBuilder::createAutoVariable(DIScope *Scope, StringRef Name,
+                                               DIFile *File, unsigned LineNo,
+                                               DIType *Ty, bool AlwaysPreserve,
+                                               unsigned Flags) {
+  return createLocalVariable(VMContext, PreservedVariables, Scope, Name,
+                             /* ArgNo */ 0, File, LineNo, Ty, AlwaysPreserve,
+                             Flags);
+}
+
+DILocalVariable *DIBuilder::createParameterVariable(
+    DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File,
+    unsigned LineNo, DIType *Ty, bool AlwaysPreserve, unsigned Flags) {
+  assert(ArgNo && "Expected non-zero argument number for parameter");
+  return createLocalVariable(VMContext, PreservedVariables, Scope, Name, ArgNo,
+                             File, LineNo, Ty, AlwaysPreserve, Flags);
+}
+
 DIExpression *DIBuilder::createExpression(ArrayRef<uint64_t> Addr) {
   return DIExpression::get(VMContext, Addr);
 }
@@ -629,36 +661,37 @@ DIExpression *DIBuilder::createBitPieceExpression(unsigned OffsetInBytes,
   return DIExpression::get(VMContext, Addr);
 }
 
-DISubprogram *DIBuilder::createFunction(DIScopeRef Context, StringRef Name,
-                                        StringRef LinkageName, DIFile *File,
-                                        unsigned LineNo, DISubroutineType *Ty,
-                                        bool isLocalToUnit, bool isDefinition,
-                                        unsigned ScopeLine, unsigned Flags,
-                                        bool isOptimized, Function *Fn,
-                                        MDNode *TParams, MDNode *Decl) {
+DISubprogram *DIBuilder::createFunction(
+    DIScopeRef Context, StringRef Name, StringRef LinkageName, DIFile *File,
+    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
+    bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized,
+    DITemplateParameterArray TParams, DISubprogram *Decl) {
   // dragonegg does not generate identifier for types, so using an empty map
   // to resolve the context should be fine.
   DITypeIdentifierMap EmptyMap;
   return createFunction(Context.resolve(EmptyMap), Name, LinkageName, File,
                         LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
-                        Flags, isOptimized, Fn, TParams, Decl);
-}
-
-DISubprogram *DIBuilder::createFunction(DIScope *Context, StringRef Name,
-                                        StringRef LinkageName, DIFile *File,
-                                        unsigned LineNo, DISubroutineType *Ty,
-                                        bool isLocalToUnit, bool isDefinition,
-                                        unsigned ScopeLine, unsigned Flags,
-                                        bool isOptimized, Function *Fn,
-                                        MDNode *TParams, MDNode *Decl) {
-  assert(Ty->getTag() == dwarf::DW_TAG_subroutine_type &&
-         "function types should be subroutines");
-  auto *Node = DISubprogram::get(
-      VMContext, DIScopeRef::get(getNonCompileUnitScope(Context)), Name,
-      LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
-      nullptr, 0, 0, Flags, isOptimized, Fn, cast_or_null<MDTuple>(TParams),
-      cast_or_null<DISubprogram>(Decl),
-      MDTuple::getTemporary(VMContext, None).release());
+                        Flags, isOptimized, TParams, Decl);
+}
+
+template <class... Ts>
+static DISubprogram *getSubprogram(bool IsDistinct, Ts &&... Args) {
+  if (IsDistinct)
+    return DISubprogram::getDistinct(std::forward<Ts>(Args)...);
+  return DISubprogram::get(std::forward<Ts>(Args)...);
+}
+
+DISubprogram *DIBuilder::createFunction(
+    DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
+    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
+    bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized,
+    DITemplateParameterArray TParams, DISubprogram *Decl) {
+  auto *Node =
+      getSubprogram(/* IsDistinct = */ isDefinition, VMContext,
+                    DIScopeRef::get(getNonCompileUnitScope(Context)), Name,
+                    LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition,
+                    ScopeLine, nullptr, 0, 0, Flags, isOptimized, TParams, Decl,
+                    MDTuple::getTemporary(VMContext, None).release());
 
   if (isDefinition)
     AllSubprograms.push_back(Node);
@@ -670,12 +703,11 @@ DISubprogram *DIBuilder::createTempFunctionFwdDecl(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
     unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
     bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized,
-    Function *Fn, MDNode *TParams, MDNode *Decl) {
+    DITemplateParameterArray TParams, DISubprogram *Decl) {
   return DISubprogram::getTemporary(
              VMContext, DIScopeRef::get(getNonCompileUnitScope(Context)), Name,
              LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition,
-             ScopeLine, nullptr, 0, 0, Flags, isOptimized, Fn,
-             cast_or_null<MDTuple>(TParams), cast_or_null<DISubprogram>(Decl),
+             ScopeLine, nullptr, 0, 0, Flags, isOptimized, TParams, Decl,
              nullptr)
       .release();
 }
@@ -685,18 +717,16 @@ DIBuilder::createMethod(DIScope *Context, StringRef Name, StringRef LinkageName,
                         DIFile *F, unsigned LineNo, DISubroutineType *Ty,
                         bool isLocalToUnit, bool isDefinition, unsigned VK,
                         unsigned VIndex, DIType *VTableHolder, unsigned Flags,
-                        bool isOptimized, Function *Fn, MDNode *TParam) {
-  assert(Ty->getTag() == dwarf::DW_TAG_subroutine_type &&
-         "function types should be subroutines");
+                        bool isOptimized, DITemplateParameterArray TParams) {
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
   // FIXME: Do we want to use different scope/lines?
-  auto *SP = DISubprogram::get(
-      VMContext, DIScopeRef::get(cast<DIScope>(Context)), Name, LinkageName, F,
-      LineNo, Ty, isLocalToUnit, isDefinition, LineNo,
-      DITypeRef::get(VTableHolder), VK, VIndex, Flags, isOptimized, Fn,
-      cast_or_null<MDTuple>(TParam), nullptr, nullptr);
+  auto *SP = getSubprogram(
+      /* IsDistinct = */ isDefinition, VMContext,
+      DIScopeRef::get(cast<DIScope>(Context)), Name, LinkageName, F, LineNo, Ty,
+      isLocalToUnit, isDefinition, LineNo, DITypeRef::get(VTableHolder), VK,
+      VIndex, Flags, isOptimized, TParams, nullptr, nullptr);
 
   if (isDefinition)
     AllSubprograms.push_back(SP);
diff --git a/contrib/llvm/lib/IR/DataLayout.cpp b/contrib/llvm/lib/IR/DataLayout.cpp
index 4d867ef..5468f47 100644
--- a/contrib/llvm/lib/IR/DataLayout.cpp
+++ b/contrib/llvm/lib/IR/DataLayout.cpp
@@ -41,6 +41,7 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
   assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
   StructAlignment = 0;
   StructSize = 0;
+  IsPadded = false;
   NumElements = ST->getNumElements();
 
   // Loop over each of the elements, placing them in memory.
@@ -49,8 +50,10 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
     unsigned TyAlign = ST->isPacked() ? 1 : DL.getABITypeAlignment(Ty);
 
     // Add padding if necessary to align the data element properly.
-    if ((StructSize & (TyAlign-1)) != 0)
+    if ((StructSize & (TyAlign-1)) != 0) {
+      IsPadded = true;
       StructSize = RoundUpToAlignment(StructSize, TyAlign);
+    }
 
     // Keep track of maximum alignment constraint.
     StructAlignment = std::max(TyAlign, StructAlignment);
@@ -64,8 +67,10 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
 
   // Add padding to the end of the struct so that it could be put in an array
   // and all array elements would be aligned correctly.
-  if ((StructSize & (StructAlignment-1)) != 0)
+  if ((StructSize & (StructAlignment-1)) != 0) {
+    IsPadded = true;
     StructSize = RoundUpToAlignment(StructSize, StructAlignment);
+  }
 }
 
 
@@ -461,8 +466,8 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
       return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
 
     // The best match so far depends on what we're looking for.
-     if (AlignType == INTEGER_ALIGN &&
-         Alignments[i].AlignType == INTEGER_ALIGN) {
+    if (AlignType == INTEGER_ALIGN &&
+        Alignments[i].AlignType == INTEGER_ALIGN) {
       // The "best match" for integers is the smallest size that is larger than
       // the BitWidth requested.
       if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
diff --git a/contrib/llvm/lib/IR/DebugInfo.cpp b/contrib/llvm/lib/IR/DebugInfo.cpp
index 9646d1a..a2443be 100644
--- a/contrib/llvm/lib/IR/DebugInfo.cpp
+++ b/contrib/llvm/lib/IR/DebugInfo.cpp
@@ -56,21 +56,6 @@ DISubprogram *llvm::getDISubprogram(const Function *F) {
   return nullptr;
 }
 
-DICompositeTypeBase *llvm::getDICompositeType(DIType *T) {
-  if (auto *C = dyn_cast_or_null<DICompositeTypeBase>(T))
-    return C;
-
-  if (auto *D = dyn_cast_or_null<DIDerivedTypeBase>(T)) {
-    // This function is currently used by dragonegg and dragonegg does
-    // not generate identifier for types, so using an empty map to resolve
-    // DerivedFrom should be fine.
-    DITypeIdentifierMap EmptyMap;
-    return getDICompositeType(D->getBaseType().resolve(EmptyMap));
-  }
-
-  return nullptr;
-}
-
 DITypeIdentifierMap
 llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) {
   DITypeIdentifierMap Map;
@@ -164,20 +149,22 @@ void DebugInfoFinder::processType(DIType *DT) {
   if (!addType(DT))
     return;
   processScope(DT->getScope().resolve(TypeIdentifierMap));
-  if (auto *DCT = dyn_cast<DICompositeTypeBase>(DT)) {
+  if (auto *ST = dyn_cast<DISubroutineType>(DT)) {
+    for (DITypeRef Ref : ST->getTypeArray())
+      processType(Ref.resolve(TypeIdentifierMap));
+    return;
+  }
+  if (auto *DCT = dyn_cast<DICompositeType>(DT)) {
     processType(DCT->getBaseType().resolve(TypeIdentifierMap));
-    if (auto *ST = dyn_cast<DISubroutineType>(DCT)) {
-      for (DITypeRef Ref : ST->getTypeArray())
-        processType(Ref.resolve(TypeIdentifierMap));
-      return;
-    }
     for (Metadata *D : DCT->getElements()) {
       if (auto *T = dyn_cast<DIType>(D))
         processType(T);
       else if (auto *SP = dyn_cast<DISubprogram>(D))
         processSubprogram(SP);
     }
-  } else if (auto *DDT = dyn_cast<DIDerivedTypeBase>(DT)) {
+    return;
+  }
+  if (auto *DDT = dyn_cast<DIDerivedType>(DT)) {
     processType(DDT->getBaseType().resolve(TypeIdentifierMap));
   }
 }
@@ -313,6 +300,10 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
 
 bool llvm::stripDebugInfo(Function &F) {
   bool Changed = false;
+  if (F.getSubprogram()) {
+    Changed = true;
+    F.setSubprogram(nullptr);
+  }
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       if (I.getDebugLoc()) {
@@ -349,7 +340,7 @@ bool llvm::StripDebugInfo(Module &M) {
 
   for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
          NME = M.named_metadata_end(); NMI != NME;) {
-    NamedMDNode *NMD = NMI;
+    NamedMDNode *NMD = &*NMI;
     ++NMI;
     if (NMD->getName().startswith("llvm.dbg.")) {
       NMD->eraseFromParent();
@@ -372,21 +363,3 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
     return Val->getZExtValue();
   return 0;
 }
-
-DenseMap<const llvm::Function *, DISubprogram *>
-llvm::makeSubprogramMap(const Module &M) {
-  DenseMap<const Function *, DISubprogram *> R;
-
-  NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu");
-  if (!CU_Nodes)
-    return R;
-
-  for (MDNode *N : CU_Nodes->operands()) {
-    auto *CUNode = cast<DICompileUnit>(N);
-    for (auto *SP : CUNode->getSubprograms()) {
-      if (Function *F = SP->getFunction())
-        R.insert(std::make_pair(F, SP));
-    }
-  }
-  return R;
-}
diff --git a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
index 5e01748..58e0abd 100644
--- a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -295,8 +295,7 @@ DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context,
                                             StorageType Storage,
                                             bool ShouldCreate) {
   DEFINE_GETIMPL_LOOKUP(DISubroutineType, (Flags, TypeArray));
-  Metadata *Ops[] = {nullptr,   nullptr, nullptr, nullptr,
-                     TypeArray, nullptr, nullptr, nullptr};
+  Metadata *Ops[] = {nullptr, nullptr, nullptr, TypeArray};
   DEFINE_GETIMPL_STORE(DISubroutineType, (Flags), Ops);
 }
 
@@ -316,22 +315,20 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *Subprograms, Metadata *GlobalVariables,
-    Metadata *ImportedEntities, uint64_t DWOId,
+    Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
     StorageType Storage, bool ShouldCreate) {
+  assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
   assert(isCanonical(SplitDebugFilename) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(
-      DICompileUnit,
-      (SourceLanguage, File, getString(Producer), IsOptimized, getString(Flags),
-       RuntimeVersion, getString(SplitDebugFilename), EmissionKind, EnumTypes,
-       RetainedTypes, Subprograms, GlobalVariables, ImportedEntities, DWOId));
+
   Metadata *Ops[] = {File, Producer, Flags, SplitDebugFilename, EnumTypes,
                      RetainedTypes, Subprograms, GlobalVariables,
-                     ImportedEntities};
-  DEFINE_GETIMPL_STORE(
-      DICompileUnit,
-      (SourceLanguage, IsOptimized, RuntimeVersion, EmissionKind, DWOId), Ops);
+                     ImportedEntities, Macros};
+  return storeImpl(new (ArrayRef<Metadata *>(Ops).size()) DICompileUnit(
+                       Context, Storage, SourceLanguage, IsOptimized,
+                       RuntimeVersion, EmissionKind, DWOId, Ops),
+                   Storage);
 }
 
 DISubprogram *DILocalScope::getSubprogram() const {
@@ -345,34 +342,28 @@ DISubprogram *DISubprogram::getImpl(
     MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
     bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
     Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
-    unsigned Flags, bool IsOptimized, Metadata *Function,
-    Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
-    StorageType Storage, bool ShouldCreate) {
+    unsigned Flags, bool IsOptimized, Metadata *TemplateParams,
+    Metadata *Declaration, Metadata *Variables, StorageType Storage,
+    bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DISubprogram,
                         (Scope, getString(Name), getString(LinkageName), File,
                          Line, Type, IsLocalToUnit, IsDefinition, ScopeLine,
                          ContainingType, Virtuality, VirtualIndex, Flags,
-                         IsOptimized, Function, TemplateParams, Declaration,
-                         Variables));
-  Metadata *Ops[] = {File,           Scope,       Name,           Name,
-                     LinkageName,    Type,        ContainingType, Function,
-                     TemplateParams, Declaration, Variables};
+                         IsOptimized, TemplateParams, Declaration, Variables));
+  Metadata *Ops[] = {File,        Scope,    Name,           Name,
+                     LinkageName, Type,     ContainingType, TemplateParams,
+                     Declaration, Variables};
   DEFINE_GETIMPL_STORE(DISubprogram,
                        (Line, ScopeLine, Virtuality, VirtualIndex, Flags,
                         IsLocalToUnit, IsDefinition, IsOptimized),
                        Ops);
 }
 
-Function *DISubprogram::getFunction() const {
-  // FIXME: Should this be looking through bitcasts?
-  return dyn_cast_or_null<Function>(getFunctionConstant());
-}
-
 bool DISubprogram::describes(const Function *F) const {
   assert(F && "Invalid function");
-  if (F == getFunction())
+  if (F->getSubprogram() == this)
     return true;
   StringRef Name = getLinkageName();
   if (Name.empty())
@@ -380,15 +371,13 @@ bool DISubprogram::describes(const Function *F) const {
   return F->getName() == Name;
 }
 
-void DISubprogram::replaceFunction(Function *F) {
-  replaceFunction(F ? ConstantAsMetadata::get(F)
-                    : static_cast<ConstantAsMetadata *>(nullptr));
-}
-
 DILexicalBlock *DILexicalBlock::getImpl(LLVMContext &Context, Metadata *Scope,
                                         Metadata *File, unsigned Line,
                                         unsigned Column, StorageType Storage,
                                         bool ShouldCreate) {
+  // Fixup column.
+  adjustColumn(Column);
+
   assert(Scope && "Expected scope");
   DEFINE_GETIMPL_LOOKUP(DILexicalBlock, (Scope, File, Line, Column));
   Metadata *Ops[] = {File, Scope};
@@ -467,21 +456,21 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
                        Ops);
 }
 
-DILocalVariable *DILocalVariable::getImpl(LLVMContext &Context, unsigned Tag,
-                                          Metadata *Scope, MDString *Name,
-                                          Metadata *File, unsigned Line,
-                                          Metadata *Type, unsigned Arg,
-                                          unsigned Flags, StorageType Storage,
+DILocalVariable *DILocalVariable::getImpl(LLVMContext &Context, Metadata *Scope,
+                                          MDString *Name, Metadata *File,
+                                          unsigned Line, Metadata *Type,
+                                          unsigned Arg, unsigned Flags,
+                                          StorageType Storage,
                                           bool ShouldCreate) {
   // 64K ought to be enough for any frontend.
   assert(Arg <= UINT16_MAX && "Expected argument number to fit in 16-bits");
 
   assert(Scope && "Expected scope");
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DILocalVariable, (Tag, Scope, getString(Name), File,
-                                          Line, Type, Arg, Flags));
+  DEFINE_GETIMPL_LOOKUP(DILocalVariable,
+                        (Scope, getString(Name), File, Line, Type, Arg, Flags));
   Metadata *Ops[] = {Scope, Name, File, Type};
-  DEFINE_GETIMPL_STORE(DILocalVariable, (Tag, Line, Arg, Flags), Ops);
+  DEFINE_GETIMPL_STORE(DILocalVariable, (Line, Arg, Flags), Ops);
 }
 
 DIExpression *DIExpression::getImpl(LLVMContext &Context,
@@ -496,6 +485,7 @@ unsigned DIExpression::ExprOperand::getSize() const {
   case dwarf::DW_OP_bit_piece:
     return 3;
   case dwarf::DW_OP_plus:
+  case dwarf::DW_OP_minus:
     return 2;
   default:
     return 1;
@@ -516,6 +506,7 @@ bool DIExpression::isValid() const {
       // Piece expressions must be at the end.
       return I->get() + I->getSize() == E->get();
     case dwarf::DW_OP_plus:
+    case dwarf::DW_OP_minus:
     case dwarf::DW_OP_deref:
       break;
     }
@@ -566,3 +557,24 @@ DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
   Metadata *Ops[] = {Scope, Entity, Name};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
+
+DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType,
+                          unsigned Line, MDString *Name, MDString *Value,
+                          StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(DIMacro,
+                        (MIType, Line, getString(Name), getString(Value)));
+  Metadata *Ops[] = { Name, Value };
+  DEFINE_GETIMPL_STORE(DIMacro, (MIType, Line), Ops);
+}
+
+DIMacroFile *DIMacroFile::getImpl(LLVMContext &Context, unsigned MIType,
+                                  unsigned Line, Metadata *File,
+                                  Metadata *Elements, StorageType Storage,
+                                  bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DIMacroFile,
+                        (MIType, Line, File, Elements));
+  Metadata *Ops[] = { File, Elements };
+  DEFINE_GETIMPL_STORE(DIMacroFile, (MIType, Line), Ops);
+}
+
diff --git a/contrib/llvm/lib/IR/DiagnosticInfo.cpp b/contrib/llvm/lib/IR/DiagnosticInfo.cpp
index b8f77ed..6426f76 100644
--- a/contrib/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/contrib/llvm/lib/IR/DiagnosticInfo.cpp
@@ -49,7 +49,7 @@ struct PassRemarksOpt {
                                "' in -pass-remarks: " + RegexError,
                            false);
     }
-  };
+  }
 };
 
 static PassRemarksOpt PassRemarksOptLoc;
@@ -91,6 +91,8 @@ int llvm::getNextAvailablePluginDiagnosticKind() {
   return ++PluginKindID;
 }
 
+const char *DiagnosticInfo::AlwaysPrint = "";
+
 DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
                                                  const Twine &MsgStr,
                                                  DiagnosticSeverity Severity)
@@ -121,9 +123,17 @@ void DiagnosticInfoDebugMetadataVersion::print(DiagnosticPrinter &DP) const {
 }
 
 void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
-  if (getFileName() && getLineNum() > 0)
-    DP << getFileName() << ":" << getLineNum() << ": ";
-  else if (getFileName())
+  if (!FileName.empty()) {
+    DP << getFileName();
+    if (LineNum > 0)
+      DP << ":" << getLineNum();
+    DP << ": ";
+  }
+  DP << getMsg();
+}
+
+void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
+  if (getFileName())
     DP << getFileName() << ": ";
   DP << getMsg();
 }
@@ -166,8 +176,9 @@ bool DiagnosticInfoOptimizationRemarkMissed::isEnabled() const {
 }
 
 bool DiagnosticInfoOptimizationRemarkAnalysis::isEnabled() const {
-  return PassRemarksAnalysisOptLoc.Pattern &&
-         PassRemarksAnalysisOptLoc.Pattern->match(getPassName());
+  return getPassName() == DiagnosticInfo::AlwaysPrint ||
+         (PassRemarksAnalysisOptLoc.Pattern &&
+          PassRemarksAnalysisOptLoc.Pattern->match(getPassName()));
 }
 
 void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const {
@@ -196,6 +207,24 @@ void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
       DiagnosticInfoOptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
 }
 
+void llvm::emitOptimizationRemarkAnalysisFPCommute(LLVMContext &Ctx,
+                                                   const char *PassName,
+                                                   const Function &Fn,
+                                                   const DebugLoc &DLoc,
+                                                   const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationRemarkAnalysisFPCommute(PassName, Fn,
+                                                                 DLoc, Msg));
+}
+
+void llvm::emitOptimizationRemarkAnalysisAliasing(LLVMContext &Ctx,
+                                                  const char *PassName,
+                                                  const Function &Fn,
+                                                  const DebugLoc &DLoc,
+                                                  const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationRemarkAnalysisAliasing(PassName, Fn,
+                                                                DLoc, Msg));
+}
+
 bool DiagnosticInfoOptimizationFailure::isEnabled() const {
   // Only print warnings.
   return getSeverity() == DS_Warning;
diff --git a/contrib/llvm/lib/IR/Dominators.cpp b/contrib/llvm/lib/IR/Dominators.cpp
index b6a8bbc..b9d4fb7 100644
--- a/contrib/llvm/lib/IR/Dominators.cpp
+++ b/contrib/llvm/lib/IR/Dominators.cpp
@@ -91,10 +91,10 @@ bool DominatorTree::dominates(const Instruction *Def,
   if (Def == User)
     return false;
 
-  // The value defined by an invoke dominates an instruction only if
-  // it dominates every instruction in UseBB.
-  // A PHI is dominated only if the instruction dominates every possible use
-  // in the UseBB.
+  // The value defined by an invoke dominates an instruction only if it
+  // dominates every instruction in UseBB.
+  // A PHI is dominated only if the instruction dominates every possible use in
+  // the UseBB.
   if (isa<InvokeInst>(Def) || isa<PHINode>(User))
     return dominates(Def, UseBB);
 
@@ -126,15 +126,15 @@ bool DominatorTree::dominates(const Instruction *Def,
   if (DefBB == UseBB)
     return false;
 
-  const InvokeInst *II = dyn_cast<InvokeInst>(Def);
-  if (!II)
-    return dominates(DefBB, UseBB);
-
   // Invoke results are only usable in the normal destination, not in the
   // exceptional destination.
-  BasicBlock *NormalDest = II->getNormalDest();
-  BasicBlockEdge E(DefBB, NormalDest);
-  return dominates(E, UseBB);
+  if (const auto *II = dyn_cast<InvokeInst>(Def)) {
+    BasicBlock *NormalDest = II->getNormalDest();
+    BasicBlockEdge E(DefBB, NormalDest);
+    return dominates(E, UseBB);
+  }
+
+  return dominates(DefBB, UseBB);
 }
 
 bool DominatorTree::dominates(const BasicBlockEdge &BBE,
@@ -142,7 +142,8 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE,
   // Assert that we have a single edge. We could handle them by simply
   // returning false, but since isSingleEdge is linear on the number of
   // edges, the callers can normally handle them more efficiently.
-  assert(BBE.isSingleEdge());
+  assert(BBE.isSingleEdge() &&
+         "This function is not efficient in handling multiple edges");
 
   // If the BB the edge ends in doesn't dominate the use BB, then the
   // edge also doesn't.
@@ -192,7 +193,8 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE, const Use &U) const {
   // Assert that we have a single edge. We could handle them by simply
   // returning false, but since isSingleEdge is linear on the number of
   // edges, the callers can normally handle them more efficiently.
-  assert(BBE.isSingleEdge());
+  assert(BBE.isSingleEdge() &&
+         "This function is not efficient in handling multiple edges");
 
   Instruction *UserInst = cast<Instruction>(U.getUser());
   // A PHI in the end of the edge is dominated by it.
@@ -232,8 +234,8 @@ bool DominatorTree::dominates(const Instruction *Def, const Use &U) const {
   if (!isReachableFromEntry(DefBB))
     return false;
 
-  // Invoke instructions define their return values on the edges
-  // to their normal successors, so we have to handle them specially.
+  // Invoke instructions define their return values on the edges to their normal
+  // successors, so we have to handle them specially.
   // Among other things, this means they don't dominate anything in
   // their own block, except possibly a phi, so we don't need to
   // walk the block in any case.
diff --git a/contrib/llvm/lib/IR/Function.cpp b/contrib/llvm/lib/IR/Function.cpp
index b50ad12..cfdfc40 100644
--- a/contrib/llvm/lib/IR/Function.cpp
+++ b/contrib/llvm/lib/IR/Function.cpp
@@ -35,8 +35,8 @@ using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
-template class llvm::SymbolTableListTraits<Argument, Function>;
-template class llvm::SymbolTableListTraits<BasicBlock, Function>;
+template class llvm::SymbolTableListTraits<Argument>;
+template class llvm::SymbolTableListTraits<BasicBlock>;
 
 //===----------------------------------------------------------------------===//
 // Argument Implementation
@@ -235,11 +235,11 @@ Type *Function::getReturnType() const {
 }
 
 void Function::removeFromParent() {
-  getParent()->getFunctionList().remove(this);
+  getParent()->getFunctionList().remove(getIterator());
 }
 
 void Function::eraseFromParent() {
-  getParent()->getFunctionList().erase(this);
+  getParent()->getFunctionList().erase(getIterator());
 }
 
 //===----------------------------------------------------------------------===//
@@ -248,7 +248,7 @@ void Function::eraseFromParent() {
 
 Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
-    : GlobalObject(PointerType::getUnqual(Ty), Value::FunctionVal,
+    : GlobalObject(Ty, Value::FunctionVal,
                    OperandTraits<Function>::op_begin(this), 0, Linkage, name),
       Ty(Ty) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
@@ -279,9 +279,6 @@ Function::~Function() {
 
   // Remove the function from the on-the-side GC table.
   clearGC();
-
-  // FIXME: needed by operator delete
-  setFunctionNumOperands(1);
 }
 
 void Function::BuildLazyArguments() const {
@@ -328,14 +325,15 @@ void Function::dropAllReferences() {
   while (!BasicBlocks.empty())
     BasicBlocks.begin()->eraseFromParent();
 
-  // Prefix and prologue data are stored in a side table.
-  setPrefixData(nullptr);
-  setPrologueData(nullptr);
+  // Drop uses of any optional data (real or placeholder).
+  if (getNumOperands()) {
+    User::dropAllReferences();
+    setNumHungOffUseOperands(0);
+    setValueSubclassData(getSubclassDataFromValue() & ~0xe);
+  }
 
   // Metadata is stored in a side-table.
   clearMetadata();
-
-  setPersonalityFn(nullptr);
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
@@ -368,73 +366,43 @@ void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
   setAttributes(PAL);
 }
 
-// Maintain the GC name for each function in an on-the-side table. This saves
-// allocating an additional word in Function for programs which do not use GC
-// (i.e., most programs) at the cost of increased overhead for clients which do
-// use GC.
-static DenseMap<const Function*,PooledStringPtr> *GCNames;
-static StringPool *GCNamePool;
-static ManagedStatic<sys::SmartRWMutex<true> > GCLock;
-
-bool Function::hasGC() const {
-  sys::SmartScopedReader<true> Reader(*GCLock);
-  return GCNames && GCNames->count(this);
-}
-
-const char *Function::getGC() const {
+const std::string &Function::getGC() const {
   assert(hasGC() && "Function has no collector");
-  sys::SmartScopedReader<true> Reader(*GCLock);
-  return *(*GCNames)[this];
+  return getContext().getGC(*this);
 }
 
-void Function::setGC(const char *Str) {
-  sys::SmartScopedWriter<true> Writer(*GCLock);
-  if (!GCNamePool)
-    GCNamePool = new StringPool();
-  if (!GCNames)
-    GCNames = new DenseMap<const Function*,PooledStringPtr>();
-  (*GCNames)[this] = GCNamePool->intern(Str);
+void Function::setGC(const std::string Str) {
+  setValueSubclassDataBit(14, !Str.empty());
+  getContext().setGC(*this, std::move(Str));
 }
 
 void Function::clearGC() {
-  sys::SmartScopedWriter<true> Writer(*GCLock);
-  if (GCNames) {
-    GCNames->erase(this);
-    if (GCNames->empty()) {
-      delete GCNames;
-      GCNames = nullptr;
-      if (GCNamePool->empty()) {
-        delete GCNamePool;
-        GCNamePool = nullptr;
-      }
-    }
-  }
+  if (!hasGC())
+    return;
+  getContext().deleteGC(*this);
+  setValueSubclassDataBit(14, false);
 }
 
-/// copyAttributesFrom - copy all additional attributes (those not needed to
-/// create a Function) from the Function Src to this one.
+/// Copy all additional attributes (those not needed to create a Function) from
+/// the Function Src to this one.
 void Function::copyAttributesFrom(const GlobalValue *Src) {
-  assert(isa<Function>(Src) && "Expected a Function!");
   GlobalObject::copyAttributesFrom(Src);
-  const Function *SrcF = cast<Function>(Src);
+  const Function *SrcF = dyn_cast<Function>(Src);
+  if (!SrcF)
+    return;
+
   setCallingConv(SrcF->getCallingConv());
   setAttributes(SrcF->getAttributes());
   if (SrcF->hasGC())
     setGC(SrcF->getGC());
   else
     clearGC();
+  if (SrcF->hasPersonalityFn())
+    setPersonalityFn(SrcF->getPersonalityFn());
   if (SrcF->hasPrefixData())
     setPrefixData(SrcF->getPrefixData());
-  else
-    setPrefixData(nullptr);
   if (SrcF->hasPrologueData())
     setPrologueData(SrcF->getPrologueData());
-  else
-    setPrologueData(nullptr);
-  if (SrcF->hasPersonalityFn())
-    setPersonalityFn(SrcF->getPersonalityFn());
-  else
-    setPersonalityFn(nullptr);
 }
 
 /// \brief This does the actual lookup of an intrinsic ID which
@@ -492,7 +460,10 @@ static std::string getMangledTypeStr(Type* Ty) {
       Result += "vararg";
     // Ensure nested function types are distinguishable.
     Result += "f"; 
-  } else if (Ty)
+  } else if (isa<VectorType>(Ty))
+    Result += "v" + utostr(Ty->getVectorNumElements()) +
+      getMangledTypeStr(Ty->getVectorElementType());
+  else if (Ty)
     Result += EVT::getEVT(Ty).getEVTString();
   return Result;
 }
@@ -541,22 +512,25 @@ enum IIT_Info {
   // Values from 16+ are only encodable with the inefficient encoding.
   IIT_V64  = 16,
   IIT_MMX  = 17,
-  IIT_METADATA = 18,
-  IIT_EMPTYSTRUCT = 19,
-  IIT_STRUCT2 = 20,
-  IIT_STRUCT3 = 21,
-  IIT_STRUCT4 = 22,
-  IIT_STRUCT5 = 23,
-  IIT_EXTEND_ARG = 24,
-  IIT_TRUNC_ARG = 25,
-  IIT_ANYPTR = 26,
-  IIT_V1   = 27,
-  IIT_VARARG = 28,
-  IIT_HALF_VEC_ARG = 29,
-  IIT_SAME_VEC_WIDTH_ARG = 30,
-  IIT_PTR_TO_ARG = 31,
-  IIT_VEC_OF_PTRS_TO_ELT = 32,
-  IIT_I128 = 33
+  IIT_TOKEN = 18,
+  IIT_METADATA = 19,
+  IIT_EMPTYSTRUCT = 20,
+  IIT_STRUCT2 = 21,
+  IIT_STRUCT3 = 22,
+  IIT_STRUCT4 = 23,
+  IIT_STRUCT5 = 24,
+  IIT_EXTEND_ARG = 25,
+  IIT_TRUNC_ARG = 26,
+  IIT_ANYPTR = 27,
+  IIT_V1   = 28,
+  IIT_VARARG = 29,
+  IIT_HALF_VEC_ARG = 30,
+  IIT_SAME_VEC_WIDTH_ARG = 31,
+  IIT_PTR_TO_ARG = 32,
+  IIT_VEC_OF_PTRS_TO_ELT = 33,
+  IIT_I128 = 34,
+  IIT_V512 = 35,
+  IIT_V1024 = 36
 };
 
 
@@ -576,6 +550,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_MMX:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
     return;
+  case IIT_TOKEN:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Token, 0));
+    return;
   case IIT_METADATA:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
     return;
@@ -634,6 +611,14 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 64));
     DecodeIITType(NextElt, Infos, OutputTable);
     return;
+  case IIT_V512:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 512));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V1024:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1024));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
   case IIT_PTR:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
     DecodeIITType(NextElt, Infos, OutputTable);
@@ -751,6 +736,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Void: return Type::getVoidTy(Context);
   case IITDescriptor::VarArg: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::Token: return Type::getTokenTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
   case IITDescriptor::Float: return Type::getFloatTy(Context);
@@ -924,62 +910,68 @@ bool Function::callsFunctionThatReturnsTwice() const {
   return false;
 }
 
+Constant *Function::getPersonalityFn() const {
+  assert(hasPersonalityFn() && getNumOperands());
+  return cast<Constant>(Op<0>());
+}
+
+void Function::setPersonalityFn(Constant *Fn) {
+  setHungoffOperand<0>(Fn);
+  setValueSubclassDataBit(3, Fn != nullptr);
+}
+
 Constant *Function::getPrefixData() const {
-  assert(hasPrefixData());
-  const LLVMContextImpl::PrefixDataMapTy &PDMap =
-      getContext().pImpl->PrefixDataMap;
-  assert(PDMap.find(this) != PDMap.end());
-  return cast<Constant>(PDMap.find(this)->second->getReturnValue());
+  assert(hasPrefixData() && getNumOperands());
+  return cast<Constant>(Op<1>());
 }
 
 void Function::setPrefixData(Constant *PrefixData) {
-  if (!PrefixData && !hasPrefixData())
-    return;
-
-  unsigned SCData = getSubclassDataFromValue();
-  LLVMContextImpl::PrefixDataMapTy &PDMap = getContext().pImpl->PrefixDataMap;
-  ReturnInst *&PDHolder = PDMap[this];
-  if (PrefixData) {
-    if (PDHolder)
-      PDHolder->setOperand(0, PrefixData);
-    else
-      PDHolder = ReturnInst::Create(getContext(), PrefixData);
-    SCData |= (1<<1);
-  } else {
-    delete PDHolder;
-    PDMap.erase(this);
-    SCData &= ~(1<<1);
-  }
-  setValueSubclassData(SCData);
+  setHungoffOperand<1>(PrefixData);
+  setValueSubclassDataBit(1, PrefixData != nullptr);
 }
 
 Constant *Function::getPrologueData() const {
-  assert(hasPrologueData());
-  const LLVMContextImpl::PrologueDataMapTy &SOMap =
-      getContext().pImpl->PrologueDataMap;
-  assert(SOMap.find(this) != SOMap.end());
-  return cast<Constant>(SOMap.find(this)->second->getReturnValue());
+  assert(hasPrologueData() && getNumOperands());
+  return cast<Constant>(Op<2>());
 }
 
 void Function::setPrologueData(Constant *PrologueData) {
-  if (!PrologueData && !hasPrologueData())
-    return;
-
-  unsigned PDData = getSubclassDataFromValue();
-  LLVMContextImpl::PrologueDataMapTy &PDMap = getContext().pImpl->PrologueDataMap;
-  ReturnInst *&PDHolder = PDMap[this];
-  if (PrologueData) {
-    if (PDHolder)
-      PDHolder->setOperand(0, PrologueData);
-    else
-      PDHolder = ReturnInst::Create(getContext(), PrologueData);
-    PDData |= (1<<2);
-  } else {
-    delete PDHolder;
-    PDMap.erase(this);
-    PDData &= ~(1<<2);
+  setHungoffOperand<2>(PrologueData);
+  setValueSubclassDataBit(2, PrologueData != nullptr);
+}
+
+void Function::allocHungoffUselist() {
+  // If we've already allocated a uselist, stop here.
+  if (getNumOperands())
+    return;
+
+  allocHungoffUses(3, /*IsPhi=*/ false);
+  setNumHungOffUseOperands(3);
+
+  // Initialize the uselist with placeholder operands to allow traversal.
+  auto *CPN = ConstantPointerNull::get(Type::getInt1PtrTy(getContext(), 0));
+  Op<0>().set(CPN);
+  Op<1>().set(CPN);
+  Op<2>().set(CPN);
+}
+
+template <int Idx>
+void Function::setHungoffOperand(Constant *C) {
+  if (C) {
+    allocHungoffUselist();
+    Op<Idx>().set(C);
+  } else if (getNumOperands()) {
+    Op<Idx>().set(
+        ConstantPointerNull::get(Type::getInt1PtrTy(getContext(), 0)));
   }
-  setValueSubclassData(PDData);
+}
+
+void Function::setValueSubclassDataBit(unsigned Bit, bool On) {
+  assert(Bit < 16 && "SubclassData contains only 16 bits");
+  if (On)
+    setValueSubclassData(getSubclassDataFromValue() | (1 << Bit));
+  else
+    setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit));
 }
 
 void Function::setEntryCount(uint64_t Count) {
@@ -997,22 +989,3 @@ Optional<uint64_t> Function::getEntryCount() const {
       }
   return None;
 }
-
-void Function::setPersonalityFn(Constant *C) {
-  if (!C) {
-    if (hasPersonalityFn()) {
-      // Note, the num operands is used to compute the offset of the operand, so
-      // the order here matters.  Clearing the operand then clearing the num
-      // operands ensures we have the correct offset to the operand.
-      Op<0>().set(nullptr);
-      setFunctionNumOperands(0);
-    }
-  } else {
-    // Note, the num operands is used to compute the offset of the operand, so
-    // the order here matters.  We need to set num operands to 1 first so that
-    // we get the correct offset to the first operand when we set it.
-    if (!hasPersonalityFn())
-      setFunctionNumOperands(1);
-    Op<0>().set(C);
-  }
-}
diff --git a/contrib/llvm/lib/IR/FunctionInfo.cpp b/contrib/llvm/lib/IR/FunctionInfo.cpp
new file mode 100644
index 0000000..17a67bc
--- /dev/null
+++ b/contrib/llvm/lib/IR/FunctionInfo.cpp
@@ -0,0 +1,67 @@
+//===-- FunctionInfo.cpp - Function Info Index ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the function info index and summary classes for the
+// IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/FunctionInfo.h"
+#include "llvm/ADT/StringMap.h"
+using namespace llvm;
+
+// Create the combined function index/summary from multiple
+// per-module instances.
+void FunctionInfoIndex::mergeFrom(std::unique_ptr<FunctionInfoIndex> Other,
+                                  uint64_t NextModuleId) {
+
+  StringRef ModPath;
+  for (auto &OtherFuncInfoLists : *Other) {
+    std::string FuncName = OtherFuncInfoLists.getKey();
+    FunctionInfoList &List = OtherFuncInfoLists.second;
+
+    // Assert that the func info list only has one entry, since we shouldn't
+    // have duplicate names within a single per-module index.
+    assert(List.size() == 1);
+    std::unique_ptr<FunctionInfo> Info = std::move(List.front());
+
+    // Skip if there was no function summary section.
+    if (!Info->functionSummary())
+      continue;
+
+    // Add the module path string ref for this module if we haven't already
+    // saved a reference to it.
+    if (ModPath.empty())
+      ModPath =
+          addModulePath(Info->functionSummary()->modulePath(), NextModuleId);
+    else
+      assert(ModPath == Info->functionSummary()->modulePath() &&
+             "Each module in the combined map should have a unique ID");
+
+    // Note the module path string ref was copied above and is still owned by
+    // the original per-module index. Reset it to the new module path
+    // string reference owned by the combined index.
+    Info->functionSummary()->setModulePath(ModPath);
+
+    // If it is a local function, rename it.
+    if (Info->functionSummary()->isLocalFunction()) {
+      // Any local functions are virtually renamed when being added to the
+      // combined index map, to disambiguate from other functions with
+      // the same name. The symbol table created for the combined index
+      // file should contain the renamed symbols.
+      FuncName =
+          FunctionInfoIndex::getGlobalNameForLocal(FuncName, NextModuleId);
+    }
+
+    // Add new function info to existing list. There may be duplicates when
+    // combining FunctionMap entries, due to COMDAT functions. Any local
+    // functions were virtually renamed above.
+    addFunctionInfo(FuncName, std::move(Info));
+  }
+}
diff --git a/contrib/llvm/lib/IR/GCOV.cpp b/contrib/llvm/lib/IR/GCOV.cpp
index 6ed58913..35b8157 100644
--- a/contrib/llvm/lib/IR/GCOV.cpp
+++ b/contrib/llvm/lib/IR/GCOV.cpp
@@ -448,7 +448,7 @@ static uint32_t branchDiv(uint64_t Numerator, uint64_t Divisor) {
 
 namespace {
 struct formatBranchInfo {
-  formatBranchInfo(const GCOVOptions &Options, uint64_t Count, uint64_t Total)
+  formatBranchInfo(const GCOV::Options &Options, uint64_t Count, uint64_t Total)
       : Options(Options), Count(Count), Total(Total) {}
 
   void print(raw_ostream &OS) const {
@@ -460,7 +460,7 @@ struct formatBranchInfo {
       OS << "taken " << branchDiv(Count, Total) << "%";
   }
 
-  const GCOVOptions &Options;
+  const GCOV::Options &Options;
   uint64_t Count;
   uint64_t Total;
 };
diff --git a/contrib/llvm/lib/IR/Globals.cpp b/contrib/llvm/lib/IR/Globals.cpp
index 1d02826..6159f93 100644
--- a/contrib/llvm/lib/IR/Globals.cpp
+++ b/contrib/llvm/lib/IR/Globals.cpp
@@ -32,15 +32,9 @@ bool GlobalValue::isMaterializable() const {
     return F->isMaterializable();
   return false;
 }
-bool GlobalValue::isDematerializable() const {
-  return getParent() && getParent()->isDematerializable(this);
-}
 std::error_code GlobalValue::materialize() {
   return getParent()->materialize(this);
 }
-void GlobalValue::dematerialize() {
-  getParent()->dematerialize(this);
-}
 
 /// Override destroyConstantImpl to make sure it doesn't get called on
 /// GlobalValue's because they shouldn't be treated like other constants.
@@ -97,10 +91,11 @@ void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
 }
 
 void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
-  const auto *GV = cast<GlobalObject>(Src);
-  GlobalValue::copyAttributesFrom(GV);
-  setAlignment(GV->getAlignment());
-  setSection(GV->getSection());
+  GlobalValue::copyAttributesFrom(Src);
+  if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
+    setAlignment(GV->getAlignment());
+    setSection(GV->getSection());
+  }
 }
 
 const char *GlobalValue::getSection() const {
@@ -147,9 +142,9 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
                                Constant *InitVal, const Twine &Name,
                                ThreadLocalMode TLMode, unsigned AddressSpace,
                                bool isExternallyInitialized)
-    : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
+    : GlobalObject(Ty, Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
-                   InitVal != nullptr, Link, Name),
+                   InitVal != nullptr, Link, Name, AddressSpace),
       isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
   setThreadLocalMode(TLMode);
@@ -165,9 +160,9 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                const Twine &Name, GlobalVariable *Before,
                                ThreadLocalMode TLMode, unsigned AddressSpace,
                                bool isExternallyInitialized)
-    : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
+    : GlobalObject(Ty, Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
-                   InitVal != nullptr, Link, Name),
+                   InitVal != nullptr, Link, Name, AddressSpace),
       isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
   setThreadLocalMode(TLMode);
@@ -178,7 +173,7 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
   }
 
   if (Before)
-    Before->getParent()->getGlobalList().insert(Before, this);
+    Before->getParent()->getGlobalList().insert(Before->getIterator(), this);
   else
     M.getGlobalList().push_back(this);
 }
@@ -188,11 +183,11 @@ void GlobalVariable::setParent(Module *parent) {
 }
 
 void GlobalVariable::removeFromParent() {
-  getParent()->getGlobalList().remove(this);
+  getParent()->getGlobalList().remove(getIterator());
 }
 
 void GlobalVariable::eraseFromParent() {
-  getParent()->getGlobalList().erase(this);
+  getParent()->getGlobalList().erase(getIterator());
 }
 
 void GlobalVariable::setInitializer(Constant *InitVal) {
@@ -216,14 +211,14 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
   }
 }
 
-/// copyAttributesFrom - copy all additional attributes (those not needed to
-/// create a GlobalVariable) from the GlobalVariable Src to this one.
+/// Copy all additional attributes (those not needed to create a GlobalVariable)
+/// from the GlobalVariable Src to this one.
 void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
-  assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!");
   GlobalObject::copyAttributesFrom(Src);
-  const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
-  setThreadLocalMode(SrcVar->getThreadLocalMode());
-  setExternallyInitialized(SrcVar->isExternallyInitialized());
+  if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
+    setThreadLocalMode(SrcVar->getThreadLocalMode());
+    setExternallyInitialized(SrcVar->isExternallyInitialized());
+  }
 }
 
 
@@ -231,35 +226,40 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
 // GlobalAlias Implementation
 //===----------------------------------------------------------------------===//
 
-GlobalAlias::GlobalAlias(PointerType *Ty, LinkageTypes Link, const Twine &Name,
-                         Constant *Aliasee, Module *ParentModule)
-    : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name) {
+GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
+                         const Twine &Name, Constant *Aliasee,
+                         Module *ParentModule)
+    : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name,
+                  AddressSpace) {
   Op<0>() = Aliasee;
 
   if (ParentModule)
     ParentModule->getAliasList().push_back(this);
 }
 
-GlobalAlias *GlobalAlias::create(PointerType *Ty, LinkageTypes Link,
-                                 const Twine &Name, Constant *Aliasee,
-                                 Module *ParentModule) {
-  return new GlobalAlias(Ty, Link, Name, Aliasee, ParentModule);
+GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Link, const Twine &Name,
+                                 Constant *Aliasee, Module *ParentModule) {
+  return new GlobalAlias(Ty, AddressSpace, Link, Name, Aliasee, ParentModule);
 }
 
-GlobalAlias *GlobalAlias::create(PointerType *Ty, LinkageTypes Linkage,
-                                 const Twine &Name, Module *Parent) {
-  return create(Ty, Linkage, Name, nullptr, Parent);
+GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Linkage, const Twine &Name,
+                                 Module *Parent) {
+  return create(Ty, AddressSpace, Linkage, Name, nullptr, Parent);
 }
 
-GlobalAlias *GlobalAlias::create(PointerType *Ty, LinkageTypes Linkage,
-                                 const Twine &Name, GlobalValue *Aliasee) {
-  return create(Ty, Linkage, Name, Aliasee, Aliasee->getParent());
+GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Linkage, const Twine &Name,
+                                 GlobalValue *Aliasee) {
+  return create(Ty, AddressSpace, Linkage, Name, Aliasee, Aliasee->getParent());
 }
 
 GlobalAlias *GlobalAlias::create(LinkageTypes Link, const Twine &Name,
                                  GlobalValue *Aliasee) {
   PointerType *PTy = Aliasee->getType();
-  return create(PTy, Link, Name, Aliasee);
+  return create(PTy->getElementType(), PTy->getAddressSpace(), Link, Name,
+                Aliasee);
 }
 
 GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalValue *Aliasee) {
@@ -271,11 +271,11 @@ void GlobalAlias::setParent(Module *parent) {
 }
 
 void GlobalAlias::removeFromParent() {
-  getParent()->getAliasList().remove(this);
+  getParent()->getAliasList().remove(getIterator());
 }
 
 void GlobalAlias::eraseFromParent() {
-  getParent()->getAliasList().erase(this);
+  getParent()->getAliasList().erase(getIterator());
 }
 
 void GlobalAlias::setAliasee(Constant *Aliasee) {
diff --git a/contrib/llvm/lib/IR/IRBuilder.cpp b/contrib/llvm/lib/IR/IRBuilder.cpp
index bddb278..4474129 100644
--- a/contrib/llvm/lib/IR/IRBuilder.cpp
+++ b/contrib/llvm/lib/IR/IRBuilder.cpp
@@ -247,18 +247,21 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
   return createCallHelper(TheFn, Ops, this, Name);
 }
 
+template <typename T0, typename T1, typename T2, typename T3>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
-                  Value *ActualCallee, ArrayRef<Value *> CallArgs,
-                  ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs) {
+                  Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
+                  ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs,
+                  ArrayRef<T3> GCArgs) {
   std::vector<Value *> Args;
   Args.push_back(B.getInt64(ID));
   Args.push_back(B.getInt32(NumPatchBytes));
   Args.push_back(ActualCallee);
   Args.push_back(B.getInt32(CallArgs.size()));
-  Args.push_back(B.getInt32((unsigned)StatepointFlags::None));
+  Args.push_back(B.getInt32(Flags));
   Args.insert(Args.end(), CallArgs.begin(), CallArgs.end());
-  Args.push_back(B.getInt32(0 /* no transition args */));
+  Args.push_back(B.getInt32(TransitionArgs.size()));
+  Args.insert(Args.end(), TransitionArgs.begin(), TransitionArgs.end());
   Args.push_back(B.getInt32(DeoptArgs.size()));
   Args.insert(Args.end(), DeoptArgs.begin(), DeoptArgs.end());
   Args.insert(Args.end(), GCArgs.begin(), GCArgs.end());
@@ -266,69 +269,109 @@ getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
   return Args;
 }
 
-CallInst *IRBuilderBase::CreateGCStatepointCall(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
-    ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs,
-    ArrayRef<Value *> GCArgs, const Twine &Name) {
+template <typename T0, typename T1, typename T2, typename T3>
+static CallInst *CreateGCStatepointCallCommon(
+    IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
+    Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
+    ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs,
+    const Twine &Name) {
   // Extract out the type of the callee.
   PointerType *FuncPtrType = cast<PointerType>(ActualCallee->getType());
   assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
          "actual callee must be a callable value");
 
-  Module *M = BB->getParent()->getParent();
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
   // Fill in the one generic type'd argument (the function is also vararg)
   Type *ArgTypes[] = { FuncPtrType };
   Function *FnStatepoint =
     Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
                               ArgTypes);
 
-  std::vector<llvm::Value *> Args = getStatepointArgs(
-      *this, ID, NumPatchBytes, ActualCallee, CallArgs, DeoptArgs, GCArgs);
-  return createCallHelper(FnStatepoint, Args, this, Name);
+  std::vector<llvm::Value *> Args =
+      getStatepointArgs(*Builder, ID, NumPatchBytes, ActualCallee, Flags,
+                        CallArgs, TransitionArgs, DeoptArgs, GCArgs);
+  return createCallHelper(FnStatepoint, Args, Builder, Name);
 }
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
-    ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
-  std::vector<Value *> VCallArgs;
-  for (auto &U : CallArgs)
-    VCallArgs.push_back(U.get());
-  return CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, VCallArgs,
-                                DeoptArgs, GCArgs, Name);
+  return CreateGCStatepointCallCommon<Value *, Value *, Value *, Value *>(
+      this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None),
+      CallArgs, None /* No Transition Args */, DeoptArgs, GCArgs, Name);
 }
 
-InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
-    uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
-    BasicBlock *NormalDest, BasicBlock *UnwindDest,
-    ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs,
+CallInst *IRBuilderBase::CreateGCStatepointCall(
+    uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags,
+    ArrayRef<Use> CallArgs, ArrayRef<Use> TransitionArgs,
+    ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+  return CreateGCStatepointCallCommon<Use, Use, Use, Value *>(
+      this, ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs,
+      DeoptArgs, GCArgs, Name);
+}
+
+CallInst *IRBuilderBase::CreateGCStatepointCall(
+    uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
+    ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
+  return CreateGCStatepointCallCommon<Use, Value *, Value *, Value *>(
+      this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None),
+      CallArgs, None, DeoptArgs, GCArgs, Name);
+}
+
+template <typename T0, typename T1, typename T2, typename T3>
+static InvokeInst *CreateGCStatepointInvokeCommon(
+    IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
+    Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest,
+    uint32_t Flags, ArrayRef<T0> InvokeArgs, ArrayRef<T1> TransitionArgs,
+    ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs, const Twine &Name) {
   // Extract out the type of the callee.
   PointerType *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
   assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
          "actual callee must be a callable value");
 
-  Module *M = BB->getParent()->getParent();
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
   // Fill in the one generic type'd argument (the function is also vararg)
   Function *FnStatepoint = Intrinsic::getDeclaration(
       M, Intrinsic::experimental_gc_statepoint, {FuncPtrType});
 
-  std::vector<llvm::Value *> Args = getStatepointArgs(
-      *this, ID, NumPatchBytes, ActualInvokee, InvokeArgs, DeoptArgs, GCArgs);
-  return createInvokeHelper(FnStatepoint, NormalDest, UnwindDest, Args, this,
+  std::vector<llvm::Value *> Args =
+      getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee, Flags,
+                        InvokeArgs, TransitionArgs, DeoptArgs, GCArgs);
+  return createInvokeHelper(FnStatepoint, NormalDest, UnwindDest, Args, Builder,
                             Name);
 }
 
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
+    BasicBlock *NormalDest, BasicBlock *UnwindDest,
+    ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Value *> GCArgs, const Twine &Name) {
+  return CreateGCStatepointInvokeCommon<Value *, Value *, Value *, Value *>(
+      this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
+      uint32_t(StatepointFlags::None), InvokeArgs, None /* No Transition Args*/,
+      DeoptArgs, GCArgs, Name);
+}
+
+InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
+    uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
+    BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
+    ArrayRef<Use> InvokeArgs, ArrayRef<Use> TransitionArgs,
+    ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+  return CreateGCStatepointInvokeCommon<Use, Use, Use, Value *>(
+      this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags,
+      InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
+}
+
+InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
+    uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs,
     ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
-  std::vector<Value *> VCallArgs;
-  for (auto &U : InvokeArgs)
-    VCallArgs.push_back(U.get());
-  return CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest,
-                                  UnwindDest, VCallArgs, DeoptArgs, GCArgs,
-                                  Name);
+  return CreateGCStatepointInvokeCommon<Use, Value *, Value *, Value *>(
+      this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
+      uint32_t(StatepointFlags::None), InvokeArgs, None, DeoptArgs, GCArgs,
+      Name);
 }
 
 CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint,
diff --git a/contrib/llvm/lib/IR/IRPrintingPasses.cpp b/contrib/llvm/lib/IR/IRPrintingPasses.cpp
index c1ac336..822dbeb 100644
--- a/contrib/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/contrib/llvm/lib/IR/IRPrintingPasses.cpp
@@ -28,7 +28,13 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
 
 PreservedAnalyses PrintModulePass::run(Module &M) {
   OS << Banner;
-  M.print(OS, nullptr, ShouldPreserveUseListOrder);
+  if (llvm::isFunctionInPrintList("*"))
+    M.print(OS, nullptr, ShouldPreserveUseListOrder);
+  else {
+    for(const auto &F : M.functions())
+      if (llvm::isFunctionInPrintList(F.getName()))
+        F.print(OS);
+  }
   return PreservedAnalyses::all();
 }
 
@@ -37,7 +43,8 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
     : OS(OS), Banner(Banner) {}
 
 PreservedAnalyses PrintFunctionPass::run(Function &F) {
-  OS << Banner << static_cast<Value &>(F);
+  if (isFunctionInPrintList(F.getName()))
+    OS << Banner << static_cast<Value &>(F);
   return PreservedAnalyses::all();
 }
 
diff --git a/contrib/llvm/lib/IR/InlineAsm.cpp b/contrib/llvm/lib/IR/InlineAsm.cpp
index aa9e027..15d3b83 100644
--- a/contrib/llvm/lib/IR/InlineAsm.cpp
+++ b/contrib/llvm/lib/IR/InlineAsm.cpp
@@ -24,23 +24,22 @@ using namespace llvm;
 InlineAsm::~InlineAsm() {
 }
 
-
-InlineAsm *InlineAsm::get(FunctionType *Ty, StringRef AsmString,
+InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
                           StringRef Constraints, bool hasSideEffects,
                           bool isAlignStack, AsmDialect asmDialect) {
-  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack,
-                       asmDialect);
-  LLVMContextImpl *pImpl = Ty->getContext().pImpl;
-  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key);
+  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
+                       isAlignStack, asmDialect);
+  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
+  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
 }
 
-InlineAsm::InlineAsm(PointerType *Ty, const std::string &asmString,
+InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
                      bool isAlignStack, AsmDialect asmDialect)
-  : Value(Ty, Value::InlineAsmVal),
-    AsmString(asmString), Constraints(constraints),
-    HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
-    Dialect(asmDialect) {
+    : Value(PointerType::getUnqual(FTy), Value::InlineAsmVal),
+      AsmString(asmString), Constraints(constraints), FTy(FTy),
+      HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
+      Dialect(asmDialect) {
 
   // Do various checks on the constraint string and type.
   assert(Verify(getFunctionType(), constraints) &&
@@ -53,7 +52,7 @@ void InlineAsm::destroyConstant() {
 }
 
 FunctionType *InlineAsm::getFunctionType() const {
-  return cast<FunctionType>(getType()->getElementType());
+  return FTy;
 }
     
 ///Default constructor.
@@ -160,6 +159,9 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       // If Operand N already has a matching input, reject this.  An output
       // can't be constrained to the same value as multiple inputs.
       if (isMultipleAlternative) {
+        if (multipleAlternativeIndex >=
+            ConstraintsSoFar[N].multipleAlternatives.size())
+          return true;
         InlineAsm::SubConstraintInfo &scInfo =
           ConstraintsSoFar[N].multipleAlternatives[multipleAlternativeIndex];
         if (scInfo.MatchingInput != -1)
@@ -291,4 +293,3 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) {
   if (Ty->getNumParams() != NumInputs) return false;
   return true;
 }
-
diff --git a/contrib/llvm/lib/IR/Instruction.cpp b/contrib/llvm/lib/IR/Instruction.cpp
index c57ba16..4b33d2e 100644
--- a/contrib/llvm/lib/IR/Instruction.cpp
+++ b/contrib/llvm/lib/IR/Instruction.cpp
@@ -28,7 +28,7 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
   if (InsertBefore) {
     BasicBlock *BB = InsertBefore->getParent();
     assert(BB && "Instruction to insert before is not in a basic block!");
-    BB->getInstList().insert(InsertBefore, this);
+    BB->getInstList().insert(InsertBefore->getIterator(), this);
   }
 }
 
@@ -62,33 +62,38 @@ Module *Instruction::getModule() {
   return getParent()->getModule();
 }
 
+Function *Instruction::getFunction() { return getParent()->getParent(); }
+
+const Function *Instruction::getFunction() const {
+  return getParent()->getParent();
+}
 
 void Instruction::removeFromParent() {
-  getParent()->getInstList().remove(this);
+  getParent()->getInstList().remove(getIterator());
 }
 
 iplist<Instruction>::iterator Instruction::eraseFromParent() {
-  return getParent()->getInstList().erase(this);
+  return getParent()->getInstList().erase(getIterator());
 }
 
-/// insertBefore - Insert an unlinked instructions into a basic block
-/// immediately before the specified instruction.
+/// Insert an unlinked instruction into a basic block immediately before the
+/// specified instruction.
 void Instruction::insertBefore(Instruction *InsertPos) {
-  InsertPos->getParent()->getInstList().insert(InsertPos, this);
+  InsertPos->getParent()->getInstList().insert(InsertPos->getIterator(), this);
 }
 
-/// insertAfter - Insert an unlinked instructions into a basic block
-/// immediately after the specified instruction.
+/// Insert an unlinked instruction into a basic block immediately after the
+/// specified instruction.
 void Instruction::insertAfter(Instruction *InsertPos) {
-  InsertPos->getParent()->getInstList().insertAfter(InsertPos, this);
+  InsertPos->getParent()->getInstList().insertAfter(InsertPos->getIterator(),
+                                                    this);
 }
 
-/// moveBefore - Unlink this instruction from its current basic block and
-/// insert it into the basic block that MovePos lives in, right before
-/// MovePos.
+/// Unlink this instruction from its current basic block and insert it into the
+/// basic block that MovePos lives in, right before MovePos.
 void Instruction::moveBefore(Instruction *MovePos) {
-  MovePos->getParent()->getInstList().splice(MovePos,getParent()->getInstList(),
-                                             this);
+  MovePos->getParent()->getInstList().splice(
+      MovePos->getIterator(), getParent()->getInstList(), getIterator());
 }
 
 /// Set or clear the unsafe-algebra flag on this instruction, which must be an
@@ -196,6 +201,10 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case Invoke: return "invoke";
   case Resume: return "resume";
   case Unreachable: return "unreachable";
+  case CleanupRet: return "cleanupret";
+  case CatchRet: return "catchret";
+  case CatchPad: return "catchpad";
+  case CatchSwitch: return "catchswitch";
 
   // Standard binary operators...
   case Add: return "add";
@@ -256,6 +265,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case ExtractValue:   return "extractvalue";
   case InsertValue:    return "insertvalue";
   case LandingPad:     return "landingpad";
+  case CleanupPad:     return "cleanuppad";
 
   default: return "<Invalid operator> ";
   }
@@ -285,11 +295,12 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
   if (const CallInst *CI = dyn_cast<CallInst>(I1))
     return CI->isTailCall() == cast<CallInst>(I2)->isTailCall() &&
            CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
+           CI->getAttributes() == cast<CallInst>(I2)->getAttributes() &&
+           CI->hasIdenticalOperandBundleSchema(*cast<CallInst>(I2));
   if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
     return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
-           CI->getAttributes() ==
-             cast<InvokeInst>(I2)->getAttributes();
+           CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes() &&
+           CI->hasIdenticalOperandBundleSchema(*cast<InvokeInst>(I2));
   if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
     return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
@@ -407,6 +418,8 @@ bool Instruction::mayReadFromMemory() const {
   case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory
   case Instruction::AtomicCmpXchg:
   case Instruction::AtomicRMW:
+  case Instruction::CatchPad:
+  case Instruction::CatchRet:
     return true;
   case Instruction::Call:
     return !cast<CallInst>(this)->doesNotAccessMemory();
@@ -427,6 +440,8 @@ bool Instruction::mayWriteToMemory() const {
   case Instruction::VAArg:
   case Instruction::AtomicCmpXchg:
   case Instruction::AtomicRMW:
+  case Instruction::CatchPad:
+  case Instruction::CatchRet:
     return true;
   case Instruction::Call:
     return !cast<CallInst>(this)->onlyReadsMemory();
@@ -455,6 +470,10 @@ bool Instruction::isAtomic() const {
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
+  if (const auto *CRI = dyn_cast<CleanupReturnInst>(this))
+    return CRI->unwindsToCaller();
+  if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(this))
+    return CatchSwitch->unwindsToCaller();
   return isa<ResumeInst>(this);
 }
 
diff --git a/contrib/llvm/lib/IR/Instructions.cpp b/contrib/llvm/lib/IR/Instructions.cpp
index 86c921a..7c64ca7 100644
--- a/contrib/llvm/lib/IR/Instructions.cpp
+++ b/contrib/llvm/lib/IR/Instructions.cpp
@@ -62,7 +62,10 @@ UnaryInstruction::~UnaryInstruction() {
 const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
   if (Op1->getType() != Op2->getType())
     return "both values to select must have same type";
-  
+
+  if (Op1->getType()->isTokenTy())
+    return "select values cannot have token type";
+
   if (VectorType *VT = dyn_cast<VectorType>(Op0->getType())) {
     // Vector select.
     if (VT->getElementType() != Type::getInt1Ty(Op0->getContext()))
@@ -84,6 +87,8 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
 //                               PHINode Class
 //===----------------------------------------------------------------------===//
 
+void PHINode::anchor() {}
+
 PHINode::PHINode(const PHINode &PN)
     : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()),
       ReservedSpace(PN.getNumOperands()) {
@@ -223,9 +228,10 @@ CallInst::~CallInst() {
 }
 
 void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
-                    const Twine &NameStr) {
+                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
   this->FTy = FTy;
-  assert(getNumOperands() == Args.size() + 1 && "NumOperands not set up?");
+  assert(getNumOperands() == Args.size() + CountBundleInputs(Bundles) + 1 &&
+         "NumOperands not set up?");
   Op<-1>() = Func;
 
 #ifndef NDEBUG
@@ -240,6 +246,11 @@ void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
 #endif
 
   std::copy(Args.begin(), Args.end(), op_begin());
+
+  auto It = populateBundleOperandInfos(Bundles, Args.size());
+  (void)It;
+  assert(It + 1 == op_end() && "Should add up!");
+
   setName(NameStr);
 }
 
@@ -281,11 +292,26 @@ CallInst::CallInst(const CallInst &CI)
       AttributeList(CI.AttributeList), FTy(CI.FTy) {
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
-    
+
   std::copy(CI.op_begin(), CI.op_end(), op_begin());
+  std::copy(CI.bundle_op_info_begin(), CI.bundle_op_info_end(),
+            bundle_op_info_begin());
   SubclassOptionalData = CI.SubclassOptionalData;
 }
 
+CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
+                           Instruction *InsertPt) {
+  std::vector<Value *> Args(CI->arg_begin(), CI->arg_end());
+
+  auto *NewCI = CallInst::Create(CI->getCalledValue(), Args, OpB, CI->getName(),
+                                 InsertPt);
+  NewCI->setTailCallKind(CI->getTailCallKind());
+  NewCI->setCallingConv(CI->getCallingConv());
+  NewCI->SubclassOptionalData = CI->SubclassOptionalData;
+  NewCI->setAttributes(CI->getAttributes());
+  return NewCI;
+}
+
 void CallInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
   AttributeSet PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, attr);
@@ -320,6 +346,8 @@ void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
 }
 
 bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
+  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
+
   if (AttributeList.hasAttribute(i, A))
     return true;
   if (const Function *F = getCalledFunction())
@@ -327,6 +355,25 @@ bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
   return false;
 }
 
+bool CallInst::dataOperandHasImpliedAttr(unsigned i,
+                                         Attribute::AttrKind A) const {
+
+  // There are getNumOperands() - 1 data operands.  The last operand is the
+  // callee.
+  assert(i < getNumOperands() && "Data operand index out of bounds!");
+
+  // The attribute A can either be directly specified, if the operand in
+  // question is a call argument; or be indirectly implied by the kind of its
+  // containing operand bundle, if the operand is a bundle operand.
+
+  if (i < (getNumArgOperands() + 1))
+    return paramHasAttr(i, A);
+
+  assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
+         "Must be either a call argument or an operand bundle!");
+  return bundleOperandHasAttr(i - 1, A);
+}
+
 /// IsConstantOne - Return true only if val is constant int 1
 static bool IsConstantOne(Value *val) {
   assert(val && "IsConstantOne does not work with nullptr val");
@@ -496,10 +543,12 @@ Instruction* CallInst::CreateFree(Value* Source, BasicBlock *InsertAtEnd) {
 
 void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
                       BasicBlock *IfException, ArrayRef<Value *> Args,
+                      ArrayRef<OperandBundleDef> Bundles,
                       const Twine &NameStr) {
   this->FTy = FTy;
 
-  assert(getNumOperands() == 3 + Args.size() && "NumOperands not set up?");
+  assert(getNumOperands() == 3 + Args.size() + CountBundleInputs(Bundles) &&
+         "NumOperands not set up?");
   Op<-3>() = Fn;
   Op<-2>() = IfNormal;
   Op<-1>() = IfException;
@@ -516,6 +565,11 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
 #endif
 
   std::copy(Args.begin(), Args.end(), op_begin());
+
+  auto It = populateBundleOperandInfos(Bundles, Args.size());
+  (void)It;
+  assert(It + 3 == op_end() && "Should add up!");
+
   setName(NameStr);
 }
 
@@ -527,9 +581,24 @@ InvokeInst::InvokeInst(const InvokeInst &II)
       AttributeList(II.AttributeList), FTy(II.FTy) {
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
+  std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
+            bundle_op_info_begin());
   SubclassOptionalData = II.SubclassOptionalData;
 }
 
+InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
+                               Instruction *InsertPt) {
+  std::vector<Value *> Args(II->arg_begin(), II->arg_end());
+
+  auto *NewII = InvokeInst::Create(II->getCalledValue(), II->getNormalDest(),
+                                   II->getUnwindDest(), Args, OpB,
+                                   II->getName(), InsertPt);
+  NewII->setCallingConv(II->getCallingConv());
+  NewII->SubclassOptionalData = II->SubclassOptionalData;
+  NewII->setAttributes(II->getAttributes());
+  return NewII;
+}
+
 BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
@@ -540,15 +609,9 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   return setSuccessor(idx, B);
 }
 
-bool InvokeInst::hasFnAttrImpl(Attribute::AttrKind A) const {
-  if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
-    return true;
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
-  return false;
-}
-
 bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
+  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
+
   if (AttributeList.hasAttribute(i, A))
     return true;
   if (const Function *F = getCalledFunction())
@@ -556,6 +619,24 @@ bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
   return false;
 }
 
+bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
+                                           Attribute::AttrKind A) const {
+  // There are getNumOperands() - 3 data operands.  The last three operands are
+  // the callee and the two successor basic blocks.
+  assert(i < (getNumOperands() - 2) && "Data operand index out of bounds!");
+
+  // The attribute A can either be directly specified, if the operand in
+  // question is an invoke argument; or be indirectly implied by the kind of its
+  // containing operand bundle, if the operand is a bundle operand.
+
+  if (i < (getNumArgOperands() + 1))
+    return paramHasAttr(i, A);
+
+  assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
+         "Must be either an invoke argument or an operand bundle!");
+  return bundleOperandHasAttr(i - 1, A);
+}
+
 void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
   AttributeSet PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, attr);
@@ -671,6 +752,234 @@ BasicBlock *ResumeInst::getSuccessorV(unsigned idx) const {
 }
 
 //===----------------------------------------------------------------------===//
+//                        CleanupReturnInst Implementation
+//===----------------------------------------------------------------------===//
+
+CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
+    : TerminatorInst(CRI.getType(), Instruction::CleanupRet,
+                     OperandTraits<CleanupReturnInst>::op_end(this) -
+                         CRI.getNumOperands(),
+                     CRI.getNumOperands()) {
+  setInstructionSubclassData(CRI.getSubclassDataFromInstruction());
+  Op<0>() = CRI.Op<0>();
+  if (CRI.hasUnwindDest())
+    Op<1>() = CRI.Op<1>();
+}
+
+void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
+  if (UnwindBB)
+    setInstructionSubclassData(getSubclassDataFromInstruction() | 1);
+
+  Op<0>() = CleanupPad;
+  if (UnwindBB)
+    Op<1>() = UnwindBB;
+}
+
+CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
+                                     unsigned Values, Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()),
+                     Instruction::CleanupRet,
+                     OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                     Values, InsertBefore) {
+  init(CleanupPad, UnwindBB);
+}
+
+CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
+                                     unsigned Values, BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()),
+                     Instruction::CleanupRet,
+                     OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                     Values, InsertAtEnd) {
+  init(CleanupPad, UnwindBB);
+}
+
+BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const {
+  assert(Idx == 0);
+  return getUnwindDest();
+}
+unsigned CleanupReturnInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
+  assert(Idx == 0);
+  setUnwindDest(B);
+}
+
+//===----------------------------------------------------------------------===//
+//                        CatchReturnInst Implementation
+//===----------------------------------------------------------------------===//
+void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) {
+  Op<0>() = CatchPad;
+  Op<1>() = BB;
+}
+
+CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
+    : TerminatorInst(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
+                     OperandTraits<CatchReturnInst>::op_begin(this), 2) {
+  Op<0>() = CRI.Op<0>();
+  Op<1>() = CRI.Op<1>();
+}
+
+CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
+                                 Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                     OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                     InsertBefore) {
+  init(CatchPad, BB);
+}
+
+CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
+                                 BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                     OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                     InsertAtEnd) {
+  init(CatchPad, BB);
+}
+
+BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const {
+  assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
+  return getSuccessor();
+}
+unsigned CatchReturnInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
+  assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
+  setSuccessor(B);
+}
+
+//===----------------------------------------------------------------------===//
+//                       CatchSwitchInst Implementation
+//===----------------------------------------------------------------------===//
+
+CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
+                                 unsigned NumReservedValues,
+                                 const Twine &NameStr,
+                                 Instruction *InsertBefore)
+    : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                     InsertBefore) {
+  if (UnwindDest)
+    ++NumReservedValues;
+  init(ParentPad, UnwindDest, NumReservedValues + 1);
+  setName(NameStr);
+}
+
+CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
+                                 unsigned NumReservedValues,
+                                 const Twine &NameStr, BasicBlock *InsertAtEnd)
+    : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                     InsertAtEnd) {
+  if (UnwindDest)
+    ++NumReservedValues;
+  init(ParentPad, UnwindDest, NumReservedValues + 1);
+  setName(NameStr);
+}
+
+CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI)
+    : TerminatorInst(CSI.getType(), Instruction::CatchSwitch, nullptr,
+                     CSI.getNumOperands()) {
+  init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands());
+  setNumHungOffUseOperands(ReservedSpace);
+  Use *OL = getOperandList();
+  const Use *InOL = CSI.getOperandList();
+  for (unsigned I = 1, E = ReservedSpace; I != E; ++I)
+    OL[I] = InOL[I];
+}
+
+void CatchSwitchInst::init(Value *ParentPad, BasicBlock *UnwindDest,
+                           unsigned NumReservedValues) {
+  assert(ParentPad && NumReservedValues);
+
+  ReservedSpace = NumReservedValues;
+  setNumHungOffUseOperands(UnwindDest ? 2 : 1);
+  allocHungoffUses(ReservedSpace);
+
+  Op<0>() = ParentPad;
+  if (UnwindDest) {
+    setInstructionSubclassData(getSubclassDataFromInstruction() | 1);
+    setUnwindDest(UnwindDest);
+  }
+}
+
+/// growOperands - grow operands - This grows the operand list in response to a
+/// push_back style of operation. This grows the number of ops by 2 times.
+void CatchSwitchInst::growOperands(unsigned Size) {
+  unsigned NumOperands = getNumOperands();
+  assert(NumOperands >= 1);
+  if (ReservedSpace >= NumOperands + Size)
+    return;
+  ReservedSpace = (NumOperands + Size / 2) * 2;
+  growHungoffUses(ReservedSpace);
+}
+
+void CatchSwitchInst::addHandler(BasicBlock *Handler) {
+  unsigned OpNo = getNumOperands();
+  growOperands(1);
+  assert(OpNo < ReservedSpace && "Growing didn't work!");
+  setNumHungOffUseOperands(getNumOperands() + 1);
+  getOperandList()[OpNo] = Handler;
+}
+
+void CatchSwitchInst::removeHandler(handler_iterator HI) {
+  // Move all subsequent handlers up one.
+  Use *EndDst = op_end() - 1;
+  for (Use *CurDst = HI.getCurrent(); CurDst != EndDst; ++CurDst)
+    *CurDst = *(CurDst + 1);
+  // Null out the last handler use.
+  *EndDst = nullptr;
+
+  setNumHungOffUseOperands(getNumOperands() - 1);
+}
+
+BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned CatchSwitchInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                        FuncletPadInst Implementation
+//===----------------------------------------------------------------------===//
+void FuncletPadInst::init(Value *ParentPad, ArrayRef<Value *> Args,
+                          const Twine &NameStr) {
+  assert(getNumOperands() == 1 + Args.size() && "NumOperands not set up?");
+  std::copy(Args.begin(), Args.end(), op_begin());
+  setParentPad(ParentPad);
+  setName(NameStr);
+}
+
+FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI)
+    : Instruction(FPI.getType(), FPI.getOpcode(),
+                  OperandTraits<FuncletPadInst>::op_end(this) -
+                      FPI.getNumOperands(),
+                  FPI.getNumOperands()) {
+  std::copy(FPI.op_begin(), FPI.op_end(), op_begin());
+  setParentPad(FPI.getParentPad());
+}
+
+FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
+                               ArrayRef<Value *> Args, unsigned Values,
+                               const Twine &NameStr, Instruction *InsertBefore)
+    : Instruction(ParentPad->getType(), Op,
+                  OperandTraits<FuncletPadInst>::op_end(this) - Values, Values,
+                  InsertBefore) {
+  init(ParentPad, Args, NameStr);
+}
+
+FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
+                               ArrayRef<Value *> Args, unsigned Values,
+                               const Twine &NameStr, BasicBlock *InsertAtEnd)
+    : Instruction(ParentPad->getType(), Op,
+                  OperandTraits<FuncletPadInst>::op_end(this) - Values, Values,
+                  InsertAtEnd) {
+  init(ParentPad, Args, NameStr);
+}
+
+//===----------------------------------------------------------------------===//
 //                      UnreachableInst Implementation
 //===----------------------------------------------------------------------===//
 
@@ -1193,6 +1502,8 @@ FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
 //                       GetElementPtrInst Implementation
 //===----------------------------------------------------------------------===//
 
+void GetElementPtrInst::anchor() {}
+
 void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
                              const Twine &Name) {
   assert(getNumOperands() == 1 + IdxList.size() &&
@@ -2029,7 +2340,7 @@ bool CastInst::isNoopCast(const DataLayout &DL) const {
 /// *  %S = secondOpcode MidTy %F to DstTy
 /// The function returns a resultOpcode so these two casts can be replaced with:
 /// *  %Replacement = resultOpcode %SrcTy %x to DstTy
-/// If no such cast is permited, the function returns 0.
+/// If no such cast is permitted, the function returns 0.
 unsigned CastInst::isEliminableCastPair(
   Instruction::CastOps firstOp, Instruction::CastOps secondOp,
   Type *SrcTy, Type *MidTy, Type *DstTy, Type *SrcIntPtrTy, Type *MidIntPtrTy,
@@ -2037,7 +2348,7 @@ unsigned CastInst::isEliminableCastPair(
   // Define the 144 possibilities for these two cast instructions. The values
   // in this matrix determine what to do in a given situation and select the
   // case in the switch below.  The rows correspond to firstOp, the columns 
-  // correspond to secondOp.  In looking at the table below, keep in  mind
+  // correspond to secondOp.  In looking at the table below, keep in mind
   // the following cast properties:
   //
   //          Size Compare       Source               Destination
@@ -2087,17 +2398,19 @@ unsigned CastInst::isEliminableCastPair(
     {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
 
+  // TODO: This logic could be encoded into the table above and handled in the
+  // switch below.
   // If either of the casts are a bitcast from scalar to vector, disallow the
-  // merging. However, bitcast of A->B->A are allowed.
-  bool isFirstBitcast  = (firstOp == Instruction::BitCast);
-  bool isSecondBitcast = (secondOp == Instruction::BitCast);
-  bool chainedBitcast  = (SrcTy == DstTy && isFirstBitcast && isSecondBitcast);
-
-  // Check if any of the bitcasts convert scalars<->vectors.
-  if ((isFirstBitcast  && isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) ||
-      (isSecondBitcast && isa<VectorType>(MidTy) != isa<VectorType>(DstTy)))
-    // Unless we are bitcasing to the original type, disallow optimizations.
-    if (!chainedBitcast) return 0;
+  // merging. However, any pair of bitcasts are allowed.
+  bool IsFirstBitcast  = (firstOp == Instruction::BitCast);
+  bool IsSecondBitcast = (secondOp == Instruction::BitCast);
+  bool AreBothBitcasts = IsFirstBitcast && IsSecondBitcast;
+
+  // Check if any of the casts convert scalars <-> vectors.
+  if ((IsFirstBitcast  && isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) ||
+      (IsSecondBitcast && isa<VectorType>(MidTy) != isa<VectorType>(DstTy)))
+    if (!AreBothBitcasts)
+      return 0;
 
   int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin]
                             [secondOp-Instruction::CastOpsBegin];
@@ -2966,9 +3279,8 @@ AddrSpaceCastInst::AddrSpaceCastInst(
 
 void CmpInst::anchor() {}
 
-CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
-                 Value *LHS, Value *RHS, const Twine &Name,
-                 Instruction *InsertBefore)
+CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
+                 Value *RHS, const Twine &Name, Instruction *InsertBefore)
   : Instruction(ty, op,
                 OperandTraits<CmpInst>::op_begin(this),
                 OperandTraits<CmpInst>::operands(this),
@@ -2979,9 +3291,8 @@ CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
   setName(Name);
 }
 
-CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
-                 Value *LHS, Value *RHS, const Twine &Name,
-                 BasicBlock *InsertAtEnd)
+CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
+                 Value *RHS, const Twine &Name, BasicBlock *InsertAtEnd)
   : Instruction(ty, op,
                 OperandTraits<CmpInst>::op_begin(this),
                 OperandTraits<CmpInst>::operands(this),
@@ -2993,8 +3304,7 @@ CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
 }
 
 CmpInst *
-CmpInst::Create(OtherOps Op, unsigned short predicate,
-                Value *S1, Value *S2, 
+CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
                 const Twine &Name, Instruction *InsertBefore) {
   if (Op == Instruction::ICmp) {
     if (InsertBefore)
@@ -3014,7 +3324,7 @@ CmpInst::Create(OtherOps Op, unsigned short predicate,
 }
 
 CmpInst *
-CmpInst::Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, 
+CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
                 const Twine &Name, BasicBlock *InsertAtEnd) {
   if (Op == Instruction::ICmp) {
     return new ICmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
@@ -3077,6 +3387,8 @@ CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
   }
 }
 
+void ICmpInst::anchor() {}
+
 ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown icmp predicate!");
@@ -3196,7 +3508,24 @@ CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
   }
 }
 
-bool CmpInst::isUnsigned(unsigned short predicate) {
+CmpInst::Predicate CmpInst::getSignedPredicate(Predicate pred) {
+  assert(CmpInst::isUnsigned(pred) && "Call only with signed predicates!");
+
+  switch (pred) {
+  default:
+    llvm_unreachable("Unknown predicate!");
+  case CmpInst::ICMP_ULT:
+    return CmpInst::ICMP_SLT;
+  case CmpInst::ICMP_ULE:
+    return CmpInst::ICMP_SLE;
+  case CmpInst::ICMP_UGT:
+    return CmpInst::ICMP_SGT;
+  case CmpInst::ICMP_UGE:
+    return CmpInst::ICMP_SGE;
+  }
+}
+
+bool CmpInst::isUnsigned(Predicate predicate) {
   switch (predicate) {
     default: return false;
     case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: 
@@ -3204,7 +3533,7 @@ bool CmpInst::isUnsigned(unsigned short predicate) {
   }
 }
 
-bool CmpInst::isSigned(unsigned short predicate) {
+bool CmpInst::isSigned(Predicate predicate) {
   switch (predicate) {
     default: return false;
     case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: 
@@ -3212,7 +3541,7 @@ bool CmpInst::isSigned(unsigned short predicate) {
   }
 }
 
-bool CmpInst::isOrdered(unsigned short predicate) {
+bool CmpInst::isOrdered(Predicate predicate) {
   switch (predicate) {
     default: return false;
     case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: 
@@ -3221,7 +3550,7 @@ bool CmpInst::isOrdered(unsigned short predicate) {
   }
 }
       
-bool CmpInst::isUnordered(unsigned short predicate) {
+bool CmpInst::isUnordered(Predicate predicate) {
   switch (predicate) {
     default: return false;
     case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: 
@@ -3230,7 +3559,7 @@ bool CmpInst::isUnordered(unsigned short predicate) {
   }
 }
 
-bool CmpInst::isTrueWhenEqual(unsigned short predicate) {
+bool CmpInst::isTrueWhenEqual(Predicate predicate) {
   switch(predicate) {
     default: return false;
     case ICMP_EQ:   case ICMP_UGE: case ICMP_ULE: case ICMP_SGE: case ICMP_SLE:
@@ -3238,7 +3567,7 @@ bool CmpInst::isTrueWhenEqual(unsigned short predicate) {
   }
 }
 
-bool CmpInst::isFalseWhenEqual(unsigned short predicate) {
+bool CmpInst::isFalseWhenEqual(Predicate predicate) {
   switch(predicate) {
   case ICMP_NE:    case ICMP_UGT: case ICMP_ULT: case ICMP_SGT: case ICMP_SLT:
   case FCMP_FALSE: case FCMP_ONE: case FCMP_OGT: case FCMP_OLT: return true;
@@ -3569,6 +3898,10 @@ AddrSpaceCastInst *AddrSpaceCastInst::cloneImpl() const {
 }
 
 CallInst *CallInst::cloneImpl() const {
+  if (hasOperandBundles()) {
+    unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
+    return new(getNumOperands(), DescriptorBytes) CallInst(*this);
+  }
   return  new(getNumOperands()) CallInst(*this);
 }
 
@@ -3613,11 +3946,31 @@ IndirectBrInst *IndirectBrInst::cloneImpl() const {
 }
 
 InvokeInst *InvokeInst::cloneImpl() const {
+  if (hasOperandBundles()) {
+    unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
+    return new(getNumOperands(), DescriptorBytes) InvokeInst(*this);
+  }
   return new(getNumOperands()) InvokeInst(*this);
 }
 
 ResumeInst *ResumeInst::cloneImpl() const { return new (1) ResumeInst(*this); }
 
+CleanupReturnInst *CleanupReturnInst::cloneImpl() const {
+  return new (getNumOperands()) CleanupReturnInst(*this);
+}
+
+CatchReturnInst *CatchReturnInst::cloneImpl() const {
+  return new (getNumOperands()) CatchReturnInst(*this);
+}
+
+CatchSwitchInst *CatchSwitchInst::cloneImpl() const {
+  return new CatchSwitchInst(*this);
+}
+
+FuncletPadInst *FuncletPadInst::cloneImpl() const {
+  return new (getNumOperands()) FuncletPadInst(*this);
+}
+
 UnreachableInst *UnreachableInst::cloneImpl() const {
   LLVMContext &Context = getContext();
   return new UnreachableInst(Context);
diff --git a/contrib/llvm/lib/IR/LLVMContext.cpp b/contrib/llvm/lib/IR/LLVMContext.cpp
index 6d799e4..48b53b0 100644
--- a/contrib/llvm/lib/IR/LLVMContext.cpp
+++ b/contrib/llvm/lib/IR/LLVMContext.cpp
@@ -104,6 +104,39 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   assert(DereferenceableOrNullID == MD_dereferenceable_or_null && 
          "dereferenceable_or_null kind id drifted");
   (void)DereferenceableOrNullID;
+
+  // Create the 'make.implicit' metadata kind.
+  unsigned MakeImplicitID = getMDKindID("make.implicit");
+  assert(MakeImplicitID == MD_make_implicit &&
+         "make.implicit kind id drifted");
+  (void)MakeImplicitID;
+
+  // Create the 'unpredictable' metadata kind.
+  unsigned UnpredictableID = getMDKindID("unpredictable");
+  assert(UnpredictableID == MD_unpredictable &&
+         "unpredictable kind id drifted");
+  (void)UnpredictableID;
+
+  // Create the 'invariant.group' metadata kind.
+  unsigned InvariantGroupId = getMDKindID("invariant.group");
+  assert(InvariantGroupId == MD_invariant_group &&
+         "invariant.group kind id drifted");
+  (void)InvariantGroupId;
+
+  // Create the 'align' metadata kind.
+  unsigned AlignID = getMDKindID("align");
+  assert(AlignID == MD_align && "align kind id drifted");
+  (void)AlignID;
+
+  auto *DeoptEntry = pImpl->getOrInsertBundleTag("deopt");
+  assert(DeoptEntry->second == LLVMContext::OB_deopt &&
+         "deopt operand bundle id drifted!");
+  (void)DeoptEntry;
+
+  auto *FuncletEntry = pImpl->getOrInsertBundleTag("funclet");
+  assert(FuncletEntry->second == LLVMContext::OB_funclet &&
+         "funclet operand bundle id drifted!");
+  (void)FuncletEntry;
 }
 LLVMContext::~LLVMContext() { delete pImpl; }
 
@@ -193,6 +226,11 @@ static bool isDiagnosticEnabled(const DiagnosticInfo &DI) {
     if (!cast<DiagnosticInfoOptimizationRemarkAnalysis>(DI).isEnabled())
       return false;
     break;
+  case llvm::DK_OptimizationRemarkAnalysisFPCommute:
+    if (!cast<DiagnosticInfoOptimizationRemarkAnalysisFPCommute>(DI)
+             .isEnabled())
+      return false;
+    break;
   default:
     break;
   }
@@ -250,7 +288,7 @@ unsigned LLVMContext::getMDKindID(StringRef Name) const {
       .first->second;
 }
 
-/// getHandlerNames - Populate client supplied smallvector using custome
+/// getHandlerNames - Populate client-supplied smallvector using custom
 /// metadata name and ID.
 void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const {
   Names.resize(pImpl->CustomMDKindNames.size());
@@ -258,3 +296,27 @@ void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const {
        E = pImpl->CustomMDKindNames.end(); I != E; ++I)
     Names[I->second] = I->first();
 }
+
+void LLVMContext::getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const {
+  pImpl->getOperandBundleTags(Tags);
+}
+
+uint32_t LLVMContext::getOperandBundleTagID(StringRef Tag) const {
+  return pImpl->getOperandBundleTagID(Tag);
+}
+
+void LLVMContext::setGC(const Function &Fn, std::string GCName) {
+  auto It = pImpl->GCNames.find(&Fn);
+
+  if (It == pImpl->GCNames.end()) {
+    pImpl->GCNames.insert(std::make_pair(&Fn, std::move(GCName)));
+    return;
+  }
+  It->second = std::move(GCName);
+}
+const std::string &LLVMContext::getGC(const Function &Fn) {
+  return pImpl->GCNames[&Fn];
+}
+void LLVMContext::deleteGC(const Function &Fn) {
+  pImpl->GCNames.erase(&Fn);
+}
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.cpp b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
index 1e20807..5239b4f 100644
--- a/contrib/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
@@ -27,6 +27,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     FloatTy(C, Type::FloatTyID),
     DoubleTy(C, Type::DoubleTyID),
     MetadataTy(C, Type::MetadataTyID),
+    TokenTy(C, Type::TokenTyID),
     X86_FP80Ty(C, Type::X86_FP80TyID),
     FP128Ty(C, Type::FP128TyID),
     PPC_FP128Ty(C, Type::PPC_FP128TyID),
@@ -78,7 +79,7 @@ LLVMContextImpl::~LLVMContextImpl() {
   // unnecessary RAUW when nodes are still unresolved.
   for (auto *I : DistinctMDNodes)
     I->dropAllReferences();
-#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   for (auto *I : CLASS##s)                                                     \
     I->dropAllReferences();
 #include "llvm/IR/Metadata.def"
@@ -92,8 +93,8 @@ LLVMContextImpl::~LLVMContextImpl() {
   // Destroy MDNodes.
   for (MDNode *I : DistinctMDNodes)
     I->deleteAsSubclass();
-#define HANDLE_MDNODE_LEAF(CLASS)                                              \
-  for (CLASS *I : CLASS##s)                                                    \
+#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
+  for (CLASS * I : CLASS##s)                                                   \
     delete I;
 #include "llvm/IR/Metadata.def"
 
@@ -218,6 +219,23 @@ unsigned MDNodeOpsKey::calculateHash(ArrayRef<Metadata *> Ops) {
   return hash_combine_range(Ops.begin(), Ops.end());
 }
 
+StringMapEntry<uint32_t> *LLVMContextImpl::getOrInsertBundleTag(StringRef Tag) {
+  uint32_t NewIdx = BundleTagCache.size();
+  return &*(BundleTagCache.insert(std::make_pair(Tag, NewIdx)).first);
+}
+
+void LLVMContextImpl::getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const {
+  Tags.resize(BundleTagCache.size());
+  for (const auto &T : BundleTagCache)
+    Tags[T.second] = T.first();
+}
+
+uint32_t LLVMContextImpl::getOperandBundleTagID(StringRef Tag) const {
+  auto I = BundleTagCache.find(Tag);
+  assert(I != BundleTagCache.end() && "Unknown tag!");
+  return I->second;
+}
+
 // ConstantsContext anchors
 void UnaryConstantExpr::anchor() { }
 
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.h b/contrib/llvm/lib/IR/LLVMContextImpl.h
index cbbf11e..d42047d 100644
--- a/contrib/llvm/lib/IR/LLVMContextImpl.h
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.h
@@ -458,67 +458,6 @@ template <> struct MDNodeKeyImpl<DIFile> {
   unsigned getHashValue() const { return hash_combine(Filename, Directory); }
 };
 
-template <> struct MDNodeKeyImpl<DICompileUnit> {
-  unsigned SourceLanguage;
-  Metadata *File;
-  StringRef Producer;
-  bool IsOptimized;
-  StringRef Flags;
-  unsigned RuntimeVersion;
-  StringRef SplitDebugFilename;
-  unsigned EmissionKind;
-  Metadata *EnumTypes;
-  Metadata *RetainedTypes;
-  Metadata *Subprograms;
-  Metadata *GlobalVariables;
-  Metadata *ImportedEntities;
-  uint64_t DWOId;
-
-  MDNodeKeyImpl(unsigned SourceLanguage, Metadata *File, StringRef Producer,
-                bool IsOptimized, StringRef Flags, unsigned RuntimeVersion,
-                StringRef SplitDebugFilename, unsigned EmissionKind,
-                Metadata *EnumTypes, Metadata *RetainedTypes,
-                Metadata *Subprograms, Metadata *GlobalVariables,
-                Metadata *ImportedEntities, uint64_t DWOId)
-      : SourceLanguage(SourceLanguage), File(File), Producer(Producer),
-        IsOptimized(IsOptimized), Flags(Flags), RuntimeVersion(RuntimeVersion),
-        SplitDebugFilename(SplitDebugFilename), EmissionKind(EmissionKind),
-        EnumTypes(EnumTypes), RetainedTypes(RetainedTypes),
-        Subprograms(Subprograms), GlobalVariables(GlobalVariables),
-        ImportedEntities(ImportedEntities), DWOId(DWOId) {}
-  MDNodeKeyImpl(const DICompileUnit *N)
-      : SourceLanguage(N->getSourceLanguage()), File(N->getRawFile()),
-        Producer(N->getProducer()), IsOptimized(N->isOptimized()),
-        Flags(N->getFlags()), RuntimeVersion(N->getRuntimeVersion()),
-        SplitDebugFilename(N->getSplitDebugFilename()),
-        EmissionKind(N->getEmissionKind()), EnumTypes(N->getRawEnumTypes()),
-        RetainedTypes(N->getRawRetainedTypes()),
-        Subprograms(N->getRawSubprograms()),
-        GlobalVariables(N->getRawGlobalVariables()),
-        ImportedEntities(N->getRawImportedEntities()), DWOId(N->getDWOId()) {}
-
-  bool isKeyOf(const DICompileUnit *RHS) const {
-    return SourceLanguage == RHS->getSourceLanguage() &&
-           File == RHS->getRawFile() && Producer == RHS->getProducer() &&
-           IsOptimized == RHS->isOptimized() && Flags == RHS->getFlags() &&
-           RuntimeVersion == RHS->getRuntimeVersion() &&
-           SplitDebugFilename == RHS->getSplitDebugFilename() &&
-           EmissionKind == RHS->getEmissionKind() &&
-           EnumTypes == RHS->getRawEnumTypes() &&
-           RetainedTypes == RHS->getRawRetainedTypes() &&
-           Subprograms == RHS->getRawSubprograms() &&
-           GlobalVariables == RHS->getRawGlobalVariables() &&
-           ImportedEntities == RHS->getRawImportedEntities() &&
-           DWOId == RHS->getDWOId();
-  }
-  unsigned getHashValue() const {
-    return hash_combine(SourceLanguage, File, Producer, IsOptimized, Flags,
-                        RuntimeVersion, SplitDebugFilename, EmissionKind,
-                        EnumTypes, RetainedTypes, Subprograms, GlobalVariables,
-                        ImportedEntities, DWOId);
-  }
-};
-
 template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *Scope;
   StringRef Name;
@@ -534,7 +473,6 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   unsigned VirtualIndex;
   unsigned Flags;
   bool IsOptimized;
-  Metadata *Function;
   Metadata *TemplateParams;
   Metadata *Declaration;
   Metadata *Variables;
@@ -544,15 +482,15 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
                 bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine,
                 Metadata *ContainingType, unsigned Virtuality,
                 unsigned VirtualIndex, unsigned Flags, bool IsOptimized,
-                Metadata *Function, Metadata *TemplateParams,
-                Metadata *Declaration, Metadata *Variables)
+                Metadata *TemplateParams, Metadata *Declaration,
+                Metadata *Variables)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
         IsDefinition(IsDefinition), ScopeLine(ScopeLine),
         ContainingType(ContainingType), Virtuality(Virtuality),
         VirtualIndex(VirtualIndex), Flags(Flags), IsOptimized(IsOptimized),
-        Function(Function), TemplateParams(TemplateParams),
-        Declaration(Declaration), Variables(Variables) {}
+        TemplateParams(TemplateParams), Declaration(Declaration),
+        Variables(Variables) {}
   MDNodeKeyImpl(const DISubprogram *N)
       : Scope(N->getRawScope()), Name(N->getName()),
         LinkageName(N->getLinkageName()), File(N->getRawFile()),
@@ -561,7 +499,6 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         ScopeLine(N->getScopeLine()), ContainingType(N->getRawContainingType()),
         Virtuality(N->getVirtuality()), VirtualIndex(N->getVirtualIndex()),
         Flags(N->getFlags()), IsOptimized(N->isOptimized()),
-        Function(N->getRawFunction()),
         TemplateParams(N->getRawTemplateParams()),
         Declaration(N->getRawDeclaration()), Variables(N->getRawVariables()) {}
 
@@ -576,7 +513,6 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            Virtuality == RHS->getVirtuality() &&
            VirtualIndex == RHS->getVirtualIndex() && Flags == RHS->getFlags() &&
            IsOptimized == RHS->isOptimized() &&
-           Function == RHS->getRawFunction() &&
            TemplateParams == RHS->getRawTemplateParams() &&
            Declaration == RHS->getRawDeclaration() &&
            Variables == RHS->getRawVariables();
@@ -584,7 +520,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   unsigned getHashValue() const {
     return hash_combine(Scope, Name, LinkageName, File, Line, Type,
                         IsLocalToUnit, IsDefinition, ScopeLine, ContainingType,
-                        Virtuality, VirtualIndex, Flags, IsOptimized, Function,
+                        Virtuality, VirtualIndex, Flags, IsOptimized,
                         TemplateParams, Declaration, Variables);
   }
 };
@@ -759,7 +695,6 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
 };
 
 template <> struct MDNodeKeyImpl<DILocalVariable> {
-  unsigned Tag;
   Metadata *Scope;
   StringRef Name;
   Metadata *File;
@@ -768,23 +703,23 @@ template <> struct MDNodeKeyImpl<DILocalVariable> {
   unsigned Arg;
   unsigned Flags;
 
-  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, StringRef Name, Metadata *File,
-                unsigned Line, Metadata *Type, unsigned Arg, unsigned Flags)
-      : Tag(Tag), Scope(Scope), Name(Name), File(File), Line(Line), Type(Type),
-        Arg(Arg), Flags(Flags) {}
+  MDNodeKeyImpl(Metadata *Scope, StringRef Name, Metadata *File, unsigned Line,
+                Metadata *Type, unsigned Arg, unsigned Flags)
+      : Scope(Scope), Name(Name), File(File), Line(Line), Type(Type), Arg(Arg),
+        Flags(Flags) {}
   MDNodeKeyImpl(const DILocalVariable *N)
-      : Tag(N->getTag()), Scope(N->getRawScope()), Name(N->getName()),
-        File(N->getRawFile()), Line(N->getLine()), Type(N->getRawType()),
-        Arg(N->getArg()), Flags(N->getFlags()) {}
+      : Scope(N->getRawScope()), Name(N->getName()), File(N->getRawFile()),
+        Line(N->getLine()), Type(N->getRawType()), Arg(N->getArg()),
+        Flags(N->getFlags()) {}
 
   bool isKeyOf(const DILocalVariable *RHS) const {
-    return Tag == RHS->getTag() && Scope == RHS->getRawScope() &&
-           Name == RHS->getName() && File == RHS->getRawFile() &&
-           Line == RHS->getLine() && Type == RHS->getRawType() &&
-           Arg == RHS->getArg() && Flags == RHS->getFlags();
+    return Scope == RHS->getRawScope() && Name == RHS->getName() &&
+           File == RHS->getRawFile() && Line == RHS->getLine() &&
+           Type == RHS->getRawType() && Arg == RHS->getArg() &&
+           Flags == RHS->getFlags();
   }
   unsigned getHashValue() const {
-    return hash_combine(Tag, Scope, Name, File, Line, Type, Arg, Flags);
+    return hash_combine(Scope, Name, File, Line, Type, Arg, Flags);
   }
 };
 
@@ -857,6 +792,49 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   }
 };
 
+template <> struct MDNodeKeyImpl<DIMacro> {
+  unsigned MIType;
+  unsigned Line;
+  StringRef Name;
+  StringRef Value;
+
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, StringRef Name, StringRef Value)
+      : MIType(MIType), Line(Line), Name(Name), Value(Value) {}
+  MDNodeKeyImpl(const DIMacro *N)
+      : MIType(N->getMacinfoType()), Line(N->getLine()), Name(N->getName()),
+        Value(N->getValue()) {}
+
+  bool isKeyOf(const DIMacro *RHS) const {
+    return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
+           Name == RHS->getName() && Value == RHS->getValue();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(MIType, Line, Name, Value);
+  }
+};
+
+template <> struct MDNodeKeyImpl<DIMacroFile> {
+  unsigned MIType;
+  unsigned Line;
+  Metadata *File;
+  Metadata *Elements;
+
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, Metadata *File,
+                Metadata *Elements)
+      : MIType(MIType), Line(Line), File(File), Elements(Elements) {}
+  MDNodeKeyImpl(const DIMacroFile *N)
+      : MIType(N->getMacinfoType()), Line(N->getLine()), File(N->getRawFile()),
+        Elements(N->getRawElements()) {}
+
+  bool isKeyOf(const DIMacroFile *RHS) const {
+    return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
+           File == RHS->getRawFile() && File == RHS->getRawElements();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(MIType, Line, File, Elements);
+  }
+};
+
 /// \brief DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
   typedef MDNodeKeyImpl<NodeTy> KeyTy;
@@ -953,7 +931,8 @@ public:
 
   DenseMap<const Value*, ValueName*> ValueNames;
 
-#define HANDLE_MDNODE_LEAF(CLASS) DenseSet<CLASS *, CLASS##Info> CLASS##s;
+#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
+  DenseSet<CLASS *, CLASS##Info> CLASS##s;
 #include "llvm/IR/Metadata.def"
 
   // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
@@ -988,8 +967,10 @@ public:
   ConstantInt *TheTrueVal;
   ConstantInt *TheFalseVal;
 
+  std::unique_ptr<ConstantTokenNone> TheNoneToken;
+
   // Basic type instances.
-  Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy;
+  Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy, TokenTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
@@ -1033,20 +1014,26 @@ public:
   /// instructions in different blocks at the same location.
   DenseMap<std::pair<const char *, unsigned>, unsigned> DiscriminatorTable;
 
-  /// \brief Mapping from a function to its prefix data, which is stored as the
-  /// operand of an unparented ReturnInst so that the prefix data has a Use.
-  typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy;
-  PrefixDataMapTy PrefixDataMap;
-
-  /// \brief Mapping from a function to its prologue data, which is stored as
-  /// the operand of an unparented ReturnInst so that the prologue data has a
-  /// Use.
-  typedef DenseMap<const Function *, ReturnInst *> PrologueDataMapTy;
-  PrologueDataMapTy PrologueDataMap;
-
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
 
+  /// \brief A set of interned tags for operand bundles.  The StringMap maps
+  /// bundle tags to their IDs.
+  ///
+  /// \see LLVMContext::getOperandBundleTagID
+  StringMap<uint32_t> BundleTagCache;
+
+  StringMapEntry<uint32_t> *getOrInsertBundleTag(StringRef Tag);
+  void getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const;
+  uint32_t getOperandBundleTagID(StringRef Tag) const;
+
+  /// Maintain the GC name for each function.
+  ///
+  /// This saves allocating an additional word in Function for programs which
+  /// do not use GC (i.e., most programs) at the cost of increased overhead for
+  /// clients which do use GC.
+  DenseMap<const Function*, std::string> GCNames;
+
   LLVMContextImpl(LLVMContext &C);
   ~LLVMContextImpl();
 
diff --git a/contrib/llvm/lib/IR/LegacyPassManager.cpp b/contrib/llvm/lib/IR/LegacyPassManager.cpp
index 27d98a2..63d89f2 100644
--- a/contrib/llvm/lib/IR/LegacyPassManager.cpp
+++ b/contrib/llvm/lib/IR/LegacyPassManager.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <map>
+#include <unordered_set>
 using namespace llvm;
 using namespace llvm::legacy;
 
@@ -83,6 +84,13 @@ PrintAfterAll("print-after-all",
               llvm::cl::desc("Print IR after each pass"),
               cl::init(false));
 
+static cl::list<std::string>
+    PrintFuncsList("filter-print-funcs", cl::value_desc("function names"),
+                   cl::desc("Only print IR for functions whose name "
+                            "match this for all print-[before|after][-all] "
+                            "options"),
+                   cl::CommaSeparated);
+
 /// This is a helper to determine whether to print IR before or
 /// after a pass.
 
@@ -109,6 +117,11 @@ static bool ShouldPrintAfterPass(const PassInfo *PI) {
   return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
 }
 
+bool llvm::isFunctionInPrintList(StringRef FunctionName) {
+  static std::unordered_set<std::string> PrintFuncNames(PrintFuncsList.begin(),
+                                                        PrintFuncsList.end());
+  return PrintFuncNames.empty() || PrintFuncNames.count(FunctionName);
+}
 /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
 /// or higher is specified.
 bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
@@ -569,13 +582,33 @@ void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
 
 AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
   AnalysisUsage *AnUsage = nullptr;
-  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  auto DMI = AnUsageMap.find(P);
   if (DMI != AnUsageMap.end())
     AnUsage = DMI->second;
   else {
-    AnUsage = new AnalysisUsage();
-    P->getAnalysisUsage(*AnUsage);
-    AnUsageMap[P] = AnUsage;
+    // Look up the analysis usage from the pass instance (different instances
+    // of the same pass can produce different results), but unique the
+    // resulting object to reduce memory usage.  This helps to greatly reduce
+    // memory usage when we have many instances of only a few pass types
+    // (e.g. instcombine, simplifycfg, etc...) which tend to share a fixed set
+    // of dependencies.
+    AnalysisUsage AU;
+    P->getAnalysisUsage(AU);
+    
+    AUFoldingSetNode* Node = nullptr;
+    FoldingSetNodeID ID;
+    AUFoldingSetNode::Profile(ID, AU);
+    void *IP = nullptr;
+    if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP))
+      Node = N;
+    else {
+      Node = new (AUFoldingSetNodeAllocator.Allocate()) AUFoldingSetNode(AU);
+      UniqueAnalysisUsages.InsertNode(Node, IP);
+    }
+    assert(Node && "cached analysis usage must be non null");
+
+    AnUsageMap[P] = &Node->AU;
+    AnUsage = &Node->AU;;
   }
   return AnUsage;
 }
@@ -686,6 +719,10 @@ void PMTopLevelManager::schedulePass(Pass *P) {
 /// passes and all pass managers. If desired pass is not found
 /// then return NULL.
 Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
+  // For immutable passes we have a direct mapping from ID to pass, so check
+  // that first.
+  if (Pass *P = ImmutablePassMap.lookup(AID))
+    return P;
 
   // Check pass managers
   for (PMDataManager *PassManager : PassManagers)
@@ -697,24 +734,6 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
     if (Pass *P = IndirectPassManager->findAnalysisPass(AID, false))
       return P;
 
-  // Check the immutable passes. Iterate in reverse order so that we find
-  // the most recently registered passes first.
-  for (auto I = ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E;
-       ++I) {
-    AnalysisID PI = (*I)->getPassID();
-    if (PI == AID)
-      return *I;
-
-    // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf = findAnalysisPassInfo(PI);
-    assert(PassInf && "Expected all immutable passes to be initialized");
-    const std::vector<const PassInfo*> &ImmPI =
-      PassInf->getInterfacesImplemented();
-    for (const PassInfo *PI : ImmPI)
-      if (PI->getTypeInfo() == AID)
-        return *I;
-  }
-
   return nullptr;
 }
 
@@ -729,6 +748,24 @@ const PassInfo *PMTopLevelManager::findAnalysisPassInfo(AnalysisID AID) const {
   return PI;
 }
 
+void PMTopLevelManager::addImmutablePass(ImmutablePass *P) {
+  P->initializePass();
+  ImmutablePasses.push_back(P);
+
+  // Add this pass to the map from its analysis ID. We clobber any prior runs
+  // of the pass in the map so that the last one added is the one found when
+  // doing lookups.
+  AnalysisID AID = P->getPassID();
+  ImmutablePassMap[AID] = P;
+
+  // Also add any interfaces implemented by the immutable pass to the map for
+  // fast lookup.
+  const PassInfo *PassInf = findAnalysisPassInfo(AID);
+  assert(PassInf && "Expected all immutable passes to be initialized");
+  for (const PassInfo *ImmPI : PassInf->getInterfacesImplemented())
+    ImmutablePassMap[ImmPI->getTypeInfo()] = P;
+}
+
 // Print passes managed by this top level manager.
 void PMTopLevelManager::dumpPasses() const {
 
@@ -780,15 +817,8 @@ void PMTopLevelManager::initializeAllAnalysisInfo() {
 
   for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
         DME = LastUser.end(); DMI != DME; ++DMI) {
-    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
-      InversedLastUser.find(DMI->second);
-    if (InvDMI != InversedLastUser.end()) {
-      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
-      L.insert(DMI->first);
-    } else {
-      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
-      InversedLastUser[DMI->second] = L;
-    }
+    SmallPtrSet<Pass *, 8> &L = InversedLastUser[DMI->second];
+    L.insert(DMI->first);
   }
 }
 
@@ -801,10 +831,6 @@ PMTopLevelManager::~PMTopLevelManager() {
   for (SmallVectorImpl<ImmutablePass *>::iterator
          I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
     delete *I;
-
-  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
-         DME = AnUsageMap.end(); DMI != DME; ++DMI)
-    delete DMI->second;
 }
 
 //===----------------------------------------------------------------------===//
@@ -989,31 +1015,28 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
 
   // At the moment, this pass is the last user of all required passes.
   SmallVector<Pass *, 12> LastUses;
-  SmallVector<Pass *, 8> RequiredPasses;
+  SmallVector<Pass *, 8> UsedPasses;
   SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
 
   unsigned PDepth = this->getDepth();
 
-  collectRequiredAnalysis(RequiredPasses,
-                          ReqAnalysisNotAvailable, P);
-  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
-         E = RequiredPasses.end(); I != E; ++I) {
-    Pass *PRequired = *I;
+  collectRequiredAndUsedAnalyses(UsedPasses, ReqAnalysisNotAvailable, P);
+  for (Pass *PUsed : UsedPasses) {
     unsigned RDepth = 0;
 
-    assert(PRequired->getResolver() && "Analysis Resolver is not set");
-    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
+    assert(PUsed->getResolver() && "Analysis Resolver is not set");
+    PMDataManager &DM = PUsed->getResolver()->getPMDataManager();
     RDepth = DM.getDepth();
 
     if (PDepth == RDepth)
-      LastUses.push_back(PRequired);
+      LastUses.push_back(PUsed);
     else if (PDepth > RDepth) {
       // Let the parent claim responsibility of last use
-      TransferLastUses.push_back(PRequired);
+      TransferLastUses.push_back(PUsed);
       // Keep track of higher level analysis used by this manager.
-      HigherLevelAnalysis.push_back(PRequired);
+      HigherLevelAnalysis.push_back(PUsed);
     } else
-      llvm_unreachable("Unable to accommodate Required Pass");
+      llvm_unreachable("Unable to accommodate Used Pass");
   }
 
   // Set P as P's last user until someone starts using P.
@@ -1030,10 +1053,8 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
   }
 
   // Now, take care of required analyses that are not available.
-  for (SmallVectorImpl<AnalysisID>::iterator
-         I = ReqAnalysisNotAvailable.begin(),
-         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    const PassInfo *PI = TPM->findAnalysisPassInfo(*I);
+  for (AnalysisID ID : ReqAnalysisNotAvailable) {
+    const PassInfo *PI = TPM->findAnalysisPassInfo(ID);
     Pass *AnalysisPass = PI->createPass();
     this->addLowerLevelRequiredPass(P, AnalysisPass);
   }
@@ -1048,30 +1069,29 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
 }
 
 
-/// Populate RP with analysis pass that are required by
+/// Populate UP with analysis pass that are used or required by
 /// pass P and are available. Populate RP_NotAvail with analysis
 /// pass that are required by pass P but are not available.
-void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
-                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
-                                            Pass *P) {
+void PMDataManager::collectRequiredAndUsedAnalyses(
+    SmallVectorImpl<Pass *> &UP, SmallVectorImpl<AnalysisID> &RP_NotAvail,
+    Pass *P) {
   AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-  for (AnalysisUsage::VectorType::const_iterator
-         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
+
+  for (const auto &UsedID : AnUsage->getUsedSet())
+    if (Pass *AnalysisPass = findAnalysisPass(UsedID, true))
+      UP.push_back(AnalysisPass);
+
+  for (const auto &RequiredID : AnUsage->getRequiredSet())
+    if (Pass *AnalysisPass = findAnalysisPass(RequiredID, true))
+      UP.push_back(AnalysisPass);
     else
-      RP_NotAvail.push_back(*I);
-  }
+      RP_NotAvail.push_back(RequiredID);
 
-  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
+  for (const auto &RequiredID : AnUsage->getRequiredTransitiveSet())
+    if (Pass *AnalysisPass = findAnalysisPass(RequiredID, true))
+      UP.push_back(AnalysisPass);
     else
-      RP_NotAvail.push_back(*I);
-  }
+      RP_NotAvail.push_back(RequiredID);
 }
 
 // All Required analyses should be available to the pass as it runs!  Here
@@ -1206,6 +1226,15 @@ void PMDataManager::dumpPreservedSet(const Pass *P) const {
   dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
 }
 
+void PMDataManager::dumpUsedSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Used", P, analysisUsage.getUsedSet());
+}
+
 void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
                                    const AnalysisUsage::VectorType &Set) const {
   assert(PassDebugging >= Details);
@@ -1310,6 +1339,7 @@ bool BBPassManager::runOnFunction(Function &F) {
         dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
                      I->getName());
       dumpPreservedSet(BP);
+      dumpUsedSet(BP);
 
       verifyPreservedAnalysis(BP);
       removeNotPreservedAnalysis(BP);
@@ -1524,6 +1554,7 @@ bool FPPassManager::runOnFunction(Function &F) {
     if (LocalChanged)
       dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
     dumpPreservedSet(FP);
+    dumpUsedSet(FP);
 
     verifyPreservedAnalysis(FP);
     removeNotPreservedAnalysis(FP);
@@ -1601,6 +1632,7 @@ MPPassManager::runOnModule(Module &M) {
       dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
                    M.getModuleIdentifier());
     dumpPreservedSet(MP);
+    dumpUsedSet(MP);
 
     verifyPreservedAnalysis(MP);
     removeNotPreservedAnalysis(MP);
diff --git a/contrib/llvm/lib/IR/MDBuilder.cpp b/contrib/llvm/lib/IR/MDBuilder.cpp
index b4c5ca7..4ce3ea2 100644
--- a/contrib/llvm/lib/IR/MDBuilder.cpp
+++ b/contrib/llvm/lib/IR/MDBuilder.cpp
@@ -36,8 +36,7 @@ MDNode *MDBuilder::createFPMath(float Accuracy) {
 
 MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight,
                                        uint32_t FalseWeight) {
-  uint32_t Weights[] = {TrueWeight, FalseWeight};
-  return createBranchWeights(Weights);
+  return createBranchWeights({TrueWeight, FalseWeight});
 }
 
 MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
@@ -53,14 +52,15 @@ MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
   return MDNode::get(Context, Vals);
 }
 
-MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) {
-  SmallVector<Metadata *, 2> Vals(2);
-  Vals[0] = createString("function_entry_count");
+MDNode *MDBuilder::createUnpredictable() {
+  return MDNode::get(Context, None);
+}
 
+MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) {
   Type *Int64Ty = Type::getInt64Ty(Context);
-  Vals[1] = createConstant(ConstantInt::get(Int64Ty, Count));
-
-  return MDNode::get(Context, Vals);
+  return MDNode::get(Context,
+                     {createString("function_entry_count"),
+                      createConstant(ConstantInt::get(Int64Ty, Count))});
 }
 
 MDNode *MDBuilder::createRange(const APInt &Lo, const APInt &Hi) {
@@ -76,8 +76,7 @@ MDNode *MDBuilder::createRange(Constant *Lo, Constant *Hi) {
     return nullptr;
 
   // Return the range [Lo, Hi).
-  Metadata *Range[2] = {createConstant(Lo), createConstant(Hi)};
-  return MDNode::get(Context, Range);
+  return MDNode::get(Context, {createConstant(Lo), createConstant(Hi)});
 }
 
 MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) {
@@ -112,12 +111,10 @@ MDNode *MDBuilder::createTBAANode(StringRef Name, MDNode *Parent,
                                   bool isConstant) {
   if (isConstant) {
     Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1);
-    Metadata *Ops[3] = {createString(Name), Parent, createConstant(Flags)};
-    return MDNode::get(Context, Ops);
-  } else {
-    Metadata *Ops[2] = {createString(Name), Parent};
-    return MDNode::get(Context, Ops);
+    return MDNode::get(Context,
+                       {createString(Name), Parent, createConstant(Flags)});
   }
+  return MDNode::get(Context, {createString(Name), Parent});
 }
 
 MDNode *MDBuilder::createAliasScopeDomain(StringRef Name) {
@@ -125,8 +122,7 @@ MDNode *MDBuilder::createAliasScopeDomain(StringRef Name) {
 }
 
 MDNode *MDBuilder::createAliasScope(StringRef Name, MDNode *Domain) {
-  Metadata *Ops[2] = {createString(Name), Domain};
-  return MDNode::get(Context, Ops);
+  return MDNode::get(Context, {createString(Name), Domain});
 }
 
 /// \brief Return metadata for a tbaa.struct node with the given
@@ -161,23 +157,19 @@ MDNode *MDBuilder::createTBAAStructTypeNode(
 MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
                                             uint64_t Offset) {
   ConstantInt *Off = ConstantInt::get(Type::getInt64Ty(Context), Offset);
-  Metadata *Ops[3] = {createString(Name), Parent, createConstant(Off)};
-  return MDNode::get(Context, Ops);
+  return MDNode::get(Context,
+                     {createString(Name), Parent, createConstant(Off)});
 }
 
 /// \brief Return metadata for a TBAA tag node with the given
 /// base type, access type and offset relative to the base type.
 MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
                                            uint64_t Offset, bool IsConstant) {
-  Type *Int64 = Type::getInt64Ty(Context);
+  IntegerType *Int64 = Type::getInt64Ty(Context);
+  ConstantInt *Off = ConstantInt::get(Int64, Offset);
   if (IsConstant) {
-    Metadata *Ops[4] = {BaseType, AccessType,
-                        createConstant(ConstantInt::get(Int64, Offset)),
-                        createConstant(ConstantInt::get(Int64, 1))};
-    return MDNode::get(Context, Ops);
-  } else {
-    Metadata *Ops[3] = {BaseType, AccessType,
-                        createConstant(ConstantInt::get(Int64, Offset))};
-    return MDNode::get(Context, Ops);
+    return MDNode::get(Context, {BaseType, AccessType, createConstant(Off),
+                                 createConstant(ConstantInt::get(Int64, 1))});
   }
+  return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)});
 }
diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp
index 1abcf0d..9a9a501 100644
--- a/contrib/llvm/lib/IR/Metadata.cpp
+++ b/contrib/llvm/lib/IR/Metadata.cpp
@@ -120,6 +120,38 @@ void MetadataAsValue::untrack() {
     MetadataTracking::untrack(MD);
 }
 
+bool MetadataTracking::track(void *Ref, Metadata &MD, OwnerTy Owner) {
+  assert(Ref && "Expected live reference");
+  assert((Owner || *static_cast<Metadata **>(Ref) == &MD) &&
+         "Reference without owner must be direct");
+  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
+    R->addRef(Ref, Owner);
+    return true;
+  }
+  return false;
+}
+
+void MetadataTracking::untrack(void *Ref, Metadata &MD) {
+  assert(Ref && "Expected live reference");
+  if (auto *R = ReplaceableMetadataImpl::get(MD))
+    R->dropRef(Ref);
+}
+
+bool MetadataTracking::retrack(void *Ref, Metadata &MD, void *New) {
+  assert(Ref && "Expected live reference");
+  assert(New && "Expected live reference");
+  assert(Ref != New && "Expected change");
+  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
+    R->moveRef(Ref, New, MD);
+    return true;
+  }
+  return false;
+}
+
+bool MetadataTracking::isReplaceable(const Metadata &MD) {
+  return ReplaceableMetadataImpl::get(const_cast<Metadata &>(MD));
+}
+
 void ReplaceableMetadataImpl::addRef(void *Ref, OwnerTy Owner) {
   bool WasInserted =
       UseMap.insert(std::make_pair(Ref, std::make_pair(Owner, NextIndex)))
@@ -158,6 +190,8 @@ void ReplaceableMetadataImpl::moveRef(void *Ref, void *New,
 void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
   assert(!(MD && isa<MDNode>(MD) && cast<MDNode>(MD)->isTemporary()) &&
          "Expected non-temp node");
+  assert(CanReplace &&
+         "Attempted to replace Metadata marked for no replacement");
 
   if (UseMap.empty())
     return;
@@ -239,6 +273,12 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) {
   }
 }
 
+ReplaceableMetadataImpl *ReplaceableMetadataImpl::get(Metadata &MD) {
+  if (auto *N = dyn_cast<MDNode>(&MD))
+    return N->Context.getReplaceableUses();
+  return dyn_cast<ValueAsMetadata>(&MD);
+}
+
 static Function *getLocalFunction(Value *V) {
   assert(V && "Expected value");
   if (auto *A = dyn_cast<Argument>(V))
@@ -517,7 +557,7 @@ void MDNode::decrementUnresolvedOperandCount() {
     resolve();
 }
 
-void MDNode::resolveCycles() {
+void MDNode::resolveRecursivelyImpl(bool AllowTemps) {
   if (isResolved())
     return;
 
@@ -530,6 +570,8 @@ void MDNode::resolveCycles() {
     if (!N)
       continue;
 
+    if (N->isTemporary() && AllowTemps)
+      continue;
     assert(!N->isTemporary() &&
            "Expected all forward declarations to be resolved");
     if (!N->isResolved())
@@ -545,6 +587,18 @@ static bool hasSelfReference(MDNode *N) {
 }
 
 MDNode *MDNode::replaceWithPermanentImpl() {
+  switch (getMetadataID()) {
+  default:
+    // If this type isn't uniquable, replace with a distinct node.
+    return replaceWithDistinctImpl();
+
+#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
+  case CLASS##Kind:                                                            \
+    break;
+#include "llvm/IR/Metadata.def"
+  }
+
+  // Even if this type is uniquable, self-references have to be distinct.
   if (hasSelfReference(this))
     return replaceWithDistinctImpl();
   return replaceWithUniquedImpl();
@@ -671,8 +725,8 @@ MDNode *MDNode::uniquify() {
   // Try to insert into uniquing store.
   switch (getMetadataID()) {
   default:
-    llvm_unreachable("Invalid subclass of MDNode");
-#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+    llvm_unreachable("Invalid or non-uniquable subclass of MDNode");
+#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   case CLASS##Kind: {                                                          \
     CLASS *SubclassThis = cast<CLASS>(this);                                   \
     std::integral_constant<bool, HasCachedHash<CLASS>::value>                  \
@@ -687,8 +741,8 @@ MDNode *MDNode::uniquify() {
 void MDNode::eraseFromStore() {
   switch (getMetadataID()) {
   default:
-    llvm_unreachable("Invalid subclass of MDNode");
-#define HANDLE_MDNODE_LEAF(CLASS)                                              \
+    llvm_unreachable("Invalid or non-uniquable subclass of MDNode");
+#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   case CLASS##Kind:                                                            \
     getContext().pImpl->CLASS##s.erase(cast<CLASS>(this));                     \
     break;
@@ -941,6 +995,17 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   return MDNode::get(A->getContext(), MDs);
 }
 
+MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return nullptr;
+
+  ConstantInt *AVal = mdconst::extract<ConstantInt>(A->getOperand(0));
+  ConstantInt *BVal = mdconst::extract<ConstantInt>(B->getOperand(0));
+  if (AVal->getZExtValue() < BVal->getZExtValue())
+    return A;
+  return B;
+}
+
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
@@ -1045,14 +1110,10 @@ MDNode *Instruction::getMetadataImpl(StringRef Kind) const {
   return getMetadataImpl(getContext().getMDKindID(Kind));
 }
 
-void Instruction::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) {
+void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
   SmallSet<unsigned, 5> KnownSet;
   KnownSet.insert(KnownIDs.begin(), KnownIDs.end());
 
-  // Drop debug if needed
-  if (KnownSet.erase(LLVMContext::MD_dbg))
-    DbgLoc = DebugLoc();
-
   if (!hasMetadataHashEntry())
     return; // Nothing to remove!
 
@@ -1077,7 +1138,7 @@ void Instruction::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) {
   }
 }
 
-/// setMetadata - Set the metadata of of the specified kind to the specified
+/// setMetadata - Set the metadata of the specified kind to the specified
 /// node.  This updates/replaces metadata if already present, or removes it if
 /// Node is null.
 void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
@@ -1251,3 +1312,11 @@ void Function::clearMetadata() {
   getContext().pImpl->FunctionMetadata.erase(this);
   setHasMetadataHashEntry(false);
 }
+
+void Function::setSubprogram(DISubprogram *SP) {
+  setMetadata(LLVMContext::MD_dbg, SP);
+}
+
+DISubprogram *Function::getSubprogram() const {
+  return cast_or_null<DISubprogram>(getMetadata(LLVMContext::MD_dbg));
+}
diff --git a/contrib/llvm/lib/IR/MetadataImpl.h b/contrib/llvm/lib/IR/MetadataImpl.h
index 662a50e..b913746 100644
--- a/contrib/llvm/lib/IR/MetadataImpl.h
+++ b/contrib/llvm/lib/IR/MetadataImpl.h
@@ -26,6 +26,19 @@ static T *getUniqued(DenseSet<T *, InfoT> &Store,
   return I == Store.end() ? nullptr : *I;
 }
 
+template <class T> T *MDNode::storeImpl(T *N, StorageType Storage) {
+  switch (Storage) {
+  case Uniqued:
+    llvm_unreachable("Cannot unique without a uniquing-store");
+  case Distinct:
+    N->storeDistinctInContext();
+    break;
+  case Temporary:
+    break;
+  }
+  return N;
+}
+
 template <class T, class StoreT>
 T *MDNode::storeImpl(T *N, StorageType Storage, StoreT &Store) {
   switch (Storage) {
diff --git a/contrib/llvm/lib/IR/MetadataTracking.cpp b/contrib/llvm/lib/IR/MetadataTracking.cpp
deleted file mode 100644
index 47f0b93..0000000
--- a/contrib/llvm/lib/IR/MetadataTracking.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===- MetadataTracking.cpp - Implement metadata tracking -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements Metadata tracking.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/MetadataTracking.h"
-#include "llvm/IR/Metadata.h"
-
-using namespace llvm;
-
-ReplaceableMetadataImpl *ReplaceableMetadataImpl::get(Metadata &MD) {
-  if (auto *N = dyn_cast<MDNode>(&MD))
-    return N->Context.getReplaceableUses();
-  return dyn_cast<ValueAsMetadata>(&MD);
-}
-
-bool MetadataTracking::track(void *Ref, Metadata &MD, OwnerTy Owner) {
-  assert(Ref && "Expected live reference");
-  assert((Owner || *static_cast<Metadata **>(Ref) == &MD) &&
-         "Reference without owner must be direct");
-  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
-    R->addRef(Ref, Owner);
-    return true;
-  }
-  return false;
-}
-
-void MetadataTracking::untrack(void *Ref, Metadata &MD) {
-  assert(Ref && "Expected live reference");
-  if (auto *R = ReplaceableMetadataImpl::get(MD))
-    R->dropRef(Ref);
-}
-
-bool MetadataTracking::retrack(void *Ref, Metadata &MD, void *New) {
-  assert(Ref && "Expected live reference");
-  assert(New && "Expected live reference");
-  assert(Ref != New && "Expected change");
-  if (auto *R = ReplaceableMetadataImpl::get(MD)) {
-    R->moveRef(Ref, New, MD);
-    return true;
-  }
-  return false;
-}
-
-bool MetadataTracking::isReplaceable(const Metadata &MD) {
-  return ReplaceableMetadataImpl::get(const_cast<Metadata &>(MD));
-}
diff --git a/contrib/llvm/lib/IR/Module.cpp b/contrib/llvm/lib/IR/Module.cpp
index 043f74e..ac578d6 100644
--- a/contrib/llvm/lib/IR/Module.cpp
+++ b/contrib/llvm/lib/IR/Module.cpp
@@ -29,6 +29,7 @@
 #include <algorithm>
 #include <cstdarg>
 #include <cstdlib>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -37,9 +38,9 @@ using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file.
-template class llvm::SymbolTableListTraits<Function, Module>;
-template class llvm::SymbolTableListTraits<GlobalVariable, Module>;
-template class llvm::SymbolTableListTraits<GlobalAlias, Module>;
+template class llvm::SymbolTableListTraits<Function>;
+template class llvm::SymbolTableListTraits<GlobalVariable>;
+template class llvm::SymbolTableListTraits<GlobalAlias>;
 
 //===----------------------------------------------------------------------===//
 // Primitive Module methods.
@@ -81,7 +82,6 @@ RandomNumberGenerator *Module::createRNG(const Pass* P) const {
   return new RandomNumberGenerator(Salt);
 }
 
-
 /// getNamedValue - Return the first global value in the module with
 /// the specified name, of arbitrary type.  This method returns null
 /// if a global with the specified name is not found.
@@ -102,6 +102,9 @@ void Module::getMDKindNames(SmallVectorImpl<StringRef> &Result) const {
   return Context.getMDKindNames(Result);
 }
 
+void Module::getOperandBundleTags(SmallVectorImpl<StringRef> &Result) const {
+  return Context.getOperandBundleTags(Result);
+}
 
 //===----------------------------------------------------------------------===//
 // Methods for easy access to the functions in the module.
@@ -274,7 +277,7 @@ NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
 /// delete it.
 void Module::eraseNamedMetadata(NamedMDNode *NMD) {
   static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab)->erase(NMD->getName());
-  NamedMDList.erase(NMD);
+  NamedMDList.erase(NMD->getIterator());
 }
 
 bool Module::isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB) {
@@ -376,17 +379,11 @@ const DataLayout &Module::getDataLayout() const { return DL; }
 //
 void Module::setMaterializer(GVMaterializer *GVM) {
   assert(!Materializer &&
-         "Module already has a GVMaterializer.  Call MaterializeAllPermanently"
+         "Module already has a GVMaterializer.  Call materializeAll"
          " to clear it out before setting another one.");
   Materializer.reset(GVM);
 }
 
-bool Module::isDematerializable(const GlobalValue *GV) const {
-  if (Materializer)
-    return Materializer->isDematerializable(GV);
-  return false;
-}
-
 std::error_code Module::materialize(GlobalValue *GV) {
   if (!Materializer)
     return std::error_code();
@@ -394,23 +391,11 @@ std::error_code Module::materialize(GlobalValue *GV) {
   return Materializer->materialize(GV);
 }
 
-void Module::dematerialize(GlobalValue *GV) {
-  if (Materializer)
-    return Materializer->dematerialize(GV);
-}
-
 std::error_code Module::materializeAll() {
   if (!Materializer)
     return std::error_code();
-  return Materializer->materializeModule(this);
-}
-
-std::error_code Module::materializeAllPermanently() {
-  if (std::error_code EC = materializeAll())
-    return EC;
-
-  Materializer.reset();
-  return std::error_code();
+  std::unique_ptr<GVMaterializer> M = std::move(Materializer);
+  return M->materializeModule();
 }
 
 std::error_code Module::materializeMetadata() {
@@ -458,7 +443,14 @@ void Module::dropAllReferences() {
 unsigned Module::getDwarfVersion() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("Dwarf Version"));
   if (!Val)
-    return dwarf::DWARF_VERSION;
+    return 0;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
+
+unsigned Module::getCodeViewFlag() const {
+  auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("CodeView"));
+  if (!Val)
+    return 0;
   return cast<ConstantInt>(Val->getValue())->getZExtValue();
 }
 
@@ -471,7 +463,7 @@ Comdat *Module::getOrInsertComdat(StringRef Name) {
 PICLevel::Level Module::getPICLevel() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("PIC Level"));
 
-  if (Val == NULL)
+  if (!Val)
     return PICLevel::Default;
 
   return static_cast<PICLevel::Level>(
@@ -481,3 +473,15 @@ PICLevel::Level Module::getPICLevel() const {
 void Module::setPICLevel(PICLevel::Level PL) {
   addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
 }
+
+void Module::setMaximumFunctionCount(uint64_t Count) {
+  addModuleFlag(ModFlagBehavior::Error, "MaxFunctionCount", Count);
+}
+
+Optional<uint64_t> Module::getMaximumFunctionCount() {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("MaxFunctionCount"));
+  if (!Val)
+    return None;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
diff --git a/contrib/llvm/lib/IR/Statepoint.cpp b/contrib/llvm/lib/IR/Statepoint.cpp
index 83ee611..27a990e 100644
--- a/contrib/llvm/lib/IR/Statepoint.cpp
+++ b/contrib/llvm/lib/IR/Statepoint.cpp
@@ -40,20 +40,7 @@ bool llvm::isStatepoint(const Value &inst) {
 }
 
 bool llvm::isGCRelocate(const ImmutableCallSite &CS) {
-  if (!CS.getInstruction()) {
-    // This is not a call site
-    return false;
-  }
-
-  return isGCRelocate(CS.getInstruction());
-}
-bool llvm::isGCRelocate(const Value *inst) {
-  if (const CallInst *call = dyn_cast<CallInst>(inst)) {
-    if (const Function *F = call->getCalledFunction()) {
-      return F->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
-    }
-  }
-  return false;
+  return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
 }
 
 bool llvm::isGCResult(const ImmutableCallSite &CS) {
@@ -67,10 +54,7 @@ bool llvm::isGCResult(const ImmutableCallSite &CS) {
 bool llvm::isGCResult(const Value *inst) {
   if (const CallInst *call = dyn_cast<CallInst>(inst)) {
     if (Function *F = call->getCalledFunction()) {
-      return (F->getIntrinsicID() == Intrinsic::experimental_gc_result_int ||
-              F->getIntrinsicID() == Intrinsic::experimental_gc_result_float ||
-              F->getIntrinsicID() == Intrinsic::experimental_gc_result_ptr ||
-              F->getIntrinsicID() == Intrinsic::experimental_gc_result);
+      return F->getIntrinsicID() == Intrinsic::experimental_gc_result;
     }
   }
   return false;
diff --git a/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h b/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h
index a18f982..50573d8 100644
--- a/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h
+++ b/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h
@@ -24,77 +24,73 @@ namespace llvm {
 /// setSymTabObject - This is called when (f.e.) the parent of a basic block
 /// changes.  This requires us to remove all the instruction symtab entries from
 /// the current function and reinsert them into the new function.
-template<typename ValueSubClass, typename ItemParentClass>
-template<typename TPtr>
-void SymbolTableListTraits<ValueSubClass,ItemParentClass>
-::setSymTabObject(TPtr *Dest, TPtr Src) {
+template <typename ValueSubClass>
+template <typename TPtr>
+void SymbolTableListTraits<ValueSubClass>::setSymTabObject(TPtr *Dest,
+                                                           TPtr Src) {
   // Get the old symtab and value list before doing the assignment.
-  ValueSymbolTable *OldST = TraitsClass::getSymTab(getListOwner());
+  ValueSymbolTable *OldST = getSymTab(getListOwner());
 
   // Do it.
   *Dest = Src;
   
   // Get the new SymTab object.
-  ValueSymbolTable *NewST = TraitsClass::getSymTab(getListOwner());
+  ValueSymbolTable *NewST = getSymTab(getListOwner());
   
   // If there is nothing to do, quick exit.
   if (OldST == NewST) return;
   
   // Move all the elements from the old symtab to the new one.
-  iplist<ValueSubClass> &ItemList = TraitsClass::getList(getListOwner());
+  ListTy &ItemList = getList(getListOwner());
   if (ItemList.empty()) return;
   
   if (OldST) {
     // Remove all entries from the previous symtab.
-    for (typename iplist<ValueSubClass>::iterator I = ItemList.begin();
-         I != ItemList.end(); ++I)
+    for (auto I = ItemList.begin(); I != ItemList.end(); ++I)
       if (I->hasName())
         OldST->removeValueName(I->getValueName());
   }
 
   if (NewST) {
     // Add all of the items to the new symtab.
-    for (typename iplist<ValueSubClass>::iterator I = ItemList.begin();
-         I != ItemList.end(); ++I)
+    for (auto I = ItemList.begin(); I != ItemList.end(); ++I)
       if (I->hasName())
-        NewST->reinsertValue(I);
+        NewST->reinsertValue(&*I);
   }
   
 }
 
-template<typename ValueSubClass, typename ItemParentClass>
-void SymbolTableListTraits<ValueSubClass,ItemParentClass>
-::addNodeToList(ValueSubClass *V) {
+template <typename ValueSubClass>
+void SymbolTableListTraits<ValueSubClass>::addNodeToList(ValueSubClass *V) {
   assert(!V->getParent() && "Value already in a container!!");
   ItemParentClass *Owner = getListOwner();
   V->setParent(Owner);
   if (V->hasName())
-    if (ValueSymbolTable *ST = TraitsClass::getSymTab(Owner))
+    if (ValueSymbolTable *ST = getSymTab(Owner))
       ST->reinsertValue(V);
 }
 
-template<typename ValueSubClass, typename ItemParentClass>
-void SymbolTableListTraits<ValueSubClass,ItemParentClass>
-::removeNodeFromList(ValueSubClass *V) {
+template <typename ValueSubClass>
+void SymbolTableListTraits<ValueSubClass>::removeNodeFromList(
+    ValueSubClass *V) {
   V->setParent(nullptr);
   if (V->hasName())
-    if (ValueSymbolTable *ST = TraitsClass::getSymTab(getListOwner()))
+    if (ValueSymbolTable *ST = getSymTab(getListOwner()))
       ST->removeValueName(V->getValueName());
 }
 
-template<typename ValueSubClass, typename ItemParentClass>
-void SymbolTableListTraits<ValueSubClass,ItemParentClass>
-::transferNodesFromList(ilist_traits<ValueSubClass> &L2,
-                        ilist_iterator<ValueSubClass> first,
-                        ilist_iterator<ValueSubClass> last) {
+template <typename ValueSubClass>
+void SymbolTableListTraits<ValueSubClass>::transferNodesFromList(
+    SymbolTableListTraits &L2, ilist_iterator<ValueSubClass> first,
+    ilist_iterator<ValueSubClass> last) {
   // We only have to do work here if transferring instructions between BBs
   ItemParentClass *NewIP = getListOwner(), *OldIP = L2.getListOwner();
   if (NewIP == OldIP) return;  // No work to do at all...
 
   // We only have to update symbol table entries if we are transferring the
   // instructions to a different symtab object...
-  ValueSymbolTable *NewST = TraitsClass::getSymTab(NewIP);
-  ValueSymbolTable *OldST = TraitsClass::getSymTab(OldIP);
+  ValueSymbolTable *NewST = getSymTab(NewIP);
+  ValueSymbolTable *OldST = getSymTab(OldIP);
   if (NewST != OldST) {
     for (; first != last; ++first) {
       ValueSubClass &V = *first;
diff --git a/contrib/llvm/lib/IR/Type.cpp b/contrib/llvm/lib/IR/Type.cpp
index a9ca800..4c1baf5 100644
--- a/contrib/llvm/lib/IR/Type.cpp
+++ b/contrib/llvm/lib/IR/Type.cpp
@@ -35,6 +35,7 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   case LabelTyID     : return getLabelTy(C);
   case MetadataTyID  : return getMetadataTy(C);
   case X86_MMXTyID   : return getX86_MMXTy(C);
+  case TokenTyID     : return getTokenTy(C);
   default:
     return nullptr;
   }
@@ -42,16 +43,10 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
 
 /// getScalarType - If this is a vector type, return the element type,
 /// otherwise return this.
-Type *Type::getScalarType() {
-  if (VectorType *VTy = dyn_cast<VectorType>(this))
+Type *Type::getScalarType() const {
+  if (auto *VTy = dyn_cast<VectorType>(this))
     return VTy->getElementType();
-  return this;
-}
-
-const Type *Type::getScalarType() const {
-  if (const VectorType *VTy = dyn_cast<VectorType>(this))
-    return VTy->getElementType();
-  return this;
+  return const_cast<Type*>(this);
 }
 
 /// isIntegerTy - Return true if this is an IntegerType of the specified width.
@@ -74,8 +69,8 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   // Vector -> Vector conversions are always lossless if the two vector types
   // have the same size, otherwise not.  Also, 64-bit vector types can be
   // converted to x86mmx.
-  if (const VectorType *thisPTy = dyn_cast<VectorType>(this)) {
-    if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
+  if (auto *thisPTy = dyn_cast<VectorType>(this)) {
+    if (auto *thatPTy = dyn_cast<VectorType>(Ty))
       return thisPTy->getBitWidth() == thatPTy->getBitWidth();
     if (Ty->getTypeID() == Type::X86_MMXTyID &&
         thisPTy->getBitWidth() == 64)
@@ -83,7 +78,7 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   }
 
   if (this->getTypeID() == Type::X86_MMXTyID)
-    if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
+    if (auto *thatPTy = dyn_cast<VectorType>(Ty))
       if (thatPTy->getBitWidth() == 64)
         return true;
 
@@ -91,8 +86,8 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   // remaining and ptr->ptr. Just select the lossless conversions. Everything
   // else is not lossless. Conservatively assume we can't losslessly convert
   // between pointers with different address spaces.
-  if (const PointerType *PTy = dyn_cast<PointerType>(this)) {
-    if (const PointerType *OtherPTy = dyn_cast<PointerType>(Ty))
+  if (auto *PTy = dyn_cast<PointerType>(this)) {
+    if (auto *OtherPTy = dyn_cast<PointerType>(Ty))
       return PTy->getAddressSpace() == OtherPTy->getAddressSpace();
     return false;
   }
@@ -100,14 +95,12 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
 }
 
 bool Type::isEmptyTy() const {
-  const ArrayType *ATy = dyn_cast<ArrayType>(this);
-  if (ATy) {
+  if (auto *ATy = dyn_cast<ArrayType>(this)) {
     unsigned NumElements = ATy->getNumElements();
     return NumElements == 0 || ATy->getElementType()->isEmptyTy();
   }
 
-  const StructType *STy = dyn_cast<StructType>(this);
-  if (STy) {
+  if (auto *STy = dyn_cast<StructType>(this)) {
     unsigned NumElements = STy->getNumElements();
     for (unsigned i = 0; i < NumElements; ++i)
       if (!STy->getElementType(i)->isEmptyTy())
@@ -144,7 +137,7 @@ unsigned Type::getScalarSizeInBits() const {
 /// is only valid on floating point types.  If the FP type does not
 /// have a stable mantissa (e.g. ppc long double), this method returns -1.
 int Type::getFPMantissaWidth() const {
-  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+  if (auto *VTy = dyn_cast<VectorType>(this))
     return VTy->getElementType()->getFPMantissaWidth();
   assert(isFloatingPointTy() && "Not a floating point type!");
   if (getTypeID() == HalfTyID) return 11;
@@ -159,66 +152,17 @@ int Type::getFPMantissaWidth() const {
 /// isSizedDerivedType - Derived types like structures and arrays are sized
 /// iff all of the members of the type are sized as well.  Since asking for
 /// their size is relatively uncommon, move this operation out of line.
-bool Type::isSizedDerivedType(SmallPtrSetImpl<const Type*> *Visited) const {
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(this))
+bool Type::isSizedDerivedType(SmallPtrSetImpl<Type*> *Visited) const {
+  if (auto *ATy = dyn_cast<ArrayType>(this))
     return ATy->getElementType()->isSized(Visited);
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+  if (auto *VTy = dyn_cast<VectorType>(this))
     return VTy->getElementType()->isSized(Visited);
 
   return cast<StructType>(this)->isSized(Visited);
 }
 
 //===----------------------------------------------------------------------===//
-//                         Subclass Helper Methods
-//===----------------------------------------------------------------------===//
-
-unsigned Type::getIntegerBitWidth() const {
-  return cast<IntegerType>(this)->getBitWidth();
-}
-
-bool Type::isFunctionVarArg() const {
-  return cast<FunctionType>(this)->isVarArg();
-}
-
-Type *Type::getFunctionParamType(unsigned i) const {
-  return cast<FunctionType>(this)->getParamType(i);
-}
-
-unsigned Type::getFunctionNumParams() const {
-  return cast<FunctionType>(this)->getNumParams();
-}
-
-StringRef Type::getStructName() const {
-  return cast<StructType>(this)->getName();
-}
-
-unsigned Type::getStructNumElements() const {
-  return cast<StructType>(this)->getNumElements();
-}
-
-Type *Type::getStructElementType(unsigned N) const {
-  return cast<StructType>(this)->getElementType(N);
-}
-
-Type *Type::getSequentialElementType() const {
-  return cast<SequentialType>(this)->getElementType();
-}
-
-uint64_t Type::getArrayNumElements() const {
-  return cast<ArrayType>(this)->getNumElements();
-}
-
-unsigned Type::getVectorNumElements() const {
-  return cast<VectorType>(this)->getNumElements();
-}
-
-unsigned Type::getPointerAddressSpace() const {
-  return cast<PointerType>(getScalarType())->getAddressSpace();
-}
-
-
-//===----------------------------------------------------------------------===//
 //                          Primitive 'Type' data
 //===----------------------------------------------------------------------===//
 
@@ -228,6 +172,7 @@ Type *Type::getHalfTy(LLVMContext &C) { return &C.pImpl->HalfTy; }
 Type *Type::getFloatTy(LLVMContext &C) { return &C.pImpl->FloatTy; }
 Type *Type::getDoubleTy(LLVMContext &C) { return &C.pImpl->DoubleTy; }
 Type *Type::getMetadataTy(LLVMContext &C) { return &C.pImpl->MetadataTy; }
+Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; }
 Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
 Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
 Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
@@ -345,7 +290,7 @@ FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params,
   assert(isValidReturnType(Result) && "invalid return type for function");
   setSubclassData(IsVarArgs);
 
-  SubTys[0] = const_cast<Type*>(Result);
+  SubTys[0] = Result;
 
   for (unsigned i = 0, e = Params.size(); i != e; ++i) {
     assert(isValidArgumentType(Params[i]) &&
@@ -428,12 +373,14 @@ void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
   if (isPacked)
     setSubclassData(getSubclassData() | SCDB_Packed);
 
-  unsigned NumElements = Elements.size();
-  Type **Elts = getContext().pImpl->TypeAllocator.Allocate<Type*>(NumElements);
-  memcpy(Elts, Elements.data(), sizeof(Elements[0]) * NumElements);
-  
-  ContainedTys = Elts;
-  NumContainedTys = NumElements;
+  NumContainedTys = Elements.size();
+
+  if (Elements.empty()) {
+    ContainedTys = nullptr;
+    return;
+  }
+
+  ContainedTys = Elements.copy(getContext().pImpl->TypeAllocator).data();
 }
 
 void StructType::setName(StringRef Name) {
@@ -470,7 +417,6 @@ void StructType::setName(StringRef Name) {
    
     do {
       TempStr.resize(NameSize + 1);
-      TmpStream.resync();
       TmpStream << getContext().pImpl->NamedStructTypesUniqueID++;
 
       IterBool = getContext().pImpl->NamedStructTypes.insert(
@@ -556,13 +502,13 @@ StructType *StructType::create(StringRef Name, Type *type, ...) {
   return Ret;
 }
 
-bool StructType::isSized(SmallPtrSetImpl<const Type*> *Visited) const {
+bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
   if ((getSubclassData() & SCDB_IsSized) != 0)
     return true;
   if (isOpaque())
     return false;
 
-  if (Visited && !Visited->insert(this).second)
+  if (Visited && !Visited->insert(const_cast<StructType*>(this)).second)
     return false;
 
   // Okay, our struct is sized if all of the elements are, but if one of the
@@ -602,22 +548,19 @@ void StructType::setBody(Type *type, ...) {
 
 bool StructType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
-         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
+         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
+         !ElemTy->isTokenTy();
 }
 
 /// isLayoutIdentical - Return true if this is layout identical to the
 /// specified struct.
 bool StructType::isLayoutIdentical(StructType *Other) const {
   if (this == Other) return true;
-  
-  if (isPacked() != Other->isPacked() ||
-      getNumElements() != Other->getNumElements())
+
+  if (isPacked() != Other->isPacked())
     return false;
 
-  if (!getNumElements())
-    return true;
-  
-  return std::equal(element_begin(), element_end(), Other->element_begin());
+  return elements() == Other->elements();
 }
 
 /// getTypeByName - Return the type with the specified name, or null if there
@@ -631,8 +574,8 @@ StructType *Module::getTypeByName(StringRef Name) const {
 //                       CompositeType Implementation
 //===----------------------------------------------------------------------===//
 
-Type *CompositeType::getTypeAtIndex(const Value *V) {
-  if (StructType *STy = dyn_cast<StructType>(this)) {
+Type *CompositeType::getTypeAtIndex(const Value *V) const {
+  if (auto *STy = dyn_cast<StructType>(this)) {
     unsigned Idx =
       (unsigned)cast<Constant>(V)->getUniqueInteger().getZExtValue();
     assert(indexValid(Idx) && "Invalid structure index!");
@@ -641,16 +584,18 @@ Type *CompositeType::getTypeAtIndex(const Value *V) {
 
   return cast<SequentialType>(this)->getElementType();
 }
-Type *CompositeType::getTypeAtIndex(unsigned Idx) {
-  if (StructType *STy = dyn_cast<StructType>(this)) {
+
+Type *CompositeType::getTypeAtIndex(unsigned Idx) const{
+  if (auto *STy = dyn_cast<StructType>(this)) {
     assert(indexValid(Idx) && "Invalid structure index!");
     return STy->getElementType(Idx);
   }
-  
+
   return cast<SequentialType>(this)->getElementType();
 }
+
 bool CompositeType::indexValid(const Value *V) const {
-  if (const StructType *STy = dyn_cast<StructType>(this)) {
+  if (auto *STy = dyn_cast<StructType>(this)) {
     // Structure indexes require (vectors of) 32-bit integer constants.  In the
     // vector case all of the indices must be equal.
     if (!V->getType()->getScalarType()->isIntegerTy(32))
@@ -667,7 +612,7 @@ bool CompositeType::indexValid(const Value *V) const {
 }
 
 bool CompositeType::indexValid(unsigned Idx) const {
-  if (const StructType *STy = dyn_cast<StructType>(this))
+  if (auto *STy = dyn_cast<StructType>(this))
     return Idx < STy->getNumElements();
   // Sequential types can be indexed by any integer.
   return true;
@@ -683,10 +628,9 @@ ArrayType::ArrayType(Type *ElType, uint64_t NumEl)
   NumElements = NumEl;
 }
 
-ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
-  Type *ElementType = const_cast<Type*>(elementType);
+ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
   assert(isValidElementType(ElementType) && "Invalid type for array element!");
-    
+
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
   ArrayType *&Entry = 
     pImpl->ArrayTypes[std::make_pair(ElementType, NumElements)];
@@ -698,7 +642,8 @@ ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
 
 bool ArrayType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
-         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
+         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
+         !ElemTy->isTokenTy();
 }
 
 //===----------------------------------------------------------------------===//
@@ -710,8 +655,7 @@ VectorType::VectorType(Type *ElType, unsigned NumEl)
   NumElements = NumEl;
 }
 
-VectorType *VectorType::get(Type *elementType, unsigned NumElements) {
-  Type *ElementType = const_cast<Type*>(elementType);
+VectorType *VectorType::get(Type *ElementType, unsigned NumElements) {
   assert(NumElements > 0 && "#Elements of a VectorType must be greater than 0");
   assert(isValidElementType(ElementType) && "Element type of a VectorType must "
                                             "be an integer, floating point, or "
@@ -761,13 +705,13 @@ PointerType::PointerType(Type *E, unsigned AddrSpace)
   assert(oldNCT == NumContainedTys && "bitfield written out of bounds?");
 }
 
-PointerType *Type::getPointerTo(unsigned addrs) {
-  return PointerType::get(this, addrs);
+PointerType *Type::getPointerTo(unsigned addrs) const {
+  return PointerType::get(const_cast<Type*>(this), addrs);
 }
 
 bool PointerType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
-         !ElemTy->isMetadataTy();
+         !ElemTy->isMetadataTy() && !ElemTy->isTokenTy();
 }
 
 bool PointerType::isLoadableOrStorableType(Type *ElemTy) {
diff --git a/contrib/llvm/lib/IR/TypeFinder.cpp b/contrib/llvm/lib/IR/TypeFinder.cpp
index 7accc5b..b5bdab0 100644
--- a/contrib/llvm/lib/IR/TypeFinder.cpp
+++ b/contrib/llvm/lib/IR/TypeFinder.cpp
@@ -44,19 +44,13 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
   for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
     incorporateType(FI->getType());
 
-    if (FI->hasPrefixData())
-      incorporateValue(FI->getPrefixData());
-
-    if (FI->hasPrologueData())
-      incorporateValue(FI->getPrologueData());
-
-    if (FI->hasPersonalityFn())
-      incorporateValue(FI->getPersonalityFn());
+    for (const Use &U : FI->operands())
+      incorporateValue(U.get());
 
     // First incorporate the arguments.
     for (Function::const_arg_iterator AI = FI->arg_begin(),
            AE = FI->arg_end(); AI != AE; ++AI)
-      incorporateValue(AI);
+      incorporateValue(&*AI);
 
     for (Function::const_iterator BB = FI->begin(), E = FI->end();
          BB != E;++BB)
@@ -85,7 +79,7 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
 
   for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
          E = M.named_metadata_end(); I != E; ++I) {
-    const NamedMDNode *NMD = I;
+    const NamedMDNode *NMD = &*I;
     for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
       incorporateMDNode(NMD->getOperand(i));
   }
diff --git a/contrib/llvm/lib/IR/User.cpp b/contrib/llvm/lib/IR/User.cpp
index 522722d..a75abe6 100644
--- a/contrib/llvm/lib/IR/User.cpp
+++ b/contrib/llvm/lib/IR/User.cpp
@@ -87,22 +87,70 @@ void User::growHungoffUses(unsigned NewNumUses, bool IsPhi) {
   Use::zap(OldOps, OldOps + OldNumUses, true);
 }
 
+
+// This is a private struct used by `User` to track the co-allocated descriptor
+// section.
+struct DescriptorInfo {
+  intptr_t SizeInBytes;
+};
+
+ArrayRef<const uint8_t> User::getDescriptor() const {
+  auto MutableARef = const_cast<User *>(this)->getDescriptor();
+  return {MutableARef.begin(), MutableARef.end()};
+}
+
+MutableArrayRef<uint8_t> User::getDescriptor() {
+  assert(HasDescriptor && "Don't call otherwise!");
+  assert(!HasHungOffUses && "Invariant!");
+
+  auto *DI = reinterpret_cast<DescriptorInfo *>(getIntrusiveOperands()) - 1;
+  assert(DI->SizeInBytes != 0 && "Should not have had a descriptor otherwise!");
+
+  return MutableArrayRef<uint8_t>(
+      reinterpret_cast<uint8_t *>(DI) - DI->SizeInBytes, DI->SizeInBytes);
+}
+
 //===----------------------------------------------------------------------===//
 //                         User operator new Implementations
 //===----------------------------------------------------------------------===//
 
-void *User::operator new(size_t Size, unsigned Us) {
+void *User::allocateFixedOperandUser(size_t Size, unsigned Us,
+                                     unsigned DescBytes) {
   assert(Us < (1u << NumUserOperandsBits) && "Too many operands");
-  void *Storage = ::operator new(Size + sizeof(Use) * Us);
-  Use *Start = static_cast<Use*>(Storage);
+
+  static_assert(sizeof(DescriptorInfo) % sizeof(void *) == 0, "Required below");
+
+  unsigned DescBytesToAllocate =
+      DescBytes == 0 ? 0 : (DescBytes + sizeof(DescriptorInfo));
+  assert(DescBytesToAllocate % sizeof(void *) == 0 &&
+         "We need this to satisfy alignment constraints for Uses");
+
+  uint8_t *Storage = static_cast<uint8_t *>(
+      ::operator new(Size + sizeof(Use) * Us + DescBytesToAllocate));
+  Use *Start = reinterpret_cast<Use *>(Storage + DescBytesToAllocate);
   Use *End = Start + Us;
   User *Obj = reinterpret_cast<User*>(End);
   Obj->NumUserOperands = Us;
   Obj->HasHungOffUses = false;
+  Obj->HasDescriptor = DescBytes != 0;
   Use::initTags(Start, End);
+
+  if (DescBytes != 0) {
+    auto *DescInfo = reinterpret_cast<DescriptorInfo *>(Storage + DescBytes);
+    DescInfo->SizeInBytes = DescBytes;
+  }
+
   return Obj;
 }
 
+void *User::operator new(size_t Size, unsigned Us) {
+  return allocateFixedOperandUser(Size, Us, 0);
+}
+
+void *User::operator new(size_t Size, unsigned Us, unsigned DescBytes) {
+  return allocateFixedOperandUser(Size, Us, DescBytes);
+}
+
 void *User::operator new(size_t Size) {
   // Allocate space for a single Use*
   void *Storage = ::operator new(Size + sizeof(Use *));
@@ -110,6 +158,7 @@ void *User::operator new(size_t Size) {
   User *Obj = reinterpret_cast<User *>(HungOffOperandList + 1);
   Obj->NumUserOperands = 0;
   Obj->HasHungOffUses = true;
+  Obj->HasDescriptor = false;
   *HungOffOperandList = nullptr;
   return Obj;
 }
@@ -123,11 +172,20 @@ void User::operator delete(void *Usr) {
   // use a Use[] allocated prior to the user.
   User *Obj = static_cast<User *>(Usr);
   if (Obj->HasHungOffUses) {
+    assert(!Obj->HasDescriptor && "not supported!");
+
     Use **HungOffOperandList = static_cast<Use **>(Usr) - 1;
     // drop the hung off uses.
     Use::zap(*HungOffOperandList, *HungOffOperandList + Obj->NumUserOperands,
              /* Delete */ true);
     ::operator delete(HungOffOperandList);
+  } else if (Obj->HasDescriptor) {
+    Use *UseBegin = static_cast<Use *>(Usr) - Obj->NumUserOperands;
+    Use::zap(UseBegin, UseBegin + Obj->NumUserOperands, /* Delete */ false);
+
+    auto *DI = reinterpret_cast<DescriptorInfo *>(UseBegin) - 1;
+    uint8_t *Storage = reinterpret_cast<uint8_t *>(DI) - DI->SizeInBytes;
+    ::operator delete(Storage);
   } else {
     Use *Storage = static_cast<Use *>(Usr) - Obj->NumUserOperands;
     Use::zap(Storage, Storage + Obj->NumUserOperands,
diff --git a/contrib/llvm/lib/IR/Value.cpp b/contrib/llvm/lib/IR/Value.cpp
index f554d59..eb9deb6 100644
--- a/contrib/llvm/lib/IR/Value.cpp
+++ b/contrib/llvm/lib/IR/Value.cpp
@@ -314,6 +314,16 @@ void Value::takeName(Value *V) {
 }
 
 #ifndef NDEBUG
+void Value::assertModuleIsMaterialized() const {
+  const GlobalValue *GV = dyn_cast<GlobalValue>(this);
+  if (!GV)
+    return;
+  const Module *M = GV->getParent();
+  if (!M)
+    return;
+  assert(M->isMaterialized());
+}
+
 static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr,
                      Constant *C) {
   if (!Cache.insert(Expr).second)
@@ -490,8 +500,7 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
         return V;
       Offset = GEPOffset;
       V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast ||
-               Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       V = GA->getAliasee();
diff --git a/contrib/llvm/lib/IR/ValueSymbolTable.cpp b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
index e10142d..deb6e75 100644
--- a/contrib/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
@@ -32,6 +32,24 @@ ValueSymbolTable::~ValueSymbolTable() {
 #endif
 }
 
+ValueName *ValueSymbolTable::makeUniqueName(Value *V,
+                                            SmallString<256> &UniqueName) {
+  unsigned BaseSize = UniqueName.size();
+  while (1) {
+    // Trim any suffix off and append the next number.
+    UniqueName.resize(BaseSize);
+    raw_svector_ostream S(UniqueName);
+    if (isa<GlobalValue>(V))
+      S << ".";
+    S << ++LastUnique;
+
+    // Try insert the vmap entry with this suffix.
+    auto IterBool = vmap.insert(std::make_pair(UniqueName, V));
+    if (IterBool.second)
+      return &*IterBool.first;
+  }
+}
+
 // Insert a value into the symbol table with the specified name...
 //
 void ValueSymbolTable::reinsertValue(Value* V) {
@@ -49,21 +67,8 @@ void ValueSymbolTable::reinsertValue(Value* V) {
   // The name is too already used, just free it so we can allocate a new name.
   V->getValueName()->Destroy();
 
-  unsigned BaseSize = UniqueName.size();
-  while (1) {
-    // Trim any suffix off and append the next number.
-    UniqueName.resize(BaseSize);
-    raw_svector_ostream(UniqueName) << "." << ++LastUnique;
-
-    // Try insert the vmap entry with this suffix.
-    auto IterBool = vmap.insert(std::make_pair(UniqueName, V));
-    if (IterBool.second) {
-      // Newly inserted name.  Success!
-      V->setValueName(&*IterBool.first);
-     //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
-      return;
-    }
-  }
+  ValueName *VN = makeUniqueName(V, UniqueName);
+  V->setValueName(VN);
 }
 
 void ValueSymbolTable::removeValueName(ValueName *V) {
@@ -86,20 +91,7 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   
   // Otherwise, there is a naming conflict.  Rename this value.
   SmallString<256> UniqueName(Name.begin(), Name.end());
-  
-  while (1) {
-    // Trim any suffix off and append the next number.
-    UniqueName.resize(Name.size());
-    raw_svector_ostream(UniqueName) << ++LastUnique;
-    
-    // Try insert the vmap entry with this suffix.
-    auto IterBool = vmap.insert(std::make_pair(UniqueName, V));
-    if (IterBool.second) {
-      // DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V <<
-      //       "\n");
-      return &*IterBool.first;
-    }
-  }
+  return makeUniqueName(V, UniqueName);
 }
 
 
diff --git a/contrib/llvm/lib/IR/ValueTypes.cpp b/contrib/llvm/lib/IR/ValueTypes.cpp
index d95de39..f293230 100644
--- a/contrib/llvm/lib/IR/ValueTypes.cpp
+++ b/contrib/llvm/lib/IR/ValueTypes.cpp
@@ -19,6 +19,11 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+EVT EVT::changeExtendedTypeToInteger() const {
+  LLVMContext &Context = LLVMTy->getContext();
+  return getIntegerVT(Context, getSizeInBits());
+}
+
 EVT EVT::changeExtendedVectorElementTypeToInteger() const {
   LLVMContext &Context = LLVMTy->getContext();
   EVT IntTy = getIntegerVT(Context, getVectorElementType().getSizeInBits());
@@ -83,6 +88,10 @@ bool EVT::isExtended1024BitVector() const {
   return isExtendedVector() && getExtendedSizeInBits() == 1024;
 }
 
+bool EVT::isExtended2048BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 2048;
+}
+
 EVT EVT::getExtendedVectorElementType() const {
   assert(isExtended() && "Type is not extended!");
   return EVT::getEVT(cast<VectorType>(LLVMTy)->getElementType());
@@ -134,6 +143,8 @@ std::string EVT::getEVTString() const {
   case MVT::v16i1:   return "v16i1";
   case MVT::v32i1:   return "v32i1";
   case MVT::v64i1:   return "v64i1";
+  case MVT::v512i1:  return "v512i1";
+  case MVT::v1024i1: return "v1024i1";
   case MVT::v1i8:    return "v1i8";
   case MVT::v2i8:    return "v2i8";
   case MVT::v4i8:    return "v4i8";
@@ -141,22 +152,29 @@ std::string EVT::getEVTString() const {
   case MVT::v16i8:   return "v16i8";
   case MVT::v32i8:   return "v32i8";
   case MVT::v64i8:   return "v64i8";
+  case MVT::v128i8:  return "v128i8";
+  case MVT::v256i8:  return "v256i8";
   case MVT::v1i16:   return "v1i16";
   case MVT::v2i16:   return "v2i16";
   case MVT::v4i16:   return "v4i16";
   case MVT::v8i16:   return "v8i16";
   case MVT::v16i16:  return "v16i16";
   case MVT::v32i16:  return "v32i16";
+  case MVT::v64i16:  return "v64i16";
+  case MVT::v128i16: return "v128i16";
   case MVT::v1i32:   return "v1i32";
   case MVT::v2i32:   return "v2i32";
   case MVT::v4i32:   return "v4i32";
   case MVT::v8i32:   return "v8i32";
   case MVT::v16i32:  return "v16i32";
+  case MVT::v32i32:  return "v32i32";
+  case MVT::v64i32:  return "v64i32";
   case MVT::v1i64:   return "v1i64";
   case MVT::v2i64:   return "v2i64";
   case MVT::v4i64:   return "v4i64";
   case MVT::v8i64:   return "v8i64";
   case MVT::v16i64:  return "v16i64";
+  case MVT::v32i64:  return "v32i64";
   case MVT::v1i128:  return "v1i128";
   case MVT::v1f32:   return "v1f32";
   case MVT::v2f32:   return "v2f32";
@@ -203,6 +221,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
   case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
   case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v512i1:  return VectorType::get(Type::getInt1Ty(Context), 512);
+  case MVT::v1024i1: return VectorType::get(Type::getInt1Ty(Context), 1024);
   case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
   case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
@@ -210,22 +230,29 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v16i8:   return VectorType::get(Type::getInt8Ty(Context), 16);
   case MVT::v32i8:   return VectorType::get(Type::getInt8Ty(Context), 32);
   case MVT::v64i8:   return VectorType::get(Type::getInt8Ty(Context), 64);
+  case MVT::v128i8:  return VectorType::get(Type::getInt8Ty(Context), 128);
+  case MVT::v256i8:  return VectorType::get(Type::getInt8Ty(Context), 256);
   case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
   case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
   case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
   case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
   case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
   case MVT::v32i16:  return VectorType::get(Type::getInt16Ty(Context), 32);
+  case MVT::v64i16:  return VectorType::get(Type::getInt16Ty(Context), 64);
+  case MVT::v128i16: return VectorType::get(Type::getInt16Ty(Context), 128);
   case MVT::v1i32:   return VectorType::get(Type::getInt32Ty(Context), 1);
   case MVT::v2i32:   return VectorType::get(Type::getInt32Ty(Context), 2);
   case MVT::v4i32:   return VectorType::get(Type::getInt32Ty(Context), 4);
   case MVT::v8i32:   return VectorType::get(Type::getInt32Ty(Context), 8);
   case MVT::v16i32:  return VectorType::get(Type::getInt32Ty(Context), 16);
+  case MVT::v32i32:  return VectorType::get(Type::getInt32Ty(Context), 32);
+  case MVT::v64i32:  return VectorType::get(Type::getInt32Ty(Context), 64);
   case MVT::v1i64:   return VectorType::get(Type::getInt64Ty(Context), 1);
   case MVT::v2i64:   return VectorType::get(Type::getInt64Ty(Context), 2);
   case MVT::v4i64:   return VectorType::get(Type::getInt64Ty(Context), 4);
   case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
   case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
+  case MVT::v32i64:  return VectorType::get(Type::getInt64Ty(Context), 32);
   case MVT::v1i128:  return VectorType::get(Type::getInt128Ty(Context), 1);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
   case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
diff --git a/contrib/llvm/lib/IR/Verifier.cpp b/contrib/llvm/lib/IR/Verifier.cpp
index 2a0a4ff..9198b0e 100644
--- a/contrib/llvm/lib/IR/Verifier.cpp
+++ b/contrib/llvm/lib/IR/Verifier.cpp
@@ -39,13 +39,13 @@
 //    only by the unwind edge of an invoke instruction.
 //  * A landingpad instruction must be the first non-PHI instruction in the
 //    block.
-//  * All landingpad instructions must use the same personality function with
-//    the same function.
+//  * Landingpad instructions must be in a function with a personality function.
 //  * All other things that are tested by asserts spread about the code...
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Verifier.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -92,6 +92,16 @@ struct VerifierSupport {
       : OS(OS), M(nullptr), Broken(false) {}
 
 private:
+  template <class NodeTy> void Write(const ilist_iterator<NodeTy> &I) {
+    Write(&*I);
+  }
+
+  void Write(const Module *M) {
+    if (!M)
+      return;
+    OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
+  }
+
   void Write(const Value *V) {
     if (!V)
       return;
@@ -136,6 +146,11 @@ private:
     OS << *C;
   }
 
+  template <typename T> void Write(ArrayRef<T> Vs) {
+    for (const T &V : Vs)
+      Write(V);
+  }
+
   template <typename T1, typename... Ts>
   void WriteTs(const T1 &V1, const Ts &... Vs) {
     Write(V1);
@@ -184,6 +199,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// \brief Track unresolved string-based type references.
   SmallDenseMap<const MDString *, const MDNode *, 32> UnresolvedTypeRefs;
 
+  /// \brief The result type for a landingpad.
+  Type *LandingPadResultTy;
+
   /// \brief Whether we've seen a call to @llvm.localescape in this function
   /// already.
   bool SawFrameEscape;
@@ -192,9 +210,19 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
 
+  // Maps catchswitches and cleanuppads that unwind to siblings to the
+  // terminators that indicate the unwind, used to detect cycles therein.
+  MapVector<Instruction *, TerminatorInst *> SiblingFuncletInfo;
+
+  /// Cache of constants visited in search of ConstantExprs.
+  SmallPtrSet<const Constant *, 32> ConstantExprVisited;
+
+  void checkAtomicMemAccessSize(const Module *M, Type *Ty,
+                                const Instruction *I);
 public:
   explicit Verifier(raw_ostream &OS)
-      : VerifierSupport(OS), Context(nullptr), SawFrameEscape(false) {}
+      : VerifierSupport(OS), Context(nullptr), LandingPadResultTy(nullptr),
+        SawFrameEscape(false) {}
 
   bool verify(const Function &F) {
     M = F.getParent();
@@ -227,8 +255,11 @@ public:
     Broken = false;
     // FIXME: We strip const here because the inst visitor strips const.
     visit(const_cast<Function &>(F));
+    verifySiblingFuncletUnwinds();
     InstsInThisBlock.clear();
+    LandingPadResultTy = nullptr;
     SawFrameEscape = false;
+    SiblingFuncletInfo.clear();
 
     return !Broken;
   }
@@ -297,12 +328,12 @@ private:
   void visitFunction(const Function &F);
   void visitBasicBlock(BasicBlock &BB);
   void visitRangeMetadata(Instruction& I, MDNode* Range, Type* Ty);
+  void visitDereferenceableMetadata(Instruction& I, MDNode* MD);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
 #include "llvm/IR/Metadata.def"
   void visitDIScope(const DIScope &N);
-  void visitDIDerivedTypeBase(const DIDerivedTypeBase &N);
   void visitDIVariable(const DIVariable &N);
   void visitDILexicalBlockBase(const DILexicalBlockBase &N);
   void visitDITemplateParameter(const DITemplateParameter &N);
@@ -379,7 +410,14 @@ private:
   void visitAllocaInst(AllocaInst &AI);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
+  void visitEHPadPredecessors(Instruction &I);
   void visitLandingPadInst(LandingPadInst &LPI);
+  void visitCatchPadInst(CatchPadInst &CPI);
+  void visitCatchReturnInst(CatchReturnInst &CatchReturn);
+  void visitCleanupPadInst(CleanupPadInst &CPI);
+  void visitFuncletPadInst(FuncletPadInst &FPI);
+  void visitCatchSwitchInst(CatchSwitchInst &CatchSwitch);
+  void visitCleanupReturnInst(CleanupReturnInst &CRI);
 
   void VerifyCallSite(CallSite CS);
   void verifyMustTailCall(CallInst &CI);
@@ -399,9 +437,11 @@ private:
   void VerifyFunctionMetadata(
       const SmallVector<std::pair<unsigned, MDNode *>, 4> MDs);
 
-  void VerifyConstantExprBitcastType(const ConstantExpr *CE);
+  void visitConstantExprsRecursively(const Constant *EntryC);
+  void visitConstantExpr(const ConstantExpr *CE);
   void VerifyStatepoint(ImmutableCallSite CS);
   void verifyFrameRecoverIndices();
+  void verifySiblingFuncletUnwinds();
 
   // Module-level debug info verification...
   void verifyTypeRefs();
@@ -524,25 +564,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   }
 
   // Walk any aggregate initializers looking for bitcasts between address spaces
-  SmallPtrSet<const Value *, 4> Visited;
-  SmallVector<const Value *, 4> WorkStack;
-  WorkStack.push_back(cast<Value>(GV.getInitializer()));
-
-  while (!WorkStack.empty()) {
-    const Value *V = WorkStack.pop_back_val();
-    if (!Visited.insert(V).second)
-      continue;
-
-    if (const User *U = dyn_cast<User>(V)) {
-      WorkStack.append(U->op_begin(), U->op_end());
-    }
-
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-      VerifyConstantExprBitcastType(CE);
-      if (Broken)
-        return;
-    }
-  }
+  visitConstantExprsRecursively(GV.getInitializer());
 
   visitGlobalValue(GV);
 }
@@ -556,7 +578,8 @@ void Verifier::visitAliaseeSubExpr(const GlobalAlias &GA, const Constant &C) {
 void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
                                    const GlobalAlias &GA, const Constant &C) {
   if (const auto *GV = dyn_cast<GlobalValue>(&C)) {
-    Assert(!GV->isDeclaration(), "Alias must point to a definition", &GA);
+    Assert(!GV->isDeclarationForLinker(), "Alias must point to a definition",
+           &GA);
 
     if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) {
       Assert(Visited.insert(GA2).second, "Aliases cannot form a cycle", &GA);
@@ -571,7 +594,7 @@ void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
   }
 
   if (const auto *CE = dyn_cast<ConstantExpr>(&C))
-    VerifyConstantExprBitcastType(CE);
+    visitConstantExprsRecursively(CE);
 
   for (const Use &U : C.operands()) {
     Value *V = &*U;
@@ -779,39 +802,10 @@ void Verifier::visitDIBasicType(const DIBasicType &N) {
          "invalid tag", &N);
 }
 
-void Verifier::visitDIDerivedTypeBase(const DIDerivedTypeBase &N) {
+void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   // Common scope checks.
   visitDIScope(N);
 
-  Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope());
-  Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N,
-         N.getBaseType());
-
-  // FIXME: Sink this into the subclass verifies.
-  if (!N.getFile() || N.getFile()->getFilename().empty()) {
-    // Check whether the filename is allowed to be empty.
-    uint16_t Tag = N.getTag();
-    Assert(
-        Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
-            Tag == dwarf::DW_TAG_pointer_type ||
-            Tag == dwarf::DW_TAG_ptr_to_member_type ||
-            Tag == dwarf::DW_TAG_reference_type ||
-            Tag == dwarf::DW_TAG_rvalue_reference_type ||
-            Tag == dwarf::DW_TAG_restrict_type ||
-            Tag == dwarf::DW_TAG_array_type ||
-            Tag == dwarf::DW_TAG_enumeration_type ||
-            Tag == dwarf::DW_TAG_subroutine_type ||
-            Tag == dwarf::DW_TAG_inheritance || Tag == dwarf::DW_TAG_friend ||
-            Tag == dwarf::DW_TAG_structure_type ||
-            Tag == dwarf::DW_TAG_member || Tag == dwarf::DW_TAG_typedef,
-        "derived/composite type requires a filename", &N, N.getFile());
-  }
-}
-
-void Verifier::visitDIDerivedType(const DIDerivedType &N) {
-  // Common derived type checks.
-  visitDIDerivedTypeBase(N);
-
   Assert(N.getTag() == dwarf::DW_TAG_typedef ||
              N.getTag() == dwarf::DW_TAG_pointer_type ||
              N.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
@@ -828,6 +822,10 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
     Assert(isTypeRef(N, N.getExtraData()), "invalid pointer to member type", &N,
            N.getExtraData());
   }
+
+  Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope());
+  Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N,
+         N.getBaseType());
 }
 
 static bool hasConflictingReferenceFlags(unsigned Flags) {
@@ -845,27 +843,34 @@ void Verifier::visitTemplateParams(const MDNode &N, const Metadata &RawParams) {
 }
 
 void Verifier::visitDICompositeType(const DICompositeType &N) {
-  // Common derived type checks.
-  visitDIDerivedTypeBase(N);
+  // Common scope checks.
+  visitDIScope(N);
 
   Assert(N.getTag() == dwarf::DW_TAG_array_type ||
              N.getTag() == dwarf::DW_TAG_structure_type ||
              N.getTag() == dwarf::DW_TAG_union_type ||
              N.getTag() == dwarf::DW_TAG_enumeration_type ||
-             N.getTag() == dwarf::DW_TAG_subroutine_type ||
              N.getTag() == dwarf::DW_TAG_class_type,
          "invalid tag", &N);
 
+  Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope());
+  Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N,
+         N.getBaseType());
+
   Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
          "invalid composite elements", &N, N.getRawElements());
   Assert(isTypeRef(N, N.getRawVTableHolder()), "invalid vtable holder", &N,
          N.getRawVTableHolder());
-  Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
-         "invalid composite elements", &N, N.getRawElements());
   Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
          &N);
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
+
+  if (N.getTag() == dwarf::DW_TAG_class_type ||
+      N.getTag() == dwarf::DW_TAG_union_type) {
+    Assert(N.getFile() && !N.getFile()->getFilename().empty(),
+           "class/union requires a filename", &N, N.getFile());
+  }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
@@ -885,6 +890,7 @@ void Verifier::visitDIFile(const DIFile &N) {
 }
 
 void Verifier::visitDICompileUnit(const DICompileUnit &N) {
+  Assert(N.isDistinct(), "compile units must be distinct", &N);
   Assert(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N);
 
   // Don't bother verifying the compilation directory or producer string
@@ -928,6 +934,12 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) {
              Op);
     }
   }
+  if (auto *Array = N.getRawMacros()) {
+    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    for (Metadata *Op : N.getMacros()->operands()) {
+      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+    }
+  }
 }
 
 void Verifier::visitDISubprogram(const DISubprogram &N) {
@@ -937,13 +949,6 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
     Assert(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
   Assert(isTypeRef(N, N.getRawContainingType()), "invalid containing type", &N,
          N.getRawContainingType());
-  if (auto *RawF = N.getRawFunction()) {
-    auto *FMD = dyn_cast<ConstantAsMetadata>(RawF);
-    auto *F = FMD ? FMD->getValue() : nullptr;
-    auto *FT = F ? dyn_cast<PointerType>(F->getType()) : nullptr;
-    Assert(F && FT && isa<FunctionType>(FT->getElementType()),
-           "invalid function", &N, F, FT);
-  }
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
   if (auto *S = N.getRawDeclaration()) {
@@ -961,40 +966,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
   Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
          &N);
 
-  auto *F = N.getFunction();
-  if (!F)
-    return;
-
-  // Check that all !dbg attachments lead to back to N (or, at least, another
-  // subprogram that describes the same function).
-  //
-  // FIXME: Check this incrementally while visiting !dbg attachments.
-  // FIXME: Only check when N is the canonical subprogram for F.
-  SmallPtrSet<const MDNode *, 32> Seen;
-  for (auto &BB : *F)
-    for (auto &I : BB) {
-      // Be careful about using DILocation here since we might be dealing with
-      // broken code (this is the Verifier after all).
-      DILocation *DL =
-          dyn_cast_or_null<DILocation>(I.getDebugLoc().getAsMDNode());
-      if (!DL)
-        continue;
-      if (!Seen.insert(DL).second)
-        continue;
-
-      DILocalScope *Scope = DL->getInlinedAtScope();
-      if (Scope && !Seen.insert(Scope).second)
-        continue;
-
-      DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr;
-      if (SP && !Seen.insert(SP).second)
-        continue;
-
-      // FIXME: Once N is canonical, check "SP == &N".
-      Assert(SP->describes(F),
-             "!dbg attachment points at wrong subprogram for function", &N, F,
-             &I, DL, Scope, SP);
-    }
+  if (N.isDefinition())
+    Assert(N.isDistinct(), "subprogram definitions must be distinct", &N);
 }
 
 void Verifier::visitDILexicalBlockBase(const DILexicalBlockBase &N) {
@@ -1020,6 +993,30 @@ void Verifier::visitDINamespace(const DINamespace &N) {
     Assert(isa<DIScope>(S), "invalid scope ref", &N, S);
 }
 
+void Verifier::visitDIMacro(const DIMacro &N) {
+  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
+         N.getMacinfoType() == dwarf::DW_MACINFO_undef,
+         "invalid macinfo type", &N);
+  Assert(!N.getName().empty(), "anonymous macro", &N);
+  if (!N.getValue().empty()) {
+    assert(N.getValue().data()[0] != ' ' && "Macro value has a space prefix");
+  }
+}
+
+void Verifier::visitDIMacroFile(const DIMacroFile &N) {
+  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
+         "invalid macinfo type", &N);
+  if (auto *F = N.getRawFile())
+    Assert(isa<DIFile>(F), "invalid file", &N, F);
+
+  if (auto *Array = N.getRawElements()) {
+    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    for (Metadata *Op : N.getElements()->operands()) {
+      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+    }
+  }
+}
+
 void Verifier::visitDIModule(const DIModule &N) {
   Assert(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
   Assert(!N.getName().empty(), "anonymous module", &N);
@@ -1075,9 +1072,7 @@ void Verifier::visitDILocalVariable(const DILocalVariable &N) {
   // Checks common to all variables.
   visitDIVariable(N);
 
-  Assert(N.getTag() == dwarf::DW_TAG_auto_variable ||
-             N.getTag() == dwarf::DW_TAG_arg_variable,
-         "invalid tag", &N);
+  Assert(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
   Assert(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
          "local variable requires a valid scope", &N, N.getRawScope());
 }
@@ -1274,7 +1269,10 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::OptimizeNone ||
         I->getKindAsEnum() == Attribute::JumpTable ||
         I->getKindAsEnum() == Attribute::Convergent ||
-        I->getKindAsEnum() == Attribute::ArgMemOnly) {
+        I->getKindAsEnum() == Attribute::ArgMemOnly ||
+        I->getKindAsEnum() == Attribute::NoRecurse ||
+        I->getKindAsEnum() == Attribute::InaccessibleMemOnly ||
+        I->getKindAsEnum() == Attribute::InaccessibleMemOrArgMemOnly) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
@@ -1365,7 +1363,7 @@ void Verifier::VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
          V);
 
   if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
-    SmallPtrSet<const Type*, 4> Visited;
+    SmallPtrSet<Type*, 4> Visited;
     if (!PTy->getElementType()->isSized(&Visited)) {
       Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
                  !Attrs.hasAttribute(Idx, Attribute::InAlloca),
@@ -1445,6 +1443,18 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
       "Attributes 'readnone and readonly' are incompatible!", V);
 
   Assert(
+      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+                           Attribute::InaccessibleMemOrArgMemOnly)),
+      "Attributes 'readnone and inaccessiblemem_or_argmemonly' are incompatible!", V);
+
+  Assert(
+      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+                           Attribute::InaccessibleMemOnly)),
+      "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
+
+  Assert(
       !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline) &&
         Attrs.hasAttribute(AttributeSet::FunctionIndex,
                            Attribute::AlwaysInline)),
@@ -1501,7 +1511,35 @@ void Verifier::VerifyFunctionMetadata(
   }
 }
 
-void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
+void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
+  if (!ConstantExprVisited.insert(EntryC).second)
+    return;
+
+  SmallVector<const Constant *, 16> Stack;
+  Stack.push_back(EntryC);
+
+  while (!Stack.empty()) {
+    const Constant *C = Stack.pop_back_val();
+
+    // Check this constant expression.
+    if (const auto *CE = dyn_cast<ConstantExpr>(C))
+      visitConstantExpr(CE);
+
+    // Visit all sub-expressions.
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC)
+        continue;
+      if (isa<GlobalValue>(OpC))
+        continue; // Global values get visited separately.
+      if (!ConstantExprVisited.insert(OpC).second)
+        continue;
+      Stack.push_back(OpC);
+    }
+  }
+}
+
+void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() != Instruction::BitCast)
     return;
 
@@ -1554,17 +1592,11 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
          &CI);
 
   const Value *Target = CS.getArgument(2);
-  const PointerType *PT = dyn_cast<PointerType>(Target->getType());
+  auto *PT = dyn_cast<PointerType>(Target->getType());
   Assert(PT && PT->getElementType()->isFunctionTy(),
          "gc.statepoint callee must be of function pointer type", &CI, Target);
   FunctionType *TargetFuncType = cast<FunctionType>(PT->getElementType());
 
-  if (NumPatchBytes)
-    Assert(isa<ConstantPointerNull>(Target->stripPointerCasts()),
-           "gc.statepoint must have null as call target if number of patchable "
-           "bytes is non zero",
-           &CI);
-
   const Value *NumCallArgsV = CS.getArgument(3);
   Assert(isa<ConstantInt>(NumCallArgsV),
          "gc.statepoint number of arguments to underlying call "
@@ -1642,14 +1674,14 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
     const CallInst *Call = dyn_cast<const CallInst>(U);
     Assert(Call, "illegal use of statepoint token", &CI, U);
     if (!Call) continue;
-    Assert(isGCRelocate(Call) || isGCResult(Call),
+    Assert(isa<GCRelocateInst>(Call) || isGCResult(Call),
            "gc.result or gc.relocate are the only value uses"
            "of a gc.statepoint",
            &CI, U);
     if (isGCResult(Call)) {
       Assert(Call->getArgOperand(0) == &CI,
              "gc.result connected to wrong gc.statepoint", &CI, Call);
-    } else if (isGCRelocate(Call)) {
+    } else if (isa<GCRelocateInst>(Call)) {
       Assert(Call->getArgOperand(0) == &CI,
              "gc.relocate connected to wrong gc.statepoint", &CI, Call);
     }
@@ -1678,6 +1710,59 @@ void Verifier::verifyFrameRecoverIndices() {
   }
 }
 
+static Instruction *getSuccPad(TerminatorInst *Terminator) {
+  BasicBlock *UnwindDest;
+  if (auto *II = dyn_cast<InvokeInst>(Terminator))
+    UnwindDest = II->getUnwindDest();
+  else if (auto *CSI = dyn_cast<CatchSwitchInst>(Terminator))
+    UnwindDest = CSI->getUnwindDest();
+  else
+    UnwindDest = cast<CleanupReturnInst>(Terminator)->getUnwindDest();
+  return UnwindDest->getFirstNonPHI();
+}
+
+void Verifier::verifySiblingFuncletUnwinds() {
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallPtrSet<Instruction *, 8> Active;
+  for (const auto &Pair : SiblingFuncletInfo) {
+    Instruction *PredPad = Pair.first;
+    if (Visited.count(PredPad))
+      continue;
+    Active.insert(PredPad);
+    TerminatorInst *Terminator = Pair.second;
+    do {
+      Instruction *SuccPad = getSuccPad(Terminator);
+      if (Active.count(SuccPad)) {
+        // Found a cycle; report error
+        Instruction *CyclePad = SuccPad;
+        SmallVector<Instruction *, 8> CycleNodes;
+        do {
+          CycleNodes.push_back(CyclePad);
+          TerminatorInst *CycleTerminator = SiblingFuncletInfo[CyclePad];
+          if (CycleTerminator != CyclePad)
+            CycleNodes.push_back(CycleTerminator);
+          CyclePad = getSuccPad(CycleTerminator);
+        } while (CyclePad != SuccPad);
+        Assert(false, "EH pads can't handle each other's exceptions",
+               ArrayRef<Instruction *>(CycleNodes));
+      }
+      // Don't re-walk a node we've already checked
+      if (!Visited.insert(SuccPad).second)
+        break;
+      // Walk to this successor if it has a map entry.
+      PredPad = SuccPad;
+      auto TermI = SiblingFuncletInfo.find(PredPad);
+      if (TermI == SiblingFuncletInfo.end())
+        break;
+      Terminator = TermI->second;
+      Active.insert(PredPad);
+    } while (true);
+    // Each node only has one successor, so we've walked all the active
+    // nodes' successors.
+    Active.clear();
+  }
+}
+
 // visitFunction - Verify that a function is ok.
 //
 void Verifier::visitFunction(const Function &F) {
@@ -1743,17 +1828,33 @@ void Verifier::visitFunction(const Function &F) {
            FT->getParamType(i));
     Assert(I->getType()->isFirstClassType(),
            "Function arguments must have first-class types!", I);
-    if (!isLLVMdotName)
+    if (!isLLVMdotName) {
       Assert(!I->getType()->isMetadataTy(),
              "Function takes metadata but isn't an intrinsic", I, &F);
+      Assert(!I->getType()->isTokenTy(),
+             "Function takes token but isn't an intrinsic", I, &F);
+    }
   }
 
+  if (!isLLVMdotName)
+    Assert(!F.getReturnType()->isTokenTy(),
+           "Functions returns a token but isn't an intrinsic", &F);
+
   // Get the function metadata attachments.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   F.getAllMetadata(MDs);
   assert(F.hasMetadata() != MDs.empty() && "Bit out-of-sync");
   VerifyFunctionMetadata(MDs);
 
+  // Check validity of the personality function
+  if (F.hasPersonalityFn()) {
+    auto *Per = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+    if (Per)
+      Assert(Per->getParent() == F.getParent(),
+             "Referencing personality function in another module!",
+             &F, F.getParent(), Per, Per->getParent());
+  }
+
   if (F.isMaterializable()) {
     // Function has a body somewhere we can't see.
     Assert(MDs.empty(), "unmaterialized function cannot have metadata", &F,
@@ -1782,13 +1883,27 @@ void Verifier::visitFunction(const Function &F) {
     }
 
     // Visit metadata attachments.
-    for (const auto &I : MDs)
+    for (const auto &I : MDs) {
+      // Verify that the attachment is legal.
+      switch (I.first) {
+      default:
+        break;
+      case LLVMContext::MD_dbg:
+        Assert(isa<DISubprogram>(I.second),
+               "function !dbg attachment must be a subprogram", &F, I.second);
+        break;
+      }
+
+      // Verify the metadata itself.
       visitMDNode(*I.second);
+    }
   }
 
   // If this function is actually an intrinsic, verify that it is only used in
   // direct call/invokes, never having its "address taken".
-  if (F.getIntrinsicID()) {
+  // Only do this if the module is materialized, otherwise we don't have all the
+  // uses.
+  if (F.getIntrinsicID() && F.getParent()->isMaterialized()) {
     const User *U;
     if (F.hasAddressTaken(&U))
       Assert(0, "Invalid user of intrinsic instruction!", U);
@@ -1798,6 +1913,44 @@ void Verifier::visitFunction(const Function &F) {
              (F.isDeclaration() && F.hasExternalLinkage()) ||
              F.hasAvailableExternallyLinkage(),
          "Function is marked as dllimport, but not external.", &F);
+
+  auto *N = F.getSubprogram();
+  if (!N)
+    return;
+
+  // Check that all !dbg attachments lead to back to N (or, at least, another
+  // subprogram that describes the same function).
+  //
+  // FIXME: Check this incrementally while visiting !dbg attachments.
+  // FIXME: Only check when N is the canonical subprogram for F.
+  SmallPtrSet<const MDNode *, 32> Seen;
+  for (auto &BB : F)
+    for (auto &I : BB) {
+      // Be careful about using DILocation here since we might be dealing with
+      // broken code (this is the Verifier after all).
+      DILocation *DL =
+          dyn_cast_or_null<DILocation>(I.getDebugLoc().getAsMDNode());
+      if (!DL)
+        continue;
+      if (!Seen.insert(DL).second)
+        continue;
+
+      DILocalScope *Scope = DL->getInlinedAtScope();
+      if (Scope && !Seen.insert(Scope).second)
+        continue;
+
+      DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr;
+
+      // Scope and SP could be the same MDNode and we don't want to skip
+      // validation in that case
+      if (SP && ((Scope != SP) && !Seen.insert(SP).second))
+        continue;
+
+      // FIXME: Once N is canonical, check "SP == &N".
+      Assert(SP->describes(&F),
+             "!dbg attachment points at wrong subprogram for function", N, &F,
+             &I, DL, Scope, SP);
+    }
 }
 
 // verifyBasicBlock - Verify that a basic block is well formed...
@@ -2194,6 +2347,9 @@ void Verifier::visitPHINode(PHINode &PN) {
              isa<PHINode>(--BasicBlock::iterator(&PN)),
          "PHI nodes not grouped at top of basic block!", &PN, PN.getParent());
 
+  // Check that a PHI doesn't yield a Token.
+  Assert(!PN.getType()->isTokenTy(), "PHI nodes cannot have token type!");
+
   // Check that all of the values of the PHI node have the same type as the
   // result, and that the incoming blocks are really basic blocks.
   for (Value *IncValue : PN.incoming_values()) {
@@ -2296,16 +2452,44 @@ void Verifier::VerifyCallSite(CallSite CS) {
   // Verify that there's no metadata unless it's a direct call to an intrinsic.
   if (CS.getCalledFunction() == nullptr ||
       !CS.getCalledFunction()->getName().startswith("llvm.")) {
-    for (FunctionType::param_iterator PI = FTy->param_begin(),
-           PE = FTy->param_end(); PI != PE; ++PI)
-      Assert(!(*PI)->isMetadataTy(),
+    for (Type *ParamTy : FTy->params()) {
+      Assert(!ParamTy->isMetadataTy(),
              "Function has metadata parameter but isn't an intrinsic", I);
+      Assert(!ParamTy->isTokenTy(),
+             "Function has token parameter but isn't an intrinsic", I);
+    }
   }
 
+  // Verify that indirect calls don't return tokens.
+  if (CS.getCalledFunction() == nullptr)
+    Assert(!FTy->getReturnType()->isTokenTy(),
+           "Return type cannot be token for indirect call!");
+
   if (Function *F = CS.getCalledFunction())
     if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
       visitIntrinsicCallSite(ID, CS);
 
+  // Verify that a callsite has at most one "deopt" and one "funclet" operand
+  // bundle.
+  bool FoundDeoptBundle = false, FoundFuncletBundle = false;
+  for (unsigned i = 0, e = CS.getNumOperandBundles(); i < e; ++i) {
+    OperandBundleUse BU = CS.getOperandBundleAt(i);
+    uint32_t Tag = BU.getTagID();
+    if (Tag == LLVMContext::OB_deopt) {
+      Assert(!FoundDeoptBundle, "Multiple deopt operand bundles", I);
+      FoundDeoptBundle = true;
+    }
+    if (Tag == LLVMContext::OB_funclet) {
+      Assert(!FoundFuncletBundle, "Multiple funclet operand bundles", I);
+      FoundFuncletBundle = true;
+      Assert(BU.Inputs.size() == 1,
+             "Expected exactly one funclet bundle operand", I);
+      Assert(isa<FuncletPadInst>(BU.Inputs.front()),
+             "Funclet bundle operands should correspond to a FuncletPadInst",
+             I);
+    }
+  }
+
   visitInstruction(*I);
 }
 
@@ -2406,10 +2590,12 @@ void Verifier::visitCallInst(CallInst &CI) {
 void Verifier::visitInvokeInst(InvokeInst &II) {
   VerifyCallSite(&II);
 
-  // Verify that there is a landingpad instruction as the first non-PHI
-  // instruction of the 'unwind' destination.
-  Assert(II.getUnwindDest()->isLandingPad(),
-         "The unwind destination does not have a landingpad instruction!", &II);
+  // Verify that the first non-PHI instruction of the unwind destination is an
+  // exception handling instruction.
+  Assert(
+      II.getUnwindDest()->isEHPad(),
+      "The unwind destination does not have an exception handling instruction!",
+      &II);
 
   visitTerminatorInst(II);
 }
@@ -2622,6 +2808,14 @@ void Verifier::visitRangeMetadata(Instruction& I,
   }
 }
 
+void Verifier::checkAtomicMemAccessSize(const Module *M, Type *Ty,
+                                        const Instruction *I) {
+  unsigned Size = M->getDataLayout().getTypeSizeInBits(Ty);
+  Assert(Size >= 8, "atomic memory access' size must be byte-sized", Ty, I);
+  Assert(!(Size & (Size - 1)),
+         "atomic memory access' operand must have a power-of-two size", Ty, I);
+}
+
 void Verifier::visitLoadInst(LoadInst &LI) {
   PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
   Assert(PTy, "Load operand must be a pointer.", &LI);
@@ -2633,14 +2827,12 @@ void Verifier::visitLoadInst(LoadInst &LI) {
            "Load cannot have Release ordering", &LI);
     Assert(LI.getAlignment() != 0,
            "Atomic load must specify explicit alignment", &LI);
-    if (!ElTy->isPointerTy()) {
-      Assert(ElTy->isIntegerTy(), "atomic load operand must have integer type!",
-             &LI, ElTy);
-      unsigned Size = ElTy->getPrimitiveSizeInBits();
-      Assert(Size >= 8 && !(Size & (Size - 1)),
-             "atomic load operand must be power-of-two byte-sized integer", &LI,
-             ElTy);
-    }
+    Assert(ElTy->isIntegerTy() || ElTy->isPointerTy() ||
+               ElTy->isFloatingPointTy(),
+           "atomic load operand must have integer, pointer, or floating point "
+           "type!",
+           ElTy, &LI);
+    checkAtomicMemAccessSize(M, ElTy, &LI);
   } else {
     Assert(LI.getSynchScope() == CrossThread,
            "Non-atomic load cannot have SynchronizationScope specified", &LI);
@@ -2662,14 +2854,12 @@ void Verifier::visitStoreInst(StoreInst &SI) {
            "Store cannot have Acquire ordering", &SI);
     Assert(SI.getAlignment() != 0,
            "Atomic store must specify explicit alignment", &SI);
-    if (!ElTy->isPointerTy()) {
-      Assert(ElTy->isIntegerTy(),
-             "atomic store operand must have integer type!", &SI, ElTy);
-      unsigned Size = ElTy->getPrimitiveSizeInBits();
-      Assert(Size >= 8 && !(Size & (Size - 1)),
-             "atomic store operand must be power-of-two byte-sized integer",
-             &SI, ElTy);
-    }
+    Assert(ElTy->isIntegerTy() || ElTy->isPointerTy() ||
+               ElTy->isFloatingPointTy(),
+           "atomic store operand must have integer, pointer, or floating point "
+           "type!",
+           ElTy, &SI);
+    checkAtomicMemAccessSize(M, ElTy, &SI);
   } else {
     Assert(SI.getSynchScope() == CrossThread,
            "Non-atomic store cannot have SynchronizationScope specified", &SI);
@@ -2678,7 +2868,7 @@ void Verifier::visitStoreInst(StoreInst &SI) {
 }
 
 void Verifier::visitAllocaInst(AllocaInst &AI) {
-  SmallPtrSet<const Type*, 4> Visited;
+  SmallPtrSet<Type*, 4> Visited;
   PointerType *PTy = AI.getType();
   Assert(PTy->getAddressSpace() == 0,
          "Allocation instruction pointer not in the generic address space!",
@@ -2716,9 +2906,7 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
   Type *ElTy = PTy->getElementType();
   Assert(ElTy->isIntegerTy(), "cmpxchg operand must have integer type!", &CXI,
          ElTy);
-  unsigned Size = ElTy->getPrimitiveSizeInBits();
-  Assert(Size >= 8 && !(Size & (Size - 1)),
-         "cmpxchg operand must be power-of-two byte-sized integer", &CXI, ElTy);
+  checkAtomicMemAccessSize(M, ElTy, &CXI);
   Assert(ElTy == CXI.getOperand(1)->getType(),
          "Expected value type does not match pointer operand type!", &CXI,
          ElTy);
@@ -2737,10 +2925,7 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   Type *ElTy = PTy->getElementType();
   Assert(ElTy->isIntegerTy(), "atomicrmw operand must have integer type!",
          &RMWI, ElTy);
-  unsigned Size = ElTy->getPrimitiveSizeInBits();
-  Assert(Size >= 8 && !(Size & (Size - 1)),
-         "atomicrmw operand must be power-of-two byte-sized integer", &RMWI,
-         ElTy);
+  checkAtomicMemAccessSize(M, ElTy, &RMWI);
   Assert(ElTy == RMWI.getOperand(1)->getType(),
          "Argument value type does not match pointer operand type!", &RMWI,
          ElTy);
@@ -2777,23 +2962,98 @@ void Verifier::visitInsertValueInst(InsertValueInst &IVI) {
   visitInstruction(IVI);
 }
 
-void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
-  BasicBlock *BB = LPI.getParent();
+static Value *getParentPad(Value *EHPad) {
+  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+    return FPI->getParentPad();
+
+  return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+void Verifier::visitEHPadPredecessors(Instruction &I) {
+  assert(I.isEHPad());
+
+  BasicBlock *BB = I.getParent();
+  Function *F = BB->getParent();
+
+  Assert(BB != &F->getEntryBlock(), "EH pad cannot be in entry block.", &I);
+
+  if (auto *LPI = dyn_cast<LandingPadInst>(&I)) {
+    // The landingpad instruction defines its parent as a landing pad block. The
+    // landing pad block may be branched to only by the unwind edge of an
+    // invoke.
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      const auto *II = dyn_cast<InvokeInst>(PredBB->getTerminator());
+      Assert(II && II->getUnwindDest() == BB && II->getNormalDest() != BB,
+             "Block containing LandingPadInst must be jumped to "
+             "only by the unwind edge of an invoke.",
+             LPI);
+    }
+    return;
+  }
+  if (auto *CPI = dyn_cast<CatchPadInst>(&I)) {
+    if (!pred_empty(BB))
+      Assert(BB->getUniquePredecessor() == CPI->getCatchSwitch()->getParent(),
+             "Block containg CatchPadInst must be jumped to "
+             "only by its catchswitch.",
+             CPI);
+    Assert(BB != CPI->getCatchSwitch()->getUnwindDest(),
+           "Catchswitch cannot unwind to one of its catchpads",
+           CPI->getCatchSwitch(), CPI);
+    return;
+  }
+
+  // Verify that each pred has a legal terminator with a legal to/from EH
+  // pad relationship.
+  Instruction *ToPad = &I;
+  Value *ToPadParent = getParentPad(ToPad);
+  for (BasicBlock *PredBB : predecessors(BB)) {
+    TerminatorInst *TI = PredBB->getTerminator();
+    Value *FromPad;
+    if (auto *II = dyn_cast<InvokeInst>(TI)) {
+      Assert(II->getUnwindDest() == BB && II->getNormalDest() != BB,
+             "EH pad must be jumped to via an unwind edge", ToPad, II);
+      if (auto Bundle = II->getOperandBundle(LLVMContext::OB_funclet))
+        FromPad = Bundle->Inputs[0];
+      else
+        FromPad = ConstantTokenNone::get(II->getContext());
+    } else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
+      FromPad = CRI->getCleanupPad();
+      Assert(FromPad != ToPadParent, "A cleanupret must exit its cleanup", CRI);
+    } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
+      FromPad = CSI;
+    } else {
+      Assert(false, "EH pad must be jumped to via an unwind edge", ToPad, TI);
+    }
+
+    // The edge may exit from zero or more nested pads.
+    for (;; FromPad = getParentPad(FromPad)) {
+      Assert(FromPad != ToPad,
+             "EH pad cannot handle exceptions raised within it", FromPad, TI);
+      if (FromPad == ToPadParent) {
+        // This is a legal unwind edge.
+        break;
+      }
+      Assert(!isa<ConstantTokenNone>(FromPad),
+             "A single unwind edge may only enter one EH pad", TI);
+    }
+  }
+}
 
+void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
   // The landingpad instruction is ill-formed if it doesn't have any clauses and
   // isn't a cleanup.
   Assert(LPI.getNumClauses() > 0 || LPI.isCleanup(),
          "LandingPadInst needs at least one clause or to be a cleanup.", &LPI);
 
-  // The landingpad instruction defines its parent as a landing pad block. The
-  // landing pad block may be branched to only by the unwind edge of an invoke.
-  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
-    const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator());
-    Assert(II && II->getUnwindDest() == BB && II->getNormalDest() != BB,
-           "Block containing LandingPadInst must be jumped to "
-           "only by the unwind edge of an invoke.",
+  visitEHPadPredecessors(LPI);
+
+  if (!LandingPadResultTy)
+    LandingPadResultTy = LPI.getType();
+  else
+    Assert(LandingPadResultTy == LPI.getType(),
+           "The landingpad instruction should have a consistent result type "
+           "inside a function.",
            &LPI);
-  }
 
   Function *F = LPI.getParent()->getParent();
   Assert(F->hasPersonalityFn(),
@@ -2820,6 +3080,269 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
   visitInstruction(LPI);
 }
 
+void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
+  visitEHPadPredecessors(CPI);
+
+  BasicBlock *BB = CPI.getParent();
+
+  Function *F = BB->getParent();
+  Assert(F->hasPersonalityFn(),
+         "CatchPadInst needs to be in a function with a personality.", &CPI);
+
+  Assert(isa<CatchSwitchInst>(CPI.getParentPad()),
+         "CatchPadInst needs to be directly nested in a CatchSwitchInst.",
+         CPI.getParentPad());
+
+  // The catchpad instruction must be the first non-PHI instruction in the
+  // block.
+  Assert(BB->getFirstNonPHI() == &CPI,
+         "CatchPadInst not the first non-PHI instruction in the block.", &CPI);
+
+  visitFuncletPadInst(CPI);
+}
+
+void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) {
+  Assert(isa<CatchPadInst>(CatchReturn.getOperand(0)),
+         "CatchReturnInst needs to be provided a CatchPad", &CatchReturn,
+         CatchReturn.getOperand(0));
+
+  visitTerminatorInst(CatchReturn);
+}
+
+void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
+  visitEHPadPredecessors(CPI);
+
+  BasicBlock *BB = CPI.getParent();
+
+  Function *F = BB->getParent();
+  Assert(F->hasPersonalityFn(),
+         "CleanupPadInst needs to be in a function with a personality.", &CPI);
+
+  // The cleanuppad instruction must be the first non-PHI instruction in the
+  // block.
+  Assert(BB->getFirstNonPHI() == &CPI,
+         "CleanupPadInst not the first non-PHI instruction in the block.",
+         &CPI);
+
+  auto *ParentPad = CPI.getParentPad();
+  Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
+         "CleanupPadInst has an invalid parent.", &CPI);
+
+  visitFuncletPadInst(CPI);
+}
+
+void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
+  User *FirstUser = nullptr;
+  Value *FirstUnwindPad = nullptr;
+  SmallVector<FuncletPadInst *, 8> Worklist({&FPI});
+  while (!Worklist.empty()) {
+    FuncletPadInst *CurrentPad = Worklist.pop_back_val();
+    Value *UnresolvedAncestorPad = nullptr;
+    for (User *U : CurrentPad->users()) {
+      BasicBlock *UnwindDest;
+      if (auto *CRI = dyn_cast<CleanupReturnInst>(U)) {
+        UnwindDest = CRI->getUnwindDest();
+      } else if (auto *CSI = dyn_cast<CatchSwitchInst>(U)) {
+        // We allow catchswitch unwind to caller to nest
+        // within an outer pad that unwinds somewhere else,
+        // because catchswitch doesn't have a nounwind variant.
+        // See e.g. SimplifyCFGOpt::SimplifyUnreachable.
+        if (CSI->unwindsToCaller())
+          continue;
+        UnwindDest = CSI->getUnwindDest();
+      } else if (auto *II = dyn_cast<InvokeInst>(U)) {
+        UnwindDest = II->getUnwindDest();
+      } else if (isa<CallInst>(U)) {
+        // Calls which don't unwind may be found inside funclet
+        // pads that unwind somewhere else.  We don't *require*
+        // such calls to be annotated nounwind.
+        continue;
+      } else if (auto *CPI = dyn_cast<CleanupPadInst>(U)) {
+        // The unwind dest for a cleanup can only be found by
+        // recursive search.  Add it to the worklist, and we'll
+        // search for its first use that determines where it unwinds.
+        Worklist.push_back(CPI);
+        continue;
+      } else {
+        Assert(isa<CatchReturnInst>(U), "Bogus funclet pad use", U);
+        continue;
+      }
+
+      Value *UnwindPad;
+      bool ExitsFPI;
+      if (UnwindDest) {
+        UnwindPad = UnwindDest->getFirstNonPHI();
+        Value *UnwindParent = getParentPad(UnwindPad);
+        // Ignore unwind edges that don't exit CurrentPad.
+        if (UnwindParent == CurrentPad)
+          continue;
+        // Determine whether the original funclet pad is exited,
+        // and if we are scanning nested pads determine how many
+        // of them are exited so we can stop searching their
+        // children.
+        Value *ExitedPad = CurrentPad;
+        ExitsFPI = false;
+        do {
+          if (ExitedPad == &FPI) {
+            ExitsFPI = true;
+            // Now we can resolve any ancestors of CurrentPad up to
+            // FPI, but not including FPI since we need to make sure
+            // to check all direct users of FPI for consistency.
+            UnresolvedAncestorPad = &FPI;
+            break;
+          }
+          Value *ExitedParent = getParentPad(ExitedPad);
+          if (ExitedParent == UnwindParent) {
+            // ExitedPad is the ancestor-most pad which this unwind
+            // edge exits, so we can resolve up to it, meaning that
+            // ExitedParent is the first ancestor still unresolved.
+            UnresolvedAncestorPad = ExitedParent;
+            break;
+          }
+          ExitedPad = ExitedParent;
+        } while (!isa<ConstantTokenNone>(ExitedPad));
+      } else {
+        // Unwinding to caller exits all pads.
+        UnwindPad = ConstantTokenNone::get(FPI.getContext());
+        ExitsFPI = true;
+        UnresolvedAncestorPad = &FPI;
+      }
+
+      if (ExitsFPI) {
+        // This unwind edge exits FPI.  Make sure it agrees with other
+        // such edges.
+        if (FirstUser) {
+          Assert(UnwindPad == FirstUnwindPad, "Unwind edges out of a funclet "
+                                              "pad must have the same unwind "
+                                              "dest",
+                 &FPI, U, FirstUser);
+        } else {
+          FirstUser = U;
+          FirstUnwindPad = UnwindPad;
+          // Record cleanup sibling unwinds for verifySiblingFuncletUnwinds
+          if (isa<CleanupPadInst>(&FPI) && !isa<ConstantTokenNone>(UnwindPad) &&
+              getParentPad(UnwindPad) == getParentPad(&FPI))
+            SiblingFuncletInfo[&FPI] = cast<TerminatorInst>(U);
+        }
+      }
+      // Make sure we visit all uses of FPI, but for nested pads stop as
+      // soon as we know where they unwind to.
+      if (CurrentPad != &FPI)
+        break;
+    }
+    if (UnresolvedAncestorPad) {
+      if (CurrentPad == UnresolvedAncestorPad) {
+        // When CurrentPad is FPI itself, we don't mark it as resolved even if
+        // we've found an unwind edge that exits it, because we need to verify
+        // all direct uses of FPI.
+        assert(CurrentPad == &FPI);
+        continue;
+      }
+      // Pop off the worklist any nested pads that we've found an unwind
+      // destination for.  The pads on the worklist are the uncles,
+      // great-uncles, etc. of CurrentPad.  We've found an unwind destination
+      // for all ancestors of CurrentPad up to but not including
+      // UnresolvedAncestorPad.
+      Value *ResolvedPad = CurrentPad;
+      while (!Worklist.empty()) {
+        Value *UnclePad = Worklist.back();
+        Value *AncestorPad = getParentPad(UnclePad);
+        // Walk ResolvedPad up the ancestor list until we either find the
+        // uncle's parent or the last resolved ancestor.
+        while (ResolvedPad != AncestorPad) {
+          Value *ResolvedParent = getParentPad(ResolvedPad);
+          if (ResolvedParent == UnresolvedAncestorPad) {
+            break;
+          }
+          ResolvedPad = ResolvedParent;
+        }
+        // If the resolved ancestor search didn't find the uncle's parent,
+        // then the uncle is not yet resolved.
+        if (ResolvedPad != AncestorPad)
+          break;
+        // This uncle is resolved, so pop it from the worklist.
+        Worklist.pop_back();
+      }
+    }
+  }
+
+  if (FirstUnwindPad) {
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FPI.getParentPad())) {
+      BasicBlock *SwitchUnwindDest = CatchSwitch->getUnwindDest();
+      Value *SwitchUnwindPad;
+      if (SwitchUnwindDest)
+        SwitchUnwindPad = SwitchUnwindDest->getFirstNonPHI();
+      else
+        SwitchUnwindPad = ConstantTokenNone::get(FPI.getContext());
+      Assert(SwitchUnwindPad == FirstUnwindPad,
+             "Unwind edges out of a catch must have the same unwind dest as "
+             "the parent catchswitch",
+             &FPI, FirstUser, CatchSwitch);
+    }
+  }
+
+  visitInstruction(FPI);
+}
+
+void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
+  visitEHPadPredecessors(CatchSwitch);
+
+  BasicBlock *BB = CatchSwitch.getParent();
+
+  Function *F = BB->getParent();
+  Assert(F->hasPersonalityFn(),
+         "CatchSwitchInst needs to be in a function with a personality.",
+         &CatchSwitch);
+
+  // The catchswitch instruction must be the first non-PHI instruction in the
+  // block.
+  Assert(BB->getFirstNonPHI() == &CatchSwitch,
+         "CatchSwitchInst not the first non-PHI instruction in the block.",
+         &CatchSwitch);
+
+  auto *ParentPad = CatchSwitch.getParentPad();
+  Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
+         "CatchSwitchInst has an invalid parent.", ParentPad);
+
+  if (BasicBlock *UnwindDest = CatchSwitch.getUnwindDest()) {
+    Instruction *I = UnwindDest->getFirstNonPHI();
+    Assert(I->isEHPad() && !isa<LandingPadInst>(I),
+           "CatchSwitchInst must unwind to an EH block which is not a "
+           "landingpad.",
+           &CatchSwitch);
+
+    // Record catchswitch sibling unwinds for verifySiblingFuncletUnwinds
+    if (getParentPad(I) == ParentPad)
+      SiblingFuncletInfo[&CatchSwitch] = &CatchSwitch;
+  }
+
+  Assert(CatchSwitch.getNumHandlers() != 0,
+         "CatchSwitchInst cannot have empty handler list", &CatchSwitch);
+
+  for (BasicBlock *Handler : CatchSwitch.handlers()) {
+    Assert(isa<CatchPadInst>(Handler->getFirstNonPHI()),
+           "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
+  }
+
+  visitTerminatorInst(CatchSwitch);
+}
+
+void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
+  Assert(isa<CleanupPadInst>(CRI.getOperand(0)),
+         "CleanupReturnInst needs to be provided a CleanupPad", &CRI,
+         CRI.getOperand(0));
+
+  if (BasicBlock *UnwindDest = CRI.getUnwindDest()) {
+    Instruction *I = UnwindDest->getFirstNonPHI();
+    Assert(I->isEHPad() && !isa<LandingPadInst>(I),
+           "CleanupReturnInst must unwind to an EH block which is not a "
+           "landingpad.",
+           &CRI);
+  }
+
+  visitTerminatorInst(CRI);
+}
+
 void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
   Instruction *Op = cast<Instruction>(I.getOperand(i));
   // If the we have an invalid invoke, don't try to compute the dominance.
@@ -2835,6 +3358,19 @@ void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
          "Instruction does not dominate all uses!", Op, &I);
 }
 
+void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) {
+  Assert(I.getType()->isPointerTy(), "dereferenceable, dereferenceable_or_null "
+         "apply only to pointer types", &I);
+  Assert(isa<LoadInst>(I),
+         "dereferenceable, dereferenceable_or_null apply only to load"
+         " instructions, use attributes for calls or invokes", &I);
+  Assert(MD->getNumOperands() == 1, "dereferenceable, dereferenceable_or_null "
+         "take one operand!", &I);
+  ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
+  Assert(CI && CI->getType()->isIntegerTy(64), "dereferenceable, "
+         "dereferenceable_or_null metadata value must be an i64!", &I);
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -2903,7 +3439,7 @@ void Verifier::visitInstruction(Instruction &I) {
           " donothing or patchpoint",
           &I);
       Assert(F->getParent() == M, "Referencing function in another module!",
-             &I);
+             &I, M, F, F->getParent());
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
       Assert(OpBB->getParent() == BB->getParent(),
              "Referring to a basic block in another function!", &I);
@@ -2911,7 +3447,7 @@ void Verifier::visitInstruction(Instruction &I) {
       Assert(OpArg->getParent() == BB->getParent(),
              "Referring to an argument in another function!", &I);
     } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
-      Assert(GV->getParent() == M, "Referencing global in another module!", &I);
+      Assert(GV->getParent() == M, "Referencing global in another module!", &I, M, GV, GV->getParent());
     } else if (isa<Instruction>(I.getOperand(i))) {
       verifyDominatesUse(I, i);
     } else if (isa<InlineAsm>(I.getOperand(i))) {
@@ -2922,22 +3458,7 @@ void Verifier::visitInstruction(Instruction &I) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
         // illegal bitcast (inttoptr <constant int> )
-        SmallVector<const ConstantExpr *, 4> Stack;
-        SmallPtrSet<const ConstantExpr *, 4> Visited;
-        Stack.push_back(CE);
-
-        while (!Stack.empty()) {
-          const ConstantExpr *V = Stack.pop_back_val();
-          if (!Visited.insert(V).second)
-            continue;
-
-          VerifyConstantExprBitcastType(V);
-
-          for (unsigned I = 0, N = V->getNumOperands(); I != N; ++I) {
-            if (ConstantExpr *Op = dyn_cast<ConstantExpr>(V->getOperand(I)))
-              Stack.push_back(Op);
-          }
-        }
+        visitConstantExprsRecursively(CE);
       }
     }
   }
@@ -2971,6 +3492,28 @@ void Verifier::visitInstruction(Instruction &I) {
            &I);
   }
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable))
+    visitDereferenceableMetadata(I, MD);
+
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable_or_null))
+    visitDereferenceableMetadata(I, MD);
+
+  if (MDNode *AlignMD = I.getMetadata(LLVMContext::MD_align)) {
+    Assert(I.getType()->isPointerTy(), "align applies only to pointer types",
+           &I);
+    Assert(isa<LoadInst>(I), "align applies only to load instructions, "
+           "use attributes for calls or invokes", &I);
+    Assert(AlignMD->getNumOperands() == 1, "align takes one operand!", &I);
+    ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(AlignMD->getOperand(0));
+    Assert(CI && CI->getType()->isIntegerTy(64),
+           "align metadata value must be an i64!", &I);
+    uint64_t Align = CI->getZExtValue();
+    Assert(isPowerOf2_64(Align),
+           "align metadata value must be a power of 2!", &I);
+    Assert(Align <= Value::MaximumAlignment,
+           "alignment is larger that implementation defined limit", &I);
+  }
+
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     Assert(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N);
@@ -2998,6 +3541,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   case IITDescriptor::Void: return !Ty->isVoidTy();
   case IITDescriptor::VarArg: return true;
   case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+  case IITDescriptor::Token: return !Ty->isTokenTy();
   case IITDescriptor::Metadata: return !Ty->isMetadataTy();
   case IITDescriptor::Half: return !Ty->isHalfTy();
   case IITDescriptor::Float: return !Ty->isFloatTy();
@@ -3321,9 +3865,6 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     VerifyStatepoint(CS);
     break;
-  case Intrinsic::experimental_gc_result_int:
-  case Intrinsic::experimental_gc_result_float:
-  case Intrinsic::experimental_gc_result_ptr:
   case Intrinsic::experimental_gc_result: {
     Assert(CS.getParent()->getParent()->hasGC(),
            "Enclosing function does not use GC.", CS);
@@ -3339,9 +3880,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     // Assert that result type matches wrapped callee.
     const Value *Target = StatepointCS.getArgument(2);
-    const PointerType *PT = cast<PointerType>(Target->getType());
-    const FunctionType *TargetFuncType =
-      cast<FunctionType>(PT->getElementType());
+    auto *PT = cast<PointerType>(Target->getType());
+    auto *TargetFuncType = cast<FunctionType>(PT->getElementType());
     Assert(CS.getType() == TargetFuncType->getReturnType(),
            "gc.result result type does not match wrapped callee", CS);
     break;
@@ -3349,22 +3889,22 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   case Intrinsic::experimental_gc_relocate: {
     Assert(CS.getNumArgOperands() == 3, "wrong number of arguments", CS);
 
+    Assert(isa<PointerType>(CS.getType()->getScalarType()),
+           "gc.relocate must return a pointer or a vector of pointers", CS);
+
     // Check that this relocate is correctly tied to the statepoint
 
     // This is case for relocate on the unwinding path of an invoke statepoint
-    if (ExtractValueInst *ExtractValue =
-          dyn_cast<ExtractValueInst>(CS.getArgOperand(0))) {
-      Assert(isa<LandingPadInst>(ExtractValue->getAggregateOperand()),
-             "gc relocate on unwind path incorrectly linked to the statepoint",
-             CS);
+    if (LandingPadInst *LandingPad =
+          dyn_cast<LandingPadInst>(CS.getArgOperand(0))) {
 
       const BasicBlock *InvokeBB =
-        ExtractValue->getParent()->getUniquePredecessor();
+          LandingPad->getParent()->getUniquePredecessor();
 
       // Landingpad relocates should have only one predecessor with invoke
       // statepoint terminator
       Assert(InvokeBB, "safepoints should have unique landingpads",
-             ExtractValue->getParent());
+             LandingPad->getParent());
       Assert(InvokeBB->getTerminator(), "safepoint block should be well formed",
              InvokeBB);
       Assert(isStatepoint(InvokeBB->getTerminator()),
@@ -3381,8 +3921,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     // Verify rest of the relocate arguments
 
-    GCRelocateOperands Ops(CS);
-    ImmutableCallSite StatepointCS(Ops.getStatepoint());
+    ImmutableCallSite StatepointCS(
+        cast<GCRelocateInst>(*CS.getInstruction()).getStatepoint());
 
     // Both the base and derived must be piped through the safepoint
     Value* Base = CS.getArgOperand(1);
@@ -3434,20 +3974,29 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "'gc parameters' section of the statepoint call",
            CS);
 
-    // Relocated value must be a pointer type, but gc_relocate does not need to return the
-    // same pointer type as the relocated pointer. It can be casted to the correct type later
-    // if it's desired. However, they must have the same address space.
-    GCRelocateOperands Operands(CS);
-    Assert(Operands.getDerivedPtr()->getType()->isPointerTy(),
+    // Relocated value must be either a pointer type or vector-of-pointer type,
+    // but gc_relocate does not need to return the same pointer type as the
+    // relocated pointer. It can be casted to the correct type later if it's
+    // desired. However, they must have the same address space and 'vectorness'
+    GCRelocateInst &Relocate = cast<GCRelocateInst>(*CS.getInstruction());
+    Assert(Relocate.getDerivedPtr()->getType()->getScalarType()->isPointerTy(),
            "gc.relocate: relocated value must be a gc pointer", CS);
 
-    // gc_relocate return type must be a pointer type, and is verified earlier in
-    // VerifyIntrinsicType().
-    Assert(cast<PointerType>(CS.getType())->getAddressSpace() ==
-           cast<PointerType>(Operands.getDerivedPtr()->getType())->getAddressSpace(),
+    auto ResultType = CS.getType();
+    auto DerivedType = Relocate.getDerivedPtr()->getType();
+    Assert(ResultType->isVectorTy() == DerivedType->isVectorTy(),
+           "gc.relocate: vector relocates to vector and pointer to pointer", CS);
+    Assert(ResultType->getPointerAddressSpace() ==
+           DerivedType->getPointerAddressSpace(),
            "gc.relocate: relocating a pointer shouldn't change its address space", CS);
     break;
   }
+  case Intrinsic::eh_exceptioncode:
+  case Intrinsic::eh_exceptionpointer: {
+    Assert(isa<CatchPadInst>(CS.getArgOperand(0)),
+           "eh.exceptionpointer argument must be a catchpad", CS);
+    break;
+  }
   };
 }
 
@@ -3598,7 +4147,7 @@ void Verifier::verifyTypeRefs() {
   for (auto *CU : CUs->operands())
     if (auto Ts = cast<DICompileUnit>(CU)->getRetainedTypes())
       for (DIType *Op : Ts)
-        if (auto *T = dyn_cast<DICompositeType>(Op))
+        if (auto *T = dyn_cast_or_null<DICompositeType>(Op))
           if (auto *S = T->getRawIdentifier()) {
             UnresolvedTypeRefs.erase(S);
             TypeRefs.insert(std::make_pair(S, T));
diff --git a/contrib/llvm/lib/IRReader/IRReader.cpp b/contrib/llvm/lib/IRReader/IRReader.cpp
index 43fee65..9b243fc 100644
--- a/contrib/llvm/lib/IRReader/IRReader.cpp
+++ b/contrib/llvm/lib/IRReader/IRReader.cpp
@@ -31,11 +31,11 @@ static const char *const TimeIRParsingName = "Parse IR";
 
 static std::unique_ptr<Module>
 getLazyIRModule(std::unique_ptr<MemoryBuffer> Buffer, SMDiagnostic &Err,
-                LLVMContext &Context) {
+                LLVMContext &Context, bool ShouldLazyLoadMetadata) {
   if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
                 (const unsigned char *)Buffer->getBufferEnd())) {
-    ErrorOr<std::unique_ptr<Module>> ModuleOrErr =
-        getLazyBitcodeModule(std::move(Buffer), Context);
+    ErrorOr<std::unique_ptr<Module>> ModuleOrErr = getLazyBitcodeModule(
+        std::move(Buffer), Context, ShouldLazyLoadMetadata);
     if (std::error_code EC = ModuleOrErr.getError()) {
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
@@ -49,7 +49,8 @@ getLazyIRModule(std::unique_ptr<MemoryBuffer> Buffer, SMDiagnostic &Err,
 
 std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
                                                   SMDiagnostic &Err,
-                                                  LLVMContext &Context) {
+                                                  LLVMContext &Context,
+                                                  bool ShouldLazyLoadMetadata) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -58,7 +59,8 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
     return nullptr;
   }
 
-  return getLazyIRModule(std::move(FileOrErr.get()), Err, Context);
+  return getLazyIRModule(std::move(FileOrErr.get()), Err, Context,
+                         ShouldLazyLoadMetadata);
 }
 
 std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
diff --git a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
index 25ae4ac..66df23b 100644
--- a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/ParallelCG.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
@@ -63,47 +64,15 @@ const char* LTOCodeGenerator::getVersionString() {
 #endif
 }
 
-static void handleLTODiagnostic(const DiagnosticInfo &DI) {
-  DiagnosticPrinterRawOStream DP(errs());
-  DI.print(DP);
-  errs() << "\n";
-}
-
-LTOCodeGenerator::LTOCodeGenerator()
-    : Context(getGlobalContext()), IRLinker(new Module("ld-temp.o", Context),
-                                            handleLTODiagnostic) {
-  initializeLTOPasses();
-}
-
-LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
-    : OwnedContext(std::move(Context)), Context(*OwnedContext),
-      IRLinker(new Module("ld-temp.o", *OwnedContext), handleLTODiagnostic) {
+LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
+    : Context(Context), MergedModule(new Module("ld-temp.o", Context)),
+      TheLinker(new Linker(*MergedModule)) {
   initializeLTOPasses();
 }
 
-void LTOCodeGenerator::destroyMergedModule() {
-  if (OwnedModule) {
-    assert(IRLinker.getModule() == &OwnedModule->getModule() &&
-           "The linker's module should be the same as the owned module");
-    delete OwnedModule;
-    OwnedModule = nullptr;
-  } else if (IRLinker.getModule())
-    IRLinker.deleteModule();
-}
-
-LTOCodeGenerator::~LTOCodeGenerator() {
-  destroyMergedModule();
+LTOCodeGenerator::~LTOCodeGenerator() {}
 
-  delete TargetMach;
-  TargetMach = nullptr;
-
-  for (std::vector<char *>::iterator I = CodegenOptions.begin(),
-                                     E = CodegenOptions.end();
-       I != E; ++I)
-    free(*I);
-}
-
-// Initialize LTO passes. Please keep this funciton in sync with
+// Initialize LTO passes. Please keep this function in sync with
 // PassManagerBuilder::populateLTOPassManager(), and make sure all LTO
 // passes are initialized.
 void LTOCodeGenerator::initializeLTOPasses() {
@@ -120,11 +89,12 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeGlobalDCEPass(R);
   initializeArgPromotionPass(R);
   initializeJumpThreadingPass(R);
-  initializeSROAPass(R);
+  initializeSROALegacyPassPass(R);
   initializeSROA_DTPass(R);
   initializeSROA_SSAUpPass(R);
-  initializeFunctionAttrsPass(R);
-  initializeGlobalsModRefPass(R);
+  initializePostOrderFunctionAttrsPass(R);
+  initializeReversePostOrderFunctionAttrsPass(R);
+  initializeGlobalsAAWrapperPassPass(R);
   initializeLICMPass(R);
   initializeMergedLoadStoreMotionPass(R);
   initializeGVNPass(R);
@@ -133,41 +103,39 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeCFGSimplifyPassPass(R);
 }
 
-bool LTOCodeGenerator::addModule(LTOModule *mod) {
-  assert(&mod->getModule().getContext() == &Context &&
+bool LTOCodeGenerator::addModule(LTOModule *Mod) {
+  assert(&Mod->getModule().getContext() == &Context &&
          "Expected module in same context");
 
-  bool ret = IRLinker.linkInModule(&mod->getModule());
+  bool ret = TheLinker->linkInModule(Mod->takeModule());
 
-  const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
+  const std::vector<const char *> &undefs = Mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
     AsmUndefinedRefs[undefs[i]] = 1;
 
   return !ret;
 }
 
-void LTOCodeGenerator::setModule(LTOModule *Mod) {
+void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   assert(&Mod->getModule().getContext() == &Context &&
          "Expected module in same context");
 
-  // Delete the old merged module.
-  destroyMergedModule();
   AsmUndefinedRefs.clear();
 
-  OwnedModule = Mod;
-  IRLinker.setModule(&Mod->getModule());
+  MergedModule = Mod->takeModule();
+  TheLinker = make_unique<Linker>(*MergedModule);
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
     AsmUndefinedRefs[Undefs[I]] = 1;
 }
 
-void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
-  Options = options;
+void LTOCodeGenerator::setTargetOptions(TargetOptions Options) {
+  this->Options = Options;
 }
 
-void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
-  switch (debug) {
+void LTOCodeGenerator::setDebugInfo(lto_debug_model Debug) {
+  switch (Debug) {
   case LTO_DEBUG_MODEL_NONE:
     EmitDwarfDebugInfo = false;
     return;
@@ -179,21 +147,26 @@ void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
   llvm_unreachable("Unknown debug format!");
 }
 
-void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) {
-  switch (model) {
-  case LTO_CODEGEN_PIC_MODEL_STATIC:
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
-  case LTO_CODEGEN_PIC_MODEL_DEFAULT:
-    CodeModel = model;
-    return;
+void LTOCodeGenerator::setOptLevel(unsigned Level) {
+  OptLevel = Level;
+  switch (OptLevel) {
+  case 0:
+    CGOptLevel = CodeGenOpt::None;
+    break;
+  case 1:
+    CGOptLevel = CodeGenOpt::Less;
+    break;
+  case 2:
+    CGOptLevel = CodeGenOpt::Default;
+    break;
+  case 3:
+    CGOptLevel = CodeGenOpt::Aggressive;
+    break;
   }
-  llvm_unreachable("Unknown PIC model!");
 }
 
-bool LTOCodeGenerator::writeMergedModules(const char *path,
-                                          std::string &errMsg) {
-  if (!determineTarget(errMsg))
+bool LTOCodeGenerator::writeMergedModules(const char *Path) {
+  if (!determineTarget())
     return false;
 
   // mark which symbols can not be internalized
@@ -201,20 +174,22 @@ bool LTOCodeGenerator::writeMergedModules(const char *path,
 
   // create output file
   std::error_code EC;
-  tool_output_file Out(path, EC, sys::fs::F_None);
+  tool_output_file Out(Path, EC, sys::fs::F_None);
   if (EC) {
-    errMsg = "could not open bitcode file for writing: ";
-    errMsg += path;
+    std::string ErrMsg = "could not open bitcode file for writing: ";
+    ErrMsg += Path;
+    emitError(ErrMsg);
     return false;
   }
 
   // write bitcode to it
-  WriteBitcodeToFile(IRLinker.getModule(), Out.os(), ShouldEmbedUselists);
+  WriteBitcodeToFile(MergedModule.get(), Out.os(), ShouldEmbedUselists);
   Out.os().close();
 
   if (Out.os().has_error()) {
-    errMsg = "could not write bitcode file: ";
-    errMsg += path;
+    std::string ErrMsg = "could not write bitcode file: ";
+    ErrMsg += Path;
+    emitError(ErrMsg);
     Out.os().clear_error();
     return false;
   }
@@ -223,22 +198,25 @@ bool LTOCodeGenerator::writeMergedModules(const char *path,
   return true;
 }
 
-bool LTOCodeGenerator::compileOptimizedToFile(const char **name,
-                                              std::string &errMsg) {
-  // make unique temp .o file to put generated object file
+bool LTOCodeGenerator::compileOptimizedToFile(const char **Name) {
+  // make unique temp output file to put generated code
   SmallString<128> Filename;
   int FD;
+
+  const char *Extension =
+      (FileType == TargetMachine::CGFT_AssemblyFile ? "s" : "o");
+
   std::error_code EC =
-      sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+      sys::fs::createTemporaryFile("lto-llvm", Extension, FD, Filename);
   if (EC) {
-    errMsg = EC.message();
+    emitError(EC.message());
     return false;
   }
 
   // generate object file
   tool_output_file objFile(Filename.c_str(), FD);
 
-  bool genResult = compileOptimized(objFile.os(), errMsg);
+  bool genResult = compileOptimized(&objFile.os());
   objFile.os().close();
   if (objFile.os().has_error()) {
     objFile.os().clear_error();
@@ -253,21 +231,21 @@ bool LTOCodeGenerator::compileOptimizedToFile(const char **name,
   }
 
   NativeObjectPath = Filename.c_str();
-  *name = NativeObjectPath.c_str();
+  *Name = NativeObjectPath.c_str();
   return true;
 }
 
 std::unique_ptr<MemoryBuffer>
-LTOCodeGenerator::compileOptimized(std::string &errMsg) {
+LTOCodeGenerator::compileOptimized() {
   const char *name;
-  if (!compileOptimizedToFile(&name, errMsg))
+  if (!compileOptimizedToFile(&name))
     return nullptr;
 
   // read .o file into memory buffer
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(name, -1, false);
   if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
+    emitError(EC.message());
     sys::fs::remove(NativeObjectPath);
     return nullptr;
   }
@@ -278,66 +256,51 @@ LTOCodeGenerator::compileOptimized(std::string &errMsg) {
   return std::move(*BufferOrErr);
 }
 
-
-bool LTOCodeGenerator::compile_to_file(const char **name,
-                                       bool disableInline,
-                                       bool disableGVNLoadPRE,
-                                       bool disableVectorization,
-                                       std::string &errMsg) {
-  if (!optimize(disableInline, disableGVNLoadPRE,
-                disableVectorization, errMsg))
+bool LTOCodeGenerator::compile_to_file(const char **Name, bool DisableVerify,
+                                       bool DisableInline,
+                                       bool DisableGVNLoadPRE,
+                                       bool DisableVectorization) {
+  if (!optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
+                DisableVectorization))
     return false;
 
-  return compileOptimizedToFile(name, errMsg);
+  return compileOptimizedToFile(Name);
 }
 
 std::unique_ptr<MemoryBuffer>
-LTOCodeGenerator::compile(bool disableInline, bool disableGVNLoadPRE,
-                          bool disableVectorization, std::string &errMsg) {
-  if (!optimize(disableInline, disableGVNLoadPRE,
-                disableVectorization, errMsg))
+LTOCodeGenerator::compile(bool DisableVerify, bool DisableInline,
+                          bool DisableGVNLoadPRE, bool DisableVectorization) {
+  if (!optimize(DisableVerify, DisableInline, DisableGVNLoadPRE,
+                DisableVectorization))
     return nullptr;
 
-  return compileOptimized(errMsg);
+  return compileOptimized();
 }
 
-bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
+bool LTOCodeGenerator::determineTarget() {
   if (TargetMach)
     return true;
 
-  std::string TripleStr = IRLinker.getModule()->getTargetTriple();
-  if (TripleStr.empty())
+  std::string TripleStr = MergedModule->getTargetTriple();
+  if (TripleStr.empty()) {
     TripleStr = sys::getDefaultTargetTriple();
+    MergedModule->setTargetTriple(TripleStr);
+  }
   llvm::Triple Triple(TripleStr);
 
   // create target machine from info for merged modules
-  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
-  if (!march)
+  std::string ErrMsg;
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
+  if (!march) {
+    emitError(ErrMsg);
     return false;
-
-  // The relocation model is actually a static member of TargetMachine and
-  // needs to be set before the TargetMachine is instantiated.
-  Reloc::Model RelocModel = Reloc::Default;
-  switch (CodeModel) {
-  case LTO_CODEGEN_PIC_MODEL_STATIC:
-    RelocModel = Reloc::Static;
-    break;
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
-    RelocModel = Reloc::PIC_;
-    break;
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
-    RelocModel = Reloc::DynamicNoPIC;
-    break;
-  case LTO_CODEGEN_PIC_MODEL_DEFAULT:
-    // RelocModel is already the default, so leave it that way.
-    break;
   }
 
   // Construct LTOModule, hand over ownership of module and target. Use MAttr as
   // the default set of features.
   SubtargetFeatures Features(MAttr);
   Features.getDefaultSubtargetFeatures(Triple);
-  std::string FeatureStr = Features.getString();
+  FeatureStr = Features.getString();
   // Set a default CPU for Darwin triples.
   if (MCpu.empty() && Triple.isOSDarwin()) {
     if (Triple.getArch() == llvm::Triple::x86_64)
@@ -348,25 +311,9 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
       MCpu = "cyclone";
   }
 
-  CodeGenOpt::Level CGOptLevel;
-  switch (OptLevel) {
-  case 0:
-    CGOptLevel = CodeGenOpt::None;
-    break;
-  case 1:
-    CGOptLevel = CodeGenOpt::Less;
-    break;
-  case 2:
-    CGOptLevel = CodeGenOpt::Default;
-    break;
-  case 3:
-    CGOptLevel = CodeGenOpt::Aggressive;
-    break;
-  }
-
-  TargetMach = march->createTargetMachine(TripleStr, MCpu, FeatureStr, Options,
-                                          RelocModel, CodeModel::Default,
-                                          CGOptLevel);
+  TargetMach.reset(march->createTargetMachine(TripleStr, MCpu, FeatureStr,
+                                              Options, RelocModel,
+                                              CodeModel::Default, CGOptLevel));
   return true;
 }
 
@@ -453,7 +400,6 @@ static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
 void LTOCodeGenerator::applyScopeRestrictions() {
   if (ScopeRestrictionsDone || !ShouldInternalize)
     return;
-  Module *mergedModule = IRLinker.getModule();
 
   // Start off with a verification pass.
   legacy::PassManager passes;
@@ -467,20 +413,17 @@ void LTOCodeGenerator::applyScopeRestrictions() {
   TargetLibraryInfoImpl TLII(Triple(TargetMach->getTargetTriple()));
   TargetLibraryInfo TLI(TLII);
 
-  accumulateAndSortLibcalls(Libcalls, TLI, *mergedModule, *TargetMach);
+  accumulateAndSortLibcalls(Libcalls, TLI, *MergedModule, *TargetMach);
 
-  for (Module::iterator f = mergedModule->begin(),
-         e = mergedModule->end(); f != e; ++f)
-    applyRestriction(*f, Libcalls, MustPreserveList, AsmUsed, Mangler);
-  for (Module::global_iterator v = mergedModule->global_begin(),
-         e = mergedModule->global_end(); v !=  e; ++v)
-    applyRestriction(*v, Libcalls, MustPreserveList, AsmUsed, Mangler);
-  for (Module::alias_iterator a = mergedModule->alias_begin(),
-         e = mergedModule->alias_end(); a != e; ++a)
-    applyRestriction(*a, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Function &f : *MergedModule)
+    applyRestriction(f, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (GlobalVariable &v : MergedModule->globals())
+    applyRestriction(v, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (GlobalAlias &a : MergedModule->aliases())
+    applyRestriction(a, Libcalls, MustPreserveList, AsmUsed, Mangler);
 
   GlobalVariable *LLVMCompilerUsed =
-    mergedModule->getGlobalVariable("llvm.compiler.used");
+    MergedModule->getGlobalVariable("llvm.compiler.used");
   findUsedValues(LLVMCompilerUsed, AsmUsed);
   if (LLVMCompilerUsed)
     LLVMCompilerUsed->eraseFromParent();
@@ -495,7 +438,7 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 
     llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size());
     LLVMCompilerUsed =
-      new llvm::GlobalVariable(*mergedModule, ATy, false,
+      new llvm::GlobalVariable(*MergedModule, ATy, false,
                                llvm::GlobalValue::AppendingLinkage,
                                llvm::ConstantArray::get(ATy, asmUsed2),
                                "llvm.compiler.used");
@@ -506,21 +449,18 @@ void LTOCodeGenerator::applyScopeRestrictions() {
   passes.add(createInternalizePass(MustPreserveList));
 
   // apply scope restrictions
-  passes.run(*mergedModule);
+  passes.run(*MergedModule);
 
   ScopeRestrictionsDone = true;
 }
 
 /// Optimize merged modules using various IPO passes
-bool LTOCodeGenerator::optimize(bool DisableInline,
+bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
                                 bool DisableGVNLoadPRE,
-                                bool DisableVectorization,
-                                std::string &errMsg) {
-  if (!this->determineTarget(errMsg))
+                                bool DisableVectorization) {
+  if (!this->determineTarget())
     return false;
 
-  Module *mergedModule = IRLinker.getModule();
-
   // Mark which symbols can not be internalized
   this->applyScopeRestrictions();
 
@@ -528,7 +468,7 @@ bool LTOCodeGenerator::optimize(bool DisableInline,
   legacy::PassManager passes;
 
   // Add an appropriate DataLayout instance for this module...
-  mergedModule->setDataLayout(*TargetMach->getDataLayout());
+  MergedModule->setDataLayout(TargetMach->createDataLayout());
 
   passes.add(
       createTargetTransformInfoWrapperPass(TargetMach->getTargetIRAnalysis()));
@@ -542,60 +482,57 @@ bool LTOCodeGenerator::optimize(bool DisableInline,
     PMB.Inliner = createFunctionInliningPass();
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple);
   PMB.OptLevel = OptLevel;
-  PMB.VerifyInput = true;
-  PMB.VerifyOutput = true;
+  PMB.VerifyInput = !DisableVerify;
+  PMB.VerifyOutput = !DisableVerify;
 
   PMB.populateLTOPassManager(passes);
 
   // Run our queue of passes all at once now, efficiently.
-  passes.run(*mergedModule);
+  passes.run(*MergedModule);
 
   return true;
 }
 
-bool LTOCodeGenerator::compileOptimized(raw_pwrite_stream &out,
-                                        std::string &errMsg) {
-  if (!this->determineTarget(errMsg))
+bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
+  if (!this->determineTarget())
     return false;
 
-  Module *mergedModule = IRLinker.getModule();
-
-  legacy::PassManager codeGenPasses;
+  legacy::PassManager preCodeGenPasses;
 
   // If the bitcode files contain ARC code and were compiled with optimization,
   // the ObjCARCContractPass must be run, so do it unconditionally here.
-  codeGenPasses.add(createObjCARCContractPass());
-
-  if (TargetMach->addPassesToEmitFile(codeGenPasses, out,
-                                      TargetMachine::CGFT_ObjectFile)) {
-    errMsg = "target file type not supported";
-    return false;
-  }
-
-  // Run the code generator, and write assembly file
-  codeGenPasses.run(*mergedModule);
+  preCodeGenPasses.add(createObjCARCContractPass());
+  preCodeGenPasses.run(*MergedModule);
+
+  // Do code generation. We need to preserve the module in case the client calls
+  // writeMergedModules() after compilation, but we only need to allow this at
+  // parallelism level 1. This is achieved by having splitCodeGen return the
+  // original module at parallelism level 1 which we then assign back to
+  // MergedModule.
+  MergedModule =
+      splitCodeGen(std::move(MergedModule), Out, MCpu, FeatureStr, Options,
+                   RelocModel, CodeModel::Default, CGOptLevel, FileType);
 
   return true;
 }
 
 /// setCodeGenDebugOptions - Set codegen debugging options to aid in debugging
 /// LTO problems.
-void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) {
-  for (std::pair<StringRef, StringRef> o = getToken(options);
-       !o.first.empty(); o = getToken(o.second)) {
-    // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add
-    // that.
-    if (CodegenOptions.empty())
-      CodegenOptions.push_back(strdup("libLLVMLTO"));
-    CodegenOptions.push_back(strdup(o.first.str().c_str()));
-  }
+void LTOCodeGenerator::setCodeGenDebugOptions(const char *Options) {
+  for (std::pair<StringRef, StringRef> o = getToken(Options); !o.first.empty();
+       o = getToken(o.second))
+    CodegenOptions.push_back(o.first);
 }
 
 void LTOCodeGenerator::parseCodeGenDebugOptions() {
   // if options were requested, set them
-  if (!CodegenOptions.empty())
-    cl::ParseCommandLineOptions(CodegenOptions.size(),
-                                const_cast<char **>(&CodegenOptions[0]));
+  if (!CodegenOptions.empty()) {
+    // ParseCommandLineOptions() expects argv[0] to be program name.
+    std::vector<const char *> CodegenArgv(1, "libLLVMLTO");
+    for (std::string &Arg : CodegenOptions)
+      CodegenArgv.push_back(Arg.c_str());
+    cl::ParseCommandLineOptions(CodegenArgv.size(), CodegenArgv.data());
+  }
 }
 
 void LTOCodeGenerator::DiagnosticHandler(const DiagnosticInfo &DI,
@@ -645,3 +582,20 @@ LTOCodeGenerator::setDiagnosticHandler(lto_diagnostic_handler_t DiagHandler,
   Context.setDiagnosticHandler(LTOCodeGenerator::DiagnosticHandler, this,
                                /* RespectFilters */ true);
 }
+
+namespace {
+class LTODiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+public:
+  LTODiagnosticInfo(const Twine &DiagMsg, DiagnosticSeverity Severity=DS_Error)
+      : DiagnosticInfo(DK_Linker, Severity), Msg(DiagMsg) {}
+  void print(DiagnosticPrinter &DP) const override { DP << Msg; }
+};
+}
+
+void LTOCodeGenerator::emitError(const std::string &ErrMsg) {
+  if (DiagHandler)
+    (*DiagHandler)(LTO_DS_ERROR, ErrMsg.c_str(), DiagContext);
+  else
+    Context.diagnose(LTODiagnosticInfo(ErrMsg));
+}
diff --git a/contrib/llvm/lib/LTO/LTOModule.cpp b/contrib/llvm/lib/LTO/LTOModule.cpp
index 53ed417..409b949 100644
--- a/contrib/llvm/lib/LTO/LTOModule.cpp
+++ b/contrib/llvm/lib/LTO/LTOModule.cpp
@@ -91,106 +91,97 @@ bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer,
   return StringRef(Triple).startswith(TriplePrefix);
 }
 
-LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options,
-                                     std::string &errMsg) {
+std::string LTOModule::getProducerString(MemoryBuffer *Buffer) {
+  ErrorOr<MemoryBufferRef> BCOrErr =
+      IRObjectFile::findBitcodeInMemBuffer(Buffer->getMemBufferRef());
+  if (!BCOrErr)
+    return "";
+  LLVMContext Context;
+  return getBitcodeProducerString(*BCOrErr, Context);
+}
+
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromFile(LLVMContext &Context, const char *path,
+                          TargetOptions options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(path);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
-                       &getGlobalContext());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
 }
 
-LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size,
-                                         TargetOptions options,
-                                         std::string &errMsg) {
-  return createFromOpenFileSlice(fd, path, size, 0, options, errMsg);
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromOpenFile(LLVMContext &Context, int fd, const char *path,
+                              size_t size, TargetOptions options) {
+  return createFromOpenFileSlice(Context, fd, path, size, 0, options);
 }
 
-LTOModule *LTOModule::createFromOpenFileSlice(int fd, const char *path,
-                                              size_t map_size, off_t offset,
-                                              TargetOptions options,
-                                              std::string &errMsg) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromOpenFileSlice(LLVMContext &Context, int fd,
+                                   const char *path, size_t map_size,
+                                   off_t offset, TargetOptions options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
-                       &getGlobalContext());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
 }
 
-LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length,
-                                       TargetOptions options,
-                                       std::string &errMsg, StringRef path) {
-  return createInContext(mem, length, options, errMsg, path,
-                         &getGlobalContext());
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromBuffer(LLVMContext &Context, const void *mem,
+                            size_t length, TargetOptions options,
+                            StringRef path) {
+  return createInContext(mem, length, options, path, &Context);
 }
 
-LTOModule *LTOModule::createInLocalContext(const void *mem, size_t length,
-                                           TargetOptions options,
-                                           std::string &errMsg,
-                                           StringRef path) {
-  return createInContext(mem, length, options, errMsg, path, nullptr);
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createInLocalContext(const void *mem, size_t length,
+                                TargetOptions options, StringRef path) {
+  return createInContext(mem, length, options, path, nullptr);
 }
 
-LTOModule *LTOModule::createInContext(const void *mem, size_t length,
-                                      TargetOptions options,
-                                      std::string &errMsg, StringRef path,
-                                      LLVMContext *Context) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createInContext(const void *mem, size_t length,
+                           TargetOptions options, StringRef path,
+                           LLVMContext *Context) {
   StringRef Data((const char *)mem, length);
   MemoryBufferRef Buffer(Data, path);
-  return makeLTOModule(Buffer, options, errMsg, Context);
+  return makeLTOModule(Buffer, options, Context);
 }
 
-static std::unique_ptr<Module> parseBitcodeFileImpl(MemoryBufferRef Buffer,
-                                                    LLVMContext &Context,
-                                                    bool ShouldBeLazy,
-                                                    std::string &ErrMsg) {
+static ErrorOr<std::unique_ptr<Module>>
+parseBitcodeFileImpl(MemoryBufferRef Buffer, LLVMContext &Context,
+                     bool ShouldBeLazy) {
 
   // Find the buffer.
   ErrorOr<MemoryBufferRef> MBOrErr =
       IRObjectFile::findBitcodeInMemBuffer(Buffer);
-  if (std::error_code EC = MBOrErr.getError()) {
-    ErrMsg = EC.message();
-    return nullptr;
-  }
-
-  std::function<void(const DiagnosticInfo &)> DiagnosticHandler =
-      [&ErrMsg](const DiagnosticInfo &DI) {
-        raw_string_ostream Stream(ErrMsg);
-        DiagnosticPrinterRawOStream DP(Stream);
-        DI.print(DP);
-      };
+  if (std::error_code EC = MBOrErr.getError())
+    return EC;
 
   if (!ShouldBeLazy) {
     // Parse the full file.
-    ErrorOr<std::unique_ptr<Module>> M =
-        parseBitcodeFile(*MBOrErr, Context, DiagnosticHandler);
-    if (!M)
-      return nullptr;
+    ErrorOr<std::unique_ptr<Module>> M = parseBitcodeFile(*MBOrErr, Context);
+    if (std::error_code EC = M.getError())
+      return EC;
     return std::move(*M);
   }
 
   // Parse lazily.
   std::unique_ptr<MemoryBuffer> LightweightBuf =
       MemoryBuffer::getMemBuffer(*MBOrErr, false);
-  ErrorOr<std::unique_ptr<Module>> M =
-      getLazyBitcodeModule(std::move(LightweightBuf), Context,
-                           DiagnosticHandler, true /*ShouldLazyLoadMetadata*/);
-  if (!M)
-    return nullptr;
+  ErrorOr<std::unique_ptr<Module>> M = getLazyBitcodeModule(
+      std::move(LightweightBuf), Context, true /*ShouldLazyLoadMetadata*/);
+  if (std::error_code EC = M.getError())
+    return EC;
   return std::move(*M);
 }
 
-LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
-                                    TargetOptions options, std::string &errMsg,
-                                    LLVMContext *Context) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
+                         LLVMContext *Context) {
   std::unique_ptr<LLVMContext> OwnedContext;
   if (!Context) {
     OwnedContext = llvm::make_unique<LLVMContext>();
@@ -199,11 +190,12 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
 
   // If we own a context, we know this is being used only for symbol
   // extraction, not linking.  Be lazy in that case.
-  std::unique_ptr<Module> M = parseBitcodeFileImpl(
-      Buffer, *Context,
-      /* ShouldBeLazy */ static_cast<bool>(OwnedContext), errMsg);
-  if (!M)
-    return nullptr;
+  ErrorOr<std::unique_ptr<Module>> MOrErr =
+      parseBitcodeFileImpl(Buffer, *Context,
+                           /* ShouldBeLazy */ static_cast<bool>(OwnedContext));
+  if (std::error_code EC = MOrErr.getError())
+    return EC;
+  std::unique_ptr<Module> &M = *MOrErr;
 
   std::string TripleStr = M->getTargetTriple();
   if (TripleStr.empty())
@@ -211,9 +203,10 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
   llvm::Triple Triple(TripleStr);
 
   // find machine architecture for this module
+  std::string errMsg;
   const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (!march)
-    return nullptr;
+    return std::unique_ptr<LTOModule>(nullptr);
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
@@ -232,25 +225,21 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
 
   TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
                                                      options);
-  M->setDataLayout(*target->getDataLayout());
+  M->setDataLayout(target->createDataLayout());
 
   std::unique_ptr<object::IRObjectFile> IRObj(
       new object::IRObjectFile(Buffer, std::move(M)));
 
-  LTOModule *Ret;
+  std::unique_ptr<LTOModule> Ret;
   if (OwnedContext)
-    Ret = new LTOModule(std::move(IRObj), target, std::move(OwnedContext));
+    Ret.reset(new LTOModule(std::move(IRObj), target, std::move(OwnedContext)));
   else
-    Ret = new LTOModule(std::move(IRObj), target);
-
-  if (Ret->parseSymbols(errMsg)) {
-    delete Ret;
-    return nullptr;
-  }
+    Ret.reset(new LTOModule(std::move(IRObj), target));
 
+  Ret->parseSymbols();
   Ret->parseMetadata();
 
-  return Ret;
+  return std::move(Ret);
 }
 
 /// Create a MemoryBuffer from a memory range with an optional name.
@@ -583,9 +572,7 @@ void LTOModule::addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
   info.symbol = decl;
 }
 
-/// parseSymbols - Parse the symbols from the module and model-level ASM and add
-/// them to either the defined or undefined lists.
-bool LTOModule::parseSymbols(std::string &errMsg) {
+void LTOModule::parseSymbols() {
   for (auto &Sym : IRFile->symbols()) {
     const GlobalValue *GV = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
     uint32_t Flags = Sym.getFlags();
@@ -640,8 +627,6 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
     NameAndAttributes info = u->getValue();
     _symbols.push_back(info);
   }
-
-  return false;
 }
 
 /// parseMetadata - Parse metadata from the module
diff --git a/contrib/llvm/lib/LibDriver/LibDriver.cpp b/contrib/llvm/lib/LibDriver/LibDriver.cpp
index b33a22f..3ae5434 100644
--- a/contrib/llvm/lib/LibDriver/LibDriver.cpp
+++ b/contrib/llvm/lib/LibDriver/LibDriver.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // Defines an interface to a lib.exe-compatible driver that also understands
-// bitcode files. Used by llvm-lib and lld-link2 /lib.
+// bitcode files. Used by llvm-lib and lld-link /lib.
 //
 //===----------------------------------------------------------------------===//
 
@@ -51,7 +51,7 @@ static const llvm::opt::OptTable::Info infoTable[] = {
 
 class LibOptTable : public llvm::opt::OptTable {
 public:
-  LibOptTable() : OptTable(infoTable, llvm::array_lengthof(infoTable), true) {}
+  LibOptTable() : OptTable(infoTable, true) {}
 };
 
 }
@@ -102,7 +102,7 @@ static Optional<std::string> findInputFile(StringRef File,
 int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
   SmallVector<const char *, 20> NewArgs(ArgsArr.begin(), ArgsArr.end());
   BumpPtrAllocator Alloc;
-  BumpPtrStringSaver Saver(Alloc);
+  StringSaver Saver(Alloc);
   cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs);
   ArgsArr = NewArgs;
 
@@ -135,14 +135,13 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
       llvm::errs() << Arg->getValue() << ": no such file or directory\n";
       return 1;
     }
-    Members.emplace_back(Saver.save(*Path),
-                         llvm::sys::path::filename(Arg->getValue()));
+    Members.emplace_back(Saver.save(*Path));
   }
 
   std::pair<StringRef, std::error_code> Result =
       llvm::writeArchive(getOutputPath(&Args, Members[0]), Members,
                          /*WriteSymtab=*/true, object::Archive::K_GNU,
-                         /*Deterministic*/ true);
+                         /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin));
 
   if (Result.second) {
     if (Result.first.empty())
diff --git a/contrib/llvm/lib/LibDriver/Options.td b/contrib/llvm/lib/LibDriver/Options.td
index 0aa1aff..5a56ef7 100644
--- a/contrib/llvm/lib/LibDriver/Options.td
+++ b/contrib/llvm/lib/LibDriver/Options.td
@@ -12,6 +12,8 @@ class P<string name, string help> :
 def libpath: P<"libpath", "Object file search path">;
 def out    : P<"out", "Path to file to write output">;
 
+def llvmlibthin : F<"llvmlibthin">;
+
 //==============================================================================
 // The flags below do nothing. They are defined only for lib.exe compatibility.
 //==============================================================================
diff --git a/contrib/llvm/lib/Linker/IRMover.cpp b/contrib/llvm/lib/Linker/IRMover.cpp
new file mode 100644
index 0000000..8dd59f9
--- /dev/null
+++ b/contrib/llvm/lib/Linker/IRMover.cpp
@@ -0,0 +1,1698 @@
+//===- lib/Linker/IRMover.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker/IRMover.h"
+#include "LinkDiagnosticInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GVMaterializer.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TypeMap implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class TypeMapTy : public ValueMapTypeRemapper {
+  /// This is a mapping from a source type to a destination type to use.
+  DenseMap<Type *, Type *> MappedTypes;
+
+  /// When checking to see if two subgraphs are isomorphic, we speculatively
+  /// add types to MappedTypes, but keep track of them here in case we need to
+  /// roll back.
+  SmallVector<Type *, 16> SpeculativeTypes;
+
+  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
+
+  /// This is a list of non-opaque structs in the source module that are mapped
+  /// to an opaque struct in the destination module.
+  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
+
+  /// This is the set of opaque types in the destination modules who are
+  /// getting a body from the source module.
+  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
+
+public:
+  TypeMapTy(IRMover::IdentifiedStructTypeSet &DstStructTypesSet)
+      : DstStructTypesSet(DstStructTypesSet) {}
+
+  IRMover::IdentifiedStructTypeSet &DstStructTypesSet;
+  /// Indicate that the specified type in the destination module is conceptually
+  /// equivalent to the specified type in the source module.
+  void addTypeMapping(Type *DstTy, Type *SrcTy);
+
+  /// Produce a body for an opaque type in the dest module from a type
+  /// definition in the source module.
+  void linkDefinedTypeBodies();
+
+  /// Return the mapped type to use for the specified input type from the
+  /// source module.
+  Type *get(Type *SrcTy);
+  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
+
+  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
+
+  FunctionType *get(FunctionType *T) {
+    return cast<FunctionType>(get((Type *)T));
+  }
+
+private:
+  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
+
+  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
+};
+}
+
+void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
+  assert(SpeculativeTypes.empty());
+  assert(SpeculativeDstOpaqueTypes.empty());
+
+  // Check to see if these types are recursively isomorphic and establish a
+  // mapping between them if so.
+  if (!areTypesIsomorphic(DstTy, SrcTy)) {
+    // Oops, they aren't isomorphic.  Just discard this request by rolling out
+    // any speculative mappings we've established.
+    for (Type *Ty : SpeculativeTypes)
+      MappedTypes.erase(Ty);
+
+    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
+                                   SpeculativeDstOpaqueTypes.size());
+    for (StructType *Ty : SpeculativeDstOpaqueTypes)
+      DstResolvedOpaqueTypes.erase(Ty);
+  } else {
+    for (Type *Ty : SpeculativeTypes)
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        if (STy->hasName())
+          STy->setName("");
+  }
+  SpeculativeTypes.clear();
+  SpeculativeDstOpaqueTypes.clear();
+}
+
+/// Recursively walk this pair of types, returning true if they are isomorphic,
+/// false if they are not.
+bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
+  // Two types with differing kinds are clearly not isomorphic.
+  if (DstTy->getTypeID() != SrcTy->getTypeID())
+    return false;
+
+  // If we have an entry in the MappedTypes table, then we have our answer.
+  Type *&Entry = MappedTypes[SrcTy];
+  if (Entry)
+    return Entry == DstTy;
+
+  // Two identical types are clearly isomorphic.  Remember this
+  // non-speculatively.
+  if (DstTy == SrcTy) {
+    Entry = DstTy;
+    return true;
+  }
+
+  // Okay, we have two types with identical kinds that we haven't seen before.
+
+  // If this is an opaque struct type, special case it.
+  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
+    // Mapping an opaque type to any struct, just keep the dest struct.
+    if (SSTy->isOpaque()) {
+      Entry = DstTy;
+      SpeculativeTypes.push_back(SrcTy);
+      return true;
+    }
+
+    // Mapping a non-opaque source type to an opaque dest.  If this is the first
+    // type that we're mapping onto this destination type then we succeed.  Keep
+    // the dest, but fill it in later. If this is the second (different) type
+    // that we're trying to map onto the same opaque type then we fail.
+    if (cast<StructType>(DstTy)->isOpaque()) {
+      // We can only map one source type onto the opaque destination type.
+      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
+        return false;
+      SrcDefinitionsToResolve.push_back(SSTy);
+      SpeculativeTypes.push_back(SrcTy);
+      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
+      Entry = DstTy;
+      return true;
+    }
+  }
+
+  // If the number of subtypes disagree between the two types, then we fail.
+  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
+    return false;
+
+  // Fail if any of the extra properties (e.g. array size) of the type disagree.
+  if (isa<IntegerType>(DstTy))
+    return false; // bitwidth disagrees.
+  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
+    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
+      return false;
+
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
+    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
+      return false;
+  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
+    StructType *SSTy = cast<StructType>(SrcTy);
+    if (DSTy->isLiteral() != SSTy->isLiteral() ||
+        DSTy->isPacked() != SSTy->isPacked())
+      return false;
+  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
+    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
+      return false;
+  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
+    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
+      return false;
+  }
+
+  // Otherwise, we speculate that these two types will line up and recursively
+  // check the subelements.
+  Entry = DstTy;
+  SpeculativeTypes.push_back(SrcTy);
+
+  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
+    if (!areTypesIsomorphic(DstTy->getContainedType(I),
+                            SrcTy->getContainedType(I)))
+      return false;
+
+  // If everything seems to have lined up, then everything is great.
+  return true;
+}
+
+void TypeMapTy::linkDefinedTypeBodies() {
+  SmallVector<Type *, 16> Elements;
+  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
+    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
+    assert(DstSTy->isOpaque());
+
+    // Map the body of the source type over to a new body for the dest type.
+    Elements.resize(SrcSTy->getNumElements());
+    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
+      Elements[I] = get(SrcSTy->getElementType(I));
+
+    DstSTy->setBody(Elements, SrcSTy->isPacked());
+    DstStructTypesSet.switchToNonOpaque(DstSTy);
+  }
+  SrcDefinitionsToResolve.clear();
+  DstResolvedOpaqueTypes.clear();
+}
+
+void TypeMapTy::finishType(StructType *DTy, StructType *STy,
+                           ArrayRef<Type *> ETypes) {
+  DTy->setBody(ETypes, STy->isPacked());
+
+  // Steal STy's name.
+  if (STy->hasName()) {
+    SmallString<16> TmpName = STy->getName();
+    STy->setName("");
+    DTy->setName(TmpName);
+  }
+
+  DstStructTypesSet.addNonOpaque(DTy);
+}
+
+Type *TypeMapTy::get(Type *Ty) {
+  SmallPtrSet<StructType *, 8> Visited;
+  return get(Ty, Visited);
+}
+
+Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
+  // If we already have an entry for this type, return it.
+  Type **Entry = &MappedTypes[Ty];
+  if (*Entry)
+    return *Entry;
+
+  // These are types that LLVM itself will unique.
+  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
+
+#ifndef NDEBUG
+  if (!IsUniqued) {
+    for (auto &Pair : MappedTypes) {
+      assert(!(Pair.first != Ty && Pair.second == Ty) &&
+             "mapping to a source type");
+    }
+  }
+#endif
+
+  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
+    StructType *DTy = StructType::create(Ty->getContext());
+    return *Entry = DTy;
+  }
+
+  // If this is not a recursive type, then just map all of the elements and
+  // then rebuild the type from inside out.
+  SmallVector<Type *, 4> ElementTypes;
+
+  // If there are no element types to map, then the type is itself.  This is
+  // true for the anonymous {} struct, things like 'float', integers, etc.
+  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
+    return *Entry = Ty;
+
+  // Remap all of the elements, keeping track of whether any of them change.
+  bool AnyChange = false;
+  ElementTypes.resize(Ty->getNumContainedTypes());
+  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
+    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
+    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
+  }
+
+  // If we found our type while recursively processing stuff, just use it.
+  Entry = &MappedTypes[Ty];
+  if (*Entry) {
+    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
+      if (DTy->isOpaque()) {
+        auto *STy = cast<StructType>(Ty);
+        finishType(DTy, STy, ElementTypes);
+      }
+    }
+    return *Entry;
+  }
+
+  // If all of the element types mapped directly over and the type is not
+  // a nomed struct, then the type is usable as-is.
+  if (!AnyChange && IsUniqued)
+    return *Entry = Ty;
+
+  // Otherwise, rebuild a modified type.
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unknown derived type to remap");
+  case Type::ArrayTyID:
+    return *Entry = ArrayType::get(ElementTypes[0],
+                                   cast<ArrayType>(Ty)->getNumElements());
+  case Type::VectorTyID:
+    return *Entry = VectorType::get(ElementTypes[0],
+                                    cast<VectorType>(Ty)->getNumElements());
+  case Type::PointerTyID:
+    return *Entry = PointerType::get(ElementTypes[0],
+                                     cast<PointerType>(Ty)->getAddressSpace());
+  case Type::FunctionTyID:
+    return *Entry = FunctionType::get(ElementTypes[0],
+                                      makeArrayRef(ElementTypes).slice(1),
+                                      cast<FunctionType>(Ty)->isVarArg());
+  case Type::StructTyID: {
+    auto *STy = cast<StructType>(Ty);
+    bool IsPacked = STy->isPacked();
+    if (IsUniqued)
+      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
+
+    // If the type is opaque, we can just use it directly.
+    if (STy->isOpaque()) {
+      DstStructTypesSet.addOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    if (StructType *OldT =
+            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
+      STy->setName("");
+      return *Entry = OldT;
+    }
+
+    if (!AnyChange) {
+      DstStructTypesSet.addNonOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    StructType *DTy = StructType::create(Ty->getContext());
+    finishType(DTy, STy, ElementTypes);
+    return *Entry = DTy;
+  }
+  }
+}
+
+LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
+                                       const Twine &Msg)
+    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
+void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
+
+//===----------------------------------------------------------------------===//
+// IRLinker implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IRLinker;
+
+/// Creates prototypes for functions that are lazily linked on the fly. This
+/// speeds up linking for modules with many/ lazily linked functions of which
+/// few get used.
+class GlobalValueMaterializer final : public ValueMaterializer {
+  IRLinker *TheIRLinker;
+
+public:
+  GlobalValueMaterializer(IRLinker *TheIRLinker) : TheIRLinker(TheIRLinker) {}
+  Value *materializeDeclFor(Value *V) override;
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
+  Metadata *mapTemporaryMetadata(Metadata *MD) override;
+  void replaceTemporaryMetadata(const Metadata *OrigMD,
+                                Metadata *NewMD) override;
+  bool isMetadataNeeded(Metadata *MD) override;
+};
+
+class LocalValueMaterializer final : public ValueMaterializer {
+  IRLinker *TheIRLinker;
+
+public:
+  LocalValueMaterializer(IRLinker *TheIRLinker) : TheIRLinker(TheIRLinker) {}
+  Value *materializeDeclFor(Value *V) override;
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
+  Metadata *mapTemporaryMetadata(Metadata *MD) override;
+  void replaceTemporaryMetadata(const Metadata *OrigMD,
+                                Metadata *NewMD) override;
+  bool isMetadataNeeded(Metadata *MD) override;
+};
+
+/// This is responsible for keeping track of the state used for moving data
+/// from SrcM to DstM.
+class IRLinker {
+  Module &DstM;
+  Module &SrcM;
+
+  std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor;
+
+  TypeMapTy TypeMap;
+  GlobalValueMaterializer GValMaterializer;
+  LocalValueMaterializer LValMaterializer;
+
+  /// Mapping of values from what they used to be in Src, to what they are now
+  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
+  /// due to the use of Value handles which the Linker doesn't actually need,
+  /// but this allows us to reuse the ValueMapper code.
+  ValueToValueMapTy ValueMap;
+  ValueToValueMapTy AliasValueMap;
+
+  DenseSet<GlobalValue *> ValuesToLink;
+  std::vector<GlobalValue *> Worklist;
+
+  void maybeAdd(GlobalValue *GV) {
+    if (ValuesToLink.insert(GV).second)
+      Worklist.push_back(GV);
+  }
+
+  /// Set to true when all global value body linking is complete (including
+  /// lazy linking). Used to prevent metadata linking from creating new
+  /// references.
+  bool DoneLinkingBodies = false;
+
+  bool HasError = false;
+
+  /// Flag indicating that we are just linking metadata (after function
+  /// importing).
+  bool IsMetadataLinkingPostpass;
+
+  /// Flags to pass to value mapper invocations.
+  RemapFlags ValueMapperFlags = RF_MoveDistinctMDs;
+
+  /// Association between metadata values created during bitcode parsing and
+  /// the value id. Used to correlate temporary metadata created during
+  /// function importing with the final metadata parsed during the subsequent
+  /// metadata linking postpass.
+  DenseMap<const Metadata *, unsigned> MetadataToIDs;
+
+  /// Association between metadata value id and temporary metadata that
+  /// remains unmapped after function importing. Saved during function
+  /// importing and consumed during the metadata linking postpass.
+  DenseMap<unsigned, MDNode *> *ValIDToTempMDMap;
+
+  /// Set of subprogram metadata that does not need to be linked into the
+  /// destination module, because the functions were not imported directly
+  /// or via an inlined body in an imported function.
+  SmallPtrSet<const Metadata *, 16> UnneededSubprograms;
+
+  /// Handles cloning of a global values from the source module into
+  /// the destination module, including setting the attributes and visibility.
+  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition);
+
+  /// Helper method for setting a message and returning an error code.
+  bool emitError(const Twine &Message) {
+    SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message));
+    HasError = true;
+    return true;
+  }
+
+  void emitWarning(const Twine &Message) {
+    SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Warning, Message));
+  }
+
+  /// Check whether we should be linking metadata from the source module.
+  bool shouldLinkMetadata() {
+    // ValIDToTempMDMap will be non-null when we are importing or otherwise want
+    // to link metadata lazily, and then when linking the metadata.
+    // We only want to return true for the former case.
+    return ValIDToTempMDMap == nullptr || IsMetadataLinkingPostpass;
+  }
+
+  /// Given a global in the source module, return the global in the
+  /// destination module that is being linked to, if any.
+  GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    // If the source has no name it can't link.  If it has local linkage,
+    // there is no name match-up going on.
+    if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise see if we have a match in the destination module's symtab.
+    GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName());
+    if (!DGV)
+      return nullptr;
+
+    // If we found a global with the same name in the dest module, but it has
+    // internal linkage, we are really not doing any linkage here.
+    if (DGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise, we do in fact link to the destination global.
+    return DGV;
+  }
+
+  void computeTypeMapping();
+
+  Constant *linkAppendingVarProto(GlobalVariable *DstGV,
+                                  const GlobalVariable *SrcGV);
+
+  bool shouldLink(GlobalValue *DGV, GlobalValue &SGV);
+  Constant *linkGlobalValueProto(GlobalValue *GV, bool ForAlias);
+
+  bool linkModuleFlagsMetadata();
+
+  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
+  bool linkFunctionBody(Function &Dst, Function &Src);
+  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
+  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
+
+  /// Functions that take care of cloning a specific global value type
+  /// into the destination module.
+  GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
+  Function *copyFunctionProto(const Function *SF);
+  GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA);
+
+  void linkNamedMDNodes();
+
+  /// Populate the UnneededSubprograms set with the DISubprogram metadata
+  /// from the source module that we don't need to link into the dest module,
+  /// because the functions were not imported directly or via an inlined body
+  /// in an imported function.
+  void findNeededSubprograms(ValueToValueMapTy &ValueMap);
+
+  /// The value mapper leaves nulls in the list of subprograms for any
+  /// in the UnneededSubprograms map. Strip those out after metadata linking.
+  void stripNullSubprograms();
+
+public:
+  IRLinker(Module &DstM, IRMover::IdentifiedStructTypeSet &Set, Module &SrcM,
+           ArrayRef<GlobalValue *> ValuesToLink,
+           std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
+           DenseMap<unsigned, MDNode *> *ValIDToTempMDMap = nullptr,
+           bool IsMetadataLinkingPostpass = false)
+      : DstM(DstM), SrcM(SrcM), AddLazyFor(AddLazyFor), TypeMap(Set),
+        GValMaterializer(this), LValMaterializer(this),
+        IsMetadataLinkingPostpass(IsMetadataLinkingPostpass),
+        ValIDToTempMDMap(ValIDToTempMDMap) {
+    for (GlobalValue *GV : ValuesToLink)
+      maybeAdd(GV);
+
+    // If appropriate, tell the value mapper that it can expect to see
+    // temporary metadata.
+    if (!shouldLinkMetadata())
+      ValueMapperFlags = ValueMapperFlags | RF_HaveUnmaterializedMetadata;
+  }
+
+  ~IRLinker() {
+    // In the case where we are not linking metadata, we unset the CanReplace
+    // flag on all temporary metadata in the MetadataToIDs map to ensure
+    // none was replaced while being a map key. Now that we are destructing
+    // the map, set the flag back to true, so that it is replaceable during
+    // metadata linking.
+    if (!shouldLinkMetadata()) {
+      for (auto MDI : MetadataToIDs) {
+        Metadata *MD = const_cast<Metadata *>(MDI.first);
+        MDNode *Node = dyn_cast<MDNode>(MD);
+        assert((Node && Node->isTemporary()) &&
+               "Found non-temp metadata in map when not linking metadata");
+        Node->setCanReplace(true);
+      }
+    }
+  }
+
+  bool run();
+  Value *materializeDeclFor(Value *V, bool ForAlias);
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias);
+
+  /// Save the mapping between the given temporary metadata and its metadata
+  /// value id. Used to support metadata linking as a postpass for function
+  /// importing.
+  Metadata *mapTemporaryMetadata(Metadata *MD);
+
+  /// Replace any temporary metadata saved for the source metadata's id with
+  /// the new non-temporary metadata. Used when metadata linking as a postpass
+  /// for function importing.
+  void replaceTemporaryMetadata(const Metadata *OrigMD, Metadata *NewMD);
+
+  /// Indicates whether we need to map the given metadata into the destination
+  /// module. Used to prevent linking of metadata only needed by functions not
+  /// linked into the dest module.
+  bool isMetadataNeeded(Metadata *MD);
+};
+}
+
+/// The LLVM SymbolTable class autorenames globals that conflict in the symbol
+/// table. This is good for all clients except for us. Go through the trouble
+/// to force this back.
+static void forceRenaming(GlobalValue *GV, StringRef Name) {
+  // If the global doesn't force its name or if it already has the right name,
+  // there is nothing for us to do.
+  if (GV->hasLocalLinkage() || GV->getName() == Name)
+    return;
+
+  Module *M = GV->getParent();
+
+  // If there is a conflict, rename the conflict.
+  if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
+    GV->takeName(ConflictGV);
+    ConflictGV->setName(Name); // This will cause ConflictGV to get renamed
+    assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
+  } else {
+    GV->setName(Name); // Force the name back
+  }
+}
+
+Value *GlobalValueMaterializer::materializeDeclFor(Value *V) {
+  return TheIRLinker->materializeDeclFor(V, false);
+}
+
+void GlobalValueMaterializer::materializeInitFor(GlobalValue *New,
+                                                 GlobalValue *Old) {
+  TheIRLinker->materializeInitFor(New, Old, false);
+}
+
+Metadata *GlobalValueMaterializer::mapTemporaryMetadata(Metadata *MD) {
+  return TheIRLinker->mapTemporaryMetadata(MD);
+}
+
+void GlobalValueMaterializer::replaceTemporaryMetadata(const Metadata *OrigMD,
+                                                       Metadata *NewMD) {
+  TheIRLinker->replaceTemporaryMetadata(OrigMD, NewMD);
+}
+
+bool GlobalValueMaterializer::isMetadataNeeded(Metadata *MD) {
+  return TheIRLinker->isMetadataNeeded(MD);
+}
+
+Value *LocalValueMaterializer::materializeDeclFor(Value *V) {
+  return TheIRLinker->materializeDeclFor(V, true);
+}
+
+void LocalValueMaterializer::materializeInitFor(GlobalValue *New,
+                                                GlobalValue *Old) {
+  TheIRLinker->materializeInitFor(New, Old, true);
+}
+
+Metadata *LocalValueMaterializer::mapTemporaryMetadata(Metadata *MD) {
+  return TheIRLinker->mapTemporaryMetadata(MD);
+}
+
+void LocalValueMaterializer::replaceTemporaryMetadata(const Metadata *OrigMD,
+                                                      Metadata *NewMD) {
+  TheIRLinker->replaceTemporaryMetadata(OrigMD, NewMD);
+}
+
+bool LocalValueMaterializer::isMetadataNeeded(Metadata *MD) {
+  return TheIRLinker->isMetadataNeeded(MD);
+}
+
+Value *IRLinker::materializeDeclFor(Value *V, bool ForAlias) {
+  auto *SGV = dyn_cast<GlobalValue>(V);
+  if (!SGV)
+    return nullptr;
+
+  return linkGlobalValueProto(SGV, ForAlias);
+}
+
+void IRLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old,
+                                  bool ForAlias) {
+  // If we already created the body, just return.
+  if (auto *F = dyn_cast<Function>(New)) {
+    if (!F->isDeclaration())
+      return;
+  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
+    if (V->hasInitializer())
+      return;
+  } else {
+    auto *A = cast<GlobalAlias>(New);
+    if (A->getAliasee())
+      return;
+  }
+
+  if (ForAlias || shouldLink(New, *Old))
+    linkGlobalValueBody(*New, *Old);
+}
+
+Metadata *IRLinker::mapTemporaryMetadata(Metadata *MD) {
+  if (!ValIDToTempMDMap)
+    return nullptr;
+  // If this temporary metadata has a value id recorded during function
+  // parsing, record that in the ValIDToTempMDMap if one was provided.
+  if (MetadataToIDs.count(MD)) {
+    unsigned Idx = MetadataToIDs[MD];
+    // Check if we created a temp MD when importing a different function from
+    // this module. If so, reuse it the same temporary metadata, otherwise
+    // add this temporary metadata to the map.
+    if (!ValIDToTempMDMap->count(Idx)) {
+      MDNode *Node = cast<MDNode>(MD);
+      assert(Node->isTemporary());
+      (*ValIDToTempMDMap)[Idx] = Node;
+    }
+    return (*ValIDToTempMDMap)[Idx];
+  }
+  return nullptr;
+}
+
+void IRLinker::replaceTemporaryMetadata(const Metadata *OrigMD,
+                                        Metadata *NewMD) {
+  if (!ValIDToTempMDMap)
+    return;
+#ifndef NDEBUG
+  auto *N = dyn_cast_or_null<MDNode>(NewMD);
+  assert(!N || !N->isTemporary());
+#endif
+  // If a mapping between metadata value ids and temporary metadata
+  // created during function importing was provided, and the source
+  // metadata has a value id recorded during metadata parsing, replace
+  // the temporary metadata with the final mapped metadata now.
+  if (MetadataToIDs.count(OrigMD)) {
+    unsigned Idx = MetadataToIDs[OrigMD];
+    // Nothing to do if we didn't need to create a temporary metadata during
+    // function importing.
+    if (!ValIDToTempMDMap->count(Idx))
+      return;
+    MDNode *TempMD = (*ValIDToTempMDMap)[Idx];
+    TempMD->replaceAllUsesWith(NewMD);
+    MDNode::deleteTemporary(TempMD);
+    ValIDToTempMDMap->erase(Idx);
+  }
+}
+
+bool IRLinker::isMetadataNeeded(Metadata *MD) {
+  // Currently only DISubprogram metadata is marked as being unneeded.
+  if (UnneededSubprograms.empty())
+    return true;
+  MDNode *Node = dyn_cast<MDNode>(MD);
+  if (!Node)
+    return true;
+  DISubprogram *SP = getDISubprogram(Node);
+  if (!SP)
+    return true;
+  return !UnneededSubprograms.count(SP);
+}
+
+/// Loop through the global variables in the src module and merge them into the
+/// dest module.
+GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
+  // No linking to be performed or linking from the source: simply create an
+  // identical version of the symbol over in the dest module... the
+  // initializer will be filled in later by LinkGlobalInits.
+  GlobalVariable *NewDGV =
+      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
+                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
+                         /*init*/ nullptr, SGVar->getName(),
+                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+                         SGVar->getType()->getAddressSpace());
+  NewDGV->setAlignment(SGVar->getAlignment());
+  return NewDGV;
+}
+
+/// Link the function in the source module into the destination module if
+/// needed, setting up mapping information.
+Function *IRLinker::copyFunctionProto(const Function *SF) {
+  // If there is no linkage to be performed or we are linking from the source,
+  // bring SF over.
+  return Function::Create(TypeMap.get(SF->getFunctionType()),
+                          GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+}
+
+/// Set up prototypes for any aliases that come over from the source module.
+GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
+  // If there is no linkage to be performed or we're linking from the source,
+  // bring over SGA.
+  auto *Ty = TypeMap.get(SGA->getValueType());
+  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+                             GlobalValue::ExternalLinkage, SGA->getName(),
+                             &DstM);
+}
+
+GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
+                                            bool ForDefinition) {
+  GlobalValue *NewGV;
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
+    NewGV = copyGlobalVariableProto(SGVar);
+  } else if (auto *SF = dyn_cast<Function>(SGV)) {
+    NewGV = copyFunctionProto(SF);
+  } else {
+    if (ForDefinition)
+      NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
+    else
+      NewGV = new GlobalVariable(
+          DstM, TypeMap.get(SGV->getType()->getElementType()),
+          /*isConstant*/ false, GlobalValue::ExternalLinkage,
+          /*init*/ nullptr, SGV->getName(),
+          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
+          SGV->getType()->getAddressSpace());
+  }
+
+  if (ForDefinition)
+    NewGV->setLinkage(SGV->getLinkage());
+  else if (SGV->hasExternalWeakLinkage() || SGV->hasWeakLinkage() ||
+           SGV->hasLinkOnceLinkage())
+    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+  NewGV->copyAttributesFrom(SGV);
+
+  // Remove these copied constants in case this stays a declaration, since
+  // they point to the source module. If the def is linked the values will
+  // be mapped in during linkFunctionBody.
+  if (auto *NewF = dyn_cast<Function>(NewGV)) {
+    NewF->setPersonalityFn(nullptr);
+    NewF->setPrefixData(nullptr);
+    NewF->setPrologueData(nullptr);
+  }
+
+  return NewGV;
+}
+
+/// Loop over all of the linked values to compute type mappings.  For example,
+/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
+/// types 'Foo' but one got renamed when the module was loaded into the same
+/// LLVMContext.
+void IRLinker::computeTypeMapping() {
+  for (GlobalValue &SGV : SrcM.globals()) {
+    GlobalValue *DGV = getLinkedToGlobal(&SGV);
+    if (!DGV)
+      continue;
+
+    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+      continue;
+    }
+
+    // Unify the element type of appending arrays.
+    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
+    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
+    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
+  }
+
+  for (GlobalValue &SGV : SrcM)
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+
+  for (GlobalValue &SGV : SrcM.aliases())
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+
+  // Incorporate types by name, scanning all the types in the source module.
+  // At this point, the destination module may have a type "%foo = { i32 }" for
+  // example.  When the source module got loaded into the same LLVMContext, if
+  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
+  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
+  for (StructType *ST : Types) {
+    if (!ST->hasName())
+      continue;
+
+    // Check to see if there is a dot in the name followed by a digit.
+    size_t DotPos = ST->getName().rfind('.');
+    if (DotPos == 0 || DotPos == StringRef::npos ||
+        ST->getName().back() == '.' ||
+        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
+      continue;
+
+    // Check to see if the destination module has a struct with the prefix name.
+    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
+    if (!DST)
+      continue;
+
+    // Don't use it if this actually came from the source module. They're in
+    // the same LLVMContext after all. Also don't use it unless the type is
+    // actually used in the destination module. This can happen in situations
+    // like this:
+    //
+    //      Module A                         Module B
+    //      --------                         --------
+    //   %Z = type { %A }                %B = type { %C.1 }
+    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
+    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
+    //   %C = type { i8* }               %B.3 = type { %C.1 }
+    //
+    // When we link Module B with Module A, the '%B' in Module B is
+    // used. However, that would then use '%C.1'. But when we process '%C.1',
+    // we prefer to take the '%C' version. So we are then left with both
+    // '%C.1' and '%C' being used for the same types. This leads to some
+    // variables using one type and some using the other.
+    if (TypeMap.DstStructTypesSet.hasType(DST))
+      TypeMap.addTypeMapping(DST, ST);
+  }
+
+  // Now that we have discovered all of the type equivalences, get a body for
+  // any 'opaque' types in the dest module that are now resolved.
+  TypeMap.linkDefinedTypeBodies();
+}
+
+static void getArrayElements(const Constant *C,
+                             SmallVectorImpl<Constant *> &Dest) {
+  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
+
+  for (unsigned i = 0; i != NumElements; ++i)
+    Dest.push_back(C->getAggregateElement(i));
+}
+
+/// If there were any appending global variables, link them together now.
+/// Return true on error.
+Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
+                                          const GlobalVariable *SrcGV) {
+  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()))
+                    ->getElementType();
+
+  StringRef Name = SrcGV->getName();
+  bool IsNewStructor = false;
+  bool IsOldStructor = false;
+  if (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") {
+    if (cast<StructType>(EltTy)->getNumElements() == 3)
+      IsNewStructor = true;
+    else
+      IsOldStructor = true;
+  }
+
+  PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo();
+  if (IsOldStructor) {
+    auto &ST = *cast<StructType>(EltTy);
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(SrcGV->getContext(), Tys, false);
+  }
+
+  if (DstGV) {
+    ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
+
+    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) {
+      emitError(
+          "Linking globals named '" + SrcGV->getName() +
+          "': can only link appending global with another appending global!");
+      return nullptr;
+    }
+
+    // Check to see that they two arrays agree on type.
+    if (EltTy != DstTy->getElementType()) {
+      emitError("Appending variables with different element types!");
+      return nullptr;
+    }
+    if (DstGV->isConstant() != SrcGV->isConstant()) {
+      emitError("Appending variables linked with different const'ness!");
+      return nullptr;
+    }
+
+    if (DstGV->getAlignment() != SrcGV->getAlignment()) {
+      emitError(
+          "Appending variables with different alignment need to be linked!");
+      return nullptr;
+    }
+
+    if (DstGV->getVisibility() != SrcGV->getVisibility()) {
+      emitError(
+          "Appending variables with different visibility need to be linked!");
+      return nullptr;
+    }
+
+    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) {
+      emitError(
+          "Appending variables with different unnamed_addr need to be linked!");
+      return nullptr;
+    }
+
+    if (StringRef(DstGV->getSection()) != SrcGV->getSection()) {
+      emitError(
+          "Appending variables with different section name need to be linked!");
+      return nullptr;
+    }
+  }
+
+  SmallVector<Constant *, 16> DstElements;
+  if (DstGV)
+    getArrayElements(DstGV->getInitializer(), DstElements);
+
+  SmallVector<Constant *, 16> SrcElements;
+  getArrayElements(SrcGV->getInitializer(), SrcElements);
+
+  if (IsNewStructor)
+    SrcElements.erase(
+        std::remove_if(SrcElements.begin(), SrcElements.end(),
+                       [this](Constant *E) {
+                         auto *Key = dyn_cast<GlobalValue>(
+                             E->getAggregateElement(2)->stripPointerCasts());
+                         if (!Key)
+                           return false;
+                         GlobalValue *DGV = getLinkedToGlobal(Key);
+                         return !shouldLink(DGV, *Key);
+                       }),
+        SrcElements.end());
+  uint64_t NewSize = DstElements.size() + SrcElements.size();
+  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
+
+  // Create the new global variable.
+  GlobalVariable *NG = new GlobalVariable(
+      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
+      /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
+      SrcGV->getType()->getAddressSpace());
+
+  NG->copyAttributesFrom(SrcGV);
+  forceRenaming(NG, SrcGV->getName());
+
+  Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
+
+  // Stop recursion.
+  ValueMap[SrcGV] = Ret;
+
+  for (auto *V : SrcElements) {
+    Constant *NewV;
+    if (IsOldStructor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = MapValue(S->getOperand(0), ValueMap, ValueMapperFlags,
+                          &TypeMap, &GValMaterializer);
+      auto *E2 = MapValue(S->getOperand(1), ValueMap, ValueMapperFlags,
+                          &TypeMap, &GValMaterializer);
+      Value *Null = Constant::getNullValue(VoidPtrTy);
+      NewV =
+          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+    } else {
+      NewV =
+          MapValue(V, ValueMap, ValueMapperFlags, &TypeMap, &GValMaterializer);
+    }
+    DstElements.push_back(NewV);
+  }
+
+  NG->setInitializer(ConstantArray::get(NewType, DstElements));
+
+  // Replace any uses of the two global variables with uses of the new
+  // global.
+  if (DstGV) {
+    DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
+    DstGV->eraseFromParent();
+  }
+
+  return Ret;
+}
+
+static bool useExistingDest(GlobalValue &SGV, GlobalValue *DGV,
+                            bool ShouldLink) {
+  if (!DGV)
+    return false;
+
+  if (SGV.isDeclaration())
+    return true;
+
+  if (DGV->isDeclarationForLinker() && !SGV.isDeclarationForLinker())
+    return false;
+
+  if (ShouldLink)
+    return false;
+
+  return true;
+}
+
+bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
+  // Already imported all the values. Just map to the Dest value
+  // in case it is referenced in the metadata.
+  if (IsMetadataLinkingPostpass) {
+    assert(!ValuesToLink.count(&SGV) &&
+           "Source value unexpectedly requested for link during metadata link");
+    return false;
+  }
+
+  if (ValuesToLink.count(&SGV))
+    return true;
+
+  if (SGV.hasLocalLinkage())
+    return true;
+
+  if (DGV && !DGV->isDeclaration())
+    return false;
+
+  if (SGV.hasAvailableExternallyLinkage())
+    return true;
+
+  if (DoneLinkingBodies)
+    return false;
+
+  AddLazyFor(SGV, [this](GlobalValue &GV) { maybeAdd(&GV); });
+  return ValuesToLink.count(&SGV);
+}
+
+Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
+  GlobalValue *DGV = getLinkedToGlobal(SGV);
+
+  bool ShouldLink = shouldLink(DGV, *SGV);
+
+  // just missing from map
+  if (ShouldLink) {
+    auto I = ValueMap.find(SGV);
+    if (I != ValueMap.end())
+      return cast<Constant>(I->second);
+
+    I = AliasValueMap.find(SGV);
+    if (I != AliasValueMap.end())
+      return cast<Constant>(I->second);
+  }
+
+  DGV = nullptr;
+  if (ShouldLink || !ForAlias)
+    DGV = getLinkedToGlobal(SGV);
+
+  // Handle the ultra special appending linkage case first.
+  assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
+  if (SGV->hasAppendingLinkage())
+    return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
+                                 cast<GlobalVariable>(SGV));
+
+  GlobalValue *NewGV;
+  if (useExistingDest(*SGV, DGV, ShouldLink)) {
+    NewGV = DGV;
+  } else {
+    // If we are done linking global value bodies (i.e. we are performing
+    // metadata linking), don't link in the global value due to this
+    // reference, simply map it to null.
+    if (DoneLinkingBodies)
+      return nullptr;
+
+    NewGV = copyGlobalValueProto(SGV, ShouldLink);
+    if (!ForAlias)
+      forceRenaming(NewGV, SGV->getName());
+  }
+  if (ShouldLink || ForAlias) {
+    if (const Comdat *SC = SGV->getComdat()) {
+      if (auto *GO = dyn_cast<GlobalObject>(NewGV)) {
+        Comdat *DC = DstM.getOrInsertComdat(SC->getName());
+        DC->setSelectionKind(SC->getSelectionKind());
+        GO->setComdat(DC);
+      }
+    }
+  }
+
+  if (!ShouldLink && ForAlias)
+    NewGV->setLinkage(GlobalValue::InternalLinkage);
+
+  Constant *C = NewGV;
+  if (DGV)
+    C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));
+
+  if (DGV && NewGV != DGV) {
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
+    DGV->eraseFromParent();
+  }
+
+  return C;
+}
+
+/// Update the initializers in the Dest module now that all globals that may be
+/// referenced are in Dest.
+void IRLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
+  // Figure out what the initializer looks like in the dest module.
+  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap, ValueMapperFlags,
+                              &TypeMap, &GValMaterializer));
+}
+
+/// Copy the source function over into the dest function and fix up references
+/// to values. At this point we know that Dest is an external function, and
+/// that Src is not.
+bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
+  assert(Dst.isDeclaration() && !Src.isDeclaration());
+
+  // Materialize if needed.
+  if (std::error_code EC = Src.materialize())
+    return emitError(EC.message());
+
+  if (!shouldLinkMetadata())
+    // This is only supported for lazy links. Do after materialization of
+    // a function and before remapping metadata on instructions below
+    // in RemapInstruction, as the saved mapping is used to handle
+    // the temporary metadata hanging off instructions.
+    SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
+                                             /* OnlyTempMD = */ true);
+
+  // Link in the prefix data.
+  if (Src.hasPrefixData())
+    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap, ValueMapperFlags,
+                               &TypeMap, &GValMaterializer));
+
+  // Link in the prologue data.
+  if (Src.hasPrologueData())
+    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap,
+                                 ValueMapperFlags, &TypeMap,
+                                 &GValMaterializer));
+
+  // Link in the personality function.
+  if (Src.hasPersonalityFn())
+    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap,
+                                  ValueMapperFlags, &TypeMap,
+                                  &GValMaterializer));
+
+  // Go through and convert function arguments over, remembering the mapping.
+  Function::arg_iterator DI = Dst.arg_begin();
+  for (Argument &Arg : Src.args()) {
+    DI->setName(Arg.getName()); // Copy the name over.
+
+    // Add a mapping to our mapping.
+    ValueMap[&Arg] = &*DI;
+    ++DI;
+  }
+
+  // Copy over the metadata attachments.
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  Src.getAllMetadata(MDs);
+  for (const auto &I : MDs)
+    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, ValueMapperFlags,
+                                         &TypeMap, &GValMaterializer));
+
+  // Splice the body of the source function into the dest function.
+  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
+
+  // At this point, all of the instructions and values of the function are now
+  // copied over.  The only problem is that they are still referencing values in
+  // the Source function as operands.  Loop through all of the operands of the
+  // functions and patch them up to point to the local versions.
+  for (BasicBlock &BB : Dst)
+    for (Instruction &I : BB)
+      RemapInstruction(&I, ValueMap, RF_IgnoreMissingEntries | ValueMapperFlags,
+                       &TypeMap, &GValMaterializer);
+
+  // There is no need to map the arguments anymore.
+  for (Argument &Arg : Src.args())
+    ValueMap.erase(&Arg);
+
+  return false;
+}
+
+void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
+  Constant *Aliasee = Src.getAliasee();
+  Constant *Val = MapValue(Aliasee, AliasValueMap, ValueMapperFlags, &TypeMap,
+                           &LValMaterializer);
+  Dst.setAliasee(Val);
+}
+
+bool IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
+  if (auto *F = dyn_cast<Function>(&Src))
+    return linkFunctionBody(cast<Function>(Dst), *F);
+  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
+    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
+    return false;
+  }
+  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
+  return false;
+}
+
+void IRLinker::findNeededSubprograms(ValueToValueMapTy &ValueMap) {
+  // Track unneeded nodes to make it simpler to handle the case
+  // where we are checking if an already-mapped SP is needed.
+  NamedMDNode *CompileUnits = SrcM.getNamedMetadata("llvm.dbg.cu");
+  if (!CompileUnits)
+    return;
+  for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) {
+    auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I));
+    assert(CU && "Expected valid compile unit");
+    // Ensure that we don't remove subprograms referenced by DIImportedEntity.
+    // It is not legal to have a DIImportedEntity with a null entity or scope.
+    // FIXME: The DISubprogram for functions not linked in but kept due to
+    // being referenced by a DIImportedEntity should also get their
+    // IsDefinition flag is unset.
+    SmallPtrSet<DISubprogram *, 8> ImportedEntitySPs;
+    for (auto *IE : CU->getImportedEntities()) {
+      if (auto *SP = dyn_cast<DISubprogram>(IE->getEntity()))
+        ImportedEntitySPs.insert(SP);
+      if (auto *SP = dyn_cast<DISubprogram>(IE->getScope()))
+        ImportedEntitySPs.insert(SP);
+    }
+    for (auto *Op : CU->getSubprograms()) {
+      // Unless we were doing function importing and deferred metadata linking,
+      // any needed SPs should have been mapped as they would be reached
+      // from the function linked in (either on the function itself for linked
+      // function bodies, or from DILocation on inlined instructions).
+      assert(!(ValueMap.MD()[Op] && IsMetadataLinkingPostpass) &&
+             "DISubprogram shouldn't be mapped yet");
+      if (!ValueMap.MD()[Op] && !ImportedEntitySPs.count(Op))
+        UnneededSubprograms.insert(Op);
+    }
+  }
+  if (!IsMetadataLinkingPostpass)
+    return;
+  // In the case of metadata linking as a postpass (e.g. for function
+  // importing), see which DISubprogram MD from the source has an associated
+  // temporary metadata node, which means the SP was needed by an imported
+  // function.
+  for (auto MDI : MetadataToIDs) {
+    const MDNode *Node = dyn_cast<MDNode>(MDI.first);
+    if (!Node)
+      continue;
+    DISubprogram *SP = getDISubprogram(Node);
+    if (!SP || !ValIDToTempMDMap->count(MDI.second))
+      continue;
+    UnneededSubprograms.erase(SP);
+  }
+}
+
+// Squash null subprograms from compile unit subprogram lists.
+void IRLinker::stripNullSubprograms() {
+  NamedMDNode *CompileUnits = DstM.getNamedMetadata("llvm.dbg.cu");
+  if (!CompileUnits)
+    return;
+  for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) {
+    auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I));
+    assert(CU && "Expected valid compile unit");
+
+    SmallVector<Metadata *, 16> NewSPs;
+    NewSPs.reserve(CU->getSubprograms().size());
+    bool FoundNull = false;
+    for (DISubprogram *SP : CU->getSubprograms()) {
+      if (!SP) {
+        FoundNull = true;
+        continue;
+      }
+      NewSPs.push_back(SP);
+    }
+    if (FoundNull)
+      CU->replaceSubprograms(MDTuple::get(CU->getContext(), NewSPs));
+  }
+}
+
+/// Insert all of the named MDNodes in Src into the Dest module.
+void IRLinker::linkNamedMDNodes() {
+  findNeededSubprograms(ValueMap);
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
+    // Don't link module flags here. Do them separately.
+    if (&NMD == SrcModFlags)
+      continue;
+    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
+    // Add Src elements into Dest node.
+    for (const MDNode *op : NMD.operands())
+      DestNMD->addOperand(MapMetadata(
+          op, ValueMap, ValueMapperFlags | RF_NullMapMissingGlobalValues,
+          &TypeMap, &GValMaterializer));
+  }
+  stripNullSubprograms();
+}
+
+/// Merge the linker flags in Src into the Dest module.
+bool IRLinker::linkModuleFlagsMetadata() {
+  // If the source module has no module flags, we are done.
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  if (!SrcModFlags)
+    return false;
+
+  // If the destination module doesn't have module flags yet, then just copy
+  // over the source module's flags.
+  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
+  if (DstModFlags->getNumOperands() == 0) {
+    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
+      DstModFlags->addOperand(SrcModFlags->getOperand(I));
+
+    return false;
+  }
+
+  // First build a map of the existing module flags and requirements.
+  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
+  SmallSetVector<MDNode *, 16> Requirements;
+  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Op = DstModFlags->getOperand(I);
+    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
+    MDString *ID = cast<MDString>(Op->getOperand(1));
+
+    if (Behavior->getZExtValue() == Module::Require) {
+      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
+    } else {
+      Flags[ID] = std::make_pair(Op, I);
+    }
+  }
+
+  // Merge in the flags from the source module, and also collect its set of
+  // requirements.
+  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *SrcOp = SrcModFlags->getOperand(I);
+    ConstantInt *SrcBehavior =
+        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
+    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
+    MDNode *DstOp;
+    unsigned DstIndex;
+    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
+    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
+
+    // If this is a requirement, add it and continue.
+    if (SrcBehaviorValue == Module::Require) {
+      // If the destination module does not already have this requirement, add
+      // it.
+      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
+        DstModFlags->addOperand(SrcOp);
+      }
+      continue;
+    }
+
+    // If there is no existing flag with this ID, just add it.
+    if (!DstOp) {
+      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
+      DstModFlags->addOperand(SrcOp);
+      continue;
+    }
+
+    // Otherwise, perform a merge.
+    ConstantInt *DstBehavior =
+        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
+    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
+
+    // If either flag has override behavior, handle it first.
+    if (DstBehaviorValue == Module::Override) {
+      // Diagnose inconsistent flags which both have override behavior.
+      if (SrcBehaviorValue == Module::Override &&
+          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting override values");
+      }
+      continue;
+    } else if (SrcBehaviorValue == Module::Override) {
+      // Update the destination flag to that of the source.
+      DstModFlags->setOperand(DstIndex, SrcOp);
+      Flags[ID].first = SrcOp;
+      continue;
+    }
+
+    // Diagnose inconsistent merge behavior types.
+    if (SrcBehaviorValue != DstBehaviorValue) {
+      emitError("linking module flags '" + ID->getString() +
+                "': IDs have conflicting behaviors");
+      continue;
+    }
+
+    auto replaceDstValue = [&](MDNode *New) {
+      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
+      DstModFlags->setOperand(DstIndex, Flag);
+      Flags[ID].first = Flag;
+    };
+
+    // Perform the merge for standard behavior types.
+    switch (SrcBehaviorValue) {
+    case Module::Require:
+    case Module::Override:
+      llvm_unreachable("not possible");
+    case Module::Error: {
+      // Emit an error if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Warning: {
+      // Emit a warning if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitWarning("linking module flags '" + ID->getString() +
+                    "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Append: {
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      SmallVector<Metadata *, 8> MDs;
+      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
+      MDs.append(DstValue->op_begin(), DstValue->op_end());
+      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
+      break;
+    }
+    case Module::AppendUnique: {
+      SmallSetVector<Metadata *, 16> Elts;
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      Elts.insert(DstValue->op_begin(), DstValue->op_end());
+      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM.getContext(),
+                                  makeArrayRef(Elts.begin(), Elts.end())));
+      break;
+    }
+    }
+  }
+
+  // Check all of the requirements.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Metadata *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = Flags[Flag].first;
+    if (!Op || Op->getOperand(2) != ReqValue) {
+      emitError("linking module flags '" + Flag->getString() +
+                "': does not have the required value");
+      continue;
+    }
+  }
+
+  return HasError;
+}
+
+// This function returns true if the triples match.
+static bool triplesMatch(const Triple &T0, const Triple &T1) {
+  // If vendor is apple, ignore the version number.
+  if (T0.getVendor() == Triple::Apple)
+    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
+           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
+
+  return T0 == T1;
+}
+
+// This function returns the merged triple.
+static std::string mergeTriples(const Triple &SrcTriple,
+                                const Triple &DstTriple) {
+  // If vendor is apple, pick the triple with the larger version number.
+  if (SrcTriple.getVendor() == Triple::Apple)
+    if (DstTriple.isOSVersionLT(SrcTriple))
+      return SrcTriple.str();
+
+  return DstTriple.str();
+}
+
+bool IRLinker::run() {
+  // Inherit the target data from the source module if the destination module
+  // doesn't have one already.
+  if (DstM.getDataLayout().isDefault())
+    DstM.setDataLayout(SrcM.getDataLayout());
+
+  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
+    emitWarning("Linking two modules of different data layouts: '" +
+                SrcM.getModuleIdentifier() + "' is '" +
+                SrcM.getDataLayoutStr() + "' whereas '" +
+                DstM.getModuleIdentifier() + "' is '" +
+                DstM.getDataLayoutStr() + "'\n");
+  }
+
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM.getTargetTriple());
+
+  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
+
+  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+    emitWarning("Linking two modules of different target triples: " +
+                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
+                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
+                DstM.getTargetTriple() + "'\n");
+
+  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+
+  // Append the module inline asm string.
+  if (!SrcM.getModuleInlineAsm().empty()) {
+    if (DstM.getModuleInlineAsm().empty())
+      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
+    else
+      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
+                              SrcM.getModuleInlineAsm());
+  }
+
+  // Loop over all of the linked values to compute type mappings.
+  computeTypeMapping();
+
+  std::reverse(Worklist.begin(), Worklist.end());
+  while (!Worklist.empty()) {
+    GlobalValue *GV = Worklist.back();
+    Worklist.pop_back();
+
+    // Already mapped.
+    if (ValueMap.find(GV) != ValueMap.end() ||
+        AliasValueMap.find(GV) != AliasValueMap.end())
+      continue;
+
+    assert(!GV->isDeclaration());
+    MapValue(GV, ValueMap, ValueMapperFlags, &TypeMap, &GValMaterializer);
+    if (HasError)
+      return true;
+  }
+
+  // Note that we are done linking global value bodies. This prevents
+  // metadata linking from creating new references.
+  DoneLinkingBodies = true;
+
+  // Remap all of the named MDNodes in Src into the DstM module. We do this
+  // after linking GlobalValues so that MDNodes that reference GlobalValues
+  // are properly remapped.
+  if (shouldLinkMetadata()) {
+    // Even if just linking metadata we should link decls above in case
+    // any are referenced by metadata. IRLinker::shouldLink ensures that
+    // we don't actually link anything from source.
+    if (IsMetadataLinkingPostpass) {
+      // Ensure metadata materialized
+      if (SrcM.getMaterializer()->materializeMetadata())
+        return true;
+      SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
+                                               /* OnlyTempMD = */ false);
+    }
+
+    linkNamedMDNodes();
+
+    if (IsMetadataLinkingPostpass) {
+      // Handle anything left in the ValIDToTempMDMap, such as metadata nodes
+      // not reached by the dbg.cu NamedMD (i.e. only reached from
+      // instructions).
+      // Walk the MetadataToIDs once to find the set of new (imported) MD
+      // that still has corresponding temporary metadata, and invoke metadata
+      // mapping on each one.
+      for (auto MDI : MetadataToIDs) {
+        if (!ValIDToTempMDMap->count(MDI.second))
+          continue;
+        MapMetadata(MDI.first, ValueMap, ValueMapperFlags, &TypeMap,
+                    &GValMaterializer);
+      }
+      assert(ValIDToTempMDMap->empty());
+    }
+
+    // Merge the module flags into the DstM module.
+    if (linkModuleFlagsMetadata())
+      return true;
+  }
+
+  return false;
+}
+
+IRMover::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
+    : ETypes(E), IsPacked(P) {}
+
+IRMover::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
+    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
+
+bool IRMover::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
+  if (IsPacked != That.IsPacked)
+    return false;
+  if (ETypes != That.ETypes)
+    return false;
+  return true;
+}
+
+bool IRMover::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
+  return !this->operator==(That);
+}
+
+StructType *IRMover::StructTypeKeyInfo::getEmptyKey() {
+  return DenseMapInfo<StructType *>::getEmptyKey();
+}
+
+StructType *IRMover::StructTypeKeyInfo::getTombstoneKey() {
+  return DenseMapInfo<StructType *>::getTombstoneKey();
+}
+
+unsigned IRMover::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
+  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
+                      Key.IsPacked);
+}
+
+unsigned IRMover::StructTypeKeyInfo::getHashValue(const StructType *ST) {
+  return getHashValue(KeyTy(ST));
+}
+
+bool IRMover::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
+                                         const StructType *RHS) {
+  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+    return false;
+  return LHS == KeyTy(RHS);
+}
+
+bool IRMover::StructTypeKeyInfo::isEqual(const StructType *LHS,
+                                         const StructType *RHS) {
+  if (RHS == getEmptyKey())
+    return LHS == getEmptyKey();
+
+  if (RHS == getTombstoneKey())
+    return LHS == getTombstoneKey();
+
+  return KeyTy(LHS) == KeyTy(RHS);
+}
+
+void IRMover::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+}
+
+void IRMover::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+  bool Removed = OpaqueStructTypes.erase(Ty);
+  (void)Removed;
+  assert(Removed);
+}
+
+void IRMover::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
+  assert(Ty->isOpaque());
+  OpaqueStructTypes.insert(Ty);
+}
+
+StructType *
+IRMover::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
+                                                bool IsPacked) {
+  IRMover::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
+  auto I = NonOpaqueStructTypes.find_as(Key);
+  if (I == NonOpaqueStructTypes.end())
+    return nullptr;
+  return *I;
+}
+
+bool IRMover::IdentifiedStructTypeSet::hasType(StructType *Ty) {
+  if (Ty->isOpaque())
+    return OpaqueStructTypes.count(Ty);
+  auto I = NonOpaqueStructTypes.find(Ty);
+  if (I == NonOpaqueStructTypes.end())
+    return false;
+  return *I == Ty;
+}
+
+IRMover::IRMover(Module &M) : Composite(M) {
+  TypeFinder StructTypes;
+  StructTypes.run(M, true);
+  for (StructType *Ty : StructTypes) {
+    if (Ty->isOpaque())
+      IdentifiedStructTypes.addOpaque(Ty);
+    else
+      IdentifiedStructTypes.addNonOpaque(Ty);
+  }
+}
+
+bool IRMover::move(
+    Module &Src, ArrayRef<GlobalValue *> ValuesToLink,
+    std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
+    DenseMap<unsigned, MDNode *> *ValIDToTempMDMap,
+    bool IsMetadataLinkingPostpass) {
+  IRLinker TheIRLinker(Composite, IdentifiedStructTypes, Src, ValuesToLink,
+                       AddLazyFor, ValIDToTempMDMap, IsMetadataLinkingPostpass);
+  bool RetCode = TheIRLinker.run();
+  Composite.dropTriviallyDeadConstantArrays();
+  return RetCode;
+}
diff --git a/contrib/llvm/lib/Linker/LinkDiagnosticInfo.h b/contrib/llvm/lib/Linker/LinkDiagnosticInfo.h
new file mode 100644
index 0000000..d91f19c
--- /dev/null
+++ b/contrib/llvm/lib/Linker/LinkDiagnosticInfo.h
@@ -0,0 +1,25 @@
+//===- LinkDiagnosticInfo.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H
+#define LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+
+namespace llvm {
+class LinkDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+
+public:
+  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
+  void print(DiagnosticPrinter &DP) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Linker/LinkModules.cpp b/contrib/llvm/lib/Linker/LinkModules.cpp
index f090680..6ffa71e 100644
--- a/contrib/llvm/lib/Linker/LinkModules.cpp
+++ b/contrib/llvm/lib/Linker/LinkModules.cpp
@@ -12,447 +12,69 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Linker/Linker.h"
+#include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <cctype>
-#include <tuple>
 using namespace llvm;
 
-
-//===----------------------------------------------------------------------===//
-// TypeMap implementation.
-//===----------------------------------------------------------------------===//
-
 namespace {
-class TypeMapTy : public ValueMapTypeRemapper {
-  /// This is a mapping from a source type to a destination type to use.
-  DenseMap<Type*, Type*> MappedTypes;
-
-  /// When checking to see if two subgraphs are isomorphic, we speculatively
-  /// add types to MappedTypes, but keep track of them here in case we need to
-  /// roll back.
-  SmallVector<Type*, 16> SpeculativeTypes;
-
-  SmallVector<StructType*, 16> SpeculativeDstOpaqueTypes;
-
-  /// This is a list of non-opaque structs in the source module that are mapped
-  /// to an opaque struct in the destination module.
-  SmallVector<StructType*, 16> SrcDefinitionsToResolve;
-
-  /// This is the set of opaque types in the destination modules who are
-  /// getting a body from the source module.
-  SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
-
-public:
-  TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet)
-      : DstStructTypesSet(DstStructTypesSet) {}
-
-  Linker::IdentifiedStructTypeSet &DstStructTypesSet;
-  /// Indicate that the specified type in the destination module is conceptually
-  /// equivalent to the specified type in the source module.
-  void addTypeMapping(Type *DstTy, Type *SrcTy);
-
-  /// Produce a body for an opaque type in the dest module from a type
-  /// definition in the source module.
-  void linkDefinedTypeBodies();
-
-  /// Return the mapped type to use for the specified input type from the
-  /// source module.
-  Type *get(Type *SrcTy);
-  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
-
-  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
-
-  FunctionType *get(FunctionType *T) {
-    return cast<FunctionType>(get((Type *)T));
-  }
-
-  /// Dump out the type map for debugging purposes.
-  void dump() const {
-    for (auto &Pair : MappedTypes) {
-      dbgs() << "TypeMap: ";
-      Pair.first->print(dbgs());
-      dbgs() << " => ";
-      Pair.second->print(dbgs());
-      dbgs() << '\n';
-    }
-  }
-
-private:
-  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
-
-  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
-};
-}
-
-void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
-  assert(SpeculativeTypes.empty());
-  assert(SpeculativeDstOpaqueTypes.empty());
-
-  // Check to see if these types are recursively isomorphic and establish a
-  // mapping between them if so.
-  if (!areTypesIsomorphic(DstTy, SrcTy)) {
-    // Oops, they aren't isomorphic.  Just discard this request by rolling out
-    // any speculative mappings we've established.
-    for (Type *Ty : SpeculativeTypes)
-      MappedTypes.erase(Ty);
-
-    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
-                                   SpeculativeDstOpaqueTypes.size());
-    for (StructType *Ty : SpeculativeDstOpaqueTypes)
-      DstResolvedOpaqueTypes.erase(Ty);
-  } else {
-    for (Type *Ty : SpeculativeTypes)
-      if (auto *STy = dyn_cast<StructType>(Ty))
-        if (STy->hasName())
-          STy->setName("");
-  }
-  SpeculativeTypes.clear();
-  SpeculativeDstOpaqueTypes.clear();
-}
-
-/// Recursively walk this pair of types, returning true if they are isomorphic,
-/// false if they are not.
-bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
-  // Two types with differing kinds are clearly not isomorphic.
-  if (DstTy->getTypeID() != SrcTy->getTypeID())
-    return false;
-
-  // If we have an entry in the MappedTypes table, then we have our answer.
-  Type *&Entry = MappedTypes[SrcTy];
-  if (Entry)
-    return Entry == DstTy;
-
-  // Two identical types are clearly isomorphic.  Remember this
-  // non-speculatively.
-  if (DstTy == SrcTy) {
-    Entry = DstTy;
-    return true;
-  }
-
-  // Okay, we have two types with identical kinds that we haven't seen before.
-
-  // If this is an opaque struct type, special case it.
-  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
-    // Mapping an opaque type to any struct, just keep the dest struct.
-    if (SSTy->isOpaque()) {
-      Entry = DstTy;
-      SpeculativeTypes.push_back(SrcTy);
-      return true;
-    }
-
-    // Mapping a non-opaque source type to an opaque dest.  If this is the first
-    // type that we're mapping onto this destination type then we succeed.  Keep
-    // the dest, but fill it in later. If this is the second (different) type
-    // that we're trying to map onto the same opaque type then we fail.
-    if (cast<StructType>(DstTy)->isOpaque()) {
-      // We can only map one source type onto the opaque destination type.
-      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
-        return false;
-      SrcDefinitionsToResolve.push_back(SSTy);
-      SpeculativeTypes.push_back(SrcTy);
-      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
-      Entry = DstTy;
-      return true;
-    }
-  }
-
-  // If the number of subtypes disagree between the two types, then we fail.
-  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
-    return false;
-
-  // Fail if any of the extra properties (e.g. array size) of the type disagree.
-  if (isa<IntegerType>(DstTy))
-    return false;  // bitwidth disagrees.
-  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
-    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
-      return false;
-
-  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
-    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
-      return false;
-  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
-    StructType *SSTy = cast<StructType>(SrcTy);
-    if (DSTy->isLiteral() != SSTy->isLiteral() ||
-        DSTy->isPacked() != SSTy->isPacked())
-      return false;
-  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
-    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
-      return false;
-  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
-      return false;
-  }
-
-  // Otherwise, we speculate that these two types will line up and recursively
-  // check the subelements.
-  Entry = DstTy;
-  SpeculativeTypes.push_back(SrcTy);
-
-  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
-    if (!areTypesIsomorphic(DstTy->getContainedType(I),
-                            SrcTy->getContainedType(I)))
-      return false;
-
-  // If everything seems to have lined up, then everything is great.
-  return true;
-}
-
-void TypeMapTy::linkDefinedTypeBodies() {
-  SmallVector<Type*, 16> Elements;
-  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
-    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
-    assert(DstSTy->isOpaque());
-
-    // Map the body of the source type over to a new body for the dest type.
-    Elements.resize(SrcSTy->getNumElements());
-    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
-      Elements[I] = get(SrcSTy->getElementType(I));
-
-    DstSTy->setBody(Elements, SrcSTy->isPacked());
-    DstStructTypesSet.switchToNonOpaque(DstSTy);
-  }
-  SrcDefinitionsToResolve.clear();
-  DstResolvedOpaqueTypes.clear();
-}
-
-void TypeMapTy::finishType(StructType *DTy, StructType *STy,
-                           ArrayRef<Type *> ETypes) {
-  DTy->setBody(ETypes, STy->isPacked());
-
-  // Steal STy's name.
-  if (STy->hasName()) {
-    SmallString<16> TmpName = STy->getName();
-    STy->setName("");
-    DTy->setName(TmpName);
-  }
-
-  DstStructTypesSet.addNonOpaque(DTy);
-}
-
-Type *TypeMapTy::get(Type *Ty) {
-  SmallPtrSet<StructType *, 8> Visited;
-  return get(Ty, Visited);
-}
-
-Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
-  // If we already have an entry for this type, return it.
-  Type **Entry = &MappedTypes[Ty];
-  if (*Entry)
-    return *Entry;
-
-  // These are types that LLVM itself will unique.
-  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
-
-#ifndef NDEBUG
-  if (!IsUniqued) {
-    for (auto &Pair : MappedTypes) {
-      assert(!(Pair.first != Ty && Pair.second == Ty) &&
-             "mapping to a source type");
-    }
-  }
-#endif
-
-  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
-    StructType *DTy = StructType::create(Ty->getContext());
-    return *Entry = DTy;
-  }
-
-  // If this is not a recursive type, then just map all of the elements and
-  // then rebuild the type from inside out.
-  SmallVector<Type *, 4> ElementTypes;
-
-  // If there are no element types to map, then the type is itself.  This is
-  // true for the anonymous {} struct, things like 'float', integers, etc.
-  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
-    return *Entry = Ty;
-
-  // Remap all of the elements, keeping track of whether any of them change.
-  bool AnyChange = false;
-  ElementTypes.resize(Ty->getNumContainedTypes());
-  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
-    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
-    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
-  }
-
-  // If we found our type while recursively processing stuff, just use it.
-  Entry = &MappedTypes[Ty];
-  if (*Entry) {
-    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
-      if (DTy->isOpaque()) {
-        auto *STy = cast<StructType>(Ty);
-        finishType(DTy, STy, ElementTypes);
-      }
-    }
-    return *Entry;
-  }
-
-  // If all of the element types mapped directly over and the type is not
-  // a nomed struct, then the type is usable as-is.
-  if (!AnyChange && IsUniqued)
-    return *Entry = Ty;
-
-  // Otherwise, rebuild a modified type.
-  switch (Ty->getTypeID()) {
-  default:
-    llvm_unreachable("unknown derived type to remap");
-  case Type::ArrayTyID:
-    return *Entry = ArrayType::get(ElementTypes[0],
-                                   cast<ArrayType>(Ty)->getNumElements());
-  case Type::VectorTyID:
-    return *Entry = VectorType::get(ElementTypes[0],
-                                    cast<VectorType>(Ty)->getNumElements());
-  case Type::PointerTyID:
-    return *Entry = PointerType::get(ElementTypes[0],
-                                     cast<PointerType>(Ty)->getAddressSpace());
-  case Type::FunctionTyID:
-    return *Entry = FunctionType::get(ElementTypes[0],
-                                      makeArrayRef(ElementTypes).slice(1),
-                                      cast<FunctionType>(Ty)->isVarArg());
-  case Type::StructTyID: {
-    auto *STy = cast<StructType>(Ty);
-    bool IsPacked = STy->isPacked();
-    if (IsUniqued)
-      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
-
-    // If the type is opaque, we can just use it directly.
-    if (STy->isOpaque()) {
-      DstStructTypesSet.addOpaque(STy);
-      return *Entry = Ty;
-    }
-
-    if (StructType *OldT =
-            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
-      STy->setName("");
-      return *Entry = OldT;
-    }
-
-    if (!AnyChange) {
-      DstStructTypesSet.addNonOpaque(STy);
-      return *Entry = Ty;
-    }
-
-    StructType *DTy = StructType::create(Ty->getContext());
-    finishType(DTy, STy, ElementTypes);
-    return *Entry = DTy;
-  }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// ModuleLinker implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-class ModuleLinker;
-
-/// Creates prototypes for functions that are lazily linked on the fly. This
-/// speeds up linking for modules with many/ lazily linked functions of which
-/// few get used.
-class ValueMaterializerTy : public ValueMaterializer {
-  TypeMapTy &TypeMap;
-  Module *DstM;
-  std::vector<GlobalValue *> &LazilyLinkGlobalValues;
-
-public:
-  ValueMaterializerTy(TypeMapTy &TypeMap, Module *DstM,
-                      std::vector<GlobalValue *> &LazilyLinkGlobalValues)
-      : ValueMaterializer(), TypeMap(TypeMap), DstM(DstM),
-        LazilyLinkGlobalValues(LazilyLinkGlobalValues) {}
-
-  Value *materializeValueFor(Value *V) override;
-};
-
-class LinkDiagnosticInfo : public DiagnosticInfo {
-  const Twine &Msg;
-
-public:
-  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
-  void print(DiagnosticPrinter &DP) const override;
-};
-LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
-                                       const Twine &Msg)
-    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
-void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
 
 /// This is an implementation class for the LinkModules function, which is the
 /// entrypoint for this file.
 class ModuleLinker {
-  Module *DstM, *SrcM;
-
-  TypeMapTy TypeMap;
-  ValueMaterializerTy ValMaterializer;
+  IRMover &Mover;
+  Module &SrcM;
 
-  /// Mapping of values from what they used to be in Src, to what they are now
-  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
-  /// due to the use of Value handles which the Linker doesn't actually need,
-  /// but this allows us to reuse the ValueMapper code.
-  ValueToValueMapTy ValueMap;
+  SetVector<GlobalValue *> ValuesToLink;
+  StringSet<> Internalize;
 
-  struct AppendingVarInfo {
-    GlobalVariable *NewGV;   // New aggregate global in dest module.
-    const Constant *DstInit; // Old initializer from dest module.
-    const Constant *SrcInit; // Old initializer from src module.
-  };
-
-  std::vector<AppendingVarInfo> AppendingVars;
+  /// For symbol clashes, prefer those from Src.
+  unsigned Flags;
 
-  // Set of items not to link in from source.
-  SmallPtrSet<const Value *, 16> DoNotLinkFromSource;
+  /// Function index passed into ModuleLinker for using in function
+  /// importing/exporting handling.
+  const FunctionInfoIndex *ImportIndex;
 
-  // Vector of GlobalValues to lazily link in.
-  std::vector<GlobalValue *> LazilyLinkGlobalValues;
+  /// Functions to import from source module, all other functions are
+  /// imported as declarations instead of definitions.
+  DenseSet<const GlobalValue *> *FunctionsToImport;
 
-  /// Functions that have replaced other functions.
-  SmallPtrSet<const Function *, 16> OverridingFunctions;
+  /// Set to true if the given FunctionInfoIndex contains any functions
+  /// from this source module, in which case we must conservatively assume
+  /// that any of its functions may be imported into another module
+  /// as part of a different backend compilation process.
+  bool HasExportedFunctions = false;
 
-  DiagnosticHandlerFunction DiagnosticHandler;
+  /// Association between metadata value id and temporary metadata that
+  /// remains unmapped after function importing. Saved during function
+  /// importing and consumed during the metadata linking postpass.
+  DenseMap<unsigned, MDNode *> *ValIDToTempMDMap;
 
-  /// For symbol clashes, prefer those from Src.
-  bool OverrideFromSrc;
+  /// Used as the callback for lazy linking.
+  /// The mover has just hit GV and we have to decide if it, and other members
+  /// of the same comdat, should be linked. Every member to be linked is passed
+  /// to Add.
+  void addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add);
 
-public:
-  ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
-               DiagnosticHandlerFunction DiagnosticHandler,
-               bool OverrideFromSrc)
-      : DstM(dstM), SrcM(srcM), TypeMap(Set),
-        ValMaterializer(TypeMap, DstM, LazilyLinkGlobalValues),
-        DiagnosticHandler(DiagnosticHandler), OverrideFromSrc(OverrideFromSrc) {
+  bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
+  bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
+  bool shouldInternalizeLinkedSymbols() {
+    return Flags & Linker::InternalizeLinkedSymbols;
   }
 
-  bool run();
-
-private:
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
 
-  /// Helper method for setting a message and returning an error code.
+  /// Should we have mover and linker error diag info?
   bool emitError(const Twine &Message) {
-    DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message));
     return true;
   }
 
-  void emitWarning(const Twine &Message) {
-    DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
-  }
-
-  bool getComdatLeader(Module *M, StringRef ComdatName,
+  bool getComdatLeader(Module &M, StringRef ComdatName,
                        const GlobalVariable *&GVar);
   bool computeResultingSelectionKind(StringRef ComdatName,
                                      Comdat::SelectionKind Src,
@@ -463,17 +85,20 @@ private:
       ComdatsChosen;
   bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
                        bool &LinkFromSrc);
+  // Keep track of the global value members of each comdat in source.
+  DenseMap<const Comdat *, std::vector<GlobalValue *>> ComdatMembers;
 
   /// Given a global in the source module, return the global in the
   /// destination module that is being linked to, if any.
   GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    Module &DstM = Mover.getModule();
     // If the source has no name it can't link.  If it has local linkage,
     // there is no name match-up going on.
-    if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
+    if (!SrcGV->hasName() || GlobalValue::isLocalLinkage(SrcGV->getLinkage()))
       return nullptr;
 
     // Otherwise see if we have a match in the destination module's symtab.
-    GlobalValue *DGV = DstM->getNamedValue(SrcGV->getName());
+    GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName());
     if (!DGV)
       return nullptr;
 
@@ -486,139 +111,305 @@ private:
     return DGV;
   }
 
-  void computeTypeMapping();
-
-  void upgradeMismatchedGlobalArray(StringRef Name);
-  void upgradeMismatchedGlobals();
+  bool linkIfNeeded(GlobalValue &GV);
 
-  bool linkAppendingVarProto(GlobalVariable *DstGV,
-                             const GlobalVariable *SrcGV);
+  /// Helper method to check if we are importing from the current source
+  /// module.
+  bool isPerformingImport() const { return FunctionsToImport != nullptr; }
 
-  bool linkGlobalValueProto(GlobalValue *GV);
-  bool linkModuleFlagsMetadata();
+  /// If we are importing from the source module, checks if we should
+  /// import SGV as a definition, otherwise import as a declaration.
+  bool doImportAsDefinition(const GlobalValue *SGV);
 
-  void linkAppendingVarInit(const AppendingVarInfo &AVI);
-
-  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
-  bool linkFunctionBody(Function &Dst, Function &Src);
-  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
-  bool linkGlobalValueBody(GlobalValue &Src);
+public:
+  ModuleLinker(IRMover &Mover, Module &SrcM, unsigned Flags,
+               const FunctionInfoIndex *Index = nullptr,
+               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr,
+               DenseMap<unsigned, MDNode *> *ValIDToTempMDMap = nullptr)
+      : Mover(Mover), SrcM(SrcM), Flags(Flags), ImportIndex(Index),
+        FunctionsToImport(FunctionsToImport),
+        ValIDToTempMDMap(ValIDToTempMDMap) {
+    assert((ImportIndex || !FunctionsToImport) &&
+           "Expect a FunctionInfoIndex when importing");
+    // If we have a FunctionInfoIndex but no function to import,
+    // then this is the primary module being compiled in a ThinLTO
+    // backend compilation, and we need to see if it has functions that
+    // may be exported to another backend compilation.
+    if (ImportIndex && !FunctionsToImport)
+      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
+    assert((ValIDToTempMDMap || !FunctionsToImport) &&
+           "Function importing must provide a ValIDToTempMDMap");
+  }
 
-  void linkNamedMDNodes();
-  void stripReplacedSubprograms();
+  bool run();
 };
-}
-
-/// The LLVM SymbolTable class autorenames globals that conflict in the symbol
-/// table. This is good for all clients except for us. Go through the trouble
-/// to force this back.
-static void forceRenaming(GlobalValue *GV, StringRef Name) {
-  // If the global doesn't force its name or if it already has the right name,
-  // there is nothing for us to do.
-  if (GV->hasLocalLinkage() || GV->getName() == Name)
-    return;
 
-  Module *M = GV->getParent();
+/// Class to handle necessary GlobalValue changes required by ThinLTO including
+/// linkage changes and any necessary renaming.
+class ThinLTOGlobalProcessing {
+  /// The Module which we are exporting or importing functions from.
+  Module &M;
+
+  /// Function index passed in for function importing/exporting handling.
+  const FunctionInfoIndex *ImportIndex;
+
+  /// Functions to import from this module, all other functions will be
+  /// imported as declarations instead of definitions.
+  DenseSet<const GlobalValue *> *FunctionsToImport;
+
+  /// Set to true if the given FunctionInfoIndex contains any functions
+  /// from this source module, in which case we must conservatively assume
+  /// that any of its functions may be imported into another module
+  /// as part of a different backend compilation process.
+  bool HasExportedFunctions = false;
+
+  /// Populated during ThinLTO global processing with locals promoted
+  /// to global scope in an exporting module, which now need to be linked
+  /// in if calling from the ModuleLinker.
+  SetVector<GlobalValue *> NewExportedValues;
+
+  /// Check if we should promote the given local value to global scope.
+  bool doPromoteLocalToGlobal(const GlobalValue *SGV);
+
+  /// Helper methods to check if we are importing from or potentially
+  /// exporting from the current source module.
+  bool isPerformingImport() const { return FunctionsToImport != nullptr; }
+  bool isModuleExporting() const { return HasExportedFunctions; }
+
+  /// If we are importing from the source module, checks if we should
+  /// import SGV as a definition, otherwise import as a declaration.
+  bool doImportAsDefinition(const GlobalValue *SGV);
+
+  /// Get the name for SGV that should be used in the linked destination
+  /// module. Specifically, this handles the case where we need to rename
+  /// a local that is being promoted to global scope.
+  std::string getName(const GlobalValue *SGV);
+
+  /// Process globals so that they can be used in ThinLTO. This includes
+  /// promoting local variables so that they can be reference externally by
+  /// thin lto imported globals and converting strong external globals to
+  /// available_externally.
+  void processGlobalsForThinLTO();
+  void processGlobalForThinLTO(GlobalValue &GV);
+
+  /// Get the new linkage for SGV that should be used in the linked destination
+  /// module. Specifically, for ThinLTO importing or exporting it may need
+  /// to be adjusted.
+  GlobalValue::LinkageTypes getLinkage(const GlobalValue *SGV);
 
-  // If there is a conflict, rename the conflict.
-  if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
-    GV->takeName(ConflictGV);
-    ConflictGV->setName(Name);    // This will cause ConflictGV to get renamed
-    assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
-  } else {
-    GV->setName(Name);              // Force the name back
+public:
+  ThinLTOGlobalProcessing(
+      Module &M, const FunctionInfoIndex *Index,
+      DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
+      : M(M), ImportIndex(Index), FunctionsToImport(FunctionsToImport) {
+    // If we have a FunctionInfoIndex but no function to import,
+    // then this is the primary module being compiled in a ThinLTO
+    // backend compilation, and we need to see if it has functions that
+    // may be exported to another backend compilation.
+    if (!FunctionsToImport)
+      HasExportedFunctions = ImportIndex->hasExportedFunctions(M);
   }
-}
 
-/// copy additional attributes (those not needed to construct a GlobalValue)
-/// from the SrcGV to the DestGV.
-static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
-  DestGV->copyAttributesFrom(SrcGV);
-  forceRenaming(DestGV, SrcGV->getName());
+  bool run();
+
+  /// Access the promoted globals that are now exported and need to be linked.
+  SetVector<GlobalValue *> &getNewExportedValues() { return NewExportedValues; }
+};
 }
 
-static bool isLessConstraining(GlobalValue::VisibilityTypes a,
-                               GlobalValue::VisibilityTypes b) {
-  if (a == GlobalValue::HiddenVisibility)
-    return false;
-  if (b == GlobalValue::HiddenVisibility)
+/// Checks if we should import SGV as a definition, otherwise import as a
+/// declaration.
+static bool
+doImportAsDefinitionImpl(const GlobalValue *SGV,
+                         DenseSet<const GlobalValue *> *FunctionsToImport) {
+  auto *GA = dyn_cast<GlobalAlias>(SGV);
+  if (GA) {
+    if (GA->hasWeakAnyLinkage())
+      return false;
+    const GlobalObject *GO = GA->getBaseObject();
+    if (!GO->hasLinkOnceODRLinkage())
+      return false;
+    return doImportAsDefinitionImpl(GO, FunctionsToImport);
+  }
+  // Always import GlobalVariable definitions, except for the special
+  // case of WeakAny which are imported as ExternalWeak declarations
+  // (see comments in ModuleLinker::getLinkage). The linkage changes
+  // described in ModuleLinker::getLinkage ensure the correct behavior (e.g.
+  // global variables with external linkage are transformed to
+  // available_externally definitions, which are ultimately turned into
+  // declarations after the EliminateAvailableExternally pass).
+  if (isa<GlobalVariable>(SGV) && !SGV->isDeclaration() &&
+      !SGV->hasWeakAnyLinkage())
     return true;
-  if (a == GlobalValue::ProtectedVisibility)
-    return false;
-  if (b == GlobalValue::ProtectedVisibility)
+  // Only import the function requested for importing.
+  auto *SF = dyn_cast<Function>(SGV);
+  if (SF && FunctionsToImport->count(SF))
     return true;
+  // Otherwise no.
   return false;
 }
 
-/// Loop through the global variables in the src module and merge them into the
-/// dest module.
-static GlobalVariable *copyGlobalVariableProto(TypeMapTy &TypeMap, Module &DstM,
-                                               const GlobalVariable *SGVar) {
-  // No linking to be performed or linking from the source: simply create an
-  // identical version of the symbol over in the dest module... the
-  // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV = new GlobalVariable(
-      DstM, TypeMap.get(SGVar->getType()->getElementType()),
-      SGVar->isConstant(), SGVar->getLinkage(), /*init*/ nullptr,
-      SGVar->getName(), /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
-      SGVar->getType()->getAddressSpace());
-
-  return NewDGV;
+bool ThinLTOGlobalProcessing::doImportAsDefinition(const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
+  return doImportAsDefinitionImpl(SGV, FunctionsToImport);
 }
 
-/// Link the function in the source module into the destination module if
-/// needed, setting up mapping information.
-static Function *copyFunctionProto(TypeMapTy &TypeMap, Module &DstM,
-                                   const Function *SF) {
-  // If there is no linkage to be performed or we are linking from the source,
-  // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()), SF->getLinkage(),
-                          SF->getName(), &DstM);
+bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
+  return doImportAsDefinitionImpl(SGV, FunctionsToImport);
 }
 
-/// Set up prototypes for any aliases that come over from the source module.
-static GlobalAlias *copyGlobalAliasProto(TypeMapTy &TypeMap, Module &DstM,
-                                         const GlobalAlias *SGA) {
-  // If there is no linkage to be performed or we're linking from the source,
-  // bring over SGA.
-  auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType()));
-  return GlobalAlias::create(PTy, SGA->getLinkage(), SGA->getName(), &DstM);
-}
+bool ThinLTOGlobalProcessing::doPromoteLocalToGlobal(const GlobalValue *SGV) {
+  assert(SGV->hasLocalLinkage());
+  // Both the imported references and the original local variable must
+  // be promoted.
+  if (!isPerformingImport() && !isModuleExporting())
+    return false;
+
+  // Local const variables never need to be promoted unless they are address
+  // taken. The imported uses can simply use the clone created in this module.
+  // For now we are conservative in determining which variables are not
+  // address taken by checking the unnamed addr flag. To be more aggressive,
+  // the address taken information must be checked earlier during parsing
+  // of the module and recorded in the function index for use when importing
+  // from that module.
+  auto *GVar = dyn_cast<GlobalVariable>(SGV);
+  if (GVar && GVar->isConstant() && GVar->hasUnnamedAddr())
+    return false;
 
-static GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, Module &DstM,
-                                         const GlobalValue *SGV) {
-  GlobalValue *NewGV;
-  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV))
-    NewGV = copyGlobalVariableProto(TypeMap, DstM, SGVar);
-  else if (auto *SF = dyn_cast<Function>(SGV))
-    NewGV = copyFunctionProto(TypeMap, DstM, SF);
-  else
-    NewGV = copyGlobalAliasProto(TypeMap, DstM, cast<GlobalAlias>(SGV));
-  copyGVAttributes(NewGV, SGV);
-  return NewGV;
+  // Eventually we only need to promote functions in the exporting module that
+  // are referenced by a potentially exported function (i.e. one that is in the
+  // function index).
+  return true;
 }
 
-Value *ValueMaterializerTy::materializeValueFor(Value *V) {
-  auto *SGV = dyn_cast<GlobalValue>(V);
-  if (!SGV)
-    return nullptr;
+std::string ThinLTOGlobalProcessing::getName(const GlobalValue *SGV) {
+  // For locals that must be promoted to global scope, ensure that
+  // the promoted name uniquely identifies the copy in the original module,
+  // using the ID assigned during combined index creation. When importing,
+  // we rename all locals (not just those that are promoted) in order to
+  // avoid naming conflicts between locals imported from different modules.
+  if (SGV->hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(SGV) || isPerformingImport()))
+    return FunctionInfoIndex::getGlobalNameForLocal(
+        SGV->getName(),
+        ImportIndex->getModuleId(SGV->getParent()->getModuleIdentifier()));
+  return SGV->getName();
+}
+
+GlobalValue::LinkageTypes
+ThinLTOGlobalProcessing::getLinkage(const GlobalValue *SGV) {
+  // Any local variable that is referenced by an exported function needs
+  // to be promoted to global scope. Since we don't currently know which
+  // functions reference which local variables/functions, we must treat
+  // all as potentially exported if this module is exporting anything.
+  if (isModuleExporting()) {
+    if (SGV->hasLocalLinkage() && doPromoteLocalToGlobal(SGV))
+      return GlobalValue::ExternalLinkage;
+    return SGV->getLinkage();
+  }
+
+  // Otherwise, if we aren't importing, no linkage change is needed.
+  if (!isPerformingImport())
+    return SGV->getLinkage();
+
+  switch (SGV->getLinkage()) {
+  case GlobalValue::ExternalLinkage:
+    // External defnitions are converted to available_externally
+    // definitions upon import, so that they are available for inlining
+    // and/or optimization, but are turned into declarations later
+    // during the EliminateAvailableExternally pass.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    // An imported external declaration stays external.
+    return SGV->getLinkage();
+
+  case GlobalValue::AvailableExternallyLinkage:
+    // An imported available_externally definition converts
+    // to external if imported as a declaration.
+    if (!doImportAsDefinition(SGV))
+      return GlobalValue::ExternalLinkage;
+    // An imported available_externally declaration stays that way.
+    return SGV->getLinkage();
+
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+    // These both stay the same when importing the definition.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakAnyLinkage:
+    // Can't import weak_any definitions correctly, or we might change the
+    // program semantics, since the linker will pick the first weak_any
+    // definition and importing would change the order they are seen by the
+    // linker. The module linking caller needs to enforce this.
+    assert(!doImportAsDefinition(SGV));
+    // If imported as a declaration, it becomes external_weak.
+    return GlobalValue::ExternalWeakLinkage;
+
+  case GlobalValue::WeakODRLinkage:
+    // For weak_odr linkage, there is a guarantee that all copies will be
+    // equivalent, so the issue described above for weak_any does not exist,
+    // and the definition can be imported. It can be treated similarly
+    // to an imported externally visible global value.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    else
+      return GlobalValue::ExternalLinkage;
+
+  case GlobalValue::AppendingLinkage:
+    // It would be incorrect to import an appending linkage variable,
+    // since it would cause global constructors/destructors to be
+    // executed multiple times. This should have already been handled
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage.
+    return GlobalValue::AppendingLinkage;
+
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    // If we are promoting the local to global scope, it is handled
+    // similarly to a normal externally visible global.
+    if (doPromoteLocalToGlobal(SGV)) {
+      if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+        return GlobalValue::AvailableExternallyLinkage;
+      else
+        return GlobalValue::ExternalLinkage;
+    }
+    // A non-promoted imported local definition stays local.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
 
-  GlobalValue *DGV = copyGlobalValueProto(TypeMap, *DstM, SGV);
+  case GlobalValue::ExternalWeakLinkage:
+    // External weak doesn't apply to definitions, must be a declaration.
+    assert(!doImportAsDefinition(SGV));
+    // Linkage stays external_weak.
+    return SGV->getLinkage();
 
-  if (Comdat *SC = SGV->getComdat()) {
-    if (auto *DGO = dyn_cast<GlobalObject>(DGV)) {
-      Comdat *DC = DstM->getOrInsertComdat(SC->getName());
-      DGO->setComdat(DC);
-    }
+  case GlobalValue::CommonLinkage:
+    // Linkage stays common on definitions.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
   }
 
-  LazilyLinkGlobalValues.push_back(SGV);
-  return DGV;
+  llvm_unreachable("unknown linkage type");
+}
+
+static GlobalValue::VisibilityTypes
+getMinVisibility(GlobalValue::VisibilityTypes A,
+                 GlobalValue::VisibilityTypes B) {
+  if (A == GlobalValue::HiddenVisibility || B == GlobalValue::HiddenVisibility)
+    return GlobalValue::HiddenVisibility;
+  if (A == GlobalValue::ProtectedVisibility ||
+      B == GlobalValue::ProtectedVisibility)
+    return GlobalValue::ProtectedVisibility;
+  return GlobalValue::DefaultVisibility;
 }
 
-bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName,
+bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
                                    const GlobalVariable *&GVar) {
-  const GlobalValue *GVal = M->getNamedValue(ComdatName);
+  const GlobalValue *GVal = M.getNamedValue(ComdatName);
   if (const auto *GA = dyn_cast_or_null<GlobalAlias>(GVal)) {
     GVal = GA->getBaseObject();
     if (!GVal)
@@ -641,6 +432,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
                                                  Comdat::SelectionKind Dst,
                                                  Comdat::SelectionKind &Result,
                                                  bool &LinkFromSrc) {
+  Module &DstM = Mover.getModule();
   // The ability to mix Comdat::SelectionKind::Any with
   // Comdat::SelectionKind::Largest is a behavior that comes from COFF.
   bool DstAnyOrLargest = Dst == Comdat::SelectionKind::Any ||
@@ -677,8 +469,8 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
         getComdatLeader(SrcM, ComdatName, SrcGV))
       return true;
 
-    const DataLayout &DstDL = DstM->getDataLayout();
-    const DataLayout &SrcDL = SrcM->getDataLayout();
+    const DataLayout &DstDL = DstM.getDataLayout();
+    const DataLayout &SrcDL = SrcM.getDataLayout();
     uint64_t DstSize =
         DstDL.getTypeAllocSize(DstGV->getType()->getPointerElementType());
     uint64_t SrcSize =
@@ -708,9 +500,10 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
 bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    Comdat::SelectionKind &Result,
                                    bool &LinkFromSrc) {
+  Module &DstM = Mover.getModule();
   Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
-  Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable();
+  Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
   Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName);
 
   if (DstCI == ComdatSymTab.end()) {
@@ -729,14 +522,17 @@ bool ModuleLinker::getComdatResult(const Comdat *SrcC,
 bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
                                         const GlobalValue &Dest,
                                         const GlobalValue &Src) {
+
   // Should we unconditionally use the Src?
-  if (OverrideFromSrc) {
+  if (shouldOverrideFromSrc()) {
     LinkFromSrc = true;
     return false;
   }
 
   // We always have to add Src if it has appending linkage.
   if (Src.hasAppendingLinkage()) {
+    // Should have prevented importing for appending linkage in linkIfNeeded.
+    assert(!isPerformingImport());
     LinkFromSrc = true;
     return false;
   }
@@ -744,6 +540,28 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
   bool SrcIsDeclaration = Src.isDeclarationForLinker();
   bool DestIsDeclaration = Dest.isDeclarationForLinker();
 
+  if (isPerformingImport()) {
+    if (isa<Function>(&Src)) {
+      // For functions, LinkFromSrc iff this is a function requested
+      // for importing. For variables, decide below normally.
+      LinkFromSrc = FunctionsToImport->count(&Src);
+      return false;
+    }
+
+    // Check if this is an alias with an already existing definition
+    // in Dest, which must have come from a prior importing pass from
+    // the same Src module. Unlike imported function and variable
+    // definitions, which are imported as available_externally and are
+    // not definitions for the linker, that is not a valid linkage for
+    // imported aliases which must be definitions. Simply use the existing
+    // Dest copy.
+    if (isa<GlobalAlias>(&Src) && !DestIsDeclaration) {
+      assert(isa<GlobalAlias>(&Dest));
+      LinkFromSrc = false;
+      return false;
+    }
+  }
+
   if (SrcIsDeclaration) {
     // If Src is external or if both Src & Dest are external..  Just link the
     // external globals, we aren't adding anything.
@@ -753,7 +571,12 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
       return false;
     }
     // If the Dest is weak, use the source linkage.
-    LinkFromSrc = Dest.hasExternalWeakLinkage();
+    if (Dest.hasExternalWeakLinkage()) {
+      LinkFromSrc = true;
+      return false;
+    }
+    // Link an available_externally over a declaration.
+    LinkFromSrc = !Src.isDeclaration() && Dest.isDeclaration();
     return false;
   }
 
@@ -808,730 +631,122 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
                    "': symbol multiply defined!");
 }
 
-/// Loop over all of the linked values to compute type mappings.  For example,
-/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
-/// types 'Foo' but one got renamed when the module was loaded into the same
-/// LLVMContext.
-void ModuleLinker::computeTypeMapping() {
-  for (GlobalValue &SGV : SrcM->globals()) {
-    GlobalValue *DGV = getLinkedToGlobal(&SGV);
-    if (!DGV)
-      continue;
-
-    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-      continue;
-    }
-
-    // Unify the element type of appending arrays.
-    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
-    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
-    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
-  }
-
-  for (GlobalValue &SGV : *SrcM) {
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-  }
-
-  for (GlobalValue &SGV : SrcM->aliases()) {
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-  }
-
-  // Incorporate types by name, scanning all the types in the source module.
-  // At this point, the destination module may have a type "%foo = { i32 }" for
-  // example.  When the source module got loaded into the same LLVMContext, if
-  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes();
-  for (StructType *ST : Types) {
-    if (!ST->hasName())
-      continue;
-
-    // Check to see if there is a dot in the name followed by a digit.
-    size_t DotPos = ST->getName().rfind('.');
-    if (DotPos == 0 || DotPos == StringRef::npos ||
-        ST->getName().back() == '.' ||
-        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
-      continue;
+bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
+  GlobalValue *DGV = getLinkedToGlobal(&GV);
 
-    // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos));
-    if (!DST)
-      continue;
-
-    // Don't use it if this actually came from the source module. They're in
-    // the same LLVMContext after all. Also don't use it unless the type is
-    // actually used in the destination module. This can happen in situations
-    // like this:
-    //
-    //      Module A                         Module B
-    //      --------                         --------
-    //   %Z = type { %A }                %B = type { %C.1 }
-    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
-    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
-    //   %C = type { i8* }               %B.3 = type { %C.1 }
-    //
-    // When we link Module B with Module A, the '%B' in Module B is
-    // used. However, that would then use '%C.1'. But when we process '%C.1',
-    // we prefer to take the '%C' version. So we are then left with both
-    // '%C.1' and '%C' being used for the same types. This leads to some
-    // variables using one type and some using the other.
-    if (TypeMap.DstStructTypesSet.hasType(DST))
-      TypeMap.addTypeMapping(DST, ST);
-  }
-
-  // Now that we have discovered all of the type equivalences, get a body for
-  // any 'opaque' types in the dest module that are now resolved.
-  TypeMap.linkDefinedTypeBodies();
-}
+  if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration()))
+    return false;
 
-static void upgradeGlobalArray(GlobalVariable *GV) {
-  ArrayType *ATy = cast<ArrayType>(GV->getType()->getElementType());
-  StructType *OldTy = cast<StructType>(ATy->getElementType());
-  assert(OldTy->getNumElements() == 2 && "Expected to upgrade from 2 elements");
-
-  // Get the upgraded 3 element type.
-  PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo();
-  Type *Tys[3] = {OldTy->getElementType(0), OldTy->getElementType(1),
-                  VoidPtrTy};
-  StructType *NewTy = StructType::get(GV->getContext(), Tys, false);
-
-  // Build new constants with a null third field filled in.
-  Constant *OldInitC = GV->getInitializer();
-  ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC);
-  if (!OldInit && !isa<ConstantAggregateZero>(OldInitC))
-    // Invalid initializer; give up.
-    return;
-  std::vector<Constant *> Initializers;
-  if (OldInit && OldInit->getNumOperands()) {
-    Value *Null = Constant::getNullValue(VoidPtrTy);
-    for (Use &U : OldInit->operands()) {
-      ConstantStruct *Init = cast<ConstantStruct>(U.get());
-      Initializers.push_back(ConstantStruct::get(
-          NewTy, Init->getOperand(0), Init->getOperand(1), Null, nullptr));
+  if (DGV && !GV.hasLocalLinkage() && !GV.hasAppendingLinkage()) {
+    auto *DGVar = dyn_cast<GlobalVariable>(DGV);
+    auto *SGVar = dyn_cast<GlobalVariable>(&GV);
+    if (DGVar && SGVar) {
+      if (DGVar->isDeclaration() && SGVar->isDeclaration() &&
+          (!DGVar->isConstant() || !SGVar->isConstant())) {
+        DGVar->setConstant(false);
+        SGVar->setConstant(false);
+      }
+      if (DGVar->hasCommonLinkage() && SGVar->hasCommonLinkage()) {
+        unsigned Align = std::max(DGVar->getAlignment(), SGVar->getAlignment());
+        SGVar->setAlignment(Align);
+        DGVar->setAlignment(Align);
+      }
     }
-  }
-  assert(Initializers.size() == ATy->getNumElements() &&
-         "Failed to copy all array elements");
-
-  // Replace the old GV with a new one.
-  ATy = ArrayType::get(NewTy, Initializers.size());
-  Constant *NewInit = ConstantArray::get(ATy, Initializers);
-  GlobalVariable *NewGV = new GlobalVariable(
-      *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "",
-      GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(),
-      GV->isExternallyInitialized());
-  NewGV->copyAttributesFrom(GV);
-  NewGV->takeName(GV);
-  assert(GV->use_empty() && "program cannot use initializer list");
-  GV->eraseFromParent();
-}
 
-void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) {
-  // Look for the global arrays.
-  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM->getNamedValue(Name));
-  if (!DstGV)
-    return;
-  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM->getNamedValue(Name));
-  if (!SrcGV)
-    return;
+    GlobalValue::VisibilityTypes Visibility =
+        getMinVisibility(DGV->getVisibility(), GV.getVisibility());
+    DGV->setVisibility(Visibility);
+    GV.setVisibility(Visibility);
 
-  // Check if the types already match.
-  auto *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-  auto *SrcTy =
-      cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
-  if (DstTy == SrcTy)
-    return;
-
-  // Grab the element types.  We can only upgrade an array of a two-field
-  // struct.  Only bother if the other one has three-fields.
-  auto *DstEltTy = cast<StructType>(DstTy->getElementType());
-  auto *SrcEltTy = cast<StructType>(SrcTy->getElementType());
-  if (DstEltTy->getNumElements() == 2 && SrcEltTy->getNumElements() == 3) {
-    upgradeGlobalArray(DstGV);
-    return;
+    bool HasUnnamedAddr = GV.hasUnnamedAddr() && DGV->hasUnnamedAddr();
+    DGV->setUnnamedAddr(HasUnnamedAddr);
+    GV.setUnnamedAddr(HasUnnamedAddr);
   }
-  if (DstEltTy->getNumElements() == 3 && SrcEltTy->getNumElements() == 2)
-    upgradeGlobalArray(SrcGV);
-
-  // We can't upgrade any other differences.
-}
-
-void ModuleLinker::upgradeMismatchedGlobals() {
-  upgradeMismatchedGlobalArray("llvm.global_ctors");
-  upgradeMismatchedGlobalArray("llvm.global_dtors");
-}
-
-/// If there were any appending global variables, link them together now.
-/// Return true on error.
-bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
-                                         const GlobalVariable *SrcGV) {
-
-  if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
-    return emitError("Linking globals named '" + SrcGV->getName() +
-           "': can only link appending global with another appending global!");
-
-  ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-  ArrayType *SrcTy =
-    cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
-  Type *EltTy = DstTy->getElementType();
-
-  // Check to see that they two arrays agree on type.
-  if (EltTy != SrcTy->getElementType())
-    return emitError("Appending variables with different element types!");
-  if (DstGV->isConstant() != SrcGV->isConstant())
-    return emitError("Appending variables linked with different const'ness!");
-
-  if (DstGV->getAlignment() != SrcGV->getAlignment())
-    return emitError(
-             "Appending variables with different alignment need to be linked!");
-
-  if (DstGV->getVisibility() != SrcGV->getVisibility())
-    return emitError(
-            "Appending variables with different visibility need to be linked!");
-
-  if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr())
-    return emitError(
-        "Appending variables with different unnamed_addr need to be linked!");
-
-  if (StringRef(DstGV->getSection()) != SrcGV->getSection())
-    return emitError(
-          "Appending variables with different section name need to be linked!");
-
-  uint64_t NewSize = DstTy->getNumElements() + SrcTy->getNumElements();
-  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
-
-  // Create the new global variable.
-  GlobalVariable *NG =
-    new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(),
-                       DstGV->getLinkage(), /*init*/nullptr, /*name*/"", DstGV,
-                       DstGV->getThreadLocalMode(),
-                       DstGV->getType()->getAddressSpace());
 
-  // Propagate alignment, visibility and section info.
-  copyGVAttributes(NG, DstGV);
-
-  AppendingVarInfo AVI;
-  AVI.NewGV = NG;
-  AVI.DstInit = DstGV->getInitializer();
-  AVI.SrcInit = SrcGV->getInitializer();
-  AppendingVars.push_back(AVI);
-
-  // Replace any uses of the two global variables with uses of the new
-  // global.
-  ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
-
-  DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
-  DstGV->eraseFromParent();
-
-  // Track the source variable so we don't try to link it.
-  DoNotLinkFromSource.insert(SrcGV);
-
-  return false;
-}
+  // Don't want to append to global_ctors list, for example, when we
+  // are importing for ThinLTO, otherwise the global ctors and dtors
+  // get executed multiple times for local variables (the latter causing
+  // double frees).
+  if (GV.hasAppendingLinkage() && isPerformingImport())
+    return false;
 
-bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
-  GlobalValue *DGV = getLinkedToGlobal(SGV);
+  if (isPerformingImport() && !doImportAsDefinition(&GV))
+    return false;
 
-  // Handle the ultra special appending linkage case first.
-  if (DGV && DGV->hasAppendingLinkage())
-    return linkAppendingVarProto(cast<GlobalVariable>(DGV),
-                                 cast<GlobalVariable>(SGV));
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage()))
+    return false;
 
-  bool LinkFromSrc = true;
-  Comdat *C = nullptr;
-  GlobalValue::VisibilityTypes Visibility = SGV->getVisibility();
-  bool HasUnnamedAddr = SGV->hasUnnamedAddr();
+  if (GV.isDeclaration())
+    return false;
 
-  if (const Comdat *SC = SGV->getComdat()) {
+  if (const Comdat *SC = GV.getComdat()) {
+    bool LinkFromSrc;
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    C = DstM->getOrInsertComdat(SC->getName());
-    C->setSelectionKind(SK);
-  } else if (DGV) {
-    if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
-      return true;
-  }
-
-  if (!LinkFromSrc) {
-    // Track the source global so that we don't attempt to copy it over when
-    // processing global initializers.
-    DoNotLinkFromSource.insert(SGV);
-
-    if (DGV)
-      // Make sure to remember this mapping.
-      ValueMap[SGV] =
-          ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
-  }
-
-  if (DGV) {
-    Visibility = isLessConstraining(Visibility, DGV->getVisibility())
-                     ? DGV->getVisibility()
-                     : Visibility;
-    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
-  }
-
-  if (!LinkFromSrc && !DGV)
+    if (LinkFromSrc)
+      ValuesToLink.insert(&GV);
     return false;
-
-  GlobalValue *NewGV;
-  if (!LinkFromSrc) {
-    NewGV = DGV;
-  } else {
-    // If the GV is to be lazily linked, don't create it just yet.
-    // The ValueMaterializerTy will deal with creating it if it's used.
-    if (!DGV && !OverrideFromSrc &&
-        (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() ||
-         SGV->hasAvailableExternallyLinkage())) {
-      DoNotLinkFromSource.insert(SGV);
-      return false;
-    }
-
-    NewGV = copyGlobalValueProto(TypeMap, *DstM, SGV);
-
-    if (DGV && isa<Function>(DGV))
-      if (auto *NewF = dyn_cast<Function>(NewGV))
-        OverridingFunctions.insert(NewF);
-  }
-
-  NewGV->setUnnamedAddr(HasUnnamedAddr);
-  NewGV->setVisibility(Visibility);
-
-  if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C)
-      NewGO->setComdat(C);
-
-    if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
-      NewGO->setAlignment(std::max(DGV->getAlignment(), SGV->getAlignment()));
-  }
-
-  if (auto *NewGVar = dyn_cast<GlobalVariable>(NewGV)) {
-    auto *DGVar = dyn_cast_or_null<GlobalVariable>(DGV);
-    auto *SGVar = dyn_cast<GlobalVariable>(SGV);
-    if (DGVar && SGVar && DGVar->isDeclaration() && SGVar->isDeclaration() &&
-        (!DGVar->isConstant() || !SGVar->isConstant()))
-      NewGVar->setConstant(false);
-  }
-
-  // Make sure to remember this mapping.
-  if (NewGV != DGV) {
-    if (DGV) {
-      DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
-      DGV->eraseFromParent();
-    }
-    ValueMap[SGV] = NewGV;
-  }
-
-  return false;
-}
-
-static void getArrayElements(const Constant *C,
-                             SmallVectorImpl<Constant *> &Dest) {
-  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
-
-  for (unsigned i = 0; i != NumElements; ++i)
-    Dest.push_back(C->getAggregateElement(i));
-}
-
-void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
-  // Merge the initializer.
-  SmallVector<Constant *, 16> DstElements;
-  getArrayElements(AVI.DstInit, DstElements);
-
-  SmallVector<Constant *, 16> SrcElements;
-  getArrayElements(AVI.SrcInit, SrcElements);
-
-  ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
-
-  StringRef Name = AVI.NewGV->getName();
-  bool IsNewStructor =
-      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
-      cast<StructType>(NewType->getElementType())->getNumElements() == 3;
-
-  for (auto *V : SrcElements) {
-    if (IsNewStructor) {
-      Constant *Key = V->getAggregateElement(2);
-      if (DoNotLinkFromSource.count(Key))
-        continue;
-    }
-    DstElements.push_back(
-        MapValue(V, ValueMap, RF_None, &TypeMap, &ValMaterializer));
   }
-  if (IsNewStructor) {
-    NewType = ArrayType::get(NewType->getElementType(), DstElements.size());
-    AVI.NewGV->mutateType(PointerType::get(NewType, 0));
-  }
-
-  AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements));
-}
 
-/// Update the initializers in the Dest module now that all globals that may be
-/// referenced are in Dest.
-void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
-  // Figure out what the initializer looks like in the dest module.
-  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap, RF_None, &TypeMap,
-                              &ValMaterializer));
-}
-
-/// Copy the source function over into the dest function and fix up references
-/// to values. At this point we know that Dest is an external function, and
-/// that Src is not.
-bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) {
-  assert(Dst.isDeclaration() && !Src.isDeclaration());
-
-  // Materialize if needed.
-  if (std::error_code EC = Src.materialize())
-    return emitError(EC.message());
-
-  // Link in the prefix data.
-  if (Src.hasPrefixData())
-    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap, RF_None, &TypeMap,
-                               &ValMaterializer));
-
-  // Link in the prologue data.
-  if (Src.hasPrologueData())
-    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap, RF_None,
-                                 &TypeMap, &ValMaterializer));
-
-  // Link in the personality function.
-  if (Src.hasPersonalityFn())
-    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap, RF_None,
-                                  &TypeMap, &ValMaterializer));
-
-  // Go through and convert function arguments over, remembering the mapping.
-  Function::arg_iterator DI = Dst.arg_begin();
-  for (Argument &Arg : Src.args()) {
-    DI->setName(Arg.getName());  // Copy the name over.
-
-    // Add a mapping to our mapping.
-    ValueMap[&Arg] = DI;
-    ++DI;
-  }
-
-  // Copy over the metadata attachments.
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
-  Src.getAllMetadata(MDs);
-  for (const auto &I : MDs)
-    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, RF_None, &TypeMap,
-                                         &ValMaterializer));
-
-  // Splice the body of the source function into the dest function.
-  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
-
-  // At this point, all of the instructions and values of the function are now
-  // copied over.  The only problem is that they are still referencing values in
-  // the Source function as operands.  Loop through all of the operands of the
-  // functions and patch them up to point to the local versions.
-  for (BasicBlock &BB : Dst)
-    for (Instruction &I : BB)
-      RemapInstruction(&I, ValueMap, RF_IgnoreMissingEntries, &TypeMap,
-                       &ValMaterializer);
-
-  // There is no need to map the arguments anymore.
-  for (Argument &Arg : Src.args())
-    ValueMap.erase(&Arg);
-
-  Src.dematerialize();
-  return false;
-}
-
-void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
-  Constant *Aliasee = Src.getAliasee();
-  Constant *Val =
-      MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer);
-  Dst.setAliasee(Val);
-}
-
-bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
-  Value *Dst = ValueMap[&Src];
-  assert(Dst);
-  if (auto *F = dyn_cast<Function>(&Src))
-    return linkFunctionBody(cast<Function>(*Dst), *F);
-  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
-    linkGlobalInit(cast<GlobalVariable>(*Dst), *GVar);
-    return false;
-  }
-  linkAliasBody(cast<GlobalAlias>(*Dst), cast<GlobalAlias>(Src));
+  bool LinkFromSrc = true;
+  if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, GV))
+    return true;
+  if (LinkFromSrc)
+    ValuesToLink.insert(&GV);
   return false;
 }
 
-/// Insert all of the named MDNodes in Src into the Dest module.
-void ModuleLinker::linkNamedMDNodes() {
-  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
-  for (const NamedMDNode &NMD : SrcM->named_metadata()) {
-    // Don't link module flags here. Do them separately.
-    if (&NMD == SrcModFlags)
-      continue;
-    NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(NMD.getName());
-    // Add Src elements into Dest node.
-    for (const MDNode *op : NMD.operands())
-      DestNMD->addOperand(
-          MapMetadata(op, ValueMap, RF_None, &TypeMap, &ValMaterializer));
-  }
-}
-
-/// Drop DISubprograms that have been superseded.
-///
-/// FIXME: this creates an asymmetric result: we strip functions from losing
-/// subprograms in DstM, but leave losing subprograms in SrcM.
-/// TODO: Remove this logic once the backend can correctly determine canonical
-/// subprograms.
-void ModuleLinker::stripReplacedSubprograms() {
-  // Avoid quadratic runtime by returning early when there's nothing to do.
-  if (OverridingFunctions.empty())
+void ModuleLinker::addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add) {
+  // Add these to the internalize list
+  if (!GV.hasLinkOnceLinkage())
     return;
 
-  // Move the functions now, so the set gets cleared even on early returns.
-  auto Functions = std::move(OverridingFunctions);
-  OverridingFunctions.clear();
+  if (shouldInternalizeLinkedSymbols())
+    Internalize.insert(GV.getName());
+  Add(GV);
 
-  // Drop functions from subprograms if they've been overridden by the new
-  // compile unit.
-  NamedMDNode *CompileUnits = DstM->getNamedMetadata("llvm.dbg.cu");
-  if (!CompileUnits)
+  const Comdat *SC = GV.getComdat();
+  if (!SC)
+    return;
+  for (GlobalValue *GV2 : ComdatMembers[SC]) {
+    if (!GV2->hasLocalLinkage() && shouldInternalizeLinkedSymbols())
+      Internalize.insert(GV2->getName());
+    Add(*GV2);
+  }
+}
+
+void ThinLTOGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+  if (GV.hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
+    GV.setName(getName(&GV));
+    GV.setLinkage(getLinkage(&GV));
+    if (!GV.hasLocalLinkage())
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+    if (isModuleExporting())
+      NewExportedValues.insert(&GV);
     return;
-  for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) {
-    auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I));
-    assert(CU && "Expected valid compile unit");
-
-    for (DISubprogram *SP : CU->getSubprograms()) {
-      if (!SP || !SP->getFunction() || !Functions.count(SP->getFunction()))
-        continue;
-
-      // Prevent DebugInfoFinder from tagging this as the canonical subprogram,
-      // since the canonical one is in the incoming module.
-      SP->replaceFunction(nullptr);
-    }
-  }
-}
-
-/// Merge the linker flags in Src into the Dest module.
-bool ModuleLinker::linkModuleFlagsMetadata() {
-  // If the source module has no module flags, we are done.
-  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
-  if (!SrcModFlags) return false;
-
-  // If the destination module doesn't have module flags yet, then just copy
-  // over the source module's flags.
-  NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata();
-  if (DstModFlags->getNumOperands() == 0) {
-    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
-      DstModFlags->addOperand(SrcModFlags->getOperand(I));
-
-    return false;
-  }
-
-  // First build a map of the existing module flags and requirements.
-  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
-  SmallSetVector<MDNode*, 16> Requirements;
-  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *Op = DstModFlags->getOperand(I);
-    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
-    MDString *ID = cast<MDString>(Op->getOperand(1));
-
-    if (Behavior->getZExtValue() == Module::Require) {
-      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
-    } else {
-      Flags[ID] = std::make_pair(Op, I);
-    }
-  }
-
-  // Merge in the flags from the source module, and also collect its set of
-  // requirements.
-  bool HasErr = false;
-  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *SrcOp = SrcModFlags->getOperand(I);
-    ConstantInt *SrcBehavior =
-        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
-    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
-    MDNode *DstOp;
-    unsigned DstIndex;
-    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
-    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
-
-    // If this is a requirement, add it and continue.
-    if (SrcBehaviorValue == Module::Require) {
-      // If the destination module does not already have this requirement, add
-      // it.
-      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
-        DstModFlags->addOperand(SrcOp);
-      }
-      continue;
-    }
-
-    // If there is no existing flag with this ID, just add it.
-    if (!DstOp) {
-      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
-      DstModFlags->addOperand(SrcOp);
-      continue;
-    }
-
-    // Otherwise, perform a merge.
-    ConstantInt *DstBehavior =
-        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
-    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
-
-    // If either flag has override behavior, handle it first.
-    if (DstBehaviorValue == Module::Override) {
-      // Diagnose inconsistent flags which both have override behavior.
-      if (SrcBehaviorValue == Module::Override &&
-          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        HasErr |= emitError("linking module flags '" + ID->getString() +
-                            "': IDs have conflicting override values");
-      }
-      continue;
-    } else if (SrcBehaviorValue == Module::Override) {
-      // Update the destination flag to that of the source.
-      DstModFlags->setOperand(DstIndex, SrcOp);
-      Flags[ID].first = SrcOp;
-      continue;
-    }
-
-    // Diagnose inconsistent merge behavior types.
-    if (SrcBehaviorValue != DstBehaviorValue) {
-      HasErr |= emitError("linking module flags '" + ID->getString() +
-                          "': IDs have conflicting behaviors");
-      continue;
-    }
-
-    auto replaceDstValue = [&](MDNode *New) {
-      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
-      MDNode *Flag = MDNode::get(DstM->getContext(), FlagOps);
-      DstModFlags->setOperand(DstIndex, Flag);
-      Flags[ID].first = Flag;
-    };
-
-    // Perform the merge for standard behavior types.
-    switch (SrcBehaviorValue) {
-    case Module::Require:
-    case Module::Override: llvm_unreachable("not possible");
-    case Module::Error: {
-      // Emit an error if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        HasErr |= emitError("linking module flags '" + ID->getString() +
-                            "': IDs have conflicting values");
-      }
-      continue;
-    }
-    case Module::Warning: {
-      // Emit a warning if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitWarning("linking module flags '" + ID->getString() +
-                    "': IDs have conflicting values");
-      }
-      continue;
-    }
-    case Module::Append: {
-      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
-      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      SmallVector<Metadata *, 8> MDs;
-      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
-      MDs.append(DstValue->op_begin(), DstValue->op_end());
-      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
-
-      replaceDstValue(MDNode::get(DstM->getContext(), MDs));
-      break;
-    }
-    case Module::AppendUnique: {
-      SmallSetVector<Metadata *, 16> Elts;
-      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
-      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      Elts.insert(DstValue->op_begin(), DstValue->op_end());
-      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
-
-      replaceDstValue(MDNode::get(DstM->getContext(),
-                                  makeArrayRef(Elts.begin(), Elts.end())));
-      break;
-    }
-    }
-  }
-
-  // Check all of the requirements.
-  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
-    MDNode *Requirement = Requirements[I];
-    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
-    Metadata *ReqValue = Requirement->getOperand(1);
-
-    MDNode *Op = Flags[Flag].first;
-    if (!Op || Op->getOperand(2) != ReqValue) {
-      HasErr |= emitError("linking module flags '" + Flag->getString() +
-                          "': does not have the required value");
-      continue;
-    }
   }
-
-  return HasErr;
+  GV.setLinkage(getLinkage(&GV));
 }
 
-// This function returns true if the triples match.
-static bool triplesMatch(const Triple &T0, const Triple &T1) {
-  // If vendor is apple, ignore the version number.
-  if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() &&
-           T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() &&
-           T0.getOS() == T1.getOS();
-
-  return T0 == T1;
+void ThinLTOGlobalProcessing::processGlobalsForThinLTO() {
+  for (GlobalVariable &GV : M.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : M)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : M.aliases())
+    processGlobalForThinLTO(GA);
 }
 
-// This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple) {
-  // If vendor is apple, pick the triple with the larger version number.
-  if (SrcTriple.getVendor() == Triple::Apple)
-    if (DstTriple.isOSVersionLT(SrcTriple))
-      return SrcTriple.str();
-
-  return DstTriple.str();
+bool ThinLTOGlobalProcessing::run() {
+  processGlobalsForThinLTO();
+  return false;
 }
 
 bool ModuleLinker::run() {
-  assert(DstM && "Null destination module");
-  assert(SrcM && "Null source module");
-
-  // Inherit the target data from the source module if the destination module
-  // doesn't have one already.
-  if (DstM->getDataLayout().isDefault())
-    DstM->setDataLayout(SrcM->getDataLayout());
-
-  if (SrcM->getDataLayout() != DstM->getDataLayout()) {
-    emitWarning("Linking two modules of different data layouts: '" +
-                SrcM->getModuleIdentifier() + "' is '" +
-                SrcM->getDataLayoutStr() + "' whereas '" +
-                DstM->getModuleIdentifier() + "' is '" +
-                DstM->getDataLayoutStr() + "'\n");
-  }
-
-  // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
-    DstM->setTargetTriple(SrcM->getTargetTriple());
-
-  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM->getTargetTriple());
-
-  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
-    emitWarning("Linking two modules of different target triples: " +
-                SrcM->getModuleIdentifier() + "' is '" +
-                SrcM->getTargetTriple() + "' whereas '" +
-                DstM->getModuleIdentifier() + "' is '" +
-                DstM->getTargetTriple() + "'\n");
-
-  DstM->setTargetTriple(mergeTriples(SrcTriple, DstTriple));
-
-  // Append the module inline asm string.
-  if (!SrcM->getModuleInlineAsm().empty()) {
-    if (DstM->getModuleInlineAsm().empty())
-      DstM->setModuleInlineAsm(SrcM->getModuleInlineAsm());
-    else
-      DstM->setModuleInlineAsm(DstM->getModuleInlineAsm()+"\n"+
-                               SrcM->getModuleInlineAsm());
-  }
-
-  // Loop over all of the linked values to compute type mappings.
-  computeTypeMapping();
-
-  ComdatsChosen.clear();
-  for (const auto &SMEC : SrcM->getComdatSymbolTable()) {
+  for (const auto &SMEC : SrcM.getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
       continue;
@@ -1542,233 +757,95 @@ bool ModuleLinker::run() {
     ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
   }
 
-  // Upgrade mismatched global arrays.
-  upgradeMismatchedGlobals();
+  for (GlobalVariable &GV : SrcM.globals())
+    if (const Comdat *SC = GV.getComdat())
+      ComdatMembers[SC].push_back(&GV);
+
+  for (Function &SF : SrcM)
+    if (const Comdat *SC = SF.getComdat())
+      ComdatMembers[SC].push_back(&SF);
+
+  for (GlobalAlias &GA : SrcM.aliases())
+    if (const Comdat *SC = GA.getComdat())
+      ComdatMembers[SC].push_back(&GA);
 
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
-  for (GlobalVariable &GV : SrcM->globals())
-    if (linkGlobalValueProto(&GV))
+  for (GlobalVariable &GV : SrcM.globals())
+    if (linkIfNeeded(GV))
       return true;
 
-  // Link the functions together between the two modules, without doing function
-  // bodies... this just adds external function prototypes to the DstM
-  // function...  We do this so that when we begin processing function bodies,
-  // all of the global values that may be referenced are available in our
-  // ValueMap.
-  for (Function &F :*SrcM)
-    if (linkGlobalValueProto(&F))
+  for (Function &SF : SrcM)
+    if (linkIfNeeded(SF))
       return true;
 
-  // If there were any aliases, link them now.
-  for (GlobalAlias &GA : SrcM->aliases())
-    if (linkGlobalValueProto(&GA))
+  for (GlobalAlias &GA : SrcM.aliases())
+    if (linkIfNeeded(GA))
       return true;
 
-  for (const AppendingVarInfo &AppendingVar : AppendingVars)
-    linkAppendingVarInit(AppendingVar);
-
-  for (const auto &Entry : DstM->getComdatSymbolTable()) {
-    const Comdat &C = Entry.getValue();
-    if (C.getSelectionKind() == Comdat::Any)
-      continue;
-    const GlobalValue *GV = SrcM->getNamedValue(C.getName());
-    if (GV)
-      MapValue(GV, ValueMap, RF_None, &TypeMap, &ValMaterializer);
-  }
-
-  // Strip replaced subprograms before mapping any metadata -- so that we're
-  // not changing metadata from the source module (note that
-  // linkGlobalValueBody() eventually calls RemapInstruction() and therefore
-  // MapMetadata()) -- but after linking global value protocols -- so that
-  // OverridingFunctions has been built.
-  stripReplacedSubprograms();
-
-  // Link in the function bodies that are defined in the source module into
-  // DstM.
-  for (Function &SF : *SrcM) {
-    // Skip if no body (function is external).
-    if (SF.isDeclaration())
-      continue;
-
-    // Skip if not linking from source.
-    if (DoNotLinkFromSource.count(&SF))
-      continue;
-
-    if (linkGlobalValueBody(SF))
+  if (ImportIndex) {
+    ThinLTOGlobalProcessing ThinLTOProcessing(SrcM, ImportIndex,
+                                              FunctionsToImport);
+    if (ThinLTOProcessing.run())
       return true;
+    for (auto *GV : ThinLTOProcessing.getNewExportedValues())
+      ValuesToLink.insert(GV);
   }
 
-  // Resolve all uses of aliases with aliasees.
-  for (GlobalAlias &Src : SrcM->aliases()) {
-    if (DoNotLinkFromSource.count(&Src))
+  for (unsigned I = 0; I < ValuesToLink.size(); ++I) {
+    GlobalValue *GV = ValuesToLink[I];
+    const Comdat *SC = GV->getComdat();
+    if (!SC)
       continue;
-    linkGlobalValueBody(Src);
+    for (GlobalValue *GV2 : ComdatMembers[SC])
+      ValuesToLink.insert(GV2);
   }
 
-  // Remap all of the named MDNodes in Src into the DstM module. We do this
-  // after linking GlobalValues so that MDNodes that reference GlobalValues
-  // are properly remapped.
-  linkNamedMDNodes();
-
-  // Merge the module flags into the DstM module.
-  if (linkModuleFlagsMetadata())
-    return true;
-
-  // Update the initializers in the DstM module now that all globals that may
-  // be referenced are in DstM.
-  for (GlobalVariable &Src : SrcM->globals()) {
-    // Only process initialized GV's or ones not already in dest.
-    if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
+  if (shouldInternalizeLinkedSymbols()) {
+    for (GlobalValue *GV : ValuesToLink)
+      Internalize.insert(GV->getName());
   }
 
-  // Process vector of lazily linked in functions.
-  while (!LazilyLinkGlobalValues.empty()) {
-    GlobalValue *SGV = LazilyLinkGlobalValues.back();
-    LazilyLinkGlobalValues.pop_back();
-
-    assert(!SGV->isDeclaration() && "users should not pass down decls");
-    if (linkGlobalValueBody(*SGV))
-      return true;
+  if (Mover.move(SrcM, ValuesToLink.getArrayRef(),
+                 [this](GlobalValue &GV, IRMover::ValueAdder Add) {
+                   addLazyFor(GV, Add);
+                 },
+                 ValIDToTempMDMap, false))
+    return true;
+  Module &DstM = Mover.getModule();
+  for (auto &P : Internalize) {
+    GlobalValue *GV = DstM.getNamedValue(P.first());
+    GV->setLinkage(GlobalValue::InternalLinkage);
   }
 
   return false;
 }
 
-Linker::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
-    : ETypes(E), IsPacked(P) {}
-
-Linker::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
-    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
-
-bool Linker::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
-  if (IsPacked != That.IsPacked)
-    return false;
-  if (ETypes != That.ETypes)
-    return false;
-  return true;
-}
-
-bool Linker::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
-  return !this->operator==(That);
-}
-
-StructType *Linker::StructTypeKeyInfo::getEmptyKey() {
-  return DenseMapInfo<StructType *>::getEmptyKey();
-}
-
-StructType *Linker::StructTypeKeyInfo::getTombstoneKey() {
-  return DenseMapInfo<StructType *>::getTombstoneKey();
-}
-
-unsigned Linker::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
-  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
-                      Key.IsPacked);
-}
-
-unsigned Linker::StructTypeKeyInfo::getHashValue(const StructType *ST) {
-  return getHashValue(KeyTy(ST));
-}
-
-bool Linker::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
-                                        const StructType *RHS) {
-  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
-    return false;
-  return LHS == KeyTy(RHS);
-}
-
-bool Linker::StructTypeKeyInfo::isEqual(const StructType *LHS,
-                                        const StructType *RHS) {
-  if (RHS == getEmptyKey())
-    return LHS == getEmptyKey();
-
-  if (RHS == getTombstoneKey())
-    return LHS == getTombstoneKey();
-
-  return KeyTy(LHS) == KeyTy(RHS);
-}
-
-void Linker::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
-  assert(!Ty->isOpaque());
-  NonOpaqueStructTypes.insert(Ty);
-}
+Linker::Linker(Module &M) : Mover(M) {}
 
-void Linker::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
-  assert(!Ty->isOpaque());
-  NonOpaqueStructTypes.insert(Ty);
-  bool Removed = OpaqueStructTypes.erase(Ty);
-  (void)Removed;
-  assert(Removed);
+bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
+                          const FunctionInfoIndex *Index,
+                          DenseSet<const GlobalValue *> *FunctionsToImport,
+                          DenseMap<unsigned, MDNode *> *ValIDToTempMDMap) {
+  ModuleLinker ModLinker(Mover, *Src, Flags, Index, FunctionsToImport,
+                         ValIDToTempMDMap);
+  return ModLinker.run();
 }
 
-void Linker::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
-  assert(Ty->isOpaque());
-  OpaqueStructTypes.insert(Ty);
+bool Linker::linkInModuleForCAPI(Module &Src) {
+  ModuleLinker ModLinker(Mover, Src, 0, nullptr, nullptr);
+  return ModLinker.run();
 }
 
-StructType *
-Linker::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
-                                               bool IsPacked) {
-  Linker::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
-  auto I = NonOpaqueStructTypes.find_as(Key);
-  if (I == NonOpaqueStructTypes.end())
-    return nullptr;
-  return *I;
-}
-
-bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
-  if (Ty->isOpaque())
-    return OpaqueStructTypes.count(Ty);
-  auto I = NonOpaqueStructTypes.find(Ty);
-  if (I == NonOpaqueStructTypes.end())
-    return false;
-  return *I == Ty;
-}
-
-void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
-  this->Composite = M;
-  this->DiagnosticHandler = DiagnosticHandler;
-
-  TypeFinder StructTypes;
-  StructTypes.run(*M, true);
-  for (StructType *Ty : StructTypes) {
-    if (Ty->isOpaque())
-      IdentifiedStructTypes.addOpaque(Ty);
-    else
-      IdentifiedStructTypes.addNonOpaque(Ty);
-  }
-}
-
-Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
-  init(M, DiagnosticHandler);
-}
-
-Linker::Linker(Module *M) {
-  init(M, [this](const DiagnosticInfo &DI) {
-    Composite->getContext().diagnose(DI);
-  });
-}
-
-Linker::~Linker() {
-}
-
-void Linker::deleteModule() {
-  delete Composite;
-  Composite = nullptr;
-}
-
-bool Linker::linkInModule(Module *Src, bool OverrideSymbols) {
-  ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
-                         DiagnosticHandler, OverrideSymbols);
-  bool RetCode = TheLinker.run();
-  Composite->dropTriviallyDeadConstantArrays();
-  return RetCode;
-}
-
-void Linker::setModule(Module *Dst) {
-  init(Dst, DiagnosticHandler);
+bool Linker::linkInMetadata(Module &Src,
+                            DenseMap<unsigned, MDNode *> *ValIDToTempMDMap) {
+  SetVector<GlobalValue *> ValuesToLink;
+  if (Mover.move(
+          Src, ValuesToLink.getArrayRef(),
+          [this](GlobalValue &GV, IRMover::ValueAdder Add) { assert(false); },
+          ValIDToTempMDMap, true))
+    return true;
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1780,34 +857,52 @@ void Linker::setModule(Module *Dst) {
 /// true is returned and ErrorMsg (if not null) is set to indicate the problem.
 /// Upon failure, the Dest module could be in a modified state, and shouldn't be
 /// relied on to be consistent.
-bool Linker::LinkModules(Module *Dest, Module *Src,
-                         DiagnosticHandlerFunction DiagnosticHandler) {
-  Linker L(Dest, DiagnosticHandler);
-  return L.linkInModule(Src);
+bool Linker::linkModules(Module &Dest, std::unique_ptr<Module> Src,
+                         unsigned Flags) {
+  Linker L(Dest);
+  return L.linkInModule(std::move(Src), Flags);
 }
 
-bool Linker::LinkModules(Module *Dest, Module *Src) {
-  Linker L(Dest);
-  return L.linkInModule(Src);
+bool llvm::renameModuleForThinLTO(Module &M, const FunctionInfoIndex *Index) {
+  ThinLTOGlobalProcessing ThinLTOProcessing(M, Index);
+  return ThinLTOProcessing.run();
 }
 
 //===----------------------------------------------------------------------===//
 // C API.
 //===----------------------------------------------------------------------===//
 
+static void diagnosticHandler(const DiagnosticInfo &DI, void *C) {
+  auto *Message = reinterpret_cast<std::string *>(C);
+  raw_string_ostream Stream(*Message);
+  DiagnosticPrinterRawOStream DP(Stream);
+  DI.print(DP);
+}
+
 LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
                          LLVMLinkerMode Unused, char **OutMessages) {
   Module *D = unwrap(Dest);
+  LLVMContext &Ctx = D->getContext();
+
+  LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler =
+      Ctx.getDiagnosticHandler();
+  void *OldDiagnosticContext = Ctx.getDiagnosticContext();
   std::string Message;
-  raw_string_ostream Stream(Message);
-  DiagnosticPrinterRawOStream DP(Stream);
+  Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true);
+
+  Linker L(*D);
+  Module *M = unwrap(Src);
+  LLVMBool Result = L.linkInModuleForCAPI(*M);
 
-  LLVMBool Result = Linker::LinkModules(
-      D, unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); });
+  Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true);
 
-  if (OutMessages && Result) {
-    Stream.flush();
+  if (OutMessages && Result)
     *OutMessages = strdup(Message.c_str());
-  }
   return Result;
 }
+
+LLVMBool LLVMLinkModules2(LLVMModuleRef Dest, LLVMModuleRef Src) {
+  Module *D = unwrap(Dest);
+  std::unique_ptr<Module> M(unwrap(Src));
+  return Linker::linkModules(*D, std::move(M));
+}
diff --git a/contrib/llvm/lib/MC/ConstantPools.cpp b/contrib/llvm/lib/MC/ConstantPools.cpp
index f7649fb..9643b75 100644
--- a/contrib/llvm/lib/MC/ConstantPools.cpp
+++ b/contrib/llvm/lib/MC/ConstantPools.cpp
@@ -29,17 +29,17 @@ void ConstantPool::emitEntries(MCStreamer &Streamer) {
        I != E; ++I) {
     Streamer.EmitCodeAlignment(I->Size); // align naturally
     Streamer.EmitLabel(I->Label);
-    Streamer.EmitValue(I->Value, I->Size);
+    Streamer.EmitValue(I->Value, I->Size, I->Loc);
   }
   Streamer.EmitDataRegion(MCDR_DataRegionEnd);
   Entries.clear();
 }
 
 const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context,
-                                     unsigned Size) {
+                                     unsigned Size, SMLoc Loc) {
   MCSymbol *CPEntryLabel = Context.createTempSymbol();
 
-  Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size));
+  Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size, Loc));
   return MCSymbolRefExpr::create(CPEntryLabel, Context);
 }
 
@@ -90,8 +90,8 @@ void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) {
 
 const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
                                                const MCExpr *Expr,
-                                               unsigned Size) {
+                                               unsigned Size, SMLoc Loc) {
   MCSection *Section = Streamer.getCurrentSection().first;
   return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext(),
-                                                   Size);
+                                                   Size, Loc);
 }
diff --git a/contrib/llvm/lib/MC/ELFObjectWriter.cpp b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
index e925bc2..e6552be 100644
--- a/contrib/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/StringSaver.h"
 #include <vector>
 using namespace llvm;
 
@@ -106,7 +107,9 @@ class ELFObjectWriter : public MCObjectWriter {
     /// @name Symbol Table Data
     /// @{
 
-    StringTableBuilder StrTabBuilder;
+    BumpPtrAllocator Alloc;
+    StringSaver VersionSymSaver{Alloc};
+    StringTableBuilder StrTabBuilder{StringTableBuilder::ELF};
 
     /// @}
 
@@ -157,9 +160,9 @@ class ELFObjectWriter : public MCObjectWriter {
 
     template <typename T> void write(T Val) {
       if (IsLittleEndian)
-        support::endian::Writer<support::little>(OS).write(Val);
+        support::endian::Writer<support::little>(getStream()).write(Val);
       else
-        support::endian::Writer<support::big>(OS).write(Val);
+        support::endian::Writer<support::big>(getStream()).write(Val);
     }
 
     void writeHeader(const MCAssembler &Asm);
@@ -232,7 +235,7 @@ class ELFObjectWriter : public MCObjectWriter {
 }
 
 void ELFObjectWriter::align(unsigned Alignment) {
-  uint64_t Padding = OffsetToAlignment(OS.tell(), Alignment);
+  uint64_t Padding = OffsetToAlignment(getStream().tell(), Alignment);
   WriteZeros(Padding);
 }
 
@@ -447,9 +450,6 @@ void ELFObjectWriter::writeSymbol(SymbolTableWriter &Writer,
                                   uint32_t StringIndex, ELFSymbolData &MSD,
                                   const MCAsmLayout &Layout) {
   const auto &Symbol = cast<MCSymbolELF>(*MSD.Symbol);
-  assert((!Symbol.getFragment() ||
-          (Symbol.getFragment()->getParent() == &Symbol.getSection())) &&
-         "The symbol's section doesn't match the fragment's symbol");
   const MCSymbolELF *Base =
       cast_or_null<MCSymbolELF>(Layout.getBaseSymbol(Symbol));
 
@@ -630,28 +630,36 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
     // In general, ELF has no relocations for -B. It can only represent (A + C)
     // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
     // replace B to implement it: (A - R - K + C)
-    if (IsPCRel)
-      Asm.getContext().reportFatalError(
+    if (IsPCRel) {
+      Asm.getContext().reportError(
           Fixup.getLoc(),
           "No relocation available to represent this relative expression");
+      return;
+    }
 
     const auto &SymB = cast<MCSymbolELF>(RefB->getSymbol());
 
-    if (SymB.isUndefined())
-      Asm.getContext().reportFatalError(
+    if (SymB.isUndefined()) {
+      Asm.getContext().reportError(
           Fixup.getLoc(),
           Twine("symbol '") + SymB.getName() +
               "' can not be undefined in a subtraction expression");
+      return;
+    }
 
     assert(!SymB.isAbsolute() && "Should have been folded");
     const MCSection &SecB = SymB.getSection();
-    if (&SecB != &FixupSection)
-      Asm.getContext().reportFatalError(
+    if (&SecB != &FixupSection) {
+      Asm.getContext().reportError(
           Fixup.getLoc(), "Cannot represent a difference across sections");
+      return;
+    }
 
-    if (::isWeak(SymB))
-      Asm.getContext().reportFatalError(
+    if (::isWeak(SymB)) {
+      Asm.getContext().reportError(
           Fixup.getLoc(), "Cannot represent a subtraction with a weak symbol");
+      return;
+    }
 
     uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
     uint64_t K = SymBOffset - FixupOffset;
@@ -764,7 +772,7 @@ void ELFObjectWriter::computeSymbolTable(
   SymbolTableIndex = addToSectionTable(SymtabSection);
 
   align(SymtabSection->getAlignment());
-  uint64_t SecStart = OS.tell();
+  uint64_t SecStart = getStream().tell();
 
   // The first entry is the undefined symbol entry.
   Writer.writeSymbol(0, 0, 0, 0, 0, 0, false);
@@ -784,8 +792,10 @@ void ELFObjectWriter::computeSymbolTable(
                     Renames.count(&Symbol)))
       continue;
 
-    if (Symbol.isTemporary() && Symbol.isUndefined())
-      Ctx.reportFatalError(SMLoc(), "Undefined temporary");
+    if (Symbol.isTemporary() && Symbol.isUndefined()) {
+      Ctx.reportError(SMLoc(), "Undefined temporary symbol");
+      continue;
+    }
 
     ELFSymbolData MSD;
     MSD.Symbol = cast<MCSymbolELF>(&Symbol);
@@ -850,13 +860,15 @@ void ELFObjectWriter::computeSymbolTable(
         Buf += Name.substr(0, Pos);
         unsigned Skip = MSD.SectionIndex == ELF::SHN_UNDEF ? 2 : 1;
         Buf += Name.substr(Pos + Skip);
-        Name = Buf;
+        Name = VersionSymSaver.save(Buf.c_str());
       }
     }
 
     // Sections have their own string table
-    if (Symbol.getType() != ELF::STT_SECTION)
-      MSD.Name = StrTabBuilder.add(Name);
+    if (Symbol.getType() != ELF::STT_SECTION) {
+      MSD.Name = Name;
+      StrTabBuilder.add(Name);
+    }
 
     if (Local)
       LocalSymbolData.push_back(MSD);
@@ -878,7 +890,7 @@ void ELFObjectWriter::computeSymbolTable(
   for (const std::string &Name : FileNames)
     StrTabBuilder.add(Name);
 
-  StrTabBuilder.finalize(StringTableBuilder::ELF);
+  StrTabBuilder.finalize();
 
   for (const std::string &Name : FileNames)
     Writer.writeSymbol(StrTabBuilder.getOffset(Name),
@@ -911,7 +923,7 @@ void ELFObjectWriter::computeSymbolTable(
     assert(MSD.Symbol->getBinding() != ELF::STB_LOCAL);
   }
 
-  uint64_t SecEnd = OS.tell();
+  uint64_t SecEnd = getStream().tell();
   SectionOffsets[SymtabSection] = std::make_pair(SecStart, SecEnd);
 
   ArrayRef<uint32_t> ShndxIndexes = Writer.getShndxIndexes();
@@ -921,12 +933,12 @@ void ELFObjectWriter::computeSymbolTable(
   }
   assert(SymtabShndxSectionIndex != 0);
 
-  SecStart = OS.tell();
+  SecStart = getStream().tell();
   const MCSectionELF *SymtabShndxSection =
       SectionTable[SymtabShndxSectionIndex - 1];
   for (uint32_t Index : ShndxIndexes)
     write(Index);
-  SecEnd = OS.tell();
+  SecEnd = getStream().tell();
   SectionOffsets[SymtabShndxSection] = std::make_pair(SecStart, SecEnd);
 }
 
@@ -957,31 +969,6 @@ ELFObjectWriter::createRelocationSection(MCContext &Ctx,
   return RelaSection;
 }
 
-static SmallVector<char, 128>
-getUncompressedData(const MCAsmLayout &Layout,
-                    const MCSection::FragmentListType &Fragments) {
-  SmallVector<char, 128> UncompressedData;
-  for (const MCFragment &F : Fragments) {
-    const SmallVectorImpl<char> *Contents;
-    switch (F.getKind()) {
-    case MCFragment::FT_Data:
-      Contents = &cast<MCDataFragment>(F).getContents();
-      break;
-    case MCFragment::FT_Dwarf:
-      Contents = &cast<MCDwarfLineAddrFragment>(F).getContents();
-      break;
-    case MCFragment::FT_DwarfFrame:
-      Contents = &cast<MCDwarfCallFrameFragment>(F).getContents();
-      break;
-    default:
-      llvm_unreachable(
-          "Not expecting any other fragment types in a debug_* section");
-    }
-    UncompressedData.append(Contents->begin(), Contents->end());
-  }
-  return UncompressedData;
-}
-
 // Include the debug info compression header:
 // "ZLIB" followed by 8 bytes representing the uncompressed size of the section,
 // useful for consumers to preallocate a buffer to decompress into.
@@ -1016,27 +1003,29 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
     return;
   }
 
-  // Gather the uncompressed data from all the fragments.
-  const MCSection::FragmentListType &Fragments = Section.getFragmentList();
-  SmallVector<char, 128> UncompressedData =
-      getUncompressedData(Layout, Fragments);
+  SmallVector<char, 128> UncompressedData;
+  raw_svector_ostream VecOS(UncompressedData);
+  raw_pwrite_stream &OldStream = getStream();
+  setStream(VecOS);
+  Asm.writeSectionData(&Section, Layout);
+  setStream(OldStream);
 
   SmallVector<char, 128> CompressedContents;
   zlib::Status Success = zlib::compress(
       StringRef(UncompressedData.data(), UncompressedData.size()),
       CompressedContents);
   if (Success != zlib::StatusOK) {
-    Asm.writeSectionData(&Section, Layout);
+    getStream() << UncompressedData;
     return;
   }
 
   if (!prependCompressionHeader(UncompressedData.size(), CompressedContents)) {
-    Asm.writeSectionData(&Section, Layout);
+    getStream() << UncompressedData;
     return;
   }
   Asm.getContext().renameELFSection(&Section,
                                     (".z" + SectionName.drop_front(1)).str());
-  OS << CompressedContents;
+  getStream() << CompressedContents;
 }
 
 void ELFObjectWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type,
@@ -1061,8 +1050,13 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
                                        const MCSectionELF &Sec) {
   std::vector<ELFRelocationEntry> &Relocs = Relocations[&Sec];
 
-  // Sort the relocation entries. Most targets just sort by Offset, but some
-  // (e.g., MIPS) have additional constraints.
+  // We record relocations by pushing to the end of a vector. Reverse the vector
+  // to get the relocations in the order they were created.
+  // In most cases that is not important, but it can be for special sections
+  // (.eh_frame) or specific relocations (TLS optimizations on SystemZ).
+  std::reverse(Relocs.begin(), Relocs.end());
+
+  // Sort the relocation entries. MIPS needs this.
   TargetObjectWriter->sortRelocs(Asm, Relocs);
 
   for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
@@ -1100,7 +1094,7 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm,
 
 const MCSectionELF *ELFObjectWriter::createStringTable(MCContext &Ctx) {
   const MCSectionELF *StrtabSection = SectionTable[StringTableIndex - 1];
-  OS << StrTabBuilder.data();
+  getStream() << StrTabBuilder.data();
   return StrtabSection;
 }
 
@@ -1209,12 +1203,12 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     align(Section.getAlignment());
 
     // Remember the offset into the file for this section.
-    uint64_t SecStart = OS.tell();
+    uint64_t SecStart = getStream().tell();
 
     const MCSymbolELF *SignatureSymbol = Section.getGroup();
     writeSectionData(Asm, Section, Layout);
 
-    uint64_t SecEnd = OS.tell();
+    uint64_t SecEnd = getStream().tell();
     SectionOffsets[&Section] = std::make_pair(SecStart, SecEnd);
 
     MCSectionELF *RelSection = createRelocationSection(Ctx, Section);
@@ -1246,7 +1240,7 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     align(Group->getAlignment());
 
     // Remember the offset into the file for this section.
-    uint64_t SecStart = OS.tell();
+    uint64_t SecStart = getStream().tell();
 
     const MCSymbol *SignatureSymbol = Group->getGroup();
     assert(SignatureSymbol);
@@ -1256,7 +1250,7 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
       write(SecIndex);
     }
 
-    uint64_t SecEnd = OS.tell();
+    uint64_t SecEnd = getStream().tell();
     SectionOffsets[Group] = std::make_pair(SecStart, SecEnd);
   }
 
@@ -1267,25 +1261,25 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     align(RelSection->getAlignment());
 
     // Remember the offset into the file for this section.
-    uint64_t SecStart = OS.tell();
+    uint64_t SecStart = getStream().tell();
 
     writeRelocations(Asm, *RelSection->getAssociatedSection());
 
-    uint64_t SecEnd = OS.tell();
+    uint64_t SecEnd = getStream().tell();
     SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
   }
 
   {
-    uint64_t SecStart = OS.tell();
+    uint64_t SecStart = getStream().tell();
     const MCSectionELF *Sec = createStringTable(Ctx);
-    uint64_t SecEnd = OS.tell();
+    uint64_t SecEnd = getStream().tell();
     SectionOffsets[Sec] = std::make_pair(SecStart, SecEnd);
   }
 
   uint64_t NaturalAlignment = is64Bit() ? 8 : 4;
   align(NaturalAlignment);
 
-  const unsigned SectionHeaderOffset = OS.tell();
+  const unsigned SectionHeaderOffset = getStream().tell();
 
   // ... then the section header table ...
   writeSectionHeader(Layout, SectionIndexMap, SectionOffsets);
@@ -1301,19 +1295,19 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     uint64_t Val = SectionHeaderOffset;
     if (sys::IsLittleEndianHost != IsLittleEndian)
       sys::swapByteOrder(Val);
-    OS.pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
-              offsetof(ELF::Elf64_Ehdr, e_shoff));
+    getStream().pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
+                       offsetof(ELF::Elf64_Ehdr, e_shoff));
     NumSectionsOffset = offsetof(ELF::Elf64_Ehdr, e_shnum);
   } else {
     uint32_t Val = SectionHeaderOffset;
     if (sys::IsLittleEndianHost != IsLittleEndian)
       sys::swapByteOrder(Val);
-    OS.pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
-              offsetof(ELF::Elf32_Ehdr, e_shoff));
+    getStream().pwrite(reinterpret_cast<char *>(&Val), sizeof(Val),
+                       offsetof(ELF::Elf32_Ehdr, e_shoff));
     NumSectionsOffset = offsetof(ELF::Elf32_Ehdr, e_shnum);
   }
-  OS.pwrite(reinterpret_cast<char *>(&NumSections), sizeof(NumSections),
-            NumSectionsOffset);
+  getStream().pwrite(reinterpret_cast<char *>(&NumSections),
+                     sizeof(NumSections), NumSectionsOffset);
 }
 
 bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
diff --git a/contrib/llvm/lib/MC/MCAsmBackend.cpp b/contrib/llvm/lib/MC/MCAsmBackend.cpp
index 36c65b7..fcf139b 100644
--- a/contrib/llvm/lib/MC/MCAsmBackend.cpp
+++ b/contrib/llvm/lib/MC/MCAsmBackend.cpp
@@ -16,6 +16,10 @@ MCAsmBackend::MCAsmBackend() : HasDataInCodeSupport(false) {}
 
 MCAsmBackend::~MCAsmBackend() {}
 
+bool MCAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const {
+  return false;
+}
+
 const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   static const MCFixupKindInfo Builtins[] = {
       {"FK_Data_1", 0, 8, 0},
diff --git a/contrib/llvm/lib/MC/MCAsmInfo.cpp b/contrib/llvm/lib/MC/MCAsmInfo.cpp
index 100dc7c..36e10b3 100644
--- a/contrib/llvm/lib/MC/MCAsmInfo.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfo.cpp
@@ -157,3 +157,9 @@ bool MCAsmInfo::isValidUnquotedName(StringRef Name) const {
 
   return true;
 }
+
+bool MCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+  // FIXME: Does .section .bss/.data/.text work everywhere??
+  return SectionName == ".text" || SectionName == ".data" ||
+        (SectionName == ".bss" && !usesELFSectionDirectiveForBSS());
+}
diff --git a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
index 97fc76a..5b9dd20 100644
--- a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -37,8 +37,7 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
 
   UseIntegratedAssembler = true;
 
-  // FIXME: For now keep the previous behavior, AShr. Need to double-check
-  // other COFF-targeting assemblers and change this if necessary.
+  // At least MSVC inline-asm does AShr.
   UseLogicalShr = false;
 }
 
diff --git a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
index bb90ff2..ae9486d 100644
--- a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -93,9 +93,4 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   UseIntegratedAssembler = true;
   SetDirectiveSuppressesReloc = true;
-
-  // FIXME: For now keep the previous behavior, AShr, matching the previous
-  // behavior of as(1) (both -q and -Q: resp. LLVM and gas v1.38).
-  // If/when this changes, the AArch64 Darwin special case can go away.
-  UseLogicalShr = false;
 }
diff --git a/contrib/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
index 227c937..c99ce77 100644
--- a/contrib/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCAsmStreamer.cpp - Text Assembly Output --------------------===//
+//===- lib/MC/MCAsmStreamer.cpp - Text Assembly Output ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,9 +29,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include <cctype>
+
 using namespace llvm;
 
 namespace {
@@ -78,6 +80,9 @@ public:
     }
     EmitCommentsAndEOL();
   }
+
+  void EmitSyntaxDirective() override;
+
   void EmitCommentsAndEOL();
 
   /// isVerboseAsm - Return true if this streamer supports verbose assembly at
@@ -160,7 +165,7 @@ public:
   void EmitBytes(StringRef Data) override;
 
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                     const SMLoc &Loc = SMLoc()) override;
+                     SMLoc Loc = SMLoc()) override;
   void EmitIntValue(uint64_t Value, unsigned Size) override;
 
   void EmitULEB128Value(const MCExpr *Value) override;
@@ -181,7 +186,7 @@ public:
   void EmitCodeAlignment(unsigned ByteAlignment,
                          unsigned MaxBytesToEmit = 0) override;
 
-  bool EmitValueToOffset(const MCExpr *Offset,
+  void emitValueToOffset(const MCExpr *Offset,
                          unsigned char Value = 0) override;
 
   void EmitFileDirective(StringRef Filename) override;
@@ -207,6 +212,8 @@ public:
   void EmitCFISameValue(int64_t Register) override;
   void EmitCFIRelOffset(int64_t Register, int64_t Offset) override;
   void EmitCFIAdjustCfaOffset(int64_t Adjustment) override;
+  void EmitCFIEscape(StringRef Values) override;
+  void EmitCFIGnuArgsSize(int64_t Size) override;
   void EmitCFISignalFrame() override;
   void EmitCFIUndefined(int64_t Register) override;
   void EmitCFIRegister(int64_t Register1, int64_t Register2) override;
@@ -233,6 +240,9 @@ public:
   void EmitBundleLock(bool AlignToEnd) override;
   void EmitBundleUnlock() override;
 
+  bool EmitRelocDirective(const MCExpr &Offset, StringRef Name,
+                          const MCExpr *Expr, SMLoc Loc) override;
+
   /// EmitRawText - If this file is backed by an assembly streamer, this dumps
   /// the specified string in the output .s file.  This capability is
   /// indicated by the hasRawTextSupport() predicate.
@@ -250,15 +260,9 @@ public:
 void MCAsmStreamer::AddComment(const Twine &T) {
   if (!IsVerboseAsm) return;
 
-  // Make sure that CommentStream is flushed.
-  CommentStream.flush();
-
   T.toVector(CommentToEmit);
   // Each comment goes on its own line.
   CommentToEmit.push_back('\n');
-
-  // Tell the comment stream that the vector changed underneath it.
-  CommentStream.resync();
 }
 
 void MCAsmStreamer::EmitCommentsAndEOL() {
@@ -267,7 +271,6 @@ void MCAsmStreamer::EmitCommentsAndEOL() {
     return;
   }
 
-  CommentStream.flush();
   StringRef Comments = CommentToEmit;
 
   assert(Comments.back() == '\n' &&
@@ -282,8 +285,6 @@ void MCAsmStreamer::EmitCommentsAndEOL() {
   } while (!Comments.empty());
 
   CommentToEmit.clear();
-  // Tell the comment stream that the vector changed underneath it.
-  CommentStream.resync();
 }
 
 static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
@@ -372,6 +373,8 @@ void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
 void MCAsmStreamer::EmitVersionMin(MCVersionMinType Kind, unsigned Major,
                                    unsigned Minor, unsigned Update) {
   switch (Kind) {
+  case MCVM_WatchOSVersionMin:    OS << "\t.watchos_version_min"; break;
+  case MCVM_TvOSVersionMin:       OS << "\t.tvos_version_min"; break;
   case MCVM_IOSVersionMin:        OS << "\t.ios_version_min"; break;
   case MCVM_OSXVersionMin:        OS << "\t.macosx_version_min"; break;
   }
@@ -480,6 +483,14 @@ void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitSyntaxDirective() {
+  if (MAI->getAssemblerDialect() == 1)
+    OS << "\t.intel_syntax noprefix\n";
+  // FIXME: Currently emit unprefix'ed registers.
+  // The intel_syntax directive has one optional argument 
+  // with may have a value of prefix or noprefix.
+}
+
 void MCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
   OS << "\t.def\t ";
   Symbol->print(OS, MAI);
@@ -531,9 +542,6 @@ void MCAsmStreamer::emitELFSize(MCSymbolELF *Symbol, const MCExpr *Value) {
 
 void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
-  // Common symbols do not belong to any actual section.
-  AssignSection(Symbol, nullptr);
-
   OS << "\t.comm\t";
   Symbol->print(OS, MAI);
   OS << ',' << Size;
@@ -553,9 +561,6 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 /// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
-  // Common symbols do not belong to any actual section.
-  AssignSection(Symbol, nullptr);
-
   OS << "\t.lcomm\t";
   Symbol->print(OS, MAI);
   OS << ',' << Size;
@@ -579,7 +584,7 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
   if (Symbol)
-    AssignSection(Symbol, Section);
+    AssignFragment(Symbol, &Section->getDummyFragment());
 
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
@@ -603,7 +608,7 @@ void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
 // e.g. _a.
 void MCAsmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
-  AssignSection(Symbol, Section);
+  AssignFragment(Symbol, &Section->getDummyFragment());
 
   assert(Symbol && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
@@ -654,7 +659,6 @@ static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
   OS << '"';
 }
 
-
 void MCAsmStreamer::EmitBytes(StringRef Data) {
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
@@ -685,7 +689,7 @@ void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
 }
 
 void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                  const SMLoc &Loc) {
+                                  SMLoc Loc) {
   assert(Size <= 8 && "Invalid size");
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
@@ -776,7 +780,6 @@ void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
   EmitEOL();
 }
 
-
 /// EmitFill - Emit NumBytes bytes worth of the value specified by
 /// FillValue.  This implements directives such as '.space'.
 void MCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
@@ -856,17 +859,15 @@ void MCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
                        1, MaxBytesToEmit);
 }
 
-bool MCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
+void MCAsmStreamer::emitValueToOffset(const MCExpr *Offset,
                                       unsigned char Value) {
   // FIXME: Verify that Offset is associated with the current section.
   OS << ".org ";
   Offset->print(OS, MAI);
   OS << ", " << (unsigned)Value;
   EmitEOL();
-  return false;
 }
 
-
 void MCAsmStreamer::EmitFileDirective(StringRef Filename) {
   assert(MAI->hasSingleParameterDotFile());
   OS << "\t.file\t";
@@ -1014,6 +1015,32 @@ void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
   EmitEOL();
 }
 
+static void PrintCFIEscape(llvm::formatted_raw_ostream &OS, StringRef Values) {
+  OS << "\t.cfi_escape ";
+  if (!Values.empty()) {
+    size_t e = Values.size() - 1;
+    for (size_t i = 0; i < e; ++i)
+      OS << format("0x%02x", uint8_t(Values[i])) << ", ";
+    OS << format("0x%02x", uint8_t(Values[e]));
+  }
+}
+
+void MCAsmStreamer::EmitCFIEscape(StringRef Values) {
+  MCStreamer::EmitCFIEscape(Values);
+  PrintCFIEscape(OS, Values);
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIGnuArgsSize(int64_t Size) {
+  MCStreamer::EmitCFIGnuArgsSize(Size);
+  
+  uint8_t Buffer[16] = { dwarf::DW_CFA_GNU_args_size };
+  unsigned Len = encodeULEB128(Size, Buffer + 1) + 1;
+  
+  PrintCFIEscape(OS, StringRef((const char *)&Buffer[0], Len));
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
   MCStreamer::EmitCFIDefCfaRegister(Register);
   OS << "\t.cfi_def_cfa_register ";
@@ -1203,7 +1230,7 @@ void MCAsmStreamer::EmitWinCFIPushFrame(bool Code) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIEndProlog(void) {
+void MCAsmStreamer::EmitWinCFIEndProlog() {
   MCStreamer::EmitWinCFIEndProlog();
 
   OS << "\t.seh_endprologue";
@@ -1217,7 +1244,6 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
   SmallVector<MCFixup, 4> Fixups;
   raw_svector_ostream VecOS(Code);
   Emitter->encodeInstruction(Inst, VecOS, Fixups, STI);
-  VecOS.flush();
 
   // If we are showing fixups, create symbolic markers in the encoded
   // representation. We do this by making a per-bit map to the fixup item index,
@@ -1334,6 +1360,19 @@ void MCAsmStreamer::EmitBundleUnlock() {
   EmitEOL();
 }
 
+bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                       const MCExpr *Expr, SMLoc) {
+  OS << "\t.reloc ";
+  Offset.print(OS, MAI);
+  OS << ", " << Name;
+  if (Expr) {
+    OS << ", ";
+    Expr->print(OS, MAI);
+  }
+  EmitEOL();
+  return false;
+}
+
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
diff --git a/contrib/llvm/lib/MC/MCAssembler.cpp b/contrib/llvm/lib/MC/MCAssembler.cpp
index f53b589..15e82fa 100644
--- a/contrib/llvm/lib/MC/MCAssembler.cpp
+++ b/contrib/llvm/lib/MC/MCAssembler.cpp
@@ -64,272 +64,11 @@ STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
 
 /* *** */
 
-MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
-  : Assembler(Asm), LastValidFragment()
- {
-  // Compute the section layout order. Virtual sections must go last.
-  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it)
-    if (!it->isVirtualSection())
-      SectionOrder.push_back(&*it);
-  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it)
-    if (it->isVirtualSection())
-      SectionOrder.push_back(&*it);
-}
-
-bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
-  const MCSection *Sec = F->getParent();
-  const MCFragment *LastValid = LastValidFragment.lookup(Sec);
-  if (!LastValid)
-    return false;
-  assert(LastValid->getParent() == Sec);
-  return F->getLayoutOrder() <= LastValid->getLayoutOrder();
-}
-
-void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
-  // If this fragment wasn't already valid, we don't need to do anything.
-  if (!isFragmentValid(F))
-    return;
-
-  // Otherwise, reset the last valid fragment to the previous fragment
-  // (if this is the first fragment, it will be NULL).
-  LastValidFragment[F->getParent()] = F->getPrevNode();
-}
-
-void MCAsmLayout::ensureValid(const MCFragment *F) const {
-  MCSection *Sec = F->getParent();
-  MCFragment *Cur = LastValidFragment[Sec];
-  if (!Cur)
-    Cur = Sec->begin();
-  else
-    Cur = Cur->getNextNode();
-
-  // Advance the layout position until the fragment is valid.
-  while (!isFragmentValid(F)) {
-    assert(Cur && "Layout bookkeeping error");
-    const_cast<MCAsmLayout*>(this)->layoutFragment(Cur);
-    Cur = Cur->getNextNode();
-  }
-}
-
-uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
-  ensureValid(F);
-  assert(F->Offset != ~UINT64_C(0) && "Address not set!");
-  return F->Offset;
-}
-
-// Simple getSymbolOffset helper for the non-varibale case.
-static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbol &S,
-                           bool ReportError, uint64_t &Val) {
-  if (!S.getFragment()) {
-    if (ReportError)
-      report_fatal_error("unable to evaluate offset to undefined symbol '" +
-                         S.getName() + "'");
-    return false;
-  }
-  Val = Layout.getFragmentOffset(S.getFragment()) + S.getOffset();
-  return true;
-}
-
-static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S,
-                                bool ReportError, uint64_t &Val) {
-  if (!S.isVariable())
-    return getLabelOffset(Layout, S, ReportError, Val);
-
-  // If SD is a variable, evaluate it.
-  MCValue Target;
-  if (!S.getVariableValue()->evaluateAsRelocatable(Target, &Layout, nullptr))
-    report_fatal_error("unable to evaluate offset for variable '" +
-                       S.getName() + "'");
-
-  uint64_t Offset = Target.getConstant();
-
-  const MCSymbolRefExpr *A = Target.getSymA();
-  if (A) {
-    uint64_t ValA;
-    if (!getLabelOffset(Layout, A->getSymbol(), ReportError, ValA))
-      return false;
-    Offset += ValA;
-  }
-
-  const MCSymbolRefExpr *B = Target.getSymB();
-  if (B) {
-    uint64_t ValB;
-    if (!getLabelOffset(Layout, B->getSymbol(), ReportError, ValB))
-      return false;
-    Offset -= ValB;
-  }
-
-  Val = Offset;
-  return true;
-}
-
-bool MCAsmLayout::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const {
-  return getSymbolOffsetImpl(*this, S, false, Val);
-}
-
-uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const {
-  uint64_t Val;
-  getSymbolOffsetImpl(*this, S, true, Val);
-  return Val;
-}
-
-const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
-  if (!Symbol.isVariable())
-    return &Symbol;
-
-  const MCExpr *Expr = Symbol.getVariableValue();
-  MCValue Value;
-  if (!Expr->evaluateAsValue(Value, *this))
-    llvm_unreachable("Invalid Expression");
-
-  const MCSymbolRefExpr *RefB = Value.getSymB();
-  if (RefB)
-    Assembler.getContext().reportFatalError(
-        SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
-                     "' could not be evaluated in a subtraction expression");
-
-  const MCSymbolRefExpr *A = Value.getSymA();
-  if (!A)
-    return nullptr;
-
-  const MCSymbol &ASym = A->getSymbol();
-  const MCAssembler &Asm = getAssembler();
-  if (ASym.isCommon()) {
-    // FIXME: we should probably add a SMLoc to MCExpr.
-    Asm.getContext().reportFatalError(SMLoc(),
-                                "Common symbol " + ASym.getName() +
-                                    " cannot be used in assignment expr");
-  }
-
-  return &ASym;
-}
-
-uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const {
-  // The size is the last fragment's end offset.
-  const MCFragment &F = Sec->getFragmentList().back();
-  return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F);
-}
-
-uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const {
-  // Virtual sections have no file size.
-  if (Sec->isVirtualSection())
-    return 0;
-
-  // Otherwise, the file size is the same as the address space size.
-  return getSectionAddressSize(Sec);
-}
-
-uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
-                                    const MCFragment *F,
-                                    uint64_t FOffset, uint64_t FSize) {
-  uint64_t BundleSize = Assembler.getBundleAlignSize();
-  assert(BundleSize > 0 &&
-         "computeBundlePadding should only be called if bundling is enabled");
-  uint64_t BundleMask = BundleSize - 1;
-  uint64_t OffsetInBundle = FOffset & BundleMask;
-  uint64_t EndOfFragment = OffsetInBundle + FSize;
-
-  // There are two kinds of bundling restrictions:
-  //
-  // 1) For alignToBundleEnd(), add padding to ensure that the fragment will
-  //    *end* on a bundle boundary.
-  // 2) Otherwise, check if the fragment would cross a bundle boundary. If it
-  //    would, add padding until the end of the bundle so that the fragment
-  //    will start in a new one.
-  if (F->alignToBundleEnd()) {
-    // Three possibilities here:
-    //
-    // A) The fragment just happens to end at a bundle boundary, so we're good.
-    // B) The fragment ends before the current bundle boundary: pad it just
-    //    enough to reach the boundary.
-    // C) The fragment ends after the current bundle boundary: pad it until it
-    //    reaches the end of the next bundle boundary.
-    //
-    // Note: this code could be made shorter with some modulo trickery, but it's
-    // intentionally kept in its more explicit form for simplicity.
-    if (EndOfFragment == BundleSize)
-      return 0;
-    else if (EndOfFragment < BundleSize)
-      return BundleSize - EndOfFragment;
-    else { // EndOfFragment > BundleSize
-      return 2 * BundleSize - EndOfFragment;
-    }
-  } else if (OffsetInBundle > 0 && EndOfFragment > BundleSize)
-    return BundleSize - OffsetInBundle;
-  else
-    return 0;
-}
-
-/* *** */
-
-void ilist_node_traits<MCFragment>::deleteNode(MCFragment *V) {
-  V->destroy();
-}
-
-MCFragment::MCFragment() : Kind(FragmentType(~0)), HasInstructions(false),
-                           AlignToBundleEnd(false), BundlePadding(0) {
-}
-
-MCFragment::~MCFragment() { }
-
-MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
-                       uint8_t BundlePadding, MCSection *Parent)
-    : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false),
-      BundlePadding(BundlePadding), Parent(Parent), Atom(nullptr),
-      Offset(~UINT64_C(0)) {
-  if (Parent)
-    Parent->getFragmentList().push_back(this);
-}
-
-void MCFragment::destroy() {
-  // First check if we are the sentinal.
-  if (Kind == FragmentType(~0)) {
-    delete this;
-    return;
-  }
-
-  switch (Kind) {
-    case FT_Align:
-      delete cast<MCAlignFragment>(this);
-      return;
-    case FT_Data:
-      delete cast<MCDataFragment>(this);
-      return;
-    case FT_CompactEncodedInst:
-      delete cast<MCCompactEncodedInstFragment>(this);
-      return;
-    case FT_Fill:
-      delete cast<MCFillFragment>(this);
-      return;
-    case FT_Relaxable:
-      delete cast<MCRelaxableFragment>(this);
-      return;
-    case FT_Org:
-      delete cast<MCOrgFragment>(this);
-      return;
-    case FT_Dwarf:
-      delete cast<MCDwarfLineAddrFragment>(this);
-      return;
-    case FT_DwarfFrame:
-      delete cast<MCDwarfCallFrameFragment>(this);
-      return;
-    case FT_LEB:
-      delete cast<MCLEBFragment>(this);
-      return;
-    case FT_SafeSEH:
-      delete cast<MCSafeSEHFragment>(this);
-      return;
-  }
-}
-
-/* *** */
-
 MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
-                         MCCodeEmitter &Emitter_, MCObjectWriter &Writer_,
-                         raw_ostream &OS_)
+                         MCCodeEmitter &Emitter_, MCObjectWriter &Writer_)
     : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
-      OS(OS_), BundleAlignSize(0), RelaxAll(false),
-      SubsectionsViaSymbols(false), ELFHeaderEFlags(0) {
+      BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false),
+      IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) {
   VersionMinInfo.Major = 0; // Major version == 0 for "none specified"
 }
 
@@ -347,6 +86,7 @@ void MCAssembler::reset() {
   BundleAlignSize = 0;
   RelaxAll = false;
   SubsectionsViaSymbols = false;
+  IncrementalLinkerCompatible = false;
   ELFHeaderEFlags = 0;
   LOHContainer.reset();
   VersionMinInfo.Major = 0;
@@ -358,6 +98,14 @@ void MCAssembler::reset() {
   getLOHContainer().reset();
 }
 
+bool MCAssembler::registerSection(MCSection &Section) {
+  if (Section.isRegistered())
+    return false;
+  Sections.push_back(&Section);
+  Section.setIsRegistered(true);
+  return true;
+}
+
 bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
   if (ThumbFuncs.count(Symbol))
     return true;
@@ -404,7 +152,7 @@ const MCSymbol *MCAssembler::getAtom(const MCSymbol &S) const {
     return &S;
 
   // Absolute and undefined symbols have no defining atom.
-  if (!S.getFragment())
+  if (!S.isInSection())
     return nullptr;
 
   // Non-linker visible symbols in sections which can't be atomized have no
@@ -426,8 +174,13 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   // probably merge the two into a single callback that tries to evaluate a
   // fixup and records a relocation if one is needed.
   const MCExpr *Expr = Fixup.getValue();
-  if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup))
-    getContext().reportFatalError(Fixup.getLoc(), "expected relocatable expression");
+  if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) {
+    getContext().reportError(Fixup.getLoc(), "expected relocatable expression");
+    // Claim to have completely evaluated the fixup, to prevent any further
+    // processing from being done.
+    Value = 0;
+    return true;
+  }
 
   bool IsPCRel = Backend.getFixupKindInfo(
     Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel;
@@ -523,12 +276,19 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
 
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
-    int64_t TargetLocation;
-    if (!OF.getOffset().evaluateAsAbsolute(TargetLocation, Layout))
+    MCValue Value;
+    if (!OF.getOffset().evaluateAsValue(Value, Layout))
       report_fatal_error("expected assembly-time absolute expression");
 
     // FIXME: We need a way to communicate this error.
     uint64_t FragmentOffset = Layout.getFragmentOffset(&OF);
+    int64_t TargetLocation = Value.getConstant();
+    if (const MCSymbolRefExpr *A = Value.getSymA()) {
+      uint64_t Val;
+      if (!Layout.getSymbolOffset(A->getSymbol(), Val))
+        report_fatal_error("expected absolute expression");
+      TargetLocation += Val;
+    }
     int64_t Size = TargetLocation - FragmentOffset;
     if (Size < 0 || Size >= 0x40000000)
       report_fatal_error("invalid .org offset '" + Twine(TargetLocation) +
@@ -540,6 +300,8 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return cast<MCDwarfLineAddrFragment>(F).getContents().size();
   case MCFragment::FT_DwarfFrame:
     return cast<MCDwarfCallFrameFragment>(F).getContents().size();
+  case MCFragment::FT_Dummy:
+    llvm_unreachable("Should not have been added");
   }
 
   llvm_unreachable("invalid fragment kind");
@@ -773,6 +535,8 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     OW->writeBytes(CF.getContents());
     break;
   }
+  case MCFragment::FT_Dummy:
+    llvm_unreachable("Should not have been added");
   }
 
   assert(OW->getStream().tell() - Start == FragmentSize &&
@@ -786,15 +550,14 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
     assert(Layout.getSectionFileSize(Sec) == 0 && "Invalid size for section!");
 
     // Check that contents are only things legal inside a virtual section.
-    for (MCSection::const_iterator it = Sec->begin(), ie = Sec->end(); it != ie;
-         ++it) {
-      switch (it->getKind()) {
+    for (const MCFragment &F : *Sec) {
+      switch (F.getKind()) {
       default: llvm_unreachable("Invalid fragment in virtual section!");
       case MCFragment::FT_Data: {
         // Check that we aren't trying to write a non-zero contents (or fixups)
         // into a virtual section. This is to support clients which use standard
         // directives to fill the contents of virtual sections.
-        const MCDataFragment &DF = cast<MCDataFragment>(*it);
+        const MCDataFragment &DF = cast<MCDataFragment>(F);
         assert(DF.fixup_begin() == DF.fixup_end() &&
                "Cannot have fixups in virtual section!");
         for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i)
@@ -810,13 +573,13 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
       case MCFragment::FT_Align:
         // Check that we aren't trying to write a non-zero value into a virtual
         // section.
-        assert((cast<MCAlignFragment>(it)->getValueSize() == 0 ||
-                cast<MCAlignFragment>(it)->getValue() == 0) &&
+        assert((cast<MCAlignFragment>(F).getValueSize() == 0 ||
+                cast<MCAlignFragment>(F).getValue() == 0) &&
                "Invalid align in virtual section!");
         break;
       case MCFragment::FT_Fill:
-        assert((cast<MCFillFragment>(it)->getValueSize() == 0 ||
-                cast<MCFillFragment>(it)->getValue() == 0) &&
+        assert((cast<MCFillFragment>(F).getValueSize() == 0 ||
+                cast<MCFillFragment>(F).getValue() == 0) &&
                "Invalid fill in virtual section!");
         break;
       }
@@ -828,9 +591,8 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
   uint64_t Start = getWriter().getStream().tell();
   (void)Start;
 
-  for (MCSection::const_iterator it = Sec->begin(), ie = Sec->end(); it != ie;
-       ++it)
-    writeFragment(*this, Layout, *it);
+  for (const MCFragment &F : *Sec)
+    writeFragment(*this, Layout, F);
 
   assert(getWriter().getStream().tell() - Start ==
          Layout.getSectionAddressSize(Sec));
@@ -854,23 +616,20 @@ std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout,
   return std::make_pair(FixedValue, IsPCRel);
 }
 
-void MCAssembler::Finish() {
+void MCAssembler::layout(MCAsmLayout &Layout) {
   DEBUG_WITH_TYPE("mc-dump", {
       llvm::errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
 
-  // Create the layout object.
-  MCAsmLayout Layout(*this);
-
   // Create dummy fragments and assign section ordinals.
   unsigned SectionIndex = 0;
-  for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
+  for (MCSection &Sec : *this) {
     // Create dummy fragments to eliminate any empty sections, this simplifies
     // layout.
-    if (it->getFragmentList().empty())
-      new MCDataFragment(&*it);
+    if (Sec.getFragmentList().empty())
+      new MCDataFragment(&Sec);
 
-    it->setOrdinal(SectionIndex++);
+    Sec.setOrdinal(SectionIndex++);
   }
 
   // Assign layout order indices to sections and fragments.
@@ -879,9 +638,8 @@ void MCAssembler::Finish() {
     Sec->setLayoutOrder(i);
 
     unsigned FragmentIndex = 0;
-    for (MCSection::iterator iFrag = Sec->begin(), iFragEnd = Sec->end();
-         iFrag != iFragEnd; ++iFrag)
-      iFrag->setLayoutOrder(FragmentIndex++);
+    for (MCFragment &Frag : *Sec)
+      Frag.setLayoutOrder(FragmentIndex++);
   }
 
   // Layout until everything fits.
@@ -899,17 +657,14 @@ void MCAssembler::Finish() {
       llvm::errs() << "assembler backend - final-layout\n--\n";
       dump(); });
 
-  uint64_t StartOffset = OS.tell();
-
   // Allow the object writer a chance to perform post-layout binding (for
   // example, to set the index fields in the symbol data).
   getWriter().executePostLayoutBinding(*this, Layout);
 
   // Evaluate and apply the fixups, generating relocation entries as necessary.
-  for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
-    for (MCSection::iterator it2 = it->begin(), ie2 = it->end(); it2 != ie2;
-         ++it2) {
-      MCEncodedFragment *F = dyn_cast<MCEncodedFragment>(it2);
+  for (MCSection &Sec : *this) {
+    for (MCFragment &Frag : Sec) {
+      MCEncodedFragment *F = dyn_cast<MCEncodedFragment>(&Frag);
       // Data and relaxable fragments both have fixups.  So only process
       // those here.
       // FIXME: Is there a better way to do this?  MCEncodedFragmentWithFixups
@@ -935,6 +690,15 @@ void MCAssembler::Finish() {
       }
     }
   }
+}
+
+void MCAssembler::Finish() {
+  // Create the layout object.
+  MCAsmLayout Layout(*this);
+  layout(Layout);
+
+  raw_ostream &OS = getWriter().getStream();
+  uint64_t StartOffset = OS.tell();
 
   // Write the object file.
   getWriter().writeObject(*this, Layout);
@@ -960,9 +724,8 @@ bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F,
   if (!getBackend().mayNeedRelaxation(F->getInst()))
     return false;
 
-  for (MCRelaxableFragment::const_fixup_iterator it = F->fixup_begin(),
-       ie = F->fixup_end(); it != ie; ++it)
-    if (fixupNeedsRelaxation(*it, F, Layout))
+  for (const MCFixup &Fixup : F->getFixups())
+    if (fixupNeedsRelaxation(Fixup, F, Layout))
       return true;
 
   return false;
@@ -991,7 +754,6 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
   getEmitter().encodeInstruction(Relaxed, VecOS, Fixups, F.getSubtargetInfo());
-  VecOS.flush();
 
   // Update the fragment.
   F.setInst(Relaxed);
@@ -1014,7 +776,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
     encodeSLEB128(Value, OSE);
   else
     encodeULEB128(Value, OSE);
-  OSE.flush();
   return OldSize != LF.getContents().size();
 }
 
@@ -1031,8 +792,8 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
   SmallString<8> &Data = DF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
-  MCDwarfLineAddr::Encode(Context, LineDelta, AddrDelta, OSE);
-  OSE.flush();
+  MCDwarfLineAddr::Encode(Context, getDWARFLinetableParams(), LineDelta,
+                          AddrDelta, OSE);
   return OldSize != Data.size();
 }
 
@@ -1048,7 +809,6 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
   Data.clear();
   raw_svector_ostream OSE(Data);
   MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OSE);
-  OSE.flush();
   return OldSize != Data.size();
 }
 
@@ -1085,7 +845,7 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
       break;
     }
     if (RelaxedFrag && !FirstRelaxedFragment)
-      FirstRelaxedFragment = I;
+      FirstRelaxedFragment = &*I;
   }
   if (FirstRelaxedFragment) {
     Layout.invalidateFragmentsFrom(FirstRelaxedFragment);
@@ -1113,158 +873,3 @@ void MCAssembler::finishLayout(MCAsmLayout &Layout) {
     Layout.getFragmentOffset(&*Layout.getSectionOrder()[i]->rbegin());
   }
 }
-
-// Debugging methods
-
-namespace llvm {
-
-raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
-  OS << "<MCFixup" << " Offset:" << AF.getOffset()
-     << " Value:" << *AF.getValue()
-     << " Kind:" << AF.getKind() << ">";
-  return OS;
-}
-
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MCFragment::dump() {
-  raw_ostream &OS = llvm::errs();
-
-  OS << "<";
-  switch (getKind()) {
-  case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
-  case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
-  case MCFragment::FT_CompactEncodedInst:
-    OS << "MCCompactEncodedInstFragment"; break;
-  case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
-  case MCFragment::FT_Relaxable:  OS << "MCRelaxableFragment"; break;
-  case MCFragment::FT_Org:   OS << "MCOrgFragment"; break;
-  case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break;
-  case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
-  case MCFragment::FT_LEB:   OS << "MCLEBFragment"; break;
-  case MCFragment::FT_SafeSEH:    OS << "MCSafeSEHFragment"; break;
-  }
-
-  OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
-     << " Offset:" << Offset
-     << " HasInstructions:" << hasInstructions() 
-     << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
-
-  switch (getKind()) {
-  case MCFragment::FT_Align: {
-    const MCAlignFragment *AF = cast<MCAlignFragment>(this);
-    if (AF->hasEmitNops())
-      OS << " (emit nops)";
-    OS << "\n       ";
-    OS << " Alignment:" << AF->getAlignment()
-       << " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize()
-       << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
-    break;
-  }
-  case MCFragment::FT_Data:  {
-    const MCDataFragment *DF = cast<MCDataFragment>(this);
-    OS << "\n       ";
-    OS << " Contents:[";
-    const SmallVectorImpl<char> &Contents = DF->getContents();
-    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
-      if (i) OS << ",";
-      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
-    }
-    OS << "] (" << Contents.size() << " bytes)";
-
-    if (DF->fixup_begin() != DF->fixup_end()) {
-      OS << ",\n       ";
-      OS << " Fixups:[";
-      for (MCDataFragment::const_fixup_iterator it = DF->fixup_begin(),
-             ie = DF->fixup_end(); it != ie; ++it) {
-        if (it != DF->fixup_begin()) OS << ",\n                ";
-        OS << *it;
-      }
-      OS << "]";
-    }
-    break;
-  }
-  case MCFragment::FT_CompactEncodedInst: {
-    const MCCompactEncodedInstFragment *CEIF =
-      cast<MCCompactEncodedInstFragment>(this);
-    OS << "\n       ";
-    OS << " Contents:[";
-    const SmallVectorImpl<char> &Contents = CEIF->getContents();
-    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
-      if (i) OS << ",";
-      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
-    }
-    OS << "] (" << Contents.size() << " bytes)";
-    break;
-  }
-  case MCFragment::FT_Fill:  {
-    const MCFillFragment *FF = cast<MCFillFragment>(this);
-    OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize()
-       << " Size:" << FF->getSize();
-    break;
-  }
-  case MCFragment::FT_Relaxable:  {
-    const MCRelaxableFragment *F = cast<MCRelaxableFragment>(this);
-    OS << "\n       ";
-    OS << " Inst:";
-    F->getInst().dump_pretty(OS);
-    break;
-  }
-  case MCFragment::FT_Org:  {
-    const MCOrgFragment *OF = cast<MCOrgFragment>(this);
-    OS << "\n       ";
-    OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue();
-    break;
-  }
-  case MCFragment::FT_Dwarf:  {
-    const MCDwarfLineAddrFragment *OF = cast<MCDwarfLineAddrFragment>(this);
-    OS << "\n       ";
-    OS << " AddrDelta:" << OF->getAddrDelta()
-       << " LineDelta:" << OF->getLineDelta();
-    break;
-  }
-  case MCFragment::FT_DwarfFrame:  {
-    const MCDwarfCallFrameFragment *CF = cast<MCDwarfCallFrameFragment>(this);
-    OS << "\n       ";
-    OS << " AddrDelta:" << CF->getAddrDelta();
-    break;
-  }
-  case MCFragment::FT_LEB: {
-    const MCLEBFragment *LF = cast<MCLEBFragment>(this);
-    OS << "\n       ";
-    OS << " Value:" << LF->getValue() << " Signed:" << LF->isSigned();
-    break;
-  }
-  case MCFragment::FT_SafeSEH: {
-    const MCSafeSEHFragment *F = cast<MCSafeSEHFragment>(this);
-    OS << "\n       ";
-    OS << " Sym:" << F->getSymbol();
-    break;
-  }
-  }
-  OS << ">";
-}
-
-void MCAssembler::dump() {
-  raw_ostream &OS = llvm::errs();
-
-  OS << "<MCAssembler\n";
-  OS << "  Sections:[\n    ";
-  for (iterator it = begin(), ie = end(); it != ie; ++it) {
-    if (it != begin()) OS << ",\n    ";
-    it->dump();
-  }
-  OS << "],\n";
-  OS << "  Symbols:[";
-
-  for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
-    if (it != symbol_begin()) OS << ",\n           ";
-    OS << "(";
-    it->dump();
-    OS << ", Index:" << it->getIndex() << ", ";
-    OS << ")";
-  }
-  OS << "]>\n";
-}
-#endif
diff --git a/contrib/llvm/lib/MC/MCContext.cpp b/contrib/llvm/lib/MC/MCContext.cpp
index a85796c..b5ad518 100644
--- a/contrib/llvm/lib/MC/MCContext.cpp
+++ b/contrib/llvm/lib/MC/MCContext.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolMachO.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -41,7 +42,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
       CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), DwarfLocSeen(false),
       GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4),
       AllowTemporaryLabels(true), DwarfCompileUnitID(0),
-      AutoReset(DoAutoReset) {
+      AutoReset(DoAutoReset), HadError(false) {
 
   std::error_code EC = llvm::sys::fs::current_path(CompilationDir);
   if (EC)
@@ -62,9 +63,6 @@ MCContext::~MCContext() {
 
   // NOTE: The symbols are all allocated out of a bump pointer allocator,
   // we don't need to free them here.
-
-  // If the stream for the .secure_log_unique directive was created free it.
-  delete (raw_ostream *)SecureLog;
 }
 
 //===----------------------------------------------------------------------===//
@@ -73,13 +71,11 @@ MCContext::~MCContext() {
 
 void MCContext::reset() {
   // Call the destructors so the fragments are freed
-  for (auto &I : ELFUniquingMap)
-    I.second->~MCSectionELF();
-  for (auto &I : COFFUniquingMap)
-    I.second->~MCSectionCOFF();
-  for (auto &I : MachOUniquingMap)
-    I.second->~MCSectionMachO();
+  COFFAllocator.DestroyAll();
+  ELFAllocator.DestroyAll();
+  MachOAllocator.DestroyAll();
 
+  MCSubtargetAllocator.DestroyAll();
   UsedNames.clear();
   Symbols.clear();
   SectionSymbols.clear();
@@ -103,6 +99,8 @@ void MCContext::reset() {
   DwarfLocSeen = false;
   GenDwarfForAssembly = false;
   GenDwarfFileNumber = 0;
+
+  HadError = false;
 }
 
 //===----------------------------------------------------------------------===//
@@ -294,8 +292,8 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
     Begin = createTempSymbol(BeginSymName, false);
 
   // Otherwise, return a new section.
-  return Entry = new (*this) MCSectionMachO(Segment, Section, TypeAndAttributes,
-                                            Reserved2, Kind, Begin);
+  return Entry = new (MachOAllocator.Allocate()) MCSectionMachO(
+             Segment, Section, TypeAndAttributes, Reserved2, Kind, Begin);
 }
 
 void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
@@ -322,7 +320,7 @@ MCSectionELF *MCContext::createELFRelSection(StringRef Name, unsigned Type,
   bool Inserted;
   std::tie(I, Inserted) = ELFRelSecNames.insert(std::make_pair(Name, true));
 
-  return new (*this)
+  return new (ELFAllocator.Allocate())
       MCSectionELF(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
                    EntrySize, Group, true, nullptr, Associated);
 }
@@ -367,15 +365,15 @@ MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type,
   if (BeginSymName)
     Begin = createTempSymbol(BeginSymName, false);
 
-  MCSectionELF *Result =
-      new (*this) MCSectionELF(CachedName, Type, Flags, Kind, EntrySize,
-                               GroupSym, UniqueID, Begin, Associated);
+  MCSectionELF *Result = new (ELFAllocator.Allocate())
+      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID,
+                   Begin, Associated);
   Entry.second = Result;
   return Result;
 }
 
 MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) {
-  MCSectionELF *Result = new (*this)
+  MCSectionELF *Result = new (ELFAllocator.Allocate())
       MCSectionELF(".group", ELF::SHT_GROUP, 0, SectionKind::getReadOnly(), 4,
                    Group, ~0, nullptr, nullptr);
   return Result;
@@ -404,7 +402,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
     Begin = createTempSymbol(BeginSymName, false);
 
   StringRef CachedName = Iter->first.SectionName;
-  MCSectionCOFF *Result = new (*this) MCSectionCOFF(
+  MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF(
       CachedName, Characteristics, COMDATSymbol, Selection, Kind, Begin);
 
   Iter->second = Result;
@@ -441,6 +439,10 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
                         COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
 }
 
+MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
+  return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Management
 //===----------------------------------------------------------------------===//
@@ -472,14 +474,24 @@ void MCContext::finalizeDwarfSections(MCStreamer &MCOS) {
       [&](MCSection *Sec) { return !MCOS.mayHaveInstructions(*Sec); });
 }
 
-void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) const {
-  // If we have a source manager and a location, use it. Otherwise just
-  // use the generic report_fatal_error().
-  if (!SrcMgr || Loc == SMLoc())
+//===----------------------------------------------------------------------===//
+// Error Reporting
+//===----------------------------------------------------------------------===//
+
+void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
+  HadError = true;
+
+  // If we have a source manager use it. Otherwise just use the generic
+  // report_fatal_error().
+  if (!SrcMgr)
     report_fatal_error(Msg, false);
 
   // Use the source manager to print the message.
   SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+}
+
+void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
+  reportError(Loc, Msg);
 
   // If we reached here, we are failing ungracefully. Run the interrupt handlers
   // to make sure any special cleanups get done, in particular that we remove
diff --git a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index 716d76a..82063fb 100644
--- a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -125,7 +125,6 @@ void LLVMDisasmDispose(LLVMDisasmContextRef DCR){
 static void emitComments(LLVMDisasmContext *DC,
                          formatted_raw_ostream &FormattedOS) {
   // Flush the stream before taking its content.
-  DC->CommentStream.flush();
   StringRef Comments = DC->CommentsToEmit.str();
   // Get the default information for printing a comment.
   const MCAsmInfo *MAI = DC->getAsmInfo();
@@ -147,7 +146,6 @@ static void emitComments(LLVMDisasmContext *DC,
 
   // Tell the comment stream that the vector changed underneath it.
   DC->CommentsToEmit.clear();
-  DC->CommentStream.resync();
 }
 
 /// \brief Gets latency information for \p Inst from the itinerary
@@ -261,7 +259,6 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
     return 0;
 
   case MCDisassembler::Success: {
-    Annotations.flush();
     StringRef AnnotationsStr = Annotations.str();
 
     SmallVector<char, 64> InsnStr;
@@ -273,7 +270,6 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
       emitLatency(DC, Inst);
 
     emitComments(DC, FormattedOS);
-    OS.flush();
 
     assert(OutStringSize != 0 && "Output buffer cannot be zero size");
     size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
diff --git a/contrib/llvm/lib/MC/MCDwarf.cpp b/contrib/llvm/lib/MC/MCDwarf.cpp
index c84c486..dafa768 100644
--- a/contrib/llvm/lib/MC/MCDwarf.cpp
+++ b/contrib/llvm/lib/MC/MCDwarf.cpp
@@ -27,26 +27,8 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-// Given a special op, return the address skip amount (in units of
-// DWARF2_LINE_MIN_INSN_LENGTH.
-#define SPECIAL_ADDR(op) (((op) - DWARF2_LINE_OPCODE_BASE)/DWARF2_LINE_RANGE)
-
-// The maximum address skip amount that can be encoded with a special op.
-#define MAX_SPECIAL_ADDR_DELTA         SPECIAL_ADDR(255)
-
-// First special line opcode - leave room for the standard opcodes.
-// Note: If you want to change this, you'll have to update the
-// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit().
-#define DWARF2_LINE_OPCODE_BASE         13
-
-// Minimum line offset in a special line info. opcode.  This value
-// was chosen to give a reasonable range of values.
-#define DWARF2_LINE_BASE                -5
 
-// Range of line offsets in a special line info. opcode.
-#define DWARF2_LINE_RANGE               14
+using namespace llvm;
 
 static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
   unsigned MinInsnLength = Context.getAsmInfo()->getMinInstAlignment();
@@ -197,7 +179,8 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
 //
 // This emits the Dwarf file and the line tables.
 //
-void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS) {
+void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS,
+                            MCDwarfLineTableParams Params) {
   MCContext &context = MCOS->getContext();
 
   auto &LineTables = context.getMCDwarfLineTables();
@@ -212,14 +195,17 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS) {
 
   // Handle the rest of the Compile Units.
   for (const auto &CUIDTablePair : LineTables)
-    CUIDTablePair.second.EmitCU(MCOS);
+    CUIDTablePair.second.EmitCU(MCOS, Params);
 }
 
-void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS) const {
-  MCOS.EmitLabel(Header.Emit(&MCOS, None).second);
+void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS,
+                               MCDwarfLineTableParams Params) const {
+  MCOS.EmitLabel(Header.Emit(&MCOS, Params, None).second);
 }
 
-std::pair<MCSymbol *, MCSymbol *> MCDwarfLineTableHeader::Emit(MCStreamer *MCOS) const {
+std::pair<MCSymbol *, MCSymbol *>
+MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
+                             MCDwarfLineTableParams Params) const {
   static const char StandardOpcodeLengths[] = {
       0, // length of DW_LNS_copy
       1, // length of DW_LNS_advance_pc
@@ -234,9 +220,10 @@ std::pair<MCSymbol *, MCSymbol *> MCDwarfLineTableHeader::Emit(MCStreamer *MCOS)
       0, // length of DW_LNS_set_epilogue_begin
       1  // DW_LNS_set_isa
   };
-  assert(array_lengthof(StandardOpcodeLengths) ==
-         (DWARF2_LINE_OPCODE_BASE - 1));
-  return Emit(MCOS, StandardOpcodeLengths);
+  assert(array_lengthof(StandardOpcodeLengths) >=
+         (Params.DWARF2LineOpcodeBase - 1U));
+  return Emit(MCOS, Params, makeArrayRef(StandardOpcodeLengths,
+                                         Params.DWARF2LineOpcodeBase - 1));
 }
 
 static const MCExpr *forceExpAbs(MCStreamer &OS, const MCExpr* Expr) {
@@ -256,9 +243,8 @@ static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) {
 }
 
 std::pair<MCSymbol *, MCSymbol *>
-MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
+MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
                              ArrayRef<char> StandardOpcodeLengths) const {
-
   MCContext &context = MCOS->getContext();
 
   // Create a symbol at the beginning of the line table.
@@ -293,8 +279,8 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
   // Parameters of the state machine, are next.
   MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
   MCOS->EmitIntValue(DWARF2_LINE_DEFAULT_IS_STMT, 1);
-  MCOS->EmitIntValue(DWARF2_LINE_BASE, 1);
-  MCOS->EmitIntValue(DWARF2_LINE_RANGE, 1);
+  MCOS->EmitIntValue(Params.DWARF2LineBase, 1);
+  MCOS->EmitIntValue(Params.DWARF2LineRange, 1);
   MCOS->EmitIntValue(StandardOpcodeLengths.size() + 1, 1);
 
   // Standard opcode lengths
@@ -329,8 +315,9 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
   return std::make_pair(LineStartSym, LineEndSym);
 }
 
-void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS) const {
-  MCSymbol *LineEndSym = Header.Emit(MCOS).second;
+void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
+                              MCDwarfLineTableParams Params) const {
+  MCSymbol *LineEndSym = Header.Emit(MCOS, Params).second;
 
   // Put out the line tables.
   for (const auto &LineSec : MCLineSections.getMCLineEntries())
@@ -416,21 +403,31 @@ unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory,
 }
 
 /// Utility function to emit the encoding to a streamer.
-void MCDwarfLineAddr::Emit(MCStreamer *MCOS, int64_t LineDelta,
-                           uint64_t AddrDelta) {
+void MCDwarfLineAddr::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
+                           int64_t LineDelta, uint64_t AddrDelta) {
   MCContext &Context = MCOS->getContext();
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
-  MCDwarfLineAddr::Encode(Context, LineDelta, AddrDelta, OS);
+  MCDwarfLineAddr::Encode(Context, Params, LineDelta, AddrDelta, OS);
   MCOS->EmitBytes(OS.str());
 }
 
+/// Given a special op, return the address skip amount (in units of
+/// DWARF2_LINE_MIN_INSN_LENGTH).
+static uint64_t SpecialAddr(MCDwarfLineTableParams Params, uint64_t op) {
+  return (op - Params.DWARF2LineOpcodeBase) / Params.DWARF2LineRange;
+}
+
 /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
-void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta,
-                             uint64_t AddrDelta, raw_ostream &OS) {
+void MCDwarfLineAddr::Encode(MCContext &Context, MCDwarfLineTableParams Params,
+                             int64_t LineDelta, uint64_t AddrDelta,
+                             raw_ostream &OS) {
   uint64_t Temp, Opcode;
   bool NeedCopy = false;
 
+  // The maximum address skip amount that can be encoded with a special op.
+  uint64_t MaxSpecialAddrDelta = SpecialAddr(Params, 255);
+
   // Scale the address delta by the minimum instruction length.
   AddrDelta = ScaleAddrDelta(Context, AddrDelta);
 
@@ -438,7 +435,7 @@ void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta,
   // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the
   // end_sequence to emit the matrix entry.
   if (LineDelta == INT64_MAX) {
-    if (AddrDelta == MAX_SPECIAL_ADDR_DELTA)
+    if (AddrDelta == MaxSpecialAddrDelta)
       OS << char(dwarf::DW_LNS_const_add_pc);
     else if (AddrDelta) {
       OS << char(dwarf::DW_LNS_advance_pc);
@@ -451,16 +448,16 @@ void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta,
   }
 
   // Bias the line delta by the base.
-  Temp = LineDelta - DWARF2_LINE_BASE;
+  Temp = LineDelta - Params.DWARF2LineBase;
 
   // If the line increment is out of range of a special opcode, we must encode
   // it with DW_LNS_advance_line.
-  if (Temp >= DWARF2_LINE_RANGE) {
+  if (Temp >= Params.DWARF2LineRange) {
     OS << char(dwarf::DW_LNS_advance_line);
     encodeSLEB128(LineDelta, OS);
 
     LineDelta = 0;
-    Temp = 0 - DWARF2_LINE_BASE;
+    Temp = 0 - Params.DWARF2LineBase;
     NeedCopy = true;
   }
 
@@ -471,19 +468,19 @@ void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta,
   }
 
   // Bias the opcode by the special opcode base.
-  Temp += DWARF2_LINE_OPCODE_BASE;
+  Temp += Params.DWARF2LineOpcodeBase;
 
   // Avoid overflow when addr_delta is large.
-  if (AddrDelta < 256 + MAX_SPECIAL_ADDR_DELTA) {
+  if (AddrDelta < 256 + MaxSpecialAddrDelta) {
     // Try using a special opcode.
-    Opcode = Temp + AddrDelta * DWARF2_LINE_RANGE;
+    Opcode = Temp + AddrDelta * Params.DWARF2LineRange;
     if (Opcode <= 255) {
       OS << char(Opcode);
       return;
     }
 
     // Try using DW_LNS_const_add_pc followed by special op.
-    Opcode = Temp + (AddrDelta - MAX_SPECIAL_ADDR_DELTA) * DWARF2_LINE_RANGE;
+    Opcode = Temp + (AddrDelta - MaxSpecialAddrDelta) * Params.DWARF2LineRange;
     if (Opcode <= 255) {
       OS << char(dwarf::DW_LNS_const_add_pc);
       OS << char(Opcode);
@@ -517,10 +514,14 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   MCOS->EmitULEB128IntValue(1);
   MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
   MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
-  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4);
-  if (MCOS->getContext().getGenDwarfSectionSyms().size() > 1 &&
-      MCOS->getContext().getDwarfVersion() >= 3) {
-    EmitAbbrev(MCOS, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4);
+  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, context.getDwarfVersion() >= 4
+                                               ? dwarf::DW_FORM_sec_offset
+                                               : dwarf::DW_FORM_data4);
+  if (context.getGenDwarfSectionSyms().size() > 1 &&
+      context.getDwarfVersion() >= 3) {
+    EmitAbbrev(MCOS, dwarf::DW_AT_ranges, context.getDwarfVersion() >= 4
+                                              ? dwarf::DW_FORM_sec_offset
+                                              : dwarf::DW_FORM_data4);
   } else {
     EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
     EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr);
@@ -845,7 +846,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     LineSectionSymbol = MCOS->getDwarfLineTableSymbol(0);
   MCSymbol *AbbrevSectionSymbol = nullptr;
   MCSymbol *InfoSectionSymbol = nullptr;
-  MCSymbol *RangesSectionSymbol = NULL;
+  MCSymbol *RangesSectionSymbol = nullptr;
 
   // Create end symbols for each section, and remove empty sections
   MCOS->getContext().finalizeDwarfSections(*MCOS);
@@ -998,38 +999,29 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 }
 
 namespace {
-  class FrameEmitterImpl {
-    int CFAOffset;
-    int InitialCFAOffset;
-    bool IsEH;
-    const MCSymbol *SectionStart;
-  public:
-    FrameEmitterImpl(bool isEH)
-        : CFAOffset(0), InitialCFAOffset(0), IsEH(isEH), SectionStart(nullptr) {
-    }
-
-    void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
-
-    /// Emit the unwind information in a compact way.
-    void EmitCompactUnwind(MCObjectStreamer &streamer,
-                           const MCDwarfFrameInfo &frame);
-
-    const MCSymbol &EmitCIE(MCObjectStreamer &streamer,
-                            const MCSymbol *personality,
-                            unsigned personalityEncoding,
-                            const MCSymbol *lsda,
-                            bool IsSignalFrame,
-                            unsigned lsdaEncoding,
-                            bool IsSimple);
-    MCSymbol *EmitFDE(MCObjectStreamer &streamer,
-                      const MCSymbol &cieStart,
-                      const MCDwarfFrameInfo &frame);
-    void EmitCFIInstructions(MCObjectStreamer &streamer,
-                             ArrayRef<MCCFIInstruction> Instrs,
-                             MCSymbol *BaseLabel);
-    void EmitCFIInstruction(MCObjectStreamer &Streamer,
-                            const MCCFIInstruction &Instr);
-  };
+class FrameEmitterImpl {
+  int CFAOffset = 0;
+  int InitialCFAOffset = 0;
+  bool IsEH;
+  MCObjectStreamer &Streamer;
+
+public:
+  FrameEmitterImpl(bool IsEH, MCObjectStreamer &Streamer)
+      : IsEH(IsEH), Streamer(Streamer) {}
+
+  /// Emit the unwind information in a compact way.
+  void EmitCompactUnwind(const MCDwarfFrameInfo &frame);
+
+  const MCSymbol &EmitCIE(const MCSymbol *personality,
+                          unsigned personalityEncoding, const MCSymbol *lsda,
+                          bool IsSignalFrame, unsigned lsdaEncoding,
+                          bool IsSimple);
+  void EmitFDE(const MCSymbol &cieStart, const MCDwarfFrameInfo &frame,
+               bool LastInSection, const MCSymbol &SectionStart);
+  void EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
+                           MCSymbol *BaseLabel);
+  void EmitCFIInstruction(const MCCFIInstruction &Instr);
+};
 
 } // end anonymous namespace
 
@@ -1037,8 +1029,7 @@ static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) {
   Streamer.EmitIntValue(Encoding, 1);
 }
 
-void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
-                                          const MCCFIInstruction &Instr) {
+void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
   auto *MRI = Streamer.getContext().getRegisterInfo();
 
@@ -1150,6 +1141,11 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
     Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
     return;
   }
+  case MCCFIInstruction::OpGnuArgsSize: {
+    Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
+    Streamer.EmitULEB128IntValue(Instr.getOffset());
+    return;
+  }
   case MCCFIInstruction::OpEscape:
     Streamer.EmitBytes(Instr.getValues());
     return;
@@ -1158,8 +1154,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
 }
 
 /// Emit frame instructions to describe the layout of the frame.
-void FrameEmitterImpl::EmitCFIInstructions(MCObjectStreamer &streamer,
-                                           ArrayRef<MCCFIInstruction> Instrs,
+void FrameEmitterImpl::EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
   for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
     const MCCFIInstruction &Instr = Instrs[i];
@@ -1171,18 +1166,17 @@ void FrameEmitterImpl::EmitCFIInstructions(MCObjectStreamer &streamer,
     if (BaseLabel && Label) {
       MCSymbol *ThisSym = Label;
       if (ThisSym != BaseLabel) {
-        streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym);
+        Streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym);
         BaseLabel = ThisSym;
       }
     }
 
-    EmitCFIInstruction(streamer, Instr);
+    EmitCFIInstruction(Instr);
   }
 }
 
 /// Emit the unwind information in a compact way.
-void FrameEmitterImpl::EmitCompactUnwind(MCObjectStreamer &Streamer,
-                                         const MCDwarfFrameInfo &Frame) {
+void FrameEmitterImpl::EmitCompactUnwind(const MCDwarfFrameInfo &Frame) {
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
 
@@ -1254,39 +1248,39 @@ static unsigned getCIEVersion(bool IsEH, unsigned DwarfVersion) {
   case 3:
     return 3;
   case 4:
+  case 5:
     return 4;
   }
   llvm_unreachable("Unknown version");
 }
 
-const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
-                                          const MCSymbol *personality,
+const MCSymbol &FrameEmitterImpl::EmitCIE(const MCSymbol *personality,
                                           unsigned personalityEncoding,
                                           const MCSymbol *lsda,
                                           bool IsSignalFrame,
                                           unsigned lsdaEncoding,
                                           bool IsSimple) {
-  MCContext &context = streamer.getContext();
+  MCContext &context = Streamer.getContext();
   const MCRegisterInfo *MRI = context.getRegisterInfo();
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
 
   MCSymbol *sectionStart = context.createTempSymbol();
-  streamer.EmitLabel(sectionStart);
+  Streamer.EmitLabel(sectionStart);
 
   MCSymbol *sectionEnd = context.createTempSymbol();
 
   // Length
-  const MCExpr *Length = MakeStartMinusEndExpr(streamer, *sectionStart,
-                                               *sectionEnd, 4);
-  emitAbsValue(streamer, Length, 4);
+  const MCExpr *Length =
+      MakeStartMinusEndExpr(Streamer, *sectionStart, *sectionEnd, 4);
+  emitAbsValue(Streamer, Length, 4);
 
   // CIE ID
   unsigned CIE_ID = IsEH ? 0 : -1;
-  streamer.EmitIntValue(CIE_ID, 4);
+  Streamer.EmitIntValue(CIE_ID, 4);
 
   // Version
   uint8_t CIEVersion = getCIEVersion(IsEH, context.getDwarfVersion());
-  streamer.EmitIntValue(CIEVersion, 1);
+  Streamer.EmitIntValue(CIEVersion, 1);
 
   // Augmentation String
   SmallString<8> Augmentation;
@@ -1299,31 +1293,31 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
     Augmentation += "R";
     if (IsSignalFrame)
       Augmentation += "S";
-    streamer.EmitBytes(Augmentation);
+    Streamer.EmitBytes(Augmentation);
   }
-  streamer.EmitIntValue(0, 1);
+  Streamer.EmitIntValue(0, 1);
 
   if (CIEVersion >= 4) {
     // Address Size
-    streamer.EmitIntValue(context.getAsmInfo()->getPointerSize(), 1);
+    Streamer.EmitIntValue(context.getAsmInfo()->getPointerSize(), 1);
 
     // Segment Descriptor Size
-    streamer.EmitIntValue(0, 1);
+    Streamer.EmitIntValue(0, 1);
   }
 
   // Code Alignment Factor
-  streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment());
+  Streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment());
 
   // Data Alignment Factor
-  streamer.EmitSLEB128IntValue(getDataAlignmentFactor(streamer));
+  Streamer.EmitSLEB128IntValue(getDataAlignmentFactor(Streamer));
 
   // Return Address Register
   if (CIEVersion == 1) {
     assert(MRI->getRARegister() <= 255 &&
            "DWARF 2 encodes return_address_register in one byte");
-    streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), IsEH), 1);
+    Streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), IsEH), 1);
   } else {
-    streamer.EmitULEB128IntValue(
+    Streamer.EmitULEB128IntValue(
         MRI->getDwarfRegNum(MRI->getRARegister(), IsEH));
   }
 
@@ -1335,28 +1329,28 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
       // Personality Encoding
       augmentationLength += 1;
       // Personality
-      augmentationLength += getSizeForEncoding(streamer, personalityEncoding);
+      augmentationLength += getSizeForEncoding(Streamer, personalityEncoding);
     }
     if (lsda)
       augmentationLength += 1;
     // Encoding of the FDE pointers
     augmentationLength += 1;
 
-    streamer.EmitULEB128IntValue(augmentationLength);
+    Streamer.EmitULEB128IntValue(augmentationLength);
 
     // Augmentation Data (optional)
     if (personality) {
       // Personality Encoding
-      emitEncodingByte(streamer, personalityEncoding);
+      emitEncodingByte(Streamer, personalityEncoding);
       // Personality
-      EmitPersonality(streamer, *personality, personalityEncoding);
+      EmitPersonality(Streamer, *personality, personalityEncoding);
     }
 
     if (lsda)
-      emitEncodingByte(streamer, lsdaEncoding);
+      emitEncodingByte(Streamer, lsdaEncoding);
 
     // Encoding of the FDE pointers
-    emitEncodingByte(streamer, MOFI->getFDEEncoding());
+    emitEncodingByte(Streamer, MOFI->getFDEEncoding());
   }
 
   // Initial Instructions
@@ -1365,22 +1359,23 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
   if (!IsSimple) {
     const std::vector<MCCFIInstruction> &Instructions =
         MAI->getInitialFrameState();
-    EmitCFIInstructions(streamer, Instructions, nullptr);
+    EmitCFIInstructions(Instructions, nullptr);
   }
 
   InitialCFAOffset = CFAOffset;
 
   // Padding
-  streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getPointerSize());
+  Streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getPointerSize());
 
-  streamer.EmitLabel(sectionEnd);
+  Streamer.EmitLabel(sectionEnd);
   return *sectionStart;
 }
 
-MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer,
-                                    const MCSymbol &cieStart,
-                                    const MCDwarfFrameInfo &frame) {
-  MCContext &context = streamer.getContext();
+void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
+                               const MCDwarfFrameInfo &frame,
+                               bool LastInSection,
+                               const MCSymbol &SectionStart) {
+  MCContext &context = Streamer.getContext();
   MCSymbol *fdeStart = context.createTempSymbol();
   MCSymbol *fdeEnd = context.createTempSymbol();
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
@@ -1388,107 +1383,103 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer,
   CFAOffset = InitialCFAOffset;
 
   // Length
-  const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0);
-  emitAbsValue(streamer, Length, 4);
+  const MCExpr *Length = MakeStartMinusEndExpr(Streamer, *fdeStart, *fdeEnd, 0);
+  emitAbsValue(Streamer, Length, 4);
 
-  streamer.EmitLabel(fdeStart);
+  Streamer.EmitLabel(fdeStart);
 
   // CIE Pointer
   const MCAsmInfo *asmInfo = context.getAsmInfo();
   if (IsEH) {
-    const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
-                                                 0);
-    emitAbsValue(streamer, offset, 4);
+    const MCExpr *offset =
+        MakeStartMinusEndExpr(Streamer, cieStart, *fdeStart, 0);
+    emitAbsValue(Streamer, offset, 4);
   } else if (!asmInfo->doesDwarfUseRelocationsAcrossSections()) {
-    const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
-                                                 cieStart, 0);
-    emitAbsValue(streamer, offset, 4);
+    const MCExpr *offset =
+        MakeStartMinusEndExpr(Streamer, SectionStart, cieStart, 0);
+    emitAbsValue(Streamer, offset, 4);
   } else {
-    streamer.EmitSymbolValue(&cieStart, 4);
+    Streamer.EmitSymbolValue(&cieStart, 4);
   }
 
   // PC Begin
   unsigned PCEncoding =
       IsEH ? MOFI->getFDEEncoding() : (unsigned)dwarf::DW_EH_PE_absptr;
-  unsigned PCSize = getSizeForEncoding(streamer, PCEncoding);
-  emitFDESymbol(streamer, *frame.Begin, PCEncoding, IsEH);
+  unsigned PCSize = getSizeForEncoding(Streamer, PCEncoding);
+  emitFDESymbol(Streamer, *frame.Begin, PCEncoding, IsEH);
 
   // PC Range
-  const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin,
-                                              *frame.End, 0);
-  emitAbsValue(streamer, Range, PCSize);
+  const MCExpr *Range =
+      MakeStartMinusEndExpr(Streamer, *frame.Begin, *frame.End, 0);
+  emitAbsValue(Streamer, Range, PCSize);
 
   if (IsEH) {
     // Augmentation Data Length
     unsigned augmentationLength = 0;
 
     if (frame.Lsda)
-      augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding);
+      augmentationLength += getSizeForEncoding(Streamer, frame.LsdaEncoding);
 
-    streamer.EmitULEB128IntValue(augmentationLength);
+    Streamer.EmitULEB128IntValue(augmentationLength);
 
     // Augmentation Data
     if (frame.Lsda)
-      emitFDESymbol(streamer, *frame.Lsda, frame.LsdaEncoding, true);
+      emitFDESymbol(Streamer, *frame.Lsda, frame.LsdaEncoding, true);
   }
 
   // Call Frame Instructions
-  EmitCFIInstructions(streamer, frame.Instructions, frame.Begin);
+  EmitCFIInstructions(frame.Instructions, frame.Begin);
 
   // Padding
-  streamer.EmitValueToAlignment(PCSize);
+  // The size of a .eh_frame section has to be a multiple of the alignment
+  // since a null CIE is interpreted as the end. Old systems overaligned
+  // .eh_frame, so we do too and account for it in the last FDE.
+  unsigned Align = LastInSection ? asmInfo->getPointerSize() : PCSize;
+  Streamer.EmitValueToAlignment(Align);
 
-  return fdeEnd;
+  Streamer.EmitLabel(fdeEnd);
 }
 
 namespace {
-  struct CIEKey {
-    static const CIEKey getEmptyKey() {
-      return CIEKey(nullptr, 0, -1, false, false);
-    }
-    static const CIEKey getTombstoneKey() {
-      return CIEKey(nullptr, -1, 0, false, false);
-    }
+struct CIEKey {
+  static const CIEKey getEmptyKey() {
+    return CIEKey(nullptr, 0, -1, false, false);
+  }
+  static const CIEKey getTombstoneKey() {
+    return CIEKey(nullptr, -1, 0, false, false);
+  }
 
-    CIEKey(const MCSymbol *Personality_, unsigned PersonalityEncoding_,
-           unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_)
-        : Personality(Personality_), PersonalityEncoding(PersonalityEncoding_),
-          LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_),
-          IsSimple(IsSimple_) {}
-    const MCSymbol *Personality;
-    unsigned PersonalityEncoding;
-    unsigned LsdaEncoding;
-    bool IsSignalFrame;
-    bool IsSimple;
-  };
-}
+  CIEKey(const MCSymbol *Personality, unsigned PersonalityEncoding,
+         unsigned LsdaEncoding, bool IsSignalFrame, bool IsSimple)
+      : Personality(Personality), PersonalityEncoding(PersonalityEncoding),
+        LsdaEncoding(LsdaEncoding), IsSignalFrame(IsSignalFrame),
+        IsSimple(IsSimple) {}
+  const MCSymbol *Personality;
+  unsigned PersonalityEncoding;
+  unsigned LsdaEncoding;
+  bool IsSignalFrame;
+  bool IsSimple;
+};
+} // anonymous namespace
 
 namespace llvm {
-  template <>
-  struct DenseMapInfo<CIEKey> {
-    static CIEKey getEmptyKey() {
-      return CIEKey::getEmptyKey();
-    }
-    static CIEKey getTombstoneKey() {
-      return CIEKey::getTombstoneKey();
-    }
-    static unsigned getHashValue(const CIEKey &Key) {
-      return static_cast<unsigned>(hash_combine(Key.Personality,
-                                                Key.PersonalityEncoding,
-                                                Key.LsdaEncoding,
-                                                Key.IsSignalFrame,
-                                                Key.IsSimple));
-    }
-    static bool isEqual(const CIEKey &LHS,
-                        const CIEKey &RHS) {
-      return LHS.Personality == RHS.Personality &&
-        LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
-        LHS.LsdaEncoding == RHS.LsdaEncoding &&
-        LHS.IsSignalFrame == RHS.IsSignalFrame &&
-        LHS.IsSimple == RHS.IsSimple;
-    }
-  };
-}
+template <> struct DenseMapInfo<CIEKey> {
+  static CIEKey getEmptyKey() { return CIEKey::getEmptyKey(); }
+  static CIEKey getTombstoneKey() { return CIEKey::getTombstoneKey(); }
+  static unsigned getHashValue(const CIEKey &Key) {
+    return static_cast<unsigned>(
+        hash_combine(Key.Personality, Key.PersonalityEncoding, Key.LsdaEncoding,
+                     Key.IsSignalFrame, Key.IsSimple));
+  }
+  static bool isEqual(const CIEKey &LHS, const CIEKey &RHS) {
+    return LHS.Personality == RHS.Personality &&
+           LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
+           LHS.LsdaEncoding == RHS.LsdaEncoding &&
+           LHS.IsSignalFrame == RHS.IsSignalFrame &&
+           LHS.IsSimple == RHS.IsSimple;
+  }
+};
+} // namespace llvm
 
 void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                                bool IsEH) {
@@ -1496,7 +1487,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
 
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
-  FrameEmitterImpl Emitter(IsEH);
+  FrameEmitterImpl Emitter(IsEH, Streamer);
   ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getDwarfFrameInfos();
 
   // Emit the compact unwind info if available.
@@ -1514,7 +1505,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
       NeedsEHFrameSection |=
         Frame.CompactUnwindEncoding ==
           MOFI->getCompactUnwindDwarfEHFrameOnly();
-      Emitter.EmitCompactUnwind(Streamer, Frame);
+      Emitter.EmitCompactUnwind(Frame);
     }
   }
 
@@ -1527,23 +1518,15 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
   Streamer.SwitchSection(&Section);
   MCSymbol *SectionStart = Context.createTempSymbol();
   Streamer.EmitLabel(SectionStart);
-  Emitter.setSectionStart(SectionStart);
 
-  MCSymbol *FDEEnd = nullptr;
   DenseMap<CIEKey, const MCSymbol *> CIEStarts;
 
   const MCSymbol *DummyDebugKey = nullptr;
-  NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();
-  for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) {
-    const MCDwarfFrameInfo &Frame = FrameArray[i];
-
-    // Emit the label from the previous iteration
-    if (FDEEnd) {
-      Streamer.EmitLabel(FDEEnd);
-      FDEEnd = nullptr;
-    }
-
-    if (!NeedsEHFrameSection && Frame.CompactUnwindEncoding !=
+  bool CanOmitDwarf = MOFI->getOmitDwarfIfHaveCompactUnwind();
+  for (auto I = FrameArray.begin(), E = FrameArray.end(); I != E;) {
+    const MCDwarfFrameInfo &Frame = *I;
+    ++I;
+    if (CanOmitDwarf && Frame.CompactUnwindEncoding !=
           MOFI->getCompactUnwindDwarfEHFrameOnly())
       // Don't generate an EH frame if we don't need one. I.e., it's taken care
       // of by the compact unwind encoding.
@@ -1553,18 +1536,12 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                Frame.LsdaEncoding, Frame.IsSignalFrame, Frame.IsSimple);
     const MCSymbol *&CIEStart = IsEH ? CIEStarts[Key] : DummyDebugKey;
     if (!CIEStart)
-      CIEStart = &Emitter.EmitCIE(Streamer, Frame.Personality,
-                                  Frame.PersonalityEncoding, Frame.Lsda,
-                                  Frame.IsSignalFrame,
-                                  Frame.LsdaEncoding,
-                                  Frame.IsSimple);
+      CIEStart = &Emitter.EmitCIE(Frame.Personality, Frame.PersonalityEncoding,
+                                  Frame.Lsda, Frame.IsSignalFrame,
+                                  Frame.LsdaEncoding, Frame.IsSimple);
 
-    FDEEnd = Emitter.EmitFDE(Streamer, *CIEStart, Frame);
+    Emitter.EmitFDE(*CIEStart, Frame, I == E, *SectionStart);
   }
-
-  Streamer.EmitValueToAlignment(Context.getAsmInfo()->getPointerSize());
-  if (FDEEnd)
-    Streamer.EmitLabel(FDEEnd);
 }
 
 void MCDwarfFrameEmitter::EmitAdvanceLoc(MCObjectStreamer &Streamer,
diff --git a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
index bc0ba85..de645ca 100644
--- a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
@@ -29,23 +29,7 @@ bool MCELFObjectTargetWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   return false;
 }
 
-// ELF doesn't require relocations to be in any order. We sort by the Offset,
-// just to match gnu as for easier comparison. The use type is an arbitrary way
-// of making the sort deterministic.
-static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
-  const ELFRelocationEntry &A = *AP;
-  const ELFRelocationEntry &B = *BP;
-  if (A.Offset != B.Offset)
-    return B.Offset - A.Offset;
-  if (B.Type != A.Type)
-    return A.Type - B.Type;
-  //llvm_unreachable("ELFRelocs might be unstable!");
-  return 0;
-}
-
-
 void
 MCELFObjectTargetWriter::sortRelocs(const MCAssembler &Asm,
                                     std::vector<ELFRelocationEntry> &Relocs) {
-  array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel);
 }
diff --git a/contrib/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm/lib/MC/MCELFStreamer.cpp
index fe9ac21..06d161b 100644
--- a/contrib/llvm/lib/MC/MCELFStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCELFStreamer.cpp
@@ -68,7 +68,6 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
       EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
 
       Assembler.writeFragmentPadding(*EF, FSize, OW);
-      VecOS.flush();
       delete OW;
 
       DF->getContents().append(Code.begin(), Code.end());
@@ -87,20 +86,10 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
 }
 
 void MCELFStreamer::InitSections(bool NoExecStack) {
-  // This emulates the same behavior of GNU as. This makes it easier
-  // to compare the output as the major sections are in the same order.
   MCContext &Ctx = getContext();
   SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
   EmitCodeAlignment(4);
 
-  SwitchSection(Ctx.getObjectFileInfo()->getDataSection());
-  EmitCodeAlignment(4);
-
-  SwitchSection(Ctx.getObjectFileInfo()->getBSSSection());
-  EmitCodeAlignment(4);
-
-  SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
-
   if (NoExecStack)
     SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
@@ -112,7 +101,7 @@ void MCELFStreamer::EmitLabel(MCSymbol *S) {
   MCObjectStreamer::EmitLabel(Symbol);
 
   const MCSectionELF &Section =
-    static_cast<const MCSectionELF&>(Symbol->getSection());
+      static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
   if (Section.getFlags() & ELF::SHF_TLS)
     Symbol->setType(ELF::STT_TLS);
 }
@@ -134,7 +123,7 @@ void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   llvm_unreachable("invalid assembler flag!");
 }
 
-// If bundle aligment is used and there are any instructions in the section, it
+// If bundle alignment is used and there are any instructions in the section, it
 // needs to be aligned to at least the bundle size.
 static void setSectionAlignmentForBundling(const MCAssembler &Assembler,
                                            MCSection *Section) {
@@ -312,13 +301,20 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
   Symbol->setType(ELF::STT_OBJECT);
 
   if (Symbol->getBinding() == ELF::STB_LOCAL) {
-    MCSection *Section = getAssembler().getContext().getELFSection(
+    MCSection &Section = *getAssembler().getContext().getELFSection(
         ".bss", ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+    MCSectionSubPair P = getCurrentSection();
+    SwitchSection(&Section);
 
-    AssignSection(Symbol, Section);
+    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+    EmitLabel(Symbol);
+    EmitZeros(Size);
 
-    struct LocalCommon L = {Symbol, Size, ByteAlignment};
-    LocalCommons.push_back(L);
+    // Update the maximum alignment of the section if necessary.
+    if (ByteAlignment > Section.getAlignment())
+      Section.setAlignment(ByteAlignment);
+
+    SwitchSection(P.first, P.second);
   } else {
     if(Symbol->declareCommon(Size, ByteAlignment))
       report_fatal_error("Symbol: " + Symbol->getName() +
@@ -344,7 +340,7 @@ void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
 }
 
 void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                  const SMLoc &Loc) {
+                                  SMLoc Loc) {
   if (isBundleLocked())
     report_fatal_error("Emitting values inside a locked bundle is forbidden");
   fixSymbolsInTLSFixups(Value);
@@ -480,7 +476,6 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
   Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
-  VecOS.flush();
 
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i)
     fixSymbolsInTLSFixups(Fixups[i].getValue());
@@ -603,7 +598,7 @@ void MCELFStreamer::EmitBundleUnlock() {
     report_fatal_error("Empty bundle-locked group is forbidden");
 
   // When the -mc-relax-all flag is used, we emit instructions to fragments
-  // stored on a stack. When the bundle unlock is emited, we pop a fragment
+  // stored on a stack. When the bundle unlock is emitted, we pop a fragment
   // from the stack a merge it to the one below.
   if (getAssembler().getRelaxAll()) {
     assert(!BundleGroups.empty() && "There are no bundle groups");
@@ -625,29 +620,6 @@ void MCELFStreamer::EmitBundleUnlock() {
     Sec.setBundleLockState(MCSection::NotBundleLocked);
 }
 
-void MCELFStreamer::Flush() {
-  for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
-                                                e = LocalCommons.end();
-       i != e; ++i) {
-    const MCSymbol &Symbol = *i->Symbol;
-    uint64_t Size = i->Size;
-    unsigned ByteAlignment = i->ByteAlignment;
-    MCSection &Section = Symbol.getSection();
-
-    getAssembler().registerSection(Section);
-    new MCAlignFragment(ByteAlignment, 0, 1, ByteAlignment, &Section);
-
-    MCFragment *F = new MCFillFragment(0, 0, Size, &Section);
-    Symbol.setFragment(F);
-
-    // Update the maximum alignment of the section if necessary.
-    if (ByteAlignment > Section.getAlignment())
-      Section.setAlignment(ByteAlignment);
-  }
-
-  LocalCommons.clear();
-}
-
 void MCELFStreamer::FinishImpl() {
   // Ensure the last section gets aligned if necessary.
   MCSection *CurSection = getCurrentSectionOnly();
@@ -655,8 +627,6 @@ void MCELFStreamer::FinishImpl() {
 
   EmitFrames(nullptr);
 
-  Flush();
-
   this->MCObjectStreamer::FinishImpl();
 }
 
diff --git a/contrib/llvm/lib/MC/MCExpr.cpp b/contrib/llvm/lib/MC/MCExpr.cpp
index a30ceec..748644b 100644
--- a/contrib/llvm/lib/MC/MCExpr.cpp
+++ b/contrib/llvm/lib/MC/MCExpr.cpp
@@ -43,7 +43,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
     const MCSymbol &Sym = SRE.getSymbol();
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
-    bool UseParens = Sym.getName()[0] == '$';
+    bool UseParens = Sym.getName().size() && Sym.getName()[0] == '$';
     if (UseParens) {
       OS << '(';
       Sym.print(OS, MAI);
@@ -202,6 +202,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_SIZE: return "SIZE";
   case VK_WEAKREF: return "WEAKREF";
   case VK_ARM_NONE: return "none";
+  case VK_ARM_GOT_PREL: return "GOT_PREL";
   case VK_ARM_TARGET1: return "target1";
   case VK_ARM_TARGET2: return "target2";
   case VK_ARM_PREL31: return "prel31";
@@ -299,6 +300,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Hexagon_LD_PLT: return "LDPLT";
   case VK_Hexagon_IE: return "IE";
   case VK_Hexagon_IE_GOT: return "IEGOT";
+  case VK_WebAssembly_FUNCTION: return "FUNCTION";
   case VK_TPREL: return "tprel";
   case VK_DTPREL: return "dtprel";
   }
@@ -311,7 +313,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("got", VK_GOT)
     .Case("gotoff", VK_GOTOFF)
     .Case("gotpcrel", VK_GOTPCREL)
-    .Case("got_prel", VK_GOTPCREL)
     .Case("gottpoff", VK_GOTTPOFF)
     .Case("indntpoff", VK_INDNTPOFF)
     .Case("ntpoff", VK_NTPOFF)
@@ -382,7 +383,15 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO)
     .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
     .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
+    .Case("gdgot", VK_Hexagon_GD_GOT)
+    .Case("gdplt", VK_Hexagon_GD_PLT)
+    .Case("iegot", VK_Hexagon_IE_GOT)
+    .Case("ie", VK_Hexagon_IE)
+    .Case("ldgot", VK_Hexagon_LD_GOT)
+    .Case("ldplt", VK_Hexagon_LD_PLT)
+    .Case("pcrel", VK_Hexagon_PCREL)
     .Case("none", VK_ARM_NONE)
+    .Case("got_prel", VK_ARM_GOT_PREL)
     .Case("target1", VK_ARM_TARGET1)
     .Case("target2", VK_ARM_TARGET2)
     .Case("prel31", VK_ARM_PREL31)
@@ -477,7 +486,8 @@ static void AttemptToFoldSymbolOffsetDifference(
   if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet))
     return;
 
-  if (SA.getFragment() == SB.getFragment()) {
+  if (SA.getFragment() == SB.getFragment() && !SA.isVariable() &&
+      !SB.isVariable()) {
     Addend += (SA.getOffset() - SB.getOffset());
 
     // Pointers to Thumb symbols need to have their low-bit set to allow
@@ -583,11 +593,6 @@ EvaluateSymbolicAdd(const MCAssembler *Asm, const MCAsmLayout *Layout,
   const MCSymbolRefExpr *A = LHS_A ? LHS_A : RHS_A;
   const MCSymbolRefExpr *B = LHS_B ? LHS_B : RHS_B;
 
-  // If we have a negated symbol, then we must have also have a non-negated
-  // symbol in order to encode the expression.
-  if (B && !A)
-    return false;
-
   Res = MCValue::get(A, B, Result_Cst);
   return true;
 }
@@ -606,7 +611,7 @@ bool MCExpr::evaluateAsValue(MCValue &Res, const MCAsmLayout &Layout) const {
                                    true);
 }
 
-static bool canExpand(const MCSymbol &Sym, const MCAssembler *Asm, bool InSet) {
+static bool canExpand(const MCSymbol &Sym, bool InSet) {
   const MCExpr *Expr = Sym.getVariableValue();
   const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr);
   if (Inner) {
@@ -616,9 +621,7 @@ static bool canExpand(const MCSymbol &Sym, const MCAssembler *Asm, bool InSet) {
 
   if (InSet)
     return true;
-  if (!Asm)
-    return false;
-  return !Asm->getWriter().isWeak(Sym);
+  return !Sym.isInSection();
 }
 
 bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
@@ -643,7 +646,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
 
     // Evaluate recursively if this is a variable.
     if (Sym.isVariable() && SRE->getKind() == MCSymbolRefExpr::VK_None &&
-        canExpand(Sym, Asm, InSet)) {
+        canExpand(Sym, InSet)) {
       bool IsMachO = SRE->hasSubsectionsViaSymbols();
       if (Sym.getVariableValue()->evaluateAsRelocatableImpl(
               Res, Asm, Layout, Fixup, Addrs, InSet || IsMachO)) {
@@ -739,7 +742,17 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     case MCBinaryExpr::AShr: Result = LHS >> RHS; break;
     case MCBinaryExpr::Add:  Result = LHS + RHS; break;
     case MCBinaryExpr::And:  Result = LHS & RHS; break;
-    case MCBinaryExpr::Div:  Result = LHS / RHS; break;
+    case MCBinaryExpr::Div:
+      // Handle division by zero. gas just emits a warning and keeps going,
+      // we try to be stricter.
+      // FIXME: Currently the caller of this function has no way to understand
+      // we're bailing out because of 'division by zero'. Therefore, it will
+      // emit a 'expected relocatable expression' error. It would be nice to
+      // change this code to emit a better diagnostic.
+      if (RHS == 0)
+        return false;
+      Result = LHS / RHS;
+      break;
     case MCBinaryExpr::EQ:   Result = LHS == RHS; break;
     case MCBinaryExpr::GT:   Result = LHS > RHS; break;
     case MCBinaryExpr::GTE:  Result = LHS >= RHS; break;
@@ -765,45 +778,41 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
   llvm_unreachable("Invalid assembly expression kind!");
 }
 
-MCSection *MCExpr::findAssociatedSection() const {
+MCFragment *MCExpr::findAssociatedFragment() const {
   switch (getKind()) {
   case Target:
     // We never look through target specific expressions.
-    return cast<MCTargetExpr>(this)->findAssociatedSection();
+    return cast<MCTargetExpr>(this)->findAssociatedFragment();
 
   case Constant:
-    return MCSymbol::AbsolutePseudoSection;
+    return MCSymbol::AbsolutePseudoFragment;
 
   case SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
     const MCSymbol &Sym = SRE->getSymbol();
-
-    if (Sym.isDefined())
-      return &Sym.getSection();
-
-    return nullptr;
+    return Sym.getFragment();
   }
 
   case Unary:
-    return cast<MCUnaryExpr>(this)->getSubExpr()->findAssociatedSection();
+    return cast<MCUnaryExpr>(this)->getSubExpr()->findAssociatedFragment();
 
   case Binary: {
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(this);
-    MCSection *LHS_S = BE->getLHS()->findAssociatedSection();
-    MCSection *RHS_S = BE->getRHS()->findAssociatedSection();
+    MCFragment *LHS_F = BE->getLHS()->findAssociatedFragment();
+    MCFragment *RHS_F = BE->getRHS()->findAssociatedFragment();
 
-    // If either section is absolute, return the other.
-    if (LHS_S == MCSymbol::AbsolutePseudoSection)
-      return RHS_S;
-    if (RHS_S == MCSymbol::AbsolutePseudoSection)
-      return LHS_S;
+    // If either is absolute, return the other.
+    if (LHS_F == MCSymbol::AbsolutePseudoFragment)
+      return RHS_F;
+    if (RHS_F == MCSymbol::AbsolutePseudoFragment)
+      return LHS_F;
 
     // Not always correct, but probably the best we can do without more context.
     if (BE->getOpcode() == MCBinaryExpr::Sub)
-      return MCSymbol::AbsolutePseudoSection;
+      return MCSymbol::AbsolutePseudoFragment;
 
-    // Otherwise, return the first non-null section.
-    return LHS_S ? LHS_S : RHS_S;
+    // Otherwise, return the first non-null fragment.
+    return LHS_F ? LHS_F : RHS_F;
   }
   }
 
diff --git a/contrib/llvm/lib/MC/MCFragment.cpp b/contrib/llvm/lib/MC/MCFragment.cpp
new file mode 100644
index 0000000..efdb704
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCFragment.cpp
@@ -0,0 +1,458 @@
+//===- lib/MC/MCFragment.cpp - Assembler Fragment Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCFragment.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <tuple>
+using namespace llvm;
+
+MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
+  : Assembler(Asm), LastValidFragment()
+ {
+  // Compute the section layout order. Virtual sections must go last.
+  for (MCSection &Sec : Asm)
+    if (!Sec.isVirtualSection())
+      SectionOrder.push_back(&Sec);
+  for (MCSection &Sec : Asm)
+    if (Sec.isVirtualSection())
+      SectionOrder.push_back(&Sec);
+}
+
+bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
+  const MCSection *Sec = F->getParent();
+  const MCFragment *LastValid = LastValidFragment.lookup(Sec);
+  if (!LastValid)
+    return false;
+  assert(LastValid->getParent() == Sec);
+  return F->getLayoutOrder() <= LastValid->getLayoutOrder();
+}
+
+void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
+  // If this fragment wasn't already valid, we don't need to do anything.
+  if (!isFragmentValid(F))
+    return;
+
+  // Otherwise, reset the last valid fragment to the previous fragment
+  // (if this is the first fragment, it will be NULL).
+  LastValidFragment[F->getParent()] = F->getPrevNode();
+}
+
+void MCAsmLayout::ensureValid(const MCFragment *F) const {
+  MCSection *Sec = F->getParent();
+  MCSection::iterator I;
+  if (MCFragment *Cur = LastValidFragment[Sec])
+    I = ++MCSection::iterator(Cur);
+  else
+    I = Sec->begin();
+
+  // Advance the layout position until the fragment is valid.
+  while (!isFragmentValid(F)) {
+    assert(I != Sec->end() && "Layout bookkeeping error");
+    const_cast<MCAsmLayout *>(this)->layoutFragment(&*I);
+    ++I;
+  }
+}
+
+uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
+  ensureValid(F);
+  assert(F->Offset != ~UINT64_C(0) && "Address not set!");
+  return F->Offset;
+}
+
+// Simple getSymbolOffset helper for the non-varibale case.
+static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbol &S,
+                           bool ReportError, uint64_t &Val) {
+  if (!S.getFragment()) {
+    if (ReportError)
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         S.getName() + "'");
+    return false;
+  }
+  Val = Layout.getFragmentOffset(S.getFragment()) + S.getOffset();
+  return true;
+}
+
+static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S,
+                                bool ReportError, uint64_t &Val) {
+  if (!S.isVariable())
+    return getLabelOffset(Layout, S, ReportError, Val);
+
+  // If SD is a variable, evaluate it.
+  MCValue Target;
+  if (!S.getVariableValue()->evaluateAsValue(Target, Layout))
+    report_fatal_error("unable to evaluate offset for variable '" +
+                       S.getName() + "'");
+
+  uint64_t Offset = Target.getConstant();
+
+  const MCSymbolRefExpr *A = Target.getSymA();
+  if (A) {
+    uint64_t ValA;
+    if (!getLabelOffset(Layout, A->getSymbol(), ReportError, ValA))
+      return false;
+    Offset += ValA;
+  }
+
+  const MCSymbolRefExpr *B = Target.getSymB();
+  if (B) {
+    uint64_t ValB;
+    if (!getLabelOffset(Layout, B->getSymbol(), ReportError, ValB))
+      return false;
+    Offset -= ValB;
+  }
+
+  Val = Offset;
+  return true;
+}
+
+bool MCAsmLayout::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const {
+  return getSymbolOffsetImpl(*this, S, false, Val);
+}
+
+uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const {
+  uint64_t Val;
+  getSymbolOffsetImpl(*this, S, true, Val);
+  return Val;
+}
+
+const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
+  if (!Symbol.isVariable())
+    return &Symbol;
+
+  const MCExpr *Expr = Symbol.getVariableValue();
+  MCValue Value;
+  if (!Expr->evaluateAsValue(Value, *this)) {
+    Assembler.getContext().reportError(
+        SMLoc(), "expression could not be evaluated");
+    return nullptr;
+  }
+
+  const MCSymbolRefExpr *RefB = Value.getSymB();
+  if (RefB) {
+    Assembler.getContext().reportError(
+        SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
+                     "' could not be evaluated in a subtraction expression");
+    return nullptr;
+  }
+
+  const MCSymbolRefExpr *A = Value.getSymA();
+  if (!A)
+    return nullptr;
+
+  const MCSymbol &ASym = A->getSymbol();
+  const MCAssembler &Asm = getAssembler();
+  if (ASym.isCommon()) {
+    // FIXME: we should probably add a SMLoc to MCExpr.
+    Asm.getContext().reportError(SMLoc(),
+                                 "Common symbol '" + ASym.getName() +
+                                     "' cannot be used in assignment expr");
+    return nullptr;
+  }
+
+  return &ASym;
+}
+
+uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const {
+  // The size is the last fragment's end offset.
+  const MCFragment &F = Sec->getFragmentList().back();
+  return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F);
+}
+
+uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const {
+  // Virtual sections have no file size.
+  if (Sec->isVirtualSection())
+    return 0;
+
+  // Otherwise, the file size is the same as the address space size.
+  return getSectionAddressSize(Sec);
+}
+
+uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
+                                    const MCFragment *F,
+                                    uint64_t FOffset, uint64_t FSize) {
+  uint64_t BundleSize = Assembler.getBundleAlignSize();
+  assert(BundleSize > 0 &&
+         "computeBundlePadding should only be called if bundling is enabled");
+  uint64_t BundleMask = BundleSize - 1;
+  uint64_t OffsetInBundle = FOffset & BundleMask;
+  uint64_t EndOfFragment = OffsetInBundle + FSize;
+
+  // There are two kinds of bundling restrictions:
+  //
+  // 1) For alignToBundleEnd(), add padding to ensure that the fragment will
+  //    *end* on a bundle boundary.
+  // 2) Otherwise, check if the fragment would cross a bundle boundary. If it
+  //    would, add padding until the end of the bundle so that the fragment
+  //    will start in a new one.
+  if (F->alignToBundleEnd()) {
+    // Three possibilities here:
+    //
+    // A) The fragment just happens to end at a bundle boundary, so we're good.
+    // B) The fragment ends before the current bundle boundary: pad it just
+    //    enough to reach the boundary.
+    // C) The fragment ends after the current bundle boundary: pad it until it
+    //    reaches the end of the next bundle boundary.
+    //
+    // Note: this code could be made shorter with some modulo trickery, but it's
+    // intentionally kept in its more explicit form for simplicity.
+    if (EndOfFragment == BundleSize)
+      return 0;
+    else if (EndOfFragment < BundleSize)
+      return BundleSize - EndOfFragment;
+    else { // EndOfFragment > BundleSize
+      return 2 * BundleSize - EndOfFragment;
+    }
+  } else if (OffsetInBundle > 0 && EndOfFragment > BundleSize)
+    return BundleSize - OffsetInBundle;
+  else
+    return 0;
+}
+
+/* *** */
+
+void ilist_node_traits<MCFragment>::deleteNode(MCFragment *V) {
+  V->destroy();
+}
+
+MCFragment::MCFragment() : Kind(FragmentType(~0)), HasInstructions(false),
+                           AlignToBundleEnd(false), BundlePadding(0) {
+}
+
+MCFragment::~MCFragment() { }
+
+MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
+                       uint8_t BundlePadding, MCSection *Parent)
+    : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false),
+      BundlePadding(BundlePadding), Parent(Parent), Atom(nullptr),
+      Offset(~UINT64_C(0)) {
+  if (Parent && !isDummy())
+    Parent->getFragmentList().push_back(this);
+}
+
+void MCFragment::destroy() {
+  // First check if we are the sentinal.
+  if (Kind == FragmentType(~0)) {
+    delete this;
+    return;
+  }
+
+  switch (Kind) {
+    case FT_Align:
+      delete cast<MCAlignFragment>(this);
+      return;
+    case FT_Data:
+      delete cast<MCDataFragment>(this);
+      return;
+    case FT_CompactEncodedInst:
+      delete cast<MCCompactEncodedInstFragment>(this);
+      return;
+    case FT_Fill:
+      delete cast<MCFillFragment>(this);
+      return;
+    case FT_Relaxable:
+      delete cast<MCRelaxableFragment>(this);
+      return;
+    case FT_Org:
+      delete cast<MCOrgFragment>(this);
+      return;
+    case FT_Dwarf:
+      delete cast<MCDwarfLineAddrFragment>(this);
+      return;
+    case FT_DwarfFrame:
+      delete cast<MCDwarfCallFrameFragment>(this);
+      return;
+    case FT_LEB:
+      delete cast<MCLEBFragment>(this);
+      return;
+    case FT_SafeSEH:
+      delete cast<MCSafeSEHFragment>(this);
+      return;
+    case FT_Dummy:
+      delete cast<MCDummyFragment>(this);
+      return;
+  }
+}
+
+/* *** */
+
+// Debugging methods
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
+  OS << "<MCFixup" << " Offset:" << AF.getOffset()
+     << " Value:" << *AF.getValue()
+     << " Kind:" << AF.getKind() << ">";
+  return OS;
+}
+
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCFragment::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<";
+  switch (getKind()) {
+  case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
+  case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
+  case MCFragment::FT_CompactEncodedInst:
+    OS << "MCCompactEncodedInstFragment"; break;
+  case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
+  case MCFragment::FT_Relaxable:  OS << "MCRelaxableFragment"; break;
+  case MCFragment::FT_Org:   OS << "MCOrgFragment"; break;
+  case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break;
+  case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
+  case MCFragment::FT_LEB:   OS << "MCLEBFragment"; break;
+  case MCFragment::FT_SafeSEH:    OS << "MCSafeSEHFragment"; break;
+  case MCFragment::FT_Dummy:
+    OS << "MCDummyFragment";
+    break;
+  }
+
+  OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
+     << " Offset:" << Offset
+     << " HasInstructions:" << hasInstructions() 
+     << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
+
+  switch (getKind()) {
+  case MCFragment::FT_Align: {
+    const MCAlignFragment *AF = cast<MCAlignFragment>(this);
+    if (AF->hasEmitNops())
+      OS << " (emit nops)";
+    OS << "\n       ";
+    OS << " Alignment:" << AF->getAlignment()
+       << " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize()
+       << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
+    break;
+  }
+  case MCFragment::FT_Data:  {
+    const MCDataFragment *DF = cast<MCDataFragment>(this);
+    OS << "\n       ";
+    OS << " Contents:[";
+    const SmallVectorImpl<char> &Contents = DF->getContents();
+    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+      if (i) OS << ",";
+      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
+    }
+    OS << "] (" << Contents.size() << " bytes)";
+
+    if (DF->fixup_begin() != DF->fixup_end()) {
+      OS << ",\n       ";
+      OS << " Fixups:[";
+      for (MCDataFragment::const_fixup_iterator it = DF->fixup_begin(),
+             ie = DF->fixup_end(); it != ie; ++it) {
+        if (it != DF->fixup_begin()) OS << ",\n                ";
+        OS << *it;
+      }
+      OS << "]";
+    }
+    break;
+  }
+  case MCFragment::FT_CompactEncodedInst: {
+    const MCCompactEncodedInstFragment *CEIF =
+      cast<MCCompactEncodedInstFragment>(this);
+    OS << "\n       ";
+    OS << " Contents:[";
+    const SmallVectorImpl<char> &Contents = CEIF->getContents();
+    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+      if (i) OS << ",";
+      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
+    }
+    OS << "] (" << Contents.size() << " bytes)";
+    break;
+  }
+  case MCFragment::FT_Fill:  {
+    const MCFillFragment *FF = cast<MCFillFragment>(this);
+    OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize()
+       << " Size:" << FF->getSize();
+    break;
+  }
+  case MCFragment::FT_Relaxable:  {
+    const MCRelaxableFragment *F = cast<MCRelaxableFragment>(this);
+    OS << "\n       ";
+    OS << " Inst:";
+    F->getInst().dump_pretty(OS);
+    break;
+  }
+  case MCFragment::FT_Org:  {
+    const MCOrgFragment *OF = cast<MCOrgFragment>(this);
+    OS << "\n       ";
+    OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue();
+    break;
+  }
+  case MCFragment::FT_Dwarf:  {
+    const MCDwarfLineAddrFragment *OF = cast<MCDwarfLineAddrFragment>(this);
+    OS << "\n       ";
+    OS << " AddrDelta:" << OF->getAddrDelta()
+       << " LineDelta:" << OF->getLineDelta();
+    break;
+  }
+  case MCFragment::FT_DwarfFrame:  {
+    const MCDwarfCallFrameFragment *CF = cast<MCDwarfCallFrameFragment>(this);
+    OS << "\n       ";
+    OS << " AddrDelta:" << CF->getAddrDelta();
+    break;
+  }
+  case MCFragment::FT_LEB: {
+    const MCLEBFragment *LF = cast<MCLEBFragment>(this);
+    OS << "\n       ";
+    OS << " Value:" << LF->getValue() << " Signed:" << LF->isSigned();
+    break;
+  }
+  case MCFragment::FT_SafeSEH: {
+    const MCSafeSEHFragment *F = cast<MCSafeSEHFragment>(this);
+    OS << "\n       ";
+    OS << " Sym:" << F->getSymbol();
+    break;
+  }
+  case MCFragment::FT_Dummy:
+    break;
+  }
+  OS << ">";
+}
+
+void MCAssembler::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<MCAssembler\n";
+  OS << "  Sections:[\n    ";
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    if (it != begin()) OS << ",\n    ";
+    it->dump();
+  }
+  OS << "],\n";
+  OS << "  Symbols:[";
+
+  for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
+    if (it != symbol_begin()) OS << ",\n           ";
+    OS << "(";
+    it->dump();
+    OS << ", Index:" << it->getIndex() << ", ";
+    OS << ")";
+  }
+  OS << "]>\n";
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCInst.cpp b/contrib/llvm/lib/MC/MCInst.cpp
index 7ef69be..5f829ae 100644
--- a/contrib/llvm/lib/MC/MCInst.cpp
+++ b/contrib/llvm/lib/MC/MCInst.cpp
@@ -23,6 +23,8 @@ void MCOperand::print(raw_ostream &OS) const {
     OS << "Reg:" << getReg();
   else if (isImm())
     OS << "Imm:" << getImm();
+  else if (isFPImm())
+    OS << "FPImm:" << getFPImm();
   else if (isExpr()) {
     OS << "Expr:(" << *getExpr() << ")";
   } else if (isInst()) {
diff --git a/contrib/llvm/lib/MC/MCInstrDesc.cpp b/contrib/llvm/lib/MC/MCInstrDesc.cpp
index 5be2fa1..ee55f3e 100644
--- a/contrib/llvm/lib/MC/MCInstrDesc.cpp
+++ b/contrib/llvm/lib/MC/MCInstrDesc.cpp
@@ -53,7 +53,7 @@ bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI,
 
 bool MCInstrDesc::hasImplicitDefOfPhysReg(unsigned Reg,
                                           const MCRegisterInfo *MRI) const {
-  if (const uint16_t *ImpDefs = ImplicitDefs)
+  if (const MCPhysReg *ImpDefs = ImplicitDefs)
     for (; *ImpDefs; ++ImpDefs)
       if (*ImpDefs == Reg || (MRI && MRI->isSubRegister(Reg, *ImpDefs)))
         return true;
diff --git a/contrib/llvm/lib/MC/MCMachOStreamer.cpp b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
index 53cd131..21f7571 100644
--- a/contrib/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
@@ -60,6 +60,7 @@ public:
 
   /// state management
   void reset() override {
+    CreatedADWARFSection = false;
     HasSectionLabel.clear();
     MCObjectStreamer::reset();
   }
@@ -180,8 +181,6 @@ void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
 void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
-  // isSymbolLinkerVisible uses the section.
-  AssignSection(Symbol, getCurrentSection().first);
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
@@ -384,8 +383,6 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
-  AssignSection(Symbol, nullptr);
-
   getAssembler().registerSymbol(*Symbol);
   Symbol->setExternal(true);
   Symbol->setCommon(Size, ByteAlignment);
@@ -417,8 +414,6 @@ void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
   if (ByteAlignment != 1)
     new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, Section);
 
-  AssignSection(Symbol, Section);
-
   MCFragment *F = new MCFillFragment(0, 0, Size, Section);
   Symbol->setFragment(F);
 
@@ -443,12 +438,11 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst,
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
   getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
-  VecOS.flush();
 
   // Add the fixups and data.
-  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
-    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
-    DF->getFixups().push_back(Fixups[i]);
+  for (MCFixup &Fixup : Fixups) {
+    Fixup.setOffset(Fixup.getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixup);
   }
   DF->getContents().append(Code.begin(), Code.end());
 }
@@ -463,7 +457,8 @@ void MCMachOStreamer::FinishImpl() {
   // defining symbols.
   DenseMap<const MCFragment *, const MCSymbol *> DefiningSymbolMap;
   for (const MCSymbol &Symbol : getAssembler().symbols()) {
-    if (getAssembler().isSymbolLinkerVisible(Symbol) && Symbol.getFragment()) {
+    if (getAssembler().isSymbolLinkerVisible(Symbol) && Symbol.isInSection() &&
+        !Symbol.isVariable()) {
       // An atom defining symbol should never be internal to a fragment.
       assert(Symbol.getOffset() == 0 &&
              "Invalid offset in atom defining symbol!");
@@ -473,14 +468,12 @@ void MCMachOStreamer::FinishImpl() {
 
   // Set the fragment atom associations by tracking the last seen atom defining
   // symbol.
-  for (MCAssembler::iterator it = getAssembler().begin(),
-         ie = getAssembler().end(); it != ie; ++it) {
+  for (MCSection &Sec : getAssembler()) {
     const MCSymbol *CurrentAtom = nullptr;
-    for (MCSection::iterator it2 = it->begin(), ie2 = it->end(); it2 != ie2;
-         ++it2) {
-      if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(it2))
+    for (MCFragment &Frag : Sec) {
+      if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag))
         CurrentAtom = Symbol;
-      it2->setAtom(CurrentAtom);
+      Frag.setAtom(CurrentAtom);
     }
   }
 
@@ -493,6 +486,26 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
                                       bool LabelSections) {
   MCMachOStreamer *S = new MCMachOStreamer(Context, MAB, OS, CE,
                                            DWARFMustBeAtTheEnd, LabelSections);
+  const Triple &TT = Context.getObjectFileInfo()->getTargetTriple();
+  if (TT.isOSDarwin()) {
+    unsigned Major, Minor, Update;
+    TT.getOSVersion(Major, Minor, Update);
+    // If there is a version specified, Major will be non-zero.
+    if (Major) {
+      MCVersionMinType VersionType;
+      if (TT.isWatchOS())
+        VersionType = MCVM_WatchOSVersionMin;
+      else if (TT.isTvOS())
+        VersionType = MCVM_TvOSVersionMin;
+      else if (TT.isMacOSX())
+        VersionType = MCVM_OSXVersionMin;
+      else {
+        assert(TT.isiOS() && "Must only be iOS platform left");
+        VersionType = MCVM_IOSVersionMin;
+      }
+      S->EmitVersionMin(VersionType, Major, Minor, Update);
+    }
+  }
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
index 576827a..f86f7e4 100644
--- a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MObjectFileInfo.cpp - Object File Information ---------------------===//
+//===-- MCObjectFileInfo.cpp - Object File Information --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,6 +16,8 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/COFF.h"
+
 using namespace llvm;
 
 static bool useCompactUnwind(const Triple &T) {
@@ -27,6 +29,10 @@ static bool useCompactUnwind(const Triple &T) {
   if (T.getArch() == Triple::aarch64)
     return true;
 
+  // armv7k always has it.
+  if (T.isWatchOS())
+    return true;
+
   // Use it on newer version of OS X.
   if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
     return true;
@@ -43,9 +49,18 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   // MachO
   SupportsWeakOmittedEHFrame = false;
 
+  EHFrameSection = Ctx->getMachOSection(
+      "__TEXT", "__eh_frame",
+      MachO::S_COALESCED | MachO::S_ATTR_NO_TOC |
+          MachO::S_ATTR_STRIP_STATIC_SYMS | MachO::S_ATTR_LIVE_SUPPORT,
+      SectionKind::getReadOnly());
+
   if (T.isOSDarwin() && T.getArch() == Triple::aarch64)
     SupportsCompactUnwindWithoutEHFrame = true;
 
+  if (T.isWatchOS())
+    OmitDwarfIfHaveCompactUnwind = true;
+
   PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
     | dwarf::DW_EH_PE_sdata4;
   LSDAEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
@@ -61,16 +76,15 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
                            MachO::S_ATTR_PURE_INSTRUCTIONS,
                            SectionKind::getText());
   DataSection // .data
-    = Ctx->getMachOSection("__DATA", "__data", 0,
-                           SectionKind::getDataRel());
+      = Ctx->getMachOSection("__DATA", "__data", 0, SectionKind::getData());
 
   // BSSSection might not be expected initialized on msvc.
   BSSSection = nullptr;
 
   TLSDataSection // .tdata
-    = Ctx->getMachOSection("__DATA", "__thread_data",
-                           MachO::S_THREAD_LOCAL_REGULAR,
-                           SectionKind::getDataRel());
+      = Ctx->getMachOSection("__DATA", "__thread_data",
+                             MachO::S_THREAD_LOCAL_REGULAR,
+                             SectionKind::getData());
   TLSBSSSection // .tbss
     = Ctx->getMachOSection("__DATA", "__thread_bss",
                            MachO::S_THREAD_LOCAL_ZEROFILL,
@@ -78,14 +92,13 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
 
   // TODO: Verify datarel below.
   TLSTLVSection // .tlv
-    = Ctx->getMachOSection("__DATA", "__thread_vars",
-                           MachO::S_THREAD_LOCAL_VARIABLES,
-                           SectionKind::getDataRel());
+      = Ctx->getMachOSection("__DATA", "__thread_vars",
+                             MachO::S_THREAD_LOCAL_VARIABLES,
+                             SectionKind::getData());
 
-  TLSThreadInitSection
-    = Ctx->getMachOSection("__DATA", "__thread_init",
-                          MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
-                          SectionKind::getDataRel());
+  TLSThreadInitSection = Ctx->getMachOSection(
+      "__DATA", "__thread_init", MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
+      SectionKind::getData());
 
   CStringSection // .cstring
     = Ctx->getMachOSection("__TEXT", "__cstring",
@@ -112,22 +125,35 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
     = Ctx->getMachOSection("__TEXT", "__const", 0,
                            SectionKind::getReadOnly());
 
-  TextCoalSection
-    = Ctx->getMachOSection("__TEXT", "__textcoal_nt",
-                           MachO::S_COALESCED |
-                           MachO::S_ATTR_PURE_INSTRUCTIONS,
-                           SectionKind::getText());
-  ConstTextCoalSection
-    = Ctx->getMachOSection("__TEXT", "__const_coal",
-                           MachO::S_COALESCED,
-                           SectionKind::getReadOnly());
+  // If the target is not powerpc, map the coal sections to the non-coal
+  // sections.
+  //
+  // "__TEXT/__textcoal_nt" => section "__TEXT/__text"
+  // "__TEXT/__const_coal"  => section "__TEXT/__const"
+  // "__DATA/__datacoal_nt" => section "__DATA/__data"
+  Triple::ArchType ArchTy = T.getArch();
+
+  if (ArchTy == Triple::ppc || ArchTy == Triple::ppc64) {
+    TextCoalSection
+      = Ctx->getMachOSection("__TEXT", "__textcoal_nt",
+                             MachO::S_COALESCED |
+                             MachO::S_ATTR_PURE_INSTRUCTIONS,
+                             SectionKind::getText());
+    ConstTextCoalSection
+      = Ctx->getMachOSection("__TEXT", "__const_coal",
+                             MachO::S_COALESCED,
+                             SectionKind::getReadOnly());
+    DataCoalSection = Ctx->getMachOSection(
+        "__DATA", "__datacoal_nt", MachO::S_COALESCED, SectionKind::getData());
+  } else {
+    TextCoalSection = TextSection;
+    ConstTextCoalSection = ReadOnlySection;
+    DataCoalSection = DataSection;
+  }
+
   ConstDataSection  // .const_data
     = Ctx->getMachOSection("__DATA", "__const", 0,
                            SectionKind::getReadOnlyWithRel());
-  DataCoalSection
-    = Ctx->getMachOSection("__DATA","__datacoal_nt",
-                           MachO::S_COALESCED,
-                           SectionKind::getDataRel());
   DataCommonSection
     = Ctx->getMachOSection("__DATA","__common",
                            MachO::S_ZEROFILL,
@@ -147,21 +173,17 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
                            SectionKind::getMetadata());
 
   if (RelocM == Reloc::Static) {
-    StaticCtorSection
-      = Ctx->getMachOSection("__TEXT", "__constructor", 0,
-                             SectionKind::getDataRel());
-    StaticDtorSection
-      = Ctx->getMachOSection("__TEXT", "__destructor", 0,
-                             SectionKind::getDataRel());
+    StaticCtorSection = Ctx->getMachOSection("__TEXT", "__constructor", 0,
+                                             SectionKind::getData());
+    StaticDtorSection = Ctx->getMachOSection("__TEXT", "__destructor", 0,
+                                             SectionKind::getData());
   } else {
-    StaticCtorSection
-      = Ctx->getMachOSection("__DATA", "__mod_init_func",
-                             MachO::S_MOD_INIT_FUNC_POINTERS,
-                             SectionKind::getDataRel());
-    StaticDtorSection
-      = Ctx->getMachOSection("__DATA", "__mod_term_func",
-                             MachO::S_MOD_TERM_FUNC_POINTERS,
-                             SectionKind::getDataRel());
+    StaticCtorSection = Ctx->getMachOSection("__DATA", "__mod_init_func",
+                                             MachO::S_MOD_INIT_FUNC_POINTERS,
+                                             SectionKind::getData());
+    StaticDtorSection = Ctx->getMachOSection("__DATA", "__mod_term_func",
+                                             MachO::S_MOD_TERM_FUNC_POINTERS,
+                                             SectionKind::getData());
   }
 
   // Exception Handling.
@@ -176,9 +198,11 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
                              SectionKind::getReadOnly());
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
-      CompactUnwindDwarfEHFrameOnly = 0x04000000;
+      CompactUnwindDwarfEHFrameOnly = 0x04000000;  // UNWIND_X86_64_MODE_DWARF
     else if (T.getArch() == Triple::aarch64)
-      CompactUnwindDwarfEHFrameOnly = 0x03000000;
+      CompactUnwindDwarfEHFrameOnly = 0x03000000;  // UNWIND_ARM64_MODE_DWARF
+    else if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb)
+      CompactUnwindDwarfEHFrameOnly = 0x04000000;  // UNWIND_ARM_MODE_DWARF
   }
 
   // Debug Information.
@@ -232,9 +256,18 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   DwarfRangesSection =
       Ctx->getMachOSection("__DWARF", "__debug_ranges", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "debug_range");
+  DwarfMacinfoSection =
+      Ctx->getMachOSection("__DWARF", "__debug_macinfo", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
   DwarfDebugInlineSection =
       Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
+  DwarfCUIndexSection =
+      Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
+  DwarfTUIndexSection =
+      Ctx->getMachOSection("__DWARF", "__debug_tu_index", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
   StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps",
                                          0, SectionKind::getMetadata());
 
@@ -258,7 +291,6 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel |
                      ((CMModel == CodeModel::Large) ? dwarf::DW_EH_PE_sdata8
                                                     : dwarf::DW_EH_PE_sdata4);
-
     break;
   default:
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
@@ -391,17 +423,15 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
     break;
   }
 
+  unsigned EHSectionType = T.getArch() == Triple::x86_64
+                               ? ELF::SHT_X86_64_UNWIND
+                               : ELF::SHT_PROGBITS;
+
   // Solaris requires different flags for .eh_frame to seemingly every other
   // platform.
-  EHSectionType = ELF::SHT_PROGBITS;
-  EHSectionFlags = ELF::SHF_ALLOC;
-  if (T.isOSSolaris()) {
-    if (T.getArch() == Triple::x86_64)
-      EHSectionType = ELF::SHT_X86_64_UNWIND;
-    else
-      EHSectionFlags |= ELF::SHF_WRITE;
-  }
-
+  unsigned EHSectionFlags = ELF::SHF_ALLOC;
+  if (T.isOSSolaris() && T.getArch() != Triple::x86_64)
+    EHSectionFlags |= ELF::SHF_WRITE;
 
   // ELF
   BSSSection = Ctx->getELFSection(".bss", ELF::SHT_NOBITS,
@@ -423,18 +453,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   TLSBSSSection = Ctx->getELFSection(
       ".tbss", ELF::SHT_NOBITS, ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE);
 
-  DataRelSection = Ctx->getELFSection(".data.rel", ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC | ELF::SHF_WRITE);
-
-  DataRelLocalSection = Ctx->getELFSection(".data.rel.local", ELF::SHT_PROGBITS,
-                                           ELF::SHF_ALLOC | ELF::SHF_WRITE);
-
   DataRelROSection = Ctx->getELFSection(".data.rel.ro", ELF::SHT_PROGBITS,
                                         ELF::SHF_ALLOC | ELF::SHF_WRITE);
 
-  DataRelROLocalSection = Ctx->getELFSection(
-      ".data.rel.ro.local", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE);
-
   MergeableConst4Section =
       Ctx->getELFSection(".rodata.cst4", ELF::SHT_PROGBITS,
                          ELF::SHF_ALLOC | ELF::SHF_MERGE, 4, "");
@@ -487,6 +508,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
       Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0);
   DwarfRangesSection =
       Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0, "debug_range");
+  DwarfMacinfoSection =
+      Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0);
 
   // DWARF5 Experimental Debug Info
 
@@ -519,14 +542,28 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   DwarfAddrSection =
       Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec");
 
+  // DWP Sections
+  DwarfCUIndexSection =
+      Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+  DwarfTUIndexSection =
+      Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0);
+
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 
   FaultMapSection =
       Ctx->getELFSection(".llvm_faultmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+  EHFrameSection =
+      Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
 }
 
 void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
+  EHFrameSection = Ctx->getCOFFSection(
+      ".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                       COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
+      SectionKind::getData());
+
   bool IsWoA = T.getArch() == Triple::arm || T.getArch() == Triple::thumb;
 
   CommDirectiveSupportsAlignment = true;
@@ -545,7 +582,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
   DataSection = Ctx->getCOFFSection(
       ".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ |
                    COFF::IMAGE_SCN_MEM_WRITE,
-      SectionKind::getDataRel());
+      SectionKind::getData());
   ReadOnlySection = Ctx->getCOFFSection(
       ".rdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getReadOnly());
@@ -563,21 +600,20 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
     StaticCtorSection = Ctx->getCOFFSection(
         ".ctors", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                       COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
-        SectionKind::getDataRel());
+        SectionKind::getData());
     StaticDtorSection = Ctx->getCOFFSection(
         ".dtors", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                       COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
-        SectionKind::getDataRel());
+        SectionKind::getData());
   }
 
   // FIXME: We're emitting LSDA info into a readonly section on COFF, even
   // though it contains relocatable pointers.  In PIC mode, this is probably a
   // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
   // adjusted or this should be a data section.
-  assert(T.isOSWindows() && "Windows is the only supported COFF target");
   if (T.getArch() == Triple::x86_64) {
     // On Windows 64 with SEH, the LSDA is emitted into the .xdata section
-    LSDASection = 0;
+    LSDASection = nullptr;
   } else {
     LSDASection = Ctx->getCOFFSection(".gcc_except_table",
                                       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -653,6 +689,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "debug_range");
+  DwarfMacinfoSection = Ctx->getCOFFSection(
+      ".debug_macinfo",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
   DwarfInfoDWOSection = Ctx->getCOFFSection(
       ".debug_info.dwo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -693,6 +734,16 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "addr_sec");
+  DwarfCUIndexSection = Ctx->getCOFFSection(
+      ".debug_cu_index",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
+  DwarfTUIndexSection = Ctx->getCOFFSection(
+      ".debug_tu_index",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
   DwarfAccelNamesSection = Ctx->getCOFFSection(
       ".apple_names",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -720,11 +771,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
 
   PDataSection = Ctx->getCOFFSection(
       ".pdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getDataRel());
+      SectionKind::getData());
 
   XDataSection = Ctx->getCOFFSection(
       ".xdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getDataRel());
+      SectionKind::getData());
 
   SXDataSection = Ctx->getCOFFSection(".sxdata", COFF::IMAGE_SCN_LNK_INFO,
                                       SectionKind::getMetadata());
@@ -732,12 +783,12 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
   TLSDataSection = Ctx->getCOFFSection(
       ".tls$", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ |
                    COFF::IMAGE_SCN_MEM_WRITE,
-      SectionKind::getDataRel());
-	  
+      SectionKind::getData());
+
   StackMapSection = Ctx->getCOFFSection(".llvm_stackmaps",
                                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                             COFF::IMAGE_SCN_MEM_READ,
-                                        SectionKind::getReadOnly());	
+                                        SectionKind::getReadOnly());
 }
 
 void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple,
@@ -752,6 +803,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple,
   CommDirectiveSupportsAlignment = true;
   SupportsWeakOmittedEHFrame = true;
   SupportsCompactUnwindWithoutEHFrame = false;
+  OmitDwarfIfHaveCompactUnwind = false;
 
   PersonalityEncoding = LSDAEncoding = FDECFIEncoding = TTypeEncoding =
       dwarf::DW_EH_PE_absptr;
@@ -767,25 +819,26 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple,
 
   TT = TheTriple;
 
-  Triple::ArchType Arch = TT.getArch();
-  // FIXME: Checking for Arch here to filter out bogus triples such as
-  // cellspu-apple-darwin. Perhaps we should fix in Triple?
-  if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
-       Arch == Triple::arm || Arch == Triple::thumb ||
-       Arch == Triple::aarch64 ||
-       Arch == Triple::ppc || Arch == Triple::ppc64 ||
-       Arch == Triple::UnknownArch) &&
-      TT.isOSBinFormatMachO()) {
+  switch (TT.getObjectFormat()) {
+  case Triple::MachO:
     Env = IsMachO;
     initMachOMCObjectFileInfo(TT);
-  } else if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
-              Arch == Triple::arm || Arch == Triple::thumb) &&
-             (TT.isOSWindows() && TT.getObjectFormat() == Triple::COFF)) {
+    break;
+  case Triple::COFF:
+    if (!TT.isOSWindows())
+      report_fatal_error(
+          "Cannot initialize MC for non-Windows COFF object files.");
+
     Env = IsCOFF;
     initCOFFMCObjectFileInfo(TT);
-  } else {
+    break;
+  case Triple::ELF:
     Env = IsELF;
     initELFMCObjectFileInfo(TT);
+    break;
+  case Triple::UnknownObjectFormat:
+    report_fatal_error("Cannot initialize MC for unknown object file format.");
+    break;
   }
 }
 
@@ -799,24 +852,3 @@ MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const {
   return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP,
                             0, utostr(Hash));
 }
-
-void MCObjectFileInfo::InitEHFrameSection() {
-  if (Env == IsMachO)
-    EHFrameSection =
-      Ctx->getMachOSection("__TEXT", "__eh_frame",
-                           MachO::S_COALESCED |
-                           MachO::S_ATTR_NO_TOC |
-                           MachO::S_ATTR_STRIP_STATIC_SYMS |
-                           MachO::S_ATTR_LIVE_SUPPORT,
-                           SectionKind::getReadOnly());
-  else if (Env == IsELF)
-    EHFrameSection =
-        Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags);
-  else
-    EHFrameSection =
-      Ctx->getCOFFSection(".eh_frame",
-                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                          COFF::IMAGE_SCN_MEM_READ |
-                          COFF::IMAGE_SCN_MEM_WRITE,
-                          SectionKind::getDataRel());
-}
diff --git a/contrib/llvm/lib/MC/MCObjectStreamer.cpp b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
index 0a63777..972610a 100644
--- a/contrib/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
@@ -28,7 +28,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
                                    MCCodeEmitter *Emitter_)
     : MCStreamer(Context),
       Assembler(new MCAssembler(Context, TAB, *Emitter_,
-                                *TAB.createObjectWriter(OS), OS)),
+                                *TAB.createObjectWriter(OS))),
       EmitEHFrame(true), EmitDebugFrame(false) {}
 
 MCObjectStreamer::~MCObjectStreamer() {
@@ -39,32 +39,31 @@ MCObjectStreamer::~MCObjectStreamer() {
 }
 
 void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) {
-  if (PendingLabels.size()) {
-    if (!F) {
-      F = new MCDataFragment();
-      MCSection *CurSection = getCurrentSectionOnly();
-      CurSection->getFragmentList().insert(CurInsertionPoint, F);
-      F->setParent(CurSection);
-    }
-    for (MCSymbol *Sym : PendingLabels) {
-      Sym->setFragment(F);
-      Sym->setOffset(FOffset);
-    }
-    PendingLabels.clear();
+  if (PendingLabels.empty())
+    return;
+  if (!F) {
+    F = new MCDataFragment();
+    MCSection *CurSection = getCurrentSectionOnly();
+    CurSection->getFragmentList().insert(CurInsertionPoint, F);
+    F->setParent(CurSection);
+  }
+  for (MCSymbol *Sym : PendingLabels) {
+    Sym->setFragment(F);
+    Sym->setOffset(FOffset);
   }
+  PendingLabels.clear();
 }
 
 void MCObjectStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi,
                                               const MCSymbol *Lo,
                                               unsigned Size) {
   // If not assigned to the same (valid) fragment, fallback.
-  if (!Hi->getFragment() || Hi->getFragment() != Lo->getFragment()) {
+  if (!Hi->getFragment() || Hi->getFragment() != Lo->getFragment() ||
+      Hi->isVariable() || Lo->isVariable()) {
     MCStreamer::emitAbsoluteSymbolDiff(Hi, Lo, Size);
     return;
   }
 
-  assert(Hi->getOffset() >= Lo->getOffset() &&
-         "Expected Hi to be greater than Lo");
   EmitIntValue(Hi->getOffset() - Lo->getOffset(), Size);
 }
 
@@ -93,7 +92,7 @@ MCFragment *MCObjectStreamer::getCurrentFragment() const {
   assert(getCurrentSectionOnly() && "No current section!");
 
   if (CurInsertionPoint != getCurrentSectionOnly()->getFragmentList().begin())
-    return std::prev(CurInsertionPoint);
+    return &*std::prev(CurInsertionPoint);
 
   return nullptr;
 }
@@ -121,7 +120,7 @@ void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) {
 }
 
 void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     const SMLoc &Loc) {
+                                     SMLoc Loc) {
   MCStreamer::EmitValueImpl(Value, Size, Loc);
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
@@ -155,7 +154,6 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   MCStreamer::EmitLabel(Symbol);
 
   getAssembler().registerSymbol(*Symbol);
-  assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!");
 
   // If there is a current fragment, mark the symbol as pointing into it.
   // Otherwise queue the label and set its fragment pointer when we emit the
@@ -276,7 +274,6 @@ void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst,
   raw_svector_ostream VecOS(Code);
   getAssembler().getEmitter().encodeInstruction(Inst, VecOS, IF->getFixups(),
                                                 STI);
-  VecOS.flush();
   IF->getContents().append(Code.begin(), Code.end());
 }
 
@@ -321,8 +318,10 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer &OS, const MCSymbol *A,
   return AddrDelta;
 }
 
-static void emitDwarfSetLineAddr(MCObjectStreamer &OS, int64_t LineDelta,
-                                 const MCSymbol *Label, int PointerSize) {
+static void emitDwarfSetLineAddr(MCObjectStreamer &OS,
+                                 MCDwarfLineTableParams Params,
+                                 int64_t LineDelta, const MCSymbol *Label,
+                                 int PointerSize) {
   // emit the sequence to set the address
   OS.EmitIntValue(dwarf::DW_LNS_extended_op, 1);
   OS.EmitULEB128IntValue(PointerSize + 1);
@@ -330,7 +329,7 @@ static void emitDwarfSetLineAddr(MCObjectStreamer &OS, int64_t LineDelta,
   OS.EmitSymbolValue(Label, PointerSize);
 
   // emit the sequence for the LineDelta (from 1) and a zero address delta.
-  MCDwarfLineAddr::Emit(&OS, LineDelta, 0);
+  MCDwarfLineAddr::Emit(&OS, Params, LineDelta, 0);
 }
 
 void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
@@ -338,13 +337,15 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                                 const MCSymbol *Label,
                                                 unsigned PointerSize) {
   if (!LastLabel) {
-    emitDwarfSetLineAddr(*this, LineDelta, Label, PointerSize);
+    emitDwarfSetLineAddr(*this, Assembler->getDWARFLinetableParams(), LineDelta,
+                         Label, PointerSize);
     return;
   }
   const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel);
   int64_t Res;
   if (AddrDelta->evaluateAsAbsolute(Res, getAssembler())) {
-    MCDwarfLineAddr::Emit(this, LineDelta, Res);
+    MCDwarfLineAddr::Emit(this, Assembler->getDWARFLinetableParams(), LineDelta,
+                          Res);
     return;
   }
   insert(new MCDwarfLineAddrFragment(LineDelta, *AddrDelta));
@@ -388,26 +389,9 @@ void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment,
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true);
 }
 
-bool MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset,
+void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value) {
-  int64_t Res;
-  if (Offset->evaluateAsAbsolute(Res, getAssembler())) {
-    insert(new MCOrgFragment(*Offset, Value));
-    return false;
-  }
-
-  MCSymbol *CurrentPos = getContext().createTempSymbol();
-  EmitLabel(CurrentPos);
-  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-  const MCExpr *Ref =
-    MCSymbolRefExpr::create(CurrentPos, Variant, getContext());
-  const MCExpr *Delta =
-    MCBinaryExpr::create(MCBinaryExpr::Sub, Offset, Ref, getContext());
-
-  if (!Delta->evaluateAsAbsolute(Res, getAssembler()))
-    return true;
-  EmitFill(Res, Value);
-  return false;
+  insert(new MCOrgFragment(*Offset, Value));
 }
 
 // Associate GPRel32 fixup with data and resize data area
@@ -430,19 +414,31 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
-void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
-  // FIXME: A MCFillFragment would be more memory efficient but MCExpr has
-  //        problems evaluating expressions across multiple fragments.
+bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                          const MCExpr *Expr, SMLoc Loc) {
+  int64_t OffsetValue;
+  if (!Offset.evaluateAsAbsolute(OffsetValue))
+    llvm_unreachable("Offset is not absolute");
+
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
-  DF->getContents().append(NumBytes, FillValue);
+
+  MCFixupKind Kind;
+  if (!Assembler->getBackend().getFixupKind(Name, Kind))
+    return true;
+
+  if (Expr == nullptr)
+    Expr =
+        MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext());
+  DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc));
+  return false;
 }
 
-void MCObjectStreamer::EmitZeros(uint64_t NumBytes) {
+void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
   const MCSection *Sec = getCurrentSection().first;
   assert(Sec && "need a section");
   unsigned ItemSize = Sec->isVirtualSection() ? 0 : 1;
-  insert(new MCFillFragment(0, ItemSize, NumBytes));
+  insert(new MCFillFragment(FillValue, ItemSize, NumBytes));
 }
 
 void MCObjectStreamer::FinishImpl() {
@@ -451,7 +447,7 @@ void MCObjectStreamer::FinishImpl() {
     MCGenDwarfInfo::Emit(this);
 
   // Dump out the dwarf file & directory tables and line tables.
-  MCDwarfLineTable::Emit(this);
+  MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams());
 
   flushPendingLabels(nullptr);
   getAssembler().Finish();
diff --git a/contrib/llvm/lib/MC/MCObjectWriter.cpp b/contrib/llvm/lib/MC/MCObjectWriter.cpp
index 3479034..e84f74a 100644
--- a/contrib/llvm/lib/MC/MCObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/MCObjectWriter.cpp
@@ -33,8 +33,14 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
   if (!SA.getFragment() || !SB.getFragment())
     return false;
 
-  return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, *SB.getFragment(),
-                                                InSet, false);
+  return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, SB, InSet);
+}
+
+bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
+    const MCAssembler &Asm, const MCSymbol &A, const MCSymbol &B,
+    bool InSet) const {
+  return isSymbolRefDifferenceFullyResolvedImpl(Asm, A, *B.getFragment(), InSet,
+                                                false);
 }
 
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
diff --git a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
index b983d99..36c1920 100644
--- a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -436,7 +436,8 @@ StringRef AsmLexer::LexUntilEndOfLine() {
   return StringRef(TokStart, CurPtr-TokStart);
 }
 
-const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
+size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
+                            bool ShouldSkipSpace) {
   const char *SavedTokStart = TokStart;
   const char *SavedCurPtr = CurPtr;
   bool SavedAtStartOfLine = isAtStartOfLine;
@@ -446,7 +447,16 @@ const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
   SMLoc SavedErrLoc = getErrLoc();
 
   SkipSpace = ShouldSkipSpace;
-  AsmToken Token = LexToken();
+
+  size_t ReadCount;
+  for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
+    AsmToken Token = LexToken();
+
+    Buf[ReadCount] = Token;
+
+    if (Token.is(AsmToken::Eof))
+      break;
+  }
 
   SetError(SavedErrLoc, SavedErr);
 
@@ -455,7 +465,7 @@ const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
   CurPtr = SavedCurPtr;
   TokStart = SavedTokStart;
 
-  return Token;
+  return ReadCount;
 }
 
 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
diff --git a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
index 04d1413..646cbb4 100644
--- a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -251,14 +252,14 @@ private:
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI);
   void eatToEndOfLine();
-  bool parseCppHashLineFilenameComment(const SMLoc &L);
+  bool parseCppHashLineFilenameComment(SMLoc L);
 
   void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
                         ArrayRef<MCAsmMacroParameter> Parameters);
   bool expandMacro(raw_svector_ostream &OS, StringRef Body,
                    ArrayRef<MCAsmMacroParameter> Parameters,
                    ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
-                   const SMLoc &L);
+                   SMLoc L);
 
   /// \brief Are macros enabled in the parser?
   bool areMacrosEnabled() {return MacrosEnabledFlag;}
@@ -342,6 +343,7 @@ private:
   enum DirectiveKind {
     DK_NO_DIRECTIVE, // Placeholder
     DK_SET, DK_EQU, DK_EQUIV, DK_ASCII, DK_ASCIZ, DK_STRING, DK_BYTE, DK_SHORT,
+    DK_RELOC,
     DK_VALUE, DK_2BYTE, DK_LONG, DK_INT, DK_4BYTE, DK_QUAD, DK_8BYTE, DK_OCTA,
     DK_SINGLE, DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW,
     DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
@@ -374,6 +376,7 @@ private:
 
   // ".ascii", ".asciz", ".string"
   bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
   bool parseDirectiveValue(unsigned Size); // ".byte", ".long", ...
   bool parseDirectiveOctaValue(); // ".octa"
   bool parseDirectiveRealValue(const fltSemantics &); // ".single", ...
@@ -553,6 +556,8 @@ void AsmParser::Note(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
 }
 
 bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
+  if(getTargetParser().getTargetOptions().MCNoWarn)
+    return false;
   if (getTargetParser().getTargetOptions().MCFatalWarnings)
     return Error(L, Msg, Ranges);
   printMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
@@ -679,11 +684,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // so conservatively exclude them. Only do this if we're finalizing, though,
   // as otherwise we won't necessarilly have seen everything yet.
   if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
-    const MCContext::SymbolTable &Symbols = getContext().getSymbols();
-    for (MCContext::SymbolTable::const_iterator i = Symbols.begin(),
-                                                e = Symbols.end();
-         i != e; ++i) {
-      MCSymbol *Sym = i->getValue();
+    for (const auto &TableEntry : getContext().getSymbols()) {
+      MCSymbol *Sym = TableEntry.getValue();
       // Variable symbols may not be marked as defined, so check those
       // explicitly. If we know it's a variable, we have a definition for
       // the purposes of this check.
@@ -691,9 +693,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
         // FIXME: We would really like to refer back to where the symbol was
         // first referenced for a source location. We need to add something
         // to track that. Currently, we just point to the end of the file.
-        printMessage(
-            getLexer().getLoc(), SourceMgr::DK_Error,
-            "assembler local symbol '" + Sym->getName() + "' not defined");
+        return Error(getLexer().getLoc(), "assembler local symbol '" +
+                                              Sym->getName() + "' not defined");
     }
   }
 
@@ -702,7 +703,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   if (!HadError && !NoFinalize)
     Out.Finish();
 
-  return HadError;
+  return HadError || getContext().hadError();
 }
 
 void AsmParser::checkForValidSection() {
@@ -865,11 +866,12 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
-    if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
+    if (Sym->isVariable() &&
+        isa<MCConstantExpr>(Sym->getVariableValue(/*SetUsed*/ false))) {
       if (Variant)
         return Error(EndLoc, "unexpected modifier on variable reference");
 
-      Res = Sym->getVariableValue();
+      Res = Sym->getVariableValue(/*SetUsed*/ false);
       return false;
     }
 
@@ -1102,8 +1104,9 @@ bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
   return false;
 }
 
-unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
-                                       MCBinaryExpr::Opcode &Kind) {
+static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K,
+                                         MCBinaryExpr::Opcode &Kind,
+                                         bool ShouldUseLogicalShr) {
   switch (K) {
   default:
     return 0; // not a binop.
@@ -1155,7 +1158,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Shl;
     return 4;
   case AsmToken::GreaterGreater:
-    Kind = MAI.shouldUseLogicalShr() ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
+    Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
     return 4;
 
   // High Intermediate Precedence: +, -
@@ -1179,6 +1182,89 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
   }
 }
 
+static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
+                                      MCBinaryExpr::Opcode &Kind,
+                                      bool ShouldUseLogicalShr) {
+  switch (K) {
+  default:
+    return 0; // not a binop.
+
+  // Lowest Precedence: &&, ||
+  case AsmToken::AmpAmp:
+    Kind = MCBinaryExpr::LAnd;
+    return 2;
+  case AsmToken::PipePipe:
+    Kind = MCBinaryExpr::LOr;
+    return 1;
+
+  // Low Precedence: ==, !=, <>, <, <=, >, >=
+  case AsmToken::EqualEqual:
+    Kind = MCBinaryExpr::EQ;
+    return 3;
+  case AsmToken::ExclaimEqual:
+  case AsmToken::LessGreater:
+    Kind = MCBinaryExpr::NE;
+    return 3;
+  case AsmToken::Less:
+    Kind = MCBinaryExpr::LT;
+    return 3;
+  case AsmToken::LessEqual:
+    Kind = MCBinaryExpr::LTE;
+    return 3;
+  case AsmToken::Greater:
+    Kind = MCBinaryExpr::GT;
+    return 3;
+  case AsmToken::GreaterEqual:
+    Kind = MCBinaryExpr::GTE;
+    return 3;
+
+  // Low Intermediate Precedence: +, -
+  case AsmToken::Plus:
+    Kind = MCBinaryExpr::Add;
+    return 4;
+  case AsmToken::Minus:
+    Kind = MCBinaryExpr::Sub;
+    return 4;
+
+  // High Intermediate Precedence: |, &, ^
+  //
+  // FIXME: gas seems to support '!' as an infix operator?
+  case AsmToken::Pipe:
+    Kind = MCBinaryExpr::Or;
+    return 5;
+  case AsmToken::Caret:
+    Kind = MCBinaryExpr::Xor;
+    return 5;
+  case AsmToken::Amp:
+    Kind = MCBinaryExpr::And;
+    return 5;
+
+  // Highest Precedence: *, /, %, <<, >>
+  case AsmToken::Star:
+    Kind = MCBinaryExpr::Mul;
+    return 6;
+  case AsmToken::Slash:
+    Kind = MCBinaryExpr::Div;
+    return 6;
+  case AsmToken::Percent:
+    Kind = MCBinaryExpr::Mod;
+    return 6;
+  case AsmToken::LessLess:
+    Kind = MCBinaryExpr::Shl;
+    return 6;
+  case AsmToken::GreaterGreater:
+    Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
+    return 6;
+  }
+}
+
+unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
+                                       MCBinaryExpr::Opcode &Kind) {
+  bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr();
+  return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr)
+                  : getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr);
+}
+
 /// \brief Parse all binary operators with precedence >= 'Precedence'.
 /// Res contains the LHS of the expression on input.
 bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
@@ -1251,6 +1337,15 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // Treat '.' as a valid identifier in this context.
     Lex();
     IDVal = ".";
+  } else if (Lexer.is(AsmToken::LCurly)) {
+    // Treat '{' as a valid identifier in this context.
+    Lex();
+    IDVal = "{";
+
+  } else if (Lexer.is(AsmToken::RCurly)) {
+    // Treat '}' as a valid identifier in this context.
+    Lex();
+    IDVal = "}";
   } else if (parseIdentifier(IDVal)) {
     if (!TheCondState.Ignore)
       return TokError("unexpected token at start of statement");
@@ -1313,6 +1408,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   // See what kind of statement we have.
   switch (Lexer.getKind()) {
   case AsmToken::Colon: {
+    if (!getTargetParser().isLabel(ID))
+      break;
     checkForValidSection();
 
     // identifier ':'   -> Label.
@@ -1334,8 +1431,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
             SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
         assert(RewrittenLabel.size() &&
                "We should have an internal name here.");
-        Info.AsmRewrites->push_back(AsmRewrite(AOK_Label, IDLoc,
-                                               IDVal.size(), RewrittenLabel));
+        Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
+                                       RewrittenLabel);
         IDVal = RewrittenLabel;
       }
       Sym = getContext().getOrCreateSymbol(IDVal);
@@ -1371,6 +1468,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   }
 
   case AsmToken::Equal:
+    if (!getTargetParser().equalIsAsmAssignment())
+      break;
     // identifier '=' ... -> assignment statement
     Lex();
 
@@ -1599,6 +1698,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveError(IDLoc, true);
     case DK_WARNING:
       return parseDirectiveWarning(IDLoc);
+    case DK_RELOC:
+      return parseDirectiveReloc(IDLoc);
     }
 
     return Error(IDLoc, "unknown directive");
@@ -1613,12 +1714,14 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
     return parseDirectiveMSAlign(IDLoc, Info);
 
+  if (ParsingInlineAsm && (IDVal == "even"))
+    Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
   checkForValidSection();
 
   // Canonicalize the opcode to lower case.
   std::string OpcodeStr = IDVal.lower();
   ParseInstructionInfo IInfo(Info.AsmRewrites);
-  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, IDLoc,
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID,
                                                      Info.ParsedOperands);
   Info.ParseError = HadError;
 
@@ -1703,7 +1806,7 @@ void AsmParser::eatToEndOfLine() {
 /// parseCppHashLineFilenameComment as this:
 ///   ::= # number "filename"
 /// or just as a full line comment if it doesn't have a number and a string.
-bool AsmParser::parseCppHashLineFilenameComment(const SMLoc &L) {
+bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
   Lex(); // Eat the hash token.
 
   if (getLexer().isNot(AsmToken::Integer)) {
@@ -1743,7 +1846,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   raw_ostream &OS = errs();
 
   const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
-  const SMLoc &DiagLoc = Diag.getLoc();
+  SMLoc DiagLoc = Diag.getLoc();
   unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   unsigned CppHashBuf =
       Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
@@ -1802,7 +1905,7 @@ static bool isIdentifierChar(char c) {
 bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             ArrayRef<MCAsmMacroParameter> Parameters,
                             ArrayRef<MCAsmMacroArgument> A,
-                            bool EnableAtPseudoVariable, const SMLoc &L) {
+                            bool EnableAtPseudoVariable, SMLoc L) {
   unsigned NParameters = Parameters.size();
   bool HasVararg = NParameters ? Parameters.back().Vararg : false;
   if ((!IsDarwin || NParameters != 0) && NParameters != A.size())
@@ -1858,10 +1961,8 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
-        for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-                                                ie = A[Index].end();
-             it != ie; ++it)
-          OS << it->getString();
+        for (const AsmToken &Token : A[Index])
+          OS << Token.getString();
         break;
       }
       }
@@ -1897,15 +1998,13 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
           }
         } else {
           bool VarargParameter = HasVararg && Index == (NParameters - 1);
-          for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-                                                  ie = A[Index].end();
-               it != ie; ++it)
+          for (const AsmToken &Token : A[Index])
             // We expect no quotes around the string's contents when
             // parsing for varargs.
-            if (it->getKind() != AsmToken::String || VarargParameter)
-              OS << it->getString();
+            if (Token.getKind() != AsmToken::String || VarargParameter)
+              OS << Token.getString();
             else
-              OS << it->getStringContents();
+              OS << Token.getStringContents();
 
           Pos += 1 + Argument.size();
         }
@@ -2371,6 +2470,51 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return false;
 }
 
+/// parseDirectiveReloc
+///  ::= .reloc expression , identifier [ , expression ]
+bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
+  const MCExpr *Offset;
+  const MCExpr *Expr = nullptr;
+
+  SMLoc OffsetLoc = Lexer.getTok().getLoc();
+  if (parseExpression(Offset))
+    return true;
+
+  // We can only deal with constant expressions at the moment.
+  int64_t OffsetValue;
+  if (!Offset->evaluateAsAbsolute(OffsetValue))
+    return Error(OffsetLoc, "expression is not a constant value");
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma");
+  Lexer.Lex();
+
+  if (Lexer.isNot(AsmToken::Identifier))
+    return TokError("expected relocation name");
+  SMLoc NameLoc = Lexer.getTok().getLoc();
+  StringRef Name = Lexer.getTok().getIdentifier();
+  Lexer.Lex();
+
+  if (Lexer.is(AsmToken::Comma)) {
+    Lexer.Lex();
+    SMLoc ExprLoc = Lexer.getLoc();
+    if (parseExpression(Expr))
+      return true;
+
+    MCValue Value;
+    if (!Expr->evaluateAsRelocatable(Value, nullptr, nullptr))
+      return Error(ExprLoc, "expression must be relocatable");
+  }
+
+  if (Lexer.isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in .reloc directive");
+
+  if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc))
+    return Error(NameLoc, "unknown relocation name");
+
+  return false;
+}
+
 /// parseDirectiveValue
 ///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
 bool AsmParser::parseDirectiveValue(unsigned Size) {
@@ -2617,7 +2761,6 @@ bool AsmParser::parseDirectiveOrg() {
   checkForValidSection();
 
   const MCExpr *Offset;
-  SMLoc Loc = getTok().getLoc();
   if (parseExpression(Offset))
     return true;
 
@@ -2636,13 +2779,7 @@ bool AsmParser::parseDirectiveOrg() {
   }
 
   Lex();
-
-  // Only limited forms of relocatable expressions are accepted here, it
-  // has to be relative to the current section. The streamer will return
-  // 'true' if the expression wasn't evaluatable.
-  if (getStreamer().EmitValueToOffset(Offset, FillExpr))
-    return Error(Loc, "expected assembly-time absolute expression");
-
+  getStreamer().emitValueToOffset(Offset, FillExpr);
   return false;
 }
 
@@ -2703,7 +2840,11 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
 
     Alignment = 1ULL << Alignment;
   } else {
-    // Reject alignments that aren't a power of two, for gas compatibility.
+    // Reject alignments that aren't either a power of two or zero,
+    // for gas compatibility. Alignment of zero is silently rounded
+    // up to one.
+    if (Alignment == 0)
+      Alignment = 1;
     if (!isPowerOf2_64(Alignment))
       Error(AlignmentLoc, "alignment must be a power of 2");
   }
@@ -4269,6 +4410,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".err"] = DK_ERR;
   DirectiveKindMap[".error"] = DK_ERROR;
   DirectiveKindMap[".warning"] = DK_WARNING;
+  DirectiveKindMap[".reloc"] = DK_RELOC;
 }
 
 MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
@@ -4405,10 +4547,10 @@ bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   SmallString<256> Buf;
   raw_svector_ostream OS(Buf);
 
-  for (MCAsmMacroArguments::iterator i = A.begin(), e = A.end(); i != e; ++i) {
+  for (const MCAsmMacroArgument &Arg : A) {
     // Note that the AtPseudoVariable is enabled for instantiations of .irp.
     // This is undocumented, but GAS seems to support it.
-    if (expandMacro(OS, M->Body, Parameter, *i, true, getTok().getLoc()))
+    if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
       return true;
   }
 
@@ -4488,10 +4630,10 @@ bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
   if (!MCE)
     return Error(ExprLoc, "unexpected expression in _emit");
   uint64_t IntValue = MCE->getValue();
-  if (!isUIntN(8, IntValue) && !isIntN(8, IntValue))
+  if (!isUInt<8>(IntValue) && !isInt<8>(IntValue))
     return Error(ExprLoc, "literal value out of range for directive");
 
-  Info.AsmRewrites->push_back(AsmRewrite(AOK_Emit, IDLoc, Len));
+  Info.AsmRewrites->emplace_back(AOK_Emit, IDLoc, Len);
   return false;
 }
 
@@ -4507,8 +4649,7 @@ bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   if (!isPowerOf2_64(IntValue))
     return Error(ExprLoc, "literal value not a power of two greater then zero");
 
-  Info.AsmRewrites->push_back(
-      AsmRewrite(AOK_Align, IDLoc, 5, Log2_64(IntValue)));
+  Info.AsmRewrites->emplace_back(AOK_Align, IDLoc, 5, Log2_64(IntValue));
   return false;
 }
 
@@ -4604,18 +4745,18 @@ bool AsmParser::parseMSInlineAsm(
         OutputDecls.push_back(OpDecl);
         OutputDeclsAddressOf.push_back(Operand.needAddressOf());
         OutputConstraints.push_back(("=" + Operand.getConstraint()).str());
-        AsmStrRewrites.push_back(AsmRewrite(AOK_Output, Start, SymName.size()));
+        AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size());
       } else {
         InputDecls.push_back(OpDecl);
         InputDeclsAddressOf.push_back(Operand.needAddressOf());
         InputConstraints.push_back(Operand.getConstraint().str());
-        AsmStrRewrites.push_back(AsmRewrite(AOK_Input, Start, SymName.size()));
+        AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size());
       }
     }
 
     // Consider implicit defs to be clobbers.  Think of cpuid and push.
-    ArrayRef<uint16_t> ImpDefs(Desc.getImplicitDefs(),
-                               Desc.getNumImplicitDefs());
+    ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
+                                Desc.getNumImplicitDefs());
     ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
   }
 
@@ -4710,14 +4851,23 @@ bool AsmParser::parseMSInlineAsm(
       OS << ".byte";
       break;
     case AOK_Align: {
-      unsigned Val = AR.Val;
-      OS << ".align " << Val;
+      // MS alignment directives are measured in bytes. If the native assembler
+      // measures alignment in bytes, we can pass it straight through.
+      OS << ".align";
+      if (getContext().getAsmInfo()->getAlignmentIsInBytes())
+        break;
 
-      // Skip the original immediate.
+      // Alignment is in log2 form, so print that instead and skip the original
+      // immediate.
+      unsigned Val = AR.Val;
+      OS << ' ' << Val;
       assert(Val < 10 && "Expected alignment less then 2^10.");
       AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
       break;
     }
+    case AOK_EVEN:
+      OS << ".even";
+      break;
     case AOK_DotOperator:
       // Insert the dot if the user omitted it.
       OS.flush();
@@ -4803,7 +4953,8 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
     // FIXME: Diagnose assignment to protected identifier (e.g., register name).
     if (isSymbolUsedInExpression(Sym, Value))
       return Parser.Error(EqualLoc, "Recursive use of '" + Name + "'");
-    else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
+    else if (Sym->isUndefined(/*SetUsed*/ false) && !Sym->isUsed() &&
+             !Sym->isVariable())
       ; // Allow redefinitions of undefined symbols only used in directives.
     else if (Sym->isVariable() && !Sym->isUsed() && allow_redef)
       ; // Allow redefinitions of variables that haven't yet been used.
@@ -4815,15 +4966,8 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
       return Parser.Error(EqualLoc,
                           "invalid reassignment of non-absolute variable '" +
                               Name + "'");
-
-    // Don't count these checks as uses.
-    Sym->setUsed(false);
   } else if (Name == ".") {
-    if (Parser.getStreamer().EmitValueToOffset(Value, 0)) {
-      Parser.Error(EqualLoc, "expected absolute expression");
-      Parser.eatToEndOfStatement();
-      return true;
-    }
+    Parser.getStreamer().emitValueToOffset(Value, 0);
     return false;
   } else
     Sym = Parser.getContext().getOrCreateSymbol(Name);
diff --git a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index f09bce0..a4b2b19 100644
--- a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -98,11 +98,10 @@ class COFFAsmParser : public MCAsmParserExtension {
                               SectionKind::getText());
   }
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
-    return ParseSectionSwitch(".data",
-                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA
-                            | COFF::IMAGE_SCN_MEM_READ
-                            | COFF::IMAGE_SCN_MEM_WRITE,
-                              SectionKind::getDataRel());
+    return ParseSectionSwitch(".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                           COFF::IMAGE_SCN_MEM_READ |
+                                           COFF::IMAGE_SCN_MEM_WRITE,
+                              SectionKind::getData());
   }
   bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
     return ParseSectionSwitch(".bss",
@@ -153,7 +152,7 @@ static SectionKind computeSectionKind(unsigned Flags) {
   if (Flags & COFF::IMAGE_SCN_MEM_READ &&
       (Flags & COFF::IMAGE_SCN_MEM_WRITE) == 0)
     return SectionKind::getReadOnly();
-  return SectionKind::getDataRel();
+  return SectionKind::getData();
 }
 
 bool COFFAsmParser::ParseSectionFlags(StringRef FlagsString, unsigned* Flags) {
diff --git a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index dc664e8..73e068a 100644
--- a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -8,10 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCSectionMachO.h"
@@ -38,6 +41,8 @@ class DarwinAsmParser : public MCAsmParserExtension {
                           unsigned TAA = 0, unsigned ImplicitAlign = 0,
                           unsigned StubSize = 0);
 
+  SMLoc LastVersionMinDirective;
+
 public:
   DarwinAsmParser() {}
 
@@ -164,9 +169,14 @@ public:
     addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveTLV>(".tlv");
 
     addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveIdent>(".ident");
+    addDirectiveHandler<&DarwinAsmParser::parseVersionMin>(
+      ".watchos_version_min");
+    addDirectiveHandler<&DarwinAsmParser::parseVersionMin>(".tvos_version_min");
     addDirectiveHandler<&DarwinAsmParser::parseVersionMin>(".ios_version_min");
     addDirectiveHandler<&DarwinAsmParser::parseVersionMin>(
       ".macosx_version_min");
+
+    LastVersionMinDirective = SMLoc();
   }
 
   bool parseDirectiveDesc(StringRef, SMLoc);
@@ -381,9 +391,8 @@ bool DarwinAsmParser::parseSectionSwitch(const char *Segment,
   // FIXME: Arch specific.
   bool isText = TAA & MachO::S_ATTR_PURE_INSTRUCTIONS;
   getStreamer().SwitchSection(getContext().getMachOSection(
-                                Segment, Section, TAA, StubSize,
-                                isText ? SectionKind::getText()
-                                       : SectionKind::getDataRel()));
+      Segment, Section, TAA, StubSize,
+      isText ? SectionKind::getText() : SectionKind::getData()));
 
   // Set the implicit alignment, if any.
   //
@@ -579,12 +588,34 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
   if (!ErrorStr.empty())
     return Error(Loc, ErrorStr.c_str());
 
+  // Issue a warning if the target is not powerpc and Section is a *coal* section.
+  Triple TT = getParser().getContext().getObjectFileInfo()->getTargetTriple();
+  Triple::ArchType ArchTy = TT.getArch();
+
+  if (ArchTy != Triple::ppc && ArchTy != Triple::ppc64) {
+    StringRef NonCoalSection = StringSwitch<StringRef>(Section)
+                                   .Case("__textcoal_nt", "__text")
+                                   .Case("__const_coal", "__const")
+                                   .Case("__datacoal_nt", "__data")
+                                   .Default(Section);
+
+    if (!Section.equals(NonCoalSection)) {
+      StringRef SectionVal(Loc.getPointer());
+      size_t B = SectionVal.find(',') + 1, E = SectionVal.find(',', B);
+      SMLoc BLoc = SMLoc::getFromPointer(SectionVal.data() + B);
+      SMLoc ELoc = SMLoc::getFromPointer(SectionVal.data() + E);
+      getParser().Warning(Loc, "section \"" + Section + "\" is deprecated",
+                          SMRange(BLoc, ELoc));
+      getParser().Note(Loc, "change section name to \"" + NonCoalSection +
+                       "\"", SMRange(BLoc, ELoc));
+    }
+  }
+
   // FIXME: Arch specific.
   bool isText = Segment == "__TEXT";  // FIXME: Hack.
   getStreamer().SwitchSection(getContext().getMachOSection(
-                                Segment, Section, TAA, StubSize,
-                                isText ? SectionKind::getText()
-                                : SectionKind::getDataRel()));
+      Segment, Section, TAA, StubSize,
+      isText ? SectionKind::getText() : SectionKind::getData()));
   return false;
 }
 
@@ -636,17 +667,16 @@ bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
                  "environment variable unset.");
 
   // Open the secure log file if we haven't already.
-  raw_ostream *OS = getContext().getSecureLog();
+  raw_fd_ostream *OS = getContext().getSecureLog();
   if (!OS) {
     std::error_code EC;
-    OS = new raw_fd_ostream(SecureLogFile, EC,
-                            sys::fs::F_Append | sys::fs::F_Text);
-    if (EC) {
-       delete OS;
+    auto NewOS = llvm::make_unique<raw_fd_ostream>(
+        SecureLogFile, EC, sys::fs::F_Append | sys::fs::F_Text);
+    if (EC)
        return Error(IDLoc, Twine("can't open secure log file: ") +
                                SecureLogFile + " (" + EC.message() + ")");
-    }
-    getContext().setSecureLog(OS);
+    OS = NewOS.get();
+    getContext().setSecureLog(std::move(NewOS));
   }
 
   // Write the message.
@@ -867,9 +897,11 @@ bool DarwinAsmParser::parseDirectiveDataRegionEnd(StringRef, SMLoc) {
 /// parseVersionMin
 ///  ::= .ios_version_min major,minor[,update]
 ///  ::= .macosx_version_min major,minor[,update]
-bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc) {
+bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc) {
   int64_t Major = 0, Minor = 0, Update = 0;
   int Kind = StringSwitch<int>(Directive)
+    .Case(".watchos_version_min", MCVM_WatchOSVersionMin)
+    .Case(".tvos_version_min", MCVM_TvOSVersionMin)
     .Case(".ios_version_min", MCVM_IOSVersionMin)
     .Case(".macosx_version_min", MCVM_OSXVersionMin);
   // Get the major version number.
@@ -902,6 +934,24 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc) {
     Lex();
   }
 
+  const Triple &T = getContext().getObjectFileInfo()->getTargetTriple();
+  Triple::OSType ExpectedOS = Triple::UnknownOS;
+  switch ((MCVersionMinType)Kind) {
+  case MCVM_WatchOSVersionMin: ExpectedOS = Triple::WatchOS; break;
+  case MCVM_TvOSVersionMin:    ExpectedOS = Triple::TvOS;    break;
+  case MCVM_IOSVersionMin:     ExpectedOS = Triple::IOS;     break;
+  case MCVM_OSXVersionMin:     ExpectedOS = Triple::MacOSX;  break;
+  }
+  if (T.getOS() != ExpectedOS)
+    Warning(Loc, Directive + " should only be used for " +
+            Triple::getOSTypeName(ExpectedOS) + " targets");
+
+  if (LastVersionMinDirective.isValid()) {
+    Warning(Loc, "overriding previous version_min directive");
+    Note(LastVersionMinDirective, "previous definition is here");
+  }
+  LastVersionMinDirective = Loc;
+
   // We've parsed a correct version specifier, so send it to the streamer.
   getStreamer().EmitVersionMin((MCVersionMinType)Kind, Major, Minor, Update);
 
diff --git a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 5f8a603..6cbcdec 100644
--- a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -52,8 +52,6 @@ public:
     addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
     addDirectiveHandler<
-      &ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local");
-    addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
     addDirectiveHandler<
@@ -81,8 +79,8 @@ public:
   // the best way for us to get access to it?
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
     return ParseSectionSwitch(".data", ELF::SHT_PROGBITS,
-                              ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                              SectionKind::getDataRel());
+                              ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                              SectionKind::getData());
   }
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
     return ParseSectionSwitch(".text", ELF::SHT_PROGBITS,
@@ -113,9 +111,8 @@ public:
   }
   bool ParseSectionDirectiveDataRel(StringRef, SMLoc) {
     return ParseSectionSwitch(".data.rel", ELF::SHT_PROGBITS,
-                              ELF::SHF_ALLOC |
-                              ELF::SHF_WRITE,
-                              SectionKind::getDataRel());
+                              ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                              SectionKind::getData());
   }
   bool ParseSectionDirectiveDataRelRo(StringRef, SMLoc) {
     return ParseSectionSwitch(".data.rel.ro", ELF::SHT_PROGBITS,
@@ -123,17 +120,10 @@ public:
                               ELF::SHF_WRITE,
                               SectionKind::getReadOnlyWithRel());
   }
-  bool ParseSectionDirectiveDataRelRoLocal(StringRef, SMLoc) {
-    return ParseSectionSwitch(".data.rel.ro.local", ELF::SHT_PROGBITS,
-                              ELF::SHF_ALLOC |
-                              ELF::SHF_WRITE,
-                              SectionKind::getReadOnlyWithRelLocal());
-  }
   bool ParseSectionDirectiveEhFrame(StringRef, SMLoc) {
     return ParseSectionSwitch(".eh_frame", ELF::SHT_PROGBITS,
-                              ELF::SHF_ALLOC |
-                              ELF::SHF_WRITE,
-                              SectionKind::getDataRel());
+                              ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                              SectionKind::getData());
   }
   bool ParseDirectivePushSection(StringRef, SMLoc);
   bool ParseDirectivePopSection(StringRef, SMLoc);
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
index 795cc85..e891bd2 100644
--- a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
@@ -12,8 +12,8 @@
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()),
-                           TokStart(nullptr), SkipSpace(true) {
+MCAsmLexer::MCAsmLexer() : TokStart(nullptr), SkipSpace(true) {
+  CurTok.emplace_back(AsmToken::Error, StringRef());
 }
 
 MCAsmLexer::~MCAsmLexer() {
diff --git a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
index 60a3a3b..4e4b478 100644
--- a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -7,13 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 using namespace llvm;
 
-MCTargetAsmParser::MCTargetAsmParser()
-  : AvailableFeatures(0), ParsingInlineAsm(false)
+MCTargetAsmParser::MCTargetAsmParser(MCTargetOptions const &MCOptions,
+                                     const MCSubtargetInfo &STI)
+  : AvailableFeatures(0), ParsingInlineAsm(false), MCOptions(MCOptions),
+    STI(&STI)
 {
 }
 
 MCTargetAsmParser::~MCTargetAsmParser() {
 }
+
+MCSubtargetInfo &MCTargetAsmParser::copySTI() {
+  MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI());
+  STI = &STICopy;
+  return STICopy;
+}
+
+const MCSubtargetInfo &MCTargetAsmParser::getSTI() const {
+  return *STI;
+}
diff --git a/contrib/llvm/lib/MC/MCSection.cpp b/contrib/llvm/lib/MC/MCSection.cpp
index 9152f2b..dbd544a 100644
--- a/contrib/llvm/lib/MC/MCSection.cpp
+++ b/contrib/llvm/lib/MC/MCSection.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 
 MCSection::MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      IsRegistered(false), Variant(V), Kind(K) {}
+      IsRegistered(false), DummyFragment(this), Variant(V), Kind(K) {}
 
 MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
   if (!End)
@@ -72,7 +72,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
   if (MI == SubsectionFragmentMap.end())
     IP = end();
   else
-    IP = MI->second;
+    IP = MI->second->getIterator();
   if (!ExactMatch && Subsection != 0) {
     // The GNU as documentation claims that subsections have an alignment of 4,
     // although this appears not to be the case.
diff --git a/contrib/llvm/lib/MC/MCSectionCOFF.cpp b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
index ce0b4f5..b8373f4 100644
--- a/contrib/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
@@ -11,6 +11,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/MC/MCSectionELF.cpp b/contrib/llvm/lib/MC/MCSectionELF.cpp
index b4448d7..5a0bb7f 100644
--- a/contrib/llvm/lib/MC/MCSectionELF.cpp
+++ b/contrib/llvm/lib/MC/MCSectionELF.cpp
@@ -27,12 +27,7 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
   if (isUnique())
     return false;
 
-  // FIXME: Does .section .bss/.data/.text work everywhere??
-  if (Name == ".text" || Name == ".data" ||
-      (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS()))
-    return true;
-
-  return false;
+  return MAI.shouldOmitSectionDirective(Name);
 }
 
 static void printName(raw_ostream &OS, StringRef Name) {
@@ -138,6 +133,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << "note";
   else if (Type == ELF::SHT_PROGBITS)
     OS << "progbits";
+  else if (Type == ELF::SHT_X86_64_UNWIND)
+    OS << "unwind";
 
   if (EntrySize) {
     assert(Flags & ELF::SHF_MERGE);
diff --git a/contrib/llvm/lib/MC/MCSectionMachO.cpp b/contrib/llvm/lib/MC/MCSectionMachO.cpp
index c9f1591..879c6e5 100644
--- a/contrib/llvm/lib/MC/MCSectionMachO.cpp
+++ b/contrib/llvm/lib/MC/MCSectionMachO.cpp
@@ -177,7 +177,7 @@ std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec,        // In.
   TAAParsed = false;
 
   SmallVector<StringRef, 5> SplitSpec;
-  Spec.split(SplitSpec, ",");
+  Spec.split(SplitSpec, ',');
   // Remove leading and trailing whitespace.
   auto GetEmptyOrTrim = [&SplitSpec](size_t Idx) -> StringRef {
     return SplitSpec.size() > Idx ? SplitSpec[Idx].trim() : StringRef();
@@ -235,7 +235,7 @@ std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec,        // In.
 
   // The attribute list is a '+' separated list of attributes.
   SmallVector<StringRef, 1> SectionAttrs;
-  Attrs.split(SectionAttrs, "+", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  Attrs.split(SectionAttrs, '+', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
 
   for (StringRef &SectionAttr : SectionAttrs) {
     auto AttrDescriptorI = std::find_if(
diff --git a/contrib/llvm/lib/MC/MCStreamer.cpp b/contrib/llvm/lib/MC/MCStreamer.cpp
index 7fbbbd9..836b405 100644
--- a/contrib/llvm/lib/MC/MCStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCStreamer.cpp
@@ -107,8 +107,7 @@ void MCStreamer::EmitSLEB128IntValue(int64_t Value) {
   EmitBytes(OSE.str());
 }
 
-void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size,
-                           const SMLoc &Loc) {
+void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size, SMLoc Loc) {
   EmitValueImpl(Value, Size, Loc);
 }
 
@@ -189,11 +188,9 @@ void MCStreamer::InitSections(bool NoExecStack) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCStreamer::AssignSection(MCSymbol *Symbol, MCSection *Section) {
-  if (Section)
-    Symbol->setSection(*Section);
-  else
-    Symbol->setUndefined();
+void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
+  assert(Fragment);
+  Symbol->setFragment(Fragment);
 
   // As we emit symbols into a section, track the order so that they can
   // be sorted upon later. Zero is reserved to mean 'unemitted'.
@@ -203,7 +200,8 @@ void MCStreamer::AssignSection(MCSymbol *Symbol, MCSection *Section) {
 void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  AssignSection(Symbol, getCurrentSection().first);
+  assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!");
+  Symbol->setFragment(&getCurrentSectionOnly()->getDummyFragment());
 
   MCTargetStreamer *TS = getTargetStreamer();
   if (TS)
@@ -361,6 +359,14 @@ void MCStreamer::EmitCFIEscape(StringRef Values) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
+void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction = 
+    MCCFIInstruction::createGnuArgsSize(Label, Size);
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
 void MCStreamer::EmitCFISignalFrame() {
   EnsureValidDwarfFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -467,6 +473,8 @@ void MCStreamer::EmitWinEHHandlerData() {
     report_fatal_error("Chained unwind areas can't have handlers!");
 }
 
+void MCStreamer::EmitSyntaxDirective() {}
+
 void MCStreamer::EmitWinCFIPushReg(unsigned Register) {
   EnsureValidWinFrameInfo();
 
@@ -679,8 +687,7 @@ void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 void MCStreamer::ChangeSection(MCSection *, const MCExpr *) {}
 void MCStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
 void MCStreamer::EmitBytes(StringRef Data) {}
-void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                               const SMLoc &Loc) {
+void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) {
   visitUsedExpr(*Value);
 }
 void MCStreamer::EmitULEB128Value(const MCExpr *Value) {}
@@ -690,9 +697,7 @@ void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::EmitCodeAlignment(unsigned ByteAlignment,
                                    unsigned MaxBytesToEmit) {}
-bool MCStreamer::EmitValueToOffset(const MCExpr *Offset, unsigned char Value) {
-  return false;
-}
+void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value) {}
 void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {}
 void MCStreamer::EmitBundleLock(bool AlignToEnd) {}
 void MCStreamer::FinishImpl() {}
diff --git a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
index 9210cf5..1b59250 100644
--- a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -32,8 +32,8 @@ void MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
     CPUSchedModel = &MCSchedModel::GetDefaultSchedModel();
 }
 
-void MCSubtargetInfo::setDefaultFeatures(StringRef CPU) {
-  FeatureBits = getFeatures(CPU, "", ProcDesc, ProcFeatures);
+void MCSubtargetInfo::setDefaultFeatures(StringRef CPU, StringRef FS) {
+  FeatureBits = getFeatures(CPU, FS, ProcDesc, ProcFeatures);
 }
 
 MCSubtargetInfo::MCSubtargetInfo(
@@ -63,32 +63,30 @@ FeatureBitset MCSubtargetInfo::ToggleFeature(const FeatureBitset &FB) {
 /// ToggleFeature - Toggle a feature and returns the re-computed feature
 /// bits. This version will also change all implied bits.
 FeatureBitset MCSubtargetInfo::ToggleFeature(StringRef FS) {
-  SubtargetFeatures Features;
-  FeatureBits = Features.ToggleFeature(FeatureBits, FS, ProcFeatures);
+  SubtargetFeatures::ToggleFeature(FeatureBits, FS, ProcFeatures);
   return FeatureBits;
 }
 
 FeatureBitset MCSubtargetInfo::ApplyFeatureFlag(StringRef FS) {
-  SubtargetFeatures Features;
-  FeatureBits = Features.ApplyFeatureFlag(FeatureBits, FS, ProcFeatures);
+  SubtargetFeatures::ApplyFeatureFlag(FeatureBits, FS, ProcFeatures);
   return FeatureBits;
 }
 
 const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   assert(ProcSchedModels && "Processor machine model not available!");
 
-  unsigned NumProcs = ProcDesc.size();
-#ifndef NDEBUG
-  for (size_t i = 1; i < NumProcs; i++) {
-    assert(strcmp(ProcSchedModels[i - 1].Key, ProcSchedModels[i].Key) < 0 &&
-           "Processor machine model table is not sorted");
-  }
-#endif
+  ArrayRef<SubtargetInfoKV> SchedModels(ProcSchedModels, ProcDesc.size());
+
+  assert(std::is_sorted(SchedModels.begin(), SchedModels.end(),
+                    [](const SubtargetInfoKV &LHS, const SubtargetInfoKV &RHS) {
+                      return strcmp(LHS.Key, RHS.Key) < 0;
+                    }) &&
+         "Processor machine model table is not sorted");
 
   // Find entry
-  const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU);
-  if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
+  auto Found =
+    std::lower_bound(SchedModels.begin(), SchedModels.end(), CPU);
+  if (Found == SchedModels.end() || StringRef(Found->Key) != CPU) {
     if (CPU != "help") // Don't error if the user asked for help.
       errs() << "'" << CPU
              << "' is not a recognized processor for this target"
diff --git a/contrib/llvm/lib/MC/MCSymbol.cpp b/contrib/llvm/lib/MC/MCSymbol.cpp
index 125380a..ab3b8eb 100644
--- a/contrib/llvm/lib/MC/MCSymbol.cpp
+++ b/contrib/llvm/lib/MC/MCSymbol.cpp
@@ -16,8 +16,11 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-// Sentinel value for the absolute pseudo section.
-MCSection *MCSymbol::AbsolutePseudoSection = reinterpret_cast<MCSection *>(1);
+// Only the address of this fragment is ever actually used.
+static MCDummyFragment SentinelFragment(nullptr);
+
+// Sentinel value for the absolute pseudo fragment.
+MCFragment *MCSymbol::AbsolutePseudoFragment = &SentinelFragment;
 
 void *MCSymbol::operator new(size_t s, const StringMapEntry<bool> *Name,
                              MCContext &Ctx) {
diff --git a/contrib/llvm/lib/MC/MCTargetOptions.cpp b/contrib/llvm/lib/MC/MCTargetOptions.cpp
index 1258d9e..4656227 100644
--- a/contrib/llvm/lib/MC/MCTargetOptions.cpp
+++ b/contrib/llvm/lib/MC/MCTargetOptions.cpp
@@ -14,9 +14,10 @@ namespace llvm {
 
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
-      MCFatalWarnings(false), MCSaveTempLabels(false),
-      MCUseDwarfDirectory(false), ShowMCEncoding(false), ShowMCInst(false),
-      AsmVerbose(false), DwarfVersion(0), ABIName() {}
+      MCFatalWarnings(false), MCNoWarn(false), MCSaveTempLabels(false),
+      MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
+      DwarfVersion(0), ABIName() {}
 
 StringRef MCTargetOptions::getABIName() const {
   return ABIName;
diff --git a/contrib/llvm/lib/MC/MCWinEH.cpp b/contrib/llvm/lib/MC/MCWinEH.cpp
index d5d9ead..83af203 100644
--- a/contrib/llvm/lib/MC/MCWinEH.cpp
+++ b/contrib/llvm/lib/MC/MCWinEH.cpp
@@ -49,10 +49,10 @@ static MCSection *getUnwindInfoSection(StringRef SecName,
       if (CodeSecName.startswith(".text$"))
         CodeSecName = CodeSecName.substr(6);
 
-      return Context.getCOFFSection(
-          (SecName + Twine('$') + CodeSecName).str(),
-          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
-          SectionKind::getDataRel());
+      return Context.getCOFFSection((SecName + Twine('$') + CodeSecName).str(),
+                                    COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                        COFF::IMAGE_SCN_MEM_READ,
+                                    SectionKind::getData());
     }
   }
 
diff --git a/contrib/llvm/lib/MC/MachObjectWriter.cpp b/contrib/llvm/lib/MC/MachObjectWriter.cpp
index 8ce6127..324385f 100644
--- a/contrib/llvm/lib/MC/MachObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/MachObjectWriter.cpp
@@ -78,7 +78,6 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S,
           dyn_cast<const MCConstantExpr>(S.getVariableValue()))
       return C->getValue();
 
-
     MCValue Target;
     if (!S.getVariableValue()->evaluateAsRelocatable(Target, &Layout, nullptr))
       report_fatal_error("unable to evaluate offset for variable '" +
@@ -117,7 +116,8 @@ uint64_t MachObjectWriter::getPaddingSize(const MCSection *Sec,
   return OffsetToAlignment(EndAddr, NextSec.getAlignment());
 }
 
-void MachObjectWriter::writeHeader(unsigned NumLoadCommands,
+void MachObjectWriter::writeHeader(MachO::HeaderFileType Type,
+                                   unsigned NumLoadCommands,
                                    unsigned LoadCommandsSize,
                                    bool SubsectionsViaSymbols) {
   uint32_t Flags = 0;
@@ -128,7 +128,7 @@ void MachObjectWriter::writeHeader(unsigned NumLoadCommands,
   // struct mach_header (28 bytes) or
   // struct mach_header_64 (32 bytes)
 
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   write32(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC);
@@ -136,29 +136,30 @@ void MachObjectWriter::writeHeader(unsigned NumLoadCommands,
   write32(TargetObjectWriter->getCPUType());
   write32(TargetObjectWriter->getCPUSubtype());
 
-  write32(MachO::MH_OBJECT);
+  write32(Type);
   write32(NumLoadCommands);
   write32(LoadCommandsSize);
   write32(Flags);
   if (is64Bit())
     write32(0); // reserved
 
-  assert(OS.tell() - Start ==
-         (is64Bit()?sizeof(MachO::mach_header_64): sizeof(MachO::mach_header)));
+  assert(
+      getStream().tell() - Start ==
+      (is64Bit() ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header)));
 }
 
 /// writeSegmentLoadCommand - Write a segment load command.
 ///
 /// \param NumSections The number of sections in this segment.
 /// \param SectionDataSize The total size of the sections.
-void MachObjectWriter::writeSegmentLoadCommand(unsigned NumSections,
-                                               uint64_t VMSize,
-                                               uint64_t SectionDataStartOffset,
-                                               uint64_t SectionDataSize) {
+void MachObjectWriter::writeSegmentLoadCommand(
+    StringRef Name, unsigned NumSections, uint64_t VMAddr, uint64_t VMSize,
+    uint64_t SectionDataStartOffset, uint64_t SectionDataSize, uint32_t MaxProt,
+    uint32_t InitProt) {
   // struct segment_command (56 bytes) or
   // struct segment_command_64 (72 bytes)
 
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   unsigned SegmentLoadCommandSize =
@@ -169,31 +170,32 @@ void MachObjectWriter::writeSegmentLoadCommand(unsigned NumSections,
           NumSections * (is64Bit() ? sizeof(MachO::section_64) :
                          sizeof(MachO::section)));
 
-  writeBytes("", 16);
+  assert(Name.size() <= 16);
+  writeBytes(Name, 16);
   if (is64Bit()) {
-    write64(0); // vmaddr
+    write64(VMAddr);                 // vmaddr
     write64(VMSize); // vmsize
     write64(SectionDataStartOffset); // file offset
     write64(SectionDataSize); // file size
   } else {
-    write32(0); // vmaddr
+    write32(VMAddr);                 // vmaddr
     write32(VMSize); // vmsize
     write32(SectionDataStartOffset); // file offset
     write32(SectionDataSize); // file size
   }
   // maxprot
-  write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
+  write32(MaxProt);
   // initprot
-  write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE);
+  write32(InitProt);
   write32(NumSections);
   write32(0); // flags
 
-  assert(OS.tell() - Start == SegmentLoadCommandSize);
+  assert(getStream().tell() - Start == SegmentLoadCommandSize);
 }
 
-void MachObjectWriter::writeSection(const MCAssembler &Asm,
-                                    const MCAsmLayout &Layout,
-                                    const MCSection &Sec, uint64_t FileOffset,
+void MachObjectWriter::writeSection(const MCAsmLayout &Layout,
+                                    const MCSection &Sec, uint64_t VMAddr,
+                                    uint64_t FileOffset, unsigned Flags,
                                     uint64_t RelocationsStart,
                                     unsigned NumRelocations) {
   uint64_t SectionSize = Layout.getSectionAddressSize(&Sec);
@@ -208,24 +210,20 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
   // struct section (68 bytes) or
   // struct section_64 (80 bytes)
 
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   writeBytes(Section.getSectionName(), 16);
   writeBytes(Section.getSegmentName(), 16);
   if (is64Bit()) {
-    write64(getSectionAddress(&Sec)); // address
+    write64(VMAddr);      // address
     write64(SectionSize); // size
   } else {
-    write32(getSectionAddress(&Sec)); // address
+    write32(VMAddr);      // address
     write32(SectionSize); // size
   }
   write32(FileOffset);
 
-  unsigned Flags = Section.getTypeAndAttributes();
-  if (Section.hasInstructions())
-    Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS;
-
   assert(isPowerOf2_32(Section.getAlignment()) && "Invalid alignment!");
   write32(Log2_32(Section.getAlignment()));
   write32(NumRelocations ? RelocationsStart : 0);
@@ -236,8 +234,8 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
   if (is64Bit())
     write32(0); // reserved3
 
-  assert(OS.tell() - Start == (is64Bit() ? sizeof(MachO::section_64) :
-                               sizeof(MachO::section)));
+  assert(getStream().tell() - Start ==
+         (is64Bit() ? sizeof(MachO::section_64) : sizeof(MachO::section)));
 }
 
 void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset,
@@ -246,7 +244,7 @@ void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset,
                                               uint32_t StringTableSize) {
   // struct symtab_command (24 bytes)
 
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   write32(MachO::LC_SYMTAB);
@@ -256,7 +254,7 @@ void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset,
   write32(StringTableOffset);
   write32(StringTableSize);
 
-  assert(OS.tell() - Start == sizeof(MachO::symtab_command));
+  assert(getStream().tell() - Start == sizeof(MachO::symtab_command));
 }
 
 void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol,
@@ -269,7 +267,7 @@ void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol,
                                                 uint32_t NumIndirectSymbols) {
   // struct dysymtab_command (80 bytes)
 
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   write32(MachO::LC_DYSYMTAB);
@@ -293,7 +291,7 @@ void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   write32(0); // locreloff
   write32(0); // nlocrel
 
-  assert(OS.tell() - Start == sizeof(MachO::dysymtab_command));
+  assert(getStream().tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
 MachObjectWriter::MachSymbolData *
@@ -389,7 +387,7 @@ void MachObjectWriter::writeNlist(MachSymbolData &MSD,
 void MachObjectWriter::writeLinkeditLoadCommand(uint32_t Type,
                                                 uint32_t DataOffset,
                                                 uint32_t DataSize) {
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   write32(Type);
@@ -397,7 +395,7 @@ void MachObjectWriter::writeLinkeditLoadCommand(uint32_t Type,
   write32(DataOffset);
   write32(DataSize);
 
-  assert(OS.tell() - Start == sizeof(MachO::linkedit_data_command));
+  assert(getStream().tell() - Start == sizeof(MachO::linkedit_data_command));
 }
 
 static unsigned ComputeLinkerOptionsLoadCommandSize(
@@ -413,7 +411,7 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand(
   const std::vector<std::string> &Options)
 {
   unsigned Size = ComputeLinkerOptionsLoadCommandSize(Options, is64Bit());
-  uint64_t Start = OS.tell();
+  uint64_t Start = getStream().tell();
   (void) Start;
 
   write32(MachO::LC_LINKER_OPTION);
@@ -429,7 +427,7 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand(
   // Pad to a multiple of the pointer size.
   writeBytes("", OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4));
 
-  assert(OS.tell() - Start == Size);
+  assert(getStream().tell() - Start == Size);
 }
 
 void MachObjectWriter::recordRelocation(MCAssembler &Asm,
@@ -458,9 +456,9 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
     if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_SYMBOL_STUBS) {
-	MCSymbol &Symbol = *it->Symbol;
-	report_fatal_error("indirect symbol '" + Symbol.getName() +
-                           "' not in a symbol pointer or stub section");
+      MCSymbol &Symbol = *it->Symbol;
+      report_fatal_error("indirect symbol '" + Symbol.getName() +
+                         "' not in a symbol pointer or stub section");
     }
   }
 
@@ -522,7 +520,7 @@ void MachObjectWriter::computeSymbolTable(
 
     StringTable.add(Symbol.getName());
   }
-  StringTable.finalize(StringTableBuilder::MachO);
+  StringTable.finalize();
 
   // Build the symbol arrays but only for non-local symbols.
   //
@@ -628,6 +626,18 @@ void MachObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
 }
 
 bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
+    const MCAssembler &Asm, const MCSymbol &A, const MCSymbol &B,
+    bool InSet) const {
+  // FIXME: We don't handle things like
+  // foo = .
+  // creating atoms.
+  if (A.isVariable() || B.isVariable())
+    return false;
+  return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, A, B,
+                                                                InSet);
+}
+
+bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
     const MCAssembler &Asm, const MCSymbol &SymA, const MCFragment &FB,
     bool InSet, bool IsPCRel) const {
   if (InSet)
@@ -746,7 +756,7 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
     ++NumLoadCommands;
     LoadCommandsSize += ComputeLinkerOptionsLoadCommandSize(Option, is64Bit());
   }
-  
+
   // Compute the total size of the section data, as well as its file size and vm
   // size.
   uint64_t SectionDataStart = (is64Bit() ? sizeof(MachO::mach_header_64) :
@@ -776,18 +786,25 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
   SectionDataFileSize += SectionDataPadding;
 
   // Write the prolog, starting with the header and load command...
-  writeHeader(NumLoadCommands, LoadCommandsSize,
+  writeHeader(MachO::MH_OBJECT, NumLoadCommands, LoadCommandsSize,
               Asm.getSubsectionsViaSymbols());
-  writeSegmentLoadCommand(NumSections, VMSize,
-                          SectionDataStart, SectionDataSize);
+  uint32_t Prot =
+      MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE;
+  writeSegmentLoadCommand("", NumSections, 0, VMSize, SectionDataStart,
+                          SectionDataSize, Prot, Prot);
 
   // ... and then the section headers.
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
-  for (const MCSection &Sec : Asm) {
+  for (const MCSection &Section : Asm) {
+    const auto &Sec = cast<MCSectionMachO>(Section);
     std::vector<RelAndSymbol> &Relocs = Relocations[&Sec];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(&Sec);
-    writeSection(Asm, Layout, Sec, SectionStart, RelocTableEnd, NumRelocs);
+    unsigned Flags = Sec.getTypeAndAttributes();
+    if (Sec.hasInstructions())
+      Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS;
+    writeSection(Layout, Sec, getSectionAddress(&Sec), SectionStart, Flags,
+                 RelocTableEnd, NumRelocs);
     RelocTableEnd += NumRelocs * sizeof(MachO::any_relocation_info);
   }
 
@@ -798,8 +815,22 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
     assert(VersionInfo.Major < 65536 && "unencodable major target version");
     uint32_t EncodedVersion = VersionInfo.Update | (VersionInfo.Minor << 8) |
       (VersionInfo.Major << 16);
-    write32(VersionInfo.Kind == MCVM_OSXVersionMin ? MachO::LC_VERSION_MIN_MACOSX :
-            MachO::LC_VERSION_MIN_IPHONEOS);
+    MachO::LoadCommandType LCType;
+    switch (VersionInfo.Kind) {
+    case MCVM_OSXVersionMin:
+      LCType = MachO::LC_VERSION_MIN_MACOSX;
+      break;
+    case MCVM_IOSVersionMin:
+      LCType = MachO::LC_VERSION_MIN_IPHONEOS;
+      break;
+    case MCVM_TvOSVersionMin:
+      LCType = MachO::LC_VERSION_MIN_TVOS;
+      break;
+    case MCVM_WatchOSVersionMin:
+      LCType = MachO::LC_VERSION_MIN_WATCHOS;
+      break;
+    }
+    write32(LCType);
     write32(sizeof(MachO::version_min_command));
     write32(EncodedVersion);
     write32(0);         // reserved.
@@ -901,12 +932,12 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
   // Write out the loh commands, if there is one.
   if (LOHSize) {
 #ifndef NDEBUG
-    unsigned Start = OS.tell();
+    unsigned Start = getStream().tell();
 #endif
     Asm.getLOHContainer().emit(*this, Layout);
     // Pad to a multiple of the pointer size.
     writeBytes("", OffsetToAlignment(LOHRawSize, is64Bit() ? 8 : 4));
-    assert(OS.tell() - Start == LOHSize);
+    assert(getStream().tell() - Start == LOHSize);
   }
 
   // Write the symbol table data, if used.
@@ -942,7 +973,7 @@ void MachObjectWriter::writeObject(MCAssembler &Asm,
         writeNlist(Entry, Layout);
 
     // Write the string table.
-    OS << StringTable.data();
+    getStream() << StringTable.data();
   }
 }
 
diff --git a/contrib/llvm/lib/MC/StringTableBuilder.cpp b/contrib/llvm/lib/MC/StringTableBuilder.cpp
index 9de9363..80e5522 100644
--- a/contrib/llvm/lib/MC/StringTableBuilder.cpp
+++ b/contrib/llvm/lib/MC/StringTableBuilder.cpp
@@ -8,35 +8,71 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/StringTableBuilder.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Endian.h"
 
+#include <vector>
+
 using namespace llvm;
 
-static bool compareBySuffix(StringRef a, StringRef b) {
-  size_t sizeA = a.size();
-  size_t sizeB = b.size();
-  size_t len = std::min(sizeA, sizeB);
-  for (size_t i = 0; i < len; ++i) {
-    char ca = a[sizeA - i - 1];
-    char cb = b[sizeB - i - 1];
-    if (ca != cb)
-      return ca > cb;
+StringTableBuilder::StringTableBuilder(Kind K) : K(K) {}
+
+typedef std::pair<StringRef, size_t> StringPair;
+
+// Returns the character at Pos from end of a string.
+static int charTailAt(StringPair *P, size_t Pos) {
+  StringRef S = P->first;
+  if (Pos >= S.size())
+    return -1;
+  return (unsigned char)S[S.size() - Pos - 1];
+}
+
+// Three-way radix quicksort. This is much faster than std::sort with strcmp
+// because it does not compare characters that we already know the same.
+static void multikey_qsort(StringPair **Begin, StringPair **End, int Pos) {
+tailcall:
+  if (End - Begin <= 1)
+    return;
+
+  // Partition items. Items in [Begin, P) are greater than the pivot,
+  // [P, Q) are the same as the pivot, and [Q, End) are less than the pivot.
+  int Pivot = charTailAt(*Begin, Pos);
+  StringPair **P = Begin;
+  StringPair **Q = End;
+  for (StringPair **R = Begin + 1; R < Q;) {
+    int C = charTailAt(*R, Pos);
+    if (C > Pivot)
+      std::swap(*P++, *R++);
+    else if (C < Pivot)
+      std::swap(*--Q, *R);
+    else
+      R++;
+  }
+
+  multikey_qsort(Begin, P, Pos);
+  multikey_qsort(Q, End, Pos);
+  if (Pivot != -1) {
+    // qsort(P, Q, Pos + 1), but with tail call optimization.
+    Begin = P;
+    End = Q;
+    ++Pos;
+    goto tailcall;
   }
-  return sizeA > sizeB;
 }
 
-void StringTableBuilder::finalize(Kind kind) {
-  SmallVector<StringRef, 8> Strings;
+void StringTableBuilder::finalize() {
+  std::vector<std::pair<StringRef, size_t> *> Strings;
   Strings.reserve(StringIndexMap.size());
+  for (std::pair<StringRef, size_t> &P : StringIndexMap)
+    Strings.push_back(&P);
 
-  for (auto i = StringIndexMap.begin(), e = StringIndexMap.end(); i != e; ++i)
-    Strings.push_back(i->getKey());
-
-  std::sort(Strings.begin(), Strings.end(), compareBySuffix);
+  if (!Strings.empty())
+    multikey_qsort(&Strings[0], &Strings[0] + Strings.size(), 0);
 
-  switch (kind) {
+  switch (K) {
+  case RAW:
+    break;
   case ELF:
   case MachO:
     // Start the table with a NUL byte.
@@ -49,22 +85,25 @@ void StringTableBuilder::finalize(Kind kind) {
   }
 
   StringRef Previous;
-  for (StringRef s : Strings) {
-    if (kind == WinCOFF)
-      assert(s.size() > COFF::NameSize && "Short string in COFF string table!");
+  for (std::pair<StringRef, size_t> *P : Strings) {
+    StringRef S = P->first;
+    if (K == WinCOFF)
+      assert(S.size() > COFF::NameSize && "Short string in COFF string table!");
 
-    if (Previous.endswith(s)) {
-      StringIndexMap[s] = StringTable.size() - 1 - s.size();
+    if (Previous.endswith(S)) {
+      P->second = StringTable.size() - S.size() - (K != RAW);
       continue;
     }
 
-    StringIndexMap[s] = StringTable.size();
-    StringTable += s;
-    StringTable += '\x00';
-    Previous = s;
+    P->second = StringTable.size();
+    StringTable += S;
+    if (K != RAW)
+      StringTable += '\x00';
+    Previous = S;
   }
 
-  switch (kind) {
+  switch (K) {
+  case RAW:
   case ELF:
     break;
   case MachO:
@@ -75,14 +114,31 @@ void StringTableBuilder::finalize(Kind kind) {
   case WinCOFF:
     // Write the table size in the first word.
     assert(StringTable.size() <= std::numeric_limits<uint32_t>::max());
-    uint32_t size = static_cast<uint32_t>(StringTable.size());
+    uint32_t Size = static_cast<uint32_t>(StringTable.size());
     support::endian::write<uint32_t, support::little, support::unaligned>(
-        StringTable.data(), size);
+        StringTable.data(), Size);
     break;
   }
+
+  Size = StringTable.size();
 }
 
 void StringTableBuilder::clear() {
   StringTable.clear();
   StringIndexMap.clear();
 }
+
+size_t StringTableBuilder::getOffset(StringRef S) const {
+  assert(isFinalized());
+  auto I = StringIndexMap.find(S);
+  assert(I != StringIndexMap.end() && "String is not in table!");
+  return I->second;
+}
+
+size_t StringTableBuilder::add(StringRef S) {
+  assert(!isFinalized());
+  auto P = StringIndexMap.insert(std::make_pair(S, Size));
+  if (P.second)
+    Size += S.size() + (K != RAW);
+  return P.first->second;
+}
diff --git a/contrib/llvm/lib/MC/SubtargetFeature.cpp b/contrib/llvm/lib/MC/SubtargetFeature.cpp
index 76574e9..7cce0fe 100644
--- a/contrib/llvm/lib/MC/SubtargetFeature.cpp
+++ b/contrib/llvm/lib/MC/SubtargetFeature.cpp
@@ -56,7 +56,7 @@ static inline bool isEnabled(StringRef Feature) {
 ///
 static void Split(std::vector<std::string> &V, StringRef S) {
   SmallVector<StringRef, 3> Tmp;
-  S.split(Tmp, ",", -1, false /* KeepEmpty */);
+  S.split(Tmp, ',', -1, false /* KeepEmpty */);
   V.assign(Tmp.begin(), Tmp.end());
 }
 
@@ -160,10 +160,9 @@ void ClearImpliedBits(FeatureBitset &Bits,
   }
 }
 
-/// ToggleFeature - Toggle a feature and returns the newly updated feature
-/// bits.
-FeatureBitset
-SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature,
+/// ToggleFeature - Toggle a feature and update the feature bits.
+void
+SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
                                  ArrayRef<SubtargetFeatureKV> FeatureTable) {
 
   // Find feature in table.
@@ -186,12 +185,9 @@ SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature,
            << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
-
-  return Bits;
 }
 
-FeatureBitset
-SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
+void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
                                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
 
   assert(hasFlag(Feature));
@@ -203,7 +199,7 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
   if (FeatureEntry) {
     // Enable/disable feature in bits
     if (isEnabled(Feature)) {
-      Bits |=  FeatureEntry->Value;
+      Bits |= FeatureEntry->Value;
 
       // For each feature that this implies, set it.
       SetImpliedBits(Bits, FeatureEntry, FeatureTable);
@@ -218,8 +214,6 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
            << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
-
-  return Bits;
 }
 
 
@@ -234,14 +228,10 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
     return FeatureBitset();
 
 #ifndef NDEBUG
-  for (size_t i = 1, e = CPUTable.size(); i != e; ++i) {
-    assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
-           "CPU table is not sorted");
-  }
-  for (size_t i = 1, e = FeatureTable.size(); i != e; ++i) {
-    assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
-          "CPU features table is not sorted");
-  }
+  assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) &&
+         "CPU table is not sorted");
+  assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) &&
+         "CPU features table is not sorted");
 #endif
   // Resulting bits
   FeatureBitset Bits;
@@ -277,7 +267,7 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
     if (Feature == "+help")
       Help(CPUTable, FeatureTable);
 
-    Bits = ApplyFeatureFlag(Bits, Feature, FeatureTable);
+    ApplyFeatureFlag(Bits, Feature, FeatureTable);
   }
 
   return Bits;
diff --git a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 56ef1c7..a76cbdb 100644
--- a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -32,8 +33,10 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/JamCRC.h"
 #include "llvm/Support/TimeValue.h"
 #include <cstdio>
+#include <ctime>
 
 using namespace llvm;
 
@@ -76,8 +79,6 @@ public:
   COFFSymbol(StringRef name);
   void set_name_offset(uint32_t Offset);
 
-  bool should_keep() const;
-
   int64_t getIndex() const { return Index; }
   void setIndex(int Value) {
     Index = Value;
@@ -125,7 +126,7 @@ public:
   COFF::header Header;
   sections Sections;
   symbols Symbols;
-  StringTableBuilder Strings;
+  StringTableBuilder Strings{StringTableBuilder::WinCOFF};
 
   // Maps used during object file creation.
   section_map SectionMap;
@@ -160,8 +161,6 @@ public:
   void SetSymbolName(COFFSymbol &S);
   void SetSectionName(COFFSection &S);
 
-  bool ExportSymbol(const MCSymbol &Symbol, MCAssembler &Asm);
-
   bool IsPhysicalSection(COFFSection *S);
 
   // Entity writing methods.
@@ -215,38 +214,6 @@ void COFFSymbol::set_name_offset(uint32_t Offset) {
   write_uint32_le(Data.Name + 4, Offset);
 }
 
-/// logic to decide if the symbol should be reported in the symbol table
-bool COFFSymbol::should_keep() const {
-  // no section means its external, keep it
-  if (!Section)
-    return true;
-
-  // if it has relocations pointing at it, keep it
-  if (Relocations > 0) {
-    assert(Section->Number != -1 && "Sections with relocations must be real!");
-    return true;
-  }
-
-  // if this is a safeseh handler, keep it
-  if (MC && (cast<MCSymbolCOFF>(MC)->isSafeSEH()))
-    return true;
-
-  // if the section its in is being droped, drop it
-  if (Section->Number == -1)
-    return false;
-
-  // if it is the section symbol, keep it
-  if (Section->Symbol == this)
-    return true;
-
-  // if its temporary, drop it
-  if (MC && MC->isTemporary())
-    return false;
-
-  // otherwise, keep it
-  return true;
-}
-
 //------------------------------------------------------------------------------
 // Section class implementation
 
@@ -392,7 +359,6 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
                                        MCAssembler &Assembler,
                                        const MCAsmLayout &Layout) {
   COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
-  SymbolMap[&Symbol] = coff_symbol;
 
   if (cast<MCSymbolCOFF>(Symbol).isWeakExternal()) {
     coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
@@ -515,25 +481,6 @@ void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
     std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
 }
 
-bool WinCOFFObjectWriter::ExportSymbol(const MCSymbol &Symbol,
-                                       MCAssembler &Asm) {
-  // This doesn't seem to be right. Strings referred to from the .data section
-  // need symbols so they can be linked to code in the .text section right?
-
-  // return Asm.isSymbolLinkerVisible(Symbol);
-
-  // Non-temporary labels should always be visible to the linker.
-  if (!Symbol.isTemporary())
-    return true;
-
-  // Temporary variable symbols are invisible.
-  if (Symbol.isVariable())
-    return false;
-
-  // Absolute temporary labels are never visible.
-  return !Symbol.isAbsolute();
-}
-
 bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) {
   return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) ==
          0;
@@ -663,7 +610,7 @@ void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
     defineSection(static_cast<const MCSectionCOFF &>(Section));
 
   for (const MCSymbol &Symbol : Asm.symbols())
-    if (ExportSymbol(Symbol, Asm))
+    if (!Symbol.isTemporary())
       DefineSymbol(Symbol, Asm, Layout);
 }
 
@@ -674,7 +621,8 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   // thunk to implement their /INCREMENTAL feature.  Make sure we don't optimize
   // away any relocations to functions.
   uint16_t Type = cast<MCSymbolCOFF>(SymA).getType();
-  if ((Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
+  if (Asm.isIncrementalLinkerCompatible() &&
+      (Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
     return false;
   return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
                                                                 InSet, IsPCRel);
@@ -702,41 +650,49 @@ void WinCOFFObjectWriter::recordRelocation(
     const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) {
   assert(Target.getSymA() && "Relocation must reference a symbol!");
 
-  const MCSymbol &Symbol = Target.getSymA()->getSymbol();
-  const MCSymbol &A = Symbol;
-  if (!A.isRegistered())
-    Asm.getContext().reportFatalError(Fixup.getLoc(),
+  const MCSymbol &A = Target.getSymA()->getSymbol();
+  if (!A.isRegistered()) {
+    Asm.getContext().reportError(Fixup.getLoc(),
                                       Twine("symbol '") + A.getName() +
                                           "' can not be undefined");
+    return;
+  }
+  if (A.isTemporary() && A.isUndefined()) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                      Twine("assembler label '") + A.getName() +
+                                          "' can not be undefined");
+    return;
+  }
 
   MCSection *Section = Fragment->getParent();
 
   // Mark this symbol as requiring an entry in the symbol table.
   assert(SectionMap.find(Section) != SectionMap.end() &&
          "Section must already have been defined in executePostLayoutBinding!");
-  assert(SymbolMap.find(&A) != SymbolMap.end() &&
-         "Symbol must already have been defined in executePostLayoutBinding!");
 
   COFFSection *coff_section = SectionMap[Section];
-  COFFSymbol *coff_symbol = SymbolMap[&A];
   const MCSymbolRefExpr *SymB = Target.getSymB();
   bool CrossSection = false;
 
   if (SymB) {
     const MCSymbol *B = &SymB->getSymbol();
-    if (!B->getFragment())
-      Asm.getContext().reportFatalError(
+    if (!B->getFragment()) {
+      Asm.getContext().reportError(
           Fixup.getLoc(),
           Twine("symbol '") + B->getName() +
               "' can not be undefined in a subtraction expression");
+      return;
+    }
 
-    if (!A.getFragment())
-      Asm.getContext().reportFatalError(
+    if (!A.getFragment()) {
+      Asm.getContext().reportError(
           Fixup.getLoc(),
-          Twine("symbol '") + Symbol.getName() +
+          Twine("symbol '") + A.getName() +
               "' can not be undefined in a subtraction expression");
+      return;
+    }
 
-    CrossSection = &Symbol.getSection() != &B->getSection();
+    CrossSection = &A.getSection() != &B->getSection();
 
     // Offset of the symbol in the section
     int64_t OffsetOfB = Layout.getSymbolOffset(*B);
@@ -765,12 +721,19 @@ void WinCOFFObjectWriter::recordRelocation(
   Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
 
   // Turn relocations for temporary symbols into section relocations.
-  if (coff_symbol->MC->isTemporary() || CrossSection) {
-    Reloc.Symb = coff_symbol->Section->Symbol;
-    FixedValue += Layout.getFragmentOffset(coff_symbol->MC->getFragment()) +
-                  coff_symbol->MC->getOffset();
-  } else
-    Reloc.Symb = coff_symbol;
+  if (A.isTemporary() || CrossSection) {
+    MCSection *TargetSection = &A.getSection();
+    assert(
+        SectionMap.find(TargetSection) != SectionMap.end() &&
+        "Section must already have been defined in executePostLayoutBinding!");
+    Reloc.Symb = SectionMap[TargetSection]->Symbol;
+    FixedValue += Layout.getSymbolOffset(A);
+  } else {
+    assert(
+        SymbolMap.find(&A) != SymbolMap.end() &&
+        "Symbol must already have been defined in executePostLayoutBinding!");
+    Reloc.Symb = SymbolMap[&A];
+  }
 
   ++Reloc.Symb->Relocations;
 
@@ -884,14 +847,10 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     // Update section number & offset for symbols that have them.
     if (Symbol->Section)
       Symbol->Data.SectionNumber = Symbol->Section->Number;
-    if (Symbol->should_keep()) {
-      Symbol->setIndex(Header.NumberOfSymbols++);
-      // Update auxiliary symbol info.
-      Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size();
-      Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols;
-    } else {
-      Symbol->setIndex(-1);
-    }
+    Symbol->setIndex(Header.NumberOfSymbols++);
+    // Update auxiliary symbol info.
+    Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size();
+    Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols;
   }
 
   // Build string table.
@@ -899,16 +858,15 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     if (S->Name.size() > COFF::NameSize)
       Strings.add(S->Name);
   for (const auto &S : Symbols)
-    if (S->should_keep() && S->Name.size() > COFF::NameSize)
+    if (S->Name.size() > COFF::NameSize)
       Strings.add(S->Name);
-  Strings.finalize(StringTableBuilder::WinCOFF);
+  Strings.finalize();
 
   // Set names.
   for (const auto &S : Sections)
     SetSectionName(*S);
   for (auto &S : Symbols)
-    if (S->should_keep())
-      SetSymbolName(*S);
+    SetSymbolName(*S);
 
   // Fixup weak external references.
   for (auto &Symbol : Symbols) {
@@ -948,7 +906,7 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
 
   // Assign file offsets to COFF object file structures.
 
-  unsigned offset = 0;
+  unsigned offset = getInitialOffset();
 
   if (UseBigObj)
     offset += COFF::Header32Size;
@@ -1011,8 +969,17 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
 
   Header.PointerToSymbolTable = offset;
 
-  // We want a deterministic output. It looks like GNU as also writes 0 in here.
-  Header.TimeDateStamp = 0;
+  // MS LINK expects to be able to use this timestamp to implement their
+  // /INCREMENTAL feature.
+  if (Asm.isIncrementalLinkerCompatible()) {
+    std::time_t Now = time(nullptr);
+    if (Now < 0 || !isUInt<32>(Now))
+      Now = UINT32_MAX;
+    Header.TimeDateStamp = Now;
+  } else {
+    // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
+    Header.TimeDateStamp = 0;
+  }
 
   // Write it all to disk...
   WriteFileHeader(Header);
@@ -1029,6 +996,7 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
       }
     }
 
+    SmallVector<char, 128> SectionContents;
     for (i = Sections.begin(), ie = Sections.end(), j = Asm.begin(),
         je = Asm.end();
          (i != ie) && (j != je); ++i, ++j) {
@@ -1037,20 +1005,47 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
         continue;
 
       if ((*i)->Header.PointerToRawData != 0) {
-        assert(OS.tell() <= (*i)->Header.PointerToRawData &&
+        assert(getStream().tell() <= (*i)->Header.PointerToRawData &&
                "Section::PointerToRawData is insane!");
 
-        unsigned SectionDataPadding = (*i)->Header.PointerToRawData - OS.tell();
+        unsigned SectionDataPadding =
+            (*i)->Header.PointerToRawData - getStream().tell();
         assert(SectionDataPadding < 4 &&
                "Should only need at most three bytes of padding!");
 
         WriteZeros(SectionDataPadding);
 
+        // Save the contents of the section to a temporary buffer, we need this
+        // to CRC the data before we dump it into the object file.
+        SectionContents.clear();
+        raw_svector_ostream VecOS(SectionContents);
+        raw_pwrite_stream &OldStream = getStream();
+        // Redirect the output stream to our buffer.
+        setStream(VecOS);
+        // Fill our buffer with the section data.
         Asm.writeSectionData(&*j, Layout);
+        // Reset the stream back to what it was before.
+        setStream(OldStream);
+
+        // Calculate our CRC with an initial value of '0', this is not how
+        // JamCRC is specified but it aligns with the expected output.
+        JamCRC JC(/*Init=*/0x00000000U);
+        JC.update(SectionContents);
+
+        // Write the section contents to the object file.
+        getStream() << SectionContents;
+
+        // Update the section definition auxiliary symbol to record the CRC.
+        COFFSection *Sec = SectionMap[&*j];
+        COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
+        assert(AuxSyms.size() == 1 &&
+               AuxSyms[0].AuxType == ATSectionDefinition);
+        AuxSymbol &SecDef = AuxSyms[0];
+        SecDef.Aux.SectionDefinition.CheckSum = JC.getCRC();
       }
 
       if ((*i)->Relocations.size() > 0) {
-        assert(OS.tell() == (*i)->Header.PointerToRelocations &&
+        assert(getStream().tell() == (*i)->Header.PointerToRelocations &&
                "Section::PointerToRelocations is insane!");
 
         if ((*i)->Relocations.size() >= 0xffff) {
@@ -1071,14 +1066,14 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     }
   }
 
-  assert(OS.tell() == Header.PointerToSymbolTable &&
+  assert(getStream().tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
   for (auto &Symbol : Symbols)
     if (Symbol->getIndex() != -1)
       WriteSymbol(*Symbol);
 
-  OS.write(Strings.data().data(), Strings.data().size());
+  getStream().write(Strings.data().data(), Strings.data().size());
 }
 
 MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
diff --git a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp b/contrib/llvm/lib/MC/WinCOFFStreamer.cpp
index 36dd691..a38b1a4 100644
--- a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/MC/WinCOFFStreamer.cpp
@@ -49,7 +49,6 @@ void MCWinCOFFStreamer::EmitInstToData(const MCInst &Inst,
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
   getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
-  VecOS.flush();
 
   // Add the fixups and data.
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
@@ -123,29 +122,37 @@ void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
          "Got non-COFF section in the COFF backend!");
 
   if (CurSymbol)
-    FatalError("starting a new symbol definition without completing the "
-               "previous one");
+    Error("starting a new symbol definition without completing the "
+          "previous one");
   CurSymbol = Symbol;
 }
 
 void MCWinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
-  if (!CurSymbol)
-    FatalError("storage class specified outside of symbol definition");
+  if (!CurSymbol) {
+    Error("storage class specified outside of symbol definition");
+    return;
+  }
 
-  if (StorageClass & ~COFF::SSC_Invalid)
-    FatalError("storage class value '" + Twine(StorageClass) +
+  if (StorageClass & ~COFF::SSC_Invalid) {
+    Error("storage class value '" + Twine(StorageClass) +
                "' out of range");
+    return;
+  }
 
   getAssembler().registerSymbol(*CurSymbol);
   cast<MCSymbolCOFF>(CurSymbol)->setClass((uint16_t)StorageClass);
 }
 
 void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) {
-  if (!CurSymbol)
-    FatalError("symbol type specified outside of a symbol definition");
+  if (!CurSymbol) {
+    Error("symbol type specified outside of a symbol definition");
+    return;
+  }
 
-  if (Type & ~0xffff)
-    FatalError("type value '" + Twine(Type) + "' out of range");
+  if (Type & ~0xffff) {
+    Error("type value '" + Twine(Type) + "' out of range");
+    return;
+  }
 
   getAssembler().registerSymbol(*CurSymbol);
   cast<MCSymbolCOFF>(CurSymbol)->setType((uint16_t)Type);
@@ -153,7 +160,7 @@ void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) {
 
 void MCWinCOFFStreamer::EndCOFFSymbolDef() {
   if (!CurSymbol)
-    FatalError("ending symbol definition without starting one");
+    Error("ending symbol definition without starting one");
   CurSymbol = nullptr;
 }
 
@@ -215,8 +222,6 @@ void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
     Size = std::max(Size, static_cast<uint64_t>(ByteAlignment));
   }
 
-  AssignSection(Symbol, nullptr);
-
   getAssembler().registerSymbol(*Symbol);
   Symbol->setExternal(true);
   Symbol->setCommon(Size, ByteAlignment);
@@ -228,7 +233,6 @@ void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
     OS << " -aligncomm:\"" << Symbol->getName() << "\","
        << Log2_32_Ceil(ByteAlignment);
-    OS.flush();
 
     PushSection();
     SwitchSection(MFI->getDrectveSection());
@@ -249,8 +253,6 @@ void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   getAssembler().registerSymbol(*Symbol);
   Symbol->setExternal(false);
 
-  AssignSection(Symbol, Section);
-
   if (ByteAlignment != 1)
     new MCAlignFragment(ByteAlignment, /*Value=*/0, /*ValueSize=*/0,
                         ByteAlignment, Section);
@@ -287,9 +289,8 @@ void MCWinCOFFStreamer::FinishImpl() {
   MCObjectStreamer::FinishImpl();
 }
 
-LLVM_ATTRIBUTE_NORETURN
-void MCWinCOFFStreamer::FatalError(const Twine &Msg) const {
-  getContext().reportFatalError(SMLoc(), Msg);
+void MCWinCOFFStreamer::Error(const Twine &Msg) const {
+  getContext().reportError(SMLoc(), Msg);
 }
 }
 
diff --git a/contrib/llvm/lib/Object/Archive.cpp b/contrib/llvm/lib/Object/Archive.cpp
index d482119..99b0650 100644
--- a/contrib/llvm/lib/Object/Archive.cpp
+++ b/contrib/llvm/lib/Object/Archive.cpp
@@ -43,10 +43,10 @@ StringRef ArchiveMemberHeader::getName() const {
   return llvm::StringRef(Name, end);
 }
 
-uint32_t ArchiveMemberHeader::getSize() const {
+ErrorOr<uint32_t> ArchiveMemberHeader::getSize() const {
   uint32_t Ret;
   if (llvm::StringRef(Size, sizeof(Size)).rtrim(" ").getAsInteger(10, Ret))
-    llvm_unreachable("Size is not a decimal number.");
+    return object_error::parse_failed; // Size is not a decimal number.
   return Ret;
 }
 
@@ -82,22 +82,30 @@ unsigned ArchiveMemberHeader::getGID() const {
   return Ret;
 }
 
-Archive::Child::Child(const Archive *Parent, const char *Start)
+Archive::Child::Child(const Archive *Parent, StringRef Data,
+                      uint16_t StartOfFile)
+    : Parent(Parent), Data(Data), StartOfFile(StartOfFile) {}
+
+Archive::Child::Child(const Archive *Parent, const char *Start,
+                      std::error_code *EC)
     : Parent(Parent) {
   if (!Start)
     return;
 
-  const ArchiveMemberHeader *Header =
-      reinterpret_cast<const ArchiveMemberHeader *>(Start);
   uint64_t Size = sizeof(ArchiveMemberHeader);
-  if (!Parent->IsThin || Header->getName() == "/" || Header->getName() == "//")
-    Size += Header->getSize();
   Data = StringRef(Start, Size);
+  if (!isThinMember()) {
+    ErrorOr<uint64_t> MemberSize = getRawSize();
+    if ((*EC = MemberSize.getError()))
+      return;
+    Size += MemberSize.get();
+    Data = StringRef(Start, Size);
+  }
 
   // Setup StartOfFile and PaddingBytes.
   StartOfFile = sizeof(ArchiveMemberHeader);
   // Don't include attached name.
-  StringRef Name = Header->getName();
+  StringRef Name = getRawName();
   if (Name.startswith("#1/")) {
     uint64_t NameSize;
     if (Name.substr(3).rtrim(" ").getAsInteger(10, NameSize))
@@ -106,25 +114,40 @@ Archive::Child::Child(const Archive *Parent, const char *Start)
   }
 }
 
-uint64_t Archive::Child::getSize() const {
-  if (Parent->IsThin)
-    return getHeader()->getSize();
+ErrorOr<uint64_t> Archive::Child::getSize() const {
+  if (Parent->IsThin) {
+    ErrorOr<uint32_t> Size = getHeader()->getSize();
+    if (std::error_code EC = Size.getError())
+      return EC;
+    return Size.get();
+  }
   return Data.size() - StartOfFile;
 }
 
-uint64_t Archive::Child::getRawSize() const {
-  return getHeader()->getSize();
+ErrorOr<uint64_t> Archive::Child::getRawSize() const {
+  ErrorOr<uint32_t> Size = getHeader()->getSize();
+  if (std::error_code EC = Size.getError())
+    return EC;
+  return Size.get();
+}
+
+bool Archive::Child::isThinMember() const {
+  StringRef Name = getHeader()->getName();
+  return Parent->IsThin && Name != "/" && Name != "//";
 }
 
 ErrorOr<StringRef> Archive::Child::getBuffer() const {
-  if (!Parent->IsThin)
-    return StringRef(Data.data() + StartOfFile, getSize());
+  if (!isThinMember()) {
+    ErrorOr<uint32_t> Size = getSize();
+    if (std::error_code EC = Size.getError())
+      return EC;
+    return StringRef(Data.data() + StartOfFile, Size.get());
+  }
   ErrorOr<StringRef> Name = getName();
   if (std::error_code EC = Name.getError())
     return EC;
-  SmallString<128> FullName =
-      Parent->getMemoryBufferRef().getBufferIdentifier();
-  sys::path::remove_filename(FullName);
+  SmallString<128> FullName = sys::path::parent_path(
+      Parent->getMemoryBufferRef().getBufferIdentifier());
   sys::path::append(FullName, *Name);
   ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(FullName);
   if (std::error_code EC = Buf.getError())
@@ -133,7 +156,7 @@ ErrorOr<StringRef> Archive::Child::getBuffer() const {
   return Parent->ThinBuffers.back()->getBuffer();
 }
 
-Archive::Child Archive::Child::getNext() const {
+ErrorOr<Archive::Child> Archive::Child::getNext() const {
   size_t SpaceToSkip = Data.size();
   // If it's odd, add 1 to make it even.
   if (SpaceToSkip & 1)
@@ -141,11 +164,19 @@ Archive::Child Archive::Child::getNext() const {
 
   const char *NextLoc = Data.data() + SpaceToSkip;
 
+  // Check to see if this is at the end of the archive.
+  if (NextLoc == Parent->Data.getBufferEnd())
+    return Child(Parent, nullptr, nullptr);
+
   // Check to see if this is past the end of the archive.
-  if (NextLoc >= Parent->Data.getBufferEnd())
-    return Child(Parent, nullptr);
+  if (NextLoc > Parent->Data.getBufferEnd())
+    return object_error::parse_failed;
 
-  return Child(Parent, NextLoc);
+  std::error_code EC;
+  Child Ret(Parent, NextLoc, &EC);
+  if (EC)
+    return EC;
+  return Ret;
 }
 
 uint64_t Archive::Child::getChildOffset() const {
@@ -168,17 +199,11 @@ ErrorOr<StringRef> Archive::Child::getName() const {
     std::size_t offset;
     if (name.substr(1).rtrim(" ").getAsInteger(10, offset))
       llvm_unreachable("Long name offset is not an integer");
-    const char *addr = Parent->StringTable->Data.begin()
-                       + sizeof(ArchiveMemberHeader)
-                       + offset;
+
     // Verify it.
-    if (Parent->StringTable == Parent->child_end()
-        || addr < (Parent->StringTable->Data.begin()
-                   + sizeof(ArchiveMemberHeader))
-        || addr > (Parent->StringTable->Data.begin()
-                   + sizeof(ArchiveMemberHeader)
-                   + Parent->StringTable->getSize()))
+    if (offset >= Parent->StringTable.size())
       return object_error::parse_failed;
+    const char *addr = Parent->StringTable.begin() + offset;
 
     // GNU long file names end with a "/\n".
     if (Parent->kind() == K_GNU || Parent->kind() == K_MIPS64) {
@@ -227,9 +252,13 @@ ErrorOr<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
   return std::move(Ret);
 }
 
+void Archive::setFirstRegular(const Child &C) {
+  FirstRegularData = C.Data;
+  FirstRegularStartOfFile = C.StartOfFile;
+}
+
 Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
-    : Binary(Binary::ID_Archive, Source), SymbolTable(child_end()),
-      StringTable(child_end()), FirstRegular(child_end()) {
+    : Binary(Binary::ID_Archive, Source) {
   StringRef Buffer = Data.getBuffer();
   // Check for sufficient magic.
   if (Buffer.startswith(ThinMagic)) {
@@ -242,15 +271,26 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   }
 
   // Get the special members.
-  child_iterator i = child_begin(false);
-  child_iterator e = child_end();
+  child_iterator I = child_begin(false);
+  if ((ec = I->getError()))
+    return;
+  child_iterator E = child_end();
 
-  if (i == e) {
+  if (I == E) {
     ec = std::error_code();
     return;
   }
+  const Child *C = &**I;
 
-  StringRef Name = i->getRawName();
+  auto Increment = [&]() {
+    ++I;
+    if ((ec = I->getError()))
+      return true;
+    C = &**I;
+    return false;
+  };
+
+  StringRef Name = C->getRawName();
 
   // Below is the pattern that is used to figure out the archive format
   // GNU archive format
@@ -273,9 +313,13 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
 
   if (Name == "__.SYMDEF") {
     Format = K_BSD;
-    SymbolTable = i;
-    ++i;
-    FirstRegular = i;
+    // We know that the symbol table is not an external file, so we just assert
+    // there is no error.
+    SymbolTable = *C->getBuffer();
+    if (Increment())
+      return;
+    setFirstRegular(*C);
+
     ec = std::error_code();
     return;
   }
@@ -283,16 +327,19 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   if (Name.startswith("#1/")) {
     Format = K_BSD;
     // We know this is BSD, so getName will work since there is no string table.
-    ErrorOr<StringRef> NameOrErr = i->getName();
+    ErrorOr<StringRef> NameOrErr = C->getName();
     ec = NameOrErr.getError();
     if (ec)
       return;
     Name = NameOrErr.get();
     if (Name == "__.SYMDEF SORTED" || Name == "__.SYMDEF") {
-      SymbolTable = i;
-      ++i;
+      // We know that the symbol table is not an external file, so we just
+      // assert there is no error.
+      SymbolTable = *C->getBuffer();
+      if (Increment())
+        return;
     }
-    FirstRegular = i;
+    setFirstRegular(*C);
     return;
   }
 
@@ -303,30 +350,36 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
 
   bool has64SymTable = false;
   if (Name == "/" || Name == "/SYM64/") {
-    SymbolTable = i;
+    // We know that the symbol table is not an external file, so we just assert
+    // there is no error.
+    SymbolTable = *C->getBuffer();
     if (Name == "/SYM64/")
       has64SymTable = true;
 
-    ++i;
-    if (i == e) {
+    if (Increment())
+      return;
+    if (I == E) {
       ec = std::error_code();
       return;
     }
-    Name = i->getRawName();
+    Name = C->getRawName();
   }
 
   if (Name == "//") {
     Format = has64SymTable ? K_MIPS64 : K_GNU;
-    StringTable = i;
-    ++i;
-    FirstRegular = i;
+    // The string table is never an external member, so we just assert on the
+    // ErrorOr.
+    StringTable = *C->getBuffer();
+    if (Increment())
+      return;
+    setFirstRegular(*C);
     ec = std::error_code();
     return;
   }
 
   if (Name[0] != '/') {
     Format = has64SymTable ? K_MIPS64 : K_GNU;
-    FirstRegular = i;
+    setFirstRegular(*C);
     ec = std::error_code();
     return;
   }
@@ -337,23 +390,30 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
   }
 
   Format = K_COFF;
-  SymbolTable = i;
+  // We know that the symbol table is not an external file, so we just assert
+  // there is no error.
+  SymbolTable = *C->getBuffer();
 
-  ++i;
-  if (i == e) {
-    FirstRegular = i;
+  if (Increment())
+    return;
+
+  if (I == E) {
+    setFirstRegular(*C);
     ec = std::error_code();
     return;
   }
 
-  Name = i->getRawName();
+  Name = C->getRawName();
 
   if (Name == "//") {
-    StringTable = i;
-    ++i;
+    // The string table is never an external member, so we just assert on the
+    // ErrorOr.
+    StringTable = *C->getBuffer();
+    if (Increment())
+      return;
   }
 
-  FirstRegular = i;
+  setFirstRegular(*C);
   ec = std::error_code();
 }
 
@@ -362,22 +422,25 @@ Archive::child_iterator Archive::child_begin(bool SkipInternal) const {
     return child_end();
 
   if (SkipInternal)
-    return FirstRegular;
+    return Child(this, FirstRegularData, FirstRegularStartOfFile);
 
   const char *Loc = Data.getBufferStart() + strlen(Magic);
-  Child c(this, Loc);
-  return c;
+  std::error_code EC;
+  Child c(this, Loc, &EC);
+  if (EC)
+    return child_iterator(EC);
+  return child_iterator(c);
 }
 
 Archive::child_iterator Archive::child_end() const {
-  return Child(this, nullptr);
+  return Child(this, nullptr, nullptr);
 }
 
 StringRef Archive::Symbol::getName() const {
   return Parent->getSymbolTable().begin() + StringIndex;
 }
 
-ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
+ErrorOr<Archive::Child> Archive::Symbol::getMember() const {
   const char *Buf = Parent->getSymbolTable().begin();
   const char *Offsets = Buf;
   if (Parent->kind() == K_MIPS64)
@@ -422,8 +485,11 @@ ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
   }
 
   const char *Loc = Parent->getData().begin() + Offset;
-  child_iterator Iter(Child(Parent, Loc));
-  return Iter;
+  std::error_code EC;
+  Child C(Parent, Loc, &EC);
+  if (EC)
+    return EC;
+  return C;
 }
 
 Archive::Symbol Archive::Symbol::getNext() const {
@@ -506,12 +572,12 @@ Archive::symbol_iterator Archive::symbol_begin() const {
 }
 
 Archive::symbol_iterator Archive::symbol_end() const {
-  if (!hasSymbolTable())
-    return symbol_iterator(Symbol(this, 0, 0));
   return symbol_iterator(Symbol(this, getNumberOfSymbols(), 0));
 }
 
 uint32_t Archive::getNumberOfSymbols() const {
+  if (!hasSymbolTable())
+    return 0;
   const char *buf = getSymbolTable().begin();
   if (kind() == K_GNU)
     return read32be(buf);
@@ -542,6 +608,4 @@ Archive::child_iterator Archive::findSym(StringRef name) const {
   return child_end();
 }
 
-bool Archive::hasSymbolTable() const {
-  return SymbolTable != child_end();
-}
+bool Archive::hasSymbolTable() const { return !SymbolTable.empty(); }
diff --git a/contrib/llvm/lib/Object/ArchiveWriter.cpp b/contrib/llvm/lib/Object/ArchiveWriter.cpp
index a40901c..c7343fd 100644
--- a/contrib/llvm/lib/Object/ArchiveWriter.cpp
+++ b/contrib/llvm/lib/Object/ArchiveWriter.cpp
@@ -34,32 +34,32 @@
 
 using namespace llvm;
 
-NewArchiveIterator::NewArchiveIterator(object::Archive::child_iterator I,
+NewArchiveIterator::NewArchiveIterator(const object::Archive::Child &OldMember,
                                        StringRef Name)
-    : IsNewMember(false), Name(Name), OldI(I) {}
+    : IsNewMember(false), Name(Name), OldMember(OldMember) {}
 
-NewArchiveIterator::NewArchiveIterator(StringRef NewFilename, StringRef Name)
-    : IsNewMember(true), Name(Name), NewFilename(NewFilename) {}
+NewArchiveIterator::NewArchiveIterator(StringRef FileName)
+    : IsNewMember(true), Name(FileName), OldMember(nullptr, nullptr, nullptr) {}
 
 StringRef NewArchiveIterator::getName() const { return Name; }
 
 bool NewArchiveIterator::isNewMember() const { return IsNewMember; }
 
-object::Archive::child_iterator NewArchiveIterator::getOld() const {
+const object::Archive::Child &NewArchiveIterator::getOld() const {
   assert(!IsNewMember);
-  return OldI;
+  return OldMember;
 }
 
 StringRef NewArchiveIterator::getNew() const {
   assert(IsNewMember);
-  return NewFilename;
+  return Name;
 }
 
 llvm::ErrorOr<int>
 NewArchiveIterator::getFD(sys::fs::file_status &NewStatus) const {
   assert(IsNewMember);
   int NewFD;
-  if (auto EC = sys::fs::openFileForRead(NewFilename, NewFD))
+  if (auto EC = sys::fs::openFileForRead(Name, NewFD))
     return EC;
   assert(NewFD != -1);
 
@@ -77,7 +77,7 @@ NewArchiveIterator::getFD(sys::fs::file_status &NewStatus) const {
 
 template <typename T>
 static void printWithSpacePadding(raw_fd_ostream &OS, T Data, unsigned Size,
-				  bool MayTruncate = false) {
+                                  bool MayTruncate = false) {
   uint64_t OldPos = OS.tell();
   OS << Data;
   unsigned SizeSoFar = OS.tell() - OldPos;
@@ -135,30 +135,56 @@ static void printBSDMemberHeader(raw_fd_ostream &Out, StringRef Name,
     Out.write(uint8_t(0));
 }
 
+static bool useStringTable(bool Thin, StringRef Name) {
+  return Thin || Name.size() >= 16;
+}
+
 static void
-printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind,
+printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind, bool Thin,
                   StringRef Name,
                   std::vector<unsigned>::iterator &StringMapIndexIter,
                   const sys::TimeValue &ModTime, unsigned UID, unsigned GID,
                   unsigned Perms, unsigned Size) {
   if (Kind == object::Archive::K_BSD)
     return printBSDMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
-  if (Name.size() < 16)
+  if (!useStringTable(Thin, Name))
     return printGNUSmallMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
   Out << '/';
   printWithSpacePadding(Out, *StringMapIndexIter++, 15);
   printRestOfMemberHeader(Out, ModTime, UID, GID, Perms, Size);
 }
 
-static void writeStringTable(raw_fd_ostream &Out,
+// Compute the relative path from From to To.
+static std::string computeRelativePath(StringRef From, StringRef To) {
+  if (sys::path::is_absolute(From) || sys::path::is_absolute(To))
+    return To;
+
+  StringRef DirFrom = sys::path::parent_path(From);
+  auto FromI = sys::path::begin(DirFrom);
+  auto ToI = sys::path::begin(To);
+  while (*FromI == *ToI) {
+    ++FromI;
+    ++ToI;
+  }
+
+  SmallString<128> Relative;
+  for (auto FromE = sys::path::end(DirFrom); FromI != FromE; ++FromI)
+    sys::path::append(Relative, "..");
+
+  for (auto ToE = sys::path::end(To); ToI != ToE; ++ToI)
+    sys::path::append(Relative, *ToI);
+
+  return Relative.str();
+}
+
+static void writeStringTable(raw_fd_ostream &Out, StringRef ArcName,
                              ArrayRef<NewArchiveIterator> Members,
-                             std::vector<unsigned> &StringMapIndexes) {
+                             std::vector<unsigned> &StringMapIndexes,
+                             bool Thin) {
   unsigned StartOffset = 0;
-  for (ArrayRef<NewArchiveIterator>::iterator I = Members.begin(),
-                                              E = Members.end();
-       I != E; ++I) {
-    StringRef Name = I->getName();
-    if (Name.size() < 16)
+  for (const NewArchiveIterator &I : Members) {
+    StringRef Name = sys::path::filename(I.getName());
+    if (!useStringTable(Thin, Name))
       continue;
     if (StartOffset == 0) {
       printWithSpacePadding(Out, "//", 58);
@@ -166,7 +192,13 @@ static void writeStringTable(raw_fd_ostream &Out,
       StartOffset = Out.tell();
     }
     StringMapIndexes.push_back(Out.tell() - StartOffset);
-    Out << Name << "/\n";
+
+    if (Thin)
+      Out << computeRelativePath(ArcName, I.getName());
+    else
+      Out << Name;
+
+    Out << "/\n";
   }
   if (StartOffset == 0)
     return;
@@ -268,9 +300,11 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   return BodyStartOffset + 4;
 }
 
-std::pair<StringRef, std::error_code> llvm::writeArchive(
-    StringRef ArcName, std::vector<NewArchiveIterator> &NewMembers,
-    bool WriteSymtab, object::Archive::Kind Kind, bool Deterministic) {
+std::pair<StringRef, std::error_code>
+llvm::writeArchive(StringRef ArcName,
+                   std::vector<NewArchiveIterator> &NewMembers,
+                   bool WriteSymtab, object::Archive::Kind Kind,
+                   bool Deterministic, bool Thin) {
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
@@ -279,7 +313,10 @@ std::pair<StringRef, std::error_code> llvm::writeArchive(
 
   tool_output_file Output(TmpArchive, TmpArchiveFD);
   raw_fd_ostream &Out = Output.os();
-  Out << "!<arch>\n";
+  if (Thin)
+    Out << "!<thin>\n";
+  else
+    Out << "!<arch>\n";
 
   std::vector<unsigned> MemberOffsetRefs;
 
@@ -309,9 +346,11 @@ std::pair<StringRef, std::error_code> llvm::writeArchive(
       Buffers.push_back(std::move(MemberBufferOrErr.get()));
       MemberRef = Buffers.back()->getMemBufferRef();
     } else {
-      object::Archive::child_iterator OldMember = Member.getOld();
+      const object::Archive::Child &OldMember = Member.getOld();
+      assert((!Thin || OldMember.getParent()->isThin()) &&
+             "Thin archives cannot refers to member of other archives");
       ErrorOr<MemoryBufferRef> MemberBufferOrErr =
-          OldMember->getMemoryBufferRef();
+          OldMember.getMemoryBufferRef();
       if (auto EC = MemberBufferOrErr.getError())
         return std::make_pair("", EC);
       MemberRef = MemberBufferOrErr.get();
@@ -330,7 +369,7 @@ std::pair<StringRef, std::error_code> llvm::writeArchive(
 
   std::vector<unsigned> StringMapIndexes;
   if (Kind != object::Archive::K_BSD)
-    writeStringTable(Out, NewMembers, StringMapIndexes);
+    writeStringTable(Out, ArcName, NewMembers, StringMapIndexes, Thin);
 
   unsigned MemberNum = 0;
   unsigned NewMemberNum = 0;
@@ -358,26 +397,32 @@ std::pair<StringRef, std::error_code> llvm::writeArchive(
       GID = Status.getGroup();
       Perms = Status.permissions();
     } else {
-      object::Archive::child_iterator OldMember = I.getOld();
-      ModTime = OldMember->getLastModified();
-      UID = OldMember->getUID();
-      GID = OldMember->getGID();
-      Perms = OldMember->getAccessMode();
+      const object::Archive::Child &OldMember = I.getOld();
+      ModTime = OldMember.getLastModified();
+      UID = OldMember.getUID();
+      GID = OldMember.getGID();
+      Perms = OldMember.getAccessMode();
     }
 
     if (I.isNewMember()) {
       StringRef FileName = I.getNew();
       const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum++];
-      printMemberHeader(Out, Kind, sys::path::filename(FileName),
+      printMemberHeader(Out, Kind, Thin, sys::path::filename(FileName),
                         StringMapIndexIter, ModTime, UID, GID, Perms,
                         Status.getSize());
     } else {
-      object::Archive::child_iterator OldMember = I.getOld();
-      printMemberHeader(Out, Kind, I.getName(), StringMapIndexIter, ModTime,
-                        UID, GID, Perms, OldMember->getSize());
+      const object::Archive::Child &OldMember = I.getOld();
+      ErrorOr<uint32_t> Size = OldMember.getSize();
+      if (std::error_code EC = Size.getError())
+        return std::make_pair("", EC);
+      StringRef FileName = I.getName();
+      printMemberHeader(Out, Kind, Thin, sys::path::filename(FileName),
+                        StringMapIndexIter, ModTime, UID, GID, Perms,
+                        Size.get());
     }
 
-    Out << File.getBuffer();
+    if (!Thin)
+      Out << File.getBuffer();
 
     if (Out.tell() % 2)
       Out << '\n';
diff --git a/contrib/llvm/lib/Object/COFFObjectFile.cpp b/contrib/llvm/lib/Object/COFFObjectFile.cpp
index bcca983..4cd6aff 100644
--- a/contrib/llvm/lib/Object/COFFObjectFile.cpp
+++ b/contrib/llvm/lib/Object/COFFObjectFile.cpp
@@ -171,6 +171,11 @@ ErrorOr<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
   if (std::error_code EC = getSection(SectionNumber, Section))
     return EC;
   Result += Section->VirtualAddress;
+
+  // The section VirtualAddress does not include ImageBase, and we want to
+  // return virtual addresses.
+  Result += getImageBase();
+
   return Result;
 }
 
@@ -178,10 +183,10 @@ SymbolRef::Type COFFObjectFile::getSymbolType(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   int32_t SectionNumber = Symb.getSectionNumber();
 
+  if (Symb.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION)
+    return SymbolRef::ST_Function;
   if (Symb.isAnyUndefined())
     return SymbolRef::ST_Unknown;
-  if (Symb.isFunctionDefinition())
-    return SymbolRef::ST_Function;
   if (Symb.isCommon())
     return SymbolRef::ST_Data;
   if (Symb.isFileRecord())
@@ -230,21 +235,17 @@ uint64_t COFFObjectFile::getCommonSymbolSizeImpl(DataRefImpl Ref) const {
   return Symb.getValue();
 }
 
-std::error_code
-COFFObjectFile::getSymbolSection(DataRefImpl Ref,
-                                 section_iterator &Result) const {
+ErrorOr<section_iterator>
+COFFObjectFile::getSymbolSection(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
-  if (COFF::isReservedSectionNumber(Symb.getSectionNumber())) {
-    Result = section_end();
-  } else {
-    const coff_section *Sec = nullptr;
-    if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec))
-      return EC;
-    DataRefImpl Ref;
-    Ref.p = reinterpret_cast<uintptr_t>(Sec);
-    Result = section_iterator(SectionRef(Ref, this));
-  }
-  return std::error_code();
+  if (COFF::isReservedSectionNumber(Symb.getSectionNumber()))
+    return section_end();
+  const coff_section *Sec = nullptr;
+  if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec))
+    return EC;
+  DataRefImpl Ret;
+  Ret.p = reinterpret_cast<uintptr_t>(Sec);
+  return section_iterator(SectionRef(Ret, this));
 }
 
 unsigned COFFObjectFile::getSymbolSectionID(SymbolRef Sym) const {
@@ -266,7 +267,12 @@ std::error_code COFFObjectFile::getSectionName(DataRefImpl Ref,
 
 uint64_t COFFObjectFile::getSectionAddress(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  return Sec->VirtualAddress;
+  uint64_t Result = Sec->VirtualAddress;
+
+  // The section VirtualAddress does not include ImageBase, and we want to
+  // return virtual addresses.
+  Result += getImageBase();
+  return Result;
 }
 
 uint64_t COFFObjectFile::getSectionSize(DataRefImpl Ref) const {
@@ -412,10 +418,18 @@ std::error_code COFFObjectFile::initSymbolTablePtr() {
   return std::error_code();
 }
 
+uint64_t COFFObjectFile::getImageBase() const {
+  if (PE32Header)
+    return PE32Header->ImageBase;
+  else if (PE32PlusHeader)
+    return PE32PlusHeader->ImageBase;
+  // This actually comes up in practice.
+  return 0;
+}
+
 // Returns the file offset for the given VA.
 std::error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
-  uint64_t ImageBase = PE32Header ? (uint64_t)PE32Header->ImageBase
-                                  : (uint64_t)PE32PlusHeader->ImageBase;
+  uint64_t ImageBase = getImageBase();
   uint64_t Rva = Addr - ImageBase;
   assert(Rva <= UINT32_MAX);
   return getRvaPtr((uint32_t)Rva, Res);
@@ -744,6 +758,8 @@ StringRef COFFObjectFile::getFileFormatName() const {
     return "COFF-x86-64";
   case COFF::IMAGE_FILE_MACHINE_ARMNT:
     return "COFF-ARM";
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
+    return "COFF-ARM64";
   default:
     return "COFF-<unknown arch>";
   }
@@ -757,6 +773,8 @@ unsigned COFFObjectFile::getArch() const {
     return Triple::x86_64;
   case COFF::IMAGE_FILE_MACHINE_ARMNT:
     return Triple::thumb;
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
+    return Triple::aarch64;
   default:
     return Triple::UnknownArch;
   }
@@ -1318,6 +1336,30 @@ ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
   return std::error_code();
 }
 
+std::error_code ExportDirectoryEntryRef::isForwarder(bool &Result) const {
+  const data_directory *DataEntry;
+  if (auto EC = OwningObject->getDataDirectory(COFF::EXPORT_TABLE, DataEntry))
+    return EC;
+  uint32_t RVA;
+  if (auto EC = getExportRVA(RVA))
+    return EC;
+  uint32_t Begin = DataEntry->RelativeVirtualAddress;
+  uint32_t End = DataEntry->RelativeVirtualAddress + DataEntry->Size;
+  Result = (Begin <= RVA && RVA < End);
+  return std::error_code();
+}
+
+std::error_code ExportDirectoryEntryRef::getForwardTo(StringRef &Result) const {
+  uint32_t RVA;
+  if (auto EC = getExportRVA(RVA))
+    return EC;
+  uintptr_t IntPtr = 0;
+  if (auto EC = OwningObject->getRvaPtr(RVA, IntPtr))
+    return EC;
+  Result = StringRef(reinterpret_cast<const char *>(IntPtr));
+  return std::error_code();
+}
+
 bool ImportedSymbolRef::
 operator==(const ImportedSymbolRef &Other) const {
   return Entry32 == Other.Entry32 && Entry64 == Other.Entry64
diff --git a/contrib/llvm/lib/Object/COFFYAML.cpp b/contrib/llvm/lib/Object/COFFYAML.cpp
index 9a24b53..4c1fca1 100644
--- a/contrib/llvm/lib/Object/COFFYAML.cpp
+++ b/contrib/llvm/lib/Object/COFFYAML.cpp
@@ -56,6 +56,7 @@ void ScalarEnumerationTraits<COFF::MachineTypes>::enumeration(
   ECase(IMAGE_FILE_MACHINE_AMD64);
   ECase(IMAGE_FILE_MACHINE_ARM);
   ECase(IMAGE_FILE_MACHINE_ARMNT);
+  ECase(IMAGE_FILE_MACHINE_ARM64);
   ECase(IMAGE_FILE_MACHINE_EBC);
   ECase(IMAGE_FILE_MACHINE_I386);
   ECase(IMAGE_FILE_MACHINE_IA64);
@@ -210,6 +211,7 @@ void ScalarBitSetTraits<COFF::Characteristics>::bitset(
 
 void ScalarBitSetTraits<COFF::SectionCharacteristics>::bitset(
     IO &IO, COFF::SectionCharacteristics &Value) {
+  BCase(IMAGE_SCN_TYPE_NOLOAD);
   BCase(IMAGE_SCN_TYPE_NO_PAD);
   BCase(IMAGE_SCN_CNT_CODE);
   BCase(IMAGE_SCN_CNT_INITIALIZED_DATA);
diff --git a/contrib/llvm/lib/Object/ELF.cpp b/contrib/llvm/lib/Object/ELF.cpp
index 398e9e4..12b772d 100644
--- a/contrib/llvm/lib/Object/ELF.cpp
+++ b/contrib/llvm/lib/Object/ELF.cpp
@@ -26,6 +26,7 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
     }
     break;
   case ELF::EM_386:
+  case ELF::EM_IAMCU:
     switch (Type) {
 #include "llvm/Support/ELFRelocs/i386.def"
     default:
@@ -90,6 +91,13 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
       break;
     }
     break;
+  case ELF::EM_WEBASSEMBLY:
+    switch (Type) {
+#include "llvm/Support/ELFRelocs/WebAssembly.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
diff --git a/contrib/llvm/lib/Object/ELFYAML.cpp b/contrib/llvm/lib/Object/ELFYAML.cpp
index 72c232c..4a4b227 100644
--- a/contrib/llvm/lib/Object/ELFYAML.cpp
+++ b/contrib/llvm/lib/Object/ELFYAML.cpp
@@ -193,6 +193,7 @@ ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO,
   ECase(EM_VIDEOCORE5)
   ECase(EM_78KOR)
   ECase(EM_56800EX)
+  ECase(EM_AMDGPU)
 #undef ECase
 }
 
@@ -316,6 +317,25 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_HEXAGON_ISA_V4)
     BCase(EF_HEXAGON_ISA_V5)
     break;
+  case ELF::EM_AVR:
+    BCase(EF_AVR_ARCH_AVR1)
+    BCase(EF_AVR_ARCH_AVR2)
+    BCase(EF_AVR_ARCH_AVR25)
+    BCase(EF_AVR_ARCH_AVR3)
+    BCase(EF_AVR_ARCH_AVR31)
+    BCase(EF_AVR_ARCH_AVR35)
+    BCase(EF_AVR_ARCH_AVR4)
+    BCase(EF_AVR_ARCH_AVR51)
+    BCase(EF_AVR_ARCH_AVR6)
+    BCase(EF_AVR_ARCH_AVRTINY)
+    BCase(EF_AVR_ARCH_XMEGA1)
+    BCase(EF_AVR_ARCH_XMEGA2)
+    BCase(EF_AVR_ARCH_XMEGA3)
+    BCase(EF_AVR_ARCH_XMEGA4)
+    BCase(EF_AVR_ARCH_XMEGA5)
+    BCase(EF_AVR_ARCH_XMEGA6)
+    BCase(EF_AVR_ARCH_XMEGA7)
+    break;
   default:
     llvm_unreachable("Unsupported architecture");
   }
@@ -382,6 +402,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
 
 void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
                                                   ELFYAML::ELF_SHF &Value) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
 #define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
   BCase(SHF_WRITE)
   BCase(SHF_ALLOC)
@@ -394,6 +415,17 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
   BCase(SHF_OS_NONCONFORMING)
   BCase(SHF_GROUP)
   BCase(SHF_TLS)
+  switch(Object->Header.Machine) {
+  case ELF::EM_AMDGPU:
+    BCase(SHF_AMDGPU_HSA_GLOBAL)
+    BCase(SHF_AMDGPU_HSA_READONLY)
+    BCase(SHF_AMDGPU_HSA_CODE)
+    BCase(SHF_AMDGPU_HSA_AGENT)
+    break;
+  default:
+    // Nothing to do.
+    break;
+  }
 #undef BCase
 }
 
@@ -466,6 +498,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
 #include "llvm/Support/ELFRelocs/Hexagon.def"
     break;
   case ELF::EM_386:
+  case ELF::EM_IAMCU:
 #include "llvm/Support/ELFRelocs/i386.def"
     break;
   case ELF::EM_AARCH64:
diff --git a/contrib/llvm/lib/Object/Error.cpp b/contrib/llvm/lib/Object/Error.cpp
index 7ca2f12..7ecc3a1 100644
--- a/contrib/llvm/lib/Object/Error.cpp
+++ b/contrib/llvm/lib/Object/Error.cpp
@@ -47,6 +47,8 @@ std::string _object_error_category::message(int EV) const {
     return "Invalid section index";
   case object_error::bitcode_section_not_found:
     return "Bitcode section not found in object file";
+  case object_error::elf_invalid_dynamic_table_size:
+    return "Invalid dynamic table size";
   case object_error::macho_small_load_command:
     return "Mach-O load command with size < 8 bytes";
   case object_error::macho_load_segment_too_many_sections:
diff --git a/contrib/llvm/lib/Object/FunctionIndexObjectFile.cpp b/contrib/llvm/lib/Object/FunctionIndexObjectFile.cpp
new file mode 100644
index 0000000..fe111de
--- /dev/null
+++ b/contrib/llvm/lib/Object/FunctionIndexObjectFile.cpp
@@ -0,0 +1,143 @@
+//===- FunctionIndexObjectFile.cpp - Function index file implementation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Part of the FunctionIndexObjectFile class implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/FunctionIndexObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/FunctionInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace object;
+
+FunctionIndexObjectFile::FunctionIndexObjectFile(
+    MemoryBufferRef Object, std::unique_ptr<FunctionInfoIndex> I)
+    : SymbolicFile(Binary::ID_FunctionIndex, Object), Index(std::move(I)) {}
+
+FunctionIndexObjectFile::~FunctionIndexObjectFile() {}
+
+std::unique_ptr<FunctionInfoIndex> FunctionIndexObjectFile::takeIndex() {
+  return std::move(Index);
+}
+
+ErrorOr<MemoryBufferRef>
+FunctionIndexObjectFile::findBitcodeInObject(const ObjectFile &Obj) {
+  for (const SectionRef &Sec : Obj.sections()) {
+    StringRef SecName;
+    if (std::error_code EC = Sec.getName(SecName))
+      return EC;
+    if (SecName == ".llvmbc") {
+      StringRef SecContents;
+      if (std::error_code EC = Sec.getContents(SecContents))
+        return EC;
+      return MemoryBufferRef(SecContents, Obj.getFileName());
+    }
+  }
+
+  return object_error::bitcode_section_not_found;
+}
+
+ErrorOr<MemoryBufferRef>
+FunctionIndexObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
+  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
+  switch (Type) {
+  case sys::fs::file_magic::bitcode:
+    return Object;
+  case sys::fs::file_magic::elf_relocatable:
+  case sys::fs::file_magic::macho_object:
+  case sys::fs::file_magic::coff_object: {
+    ErrorOr<std::unique_ptr<ObjectFile>> ObjFile =
+        ObjectFile::createObjectFile(Object, Type);
+    if (!ObjFile)
+      return ObjFile.getError();
+    return findBitcodeInObject(*ObjFile->get());
+  }
+  default:
+    return object_error::invalid_file_type;
+  }
+}
+
+// Looks for function index in the given memory buffer.
+// returns true if found, else false.
+bool FunctionIndexObjectFile::hasFunctionSummaryInMemBuffer(
+    MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler) {
+  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
+  if (!BCOrErr)
+    return false;
+
+  return hasFunctionSummary(BCOrErr.get(), DiagnosticHandler);
+}
+
+// Parse function index in the given memory buffer.
+// Return new FunctionIndexObjectFile instance containing parsed
+// function summary/index.
+ErrorOr<std::unique_ptr<FunctionIndexObjectFile>>
+FunctionIndexObjectFile::create(MemoryBufferRef Object,
+                                DiagnosticHandlerFunction DiagnosticHandler,
+                                bool IsLazy) {
+  std::unique_ptr<FunctionInfoIndex> Index;
+
+  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
+  if (!BCOrErr)
+    return BCOrErr.getError();
+
+  ErrorOr<std::unique_ptr<FunctionInfoIndex>> IOrErr = getFunctionInfoIndex(
+      BCOrErr.get(), DiagnosticHandler, IsLazy);
+
+  if (std::error_code EC = IOrErr.getError())
+    return EC;
+
+  Index = std::move(IOrErr.get());
+
+  return llvm::make_unique<FunctionIndexObjectFile>(Object, std::move(Index));
+}
+
+// Parse the function summary information for function with the
+// given name out of the given buffer. Parsed information is
+// stored on the index object saved in this object.
+std::error_code FunctionIndexObjectFile::findFunctionSummaryInMemBuffer(
+    MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler,
+    StringRef FunctionName) {
+  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
+  switch (Type) {
+  case sys::fs::file_magic::bitcode: {
+    return readFunctionSummary(Object, DiagnosticHandler, FunctionName,
+                               std::move(Index));
+  }
+  default:
+    return object_error::invalid_file_type;
+  }
+}
+
+// Parse the function index out of an IR file and return the function
+// index object if found, or nullptr if not.
+ErrorOr<std::unique_ptr<FunctionInfoIndex>>
+llvm::getFunctionIndexForFile(StringRef Path,
+                              DiagnosticHandlerFunction DiagnosticHandler) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  std::error_code EC = FileOrErr.getError();
+  if (EC)
+    return EC;
+  MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+  ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr =
+      object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler);
+  EC = ObjOrErr.getError();
+  if (EC)
+    return EC;
+
+  object::FunctionIndexObjectFile &Obj = **ObjOrErr;
+  return Obj.takeIndex();
+}
diff --git a/contrib/llvm/lib/Object/IRObjectFile.cpp b/contrib/llvm/lib/Object/IRObjectFile.cpp
index 9f5132e..c35c413 100644
--- a/contrib/llvm/lib/Object/IRObjectFile.cpp
+++ b/contrib/llvm/lib/Object/IRObjectFile.cpp
@@ -219,6 +219,12 @@ uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   uint32_t Res = BasicSymbolRef::SF_None;
   if (GV->isDeclarationForLinker())
     Res |= BasicSymbolRef::SF_Undefined;
+  else if (GV->hasHiddenVisibility() && !GV->hasLocalLinkage())
+    Res |= BasicSymbolRef::SF_Hidden;
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+    if (GVar->isConstant())
+      Res |= BasicSymbolRef::SF_Const;
+  }
   if (GV->hasPrivateLinkage())
     Res |= BasicSymbolRef::SF_FormatSpecific;
   if (!GV->hasLocalLinkage())
@@ -303,7 +309,7 @@ llvm::object::IRObjectFile::create(MemoryBufferRef Object,
       MemoryBuffer::getMemBuffer(BCOrErr.get(), false));
 
   ErrorOr<std::unique_ptr<Module>> MOrErr =
-      getLazyBitcodeModule(std::move(Buff), Context, nullptr,
+      getLazyBitcodeModule(std::move(Buff), Context,
                            /*ShouldLazyLoadMetadata*/ true);
   if (std::error_code EC = MOrErr.getError())
     return EC;
diff --git a/contrib/llvm/lib/Object/MachOObjectFile.cpp b/contrib/llvm/lib/Object/MachOObjectFile.cpp
index 0590063..d1f79b2 100644
--- a/contrib/llvm/lib/Object/MachOObjectFile.cpp
+++ b/contrib/llvm/lib/Object/MachOObjectFile.cpp
@@ -278,7 +278,7 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
         return;
       }
       LinkOptHintsLoadCmd = Load.Ptr;
-    } else if (Load.C.cmd == MachO::LC_DYLD_INFO || 
+    } else if (Load.C.cmd == MachO::LC_DYLD_INFO ||
                Load.C.cmd == MachO::LC_DYLD_INFO_ONLY) {
       // Multiple dyldinfo load commands
       if (DyldInfoLoadCmd) {
@@ -401,6 +401,9 @@ SymbolRef::Type MachOObjectFile::getSymbolType(DataRefImpl Symb) const {
     case MachO::N_UNDF :
       return SymbolRef::ST_Unknown;
     case MachO::N_SECT :
+      section_iterator Sec = *getSymbolSection(Symb);
+      if (Sec->isData() || Sec->isBSS())
+        return SymbolRef::ST_Data;
       return SymbolRef::ST_Function;
   }
   return SymbolRef::ST_Other;
@@ -445,22 +448,18 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   return Result;
 }
 
-std::error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb,
-                                                  section_iterator &Res) const {
+ErrorOr<section_iterator>
+MachOObjectFile::getSymbolSection(DataRefImpl Symb) const {
   MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t index = Entry.n_sect;
 
-  if (index == 0) {
-    Res = section_end();
-  } else {
-    DataRefImpl DRI;
-    DRI.d.a = index - 1;
-    if (DRI.d.a >= Sections.size())
-      report_fatal_error("getSymbolSection: Invalid section index.");
-    Res = section_iterator(SectionRef(DRI, this));
-  }
-
-  return std::error_code();
+  if (index == 0)
+    return section_end();
+  DataRefImpl DRI;
+  DRI.d.a = index - 1;
+  if (DRI.d.a >= Sections.size())
+    report_fatal_error("getSymbolSection: Invalid section index.");
+  return section_iterator(SectionRef(DRI, this));
 }
 
 unsigned MachOObjectFile::getSymbolSectionID(SymbolRef Sym) const {
@@ -487,9 +486,32 @@ uint64_t MachOObjectFile::getSectionAddress(DataRefImpl Sec) const {
 }
 
 uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
-  if (is64Bit())
-    return getSection64(Sec).size;
-  return getSection(Sec).size;
+  // In the case if a malformed Mach-O file where the section offset is past
+  // the end of the file or some part of the section size is past the end of
+  // the file return a size of zero or a size that covers the rest of the file
+  // but does not extend past the end of the file.
+  uint32_t SectOffset, SectType;
+  uint64_t SectSize;
+
+  if (is64Bit()) {
+    MachO::section_64 Sect = getSection64(Sec);
+    SectOffset = Sect.offset;
+    SectSize = Sect.size;
+    SectType = Sect.flags & MachO::SECTION_TYPE;
+  } else {
+    MachO::section Sect = getSection(Sec);
+    SectOffset = Sect.offset;
+    SectSize = Sect.size;
+    SectType = Sect.flags & MachO::SECTION_TYPE;
+  }
+  if (SectType == MachO::S_ZEROFILL || SectType == MachO::S_GB_ZEROFILL)
+    return SectSize;
+  uint64_t FileSize = getData().size();
+  if (SectOffset > FileSize)
+    return 0;
+  if (FileSize - SectOffset < SectSize)
+    return FileSize - SectOffset;
+  return SectSize;
 }
 
 std::error_code MachOObjectFile::getSectionContents(DataRefImpl Sec,
@@ -1136,8 +1158,7 @@ Triple MachOObjectFile::getThumbArch(uint32_t CPUType, uint32_t CPUSubType,
 }
 
 Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
-                                const char **McpuDefault,
-				Triple *ThumbTriple) {
+                                const char **McpuDefault, Triple *ThumbTriple) {
   Triple T = MachOObjectFile::getArch(CPUType, CPUSubType, McpuDefault);
   *ThumbTriple = MachOObjectFile::getThumbArch(CPUType, CPUSubType,
                                                McpuDefault);
@@ -1212,8 +1233,8 @@ dice_iterator MachOObjectFile::end_dices() const {
   return dice_iterator(DiceRef(DRI, this));
 }
 
-ExportEntry::ExportEntry(ArrayRef<uint8_t> T) 
-  : Trie(T), Malformed(false), Done(false) { }
+ExportEntry::ExportEntry(ArrayRef<uint8_t> T)
+    : Trie(T), Malformed(false), Done(false) {}
 
 void ExportEntry::moveToFirst() {
   pushNode(0);
@@ -1226,7 +1247,7 @@ void ExportEntry::moveToEnd() {
 }
 
 bool ExportEntry::operator==(const ExportEntry &Other) const {
-  // Common case, one at end, other iterating from begin. 
+  // Common case, one at end, other iterating from begin.
   if (Done || Other.Done)
     return (Done == Other.Done);
   // Not equal if different stack sizes.
@@ -1240,7 +1261,7 @@ bool ExportEntry::operator==(const ExportEntry &Other) const {
     if (Stack[i].Start != Other.Stack[i].Start)
       return false;
   }
-  return true;  
+  return true;
 }
 
 uint64_t ExportEntry::readULEB128(const uint8_t *&Ptr) {
@@ -1281,11 +1302,10 @@ uint32_t ExportEntry::nodeOffset() const {
   return Stack.back().Start - Trie.begin();
 }
 
-ExportEntry::NodeState::NodeState(const uint8_t *Ptr) 
-  : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0), 
-    ImportName(nullptr), ChildCount(0), NextChildIndex(0),  
-    ParentStringLength(0), IsExportNode(false) {
-}
+ExportEntry::NodeState::NodeState(const uint8_t *Ptr)
+    : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0),
+      ImportName(nullptr), ChildCount(0), NextChildIndex(0),
+      ParentStringLength(0), IsExportNode(false) {}
 
 void ExportEntry::pushNode(uint64_t offset) {
   const uint8_t *Ptr = Trie.begin() + offset;
@@ -1302,7 +1322,7 @@ void ExportEntry::pushNode(uint64_t offset) {
     } else {
       State.Address = readULEB128(State.Current);
       if (State.Flags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER)
-        State.Other = readULEB128(State.Current); 
+        State.Other = readULEB128(State.Current);
     }
   }
   State.ChildCount = *Children;
@@ -1339,7 +1359,7 @@ void ExportEntry::pushDownUntilBottom() {
 //
 // There is one "export" node for each exported symbol.  But because some
 // symbols may be a prefix of another symbol (e.g. _dup and _dup2), an export
-// node may have child nodes too.  
+// node may have child nodes too.
 //
 // The algorithm for moveNext() is to keep moving down the leftmost unvisited
 // child until hitting a node with no children (which is an export node or
@@ -1372,7 +1392,7 @@ void ExportEntry::moveNext() {
   Done = true;
 }
 
-iterator_range<export_iterator> 
+iterator_range<export_iterator>
 MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
   ExportEntry Start(Trie);
   if (Trie.size() == 0)
@@ -1383,15 +1403,13 @@ MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
   ExportEntry Finish(Trie);
   Finish.moveToEnd();
 
-  return iterator_range<export_iterator>(export_iterator(Start), 
-                                         export_iterator(Finish));
+  return make_range(export_iterator(Start), export_iterator(Finish));
 }
 
 iterator_range<export_iterator> MachOObjectFile::exports() const {
   return exports(getDyldInfoExportsTrie());
 }
 
-
 MachORebaseEntry::MachORebaseEntry(ArrayRef<uint8_t> Bytes, bool is64Bit)
     : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
       RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
@@ -1555,17 +1573,14 @@ MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
   MachORebaseEntry Finish(Opcodes, is64);
   Finish.moveToEnd();
 
-  return iterator_range<rebase_iterator>(rebase_iterator(Start),
-                                         rebase_iterator(Finish));
+  return make_range(rebase_iterator(Start), rebase_iterator(Finish));
 }
 
 iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
   return rebaseTable(getDyldInfoRebaseOpcodes(), is64Bit());
 }
 
-
-MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit,
-                               Kind BK)
+MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
     : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
       Ordinal(0), Flags(0), Addend(0), RemainingLoopCount(0), AdvanceAmount(0),
       BindType(0), PointerSize(is64Bit ? 8 : 4),
@@ -1769,7 +1784,6 @@ int64_t MachOBindEntry::readSLEB128() {
   return Result;
 }
 
-
 uint32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachOBindEntry::segmentOffset() const { return SegmentOffset; }
@@ -1810,8 +1824,7 @@ MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
   MachOBindEntry Finish(Opcodes, is64, BKind);
   Finish.moveToEnd();
 
-  return iterator_range<bind_iterator>(bind_iterator(Start),
-                                       bind_iterator(Finish));
+  return make_range(bind_iterator(Start), bind_iterator(Finish));
 }
 
 iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
@@ -1841,8 +1854,7 @@ MachOObjectFile::end_load_commands() const {
 
 iterator_range<MachOObjectFile::load_command_iterator>
 MachOObjectFile::load_commands() const {
-  return iterator_range<load_command_iterator>(begin_load_commands(),
-                                               end_load_commands());
+  return make_range(begin_load_commands(), end_load_commands());
 }
 
 StringRef
@@ -2207,66 +2219,66 @@ MachOObjectFile::getLinkOptHintsLoadCommand() const {
 }
 
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoRebaseOpcodes() const {
-  if (!DyldInfoLoadCmd) 
-    return ArrayRef<uint8_t>();
+  if (!DyldInfoLoadCmd)
+    return None;
 
-  MachO::dyld_info_command DyldInfo 
-                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
-  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
-                                             getPtr(this, DyldInfo.rebase_off));
-  return ArrayRef<uint8_t>(Ptr, DyldInfo.rebase_size);
+  MachO::dyld_info_command DyldInfo =
+      getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr =
+      reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.rebase_off));
+  return makeArrayRef(Ptr, DyldInfo.rebase_size);
 }
 
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoBindOpcodes() const {
-  if (!DyldInfoLoadCmd) 
-    return ArrayRef<uint8_t>();
+  if (!DyldInfoLoadCmd)
+    return None;
 
-  MachO::dyld_info_command DyldInfo 
-                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
-  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
-                                               getPtr(this, DyldInfo.bind_off));
-  return ArrayRef<uint8_t>(Ptr, DyldInfo.bind_size);
+  MachO::dyld_info_command DyldInfo =
+      getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr =
+      reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.bind_off));
+  return makeArrayRef(Ptr, DyldInfo.bind_size);
 }
 
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoWeakBindOpcodes() const {
-  if (!DyldInfoLoadCmd) 
-    return ArrayRef<uint8_t>();
+  if (!DyldInfoLoadCmd)
+    return None;
 
-  MachO::dyld_info_command DyldInfo 
-                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
-  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
-                                          getPtr(this, DyldInfo.weak_bind_off));
-  return ArrayRef<uint8_t>(Ptr, DyldInfo.weak_bind_size);
+  MachO::dyld_info_command DyldInfo =
+      getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr =
+      reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.weak_bind_off));
+  return makeArrayRef(Ptr, DyldInfo.weak_bind_size);
 }
 
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoLazyBindOpcodes() const {
-  if (!DyldInfoLoadCmd) 
-    return ArrayRef<uint8_t>();
+  if (!DyldInfoLoadCmd)
+    return None;
 
-  MachO::dyld_info_command DyldInfo 
-                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
-  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
-                                          getPtr(this, DyldInfo.lazy_bind_off));
-  return ArrayRef<uint8_t>(Ptr, DyldInfo.lazy_bind_size);
+  MachO::dyld_info_command DyldInfo =
+      getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr =
+      reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.lazy_bind_off));
+  return makeArrayRef(Ptr, DyldInfo.lazy_bind_size);
 }
 
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoExportsTrie() const {
-  if (!DyldInfoLoadCmd) 
-    return ArrayRef<uint8_t>();
+  if (!DyldInfoLoadCmd)
+    return None;
 
-  MachO::dyld_info_command DyldInfo 
-                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
-  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
-                                             getPtr(this, DyldInfo.export_off));
-  return ArrayRef<uint8_t>(Ptr, DyldInfo.export_size);
+  MachO::dyld_info_command DyldInfo =
+      getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr =
+      reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.export_off));
+  return makeArrayRef(Ptr, DyldInfo.export_size);
 }
 
 ArrayRef<uint8_t> MachOObjectFile::getUuid() const {
   if (!UuidLoadCmd)
-    return ArrayRef<uint8_t>();
+    return None;
   // Returning a pointer is fine as uuid doesn't need endian swapping.
   const char *Ptr = UuidLoadCmd + offsetof(MachO::uuid_command, uuid);
-  return ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Ptr), 16);
+  return makeArrayRef(reinterpret_cast<const uint8_t *>(Ptr), 16);
 }
 
 StringRef MachOObjectFile::getStringTableData() const {
@@ -2315,4 +2327,3 @@ ObjectFile::createMachOObjectFile(MemoryBufferRef Buffer) {
     return EC;
   return std::move(Ret);
 }
-
diff --git a/contrib/llvm/lib/Object/MachOUniversal.cpp b/contrib/llvm/lib/Object/MachOUniversal.cpp
index 1d0e69e..a1c83b9 100644
--- a/contrib/llvm/lib/Object/MachOUniversal.cpp
+++ b/contrib/llvm/lib/Object/MachOUniversal.cpp
@@ -69,14 +69,14 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch(
 
 ErrorOr<std::unique_ptr<MachOObjectFile>>
 MachOUniversalBinary::ObjectForArch::getAsObjectFile() const {
-  if (Parent) {
-    StringRef ParentData = Parent->getData();
-    StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
-    StringRef ObjectName = Parent->getFileName();
-    MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
-    return ObjectFile::createMachOObjectFile(ObjBuffer);
-  }
-  return object_error::parse_failed;
+  if (!Parent)
+    return object_error::parse_failed;
+
+  StringRef ParentData = Parent->getData();
+  StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
+  StringRef ObjectName = Parent->getFileName();
+  MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
+  return ObjectFile::createMachOObjectFile(ObjBuffer);
 }
 
 ErrorOr<std::unique_ptr<Archive>>
diff --git a/contrib/llvm/lib/Object/Object.cpp b/contrib/llvm/lib/Object/Object.cpp
index 5c4b7a6..b44c1a1 100644
--- a/contrib/llvm/lib/Object/Object.cpp
+++ b/contrib/llvm/lib/Object/Object.cpp
@@ -98,8 +98,10 @@ void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) {
 
 void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
                                  LLVMSymbolIteratorRef Sym) {
-  if (std::error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect)))
+  ErrorOr<section_iterator> SecOrErr = (*unwrap(Sym))->getSection();
+  if (std::error_code ec = SecOrErr.getError())
     report_fatal_error(ec.message());
+  *unwrap(Sect) = *SecOrErr;
 }
 
 // ObjectFile Symbol iterators
diff --git a/contrib/llvm/lib/Object/ObjectFile.cpp b/contrib/llvm/lib/Object/ObjectFile.cpp
index f82edae..d12dc41 100644
--- a/contrib/llvm/lib/Object/ObjectFile.cpp
+++ b/contrib/llvm/lib/Object/ObjectFile.cpp
@@ -29,10 +29,10 @@ ObjectFile::ObjectFile(unsigned int Type, MemoryBufferRef Source)
     : SymbolicFile(Type, Source) {}
 
 bool SectionRef::containsSymbol(SymbolRef S) const {
-  section_iterator SymSec = getObject()->section_end();
-  if (S.getSection(SymSec))
+  ErrorOr<section_iterator> SymSec = S.getSection();
+  if (!SymSec)
     return false;
-  return *this == *SymSec;
+  return *this == **SymSec;
 }
 
 uint64_t ObjectFile::getSymbolValue(DataRefImpl Ref) const {
diff --git a/contrib/llvm/lib/Object/SymbolicFile.cpp b/contrib/llvm/lib/Object/SymbolicFile.cpp
index 854e68e..bf79dfb 100644
--- a/contrib/llvm/lib/Object/SymbolicFile.cpp
+++ b/contrib/llvm/lib/Object/SymbolicFile.cpp
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
@@ -54,9 +56,10 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
   case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
   case sys::fs::file_magic::macho_dsym_companion:
   case sys::fs::file_magic::macho_kext_bundle:
-  case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
     return ObjectFile::createObjectFile(Object, Type);
+  case sys::fs::file_magic::coff_import_library:
+    return std::unique_ptr<SymbolicFile>(new COFFImportFile(Object));
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::coff_object: {
diff --git a/contrib/llvm/lib/Option/Arg.cpp b/contrib/llvm/lib/Option/Arg.cpp
index ac00073..c3de2d1 100644
--- a/contrib/llvm/lib/Option/Arg.cpp
+++ b/contrib/llvm/lib/Option/Arg.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 using namespace llvm::opt;
@@ -43,23 +44,25 @@ Arg::~Arg() {
   }
 }
 
-void Arg::dump() const {
-  llvm::errs() << "<";
+void Arg::print(raw_ostream& O) const {
+  O << "<";
 
-  llvm::errs() << " Opt:";
-  Opt.dump();
+  O << " Opt:";
+  Opt.print(O);
 
-  llvm::errs() << " Index:" << Index;
+  O << " Index:" << Index;
 
-  llvm::errs() << " Values: [";
+  O << " Values: [";
   for (unsigned i = 0, e = Values.size(); i != e; ++i) {
-    if (i) llvm::errs() << ", ";
-    llvm::errs() << "'" << Values[i] << "'";
+    if (i) O << ", ";
+    O << "'" << Values[i] << "'";
   }
 
-  llvm::errs() << "]>\n";
+  O << "]>\n";
 }
 
+LLVM_DUMP_METHOD void Arg::dump() const { print(dbgs()); }
+
 std::string Arg::getAsString(const ArgList &Args) const {
   SmallString<256> Res;
   llvm::raw_svector_ostream OS(Res);
diff --git a/contrib/llvm/lib/Option/ArgList.cpp b/contrib/llvm/lib/Option/ArgList.cpp
index a74ead6..0826ef8 100644
--- a/contrib/llvm/lib/Option/ArgList.cpp
+++ b/contrib/llvm/lib/Option/ArgList.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -258,6 +259,21 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0,
   }
 }
 
+void ArgList::AddAllArgs(ArgStringList &Output,
+                         ArrayRef<OptSpecifier> Ids) const {
+  for (const Arg *Arg : Args) {
+    for (OptSpecifier Id : Ids) {
+      if (Arg->getOption().matches(Id)) {
+        Arg->claim();
+        Arg->render(*this, Output);
+        break;
+      }
+    }
+  }
+}
+
+/// This 3-opt variant of AddAllArgs could be eliminated in favor of one
+/// that accepts a single specifier, given the above which accepts any number.
 void ArgList::AddAllArgs(ArgStringList &Output, OptSpecifier Id0,
                          OptSpecifier Id1, OptSpecifier Id2) const {
   for (auto Arg: filtered(Id0, Id1, Id2)) {
@@ -313,6 +329,15 @@ const char *ArgList::GetOrMakeJoinedArgString(unsigned Index,
   return MakeArgString(LHS + RHS);
 }
 
+void ArgList::print(raw_ostream &O) const {
+  for (Arg *A : *this) {
+    O << "* ";
+    A->print(O);
+  }
+}
+
+LLVM_DUMP_METHOD void ArgList::dump() const { print(dbgs()); }
+
 //
 
 void InputArgList::releaseMemory() {
diff --git a/contrib/llvm/lib/Option/OptTable.cpp b/contrib/llvm/lib/Option/OptTable.cpp
index e83536f..09d4ceb 100644
--- a/contrib/llvm/lib/Option/OptTable.cpp
+++ b/contrib/llvm/lib/Option/OptTable.cpp
@@ -84,11 +84,9 @@ static inline bool operator<(const OptTable::Info &I, const char *Name) {
 
 OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {}
 
-OptTable::OptTable(const Info *OptionInfos, unsigned NumOptionInfos,
-                   bool IgnoreCase)
-    : OptionInfos(OptionInfos), NumOptionInfos(NumOptionInfos),
-      IgnoreCase(IgnoreCase), TheInputOptionID(0), TheUnknownOptionID(0),
-      FirstSearchableIndex(0) {
+OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase)
+    : OptionInfos(OptionInfos), IgnoreCase(IgnoreCase), TheInputOptionID(0),
+      TheUnknownOptionID(0), FirstSearchableIndex(0) {
   // Explicitly zero initialize the error to work around a bug in array
   // value-initialization on MinGW with gcc 4.3.5.
 
@@ -199,8 +197,8 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
   if (isInput(PrefixesUnion, Str))
     return new Arg(getOption(TheInputOptionID), Str, Index++, Str);
 
-  const Info *Start = OptionInfos + FirstSearchableIndex;
-  const Info *End = OptionInfos + getNumOptions();
+  const Info *Start = OptionInfos.begin() + FirstSearchableIndex;
+  const Info *End = OptionInfos.end();
   StringRef Name = StringRef(Str).ltrim(PrefixChars);
 
   // Search for the first next option which could be a prefix.
diff --git a/contrib/llvm/lib/Option/Option.cpp b/contrib/llvm/lib/Option/Option.cpp
index 221414d..ebf05aa 100644
--- a/contrib/llvm/lib/Option/Option.cpp
+++ b/contrib/llvm/lib/Option/Option.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -35,10 +36,10 @@ Option::Option(const OptTable::Info *info, const OptTable *owner)
   }
 }
 
-void Option::dump() const {
-  llvm::errs() << "<";
+void Option::print(raw_ostream &O) const {
+  O << "<";
   switch (getKind()) {
-#define P(N) case N: llvm::errs() << #N; break
+#define P(N) case N: O << #N; break
     P(GroupClass);
     P(InputClass);
     P(UnknownClass);
@@ -54,33 +55,35 @@ void Option::dump() const {
   }
 
   if (Info->Prefixes) {
-    llvm::errs() << " Prefixes:[";
-    for (const char * const *Pre = Info->Prefixes; *Pre != nullptr; ++Pre) {
-      llvm::errs() << '"' << *Pre << (*(Pre + 1) == nullptr ? "\"" : "\", ");
+    O << " Prefixes:[";
+    for (const char *const *Pre = Info->Prefixes; *Pre != nullptr; ++Pre) {
+      O << '"' << *Pre << (*(Pre + 1) == nullptr ? "\"" : "\", ");
     }
-    llvm::errs() << ']';
+    O << ']';
   }
 
-  llvm::errs() << " Name:\"" << getName() << '"';
+  O << " Name:\"" << getName() << '"';
 
   const Option Group = getGroup();
   if (Group.isValid()) {
-    llvm::errs() << " Group:";
-    Group.dump();
+    O << " Group:";
+    Group.print(O);
   }
 
   const Option Alias = getAlias();
   if (Alias.isValid()) {
-    llvm::errs() << " Alias:";
-    Alias.dump();
+    O << " Alias:";
+    Alias.print(O);
   }
 
   if (getKind() == MultiArgClass)
-    llvm::errs() << " NumArgs:" << getNumArgs();
+    O << " NumArgs:" << getNumArgs();
 
-  llvm::errs() << ">\n";
+  O << ">\n";
 }
 
+void Option::dump() const { print(dbgs()); }
+
 bool Option::matches(OptSpecifier Opt) const {
   // Aliases are never considered in matching, look through them.
   const Option Alias = getAlias();
diff --git a/contrib/llvm/lib/Passes/PassBuilder.cpp b/contrib/llvm/lib/Passes/PassBuilder.cpp
index ba71320..8ba81f7 100644
--- a/contrib/llvm/lib/Passes/PassBuilder.cpp
+++ b/contrib/llvm/lib/Passes/PassBuilder.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -28,9 +29,14 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/SROA.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Passes/PassRegistry.def b/contrib/llvm/lib/Passes/PassRegistry.def
index d768a3a..241a789 100644
--- a/contrib/llvm/lib/Passes/PassRegistry.def
+++ b/contrib/llvm/lib/Passes/PassRegistry.def
@@ -27,10 +27,13 @@ MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
+MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
+MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("print", PrintModulePass(dbgs()))
 MODULE_PASS("print-cg", LazyCallGraphPrinterPass(dbgs()))
+MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass())
 MODULE_PASS("verify", VerifierPass())
 #undef MODULE_PASS
 
@@ -54,6 +57,7 @@ FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
+FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
 FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 FUNCTION_ANALYSIS("targetir",
                   TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis())
@@ -62,6 +66,7 @@ FUNCTION_ANALYSIS("targetir",
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("adce", ADCEPass())
 FUNCTION_PASS("early-cse", EarlyCSEPass())
 FUNCTION_PASS("instcombine", InstCombinePass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
@@ -71,7 +76,9 @@ FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
+FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
 FUNCTION_PASS("simplify-cfg", SimplifyCFGPass())
+FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 #undef FUNCTION_PASS
diff --git a/contrib/llvm/lib/ProfileData/CoverageMapping.cpp b/contrib/llvm/lib/ProfileData/CoverageMapping.cpp
index cf04fea..f5d477b 100644
--- a/contrib/llvm/lib/ProfileData/CoverageMapping.cpp
+++ b/contrib/llvm/lib/ProfileData/CoverageMapping.cpp
@@ -181,18 +181,6 @@ void FunctionRecordIterator::skipOtherFiles() {
     *this = FunctionRecordIterator();
 }
 
-/// Get the function name from the record, removing the filename prefix if
-/// necessary.
-static StringRef getFuncNameWithoutPrefix(const CoverageMappingRecord &Record) {
-  StringRef FunctionName = Record.FunctionName;
-  if (Record.Filenames.empty())
-    return FunctionName;
-  StringRef Filename = sys::path::filename(Record.Filenames[0]);
-  if (FunctionName.startswith(Filename))
-    FunctionName = FunctionName.drop_front(Filename.size() + 1);
-  return FunctionName;
-}
-
 ErrorOr<std::unique_ptr<CoverageMapping>>
 CoverageMapping::load(CoverageMappingReader &CoverageReader,
                       IndexedInstrProfReader &ProfileReader) {
@@ -216,7 +204,11 @@ CoverageMapping::load(CoverageMappingReader &CoverageReader,
 
     assert(!Record.MappingRegions.empty() && "Function has no regions");
 
-    FunctionRecord Function(getFuncNameWithoutPrefix(Record), Record.Filenames);
+    StringRef OrigFuncName = Record.FunctionName;
+    if (!Record.Filenames.empty())
+      OrigFuncName =
+          getFuncNameWithoutPrefix(OrigFuncName, Record.Filenames[0]);
+    FunctionRecord Function(OrigFuncName, Record.Filenames);
     for (const auto &Region : Record.MappingRegions) {
       ErrorOr<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
       if (!ExecutionCount)
@@ -525,6 +517,6 @@ class CoverageMappingErrorCategoryType : public std::error_category {
 
 static ManagedStatic<CoverageMappingErrorCategoryType> ErrorCategory;
 
-const std::error_category &llvm::coveragemap_category() {
+const std::error_category &llvm::coverage::coveragemap_category() {
   return *ErrorCategory;
 }
diff --git a/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp b/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp
index 334a3f5..89e1cf4 100644
--- a/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp
+++ b/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp
@@ -290,36 +290,25 @@ std::error_code RawCoverageMappingReader::read() {
   return std::error_code();
 }
 
-namespace {
-
-/// \brief A helper structure to access the data from a section
-/// in an object file.
-struct SectionData {
-  StringRef Data;
-  uint64_t Address;
-
-  std::error_code load(SectionRef &Section) {
-    if (auto Err = Section.getContents(Data))
-      return Err;
-    Address = Section.getAddress();
-    return std::error_code();
-  }
+std::error_code InstrProfSymtab::create(SectionRef &Section) {
+  if (auto Err = Section.getContents(Data))
+    return Err;
+  Address = Section.getAddress();
+  return std::error_code();
+}
 
-  std::error_code get(uint64_t Pointer, size_t Size, StringRef &Result) {
-    if (Pointer < Address)
-      return coveragemap_error::malformed;
-    auto Offset = Pointer - Address;
-    if (Offset + Size > Data.size())
-      return coveragemap_error::malformed;
-    Result = Data.substr(Pointer - Address, Size);
-    return std::error_code();
-  }
-};
+StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) {
+  if (Pointer < Address)
+    return StringRef();
+  auto Offset = Pointer - Address;
+  if (Offset + Size > Data.size())
+    return StringRef();
+  return Data.substr(Pointer - Address, Size);
 }
 
 template <typename T, support::endianness Endian>
-std::error_code readCoverageMappingData(
-    SectionData &ProfileNames, StringRef Data,
+static std::error_code readCoverageMappingData(
+    InstrProfSymtab &ProfileNames, StringRef Data,
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
     std::vector<StringRef> &Filenames) {
   using namespace support;
@@ -327,23 +316,21 @@ std::error_code readCoverageMappingData(
 
   // Read the records in the coverage data section.
   for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) {
-    if (Buf + 4 * sizeof(uint32_t) > End)
+    if (Buf + sizeof(CovMapHeader) > End)
       return coveragemap_error::malformed;
-    uint32_t NRecords = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-    uint32_t FilenamesSize = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-    uint32_t CoverageSize = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-    uint32_t Version = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-
-    switch (Version) {
-    case CoverageMappingVersion1:
-      break;
-    default:
+    auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
+    uint32_t NRecords = CovHeader->getNRecords<Endian>();
+    uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>();
+    uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>();
+    uint32_t Version = CovHeader->getVersion<Endian>();
+    Buf = reinterpret_cast<const char *>(++CovHeader);
+
+    if (Version > coverage::CoverageMappingCurrentVersion)
       return coveragemap_error::unsupported_version;
-    }
 
     // Skip past the function records, saving the start and end for later.
     const char *FunBuf = Buf;
-    Buf += NRecords * (sizeof(T) + 2 * sizeof(uint32_t) + sizeof(uint64_t));
+    Buf += NRecords * sizeof(coverage::CovMapFunctionRecord<T>);
     const char *FunEnd = Buf;
 
     // Get the filenames.
@@ -366,12 +353,12 @@ std::error_code readCoverageMappingData(
     // before reading the next map.
     Buf += alignmentAdjustment(Buf, 8);
 
-    while (FunBuf < FunEnd) {
+    auto CFR =
+        reinterpret_cast<const coverage::CovMapFunctionRecord<T> *>(FunBuf);
+    while ((const char *)CFR < FunEnd) {
       // Read the function information
-      T NamePtr = endian::readNext<T, Endian, unaligned>(FunBuf);
-      uint32_t NameSize = endian::readNext<uint32_t, Endian, unaligned>(FunBuf);
-      uint32_t DataSize = endian::readNext<uint32_t, Endian, unaligned>(FunBuf);
-      uint64_t FuncHash = endian::readNext<uint64_t, Endian, unaligned>(FunBuf);
+      uint32_t DataSize = CFR->template getDataSize<Endian>();
+      uint64_t FuncHash = CFR->template getFuncHash<Endian>();
 
       // Now use that to read the coverage data.
       if (CovBuf + DataSize > CovEnd)
@@ -382,16 +369,18 @@ std::error_code readCoverageMappingData(
       // Ignore this record if we already have a record that points to the same
       // function name. This is useful to ignore the redundant records for the
       // functions with ODR linkage.
-      if (!UniqueFunctionMappingData.insert(NamePtr).second)
+      T NameRef = CFR->template getFuncNameRef<Endian>();
+      if (!UniqueFunctionMappingData.insert(NameRef).second)
         continue;
 
-      // Finally, grab the name and create a record.
       StringRef FuncName;
-      if (std::error_code EC = ProfileNames.get(NamePtr, NameSize, FuncName))
+      if (std::error_code EC =
+              CFR->template getFuncName<Endian>(ProfileNames, FuncName))
         return EC;
       Records.push_back(BinaryCoverageReader::ProfileMappingRecord(
           CoverageMappingVersion(Version), FuncName, FuncHash, Mapping,
           FilenamesBegin, Filenames.size() - FilenamesBegin));
+      CFR++;
     }
   }
 
@@ -401,7 +390,7 @@ std::error_code readCoverageMappingData(
 static const char *TestingFormatMagic = "llvmcovmtestdata";
 
 static std::error_code loadTestingFormat(StringRef Data,
-                                         SectionData &ProfileNames,
+                                         InstrProfSymtab &ProfileNames,
                                          StringRef &CoverageMapping,
                                          uint8_t &BytesInAddress,
                                          support::endianness &Endian) {
@@ -420,14 +409,14 @@ static std::error_code loadTestingFormat(StringRef Data,
   if (Data.size() < 1)
     return coveragemap_error::truncated;
   N = 0;
-  ProfileNames.Address =
+  uint64_t Address =
       decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
   if (N > Data.size())
     return coveragemap_error::malformed;
   Data = Data.substr(N);
   if (Data.size() < ProfileNamesSize)
     return coveragemap_error::malformed;
-  ProfileNames.Data = Data.substr(0, ProfileNamesSize);
+  ProfileNames.create(Data.substr(0, ProfileNamesSize), Address);
   CoverageMapping = Data.substr(ProfileNamesSize);
   return std::error_code();
 }
@@ -443,12 +432,10 @@ static ErrorOr<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
   return coveragemap_error::no_data_found;
 }
 
-static std::error_code loadBinaryFormat(MemoryBufferRef ObjectBuffer,
-                                        SectionData &ProfileNames,
-                                        StringRef &CoverageMapping,
-                                        uint8_t &BytesInAddress,
-                                        support::endianness &Endian,
-                                        StringRef Arch) {
+static std::error_code
+loadBinaryFormat(MemoryBufferRef ObjectBuffer, InstrProfSymtab &ProfileNames,
+                 StringRef &CoverageMapping, uint8_t &BytesInAddress,
+                 support::endianness &Endian, StringRef Arch) {
   auto BinOrErr = object::createBinary(ObjectBuffer);
   if (std::error_code EC = BinOrErr.getError())
     return EC;
@@ -477,17 +464,18 @@ static std::error_code loadBinaryFormat(MemoryBufferRef ObjectBuffer,
                                 : support::endianness::big;
 
   // Look for the sections that we are interested in.
-  auto NamesSection = lookupSection(*OF, "__llvm_prf_names");
+  auto NamesSection = lookupSection(*OF, getInstrProfNameSectionName(false));
   if (auto EC = NamesSection.getError())
     return EC;
-  auto CoverageSection = lookupSection(*OF, "__llvm_covmap");
+  auto CoverageSection =
+      lookupSection(*OF, getInstrProfCoverageSectionName(false));
   if (auto EC = CoverageSection.getError())
     return EC;
 
   // Get the contents of the given sections.
   if (std::error_code EC = CoverageSection->getContents(CoverageMapping))
     return EC;
-  if (std::error_code EC = ProfileNames.load(*NamesSection))
+  if (std::error_code EC = ProfileNames.create(*NamesSection))
     return EC;
 
   return std::error_code();
@@ -498,33 +486,33 @@ BinaryCoverageReader::create(std::unique_ptr<MemoryBuffer> &ObjectBuffer,
                              StringRef Arch) {
   std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader());
 
-  SectionData Profile;
+  InstrProfSymtab ProfileNames;
   StringRef Coverage;
   uint8_t BytesInAddress;
   support::endianness Endian;
   std::error_code EC;
   if (ObjectBuffer->getBuffer().startswith(TestingFormatMagic))
     // This is a special format used for testing.
-    EC = loadTestingFormat(ObjectBuffer->getBuffer(), Profile, Coverage,
+    EC = loadTestingFormat(ObjectBuffer->getBuffer(), ProfileNames, Coverage,
                            BytesInAddress, Endian);
   else
-    EC = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), Profile, Coverage,
-                          BytesInAddress, Endian, Arch);
+    EC = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), ProfileNames,
+                          Coverage, BytesInAddress, Endian, Arch);
   if (EC)
     return EC;
 
   if (BytesInAddress == 4 && Endian == support::endianness::little)
     EC = readCoverageMappingData<uint32_t, support::endianness::little>(
-        Profile, Coverage, Reader->MappingRecords, Reader->Filenames);
+        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
   else if (BytesInAddress == 4 && Endian == support::endianness::big)
     EC = readCoverageMappingData<uint32_t, support::endianness::big>(
-        Profile, Coverage, Reader->MappingRecords, Reader->Filenames);
+        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
   else if (BytesInAddress == 8 && Endian == support::endianness::little)
     EC = readCoverageMappingData<uint64_t, support::endianness::little>(
-        Profile, Coverage, Reader->MappingRecords, Reader->Filenames);
+        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
   else if (BytesInAddress == 8 && Endian == support::endianness::big)
     EC = readCoverageMappingData<uint64_t, support::endianness::big>(
-        Profile, Coverage, Reader->MappingRecords, Reader->Filenames);
+        ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames);
   else
     return coveragemap_error::malformed;
   if (EC)
diff --git a/contrib/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm/lib/ProfileData/InstrProf.cpp
index 92822a7..d677763 100644
--- a/contrib/llvm/lib/ProfileData/InstrProf.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProf.cpp
@@ -13,7 +13,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
@@ -28,28 +35,32 @@ class InstrProfErrorCategoryType : public std::error_category {
       return "Success";
     case instrprof_error::eof:
       return "End of File";
+    case instrprof_error::unrecognized_format:
+      return "Unrecognized instrumentation profile encoding format";
     case instrprof_error::bad_magic:
-      return "Invalid profile data (bad magic)";
+      return "Invalid instrumentation profile data (bad magic)";
     case instrprof_error::bad_header:
-      return "Invalid profile data (file header is corrupt)";
+      return "Invalid instrumentation profile data (file header is corrupt)";
     case instrprof_error::unsupported_version:
-      return "Unsupported profiling format version";
+      return "Unsupported instrumentation profile format version";
     case instrprof_error::unsupported_hash_type:
-      return "Unsupported profiling hash";
+      return "Unsupported instrumentation profile hash type";
     case instrprof_error::too_large:
       return "Too much profile data";
     case instrprof_error::truncated:
       return "Truncated profile data";
     case instrprof_error::malformed:
-      return "Malformed profile data";
+      return "Malformed instrumentation profile data";
     case instrprof_error::unknown_function:
       return "No profile data available for function";
     case instrprof_error::hash_mismatch:
-      return "Function hash mismatch";
+      return "Function control flow change detected (hash mismatch)";
     case instrprof_error::count_mismatch:
-      return "Function count mismatch";
+      return "Function basic block count change detected (counter mismatch)";
     case instrprof_error::counter_overflow:
       return "Counter overflow";
+    case instrprof_error::value_site_count_mismatch:
+      return "Function value site count change detected (counter mismatch)";
     }
     llvm_unreachable("A value of instrprof_error has no message.");
   }
@@ -61,3 +72,531 @@ static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory;
 const std::error_category &llvm::instrprof_category() {
   return *ErrorCategory;
 }
+
+namespace llvm {
+
+std::string getPGOFuncName(StringRef RawFuncName,
+                           GlobalValue::LinkageTypes Linkage,
+                           StringRef FileName,
+                           uint64_t Version LLVM_ATTRIBUTE_UNUSED) {
+
+  // Function names may be prefixed with a binary '1' to indicate
+  // that the backend should not modify the symbols due to any platform
+  // naming convention. Do not include that '1' in the PGO profile name.
+  if (RawFuncName[0] == '\1')
+    RawFuncName = RawFuncName.substr(1);
+
+  std::string FuncName = RawFuncName;
+  if (llvm::GlobalValue::isLocalLinkage(Linkage)) {
+    // For local symbols, prepend the main file name to distinguish them.
+    // Do not include the full path in the file name since there's no guarantee
+    // that it will stay the same, e.g., if the files are checked out from
+    // version control in different locations.
+    if (FileName.empty())
+      FuncName = FuncName.insert(0, "<unknown>:");
+    else
+      FuncName = FuncName.insert(0, FileName.str() + ":");
+  }
+  return FuncName;
+}
+
+std::string getPGOFuncName(const Function &F, uint64_t Version) {
+  return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName(),
+                        Version);
+}
+
+StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName) {
+  if (FileName.empty())
+    return PGOFuncName;
+  // Drop the file name including ':'. See also getPGOFuncName.
+  if (PGOFuncName.startswith(FileName))
+    PGOFuncName = PGOFuncName.drop_front(FileName.size() + 1);
+  return PGOFuncName;
+}
+
+// \p FuncName is the string used as profile lookup key for the function. A
+// symbol is created to hold the name. Return the legalized symbol name.
+static std::string getPGOFuncNameVarName(StringRef FuncName,
+                                         GlobalValue::LinkageTypes Linkage) {
+  std::string VarName = getInstrProfNameVarPrefix();
+  VarName += FuncName;
+
+  if (!GlobalValue::isLocalLinkage(Linkage))
+    return VarName;
+
+  // Now fix up illegal chars in local VarName that may upset the assembler.
+  const char *InvalidChars = "-:<>\"'";
+  size_t found = VarName.find_first_of(InvalidChars);
+  while (found != std::string::npos) {
+    VarName[found] = '_';
+    found = VarName.find_first_of(InvalidChars, found + 1);
+  }
+  return VarName;
+}
+
+GlobalVariable *createPGOFuncNameVar(Module &M,
+                                     GlobalValue::LinkageTypes Linkage,
+                                     StringRef FuncName) {
+
+  // We generally want to match the function's linkage, but available_externally
+  // and extern_weak both have the wrong semantics, and anything that doesn't
+  // need to link across compilation units doesn't need to be visible at all.
+  if (Linkage == GlobalValue::ExternalWeakLinkage)
+    Linkage = GlobalValue::LinkOnceAnyLinkage;
+  else if (Linkage == GlobalValue::AvailableExternallyLinkage)
+    Linkage = GlobalValue::LinkOnceODRLinkage;
+  else if (Linkage == GlobalValue::InternalLinkage ||
+           Linkage == GlobalValue::ExternalLinkage)
+    Linkage = GlobalValue::PrivateLinkage;
+
+  auto *Value = ConstantDataArray::getString(M.getContext(), FuncName, false);
+  auto FuncNameVar =
+      new GlobalVariable(M, Value->getType(), true, Linkage, Value,
+                         getPGOFuncNameVarName(FuncName, Linkage));
+
+  // Hide the symbol so that we correctly get a copy for each executable.
+  if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
+    FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
+
+  return FuncNameVar;
+}
+
+GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName) {
+  return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), FuncName);
+}
+
+int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+                              bool doCompression, std::string &Result) {
+  uint8_t Header[16], *P = Header;
+  std::string UncompressedNameStrings =
+      join(NameStrs.begin(), NameStrs.end(), StringRef(" "));
+
+  unsigned EncLen = encodeULEB128(UncompressedNameStrings.length(), P);
+  P += EncLen;
+
+  auto WriteStringToResult = [&](size_t CompressedLen,
+                                 const std::string &InputStr) {
+    EncLen = encodeULEB128(CompressedLen, P);
+    P += EncLen;
+    char *HeaderStr = reinterpret_cast<char *>(&Header[0]);
+    unsigned HeaderLen = P - &Header[0];
+    Result.append(HeaderStr, HeaderLen);
+    Result += InputStr;
+    return 0;
+  };
+
+  if (!doCompression)
+    return WriteStringToResult(0, UncompressedNameStrings);
+
+  SmallVector<char, 128> CompressedNameStrings;
+  zlib::Status Success =
+      zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
+                     zlib::BestSizeCompression);
+
+  if (Success != zlib::StatusOK)
+    return 1;
+
+  return WriteStringToResult(
+      CompressedNameStrings.size(),
+      std::string(CompressedNameStrings.data(), CompressedNameStrings.size()));
+}
+
+StringRef getPGOFuncNameInitializer(GlobalVariable *NameVar) {
+  auto *Arr = cast<ConstantDataArray>(NameVar->getInitializer());
+  StringRef NameStr =
+      Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
+  return NameStr;
+}
+
+int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+                              std::string &Result) {
+  std::vector<std::string> NameStrs;
+  for (auto *NameVar : NameVars) {
+    NameStrs.push_back(getPGOFuncNameInitializer(NameVar));
+  }
+  return collectPGOFuncNameStrings(NameStrs, zlib::isAvailable(), Result);
+}
+
+int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
+  const uint8_t *P = reinterpret_cast<const uint8_t *>(NameStrings.data());
+  const uint8_t *EndP = reinterpret_cast<const uint8_t *>(NameStrings.data() +
+                                                          NameStrings.size());
+  while (P < EndP) {
+    uint32_t N;
+    uint64_t UncompressedSize = decodeULEB128(P, &N);
+    P += N;
+    uint64_t CompressedSize = decodeULEB128(P, &N);
+    P += N;
+    bool isCompressed = (CompressedSize != 0);
+    SmallString<128> UncompressedNameStrings;
+    StringRef NameStrings;
+    if (isCompressed) {
+      StringRef CompressedNameStrings(reinterpret_cast<const char *>(P),
+                                      CompressedSize);
+      if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
+                           UncompressedSize) != zlib::StatusOK)
+        return 1;
+      P += CompressedSize;
+      NameStrings = StringRef(UncompressedNameStrings.data(),
+                              UncompressedNameStrings.size());
+    } else {
+      NameStrings =
+          StringRef(reinterpret_cast<const char *>(P), UncompressedSize);
+      P += UncompressedSize;
+    }
+    // Now parse the name strings.
+    SmallVector<StringRef, 0> Names;
+    NameStrings.split(Names, ' ');
+    for (StringRef &Name : Names)
+      Symtab.addFuncName(Name);
+
+    while (P < EndP && *P == 0)
+      P++;
+  }
+  Symtab.finalizeSymtab();
+  return 0;
+}
+
+instrprof_error InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
+                                                uint64_t Weight) {
+  this->sortByTargetValues();
+  Input.sortByTargetValues();
+  auto I = ValueData.begin();
+  auto IE = ValueData.end();
+  instrprof_error Result = instrprof_error::success;
+  for (auto J = Input.ValueData.begin(), JE = Input.ValueData.end(); J != JE;
+       ++J) {
+    while (I != IE && I->Value < J->Value)
+      ++I;
+    if (I != IE && I->Value == J->Value) {
+      bool Overflowed;
+      I->Count = SaturatingMultiplyAdd(J->Count, Weight, I->Count, &Overflowed);
+      if (Overflowed)
+        Result = instrprof_error::counter_overflow;
+      ++I;
+      continue;
+    }
+    ValueData.insert(I, *J);
+  }
+  return Result;
+}
+
+instrprof_error InstrProfValueSiteRecord::scale(uint64_t Weight) {
+  instrprof_error Result = instrprof_error::success;
+  for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) {
+    bool Overflowed;
+    I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed);
+    if (Overflowed)
+      Result = instrprof_error::counter_overflow;
+  }
+  return Result;
+}
+
+// Merge Value Profile data from Src record to this record for ValueKind.
+// Scale merged value counts by \p Weight.
+instrprof_error InstrProfRecord::mergeValueProfData(uint32_t ValueKind,
+                                                    InstrProfRecord &Src,
+                                                    uint64_t Weight) {
+  uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
+  uint32_t OtherNumValueSites = Src.getNumValueSites(ValueKind);
+  if (ThisNumValueSites != OtherNumValueSites)
+    return instrprof_error::value_site_count_mismatch;
+  std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
+      getValueSitesForKind(ValueKind);
+  std::vector<InstrProfValueSiteRecord> &OtherSiteRecords =
+      Src.getValueSitesForKind(ValueKind);
+  instrprof_error Result = instrprof_error::success;
+  for (uint32_t I = 0; I < ThisNumValueSites; I++)
+    MergeResult(Result, ThisSiteRecords[I].merge(OtherSiteRecords[I], Weight));
+  return Result;
+}
+
+instrprof_error InstrProfRecord::merge(InstrProfRecord &Other,
+                                       uint64_t Weight) {
+  // If the number of counters doesn't match we either have bad data
+  // or a hash collision.
+  if (Counts.size() != Other.Counts.size())
+    return instrprof_error::count_mismatch;
+
+  instrprof_error Result = instrprof_error::success;
+
+  for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
+    bool Overflowed;
+    Counts[I] =
+        SaturatingMultiplyAdd(Other.Counts[I], Weight, Counts[I], &Overflowed);
+    if (Overflowed)
+      Result = instrprof_error::counter_overflow;
+  }
+
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    MergeResult(Result, mergeValueProfData(Kind, Other, Weight));
+
+  return Result;
+}
+
+instrprof_error InstrProfRecord::scaleValueProfData(uint32_t ValueKind,
+                                                    uint64_t Weight) {
+  uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
+  std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
+      getValueSitesForKind(ValueKind);
+  instrprof_error Result = instrprof_error::success;
+  for (uint32_t I = 0; I < ThisNumValueSites; I++)
+    MergeResult(Result, ThisSiteRecords[I].scale(Weight));
+  return Result;
+}
+
+instrprof_error InstrProfRecord::scale(uint64_t Weight) {
+  instrprof_error Result = instrprof_error::success;
+  for (auto &Count : this->Counts) {
+    bool Overflowed;
+    Count = SaturatingMultiply(Count, Weight, &Overflowed);
+    if (Overflowed && Result == instrprof_error::success) {
+      Result = instrprof_error::counter_overflow;
+    }
+  }
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    MergeResult(Result, scaleValueProfData(Kind, Weight));
+
+  return Result;
+}
+
+// Map indirect call target name hash to name string.
+uint64_t InstrProfRecord::remapValue(uint64_t Value, uint32_t ValueKind,
+                                     ValueMapType *ValueMap) {
+  if (!ValueMap)
+    return Value;
+  switch (ValueKind) {
+  case IPVK_IndirectCallTarget: {
+    auto Result =
+        std::lower_bound(ValueMap->begin(), ValueMap->end(), Value,
+                         [](const std::pair<uint64_t, uint64_t> &LHS,
+                            uint64_t RHS) { return LHS.first < RHS; });
+    if (Result != ValueMap->end())
+      Value = (uint64_t)Result->second;
+    break;
+  }
+  }
+  return Value;
+}
+
+void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
+                                   InstrProfValueData *VData, uint32_t N,
+                                   ValueMapType *ValueMap) {
+  for (uint32_t I = 0; I < N; I++) {
+    VData[I].Value = remapValue(VData[I].Value, ValueKind, ValueMap);
+  }
+  std::vector<InstrProfValueSiteRecord> &ValueSites =
+      getValueSitesForKind(ValueKind);
+  if (N == 0)
+    ValueSites.push_back(InstrProfValueSiteRecord());
+  else
+    ValueSites.emplace_back(VData, VData + N);
+}
+
+#define INSTR_PROF_COMMON_API_IMPL
+#include "llvm/ProfileData/InstrProfData.inc"
+
+/*!
+ * \brief ValueProfRecordClosure Interface implementation for  InstrProfRecord
+ *  class. These C wrappers are used as adaptors so that C++ code can be
+ *  invoked as callbacks.
+ */
+uint32_t getNumValueKindsInstrProf(const void *Record) {
+  return reinterpret_cast<const InstrProfRecord *>(Record)->getNumValueKinds();
+}
+
+uint32_t getNumValueSitesInstrProf(const void *Record, uint32_t VKind) {
+  return reinterpret_cast<const InstrProfRecord *>(Record)
+      ->getNumValueSites(VKind);
+}
+
+uint32_t getNumValueDataInstrProf(const void *Record, uint32_t VKind) {
+  return reinterpret_cast<const InstrProfRecord *>(Record)
+      ->getNumValueData(VKind);
+}
+
+uint32_t getNumValueDataForSiteInstrProf(const void *R, uint32_t VK,
+                                         uint32_t S) {
+  return reinterpret_cast<const InstrProfRecord *>(R)
+      ->getNumValueDataForSite(VK, S);
+}
+
+void getValueForSiteInstrProf(const void *R, InstrProfValueData *Dst,
+                              uint32_t K, uint32_t S,
+                              uint64_t (*Mapper)(uint32_t, uint64_t)) {
+  return reinterpret_cast<const InstrProfRecord *>(R)->getValueForSite(
+      Dst, K, S, Mapper);
+}
+
+ValueProfData *allocValueProfDataInstrProf(size_t TotalSizeInBytes) {
+  ValueProfData *VD =
+      (ValueProfData *)(new (::operator new(TotalSizeInBytes)) ValueProfData());
+  memset(VD, 0, TotalSizeInBytes);
+  return VD;
+}
+
+static ValueProfRecordClosure InstrProfRecordClosure = {
+    0,
+    getNumValueKindsInstrProf,
+    getNumValueSitesInstrProf,
+    getNumValueDataInstrProf,
+    getNumValueDataForSiteInstrProf,
+    0,
+    getValueForSiteInstrProf,
+    allocValueProfDataInstrProf};
+
+// Wrapper implementation using the closure mechanism.
+uint32_t ValueProfData::getSize(const InstrProfRecord &Record) {
+  InstrProfRecordClosure.Record = &Record;
+  return getValueProfDataSize(&InstrProfRecordClosure);
+}
+
+// Wrapper implementation using the closure mechanism.
+std::unique_ptr<ValueProfData>
+ValueProfData::serializeFrom(const InstrProfRecord &Record) {
+  InstrProfRecordClosure.Record = &Record;
+
+  std::unique_ptr<ValueProfData> VPD(
+      serializeValueProfDataFrom(&InstrProfRecordClosure, nullptr));
+  return VPD;
+}
+
+void ValueProfRecord::deserializeTo(InstrProfRecord &Record,
+                                    InstrProfRecord::ValueMapType *VMap) {
+  Record.reserveSites(Kind, NumValueSites);
+
+  InstrProfValueData *ValueData = getValueProfRecordValueData(this);
+  for (uint64_t VSite = 0; VSite < NumValueSites; ++VSite) {
+    uint8_t ValueDataCount = this->SiteCountArray[VSite];
+    Record.addValueData(Kind, VSite, ValueData, ValueDataCount, VMap);
+    ValueData += ValueDataCount;
+  }
+}
+
+// For writing/serializing,  Old is the host endianness, and  New is
+// byte order intended on disk. For Reading/deserialization, Old
+// is the on-disk source endianness, and New is the host endianness.
+void ValueProfRecord::swapBytes(support::endianness Old,
+                                support::endianness New) {
+  using namespace support;
+  if (Old == New)
+    return;
+
+  if (getHostEndianness() != Old) {
+    sys::swapByteOrder<uint32_t>(NumValueSites);
+    sys::swapByteOrder<uint32_t>(Kind);
+  }
+  uint32_t ND = getValueProfRecordNumValueData(this);
+  InstrProfValueData *VD = getValueProfRecordValueData(this);
+
+  // No need to swap byte array: SiteCountArrray.
+  for (uint32_t I = 0; I < ND; I++) {
+    sys::swapByteOrder<uint64_t>(VD[I].Value);
+    sys::swapByteOrder<uint64_t>(VD[I].Count);
+  }
+  if (getHostEndianness() == Old) {
+    sys::swapByteOrder<uint32_t>(NumValueSites);
+    sys::swapByteOrder<uint32_t>(Kind);
+  }
+}
+
+void ValueProfData::deserializeTo(InstrProfRecord &Record,
+                                  InstrProfRecord::ValueMapType *VMap) {
+  if (NumValueKinds == 0)
+    return;
+
+  ValueProfRecord *VR = getFirstValueProfRecord(this);
+  for (uint32_t K = 0; K < NumValueKinds; K++) {
+    VR->deserializeTo(Record, VMap);
+    VR = getValueProfRecordNext(VR);
+  }
+}
+
+template <class T>
+static T swapToHostOrder(const unsigned char *&D, support::endianness Orig) {
+  using namespace support;
+  if (Orig == little)
+    return endian::readNext<T, little, unaligned>(D);
+  else
+    return endian::readNext<T, big, unaligned>(D);
+}
+
+static std::unique_ptr<ValueProfData> allocValueProfData(uint32_t TotalSize) {
+  return std::unique_ptr<ValueProfData>(new (::operator new(TotalSize))
+                                            ValueProfData());
+}
+
+instrprof_error ValueProfData::checkIntegrity() {
+  if (NumValueKinds > IPVK_Last + 1)
+    return instrprof_error::malformed;
+  // Total size needs to be mulltiple of quadword size.
+  if (TotalSize % sizeof(uint64_t))
+    return instrprof_error::malformed;
+
+  ValueProfRecord *VR = getFirstValueProfRecord(this);
+  for (uint32_t K = 0; K < this->NumValueKinds; K++) {
+    if (VR->Kind > IPVK_Last)
+      return instrprof_error::malformed;
+    VR = getValueProfRecordNext(VR);
+    if ((char *)VR - (char *)this > (ptrdiff_t)TotalSize)
+      return instrprof_error::malformed;
+  }
+  return instrprof_error::success;
+}
+
+ErrorOr<std::unique_ptr<ValueProfData>>
+ValueProfData::getValueProfData(const unsigned char *D,
+                                const unsigned char *const BufferEnd,
+                                support::endianness Endianness) {
+  using namespace support;
+  if (D + sizeof(ValueProfData) > BufferEnd)
+    return instrprof_error::truncated;
+
+  const unsigned char *Header = D;
+  uint32_t TotalSize = swapToHostOrder<uint32_t>(Header, Endianness);
+  if (D + TotalSize > BufferEnd)
+    return instrprof_error::too_large;
+
+  std::unique_ptr<ValueProfData> VPD = allocValueProfData(TotalSize);
+  memcpy(VPD.get(), D, TotalSize);
+  // Byte swap.
+  VPD->swapBytesToHost(Endianness);
+
+  instrprof_error EC = VPD->checkIntegrity();
+  if (EC != instrprof_error::success)
+    return EC;
+
+  return std::move(VPD);
+}
+
+void ValueProfData::swapBytesToHost(support::endianness Endianness) {
+  using namespace support;
+  if (Endianness == getHostEndianness())
+    return;
+
+  sys::swapByteOrder<uint32_t>(TotalSize);
+  sys::swapByteOrder<uint32_t>(NumValueKinds);
+
+  ValueProfRecord *VR = getFirstValueProfRecord(this);
+  for (uint32_t K = 0; K < NumValueKinds; K++) {
+    VR->swapBytes(Endianness, getHostEndianness());
+    VR = getValueProfRecordNext(VR);
+  }
+}
+
+void ValueProfData::swapBytesFromHost(support::endianness Endianness) {
+  using namespace support;
+  if (Endianness == getHostEndianness())
+    return;
+
+  ValueProfRecord *VR = getFirstValueProfRecord(this);
+  for (uint32_t K = 0; K < NumValueKinds; K++) {
+    ValueProfRecord *NVR = getValueProfRecordNext(VR);
+    VR->swapBytes(getHostEndianness(), Endianness);
+    VR = NVR;
+  }
+  sys::swapByteOrder<uint32_t>(TotalSize);
+  sys::swapByteOrder<uint32_t>(NumValueKinds);
+}
+
+}
diff --git a/contrib/llvm/lib/ProfileData/InstrProfIndexed.h b/contrib/llvm/lib/ProfileData/InstrProfIndexed.h
deleted file mode 100644
index ebca7b2..0000000
--- a/contrib/llvm/lib/ProfileData/InstrProfIndexed.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//=-- InstrProfIndexed.h - Indexed profiling format support -------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Shared header for the instrumented profile data reader and writer.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H
-#define LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H
-
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MD5.h"
-
-namespace llvm {
-
-namespace IndexedInstrProf {
-enum class HashT : uint32_t {
-  MD5,
-
-  Last = MD5
-};
-
-static inline uint64_t MD5Hash(StringRef Str) {
-  MD5 Hash;
-  Hash.update(Str);
-  llvm::MD5::MD5Result Result;
-  Hash.final(Result);
-  // Return the least significant 8 bytes. Our MD5 implementation returns the
-  // result in little endian, so we may need to swap bytes.
-  using namespace llvm::support;
-  return endian::read<uint64_t, little, unaligned>(Result);
-}
-
-static inline uint64_t ComputeHash(HashT Type, StringRef K) {
-  switch (Type) {
-  case HashT::MD5:
-    return IndexedInstrProf::MD5Hash(K);
-  }
-  llvm_unreachable("Unhandled hash type");
-}
-
-const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81"
-const uint64_t Version = 2;
-const HashT HashType = HashT::MD5;
-}
-
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
index 8a529a0..5e83456 100644
--- a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfReader.h"
-#include "InstrProfIndexed.h"
 #include "llvm/ADT/STLExtras.h"
 #include <cassert>
 
@@ -55,8 +54,10 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
     Result.reset(new RawInstrProfReader64(std::move(Buffer)));
   else if (RawInstrProfReader32::hasFormat(*Buffer))
     Result.reset(new RawInstrProfReader32(std::move(Buffer)));
-  else
+  else if (TextInstrProfReader::hasFormat(*Buffer))
     Result.reset(new TextInstrProfReader(std::move(Buffer)));
+  else
+    return instrprof_error::unrecognized_format;
 
   // Initialize the reader and return the result.
   if (std::error_code EC = initializeReader(*Result))
@@ -98,16 +99,98 @@ void InstrProfIterator::Increment() {
     *this = InstrProfIterator();
 }
 
+bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
+  // Verify that this really looks like plain ASCII text by checking a
+  // 'reasonable' number of characters (up to profile magic size).
+  size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
+  StringRef buffer = Buffer.getBufferStart();
+  return count == 0 ||
+         std::all_of(buffer.begin(), buffer.begin() + count,
+                     [](char c) { return ::isprint(c) || ::isspace(c); });
+}
+
+std::error_code TextInstrProfReader::readHeader() {
+  Symtab.reset(new InstrProfSymtab());
+  return success();
+}
+
+std::error_code
+TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
+
+#define CHECK_LINE_END(Line)                                                   \
+  if (Line.is_at_end())                                                        \
+    return error(instrprof_error::truncated);
+#define READ_NUM(Str, Dst)                                                     \
+  if ((Str).getAsInteger(10, (Dst)))                                           \
+    return error(instrprof_error::malformed);
+#define VP_READ_ADVANCE(Val)                                                   \
+  CHECK_LINE_END(Line);                                                        \
+  uint32_t Val;                                                                \
+  READ_NUM((*Line), (Val));                                                    \
+  Line++;
+
+  if (Line.is_at_end())
+    return success();
+
+  uint32_t NumValueKinds;
+  if (Line->getAsInteger(10, NumValueKinds)) {
+    // No value profile data
+    return success();
+  }
+  if (NumValueKinds == 0 || NumValueKinds > IPVK_Last + 1)
+    return error(instrprof_error::malformed);
+  Line++;
+
+  for (uint32_t VK = 0; VK < NumValueKinds; VK++) {
+    VP_READ_ADVANCE(ValueKind);
+    if (ValueKind > IPVK_Last)
+      return error(instrprof_error::malformed);
+    VP_READ_ADVANCE(NumValueSites);
+    if (!NumValueSites)
+      continue;
+
+    Record.reserveSites(VK, NumValueSites);
+    for (uint32_t S = 0; S < NumValueSites; S++) {
+      VP_READ_ADVANCE(NumValueData);
+
+      std::vector<InstrProfValueData> CurrentValues;
+      for (uint32_t V = 0; V < NumValueData; V++) {
+        CHECK_LINE_END(Line);
+        std::pair<StringRef, StringRef> VD = Line->split(':');
+        uint64_t TakenCount, Value;
+        if (VK == IPVK_IndirectCallTarget) {
+          Symtab->addFuncName(VD.first);
+          Value = IndexedInstrProf::ComputeHash(VD.first);
+        } else {
+          READ_NUM(VD.first, Value);
+        }
+        READ_NUM(VD.second, TakenCount);
+        CurrentValues.push_back({Value, TakenCount});
+        Line++;
+      }
+      Record.addValueData(VK, S, CurrentValues.data(), NumValueData, nullptr);
+    }
+  }
+  return success();
+
+#undef CHECK_LINE_END
+#undef READ_NUM
+#undef VP_READ_ADVANCE
+}
+
 std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   // Skip empty lines and comments.
   while (!Line.is_at_end() && (Line->empty() || Line->startswith("#")))
     ++Line;
   // If we hit EOF while looking for a name, we're done.
-  if (Line.is_at_end())
+  if (Line.is_at_end()) {
+    Symtab->finalizeSymtab();
     return error(instrprof_error::eof);
+  }
 
   // Read the function name.
   Record.Name = *Line++;
+  Symtab->addFuncName(Record.Name);
 
   // Read the function hash.
   if (Line.is_at_end())
@@ -136,36 +219,14 @@ std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
     Record.Counts.push_back(Count);
   }
 
-  return success();
-}
+  // Check if value profile data exists and read it if so.
+  if (std::error_code EC = readValueProfileData(Record))
+    return EC;
 
-template <class IntPtrT>
-static uint64_t getRawMagic();
-
-template <>
-uint64_t getRawMagic<uint64_t>() {
-  return
-    uint64_t(255) << 56 |
-    uint64_t('l') << 48 |
-    uint64_t('p') << 40 |
-    uint64_t('r') << 32 |
-    uint64_t('o') << 24 |
-    uint64_t('f') << 16 |
-    uint64_t('r') <<  8 |
-    uint64_t(129);
-}
-
-template <>
-uint64_t getRawMagic<uint32_t>() {
-  return
-    uint64_t(255) << 56 |
-    uint64_t('l') << 48 |
-    uint64_t('p') << 40 |
-    uint64_t('r') << 32 |
-    uint64_t('o') << 24 |
-    uint64_t('f') << 16 |
-    uint64_t('R') <<  8 |
-    uint64_t(129);
+  // This is needed to avoid two pass parsing because llvm-profdata
+  // does dumping while reading.
+  Symtab->finalizeSymtab();
+  return success();
 }
 
 template <class IntPtrT>
@@ -174,19 +235,19 @@ bool RawInstrProfReader<IntPtrT>::hasFormat(const MemoryBuffer &DataBuffer) {
     return false;
   uint64_t Magic =
     *reinterpret_cast<const uint64_t *>(DataBuffer.getBufferStart());
-  return getRawMagic<IntPtrT>() == Magic ||
-    sys::getSwappedBytes(getRawMagic<IntPtrT>()) == Magic;
+  return RawInstrProf::getMagic<IntPtrT>() == Magic ||
+         sys::getSwappedBytes(RawInstrProf::getMagic<IntPtrT>()) == Magic;
 }
 
 template <class IntPtrT>
 std::error_code RawInstrProfReader<IntPtrT>::readHeader() {
   if (!hasFormat(*DataBuffer))
     return error(instrprof_error::bad_magic);
-  if (DataBuffer->getBufferSize() < sizeof(RawHeader))
+  if (DataBuffer->getBufferSize() < sizeof(RawInstrProf::Header))
     return error(instrprof_error::bad_header);
-  auto *Header =
-    reinterpret_cast<const RawHeader *>(DataBuffer->getBufferStart());
-  ShouldSwapBytes = Header->Magic != getRawMagic<IntPtrT>();
+  auto *Header = reinterpret_cast<const RawInstrProf::Header *>(
+      DataBuffer->getBufferStart());
+  ShouldSwapBytes = Header->Magic != RawInstrProf::getMagic<IntPtrT>();
   return readHeader(*Header);
 }
 
@@ -202,29 +263,38 @@ RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
     return instrprof_error::eof;
   // If there isn't enough space for another header, this is probably just
   // garbage at the end of the file.
-  if (CurrentPos + sizeof(RawHeader) > End)
+  if (CurrentPos + sizeof(RawInstrProf::Header) > End)
     return instrprof_error::malformed;
   // The writer ensures each profile is padded to start at an aligned address.
   if (reinterpret_cast<size_t>(CurrentPos) % alignOf<uint64_t>())
     return instrprof_error::malformed;
   // The magic should have the same byte order as in the previous header.
   uint64_t Magic = *reinterpret_cast<const uint64_t *>(CurrentPos);
-  if (Magic != swap(getRawMagic<IntPtrT>()))
+  if (Magic != swap(RawInstrProf::getMagic<IntPtrT>()))
     return instrprof_error::bad_magic;
 
   // There's another profile to read, so we need to process the header.
-  auto *Header = reinterpret_cast<const RawHeader *>(CurrentPos);
+  auto *Header = reinterpret_cast<const RawInstrProf::Header *>(CurrentPos);
   return readHeader(*Header);
 }
 
-static uint64_t getRawVersion() {
-  return 1;
+template <class IntPtrT>
+void RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
+  for (const RawInstrProf::ProfileData<IntPtrT> *I = Data; I != DataEnd; ++I) {
+    StringRef FunctionName(getName(I->NamePtr), swap(I->NameSize));
+    Symtab.addFuncName(FunctionName);
+    const IntPtrT FPtr = swap(I->FunctionPointer);
+    if (!FPtr)
+      continue;
+    Symtab.mapAddress(FPtr, IndexedInstrProf::ComputeHash(FunctionName));
+  }
+  Symtab.finalizeSymtab();
 }
 
 template <class IntPtrT>
 std::error_code
-RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
-  if (swap(Header.Version) != getRawVersion())
+RawInstrProfReader<IntPtrT>::readHeader(const RawInstrProf::Header &Header) {
+  if (swap(Header.Version) != RawInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
   CountersDelta = swap(Header.CountersDelta);
@@ -232,50 +302,69 @@ RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
   auto DataSize = swap(Header.DataSize);
   auto CountersSize = swap(Header.CountersSize);
   auto NamesSize = swap(Header.NamesSize);
+  auto ValueDataSize = swap(Header.ValueDataSize);
+  ValueKindLast = swap(Header.ValueKindLast);
+
+  auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>);
+  auto PaddingSize = getNumPaddingBytes(NamesSize);
 
-  ptrdiff_t DataOffset = sizeof(RawHeader);
-  ptrdiff_t CountersOffset = DataOffset + sizeof(ProfileData) * DataSize;
+  ptrdiff_t DataOffset = sizeof(RawInstrProf::Header);
+  ptrdiff_t CountersOffset = DataOffset + DataSizeInBytes;
   ptrdiff_t NamesOffset = CountersOffset + sizeof(uint64_t) * CountersSize;
-  size_t ProfileSize = NamesOffset + sizeof(char) * NamesSize;
+  ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
+  size_t ProfileSize = ValueDataOffset + ValueDataSize;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
   if (Start + ProfileSize > DataBuffer->getBufferEnd())
     return error(instrprof_error::bad_header);
 
-  Data = reinterpret_cast<const ProfileData *>(Start + DataOffset);
+  Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
+      Start + DataOffset);
   DataEnd = Data + DataSize;
   CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset);
   NamesStart = Start + NamesOffset;
+  ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
   ProfileEnd = Start + ProfileSize;
 
+  std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
+  createSymtab(*NewSymtab.get());
+  Symtab = std::move(NewSymtab);
   return success();
 }
 
 template <class IntPtrT>
-std::error_code
-RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
-  if (Data == DataEnd)
-    if (std::error_code EC = readNextHeader(ProfileEnd))
-      return EC;
+std::error_code RawInstrProfReader<IntPtrT>::readName(InstrProfRecord &Record) {
+  Record.Name = StringRef(getName(Data->NamePtr), swap(Data->NameSize));
+  if (Record.Name.data() < NamesStart ||
+      Record.Name.data() + Record.Name.size() >
+          reinterpret_cast<const char *>(ValueDataStart))
+    return error(instrprof_error::malformed);
+  return success();
+}
+
+template <class IntPtrT>
+std::error_code RawInstrProfReader<IntPtrT>::readFuncHash(
+    InstrProfRecord &Record) {
+  Record.Hash = swap(Data->FuncHash);
+  return success();
+}
 
-  // Get the raw data.
-  StringRef RawName(getName(Data->NamePtr), swap(Data->NameSize));
+template <class IntPtrT>
+std::error_code RawInstrProfReader<IntPtrT>::readRawCounts(
+    InstrProfRecord &Record) {
   uint32_t NumCounters = swap(Data->NumCounters);
+  IntPtrT CounterPtr = Data->CounterPtr;
   if (NumCounters == 0)
     return error(instrprof_error::malformed);
-  auto RawCounts = makeArrayRef(getCounter(Data->CounterPtr), NumCounters);
 
-  // Check bounds.
+  auto RawCounts = makeArrayRef(getCounter(CounterPtr), NumCounters);
   auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
-  if (RawName.data() < NamesStart ||
-      RawName.data() + RawName.size() > DataBuffer->getBufferEnd() ||
-      RawCounts.data() < CountersStart ||
+
+  // Check bounds.
+  if (RawCounts.data() < CountersStart ||
       RawCounts.data() + RawCounts.size() > NamesStartAsCounter)
     return error(instrprof_error::malformed);
 
-  // Store the data in Record, byte-swapping as necessary.
-  Record.Hash = swap(Data->FuncHash);
-  Record.Name = RawName;
   if (ShouldSwapBytes) {
     Record.Counts.clear();
     Record.Counts.reserve(RawCounts.size());
@@ -284,8 +373,61 @@ RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   } else
     Record.Counts = RawCounts;
 
+  return success();
+}
+
+template <class IntPtrT>
+std::error_code
+RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
+
+  Record.clearValueData();
+  CurValueDataSize = 0;
+  // Need to match the logic in value profile dumper code in compiler-rt:
+  uint32_t NumValueKinds = 0;
+  for (uint32_t I = 0; I < IPVK_Last + 1; I++)
+    NumValueKinds += (Data->NumValueSites[I] != 0);
+
+  if (!NumValueKinds)
+    return success();
+
+  ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
+      ValueProfData::getValueProfData(ValueDataStart,
+                                      (const unsigned char *)ProfileEnd,
+                                      getDataEndianness());
+
+  if (VDataPtrOrErr.getError())
+    return VDataPtrOrErr.getError();
+
+  VDataPtrOrErr.get()->deserializeTo(Record, &Symtab->getAddrHashMap());
+  CurValueDataSize = VDataPtrOrErr.get()->getSize();
+  return success();
+}
+
+template <class IntPtrT>
+std::error_code
+RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
+  if (atEnd())
+    if (std::error_code EC = readNextHeader(ProfileEnd))
+      return EC;
+
+  // Read name ad set it in Record.
+  if (std::error_code EC = readName(Record))
+    return EC;
+
+  // Read FuncHash and set it in Record.
+  if (std::error_code EC = readFuncHash(Record))
+    return EC;
+
+  // Read raw counts and set Record.
+  if (std::error_code EC = readRawCounts(Record))
+    return EC;
+
+  // Read value data and set Record.
+  if (std::error_code EC = readValueProfilingData(Record))
+    return EC;
+
   // Iterate.
-  ++Data;
+  advanceData();
   return success();
 }
 
@@ -302,52 +444,112 @@ InstrProfLookupTrait::ComputeHash(StringRef K) {
 typedef InstrProfLookupTrait::data_type data_type;
 typedef InstrProfLookupTrait::offset_type offset_type;
 
+bool InstrProfLookupTrait::readValueProfilingData(
+    const unsigned char *&D, const unsigned char *const End) {
+  ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
+      ValueProfData::getValueProfData(D, End, ValueProfDataEndianness);
+
+  if (VDataPtrOrErr.getError())
+    return false;
+
+  VDataPtrOrErr.get()->deserializeTo(DataBuffer.back(), nullptr);
+  D += VDataPtrOrErr.get()->TotalSize;
+
+  return true;
+}
+
 data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
                                          offset_type N) {
-
   // Check if the data is corrupt. If so, don't try to read it.
   if (N % sizeof(uint64_t))
     return data_type();
 
   DataBuffer.clear();
-  uint64_t NumCounts;
-  uint64_t NumEntries = N / sizeof(uint64_t);
   std::vector<uint64_t> CounterBuffer;
-  for (uint64_t I = 0; I < NumEntries; I += NumCounts) {
-    using namespace support;
-    // The function hash comes first.
-    uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D);
 
-    if (++I >= NumEntries)
+  using namespace support;
+  const unsigned char *End = D + N;
+  while (D < End) {
+    // Read hash.
+    if (D + sizeof(uint64_t) >= End)
       return data_type();
+    uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D);
 
-    // In v1, we have at least one count.
-    // Later, we have the number of counts.
-    NumCounts = (1 == FormatVersion)
-                    ? NumEntries - I
-                    : endian::readNext<uint64_t, little, unaligned>(D);
-    if (1 != FormatVersion)
-      ++I;
-
-    // If we have more counts than data, this is bogus.
-    if (I + NumCounts > NumEntries)
+    // Initialize number of counters for FormatVersion == 1.
+    uint64_t CountsSize = N / sizeof(uint64_t) - 1;
+    // If format version is different then read the number of counters.
+    if (FormatVersion != 1) {
+      if (D + sizeof(uint64_t) > End)
+        return data_type();
+      CountsSize = endian::readNext<uint64_t, little, unaligned>(D);
+    }
+    // Read counter values.
+    if (D + CountsSize * sizeof(uint64_t) > End)
       return data_type();
 
     CounterBuffer.clear();
-    for (unsigned J = 0; J < NumCounts; ++J)
+    CounterBuffer.reserve(CountsSize);
+    for (uint64_t J = 0; J < CountsSize; ++J)
       CounterBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
 
-    DataBuffer.push_back(InstrProfRecord(K, Hash, std::move(CounterBuffer)));
+    DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer));
+
+    // Read value profiling data.
+    if (FormatVersion > 2 && !readValueProfilingData(D, End)) {
+      DataBuffer.clear();
+      return data_type();
+    }
   }
   return DataBuffer;
 }
 
+template <typename HashTableImpl>
+std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
+    StringRef FuncName, ArrayRef<InstrProfRecord> &Data) {
+  auto Iter = HashTable->find(FuncName);
+  if (Iter == HashTable->end())
+    return instrprof_error::unknown_function;
+
+  Data = (*Iter);
+  if (Data.empty())
+    return instrprof_error::malformed;
+
+  return instrprof_error::success;
+}
+
+template <typename HashTableImpl>
+std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
+    ArrayRef<InstrProfRecord> &Data) {
+  if (atEnd())
+    return instrprof_error::eof;
+
+  Data = *RecordIterator;
+
+  if (Data.empty())
+    return instrprof_error::malformed;
+
+  return instrprof_error::success;
+}
+
+template <typename HashTableImpl>
+InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
+    const unsigned char *Buckets, const unsigned char *const Payload,
+    const unsigned char *const Base, IndexedInstrProf::HashT HashType,
+    uint64_t Version) {
+  FormatVersion = Version;
+  HashTable.reset(HashTableImpl::Create(
+      Buckets, Payload, Base,
+      typename HashTableImpl::InfoType(HashType, Version)));
+  RecordIterator = HashTable->data_begin();
+}
+
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
   if (DataBuffer.getBufferSize() < 8)
     return false;
   using namespace support;
   uint64_t Magic =
       endian::read<uint64_t, little, aligned>(DataBuffer.getBufferStart());
+  // Verify that it's magical.
   return Magic == IndexedInstrProf::Magic;
 }
 
@@ -360,71 +562,91 @@ std::error_code IndexedInstrProfReader::readHeader() {
 
   using namespace support;
 
+  auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur);
+  Cur += sizeof(IndexedInstrProf::Header);
+
   // Check the magic number.
-  uint64_t Magic = endian::readNext<uint64_t, little, unaligned>(Cur);
+  uint64_t Magic = endian::byte_swap<uint64_t, little>(Header->Magic);
   if (Magic != IndexedInstrProf::Magic)
     return error(instrprof_error::bad_magic);
 
   // Read the version.
-  FormatVersion = endian::readNext<uint64_t, little, unaligned>(Cur);
+  uint64_t FormatVersion = endian::byte_swap<uint64_t, little>(Header->Version);
   if (FormatVersion > IndexedInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
   // Read the maximal function count.
-  MaxFunctionCount = endian::readNext<uint64_t, little, unaligned>(Cur);
+  MaxFunctionCount =
+      endian::byte_swap<uint64_t, little>(Header->MaxFunctionCount);
 
   // Read the hash type and start offset.
   IndexedInstrProf::HashT HashType = static_cast<IndexedInstrProf::HashT>(
-      endian::readNext<uint64_t, little, unaligned>(Cur));
+      endian::byte_swap<uint64_t, little>(Header->HashType));
   if (HashType > IndexedInstrProf::HashT::Last)
     return error(instrprof_error::unsupported_hash_type);
-  uint64_t HashOffset = endian::readNext<uint64_t, little, unaligned>(Cur);
 
-  // The rest of the file is an on disk hash table.
-  Index.reset(InstrProfReaderIndex::Create(
-      Start + HashOffset, Cur, Start,
-      InstrProfLookupTrait(HashType, FormatVersion)));
-  // Set up our iterator for readNextRecord.
-  RecordIterator = Index->data_begin();
+  uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
+  // The rest of the file is an on disk hash table.
+  InstrProfReaderIndexBase *IndexPtr = nullptr;
+  IndexPtr = new InstrProfReaderIndex<OnDiskHashTableImplV3>(
+      Start + HashOffset, Cur, Start, HashType, FormatVersion);
+  Index.reset(IndexPtr);
   return success();
 }
 
-std::error_code IndexedInstrProfReader::getFunctionCounts(
-    StringRef FuncName, uint64_t FuncHash, std::vector<uint64_t> &Counts) {
-  auto Iter = Index->find(FuncName);
-  if (Iter == Index->end())
-    return error(instrprof_error::unknown_function);
+InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
+  if (Symtab.get())
+    return *Symtab.get();
 
-  // Found it. Look for counters with the right hash.
-  ArrayRef<InstrProfRecord> Data = (*Iter);
-  if (Data.empty())
-    return error(instrprof_error::malformed);
+  std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
+  Index->populateSymtab(*NewSymtab.get());
+
+  Symtab = std::move(NewSymtab);
+  return *Symtab.get();
+}
 
+ErrorOr<InstrProfRecord>
+IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
+                                           uint64_t FuncHash) {
+  ArrayRef<InstrProfRecord> Data;
+  std::error_code EC = Index->getRecords(FuncName, Data);
+  if (EC != instrprof_error::success)
+    return EC;
+  // Found it. Look for counters with the right hash.
   for (unsigned I = 0, E = Data.size(); I < E; ++I) {
     // Check for a match and fill the vector if there is one.
     if (Data[I].Hash == FuncHash) {
-      Counts = Data[I].Counts;
-      return success();
+      return std::move(Data[I]);
     }
   }
   return error(instrprof_error::hash_mismatch);
 }
 
 std::error_code
-IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
-  // Are we out of records?
-  if (RecordIterator == Index->data_end())
-    return error(instrprof_error::eof);
+IndexedInstrProfReader::getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
+                                          std::vector<uint64_t> &Counts) {
+  ErrorOr<InstrProfRecord> Record = getInstrProfRecord(FuncName, FuncHash);
+  if (std::error_code EC = Record.getError())
+    return EC;
 
-  if ((*RecordIterator).empty())
-    return error(instrprof_error::malformed);
+  Counts = Record.get().Counts;
+  return success();
+}
 
+std::error_code IndexedInstrProfReader::readNextRecord(
+    InstrProfRecord &Record) {
   static unsigned RecordIndex = 0;
-  ArrayRef<InstrProfRecord> Data = (*RecordIterator);
+
+  ArrayRef<InstrProfRecord> Data;
+
+  std::error_code EC = Index->getRecords(Data);
+  if (EC != instrprof_error::success)
+    return error(EC);
+
   Record = Data[RecordIndex++];
   if (RecordIndex >= Data.size()) {
-    ++RecordIterator;
+    Index->advanceToNextKey();
     RecordIndex = 0;
   }
   return success();
diff --git a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
index 2188543..f522724 100644
--- a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -13,27 +13,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfWriter.h"
-#include "InstrProfIndexed.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/OnDiskHashTable.h"
+#include <tuple>
 
 using namespace llvm;
 
 namespace {
+static support::endianness ValueProfDataEndianness = support::little;
+
 class InstrProfRecordTrait {
 public:
   typedef StringRef key_type;
   typedef StringRef key_type_ref;
 
-  typedef const InstrProfWriter::CounterData *const data_type;
-  typedef const InstrProfWriter::CounterData *const data_type_ref;
+  typedef const InstrProfWriter::ProfilingData *const data_type;
+  typedef const InstrProfWriter::ProfilingData *const data_type_ref;
 
   typedef uint64_t hash_value_type;
   typedef uint64_t offset_type;
 
   static hash_value_type ComputeHash(key_type_ref K) {
-    return IndexedInstrProf::ComputeHash(IndexedInstrProf::HashType, K);
+    return IndexedInstrProf::ComputeHash(K);
   }
 
   static std::pair<offset_type, offset_type>
@@ -45,8 +47,15 @@ public:
     LE.write<offset_type>(N);
 
     offset_type M = 0;
-    for (const auto &Counts : *V)
-      M += (2 + Counts.second.size()) * sizeof(uint64_t);
+    for (const auto &ProfileData : *V) {
+      const InstrProfRecord &ProfRecord = ProfileData.second;
+      M += sizeof(uint64_t); // The function hash
+      M += sizeof(uint64_t); // The size of the Counts vector
+      M += ProfRecord.Counts.size() * sizeof(uint64_t);
+
+      // Value data
+      M += ValueProfData::getSize(ProfileData.second);
+    }
     LE.write<offset_type>(M);
 
     return std::make_pair(N, M);
@@ -60,50 +69,62 @@ public:
                        offset_type) {
     using namespace llvm::support;
     endian::Writer<little> LE(Out);
+    for (const auto &ProfileData : *V) {
+      const InstrProfRecord &ProfRecord = ProfileData.second;
 
-    for (const auto &Counts : *V) {
-      LE.write<uint64_t>(Counts.first);
-      LE.write<uint64_t>(Counts.second.size());
-      for (uint64_t I : Counts.second)
+      LE.write<uint64_t>(ProfileData.first); // Function hash
+      LE.write<uint64_t>(ProfRecord.Counts.size());
+      for (uint64_t I : ProfRecord.Counts)
         LE.write<uint64_t>(I);
+
+      // Write value data
+      std::unique_ptr<ValueProfData> VDataPtr =
+          ValueProfData::serializeFrom(ProfileData.second);
+      uint32_t S = VDataPtr->getSize();
+      VDataPtr->swapBytesFromHost(ValueProfDataEndianness);
+      Out.write((const char *)VDataPtr.get(), S);
     }
   }
 };
 }
 
-std::error_code
-InstrProfWriter::addFunctionCounts(StringRef FunctionName,
-                                   uint64_t FunctionHash,
-                                   ArrayRef<uint64_t> Counters) {
-  auto &CounterData = FunctionData[FunctionName];
+// Internal interface for testing purpose only.
+void InstrProfWriter::setValueProfDataEndianness(
+    support::endianness Endianness) {
+  ValueProfDataEndianness = Endianness;
+}
+
+std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I,
+                                           uint64_t Weight) {
+  auto &ProfileDataMap = FunctionData[I.Name];
+
+  bool NewFunc;
+  ProfilingData::iterator Where;
+  std::tie(Where, NewFunc) =
+      ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord()));
+  InstrProfRecord &Dest = Where->second;
 
-  auto Where = CounterData.find(FunctionHash);
-  if (Where == CounterData.end()) {
+  instrprof_error Result = instrprof_error::success;
+  if (NewFunc) {
     // We've never seen a function with this name and hash, add it.
-    CounterData[FunctionHash] = Counters;
-    // We keep track of the max function count as we go for simplicity.
-    if (Counters[0] > MaxFunctionCount)
-      MaxFunctionCount = Counters[0];
-    return instrprof_error::success;
+    Dest = std::move(I);
+    // Fix up the name to avoid dangling reference.
+    Dest.Name = FunctionData.find(Dest.Name)->getKey();
+    if (Weight > 1)
+      Result = Dest.scale(Weight);
+  } else {
+    // We're updating a function we've seen before.
+    Result = Dest.merge(I, Weight);
   }
 
-  // We're updating a function we've seen before.
-  auto &FoundCounters = Where->second;
-  // If the number of counters doesn't match we either have bad data or a hash
-  // collision.
-  if (FoundCounters.size() != Counters.size())
-    return instrprof_error::count_mismatch;
-
-  for (size_t I = 0, E = Counters.size(); I < E; ++I) {
-    if (FoundCounters[I] + Counters[I] < FoundCounters[I])
-      return instrprof_error::counter_overflow;
-    FoundCounters[I] += Counters[I];
-  }
+  Dest.sortValueData();
+
   // We keep track of the max function count as we go for simplicity.
-  if (FoundCounters[0] > MaxFunctionCount)
-    MaxFunctionCount = FoundCounters[0];
+  // Update this statistic no matter the result of the merge.
+  if (Dest.Counts[0] > MaxFunctionCount)
+    MaxFunctionCount = Dest.Counts[0];
 
-  return instrprof_error::success;
+  return Result;
 }
 
 std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) {
@@ -117,13 +138,23 @@ std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) {
   endian::Writer<little> LE(OS);
 
   // Write the header.
-  LE.write<uint64_t>(IndexedInstrProf::Magic);
-  LE.write<uint64_t>(IndexedInstrProf::Version);
-  LE.write<uint64_t>(MaxFunctionCount);
-  LE.write<uint64_t>(static_cast<uint64_t>(IndexedInstrProf::HashType));
+  IndexedInstrProf::Header Header;
+  Header.Magic = IndexedInstrProf::Magic;
+  Header.Version = IndexedInstrProf::Version;
+  Header.MaxFunctionCount = MaxFunctionCount;
+  Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
+  Header.HashOffset = 0;
+  int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
+
+  // Only write out all the fields execpt 'HashOffset'. We need
+  // to remember the offset of that field to allow back patching
+  // later.
+  for (int I = 0; I < N - 1; I++)
+    LE.write<uint64_t>(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save a space to write the hash table start location.
   uint64_t HashTableStartLoc = OS.tell();
+  // Reserve the space for HashOffset field.
   LE.write<uint64_t>(0);
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS);
@@ -138,9 +169,65 @@ void InstrProfWriter::write(raw_fd_ostream &OS) {
   // Go back and fill in the hash table start.
   using namespace support;
   OS.seek(TableStart.first);
+  // Now patch the HashOffset field previously reserved.
   endian::Writer<little>(OS).write<uint64_t>(TableStart.second);
 }
 
+static const char *ValueProfKindStr[] = {
+#define VALUE_PROF_KIND(Enumerator, Value) #Enumerator,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+void InstrProfWriter::writeRecordInText(const InstrProfRecord &Func,
+                                        InstrProfSymtab &Symtab,
+                                        raw_fd_ostream &OS) {
+  OS << Func.Name << "\n";
+  OS << "# Func Hash:\n" << Func.Hash << "\n";
+  OS << "# Num Counters:\n" << Func.Counts.size() << "\n";
+  OS << "# Counter Values:\n";
+  for (uint64_t Count : Func.Counts)
+    OS << Count << "\n";
+
+  uint32_t NumValueKinds = Func.getNumValueKinds();
+  if (!NumValueKinds) {
+    OS << "\n";
+    return;
+  }
+
+  OS << "# Num Value Kinds:\n" << Func.getNumValueKinds() << "\n";
+  for (uint32_t VK = 0; VK < IPVK_Last + 1; VK++) {
+    uint32_t NS = Func.getNumValueSites(VK);
+    if (!NS)
+      continue;
+    OS << "# ValueKind = " << ValueProfKindStr[VK] << ":\n" << VK << "\n";
+    OS << "# NumValueSites:\n" << NS << "\n";
+    for (uint32_t S = 0; S < NS; S++) {
+      uint32_t ND = Func.getNumValueDataForSite(VK, S);
+      OS << ND << "\n";
+      std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
+      for (uint32_t I = 0; I < ND; I++) {
+        if (VK == IPVK_IndirectCallTarget)
+          OS << Symtab.getFuncName(VD[I].Value) << ":" << VD[I].Count << "\n";
+        else
+          OS << VD[I].Value << ":" << VD[I].Count << "\n";
+      }
+    }
+  }
+
+  OS << "\n";
+}
+
+void InstrProfWriter::writeText(raw_fd_ostream &OS) {
+  InstrProfSymtab Symtab;
+  for (const auto &I : FunctionData)
+    Symtab.addFuncName(I.getKey());
+  Symtab.finalizeSymtab();
+
+  for (const auto &I : FunctionData)
+    for (const auto &Func : I.getValue())
+      writeRecordInText(Func.second, Symtab, OS);
+}
+
 std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
   std::string Data;
   llvm::raw_string_ostream OS(Data);
diff --git a/contrib/llvm/lib/ProfileData/SampleProf.cpp b/contrib/llvm/lib/ProfileData/SampleProf.cpp
index 920c48a..9ded757 100644
--- a/contrib/llvm/lib/ProfileData/SampleProf.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProf.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 
+using namespace llvm::sampleprof;
 using namespace llvm;
 
 namespace {
@@ -27,17 +28,25 @@ class SampleProfErrorCategoryType : public std::error_category {
     case sampleprof_error::success:
       return "Success";
     case sampleprof_error::bad_magic:
-      return "Invalid file format (bad magic)";
+      return "Invalid sample profile data (bad magic)";
     case sampleprof_error::unsupported_version:
-      return "Unsupported format version";
+      return "Unsupported sample profile format version";
     case sampleprof_error::too_large:
       return "Too much profile data";
     case sampleprof_error::truncated:
       return "Truncated profile data";
     case sampleprof_error::malformed:
-      return "Malformed profile data";
+      return "Malformed sample profile data";
     case sampleprof_error::unrecognized_format:
-      return "Unrecognized profile encoding format";
+      return "Unrecognized sample profile encoding format";
+    case sampleprof_error::unsupported_writing_format:
+      return "Profile encoding format unsupported for writing operations";
+    case sampleprof_error::truncated_name_table:
+      return "Truncated function name table";
+    case sampleprof_error::not_implemented:
+      return "Unimplemented feature";
+    case sampleprof_error::counter_overflow:
+      return "Counter overflow";
     }
     llvm_unreachable("A value of sampleprof_error has no message.");
   }
@@ -49,3 +58,92 @@ static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory;
 const std::error_category &llvm::sampleprof_category() {
   return *ErrorCategory;
 }
+
+void LineLocation::print(raw_ostream &OS) const {
+  OS << LineOffset;
+  if (Discriminator > 0)
+    OS << "." << Discriminator;
+}
+
+raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
+                                          const LineLocation &Loc) {
+  Loc.print(OS);
+  return OS;
+}
+
+void LineLocation::dump() const { print(dbgs()); }
+
+void CallsiteLocation::print(raw_ostream &OS) const {
+  LineLocation::print(OS);
+  OS << ": inlined callee: " << CalleeName;
+}
+
+void CallsiteLocation::dump() const { print(dbgs()); }
+
+inline raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
+                                                 const CallsiteLocation &Loc) {
+  Loc.print(OS);
+  return OS;
+}
+
+/// \brief Print the sample record to the stream \p OS indented by \p Indent.
+void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
+  OS << NumSamples;
+  if (hasCalls()) {
+    OS << ", calls:";
+    for (const auto &I : getCallTargets())
+      OS << " " << I.first() << ":" << I.second;
+  }
+  OS << "\n";
+}
+
+void SampleRecord::dump() const { print(dbgs(), 0); }
+
+raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
+                                          const SampleRecord &Sample) {
+  Sample.print(OS, 0);
+  return OS;
+}
+
+/// \brief Print the samples collected for a function on stream \p OS.
+void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
+  OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
+     << " sampled lines\n";
+
+  OS.indent(Indent);
+  if (BodySamples.size() > 0) {
+    OS << "Samples collected in the function's body {\n";
+    SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples);
+    for (const auto &SI : SortedBodySamples.get()) {
+      OS.indent(Indent + 2);
+      OS << SI->first << ": " << SI->second;
+    }
+    OS.indent(Indent);
+    OS << "}\n";
+  } else {
+    OS << "No samples collected in the function's body\n";
+  }
+
+  OS.indent(Indent);
+  if (CallsiteSamples.size() > 0) {
+    OS << "Samples collected in inlined callsites {\n";
+    SampleSorter<CallsiteLocation, FunctionSamples> SortedCallsiteSamples(
+        CallsiteSamples);
+    for (const auto &CS : SortedCallsiteSamples.get()) {
+      OS.indent(Indent + 2);
+      OS << CS->first << ": ";
+      CS->second.print(OS, Indent + 4);
+    }
+    OS << "}\n";
+  } else {
+    OS << "No inlined callsites in this function\n";
+  }
+}
+
+raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
+                                          const FunctionSamples &FS) {
+  FS.print(OS);
+  return OS;
+}
+
+void FunctionSamples::dump(void) const { print(dbgs(), 0); }
diff --git a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
index b39bfd6..93cd87b 100644
--- a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -8,133 +8,37 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements the class that reads LLVM sample profiles. It
-// supports two file formats: text and binary. The textual representation
-// is useful for debugging and testing purposes. The binary representation
-// is more compact, resulting in smaller file sizes. However, they can
-// both be used interchangeably.
+// supports three file formats: text, binary and gcov.
 //
-// NOTE: If you are making changes to the file format, please remember
-//       to document them in the Clang documentation at
-//       tools/clang/docs/UsersManual.rst.
+// The textual representation is useful for debugging and testing purposes. The
+// binary representation is more compact, resulting in smaller file sizes.
 //
-// Text format
-// -----------
+// The gcov encoding is the one generated by GCC's AutoFDO profile creation
+// tool (https://github.com/google/autofdo)
 //
-// Sample profiles are written as ASCII text. The file is divided into
-// sections, which correspond to each of the functions executed at runtime.
-// Each section has the following format
-//
-//     function1:total_samples:total_head_samples
-//     offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ]
-//     offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ]
-//     ...
-//     offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ]
-//
-// The file may contain blank lines between sections and within a
-// section. However, the spacing within a single line is fixed. Additional
-// spaces will result in an error while reading the file.
-//
-// Function names must be mangled in order for the profile loader to
-// match them in the current translation unit. The two numbers in the
-// function header specify how many total samples were accumulated in the
-// function (first number), and the total number of samples accumulated
-// in the prologue of the function (second number). This head sample
-// count provides an indicator of how frequently the function is invoked.
-//
-// Each sampled line may contain several items. Some are optional (marked
-// below):
-//
-// a. Source line offset. This number represents the line number
-//    in the function where the sample was collected. The line number is
-//    always relative to the line where symbol of the function is
-//    defined. So, if the function has its header at line 280, the offset
-//    13 is at line 293 in the file.
-//
-//    Note that this offset should never be a negative number. This could
-//    happen in cases like macros. The debug machinery will register the
-//    line number at the point of macro expansion. So, if the macro was
-//    expanded in a line before the start of the function, the profile
-//    converter should emit a 0 as the offset (this means that the optimizers
-//    will not be able to associate a meaningful weight to the instructions
-//    in the macro).
-//
-// b. [OPTIONAL] Discriminator. This is used if the sampled program
-//    was compiled with DWARF discriminator support
-//    (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators).
-//    DWARF discriminators are unsigned integer values that allow the
-//    compiler to distinguish between multiple execution paths on the
-//    same source line location.
-//
-//    For example, consider the line of code ``if (cond) foo(); else bar();``.
-//    If the predicate ``cond`` is true 80% of the time, then the edge
-//    into function ``foo`` should be considered to be taken most of the
-//    time. But both calls to ``foo`` and ``bar`` are at the same source
-//    line, so a sample count at that line is not sufficient. The
-//    compiler needs to know which part of that line is taken more
-//    frequently.
-//
-//    This is what discriminators provide. In this case, the calls to
-//    ``foo`` and ``bar`` will be at the same line, but will have
-//    different discriminator values. This allows the compiler to correctly
-//    set edge weights into ``foo`` and ``bar``.
-//
-// c. Number of samples. This is an integer quantity representing the
-//    number of samples collected by the profiler at this source
-//    location.
-//
-// d. [OPTIONAL] Potential call targets and samples. If present, this
-//    line contains a call instruction. This models both direct and
-//    number of samples. For example,
-//
-//      130: 7  foo:3  bar:2  baz:7
-//
-//    The above means that at relative line offset 130 there is a call
-//    instruction that calls one of ``foo()``, ``bar()`` and ``baz()``,
-//    with ``baz()`` being the relatively more frequently called target.
+// All three encodings can be used interchangeably as an input sample profile.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
 
 using namespace llvm::sampleprof;
 using namespace llvm;
 
-/// \brief Print the samples collected for a function on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-void FunctionSamples::print(raw_ostream &OS) {
-  OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
-     << " sampled lines\n";
-  for (const auto &SI : BodySamples) {
-    LineLocation Loc = SI.first;
-    const SampleRecord &Sample = SI.second;
-    OS << "\tline offset: " << Loc.LineOffset
-       << ", discriminator: " << Loc.Discriminator
-       << ", number of samples: " << Sample.getSamples();
-    if (Sample.hasCalls()) {
-      OS << ", calls:";
-      for (const auto &I : Sample.getCallTargets())
-        OS << " " << I.first() << ":" << I.second;
-    }
-    OS << "\n";
-  }
-  OS << "\n";
-}
-
 /// \brief Dump the function profile for \p FName.
 ///
 /// \param FName Name of the function to print.
 /// \param OS Stream to emit the output to.
 void SampleProfileReader::dumpFunctionProfile(StringRef FName,
                                               raw_ostream &OS) {
-  OS << "Function: " << FName << ": ";
-  Profiles[FName].print(OS);
+  OS << "Function: " << FName << ": " << Profiles[FName];
 }
 
 /// \brief Dump all the function profiles found on stream \p OS.
@@ -143,6 +47,102 @@ void SampleProfileReader::dump(raw_ostream &OS) {
     dumpFunctionProfile(I.getKey(), OS);
 }
 
+/// \brief Parse \p Input as function head.
+///
+/// Parse one line of \p Input, and update function name in \p FName,
+/// function's total sample count in \p NumSamples, function's entry
+/// count in \p NumHeadSamples.
+///
+/// \returns true if parsing is successful.
+static bool ParseHead(const StringRef &Input, StringRef &FName,
+                      uint64_t &NumSamples, uint64_t &NumHeadSamples) {
+  if (Input[0] == ' ')
+    return false;
+  size_t n2 = Input.rfind(':');
+  size_t n1 = Input.rfind(':', n2 - 1);
+  FName = Input.substr(0, n1);
+  if (Input.substr(n1 + 1, n2 - n1 - 1).getAsInteger(10, NumSamples))
+    return false;
+  if (Input.substr(n2 + 1).getAsInteger(10, NumHeadSamples))
+    return false;
+  return true;
+}
+
+
+/// \brief Returns true if line offset \p L is legal (only has 16 bits).
+static bool isOffsetLegal(unsigned L) {
+  return (L & 0xffff) == L;
+}
+
+/// \brief Parse \p Input as line sample.
+///
+/// \param Input input line.
+/// \param IsCallsite true if the line represents an inlined callsite.
+/// \param Depth the depth of the inline stack.
+/// \param NumSamples total samples of the line/inlined callsite.
+/// \param LineOffset line offset to the start of the function.
+/// \param Discriminator discriminator of the line.
+/// \param TargetCountMap map from indirect call target to count.
+///
+/// returns true if parsing is successful.
+static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
+                      uint64_t &NumSamples, uint32_t &LineOffset,
+                      uint32_t &Discriminator, StringRef &CalleeName,
+                      DenseMap<StringRef, uint64_t> &TargetCountMap) {
+  for (Depth = 0; Input[Depth] == ' '; Depth++)
+    ;
+  if (Depth == 0)
+    return false;
+
+  size_t n1 = Input.find(':');
+  StringRef Loc = Input.substr(Depth, n1 - Depth);
+  size_t n2 = Loc.find('.');
+  if (n2 == StringRef::npos) {
+    if (Loc.getAsInteger(10, LineOffset) || !isOffsetLegal(LineOffset))
+      return false;
+    Discriminator = 0;
+  } else {
+    if (Loc.substr(0, n2).getAsInteger(10, LineOffset))
+      return false;
+    if (Loc.substr(n2 + 1).getAsInteger(10, Discriminator))
+      return false;
+  }
+
+  StringRef Rest = Input.substr(n1 + 2);
+  if (Rest[0] >= '0' && Rest[0] <= '9') {
+    IsCallsite = false;
+    size_t n3 = Rest.find(' ');
+    if (n3 == StringRef::npos) {
+      if (Rest.getAsInteger(10, NumSamples))
+        return false;
+    } else {
+      if (Rest.substr(0, n3).getAsInteger(10, NumSamples))
+        return false;
+    }
+    while (n3 != StringRef::npos) {
+      n3 += Rest.substr(n3).find_first_not_of(' ');
+      Rest = Rest.substr(n3);
+      n3 = Rest.find(' ');
+      StringRef pair = Rest;
+      if (n3 != StringRef::npos) {
+        pair = Rest.substr(0, n3);
+      }
+      size_t n4 = pair.find(':');
+      uint64_t count;
+      if (pair.substr(n4 + 1).getAsInteger(10, count))
+        return false;
+      TargetCountMap[pair.substr(0, n4)] = count;
+    }
+  } else {
+    IsCallsite = true;
+    size_t n3 = Rest.find_last_of(':');
+    CalleeName = Rest.substr(0, n3);
+    if (Rest.substr(n3 + 1).getAsInteger(10, NumSamples))
+      return false;
+  }
+  return true;
+}
+
 /// \brief Load samples from a text file.
 ///
 /// See the documentation at the top of the file for an explanation of
@@ -151,14 +151,13 @@ void SampleProfileReader::dump(raw_ostream &OS) {
 /// \returns true if the file was loaded successfully, false otherwise.
 std::error_code SampleProfileReaderText::read() {
   line_iterator LineIt(*Buffer, /*SkipBlanks=*/true, '#');
+  sampleprof_error Result = sampleprof_error::success;
+
+  InlineCallStack InlineStack;
 
-  // Read the profile of each function. Since each function may be
-  // mentioned more than once, and we are collecting flat profiles,
-  // accumulate samples as we parse them.
-  Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$");
-  Regex LineSampleRE("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$");
-  Regex CallSampleRE(" +([^0-9 ][^ ]*):([0-9]+)");
-  while (!LineIt.is_at_eof()) {
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    if ((*LineIt)[(*LineIt).find_first_not_of(' ')] == '#')
+      continue;
     // Read the header of each function.
     //
     // Note that for function identifiers we are actually expecting
@@ -171,63 +170,74 @@ std::error_code SampleProfileReaderText::read() {
     //
     // The only requirement we place on the identifier, then, is that it
     // should not begin with a number.
-    SmallVector<StringRef, 4> Matches;
-    if (!HeadRE.match(*LineIt, &Matches)) {
-      reportParseError(LineIt.line_number(),
-                       "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
-      return sampleprof_error::malformed;
-    }
-    assert(Matches.size() == 4);
-    StringRef FName = Matches[1];
-    unsigned NumSamples, NumHeadSamples;
-    Matches[2].getAsInteger(10, NumSamples);
-    Matches[3].getAsInteger(10, NumHeadSamples);
-    Profiles[FName] = FunctionSamples();
-    FunctionSamples &FProfile = Profiles[FName];
-    FProfile.addTotalSamples(NumSamples);
-    FProfile.addHeadSamples(NumHeadSamples);
-    ++LineIt;
-
-    // Now read the body. The body of the function ends when we reach
-    // EOF or when we see the start of the next function.
-    while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) {
-      if (!LineSampleRE.match(*LineIt, &Matches)) {
-        reportParseError(
-            LineIt.line_number(),
-            "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt);
+    if ((*LineIt)[0] != ' ') {
+      uint64_t NumSamples, NumHeadSamples;
+      StringRef FName;
+      if (!ParseHead(*LineIt, FName, NumSamples, NumHeadSamples)) {
+        reportError(LineIt.line_number(),
+                    "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
+        return sampleprof_error::malformed;
+      }
+      Profiles[FName] = FunctionSamples();
+      FunctionSamples &FProfile = Profiles[FName];
+      MergeResult(Result, FProfile.addTotalSamples(NumSamples));
+      MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples));
+      InlineStack.clear();
+      InlineStack.push_back(&FProfile);
+    } else {
+      uint64_t NumSamples;
+      StringRef FName;
+      DenseMap<StringRef, uint64_t> TargetCountMap;
+      bool IsCallsite;
+      uint32_t Depth, LineOffset, Discriminator;
+      if (!ParseLine(*LineIt, IsCallsite, Depth, NumSamples, LineOffset,
+                     Discriminator, FName, TargetCountMap)) {
+        reportError(LineIt.line_number(),
+                    "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " +
+                        *LineIt);
         return sampleprof_error::malformed;
       }
-      assert(Matches.size() == 5);
-      unsigned LineOffset, NumSamples, Discriminator = 0;
-      Matches[1].getAsInteger(10, LineOffset);
-      if (Matches[2] != "")
-        Matches[2].getAsInteger(10, Discriminator);
-      Matches[3].getAsInteger(10, NumSamples);
-
-      // If there are function calls in this line, generate a call sample
-      // entry for each call.
-      std::string CallsLine(Matches[4]);
-      while (CallsLine != "") {
-        SmallVector<StringRef, 3> CallSample;
-        if (!CallSampleRE.match(CallsLine, &CallSample)) {
-          reportParseError(LineIt.line_number(),
-                           "Expected 'mangled_name:NUM', found " + CallsLine);
-          return sampleprof_error::malformed;
+      if (IsCallsite) {
+        while (InlineStack.size() > Depth) {
+          InlineStack.pop_back();
         }
-        StringRef CalledFunction = CallSample[1];
-        unsigned CalledFunctionSamples;
-        CallSample[2].getAsInteger(10, CalledFunctionSamples);
-        FProfile.addCalledTargetSamples(LineOffset, Discriminator,
-                                        CalledFunction, CalledFunctionSamples);
-        CallsLine = CallSampleRE.sub("", CallsLine);
+        FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt(
+            CallsiteLocation(LineOffset, Discriminator, FName));
+        MergeResult(Result, FSamples.addTotalSamples(NumSamples));
+        InlineStack.push_back(&FSamples);
+      } else {
+        while (InlineStack.size() > Depth) {
+          InlineStack.pop_back();
+        }
+        FunctionSamples &FProfile = *InlineStack.back();
+        for (const auto &name_count : TargetCountMap) {
+          MergeResult(Result, FProfile.addCalledTargetSamples(
+                                  LineOffset, Discriminator, name_count.first,
+                                  name_count.second));
+        }
+        MergeResult(Result, FProfile.addBodySamples(LineOffset, Discriminator,
+                                                    NumSamples));
       }
+    }
+  }
 
-      FProfile.addBodySamples(LineOffset, Discriminator, NumSamples);
-      ++LineIt;
+  return Result;
+}
+
+bool SampleProfileReaderText::hasFormat(const MemoryBuffer &Buffer) {
+  bool result = false;
+
+  // Check that the first non-comment line is a valid function header.
+  line_iterator LineIt(Buffer, /*SkipBlanks=*/true, '#');
+  if (!LineIt.is_at_eof()) {
+    if ((*LineIt)[0] != ' ') {
+      uint64_t NumSamples, NumHeadSamples;
+      StringRef FName;
+      result = ParseHead(*LineIt, FName, NumSamples, NumHeadSamples);
     }
   }
 
-  return sampleprof_error::success;
+  return result;
 }
 
 template <typename T> ErrorOr<T> SampleProfileReaderBinary::readNumber() {
@@ -243,7 +253,7 @@ template <typename T> ErrorOr<T> SampleProfileReaderBinary::readNumber() {
     EC = sampleprof_error::success;
 
   if (EC) {
-    reportParseError(0, EC.message());
+    reportError(0, EC.message());
     return EC;
   }
 
@@ -256,7 +266,7 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readString() {
   StringRef Str(reinterpret_cast<const char *>(Data));
   if (Data + Str.size() + 1 > End) {
     EC = sampleprof_error::truncated;
-    reportParseError(0, EC.message());
+    reportError(0, EC.message());
     return EC;
   }
 
@@ -264,62 +274,109 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readString() {
   return Str;
 }
 
-std::error_code SampleProfileReaderBinary::read() {
-  while (!at_eof()) {
-    auto FName(readString());
-    if (std::error_code EC = FName.getError())
+ErrorOr<StringRef> SampleProfileReaderBinary::readStringFromTable() {
+  std::error_code EC;
+  auto Idx = readNumber<uint32_t>();
+  if (std::error_code EC = Idx.getError())
+    return EC;
+  if (*Idx >= NameTable.size())
+    return sampleprof_error::truncated_name_table;
+  return NameTable[*Idx];
+}
+
+std::error_code
+SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
+  auto NumSamples = readNumber<uint64_t>();
+  if (std::error_code EC = NumSamples.getError())
+    return EC;
+  FProfile.addTotalSamples(*NumSamples);
+
+  // Read the samples in the body.
+  auto NumRecords = readNumber<uint32_t>();
+  if (std::error_code EC = NumRecords.getError())
+    return EC;
+
+  for (uint32_t I = 0; I < *NumRecords; ++I) {
+    auto LineOffset = readNumber<uint64_t>();
+    if (std::error_code EC = LineOffset.getError())
       return EC;
 
-    Profiles[*FName] = FunctionSamples();
-    FunctionSamples &FProfile = Profiles[*FName];
+    if (!isOffsetLegal(*LineOffset)) {
+      return std::error_code();
+    }
 
-    auto Val = readNumber<unsigned>();
-    if (std::error_code EC = Val.getError())
+    auto Discriminator = readNumber<uint64_t>();
+    if (std::error_code EC = Discriminator.getError())
       return EC;
-    FProfile.addTotalSamples(*Val);
 
-    Val = readNumber<unsigned>();
-    if (std::error_code EC = Val.getError())
+    auto NumSamples = readNumber<uint64_t>();
+    if (std::error_code EC = NumSamples.getError())
       return EC;
-    FProfile.addHeadSamples(*Val);
 
-    // Read the samples in the body.
-    auto NumRecords = readNumber<unsigned>();
-    if (std::error_code EC = NumRecords.getError())
+    auto NumCalls = readNumber<uint32_t>();
+    if (std::error_code EC = NumCalls.getError())
       return EC;
-    for (unsigned I = 0; I < *NumRecords; ++I) {
-      auto LineOffset = readNumber<uint64_t>();
-      if (std::error_code EC = LineOffset.getError())
-        return EC;
 
-      auto Discriminator = readNumber<uint64_t>();
-      if (std::error_code EC = Discriminator.getError())
+    for (uint32_t J = 0; J < *NumCalls; ++J) {
+      auto CalledFunction(readStringFromTable());
+      if (std::error_code EC = CalledFunction.getError())
         return EC;
 
-      auto NumSamples = readNumber<uint64_t>();
-      if (std::error_code EC = NumSamples.getError())
+      auto CalledFunctionSamples = readNumber<uint64_t>();
+      if (std::error_code EC = CalledFunctionSamples.getError())
         return EC;
 
-      auto NumCalls = readNumber<unsigned>();
-      if (std::error_code EC = NumCalls.getError())
-        return EC;
+      FProfile.addCalledTargetSamples(*LineOffset, *Discriminator,
+                                      *CalledFunction, *CalledFunctionSamples);
+    }
+
+    FProfile.addBodySamples(*LineOffset, *Discriminator, *NumSamples);
+  }
 
-      for (unsigned J = 0; J < *NumCalls; ++J) {
-        auto CalledFunction(readString());
-        if (std::error_code EC = CalledFunction.getError())
-          return EC;
+  // Read all the samples for inlined function calls.
+  auto NumCallsites = readNumber<uint32_t>();
+  if (std::error_code EC = NumCallsites.getError())
+    return EC;
 
-        auto CalledFunctionSamples = readNumber<uint64_t>();
-        if (std::error_code EC = CalledFunctionSamples.getError())
-          return EC;
+  for (uint32_t J = 0; J < *NumCallsites; ++J) {
+    auto LineOffset = readNumber<uint64_t>();
+    if (std::error_code EC = LineOffset.getError())
+      return EC;
 
-        FProfile.addCalledTargetSamples(*LineOffset, *Discriminator,
-                                        *CalledFunction,
-                                        *CalledFunctionSamples);
-      }
+    auto Discriminator = readNumber<uint64_t>();
+    if (std::error_code EC = Discriminator.getError())
+      return EC;
 
-      FProfile.addBodySamples(*LineOffset, *Discriminator, *NumSamples);
-    }
+    auto FName(readStringFromTable());
+    if (std::error_code EC = FName.getError())
+      return EC;
+
+    FunctionSamples &CalleeProfile = FProfile.functionSamplesAt(
+        CallsiteLocation(*LineOffset, *Discriminator, *FName));
+    if (std::error_code EC = readProfile(CalleeProfile))
+      return EC;
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderBinary::read() {
+  while (!at_eof()) {
+    auto NumHeadSamples = readNumber<uint64_t>();
+    if (std::error_code EC = NumHeadSamples.getError())
+      return EC;
+
+    auto FName(readStringFromTable());
+    if (std::error_code EC = FName.getError())
+      return EC;
+
+    Profiles[*FName] = FunctionSamples();
+    FunctionSamples &FProfile = Profiles[*FName];
+
+    FProfile.addHeadSamples(*NumHeadSamples);
+
+    if (std::error_code EC = readProfile(FProfile))
+      return EC;
   }
 
   return sampleprof_error::success;
@@ -343,6 +400,18 @@ std::error_code SampleProfileReaderBinary::readHeader() {
   else if (*Version != SPVersion())
     return sampleprof_error::unsupported_version;
 
+  // Read the name table.
+  auto Size = readNumber<uint32_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+  NameTable.reserve(*Size);
+  for (uint32_t I = 0; I < *Size; ++I) {
+    auto Name(readString());
+    if (std::error_code EC = Name.getError())
+      return EC;
+    NameTable.push_back(*Name);
+  }
+
   return sampleprof_error::success;
 }
 
@@ -353,6 +422,249 @@ bool SampleProfileReaderBinary::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == SPMagic();
 }
 
+std::error_code SampleProfileReaderGCC::skipNextWord() {
+  uint32_t dummy;
+  if (!GcovBuffer.readInt(dummy))
+    return sampleprof_error::truncated;
+  return sampleprof_error::success;
+}
+
+template <typename T> ErrorOr<T> SampleProfileReaderGCC::readNumber() {
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    uint32_t Val;
+    if (GcovBuffer.readInt(Val) && Val <= std::numeric_limits<T>::max())
+      return static_cast<T>(Val);
+  } else if (sizeof(T) <= sizeof(uint64_t)) {
+    uint64_t Val;
+    if (GcovBuffer.readInt64(Val) && Val <= std::numeric_limits<T>::max())
+      return static_cast<T>(Val);
+  }
+
+  std::error_code EC = sampleprof_error::malformed;
+  reportError(0, EC.message());
+  return EC;
+}
+
+ErrorOr<StringRef> SampleProfileReaderGCC::readString() {
+  StringRef Str;
+  if (!GcovBuffer.readString(Str))
+    return sampleprof_error::truncated;
+  return Str;
+}
+
+std::error_code SampleProfileReaderGCC::readHeader() {
+  // Read the magic identifier.
+  if (!GcovBuffer.readGCDAFormat())
+    return sampleprof_error::unrecognized_format;
+
+  // Read the version number. Note - the GCC reader does not validate this
+  // version, but the profile creator generates v704.
+  GCOV::GCOVVersion version;
+  if (!GcovBuffer.readGCOVVersion(version))
+    return sampleprof_error::unrecognized_format;
+
+  if (version != GCOV::V704)
+    return sampleprof_error::unsupported_version;
+
+  // Skip the empty integer.
+  if (std::error_code EC = skipNextWord())
+    return EC;
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderGCC::readSectionTag(uint32_t Expected) {
+  uint32_t Tag;
+  if (!GcovBuffer.readInt(Tag))
+    return sampleprof_error::truncated;
+
+  if (Tag != Expected)
+    return sampleprof_error::malformed;
+
+  if (std::error_code EC = skipNextWord())
+    return EC;
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderGCC::readNameTable() {
+  if (std::error_code EC = readSectionTag(GCOVTagAFDOFileNames))
+    return EC;
+
+  uint32_t Size;
+  if (!GcovBuffer.readInt(Size))
+    return sampleprof_error::truncated;
+
+  for (uint32_t I = 0; I < Size; ++I) {
+    StringRef Str;
+    if (!GcovBuffer.readString(Str))
+      return sampleprof_error::truncated;
+    Names.push_back(Str);
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderGCC::readFunctionProfiles() {
+  if (std::error_code EC = readSectionTag(GCOVTagAFDOFunction))
+    return EC;
+
+  uint32_t NumFunctions;
+  if (!GcovBuffer.readInt(NumFunctions))
+    return sampleprof_error::truncated;
+
+  InlineCallStack Stack;
+  for (uint32_t I = 0; I < NumFunctions; ++I)
+    if (std::error_code EC = readOneFunctionProfile(Stack, true, 0))
+      return EC;
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
+    const InlineCallStack &InlineStack, bool Update, uint32_t Offset) {
+  uint64_t HeadCount = 0;
+  if (InlineStack.size() == 0)
+    if (!GcovBuffer.readInt64(HeadCount))
+      return sampleprof_error::truncated;
+
+  uint32_t NameIdx;
+  if (!GcovBuffer.readInt(NameIdx))
+    return sampleprof_error::truncated;
+
+  StringRef Name(Names[NameIdx]);
+
+  uint32_t NumPosCounts;
+  if (!GcovBuffer.readInt(NumPosCounts))
+    return sampleprof_error::truncated;
+
+  uint32_t NumCallsites;
+  if (!GcovBuffer.readInt(NumCallsites))
+    return sampleprof_error::truncated;
+
+  FunctionSamples *FProfile = nullptr;
+  if (InlineStack.size() == 0) {
+    // If this is a top function that we have already processed, do not
+    // update its profile again.  This happens in the presence of
+    // function aliases.  Since these aliases share the same function
+    // body, there will be identical replicated profiles for the
+    // original function.  In this case, we simply not bother updating
+    // the profile of the original function.
+    FProfile = &Profiles[Name];
+    FProfile->addHeadSamples(HeadCount);
+    if (FProfile->getTotalSamples() > 0)
+      Update = false;
+  } else {
+    // Otherwise, we are reading an inlined instance. The top of the
+    // inline stack contains the profile of the caller. Insert this
+    // callee in the caller's CallsiteMap.
+    FunctionSamples *CallerProfile = InlineStack.front();
+    uint32_t LineOffset = Offset >> 16;
+    uint32_t Discriminator = Offset & 0xffff;
+    FProfile = &CallerProfile->functionSamplesAt(
+        CallsiteLocation(LineOffset, Discriminator, Name));
+  }
+
+  for (uint32_t I = 0; I < NumPosCounts; ++I) {
+    uint32_t Offset;
+    if (!GcovBuffer.readInt(Offset))
+      return sampleprof_error::truncated;
+
+    uint32_t NumTargets;
+    if (!GcovBuffer.readInt(NumTargets))
+      return sampleprof_error::truncated;
+
+    uint64_t Count;
+    if (!GcovBuffer.readInt64(Count))
+      return sampleprof_error::truncated;
+
+    // The line location is encoded in the offset as:
+    //   high 16 bits: line offset to the start of the function.
+    //   low 16 bits: discriminator.
+    uint32_t LineOffset = Offset >> 16;
+    uint32_t Discriminator = Offset & 0xffff;
+
+    InlineCallStack NewStack;
+    NewStack.push_back(FProfile);
+    NewStack.insert(NewStack.end(), InlineStack.begin(), InlineStack.end());
+    if (Update) {
+      // Walk up the inline stack, adding the samples on this line to
+      // the total sample count of the callers in the chain.
+      for (auto CallerProfile : NewStack)
+        CallerProfile->addTotalSamples(Count);
+
+      // Update the body samples for the current profile.
+      FProfile->addBodySamples(LineOffset, Discriminator, Count);
+    }
+
+    // Process the list of functions called at an indirect call site.
+    // These are all the targets that a function pointer (or virtual
+    // function) resolved at runtime.
+    for (uint32_t J = 0; J < NumTargets; J++) {
+      uint32_t HistVal;
+      if (!GcovBuffer.readInt(HistVal))
+        return sampleprof_error::truncated;
+
+      if (HistVal != HIST_TYPE_INDIR_CALL_TOPN)
+        return sampleprof_error::malformed;
+
+      uint64_t TargetIdx;
+      if (!GcovBuffer.readInt64(TargetIdx))
+        return sampleprof_error::truncated;
+      StringRef TargetName(Names[TargetIdx]);
+
+      uint64_t TargetCount;
+      if (!GcovBuffer.readInt64(TargetCount))
+        return sampleprof_error::truncated;
+
+      if (Update) {
+        FunctionSamples &TargetProfile = Profiles[TargetName];
+        TargetProfile.addCalledTargetSamples(LineOffset, Discriminator,
+                                             TargetName, TargetCount);
+      }
+    }
+  }
+
+  // Process all the inlined callers into the current function. These
+  // are all the callsites that were inlined into this function.
+  for (uint32_t I = 0; I < NumCallsites; I++) {
+    // The offset is encoded as:
+    //   high 16 bits: line offset to the start of the function.
+    //   low 16 bits: discriminator.
+    uint32_t Offset;
+    if (!GcovBuffer.readInt(Offset))
+      return sampleprof_error::truncated;
+    InlineCallStack NewStack;
+    NewStack.push_back(FProfile);
+    NewStack.insert(NewStack.end(), InlineStack.begin(), InlineStack.end());
+    if (std::error_code EC = readOneFunctionProfile(NewStack, Update, Offset))
+      return EC;
+  }
+
+  return sampleprof_error::success;
+}
+
+/// \brief Read a GCC AutoFDO profile.
+///
+/// This format is generated by the Linux Perf conversion tool at
+/// https://github.com/google/autofdo.
+std::error_code SampleProfileReaderGCC::read() {
+  // Read the string table.
+  if (std::error_code EC = readNameTable())
+    return EC;
+
+  // Read the source profile.
+  if (std::error_code EC = readFunctionProfiles())
+    return EC;
+
+  return sampleprof_error::success;
+}
+
+bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
+  StringRef Magic(reinterpret_cast<const char *>(Buffer.getBufferStart()));
+  return Magic == "adcg*704";
+}
+
 /// \brief Prepare a memory buffer for the contents of \p Filename.
 ///
 /// \returns an error code indicating the status of the buffer.
@@ -364,7 +676,7 @@ setupMemoryBuffer(std::string Filename) {
   auto Buffer = std::move(BufferOrErr.get());
 
   // Sanity check the file.
-  if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
+  if (Buffer->getBufferSize() > std::numeric_limits<uint32_t>::max())
     return sampleprof_error::too_large;
 
   return std::move(Buffer);
@@ -384,13 +696,29 @@ SampleProfileReader::create(StringRef Filename, LLVMContext &C) {
   auto BufferOrError = setupMemoryBuffer(Filename);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
+  return create(BufferOrError.get(), C);
+}
 
-  auto Buffer = std::move(BufferOrError.get());
+/// \brief Create a sample profile reader based on the format of the input data.
+///
+/// \param B The memory buffer to create the reader from (assumes ownership).
+///
+/// \param Reader The reader to instantiate according to \p Filename's format.
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReader>>
+SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
   std::unique_ptr<SampleProfileReader> Reader;
-  if (SampleProfileReaderBinary::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderBinary(std::move(Buffer), C));
+  if (SampleProfileReaderBinary::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderBinary(std::move(B), C));
+  else if (SampleProfileReaderGCC::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderGCC(std::move(B), C));
+  else if (SampleProfileReaderText::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderText(std::move(B), C));
   else
-    Reader.reset(new SampleProfileReaderText(std::move(Buffer), C));
+    return sampleprof_error::unrecognized_format;
 
   if (std::error_code EC = Reader->readHeader())
     return EC;
diff --git a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
index c95267a..51feee5 100644
--- a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -30,16 +30,27 @@ using namespace llvm::sampleprof;
 using namespace llvm;
 
 /// \brief Write samples to a text file.
-bool SampleProfileWriterText::write(StringRef FName, const FunctionSamples &S) {
-  if (S.empty())
-    return true;
-
-  OS << FName << ":" << S.getTotalSamples() << ":" << S.getHeadSamples()
-     << "\n";
-
-  for (const auto &I : S.getBodySamples()) {
-    LineLocation Loc = I.first;
-    const SampleRecord &Sample = I.second;
+///
+/// Note: it may be tempting to implement this in terms of
+/// FunctionSamples::print().  Please don't.  The dump functionality is intended
+/// for debugging and has no specified form.
+///
+/// The format used here is more structured and deliberate because
+/// it needs to be parsed by the SampleProfileReaderText class.
+std::error_code SampleProfileWriterText::write(StringRef FName,
+                                               const FunctionSamples &S) {
+  auto &OS = *OutputStream;
+
+  OS << FName << ":" << S.getTotalSamples();
+  if (Indent == 0)
+    OS << ":" << S.getHeadSamples();
+  OS << "\n";
+
+  SampleSorter<LineLocation, SampleRecord> SortedSamples(S.getBodySamples());
+  for (const auto &I : SortedSamples.get()) {
+    LineLocation Loc = I->first;
+    const SampleRecord &Sample = I->second;
+    OS.indent(Indent + 1);
     if (Loc.Discriminator == 0)
       OS << Loc.LineOffset << ": ";
     else
@@ -52,32 +63,89 @@ bool SampleProfileWriterText::write(StringRef FName, const FunctionSamples &S) {
     OS << "\n";
   }
 
-  return true;
+  SampleSorter<CallsiteLocation, FunctionSamples> SortedCallsiteSamples(
+      S.getCallsiteSamples());
+  Indent += 1;
+  for (const auto &I : SortedCallsiteSamples.get()) {
+    CallsiteLocation Loc = I->first;
+    const FunctionSamples &CalleeSamples = I->second;
+    OS.indent(Indent);
+    if (Loc.Discriminator == 0)
+      OS << Loc.LineOffset << ": ";
+    else
+      OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
+    if (std::error_code EC = write(Loc.CalleeName, CalleeSamples))
+      return EC;
+  }
+  Indent -= 1;
+
+  return sampleprof_error::success;
 }
 
-SampleProfileWriterBinary::SampleProfileWriterBinary(StringRef F,
-                                                     std::error_code &EC)
-    : SampleProfileWriter(F, EC, sys::fs::F_None) {
-  if (EC)
-    return;
+std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
+  const auto &ret = NameTable.find(FName);
+  if (ret == NameTable.end())
+    return sampleprof_error::truncated_name_table;
+  encodeULEB128(ret->second, *OutputStream);
+  return sampleprof_error::success;
+}
 
-  // Write the file header.
+void SampleProfileWriterBinary::addName(StringRef FName) {
+  auto NextIdx = NameTable.size();
+  NameTable.insert(std::make_pair(FName, NextIdx));
+}
+
+void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
+  // Add all the names in indirect call targets.
+  for (const auto &I : S.getBodySamples()) {
+    const SampleRecord &Sample = I.second;
+    for (const auto &J : Sample.getCallTargets())
+      addName(J.first());
+  }
+
+  // Recursively add all the names for inlined callsites.
+  for (const auto &J : S.getCallsiteSamples()) {
+    CallsiteLocation Loc = J.first;
+    const FunctionSamples &CalleeSamples = J.second;
+    addName(Loc.CalleeName);
+    addNames(CalleeSamples);
+  }
+}
+
+std::error_code SampleProfileWriterBinary::writeHeader(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  auto &OS = *OutputStream;
+
+  // Write file magic identifier.
   encodeULEB128(SPMagic(), OS);
   encodeULEB128(SPVersion(), OS);
+
+  // Generate the name table for all the functions referenced in the profile.
+  for (const auto &I : ProfileMap) {
+    addName(I.first());
+    addNames(I.second);
+  }
+
+  // Write out the name table.
+  encodeULEB128(NameTable.size(), OS);
+  for (auto N : NameTable) {
+    OS << N.first;
+    encodeULEB128(0, OS);
+  }
+
+  return sampleprof_error::success;
 }
 
-/// \brief Write samples to a binary file.
-///
-/// \returns true if the samples were written successfully, false otherwise.
-bool SampleProfileWriterBinary::write(StringRef FName,
-                                      const FunctionSamples &S) {
-  if (S.empty())
-    return true;
+std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
+                                                     const FunctionSamples &S) {
+  auto &OS = *OutputStream;
+
+  if (std::error_code EC = writeNameIdx(FName))
+    return EC;
 
-  OS << FName;
-  encodeULEB128(0, OS);
   encodeULEB128(S.getTotalSamples(), OS);
-  encodeULEB128(S.getHeadSamples(), OS);
+
+  // Emit all the body samples.
   encodeULEB128(S.getBodySamples().size(), OS);
   for (const auto &I : S.getBodySamples()) {
     LineLocation Loc = I.first;
@@ -87,18 +155,38 @@ bool SampleProfileWriterBinary::write(StringRef FName,
     encodeULEB128(Sample.getSamples(), OS);
     encodeULEB128(Sample.getCallTargets().size(), OS);
     for (const auto &J : Sample.getCallTargets()) {
-      std::string Callee = J.first();
-      unsigned CalleeSamples = J.second;
-      OS << Callee;
-      encodeULEB128(0, OS);
+      StringRef Callee = J.first();
+      uint64_t CalleeSamples = J.second;
+      if (std::error_code EC = writeNameIdx(Callee))
+        return EC;
       encodeULEB128(CalleeSamples, OS);
     }
   }
 
-  return true;
+  // Recursively emit all the callsite samples.
+  encodeULEB128(S.getCallsiteSamples().size(), OS);
+  for (const auto &J : S.getCallsiteSamples()) {
+    CallsiteLocation Loc = J.first;
+    const FunctionSamples &CalleeSamples = J.second;
+    encodeULEB128(Loc.LineOffset, OS);
+    encodeULEB128(Loc.Discriminator, OS);
+    if (std::error_code EC = writeBody(Loc.CalleeName, CalleeSamples))
+      return EC;
+  }
+
+  return sampleprof_error::success;
 }
 
-/// \brief Create a sample profile writer based on the specified format.
+/// \brief Write samples of a top-level function to a binary file.
+///
+/// \returns true if the samples were written successfully, false otherwise.
+std::error_code SampleProfileWriterBinary::write(StringRef FName,
+                                                 const FunctionSamples &S) {
+  encodeULEB128(S.getHeadSamples(), *OutputStream);
+  return writeBody(FName, S);
+}
+
+/// \brief Create a sample profile file writer based on the specified format.
 ///
 /// \param Filename The file to create.
 ///
@@ -110,12 +198,38 @@ bool SampleProfileWriterBinary::write(StringRef FName,
 ErrorOr<std::unique_ptr<SampleProfileWriter>>
 SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
   std::error_code EC;
+  std::unique_ptr<raw_ostream> OS;
+  if (Format == SPF_Binary)
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_None));
+  else
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_Text));
+  if (EC)
+    return EC;
+
+  return create(OS, Format);
+}
+
+/// \brief Create a sample profile stream writer based on the specified format.
+///
+/// \param OS The output stream to store the profile data to.
+///
+/// \param Writer The writer to instantiate according to the specified format.
+///
+/// \param Format Encoding format for the profile file.
+///
+/// \returns an error code indicating the status of the created writer.
+ErrorOr<std::unique_ptr<SampleProfileWriter>>
+SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                            SampleProfileFormat Format) {
+  std::error_code EC;
   std::unique_ptr<SampleProfileWriter> Writer;
 
   if (Format == SPF_Binary)
-    Writer.reset(new SampleProfileWriterBinary(Filename, EC));
+    Writer.reset(new SampleProfileWriterBinary(OS));
   else if (Format == SPF_Text)
-    Writer.reset(new SampleProfileWriterText(Filename, EC));
+    Writer.reset(new SampleProfileWriterText(OS));
+  else if (Format == SPF_GCC)
+    EC = sampleprof_error::unsupported_writing_format;
   else
     EC = sampleprof_error::unrecognized_format;
 
diff --git a/contrib/llvm/lib/Support/APFloat.cpp b/contrib/llvm/lib/Support/APFloat.cpp
index 5d31225..19b8221 100644
--- a/contrib/llvm/lib/Support/APFloat.cpp
+++ b/contrib/llvm/lib/Support/APFloat.cpp
@@ -768,6 +768,15 @@ APFloat::isLargest() const {
 }
 
 bool
+APFloat::isInteger() const {
+  // This could be made more efficient; I'm going for obviously correct.
+  if (!isFinite()) return false;
+  APFloat truncated = *this;
+  truncated.roundToIntegral(rmTowardZero);
+  return compare(truncated) == cmpEqual;
+}
+
+bool
 APFloat::bitwiseIsEqual(const APFloat &rhs) const {
   if (this == &rhs)
     return true;
@@ -777,18 +786,12 @@ APFloat::bitwiseIsEqual(const APFloat &rhs) const {
     return false;
   if (category==fcZero || category==fcInfinity)
     return true;
-  else if (isFiniteNonZero() && exponent!=rhs.exponent)
+
+  if (isFiniteNonZero() && exponent != rhs.exponent)
     return false;
-  else {
-    int i= partCount();
-    const integerPart* p=significandParts();
-    const integerPart* q=rhs.significandParts();
-    for (; i>0; i--, p++, q++) {
-      if (*p != *q)
-        return false;
-    }
-    return true;
-  }
+
+  return std::equal(significandParts(), significandParts() + partCount(),
+                    rhs.significandParts());
 }
 
 APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value) {
@@ -847,6 +850,21 @@ APFloat::semanticsPrecision(const fltSemantics &semantics)
 {
   return semantics.precision;
 }
+APFloat::ExponentType
+APFloat::semanticsMaxExponent(const fltSemantics &semantics)
+{
+  return semantics.maxExponent;
+}
+APFloat::ExponentType
+APFloat::semanticsMinExponent(const fltSemantics &semantics)
+{
+  return semantics.minExponent;
+}
+unsigned int
+APFloat::semanticsSizeInBits(const fltSemantics &semantics)
+{
+  return semantics.sizeInBits;
+}
 
 const integerPart *
 APFloat::significandParts() const
@@ -1762,7 +1780,7 @@ APFloat::remainder(const APFloat &rhs)
 /* Normalized llvm frem (C fmod).
    This is not currently correct in all cases.  */
 APFloat::opStatus
-APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
+APFloat::mod(const APFloat &rhs)
 {
   opStatus fs;
   fs = modSpecials(rhs);
@@ -1787,10 +1805,10 @@ APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
                                           rmNearestTiesToEven);
     assert(fs==opOK);   // should always work
 
-    fs = V.multiply(rhs, rounding_mode);
+    fs = V.multiply(rhs, rmNearestTiesToEven);
     assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
 
-    fs = subtract(V, rounding_mode);
+    fs = subtract(V, rmNearestTiesToEven);
     assert(fs==opOK || fs==opInexact);   // likewise
 
     if (isZero())
diff --git a/contrib/llvm/lib/Support/BlockFrequency.cpp b/contrib/llvm/lib/Support/BlockFrequency.cpp
index 6f7e341..e7f3e17 100644
--- a/contrib/llvm/lib/Support/BlockFrequency.cpp
+++ b/contrib/llvm/lib/Support/BlockFrequency.cpp
@@ -11,37 +11,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 
 using namespace llvm;
 
-BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
+BlockFrequency &BlockFrequency::operator*=(BranchProbability Prob) {
   Frequency = Prob.scale(Frequency);
   return *this;
 }
 
-const BlockFrequency
-BlockFrequency::operator*(const BranchProbability &Prob) const {
+BlockFrequency BlockFrequency::operator*(BranchProbability Prob) const {
   BlockFrequency Freq(Frequency);
   Freq *= Prob;
   return Freq;
 }
 
-BlockFrequency &BlockFrequency::operator/=(const BranchProbability &Prob) {
+BlockFrequency &BlockFrequency::operator/=(BranchProbability Prob) {
   Frequency = Prob.scaleByInverse(Frequency);
   return *this;
 }
 
-BlockFrequency BlockFrequency::operator/(const BranchProbability &Prob) const {
+BlockFrequency BlockFrequency::operator/(BranchProbability Prob) const {
   BlockFrequency Freq(Frequency);
   Freq /= Prob;
   return Freq;
 }
 
-BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) {
+BlockFrequency &BlockFrequency::operator+=(BlockFrequency Freq) {
   uint64_t Before = Freq.Frequency;
   Frequency += Freq.Frequency;
 
@@ -52,11 +50,25 @@ BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) {
   return *this;
 }
 
-const BlockFrequency
-BlockFrequency::operator+(const BlockFrequency &Prob) const {
-  BlockFrequency Freq(Frequency);
-  Freq += Prob;
-  return Freq;
+BlockFrequency BlockFrequency::operator+(BlockFrequency Freq) const {
+  BlockFrequency NewFreq(Frequency);
+  NewFreq += Freq;
+  return NewFreq;
+}
+
+BlockFrequency &BlockFrequency::operator-=(BlockFrequency Freq) {
+  // If underflow, set frequency to 0.
+  if (Frequency <= Freq.Frequency)
+    Frequency = 0;
+  else
+    Frequency -= Freq.Frequency;
+  return *this;
+}
+
+BlockFrequency BlockFrequency::operator-(BlockFrequency Freq) const {
+  BlockFrequency NewFreq(Frequency);
+  NewFreq -= Freq;
+  return NewFreq;
 }
 
 BlockFrequency &BlockFrequency::operator>>=(const unsigned count) {
diff --git a/contrib/llvm/lib/Support/BranchProbability.cpp b/contrib/llvm/lib/Support/BranchProbability.cpp
index 65878d6..771d02c 100644
--- a/contrib/llvm/lib/Support/BranchProbability.cpp
+++ b/contrib/llvm/lib/Support/BranchProbability.cpp
@@ -15,17 +15,58 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
+const uint32_t BranchProbability::D;
+
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
-  return OS << N << " / " << D << " = "
-            << format("%g%%", ((double)N / D) * 100.0);
+  if (isUnknown())
+    return OS << "?%";
+
+  // Get a percentage rounded to two decimal digits. This avoids
+  // implementation-defined rounding inside printf.
+  double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0;
+  return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D,
+                      Percent);
 }
 
 void BranchProbability::dump() const { print(dbgs()) << '\n'; }
 
+BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
+  assert(Denominator > 0 && "Denominator cannot be 0!");
+  assert(Numerator <= Denominator && "Probability cannot be bigger than 1!");
+  if (Denominator == D)
+    N = Numerator;
+  else {
+    uint64_t Prob64 =
+        (Numerator * static_cast<uint64_t>(D) + Denominator / 2) / Denominator;
+    N = static_cast<uint32_t>(Prob64);
+  }
+}
+
+BranchProbability
+BranchProbability::getBranchProbability(uint64_t Numerator,
+                                        uint64_t Denominator) {
+  assert(Numerator <= Denominator && "Probability cannot be bigger than 1!");
+  // Scale down Denominator to fit in a 32-bit integer.
+  int Scale = 0;
+  while (Denominator > UINT32_MAX) {
+    Denominator >>= 1;
+    Scale++;
+  }
+  return BranchProbability(Numerator >> Scale, Denominator);
+}
+
+// If ConstD is not zero, then replace D by ConstD so that division and modulo
+// operations by D can be optimized, in case this function is not inlined by the
+// compiler.
+template <uint32_t ConstD>
 static uint64_t scale(uint64_t Num, uint32_t N, uint32_t D) {
+  if (ConstD > 0)
+    D = ConstD;
+
   assert(D && "divide by 0");
 
   // Fast path for multiplying by 1.0.
@@ -65,9 +106,9 @@ static uint64_t scale(uint64_t Num, uint32_t N, uint32_t D) {
 }
 
 uint64_t BranchProbability::scale(uint64_t Num) const {
-  return ::scale(Num, N, D);
+  return ::scale<D>(Num, N, D);
 }
 
 uint64_t BranchProbability::scaleByInverse(uint64_t Num) const {
-  return ::scale(Num, D, N);
+  return ::scale<0>(Num, D, N);
 }
diff --git a/contrib/llvm/lib/Support/CommandLine.cpp b/contrib/llvm/lib/Support/CommandLine.cpp
index 17fba95..fdcdb03 100644
--- a/contrib/llvm/lib/Support/CommandLine.cpp
+++ b/contrib/llvm/lib/Support/CommandLine.cpp
@@ -120,7 +120,7 @@ public:
 
   void addOption(Option *O) {
     bool HadErrors = false;
-    if (O->ArgStr[0]) {
+    if (O->hasArgStr()) {
       // Add argument to the argument map!
       if (!OptionsMap.insert(std::make_pair(O->ArgStr, O)).second) {
         errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr
@@ -151,12 +151,12 @@ public:
   }
 
   void removeOption(Option *O) {
-    SmallVector<const char *, 16> OptionNames;
+    SmallVector<StringRef, 16> OptionNames;
     O->getExtraOptionNames(OptionNames);
-    if (O->ArgStr[0])
+    if (O->hasArgStr())
       OptionNames.push_back(O->ArgStr);
     for (auto Name : OptionNames)
-      OptionsMap.erase(StringRef(Name));
+      OptionsMap.erase(Name);
 
     if (O->getFormattingFlag() == cl::Positional)
       for (auto Opt = PositionalOpts.begin(); Opt != PositionalOpts.end();
@@ -182,13 +182,13 @@ public:
             nullptr != ConsumeAfterOpt);
   }
 
-  void updateArgStr(Option *O, const char *NewName) {
+  void updateArgStr(Option *O, StringRef NewName) {
     if (!OptionsMap.insert(std::make_pair(NewName, O)).second) {
       errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr
              << "' registered more than once!\n";
       report_fatal_error("inconsistency in registered CommandLine options");
     }
-    OptionsMap.erase(StringRef(O->ArgStr));
+    OptionsMap.erase(O->ArgStr);
   }
 
   void printOptionValues();
@@ -227,7 +227,7 @@ void Option::addArgument() {
 
 void Option::removeArgument() { GlobalParser->removeOption(this); }
 
-void Option::setArgStr(const char *S) {
+void Option::setArgStr(StringRef S) {
   if (FullyInitialized)
     GlobalParser->updateArgStr(this, S);
   ArgStr = S;
@@ -296,24 +296,23 @@ static Option *LookupNearestOption(StringRef Arg,
                                            ie = OptionsMap.end();
        it != ie; ++it) {
     Option *O = it->second;
-    SmallVector<const char *, 16> OptionNames;
+    SmallVector<StringRef, 16> OptionNames;
     O->getExtraOptionNames(OptionNames);
-    if (O->ArgStr[0])
+    if (O->hasArgStr())
       OptionNames.push_back(O->ArgStr);
 
     bool PermitValue = O->getValueExpectedFlag() != cl::ValueDisallowed;
     StringRef Flag = PermitValue ? LHS : Arg;
-    for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
-      StringRef Name = OptionNames[i];
+    for (auto Name : OptionNames) {
       unsigned Distance = StringRef(Name).edit_distance(
           Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance);
       if (!Best || Distance < BestDistance) {
         Best = O;
         BestDistance = Distance;
         if (RHS.empty() || !PermitValue)
-          NearestString = OptionNames[i];
+          NearestString = Name;
         else
-          NearestString = (Twine(OptionNames[i]) + "=" + RHS).str();
+          NearestString = (Twine(Name) + "=" + RHS).str();
       }
     }
   }
@@ -346,10 +345,7 @@ static bool CommaSeparateAndAddOccurrence(Option *Handler, unsigned pos,
     Value = Val;
   }
 
-  if (Handler->addOccurrence(pos, ArgName, Value, MultiArg))
-    return true;
-
-  return false;
+  return Handler->addOccurrence(pos, ArgName, Value, MultiArg);
 }
 
 /// ProvideOption - For Value, this differentiates between an empty value ("")
@@ -799,7 +795,7 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
   // telling us.
   SmallVector<const char *, 20> newArgv;
   BumpPtrAllocator A;
-  BumpPtrStringSaver Saver(A);
+  StringSaver Saver(A);
   newArgv.push_back(Saver.save(progName));
 
   // Parse the value of the environment variable into a "command line"
@@ -822,7 +818,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
   // Expand response files.
   SmallVector<const char *, 20> newArgv(argv, argv + argc);
   BumpPtrAllocator A;
-  BumpPtrStringSaver Saver(A);
+  StringSaver Saver(A);
   ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv);
   argv = &newArgv[0];
   argc = static_cast<int>(newArgv.size());
@@ -859,7 +855,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc,
               "error - this positional option will never be matched, "
               "because it does not Require a value, and a "
               "cl::ConsumeAfter option is active!");
-      } else if (UnboundedFound && !Opt->ArgStr[0]) {
+      } else if (UnboundedFound && !Opt->hasArgStr()) {
         // This option does not "require" a value...  Make sure this option is
         // not specified after an option that eats all extra arguments, or this
         // one will never get any!
@@ -1144,8 +1140,8 @@ bool Option::addOccurrence(unsigned pos, StringRef ArgName, StringRef Value,
 // getValueStr - Get the value description string, using "DefaultMsg" if nothing
 // has been specified yet.
 //
-static const char *getValueStr(const Option &O, const char *DefaultMsg) {
-  if (O.ValueStr[0] == 0)
+static StringRef getValueStr(const Option &O, StringRef DefaultMsg) {
+  if (O.ValueStr.empty())
     return DefaultMsg;
   return O.ValueStr;
 }
@@ -1155,7 +1151,7 @@ static const char *getValueStr(const Option &O, const char *DefaultMsg) {
 //
 
 // Return the width of the option tag for printing...
-size_t alias::getOptionWidth() const { return std::strlen(ArgStr) + 6; }
+size_t alias::getOptionWidth() const { return ArgStr.size() + 6; }
 
 static void printHelpStr(StringRef HelpStr, size_t Indent,
                          size_t FirstLineIndentedBy) {
@@ -1170,7 +1166,7 @@ static void printHelpStr(StringRef HelpStr, size_t Indent,
 // Print out the option for the alias.
 void alias::printOptionInfo(size_t GlobalWidth) const {
   outs() << "  -" << ArgStr;
-  printHelpStr(HelpStr, GlobalWidth, std::strlen(ArgStr) + 6);
+  printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1182,9 +1178,9 @@ void alias::printOptionInfo(size_t GlobalWidth) const {
 
 // Return the width of the option tag for printing...
 size_t basic_parser_impl::getOptionWidth(const Option &O) const {
-  size_t Len = std::strlen(O.ArgStr);
+  size_t Len = O.ArgStr.size();
   if (const char *ValName = getValueName())
-    Len += std::strlen(getValueStr(O, ValName)) + 3;
+    Len += getValueStr(O, ValName).size() + 3;
 
   return Len + 6;
 }
@@ -1205,7 +1201,7 @@ void basic_parser_impl::printOptionInfo(const Option &O,
 void basic_parser_impl::printOptionName(const Option &O,
                                         size_t GlobalWidth) const {
   outs() << "  -" << O.ArgStr;
-  outs().indent(GlobalWidth - std::strlen(O.ArgStr));
+  outs().indent(GlobalWidth - O.ArgStr.size());
 }
 
 // parser<bool> implementation
@@ -1319,7 +1315,7 @@ unsigned generic_parser_base::findOption(const char *Name) {
 // Return the width of the option tag for printing...
 size_t generic_parser_base::getOptionWidth(const Option &O) const {
   if (O.hasArgStr()) {
-    size_t Size = std::strlen(O.ArgStr) + 6;
+    size_t Size = O.ArgStr.size() + 6;
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
       Size = std::max(Size, std::strlen(getOption(i)) + 8);
     return Size;
@@ -1338,7 +1334,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
                                           size_t GlobalWidth) const {
   if (O.hasArgStr()) {
     outs() << "  -" << O.ArgStr;
-    printHelpStr(O.HelpStr, GlobalWidth, std::strlen(O.ArgStr) + 6);
+    printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
 
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       size_t NumSpaces = GlobalWidth - strlen(getOption(i)) - 8;
@@ -1346,7 +1342,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
       outs().indent(NumSpaces) << " -   " << getDescription(i) << '\n';
     }
   } else {
-    if (O.HelpStr[0])
+    if (!O.HelpStr.empty())
       outs() << "  " << O.HelpStr << '\n';
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       const char *Option = getOption(i);
@@ -1365,7 +1361,7 @@ void generic_parser_base::printGenericOptionDiff(
     const Option &O, const GenericOptionValue &Value,
     const GenericOptionValue &Default, size_t GlobalWidth) const {
   outs() << "  -" << O.ArgStr;
-  outs().indent(GlobalWidth - std::strlen(O.ArgStr));
+  outs().indent(GlobalWidth - O.ArgStr.size());
 
   unsigned NumOpts = getNumOptions();
   for (unsigned i = 0; i != NumOpts; ++i) {
@@ -1508,7 +1504,7 @@ public:
     outs() << "USAGE: " << GlobalParser->ProgramName << " [options]";
 
     for (auto Opt : GlobalParser->PositionalOpts) {
-      if (Opt->ArgStr[0])
+      if (Opt->hasArgStr())
         outs() << " --" << Opt->ArgStr;
       outs() << " " << Opt->HelpStr;
     }
diff --git a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
index aba0f1d..3f4ef9d 100644
--- a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -24,6 +24,12 @@ static ManagedStatic<
     sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
 
 struct CrashRecoveryContextImpl {
+  // When threads are disabled, this links up all active
+  // CrashRecoveryContextImpls.  When threads are enabled there's one thread
+  // per CrashRecoveryContext and CurrentContext is a thread-local, so only one
+  // CrashRecoveryContextImpl is active per thread and this is always null.
+  const CrashRecoveryContextImpl *Next;
+
   CrashRecoveryContext *CRC;
   std::string Backtrace;
   ::jmp_buf JumpBuffer;
@@ -34,21 +40,26 @@ public:
   CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
                                                         Failed(false),
                                                         SwitchedThread(false) {
+    Next = CurrentContext->get();
     CurrentContext->set(this);
   }
   ~CrashRecoveryContextImpl() {
     if (!SwitchedThread)
-      CurrentContext->erase();
+      CurrentContext->set(Next);
   }
 
   /// \brief Called when the separate crash-recovery thread was finished, to
   /// indicate that we don't need to clear the thread-local CurrentContext.
-  void setSwitchedThread() { SwitchedThread = true; }
+  void setSwitchedThread() { 
+#if defined(LLVM_ENABLE_THREADS) && LLVM_ENABLE_THREADS != 0
+    SwitchedThread = true;
+#endif
+  }
 
   void HandleCrash() {
     // Eliminate the current context entry, to avoid re-entering in case the
     // cleanup code crashes.
-    CurrentContext->erase();
+    CurrentContext->set(Next);
 
     assert(!Failed && "Crash recovery context already failed!");
     Failed = true;
@@ -65,7 +76,7 @@ public:
 static ManagedStatic<sys::Mutex> gCrashRecoveryContextMutex;
 static bool gCrashRecoveryEnabled = false;
 
-static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextCleanup> >
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>>
        tlIsRecoveringFromCrash;
 
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
@@ -73,7 +84,8 @@ CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
 CrashRecoveryContext::~CrashRecoveryContext() {
   // Reclaim registered resources.
   CrashRecoveryContextCleanup *i = head;
-  tlIsRecoveringFromCrash->set(head);
+  const CrashRecoveryContext *PC = tlIsRecoveringFromCrash->get();
+  tlIsRecoveringFromCrash->set(this);
   while (i) {
     CrashRecoveryContextCleanup *tmp = i;
     i = tmp->next;
@@ -81,7 +93,7 @@ CrashRecoveryContext::~CrashRecoveryContext() {
     tmp->recoverResources();
     delete tmp;
   }
-  tlIsRecoveringFromCrash->erase();
+  tlIsRecoveringFromCrash->set(PC);
   
   CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
   delete CRCI;
@@ -232,7 +244,7 @@ void CrashRecoveryContext::Disable() {
 
 static const int Signals[] =
     { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
-static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
+static const unsigned NumSignals = array_lengthof(Signals);
 static struct sigaction PrevActions[NumSignals];
 
 static void CrashRecoverySignalHandler(int Signal) {
diff --git a/contrib/llvm/lib/Support/Debug.cpp b/contrib/llvm/lib/Support/Debug.cpp
index 47751fc..323d532 100644
--- a/contrib/llvm/lib/Support/Debug.cpp
+++ b/contrib/llvm/lib/Support/Debug.cpp
@@ -95,7 +95,10 @@ struct DebugOnlyOpt {
     if (Val.empty())
       return;
     DebugFlag = true;
-    CurrentDebugType->push_back(Val);
+    SmallVector<StringRef,8> dbgTypes;
+    StringRef(Val).split(dbgTypes, ',', -1, false);
+    for (auto dbgType : dbgTypes)
+      CurrentDebugType->push_back(dbgType);
   }
 };
 
@@ -104,10 +107,9 @@ struct DebugOnlyOpt {
 static DebugOnlyOpt DebugOnlyOptLoc;
 
 static cl::opt<DebugOnlyOpt, true, cl::parser<std::string> >
-DebugOnly("debug-only", cl::desc("Enable a specific type of debug output"),
+DebugOnly("debug-only", cl::desc("Enable a specific type of debug output (comma separated list of types)"),
           cl::Hidden, cl::ZeroOrMore, cl::value_desc("debug string"),
           cl::location(DebugOnlyOptLoc), cl::ValueRequired);
-
 // Signal handlers - dump debug output on termination.
 static void debug_user_sig_handler(void *Cookie) {
   // This is a bit sneaky.  Since this is under #ifndef NDEBUG, we
diff --git a/contrib/llvm/lib/Support/Dwarf.cpp b/contrib/llvm/lib/Support/Dwarf.cpp
index 13a4155..7d72256 100644
--- a/contrib/llvm/lib/Support/Dwarf.cpp
+++ b/contrib/llvm/lib/Support/Dwarf.cpp
@@ -177,6 +177,23 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_MIPS_assumed_size:          return "DW_AT_MIPS_assumed_size";
   case DW_AT_lo_user:                    return "DW_AT_lo_user";
   case DW_AT_hi_user:                    return "DW_AT_hi_user";
+  case DW_AT_BORLAND_property_read:      return "DW_AT_BORLAND_property_read";
+  case DW_AT_BORLAND_property_write:     return "DW_AT_BORLAND_property_write";
+  case DW_AT_BORLAND_property_implements: return "DW_AT_BORLAND_property_implements";
+  case DW_AT_BORLAND_property_index:     return "DW_AT_BORLAND_property_index";
+  case DW_AT_BORLAND_property_default:   return "DW_AT_BORLAND_property_default";
+  case DW_AT_BORLAND_Delphi_unit:        return "DW_AT_BORLAND_Delphi_unit";
+  case DW_AT_BORLAND_Delphi_class:       return "DW_AT_BORLAND_Delphi_class";
+  case DW_AT_BORLAND_Delphi_record:      return "DW_AT_BORLAND_Delphi_record";
+  case DW_AT_BORLAND_Delphi_metaclass:   return "DW_AT_BORLAND_Delphi_metaclass";
+  case DW_AT_BORLAND_Delphi_constructor: return "DW_AT_BORLAND_Delphi_constructor";
+  case DW_AT_BORLAND_Delphi_destructor:  return "DW_AT_BORLAND_Delphi_destructor";
+  case DW_AT_BORLAND_Delphi_anonymous_method: return "DW_AT_BORLAND_Delphi_anonymous_method";
+  case DW_AT_BORLAND_Delphi_interface:   return "DW_AT_BORLAND_Delphi_interface";
+  case DW_AT_BORLAND_Delphi_ABI:         return "DW_AT_BORLAND_Delphi_ABI";
+  case DW_AT_BORLAND_Delphi_return:      return "DW_AT_BORLAND_Delphi_return";
+  case DW_AT_BORLAND_Delphi_frameptr:    return "DW_AT_BORLAND_Delphi_frameptr";
+  case DW_AT_BORLAND_closure:            return "DW_AT_BORLAND_closure";
   case DW_AT_APPLE_optimized:            return "DW_AT_APPLE_optimized";
   case DW_AT_APPLE_flags:                return "DW_AT_APPLE_flags";
   case DW_AT_APPLE_isa:                  return "DW_AT_APPLE_isa";
@@ -201,6 +218,7 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_GNU_addr_base:              return "DW_AT_GNU_addr_base";
   case DW_AT_GNU_pubnames:               return "DW_AT_GNU_pubnames";
   case DW_AT_GNU_pubtypes:               return "DW_AT_GNU_pubtypes";
+  case DW_AT_GNU_discriminator:          return "DW_AT_GNU_discriminator";
   }
   return nullptr;
 }
@@ -373,6 +391,14 @@ const char *llvm::dwarf::ConventionString(unsigned Convention) {
    case DW_CC_nocall:                     return "DW_CC_nocall";
    case DW_CC_lo_user:                    return "DW_CC_lo_user";
    case DW_CC_hi_user:                    return "DW_CC_hi_user";
+   case DW_CC_GNU_borland_fastcall_i386:  return "DW_CC_GNU_borland_fastcall_i386";
+   case DW_CC_BORLAND_safecall:           return "DW_CC_BORLAND_safecall";
+   case DW_CC_BORLAND_stdcall:            return "DW_CC_BORLAND_stdcall";
+   case DW_CC_BORLAND_pascal:             return "DW_CC_BORLAND_pascal";
+   case DW_CC_BORLAND_msfastcall:         return "DW_CC_BORLAND_msfastcall";
+   case DW_CC_BORLAND_msreturn:           return "DW_CC_BORLAND_msreturn";
+   case DW_CC_BORLAND_thiscall:           return "DW_CC_BORLAND_thiscall";
+   case DW_CC_BORLAND_fastcall:           return "DW_CC_BORLAND_fastcall";
   }
   return nullptr;
 }
@@ -442,10 +468,21 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
   case DW_MACINFO_start_file:            return "DW_MACINFO_start_file";
   case DW_MACINFO_end_file:              return "DW_MACINFO_end_file";
   case DW_MACINFO_vendor_ext:            return "DW_MACINFO_vendor_ext";
+  case DW_MACINFO_invalid:               return "DW_MACINFO_invalid";
   }
   return nullptr;
 }
 
+unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
+  return StringSwitch<unsigned>(MacinfoString)
+      .Case("DW_MACINFO_define", DW_MACINFO_define)
+      .Case("DW_MACINFO_undef", DW_MACINFO_undef)
+      .Case("DW_MACINFO_start_file", DW_MACINFO_start_file)
+      .Case("DW_MACINFO_end_file", DW_MACINFO_end_file)
+      .Case("DW_MACINFO_vendor_ext", DW_MACINFO_vendor_ext)
+      .Default(DW_MACINFO_invalid);
+}
+
 const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
   case DW_CFA_nop:                       return "DW_CFA_nop";
diff --git a/contrib/llvm/lib/Support/ErrorHandling.cpp b/contrib/llvm/lib/Support/ErrorHandling.cpp
index a25e21a..2808bd3 100644
--- a/contrib/llvm/lib/Support/ErrorHandling.cpp
+++ b/contrib/llvm/lib/Support/ErrorHandling.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm-c/Core.h"
+#include "llvm-c/ErrorHandling.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
diff --git a/contrib/llvm/lib/Support/FileOutputBuffer.cpp b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
index 307ff09..651e679 100644
--- a/contrib/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Signals.h"
 #include <system_error>
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
@@ -34,10 +35,8 @@ FileOutputBuffer::~FileOutputBuffer() {
   sys::fs::remove(Twine(TempPath));
 }
 
-std::error_code
-FileOutputBuffer::create(StringRef FilePath, size_t Size,
-                         std::unique_ptr<FileOutputBuffer> &Result,
-                         unsigned Flags) {
+ErrorOr<std::unique_ptr<FileOutputBuffer>>
+FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
   // If file already exists, it must be a regular file (to be mappable).
   sys::fs::file_status Stat;
   std::error_code EC = sys::fs::status(FilePath, Stat);
@@ -76,6 +75,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size,
   if (EC)
     return EC;
 
+  sys::RemoveFileOnSignal(TempFilePath);
+
 #ifndef LLVM_ON_WIN32
   // On Windows, CreateFileMapping (the mmap function on Windows)
   // automatically extends the underlying file. We don't need to
@@ -95,10 +96,9 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size,
   if (Ret)
     return std::error_code(errno, std::generic_category());
 
-  Result.reset(
+  std::unique_ptr<FileOutputBuffer> Buf(
       new FileOutputBuffer(std::move(MappedFile), FilePath, TempFilePath));
-
-  return std::error_code();
+  return std::move(Buf);
 }
 
 std::error_code FileOutputBuffer::commit() {
@@ -107,6 +107,8 @@ std::error_code FileOutputBuffer::commit() {
 
 
   // Rename file to final name.
-  return sys::fs::rename(Twine(TempPath), Twine(FinalPath));
+  std::error_code EC = sys::fs::rename(Twine(TempPath), Twine(FinalPath));
+  sys::DontRemoveFileOnSignal(TempPath);
+  return EC;
 }
 } // namespace
diff --git a/contrib/llvm/lib/Support/FoldingSet.cpp b/contrib/llvm/lib/Support/FoldingSet.cpp
index b8538ff..bb0ec2d 100644
--- a/contrib/llvm/lib/Support/FoldingSet.cpp
+++ b/contrib/llvm/lib/Support/FoldingSet.cpp
@@ -232,9 +232,29 @@ FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) {
   Buckets = AllocateBuckets(NumBuckets);
   NumNodes = 0;
 }
+
+FoldingSetImpl::FoldingSetImpl(FoldingSetImpl &&Arg)
+    : Buckets(Arg.Buckets), NumBuckets(Arg.NumBuckets), NumNodes(Arg.NumNodes) {
+  Arg.Buckets = nullptr;
+  Arg.NumBuckets = 0;
+  Arg.NumNodes = 0;
+}
+
+FoldingSetImpl &FoldingSetImpl::operator=(FoldingSetImpl &&RHS) {
+  free(Buckets); // This may be null if the set is in a moved-from state.
+  Buckets = RHS.Buckets;
+  NumBuckets = RHS.NumBuckets;
+  NumNodes = RHS.NumNodes;
+  RHS.Buckets = nullptr;
+  RHS.NumBuckets = 0;
+  RHS.NumNodes = 0;
+  return *this;
+}
+
 FoldingSetImpl::~FoldingSetImpl() {
   free(Buckets);
 }
+
 void FoldingSetImpl::clear() {
   // Set all but the last bucket to null pointers.
   memset(Buckets, 0, NumBuckets*sizeof(void*));
diff --git a/contrib/llvm/lib/Support/GraphWriter.cpp b/contrib/llvm/lib/Support/GraphWriter.cpp
index a9b0220..d0e1d50 100644
--- a/contrib/llvm/lib/Support/GraphWriter.cpp
+++ b/contrib/llvm/lib/Support/GraphWriter.cpp
@@ -103,7 +103,7 @@ struct GraphSession {
   bool TryFindProgram(StringRef Names, std::string &ProgramPath) {
     raw_string_ostream Log(LogBuffer);
     SmallVector<StringRef, 8> parts;
-    Names.split(parts, "|");
+    Names.split(parts, '|');
     for (auto Name : parts) {
       if (ErrorOr<std::string> P = sys::findProgramByName(Name)) {
         ProgramPath = *P;
@@ -189,61 +189,87 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
     return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
   }
 
-  enum PSViewerKind { PSV_None, PSV_OSXOpen, PSV_XDGOpen, PSV_Ghostview };
-  PSViewerKind PSViewer = PSV_None;
+  enum ViewerKind {
+    VK_None,
+    VK_OSXOpen,
+    VK_XDGOpen,
+    VK_Ghostview,
+    VK_CmdStart
+  };
+  ViewerKind Viewer = VK_None;
 #ifdef __APPLE__
-  if (!PSViewer && S.TryFindProgram("open", ViewerPath))
-    PSViewer = PSV_OSXOpen;
+  if (!Viewer && S.TryFindProgram("open", ViewerPath))
+    Viewer = VK_OSXOpen;
+#endif
+  if (!Viewer && S.TryFindProgram("gv", ViewerPath))
+    Viewer = VK_Ghostview;
+  if (!Viewer && S.TryFindProgram("xdg-open", ViewerPath))
+    Viewer = VK_XDGOpen;
+#ifdef LLVM_ON_WIN32
+  if (!Viewer && S.TryFindProgram("cmd", ViewerPath)) {
+    Viewer = VK_CmdStart;
+  }
 #endif
-  if (!PSViewer && S.TryFindProgram("gv", ViewerPath))
-    PSViewer = PSV_Ghostview;
-  if (!PSViewer && S.TryFindProgram("xdg-open", ViewerPath))
-    PSViewer = PSV_XDGOpen;
 
-  // PostScript graph generator + PostScript viewer
+  // PostScript or PDF graph generator + PostScript/PDF viewer
   std::string GeneratorPath;
-  if (PSViewer &&
+  if (Viewer &&
       (S.TryFindProgram(getProgramName(program), GeneratorPath) ||
        S.TryFindProgram("dot|fdp|neato|twopi|circo", GeneratorPath))) {
-    std::string PSFilename = Filename + ".ps";
+    std::string OutputFilename =
+        Filename + (Viewer == VK_CmdStart ? ".pdf" : ".ps");
 
     std::vector<const char *> args;
     args.push_back(GeneratorPath.c_str());
-    args.push_back("-Tps");
+    if (Viewer == VK_CmdStart)
+      args.push_back("-Tpdf");
+    else
+      args.push_back("-Tps");
     args.push_back("-Nfontname=Courier");
     args.push_back("-Gsize=7.5,10");
     args.push_back(Filename.c_str());
     args.push_back("-o");
-    args.push_back(PSFilename.c_str());
+    args.push_back(OutputFilename.c_str());
     args.push_back(nullptr);
 
     errs() << "Running '" << GeneratorPath << "' program... ";
 
-    if (ExecGraphViewer(GeneratorPath, args, Filename, wait, ErrMsg))
+    if (ExecGraphViewer(GeneratorPath, args, Filename, true, ErrMsg))
       return true;
 
+    // The lifetime of StartArg must include the call of ExecGraphViewer
+    // because the args are passed as vector of char*.
+    std::string StartArg;
+
     args.clear();
     args.push_back(ViewerPath.c_str());
-    switch (PSViewer) {
-    case PSV_OSXOpen:
+    switch (Viewer) {
+    case VK_OSXOpen:
       args.push_back("-W");
-      args.push_back(PSFilename.c_str());
+      args.push_back(OutputFilename.c_str());
       break;
-    case PSV_XDGOpen:
+    case VK_XDGOpen:
       wait = false;
-      args.push_back(PSFilename.c_str());
+      args.push_back(OutputFilename.c_str());
       break;
-    case PSV_Ghostview:
+    case VK_Ghostview:
       args.push_back("--spartan");
-      args.push_back(PSFilename.c_str());
+      args.push_back(OutputFilename.c_str());
+      break;
+    case VK_CmdStart:
+      args.push_back("/S");
+      args.push_back("/C");
+      StartArg =
+          (StringRef("start ") + (wait ? "/WAIT " : "") + OutputFilename).str();
+      args.push_back(StartArg.c_str());
       break;
-    case PSV_None:
+    case VK_None:
       llvm_unreachable("Invalid viewer");
     }
     args.push_back(nullptr);
 
     ErrMsg.clear();
-    return ExecGraphViewer(ViewerPath, args, PSFilename, wait, ErrMsg);
+    return ExecGraphViewer(ViewerPath, args, OutputFilename, wait, ErrMsg);
   }
 
   // dotty
diff --git a/contrib/llvm/lib/Support/Host.cpp b/contrib/llvm/lib/Support/Host.cpp
index 1bd1fe2..c0f9e07 100644
--- a/contrib/llvm/lib/Support/Host.cpp
+++ b/contrib/llvm/lib/Support/Host.cpp
@@ -368,8 +368,14 @@ StringRef sys::getHostCPUName() {
 
       // Broadwell:
       case 61:
+      case 71:
         return "broadwell";
 
+      // Skylake:
+      case 78:
+      case 94:
+        return "skylake";
+
       case 28: // Most 45 nm Intel Atom processors
       case 38: // 45 nm Atom Lincroft
       case 39: // 32 nm Atom Medfield
@@ -381,6 +387,8 @@ StringRef sys::getHostCPUName() {
       case 55:
       case 74:
       case 77:
+      case 90:
+      case 93:
         return "silvermont";
 
       default: // Unknown family 6 CPU, try to guess.
@@ -689,7 +697,7 @@ StringRef sys::getHostCPUName() {
     if (Lines[I].startswith("features")) {
       size_t Pos = Lines[I].find(":");
       if (Pos != StringRef::npos) {
-        Lines[I].drop_front(Pos + 1).split(CPUFeatures, " ");
+        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
         break;
       }
     }
@@ -766,14 +774,17 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
   // switch, then we have full AVX support.
-  bool HasAVX = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) &&
-                !GetX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
-  Features["avx"]    = HasAVX;
-  Features["fma"]    = HasAVX && (ECX >> 12) & 1;
-  Features["f16c"]   = HasAVX && (ECX >> 29) & 1;
+  bool HasAVXSave = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) &&
+                    !GetX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
+  Features["avx"]    = HasAVXSave;
+  Features["fma"]    = HasAVXSave && (ECX >> 12) & 1;
+  Features["f16c"]   = HasAVXSave && (ECX >> 29) & 1;
+
+  // Only enable XSAVE if OS has enabled support for saving YMM state.
+  Features["xsave"]  = HasAVXSave && (ECX >> 26) & 1;
 
   // AVX512 requires additional context to be saved by the OS.
-  bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
+  bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
 
   unsigned MaxExtLevel;
   GetX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -783,15 +794,15 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["lzcnt"]  = HasExtLeaf1 && ((ECX >>  5) & 1);
   Features["sse4a"]  = HasExtLeaf1 && ((ECX >>  6) & 1);
   Features["prfchw"] = HasExtLeaf1 && ((ECX >>  8) & 1);
-  Features["xop"]    = HasAVX && HasExtLeaf1 && ((ECX >> 11) & 1);
-  Features["fma4"]   = HasAVX && HasExtLeaf1 && ((ECX >> 16) & 1);
+  Features["xop"]    = HasExtLeaf1 && ((ECX >> 11) & 1) && HasAVXSave;
+  Features["fma4"]   = HasExtLeaf1 && ((ECX >> 16) & 1) && HasAVXSave;
   Features["tbm"]    = HasExtLeaf1 && ((ECX >> 21) & 1);
 
   bool HasLeaf7 = MaxLevel >= 7 &&
                   !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
   // AVX2 is only supported if we have the OS save support from AVX.
-  Features["avx2"]     = HasAVX && HasLeaf7 && (EBX >>  5) & 1;
+  Features["avx2"]     = HasAVXSave && HasLeaf7 && ((EBX >>  5) & 1);
 
   Features["fsgsbase"] = HasLeaf7 && ((EBX >>  0) & 1);
   Features["bmi"]      = HasLeaf7 && ((EBX >>  3) & 1);
@@ -801,6 +812,8 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["rdseed"]   = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"]      = HasLeaf7 && ((EBX >> 19) & 1);
   Features["sha"]      = HasLeaf7 && ((EBX >> 29) & 1);
+  // Enable protection keys
+  Features["pku"]    = HasLeaf7 && ((ECX >> 4) & 1);
 
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]  = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
@@ -811,6 +824,14 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avx512bw"] = HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save;
   Features["avx512vl"] = HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save;
 
+  bool HasLeafD = MaxLevel >= 0xd &&
+    !GetX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
+
+  // Only enable XSAVE if OS has enabled support for saving YMM state.
+  Features["xsaveopt"] = HasAVXSave && HasLeafD && ((EAX >> 0) & 1);
+  Features["xsavec"]   = HasAVXSave && HasLeafD && ((EAX >> 1) & 1);
+  Features["xsaves"]   = HasAVXSave && HasLeafD && ((EAX >> 3) & 1);
+
   return true;
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
@@ -832,7 +853,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // Look for the CPU features.
   for (unsigned I = 0, E = Lines.size(); I != E; ++I)
     if (Lines[I].startswith("Features")) {
-      Lines[I].split(CPUFeatures, " ");
+      Lines[I].split(CPUFeatures, ' ');
       break;
     }
 
diff --git a/contrib/llvm/lib/Support/IntEqClasses.cpp b/contrib/llvm/lib/Support/IntEqClasses.cpp
index 1134495..ff21357 100644
--- a/contrib/llvm/lib/Support/IntEqClasses.cpp
+++ b/contrib/llvm/lib/Support/IntEqClasses.cpp
@@ -29,7 +29,7 @@ void IntEqClasses::grow(unsigned N) {
     EC.push_back(EC.size());
 }
 
-void IntEqClasses::join(unsigned a, unsigned b) {
+unsigned IntEqClasses::join(unsigned a, unsigned b) {
   assert(NumClasses == 0 && "join() called after compress().");
   unsigned eca = EC[a];
   unsigned ecb = EC[b];
@@ -41,6 +41,8 @@ void IntEqClasses::join(unsigned a, unsigned b) {
       EC[b] = eca, b = ecb, ecb = EC[b];
     else
       EC[a] = ecb, a = eca, eca = EC[a];
+
+  return eca;
 }
 
 unsigned IntEqClasses::findLeader(unsigned a) const {
diff --git a/contrib/llvm/lib/Support/JamCRC.cpp b/contrib/llvm/lib/Support/JamCRC.cpp
new file mode 100644
index 0000000..bc21c91
--- /dev/null
+++ b/contrib/llvm/lib/Support/JamCRC.cpp
@@ -0,0 +1,96 @@
+//===-- JamCRC.cpp - Cyclic Redundancy Check --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of JamCRC.
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation technique is the one mentioned in:
+// D. V. Sarwate. 1988. Computation of cyclic redundancy checks via table
+// look-up. Commun. ACM 31, 8 (August 1988)
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/JamCRC.h"
+
+using namespace llvm;
+
+static const uint32_t CRCTable[256] = {
+    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+    0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+    0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+    0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+    0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+    0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+    0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+    0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+    0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+    0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+    0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+    0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+    0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+    0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+    0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+    0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+    0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+    0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+    0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+    0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+    0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+    0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+    0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+    0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+    0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+    0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+    0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+    0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+    0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+    0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+    0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+    0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+    0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+    0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+    0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+    0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+    0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+    0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+    0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+    0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+    0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+    0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+    0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+    0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+    0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+    0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+    0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+    0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+    0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+    0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+    0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+    0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+void JamCRC::update(ArrayRef<char> Data) {
+  for (char Byte : Data) {
+    int TableIdx = (CRC ^ Byte) & 0xff;
+    CRC = CRCTable[TableIdx] ^ (CRC >> 8);
+  }
+}
diff --git a/contrib/llvm/lib/Support/Locale.cpp b/contrib/llvm/lib/Support/Locale.cpp
index d5cb72b..53bc0e3 100644
--- a/contrib/llvm/lib/Support/Locale.cpp
+++ b/contrib/llvm/lib/Support/Locale.cpp
@@ -1,3 +1,4 @@
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/Unicode.h"
 
diff --git a/contrib/llvm/lib/Support/ManagedStatic.cpp b/contrib/llvm/lib/Support/ManagedStatic.cpp
index b8fb284..9868207 100644
--- a/contrib/llvm/lib/Support/ManagedStatic.cpp
+++ b/contrib/llvm/lib/Support/ManagedStatic.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Atomic.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include <cassert>
diff --git a/contrib/llvm/lib/Support/MemoryBuffer.cpp b/contrib/llvm/lib/Support/MemoryBuffer.cpp
index d09ef3a..faee10b 100644
--- a/contrib/llvm/lib/Support/MemoryBuffer.cpp
+++ b/contrib/llvm/lib/Support/MemoryBuffer.cpp
@@ -162,13 +162,14 @@ MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
-MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize) {
+MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
+                             bool RequiresNullTerminator) {
   SmallString<256> NameBuf;
   StringRef NameRef = Filename.toStringRef(NameBuf);
 
   if (NameRef == "-")
     return getSTDIN();
-  return getFile(Filename, FileSize);
+  return getFile(Filename, FileSize, RequiresNullTerminator);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
diff --git a/contrib/llvm/lib/Support/Path.cpp b/contrib/llvm/lib/Support/Path.cpp
index cf46738..4952f59 100644
--- a/contrib/llvm/lib/Support/Path.cpp
+++ b/contrib/llvm/lib/Support/Path.cpp
@@ -455,17 +455,15 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
   if (!c.isTriviallyEmpty()) components.push_back(c.toStringRef(c_storage));
   if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
 
-  for (SmallVectorImpl<StringRef>::const_iterator i = components.begin(),
-                                                  e = components.end();
-                                                  i != e; ++i) {
+  for (auto &component : components) {
     bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
-    bool component_has_sep = !i->empty() && is_separator((*i)[0]);
-    bool is_root_name = has_root_name(*i);
+    bool component_has_sep = !component.empty() && is_separator(component[0]);
+    bool is_root_name = has_root_name(component);
 
     if (path_has_sep) {
       // Strip separators from beginning of component.
-      size_t loc = i->find_first_not_of(separators);
-      StringRef c = i->substr(loc);
+      size_t loc = component.find_first_not_of(separators);
+      StringRef c = component.substr(loc);
 
       // Append it.
       path.append(c.begin(), c.end());
@@ -477,7 +475,7 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
       path.push_back(preferred_separator);
     }
 
-    path.append(i->begin(), i->end());
+    path.append(component.begin(), component.end());
   }
 }
 
@@ -661,8 +659,51 @@ bool is_absolute(const Twine &path) {
   return rootDir && rootName;
 }
 
-bool is_relative(const Twine &path) {
-  return !is_absolute(path);
+bool is_relative(const Twine &path) { return !is_absolute(path); }
+
+StringRef remove_leading_dotslash(StringRef Path) {
+  // Remove leading "./" (or ".//" or "././" etc.)
+  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1])) {
+    Path = Path.substr(2);
+    while (Path.size() > 0 && is_separator(Path[0]))
+      Path = Path.substr(1);
+  }
+  return Path;
+}
+
+static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
+  SmallVector<StringRef, 16> components;
+
+  // Skip the root path, then look for traversal in the components.
+  StringRef rel = path::relative_path(path);
+  for (StringRef C : llvm::make_range(path::begin(rel), path::end(rel))) {
+    if (C == ".")
+      continue;
+    if (remove_dot_dot) {
+      if (C == "..") {
+        if (!components.empty())
+          components.pop_back();
+        continue;
+      }
+    }
+    components.push_back(C);
+  }
+
+  SmallString<256> buffer = path::root_path(path);
+  for (StringRef C : components)
+    path::append(buffer, C);
+  return buffer;
+}
+
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot) {
+  StringRef p(path.data(), path.size());
+
+  SmallString<256> result = remove_dots(p, remove_dot_dot);
+  if (result == path)
+    return false;
+
+  path.swap(result);
+  return true;
 }
 
 } // end namespace path
@@ -732,7 +773,9 @@ std::error_code createUniqueDirectory(const Twine &Prefix,
                             true, 0, FS_Dir);
 }
 
-std::error_code make_absolute(SmallVectorImpl<char> &path) {
+static std::error_code make_absolute(const Twine &current_directory,
+                                     SmallVectorImpl<char> &path,
+                                     bool use_current_directory) {
   StringRef p(path.data(), path.size());
 
   bool rootDirectory = path::has_root_directory(p),
@@ -748,7 +791,9 @@ std::error_code make_absolute(SmallVectorImpl<char> &path) {
 
   // All of the following conditions will need the current directory.
   SmallString<128> current_dir;
-  if (std::error_code ec = current_path(current_dir))
+  if (use_current_directory)
+    current_directory.toVector(current_dir);
+  else if (std::error_code ec = current_path(current_dir))
     return ec;
 
   // Relative path. Prepend the current directory.
@@ -785,12 +830,22 @@ std::error_code make_absolute(SmallVectorImpl<char> &path) {
                    "occurred above!");
 }
 
-std::error_code create_directories(const Twine &Path, bool IgnoreExisting) {
+std::error_code make_absolute(const Twine &current_directory,
+                              SmallVectorImpl<char> &path) {
+  return make_absolute(current_directory, path, true);
+}
+
+std::error_code make_absolute(SmallVectorImpl<char> &path) {
+  return make_absolute(Twine(), path, false);
+}
+
+std::error_code create_directories(const Twine &Path, bool IgnoreExisting,
+                                   perms Perms) {
   SmallString<128> PathStorage;
   StringRef P = Path.toStringRef(PathStorage);
 
   // Be optimistic and try to create the directory
-  std::error_code EC = create_directory(P, IgnoreExisting);
+  std::error_code EC = create_directory(P, IgnoreExisting, Perms);
   // If we succeeded, or had any error other than the parent not existing, just
   // return it.
   if (EC != errc::no_such_file_or_directory)
@@ -802,10 +857,10 @@ std::error_code create_directories(const Twine &Path, bool IgnoreExisting) {
   if (Parent.empty())
     return EC;
 
-  if ((EC = create_directories(Parent)))
+  if ((EC = create_directories(Parent, IgnoreExisting, Perms)))
       return EC;
 
-  return create_directory(P, IgnoreExisting);
+  return create_directory(P, IgnoreExisting, Perms);
 }
 
 std::error_code copy_file(const Twine &From, const Twine &To) {
@@ -889,8 +944,7 @@ std::error_code is_other(const Twine &Path, bool &Result) {
 }
 
 void directory_entry::replace_filename(const Twine &filename, file_status st) {
-  SmallString<128> path(Path.begin(), Path.end());
-  path::remove_filename(path);
+  SmallString<128> path = path::parent_path(Path);
   path::append(path, filename);
   Path = path.str();
   Status = st;
@@ -940,7 +994,8 @@ file_magic identify_magic(StringRef Magic) {
       break;
     case '!':
       if (Magic.size() >= 8)
-        if (memcmp(Magic.data(),"!<arch>\n",8) == 0)
+        if (memcmp(Magic.data(), "!<arch>\n", 8) == 0 ||
+            memcmp(Magic.data(), "!<thin>\n", 8) == 0)
           return file_magic::archive;
       break;
 
@@ -1074,3 +1129,20 @@ std::error_code directory_entry::status(file_status &result) const {
 #if defined(LLVM_ON_WIN32)
 #include "Windows/Path.inc"
 #endif
+
+namespace llvm {
+namespace sys {
+namespace path {
+
+bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1,
+                          const Twine &Path2, const Twine &Path3) {
+  if (getUserCacheDir(Result)) {
+    append(Result, Path1, Path2, Path3);
+    return true;
+  }
+  return false;
+}
+
+} // end namespace path
+} // end namsspace sys
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Support/PrettyStackTrace.cpp b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
index f9f8cab..05b3e31 100644
--- a/contrib/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm-c/Core.h"
+#include "llvm-c/ErrorHandling.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
 #include "llvm/Support/Compiler.h"
@@ -154,6 +154,20 @@ void llvm::EnablePrettyStackTrace() {
 #endif
 }
 
+const void* llvm::SavePrettyStackState() {
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+  return PrettyStackTraceHead;
+#else
+  return nullptr;
+#endif
+}
+
+void llvm::RestorePrettyStackState(const void* Top) {
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+  PrettyStackTraceHead = (const PrettyStackTraceEntry*)Top;
+#endif
+}
+
 void LLVMEnablePrettyStackTrace() {
   EnablePrettyStackTrace();
 }
diff --git a/contrib/llvm/lib/Support/Signals.cpp b/contrib/llvm/lib/Support/Signals.cpp
index a117893..3dc6b7c 100644
--- a/contrib/llvm/lib/Support/Signals.cpp
+++ b/contrib/llvm/lib/Support/Signals.cpp
@@ -12,8 +12,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Signals.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
 
 namespace llvm {
 using namespace sys;
@@ -23,6 +36,131 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
+static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>>
+    CallBacksToRun;
+void sys::RunSignalHandlers() {
+  if (!CallBacksToRun.isConstructed())
+    return;
+  for (auto &I : *CallBacksToRun)
+    I.first(I.second);
+  CallBacksToRun->clear();
+}
+}
+
+using namespace llvm;
+
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+                                  const char **Modules, intptr_t *Offsets,
+                                  const char *MainExecutableName,
+                                  StringSaver &StrPool);
+
+/// Format a pointer value as hexadecimal. Zero pad it out so its always the
+/// same width.
+static FormattedNumber format_ptr(void *PC) {
+  // Each byte is two hex digits plus 2 for the 0x prefix.
+  unsigned PtrWidth = 2 + 2 * sizeof(void *);
+  return format_hex((uint64_t)PC, PtrWidth);
+}
+
+static bool printSymbolizedStackTrace(void **StackTrace, int Depth,
+                                      llvm::raw_ostream &OS)
+  LLVM_ATTRIBUTE_USED;
+
+/// Helper that launches llvm-symbolizer and symbolizes a backtrace.
+static bool printSymbolizedStackTrace(void **StackTrace, int Depth,
+                                      llvm::raw_ostream &OS) {
+  // FIXME: Subtract necessary number from StackTrace entries to turn return addresses
+  // into actual instruction addresses.
+  // Use llvm-symbolizer tool to symbolize the stack traces.
+  ErrorOr<std::string> LLVMSymbolizerPathOrErr =
+      sys::findProgramByName("llvm-symbolizer");
+  if (!LLVMSymbolizerPathOrErr)
+    return false;
+  const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr;
+  // We don't know argv0 or the address of main() at this point, but try
+  // to guess it anyway (it's possible on some platforms).
+  std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr);
+  if (MainExecutableName.empty() ||
+      MainExecutableName.find("llvm-symbolizer") != std::string::npos)
+    return false;
+
+  BumpPtrAllocator Allocator;
+  StringSaver StrPool(Allocator);
+  std::vector<const char *> Modules(Depth, nullptr);
+  std::vector<intptr_t> Offsets(Depth, 0);
+  if (!findModulesAndOffsets(StackTrace, Depth, Modules.data(), Offsets.data(),
+                             MainExecutableName.c_str(), StrPool))
+    return false;
+  int InputFD;
+  SmallString<32> InputFile, OutputFile;
+  sys::fs::createTemporaryFile("symbolizer-input", "", InputFD, InputFile);
+  sys::fs::createTemporaryFile("symbolizer-output", "", OutputFile);
+  FileRemover InputRemover(InputFile.c_str());
+  FileRemover OutputRemover(OutputFile.c_str());
+
+  {
+    raw_fd_ostream Input(InputFD, true);
+    for (int i = 0; i < Depth; i++) {
+      if (Modules[i])
+        Input << Modules[i] << " " << (void*)Offsets[i] << "\n";
+    }
+  }
+
+  StringRef InputFileStr(InputFile);
+  StringRef OutputFileStr(OutputFile);
+  StringRef StderrFileStr;
+  const StringRef *Redirects[] = {&InputFileStr, &OutputFileStr,
+                                  &StderrFileStr};
+  const char *Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
+#ifdef LLVM_ON_WIN32
+                        // Pass --relative-address on Windows so that we don't
+                        // have to add ImageBase from PE file.
+                        // FIXME: Make this the default for llvm-symbolizer.
+                        "--relative-address",
+#endif
+                        "--demangle", nullptr};
+  int RunResult =
+      sys::ExecuteAndWait(LLVMSymbolizerPath, Args, nullptr, Redirects);
+  if (RunResult != 0)
+    return false;
+
+  // This report format is based on the sanitizer stack trace printer.  See
+  // sanitizer_stacktrace_printer.cc in compiler-rt.
+  auto OutputBuf = MemoryBuffer::getFile(OutputFile.c_str());
+  if (!OutputBuf)
+    return false;
+  StringRef Output = OutputBuf.get()->getBuffer();
+  SmallVector<StringRef, 32> Lines;
+  Output.split(Lines, "\n");
+  auto CurLine = Lines.begin();
+  int frame_no = 0;
+  for (int i = 0; i < Depth; i++) {
+    if (!Modules[i]) {
+      OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << '\n';
+      continue;
+    }
+    // Read pairs of lines (function name and file/line info) until we
+    // encounter empty line.
+    for (;;) {
+      if (CurLine == Lines.end())
+        return false;
+      StringRef FunctionName = *CurLine++;
+      if (FunctionName.empty())
+        break;
+      OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << ' ';
+      if (!FunctionName.startswith("??"))
+        OS << FunctionName << ' ';
+      if (CurLine == Lines.end())
+        return false;
+      StringRef FileLineInfo = *CurLine++;
+      if (!FileLineInfo.startswith("??"))
+        OS << FileLineInfo;
+      else
+        OS << "(" << Modules[i] << '+' << format_hex(Offsets[i], 0) << ")";
+      OS << "\n";
+    }
+  }
+  return true;
 }
 
 // Include the platform-specific parts of this class.
diff --git a/contrib/llvm/lib/Support/Statistic.cpp b/contrib/llvm/lib/Support/Statistic.cpp
index 56c3b0f..e49d1cb 100644
--- a/contrib/llvm/lib/Support/Statistic.cpp
+++ b/contrib/llvm/lib/Support/Statistic.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -33,9 +34,6 @@
 #include <cstring>
 using namespace llvm;
 
-// CreateInfoOutputFile - Return a file stream to print our output on.
-namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
-
 /// -stats - Command line option to cause transformations to emit stats about
 /// what they did.
 ///
@@ -144,20 +142,18 @@ void llvm::PrintStatistics() {
   if (Stats.Stats.empty()) return;
 
   // Get the stream to write to.
-  raw_ostream &OutStream = *CreateInfoOutputFile();
-  PrintStatistics(OutStream);
-  delete &OutStream;   // Close the file.
+  std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
+  PrintStatistics(*OutStream);
+
 #else
   // Check if the -stats option is set instead of checking
   // !Stats.Stats.empty().  In release builds, Statistics operators
   // do nothing, so stats are never Registered.
   if (Enabled) {
     // Get the stream to write to.
-    raw_ostream &OutStream = *CreateInfoOutputFile();
-    OutStream << "Statistics are disabled.  "
-            << "Build with asserts or with -DLLVM_ENABLE_STATS\n";
-    OutStream.flush();
-    delete &OutStream;   // Close the file.
+    std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
+    (*OutStream) << "Statistics are disabled.  "
+                 << "Build with asserts or with -DLLVM_ENABLE_STATS\n";
   }
 #endif
 }
diff --git a/contrib/llvm/lib/Support/StringRef.cpp b/contrib/llvm/lib/Support/StringRef.cpp
index ddece08..7ecff29 100644
--- a/contrib/llvm/lib/Support/StringRef.cpp
+++ b/contrib/llvm/lib/Support/StringRef.cpp
@@ -140,37 +140,44 @@ std::string StringRef::upper() const {
 /// \return - The index of the first occurrence of \arg Str, or npos if not
 /// found.
 size_t StringRef::find(StringRef Str, size_t From) const {
+  if (From > Length)
+    return npos;
+
+  const char *Needle = Str.data();
   size_t N = Str.size();
-  if (N > Length)
+  if (N == 0)
+    return From;
+
+  size_t Size = Length - From;
+  if (Size < N)
     return npos;
 
+  const char *Start = Data + From;
+  const char *Stop = Start + (Size - N + 1);
+
   // For short haystacks or unsupported needles fall back to the naive algorithm
-  if (Length < 16 || N > 255 || N == 0) {
-    for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i)
-      if (substr(i, N).equals(Str))
-        return i;
+  if (Size < 16 || N > 255) {
+    do {
+      if (std::memcmp(Start, Needle, N) == 0)
+        return Start - Data;
+      ++Start;
+    } while (Start < Stop);
     return npos;
   }
 
-  if (From >= Length)
-    return npos;
-
   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
   uint8_t BadCharSkip[256];
   std::memset(BadCharSkip, N, 256);
   for (unsigned i = 0; i != N-1; ++i)
     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
 
-  unsigned Len = Length-From, Pos = From;
-  while (Len >= N) {
-    if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
-      return Pos;
+  do {
+    if (std::memcmp(Start, Needle, N) == 0)
+      return Start - Data;
 
     // Otherwise skip the appropriate number of bytes.
-    uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]];
-    Len -= Skip;
-    Pos += Skip;
-  }
+    Start += BadCharSkip[(uint8_t)Start[N-1]];
+  } while (Start < Stop);
 
   return npos;
 }
@@ -274,24 +281,56 @@ StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
 }
 
 void StringRef::split(SmallVectorImpl<StringRef> &A,
-                      StringRef Separators, int MaxSplit,
+                      StringRef Separator, int MaxSplit,
                       bool KeepEmpty) const {
-  StringRef rest = *this;
-
-  // rest.data() is used to distinguish cases like "a," that splits into
-  // "a" + "" and "a" that splits into "a" + 0.
-  for (int splits = 0;
-       rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit);
-       ++splits) {
-    std::pair<StringRef, StringRef> p = rest.split(Separators);
-
-    if (KeepEmpty || p.first.size() != 0)
-      A.push_back(p.first);
-    rest = p.second;
+  StringRef S = *this;
+
+  // Count down from MaxSplit. When MaxSplit is -1, this will just split
+  // "forever". This doesn't support splitting more than 2^31 times
+  // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
+  // but that seems unlikely to be useful.
+  while (MaxSplit-- != 0) {
+    size_t Idx = S.find(Separator);
+    if (Idx == npos)
+      break;
+
+    // Push this split.
+    if (KeepEmpty || Idx > 0)
+      A.push_back(S.slice(0, Idx));
+
+    // Jump forward.
+    S = S.slice(Idx + Separator.size(), npos);
+  }
+
+  // Push the tail.
+  if (KeepEmpty || !S.empty())
+    A.push_back(S);
+}
+
+void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
+                      int MaxSplit, bool KeepEmpty) const {
+  StringRef S = *this;
+
+  // Count down from MaxSplit. When MaxSplit is -1, this will just split
+  // "forever". This doesn't support splitting more than 2^31 times
+  // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
+  // but that seems unlikely to be useful.
+  while (MaxSplit-- != 0) {
+    size_t Idx = S.find(Separator);
+    if (Idx == npos)
+      break;
+
+    // Push this split.
+    if (KeepEmpty || Idx > 0)
+      A.push_back(S.slice(0, Idx));
+
+    // Jump forward.
+    S = S.slice(Idx + 1, npos);
   }
-  // If we have a tail left, add it.
-  if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty))
-    A.push_back(rest);
+
+  // Push the tail.
+  if (KeepEmpty || !S.empty())
+    A.push_back(S);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Support/StringSaver.cpp b/contrib/llvm/lib/Support/StringSaver.cpp
index d6b84e5..bbc1fd2 100644
--- a/contrib/llvm/lib/Support/StringSaver.cpp
+++ b/contrib/llvm/lib/Support/StringSaver.cpp
@@ -11,7 +11,7 @@
 
 using namespace llvm;
 
-const char *StringSaver::saveImpl(StringRef S) {
+const char *StringSaver::save(StringRef S) {
   char *P = Alloc.Allocate<char>(S.size() + 1);
   memcpy(P, S.data(), S.size());
   P[S.size()] = '\0';
diff --git a/contrib/llvm/lib/Support/TargetParser.cpp b/contrib/llvm/lib/Support/TargetParser.cpp
index 4d4c041..337532e 100644
--- a/contrib/llvm/lib/Support/TargetParser.cpp
+++ b/contrib/llvm/lib/Support/TargetParser.cpp
@@ -16,9 +16,11 @@
 #include "llvm/Support/TargetParser.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include <cctype>
 
 using namespace llvm;
+using namespace ARM;
 
 namespace {
 
@@ -26,36 +28,19 @@ namespace {
 // features they correspond to (use getFPUFeatures).
 // FIXME: TableGen this.
 // The entries must appear in the order listed in ARM::FPUKind for correct indexing
-struct {
-  const char * Name;
+static const struct {
+  const char *NameCStr;
+  size_t NameLength;
   ARM::FPUKind ID;
   ARM::FPUVersion FPUVersion;
   ARM::NeonSupportLevel NeonSupport;
   ARM::FPURestriction Restriction;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
 } FPUNames[] = {
-  { "invalid",        ARM::FK_INVALID,        ARM::FV_NONE,       ARM::NS_None,   ARM::FR_None},
-  { "none",           ARM::FK_NONE,           ARM::FV_NONE,       ARM::NS_None,   ARM::FR_None},
-  { "vfp",            ARM::FK_VFP,            ARM::FV_VFPV2,      ARM::NS_None,   ARM::FR_None},
-  { "vfpv2",          ARM::FK_VFPV2,          ARM::FV_VFPV2,      ARM::NS_None,   ARM::FR_None},
-  { "vfpv3",          ARM::FK_VFPV3,          ARM::FV_VFPV3,      ARM::NS_None,   ARM::FR_None},
-  { "vfpv3-fp16",     ARM::FK_VFPV3_FP16,     ARM::FV_VFPV3_FP16, ARM::NS_None,   ARM::FR_None},
-  { "vfpv3-d16",      ARM::FK_VFPV3_D16,      ARM::FV_VFPV3,      ARM::NS_None,   ARM::FR_D16},
-  { "vfpv3-d16-fp16", ARM::FK_VFPV3_D16_FP16, ARM::FV_VFPV3_FP16, ARM::NS_None,   ARM::FR_D16},
-  { "vfpv3xd",        ARM::FK_VFPV3XD,        ARM::FV_VFPV3,      ARM::NS_None,   ARM::FR_SP_D16},
-  { "vfpv3xd-fp16",   ARM::FK_VFPV3XD_FP16,   ARM::FV_VFPV3_FP16, ARM::NS_None,   ARM::FR_SP_D16},
-  { "vfpv4",          ARM::FK_VFPV4,          ARM::FV_VFPV4,      ARM::NS_None,   ARM::FR_None},
-  { "vfpv4-d16",      ARM::FK_VFPV4_D16,      ARM::FV_VFPV4,      ARM::NS_None,   ARM::FR_D16},
-  { "fpv4-sp-d16",    ARM::FK_FPV4_SP_D16,    ARM::FV_VFPV4,      ARM::NS_None,   ARM::FR_SP_D16},
-  { "fpv5-d16",       ARM::FK_FPV5_D16,       ARM::FV_VFPV5,      ARM::NS_None,   ARM::FR_D16},
-  { "fpv5-sp-d16",    ARM::FK_FPV5_SP_D16,    ARM::FV_VFPV5,      ARM::NS_None,   ARM::FR_SP_D16},
-  { "fp-armv8",       ARM::FK_FP_ARMV8,       ARM::FV_VFPV5,      ARM::NS_None,   ARM::FR_None},
-  { "neon",           ARM::FK_NEON,           ARM::FV_VFPV3,      ARM::NS_Neon,   ARM::FR_None},
-  { "neon-fp16",      ARM::FK_NEON_FP16,      ARM::FV_VFPV3_FP16, ARM::NS_Neon,   ARM::FR_None},
-  { "neon-vfpv4",     ARM::FK_NEON_VFPV4,     ARM::FV_VFPV4,      ARM::NS_Neon,   ARM::FR_None},
-  { "neon-fp-armv8",  ARM::FK_NEON_FP_ARMV8,  ARM::FV_VFPV5,      ARM::NS_Neon,   ARM::FR_None},
-  { "crypto-neon-fp-armv8",
-               ARM::FK_CRYPTO_NEON_FP_ARMV8,  ARM::FV_VFPV5,      ARM::NS_Crypto, ARM::FR_None},
-  { "softvfp",        ARM::FK_SOFTVFP,        ARM::FV_NONE,       ARM::NS_None,   ARM::FR_None},
+#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \
+  { NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION },
+#include "llvm/Support/ARMTargetParser.def"
 };
 
 // List of canonical arch names (use getArchSynonym).
@@ -66,165 +51,79 @@ struct {
 // of the triples and are not conforming with their official names.
 // Check to see if the expectation should be changed.
 // FIXME: TableGen this.
-struct {
-  const char *Name;
+static const struct {
+  const char *NameCStr;
+  size_t NameLength;
+  const char *CPUAttrCStr;
+  size_t CPUAttrLength;
+  const char *SubArchCStr;
+  size_t SubArchLength;
+  unsigned DefaultFPU;
+  unsigned ArchBaseExtensions;
   ARM::ArchKind ID;
-  const char *CPUAttr; // CPU class in build attributes.
-  const char *SubArch; // Sub-Arch name.
   ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes.
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+
+  // CPU class in build attributes.
+  StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); }
+
+  // Sub-Arch name.
+  StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); }
 } ARCHNames[] = {
-  { "invalid",   ARM::AK_INVALID,  nullptr,   nullptr, ARMBuildAttrs::CPUArch::Pre_v4 },
-  { "armv2",     ARM::AK_ARMV2,    "2",       "v2",    ARMBuildAttrs::CPUArch::Pre_v4 },
-  { "armv2a",    ARM::AK_ARMV2A,   "2A",      "v2a",   ARMBuildAttrs::CPUArch::Pre_v4 },
-  { "armv3",     ARM::AK_ARMV3,    "3",       "v3",    ARMBuildAttrs::CPUArch::Pre_v4 },
-  { "armv3m",    ARM::AK_ARMV3M,   "3M",      "v3m",   ARMBuildAttrs::CPUArch::Pre_v4 },
-  { "armv4",     ARM::AK_ARMV4,    "4",       "v4",    ARMBuildAttrs::CPUArch::v4 },
-  { "armv4t",    ARM::AK_ARMV4T,   "4T",      "v4t",   ARMBuildAttrs::CPUArch::v4T },
-  { "armv5t",    ARM::AK_ARMV5T,   "5T",      "v5",    ARMBuildAttrs::CPUArch::v5T },
-  { "armv5te",   ARM::AK_ARMV5TE,  "5TE",     "v5e",   ARMBuildAttrs::CPUArch::v5TE },
-  { "armv5tej",  ARM::AK_ARMV5TEJ, "5TEJ",    "v5e",   ARMBuildAttrs::CPUArch::v5TEJ },
-  { "armv6",     ARM::AK_ARMV6,    "6",       "v6",    ARMBuildAttrs::CPUArch::v6 },
-  { "armv6k",    ARM::AK_ARMV6K,   "6K",      "v6k",   ARMBuildAttrs::CPUArch::v6K },
-  { "armv6t2",   ARM::AK_ARMV6T2,  "6T2",     "v6t2",  ARMBuildAttrs::CPUArch::v6T2 },
-  { "armv6z",    ARM::AK_ARMV6Z,   "6Z",      "v6z",   ARMBuildAttrs::CPUArch::v6KZ },
-  { "armv6zk",   ARM::AK_ARMV6ZK,  "6ZK",     "v6zk",  ARMBuildAttrs::CPUArch::v6KZ },
-  { "armv6-m",   ARM::AK_ARMV6M,   "6-M",     "v6m",   ARMBuildAttrs::CPUArch::v6_M },
-  { "armv6s-m",  ARM::AK_ARMV6SM,  "6S-M",    "v6sm",  ARMBuildAttrs::CPUArch::v6S_M },
-  { "armv7-a",   ARM::AK_ARMV7A,   "7-A",     "v7",    ARMBuildAttrs::CPUArch::v7 },
-  { "armv7-r",   ARM::AK_ARMV7R,   "7-R",     "v7r",   ARMBuildAttrs::CPUArch::v7 },
-  { "armv7-m",   ARM::AK_ARMV7M,   "7-M",     "v7m",   ARMBuildAttrs::CPUArch::v7 },
-  { "armv7e-m",  ARM::AK_ARMV7EM,  "7E-M",    "v7em",  ARMBuildAttrs::CPUArch::v7E_M },
-  { "armv8-a",   ARM::AK_ARMV8A,   "8-A",     "v8",    ARMBuildAttrs::CPUArch::v8 },
-  { "armv8.1-a", ARM::AK_ARMV8_1A, "8.1-A",   "v8.1a", ARMBuildAttrs::CPUArch::v8 },
-  // Non-standard Arch names.
-  { "iwmmxt",    ARM::AK_IWMMXT,   "iwmmxt",  "",      ARMBuildAttrs::CPUArch::v5TE },
-  { "iwmmxt2",   ARM::AK_IWMMXT2,  "iwmmxt2", "",      ARMBuildAttrs::CPUArch::v5TE },
-  { "xscale",    ARM::AK_XSCALE,   "xscale",  "",      ARMBuildAttrs::CPUArch::v5TE },
-  { "armv5",     ARM::AK_ARMV5,    "5T",      "v5",    ARMBuildAttrs::CPUArch::v5T },
-  { "armv5e",    ARM::AK_ARMV5E,   "5TE",     "v5e",   ARMBuildAttrs::CPUArch::v5TE },
-  { "armv6j",    ARM::AK_ARMV6J,   "6J",      "v6",    ARMBuildAttrs::CPUArch::v6 },
-  { "armv6hl",   ARM::AK_ARMV6HL,  "6-M",     "v6hl",  ARMBuildAttrs::CPUArch::v6_M },
-  { "armv7",     ARM::AK_ARMV7,    "7",       "v7",    ARMBuildAttrs::CPUArch::v7 },
-  { "armv7l",    ARM::AK_ARMV7L,   "7-L",     "v7l",   ARMBuildAttrs::CPUArch::v7 },
-  { "armv7hl",   ARM::AK_ARMV7HL,  "7-L",     "v7hl",  ARMBuildAttrs::CPUArch::v7 },
-  { "armv7s",    ARM::AK_ARMV7S,   "7-S",     "v7s",   ARMBuildAttrs::CPUArch::v7 }
+#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)       \
+  {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH,       \
+   sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, ID, ARCH_ATTR},
+#include "llvm/Support/ARMTargetParser.def"
 };
+
 // List of Arch Extension names.
 // FIXME: TableGen this.
-struct {
-  const char *Name;
-  ARM::ArchExtKind ID;
+static const struct {
+  const char *NameCStr;
+  size_t NameLength;
+  unsigned ID;
+  const char *Feature;
+  const char *NegFeature;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
 } ARCHExtNames[] = {
-  { "invalid",  ARM::AEK_INVALID },
-  { "crc",      ARM::AEK_CRC },
-  { "crypto",   ARM::AEK_CRYPTO },
-  { "fp",       ARM::AEK_FP },
-  { "idiv",     ARM::AEK_HWDIV },
-  { "mp",       ARM::AEK_MP },
-  { "simd",     ARM::AEK_SIMD },
-  { "sec",      ARM::AEK_SEC },
-  { "virt",     ARM::AEK_VIRT },
-  { "os",       ARM::AEK_OS },
-  { "iwmmxt",   ARM::AEK_IWMMXT },
-  { "iwmmxt2",  ARM::AEK_IWMMXT2 },
-  { "maverick", ARM::AEK_MAVERICK },
-  { "xscale",   ARM::AEK_XSCALE }
+#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
+  { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE },
+#include "llvm/Support/ARMTargetParser.def"
 };
+
+// List of HWDiv names (use getHWDivSynonym) and which architectural
+// features they correspond to (use getHWDivFeatures).
+// FIXME: TableGen this.
+static const struct {
+  const char *NameCStr;
+  size_t NameLength;
+  unsigned ID;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
+} HWDivNames[] = {
+#define ARM_HW_DIV_NAME(NAME, ID) { NAME, sizeof(NAME) - 1, ID },
+#include "llvm/Support/ARMTargetParser.def"
+};
+
 // List of CPU names and their arches.
 // The same CPU can have multiple arches and can be default on multiple arches.
 // When finding the Arch for a CPU, first-found prevails. Sort them accordingly.
 // When this becomes table-generated, we'd probably need two tables.
 // FIXME: TableGen this.
-struct {
-  const char *Name;
+static const struct {
+  const char *NameCStr;
+  size_t NameLength;
   ARM::ArchKind ArchID;
-  bool Default;
+  bool Default; // is $Name the default CPU for $ArchID ?
+  unsigned DefaultExtensions;
+
+  StringRef getName() const { return StringRef(NameCStr, NameLength); }
 } CPUNames[] = {
-  { "arm2",          ARM::AK_ARMV2,    true },
-  { "arm3",          ARM::AK_ARMV2A,   true },
-  { "arm6",          ARM::AK_ARMV3,    true },
-  { "arm7m",         ARM::AK_ARMV3M,   true },
-  { "arm8",          ARM::AK_ARMV4,    false },
-  { "arm810",        ARM::AK_ARMV4,    false },
-  { "strongarm",     ARM::AK_ARMV4,    true },
-  { "strongarm110",  ARM::AK_ARMV4,    false },
-  { "strongarm1100", ARM::AK_ARMV4,    false },
-  { "strongarm1110", ARM::AK_ARMV4,    false },
-  { "arm7tdmi",      ARM::AK_ARMV4T,   true },
-  { "arm7tdmi-s",    ARM::AK_ARMV4T,   false },
-  { "arm710t",       ARM::AK_ARMV4T,   false },
-  { "arm720t",       ARM::AK_ARMV4T,   false },
-  { "arm9",          ARM::AK_ARMV4T,   false },
-  { "arm9tdmi",      ARM::AK_ARMV4T,   false },
-  { "arm920",        ARM::AK_ARMV4T,   false },
-  { "arm920t",       ARM::AK_ARMV4T,   false },
-  { "arm922t",       ARM::AK_ARMV4T,   false },
-  { "arm9312",       ARM::AK_ARMV4T,   false },
-  { "arm940t",       ARM::AK_ARMV4T,   false },
-  { "ep9312",        ARM::AK_ARMV4T,   false },
-  { "arm10tdmi",     ARM::AK_ARMV5T,   true },
-  { "arm1020t",      ARM::AK_ARMV5T,   false },
-  { "arm9e",         ARM::AK_ARMV5TE,  false },
-  { "arm946e-s",     ARM::AK_ARMV5TE,  false },
-  { "arm966e-s",     ARM::AK_ARMV5TE,  false },
-  { "arm968e-s",     ARM::AK_ARMV5TE,  false },
-  { "arm10e",        ARM::AK_ARMV5TE,  false },
-  { "arm1020e",      ARM::AK_ARMV5TE,  false },
-  { "arm1022e",      ARM::AK_ARMV5TE,  true },
-  { "iwmmxt",        ARM::AK_ARMV5TE,  false },
-  { "xscale",        ARM::AK_ARMV5TE,  false },
-  { "arm926ej-s",    ARM::AK_ARMV5TEJ, true },
-  { "arm1136jf-s",   ARM::AK_ARMV6,    true },
-  { "arm1176j-s",    ARM::AK_ARMV6K,   false },
-  { "arm1176jz-s",   ARM::AK_ARMV6K,   false },
-  { "mpcore",        ARM::AK_ARMV6K,   false },
-  { "mpcorenovfp",   ARM::AK_ARMV6K,   false },
-  { "arm1176jzf-s",  ARM::AK_ARMV6K,   true },
-  { "arm1176jzf-s",  ARM::AK_ARMV6Z,   true },
-  { "arm1176jzf-s",  ARM::AK_ARMV6ZK,  true },
-  { "arm1156t2-s",   ARM::AK_ARMV6T2,  true },
-  { "arm1156t2f-s",  ARM::AK_ARMV6T2,  false },
-  { "cortex-m0",     ARM::AK_ARMV6M,   true },
-  { "cortex-m0plus", ARM::AK_ARMV6M,   false },
-  { "cortex-m1",     ARM::AK_ARMV6M,   false },
-  { "sc000",         ARM::AK_ARMV6M,   false },
-  { "cortex-a5",     ARM::AK_ARMV7A,   false },
-  { "cortex-a7",     ARM::AK_ARMV7A,   false },
-  { "cortex-a8",     ARM::AK_ARMV7A,   true },
-  { "cortex-a9",     ARM::AK_ARMV7A,   false },
-  { "cortex-a12",    ARM::AK_ARMV7A,   false },
-  { "cortex-a15",    ARM::AK_ARMV7A,   false },
-  { "cortex-a17",    ARM::AK_ARMV7A,   false },
-  { "krait",         ARM::AK_ARMV7A,   false },
-  { "cortex-r4",     ARM::AK_ARMV7R,   true },
-  { "cortex-r4f",    ARM::AK_ARMV7R,   false },
-  { "cortex-r5",     ARM::AK_ARMV7R,   false },
-  { "cortex-r7",     ARM::AK_ARMV7R,   false },
-  { "sc300",         ARM::AK_ARMV7M,   false },
-  { "cortex-m3",     ARM::AK_ARMV7M,   true },
-  { "cortex-m4",     ARM::AK_ARMV7EM,  true },
-  { "cortex-m7",     ARM::AK_ARMV7EM,  false },
-  { "cortex-a53",    ARM::AK_ARMV8A,   true },
-  { "cortex-a57",    ARM::AK_ARMV8A,   false },
-  { "cortex-a72",    ARM::AK_ARMV8A,   false },
-  { "cyclone",       ARM::AK_ARMV8A,   false },
-  { "generic",       ARM::AK_ARMV8_1A, true },
-  // Non-standard Arch names.
-  { "iwmmxt",        ARM::AK_IWMMXT,   true },
-  { "xscale",        ARM::AK_XSCALE,   true },
-  { "arm10tdmi",     ARM::AK_ARMV5,    true },
-  { "arm1022e",      ARM::AK_ARMV5E,   true },
-  { "arm1136j-s",    ARM::AK_ARMV6J,   true },
-  { "arm1136jz-s",   ARM::AK_ARMV6J,   false },
-  { "cortex-m0",     ARM::AK_ARMV6SM,  true },
-  { "arm1176jzf-s",  ARM::AK_ARMV6HL,  true },
-  { "cortex-a8",     ARM::AK_ARMV7,    true },
-  { "cortex-a8",     ARM::AK_ARMV7L,   true },
-  { "cortex-a8",     ARM::AK_ARMV7HL,  true },
-  { "cortex-m4",     ARM::AK_ARMV7EM,  true },
-  { "swift",         ARM::AK_ARMV7S,   true },
-  // Invalid CPU
-  { "invalid",       ARM::AK_INVALID,  true }
+#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+  { NAME, sizeof(NAME) - 1, ID, IS_DEFAULT, DEFAULT_EXT },
+#include "llvm/Support/ARMTargetParser.def"
 };
 
 } // namespace
@@ -233,33 +132,93 @@ struct {
 // Information by ID
 // ======================================================= //
 
-const char *ARMTargetParser::getFPUName(unsigned FPUKind) {
+StringRef llvm::ARM::getFPUName(unsigned FPUKind) {
   if (FPUKind >= ARM::FK_LAST)
-    return nullptr;
-  return FPUNames[FPUKind].Name;
+    return StringRef();
+  return FPUNames[FPUKind].getName();
 }
 
-unsigned ARMTargetParser::getFPUVersion(unsigned FPUKind) {
+unsigned llvm::ARM::getFPUVersion(unsigned FPUKind) {
   if (FPUKind >= ARM::FK_LAST)
     return 0;
   return FPUNames[FPUKind].FPUVersion;
 }
 
-unsigned ARMTargetParser::getFPUNeonSupportLevel(unsigned FPUKind) {
+unsigned llvm::ARM::getFPUNeonSupportLevel(unsigned FPUKind) {
   if (FPUKind >= ARM::FK_LAST)
     return 0;
   return FPUNames[FPUKind].NeonSupport;
 }
 
-unsigned ARMTargetParser::getFPURestriction(unsigned FPUKind) {
+unsigned llvm::ARM::getFPURestriction(unsigned FPUKind) {
   if (FPUKind >= ARM::FK_LAST)
     return 0;
   return FPUNames[FPUKind].Restriction;
 }
 
-bool ARMTargetParser::getFPUFeatures(unsigned FPUKind,
+unsigned llvm::ARM::getDefaultFPU(StringRef CPU, unsigned ArchKind) {
+  if (CPU == "generic")
+    return ARCHNames[ArchKind].DefaultFPU;
+
+  return StringSwitch<unsigned>(CPU)
+#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+    .Case(NAME, DEFAULT_FPU)
+#include "llvm/Support/ARMTargetParser.def"
+    .Default(ARM::FK_INVALID);
+}
+
+unsigned llvm::ARM::getDefaultExtensions(StringRef CPU, unsigned ArchKind) {
+  if (CPU == "generic")
+    return ARCHNames[ArchKind].ArchBaseExtensions;
+
+  return StringSwitch<unsigned>(CPU)
+#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
+    .Case(NAME, ARCHNames[ID].ArchBaseExtensions | DEFAULT_EXT)
+#include "llvm/Support/ARMTargetParser.def"
+    .Default(ARM::AEK_INVALID);
+}
+
+bool llvm::ARM::getHWDivFeatures(unsigned HWDivKind,
+                                 std::vector<const char *> &Features) {
+
+  if (HWDivKind == ARM::AEK_INVALID)
+    return false;
+
+  if (HWDivKind & ARM::AEK_HWDIVARM)
+    Features.push_back("+hwdiv-arm");
+  else
+    Features.push_back("-hwdiv-arm");
+
+  if (HWDivKind & ARM::AEK_HWDIV)
+    Features.push_back("+hwdiv");
+  else
+    Features.push_back("-hwdiv");
+
+  return true;
+}
+
+bool llvm::ARM::getExtensionFeatures(unsigned Extensions,
                                      std::vector<const char *> &Features) {
 
+  if (Extensions == ARM::AEK_INVALID)
+    return false;
+
+  if (Extensions & ARM::AEK_CRC)
+    Features.push_back("+crc");
+  else
+    Features.push_back("-crc");
+
+  if (Extensions & ARM::AEK_DSP)
+    Features.push_back("+dsp");
+  else
+    Features.push_back("-dsp");
+
+  return getHWDivFeatures(Extensions, Features);
+}
+
+bool llvm::ARM::getFPUFeatures(unsigned FPUKind,
+                               std::vector<const char *> &Features) {
+
   if (FPUKind >= ARM::FK_LAST || FPUKind == ARM::FK_INVALID)
     return false;
 
@@ -323,6 +282,7 @@ bool ARMTargetParser::getFPUFeatures(unsigned FPUKind,
   // crypto includes neon, so we handle this similarly to FPU version.
   switch (FPUNames[FPUKind].NeonSupport) {
   case ARM::NS_Crypto:
+    Features.push_back("+neon");
     Features.push_back("+crypto");
     break;
   case ARM::NS_Neon:
@@ -338,88 +298,127 @@ bool ARMTargetParser::getFPUFeatures(unsigned FPUKind,
   return true;
 }
 
-const char *ARMTargetParser::getArchName(unsigned ArchKind) {
+StringRef llvm::ARM::getArchName(unsigned ArchKind) {
   if (ArchKind >= ARM::AK_LAST)
-    return nullptr;
-  return ARCHNames[ArchKind].Name;
+    return StringRef();
+  return ARCHNames[ArchKind].getName();
 }
 
-const char *ARMTargetParser::getCPUAttr(unsigned ArchKind) {
+StringRef llvm::ARM::getCPUAttr(unsigned ArchKind) {
   if (ArchKind >= ARM::AK_LAST)
-    return nullptr;
-  return ARCHNames[ArchKind].CPUAttr;
+    return StringRef();
+  return ARCHNames[ArchKind].getCPUAttr();
 }
 
-const char *ARMTargetParser::getSubArch(unsigned ArchKind) {
+StringRef llvm::ARM::getSubArch(unsigned ArchKind) {
   if (ArchKind >= ARM::AK_LAST)
-    return nullptr;
-  return ARCHNames[ArchKind].SubArch;
+    return StringRef();
+  return ARCHNames[ArchKind].getSubArch();
 }
 
-unsigned ARMTargetParser::getArchAttr(unsigned ArchKind) {
+unsigned llvm::ARM::getArchAttr(unsigned ArchKind) {
   if (ArchKind >= ARM::AK_LAST)
     return ARMBuildAttrs::CPUArch::Pre_v4;
   return ARCHNames[ArchKind].ArchAttr;
 }
 
-const char *ARMTargetParser::getArchExtName(unsigned ArchExtKind) {
-  if (ArchExtKind >= ARM::AEK_LAST)
-    return nullptr;
-  return ARCHExtNames[ArchExtKind].Name;
+StringRef llvm::ARM::getArchExtName(unsigned ArchExtKind) {
+  for (const auto AE : ARCHExtNames) {
+    if (ArchExtKind == AE.ID)
+      return AE.getName();
+  }
+  return StringRef();
 }
 
-const char *ARMTargetParser::getDefaultCPU(StringRef Arch) {
+const char *llvm::ARM::getArchExtFeature(StringRef ArchExt) {
+  if (ArchExt.startswith("no")) {
+    StringRef ArchExtBase(ArchExt.substr(2));
+    for (const auto AE : ARCHExtNames) {
+      if (AE.NegFeature && ArchExtBase == AE.getName())
+        return AE.NegFeature;
+    }
+  }
+  for (const auto AE : ARCHExtNames) {
+    if (AE.Feature && ArchExt == AE.getName())
+      return AE.Feature;
+  }
+
+  return nullptr;
+}
+
+StringRef llvm::ARM::getHWDivName(unsigned HWDivKind) {
+  for (const auto D : HWDivNames) {
+    if (HWDivKind == D.ID)
+      return D.getName();
+  }
+  return StringRef();
+}
+
+StringRef llvm::ARM::getDefaultCPU(StringRef Arch) {
   unsigned AK = parseArch(Arch);
   if (AK == ARM::AK_INVALID)
-    return nullptr;
+    return StringRef();
 
   // Look for multiple AKs to find the default for pair AK+Name.
   for (const auto CPU : CPUNames) {
     if (CPU.ArchID == AK && CPU.Default)
-      return CPU.Name;
+      return CPU.getName();
   }
-  return nullptr;
+
+  // If we can't find a default then target the architecture instead
+  return "generic";
 }
 
 // ======================================================= //
 // Parsers
 // ======================================================= //
 
-StringRef ARMTargetParser::getFPUSynonym(StringRef FPU) {
+static StringRef getHWDivSynonym(StringRef HWDiv) {
+  return StringSwitch<StringRef>(HWDiv)
+      .Case("thumb,arm", "arm,thumb")
+      .Default(HWDiv);
+}
+
+static StringRef getFPUSynonym(StringRef FPU) {
   return StringSwitch<StringRef>(FPU)
-    .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported
-    .Case("vfp2", "vfpv2")
-    .Case("vfp3", "vfpv3")
-    .Case("vfp4", "vfpv4")
-    .Case("vfp3-d16", "vfpv3-d16")
-    .Case("vfp4-d16", "vfpv4-d16")
-    .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16")
-    .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16")
-    .Case("fp5-sp-d16", "fpv5-sp-d16")
-    .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16")
-    // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3.
-    .Case("neon-vfpv3", "neon")
-    .Default(FPU);
+      .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported
+      .Case("vfp2", "vfpv2")
+      .Case("vfp3", "vfpv3")
+      .Case("vfp4", "vfpv4")
+      .Case("vfp3-d16", "vfpv3-d16")
+      .Case("vfp4-d16", "vfpv4-d16")
+      .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16")
+      .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16")
+      .Case("fp5-sp-d16", "fpv5-sp-d16")
+      .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16")
+      // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3.
+      .Case("neon-vfpv3", "neon")
+      .Default(FPU);
 }
 
-StringRef ARMTargetParser::getArchSynonym(StringRef Arch) {
+static StringRef getArchSynonym(StringRef Arch) {
   return StringSwitch<StringRef>(Arch)
-    .Case("v6sm", "v6s-m")
-    .Case("v6m", "v6-m")
-    .Case("v7a", "v7-a")
-    .Case("v7r", "v7-r")
-    .Case("v7m", "v7-m")
-    .Case("v7em", "v7e-m")
-    .Cases("v8", "v8a", "aarch64", "arm64", "v8-a")
-    .Case("v8.1a", "v8.1-a")
-    .Default(Arch);
+      .Case("v5", "v5t")
+      .Case("v5e", "v5te")
+      .Case("v6j", "v6")
+      .Case("v6hl", "v6k")
+      .Cases("v6m", "v6sm", "v6s-m", "v6-m")
+      .Cases("v6z", "v6zk", "v6kz")
+      .Cases("v7", "v7a", "v7hl", "v7l", "v7-a")
+      .Case("v7r", "v7-r")
+      .Case("v7m", "v7-m")
+      .Case("v7em", "v7e-m")
+      .Cases("v8", "v8a", "aarch64", "arm64", "v8-a")
+      .Case("v8.1a", "v8.1-a")
+      .Case("v8.2a", "v8.2-a")
+      .Default(Arch);
 }
 
 // MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but
 // (iwmmxt|xscale)(eb)? is also permitted. If the former, return
 // "v.+", if the latter, return unmodified string, minus 'eb'.
 // If invalid, return empty string.
-StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) {
+StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) {
   size_t offset = StringRef::npos;
   StringRef A = Arch;
   StringRef Error = "";
@@ -436,7 +435,7 @@ StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) {
     // AArch64 uses "_be", not "eb" suffix.
     if (A.find("eb") != StringRef::npos)
       return Error;
-    if (A.substr(offset,3) == "_be")
+    if (A.substr(offset, 3) == "_be")
       offset += 3;
   }
 
@@ -456,7 +455,7 @@ StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) {
 
   // Only match non-marketing names
   if (offset != StringRef::npos) {
-  // Must start with 'vN'.
+    // Must start with 'vN'.
     if (A[0] != 'v' || !std::isdigit(A[1]))
       return Error;
     // Can't have an extra 'eb'.
@@ -468,56 +467,64 @@ StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) {
   return A;
 }
 
-unsigned ARMTargetParser::parseFPU(StringRef FPU) {
+unsigned llvm::ARM::parseHWDiv(StringRef HWDiv) {
+  StringRef Syn = getHWDivSynonym(HWDiv);
+  for (const auto D : HWDivNames) {
+    if (Syn == D.getName())
+      return D.ID;
+  }
+  return ARM::AEK_INVALID;
+}
+
+unsigned llvm::ARM::parseFPU(StringRef FPU) {
   StringRef Syn = getFPUSynonym(FPU);
   for (const auto F : FPUNames) {
-    if (Syn == F.Name)
+    if (Syn == F.getName())
       return F.ID;
   }
   return ARM::FK_INVALID;
 }
 
 // Allows partial match, ex. "v7a" matches "armv7a".
-unsigned ARMTargetParser::parseArch(StringRef Arch) {
+unsigned llvm::ARM::parseArch(StringRef Arch) {
   Arch = getCanonicalArchName(Arch);
   StringRef Syn = getArchSynonym(Arch);
   for (const auto A : ARCHNames) {
-    if (StringRef(A.Name).endswith(Syn))
+    if (A.getName().endswith(Syn))
       return A.ID;
   }
   return ARM::AK_INVALID;
 }
 
-unsigned ARMTargetParser::parseArchExt(StringRef ArchExt) {
+unsigned llvm::ARM::parseArchExt(StringRef ArchExt) {
   for (const auto A : ARCHExtNames) {
-    if (ArchExt == A.Name)
+    if (ArchExt == A.getName())
       return A.ID;
   }
   return ARM::AEK_INVALID;
 }
 
-unsigned ARMTargetParser::parseCPUArch(StringRef CPU) {
+unsigned llvm::ARM::parseCPUArch(StringRef CPU) {
   for (const auto C : CPUNames) {
-    if (CPU == C.Name)
+    if (CPU == C.getName())
       return C.ArchID;
   }
   return ARM::AK_INVALID;
 }
 
 // ARM, Thumb, AArch64
-unsigned ARMTargetParser::parseArchISA(StringRef Arch) {
+unsigned llvm::ARM::parseArchISA(StringRef Arch) {
   return StringSwitch<unsigned>(Arch)
       .StartsWith("aarch64", ARM::IK_AARCH64)
-      .StartsWith("arm64",   ARM::IK_AARCH64)
-      .StartsWith("thumb",   ARM::IK_THUMB)
-      .StartsWith("arm",     ARM::IK_ARM)
+      .StartsWith("arm64", ARM::IK_AARCH64)
+      .StartsWith("thumb", ARM::IK_THUMB)
+      .StartsWith("arm", ARM::IK_ARM)
       .Default(ARM::EK_INVALID);
 }
 
 // Little/Big endian
-unsigned ARMTargetParser::parseArchEndian(StringRef Arch) {
-  if (Arch.startswith("armeb") ||
-      Arch.startswith("thumbeb") ||
+unsigned llvm::ARM::parseArchEndian(StringRef Arch) {
+  if (Arch.startswith("armeb") || Arch.startswith("thumbeb") ||
       Arch.startswith("aarch64_be"))
     return ARM::EK_BIG;
 
@@ -535,29 +542,29 @@ unsigned ARMTargetParser::parseArchEndian(StringRef Arch) {
 }
 
 // Profile A/R/M
-unsigned ARMTargetParser::parseArchProfile(StringRef Arch) {
+unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   Arch = getCanonicalArchName(Arch);
-  switch(parseArch(Arch)) {
+  switch (parseArch(Arch)) {
   case ARM::AK_ARMV6M:
   case ARM::AK_ARMV7M:
-  case ARM::AK_ARMV6SM:
   case ARM::AK_ARMV7EM:
     return ARM::PK_M;
   case ARM::AK_ARMV7R:
     return ARM::PK_R;
-  case ARM::AK_ARMV7:
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7K:
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     return ARM::PK_A;
   }
   return ARM::PK_INVALID;
 }
 
 // Version number (ex. v7 = 7).
-unsigned ARMTargetParser::parseArchVersion(StringRef Arch) {
+unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   Arch = getCanonicalArchName(Arch);
-  switch(parseArch(Arch)) {
+  switch (parseArch(Arch)) {
   case ARM::AK_ARMV2:
   case ARM::AK_ARMV2A:
     return 2;
@@ -567,36 +574,29 @@ unsigned ARMTargetParser::parseArchVersion(StringRef Arch) {
   case ARM::AK_ARMV4:
   case ARM::AK_ARMV4T:
     return 4;
-  case ARM::AK_ARMV5:
   case ARM::AK_ARMV5T:
   case ARM::AK_ARMV5TE:
   case ARM::AK_IWMMXT:
   case ARM::AK_IWMMXT2:
   case ARM::AK_XSCALE:
-  case ARM::AK_ARMV5E:
   case ARM::AK_ARMV5TEJ:
     return 5;
   case ARM::AK_ARMV6:
-  case ARM::AK_ARMV6J:
   case ARM::AK_ARMV6K:
   case ARM::AK_ARMV6T2:
-  case ARM::AK_ARMV6Z:
-  case ARM::AK_ARMV6ZK:
+  case ARM::AK_ARMV6KZ:
   case ARM::AK_ARMV6M:
-  case ARM::AK_ARMV6SM:
-  case ARM::AK_ARMV6HL:
     return 6;
-  case ARM::AK_ARMV7:
   case ARM::AK_ARMV7A:
   case ARM::AK_ARMV7R:
   case ARM::AK_ARMV7M:
-  case ARM::AK_ARMV7L:
-  case ARM::AK_ARMV7HL:
   case ARM::AK_ARMV7S:
   case ARM::AK_ARMV7EM:
+  case ARM::AK_ARMV7K:
     return 7;
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     return 8;
   }
   return 0;
diff --git a/contrib/llvm/lib/Support/ThreadPool.cpp b/contrib/llvm/lib/Support/ThreadPool.cpp
new file mode 100644
index 0000000..d4dcb2e
--- /dev/null
+++ b/contrib/llvm/lib/Support/ThreadPool.cpp
@@ -0,0 +1,155 @@
+//==-- llvm/Support/ThreadPool.cpp - A ThreadPool implementation -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a crude C++11 based thread pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ThreadPool.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#if LLVM_ENABLE_THREADS
+
+// Default to std::thread::hardware_concurrency
+ThreadPool::ThreadPool() : ThreadPool(std::thread::hardware_concurrency()) {}
+
+ThreadPool::ThreadPool(unsigned ThreadCount)
+    : ActiveThreads(0), EnableFlag(true) {
+  // Create ThreadCount threads that will loop forever, wait on QueueCondition
+  // for tasks to be queued or the Pool to be destroyed.
+  Threads.reserve(ThreadCount);
+  for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
+    Threads.emplace_back([&] {
+      while (true) {
+        PackagedTaskTy Task;
+        {
+          std::unique_lock<std::mutex> LockGuard(QueueLock);
+          // Wait for tasks to be pushed in the queue
+          QueueCondition.wait(LockGuard,
+                              [&] { return !EnableFlag || !Tasks.empty(); });
+          // Exit condition
+          if (!EnableFlag && Tasks.empty())
+            return;
+          // Yeah, we have a task, grab it and release the lock on the queue
+
+          // We first need to signal that we are active before popping the queue
+          // in order for wait() to properly detect that even if the queue is
+          // empty, there is still a task in flight.
+          {
+            ++ActiveThreads;
+            std::unique_lock<std::mutex> LockGuard(CompletionLock);
+          }
+          Task = std::move(Tasks.front());
+          Tasks.pop();
+        }
+        // Run the task we just grabbed
+#ifndef _MSC_VER
+        Task();
+#else
+        Task(/* unused */ false);
+#endif
+
+        {
+          // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
+          std::unique_lock<std::mutex> LockGuard(CompletionLock);
+          --ActiveThreads;
+        }
+
+        // Notify task completion, in case someone waits on ThreadPool::wait()
+        CompletionCondition.notify_all();
+      }
+    });
+  }
+}
+
+void ThreadPool::wait() {
+  // Wait for all threads to complete and the queue to be empty
+  std::unique_lock<std::mutex> LockGuard(CompletionLock);
+  CompletionCondition.wait(LockGuard,
+                           [&] { return Tasks.empty() && !ActiveThreads; });
+}
+
+std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) {
+  /// Wrap the Task in a packaged_task to return a future object.
+  PackagedTaskTy PackagedTask(std::move(Task));
+  auto Future = PackagedTask.get_future();
+  {
+    // Lock the queue and push the new task
+    std::unique_lock<std::mutex> LockGuard(QueueLock);
+
+    // Don't allow enqueueing after disabling the pool
+    assert(EnableFlag && "Queuing a thread during ThreadPool destruction");
+
+    Tasks.push(std::move(PackagedTask));
+  }
+  QueueCondition.notify_one();
+  return Future.share();
+}
+
+// The destructor joins all threads, waiting for completion.
+ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> LockGuard(QueueLock);
+    EnableFlag = false;
+  }
+  QueueCondition.notify_all();
+  for (auto &Worker : Threads)
+    Worker.join();
+}
+
+#else // LLVM_ENABLE_THREADS Disabled
+
+ThreadPool::ThreadPool() : ThreadPool(0) {}
+
+// No threads are launched, issue a warning if ThreadCount is not 0
+ThreadPool::ThreadPool(unsigned ThreadCount)
+    : ActiveThreads(0) {
+  if (ThreadCount) {
+    errs() << "Warning: request a ThreadPool with " << ThreadCount
+           << " threads, but LLVM_ENABLE_THREADS has been turned off\n";
+  }
+}
+
+void ThreadPool::wait() {
+  // Sequential implementation running the tasks
+  while (!Tasks.empty()) {
+    auto Task = std::move(Tasks.front());
+    Tasks.pop();
+#ifndef _MSC_VER
+        Task();
+#else
+        Task(/* unused */ false);
+#endif
+  }
+}
+
+std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) {
+#ifndef _MSC_VER
+  // Get a Future with launch::deferred execution using std::async
+  auto Future = std::async(std::launch::deferred, std::move(Task)).share();
+  // Wrap the future so that both ThreadPool::wait() can operate and the
+  // returned future can be sync'ed on.
+  PackagedTaskTy PackagedTask([Future]() { Future.get(); });
+#else
+  auto Future = std::async(std::launch::deferred, std::move(Task), false).share();
+  PackagedTaskTy PackagedTask([Future](bool) -> bool { Future.get(); return false; });
+#endif
+  Tasks.push(std::move(PackagedTask));
+  return Future;
+}
+
+ThreadPool::~ThreadPool() {
+  wait();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Support/TimeValue.cpp b/contrib/llvm/lib/Support/TimeValue.cpp
index 136b93e..94a4c01 100644
--- a/contrib/llvm/lib/Support/TimeValue.cpp
+++ b/contrib/llvm/lib/Support/TimeValue.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Config/config.h"
 
 namespace llvm {
+
 using namespace sys;
 
 const TimeValue::SecondsType
@@ -22,8 +23,7 @@ const TimeValue::SecondsType
 const TimeValue::SecondsType
   TimeValue::Win32ZeroTimeSeconds = -12591158400ULL;
 
-void
-TimeValue::normalize( void ) {
+void TimeValue::normalize() {
   if ( nanos_ >= NANOSECONDS_PER_SECOND ) {
     do {
       seconds_++;
@@ -45,7 +45,7 @@ TimeValue::normalize( void ) {
   }
 }
 
-}
+} // namespace llvm
 
 /// Include the platform-specific portion of TimeValue class
 #ifdef LLVM_ON_UNIX
diff --git a/contrib/llvm/lib/Support/Timer.cpp b/contrib/llvm/lib/Support/Timer.cpp
index d7b6515..414f559 100644
--- a/contrib/llvm/lib/Support/Timer.cpp
+++ b/contrib/llvm/lib/Support/Timer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Timer.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -22,9 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-// CreateInfoOutputFile - Return a file stream to print our output on.
-namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
-
 // getLibSupportInfoOutputFilename - This ugly hack is brought to you courtesy
 // of constructor/destructor ordering being unspecified by C++.  Basically the
 // problem is that a Statistic object gets destroyed, which ends up calling
@@ -52,28 +50,27 @@ namespace {
                    cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
 }
 
-// CreateInfoOutputFile - Return a file stream to print our output on.
-raw_ostream *llvm::CreateInfoOutputFile() {
+// Return a file stream to print our output on.
+std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
   const std::string &OutputFilename = getLibSupportInfoOutputFilename();
   if (OutputFilename.empty())
-    return new raw_fd_ostream(2, false); // stderr.
+    return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
   if (OutputFilename == "-")
-    return new raw_fd_ostream(1, false); // stdout.
-  
+    return llvm::make_unique<raw_fd_ostream>(1, false); // stdout.
+
   // Append mode is used because the info output file is opened and closed
   // each time -stats or -time-passes wants to print output to it. To
   // compensate for this, the test-suite Makefiles have code to delete the
   // info output file before running commands which write to it.
   std::error_code EC;
-  raw_ostream *Result = new raw_fd_ostream(OutputFilename, EC,
-                                           sys::fs::F_Append | sys::fs::F_Text);
+  auto Result = llvm::make_unique<raw_fd_ostream>(
+      OutputFilename, EC, sys::fs::F_Append | sys::fs::F_Text);
   if (!EC)
     return Result;
-  
+
   errs() << "Error opening info-output-file '"
     << OutputFilename << " for appending!\n";
-  delete Result;
-  return new raw_fd_ostream(2, false); // stderr.
+  return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
 }
 
 
@@ -99,17 +96,13 @@ static TimerGroup *getDefaultTimerGroup() {
 //===----------------------------------------------------------------------===//
 
 void Timer::init(StringRef N) {
-  assert(!TG && "Timer already initialized");
-  Name.assign(N.begin(), N.end());
-  Started = false;
-  TG = getDefaultTimerGroup();
-  TG->addTimer(*this);
+  init(N, *getDefaultTimerGroup());
 }
 
 void Timer::init(StringRef N, TimerGroup &tg) {
   assert(!TG && "Timer already initialized");
   Name.assign(N.begin(), N.end());
-  Started = false;
+  Running = Triggered = false;
   TG = &tg;
   TG->addTimer(*this);
 }
@@ -142,25 +135,22 @@ TimeRecord TimeRecord::getCurrentTime(bool Start) {
   return Result;
 }
 
-static ManagedStatic<std::vector<Timer*> > ActiveTimers;
-
 void Timer::startTimer() {
-  Started = true;
-  ActiveTimers->push_back(this);
-  Time -= TimeRecord::getCurrentTime(true);
+  assert(!Running && "Cannot start a running timer");
+  Running = Triggered = true;
+  StartTime = TimeRecord::getCurrentTime(true);
 }
 
 void Timer::stopTimer() {
+  assert(Running && "Cannot stop a paused timer");
+  Running = false;
   Time += TimeRecord::getCurrentTime(false);
+  Time -= StartTime;
+}
 
-  if (ActiveTimers->back() == this) {
-    ActiveTimers->pop_back();
-  } else {
-    std::vector<Timer*>::iterator I =
-      std::find(ActiveTimers->begin(), ActiveTimers->end(), this);
-    assert(I != ActiveTimers->end() && "stop but no startTimer?");
-    ActiveTimers->erase(I);
-  }
+void Timer::clear() {
+  Running = Triggered = false;
+  Time = StartTime = TimeRecord();
 }
 
 static void printVal(double Val, double Total, raw_ostream &OS) {
@@ -278,8 +268,8 @@ void TimerGroup::removeTimer(Timer &T) {
   sys::SmartScopedLock<true> L(*TimerLock);
   
   // If the timer was started, move its data to TimersToPrint.
-  if (T.Started)
-    TimersToPrint.push_back(std::make_pair(T.Time, T.Name));
+  if (T.hasTriggered())
+    TimersToPrint.emplace_back(T.Time, T.Name);
 
   T.TG = nullptr;
   
@@ -292,10 +282,9 @@ void TimerGroup::removeTimer(Timer &T) {
   // them were started.
   if (FirstTimer || TimersToPrint.empty())
     return;
-  
-  raw_ostream *OutStream = CreateInfoOutputFile();
+
+  std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
   PrintQueuedTimers(*OutStream);
-  delete OutStream;   // Close the file.
 }
 
 void TimerGroup::addTimer(Timer &T) {
@@ -314,8 +303,8 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   std::sort(TimersToPrint.begin(), TimersToPrint.end());
   
   TimeRecord Total;
-  for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
-    Total += TimersToPrint[i].first;
+  for (auto &RecordNamePair : TimersToPrint)
+    Total += RecordNamePair.first;
   
   // Print out timing header.
   OS << "===" << std::string(73, '-') << "===\n";
@@ -365,12 +354,11 @@ void TimerGroup::print(raw_ostream &OS) {
   // See if any of our timers were started, if so add them to TimersToPrint and
   // reset them.
   for (Timer *T = FirstTimer; T; T = T->Next) {
-    if (!T->Started) continue;
-    TimersToPrint.push_back(std::make_pair(T->Time, T->Name));
+    if (!T->hasTriggered()) continue;
+    TimersToPrint.emplace_back(T->Time, T->Name);
     
     // Clear out the time.
-    T->Started = 0;
-    T->Time = TimeRecord();
+    T->clear();
   }
 
   // If any timers were started, print the group.
diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp
index c6646fb..0e5d3ac 100644
--- a/contrib/llvm/lib/Support/Triple.cpp
+++ b/contrib/llvm/lib/Support/Triple.cpp
@@ -25,6 +25,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case aarch64_be:  return "aarch64_be";
   case arm:         return "arm";
   case armeb:       return "armeb";
+  case avr:         return "avr";
   case bpfel:       return "bpfel";
   case bpfeb:       return "bpfeb";
   case hexagon:     return "hexagon";
@@ -80,6 +81,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case thumb:
   case thumbeb:     return "arm";
 
+  case avr:         return "avr";
+
   case ppc64:
   case ppc64le:
   case ppc:         return "ppc";
@@ -124,8 +127,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case spir64:      return "spir";
   case kalimba:     return "kalimba";
   case shave:       return "shave";
-  case wasm32:      return "wasm32";
-  case wasm64:      return "wasm64";
+  case wasm32:
+  case wasm64:      return "wasm";
   }
 }
 
@@ -144,6 +147,7 @@ const char *Triple::getVendorTypeName(VendorType Kind) {
   case MipsTechnologies: return "mti";
   case NVIDIA: return "nvidia";
   case CSR: return "csr";
+  case Myriad: return "myriad";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -177,6 +181,9 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case NVCL: return "nvcl";
   case AMDHSA: return "amdhsa";
   case PS4: return "ps4";
+  case ELFIAMCU: return "elfiamcu";
+  case TvOS: return "tvos";
+  case WatchOS: return "watchos";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -196,6 +203,8 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case MSVC: return "msvc";
   case Itanium: return "itanium";
   case Cygnus: return "cygnus";
+  case AMDOpenCL: return "amdopencl";
+  case CoreCLR: return "coreclr";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -224,6 +233,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
     .Case("arm", arm)
     .Case("armeb", armeb)
+    .Case("avr", avr)
     .StartsWith("bpf", BPFArch)
     .Case("mips", mips)
     .Case("mipsel", mipsel)
@@ -265,8 +275,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
 }
 
 static Triple::ArchType parseARMArch(StringRef ArchName) {
-  unsigned ISA = ARMTargetParser::parseArchISA(ArchName);
-  unsigned ENDIAN = ARMTargetParser::parseArchEndian(ArchName);
+  unsigned ISA = ARM::parseArchISA(ArchName);
+  unsigned ENDIAN = ARM::parseArchEndian(ArchName);
 
   Triple::ArchType arch = Triple::UnknownArch;
   switch (ENDIAN) {
@@ -300,7 +310,7 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
   }
   }
 
-  ArchName = ARMTargetParser::getCanonicalArchName(ArchName);
+  ArchName = ARM::getCanonicalArchName(ArchName);
   if (ArchName.empty())
     return Triple::UnknownArch;
 
@@ -310,8 +320,8 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
     return Triple::UnknownArch;
 
   // Thumb only for v6m
-  unsigned Profile = ARMTargetParser::parseArchProfile(ArchName);
-  unsigned Version = ARMTargetParser::parseArchVersion(ArchName);
+  unsigned Profile = ARM::parseArchProfile(ArchName);
+  unsigned Version = ARM::parseArchVersion(ArchName);
   if (Profile == ARM::PK_M && Version == 6) {
     if (ENDIAN == ARM::EK_BIG)
       return Triple::thumbeb;
@@ -323,10 +333,7 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
-  Triple::ArchType ARMArch(parseARMArch(ArchName));
-  Triple::ArchType BPFArch(parseBPFArch(ArchName));
-
-  return StringSwitch<Triple::ArchType>(ArchName)
+  auto AT = StringSwitch<Triple::ArchType>(ArchName)
     .Cases("i386", "i486", "i586", "i686", Triple::x86)
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
@@ -336,9 +343,14 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("powerpc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
     .Case("xscaleeb", Triple::armeb)
-    .StartsWith("arm", ARMArch)
-    .StartsWith("thumb", ARMArch)
-    .StartsWith("aarch64", ARMArch)
+    .Case("aarch64", Triple::aarch64)
+    .Case("aarch64_be", Triple::aarch64_be)
+    .Case("arm64", Triple::aarch64)
+    .Case("arm", Triple::arm)
+    .Case("armeb", Triple::armeb)
+    .Case("thumb", Triple::thumb)
+    .Case("thumbeb", Triple::thumbeb)
+    .Case("avr", Triple::avr)
     .Case("msp430", Triple::msp430)
     .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
@@ -346,7 +358,6 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("mips64el", Triple::mips64el)
     .Case("r600", Triple::r600)
     .Case("amdgcn", Triple::amdgcn)
-    .StartsWith("bpf", BPFArch)
     .Case("hexagon", Triple::hexagon)
     .Case("s390x", Triple::systemz)
     .Case("sparc", Triple::sparc)
@@ -369,6 +380,18 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("wasm32", Triple::wasm32)
     .Case("wasm64", Triple::wasm64)
     .Default(Triple::UnknownArch);
+
+  // Some architectures require special parsing logic just to compute the
+  // ArchType result.
+  if (AT == Triple::UnknownArch) {
+    if (ArchName.startswith("arm") || ArchName.startswith("thumb") ||
+        ArchName.startswith("aarch64"))
+      return parseARMArch(ArchName);
+    if (ArchName.startswith("bpf"))
+      return parseBPFArch(ArchName);
+  }
+
+  return AT;
 }
 
 static Triple::VendorType parseVendor(StringRef VendorName) {
@@ -384,6 +407,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("mti", Triple::MipsTechnologies)
     .Case("nvidia", Triple::NVIDIA)
     .Case("csr", Triple::CSR)
+    .Case("myriad", Triple::Myriad)
     .Default(Triple::UnknownVendor);
 }
 
@@ -414,6 +438,9 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("nvcl", Triple::NVCL)
     .StartsWith("amdhsa", Triple::AMDHSA)
     .StartsWith("ps4", Triple::PS4)
+    .StartsWith("elfiamcu", Triple::ELFIAMCU)
+    .StartsWith("tvos", Triple::TvOS)
+    .StartsWith("watchos", Triple::WatchOS)
     .Default(Triple::UnknownOS);
 }
 
@@ -430,6 +457,8 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
     .StartsWith("msvc", Triple::MSVC)
     .StartsWith("itanium", Triple::Itanium)
     .StartsWith("cygnus", Triple::Cygnus)
+    .StartsWith("amdopencl", Triple::AMDOpenCL)
+    .StartsWith("coreclr", Triple::CoreCLR)
     .Default(Triple::UnknownEnvironment);
 }
 
@@ -442,7 +471,7 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
 }
 
 static Triple::SubArchType parseSubArch(StringRef SubArchName) {
-  StringRef ARMSubArch = ARMTargetParser::getCanonicalArchName(SubArchName);
+  StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
 
   // For now, this is the small part. Early return.
   if (ARMSubArch.empty())
@@ -453,14 +482,12 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
       .Default(Triple::NoSubArch);
 
   // ARM sub arch.
-  switch(ARMTargetParser::parseArch(ARMSubArch)) {
+  switch(ARM::parseArch(ARMSubArch)) {
   case ARM::AK_ARMV4:
     return Triple::NoSubArch;
   case ARM::AK_ARMV4T:
     return Triple::ARMSubArch_v4t;
-  case ARM::AK_ARMV5:
   case ARM::AK_ARMV5T:
-  case ARM::AK_ARMV5E:
     return Triple::ARMSubArch_v5;
   case ARM::AK_ARMV5TE:
   case ARM::AK_IWMMXT:
@@ -469,24 +496,19 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
   case ARM::AK_ARMV5TEJ:
     return Triple::ARMSubArch_v5te;
   case ARM::AK_ARMV6:
-  case ARM::AK_ARMV6J:
-  case ARM::AK_ARMV6Z:
     return Triple::ARMSubArch_v6;
   case ARM::AK_ARMV6K:
-  case ARM::AK_ARMV6ZK:
-  case ARM::AK_ARMV6HL:
+  case ARM::AK_ARMV6KZ:
     return Triple::ARMSubArch_v6k;
   case ARM::AK_ARMV6T2:
     return Triple::ARMSubArch_v6t2;
   case ARM::AK_ARMV6M:
-  case ARM::AK_ARMV6SM:
     return Triple::ARMSubArch_v6m;
-  case ARM::AK_ARMV7:
   case ARM::AK_ARMV7A:
   case ARM::AK_ARMV7R:
-  case ARM::AK_ARMV7L:
-  case ARM::AK_ARMV7HL:
     return Triple::ARMSubArch_v7;
+  case ARM::AK_ARMV7K:
+    return Triple::ARMSubArch_v7k;
   case ARM::AK_ARMV7M:
     return Triple::ARMSubArch_v7m;
   case ARM::AK_ARMV7S:
@@ -497,6 +519,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8;
   case ARM::AK_ARMV8_1A:
     return Triple::ARMSubArch_v8_1a;
+  case ARM::AK_ARMV8_2A:
+    return Triple::ARMSubArch_v8_2a;
   default:
     return Triple::NoSubArch;
   }
@@ -514,20 +538,53 @@ static const char *getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
 
 static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   switch (T.getArch()) {
-  default:
-    break;
+  case Triple::UnknownArch:
+  case Triple::aarch64:
+  case Triple::arm:
+  case Triple::thumb:
+  case Triple::x86:
+  case Triple::x86_64:
+    if (T.isOSDarwin())
+      return Triple::MachO;
+    else if (T.isOSWindows())
+      return Triple::COFF;
+    return Triple::ELF;
+
+  case Triple::aarch64_be:
+  case Triple::amdgcn:
+  case Triple::amdil:
+  case Triple::amdil64:
+  case Triple::armeb:
+  case Triple::avr:
+  case Triple::bpfeb:
+  case Triple::bpfel:
   case Triple::hexagon:
+  case Triple::hsail:
+  case Triple::hsail64:
+  case Triple::kalimba:
+  case Triple::le32:
+  case Triple::le64:
   case Triple::mips:
-  case Triple::mipsel:
   case Triple::mips64:
   case Triple::mips64el:
+  case Triple::mipsel:
+  case Triple::msp430:
+  case Triple::nvptx:
+  case Triple::nvptx64:
+  case Triple::ppc64le:
   case Triple::r600:
-  case Triple::amdgcn:
+  case Triple::shave:
   case Triple::sparc:
+  case Triple::sparcel:
   case Triple::sparcv9:
+  case Triple::spir:
+  case Triple::spir64:
   case Triple::systemz:
+  case Triple::tce:
+  case Triple::thumbeb:
+  case Triple::wasm32:
+  case Triple::wasm64:
   case Triple::xcore:
-  case Triple::ppc64le:
     return Triple::ELF;
 
   case Triple::ppc:
@@ -536,12 +593,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
       return Triple::MachO;
     return Triple::ELF;
   }
-
-  if (T.isOSDarwin())
-    return Triple::MachO;
-  else if (T.isOSWindows())
-    return Triple::COFF;
-  return Triple::ELF;
+  llvm_unreachable("unknown architecture");
 }
 
 /// \brief Construct a triple from the string representation provided.
@@ -549,14 +601,27 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
 /// This stores the string representation and parses the various pieces into
 /// enum members.
 Triple::Triple(const Twine &Str)
-    : Data(Str.str()),
-      Arch(parseArch(getArchName())),
-      SubArch(parseSubArch(getArchName())),
-      Vendor(parseVendor(getVendorName())),
-      OS(parseOS(getOSName())),
-      Environment(parseEnvironment(getEnvironmentName())),
-      ObjectFormat(parseFormat(getEnvironmentName())) {
-  if (ObjectFormat == Triple::UnknownObjectFormat)
+    : Data(Str.str()), Arch(UnknownArch), SubArch(NoSubArch),
+      Vendor(UnknownVendor), OS(UnknownOS), Environment(UnknownEnvironment),
+      ObjectFormat(UnknownObjectFormat) {
+  // Do minimal parsing by hand here.
+  SmallVector<StringRef, 4> Components;
+  StringRef(Data).split(Components, '-', /*MaxSplit*/ 3);
+  if (Components.size() > 0) {
+    Arch = parseArch(Components[0]);
+    SubArch = parseSubArch(Components[0]);
+    if (Components.size() > 1) {
+      Vendor = parseVendor(Components[1]);
+      if (Components.size() > 2) {
+        OS = parseOS(Components[2]);
+        if (Components.size() > 3) {
+          Environment = parseEnvironment(Components[3]);
+          ObjectFormat = parseFormat(Components[3]);
+        }
+      }
+    }
+  }
+  if (ObjectFormat == UnknownObjectFormat)
     ObjectFormat = getDefaultFormat(*this);
 }
 
@@ -601,7 +666,7 @@ std::string Triple::normalize(StringRef Str) {
 
   // Parse into components.
   SmallVector<StringRef, 4> Components;
-  Str.split(Components, "-");
+  Str.split(Components, '-');
 
   // If the first component corresponds to a known architecture, preferentially
   // use it for the architecture.  If the second component corresponds to a
@@ -889,6 +954,8 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
       return false;
     break;
   case IOS:
+  case TvOS:
+  case WatchOS:
     // Ignore the version from the triple.  This is only handled because the
     // the clang driver combines OS X and IOS support into a common Darwin
     // toolchain that wants to know the OS X version number even when targeting
@@ -916,11 +983,38 @@ void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
     Micro = 0;
     break;
   case IOS:
+  case TvOS:
     getOSVersion(Major, Minor, Micro);
     // Default to 5.0 (or 7.0 for arm64).
     if (Major == 0)
       Major = (getArch() == aarch64) ? 7 : 5;
     break;
+  case WatchOS:
+    llvm_unreachable("conflicting triple info");
+  }
+}
+
+void Triple::getWatchOSVersion(unsigned &Major, unsigned &Minor,
+                               unsigned &Micro) const {
+  switch (getOS()) {
+  default: llvm_unreachable("unexpected OS for Darwin triple");
+  case Darwin:
+  case MacOSX:
+    // Ignore the version from the triple.  This is only handled because the
+    // the clang driver combines OS X and IOS support into a common Darwin
+    // toolchain that wants to know the iOS version number even when targeting
+    // OS X.
+    Major = 2;
+    Minor = 0;
+    Micro = 0;
+    break;
+  case WatchOS:
+    getOSVersion(Major, Minor, Micro);
+    if (Major == 0)
+      Major = 2;
+    break;
+  case IOS:
+    llvm_unreachable("conflicting triple info");
   }
 }
 
@@ -993,6 +1087,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::UnknownArch:
     return 0;
 
+  case llvm::Triple::avr:
   case llvm::Triple::msp430:
     return 16;
 
@@ -1059,9 +1154,8 @@ Triple Triple::get32BitArchVariant() const {
   Triple T(*this);
   switch (getArch()) {
   case Triple::UnknownArch:
-  case Triple::aarch64:
-  case Triple::aarch64_be:
   case Triple::amdgcn:
+  case Triple::avr:
   case Triple::bpfel:
   case Triple::bpfeb:
   case Triple::msp430:
@@ -1095,17 +1189,19 @@ Triple Triple::get32BitArchVariant() const {
     // Already 32-bit.
     break;
 
-  case Triple::le64:      T.setArch(Triple::le32);    break;
-  case Triple::mips64:    T.setArch(Triple::mips);    break;
-  case Triple::mips64el:  T.setArch(Triple::mipsel);  break;
-  case Triple::nvptx64:   T.setArch(Triple::nvptx);   break;
-  case Triple::ppc64:     T.setArch(Triple::ppc);     break;
-  case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
-  case Triple::x86_64:    T.setArch(Triple::x86);     break;
-  case Triple::amdil64:   T.setArch(Triple::amdil);   break;
-  case Triple::hsail64:   T.setArch(Triple::hsail);   break;
-  case Triple::spir64:    T.setArch(Triple::spir);    break;
-  case Triple::wasm64:    T.setArch(Triple::wasm32);  break;
+  case Triple::aarch64:    T.setArch(Triple::arm);     break;
+  case Triple::aarch64_be: T.setArch(Triple::armeb);   break;
+  case Triple::le64:       T.setArch(Triple::le32);    break;
+  case Triple::mips64:     T.setArch(Triple::mips);    break;
+  case Triple::mips64el:   T.setArch(Triple::mipsel);  break;
+  case Triple::nvptx64:    T.setArch(Triple::nvptx);   break;
+  case Triple::ppc64:      T.setArch(Triple::ppc);     break;
+  case Triple::sparcv9:    T.setArch(Triple::sparc);   break;
+  case Triple::x86_64:     T.setArch(Triple::x86);     break;
+  case Triple::amdil64:    T.setArch(Triple::amdil);   break;
+  case Triple::hsail64:    T.setArch(Triple::hsail);   break;
+  case Triple::spir64:     T.setArch(Triple::spir);    break;
+  case Triple::wasm64:     T.setArch(Triple::wasm32);  break;
   }
   return T;
 }
@@ -1114,15 +1210,12 @@ Triple Triple::get64BitArchVariant() const {
   Triple T(*this);
   switch (getArch()) {
   case Triple::UnknownArch:
-  case Triple::arm:
-  case Triple::armeb:
+  case Triple::avr:
   case Triple::hexagon:
   case Triple::kalimba:
   case Triple::msp430:
   case Triple::r600:
   case Triple::tce:
-  case Triple::thumb:
-  case Triple::thumbeb:
   case Triple::xcore:
   case Triple::sparcel:
   case Triple::shave:
@@ -1150,17 +1243,21 @@ Triple Triple::get64BitArchVariant() const {
     // Already 64-bit.
     break;
 
-  case Triple::le32:    T.setArch(Triple::le64);      break;
-  case Triple::mips:    T.setArch(Triple::mips64);    break;
-  case Triple::mipsel:  T.setArch(Triple::mips64el);  break;
-  case Triple::nvptx:   T.setArch(Triple::nvptx64);   break;
-  case Triple::ppc:     T.setArch(Triple::ppc64);     break;
-  case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
-  case Triple::x86:     T.setArch(Triple::x86_64);    break;
-  case Triple::amdil:   T.setArch(Triple::amdil64);   break;
-  case Triple::hsail:   T.setArch(Triple::hsail64);   break;
-  case Triple::spir:    T.setArch(Triple::spir64);    break;
-  case Triple::wasm32:  T.setArch(Triple::wasm64);    break;
+  case Triple::arm:     T.setArch(Triple::aarch64);    break;
+  case Triple::armeb:   T.setArch(Triple::aarch64_be); break;
+  case Triple::le32:    T.setArch(Triple::le64);       break;
+  case Triple::mips:    T.setArch(Triple::mips64);     break;
+  case Triple::mipsel:  T.setArch(Triple::mips64el);   break;
+  case Triple::nvptx:   T.setArch(Triple::nvptx64);    break;
+  case Triple::ppc:     T.setArch(Triple::ppc64);      break;
+  case Triple::sparc:   T.setArch(Triple::sparcv9);    break;
+  case Triple::x86:     T.setArch(Triple::x86_64);     break;
+  case Triple::amdil:   T.setArch(Triple::amdil64);    break;
+  case Triple::hsail:   T.setArch(Triple::hsail64);    break;
+  case Triple::spir:    T.setArch(Triple::spir64);     break;
+  case Triple::thumb:   T.setArch(Triple::aarch64);    break;
+  case Triple::thumbeb: T.setArch(Triple::aarch64_be); break;
+  case Triple::wasm32:  T.setArch(Triple::wasm64);     break;
   }
   return T;
 }
@@ -1172,6 +1269,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::amdgcn:
   case Triple::amdil64:
   case Triple::amdil:
+  case Triple::avr:
   case Triple::hexagon:
   case Triple::hsail64:
   case Triple::hsail:
@@ -1244,6 +1342,7 @@ Triple Triple::getLittleEndianArchVariant() const {
   case Triple::amdil64:
   case Triple::amdil:
   case Triple::arm:
+  case Triple::avr:
   case Triple::bpfel:
   case Triple::hexagon:
   case Triple::hsail64:
@@ -1281,10 +1380,10 @@ Triple Triple::getLittleEndianArchVariant() const {
   return T;
 }
 
-const char *Triple::getARMCPUForArch(StringRef MArch) const {
+StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   if (MArch.empty())
     MArch = getArchName();
-  MArch = ARMTargetParser::getCanonicalArchName(MArch);
+  MArch = ARM::getCanonicalArchName(MArch);
 
   // Some defaults are forced.
   switch (getOS()) {
@@ -1296,15 +1395,21 @@ const char *Triple::getARMCPUForArch(StringRef MArch) const {
   case llvm::Triple::Win32:
     // FIXME: this is invalid for WindowsCE
     return "cortex-a9";
+  case llvm::Triple::MacOSX:
+  case llvm::Triple::IOS:
+  case llvm::Triple::WatchOS:
+    if (MArch == "v7k")
+      return "cortex-a7";
+    break;
   default:
     break;
   }
 
   if (MArch.empty())
-    return nullptr;
+    return StringRef();
 
-  const char *CPU = ARMTargetParser::getDefaultCPU(MArch);
-  if (CPU)
+  StringRef CPU = ARM::getDefaultCPU(MArch);
+  if (!CPU.empty())
     return CPU;
 
   // If no specific architecture version is requested, return the minimum CPU
diff --git a/contrib/llvm/lib/Support/Unix/Memory.inc b/contrib/llvm/lib/Support/Unix/Memory.inc
index c421ee8..d703191 100644
--- a/contrib/llvm/lib/Support/Unix/Memory.inc
+++ b/contrib/llvm/lib/Support/Unix/Memory.inc
@@ -50,9 +50,8 @@ int getPosixProtectionFlags(unsigned Flags) {
     return PROT_READ | PROT_WRITE;
   case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
     return PROT_READ | PROT_EXEC;
-  case llvm::sys::Memory::MF_READ |
-	 llvm::sys::Memory::MF_WRITE |
-	 llvm::sys::Memory::MF_EXEC:
+  case llvm::sys::Memory::MF_READ | llvm::sys::Memory::MF_WRITE |
+      llvm::sys::Memory::MF_EXEC:
     return PROT_READ | PROT_WRITE | PROT_EXEC;
   case llvm::sys::Memory::MF_EXEC:
 #if defined(__FreeBSD__)
@@ -153,6 +152,7 @@ Memory::releaseMappedMemory(MemoryBlock &M) {
 
 std::error_code
 Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
+  static const size_t PageSize = Process::getPageSize();
   if (M.Address == nullptr || M.Size == 0)
     return std::error_code();
 
@@ -161,7 +161,7 @@ Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
 
   int Protect = getPosixProtectionFlags(Flags);
 
-  int Result = ::mprotect(M.Address, M.Size, Protect);
+  int Result = ::mprotect((void*)((uintptr_t)M.Address & ~(PageSize-1)), PageSize*((M.Size+PageSize-1)/PageSize), Protect);
   if (Result != 0)
     return std::error_code(errno, std::generic_category());
 
@@ -181,7 +181,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
                     std::string *ErrMsg) {
   if (NumBytes == 0) return MemoryBlock();
 
-  size_t PageSize = Process::getPageSize();
+  static const size_t PageSize = Process::getPageSize();
   size_t NumPages = (NumBytes+PageSize-1)/PageSize;
 
   int fd = -1;
@@ -265,15 +265,12 @@ bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 }
 
 bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
-#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
   if (M.Address == 0 || M.Size == 0) return false;
   Memory::InvalidateInstructionCache(M.Address, M.Size);
+#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
     (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
   return KERN_SUCCESS == kr;
-#elif defined(__arm__) || defined(__aarch64__)
-  Memory::InvalidateInstructionCache(M.Address, M.Size);
-  return true;
 #else
   return true;
 #endif
diff --git a/contrib/llvm/lib/Support/Unix/Path.inc b/contrib/llvm/lib/Support/Unix/Path.inc
index 973d010..d85c37a 100644
--- a/contrib/llvm/lib/Support/Unix/Path.inc
+++ b/contrib/llvm/lib/Support/Unix/Path.inc
@@ -75,12 +75,12 @@ test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
   char fullpath[PATH_MAX];
 
   snprintf(fullpath, PATH_MAX, "%s/%s", dir, bin);
-  if (realpath(fullpath, ret) == NULL)
-    return (1);
+  if (!realpath(fullpath, ret))
+    return 1;
   if (stat(fullpath, &sb) != 0)
-    return (1);
+    return 1;
 
-  return (0);
+  return 0;
 }
 
 static char *
@@ -91,34 +91,34 @@ getprogpath(char ret[PATH_MAX], const char *bin)
   /* First approach: absolute path. */
   if (bin[0] == '/') {
     if (test_dir(ret, "/", bin) == 0)
-      return (ret);
-    return (NULL);
+      return ret;
+    return nullptr;
   }
 
   /* Second approach: relative path. */
-  if (strchr(bin, '/') != NULL) {
+  if (strchr(bin, '/')) {
     char cwd[PATH_MAX];
-    if (getcwd(cwd, PATH_MAX) == NULL)
-      return (NULL);
+    if (!getcwd(cwd, PATH_MAX))
+      return nullptr;
     if (test_dir(ret, cwd, bin) == 0)
-      return (ret);
-    return (NULL);
+      return ret;
+    return nullptr;
   }
 
   /* Third approach: $PATH */
-  if ((pv = getenv("PATH")) == NULL)
-    return (NULL);
+  if ((pv = getenv("PATH")) == nullptr)
+    return nullptr;
   s = pv = strdup(pv);
-  if (pv == NULL)
-    return (NULL);
-  while ((t = strsep(&s, ":")) != NULL) {
+  if (!pv)
+    return nullptr;
+  while ((t = strsep(&s, ":")) != nullptr) {
     if (test_dir(ret, t, bin) == 0) {
       free(pv);
-      return (ret);
+      return ret;
     }
   }
   free(pv);
-  return (NULL);
+  return nullptr;
 }
 #endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
 
@@ -153,8 +153,8 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
           return std::string(exe_path, len);
   } else {
       // Fall back to the classical detection.
-      if (getprogpath(exe_path, argv0) != NULL)
-          return exe_path;
+      if (getprogpath(exe_path, argv0))
+        return exe_path;
   }
 #elif defined(HAVE_DLFCN_H)
   // Use dladdr to get executable path if available.
@@ -219,11 +219,12 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   return std::error_code();
 }
 
-std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
+std::error_code create_directory(const Twine &path, bool IgnoreExisting,
+                                 perms Perms) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
-  if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) {
+  if (::mkdir(p.begin(), Perms) == -1) {
     if (errno != EEXIST || !IgnoreExisting)
       return std::error_code(errno, std::generic_category());
   }
@@ -324,6 +325,10 @@ std::error_code access(const Twine &Path, AccessMode Mode) {
   return std::error_code();
 }
 
+bool can_execute(const Twine &Path) {
+  return !access(Path, AccessMode::Execute);
+}
+
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
   return A.fs_st_dev == B.fs_st_dev &&
@@ -555,6 +560,54 @@ bool home_directory(SmallVectorImpl<char> &result) {
   return false;
 }
 
+static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
+  #if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
+  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
+  // macros defined in <unistd.h> on darwin >= 9
+  int ConfName = TempDir ? _CS_DARWIN_USER_TEMP_DIR
+                         : _CS_DARWIN_USER_CACHE_DIR;
+  size_t ConfLen = confstr(ConfName, nullptr, 0);
+  if (ConfLen > 0) {
+    do {
+      Result.resize(ConfLen);
+      ConfLen = confstr(ConfName, Result.data(), Result.size());
+    } while (ConfLen > 0 && ConfLen != Result.size());
+
+    if (ConfLen > 0) {
+      assert(Result.back() == 0);
+      Result.pop_back();
+      return true;
+    }
+
+    Result.clear();
+  }
+  #endif
+  return false;
+}
+
+static bool getUserCacheDir(SmallVectorImpl<char> &Result) {
+  // First try using XDS_CACHE_HOME env variable,
+  // as specified in XDG Base Directory Specification at
+  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+  if (const char *XdsCacheDir = std::getenv("XDS_CACHE_HOME")) {
+    Result.clear();
+    Result.append(XdsCacheDir, XdsCacheDir + strlen(XdsCacheDir));
+    return true;
+  }
+
+  // Try Darwin configuration query
+  if (getDarwinConfDir(false, Result))
+    return true;
+
+  // Use "$HOME/.cache" if $HOME is available
+  if (home_directory(Result)) {
+    append(Result, ".cache");
+    return true;
+  }
+
+  return false;
+}
+
 static const char *getEnvTempDir() {
   // Check whether the temporary directory is specified by an environment
   // variable.
@@ -589,27 +642,8 @@ void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
     }
   }
 
-#if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
-  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
-  // macros defined in <unistd.h> on darwin >= 9
-  int ConfName = ErasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
-                               : _CS_DARWIN_USER_CACHE_DIR;
-  size_t ConfLen = confstr(ConfName, nullptr, 0);
-  if (ConfLen > 0) {
-    do {
-      Result.resize(ConfLen);
-      ConfLen = confstr(ConfName, Result.data(), Result.size());
-    } while (ConfLen > 0 && ConfLen != Result.size());
-
-    if (ConfLen > 0) {
-      assert(Result.back() == 0);
-      Result.pop_back();
-      return;
-    }
-
-    Result.clear();
-  }
-#endif
+  if (getDarwinConfDir(ErasedOnReboot, Result))
+    return;
 
   const char *RequestedDir = getDefaultTempDir(ErasedOnReboot);
   Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
diff --git a/contrib/llvm/lib/Support/Unix/Process.inc b/contrib/llvm/lib/Support/Unix/Process.inc
index df13bd2..27083ee 100644
--- a/contrib/llvm/lib/Support/Unix/Process.inc
+++ b/contrib/llvm/lib/Support/Unix/Process.inc
@@ -430,13 +430,18 @@ const char *Process::ResetColor() {
 #if !defined(HAVE_DECL_ARC4RANDOM) || !HAVE_DECL_ARC4RANDOM
 static unsigned GetRandomNumberSeed() {
   // Attempt to get the initial seed from /dev/urandom, if possible.
-  if (FILE *RandomSource = ::fopen("/dev/urandom", "r")) {
+  int urandomFD = open("/dev/urandom", O_RDONLY);
+
+  if (urandomFD != -1) {
     unsigned seed;
-    int count = ::fread((void *)&seed, sizeof(seed), 1, RandomSource);
-    ::fclose(RandomSource);
+    // Don't use a buffered read to avoid reading more data
+    // from /dev/urandom than we need.
+    int count = read(urandomFD, (void *)&seed, sizeof(seed));
+
+    close(urandomFD);
 
     // Return the seed if the read was successful.
-    if (count == 1)
+    if (count == sizeof(seed))
       return seed;
   }
 
diff --git a/contrib/llvm/lib/Support/Unix/Program.inc b/contrib/llvm/lib/Support/Unix/Program.inc
index 8947b62..7d3537e 100644
--- a/contrib/llvm/lib/Support/Unix/Program.inc
+++ b/contrib/llvm/lib/Support/Unix/Program.inc
@@ -323,7 +323,6 @@ namespace llvm {
 
 ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
                       bool WaitUntilTerminates, std::string *ErrMsg) {
-#ifdef HAVE_SYS_WAIT_H
   struct sigaction Act, Old;
   assert(PI.Pid && "invalid pid to wait on, process not started?");
 
@@ -417,12 +416,6 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     // signal during execution as opposed to failing to execute.
     WaitResult.ReturnCode = -2;
   }
-#else
-  if (ErrMsg)
-    *ErrMsg = "Program::Wait is not implemented on this platform yet!";
-  ProcessInfo WaitResult;
-  WaitResult.ReturnCode = -2;
-#endif
   return WaitResult;
 }
 
@@ -453,7 +446,7 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
   return EC;
 }
 
-bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
+bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) {
   static long ArgMax = sysconf(_SC_ARG_MAX);
 
   // System says no practical limit.
@@ -463,7 +456,7 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   // Conservatively account for space required by environment variables.
   long HalfArgMax = ArgMax / 2;
 
-  size_t ArgLength = 0;
+  size_t ArgLength = Program.size() + 1;
   for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
     ArgLength += strlen(*I) + 1;
diff --git a/contrib/llvm/lib/Support/Unix/Signals.inc b/contrib/llvm/lib/Support/Unix/Signals.inc
index bfe2a3a..061cdb3 100644
--- a/contrib/llvm/lib/Support/Unix/Signals.inc
+++ b/contrib/llvm/lib/Support/Unix/Signals.inc
@@ -17,7 +17,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Program.h"
@@ -25,7 +24,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
-#include <vector>
 #if HAVE_EXECINFO_H
 # include <execinfo.h>         // For backtrace().
 #endif
@@ -58,8 +56,6 @@ static ManagedStatic<SmartMutex<true> > SignalsMutex;
 static void (*InterruptFunction)() = nullptr;
 
 static ManagedStatic<std::vector<std::string>> FilesToRemove;
-static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>>
-    CallBacksToRun;
 
 // IntSigs - Signals that represent requested termination. There's no bug
 // or failure, or if there is, it's not our direct responsibility. For whatever
@@ -90,12 +86,11 @@ static unsigned NumRegisteredSignals = 0;
 static struct {
   struct sigaction SA;
   int SigNo;
-} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])];
+} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)];
 
 
 static void RegisterHandler(int Signal) {
-  assert(NumRegisteredSignals <
-         sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) &&
+  assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) &&
          "Out of space for signal handlers!");
 
   struct sigaction NewHandler;
@@ -117,7 +112,7 @@ static void RegisterHandlers() {
   // during handling an actual signal because you can't safely call new in a
   // signal handler.
   *SignalsMutex;
-  
+
   // If the handlers are already registered, we're done.
   if (NumRegisteredSignals != 0) return;
 
@@ -148,9 +143,6 @@ static void RemoveFilesToRemove() {
   // memory.
   std::vector<std::string>& FilesToRemoveRef = *FilesToRemove;
   for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i) {
-    // We rely on a std::string implementation for which repeated calls to
-    // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these
-    // strings to try to ensure this is safe.
     const char *path = FilesToRemoveRef[i].c_str();
 
     // Get the status so we can determine if it's a file or directory. If we
@@ -164,7 +156,7 @@ static void RemoveFilesToRemove() {
     // super-user permissions.
     if (!S_ISREG(buf.st_mode))
       continue;
-  
+
     // Otherwise, remove the file. We ignore any errors here as there is nothing
     // else we can do.
     unlink(path);
@@ -205,11 +197,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   }
 
   // Otherwise if it is a fault (like SEGV) run any handler.
-  if (CallBacksToRun.isConstructed()) {
-    auto &CallBacksToRunRef = *CallBacksToRun;
-    for (unsigned i = 0, e = CallBacksToRun->size(); i != e; ++i)
-      CallBacksToRunRef[i].first(CallBacksToRunRef[i].second);
-  }
+  llvm::sys::RunSignalHandlers();
 
 #ifdef __s390__
   // On S/390, certain signals are delivered with PSW Address pointing to
@@ -239,21 +227,7 @@ bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
                                    std::string* ErrMsg) {
   {
     sys::SmartScopedLock<true> Guard(*SignalsMutex);
-    std::vector<std::string>& FilesToRemoveRef = *FilesToRemove;
-    std::string *OldPtr =
-        FilesToRemoveRef.empty() ? nullptr : &FilesToRemoveRef[0];
-    FilesToRemoveRef.push_back(Filename);
-
-    // We want to call 'c_str()' on every std::string in this vector so that if
-    // the underlying implementation requires a re-allocation, it happens here
-    // rather than inside of the signal handler. If we see the vector grow, we
-    // have to call it on every entry. If it remains in place, we only need to
-    // call it on the latest one.
-    if (OldPtr == &FilesToRemoveRef[0])
-      FilesToRemoveRef.back().c_str();
-    else
-      for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i)
-        FilesToRemoveRef[i].c_str();
+    FilesToRemove->push_back(Filename);
   }
 
   RegisterHandlers();
@@ -268,13 +242,6 @@ void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
   std::vector<std::string>::iterator I = FilesToRemove->end();
   if (RI != FilesToRemove->rend())
     I = FilesToRemove->erase(RI.base()-1);
-
-  // We need to call c_str() on every element which would have been moved by
-  // the erase. These elements, in a C++98 implementation where c_str()
-  // requires a reallocation on the first call may have had the call to c_str()
-  // made on insertion become invalid by being copied down an element.
-  for (std::vector<std::string>::iterator E = FilesToRemove->end(); I != E; ++I)
-    I->c_str();
 }
 
 /// AddSignalHandler - Add a function to be called when a signal is delivered
@@ -285,10 +252,9 @@ void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
   RegisterHandlers();
 }
 
-#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
-
-#if HAVE_LINK_H && (defined(__linux__) || defined(__FreeBSD__) ||              \
-                    defined(__FreeBSD_kernel__) || defined(__NetBSD__))
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) && HAVE_LINK_H &&    \
+    (defined(__linux__) || defined(__FreeBSD__) ||                             \
+     defined(__FreeBSD_kernel__) || defined(__NetBSD__))
 struct DlIteratePhdrData {
   void **StackTrace;
   int depth;
@@ -321,108 +287,27 @@ static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) {
   return 0;
 }
 
+/// If this is an ELF platform, we can find all loaded modules and their virtual
+/// addresses with dl_iterate_phdr.
 static bool findModulesAndOffsets(void **StackTrace, int Depth,
                                   const char **Modules, intptr_t *Offsets,
-                                  const char *MainExecutableName) {
+                                  const char *MainExecutableName,
+                                  StringSaver &StrPool) {
   DlIteratePhdrData data = {StackTrace, Depth,   true,
                             Modules,    Offsets, MainExecutableName};
   dl_iterate_phdr(dl_iterate_phdr_cb, &data);
   return true;
 }
 #else
+/// This platform does not have dl_iterate_phdr, so we do not yet know how to
+/// find all loaded DSOs.
 static bool findModulesAndOffsets(void **StackTrace, int Depth,
                                   const char **Modules, intptr_t *Offsets,
-                                  const char *MainExecutableName) {
+                                  const char *MainExecutableName,
+                                  StringSaver &StrPool) {
   return false;
 }
-#endif
-
-static bool printSymbolizedStackTrace(void **StackTrace, int Depth,
-                                      llvm::raw_ostream &OS) {
-  // FIXME: Subtract necessary number from StackTrace entries to turn return addresses
-  // into actual instruction addresses.
-  // Use llvm-symbolizer tool to symbolize the stack traces.
-  ErrorOr<std::string> LLVMSymbolizerPathOrErr =
-      sys::findProgramByName("llvm-symbolizer");
-  if (!LLVMSymbolizerPathOrErr)
-    return false;
-  const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr;
-  // We don't know argv0 or the address of main() at this point, but try
-  // to guess it anyway (it's possible on some platforms).
-  std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr);
-  if (MainExecutableName.empty() ||
-      MainExecutableName.find("llvm-symbolizer") != std::string::npos)
-    return false;
-
-  std::vector<const char *> Modules(Depth, nullptr);
-  std::vector<intptr_t> Offsets(Depth, 0);
-  if (!findModulesAndOffsets(StackTrace, Depth, Modules.data(), Offsets.data(),
-                             MainExecutableName.c_str()))
-    return false;
-  int InputFD;
-  SmallString<32> InputFile, OutputFile;
-  sys::fs::createTemporaryFile("symbolizer-input", "", InputFD, InputFile);
-  sys::fs::createTemporaryFile("symbolizer-output", "", OutputFile);
-  FileRemover InputRemover(InputFile.c_str());
-  FileRemover OutputRemover(OutputFile.c_str());
-
-  {
-    raw_fd_ostream Input(InputFD, true);
-    for (int i = 0; i < Depth; i++) {
-      if (Modules[i])
-        Input << Modules[i] << " " << (void*)Offsets[i] << "\n";
-    }
-  }
-
-  StringRef InputFileStr(InputFile);
-  StringRef OutputFileStr(OutputFile);
-  StringRef StderrFileStr;
-  const StringRef *Redirects[] = {&InputFileStr, &OutputFileStr,
-                                  &StderrFileStr};
-  const char *Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
-                        "--demangle", nullptr};
-  int RunResult =
-      sys::ExecuteAndWait(LLVMSymbolizerPath, Args, nullptr, Redirects);
-  if (RunResult != 0)
-    return false;
-
-  auto OutputBuf = MemoryBuffer::getFile(OutputFile.c_str());
-  if (!OutputBuf)
-    return false;
-  StringRef Output = OutputBuf.get()->getBuffer();
-  SmallVector<StringRef, 32> Lines;
-  Output.split(Lines, "\n");
-  auto CurLine = Lines.begin();
-  int frame_no = 0;
-  for (int i = 0; i < Depth; i++) {
-    if (!Modules[i]) {
-      OS << format("#%d %p\n", frame_no++, StackTrace[i]);
-      continue;
-    }
-    // Read pairs of lines (function name and file/line info) until we
-    // encounter empty line.
-    for (;;) {
-      if (CurLine == Lines.end())
-        return false;
-      StringRef FunctionName = *CurLine++;
-      if (FunctionName.empty())
-        break;
-      OS << format("#%d %p ", frame_no++, StackTrace[i]);
-      if (!FunctionName.startswith("??"))
-        OS << format("%s ", FunctionName.str().c_str());
-      if (CurLine == Lines.end())
-        return false;
-      StringRef FileLineInfo = *CurLine++;
-      if (!FileLineInfo.startswith("??"))
-        OS << format("%s", FileLineInfo.str().c_str());
-      else
-        OS << format("(%s+%p)", Modules[i], (void *)Offsets[i]);
-      OS << "\n";
-    }
-  }
-  return true;
-}
-#endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+#endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) && ...
 
 // PrintStackTrace - In the case of a program crash or fault, print out a stack
 // trace so that the user has an indication of why and where we died.
diff --git a/contrib/llvm/lib/Support/Unix/Unix.h b/contrib/llvm/lib/Support/Unix/Unix.h
index e16a226..871e612 100644
--- a/contrib/llvm/lib/Support/Unix/Unix.h
+++ b/contrib/llvm/lib/Support/Unix/Unix.h
@@ -29,6 +29,7 @@
 #include <cstring>
 #include <string>
 #include <sys/types.h>
+#include <sys/wait.h>
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
@@ -43,22 +44,10 @@
 #endif
 #include <time.h>
 
-#ifdef HAVE_SYS_WAIT_H
-# include <sys/wait.h>
-#endif
-
 #ifdef HAVE_DLFCN_H
 # include <dlfcn.h>
 #endif
 
-#ifndef WEXITSTATUS
-# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
-#endif
-
-#ifndef WIFEXITED
-# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
-#endif
-
 /// This function builds an error message into \p ErrMsg using the \p prefix
 /// string and the Unix error number given by \p errnum. If errnum is -1, the
 /// default then the value of errno is used.
diff --git a/contrib/llvm/lib/Support/Valgrind.cpp b/contrib/llvm/lib/Support/Valgrind.cpp
index facf8d9..8d852a6 100644
--- a/contrib/llvm/lib/Support/Valgrind.cpp
+++ b/contrib/llvm/lib/Support/Valgrind.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Support/Valgrind.h"
 #include "llvm/Config/config.h"
+#include <cstddef>
 
 #if HAVE_VALGRIND_VALGRIND_H
 #include <valgrind/valgrind.h>
@@ -52,23 +53,3 @@ void llvm::sys::ValgrindDiscardTranslations(const void *Addr, size_t Len) {
 }
 
 #endif  // !HAVE_VALGRIND_VALGRIND_H
-
-// These functions require no implementation, tsan just looks at the arguments
-// they're called with. However, they are required to be weak as some other
-// application or library may already be providing these definitions for the
-// same reason we are.
-extern "C" {
-LLVM_ATTRIBUTE_WEAK void AnnotateHappensAfter(const char *file, int line,
-                                              const volatile void *cv);
-void AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {
-}
-LLVM_ATTRIBUTE_WEAK void AnnotateHappensBefore(const char *file, int line,
-                                               const volatile void *cv);
-void AnnotateHappensBefore(const char *file, int line,
-                           const volatile void *cv) {}
-LLVM_ATTRIBUTE_WEAK void AnnotateIgnoreWritesBegin(const char *file, int line);
-void AnnotateIgnoreWritesBegin(const char *file, int line) {}
-LLVM_ATTRIBUTE_WEAK void AnnotateIgnoreWritesEnd(const char *file, int line);
-void AnnotateIgnoreWritesEnd(const char *file, int line) {}
-}
-
diff --git a/contrib/llvm/lib/Support/Windows/COM.inc b/contrib/llvm/lib/Support/Windows/COM.inc
index 0c50d6f..54f3ecf 100644
--- a/contrib/llvm/lib/Support/Windows/COM.inc
+++ b/contrib/llvm/lib/Support/Windows/COM.inc
@@ -1,4 +1,4 @@
-//===- llvm/Support/Windows/COM.inc - Windows COM Implementation *- C++ -*-===//
+//==- llvm/Support/Windows/COM.inc - Windows COM Implementation -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
index d38f197..17418b0 100644
--- a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
+++ b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
@@ -76,14 +76,14 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   SmallVector<wchar_t, MAX_PATH> filenameUnicode;
   if (std::error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
     SetLastError(ec.value());
-    MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: ");
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16");
     return DynamicLibrary();
   }
   
   HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
   if (a_handle == 0) {
-    MakeErrMsg(errMsg, std::string(filename) + ": Can't open : ");
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't open");
     return DynamicLibrary();
   }
 
diff --git a/contrib/llvm/lib/Support/Windows/Memory.inc b/contrib/llvm/lib/Support/Windows/Memory.inc
index 4b2ff2e..7eab9ff 100644
--- a/contrib/llvm/lib/Support/Windows/Memory.inc
+++ b/contrib/llvm/lib/Support/Windows/Memory.inc
@@ -192,14 +192,14 @@ static DWORD getProtection(const void *addr) {
 
 bool Memory::setWritable(MemoryBlock &M, std::string *ErrMsg) {
   if (!setRangeWritable(M.Address, M.Size)) {
-    return MakeErrMsg(ErrMsg, "Cannot set memory to writeable: ");
+    return MakeErrMsg(ErrMsg, "Cannot set memory to writeable");
   }
   return true;
 }
 
 bool Memory::setExecutable(MemoryBlock &M, std::string *ErrMsg) {
   if (!setRangeExecutable(M.Address, M.Size)) {
-    return MakeErrMsg(ErrMsg, "Cannot set memory to executable: ");
+    return MakeErrMsg(ErrMsg, "Cannot set memory to executable");
   }
   return true;
 }
diff --git a/contrib/llvm/lib/Support/Windows/Path.inc b/contrib/llvm/lib/Support/Windows/Path.inc
index 72da7c5..5ef77b1 100644
--- a/contrib/llvm/lib/Support/Windows/Path.inc
+++ b/contrib/llvm/lib/Support/Windows/Path.inc
@@ -38,6 +38,7 @@ typedef int errno_t;
 
 #ifdef _MSC_VER
 # pragma comment(lib, "advapi32.lib")  // This provides CryptAcquireContextW.
+# pragma comment(lib, "ole32.lib")     // This provides CoTaskMemFree
 #endif
 
 using namespace llvm;
@@ -182,7 +183,8 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
-std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
+std::error_code create_directory(const Twine &path, bool IgnoreExisting,
+                                 perms Perms) {
   SmallVector<wchar_t, 128> path_utf16;
 
   if (std::error_code ec = widenPath(path, path_utf16))
@@ -252,17 +254,34 @@ std::error_code rename(const Twine &from, const Twine &to) {
     return ec;
 
   std::error_code ec = std::error_code();
+
+  // Retry while we see ERROR_ACCESS_DENIED.
+  // System scanners (eg. indexer) might open the source file when it is written
+  // and closed.
+
   for (int i = 0; i < 2000; i++) {
+    // Try ReplaceFile first, as it is able to associate a new data stream with
+    // the destination even if the destination file is currently open.
+    if (::ReplaceFileW(wide_to.begin(), wide_from.begin(), NULL, 0, NULL, NULL))
+      return std::error_code();
+
+    // We get ERROR_FILE_NOT_FOUND if the destination file is missing.
+    // MoveFileEx can handle this case.
+    DWORD ReplaceError = ::GetLastError();
+    ec = mapWindowsError(ReplaceError);
+    if (ReplaceError != ERROR_ACCESS_DENIED &&
+        ReplaceError != ERROR_FILE_NOT_FOUND &&
+        ReplaceError != ERROR_SHARING_VIOLATION)
+      break;
+
     if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
                       MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
       return std::error_code();
-    DWORD LastError = ::GetLastError();
-    ec = mapWindowsError(LastError);
-    if (LastError != ERROR_ACCESS_DENIED)
-      break;
-    // Retry MoveFile() at ACCESS_DENIED.
-    // System scanners (eg. indexer) might open the source file when
-    // It is written and closed.
+
+    DWORD MoveError = ::GetLastError();
+    ec = mapWindowsError(MoveError);
+    if (MoveError != ERROR_ACCESS_DENIED) break;
+
     ::Sleep(1);
   }
 
@@ -301,6 +320,11 @@ std::error_code access(const Twine &Path, AccessMode Mode) {
   return std::error_code();
 }
 
+bool can_execute(const Twine &Path) {
+  return !access(Path, AccessMode::Execute) ||
+         !access(Path + ".exe", AccessMode::Execute);
+}
+
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
   return A.FileIndexHigh      == B.FileIndexHigh &&
@@ -325,10 +349,12 @@ std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
 static bool isReservedName(StringRef path) {
   // This list of reserved names comes from MSDN, at:
   // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
-  static const char *sReservedNames[] = { "nul", "con", "prn", "aux",
-                              "com1", "com2", "com3", "com4", "com5", "com6",
-                              "com7", "com8", "com9", "lpt1", "lpt2", "lpt3",
-                              "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9" };
+  static const char *const sReservedNames[] = { "nul", "con", "prn", "aux",
+                                                "com1", "com2", "com3", "com4",
+                                                "com5", "com6", "com7", "com8",
+                                                "com9", "lpt1", "lpt2", "lpt3",
+                                                "lpt4", "lpt5", "lpt6", "lpt7",
+                                                "lpt8", "lpt9" };
 
   // First, check to see if this is a device namespace, which always
   // starts with \\.\, since device namespaces are not legal file paths.
@@ -643,9 +669,10 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
   if (std::error_code EC = widenPath(Name, PathUTF16))
     return EC;
 
-  HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
-                           FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
-                           OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE H =
+      ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
   if (H == INVALID_HANDLE_VALUE) {
     DWORD LastError = ::GetLastError();
     std::error_code EC = mapWindowsError(LastError);
@@ -728,30 +755,31 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
 } // end namespace fs
 
 namespace path {
-
-bool home_directory(SmallVectorImpl<char> &result) {
-  wchar_t Path[MAX_PATH];
-  if (::SHGetFolderPathW(0, CSIDL_APPDATA | CSIDL_FLAG_CREATE, 0,
-                         /*SHGFP_TYPE_CURRENT*/0, Path) != S_OK)
+static bool getKnownFolderPath(KNOWNFOLDERID folderId,
+                               SmallVectorImpl<char> &result) {
+  wchar_t *path = nullptr;
+  if (::SHGetKnownFolderPath(folderId, KF_FLAG_CREATE, nullptr, &path) != S_OK)
     return false;
 
-  if (UTF16ToUTF8(Path, ::wcslen(Path), result))
-    return false;
+  bool ok = !UTF16ToUTF8(path, ::wcslen(path), result);
+  ::CoTaskMemFree(path);
+  return ok;
+}
 
-  return true;
+bool getUserCacheDir(SmallVectorImpl<char> &Result) {
+  return getKnownFolderPath(FOLDERID_LocalAppData, Result);
 }
 
-static bool getTempDirEnvVar(const char *Var, SmallVectorImpl<char> &Res) {
-  SmallVector<wchar_t, 128> NameUTF16;
-  if (windows::UTF8ToUTF16(Var, NameUTF16))
-    return false;
+bool home_directory(SmallVectorImpl<char> &result) {
+  return getKnownFolderPath(FOLDERID_Profile, result);
+}
 
+static bool getTempDirEnvVar(const wchar_t *Var, SmallVectorImpl<char> &Res) {
   SmallVector<wchar_t, 1024> Buf;
   size_t Size = 1024;
   do {
     Buf.reserve(Size);
-    Size =
-        GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity());
+    Size = GetEnvironmentVariableW(Var, Buf.data(), Buf.capacity());
     if (Size == 0)
       return false;
 
@@ -759,14 +787,12 @@ static bool getTempDirEnvVar(const char *Var, SmallVectorImpl<char> &Res) {
   } while (Size > Buf.capacity());
   Buf.set_size(Size);
 
-  if (windows::UTF16ToUTF8(Buf.data(), Size, Res))
-    return false;
-  return true;
+  return !windows::UTF16ToUTF8(Buf.data(), Size, Res);
 }
 
 static bool getTempDirEnvVar(SmallVectorImpl<char> &Res) {
-  const char *EnvironmentVariables[] = {"TMP", "TEMP", "USERPROFILE"};
-  for (const char *Env : EnvironmentVariables) {
+  const wchar_t *EnvironmentVariables[] = {L"TMP", L"TEMP", L"USERPROFILE"};
+  for (auto *Env : EnvironmentVariables) {
     if (getTempDirEnvVar(Env, Res))
       return true;
   }
@@ -777,13 +803,19 @@ void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
   (void)ErasedOnReboot;
   Result.clear();
 
-  // Check whether the temporary directory is specified by an environment
-  // variable.
-  if (getTempDirEnvVar(Result))
+  // Check whether the temporary directory is specified by an environment var.
+  // This matches GetTempPath logic to some degree. GetTempPath is not used
+  // directly as it cannot handle evn var longer than 130 chars on Windows 7
+  // (fixed on Windows 8).
+  if (getTempDirEnvVar(Result)) {
+    assert(!Result.empty() && "Unexpected empty path");
+    native(Result); // Some Unix-like shells use Unix path separator in $TMP.
+    fs::make_absolute(Result); // Make it absolute if not already.
     return;
+  }
 
   // Fall back to a system default.
-  const char *DefaultResult = "C:\\TEMP";
+  const char *DefaultResult = "C:\\Temp";
   Result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
 }
 } // end namespace path
diff --git a/contrib/llvm/lib/Support/Windows/Process.inc b/contrib/llvm/lib/Support/Windows/Process.inc
index 8164956..dae35a8 100644
--- a/contrib/llvm/lib/Support/Windows/Process.inc
+++ b/contrib/llvm/lib/Support/Windows/Process.inc
@@ -417,16 +417,23 @@ const char *Process::ResetColor() {
   return 0;
 }
 
+// Include GetLastError() in a fatal error message.
+static void ReportLastErrorFatal(const char *Msg) {
+  std::string ErrMsg;
+  MakeErrMsg(&ErrMsg, Msg);
+  report_fatal_error(ErrMsg);
+}
+
 unsigned Process::GetRandomNumber() {
   HCRYPTPROV HCPC;
   if (!::CryptAcquireContextW(&HCPC, NULL, NULL, PROV_RSA_FULL,
                               CRYPT_VERIFYCONTEXT))
-    report_fatal_error("Could not acquire a cryptographic context");
+    ReportLastErrorFatal("Could not acquire a cryptographic context");
 
   ScopedCryptContext CryptoProvider(HCPC);
   unsigned Ret;
   if (!::CryptGenRandom(CryptoProvider, sizeof(Ret),
                         reinterpret_cast<BYTE *>(&Ret)))
-    report_fatal_error("Could not generate a random number");
+    ReportLastErrorFatal("Could not generate a random number");
   return Ret;
 }
diff --git a/contrib/llvm/lib/Support/Windows/Program.inc b/contrib/llvm/lib/Support/Windows/Program.inc
index c29d872..78fc538 100644
--- a/contrib/llvm/lib/Support/Windows/Program.inc
+++ b/contrib/llvm/lib/Support/Windows/Program.inc
@@ -75,8 +75,15 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
 
     do {
       U16Result.reserve(Len);
-      Len = ::SearchPathW(Path, c_str(U16Name),
-                          U16Ext.empty() ? nullptr : c_str(U16Ext),
+      // Lets attach the extension manually. That is needed for files
+      // with a point in name like aaa.bbb. SearchPathW will not add extension
+      // from its argument to such files because it thinks they already had one.
+      SmallVector<wchar_t, MAX_PATH> U16NameExt;
+      if (std::error_code EC =
+              windows::UTF8ToUTF16(Twine(Name + Ext).str(), U16NameExt))
+        return EC;
+
+      Len = ::SearchPathW(Path, c_str(U16NameExt), nullptr,
                           U16Result.capacity(), U16Result.data(), nullptr);
     } while (Len > U16Result.capacity());
 
@@ -132,7 +139,7 @@ static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
                   FILE_ATTRIBUTE_NORMAL, NULL);
   if (h == INVALID_HANDLE_VALUE) {
     MakeErrMsg(ErrMsg, fname + ": Can't open file for " +
-        (fd ? "input: " : "output: "));
+        (fd ? "input" : "output"));
   }
 
   return h;
@@ -251,6 +258,14 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
     return false;
   }
 
+  // can_execute may succeed by looking at Program + ".exe". CreateProcessW
+  // will implicitly add the .exe if we provide a command line without an
+  // executable path, but since we use an explicit executable, we have to add
+  // ".exe" ourselves.
+  SmallString<64> ProgramStorage;
+  if (!sys::fs::exists(Program))
+    Program = Twine(Program + ".exe").toStringRef(ProgramStorage);
+
   // Windows wants a command line, not an array of args, to pass to the new
   // process.  We have to concatenate them all, while quoting the args that
   // have embedded spaces (or are empty).
@@ -416,7 +431,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     if (SecondsToWait) {
       if (!TerminateProcess(PI.ProcessHandle, 1)) {
         if (ErrMsg)
-          MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
+          MakeErrMsg(ErrMsg, "Failed to terminate timed-out program");
 
         // -2 indicates a crash or timeout as opposed to failure to execute.
         WaitResult.ReturnCode = -2;
@@ -441,7 +456,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   if (!rc) {
     SetLastError(err);
     if (ErrMsg)
-      MakeErrMsg(ErrMsg, "Failed getting status for program.");
+      MakeErrMsg(ErrMsg, "Failed getting status for program");
 
     // -2 indicates a crash or timeout as opposed to failure to execute.
     WaitResult.ReturnCode = -2;
@@ -520,14 +535,15 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
   return EC;
 }
 
-bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
+bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) {
   // The documented max length of the command line passed to CreateProcess.
   static const size_t MaxCommandStringLength = 32768;
-  size_t ArgLength = 0;
+  // Account for the trailing space for the program path and the
+  // trailing NULL of the last argument.
+  size_t ArgLength = ArgLenWithQuotes(Program.str().c_str()) + 2;
   for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
-    // Account for the trailing space for every arg but the last one and the
-    // trailing NULL of the last argument.
+    // Account for the trailing space for every arg
     ArgLength += ArgLenWithQuotes(*I) + 1;
     if (ArgLength > MaxCommandStringLength) {
       return false;
diff --git a/contrib/llvm/lib/Support/Windows/Signals.inc b/contrib/llvm/lib/Support/Windows/Signals.inc
index 5c8c239..f40ca72 100644
--- a/contrib/llvm/lib/Support/Windows/Signals.inc
+++ b/contrib/llvm/lib/Support/Windows/Signals.inc
@@ -14,7 +14,6 @@
 #include <algorithm>
 #include <signal.h>
 #include <stdio.h>
-#include <vector>
 
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -136,6 +135,10 @@ typedef BOOL (WINAPI *fpSymGetLineFromAddr64)(HANDLE, DWORD64,
                       PDWORD, PIMAGEHLP_LINE64);
 static fpSymGetLineFromAddr64 fSymGetLineFromAddr64;
 
+typedef BOOL(WINAPI *fpSymGetModuleInfo64)(HANDLE hProcess, DWORD64 dwAddr,
+                                           PIMAGEHLP_MODULE64 ModuleInfo);
+static fpSymGetModuleInfo64 fSymGetModuleInfo64;
+
 typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64);
 static fpSymFunctionTableAccess64 fSymFunctionTableAccess64;
 
@@ -145,6 +148,9 @@ static fpSymSetOptions fSymSetOptions;
 typedef BOOL (WINAPI *fpSymInitialize)(HANDLE, PCSTR, BOOL);
 static fpSymInitialize fSymInitialize;
 
+typedef BOOL (WINAPI *fpEnumerateLoadedModules)(HANDLE,PENUMLOADED_MODULES_CALLBACK64,PVOID);
+static fpEnumerateLoadedModules fEnumerateLoadedModules;
+
 static bool load64BitDebugHelp(void) {
   HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
   if (hLib) {
@@ -156,14 +162,20 @@ static bool load64BitDebugHelp(void) {
                       ::GetProcAddress(hLib, "SymGetSymFromAddr64");
     fSymGetLineFromAddr64 = (fpSymGetLineFromAddr64)
                       ::GetProcAddress(hLib, "SymGetLineFromAddr64");
+    fSymGetModuleInfo64 = (fpSymGetModuleInfo64)
+                      ::GetProcAddress(hLib, "SymGetModuleInfo64");
     fSymFunctionTableAccess64 = (fpSymFunctionTableAccess64)
                      ::GetProcAddress(hLib, "SymFunctionTableAccess64");
     fSymSetOptions = (fpSymSetOptions)::GetProcAddress(hLib, "SymSetOptions");
     fSymInitialize = (fpSymInitialize)::GetProcAddress(hLib, "SymInitialize");
+    fEnumerateLoadedModules = (fpEnumerateLoadedModules)
+      ::GetProcAddress(hLib, "EnumerateLoadedModules64");
   }
   return fStackWalk64 && fSymInitialize && fSymSetOptions;
 }
 
+using namespace llvm;
+
 // Forward declare.
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep);
 static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType);
@@ -172,7 +184,6 @@ static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType);
 static void (*InterruptFunction)() = 0;
 
 static std::vector<std::string> *FilesToRemove = NULL;
-static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0;
 static bool RegisteredUnhandledExceptionFilter = false;
 static bool CleanupExecuted = false;
 static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL;
@@ -183,23 +194,106 @@ static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL;
 static CRITICAL_SECTION CriticalSection;
 static bool CriticalSectionInitialized = false;
 
-static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess,
-                                     HANDLE hThread, STACKFRAME64 &StackFrame,
-                                     CONTEXT *Context) {
-  DWORD machineType;
+enum {
 #if defined(_M_X64)
-  machineType = IMAGE_FILE_MACHINE_AMD64;
+  NativeMachineType = IMAGE_FILE_MACHINE_AMD64
 #else
-  machineType = IMAGE_FILE_MACHINE_I386;
+  NativeMachineType = IMAGE_FILE_MACHINE_I386
 #endif
+};
+
+static bool printStackTraceWithLLVMSymbolizer(llvm::raw_ostream &OS,
+                                              HANDLE hProcess, HANDLE hThread,
+                                              STACKFRAME64 &StackFrameOrig,
+                                              CONTEXT *ContextOrig) {
+  // StackWalk64 modifies the incoming stack frame and context, so copy them.
+  STACKFRAME64 StackFrame = StackFrameOrig;
+
+  // Copy the register context so that we don't modify it while we unwind. We
+  // could use InitializeContext + CopyContext, but that's only required to get
+  // at AVX registers, which typically aren't needed by StackWalk64. Reduce the
+  // flag set to indicate that there's less data.
+  CONTEXT Context = *ContextOrig;
+  Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
+
+  static void *StackTrace[256];
+  size_t Depth = 0;
+  while (fStackWalk64(NativeMachineType, hProcess, hThread, &StackFrame,
+                      &Context, 0, fSymFunctionTableAccess64,
+                      fSymGetModuleBase64, 0)) {
+    if (StackFrame.AddrFrame.Offset == 0)
+      break;
+    StackTrace[Depth++] = (void *)(uintptr_t)StackFrame.AddrPC.Offset;
+    if (Depth >= array_lengthof(StackTrace))
+      break;
+  }
+
+  return printSymbolizedStackTrace(&StackTrace[0], Depth, OS);
+}
+
+namespace {
+struct FindModuleData {
+  void **StackTrace;
+  int Depth;
+  const char **Modules;
+  intptr_t *Offsets;
+  StringSaver *StrPool;
+};
+}
+
+static BOOL CALLBACK findModuleCallback(WIN32_ELMCB_PCSTR ModuleName,
+                                        DWORD64 ModuleBase, ULONG ModuleSize,
+                                        void *VoidData) {
+  FindModuleData *Data = (FindModuleData*)VoidData;
+  intptr_t Beg = ModuleBase;
+  intptr_t End = Beg + ModuleSize;
+  for (int I = 0; I < Data->Depth; I++) {
+    if (Data->Modules[I])
+      continue;
+    intptr_t Addr = (intptr_t)Data->StackTrace[I];
+    if (Beg <= Addr && Addr < End) {
+      Data->Modules[I] = Data->StrPool->save(ModuleName);
+      Data->Offsets[I] = Addr - Beg;
+    }
+  }
+  return TRUE;
+}
 
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+                                  const char **Modules, intptr_t *Offsets,
+                                  const char *MainExecutableName,
+                                  StringSaver &StrPool) {
+  if (!fEnumerateLoadedModules)
+    return false;
+  FindModuleData Data;
+  Data.StackTrace = StackTrace;
+  Data.Depth = Depth;
+  Data.Modules = Modules;
+  Data.Offsets = Offsets;
+  Data.StrPool = &StrPool;
+  fEnumerateLoadedModules(GetCurrentProcess(), findModuleCallback, &Data);
+  return true;
+}
+
+static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess,
+                                     HANDLE hThread, STACKFRAME64 &StackFrame,
+                                     CONTEXT *Context) {
   // Initialize the symbol handler.
   fSymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_LOAD_LINES);
   fSymInitialize(hProcess, NULL, TRUE);
 
+  // Try llvm-symbolizer first. llvm-symbolizer knows how to deal with both PDBs
+  // and DWARF, so it should do a good job regardless of what debug info or
+  // linker is in use.
+  if (printStackTraceWithLLVMSymbolizer(OS, hProcess, hThread, StackFrame,
+                                        Context)) {
+    return;
+  }
+
   while (true) {
-    if (!fStackWalk64(machineType, hProcess, hThread, &StackFrame, Context, 0,
-                      fSymFunctionTableAccess64, fSymGetModuleBase64, 0)) {
+    if (!fStackWalk64(NativeMachineType, hProcess, hThread, &StackFrame,
+                      Context, 0, fSymFunctionTableAccess64,
+                      fSymGetModuleBase64, 0)) {
       break;
     }
 
@@ -404,7 +498,6 @@ extern "C" VOID WINAPI RtlCaptureContext(PCONTEXT ContextRecord);
 #endif
 
 void llvm::sys::PrintStackTrace(raw_ostream &OS) {
-
   STACKFRAME64 StackFrame = {};
   CONTEXT Context = {};
   ::RtlCaptureContext(&Context);
@@ -436,8 +529,6 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) {
 /// to the process.  The handler can have a cookie passed to it to identify
 /// what instance of the handler it is.
 void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
-  if (CallBacksToRun == 0)
-    CallBacksToRun = new std::vector<std::pair<void(*)(void*), void*> >();
   CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
   RegisterHandler();
   LeaveCriticalSection(&CriticalSection);
@@ -454,17 +545,12 @@ static void Cleanup() {
   CleanupExecuted = true;
 
   // FIXME: open files cannot be deleted.
-
   if (FilesToRemove != NULL)
     while (!FilesToRemove->empty()) {
       llvm::sys::fs::remove(FilesToRemove->back());
       FilesToRemove->pop_back();
     }
-
-  if (CallBacksToRun)
-    for (auto &I : *CallBacksToRun)
-      I.first(I.second);
-
+  llvm::sys::RunSignalHandlers();
   LeaveCriticalSection(&CriticalSection);
 }
 
diff --git a/contrib/llvm/lib/Support/Windows/WindowsSupport.h b/contrib/llvm/lib/Support/Windows/WindowsSupport.h
index 5bb0b8d..60490f2 100644
--- a/contrib/llvm/lib/Support/Windows/WindowsSupport.h
+++ b/contrib/llvm/lib/Support/Windows/WindowsSupport.h
@@ -26,12 +26,16 @@
 #undef _WIN32_WINNT
 #undef _WIN32_IE
 
-// Require at least Windows XP(5.1) API.
-#define _WIN32_WINNT 0x0501
-#define _WIN32_IE    0x0600 // MinGW at it again.
+// Require at least Windows 7 API.
+#define _WIN32_WINNT 0x0601
+#define _WIN32_IE    0x0800 // MinGW at it again. FIXME: verify if still needed.
 #define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
@@ -43,17 +47,42 @@
 #include <string>
 #include <vector>
 
+/// Determines if the program is running on Windows 8 or newer. This
+/// reimplements one of the helpers in the Windows 8.1 SDK, which are intended
+/// to supercede raw calls to GetVersionEx. Old SDKs, Cygwin, and MinGW don't
+/// yet have VersionHelpers.h, so we have our own helper.
+inline bool RunningWindows8OrGreater() {
+  // Windows 8 is version 6.2, service pack 0.
+  OSVERSIONINFOEXW osvi = {};
+  osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+  osvi.dwMajorVersion = 6;
+  osvi.dwMinorVersion = 2;
+  osvi.wServicePackMajor = 0;
+
+  DWORDLONG Mask = 0;
+  Mask = VerSetConditionMask(Mask, VER_MAJORVERSION, VER_GREATER_EQUAL);
+  Mask = VerSetConditionMask(Mask, VER_MINORVERSION, VER_GREATER_EQUAL);
+  Mask = VerSetConditionMask(Mask, VER_SERVICEPACKMAJOR, VER_GREATER_EQUAL);
+
+  return VerifyVersionInfoW(&osvi, VER_MAJORVERSION | VER_MINORVERSION |
+                                       VER_SERVICEPACKMAJOR,
+                            Mask) != FALSE;
+}
+
 inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
   if (!ErrMsg)
     return true;
   char *buffer = NULL;
+  DWORD LastError = GetLastError();
   DWORD R = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
-                          FORMAT_MESSAGE_FROM_SYSTEM,
-                          NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
+                          FORMAT_MESSAGE_FROM_SYSTEM |
+                          FORMAT_MESSAGE_MAX_WIDTH_MASK,
+                          NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
   if (R)
-    *ErrMsg = prefix + buffer;
+    *ErrMsg = prefix + ": " + buffer;
   else
-    *ErrMsg = prefix + "Unknown error";
+    *ErrMsg = prefix + ": Unknown error";
+  *ErrMsg += " (0x" + llvm::utohexstr(LastError) + ")";
 
   LocalFree(buffer);
   return R != 0;
diff --git a/contrib/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm/lib/Support/YAMLParser.cpp
index d55da5e..c4384ca 100644
--- a/contrib/llvm/lib/Support/YAMLParser.cpp
+++ b/contrib/llvm/lib/Support/YAMLParser.cpp
@@ -801,7 +801,7 @@ Token &Scanner::peekNext() {
 
     removeStaleSimpleKeyCandidates();
     SimpleKey SK;
-    SK.Tok = TokenQueue.front();
+    SK.Tok = TokenQueue.begin();
     if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
         == SimpleKeys.end())
       break;
@@ -962,10 +962,8 @@ void Scanner::skip(uint32_t Distance) {
 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
   if (Position == End)
     return false;
-  if (   *Position == ' ' || *Position == '\t'
-      || *Position == '\r' || *Position == '\n')
-    return true;
-  return false;
+  return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
+         *Position == '\n';
 }
 
 bool Scanner::consumeLineBreakIfPresent() {
@@ -1163,7 +1161,7 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) {
   TokenQueue.push_back(T);
 
   // [ and { may begin a simple key.
-  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
+  saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
 
   // And may also be followed by a simple key.
   IsSimpleKeyAllowed = true;
@@ -1326,7 +1324,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
   T.Range = StringRef(Start, Current - Start);
   TokenQueue.push_back(T);
 
-  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
 
@@ -1404,7 +1402,7 @@ bool Scanner::scanPlainScalar() {
   TokenQueue.push_back(T);
 
   // Plain scalars can be simple keys.
-  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
 
@@ -1439,7 +1437,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
   TokenQueue.push_back(T);
 
   // Alias and anchors can be simple keys.
-  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
 
@@ -1669,7 +1667,7 @@ bool Scanner::scanTag() {
   TokenQueue.push_back(T);
 
   // Tags can be simple keys.
-  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
 
   IsSimpleKeyAllowed = false;
 
diff --git a/contrib/llvm/lib/Support/YAMLTraits.cpp b/contrib/llvm/lib/Support/YAMLTraits.cpp
index 6b59a16..2aa6e9b 100644
--- a/contrib/llvm/lib/Support/YAMLTraits.cpp
+++ b/contrib/llvm/lib/Support/YAMLTraits.cpp
@@ -332,17 +332,12 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
     StringRef KeyStr = SN->getValue(StringStorage);
     if (!StringStorage.empty()) {
       // Copy string to permanent storage
-      unsigned Len = StringStorage.size();
-      char *Buf = StringAllocator.Allocate<char>(Len);
-      memcpy(Buf, &StringStorage[0], Len);
-      KeyStr = StringRef(Buf, Len);
+      KeyStr = StringStorage.str().copy(StringAllocator);
     }
     return llvm::make_unique<ScalarHNode>(N, KeyStr);
   } else if (BlockScalarNode *BSN = dyn_cast<BlockScalarNode>(N)) {
-    StringRef Value = BSN->getValue();
-    char *Buf = StringAllocator.Allocate<char>(Value.size());
-    memcpy(Buf, Value.data(), Value.size());
-    return llvm::make_unique<ScalarHNode>(N, StringRef(Buf, Value.size()));
+    StringRef ValueCopy = BSN->getValue().copy(StringAllocator);
+    return llvm::make_unique<ScalarHNode>(N, ValueCopy);
   } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
     auto SQHNode = llvm::make_unique<SequenceHNode>(N);
     for (Node &SN : *SQ) {
@@ -365,10 +360,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
       StringRef KeyStr = KeyScalar->getValue(StringStorage);
       if (!StringStorage.empty()) {
         // Copy string to permanent storage
-        unsigned Len = StringStorage.size();
-        char *Buf = StringAllocator.Allocate<char>(Len);
-        memcpy(Buf, &StringStorage[0], Len);
-        KeyStr = StringRef(Buf, Len);
+        KeyStr = StringStorage.str().copy(StringAllocator);
       }
       auto ValueHNode = this->createHNodes(KVN.getValue());
       if (EC)
diff --git a/contrib/llvm/lib/Support/raw_ostream.cpp b/contrib/llvm/lib/Support/raw_ostream.cpp
index 42f830b..15813fd 100644
--- a/contrib/llvm/lib/Support/raw_ostream.cpp
+++ b/contrib/llvm/lib/Support/raw_ostream.cpp
@@ -57,6 +57,10 @@
 #endif
 #endif
 
+#ifdef LLVM_ON_WIN32
+#include "Windows/WindowsSupport.h"
+#endif
+
 using namespace llvm;
 
 raw_ostream::~raw_ostream() {
@@ -517,7 +521,7 @@ raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
 /// closes the file when the stream is destroyed.
 raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
     : raw_pwrite_stream(unbuffered), FD(fd), ShouldClose(shouldClose),
-      Error(false), UseAtomicWrites(false) {
+      Error(false) {
   if (FD < 0 ) {
     ShouldClose = false;
     return;
@@ -567,22 +571,21 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
   assert(FD >= 0 && "File already closed.");
   pos += Size;
 
-  do {
-    ssize_t ret;
-
-    // Check whether we should attempt to use atomic writes.
-    if (LLVM_LIKELY(!UseAtomicWrites)) {
-      ret = ::write(FD, Ptr, Size);
-    } else {
-      // Use ::writev() where available.
-#if defined(HAVE_WRITEV)
-      const void *Addr = static_cast<const void *>(Ptr);
-      struct iovec IOV = {const_cast<void *>(Addr), Size };
-      ret = ::writev(FD, &IOV, 1);
+#ifndef LLVM_ON_WIN32
+  bool ShouldWriteInChunks = false;
 #else
-      ret = ::write(FD, Ptr, Size);
+  // Writing a large size of output to Windows console returns ENOMEM. It seems
+  // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
+  // the latter has a size limit (66000 bytes or less, depending on heap usage).
+  bool ShouldWriteInChunks = !!::_isatty(FD) && !RunningWindows8OrGreater();
 #endif
-    }
+
+  do {
+    size_t ChunkSize = Size;
+    if (ChunkSize > 32767 && ShouldWriteInChunks)
+        ChunkSize = 32767;
+
+    ssize_t ret = ::write(FD, Ptr, ChunkSize);
 
     if (ret < 0) {
       // If it's a recoverable error, swallow it and retry the write.
@@ -755,72 +758,15 @@ void raw_string_ostream::write_impl(const char *Ptr, size_t Size) {
 //  raw_svector_ostream
 //===----------------------------------------------------------------------===//
 
-// The raw_svector_ostream implementation uses the SmallVector itself as the
-// buffer for the raw_ostream. We guarantee that the raw_ostream buffer is
-// always pointing past the end of the vector, but within the vector
-// capacity. This allows raw_ostream to write directly into the correct place,
-// and we only need to set the vector size when the data is flushed.
+uint64_t raw_svector_ostream::current_pos() const { return OS.size(); }
 
-raw_svector_ostream::raw_svector_ostream(SmallVectorImpl<char> &O, unsigned)
-    : OS(O) {}
-
-raw_svector_ostream::raw_svector_ostream(SmallVectorImpl<char> &O) : OS(O) {
-  init();
-}
-
-void raw_svector_ostream::init() {
-  // Set up the initial external buffer. We make sure that the buffer has at
-  // least 128 bytes free; raw_ostream itself only requires 64, but we want to
-  // make sure that we don't grow the buffer unnecessarily on destruction (when
-  // the data is flushed). See the FIXME below.
-  OS.reserve(OS.size() + 128);
-  SetBuffer(OS.end(), OS.capacity() - OS.size());
-}
-
-raw_svector_ostream::~raw_svector_ostream() {
-  // FIXME: Prevent resizing during this flush().
-  flush();
+void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) {
+  OS.append(Ptr, Ptr + Size);
 }
 
 void raw_svector_ostream::pwrite_impl(const char *Ptr, size_t Size,
                                       uint64_t Offset) {
-  flush();
-  memcpy(OS.begin() + Offset, Ptr, Size);
-}
-
-/// resync - This is called when the SmallVector we're appending to is changed
-/// outside of the raw_svector_ostream's control.  It is only safe to do this
-/// if the raw_svector_ostream has previously been flushed.
-void raw_svector_ostream::resync() {
-  assert(GetNumBytesInBuffer() == 0 && "Didn't flush before mutating vector");
-
-  if (OS.capacity() - OS.size() < 64)
-    OS.reserve(OS.capacity() * 2);
-  SetBuffer(OS.end(), OS.capacity() - OS.size());
-}
-
-void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) {
-  if (Ptr == OS.end()) {
-    // Grow the buffer to include the scratch area without copying.
-    size_t NewSize = OS.size() + Size;
-    assert(NewSize <= OS.capacity() && "Invalid write_impl() call!");
-    OS.set_size(NewSize);
-  } else {
-    assert(!GetNumBytesInBuffer());
-    OS.append(Ptr, Ptr + Size);
-  }
-
-  OS.reserve(OS.size() + 64);
-  SetBuffer(OS.end(), OS.capacity() - OS.size());
-}
-
-uint64_t raw_svector_ostream::current_pos() const {
-  return OS.size();
-}
-
-StringRef raw_svector_ostream::str() {
-  flush();
-  return StringRef(OS.begin(), OS.size());
+  memcpy(OS.data() + Offset, Ptr, Size);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/TableGen/Record.cpp b/contrib/llvm/lib/TableGen/Record.cpp
index c9a31b6..11e35b7 100644
--- a/contrib/llvm/lib/TableGen/Record.cpp
+++ b/contrib/llvm/lib/TableGen/Record.cpp
@@ -673,6 +673,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
         PrintFatalError(CurRec->getLoc(),
                         "Undefined reference:'" + Name + "'\n");
       }
+
+      if (isa<IntRecTy>(getType())) {
+        if (BitsInit *BI = dyn_cast<BitsInit>(LHS)) {
+          if (Init *NewInit = BI->convertInitializerTo(IntRecTy::get()))
+            return NewInit;
+          break;
+        }
+      }
     }
     break;
   }
@@ -714,7 +722,7 @@ Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
 
 std::string UnOpInit::getAsString() const {
   std::string Result;
-  switch (Opc) {
+  switch (getOpcode()) {
   case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
   case HEAD: Result = "!head"; break;
   case TAIL: Result = "!tail"; break;
@@ -842,7 +850,7 @@ Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
 
 std::string BinOpInit::getAsString() const {
   std::string Result;
-  switch (Opc) {
+  switch (getOpcode()) {
   case CONCAT: Result = "!con"; break;
   case ADD: Result = "!add"; break;
   case AND: Result = "!and"; break;
@@ -1046,7 +1054,7 @@ Init *TernOpInit::resolveReferences(Record &R,
                                     const RecordVal *RV) const {
   Init *lhs = LHS->resolveReferences(R, RV);
 
-  if (Opc == IF && lhs != LHS) {
+  if (getOpcode() == IF && lhs != LHS) {
     IntInit *Value = dyn_cast<IntInit>(lhs);
     if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
       Value = dyn_cast<IntInit>(I);
@@ -1074,7 +1082,7 @@ Init *TernOpInit::resolveReferences(Record &R,
 
 std::string TernOpInit::getAsString() const {
   std::string Result;
-  switch (Opc) {
+  switch (getOpcode()) {
   case SUBST: Result = "!subst"; break;
   case FOREACH: Result = "!foreach"; break;
   case IF: Result = "!if"; break;
@@ -1633,7 +1641,7 @@ void Record::dump() const { errs() << *this; }
 raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   OS << R.getNameInitAsString();
 
-  const std::vector<Init *> &TArgs = R.getTemplateArgs();
+  ArrayRef<Init *> TArgs = R.getTemplateArgs();
   if (!TArgs.empty()) {
     OS << "<";
     bool NeedComma = false;
diff --git a/contrib/llvm/lib/TableGen/SetTheory.cpp b/contrib/llvm/lib/TableGen/SetTheory.cpp
index 07c5381..f56b17a 100644
--- a/contrib/llvm/lib/TableGen/SetTheory.cpp
+++ b/contrib/llvm/lib/TableGen/SetTheory.cpp
@@ -196,7 +196,7 @@ struct SequenceOp : public SetTheory::Operator {
     if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[2]))
       To = II->getValue();
     else
-      PrintFatalError(Loc, "From must be an integer: " + Expr->getAsString());
+      PrintFatalError(Loc, "To must be an integer: " + Expr->getAsString());
     if (To < 0 || To >= (1 << 30))
       PrintFatalError(Loc, "To out of range");
 
diff --git a/contrib/llvm/lib/TableGen/TGParser.cpp b/contrib/llvm/lib/TableGen/TGParser.cpp
index 5c36fda..1506a71 100644
--- a/contrib/llvm/lib/TableGen/TGParser.cpp
+++ b/contrib/llvm/lib/TableGen/TGParser.cpp
@@ -77,7 +77,8 @@ bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
 /// SetValue -
 /// Return true on error, false on success.
 bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
-                        const std::vector<unsigned> &BitList, Init *V) {
+                        ArrayRef<unsigned> BitList, Init *V,
+                        bool AllowSelfAssignment) {
   if (!V) return false;
 
   if (!CurRec) CurRec = &CurMultiClass->Rec;
@@ -91,8 +92,8 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
   // in the resolution machinery.
   if (BitList.empty())
     if (VarInit *VI = dyn_cast<VarInit>(V))
-      if (VI->getNameInit() == ValName)
-        return false;
+      if (VI->getNameInit() == ValName && !AllowSelfAssignment)
+        return true;
 
   // If we are assigning to a subset of the bits in the value... then we must be
   // assigning to a field of BitsRecTy, which must have a BitsInit
@@ -152,7 +153,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
     if (AddValue(CurRec, SubClass.RefRange.Start, Val))
       return true;
 
-  const std::vector<Init *> &TArgs = SC->getTemplateArgs();
+  ArrayRef<Init *> TArgs = SC->getTemplateArgs();
 
   // Ensure that an appropriate number of template arguments are specified.
   if (TArgs.size() < SubClass.TemplateArgs.size())
@@ -165,7 +166,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
     if (i < SubClass.TemplateArgs.size()) {
       // If a value is specified for this template arg, set it now.
       if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i],
-                   std::vector<unsigned>(), SubClass.TemplateArgs[i]))
+                   None, SubClass.TemplateArgs[i]))
         return true;
 
       // Resolve it next.
@@ -228,7 +229,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
     CurMC->DefPrototypes.push_back(std::move(NewDef));
   }
 
-  const std::vector<Init *> &SMCTArgs = SMC->Rec.getTemplateArgs();
+  ArrayRef<Init *> SMCTArgs = SMC->Rec.getTemplateArgs();
 
   // Ensure that an appropriate number of template arguments are
   // specified.
@@ -243,8 +244,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
       // If a value is specified for this template arg, set it in the
       // superclass now.
       if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i],
-                   std::vector<unsigned>(),
-                   SubMultiClass.TemplateArgs[i]))
+                   None, SubMultiClass.TemplateArgs[i]))
         return true;
 
       // Resolve it next.
@@ -258,8 +258,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
       for (const auto &Def :
              makeArrayRef(CurMC->DefPrototypes).slice(newDefStart)) {
         if (SetValue(Def.get(), SubMultiClass.RefRange.Start, SMCTArgs[i],
-                     std::vector<unsigned>(),
-                     SubMultiClass.TemplateArgs[i]))
+                     None, SubMultiClass.TemplateArgs[i]))
           return true;
 
         // Resolve it next.
@@ -332,8 +331,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
 
     IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
 
-    if (SetValue(IterRec.get(), Loc, IterVar->getName(),
-                 std::vector<unsigned>(), IVal))
+    if (SetValue(IterRec.get(), Loc, IterVar->getName(), None, IVal))
       return Error(Loc, "when instantiating this def");
 
     // Resolve it next.
@@ -1641,7 +1639,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
   RecTy *ItemType = EltTy;
   unsigned int ArgN = 0;
   if (ArgsRec && !EltTy) {
-    const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
+    ArrayRef<Init *> TArgs = ArgsRec->getTemplateArgs();
     if (TArgs.empty()) {
       TokError("template argument provided to non-template class");
       return std::vector<Init*>();
@@ -1662,7 +1660,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
     Lex.Lex();  // Eat the comma
 
     if (ArgsRec && !EltTy) {
-      const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
+      ArrayRef<Init *> TArgs = ArgsRec->getTemplateArgs();
       if (ArgN >= TArgs.size()) {
         TokError("too many template arguments");
         return std::vector<Init*>();
@@ -1728,7 +1726,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
     SMLoc ValLoc = Lex.getLoc();
     Init *Val = ParseValue(CurRec, Type);
     if (!Val ||
-        SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
+        SetValue(CurRec, ValLoc, DeclName, None, Val))
       // Return the name, even if an error is thrown.  This is so that we can
       // continue to make some progress, even without the value having been
       // initialized.
@@ -2313,13 +2311,11 @@ bool TGParser::ParseMultiClass() {
   return false;
 }
 
-Record *TGParser::
-InstantiateMulticlassDef(MultiClass &MC,
-                         Record *DefProto,
-                         Init *&DefmPrefix,
-                         SMRange DefmPrefixRange,
-                         const std::vector<Init *> &TArgs,
-                         std::vector<Init *> &TemplateVals) {
+Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
+                                           Init *&DefmPrefix,
+                                           SMRange DefmPrefixRange,
+                                           ArrayRef<Init *> TArgs,
+                                           std::vector<Init *> &TemplateVals) {
   // We need to preserve DefProto so it can be reused for later
   // instantiations, so create a new Record to inherit from it.
 
@@ -2360,8 +2356,8 @@ InstantiateMulticlassDef(MultiClass &MC,
   // Set the value for NAME. We don't resolve references to it 'til later,
   // though, so that uses in nested multiclass names don't get
   // confused.
-  if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME",
-               std::vector<unsigned>(), DefmPrefix)) {
+  if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME", None, DefmPrefix,
+               /*AllowSelfAssignment*/true)) {
     Error(DefmPrefixRange.Start, "Could not resolve " +
           CurRec->getNameInitAsString() + ":NAME to '" +
           DefmPrefix->getAsUnquotedString() + "'");
@@ -2437,11 +2433,9 @@ InstantiateMulticlassDef(MultiClass &MC,
   return CurRec.release();
 }
 
-bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC,
-                                        Record *CurRec,
-                                        SMLoc DefmPrefixLoc,
-                                        SMLoc SubClassLoc,
-                                        const std::vector<Init *> &TArgs,
+bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, Record *CurRec,
+                                        SMLoc DefmPrefixLoc, SMLoc SubClassLoc,
+                                        ArrayRef<Init *> TArgs,
                                         std::vector<Init *> &TemplateVals,
                                         bool DeleteArgs) {
   // Loop over all of the template arguments, setting them to the specified
@@ -2450,8 +2444,7 @@ bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC,
     // Check if a value is specified for this temp-arg.
     if (i < TemplateVals.size()) {
       // Set it now.
-      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], std::vector<unsigned>(),
-                   TemplateVals[i]))
+      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], None, TemplateVals[i]))
         return true;
 
       // Resolve it next.
@@ -2540,7 +2533,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     std::vector<Init*> &TemplateVals = Ref.TemplateArgs;
 
     // Verify that the correct number of template arguments were specified.
-    const std::vector<Init *> &TArgs = MC->Rec.getTemplateArgs();
+    ArrayRef<Init *> TArgs = MC->Rec.getTemplateArgs();
     if (TArgs.size() < TemplateVals.size())
       return Error(SubClassLoc,
                    "more template args specified than multiclass expects");
diff --git a/contrib/llvm/lib/TableGen/TGParser.h b/contrib/llvm/lib/TableGen/TGParser.h
index d69d1f4..739d9a9 100644
--- a/contrib/llvm/lib/TableGen/TGParser.h
+++ b/contrib/llvm/lib/TableGen/TGParser.h
@@ -105,10 +105,13 @@ public:
 private:  // Semantic analysis methods.
   bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
   bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName,
-                const std::vector<unsigned> &BitList, Init *V);
+                ArrayRef<unsigned> BitList, Init *V,
+                bool AllowSelfAssignment = false);
   bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName,
-                const std::vector<unsigned> &BitList, Init *V) {
-    return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V);
+                ArrayRef<unsigned> BitList, Init *V,
+                bool AllowSelfAssignment = false) {
+    return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V,
+                    AllowSelfAssignment);
   }
   bool AddSubClass(Record *Rec, SubClassReference &SubClass);
   bool AddSubMultiClass(MultiClass *CurMC,
@@ -135,17 +138,13 @@ private:  // Parser methods.
   bool ParseObject(MultiClass *MC);
   bool ParseClass();
   bool ParseMultiClass();
-  Record *InstantiateMulticlassDef(MultiClass &MC,
-                                   Record *DefProto,
-                                   Init *&DefmPrefix,
-                                   SMRange DefmPrefixRange,
-                                   const std::vector<Init *> &TArgs,
+  Record *InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
+                                   Init *&DefmPrefix, SMRange DefmPrefixRange,
+                                   ArrayRef<Init *> TArgs,
                                    std::vector<Init *> &TemplateVals);
-  bool ResolveMulticlassDefArgs(MultiClass &MC,
-                                Record *DefProto,
-                                SMLoc DefmPrefixLoc,
-                                SMLoc SubClassLoc,
-                                const std::vector<Init *> &TArgs,
+  bool ResolveMulticlassDefArgs(MultiClass &MC, Record *DefProto,
+                                SMLoc DefmPrefixLoc, SMLoc SubClassLoc,
+                                ArrayRef<Init *> TArgs,
                                 std::vector<Init *> &TemplateVals,
                                 bool DeleteArgs);
   bool ResolveMulticlassDef(MultiClass &MC,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
index 9a7d6c8..46ef2c1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -32,6 +32,15 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
 
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+  "Enable ARMv8 PMUv3 Performance Monitors extension">;
+
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+  "Full FP16", [FeatureFPARMv8]>;
+
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+  "Enable Statistical Profiling extension">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -40,6 +49,15 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
 def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
                                         "Has zero-cycle zeroing instructions">;
 
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+                                          "StrictAlign", "true",
+                                          "Disallow all unaligned memory "
+                                          "access">;
+
+def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
+                                         "Reserve X18, making it unavailable "
+                                         "as a GPR">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -47,6 +65,9 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
   "Support ARM v8.1a instructions", [FeatureCRC]>;
 
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+  "Support ARM v8.2a instructions", [HasV8_1aOps]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -70,19 +91,29 @@ include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeaturePerfMon]>;
+
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
                                    FeatureCrypto,
-                                   FeatureCRC]>;
+                                   FeatureCRC,
+                                   FeaturePerfMon]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
                                    FeatureCrypto,
-                                   FeatureCRC]>;
+                                   FeatureCRC,
+                                   FeaturePerfMon]>;
 
 def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
                                    "Cyclone",
@@ -90,17 +121,31 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
                                    FeatureNEON,
                                    FeatureCrypto,
                                    FeatureCRC,
+                                   FeaturePerfMon,
                                    FeatureZCRegMove, FeatureZCZeroing]>;
 
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M1 processors",
+                                    [FeatureFPARMv8,
+                                    FeatureNEON,
+                                    FeatureCrypto,
+                                    FeatureCRC,
+                                    FeaturePerfMon]>;
+
 def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
                                               FeatureNEON,
-                                              FeatureCRC]>;
+                                              FeatureCRC,
+                                              FeaturePerfMon]>;
 
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
+def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
@@ -109,11 +154,13 @@ def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 def GenericAsmParserVariant : AsmParserVariant {
   int Variant = 0;
   string Name = "generic";
+  string BreakCharacters = ".";
 }
 
 def AppleAsmParserVariant : AsmParserVariant {
   int Variant = 1;
   string Name = "apple-neon";
+  string BreakCharacters = ".";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index d7ef3f4..d215d9e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -122,7 +122,7 @@ AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
 static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
                                              const TargetInstrInfo *TII) {
   // Get the previous machine basic block in the function.
-  MachineFunction::iterator MBBI = *MBB;
+  MachineFunction::iterator MBBI(MBB);
 
   // Can't go off top of function.
   if (MBBI == MBB->getParent()->begin())
@@ -131,7 +131,7 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 2> Cond;
 
-  MachineBasicBlock *PrevBB = std::prev(MBBI);
+  MachineBasicBlock *PrevBB = &*std::prev(MBBI);
   for (MachineBasicBlock *S : MBB->predecessors())
     if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
         !TBB && !FBB)
@@ -151,10 +151,9 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
   // If there is no non-pseudo in the current block, loop back around and try
   // the previous block (if there is one).
   while ((FMBB = getBBFallenThrough(FMBB, TII))) {
-    for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) {
-      if (!I->isPseudo())
-        return &*I;
-    }
+    for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend()))
+      if (!I.isPseudo())
+        return &I;
   }
 
   // There was no previous non-pseudo in the fallen through blocks
@@ -217,8 +216,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
     ++Idx;
   }
 
-  DEBUG(dbgs() << "Scan complete, "<< Sequences.size()
-               << " occurences of pattern found.\n");
+  DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+               << " occurrences of pattern found.\n");
 
   // Then update the basic block, inserting nops between the detected sequences.
   for (auto &MI : Sequences) {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 9d6dbd6..3d1ab4e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -158,7 +158,7 @@ INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
                     "AArch64 A57 FP Load-Balancing", false, false)
 
 namespace {
-/// A Chain is a sequence of instructions that are linked together by 
+/// A Chain is a sequence of instructions that are linked together by
 /// an accumulation operand. For example:
 ///
 ///   fmul d0<def>, ?
@@ -285,7 +285,7 @@ public:
   std::string str() const {
     std::string S;
     raw_string_ostream OS(S);
-    
+
     OS << "{";
     StartInst->print(OS, /* SkipOpers= */true);
     OS << " -> ";
@@ -427,7 +427,7 @@ Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
       return Ch;
     }
   }
-  
+
   // Bailout case - just return the first item.
   Chain *Ch = L.front();
   L.erase(L.begin());
@@ -495,7 +495,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
   RS.enterBasicBlock(&MBB);
   RS.forward(MachineBasicBlock::iterator(G->getStart()));
 
-  // Can we find an appropriate register that is available throughout the life 
+  // Can we find an appropriate register that is available throughout the life
   // of the chain?
   unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
   BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
@@ -593,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
       if (Change) {
         Substs[MO.getReg()] = Reg;
         MO.setReg(Reg);
-        MRI->setPhysRegUsed(Reg);
 
         Changed = true;
       }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 716e1a3..3afcdfb 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -57,6 +57,8 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
                      " the other."),
             cl::init(true));
 
+#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
+
 //===----------------------------------------------------------------------===//
 //                       AArch64AddressTypePromotion
 //===----------------------------------------------------------------------===//
@@ -76,7 +78,7 @@ public:
   }
 
   const char *getPassName() const override {
-    return "AArch64 Address Type Promotion";
+    return AARCH64_TYPE_PROMO_NAME;
   }
 
   /// Iterate over the functions and promote the computation of interesting
@@ -143,10 +145,10 @@ private:
 char AArch64AddressTypePromotion::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
-                      "AArch64 Type Promotion Pass", false, false)
+                      AARCH64_TYPE_PROMO_NAME, false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
-                    "AArch64 Type Promotion Pass", false, false)
+                    AARCH64_TYPE_PROMO_NAME, false, false)
 
 FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
   return new AArch64AddressTypePromotion();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 18d21fd..1644d71 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -61,6 +61,12 @@ STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
 STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
 STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
 
+namespace llvm {
+void initializeAArch64AdvSIMDScalarPass(PassRegistry &);
+}
+
+#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
+
 namespace {
 class AArch64AdvSIMDScalar : public MachineFunctionPass {
   MachineRegisterInfo *MRI;
@@ -82,12 +88,14 @@ private:
 
 public:
   static char ID; // Pass identification, replacement for typeid.
-  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
+    initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
+  }
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
   const char *getPassName() const override {
-    return "AdvSIMD Scalar Operation Optimization";
+    return AARCH64_ADVSIMD_NAME;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -98,6 +106,9 @@ public:
 char AArch64AdvSIMDScalar::ID = 0;
 } // end anonymous namespace
 
+INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
+                AARCH64_ADVSIMD_NAME, false, false)
+
 static bool isGPR64(unsigned Reg, unsigned SubReg,
                     const MachineRegisterInfo *MRI) {
   if (SubReg)
@@ -381,7 +392,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
 
   // Just check things on a one-block-at-a-time basis.
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
-    if (processMachineBasicBlock(I))
+    if (processMachineBasicBlock(&*I))
       Changed = true;
   return Changed;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index d973234..a614f55 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -45,6 +45,12 @@ BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
 STATISTIC(NumSplit, "Number of basic blocks split");
 STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
 
+namespace llvm {
+void initializeAArch64BranchRelaxationPass(PassRegistry &);
+}
+
+#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass"
+
 namespace {
 class AArch64BranchRelaxation : public MachineFunctionPass {
   /// BasicBlockInfo - Information about the offset and size of a single
@@ -93,17 +99,22 @@ class AArch64BranchRelaxation : public MachineFunctionPass {
 
 public:
   static char ID;
-  AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+  AArch64BranchRelaxation() : MachineFunctionPass(ID) {
+    initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry());
+  }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "AArch64 branch relaxation pass";
+    return AARCH64_BR_RELAX_NAME;
   }
 };
 char AArch64BranchRelaxation::ID = 0;
 }
 
+INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax",
+                AARCH64_BR_RELAX_NAME, false, false)
+
 /// verify - check BBOffsets, BBSizes, alignment of islands
 void AArch64BranchRelaxation::verify() {
 #ifndef NDEBUG
@@ -131,14 +142,14 @@ void AArch64BranchRelaxation::dumpBBs() {
 /// into the block immediately after it.
 static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
+  MachineFunction::iterator MBBI(MBB);
   // Can't fall off end of function.
-  MachineBasicBlock *NextBB = std::next(MBBI);
+  auto NextBB = std::next(MBBI);
   if (NextBB == MBB->getParent()->end())
     return false;
 
   for (MachineBasicBlock *S : MBB->successors())
-    if (S == NextBB)
+    if (S == &*NextBB)
       return true;
 
   return false;
@@ -216,9 +227,7 @@ AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
   // Create a new MBB for the code after the OrigBB.
   MachineBasicBlock *NewBB =
       MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB;
-  ++MBBI;
-  MF->insert(MBBI, NewBB);
+  MF->insert(++OrigBB->getIterator(), NewBB);
 
   // Splice the instructions starting with MI over to NewBB.
   NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
@@ -421,7 +430,7 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
     MBB->replaceSuccessor(FBB, NewBB);
     NewBB->addSuccessor(FBB);
   }
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB));
 
   DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
                << ", invert condition and change dest. to BB#"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
index 1e2d1c3..bc44bc5 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,30 +25,28 @@
 namespace {
 using namespace llvm;
 
-static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
-                                    AArch64::X3, AArch64::X4, AArch64::X5,
-                                    AArch64::X6, AArch64::X7};
-static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
-                                    AArch64::H3, AArch64::H4, AArch64::H5,
-                                    AArch64::H6, AArch64::H7};
-static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
-                                    AArch64::S3, AArch64::S4, AArch64::S5,
-                                    AArch64::S6, AArch64::S7};
-static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
-                                    AArch64::D3, AArch64::D4, AArch64::D5,
-                                    AArch64::D6, AArch64::D7};
-static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
-                                    AArch64::Q3, AArch64::Q4, AArch64::Q5,
-                                    AArch64::Q6, AArch64::Q7};
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                     AArch64::X3, AArch64::X4, AArch64::X5,
+                                     AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+                                     AArch64::H3, AArch64::H4, AArch64::H5,
+                                     AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                     AArch64::S3, AArch64::S4, AArch64::S5,
+                                     AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                     AArch64::D3, AArch64::D4, AArch64::D5,
+                                     AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                     AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                     AArch64::Q6, AArch64::Q7};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
                              CCState &State, unsigned SlotAlign) {
   unsigned Size = LocVT.getSizeInBits() / 8;
-  unsigned StackAlign = State.getMachineFunction()
-                            .getTarget()
-                            .getDataLayout()
-                            ->getStackAlignment();
+  unsigned StackAlign =
+      State.getMachineFunction().getDataLayout().getStackAlignment();
   unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
 
   for (auto &It : PendingMembers) {
@@ -88,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
-  ArrayRef<uint16_t> RegList;
+  ArrayRef<MCPhysReg> RegList;
   if (LocVT.SimpleTy == MVT::i64)
     RegList = XRegList;
   else if (LocVT.SimpleTy == MVT::f16)
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 815ebef..388d64e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> :
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
 /// CCIfBigEndian - Match only if we're in big endian mode.
 class CCIfBigEndian<CCAction A> :
-  CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>;
+  CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
@@ -279,6 +279,23 @@ def CSR_AArch64_TLS_Darwin
                            FP,
                            (sequence "Q%u", 0, 31))>;
 
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+    : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+                           (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_AArch64_CXX_TLS_Darwin_PE
+    : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
+    : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+
 // The ELF stub used for TLS-descriptor access saves every feasible
 // register. Only X0 and LR are clobbered.
 def CSR_AArch64_TLS_ELF
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 06ff9af..9310ac4 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -117,10 +117,10 @@ struct LDTLSCleanup : public MachineFunctionPass {
     *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
 
     // Insert a copy from X0 to TLSBaseAddrReg for later.
-    MachineInstr *Next = I->getNextNode();
-    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                 TII->get(TargetOpcode::COPY),
-                                 *TLSBaseAddrReg).addReg(AArch64::X0);
+    MachineInstr *Copy =
+        BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+            .addReg(AArch64::X0);
 
     return Copy;
   }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index efdb2e3..78c239b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -168,6 +168,8 @@ namespace llvm {
 void initializeAArch64CollectLOHPass(PassRegistry &);
 }
 
+#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
+
 namespace {
 struct AArch64CollectLOH : public MachineFunctionPass {
   static char ID;
@@ -178,7 +180,7 @@ struct AArch64CollectLOH : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "AArch64 Collect Linker Optimization Hint (LOH)";
+    return AARCH64_COLLECT_LOH_NAME;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -220,12 +222,10 @@ typedef SmallVector<unsigned, 32> MapIdToReg;
 char AArch64CollectLOH::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
-                      "AArch64 Collect Linker Optimization Hint (LOH)", false,
-                      false)
+                      AARCH64_COLLECT_LOH_NAME, false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
-                    "AArch64 Collect Linker Optimization Hint (LOH)", false,
-                    false)
+                    AARCH64_COLLECT_LOH_NAME, false, false)
 
 /// Given a couple (MBB, reg) get the corresponding set of instruction from
 /// the given "sets".
@@ -353,9 +353,17 @@ static void initReachingDef(const MachineFunction &MF,
 
         for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
           MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
-          assert(ItRegId != RegToId.end() &&
-                 "Sub-register of an "
-                 "involved register, not recorded as involved!");
+          // If this alias has not been recorded, then it is not interesting
+          // for the current analysis.
+          // We can end up in this situation because of tuple registers.
+          // E.g., Let say we are interested in S1. When we register
+          // S1, we will also register its aliases and in particular
+          // the tuple Q1_Q2.
+          // Now, when we encounter Q1_Q2, we will look through its aliases
+          // and will find that S2 is not registered.
+          if (ItRegId == RegToId.end())
+            continue;
+
           BBKillSet.set(ItRegId->second);
           BBGen[ItRegId->second] = &MI;
         }
@@ -523,6 +531,8 @@ static bool isCandidateStore(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
+  case AArch64::STRBBui:
+  case AArch64::STRHHui:
   case AArch64::STRBui:
   case AArch64::STRHui:
   case AArch64::STRWui:
@@ -884,7 +894,8 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
     bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
     // If the chain is three instructions long and ldr is the second element,
     // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+    if (L2 && !IsL2Add &&
+        !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
       continue;
     SmallVector<const MachineInstr *, 3> Args;
     MCLOHType Kind;
@@ -1000,7 +1011,8 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
   DEBUG(dbgs() << "** Collect Involved Register\n");
   for (const auto &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      if (!canDefBePartOfLOH(&MI))
+      if (!canDefBePartOfLOH(&MI) &&
+          !isCandidateLoad(&MI) && !isCandidateStore(&MI))
         continue;
 
       // Process defs
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index b9e41c6..fc27bfe 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -59,6 +59,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -153,13 +154,20 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
     case AArch64::SUBSXri:
     // cmn is an alias for adds with a dead destination register.
     case AArch64::ADDSWri:
-    case AArch64::ADDSXri:
-      if (MRI->use_empty(I->getOperand(0).getReg()))
-        return I;
-
-      DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
-      return nullptr;
-
+    case AArch64::ADDSXri: {
+      unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
+      if (!I->getOperand(2).isImm()) {
+        DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+        return nullptr;
+      } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+        DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+        return nullptr;
+      } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
+        DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+        return nullptr;
+      }
+      return I;
+    }
     // Prevent false positive case like:
     // cmp      w19, #0
     // cinc     w0, w19, gt
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2b0c92f..df1320f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -353,7 +353,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     MIOperands::PhysRegInfo PRI =
         MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
 
-    if (PRI.Reads) {
+    if (PRI.Read) {
       // The ccmp doesn't produce exactly the same flags as the original
       // compare, so reject the transform if there are uses of the flags
       // besides the terminators.
@@ -362,7 +362,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
       return nullptr;
     }
 
-    if (PRI.Clobbers) {
+    if (PRI.Defined || PRI.Clobbered) {
       DEBUG(dbgs() << "Not convertible compare: " << *I);
       ++NumUnknNZCVDefs;
       return nullptr;
@@ -567,8 +567,8 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // All CmpBB instructions are moved into Head, and CmpBB is deleted.
   // Update the CFG first.
   updateTailPHIs();
-  Head->removeSuccessor(CmpBB);
-  CmpBB->removeSuccessor(Tail);
+  Head->removeSuccessor(CmpBB, true);
+  CmpBB->removeSuccessor(Tail, true);
   Head->transferSuccessorsAndUpdatePHIs(CmpBB);
   DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
   TII->RemoveBranch(*Head);
@@ -786,13 +786,13 @@ void AArch64ConditionalCompares::updateDomTree(
   // convert() removes CmpBB which was previously dominated by Head.
   // CmpBB children should be transferred to Head.
   MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+  for (MachineBasicBlock *RemovedMBB : Removed) {
+    MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB);
     assert(Node != HeadNode && "Cannot erase the head node");
     assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
     while (Node->getNumChildren())
       DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
-    DomTree->eraseNode(Removed[i]);
+    DomTree->eraseNode(RemovedMBB);
   }
 }
 
@@ -801,8 +801,8 @@ void
 AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
   if (!Loops)
     return;
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
-    Loops->removeBlock(Removed[i]);
+  for (MachineBasicBlock *RemovedMBB : Removed)
+    Loops->removeBlock(RemovedMBB);
 }
 
 /// Invalidate MachineTraceMetrics before if-conversion.
@@ -899,7 +899,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
-  MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+  MinSize = MF.getFunction()->optForMinSize();
 
   bool Changed = false;
   CmpConv.runOnMachineFunction(MF);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 74fc167..576cf4a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -26,6 +26,12 @@ using namespace llvm;
 
 STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
 
+namespace llvm {
+void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &);
+}
+
+#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions"
+
 namespace {
 class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
 private:
@@ -35,11 +41,14 @@ private:
   bool usesFrameIndex(const MachineInstr &MI);
 public:
   static char ID; // Pass identification, replacement for typeid.
-  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {
+    initializeAArch64DeadRegisterDefinitionsPass(
+        *PassRegistry::getPassRegistry());
+  }
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
-  const char *getPassName() const override { return "Dead register definitions"; }
+  const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -49,6 +58,9 @@ public:
 char AArch64DeadRegisterDefinitions::ID = 0;
 } // end anonymous namespace
 
+INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs",
+                AARCH64_DEAD_REG_DEF_NAME, false, false)
+
 bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
     unsigned Reg, const MachineInstr &MI) {
   for (const MachineOperand &MO : MI.implicit_operands())
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c2470f7..d24e42a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -22,18 +22,26 @@
 #include "llvm/Support/MathExtras.h"
 using namespace llvm;
 
+namespace llvm {
+void initializeAArch64ExpandPseudoPass(PassRegistry &);
+}
+
+#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
+
 namespace {
 class AArch64ExpandPseudo : public MachineFunctionPass {
 public:
   static char ID;
-  AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {
+    initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
+  }
 
   const AArch64InstrInfo *TII;
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   const char *getPassName() const override {
-    return "AArch64 pseudo instruction expansion pass";
+    return AARCH64_EXPAND_PSEUDO_NAME;
   }
 
 private:
@@ -45,6 +53,9 @@ private:
 char AArch64ExpandPseudo::ID = 0;
 }
 
+INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
+                AARCH64_EXPAND_PSEUDO_NAME, false, false)
+
 /// \brief Transfer implicit operands on the pseudo instruction to the
 /// instructions created from the expansion.
 static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 0728198..0ac4b39 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -523,7 +523,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
     U = C;
   }
 
-  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+  if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
@@ -969,7 +969,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
 
   // Cannot encode an offset register and an immediate offset in the same
   // instruction. Fold the immediate offset into the load/store instruction and
-  // emit an additonal add to take care of the offset register.
+  // emit an additional add to take care of the offset register.
   if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
     RegisterOffsetNeedsLowering = true;
 
@@ -1058,8 +1058,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr,
     // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
     // and alignment should be based on the VT.
     MMO = FuncInfo.MF->getMachineMemOperand(
-      MachinePointerInfo::getFixedStack(FI, Offset), Flags,
-      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
     MIB.addFrameIndex(FI).addImm(Offset);
   } else {
@@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
   }
 
   // Check if the mul can be folded into the instruction.
-  if (RHS->hasOneUse() && isValueAvailable(RHS))
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (isMulPowOf2(RHS)) {
       const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
       const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
@@ -1193,12 +1193,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
       if (!RHSReg)
         return 0;
       bool RHSIsKill = hasTrivialKill(MulLHS);
-      return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
-                           AArch64_AM::LSL, ShiftVal, SetFlags, WantResult);
+      ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
+                                WantResult);
+      if (ResultReg)
+        return ResultReg;
     }
+  }
 
   // Check if the shift can be folded into the instruction.
-  if (RHS->hasOneUse() && isValueAvailable(RHS))
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
         AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
@@ -1214,12 +1218,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
           if (!RHSReg)
             return 0;
           bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
-          return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
-                               RHSIsKill, ShiftType, ShiftVal, SetFlags,
-                               WantResult);
+          ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                    RHSIsKill, ShiftType, ShiftVal, SetFlags,
+                                    WantResult);
+          if (ResultReg)
+            return ResultReg;
         }
       }
     }
+  }
 
   unsigned RHSReg = getRegForValue(RHS);
   if (!RHSReg)
@@ -1323,6 +1330,10 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
+  // Don't deal with undefined shifts.
+  if (ShiftImm >= RetVT.getSizeInBits())
+    return 0;
+
   static const unsigned OpcTable[2][2][2] = {
     { { AArch64::SUBWrs,  AArch64::SUBXrs  },
       { AArch64::ADDWrs,  AArch64::ADDXrs  }  },
@@ -1360,6 +1371,9 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
+  if (ShiftImm >= 4)
+    return 0;
+
   static const unsigned OpcTable[2][2][2] = {
     { { AArch64::SUBWrx,  AArch64::SUBXrx  },
       { AArch64::ADDWrx,  AArch64::ADDXrx  }  },
@@ -1542,7 +1556,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
     return ResultReg;
 
   // Check if the mul can be folded into the instruction.
-  if (RHS->hasOneUse() && isValueAvailable(RHS))
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (isMulPowOf2(RHS)) {
       const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
       const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
@@ -1558,12 +1572,15 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
       if (!RHSReg)
         return 0;
       bool RHSIsKill = hasTrivialKill(MulLHS);
-      return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
-                              RHSIsKill, ShiftVal);
+      ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                   RHSIsKill, ShiftVal);
+      if (ResultReg)
+        return ResultReg;
     }
+  }
 
   // Check if the shift can be folded into the instruction.
-  if (RHS->hasOneUse() && isValueAvailable(RHS))
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (const auto *SI = dyn_cast<ShlOperator>(RHS))
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
         uint64_t ShiftVal = C->getZExtValue();
@@ -1571,9 +1588,12 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
         if (!RHSReg)
           return 0;
         bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
-        return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
-                                RHSIsKill, ShiftVal);
+        ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                     RHSIsKill, ShiftVal);
+        if (ResultReg)
+          return ResultReg;
       }
+  }
 
   unsigned RHSReg = getRegForValue(RHS);
   if (!RHSReg)
@@ -1646,6 +1666,11 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
     { AArch64::ORRWrs, AArch64::ORRXrs },
     { AArch64::EORWrs, AArch64::EORXrs }
   };
+
+  // Don't deal with undefined shifts.
+  if (ShiftImm >= RetVT.getSizeInBits())
+    return 0;
+
   const TargetRegisterClass *RC;
   unsigned Opc;
   switch (RetVT.SimpleTy) {
@@ -2235,14 +2260,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
     MIB.addImm(TestBit);
   MIB.addMBB(TBB);
 
-  // Obtain the branch weight and add the TrueBB to the successor list.
-  uint32_t BranchWeight = 0;
-  if (FuncInfo.BPI)
-    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                               TBB->getBasicBlock());
-  FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-  fastEmitBranch(FBB, DbgLoc);
-
+  finishCondBranch(BI->getParent(), TBB, FBB);
   return true;
 }
 
@@ -2257,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
-  AArch64CC::CondCode CC = AArch64CC::NE;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && isValueAvailable(CI)) {
       // Try to optimize or fold the cmp.
@@ -2289,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
 
       // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
       // instruction.
-      CC = getCompareCC(Predicate);
+      AArch64CC::CondCode CC = getCompareCC(Predicate);
       AArch64CC::CondCode ExtraCC = AArch64CC::AL;
       switch (Predicate) {
       default:
@@ -2317,52 +2334,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
           .addImm(CC)
           .addMBB(TBB);
 
-      // Obtain the branch weight and add the TrueBB to the successor list.
-      uint32_t BranchWeight = 0;
-      if (FuncInfo.BPI)
-        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                  TBB->getBasicBlock());
-      FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
-      fastEmitBranch(FBB, DbgLoc);
-      return true;
-    }
-  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
-    MVT SrcVT;
-    if (TI->hasOneUse() && isValueAvailable(TI) &&
-        isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
-      unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (!CondReg)
-        return false;
-      bool CondIsKill = hasTrivialKill(TI->getOperand(0));
-
-      // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64) {
-        CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
-                                             AArch64::sub_32);
-        CondIsKill = true;
-      }
-
-      unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
-      assert(ANDReg && "Unexpected AND instruction emission failure.");
-      emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
-      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-        std::swap(TBB, FBB);
-        CC = AArch64CC::EQ;
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-
-      // Obtain the branch weight and add the TrueBB to the successor list.
-      uint32_t BranchWeight = 0;
-      if (FuncInfo.BPI)
-        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                  TBB->getBasicBlock());
-      FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
-      fastEmitBranch(FBB, DbgLoc);
+      finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
   } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
@@ -2371,34 +2343,31 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
         .addMBB(Target);
 
-    // Obtain the branch weight and add the target to the successor list.
-    uint32_t BranchWeight = 0;
-    if (FuncInfo.BPI)
-      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                 Target->getBasicBlock());
-    FuncInfo.MBB->addSuccessor(Target, BranchWeight);
+    // Obtain the branch probability and add the target to the successor list.
+    if (FuncInfo.BPI) {
+      auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+          BI->getParent(), Target->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(Target, BranchProbability);
+    } else
+      FuncInfo.MBB->addSuccessorWithoutProb(Target);
     return true;
-  } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
-    // Fake request the condition, otherwise the intrinsic might be completely
-    // optimized away.
-    unsigned CondReg = getRegForValue(BI->getCondition());
-    if (!CondReg)
-      return false;
-
-    // Emit the branch.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-      .addImm(CC)
-      .addMBB(TBB);
+  } else {
+    AArch64CC::CondCode CC = AArch64CC::NE;
+    if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+      // Fake request the condition, otherwise the intrinsic might be completely
+      // optimized away.
+      unsigned CondReg = getRegForValue(BI->getCondition());
+      if (!CondReg)
+        return false;
 
-    // Obtain the branch weight and add the TrueBB to the successor list.
-    uint32_t BranchWeight = 0;
-    if (FuncInfo.BPI)
-      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                 TBB->getBasicBlock());
-    FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+        .addImm(CC)
+        .addMBB(TBB);
 
-    fastEmitBranch(FBB, DbgLoc);
-    return true;
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
   }
 
   unsigned CondReg = getRegForValue(BI->getCondition());
@@ -2406,32 +2375,22 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     return false;
   bool CondRegIsKill = hasTrivialKill(BI->getCondition());
 
-  // We've been divorced from our compare!  Our block was split, and
-  // now our compare lives in a predecessor block.  We musn't
-  // re-compare here, as the children of the compare aren't guaranteed
-  // live across the block boundary (we *could* check for this).
-  // Regardless, the compare has been done in the predecessor block,
-  // and it left a value for us in a virtual register.  Ergo, we test
-  // the one-bit value left in the virtual register.
-  emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0);
-
+  // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+  unsigned Opcode = AArch64::TBNZW;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
-    CC = AArch64CC::EQ;
+    Opcode = AArch64::TBZW;
   }
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-      .addImm(CC)
+  const MCInstrDesc &II = TII.get(Opcode);
+  unsigned ConstrainedCondReg
+    = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+      .addImm(0)
       .addMBB(TBB);
 
-  // Obtain the branch weight and add the TrueBB to the successor list.
-  uint32_t BranchWeight = 0;
-  if (FuncInfo.BPI)
-    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                               TBB->getBasicBlock());
-  FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
-  fastEmitBranch(FBB, DbgLoc);
+  finishCondBranch(BI->getParent(), TBB, FBB);
   return true;
 }
 
@@ -2447,8 +2406,8 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
 
   // Make sure the CFG is up-to-date.
-  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
-    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+  for (auto *Succ : BI->successors())
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]);
 
   return true;
 }
@@ -2456,6 +2415,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
 bool AArch64FastISel::selectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
+  // Vectors of i1 are weird: bail out.
+  if (CI->getType()->isVectorTy())
+    return false;
+
   // Try to optimize or fold the cmp.
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
   unsigned ResultReg = 0;
@@ -2954,8 +2917,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
     .addImm(NumBytes);
 
   // Process the args.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  for (CCValAssign &VA : ArgLocs) {
     const Value *ArgVal = CLI.OutVals[VA.getValNo()];
     MVT ArgVT = OutVTs[VA.getValNo()];
 
@@ -3018,8 +2980,8 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
 
       unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getStack(Addr.getOffset()),
-        MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+          MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
 
       if (!emitStore(ArgVT, ArgReg, Addr, MMO))
         return false;
@@ -3318,8 +3280,8 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
     return false;
 
   // Make sure nothing is in the way
-  BasicBlock::const_iterator Start = I;
-  BasicBlock::const_iterator End = II;
+  BasicBlock::const_iterator Start(I);
+  BasicBlock::const_iterator End(II);
   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
     // We only expect extractvalue instructions between the intrinsic and the
     // instruction to be selected.
@@ -3684,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
   if (F.isVarArg())
     return false;
 
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
   // Build a list of return value registers.
   SmallVector<unsigned, 4> RetRegs;
 
@@ -3763,8 +3728,8 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(AArch64::RET_ReallyLR));
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned RetReg : RetRegs)
+    MIB.addReg(RetReg, RegState::Implicit);
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a76473f..11ae800 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -72,9 +72,9 @@
 //
 // For most functions, some of the frame areas are empty. For those functions,
 // it may not be necessary to set up fp or bp:
-// * A base pointer is definitly needed when there are both VLAs and local
+// * A base pointer is definitely needed when there are both VLAs and local
 //   variables with more-than-default alignment requirements.
-// * A frame pointer is definitly needed when there are local variables with
+// * A frame pointer is definitely needed when there are local variables with
 //   more-than-default alignment requirements.
 //
 // In some cases when a base pointer is not strictly needed, it is generated
@@ -216,11 +216,11 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   if (CSI.empty())
     return;
 
-  const DataLayout *TD = MF.getTarget().getDataLayout();
+  const DataLayout &TD = MF.getDataLayout();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize(0);
+  int stackGrowth = -TD.getPointerSize(0);
 
   // Calculate offsets.
   int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
@@ -280,14 +280,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
-  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
   bool HasFP = hasFP(MF);
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
@@ -354,7 +357,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (NumBytes && NeedsRealignment) {
     // Use the first callee-saved register as a scratch register.
     scratchSPReg = AArch64::X9;
-    MF.getRegInfo().setPhysRegUsed(scratchSPReg);
   }
 
   // If we're a leaf function, try using the red zone.
@@ -400,8 +402,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
-    const int StackGrowth = -TD->getPointerSize(0);
+    const DataLayout &TD = MF.getDataLayout();
+    const int StackGrowth = -TD.getPointerSize(0);
     unsigned FramePtr = RegInfo->getFrameRegister(MF);
     // An example of the prologue:
     //
@@ -513,33 +515,33 @@ static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
   return false;
 }
 
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+/// Checks whether the given instruction restores callee save registers
+/// and if so returns how many.
+static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
   unsigned RtIdx = 0;
-  if (MI->getOpcode() == AArch64::LDPXpost ||
-      MI->getOpcode() == AArch64::LDPDpost)
+  switch (MI.getOpcode()) {
+  case AArch64::LDPXpost:
+  case AArch64::LDPDpost:
     RtIdx = 1;
-
-  if (MI->getOpcode() == AArch64::LDPXpost ||
-      MI->getOpcode() == AArch64::LDPDpost ||
-      MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
-    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
-        MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
-      return false;
-    return true;
+    // FALLTHROUGH
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+    if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
+        MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
+      return 0;
+    return 2;
   }
-
-  return false;
+  return 0;
 }
 
 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64InstrInfo *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
   if (MBB.end() != MBBI) {
@@ -585,7 +587,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   //      ---------------------|        ---           |
   //      |                    |         |            |
   //      |   CalleeSavedReg   |         |            |
-  //      | (NumRestores * 16) |         |            |
+  //      | (NumRestores * 8)  |         |            |
   //      |                    |         |            |
   //      ---------------------|         |         NumBytes
   //      |                    |     StackSize  (StackAdjustUp)
@@ -606,17 +608,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // Move past the restores of the callee-saved registers.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-  if (LastPopI != MBB.begin()) {
-    do {
-      ++NumRestores;
-      --LastPopI;
-    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
-    if (!isCSRestore(LastPopI, CSRegs)) {
+  MachineBasicBlock::iterator Begin = MBB.begin();
+  while (LastPopI != Begin) {
+    --LastPopI;
+    unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
+    NumRestores += Restores;
+    if (Restores == 0) {
       ++LastPopI;
-      --NumRestores;
+      break;
     }
   }
-  NumBytes -= NumRestores * 16;
+  NumBytes -= NumRestores * 8;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   if (!hasFP(MF)) {
@@ -634,15 +636,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // be able to save any instructions.
   if (NumBytes || MFI->hasVarSizedObjects())
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                              int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
+                    -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -739,9 +733,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   DebugLoc DL;
   assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
 
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
   for (unsigned i = 0; i < Count; i += 2) {
     unsigned idx = Count - i - 2;
     unsigned Reg1 = CSI[idx].getReg();
@@ -911,7 +902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned NumFPRSpilled = 0;
   bool ExtraCSSpill = false;
   bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
 
   // Check pairs of consecutive callee-saved registers.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 731f031..427afdf 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -37,7 +37,6 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
   int resolveFrameIndexReference(const MachineFunction &MF, int FI,
@@ -61,6 +60,11 @@ public:
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
+
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return true;
+  }
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 772e894..6c86888 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -34,7 +34,6 @@ using namespace llvm;
 namespace {
 
 class AArch64DAGToDAGISel : public SelectionDAGISel {
-  AArch64TargetMachine &TM;
 
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 public:
   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+      : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
         ForCodeSize(false) {}
 
   const char *getPassName() const override {
@@ -53,9 +52,7 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    ForCodeSize =
-        MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
-        MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+    ForCodeSize = MF.getFunction()->optForSize();
     Subtarget = &MF.getSubtarget<AArch64Subtarget>();
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
@@ -79,6 +76,21 @@ public:
   bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
     return SelectShiftedRegister(N, true, Reg, Shift);
   }
+  bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+  }
   bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
     return SelectAddrModeIndexed(N, 1, Base, OffImm);
   }
@@ -153,8 +165,7 @@ public:
 
   SDNode *SelectBitfieldExtractOp(SDNode *N);
   SDNode *SelectBitfieldInsertOp(SDNode *N);
-
-  SDNode *SelectLIBM(SDNode *N);
+  SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
 
   SDNode *SelectReadRegister(SDNode *N);
   SDNode *SelectWriteRegister(SDNode *N);
@@ -165,6 +176,8 @@ public:
 private:
   bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
                              SDValue &Shift);
+  bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+                               SDValue &OffImm);
   bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
                              SDValue &OffImm);
   bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@@ -422,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
   return true;
 }
 
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
 // high lane extract.
 static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
                              SDValue &LaneOp, int &LaneIdx) {
@@ -572,7 +585,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
   }
 
   // AArch64 mandates that the RHS of the operation must use the smallest
-  // register classs that could contain the size being extended from.  Thus,
+  // register class that could contain the size being extended from.  Thus,
   // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
   // there might not be an actual 32-bit value in the program.  We can
   // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
@@ -587,7 +600,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
 /// need to create a real ADD instruction from it anyway and there's no point in
 /// folding it into the mem op. Theoretically, it shouldn't matter, but there's
 /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
-/// leads to duplaicated ADRP instructions.
+/// leads to duplicated ADRP instructions.
 static bool isWorthFoldingADDlow(SDValue N) {
   for (auto Use : N->uses()) {
     if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
@@ -604,6 +617,51 @@ static bool isWorthFoldingADDlow(SDValue N) {
   return true;
 }
 
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+                                                  SDValue &Base,
+                                                  SDValue &OffImm) {
+  SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+    return true;
+  }
+
+  // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+  // selected here doesn't support labels/immediates, only base+offset.
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = RHS->getSExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
+          RHSC < (0x40 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    stp x1, x2, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+  return true;
+}
+
 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
@@ -867,7 +925,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   if (isa<ConstantSDNode>(RHS)) {
     int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
     unsigned Scale = Log2_32(Size);
-    // Skip the immediate can be seleced by load/store addressing mode.
+    // Skip the immediate can be selected by load/store addressing mode.
     // Also skip the immediate can be encoded by a single ADD (SUB is also
     // checked by using -ImmOff).
     if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
@@ -1034,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
       // it into an i64.
       DstVT = MVT::i32;
     }
+  } else if (VT == MVT::f16) {
+    Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
   } else if (VT == MVT::f32) {
     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
   } else if (VT == MVT::f64 || VT.is64BitVector()) {
@@ -1222,8 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
   SDValue SuperReg = SDValue(Ld, 0);
 
   EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
-                              AArch64::qsub3 };
+  static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+                                    AArch64::qsub2, AArch64::qsub3 };
   for (unsigned i = 0; i < NumVecs; ++i) {
     SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
     if (Narrow)
@@ -1275,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
                 Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
   } else {
     EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
-                                AArch64::qsub3 };
+    static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+                                      AArch64::qsub2, AArch64::qsub3 };
     for (unsigned i = 0; i < NumVecs; ++i) {
       SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
                                                   SuperReg);
@@ -1420,7 +1480,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
     // The resulting code will be at least as good as the original one
     // plus it may expose more opportunities for bitfield insert pattern.
     // FIXME: Currently we limit this to the bigger pattern, because
-    // some optimizations expect AND and not UBFM
+    // some optimizations expect AND and not UBFM.
     Opd0 = N->getOperand(0);
   } else
     return false;
@@ -1852,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
 /// Does this tree qualify as an attempt to move a bitfield into position,
 /// essentially "(and (shl VAL, N), Mask)".
 static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    bool BiggerPattern,
                                     SDValue &Src, int &ShiftAmount,
                                     int &MaskWidth) {
   EVT VT = Op.getValueType();
@@ -1874,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
     Op = Op.getOperand(0);
   }
 
+  // Don't match if the SHL has more than one use, since then we'll end up
+  // generating SHL+UBFIZ instead of just keeping SHL+AND.
+  if (!BiggerPattern && !Op.hasOneUse())
+    return false;
+
   uint64_t ShlImm;
   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
     return false;
@@ -1887,7 +1953,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
 
   // BFI encompasses sufficiently many nodes that it's worth inserting an extra
   // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
-  // amount.
+  // amount.  BiggerPattern is true when this pattern is being matched for BFI,
+  // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+  // which case it is not profitable to insert an extra shift.
+  if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+    return false;
   Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
 
   return true;
@@ -1904,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
 // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
                                      SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+                                     unsigned &ImmS, const APInt &UsefulBits,
+                                     SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
 
   // Set Opc
@@ -1918,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
 
   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
   // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
 
   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
 
-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+  // OR is commutative, check all combinations of operand order and values of
+  // BiggerPattern, i.e.
+  //     Opd0, Opd1, BiggerPattern=false
+  //     Opd1, Opd0, BiggerPattern=false
+  //     Opd0, Opd1, BiggerPattern=true
+  //     Opd1, Opd0, BiggerPattern=true
+  // Several of these combinations may match, so check with BiggerPattern=false
+  // first since that will produce better results by matching more instructions
+  // and/or inserting fewer extra instructions.
+  for (int I = 0; I < 4; ++I) {
+
+    bool BiggerPattern = I / 2;
+    SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+    SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+    SDNode *OrOpd1 = OrOpd1Val.getNode();
+
     unsigned BFXOpc;
     int DstLSB, Width;
     if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
-                            NumberOfIgnoredLowBits, true)) {
+                            NumberOfIgnoredLowBits, BiggerPattern)) {
       // Check that the returned opcode is compatible with the pattern,
       // i.e., same type and zero extended (U and not S)
       if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
@@ -1952,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
 
       // If the mask on the insertee is correct, we have a BFXIL operation. We
       // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
-                                       DstLSB, Width)) {
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+                                       BiggerPattern,
+                                       Src, DstLSB, Width)) {
       ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
       ImmS = Width - 1;
     } else
@@ -2003,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
   unsigned Opc;
   unsigned LSB, MSB;
   SDValue Opd0, Opd1;
+  EVT VT = N->getValueType(0);
+  APInt NUsefulBits;
+  getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+  // If all bits are not useful, just return UNDEF.
+  if (!NUsefulBits)
+    return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
 
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
+                                CurDAG))
     return nullptr;
 
-  EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[] = { Opd0,
                     Opd1,
@@ -2016,58 +2102,37 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+  if (N->getOpcode() != ISD::AND)
+    return nullptr;
+
   EVT VT = N->getValueType(0);
-  unsigned Variant;
   unsigned Opc;
-  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
-
-  if (VT == MVT::f32) {
-    Variant = 0;
-  } else if (VT == MVT::f64) {
-    Variant = 1;
-  } else
-    return nullptr; // Unrecognized argument type. Fall back on default codegen.
-
-  // Pick the FRINTX variant needed to set the flags.
-  unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
-  switch (N->getOpcode()) {
-  default:
-    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
-  case ISD::FCEIL: {
-    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
-    Opc = FRINTPOpcs[Variant];
-    break;
-  }
-  case ISD::FFLOOR: {
-    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
-    Opc = FRINTMOpcs[Variant];
-    break;
-  }
-  case ISD::FTRUNC: {
-    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
-    Opc = FRINTZOpcs[Variant];
-    break;
-  }
-  case ISD::FROUND: {
-    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
-    Opc = FRINTAOpcs[Variant];
-    break;
-  }
-  }
+  if (VT == MVT::i32)
+    Opc = AArch64::UBFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::UBFMXri;
+  else
+    return nullptr;
 
-  SDLoc dl(N);
-  SDValue In = N->getOperand(0);
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(In);
+  SDValue Op0;
+  int DstLSB, Width;
+  if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+                               Op0, DstLSB, Width))
+    return nullptr;
 
-  if (!TM.Options.UnsafeFPMath) {
-    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
-    Ops.push_back(SDValue(FRINTX, 1));
-  }
+  // ImmR is the rotate right amount.
+  unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+  // ImmS is the most significant bit of the source to be moved.
+  unsigned ImmS = Width - 1;
 
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  SDLoc DL(N);
+  SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
 }
 
 bool
@@ -2119,7 +2184,7 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
 // into a single value to be used in the MRS/MSR instruction.
 static int getIntOperandFromRegisterString(StringRef RegString) {
   SmallVector<StringRef, 5> Fields;
-  RegString.split(Fields, ":");
+  RegString.split(Fields, ':');
 
   if (Fields.size() == 1)
     return -1;
@@ -2206,7 +2271,15 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
     assert (isa<ConstantSDNode>(N->getOperand(2))
               && "Expected a constant integer expression.");
     uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-    return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other,
+    unsigned State;
+    if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
+      assert(Immed < 2 && "Bad imm");
+      State = AArch64::MSRpstateImm1;
+    } else {
+      assert(Immed < 16 && "Bad imm");
+      State = AArch64::MSRpstateImm4;
+    }
+    return CurDAG->getMachineNode(State, DL, MVT::Other,
                                   CurDAG->getTargetConstant(Reg, DL, MVT::i32),
                                   CurDAG->getTargetConstant(Immed, DL, MVT::i16),
                                   N->getOperand(0));
@@ -2279,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   case ISD::SRA:
     if (SDNode *I = SelectBitfieldExtractOp(Node))
       return I;
+    if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
+      return I;
     break;
 
   case ISD::OR:
@@ -2802,6 +2877,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       break;
     }
     }
+    break;
   }
   case AArch64ISD::LD2post: {
     if (VT == MVT::v8i8)
@@ -3214,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
     break;
   }
-
-  case ISD::FCEIL:
-  case ISD::FFLOOR:
-  case ISD::FTRUNC:
-  case ISD::FROUND:
-    if (SDNode *I = SelectLIBM(Node))
-      return I;
-    break;
   }
 
   // Select the default instruction
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3e8f46c..4ecfbe9 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,23 +40,6 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
-namespace {
-enum AlignMode {
-  StrictAlign,
-  NoStrictAlign
-};
-}
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
-      cl::Hidden, cl::init(NoStrictAlign),
-      cl::values(
-          clEnumValN(StrictAlign,   "aarch64-strict-align",
-                     "Disallow all unaligned memory accesses"),
-          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
-                     "Allow unaligned memory accesses"),
-          clEnumValEnd));
-
 // Place holder until extr generation is tested fully.
 static cl::opt<bool>
 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
@@ -76,6 +59,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -210,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
-  // Exception handling.
-  // FIXME: These are guesses. Has this been defined yet?
-  setExceptionPointerRegister(AArch64::X0);
-  setExceptionSelectorRegister(AArch64::X1);
-
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
@@ -234,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // AArch64 lacks both left-rotate and popcount instructions.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+  }
 
   // AArch64 doesn't have {U|S}MUL_LOHI.
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
@@ -252,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+  }
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i64, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -315,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
   setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
   setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
+  setOperationAction(ISD::FMINNAN,     MVT::f16,  Promote);
+  setOperationAction(ISD::FMAXNAN,     MVT::f16,  Promote);
 
   // v4f16 is also a storage-only type, so promote it to v4f32 when that is
   // known to be safe.
@@ -403,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FRINT, Ty, Legal);
     setOperationAction(ISD::FTRUNC, Ty, Legal);
     setOperationAction(ISD::FROUND, Ty, Legal);
+    setOperationAction(ISD::FMINNUM, Ty, Legal);
+    setOperationAction(ISD::FMAXNUM, Ty, Legal);
+    setOperationAction(ISD::FMINNAN, Ty, Legal);
+    setOperationAction(ISD::FMAXNAN, Ty, Legal);
   }
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
+  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
+  // This requires the Performance Monitors extension.
+  if (Subtarget->hasPerfMon())
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
   if (Subtarget->isTargetMachO()) {
     // For iOS, we don't want to the normal expansion of a libcall to
     // sincos. We want to issue a libcall to __sincos_stret to avoid memory
@@ -456,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setIndexedLoadAction(im, MVT::i64, Legal);
     setIndexedLoadAction(im, MVT::f64, Legal);
     setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedLoadAction(im, MVT::f16, Legal);
     setIndexedStoreAction(im, MVT::i8, Legal);
     setIndexedStoreAction(im, MVT::i16, Legal);
     setIndexedStoreAction(im, MVT::i32, Legal);
     setIndexedStoreAction(im, MVT::i64, Legal);
     setIndexedStoreAction(im, MVT::f64, Legal);
     setIndexedStoreAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::f16, Legal);
   }
 
   // Trap.
@@ -479,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
 
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::FP_TO_UINT);
+  setTargetDAGCombine(ISD::FDIV);
+
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
   setTargetDAGCombine(ISD::ANY_EXTEND);
@@ -487,16 +493,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
+  if (Subtarget->supportsAddressTopByteIgnored())
+    setTargetDAGCombine(ISD::LOAD);
 
   setTargetDAGCombine(ISD::MUL);
 
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::VSELECT);
-  setTargetDAGCombine(ISD::SELECT_CC);
 
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -512,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setMinFunctionAlignment(2);
 
-  RequireStrictAlign = (Align == StrictAlign);
-
   setHasExtractBitsInsn(true);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
   if (Subtarget->hasNEON()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
@@ -646,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
     setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+
+    // But we do support custom-lowering for FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
   }
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
@@ -686,6 +697,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT.getSimpleVT(), Legal);
 
+  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
+  if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
+    for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
+                            ISD::FMINNUM, ISD::FMAXNUM})
+      setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -730,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
     switch (IntID) {
     default: return;
@@ -780,6 +797,34 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
   return MVT::i64;
 }
 
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                           unsigned AddrSpace,
+                                                           unsigned Align,
+                                                           bool *Fast) const {
+  if (Subtarget->requiresStrictAlign())
+    return false;
+
+  // FIXME: This is mostly true for Cyclone, but not necessarily others.
+  if (Fast) {
+    // FIXME: Define an attribute for slow unaligned accesses instead of
+    // relying on the CPU type as a proxy.
+    // On Cyclone, unaligned 128-bit stores are slow.
+    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+            // See comments in performSTORECombine() for more details about
+            // these conditions.
+
+            // Code that uses clang vector extensions can mark that it
+            // wants unaligned accesses to be treated as fast by
+            // underspecifying alignment to be 1 or 2.
+            Align <= 2 ||
+
+            // Disregard v2i64. Memcpy lowering produces those and splitting
+            // them regresses performance on micro-benchmarks and olden/bh.
+            VT == MVT::v2i64;
+  }
+  return true;
+}
+
 FastISel *
 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                       const TargetLibraryInfo *libInfo) const {
@@ -809,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
+  case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
+  case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
-  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
-  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
   case AArch64ISD::DUP:               return "AArch64ISD::DUP";
   case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
   case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
@@ -931,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   DebugLoc DL = MI->getDebugLoc();
-  MachineFunction::iterator It = MBB;
-  ++It;
+  MachineFunction::iterator It = ++MBB->getIterator();
 
   unsigned DestReg = MI->getOperand(0).getReg();
   unsigned IfTrueReg = MI->getOperand(1).getReg();
@@ -1141,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
-  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
-      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
     // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
@@ -1156,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // the absence of information about op2.
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
-             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
              !isUnsignedIntSetCC(CC)) {
     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
     // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
@@ -1167,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     LHS = LHS.getOperand(0);
   }
 
-  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
       .getValue(1);
 }
 
+/// \defgroup AArch64CCMP CMP;CCMP matching
+///
+/// These functions deal with the formation of CMP;CCMP;... sequences.
+/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
+/// a comparison. They set the NZCV flags to a predefined value if their
+/// predicate is false. This allows to express arbitrary conjunctions, for
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// expressed as:
+///   cmp A
+///   ccmp B, inv(CB), CA
+///   check for CB flags
+///
+/// In general we can create code for arbitrary "... (and (and A B) C)"
+/// sequences. We can also implement some "or" expressions, because "(or A B)"
+/// is equivalent to "not (and (not A) (not B))" and we can implement some
+/// negation operations:
+/// We can negate the results of a single comparison by inverting the flags
+/// used when the predicate fails and inverting the flags tested in the next
+/// instruction; We can also negate the results of the whole previous
+/// conditional compare sequence by inverting the flags tested in the next
+/// instruction. However there is no way to negate the result of a partial
+/// sequence.
+///
+/// Therefore on encountering an "or" expression we can negate the subtree on
+/// one side and have to be able to push the negate to the leafs of the subtree
+/// on the other side (see also the comments in code). As complete example:
+/// "or (or (setCA (cmp A)) (setCB (cmp B)))
+///     (and (setCC (cmp C)) (setCD (cmp D)))"
+/// is transformed to
+/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
+///           (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// and implemented as:
+///   cmp C
+///   ccmp D, inv(CD), CC
+///   ccmp A, CA, inv(CD)
+///   ccmp B, CB, inv(CA)
+///   check for CB flags
+/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
+/// by conditional compare sequences.
+/// @{
+
+/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+                                         ISD::CondCode CC, SDValue CCOp,
+                                         SDValue Condition, unsigned NZCV,
+                                         SDLoc DL, SelectionDAG &DAG) {
+  unsigned Opcode = 0;
+  if (LHS.getValueType().isFloatingPoint())
+    Opcode = AArch64ISD::FCCMP;
+  else if (RHS.getOpcode() == ISD::SUB) {
+    SDValue SubOp0 = RHS.getOperand(0);
+    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+        // See emitComparison() on why we can only do this for SETEQ and SETNE.
+        Opcode = AArch64ISD::CCMN;
+        RHS = RHS.getOperand(1);
+      }
+  }
+  if (Opcode == 0)
+    Opcode = AArch64ISD::CCMP;
+
+  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+/// CanPushNegate is set to true if we can push a negate operation through
+/// the tree in a was that we are left with AND operations and negate operations
+/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
+/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
+/// brought into such a form.
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+                                         unsigned Depth = 0) {
+  if (!Val.hasOneUse())
+    return false;
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    CanPushNegate = true;
+    return true;
+  }
+  // Protect against stack overflow.
+  if (Depth > 15)
+    return false;
+  if (Opcode == ISD::AND || Opcode == ISD::OR) {
+    SDValue O0 = Val->getOperand(0);
+    SDValue O1 = Val->getOperand(1);
+    bool CanPushNegateL;
+    if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+      return false;
+    bool CanPushNegateR;
+    if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+      return false;
+    // We cannot push a negate through an AND operation (it would become an OR),
+    // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
+    // push the negate through the x/y subtrees.
+    CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+    return true;
+  }
+  return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// Tries to transform the given i1 producing node @p Val to a series compare
+/// and conditional compare operations. @returns an NZCV flags producing node
+/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
+/// transformation was not possible.
+/// On recursive invocations @p PushNegate may be set to true to have negation
+/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
+/// for the comparisons in the current subtree; @p Depth limits the search
+/// depth to avoid stack overflow.
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+    AArch64CC::CondCode &OutCC, bool PushNegate = false,
+    SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
+    unsigned Depth = 0) {
+  // We're at a tree leaf, produce a conditional comparison operation.
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    SDValue LHS = Val->getOperand(0);
+    SDValue RHS = Val->getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+    bool isInteger = LHS.getValueType().isInteger();
+    if (PushNegate)
+      CC = getSetCCInverse(CC, isInteger);
+    SDLoc DL(Val);
+    // Determine OutCC and handle FP special case.
+    if (isInteger) {
+      OutCC = changeIntCCToAArch64CC(CC);
+    } else {
+      assert(LHS.getValueType().isFloatingPoint());
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
+      // Surpisingly some floating point conditions can't be tested with a
+      // single condition code. Construct an additional comparison in this case.
+      // See comment below on how we deal with OR conditions.
+      if (ExtraCC != AArch64CC::AL) {
+        SDValue ExtraCmp;
+        if (!CCOp.getNode())
+          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+        else {
+          SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+          // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
+          unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
+          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
+                                               NZCV, DL, DAG);
+        }
+        CCOp = ExtraCmp;
+        Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
+        OutCC = AArch64CC::getInvertedCondCode(OutCC);
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp.getNode())
+      return emitComparison(LHS, RHS, CC, DL, DAG);
+    // Otherwise produce a ccmp.
+    SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+    AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+    return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+                                     DAG);
+  } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
+    return SDValue();
+
+  assert((Opcode == ISD::OR || !PushNegate)
+         && "Can only push negate through OR operation");
+
+  // Check if both sides can be transformed.
+  SDValue LHS = Val->getOperand(0);
+  SDValue RHS = Val->getOperand(1);
+  bool CanPushNegateL;
+  if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
+    return SDValue();
+  bool CanPushNegateR;
+  if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
+    return SDValue();
+
+  // Do we need to negate our operands?
+  bool NegateOperands = Opcode == ISD::OR;
+  // We can negate the results of all previous operations by inverting the
+  // predicate flags giving us a free negation for one side. For the other side
+  // we need to be able to push the negation to the leafs of the tree.
+  if (NegateOperands) {
+    if (!CanPushNegateL && !CanPushNegateR)
+      return SDValue();
+    // Order the side where we can push the negate through to LHS.
+    if (!CanPushNegateL && CanPushNegateR)
+      std::swap(LHS, RHS);
+  } else {
+    bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
+    bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
+    if (NeedsNegOutL && NeedsNegOutR)
+      return SDValue();
+    // Order the side where we need to negate the output flags to RHS so it
+    // gets emitted first.
+    if (NeedsNegOutL)
+      std::swap(LHS, RHS);
+  }
+
+  // Emit RHS. If we want to negate the tree we only need to push a negate
+  // through if we are already in a PushNegate case, otherwise we can negate
+  // the "flags to test" afterwards.
+  AArch64CC::CondCode RHSCC;
+  SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
+                                                CCOp, Predicate, Depth+1);
+  if (NegateOperands && !PushNegate)
+    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+  // Emit LHS. We must push the negate through if we need to negate it.
+  SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
+                                                CmpR, RHSCC, Depth+1);
+  // If we transformed an OR to and AND then we have to negate the result
+  // (or absorb a PushNegate resulting in a double negation).
+  if (Opcode == ISD::OR && !PushNegate)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
+/// @}
+
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
-  SDValue Cmp;
-  AArch64CC::CondCode AArch64CC;
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
@@ -1229,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       }
     }
   }
-  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
-  // For the i8 operand, the largest immediate is 255, so this can be easily
-  // encoded in the compare instruction. For the i16 operand, however, the
-  // largest immediate cannot be encoded in the compare.
-  // Therefore, use a sign extending load and cmn to avoid materializing the -1
-  // constant. For example,
-  // movz w1, #65535
-  // ldrh w0, [x0, #0]
-  // cmp w0, w1
-  // >
-  // ldrsh w0, [x0, #0]
-  // cmn w0, #1
-  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
-  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
-  // both the LHS and RHS are truely zero extended and to make sure the
-  // transformation is profitable.
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
-    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
-        isa<LoadSDNode>(LHS)) {
-      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
-          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
-          LHS.getNode()->hasNUsesOfValue(1, 0)) {
-        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
-        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
-          SDValue SExt =
-              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
-                          DAG.getValueType(MVT::i16));
-          Cmp = emitComparison(SExt,
-                               DAG.getConstant(ValueofRHS, dl,
-                                               RHS.getValueType()),
-                               CC, dl, DAG);
-          AArch64CC = changeIntCCToAArch64CC(CC);
-          AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
-          return Cmp;
-        }
+    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+    // For the i8 operand, the largest immediate is 255, so this can be easily
+    // encoded in the compare instruction. For the i16 operand, however, the
+    // largest immediate cannot be encoded in the compare.
+    // Therefore, use a sign extending load and cmn to avoid materializing the
+    // -1 constant. For example,
+    // movz w1, #65535
+    // ldrh w0, [x0, #0]
+    // cmp w0, w1
+    // >
+    // ldrsh w0, [x0, #0]
+    // cmn w0, #1
+    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+    // if and only if (sext LHS) == (sext RHS). The checks are in place to
+    // ensure both the LHS and RHS are truly zero extended and to make sure the
+    // transformation is profitable.
+    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+        LHS.getNode()->hasNUsesOfValue(1, 0)) {
+      int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+        SDValue SExt =
+            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                        DAG.getValueType(MVT::i16));
+        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+                                                   RHS.getValueType()),
+                             CC, dl, DAG);
+        AArch64CC = changeIntCCToAArch64CC(CC);
       }
     }
+
+    if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+      if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+        if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+          AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+      }
+    }
+  }
+
+  if (!Cmp) {
+    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+    AArch64CC = changeIntCCToAArch64CC(CC);
   }
-  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC = changeIntCCToAArch64CC(CC);
-  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
+  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
   return Cmp;
 }
 
@@ -1391,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                              RTLIB::Libcall Call) const {
   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
+  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
 }
 
 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -1571,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
   // precise. That doesn't take part in the LibCall so we can't directly use
   // LowerF128Call.
   SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
@@ -1581,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   // in the cost tables.
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
+  unsigned NumElts = InVT.getVectorNumElements();
+
+  // f16 vectors are promoted to f32 before a conversion.
+  if (InVT.getVectorElementType() == MVT::f16) {
+    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+  }
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     SDLoc dl(Op);
@@ -1628,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
+  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -1931,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
+SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::aarch64_thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::aarch64_neon_smax:
+    return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_neon_umax:
+    return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_neon_smin:
+    return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_neon_umin:
+    return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -2032,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFSINCOS(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2128,7 +2426,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
       continue;
     }
-    
+
     if (VA.isRegLoc()) {
       // Arguments stored in registers.
       EVT RegVT = VA.getLocVT();
@@ -2214,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         break;
       }
 
-      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
-                                MachinePointerInfo::getFixedStack(FI),
-                                MemVT, false, false, false, 0);
+      ArgValue = DAG.getExtLoad(
+          ExtType, DL, VA.getLocVT(), Chain, FIN,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+          MemVT, false, false, false, 0);
 
       InVals.push_back(ArgValue);
     }
@@ -2289,9 +2588,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      SDValue Store = DAG.getStore(
+          Val.getValue(1), DL, Val, FIN,
+          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
+          false, 0);
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2318,9 +2618,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
 
-        SDValue Store =
-            DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                         MachinePointerInfo::getStack(i * 16), false, false, 0);
+        SDValue Store = DAG.getStore(
+            Val.getValue(1), DL, Val, FIN,
+            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
+            false, false, 0);
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getConstant(16, DL, PtrVT));
@@ -2453,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
                    *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
-    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
-      if (!ArgLocs[i].isRegLoc())
+    for (const CCValAssign &ArgLoc : ArgLocs)
+      if (!ArgLoc.isRegLoc())
         return false;
   }
 
@@ -2758,7 +3059,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
 
         DstAddr = DAG.getFrameIndex(FI, PtrVT);
-        DstInfo = MachinePointerInfo::getFixedStack(FI);
+        DstInfo =
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
         // Make sure any stack arguments overlapping with where we're storing
         // are loaded before this eventual operation. Otherwise they'll be
@@ -2768,7 +3070,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
 
         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
-        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+        DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
+                                               LocMemOffset);
       }
 
       if (Outs[i].Flags.isByVal()) {
@@ -2802,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
+  for (auto &RegToPass : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+                             RegToPass.second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
@@ -2860,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
+  for (auto &RegToPass : RegsToPass)
+    Ops.push_back(DAG.getRegister(RegToPass.first,
+                                  RegToPass.second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
@@ -2968,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (AArch64::GPR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else if (AArch64::FPR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
 
   RetOps[0] = Chain; // Update chain.
 
@@ -3010,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
     unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
     SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
     SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-    SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
-                                     MachinePointerInfo::getConstantPool(),
-                                     /*isVolatile=*/ false,
-                                     /*isNonTemporal=*/ true,
-                                     /*isInvariant=*/ true, 8);
+    SDValue GlobalAddr = DAG.getLoad(
+        PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        /*isVolatile=*/false,
+        /*isNonTemporal=*/true,
+        /*isInvariant=*/true, 8);
     if (GN->getOffset() != 0)
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
                          DAG.getConstant(GN->getOffset(), DL, PtrVT));
@@ -3087,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   // to obtain the address of the variable.
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet =
-      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
-                  false, true, true, 8);
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
+                  MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
+                  true, true, 8);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3160,6 +3478,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
     if (Model == TLSModel::LocalDynamic)
       Model = TLSModel::GeneralDynamic;
@@ -3277,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   // instruction.
   unsigned Opc = LHS.getOpcode();
-  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isOne() &&
+  if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
     assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
@@ -3392,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
   SDValue In1 = Op.getOperand(0);
   SDValue In2 = Op.getOperand(1);
   EVT SrcVT = In2.getValueType();
-  if (SrcVT != VT) {
-    if (SrcVT == MVT::f32 && VT == MVT::f64)
-      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
-    else if (SrcVT == MVT::f64 && VT == MVT::f32)
-      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2,
-                        DAG.getIntPtrConstant(0, DL));
-    else
-      // FIXME: Src type is different, bail out for now. Can VT really be a
-      // vector type?
-      return SDValue();
-  }
+
+  if (SrcVT.bitsLT(VT))
+    In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+  else if (SrcVT.bitsGT(VT))
+    In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
 
   EVT VecVT;
   EVT EltVT;
@@ -3410,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
   SDValue VecVal1, VecVal2;
   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
     EltVT = MVT::i32;
-    VecVT = MVT::v4i32;
+    VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
     EltMask = 0x80000000ULL;
 
     if (!VT.isVector()) {
@@ -3571,32 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
-  if (Cmp == Result)
-    return (Cmp.getValueType() == MVT::f32 ||
-            Cmp.getValueType() == MVT::f64);
-
-  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
-  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
-  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
-      Result.getValueType() == MVT::f64) {
-    bool Lossy;
-    APFloat CmpVal = CCmp->getValueAPF();
-    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
-    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
-  }
-
-  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
                                               SDValue RHS, SDValue TVal,
                                               SDValue FVal, SDLoc dl,
@@ -3614,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
     }
   }
 
-  // Handle integers first.
+  // Also handle f16, for which we need to do a f32 comparison.
+  if (LHS.getValueType() == MVT::f16) {
+    LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+    RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+  }
+
+  // Next, handle integers.
   if (LHS.getValueType().isInteger()) {
     assert((LHS.getValueType() == RHS.getValueType()) &&
            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
@@ -3637,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
     } else if (TVal.getOpcode() == ISD::XOR) {
       // If TVal is a NOT we want to swap TVal and FVal so that we can match
       // with a CSINV rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
-      if (CVal && CVal->isAllOnesValue()) {
+      if (isAllOnesConstant(TVal.getOperand(1))) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
@@ -3647,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
     } else if (TVal.getOpcode() == ISD::SUB) {
       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
       // that we can match with a CSNEG rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
-      if (CVal && CVal->isNullValue()) {
+      if (isNullConstant(TVal.getOperand(0))) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
@@ -4109,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  HiBitsForLo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  HiBitsForLo, CCVal, Cmp);
+
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue LoForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
 
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts larger than the register width are wrapped rather than
   // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, dl,
-                                                        MVT::i64))
-                          : DAG.getConstant(0, dl, VT);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+  SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForBigShift =
+      Opc == ISD::SRA
+          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                        DAG.getConstant(VTBits - 1, dl, MVT::i64))
+          : DAG.getConstant(0, dl, VT);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
+
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+                                                   SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -4156,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
 
   assert(Op.getOpcode() == ISD::SHL_PARTS);
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  LoBitsForHi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  LoBitsForHi, CCVal, Cmp);
+
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+  SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
 
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts of larger than register sizes are wrapped rather than
   // clamped, so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, dl, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+  SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
@@ -4362,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
   // Validate and return a target constant for them if we can.
   case 'z': {
     // 'z' maps to xzr or wzr so it needs an input of 0.
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C || C->getZExtValue() != 0)
+    if (!isNullConstant(Op))
       return;
 
     if (Op.getValueType() == MVT::i64)
@@ -4763,7 +5074,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
 
   // The index of an EXT is the first element if it is not UNDEF.
   // Watch out for the beginning UNDEFs. The EXT index should be the expected
-  // value of the first element.  E.g. 
+  // value of the first element.  E.g.
   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
   // ExpectedElt is the last mask index plus 1.
@@ -5653,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op,
     return Op;
 
   SmallVector<SDValue, 16> Ops;
-  for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
-    SDValue Lane = Op.getOperand(I);
-    if (Lane.getOpcode() == ISD::Constant) {
+  for (SDValue Lane : Op->ops()) {
+    if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
       APInt LowBits(EltTy.getSizeInBits(),
-                    cast<ConstantSDNode>(Lane)->getZExtValue());
+                    CstLane->getZExtValue());
       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
     }
     Ops.push_back(Lane);
@@ -5997,8 +6307,7 @@ FailedModImm:
 
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
-    SDValue shuffle = ReconstructShuffle(Op, DAG);
-    if (shuffle != SDValue())
+    if (SDValue shuffle = ReconstructShuffle(Op, DAG))
       return shuffle;
   }
 
@@ -6017,7 +6326,10 @@ FailedModImm:
     // a) Avoid a RMW dependency on the full vector register, and
     // b) Allow the register coalescer to fold away the copy if the
     //    value is already in an S or D register.
-    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+    // Do not do this for UNDEF/LOAD nodes because we have better patterns
+    // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
+    if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+        (ElemSize == 32 || ElemSize == 64)) {
       unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
       MachineSDNode *N =
           DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
@@ -6123,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   unsigned Val = Cst->getZExtValue();
 
   unsigned Size = Op.getValueType().getSizeInBits();
-  if (Val == 0) {
-    switch (Size) {
-    case 8:
-      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 16:
-      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 32:
-      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 64:
-      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    default:
-      llvm_unreachable("Unexpected vector type in extract_subvector!");
-    }
-  }
+
+  // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+  if (Val == 0)
+    return Op;
+
   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   // that directly.
   if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
@@ -6213,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
 ///   0 <= Value <= ElementBits for a long left shift.
 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
   if (!getVShiftImm(Op, ElementBits, Cnt))
     return false;
   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
 }
 
 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation.  For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-///   1 <= |Value| <= ElementBits for a right shift; or
-///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
-                         int64_t &Cnt) {
+/// operand of a vector shift right operation. The value must be in the range:
+///   1 <= Value <= ElementBits for a right shift; or
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
   if (!getVShiftImm(Op, ElementBits, Cnt))
     return false;
-  if (isIntrinsic)
-    Cnt = -Cnt;
   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
 }
 
@@ -6261,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
   case ISD::SRA:
   case ISD::SRL:
     // Right shift immediate
-    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
-        Cnt < EltSize) {
+    if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
       unsigned Opc =
           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
@@ -6451,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
@@ -6477,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6720,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
 
-  // Skip illegal vector types.
-  if (VecSize != 64 && VecSize != 128)
+  // Skip if we do not have NEON and skip illegal vector types.
+  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
     return false;
 
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
@@ -6806,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
-  // Skip illegal vector types.
-  if (SubVecSize != 64 && SubVecSize != 128)
+  // Skip if we do not have NEON and skip illegal vector types.
+  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
     return false;
 
   Value *Op0 = SVI->getOperand(0);
@@ -7228,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
-  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
-  if (Res != SDValue())
+  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
   EVT VT = N->getValueType(0);
@@ -7242,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 
   // If the result of an integer load is only used by an integer-to-float
   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
-  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  // This eliminates an "integer-to-vector-move" UOP and improves throughput.
   SDValue N0 = N->getOperand(0);
   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
@@ -7265,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// Fold a floating-point multiply by power of two into floating-point to
+/// fixed-point conversion.
+static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  SDValue ConstVec = Op->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
+  if (FloatBits != 32 && FloatBits != 64)
+    return SDValue();
+
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
+  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+    return SDValue();
+
+  // Avoid conversions where iN is larger than the float (e.g., float -> i64).
+  if (IntBits > FloatBits)
+    return SDValue();
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t Bits = IntBits == 64 ? 64 : 32;
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
+  if (C == -1 || C == 0 || C > Bits)
+    return SDValue();
+
+  MVT ResTy;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  switch (NumLanes) {
+  default:
+    return SDValue();
+  case 2:
+    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+    break;
+  case 4:
+    ResTy = MVT::v4i32;
+    break;
+  }
+
+  SDLoc DL(N);
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
+                                      : Intrinsic::aarch64_neon_vcvtfp2fxu;
+  SDValue FixConv =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
+                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+                  Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+  // We can handle smaller integers by generating an extra trunc.
+  if (IntBits < FloatBits)
+    FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
+
+  return FixConv;
+}
+
+/// Fold a floating-point divide by power of two into fixed-point to
+/// floating-point conversion.
+static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+                                  const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  unsigned Opc = Op->getOpcode();
+  if (!Op.getValueType().isVector() ||
+      (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
+    return SDValue();
+
+  SDValue ConstVec = N->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  int32_t IntBits = IntTy.getSizeInBits();
+  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+    return SDValue();
+
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  int32_t FloatBits = FloatTy.getSizeInBits();
+  if (FloatBits != 32 && FloatBits != 64)
+    return SDValue();
+
+  // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
+  if (IntBits > FloatBits)
+    return SDValue();
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+  if (C == -1 || C == 0 || C > FloatBits)
+    return SDValue();
+
+  MVT ResTy;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  switch (NumLanes) {
+  default:
+    return SDValue();
+  case 2:
+    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+    break;
+  case 4:
+    ResTy = MVT::v4i32;
+    break;
+  }
+
+  SDLoc DL(N);
+  SDValue ConvInput = Op.getOperand(0);
+  bool IsSigned = Opc == ISD::SINT_TO_FP;
+  if (IntBits < FloatBits)
+    ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+                            ResTy, ConvInput);
+
+  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
+                                      : Intrinsic::aarch64_neon_vcvtfxu2fp;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+                     DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+                     DAG.getConstant(C, DL, MVT::i32));
+}
+
 /// An EXTR instruction is made up of two shifts, ORed together. This helper
 /// searches for and classifies those shifts.
 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
@@ -7964,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
-    break;
   case Intrinsic::aarch64_neon_saddv:
     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
   case Intrinsic::aarch64_neon_uaddv:
@@ -7978,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_umaxv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
   case Intrinsic::aarch64_neon_fmax:
-    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmin:
-    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmaxnm:
+    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fminnm:
+    return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull:
@@ -8141,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
   unsigned Alignment = std::min(OrigAlignment, EltOffset);
 
   // Create scalar stores. This is at least as good as the code sequence for a
-  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // split unaligned store which is a dup.s, ext.b, and two stores.
   // Most of the time the three stores should be replaced by store pair
   // instructions (stp).
   SDLoc DL(St);
@@ -8162,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
   return NewST1;
 }
 
-static SDValue performSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   SelectionDAG &DAG,
-                                   const AArch64Subtarget *Subtarget) {
+static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              SelectionDAG &DAG,
+                              const AArch64Subtarget *Subtarget) {
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
@@ -8173,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N,
   if (S->isVolatile())
     return SDValue();
 
+  // FIXME: The logic for deciding if an unaligned store should be split should
+  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+  // a call to that function here.
+
   // Cyclone has bad performance on unaligned 16B stores when crossing line and
   // page boundaries. We want to split such stores.
   if (!Subtarget->isCyclone())
     return SDValue();
 
-  // Don't split at Oz.
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
-  if (IsMinSize)
+  // Don't split at -Oz.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
 
   SDValue StVal = S->getValue();
@@ -8204,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N,
   // If we get a splat of a scalar convert this vector store to a store of
   // scalars. They will be merged into store pairs thereby removing two
   // instructions.
-  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
-  if (ReplacedSplat != SDValue())
+  if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S))
     return ReplacedSplat;
 
   SDLoc DL(S);
@@ -8326,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N,
   return SDValue();
 }
 
+/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// address translation.
+static bool performTBISimplification(SDValue Addr,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                        DCI.isBeforeLegalizeOps());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+    return true;
+  }
+  return false;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
+  if (Split.getNode())
+    return Split;
+
+  if (Subtarget->supportsAddressTopByteIgnored() &&
+      performTBISimplification(N->getOperand(2), DCI, DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+  /// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+                                                     unsigned Op,
+                                                     SelectionDAG &DAG) {
+  EVT VTy = OpV->getOperand(0).getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  int NumVecElts = VTy.getVectorNumElements();
+  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+    if (NumVecElts != 4)
+      return SDValue();
+  } else {
+    if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+      return SDValue();
+  }
+
+  int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
+  SDValue PreOp = OpV;
+  // Iterate over each step of the across vector reduction.
+  for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
+    SDValue CurOp = PreOp.getOperand(0);
+    SDValue Shuffle = PreOp.getOperand(1);
+    if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
+      // Try to swap the 1st and 2nd operand as add and min/max instructions
+      // are commutative.
+      CurOp = PreOp.getOperand(1);
+      Shuffle = PreOp.getOperand(0);
+      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+        return SDValue();
+    }
+
+    // Check if the input vector is fed by the operator we want to handle,
+    // except the last step; the very first input vector is not necessarily
+    // the same operator we are handling.
+    if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+      return SDValue();
+
+    // Check if it forms one step of the across vector reduction.
+    // E.g.,
+    //   %cur = add %1, %0
+    //   %shuffle = vector_shuffle %cur, <2, 3, u, u>
+    //   %pre = add %cur, %shuffle
+    if (Shuffle.getOperand(0) != CurOp)
+      return SDValue();
+
+    int NumMaskElts = 1 << CurStep;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
+    // Check mask values in each step.
+    // We expect the shuffle mask in each step follows a specific pattern
+    // denoted here by the <M, U> form, where M is a sequence of integers
+    // starting from NumMaskElts, increasing by 1, and the number integers
+    // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
+    // of undef in U should be NumVecElts - NumMaskElts.
+    // E.g., for <8 x i16>, mask values in each step should be :
+    //   step 0 : <1,u,u,u,u,u,u,u>
+    //   step 1 : <2,3,u,u,u,u,u,u>
+    //   step 2 : <4,5,6,7,u,u,u,u>
+    for (int i = 0; i < NumVecElts; ++i)
+      if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
+          (i >= NumMaskElts && !(Mask[i] < 0)))
+        return SDValue();
+
+    PreOp = CurOp;
+  }
+  unsigned Opcode;
+  bool IsIntrinsic = false;
+
+  switch (Op) {
+  default:
+    llvm_unreachable("Unexpected operator for across vector reduction");
+  case ISD::ADD:
+    Opcode = AArch64ISD::UADDV;
+    break;
+  case ISD::SMAX:
+    Opcode = AArch64ISD::SMAXV;
+    break;
+  case ISD::UMAX:
+    Opcode = AArch64ISD::UMAXV;
+    break;
+  case ISD::SMIN:
+    Opcode = AArch64ISD::SMINV;
+    break;
+  case ISD::UMIN:
+    Opcode = AArch64ISD::UMINV;
+    break;
+  case ISD::FMAXNUM:
+    Opcode = Intrinsic::aarch64_neon_fmaxnmv;
+    IsIntrinsic = true;
+    break;
+  case ISD::FMINNUM:
+    Opcode = Intrinsic::aarch64_neon_fminnmv;
+    IsIntrinsic = true;
+    break;
+  }
+  SDLoc DL(N);
+
+  return IsIntrinsic
+             ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+                           DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
+             : DAG.getNode(
+                   ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+                   DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+                   DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+///   svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+///   %smax0 = smax %arr, svn0
+///   %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax1 = smax %smax0, %svn1
+///   %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax2 = smax %smax1, svn2
+///   %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %sc = setcc %smax2, %svn3, gt
+///   %n0 = extract_vector_elt %sc, #0
+///   %n1 = extract_vector_elt %smax2, #0
+///   %n2 = extract_vector_elt $smax2, #1
+///   %result = select %n0, %n1, n2
+///     becomes :
+///   %1 = smaxv %0
+///   %result = extract_vector_elt %1, 0
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                        const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+
+  // Check if the SELECT merges up the final result of the min/max
+  // from a vector.
+  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  // Expect N0 is fed by SETCC.
+  SDValue SetCC = N0.getOperand(0);
+  EVT SetCCVT = SetCC.getValueType();
+  if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+      SetCCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue VectorOp = SetCC.getOperand(0);
+  unsigned Op = VectorOp->getOpcode();
+  // Check if the input vector is fed by the operator we want to handle.
+  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
+      Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
+    return SDValue();
+
+  EVT VTy = VectorOp.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  if (VTy.getSizeInBits() < 64)
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+    if (EltTy != MVT::f32)
+      return SDValue();
+  } else {
+    if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+      return SDValue();
+  }
+
+  // Check if extracting from the same vector.
+  // For example,
+  //   %sc = setcc %vector, %svn1, gt
+  //   %n0 = extract_vector_elt %sc, #0
+  //   %n1 = extract_vector_elt %vector, #0
+  //   %n2 = extract_vector_elt $vector, #1
+  if (!(VectorOp == IfTrue->getOperand(0) &&
+        VectorOp == IfFalse->getOperand(0)))
+    return SDValue();
+
+  // Check if the condition code is matched with the operator type.
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+      (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+      (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
+      (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
+       CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
+       CC != ISD::SETGE) ||
+      (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
+       CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
+       CC != ISD::SETLE))
+    return SDValue();
+
+  // Expect to check only lane 0 from the vector SETCC.
+  if (!isNullConstant(N0.getOperand(1)))
+    return SDValue();
+
+  // Expect to extract the true value from lane 0.
+  if (!isNullConstant(IfTrue.getOperand(1)))
+    return SDValue();
+
+  // Expect to extract the false value from lane 1.
+  if (!isOneConstant(IfFalse.getOperand(1)))
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+///   %1 = vector_shuffle %0, <2,3,u,u>
+///   %2 = add %0, %1
+///   %3 = vector_shuffle %2, <1,u,u,u>
+///   %4 = add %2, %3
+///   %result = extract_vector_elt %4, 0
+/// becomes :
+///   %0 = uaddv %0
+///   %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Check if the input vector is fed by the ADD.
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The vector extract idx must constant zero because we only expect the final
+  // result of the reduction is placed in lane 0.
+  if (!isNullConstant(N1))
+    return SDValue();
+
+  EVT VTy = N0.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  if (VTy.getSizeInBits() < 64)
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
+}
+
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -8751,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N,
   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
     return SDValue();
 
-  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+  if (isNullConstant(LHS))
     std::swap(LHS, RHS);
 
-  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+  if (!isNullConstant(RHS))
     return SDValue();
 
   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
@@ -8774,6 +9491,103 @@ static SDValue performBRCONDCombine(SDNode *N,
   return SDValue();
 }
 
+// Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
+// as well as whether the test should be inverted.  This code is required to
+// catch these cases (as opposed to standard dag combines) because
+// AArch64ISD::TBZ is matched during legalization.
+static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
+                                 SelectionDAG &DAG) {
+
+  if (!Op->hasOneUse())
+    return Op;
+
+  // We don't handle undef/constant-fold cases below, as they should have
+  // already been taken care of (e.g. and of 0, test of undefined shifted bits,
+  // etc.)
+
+  // (tbz (trunc x), b) -> (tbz x, b)
+  // This case is just here to enable more of the below cases to be caught.
+  if (Op->getOpcode() == ISD::TRUNCATE &&
+      Bit < Op->getValueType(0).getSizeInBits()) {
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+  }
+
+  if (Op->getNumOperands() != 2)
+    return Op;
+
+  auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+  if (!C)
+    return Op;
+
+  switch (Op->getOpcode()) {
+  default:
+    return Op;
+
+  // (tbz (and x, m), b) -> (tbz x, b)
+  case ISD::AND:
+    if ((C->getZExtValue() >> Bit) & 1)
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    return Op;
+
+  // (tbz (shl x, c), b) -> (tbz x, b-c)
+  case ISD::SHL:
+    if (C->getZExtValue() <= Bit &&
+        (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+      Bit = Bit - C->getZExtValue();
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    }
+    return Op;
+
+  // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
+  case ISD::SRA:
+    Bit = Bit + C->getZExtValue();
+    if (Bit >= Op->getValueType(0).getSizeInBits())
+      Bit = Op->getValueType(0).getSizeInBits() - 1;
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+
+  // (tbz (srl x, c), b) -> (tbz x, b+c)
+  case ISD::SRL:
+    if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+      Bit = Bit + C->getZExtValue();
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    }
+    return Op;
+
+  // (tbz (xor x, -1), b) -> (tbnz x, b)
+  case ISD::XOR:
+    if ((C->getZExtValue() >> Bit) & 1)
+      Invert = !Invert;
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+  }
+}
+
+// Optimize test single bit zero/non-zero and branch.
+static SDValue performTBZCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 SelectionDAG &DAG) {
+  unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  bool Invert = false;
+  SDValue TestSrc = N->getOperand(1);
+  SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
+
+  if (TestSrc == NewTestSrc)
+    return SDValue();
+
+  unsigned NewOpc = N->getOpcode();
+  if (Invert) {
+    if (NewOpc == AArch64ISD::TBZ)
+      NewOpc = AArch64ISD::TBNZ;
+    else {
+      assert(NewOpc == AArch64ISD::TBNZ);
+      NewOpc = AArch64ISD::TBZ;
+    }
+  }
+
+  SDLoc DL(N);
+  return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
+                     DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
+}
+
 // vselect (v1i1 setcc) ->
 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
@@ -8868,75 +9682,6 @@ static SDValue performSelectCombine(SDNode *N,
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
 
-/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC
-/// to match FMIN/FMAX patterns.
-static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) {
-  // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y".
-  // Unless the NoNaNsFPMath option is set, be careful about NaNs:
-  // vmax/vmin return NaN if either operand is a NaN;
-  // only do the transformation when it matches that behavior.
-
-  SDValue CondLHS = N->getOperand(0);
-  SDValue CondRHS = N->getOperand(1);
-  SDValue LHS = N->getOperand(2);
-  SDValue RHS = N->getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-
-  unsigned Opcode;
-  bool IsReversed;
-  if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) &&
-      selectCCOpsAreFMaxCompatible(CondRHS, RHS)) {
-    IsReversed = false; // x CC y ? x : y
-  } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) &&
-             selectCCOpsAreFMaxCompatible(CondLHS, RHS)) {
-    IsReversed = true ; // x CC y ? y : x
-  } else {
-    return SDValue();
-  }
-
-  bool IsUnordered = false, IsOrEqual;
-  switch (CC) {
-  default:
-    return SDValue();
-  case ISD::SETULT:
-  case ISD::SETULE:
-    IsUnordered = true;
-  case ISD::SETOLT:
-  case ISD::SETOLE:
-  case ISD::SETLT:
-  case ISD::SETLE:
-    IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE);
-    Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN;
-    break;
-
-  case ISD::SETUGT:
-  case ISD::SETUGE:
-    IsUnordered = true;
-  case ISD::SETOGT:
-  case ISD::SETOGE:
-  case ISD::SETGT:
-  case ISD::SETGE:
-    IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE);
-    Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX;
-    break;
-  }
-
-  // If LHS is NaN, an ordered comparison will be false and the result will be
-  // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking
-  // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
-  if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
-    return SDValue();
-
-  // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true,
-  // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be
-  // used for unsafe math or if one of the operands is known to be nonzero.
-  if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath &&
-      !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
-    return SDValue();
-
-  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
-}
-
 /// Get rid of unnecessary NVCASTs (that don't change the type).
 static SDValue performNVCASTCombine(SDNode *N) {
   if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -8961,6 +9706,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return performIntToFpCombine(N, DAG, Subtarget);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performFpToIntCombine(N, DAG, Subtarget);
+  case ISD::FDIV:
+    return performFDivCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
@@ -8973,16 +9723,25 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::SELECT:
-    return performSelectCombine(N, DCI);
+  case ISD::SELECT: {
+    SDValue RV = performSelectCombine(N, DCI);
+    if (!RV.getNode())
+      RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+    return RV;
+  }
   case ISD::VSELECT:
     return performVSelectCombine(N, DCI.DAG);
-  case ISD::SELECT_CC:
-    return performSelectCCCombine(N, DCI.DAG);
+  case ISD::LOAD:
+    if (performTBISimplification(N->getOperand(1), DCI, DAG))
+      return SDValue(N, 0);
+    break;
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::TBNZ:
+  case AArch64ISD::TBZ:
+    return performTBZCombine(N, DCI, DAG);
   case AArch64ISD::CSEL:
     return performCONDCombine(N, DCI, DAG, 2, 3);
   case AArch64ISD::DUP:
@@ -8991,6 +9750,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performNVCASTCombine(N);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -9157,6 +9918,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
 }
 
+static void ReplaceReductionResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG, unsigned InterOp,
+                                    unsigned AcrossOp) {
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+  SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
+  SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
+  Results.push_back(SplitVal);
+}
+
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
@@ -9165,6 +9940,24 @@ void AArch64TargetLowering::ReplaceNodeResults(
   case ISD::BITCAST:
     ReplaceBITCASTResults(N, Results, DAG);
     return;
+  case AArch64ISD::SADDV:
+    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
+    return;
+  case AArch64ISD::UADDV:
+    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
+    return;
+  case AArch64ISD::SMINV:
+    ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
+    return;
+  case AArch64ISD::UMINV:
+    ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
+    return;
+  case AArch64ISD::SMAXV:
+    ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
+    return;
+  case AArch64ISD::UMAXV:
+    ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
+    return;
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
@@ -9177,10 +9970,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const {
   return true;
 }
 
-bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
   // reciprocal if there are three or more FDIVs.
-  return NumUsers > 2;
+  return 3;
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -9206,20 +9999,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
-bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return Size == 128;
+  return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
 }
 
 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return Size <= 128 ? AtomicRMWExpansionKind::LLSC
-                     : AtomicRMWExpansionKind::None;
+  return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
 }
 
-bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
+    AtomicCmpXchgInst *AI) const {
   return true;
 }
 
@@ -9258,6 +10052,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
       cast<PointerType>(Addr->getType())->getElementType());
 }
 
+void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+    IRBuilder<> &Builder) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Builder.CreateCall(
+      llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+}
+
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
@@ -9294,3 +10095,70 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   return Ty->isArrayTy();
 }
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+                                                            EVT) const {
+  return false;
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getSafeStackPointerLocation(IRB);
+
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  const unsigned TlsOffset = 0x48;
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+  return IRB.CreatePointerCast(
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  // Update IsSplitCSR in AArch64unctionInfo.
+  AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (AArch64::GPR64RegClass.contains(*I))
+      RC = &AArch64::GPR64RegClass;
+    else if (AArch64::FPR64RegClass.contains(*I))
+      RC = &AArch64::FPR64RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            NewVR)
+        .addReg(*I);
+
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+              *I)
+          .addReg(NewVR);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c73ce1e..e99616c 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 
+#include "AArch64.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/CallingConv.h"
@@ -58,13 +59,14 @@ enum NodeType : unsigned {
   SBCS,
   ANDS,
 
+  // Conditional compares. Operands: left,right,falsecc,cc,flags
+  CCMP,
+  CCMN,
+  FCCMP,
+
   // Floating point comparison
   FCMP,
 
-  // Floating point max and min instructions.
-  FMAX,
-  FMIN,
-
   // Scalar extract
   EXTR,
 
@@ -217,8 +219,6 @@ class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
-  bool RequireStrictAlign;
-
 public:
   explicit AArch64TargetLowering(const TargetMachine &TM,
                                  const AArch64Subtarget &STI);
@@ -226,46 +226,35 @@ public:
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
-  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
-  /// Mask are known to be either zero or one and return them in the
-  /// KnownZero/KnownOne bitsets.
+  /// Determine which of the bits specified in Mask are known to be either zero
+  /// or one and return them in the KnownZero/KnownOne bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
                                      APInt &KnownOne, const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
-  /// allowsMisalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses of the specified type.
+  /// Returns true if the target allows unaligned memory accesses of the
+  /// specified type.
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
                                       unsigned Align = 1,
-                                      bool *Fast = nullptr) const override {
-    if (RequireStrictAlign)
-      return false;
-    // FIXME: True for Cyclone, but not necessary others.
-    if (Fast)
-      *Fast = true;
-    return true;
-  }
+                                      bool *Fast = nullptr) const override;
 
-  /// LowerOperation - Provide custom lowering hooks for some operations.
+  /// Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  unsigned getFunctionAlignment(const Function *F) const;
-
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
     // Addrspacecasts are always noops.
     return true;
   }
 
-  /// createFastISel - This method returns a target specific FastISel object,
-  /// or null if the target does not support "fast" ISel.
+  /// This method returns a target specific FastISel object, or null if the
+  /// target does not support "fast" ISel.
   FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                            const TargetLibraryInfo *libInfo) const override;
 
@@ -273,11 +262,11 @@ public:
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
-  /// codegen'd directly, or if it should be stack expanded.
+  /// Return true if the given shuffle mask can be codegen'd directly, or if it
+  /// should be stack expanded.
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
-  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  /// Return the ISD::SETCC ValueType.
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
 
@@ -322,8 +311,8 @@ public:
                           bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
-  /// isLegalAddressingMode - Return true if the addressing mode represented
-  /// by AM is legal for this target, for a load/store of the specified type.
+  /// Return true if the addressing mode represented by AM is legal for this
+  /// target, for a load/store of the specified type.
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
@@ -335,10 +324,9 @@ public:
   int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                            unsigned AS) const override;
 
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
+  /// Return true if an FMA operation is faster than a pair of fmul and fadd
+  /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
+  /// returns true, otherwise fmuladd is expanded to fmul + fadd.
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
   const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
@@ -351,25 +339,65 @@ public:
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
-  bool hasLoadLinkedStoreConditional() const override;
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                               Value *Addr, AtomicOrdering Ord) const override;
 
-  bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
   bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
-  TargetLoweringBase::AtomicRMWExpansionKind
+  TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
+  bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
   bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
+  /// If the target has a standard location for the unsafe stack pointer,
+  /// returns the address of that location. Otherwise, returns nullptr.
+  Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  unsigned
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+    // FIXME: This is a guess. Has this been defined yet?
+    return AArch64::X0;
+  }
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  unsigned
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+    // FIXME: This is a guess. Has this been defined yet?
+    return AArch64::X1;
+  }
+
+  bool isCheapToSpeculateCttz() const override {
+    return true;
+  }
+
+  bool isCheapToSpeculateCtlz() const override {
+    return true;
+  }
+  bool supportSplitCSR(MachineFunction *MF) const override {
+    return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+           MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+  }
+  void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+  void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
-  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
@@ -392,6 +420,8 @@ private:
                           SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
                           bool isThisReturn, SDValue ThisVal) const;
 
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
   bool isEligibleForTailCallOptimization(
       SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
       bool isCalleeStructRet, bool isCallerStructRet,
@@ -470,7 +500,7 @@ private:
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
-  bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+  unsigned combineRepeatedFPDivisors() const override;
 
   ConstraintType getConstraintType(StringRef Constraint) const override;
   unsigned getRegisterByName(const char* RegName, EVT VT,
@@ -516,6 +546,8 @@ private:
   bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
+
+  bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
 };
 
 namespace AArch64 {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 3f2e772..6ac2175 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
   let PrintMethod = "printImmScale<16>";
 }
 
+def am_indexed7s8   : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
 class AsmImmRange<int Low, int High> : AsmOperandClass {
   let Name = "Imm" # Low # "_" # High;
   let DiagnosticType = "InvalidImm" # Low # "_" # High;
@@ -346,9 +352,11 @@ class fixedpoint_i64<ValueType FloatVT>
   let ParserMatchClass = Imm1_64Operand;
 }
 
+def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
 def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
 def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
 
+def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
 def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
 def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
 
@@ -402,6 +410,7 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm1_32Operand;
 }
 
+def Imm0_1Operand : AsmImmRange<0, 1>;
 def Imm0_7Operand : AsmImmRange<0, 7>;
 def Imm0_15Operand : AsmImmRange<0, 15>;
 def Imm0_31Operand : AsmImmRange<0, 31>;
@@ -525,6 +534,20 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_31Operand;
 }
 
+// True if the 32-bit immediate is in the range [0,31]
+def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_1 predicate - True if the immediate is in the range [0,1]
+def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = Imm0_1Operand;
+}
+
 // imm0_15 predicate - True if the immediate is in the range [0,15]
 def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 16;
@@ -542,7 +565,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
 // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
 def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 16;
-}]>;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
 
 // An arithmetic shifter operand:
 //  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
@@ -690,6 +715,17 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
 }
 
 // Floating-point immediate.
+def fpimm16 : Operand<f16>,
+              PatLeaf<(f16 fpimm), [{
+      return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP16Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
 def fpimm32 : Operand<f32>,
               PatLeaf<(f32 fpimm), [{
       return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
@@ -822,7 +858,7 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
 // model patterns with sufficiently fine granularity
 let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
   class HintI<string mnemonic>
-      : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "",
+      : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
                       [(int_aarch64_hint imm0_127:$imm)]>,
         Sched<[WriteHint]> {
     bits <7> imm;
@@ -875,6 +911,25 @@ def msr_sysreg_op : Operand<i32> {
   let PrintMethod = "printMSRSystemRegister";
 }
 
+def PSBHintOperand : AsmOperandClass {
+  let Name = "PSBHint";
+  let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+  let ParserMatchClass = PSBHintOperand;
+  let PrintMethod = "printPSBHintOp";
+  let MCOperandPredicate = [{
+    // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+    // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+    if (!MCOp.isImm())
+      return false;
+    bool ValidNamed;
+    (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
+      STI.getFeatureBits(), ValidNamed);
+    return ValidNamed;
+  }];
+}
+
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
   bits<16> systemreg;
@@ -890,19 +945,19 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
   let Inst{20-5} = systemreg;
 }
 
-def SystemPStateFieldOperand : AsmOperandClass {
-  let Name = "SystemPStateField";
+def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
+  let Name = "SystemPStateFieldWithImm0_15";
   let ParserMethod = "tryParseSysReg";
 }
-def pstatefield_op : Operand<i32> {
-  let ParserMatchClass = SystemPStateFieldOperand;
+def pstatefield4_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
   let PrintMethod = "printSystemPStateField";
 }
 
 let Defs = [NZCV] in
-class MSRpstateI
-  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
-                  "msr", "\t$pstate_field, $imm">,
+class MSRpstateImm0_15
+  : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm),
+                  "msr", "\t$pstatefield, $imm">,
     Sched<[WriteSys]> {
   bits<6> pstatefield;
   bits<4> imm;
@@ -913,6 +968,37 @@ class MSRpstateI
   let Inst{7-5} = pstatefield{2-0};
 
   let DecoderMethod = "DecodeSystemPStateInstruction";
+  // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+  // Fail the decoder should attempt to decode the instruction as MSRI.
+  let hasCompleteDecoder = 0;
+}
+
+def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
+  let Name = "SystemPStateFieldWithImm0_1";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield1_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_1
+  : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm),
+                  "msr", "\t$pstatefield, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bit imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-9} = 0b0100000;
+  let Inst{8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+  // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+  // Fail the decoder should attempt to decode the instruction as MSRI.
+  let hasCompleteDecoder = 0;
 }
 
 // SYS and SYSL generic system instructions.
@@ -1341,7 +1427,7 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
 }
 
 class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
+    : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
 
 class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
@@ -1407,13 +1493,13 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode>
 }
 
 class MulAccumWAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
+    : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
 class MulAccumXAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
+    : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
 class WideMulAccumAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
+    : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
 
 class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
@@ -1643,7 +1729,7 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
 class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
                      RegisterClass src1Regtype, RegisterClass src2Regtype,
                      int shiftExt>
-    : InstAlias<asm#" $dst, $src1, $src2",
+    : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
                       shiftExt)>;
 
@@ -1701,10 +1787,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
   }
 
   // add Rd, Rb, -imm -> sub Rd, Rn, imm
-  def : InstAlias<alias#" $Rd, $Rn, $imm",
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#" $Rd, $Rn, $imm",
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1776,43 +1862,43 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
   } // Defs = [NZCV]
 
   // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
-  def : InstAlias<alias#" $Rd, $Rn, $imm",
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#" $Rd, $Rn, $imm",
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
   // Compare aliases
-  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+  def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
                   WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
-  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+  def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
                   XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
                   WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
                   XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
                   XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
                   WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
                   XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
 
   // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
-  def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
                   WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
                   XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
 
   // Compare shorthands
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
                   WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
                   XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
                   WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
                   XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
 
   // Register/register aliases with no shift when SP is not used.
@@ -1998,7 +2084,7 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
 
 // Aliases for register+register logical instructions.
 class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
+    : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
 
 multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
@@ -2017,10 +2103,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
     let Inst{31} = 1;
   }
 
-  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2039,10 +2125,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
   }
   } // end Defs = [NZCV]
 
-  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2105,9 +2191,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
 //---
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
+                            string mnemonic, SDNode OpNode>
+    : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
+         [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
+                             (i32 imm:$cond), NZCV))]>,
       Sched<[WriteI, ReadI]> {
   let Uses = [NZCV];
   let Defs = [NZCV];
@@ -2127,19 +2216,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
   let Inst{3-0}   = nzcv;
 }
 
-multiclass CondSetFlagsImm<bit op, string asm> {
-  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
+                            SDNode OpNode>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
+         [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
+                             (i32 imm:$cond), NZCV))]>,
       Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
   let Defs = [NZCV];
@@ -2159,11 +2242,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
   let Inst{3-0}   = nzcv;
 }
 
-multiclass CondSetFlagsReg<bit op, string asm> {
-  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
+  // immediate operand variants
+  def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
     let Inst{31} = 0;
   }
-  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+  def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  // register operand variants
+  def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
     let Inst{31} = 1;
   }
 }
@@ -2328,7 +2419,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                            asm, pattern>,
            Sched<[WriteLD]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -2340,7 +2431,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                            asm, pattern>,
            Sched<[WriteST]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -2508,7 +2599,7 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
 }
 
 class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
-  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+  : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
               (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
 
 multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
@@ -2934,7 +3025,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                                (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
           Sched<[WriteLD]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -2946,7 +3037,7 @@ multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
                                asm, pattern>,
           Sched<[WriteST]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -2958,7 +3049,7 @@ multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
                                asm, pat>,
           Sched<[WriteLD]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -2993,7 +3084,7 @@ multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
                                     (ins GPR64sp:$Rn, simm9:$offset), asm>,
           Sched<[WriteLD]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -3005,7 +3096,7 @@ multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
                                  asm>,
           Sched<[WriteST]>;
 
-  def : InstAlias<asm # " $Rt, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
@@ -3136,7 +3227,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
           Sched<[WriteLD, WriteLDHi]>;
 
-  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
                                                   GPR64sp:$Rn, 0)>;
 }
@@ -3151,7 +3242,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
                                   asm>,
           Sched<[WriteSTP]>;
 
-  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
                   (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
                                                   GPR64sp:$Rn, 0)>;
 }
@@ -3230,8 +3321,8 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
 let mayStore = 1, mayLoad = 0 in
 class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
                        Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
-                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
                                   GPR64sp:$Rn, idxtype:$offset),
                              asm>,
       Sched<[WriteAdr, WriteSTP]>;
@@ -3477,6 +3568,20 @@ class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
 
 multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
            SDPatternOperator OpN> {
+  // Unscaled half-precision to 32-bit
+  def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  // Unscaled half-precision to 64-bit
+  def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Predicates = [HasFullFP16];
+  }
+
   // Unscaled single-precision to 32-bit
   def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
                                      [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
@@ -3504,6 +3609,25 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
 
 multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
                              SDPatternOperator OpN> {
+  // Scaled half-precision to 32-bit
+  def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
+                              fixedpoint_f16_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+                                          fixedpoint_f16_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+    let Predicates = [HasFullFP16];
+  }
+
+  // Scaled half-precision to 64-bit
+  def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
+                              fixedpoint_f16_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+                                          fixedpoint_f16_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Predicates = [HasFullFP16];
+  }
+
   // Scaled single-precision to 32-bit
   def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
                               fixedpoint_f32_i32, asm,
@@ -3553,7 +3677,7 @@ class BaseIntegerToFP<bit isUnsigned,
   bits<5> Rd;
   bits<5> Rn;
   bits<6> scale;
-  let Inst{30-23} = 0b00111100;
+  let Inst{30-24} = 0b0011110;
   let Inst{21-17} = 0b00001;
   let Inst{16}    = isUnsigned;
   let Inst{15-10} = scale;
@@ -3570,7 +3694,7 @@ class BaseIntegerToFPUnscaled<bit isUnsigned,
   bits<5> Rd;
   bits<5> Rn;
   bits<6> scale;
-  let Inst{30-23} = 0b00111100;
+  let Inst{30-24} = 0b0011110;
   let Inst{21-17} = 0b10001;
   let Inst{16}    = isUnsigned;
   let Inst{15-10} = 0b000000;
@@ -3580,33 +3704,55 @@ class BaseIntegerToFPUnscaled<bit isUnsigned,
 
 multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
   // Unscaled
+  def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
   def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
     let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
   }
 
   def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
     let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+
+  def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
   }
 
   def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
     let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
   }
 
   def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
     let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
   }
 
   // Scaled
+  def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
+                             [(set FPR16:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f16_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let scale{5} = 1;
+    let Predicates = [HasFullFP16];
+  }
+
   def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
                              [(set FPR32:$Rd,
                                    (fdiv (node GPR32:$Rn),
                                          fixedpoint_f32_i32:$scale))]> {
     let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
     let scale{5} = 1;
   }
 
@@ -3615,16 +3761,25 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
                                    (fdiv (node GPR32:$Rn),
                                          fixedpoint_f64_i32:$scale))]> {
     let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
     let scale{5} = 1;
   }
 
+  def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
+                             [(set FPR16:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f16_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
   def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
                              [(set FPR32:$Rd,
                                    (fdiv (node GPR64:$Rn),
                                          fixedpoint_f32_i64:$scale))]> {
     let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
   }
 
   def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
@@ -3632,7 +3787,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
                                    (fdiv (node GPR64:$Rn),
                                          fixedpoint_f64_i64:$scale))]> {
     let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
   }
 }
 
@@ -3654,7 +3809,7 @@ class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
       Sched<[WriteFCopy]> {
   bits<5> Rd;
   bits<5> Rn;
-  let Inst{30-23} = 0b00111100;
+  let Inst{30-24} = 0b0011110;
   let Inst{21}    = 1;
   let Inst{20-19} = rmode;
   let Inst{18-16} = opcode;
@@ -3704,26 +3859,49 @@ class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
 }
 
 
-
 multiclass UnscaledConversion<string asm> {
+  def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
   def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
     let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
   }
 
   def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
     let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+
+  def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
   }
 
   def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
     let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
   }
 
   def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
     let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
   }
 
   def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
@@ -3796,7 +3974,7 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
       Sched<[WriteF]> {
   bits<5> Rd;
   bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21-19} = 0b100;
   let Inst{18-15} = opcode;
   let Inst{14-10} = 0b10000;
@@ -3806,12 +3984,17 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
 
 multiclass SingleOperandFPData<bits<4> opcode, string asm,
                                SDPatternOperator node = null_frag> {
+  def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
   def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
-    let Inst{22} = 0; // 32-bit size flag
+    let Inst{23-22} = 0b00; // 32-bit size flag
   }
 
   def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
-    let Inst{22} = 1; // 64-bit size flag
+    let Inst{23-22} = 0b01; // 64-bit size flag
   }
 }
 
@@ -3828,7 +4011,7 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21}    = 1;
   let Inst{20-16} = Rm;
   let Inst{15-12} = opcode;
@@ -3839,28 +4022,41 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
 
 multiclass TwoOperandFPData<bits<4> opcode, string asm,
                             SDPatternOperator node = null_frag> {
+  def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+                         [(set (f16 FPR16:$Rd),
+                               (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
   def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
                          [(set (f32 FPR32:$Rd),
                                (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
-    let Inst{22} = 0; // 32-bit size flag
+    let Inst{23-22} = 0b00; // 32-bit size flag
   }
 
   def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
                          [(set (f64 FPR64:$Rd),
                                (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
-    let Inst{22} = 1; // 64-bit size flag
+    let Inst{23-22} = 0b01; // 64-bit size flag
   }
 }
 
 multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+                  [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
   def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
                   [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
-    let Inst{22} = 0; // 32-bit size flag
+    let Inst{23-22} = 0b00; // 32-bit size flag
   }
 
   def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
                   [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
-    let Inst{22} = 1; // 64-bit size flag
+    let Inst{23-22} = 0b01; // 64-bit size flag
   }
 }
 
@@ -3878,7 +4074,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
   bits<5> Rn;
   bits<5> Rm;
   bits<5> Ra;
-  let Inst{31-23} = 0b000111110;
+  let Inst{31-24} = 0b00011111;
   let Inst{21}    = isNegated;
   let Inst{20-16} = Rm;
   let Inst{15}    = isSub;
@@ -3889,16 +4085,23 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
 
 multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
                               SDPatternOperator node> {
+  def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
+            [(set FPR16:$Rd,
+                  (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
   def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
             [(set FPR32:$Rd,
                   (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
-    let Inst{22} = 0; // 32-bit size flag
+    let Inst{23-22} = 0b00; // 32-bit size flag
   }
 
   def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
             [(set FPR64:$Rd,
                   (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
-    let Inst{22} = 1; // 64-bit size flag
+    let Inst{23-22} = 0b01; // 64-bit size flag
   }
 }
 
@@ -3913,7 +4116,7 @@ class BaseOneOperandFPComparison<bit signalAllNans,
     : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
       Sched<[WriteFCmp]> {
   bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21}    = 1;
 
   let Inst{15-10} = 0b001000;
@@ -3932,7 +4135,7 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
       Sched<[WriteFCmp]> {
   bits<5> Rm;
   bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21}    = 1;
   let Inst{20-16} = Rm;
   let Inst{15-10} = 0b001000;
@@ -3944,24 +4147,36 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
 multiclass FPComparison<bit signalAllNans, string asm,
                         SDPatternOperator OpNode = null_frag> {
   let Defs = [NZCV] in {
+  def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
+      [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
+  def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
+      [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
   def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
       [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
-    let Inst{22} = 0;
+    let Inst{23-22} = 0b00;
   }
 
   def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
       [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
-    let Inst{22} = 0;
+    let Inst{23-22} = 0b00;
   }
 
   def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
       [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
-    let Inst{22} = 1;
+    let Inst{23-22} = 0b01;
   }
 
   def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
       [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
-    let Inst{22} = 1;
+    let Inst{23-22} = 0b01;
   }
   } // Defs = [NZCV]
 }
@@ -3971,17 +4186,20 @@ multiclass FPComparison<bit signalAllNans, string asm,
 //---
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
-                              RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
+                           string mnemonic, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
       Sched<[WriteFCmp]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
   bits<5> Rn;
   bits<5> Rm;
   bits<4> nzcv;
   bits<4> cond;
 
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21}    = 1;
   let Inst{20-16} = Rm;
   let Inst{15-12} = cond;
@@ -3991,16 +4209,24 @@ class BaseFPCondComparison<bit signalAllNans,
   let Inst{3-0}   = nzcv;
 }
 
-multiclass FPCondComparison<bit signalAllNans, string asm> {
-  let Defs = [NZCV], Uses = [NZCV] in {
-  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
-    let Inst{22} = 0;
+multiclass FPCondComparison<bit signalAllNans, string mnemonic,
+                            SDPatternOperator OpNode = null_frag> {
+  def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
   }
 
-  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
-    let Inst{22} = 1;
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
+      [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
+                          (i32 imm:$cond), NZCV))]> {
+    let Inst{23-22} = 0b00;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
+      [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
+                          (i32 imm:$cond), NZCV))]> {
+    let Inst{23-22} = 0b01;
   }
-  } // Defs = [NZCV], Uses = [NZCV]
 }
 
 //---
@@ -4019,7 +4245,7 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
   bits<5> Rm;
   bits<4> cond;
 
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21}    = 1;
   let Inst{20-16} = Rm;
   let Inst{15-12} = cond;
@@ -4030,12 +4256,17 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
 
 multiclass FPCondSelect<string asm> {
   let Uses = [NZCV] in {
+  def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
   def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
-    let Inst{22} = 0;
+    let Inst{23-22} = 0b00;
   }
 
   def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
-    let Inst{22} = 1;
+    let Inst{23-22} = 0b01;
   }
   } // Uses = [NZCV]
 }
@@ -4050,7 +4281,7 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
     Sched<[WriteFImm]> {
   bits<5> Rd;
   bits<8> imm;
-  let Inst{31-23} = 0b000111100;
+  let Inst{31-24} = 0b00011110;
   let Inst{21}    = 1;
   let Inst{20-13} = imm;
   let Inst{12-5}  = 0b10000000;
@@ -4058,12 +4289,17 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
 }
 
 multiclass FPMoveImmediate<string asm> {
+  def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
   def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
-    let Inst{22} = 0;
+    let Inst{23-22} = 0b00;
   }
 
   def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
-    let Inst{22} = 1;
+    let Inst{23-22} = 0b01;
   }
 }
 } // end of 'let Predicates = [HasFPARMv8]'
@@ -4079,7 +4315,7 @@ let Predicates = [HasNEON] in {
 //----------------------------------------------------------------------------
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
                         RegisterOperand regtype, string asm, string kind,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -4093,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{30}    = Q;
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -4103,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 }
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
                         RegisterOperand regtype, string asm, string kind,
                         list<dag> pattern>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
@@ -4117,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{30}    = Q;
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -4129,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
                                       asm, ".2d",
          [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
 }
@@ -4155,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
 // As above, but D sized elements unsupported.
 multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
         [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
         [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
         [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
         [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
         [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
         [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
                                       asm, ".8b",
       [(set (v8i8 V64:$dst),
             (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
                                       asm, ".16b",
       [(set (v16i8 V128:$dst),
             (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
                                       asm, ".4h",
       [(set (v4i16 V64:$dst),
             (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
                                       asm, ".8h",
       [(set (v8i16 V128:$dst),
             (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
                                       asm, ".2s",
       [(set (v2i32 V64:$dst),
             (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
                                       asm, ".4s",
       [(set (v4i32 V128:$dst),
             (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
@@ -4206,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
 // As above, but only B sized elements supported.
 multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
     [(set (v16i8 V128:$Rd),
           (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
 }
 
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
         [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
         [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
                                     SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
         [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
         [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
         [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+     [(set (v4f16 V64:$dst),
+           (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+     [(set (v8f16 V128:$dst),
+           (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
      [(set (v2f32 V64:$dst),
            (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
      [(set (v4f32 V128:$dst),
            (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
      [(set (v2f64 V128:$dst),
            (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
@@ -4262,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
 // As above, but D and B sized elements unsupported.
 multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
 }
@@ -4279,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
 // Logical three vector ops share opcode bits, and only use B sized elements.
 multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
                                   SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
                          [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+  def v16i8  : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
                                      asm, ".16b",
                          [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
 
@@ -4303,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
 
 multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
                                   string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
              [(set (v8i8 V64:$dst),
                    (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
                                      asm, ".16b",
              [(set (v16i8 V128:$dst),
                    (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
@@ -4347,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
+                            bits<2> size2, RegisterOperand regtype, string asm,
+                            string dstkind, string srckind, list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
@@ -4360,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4369,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
+                                bits<2> size2, RegisterOperand regtype,
+                                string asm, string dstkind, string srckind,
+                                list<dag> pattern>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
@@ -4382,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4392,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
 // Supports B, H, and S element sizes.
 multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                       asm, ".8b", ".8b",
                           [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                       asm, ".16b", ".16b",
                           [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                       asm, ".4h", ".4h",
                           [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                       asm, ".8h", ".8h",
                           [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                       asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                       asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
@@ -4450,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS {
 // Supports all element sizes.
 multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                       asm, ".4h", ".8b",
                [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                       asm, ".8h", ".16b",
                [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                       asm, ".2s", ".4h",
                [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                       asm, ".4s", ".8h",
                [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                       asm, ".1d", ".2s",
                [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                       asm, ".2d", ".4s",
                [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
 
 multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
                                  SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
                                           asm, ".4h", ".8b",
       [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
                                       (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
                                           asm, ".8h", ".16b",
       [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
                                       (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
                                           asm, ".2s", ".4h",
       [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
                                       (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
                                           asm, ".4s", ".8h",
       [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
                                       (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
                                           asm, ".1d", ".2s",
       [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
                                       (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
                                           asm, ".2d", ".4s",
       [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
                                       (v4i32 V128:$Rn)))]>;
@@ -4501,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
 // Supports all element sizes, except 1xD.
 multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
                                     asm, ".8b", ".8b",
     [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
                                     asm, ".16b", ".16b",
     [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
                                     asm, ".4h", ".4h",
     [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
                                     asm, ".8h", ".8h",
     [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
                                     asm, ".2s", ".2s",
     [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
                                     asm, ".4s", ".4s",
     [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
                                     asm, ".2d", ".2d",
     [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
 }
 
 multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
     [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                 asm, ".4h", ".4h",
     [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                 asm, ".8h", ".8h",
     [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
     [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
     [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
     [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
@@ -4553,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
 // Supports only B element sizes.
 multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
                           SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
                     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
                     [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
 
@@ -4565,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
 // Supports only B and H element sizes.
 multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
                     [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
                     [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                 asm, ".4h", ".4h",
                     [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                 asm, ".8h", ".8h",
                     [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
 }
@@ -4583,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
 // as an extra opcode bit.
 multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
@@ -4597,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
 // Supports only S element size.
 multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
@@ -4608,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
 
 multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
 
 multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
@@ -4706,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 }
 
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype,
-                           string asm, string kind, string zero,
-                           ValueType dty, ValueType sty, SDNode OpNode>
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+                           bits<5> opcode, RegisterOperand regtype, string asm,
+                           string kind, string zero, ValueType dty,
+                           ValueType sty, SDNode OpNode>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
       "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
@@ -4722,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4732,54 +5023,74 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 // Comparisons support all element sizes, except 1xD.
 multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
                             SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
                                      asm, ".8b", "0",
                                      v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
                                      asm, ".16b", "0",
                                      v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
                                      asm, ".4h", "0",
                                      v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
                                      asm, ".8h", "0",
                                      v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
                                      asm, ".2s", "0",
                                      v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
                                      asm, ".4s", "0",
                                      v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
                                      asm, ".2d", "0",
                                      v2i64, v2i64, OpNode>;
 }
 
-// FP Comparisons support only S and D element sizes.
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
 multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
                               string asm, SDNode OpNode> {
 
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+                                     asm, ".4h", "0.0",
+                                     v4i16, v4f16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+                                     asm, ".8h", "0.0",
+                                     v8i16, v8f16, OpNode>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
                                      asm, ".2s", "0.0",
                                      v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
                                      asm, ".4s", "0.0",
                                      v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
                                      asm, ".2d", "0.0",
                                      v2i64, v2f64, OpNode>;
 
-  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
+  def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
                   (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
-  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+  def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
                   (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+  def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
                   (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
+  def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
                   (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
-  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+  def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
                   (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+  def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
                   (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
 }
 
@@ -5325,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
 //----------------------------------------------------------------------------
 
 let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
                         RegisterClass regtype, string asm,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -5337,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{31-30} = 0b01;
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -5369,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
 
 multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
 }
 
 multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
 
   def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
             (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
@@ -5389,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
 
 multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
                              [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
 }
 
 multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
@@ -5404,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
                                      asm, []>;
 }
 
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+    } // Predicates = [HasNEON, HasFullFP16]
   }
 
   def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
             (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
 }
 
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
                                 SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      []>;
+    } // Predicates = [HasNEON, HasFullFP16]
   }
 
   def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -5482,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, RegisterClass regtype2,
                         string asm, list<dag> pat>
   : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
@@ -5494,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -5523,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
 
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, string asm, string zero>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "\t$Rd, $Rn, #" # zero, "", []>,
@@ -5534,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -5556,21 +5878,28 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
 
 multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
 
   def : Pat<(v1i64 (OpNode FPR64:$Rn)),
             (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
 }
 
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+  }
 
-  def : InstAlias<asm # " $Rd, $Rn, #0",
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
-  def : InstAlias<asm # " $Rd, $Rn, #0",
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+  }
 
   def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
             (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5578,35 +5907,42 @@ multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
 
 multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
                           SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
 
   def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
             (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
 }
 
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1f16       : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+  }
 }
 
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
                                 [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
                                 [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+                                [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+  }
 }
 
 multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
            [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
            [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
   }
 
   def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
@@ -5633,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
                                  SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
         [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
 }
 
 //----------------------------------------------------------------------------
@@ -5668,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
                                       asm, ".2d">;
 }
 
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+                                      asm, ".2h">;
+  }
+  def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
                                       asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+  def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
                                       asm, ".2d">;
 }
 
@@ -5727,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
                                    asm, ".4s", []>;
 }
 
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
                             Intrinsic intOp> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+                                   asm, ".4h",
+        [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+  def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+                                   asm, ".8h",
+        [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
   def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
                                    asm, ".4s",
         [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
@@ -5925,7 +6273,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst,
 class SIMDInsElementMovAlias<string size, Instruction inst,
                              Operand idxtype>
     : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
-                      # "|" # size #" $dst$idx, $src$idx2}",
+                      # "|" # size #"\t$dst$idx, $src$idx2}",
                 (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
 
 
@@ -6215,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> {
 // AdvSIMD modified immediate instructions
 //----------------------------------------------------------------------------
 
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
                           string asm, string op_string,
                           string cstr, list<dag> pattern>
   : I<oops, iops, asm, op_string, cstr, pattern>,
@@ -6227,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
   let Inst{29}    = op;
   let Inst{28-19} = 0b0111100000;
   let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
+  let Inst{11} = op2;
+  let Inst{10} = 1;
   let Inst{9-5}   = imm8{4-0};
   let Inst{4-0}   = Rd;
 }
 
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
                                 Operand immtype, dag opt_shift_iop,
                                 string opt_shift, string asm, string kind,
                                 list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+  : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
                         !con((ins immtype:$imm8), opt_shift_iop), asm,
                         "{\t$Rd" # kind # ", $imm8" # opt_shift #
                         "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6248,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
                                 Operand immtype, dag opt_shift_iop,
                                 string opt_shift, string asm, string kind,
                                 list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+  : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
                         !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
                         asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
                              "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6259,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
 class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
                                      RegisterOperand vectype, string asm,
                                      string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins logical_vec_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<2> shift;
@@ -6284,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
 class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
                                          RegisterOperand vectype, string asm,
                                          string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins logical_vec_hw_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<2> shift;
@@ -6349,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
 class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
                              RegisterOperand vectype, string asm,
                              string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins move_vec_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<1> shift;
@@ -6357,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
   let Inst{12}    = shift;
 }
 
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
                                    RegisterOperand vectype,
                                    Operand imm_type, string asm,
                                    string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+  : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
                               asm, kind, pattern> {
   let Inst{15-12} = cmode;
 }
 
 class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
                                    list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+  : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
                         "\t$Rd, $imm8", "", pattern> {
   let Inst{15-12} = cmode;
   let DecoderMethod = "DecodeModImmInstruction";
@@ -6438,8 +6787,36 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
   let Inst{4-0}   = Rd;
 }
 
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
-                           SDPatternOperator OpNode> {
+multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4f16 V64:$Rd),
+        (OpNode (v4f16 V64:$Rn),
+         (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8f16 V128:$Rd),
+        (OpNode (v8f16 V128:$Rn),
+         (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
                                       V64, V64,
                                       V128, VectorIndexS,
@@ -6476,6 +6853,21 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
     let Inst{21} = 0;
   }
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h",
+    [(set (f16 FPR16Op:$Rd),
+          (OpNode (f16 FPR16Op:$Rn),
+                  (f16 (vector_extract (v8f16 V128_lo:$Rm),
+                                       VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
                                       FPR32Op, FPR32Op, V128, VectorIndexS,
                                       asm, ".s", "", "", ".s",
@@ -6501,7 +6893,7 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
   }
 }
 
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
   // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
   def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
                            (AArch64duplane32 (v4f32 V128:$Rm),
@@ -6553,7 +6945,28 @@ multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
                 V128:$Rm, VectorIndexD:$idx)>;
 }
 
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+                                          V128, V128,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
                                           V128, VectorIndexS,
                                           asm, ".2s", ".2s", ".2s", ".s", []> {
@@ -6580,6 +6993,16 @@ multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
     let Inst{21} = 0;
   }
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
 
   def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
                                       FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -7117,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
 }
 
 
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
   def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
                               FPR32, FPR32, vecshiftR32, asm, []> {
     let Inst{20-16} = imm{4-0};
@@ -7297,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
 
 multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
                               Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
@@ -7322,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   }
 }
 
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
                                   Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
@@ -8604,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in {
 class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
                                     RegisterOperand regtype, string asm, 
                                     string kind, list<dag> pattern>
-  : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, 
+  : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, 
                                 pattern> {
-  let Inst{21}=0;
 }
 multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
                                              SDPatternOperator Accum> {
@@ -9041,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">;
 def : TokenAlias<".4S", ".4s">;
 def : TokenAlias<".2D", ".2d">;
 def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
 def : TokenAlias<".B", ".b">;
 def : TokenAlias<".H", ".h">;
 def : TokenAlias<".S", ".s">;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c0b3f2c..f398117 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
-#include "AArch64MachineCombinerPattern.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -533,6 +532,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
       CC);
 }
 
+/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
+static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
+  uint64_t Imm = MI->getOperand(1).getImm();
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
+}
+
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
@@ -573,6 +580,12 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ORRWrr:
   case AArch64::ORRXrr:
     return true;
+  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
+  // ORRXri, it is as cheap as MOV
+  case AArch64::MOVi32imm:
+    return canBeExpandedToORR(MI, 32);
+  case AArch64::MOVi64imm:
+    return canBeExpandedToORR(MI, 64);
   }
 
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -1379,42 +1392,34 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
     Width = 1;
     Scale = 1;
     break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    Scale = Width = 16;
+    break;
   case AArch64::LDRXui:
+  case AArch64::LDRDui:
   case AArch64::STRXui:
+  case AArch64::STRDui:
     Scale = Width = 8;
     break;
   case AArch64::LDRWui:
+  case AArch64::LDRSui:
   case AArch64::STRWui:
+  case AArch64::STRSui:
     Scale = Width = 4;
     break;
-  case AArch64::LDRBui:
-  case AArch64::STRBui:
-    Scale = Width = 1;
-    break;
   case AArch64::LDRHui:
+  case AArch64::LDRHHui:
   case AArch64::STRHui:
+  case AArch64::STRHHui:
     Scale = Width = 2;
     break;
-  case AArch64::LDRSui:
-  case AArch64::STRSui:
-    Scale = Width = 4;
-    break;
-  case AArch64::LDRDui:
-  case AArch64::STRDui:
-    Scale = Width = 8;
-    break;
-  case AArch64::LDRQui:
-  case AArch64::STRQui:
-    Scale = Width = 16;
-    break;
+  case AArch64::LDRBui:
   case AArch64::LDRBBui:
+  case AArch64::STRBui:
   case AArch64::STRBBui:
     Scale = Width = 1;
     break;
-  case AArch64::LDRHHui:
-  case AArch64::STRHHui:
-    Scale = Width = 2;
-    break;
   };
 
   BaseReg = LdSt->getOperand(1).getReg();
@@ -1445,23 +1450,43 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
 
 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
                                               MachineInstr *Second) const {
-  // Cyclone can fuse CMN, CMP followed by Bcc.
-
-  // FIXME: B0 can also fuse:
-  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
-  if (Second->getOpcode() != AArch64::Bcc)
-    return false;
-  switch (First->getOpcode()) {
-  default:
-    return false;
-  case AArch64::SUBSWri:
-  case AArch64::ADDSWri:
-  case AArch64::ANDSWri:
-  case AArch64::SUBSXri:
-  case AArch64::ADDSXri:
-  case AArch64::ANDSXri:
-    return true;
+  if (Subtarget.isCyclone()) {
+    // Cyclone can fuse CMN, CMP, TST followed by Bcc.
+    unsigned SecondOpcode = Second->getOpcode();
+    if (SecondOpcode == AArch64::Bcc) {
+      switch (First->getOpcode()) {
+      default:
+        return false;
+      case AArch64::SUBSWri:
+      case AArch64::ADDSWri:
+      case AArch64::ANDSWri:
+      case AArch64::SUBSXri:
+      case AArch64::ADDSXri:
+      case AArch64::ANDSXri:
+        return true;
+      }
+    }
+    // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
+      switch (First->getOpcode()) {
+      default:
+        return false;
+      case AArch64::ADDWri:
+      case AArch64::ADDXri:
+      case AArch64::ANDWri:
+      case AArch64::ANDXri:
+      case AArch64::EORWri:
+      case AArch64::EORXri:
+      case AArch64::ORRWri:
+      case AArch64::ORRXri:
+      case AArch64::SUBWri:
+      case AArch64::SUBXri:
+        return true;
+      }
+    }
   }
+  return false;
 }
 
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
@@ -1814,7 +1839,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
 
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
   unsigned Opc = 0;
@@ -1911,7 +1936,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
 
@@ -2226,11 +2251,19 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
   case AArch64::LDPDi:
   case AArch64::STPXi:
   case AArch64::STPDi:
+  case AArch64::LDNPXi:
+  case AArch64::LDNPDi:
+  case AArch64::STNPXi:
+  case AArch64::STNPDi:
+    ImmIdx = 3;
     IsSigned = true;
     Scale = 8;
     break;
   case AArch64::LDPQi:
   case AArch64::STPQi:
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+    ImmIdx = 3;
     IsSigned = true;
     Scale = 16;
     break;
@@ -2238,6 +2271,11 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
   case AArch64::LDPSi:
   case AArch64::STPWi:
   case AArch64::STPSi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPSi:
+  case AArch64::STNPWi:
+  case AArch64::STNPSi:
+    ImmIdx = 3;
     IsSigned = true;
     Scale = 4;
     break;
@@ -2449,15 +2487,36 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
   return true;
 }
 
-/// Return true when there is potentially a faster code sequence
-/// for an instruction chain ending in \p Root. All potential patterns are
-/// listed
-/// in the \p Pattern vector. Pattern should be sorted in priority order since
-/// the pattern evaluator stops checking as soon as it finds a faster sequence.
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (xor, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDDrr:
+  case AArch64::FADDSrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FMULDrr:
+  case AArch64::FMULSrr:
+  case AArch64::FMULX32:
+  case AArch64::FMULX64:
+  case AArch64::FMULXv2f32:
+  case AArch64::FMULXv2f64:
+  case AArch64::FMULXv4f32:
+  case AArch64::FMULv2f32:
+  case AArch64::FMULv2f64:
+  case AArch64::FMULv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    return false;
+  }
+}
 
-bool AArch64InstrInfo::getMachineCombinerPatterns(
-    MachineInstr &Root,
-    SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
+/// Find instructions that can be turned into madd.
+static bool getMaddPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
   unsigned Opc = Root.getOpcode();
   MachineBasicBlock &MBB = *Root.getParent();
   bool Found = false;
@@ -2485,76 +2544,76 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
            "ADDWrr does not have register operands");
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
                           AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
       Found = true;
     }
     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
                           AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+      Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
       Found = true;
     }
     break;
   case AArch64::ADDXrr:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
                           AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
       Found = true;
     }
     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
                           AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+      Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
       Found = true;
     }
     break;
   case AArch64::SUBWrr:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
                           AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
       Found = true;
     }
     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
                           AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+      Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
       Found = true;
     }
     break;
   case AArch64::SUBXrr:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
                           AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
       Found = true;
     }
     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
                           AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+      Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
       Found = true;
     }
     break;
   case AArch64::ADDWri:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
                           AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
       Found = true;
     }
     break;
   case AArch64::ADDXri:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
                           AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
       Found = true;
     }
     break;
   case AArch64::SUBWri:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
                           AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
       Found = true;
     }
     break;
   case AArch64::SUBXri:
     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
                           AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+      Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
       Found = true;
     }
     break;
@@ -2562,6 +2621,20 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   return Found;
 }
 
+/// Return true when there is potentially a faster code sequence for an
+/// instruction chain ending in \p Root. All potential patterns are listed in
+/// the \p Pattern vector. Pattern should be sorted in priority order since the
+/// pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  if (getMaddPatterns(Root, Patterns))
+    return true;
+
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
 /// genMadd - Generate madd instruction and combine mul and add.
 /// Example:
 ///  MUL I=A,B,0
@@ -2661,7 +2734,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
 /// this function generates the instructions that could replace the
 /// original code sequence
 void AArch64InstrInfo::genAlternativeCodeSequence(
-    MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+    MachineInstr &Root, MachineCombinerPattern Pattern,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
     SmallVectorImpl<MachineInstr *> &DelInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
@@ -2675,15 +2748,17 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   unsigned Opc;
   switch (Pattern) {
   default:
-    // signal error.
-    break;
-  case MachineCombinerPattern::MC_MULADDW_OP1:
-  case MachineCombinerPattern::MC_MULADDX_OP1:
+    // Reassociate instructions.
+    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+                                                DelInstrs, InstrIdxForVirtReg);
+    return;
+  case MachineCombinerPattern::MULADDW_OP1:
+  case MachineCombinerPattern::MULADDX_OP1:
     // MUL I=A,B,0
     // ADD R,I,C
     // ==> MADD R,A,B,C
     // --- Create(MADD);
-    if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) {
+    if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
       Opc = AArch64::MADDWrrr;
       RC = &AArch64::GPR32RegClass;
     } else {
@@ -2692,13 +2767,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
-  case MachineCombinerPattern::MC_MULADDW_OP2:
-  case MachineCombinerPattern::MC_MULADDX_OP2:
+  case MachineCombinerPattern::MULADDW_OP2:
+  case MachineCombinerPattern::MULADDX_OP2:
     // MUL I=A,B,0
     // ADD R,C,I
     // ==> MADD R,A,B,C
     // --- Create(MADD);
-    if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) {
+    if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
       Opc = AArch64::MADDWrrr;
       RC = &AArch64::GPR32RegClass;
     } else {
@@ -2707,8 +2782,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
-  case MachineCombinerPattern::MC_MULADDWI_OP1:
-  case MachineCombinerPattern::MC_MULADDXI_OP1: {
+  case MachineCombinerPattern::MULADDWI_OP1:
+  case MachineCombinerPattern::MULADDXI_OP1: {
     // MUL I=A,B,0
     // ADD R,I,Imm
     // ==> ORR  V, ZR, Imm
@@ -2716,7 +2791,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     // --- Create(MADD);
     const TargetRegisterClass *OrrRC;
     unsigned BitSize, OrrOpc, ZeroReg;
-    if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+    if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
       OrrOpc = AArch64::ORRWri;
       OrrRC = &AArch64::GPR32spRegClass;
       BitSize = 32;
@@ -2751,8 +2826,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
-  case MachineCombinerPattern::MC_MULSUBW_OP1:
-  case MachineCombinerPattern::MC_MULSUBX_OP1: {
+  case MachineCombinerPattern::MULSUBW_OP1:
+  case MachineCombinerPattern::MULSUBX_OP1: {
     // MUL I=A,B,0
     // SUB R,I, C
     // ==> SUB  V, 0, C
@@ -2760,7 +2835,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     // --- Create(MADD);
     const TargetRegisterClass *SubRC;
     unsigned SubOpc, ZeroReg;
-    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+    if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
       SubOpc = AArch64::SUBWrr;
       SubRC = &AArch64::GPR32spRegClass;
       ZeroReg = AArch64::WZR;
@@ -2784,13 +2859,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
     break;
   }
-  case MachineCombinerPattern::MC_MULSUBW_OP2:
-  case MachineCombinerPattern::MC_MULSUBX_OP2:
+  case MachineCombinerPattern::MULSUBW_OP2:
+  case MachineCombinerPattern::MULSUBX_OP2:
     // MUL I=A,B,0
     // SUB R,C,I
     // ==> MSUB R,A,B,C (computes C - A*B)
     // --- Create(MSUB);
-    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) {
+    if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
       Opc = AArch64::MSUBWrrr;
       RC = &AArch64::GPR32RegClass;
     } else {
@@ -2799,8 +2874,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
-  case MachineCombinerPattern::MC_MULSUBWI_OP1:
-  case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+  case MachineCombinerPattern::MULSUBWI_OP1:
+  case MachineCombinerPattern::MULSUBXI_OP1: {
     // MUL I=A,B,0
     // SUB R,I, Imm
     // ==> ORR  V, ZR, -Imm
@@ -2808,7 +2883,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     // --- Create(MADD);
     const TargetRegisterClass *OrrRC;
     unsigned BitSize, OrrOpc, ZeroReg;
-    if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+    if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
       OrrOpc = AArch64::ORRWri;
       OrrRC = &AArch64::GPR32spRegClass;
       BitSize = 32;
@@ -2944,3 +3019,34 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
   MI->eraseFromParent();
   return true;
 }
+
+std::pair<unsigned, unsigned>
+AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  const unsigned Mask = AArch64II::MO_FRAGMENT;
+  return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace AArch64II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_PAGE, "aarch64-page"},
+      {MO_PAGEOFF, "aarch64-pageoff"},
+      {MO_G3, "aarch64-g3"},
+      {MO_G2, "aarch64-g2"},
+      {MO_G1, "aarch64-g1"},
+      {MO_G0, "aarch64-g0"},
+      {MO_HI12, "aarch64-hi12"}};
+  return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace AArch64II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT, "aarch64-got"},
+      {MO_NC, "aarch64-nc"},
+      {MO_TLS, "aarch64-tls"},
+      {MO_CONSTPOOL, "aarch64-constant-pool"}};
+  return makeArrayRef(TargetFlags);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 68c2a28..b5bb446 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -167,13 +167,15 @@ public:
   /// for an instruction chain ending in <Root>. All potential patterns are
   /// listed in the <Patterns> array.
   bool getMachineCombinerPatterns(MachineInstr &Root,
-                  SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns)
+                  SmallVectorImpl<MachineCombinerPattern> &Patterns)
       const override;
-
+  /// Return true when Inst is associative and commutative so that it can be
+  /// reassociated.
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
   /// When getMachineCombinerPatterns() finds patterns, this function generates
   /// the instructions that could replace the original code sequence
   void genAlternativeCodeSequence(
-      MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+      MachineInstr &Root, MachineCombinerPattern Pattern,
       SmallVectorImpl<MachineInstr *> &InsInstrs,
       SmallVectorImpl<MachineInstr *> &DelInstrs,
       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
@@ -181,6 +183,14 @@ public:
   bool useMachineCombiner() const override;
 
   bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
+
 private:
   void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
                              MachineBasicBlock *TBB,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index fa1a46a..d02bc9f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -16,6 +16,8 @@
 //
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
                                  AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
+                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
@@ -24,6 +26,12 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
+def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
+                                 AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE           : Predicate<"Subtarget->hasSPE()">,
+                                 AssemblerPredicate<"FeatureSPE", "spe">;
+
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
@@ -66,6 +74,20 @@ def SDT_AArch64CSel  : SDTypeProfile<1, 4,
                                     SDTCisSameAs<0, 2>,
                                     SDTCisInt<3>,
                                     SDTCisVT<4, i32>]>;
+def SDT_AArch64CCMP : SDTypeProfile<1, 5,
+                                    [SDTCisVT<0, i32>,
+                                     SDTCisInt<1>,
+                                     SDTCisSameAs<1, 2>,
+                                     SDTCisInt<3>,
+                                     SDTCisInt<4>,
+                                     SDTCisVT<5, i32>]>;
+def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
+                                     [SDTCisVT<0, i32>,
+                                      SDTCisFP<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisInt<3>,
+                                      SDTCisInt<4>,
+                                      SDTCisVT<5, i32>]>;
 def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
                                    [SDTCisFP<0>,
                                     SDTCisSameAs<0, 1>]>;
@@ -160,13 +182,14 @@ def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
 def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
 def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
 
+def AArch64ccmp      : SDNode<"AArch64ISD::CCMP",  SDT_AArch64CCMP>;
+def AArch64ccmn      : SDNode<"AArch64ISD::CCMN",  SDT_AArch64CCMP>;
+def AArch64fccmp     : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
+
 def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
 
 def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
 
-def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
-def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
-
 def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
 def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
 def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
@@ -361,6 +384,9 @@ def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
 
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
 // As far as LLVM is concerned this writes to the system's exclusive monitors.
 let mayLoad = 1, mayStore = 1 in
 def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
@@ -383,12 +409,17 @@ def : InstAlias<"isb", (ISB 0xf)>;
 
 def MRS    : MRSI;
 def MSR    : MSRI;
-def MSRpstate: MSRpstateI;
+def MSRpstateImm1 : MSRpstateImm0_1;
+def MSRpstateImm4 : MSRpstateImm0_15;
 
 // The thread pointer (on Linux, at least, where this has been implemented) is
 // TPIDR_EL0.
 def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
 
+// The cycle counter PMC register is PMCCNTR_EL0.
+let Predicates = [HasPerfMon] in
+def : Pat<(readcyclecounter), (MRS 0xdce8)>;
+
 // Generic system instructions
 def SYSxt  : SystemXtI<0, "sys">;
 def SYSLxt : SystemLXtI<1, "sysl">;
@@ -595,10 +626,12 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
           (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
 def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
           (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+let AddedComplexity = 1 in {
 def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
           (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
 def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
           (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+}
 
 // Because of the immediate format for add/sub-imm instructions, the
 // expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
@@ -823,7 +856,7 @@ defm AND  : LogicalReg<0b00, 0, "and", and>;
 defm BIC  : LogicalReg<0b00, 1, "bic",
                        BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 defm EON  : LogicalReg<0b10, 1, "eon",
-                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+                       BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
 defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
 defm ORN  : LogicalReg<0b01, 1, "orn",
                        BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
@@ -1020,13 +1053,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
 def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 //===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
+// Conditional comparison instructions.
 //===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
+defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
+defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
 
 //===----------------------------------------------------------------------===//
 // Conditional select instructions.
@@ -2421,6 +2451,26 @@ defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvt
 defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
 }
 
+multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+  def : Pat<(i32 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fceil,  "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fceil,  "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, frnd,   "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, frnd,   "FCVTAU">;
+
 //===----------------------------------------------------------------------===//
 // Scaled integer to floating point conversion instructions.
 //===----------------------------------------------------------------------===//
@@ -2466,14 +2516,7 @@ defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
 def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
           (FRINTNDr FPR64:$Rn)>;
 
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
 defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
 defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
 
 let SchedRW = [WriteFDiv] in {
@@ -2488,23 +2531,23 @@ defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
 let SchedRW = [WriteFDiv] in {
 defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
 }
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminnan>;
 let SchedRW = [WriteFMul] in {
 defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
 defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
 }
 defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
 
-def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
 
 //===----------------------------------------------------------------------===//
@@ -2556,7 +2599,7 @@ defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
 //===----------------------------------------------------------------------===//
 
 defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP  : FPCondComparison<0, "fccmp">;
+defm FCCMP  : FPCondComparison<0, "fccmp", AArch64fccmp>;
 
 //===----------------------------------------------------------------------===//
 // Floating point conditional select instruction.
@@ -2589,6 +2632,40 @@ defm FMOV : FPMoveImmediate<"fmov">;
 // Advanced SIMD two vector instructions.
 //===----------------------------------------------------------------------===//
 
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+// Match UABDL in log2-shuffle patterns.
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+               (v8i16 (add (sub (zext (v8i8 V64:$opA)),
+                                (zext (v8i8 V64:$opB))),
+                           (AArch64vashr v8i16:$src, (i32 15))))),
+          (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+               (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
+                                (zext (extract_high_v16i8 V128:$opB))),
+                           (AArch64vashr v8i16:$src, (i32 15))))),
+          (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+               (v4i32 (add (sub (zext (v4i16 V64:$opA)),
+                                (zext (v4i16 V64:$opB))),
+                           (AArch64vashr v4i32:$src, (i32 31))))),
+          (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+               (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
+                                (zext (extract_high_v8i16 V128:$opB))),
+                          (AArch64vashr v4i32:$src, (i32 31))))),
+          (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+               (v2i64 (add (sub (zext (v2i32 V64:$opA)),
+                                (zext (v2i32 V64:$opB))),
+                           (AArch64vashr v2i64:$src, (i32 63))))),
+          (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+               (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
+                                (zext (extract_high_v4i32 V128:$opB))),
+                          (AArch64vashr v2i64:$src, (i32 63))))),
+          (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
+
 defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
 def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
                (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
@@ -2780,29 +2857,29 @@ defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
             TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
             TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
@@ -2816,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
 def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
           (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
 defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
                       TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
 defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
@@ -2833,9 +2910,9 @@ defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
 defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
 defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
 defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
-defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
 defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
-defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
 defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
@@ -2852,9 +2929,9 @@ defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
 defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
 defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
 defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
-defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
 defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
-defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
 defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
 defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
 defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
@@ -2879,54 +2956,6 @@ defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
                                   BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
 defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
 
-def : Pat<(v8i8 (smin V64:$Rn, V64:$Rm)),
-          (SMINv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (smin V64:$Rn, V64:$Rm)),
-          (SMINv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (smin V64:$Rn, V64:$Rm)),
-          (SMINv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (smin V128:$Rn, V128:$Rm)),
-          (SMINv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (smin V128:$Rn, V128:$Rm)),
-          (SMINv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (smin V128:$Rn, V128:$Rm)),
-          (SMINv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (smax V64:$Rn, V64:$Rm)),
-          (SMAXv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (smax V64:$Rn, V64:$Rm)),
-          (SMAXv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (smax V64:$Rn, V64:$Rm)),
-          (SMAXv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (smax V128:$Rn, V128:$Rm)),
-          (SMAXv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (smax V128:$Rn, V128:$Rm)),
-          (SMAXv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (smax V128:$Rn, V128:$Rm)),
-          (SMAXv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (umin V64:$Rn, V64:$Rm)),
-          (UMINv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (umin V64:$Rn, V64:$Rm)),
-          (UMINv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (umin V64:$Rn, V64:$Rm)),
-          (UMINv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (umin V128:$Rn, V128:$Rm)),
-          (UMINv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (umin V128:$Rn, V128:$Rm)),
-          (UMINv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (umin V128:$Rn, V128:$Rm)),
-          (UMINv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (umax V64:$Rn, V64:$Rm)),
-          (UMAXv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (umax V64:$Rn, V64:$Rm)),
-          (UMAXv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (umax V64:$Rn, V64:$Rm)),
-          (UMAXv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (umax V128:$Rn, V128:$Rm)),
-          (UMAXv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (umax V128:$Rn, V128:$Rm)),
-          (UMAXv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (umax V128:$Rn, V128:$Rm)),
-          (UMAXv4i32 V128:$Rn, V128:$Rm)>;
 
 def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
@@ -3052,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmlt.2d\t$dst, $src1, $src2}",
                 (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmle.4h\t$dst, $src1, $src2}",
+                (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmle.8h\t$dst, $src1, $src2}",
+                (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmle.2s\t$dst, $src1, $src2}",
                 (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3062,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmle.2d\t$dst, $src1, $src2}",
                 (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmlt.4h\t$dst, $src1, $src2}",
+                (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmlt.8h\t$dst, $src1, $src2}",
+                (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmlt.2s\t$dst, $src1, $src2}",
                 (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3072,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmlt.2d\t$dst, $src1, $src2}",
                 (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|facle.4h\t$dst, $src1, $src2}",
+                (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|facle.8h\t$dst, $src1, $src2}",
+                (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|facle.2s\t$dst, $src1, $src2}",
                 (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3082,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|facle.2d\t$dst, $src1, $src2}",
                 (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|faclt.4h\t$dst, $src1, $src2}",
+                (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|faclt.8h\t$dst, $src1, $src2}",
+                (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|faclt.2s\t$dst, $src1, $src2}",
                 (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3103,19 +3164,19 @@ defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+defm FABD     : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
 def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
                                      int_aarch64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
                                      int_aarch64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -3198,35 +3259,35 @@ defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
 defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
 defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
 defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
-defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
-defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
-defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
-defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
-defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+defm FCMEQ  : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDFPTwoScalar<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDFPTwoScalar<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDFPTwoScalar<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDFPTwoScalar<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDFPTwoScalar<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDFPTwoScalar<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDFPTwoScalar<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
 defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
 defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
 defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
 defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
                                      int_aarch64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UCVTF  : SIMDFPTwoScalarCVT<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
 defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
 defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
                                     int_aarch64_neon_usqadd>;
@@ -3390,8 +3451,6 @@ defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
                  BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
 defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
                                               int_aarch64_neon_uabd>;
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          int_aarch64_neon_uabd>;
 defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
                  BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
@@ -3449,8 +3508,8 @@ defm : Neon_mulacc_widen_patterns<
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
           (PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
-                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
+                                    (extractelt (v2i64 V128:$Rm), (i64 1))),
           (PMULLv2i64 V128:$Rn, V128:$Rm)>;
 
 // CodeGen patterns for addhn and subhn instructions, which can actually be
@@ -3593,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">;
 //----------------------------------------------------------------------------
 
 defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+defm FADDP   : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
 def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
 def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -3713,12 +3772,12 @@ defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
 multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
                                SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
                                                          imm:$idx))))),
             (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
 
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
-                                                         imm:$idx))))),
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
+                                                       imm:$idx))))),
             (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
 }
 
@@ -3747,6 +3806,13 @@ def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
 def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
           (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
 
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
+            VectorIndexB:$idx)))), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
+            VectorIndexH:$idx)))), i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+
 // Extracting i8 or i16 elements will have the zero-extend transformed to
 // an 'and' mask by type legalization since neither i8 nor i16 are legal types
 // for AArch64. Match these patterns here since UMOV already zeroes out the high
@@ -3784,6 +3850,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
 
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -3949,10 +4020,10 @@ defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
 defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
 defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
 defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
 
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
@@ -4199,15 +4270,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 // AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".2d",
                        [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64,  fpimm8,
                                               "fmov", ".2s",
                        [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".4s",
                        [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64,  fpimm8,
+                                              "fmov", ".4h",
+                       [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".8h",
+                       [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
 
 // AdvSIMD MOVI
 
@@ -4235,7 +4314,7 @@ def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
 // The movi_edit node has the immediate value already encoded, so we use
 // a plain imm0_255 in the pattern
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
                                                 simdimmtype10,
                                                 "movi", ".2d",
                    [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
@@ -4296,10 +4375,10 @@ def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
                             (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 
 // Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64,  imm0_255,
                                                  "movi", ".8b",
                        [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
                                                  "movi", ".16b",
                        [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
 
@@ -4340,8 +4419,8 @@ def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
 //----------------------------------------------------------------------------
 
 let hasSideEffects = 0 in {
-  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
-  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+  defm FMLA  : SIMDFPIndexedTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedTied<0, 0b0101, "fmls">;
 }
 
 // NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
@@ -4349,18 +4428,18 @@ let hasSideEffects = 0 in {
 
 // On the other hand, there are quite a few valid combinatorial options due to
 // the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
            TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
 
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
            TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
            TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
            TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
 
 multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
@@ -4424,7 +4503,9 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
             (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
                 V128:$Rm, VectorIndexS:$idx)>;
   def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                         (vector_extract (v4f32 (insert_subvector undef,
+                                                    (v2f32 (fneg V64:$Rm)),
+                                                    (i32 0))),
                                          VectorIndexS:$idx))),
             (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
                 (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
@@ -4442,8 +4523,8 @@ defm : FMLSIndexedAfterNegPatterns<
 defm : FMLSIndexedAfterNegPatterns<
            TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
 
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
 
 def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv2i32_indexed V64:$Rn,
@@ -4497,10 +4578,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
 //----------------------------------------------------------------------------
 // AdvSIMD scalar shift instructions
 //----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
 // Codegen patterns for the above. We don't put these directly on the
 // instructions because TableGen's type inference can't handle the truth.
 // Having the same base pattern for fp <--> int totally freaks it out.
@@ -4573,7 +4654,7 @@ defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
 //----------------------------------------------------------------------------
 defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
 defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
                                    int_aarch64_neon_vcvtfxs2fp>;
 defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
                                          int_aarch64_neon_rshrn>;
@@ -4608,7 +4689,7 @@ defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
 defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
                 TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+defm UCVTF   : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
                         int_aarch64_neon_vcvtfxu2fp>;
 defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
                                          int_aarch64_neon_uqrshrn>;
@@ -5133,10 +5214,10 @@ def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
 def : Pat<(i64 (anyext GPR32:$src)),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
 
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
+// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
+// then assert the extension has happened.
 def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+          (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
 
 // To sign extend, we use a signed bitfield move instruction (SBFM) on the
 // containing super-reg.
@@ -5801,6 +5882,21 @@ def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
                              (v16i8 (REV16v16i8 FPR128:$src))>;
 }
 
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+
 def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
           (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
 def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
@@ -5852,6 +5948,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
 def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
           (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
 
+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+  Pat<(nontemporalstore (VT FPR128:$Rt),
+        (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+      (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+              (CPYi64 FPR128:$Rt, (i64 1)),
+              GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+  Pat<(nontemporalstore (VT FPR64:$Rt),
+        (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+      (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+              (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+              GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+            (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+          (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+                  GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
 // Tail call return handling. These are all compiler pseudo-instructions,
 // so no encoding information or anything like that.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 82f77a7..43664df 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -41,54 +41,85 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
 STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
+STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
+STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
 
 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
                                    cl::init(20), cl::Hidden);
 
-// Place holder while testing unscaled load/store combining
-static cl::opt<bool> EnableAArch64UnscaledMemOp(
-    "aarch64-unscaled-mem-op", cl::Hidden,
-    cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
+namespace llvm {
+void initializeAArch64LoadStoreOptPass(PassRegistry &);
+}
+
+#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
 
 namespace {
+
+typedef struct LdStPairFlags {
+  // If a matching instruction is found, MergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  bool MergeForward;
+
+  // SExtIdx gives the index of the result of the load pair that must be
+  // extended. The value of SExtIdx assumes that the paired load produces the
+  // value in this order: (I, returned iterator), i.e., -1 means no value has
+  // to be extended, 0 means I, and 1 means the returned iterator.
+  int SExtIdx;
+
+  LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+
+  void setMergeForward(bool V = true) { MergeForward = V; }
+  bool getMergeForward() const { return MergeForward; }
+
+  void setSExtIdx(int V) { SExtIdx = V; }
+  int getSExtIdx() const { return SExtIdx; }
+
+} LdStPairFlags;
+
 struct AArch64LoadStoreOpt : public MachineFunctionPass {
   static char ID;
-  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
+    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
+  }
 
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
+  const AArch64Subtarget *Subtarget;
 
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
-  // If a matching instruction is found, MergeForward is set to true if the
-  // merge is to remove the first instruction and replace the second with
-  // a pair-wise insn, and false if the reverse is true.
-  // \p SExtIdx[out] gives the index of the result of the load pair that
-  // must be extended. The value of SExtIdx assumes that the paired load
-  // produces the value in this order: (I, returned iterator), i.e.,
-  // -1 means no value has to be extended, 0 means I, and 1 means the
-  // returned iterator.
   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
-                                               bool &MergeForward, int &SExtIdx,
+                                               LdStPairFlags &Flags,
                                                unsigned Limit);
+
+  // Scan the instructions looking for a store that writes to the address from
+  // which the current load instruction reads. Return true if one is found.
+  bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+                         MachineBasicBlock::iterator &StoreI);
+
   // Merge the two instructions indicated into a single pair-wise instruction.
   // If MergeForward is true, erase the first instruction and fold its
   // operation into the second. If false, the reverse. Return the instruction
   // following the first instruction (which may change during processing).
-  // \p SExtIdx index of the result that must be extended for a paired load.
-  // -1 means none, 0 means I, and 1 means Paired.
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, bool MergeForward,
-                   int SExtIdx);
+                   MachineBasicBlock::iterator Paired,
+                   const LdStPairFlags &Flags);
+
+  // Promote the load that reads directly from the address stored to.
+  MachineBasicBlock::iterator
+  promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+                       MachineBasicBlock::iterator StoreI);
 
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan forwards.
   MachineBasicBlock::iterator
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
-                                int Value);
+                                int UnscaledOffset);
 
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
@@ -96,97 +127,177 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   MachineBasicBlock::iterator
   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
 
-  // Merge a pre-index base register update into a ld/st instruction.
-  MachineBasicBlock::iterator
-  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                        MachineBasicBlock::iterator Update);
+  // Find an instruction that updates the base register of the ld/st
+  // instruction.
+  bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
+                            unsigned BaseReg, int Offset);
 
-  // Merge a post-index base register update into a ld/st instruction.
+  // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
-  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                         MachineBasicBlock::iterator Update);
+  mergeUpdateInsn(MachineBasicBlock::iterator I,
+                  MachineBasicBlock::iterator Update, bool IsPreIdx);
+
+  // Find and merge foldable ldr/str instructions.
+  bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
 
-  bool optimizeBlock(MachineBasicBlock &MBB);
+  // Find and promote load instructions which read directly from store.
+  bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
+  // Check if converting two narrow loads into a single wider load with
+  // bitfield extracts could be enabled.
+  bool enableNarrowLdMerge(MachineFunction &Fn);
+
+  bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   const char *getPassName() const override {
-    return "AArch64 load / store optimization pass";
+    return AARCH64_LOAD_STORE_OPT_NAME;
   }
-
-private:
-  int getMemSize(MachineInstr *MemMI);
 };
 char AArch64LoadStoreOpt::ID = 0;
 } // namespace
 
-static bool isUnscaledLdst(unsigned Opc) {
+INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
+                AARCH64_LOAD_STORE_OPT_NAME, false, false)
+
+static bool isUnscaledLdSt(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
   case AArch64::STURSi:
-    return true;
   case AArch64::STURDi:
-    return true;
   case AArch64::STURQi:
-    return true;
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
   case AArch64::STURWi:
-    return true;
   case AArch64::STURXi:
-    return true;
   case AArch64::LDURSi:
-    return true;
   case AArch64::LDURDi:
-    return true;
   case AArch64::LDURQi:
-    return true;
   case AArch64::LDURWi:
-    return true;
   case AArch64::LDURXi:
-    return true;
   case AArch64::LDURSWi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+    return true;
+  }
+}
+
+static bool isUnscaledLdSt(MachineInstr *MI) {
+  return isUnscaledLdSt(MI->getOpcode());
+}
+
+static unsigned getBitExtrOpcode(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode.");
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+    return AArch64::UBFMWri;
+  case AArch64::LDRSBWui:
+  case AArch64::LDURSBWi:
+  case AArch64::LDRSHWui:
+  case AArch64::LDURSHWi:
+    return AArch64::SBFMWri;
+  }
+}
+
+static bool isNarrowStore(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return true;
+  }
+}
+
+static bool isNarrowStore(MachineInstr *MI) {
+  return isNarrowStore(MI->getOpcode());
+}
+
+static bool isNarrowLoad(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
+  case AArch64::LDRSHWui:
+  case AArch64::LDURSHWi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDURSBWi:
     return true;
   }
 }
 
-// Size in bytes of the data moved by an unscaled load or store
-int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
-  switch (MemMI->getOpcode()) {
+static bool isNarrowLoad(MachineInstr *MI) {
+  return isNarrowLoad(MI->getOpcode());
+}
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Opcode has unknown size!");
+    llvm_unreachable("Opcode has unknown scale!");
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDURSBWi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    return 1;
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+  case AArch64::LDRSHWui:
+  case AArch64::LDURSHWi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return 2;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
   case AArch64::STRSui:
   case AArch64::STURSi:
-    return 4;
-  case AArch64::STRDui:
-  case AArch64::STURDi:
-    return 8;
-  case AArch64::STRQui:
-  case AArch64::STURQi:
-    return 16;
   case AArch64::STRWui:
   case AArch64::STURWi:
-    return 4;
-  case AArch64::STRXui:
-  case AArch64::STURXi:
-    return 8;
-  case AArch64::LDRSui:
-  case AArch64::LDURSi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::STPSi:
+  case AArch64::STPWi:
     return 4;
   case AArch64::LDRDui:
   case AArch64::LDURDi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDPDi:
+  case AArch64::LDPXi:
+  case AArch64::STPDi:
+  case AArch64::STPXi:
     return 8;
   case AArch64::LDRQui:
   case AArch64::LDURQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
     return 16;
-  case AArch64::LDRWui:
-  case AArch64::LDURWi:
-    return 4;
-  case AArch64::LDRXui:
-  case AArch64::LDURXi:
-    return 8;
-  case AArch64::LDRSWui:
-  case AArch64::LDURSWi:
-    return 4;
   }
 }
 
@@ -203,6 +314,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   case AArch64::STURDi:
   case AArch64::STRQui:
   case AArch64::STURQi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
   case AArch64::STRWui:
   case AArch64::STURWi:
   case AArch64::STRXui:
@@ -219,11 +334,23 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   case AArch64::STURSi:
   case AArch64::LDRSui:
   case AArch64::LDURSi:
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
     return Opc;
   case AArch64::LDRSWui:
     return AArch64::LDRWui;
   case AArch64::LDURSWi:
     return AArch64::LDURWi;
+  case AArch64::LDRSBWui:
+    return AArch64::LDRBBui;
+  case AArch64::LDRSHWui:
+    return AArch64::LDRHHui;
+  case AArch64::LDURSBWi:
+    return AArch64::LDURBBi;
+  case AArch64::LDURSHWi:
+    return AArch64::LDURHHi;
   }
 }
 
@@ -240,6 +367,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::STRQui:
   case AArch64::STURQi:
     return AArch64::STPQi;
+  case AArch64::STRBBui:
+    return AArch64::STRHHui;
+  case AArch64::STRHHui:
+    return AArch64::STRWui;
+  case AArch64::STURBBi:
+    return AArch64::STURHHi;
+  case AArch64::STURHHi:
+    return AArch64::STURWi;
   case AArch64::STRWui:
   case AArch64::STURWi:
     return AArch64::STPWi;
@@ -264,6 +399,48 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::LDRSWui:
   case AArch64::LDURSWi:
     return AArch64::LDPSWi;
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+    return AArch64::LDRWui;
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+    return AArch64::LDURWi;
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+    return AArch64::LDRHHui;
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+    return AArch64::LDURHHi;
+  }
+}
+
+static unsigned isMatchingStore(MachineInstr *LoadInst,
+                                MachineInstr *StoreInst) {
+  unsigned LdOpc = LoadInst->getOpcode();
+  unsigned StOpc = StoreInst->getOpcode();
+  switch (LdOpc) {
+  default:
+    llvm_unreachable("Unsupported load instruction!");
+  case AArch64::LDRBBui:
+    return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+           StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+  case AArch64::LDURBBi:
+    return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+           StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+  case AArch64::LDRHHui:
+    return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+           StOpc == AArch64::STRXui;
+  case AArch64::LDURHHi:
+    return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+           StOpc == AArch64::STURXi;
+  case AArch64::LDRWui:
+    return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+  case AArch64::LDURWi:
+    return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+  case AArch64::LDRXui:
+    return StOpc == AArch64::STRXui;
+  case AArch64::LDURXi:
+    return StOpc == AArch64::STURXi;
   }
 }
 
@@ -277,6 +454,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
     return AArch64::STRDpre;
   case AArch64::STRQui:
     return AArch64::STRQpre;
+  case AArch64::STRBBui:
+    return AArch64::STRBBpre;
+  case AArch64::STRHHui:
+    return AArch64::STRHHpre;
   case AArch64::STRWui:
     return AArch64::STRWpre;
   case AArch64::STRXui:
@@ -287,12 +468,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
     return AArch64::LDRDpre;
   case AArch64::LDRQui:
     return AArch64::LDRQpre;
+  case AArch64::LDRBBui:
+    return AArch64::LDRBBpre;
+  case AArch64::LDRHHui:
+    return AArch64::LDRHHpre;
   case AArch64::LDRWui:
     return AArch64::LDRWpre;
   case AArch64::LDRXui:
     return AArch64::LDRXpre;
   case AArch64::LDRSWui:
     return AArch64::LDRSWpre;
+  case AArch64::LDPSi:
+    return AArch64::LDPSpre;
+  case AArch64::LDPSWi:
+    return AArch64::LDPSWpre;
+  case AArch64::LDPDi:
+    return AArch64::LDPDpre;
+  case AArch64::LDPQi:
+    return AArch64::LDPQpre;
+  case AArch64::LDPWi:
+    return AArch64::LDPWpre;
+  case AArch64::LDPXi:
+    return AArch64::LDPXpre;
+  case AArch64::STPSi:
+    return AArch64::STPSpre;
+  case AArch64::STPDi:
+    return AArch64::STPDpre;
+  case AArch64::STPQi:
+    return AArch64::STPQpre;
+  case AArch64::STPWi:
+    return AArch64::STPWpre;
+  case AArch64::STPXi:
+    return AArch64::STPXpre;
   }
 }
 
@@ -306,6 +513,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
     return AArch64::STRDpost;
   case AArch64::STRQui:
     return AArch64::STRQpost;
+  case AArch64::STRBBui:
+    return AArch64::STRBBpost;
+  case AArch64::STRHHui:
+    return AArch64::STRHHpost;
   case AArch64::STRWui:
     return AArch64::STRWpost;
   case AArch64::STRXui:
@@ -316,19 +527,96 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
     return AArch64::LDRDpost;
   case AArch64::LDRQui:
     return AArch64::LDRQpost;
+  case AArch64::LDRBBui:
+    return AArch64::LDRBBpost;
+  case AArch64::LDRHHui:
+    return AArch64::LDRHHpost;
   case AArch64::LDRWui:
     return AArch64::LDRWpost;
   case AArch64::LDRXui:
     return AArch64::LDRXpost;
   case AArch64::LDRSWui:
     return AArch64::LDRSWpost;
+  case AArch64::LDPSi:
+    return AArch64::LDPSpost;
+  case AArch64::LDPSWi:
+    return AArch64::LDPSWpost;
+  case AArch64::LDPDi:
+    return AArch64::LDPDpost;
+  case AArch64::LDPQi:
+    return AArch64::LDPQpost;
+  case AArch64::LDPWi:
+    return AArch64::LDPWpost;
+  case AArch64::LDPXi:
+    return AArch64::LDPXpost;
+  case AArch64::STPSi:
+    return AArch64::STPSpost;
+  case AArch64::STPDi:
+    return AArch64::STPDpost;
+  case AArch64::STPQi:
+    return AArch64::STPQpost;
+  case AArch64::STPWi:
+    return AArch64::STPWpost;
+  case AArch64::STPXi:
+    return AArch64::STPXpost;
   }
 }
 
+static bool isPairedLdSt(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+  case AArch64::STPSi:
+  case AArch64::STPDi:
+  case AArch64::STPQi:
+  case AArch64::STPWi:
+  case AArch64::STPXi:
+    return true;
+  }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
+                                          unsigned PairedRegOp = 0) {
+  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
+  unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+  return MI->getOperand(Idx);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+  unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+  return MI->getOperand(Idx);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+  unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+  return MI->getOperand(Idx);
+}
+
+static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
+                                  MachineInstr *StoreInst) {
+  assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+  int LoadSize = getMemScale(LoadInst);
+  int StoreSize = getMemScale(StoreInst);
+  int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+                             ? getLdStOffsetOp(StoreInst).getImm()
+                             : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+  int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+                             ? getLdStOffsetOp(LoadInst).getImm()
+                             : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+  return (UnscaledStOffset <= UnscaledLdOffset) &&
+         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                                       MachineBasicBlock::iterator Paired,
-                                      bool MergeForward, int SExtIdx) {
+                                      const LdStPairFlags &Flags) {
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
   // If NextI is the second of the two instructions to be merged, we need
@@ -338,25 +626,26 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   if (NextI == Paired)
     ++NextI;
 
+  int SExtIdx = Flags.getSExtIdx();
   unsigned Opc =
       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
-  bool IsUnscaled = isUnscaledLdst(Opc);
-  int OffsetStride =
-      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+  bool IsUnscaled = isUnscaledLdSt(Opc);
+  int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
 
+  bool MergeForward = Flags.getMergeForward();
   unsigned NewOpc = getMatchingPairOpcode(Opc);
   // Insert our new paired instruction after whichever of the paired
   // instructions MergeForward indicates.
   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
   // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
-  MachineOperand &BaseRegOp =
-      MergeForward ? Paired->getOperand(1) : I->getOperand(1);
+  const MachineOperand &BaseRegOp =
+      MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
   MachineInstr *RtMI, *Rt2MI;
-  if (I->getOperand(2).getImm() ==
-      Paired->getOperand(2).getImm() + OffsetStride) {
+  if (getLdStOffsetOp(I).getImm() ==
+      getLdStOffsetOp(Paired).getImm() + OffsetStride) {
     RtMI = Paired;
     Rt2MI = I;
     // Here we swapped the assumption made for SExtIdx.
@@ -368,18 +657,132 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     RtMI = I;
     Rt2MI = Paired;
   }
-  // Handle Unscaled
-  int OffsetImm = RtMI->getOperand(2).getImm();
-  if (IsUnscaled && EnableAArch64UnscaledMemOp)
-    OffsetImm /= OffsetStride;
+
+  int OffsetImm = getLdStOffsetOp(RtMI).getImm();
+
+  if (isNarrowLoad(Opc)) {
+    // Change the scaled offset from small to large type.
+    if (!IsUnscaled) {
+      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+      OffsetImm /= 2;
+    }
+    MachineInstr *RtNewDest = MergeForward ? I : Paired;
+    // When merging small (< 32 bit) loads for big-endian targets, the order of
+    // the component parts gets swapped.
+    if (!Subtarget->isLittleEndian())
+      std::swap(RtMI, Rt2MI);
+    // Construct the new load instruction.
+    MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
+    NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                       TII->get(NewOpc))
+                   .addOperand(getLdStRegOp(RtNewDest))
+                   .addOperand(BaseRegOp)
+                   .addImm(OffsetImm)
+                   .setMemRefs(I->mergeMemRefsWith(*Paired));
+
+    DEBUG(
+        dbgs()
+        << "Creating the new load and extract. Replacing instructions:\n    ");
+    DEBUG(I->print(dbgs()));
+    DEBUG(dbgs() << "    ");
+    DEBUG(Paired->print(dbgs()));
+    DEBUG(dbgs() << "  with instructions:\n    ");
+    DEBUG((NewMemMI)->print(dbgs()));
+
+    int Width = getMemScale(I) == 1 ? 8 : 16;
+    int LSBLow = 0;
+    int LSBHigh = Width;
+    int ImmsLow = LSBLow + Width - 1;
+    int ImmsHigh = LSBHigh + Width - 1;
+    MachineInstr *ExtDestMI = MergeForward ? Paired : I;
+    if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
+      // Create the bitfield extract for high bits.
+      BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                          TII->get(getBitExtrOpcode(Rt2MI)))
+                      .addOperand(getLdStRegOp(Rt2MI))
+                      .addReg(getLdStRegOp(RtNewDest).getReg())
+                      .addImm(LSBHigh)
+                      .addImm(ImmsHigh);
+      // Create the bitfield extract for low bits.
+      if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
+        // For unsigned, prefer to use AND for low bits.
+        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                            TII->get(AArch64::ANDWri))
+                        .addOperand(getLdStRegOp(RtMI))
+                        .addReg(getLdStRegOp(RtNewDest).getReg())
+                        .addImm(ImmsLow);
+      } else {
+        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                            TII->get(getBitExtrOpcode(RtMI)))
+                        .addOperand(getLdStRegOp(RtMI))
+                        .addReg(getLdStRegOp(RtNewDest).getReg())
+                        .addImm(LSBLow)
+                        .addImm(ImmsLow);
+      }
+    } else {
+      // Create the bitfield extract for low bits.
+      if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
+        // For unsigned, prefer to use AND for low bits.
+        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                            TII->get(AArch64::ANDWri))
+                        .addOperand(getLdStRegOp(RtMI))
+                        .addReg(getLdStRegOp(RtNewDest).getReg())
+                        .addImm(ImmsLow);
+      } else {
+        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                            TII->get(getBitExtrOpcode(RtMI)))
+                        .addOperand(getLdStRegOp(RtMI))
+                        .addReg(getLdStRegOp(RtNewDest).getReg())
+                        .addImm(LSBLow)
+                        .addImm(ImmsLow);
+      }
+
+      // Create the bitfield extract for high bits.
+      BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                          TII->get(getBitExtrOpcode(Rt2MI)))
+                      .addOperand(getLdStRegOp(Rt2MI))
+                      .addReg(getLdStRegOp(RtNewDest).getReg())
+                      .addImm(LSBHigh)
+                      .addImm(ImmsHigh);
+    }
+    DEBUG(dbgs() << "    ");
+    DEBUG((BitExtMI1)->print(dbgs()));
+    DEBUG(dbgs() << "    ");
+    DEBUG((BitExtMI2)->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+
+    // Erase the old instructions.
+    I->eraseFromParent();
+    Paired->eraseFromParent();
+    return NextI;
+  }
 
   // Construct the new instruction.
-  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
-                                    I->getDebugLoc(), TII->get(NewOpc))
-                                .addOperand(RtMI->getOperand(0))
-                                .addOperand(Rt2MI->getOperand(0))
-                                .addOperand(BaseRegOp)
-                                .addImm(OffsetImm);
+  MachineInstrBuilder MIB;
+  if (isNarrowStore(Opc)) {
+    // Change the scaled offset from small to large type.
+    if (!IsUnscaled) {
+      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+      OffsetImm /= 2;
+    }
+    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                  TII->get(NewOpc))
+              .addOperand(getLdStRegOp(I))
+              .addOperand(BaseRegOp)
+              .addImm(OffsetImm)
+              .setMemRefs(I->mergeMemRefsWith(*Paired));
+  } else {
+    // Handle Unscaled
+    if (IsUnscaled)
+      OffsetImm /= OffsetStride;
+    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+                  TII->get(NewOpc))
+              .addOperand(getLdStRegOp(RtMI))
+              .addOperand(getLdStRegOp(Rt2MI))
+              .addOperand(BaseRegOp)
+              .addImm(OffsetImm);
+  }
+
   (void)MIB;
 
   // FIXME: Do we need/want to copy the mem operands from the source
@@ -439,13 +842,112 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   return NextI;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+                                          MachineBasicBlock::iterator StoreI) {
+  MachineBasicBlock::iterator NextI = LoadI;
+  ++NextI;
+
+  int LoadSize = getMemScale(LoadI);
+  int StoreSize = getMemScale(StoreI);
+  unsigned LdRt = getLdStRegOp(LoadI).getReg();
+  unsigned StRt = getLdStRegOp(StoreI).getReg();
+  bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+  assert((IsStoreXReg ||
+          TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+         "Unexpected RegClass");
+
+  MachineInstr *BitExtMI;
+  if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
+    // Remove the load, if the destination register of the loads is the same
+    // register for stored value.
+    if (StRt == LdRt && LoadSize == 8) {
+      DEBUG(dbgs() << "Remove load instruction:\n    ");
+      DEBUG(LoadI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      LoadI->eraseFromParent();
+      return NextI;
+    }
+    // Replace the load with a mov if the load and store are in the same size.
+    BitExtMI =
+        BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+                TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+            .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+            .addReg(StRt)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  } else {
+    // FIXME: Currently we disable this transformation in big-endian targets as
+    // performance and correctness are verified only in little-endian.
+    if (!Subtarget->isLittleEndian())
+      return NextI;
+    bool IsUnscaled = isUnscaledLdSt(LoadI);
+    assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+    assert(LoadSize <= StoreSize && "Invalid load size");
+    int UnscaledLdOffset = IsUnscaled
+                               ? getLdStOffsetOp(LoadI).getImm()
+                               : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+    int UnscaledStOffset = IsUnscaled
+                               ? getLdStOffsetOp(StoreI).getImm()
+                               : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+    int Width = LoadSize * 8;
+    int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+    int Imms = Immr + Width - 1;
+    unsigned DestReg = IsStoreXReg
+                           ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+                                                      &AArch64::GPR64RegClass)
+                           : LdRt;
+
+    assert((UnscaledLdOffset >= UnscaledStOffset &&
+            (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+           "Invalid offset");
+
+    Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+    Imms = Immr + Width - 1;
+    if (UnscaledLdOffset == UnscaledStOffset) {
+      uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+                                | ((Immr) << 6)               // immr
+                                | ((Imms) << 0)               // imms
+          ;
+
+      BitExtMI =
+          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+                  TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+                  DestReg)
+              .addReg(StRt)
+              .addImm(AndMaskEncoded);
+    } else {
+      BitExtMI =
+          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+                  TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+                  DestReg)
+              .addReg(StRt)
+              .addImm(Immr)
+              .addImm(Imms);
+    }
+  }
+
+  DEBUG(dbgs() << "Promoting load by replacing :\n    ");
+  DEBUG(StoreI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(LoadI->print(dbgs()));
+  DEBUG(dbgs() << "  with instructions:\n    ");
+  DEBUG(StoreI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG((BitExtMI)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  LoadI->eraseFromParent();
+  return NextI;
+}
+
 /// trackRegDefsUses - Remember what registers the specified instruction uses
 /// and modifies.
-static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
                              BitVector &UsedRegs,
                              const TargetRegisterInfo *TRI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isRegMask())
       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
 
@@ -464,16 +966,12 @@ static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
 }
 
 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
-  if (!IsUnscaled && (Offset > 63 || Offset < -64))
-    return false;
-  if (IsUnscaled) {
-    // Convert the byte-offset used by unscaled into an "element" offset used
-    // by the scaled pair load/store instructions.
-    int ElemOffset = Offset / OffsetStride;
-    if (ElemOffset > 63 || ElemOffset < -64)
-      return false;
-  }
-  return true;
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  if (IsUnscaled)
+    Offset /= OffsetStride;
+
+  return Offset <= 63 && Offset >= -64;
 }
 
 // Do alignment, specialized to power of 2 and for signed ints,
@@ -507,12 +1005,65 @@ static bool mayAlias(MachineInstr *MIa,
   return false;
 }
 
+bool AArch64LoadStoreOpt::findMatchingStore(
+    MachineBasicBlock::iterator I, unsigned Limit,
+    MachineBasicBlock::iterator &StoreI) {
+  MachineBasicBlock::iterator E = I->getParent()->begin();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+
+  // Track which registers have been modified and used between the first insn
+  // and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+
+  for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+    --MBBI;
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If the load instruction reads directly from the address to which the
+    // store instruction writes and the stored value is not modified, we can
+    // promote the load. Since we do not handle stores with pre-/post-index,
+    // it's unnecessary to check if BaseReg is modified by the store itself.
+    if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+        BaseReg == getLdStBaseOp(MI).getReg() &&
+        isLdOffsetInRangeOfSt(FirstMI, MI) &&
+        !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+      StoreI = MBBI;
+      return true;
+    }
+
+    if (MI->isCall())
+      return false;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return false;
+
+    // If we encounter a store aliased with the load, return early.
+    if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+      return false;
+  }
+  return false;
+}
+
 /// findMatchingInsn - Scan the instructions looking for a load/store that can
 /// be combined with the current instruction into a load/store pair.
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                      bool &MergeForward, int &SExtIdx,
-                                      unsigned Limit) {
+                                      LdStPairFlags &Flags, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator MBBI = I;
   MachineInstr *FirstMI = I;
@@ -520,21 +1071,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
   unsigned Opc = FirstMI->getOpcode();
   bool MayLoad = FirstMI->mayLoad();
-  bool IsUnscaled = isUnscaledLdst(Opc);
-  unsigned Reg = FirstMI->getOperand(0).getReg();
-  unsigned BaseReg = FirstMI->getOperand(1).getReg();
-  int Offset = FirstMI->getOperand(2).getImm();
+  bool IsUnscaled = isUnscaledLdSt(FirstMI);
+  unsigned Reg = getLdStRegOp(FirstMI).getReg();
+  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+  int Offset = getLdStOffsetOp(FirstMI).getImm();
+  bool IsNarrowStore = isNarrowStore(Opc);
+
+  // For narrow stores, find only the case where the stored value is WZR.
+  if (IsNarrowStore && Reg != AArch64::WZR)
+    return E;
 
   // Early exit if the first instruction modifies the base register.
   // e.g., ldr x0, [x0]
-  // Early exit if the offset if not possible to match. (6 bits of positive
-  // range, plus allow an extra one in case we find a later insn that matches
-  // with Offset-1
   if (FirstMI->modifiesRegister(BaseReg, TRI))
     return E;
-  int OffsetStride =
-      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
-  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+
+  // Early exit if the offset if not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1)
+  int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+  if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
+      !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
     return E;
 
   // Track which registers have been modified and used between the first insn
@@ -557,18 +1114,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
     ++Count;
 
     bool CanMergeOpc = Opc == MI->getOpcode();
-    SExtIdx = -1;
+    Flags.setSExtIdx(-1);
     if (!CanMergeOpc) {
       bool IsValidLdStrOpc;
       unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
-      if (!IsValidLdStrOpc)
-        continue;
+      assert(IsValidLdStrOpc &&
+             "Given Opc should be a Load or Store with an immediate");
       // Opc will be the first instruction in the pair.
-      SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0;
+      Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
       CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
     }
 
-    if (CanMergeOpc && MI->getOperand(2).isImm()) {
+    if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
+      assert(MI->mayLoadOrStore() && "Expected memory operation.");
       // If we've found another instruction with the same opcode, check to see
       // if the base and offset are compatible with our starting instruction.
       // These instructions all have scaled immediate operands, so we just
@@ -579,8 +1137,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
       // Pairwise instructions have a 7-bit signed offset field. Single insns
       // have a 12-bit unsigned offset field. To be a valid combine, the
       // final offset must be in range.
-      unsigned MIBaseReg = MI->getOperand(1).getReg();
-      int MIOffset = MI->getOperand(2).getImm();
+      unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+      int MIOffset = getLdStOffsetOp(MI).getImm();
       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
                                    (Offset + OffsetStride == MIOffset))) {
         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
@@ -591,30 +1149,43 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           return E;
         // If the resultant immediate offset of merging these instructions
         // is out of range for a pairwise instruction, bail and keep looking.
-        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
-        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+        bool MIIsUnscaled = isUnscaledLdSt(MI);
+        bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
+        if (!IsNarrowLoad &&
+            !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          if (MI->mayLoadOrStore())
-            MemInsns.push_back(MI);
+          MemInsns.push_back(MI);
           continue;
         }
-        // If the alignment requirements of the paired (scaled) instruction
-        // can't express the offset of the unscaled input, bail and keep
-        // looking.
-        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
-            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          if (MI->mayLoadOrStore())
+
+        if (IsNarrowLoad || IsNarrowStore) {
+          // If the alignment requirements of the scaled wide load/store
+          // instruction can't express the offset of the scaled narrow
+          // input, bail and keep looking.
+          if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
             MemInsns.push_back(MI);
-          continue;
+            continue;
+          }
+        } else {
+          // If the alignment requirements of the paired (scaled) instruction
+          // can't express the offset of the unscaled input, bail and keep
+          // looking.
+          if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(MI);
+            continue;
+          }
         }
         // If the destination register of the loads is the same register, bail
         // and keep looking. A load-pair instruction with both destination
         // registers the same is UNPREDICTABLE and will result in an exception.
-        if (MayLoad && Reg == MI->getOperand(0).getReg()) {
+        // For narrow stores, allow only when the stored value is the same
+        // (i.e., WZR).
+        if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
+            (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          if (MI->mayLoadOrStore())
-            MemInsns.push_back(MI);
+          MemInsns.push_back(MI);
           continue;
         }
 
@@ -622,10 +1193,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // the two instructions and none of the instructions between the second
         // and first alias with the second, we can combine the second into the
         // first.
-        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
-            !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) &&
+        if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
+            !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
             !mayAlias(MI, MemInsns, TII)) {
-          MergeForward = false;
+          Flags.setMergeForward(false);
           return MBBI;
         }
 
@@ -633,11 +1204,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // between the two instructions and none of the instructions between the
         // first and the second alias with the first, we can combine the first
         // into the second.
-        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
-            !(FirstMI->mayLoad() &&
-              UsedRegs[FirstMI->getOperand(0).getReg()]) &&
+        if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
+            !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
             !mayAlias(FirstMI, MemInsns, TII)) {
-          MergeForward = true;
+          Flags.setMergeForward(true);
           return MBBI;
         }
         // Unable to combine these instructions due to interference in between.
@@ -666,51 +1236,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 }
 
 MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                           MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == AArch64::ADDXri ||
-          Update->getOpcode() == AArch64::SUBXri) &&
-         "Unexpected base register update instruction to merge!");
-  MachineBasicBlock::iterator NextI = I;
-  // Return the instruction following the merged instruction, which is
-  // the instruction following our unmerged load. Unless that's the add/sub
-  // instruction we're merging, in which case it's the one after that.
-  if (++NextI == Update)
-    ++NextI;
-
-  int Value = Update->getOperand(2).getImm();
-  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
-         "Can't merge 1 << 12 offset into pre-indexed load / store");
-  if (Update->getOpcode() == AArch64::SUBXri)
-    Value = -Value;
-
-  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
-  MachineInstrBuilder MIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-          .addOperand(Update->getOperand(0))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addImm(Value);
-  (void)MIB;
-
-  DEBUG(dbgs() << "Creating pre-indexed load/store.");
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Update->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions for the block.
-  I->eraseFromParent();
-  Update->eraseFromParent();
-
-  return NextI;
-}
-
-MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
-    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
+                                     MachineBasicBlock::iterator Update,
+                                     bool IsPreIdx) {
   assert((Update->getOpcode() == AArch64::ADDXri ||
           Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
@@ -723,20 +1251,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
 
   int Value = Update->getOperand(2).getImm();
   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
-         "Can't merge 1 << 12 offset into post-indexed load / store");
+         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
   if (Update->getOpcode() == AArch64::SUBXri)
     Value = -Value;
 
-  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
-  MachineInstrBuilder MIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-          .addOperand(Update->getOperand(0))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addImm(Value);
+  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
+                             : getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB;
+  if (!isPairedLdSt(I)) {
+    // Non-paired instruction.
+    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+              .addOperand(getLdStRegOp(Update))
+              .addOperand(getLdStRegOp(I))
+              .addOperand(getLdStBaseOp(I))
+              .addImm(Value);
+  } else {
+    // Paired instruction.
+    int Scale = getMemScale(I);
+    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+              .addOperand(getLdStRegOp(Update))
+              .addOperand(getLdStRegOp(I, 0))
+              .addOperand(getLdStRegOp(I, 1))
+              .addOperand(getLdStBaseOp(I))
+              .addImm(Value / Scale);
+  }
   (void)MIB;
 
-  DEBUG(dbgs() << "Creating post-indexed load/store.");
+  if (IsPreIdx)
+    DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  else
+    DEBUG(dbgs() << "Creating post-indexed load/store.");
   DEBUG(dbgs() << "    Replacing instructions:\n    ");
   DEBUG(I->print(dbgs()));
   DEBUG(dbgs() << "    ");
@@ -752,8 +1296,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
   return NextI;
 }
 
-static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
-                                 int Offset) {
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
+                                               MachineInstr *MI,
+                                               unsigned BaseReg, int Offset) {
   switch (MI->getOpcode()) {
   default:
     break;
@@ -769,44 +1314,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
     // Watch out for 1 << 12 shifted value.
     if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
       break;
-    // If the instruction has the base register as source and dest and the
-    // immediate will fit in a signed 9-bit integer, then we have a match.
-    if (MI->getOperand(0).getReg() == BaseReg &&
-        MI->getOperand(1).getReg() == BaseReg &&
-        MI->getOperand(2).getImm() <= 255 &&
-        MI->getOperand(2).getImm() >= -256) {
-      // If we have a non-zero Offset, we check that it matches the amount
-      // we're adding to the register.
-      if (!Offset || Offset == MI->getOperand(2).getImm())
-        return true;
+
+    // The update instruction source and destination register must be the
+    // same as the load/store base register.
+    if (MI->getOperand(0).getReg() != BaseReg ||
+        MI->getOperand(1).getReg() != BaseReg)
+      break;
+
+    bool IsPairedInsn = isPairedLdSt(MemMI);
+    int UpdateOffset = MI->getOperand(2).getImm();
+    // For non-paired load/store instructions, the immediate must fit in a
+    // signed 9-bit integer.
+    if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+      break;
+
+    // For paired load/store instructions, the immediate must be a multiple of
+    // the scaling factor.  The scaled offset must also fit into a signed 7-bit
+    // integer.
+    if (IsPairedInsn) {
+      int Scale = getMemScale(MemMI);
+      if (UpdateOffset % Scale != 0)
+        break;
+
+      int ScaledOffset = UpdateOffset / Scale;
+      if (ScaledOffset > 64 || ScaledOffset < -64)
+        break;
     }
+
+    // If we have a non-zero Offset, we check that it matches the amount
+    // we're adding to the register.
+    if (!Offset || Offset == MI->getOperand(2).getImm())
+      return true;
     break;
   }
   return false;
 }
 
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
-    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+    MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr *MemMI = I;
   MachineBasicBlock::iterator MBBI = I;
-  const MachineFunction &MF = *MemMI->getParent()->getParent();
 
-  unsigned DestReg = MemMI->getOperand(0).getReg();
-  unsigned BaseReg = MemMI->getOperand(1).getReg();
-  int Offset = MemMI->getOperand(2).getImm() *
-               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
 
-  // If the base register overlaps the destination register, we can't
-  // merge the update.
-  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+  // Scan forward looking for post-index opportunities.  Updating instructions
+  // can't be formed if the memory instruction doesn't have the offset we're
+  // looking for.
+  if (MIUnscaledOffset != UnscaledOffset)
     return E;
 
-  // Scan forward looking for post-index opportunities.
-  // Updating instructions can't be formed if the memory insn already
-  // has an offset other than the value we're looking for.
-  if (Offset != Value)
-    return E;
+  // If the base register overlaps a destination register, we can't
+  // merge the update.
+  bool IsPairedInsn = isPairedLdSt(MemMI);
+  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+      return E;
+  }
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
@@ -825,7 +1391,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(MI, BaseReg, Value))
+    if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
@@ -845,21 +1411,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr *MemMI = I;
   MachineBasicBlock::iterator MBBI = I;
-  const MachineFunction &MF = *MemMI->getParent()->getParent();
 
-  unsigned DestReg = MemMI->getOperand(0).getReg();
-  unsigned BaseReg = MemMI->getOperand(1).getReg();
-  int Offset = MemMI->getOperand(2).getImm();
-  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+  int Offset = getLdStOffsetOp(MemMI).getImm();
 
   // If the load/store is the first instruction in the block, there's obviously
   // not any matching update. Ditto if the memory offset isn't zero.
   if (MBBI == B || Offset != 0)
     return E;
-  // If the base register overlaps the destination register, we can't
+  // If the base register overlaps a destination register, we can't
   // merge the update.
-  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-    return E;
+  bool IsPairedInsn = isPairedLdSt(MemMI);
+  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+      return E;
+  }
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
@@ -878,7 +1445,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+    if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
@@ -892,17 +1459,101 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
-bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+    MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = MBBI;
+  // If this is a volatile load, don't mess with it.
+  if (MI->hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm.
+  // FIXME: It is possible to extend it to handle reg+reg cases.
+  if (!getLdStOffsetOp(MI).isImm())
+    return false;
+
+  // Look backward up to ScanLimit instructions.
+  MachineBasicBlock::iterator StoreI;
+  if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+    ++NumLoadsFromStoresPromoted;
+    // Promote the load. Keeping the iterator straight is a
+    // pain, so we let the merge routine tell us what the next instruction
+    // is after it's done mucking about.
+    MBBI = promoteLoadFromStore(MBBI, StoreI);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64LoadStoreOpt::tryToMergeLdStInst(
+    MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = MBBI;
+  MachineBasicBlock::iterator E = MI->getParent()->end();
+  // If this is a volatile load/store, don't mess with it.
+  if (MI->hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  if (!getLdStOffsetOp(MI).isImm())
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (TII->isLdStPairSuppressed(MI))
+    return false;
+
+  // Look ahead up to ScanLimit instructions for a pairable instruction.
+  LdStPairFlags Flags;
+  MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
+  if (Paired != E) {
+    if (isNarrowLoad(MI)) {
+      ++NumNarrowLoadsPromoted;
+    } else if (isNarrowStore(MI)) {
+      ++NumZeroStoresPromoted;
+    } else {
+      ++NumPairCreated;
+      if (isUnscaledLdSt(MI))
+        ++NumUnscaledPairCreated;
+    }
+
+    // Merge the loads into a pair. Keeping the iterator straight is a
+    // pain, so we let the merge routine tell us what the next instruction
+    // is after it's done mucking about.
+    MBBI = mergePairedInsns(MBBI, Paired, Flags);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+                                        bool enableNarrowLdOpt) {
   bool Modified = false;
-  // Two tranformations to do here:
-  // 1) Find loads and stores that can be merged into a single load or store
+  // Three tranformations to do here:
+  // 1) Find loads that directly read from stores and promote them by
+  //    replacing with mov instructions. If the store is wider than the load,
+  //    the load will be replaced with a bitfield extract.
+  //      e.g.,
+  //        str w1, [x0, #4]
+  //        ldrh w2, [x0, #6]
+  //        ; becomes
+  //        str w1, [x0, #4]
+  //        lsr	w2, w1, #16
+  // 2) Find narrow loads that can be converted into a single wider load
+  //    with bitfield extract instructions.
+  //      e.g.,
+  //        ldrh w0, [x2]
+  //        ldrh w1, [x2, #2]
+  //        ; becomes
+  //        ldr w0, [x2]
+  //        ubfx w1, w0, #16, #16
+  //        and w0, w0, #ffff
+  // 3) Find loads and stores that can be merged into a single load or store
   //    pair instruction.
   //      e.g.,
   //        ldr x0, [x2]
   //        ldr x1, [x2, #8]
   //        ; becomes
   //        ldp x0, x1, [x2]
-  // 2) Find base register updates that can be merged into the load or store
+  // 4) Find base register updates that can be merged into the load or store
   //    as a base-reg writeback.
   //      e.g.,
   //        ldr x0, [x2]
@@ -918,6 +1569,69 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // Just move on to the next instruction.
       ++MBBI;
       break;
+    // Scaled instructions.
+    case AArch64::LDRBBui:
+    case AArch64::LDRHHui:
+    case AArch64::LDRWui:
+    case AArch64::LDRXui:
+    // Unscaled instructions.
+    case AArch64::LDURBBi:
+    case AArch64::LDURHHi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      if (tryToPromoteLoadFromStore(MBBI)) {
+        Modified = true;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       enableNarrowLdOpt && MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    // Scaled instructions.
+    case AArch64::LDRBBui:
+    case AArch64::LDRHHui:
+    case AArch64::LDRSBWui:
+    case AArch64::LDRSHWui:
+    case AArch64::STRBBui:
+    case AArch64::STRHHui:
+    // Unscaled instructions.
+    case AArch64::LDURBBi:
+    case AArch64::LDURHHi:
+    case AArch64::LDURSBWi:
+    case AArch64::LDURSHWi:
+    case AArch64::STURBBi:
+    case AArch64::STURHHi: {
+      if (tryToMergeLdStInst(MBBI)) {
+        Modified = true;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    // Scaled instructions.
     case AArch64::STRSui:
     case AArch64::STRDui:
     case AArch64::STRQui:
@@ -929,7 +1643,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
     case AArch64::LDRXui:
     case AArch64::LDRWui:
     case AArch64::LDRSWui:
-    // do the unscaled versions as well
+    // Unscaled instructions.
     case AArch64::STURSi:
     case AArch64::STURDi:
     case AArch64::STURQi:
@@ -941,37 +1655,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
     case AArch64::LDURWi:
     case AArch64::LDURXi:
     case AArch64::LDURSWi: {
-      // If this is a volatile load/store, don't mess with it.
-      if (MI->hasOrderedMemoryRef()) {
-        ++MBBI;
-        break;
-      }
-      // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!MI->getOperand(2).isImm()) {
-        ++MBBI;
-        break;
-      }
-      // Check if this load/store has a hint to avoid pair formation.
-      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
-      if (TII->isLdStPairSuppressed(MI)) {
-        ++MBBI;
-        break;
-      }
-      // Look ahead up to ScanLimit instructions for a pairable instruction.
-      bool MergeForward = false;
-      int SExtIdx = -1;
-      MachineBasicBlock::iterator Paired =
-          findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit);
-      if (Paired != E) {
-        // Merge the loads into a pair. Keeping the iterator straight is a
-        // pain, so we let the merge routine tell us what the next instruction
-        // is after it's done mucking about.
-        MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx);
-
+      if (tryToMergeLdStInst(MBBI)) {
         Modified = true;
-        ++NumPairCreated;
-        if (isUnscaledLdst(MI->getOpcode()))
-          ++NumUnscaledPairCreated;
         break;
       }
       ++MBBI;
@@ -992,17 +1677,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // Just move on to the next instruction.
       ++MBBI;
       break;
+    // Scaled instructions.
     case AArch64::STRSui:
     case AArch64::STRDui:
     case AArch64::STRQui:
     case AArch64::STRXui:
     case AArch64::STRWui:
+    case AArch64::STRHHui:
+    case AArch64::STRBBui:
     case AArch64::LDRSui:
     case AArch64::LDRDui:
     case AArch64::LDRQui:
     case AArch64::LDRXui:
     case AArch64::LDRWui:
-    // do the unscaled versions as well
+    case AArch64::LDRHHui:
+    case AArch64::LDRBBui:
+    // Unscaled instructions.
     case AArch64::STURSi:
     case AArch64::STURDi:
     case AArch64::STURQi:
@@ -1012,25 +1702,41 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
     case AArch64::LDURDi:
     case AArch64::LDURQi:
     case AArch64::LDURWi:
-    case AArch64::LDURXi: {
+    case AArch64::LDURXi:
+    // Paired instructions.
+    case AArch64::LDPSi:
+    case AArch64::LDPSWi:
+    case AArch64::LDPDi:
+    case AArch64::LDPQi:
+    case AArch64::LDPWi:
+    case AArch64::LDPXi:
+    case AArch64::STPSi:
+    case AArch64::STPDi:
+    case AArch64::STPQi:
+    case AArch64::STPWi:
+    case AArch64::STPXi: {
       // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!MI->getOperand(2).isImm()) {
+      if (!getLdStOffsetOp(MI).isImm()) {
         ++MBBI;
         break;
       }
-      // Look ahead up to ScanLimit instructions for a mergable instruction.
+      // Look forward to try to form a post-index instruction. For example,
+      // ldr x0, [x20]
+      // add x20, x20, #32
+      //   merged into:
+      // ldr x0, [x20], #32
       MachineBasicBlock::iterator Update =
           findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
       if (Update != E) {
         // Merge the update into the ld/st.
-        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
         Modified = true;
         ++NumPostFolded;
         break;
       }
       // Don't know how to handle pre/post-index versions, so move to the next
       // instruction.
-      if (isUnscaledLdst(Opc)) {
+      if (isUnscaledLdSt(Opc)) {
         ++MBBI;
         break;
       }
@@ -1043,28 +1749,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
-        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
         Modified = true;
         ++NumPreFolded;
         break;
       }
+      // The immediate in the load/store is scaled by the size of the memory
+      // operation. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
 
       // Look forward to try to find a post-index instruction. For example,
       // ldr x1, [x0, #64]
       // add x0, x0, #64
       //   merged into:
       // ldr x1, [x0, #64]!
-
-      // The immediate in the load/store is scaled by the size of the register
-      // being loaded. The immediate in the add we're looking for,
-      // however, is not, so adjust here.
-      int Value = MI->getOperand(2).getImm() *
-                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
-                      ->getSize();
-      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
       if (Update != E) {
         // Merge the update into the ld/st.
-        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
         Modified = true;
         ++NumPreFolded;
         break;
@@ -1081,13 +1784,24 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
   return Modified;
 }
 
+bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
+  bool ProfitableArch = Subtarget->isCortexA57();
+  // FIXME: The benefit from converting narrow loads into a wider load could be
+  // microarchitectural as it assumes that a single load with two bitfield
+  // extracts is cheaper than two narrow loads. Currently, this conversion is
+  // enabled only in cortex-a57 on which performance benefits were verified.
+  return ProfitableArch && !Subtarget->requiresStrictAlign();
+}
+
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
-  TRI = Fn.getSubtarget().getRegisterInfo();
+  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+  TRI = Subtarget->getRegisterInfo();
 
   bool Modified = false;
+  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
   for (auto &MBB : Fn)
-    Modified |= optimizeBlock(MBB);
+    Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
 
   return Modified;
 }
@@ -1095,8 +1809,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
 // loads and stores near one another?
 
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
+/// createAArch64LoadStoreOptimizationPass - returns an instance of the
+/// load / store optimization pass.
 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
   return new AArch64LoadStoreOpt();
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 580427a..2b4cdf1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -207,9 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
 void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
-    if (lowerOperand(MI->getOperand(i), MCOp))
+    if (lowerOperand(MO, MCOp))
       OutMI.addOperand(MCOp);
   }
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h
deleted file mode 100644
index 4164b33..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- AArch64MachineCombinerPattern.h                                    -===//
-//===- AArch64 instruction pattern supported by combiner                  -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines instruction pattern supported by combiner
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
-
-namespace llvm {
-
-/// Enumeration of instruction pattern supported by machine combiner
-///
-///
-namespace MachineCombinerPattern {
-enum MC_PATTERN : int {
-  MC_NONE = 0,
-  MC_MULADDW_OP1 = 1,
-  MC_MULADDW_OP2 = 2,
-  MC_MULSUBW_OP1 = 3,
-  MC_MULSUBW_OP2 = 4,
-  MC_MULADDWI_OP1 = 5,
-  MC_MULSUBWI_OP1 = 6,
-  MC_MULADDX_OP1 = 7,
-  MC_MULADDX_OP2 = 8,
-  MC_MULSUBX_OP1 = 9,
-  MC_MULSUBX_OP2 = 10,
-  MC_MULADDXI_OP1 = 11,
-  MC_MULSUBXI_OP1 = 12
-};
-} // end namespace MachineCombinerPattern
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 536a8d0..318f839 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
+//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -42,7 +42,7 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
   unsigned ArgumentStackToRestore;
 
   /// HasStackFrame - True if this function has a stack frame. Set by
-  /// processFunctionBeforeCalleeSavedScan().
+  /// determineCalleeSaves().
   bool HasStackFrame;
 
   /// \brief Amount of stack frame size, not including callee-saved registers.
@@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
   /// registers.
   unsigned VarArgsFPRSize;
 
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR;
+
 public:
   AArch64FunctionInfo()
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+        IsSplitCSR(false) {}
 
   explicit AArch64FunctionInfo(MachineFunction &MF)
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+        IsSplitCSR(false) {
     (void)MF;
   }
 
@@ -96,6 +102,9 @@ public:
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
   void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
   unsigned getLocalStackSize() const { return LocalStackSize; }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
index e1b93bf..79c09d9 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -489,7 +489,7 @@ bool AArch64PromoteConstant::insertDefinitions(
 
     for (const auto &IPI : InsertPts) {
       // Create the load of the global variable.
-      IRBuilder<> Builder(IPI.first->getParent(), IPI.first);
+      IRBuilder<> Builder(IPI.first);
       LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
       DEBUG(dbgs() << "**********\n");
       DEBUG(dbgs() << "New def: ");
@@ -540,7 +540,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) {
   bool LocalChange = false;
   SmallPtrSet<Constant *, 8> AlreadyChecked;
 
-  for (Instruction &I : inst_range(&F)) {
+  for (Instruction &I : instructions(&F)) {
     // Traverse the operand, looking for constant vectors. Replace them by a
     // load of a global variable of constant vector type.
     for (Value *Op : I.operand_values()) {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 841af55..32b4888 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,6 +15,7 @@
 #include "AArch64RegisterInfo.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
@@ -34,10 +35,6 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-static cl::opt<bool>
-ReserveX18("aarch64-reserve-x18", cl::Hidden,
-          cl::desc("Reserve X18, making it unavailable as GPR"));
-
 AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
     : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {}
 
@@ -50,10 +47,23 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_NoRegs_SaveList;
   if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
+           CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
+           CSR_AArch64_CXX_TLS_Darwin_SaveList;
   else
     return CSR_AArch64_AAPCS_SaveList;
 }
 
+const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
+    return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+  return nullptr;
+}
+
 const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
@@ -62,6 +72,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
+  if (CC == CallingConv::CXX_FAST_TLS)
+    return CSR_AArch64_CXX_TLS_Darwin_RegMask;
   else
     return CSR_AArch64_AAPCS_RegMask;
 }
@@ -104,7 +116,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(AArch64::W29);
   }
 
-  if (TT.isOSDarwin() || ReserveX18) {
+  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
     Reserved.set(AArch64::X18); // Platform register
     Reserved.set(AArch64::W18);
   }
@@ -131,7 +143,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
     return true;
   case AArch64::X18:
   case AArch64::W18:
-    return TT.isOSDarwin() || ReserveX18;
+    return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
   case AArch64::FP:
   case AArch64::W29:
     return TFI->hasFP(MF) || TT.isOSDarwin();
@@ -186,29 +198,6 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   return false;
 }
 
-bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const {
-
-  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
-    return false;
-
-  return true;
-}
-
-// FIXME: share this with other backends with identical implementation?
-bool
-AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64FrameLowering *TFI = getFrameLowering(MF);
-  const Function *F = MF.getFunction();
-  unsigned StackAlign = TFI->getStackAlignment();
-  bool requiresRealignment =
-      ((MFI->getMaxAlignment() > StackAlign) ||
-       F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::StackAlignment));
-
-  return requiresRealignment && canRealignStack(MF);
-}
-
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -424,10 +413,11 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case AArch64::GPR64RegClassID:
   case AArch64::GPR32commonRegClassID:
   case AArch64::GPR64commonRegClassID:
-    return 32 - 1                                // XZR/SP
-           - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
-           - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register
-           - hasBasePointer(MF);           // X19
+    return 32 - 1                                   // XZR/SP
+              - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
+              - MF.getSubtarget<AArch64Subtarget>()
+                    .isX18Reserved() // X18 reserved as platform register
+              - hasBasePointer(MF);  // X19
   case AArch64::FPR8RegClassID:
   case AArch64::FPR16RegClassID:
   case AArch64::FPR32RegClassID:
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 8c379d9..f33f788 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -35,6 +35,8 @@ public:
 
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
 
@@ -93,9 +95,6 @@ public:
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
-  // Base pointer (stack realignment) support.
-  bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index b2efca0..a8c8b17 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64",
 // The lower 16 vector registers.  Some instructions can only take registers
 // in this range.
 def FPR128_lo : RegisterClass<"AArch64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
                               128, (trunc FPR128, 16)>;
 
 // Pairs, triples, and quads of 64-bit vector registers.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 486efd6..f6ee8cf 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -31,6 +31,11 @@ static cl::opt<bool>
 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
                      "converter pass"), cl::init(true), cl::Hidden);
 
+// If OS supports TBI, use this flag to enable it.
+static cl::opt<bool>
+UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
+                         "an address is ignored"), cl::init(false), cl::Hidden);
+
 AArch64Subtarget &
 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
   // Determine default and user-specified characteristics
@@ -46,9 +51,11 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false),
-      HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
-      IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
+      HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false),
+      HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
+      StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian),
+      CPUString(CPU), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
       TLInfo(TM, *this) {}
 
@@ -113,12 +120,30 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   // bi-directional scheduling. 253.perlbmk.
   Policy.OnlyTopDown = false;
   Policy.OnlyBottomUp = false;
+  // Enabling or Disabling the latency heuristic is a close call: It seems to
+  // help nearly no benchmark on out-of-order architectures, on the other hand
+  // it regresses register pressure on a few benchmarking.
+  if (isCyclone())
+    Policy.DisableLatencyHeuristic = true;
 }
 
 bool AArch64Subtarget::enableEarlyIfConversion() const {
   return EnableEarlyIfConvert;
 }
 
+bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
+  if (!UseAddressTopByteIgnored)
+    return false;
+
+  if (TargetTriple.isiOS()) {
+    unsigned Major, Minor, Micro;
+    TargetTriple.getiOSVersion(Major, Minor, Micro);
+    return Major >= 8;
+  }
+
+  return false;
+}
+
 std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
   if (!isCortexA57())
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 6bb0694..151133b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,17 +33,28 @@ class Triple;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+  enum ARMProcFamilyEnum {
+    Others,
+    CortexA35,
+    CortexA53,
+    CortexA57,
+    Cyclone,
+    ExynosM1
+  };
 
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily;
 
   bool HasV8_1aOps;
+  bool HasV8_2aOps;
 
   bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
   bool HasCRC;
+  bool HasPerfMon;
+  bool HasFullFP16;
+  bool HasSPE;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove;
@@ -51,6 +62,12 @@ protected:
   // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
   bool HasZeroCycleZeroing;
 
+  // StrictAlign - Disallow unaligned memory accesses.
+  bool StrictAlign;
+
+  // ReserveX18 - X18 is not available as a general purpose register.
+  bool ReserveX18;
+
   bool IsLittle;
 
   /// CPUString - String name of used CPU.
@@ -92,19 +109,30 @@ public:
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override { return true; }
   bool enablePostRAScheduler() const override {
-    return isCortexA53() || isCortexA57();
+    return isGeneric() || isCortexA53() || isCortexA57();
   }
 
   bool hasV8_1aOps() const { return HasV8_1aOps; }
+  bool hasV8_2aOps() const { return HasV8_2aOps; }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
   bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
 
+  bool requiresStrictAlign() const { return StrictAlign; }
+
+  bool isX18Reserved() const { return ReserveX18; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  /// CPU has TBI (top byte of addresses is ignored during HW address
+  /// translation) and OS enables it.
+  bool supportsAddressTopByteIgnored() const;
+
+  bool hasPerfMon() const { return HasPerfMon; }
+  bool hasFullFP16() const { return HasFullFP16; }
+  bool hasSPE() const { return HasSPE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -112,14 +140,17 @@ public:
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
 
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
+  bool isGeneric() const { return CPUString == "generic"; }
   bool isCyclone() const { return CPUString == "cyclone"; }
   bool isCortexA57() const { return CPUString == "cortex-a57"; }
   bool isCortexA53() const { return CPUString == "cortex-a53"; }
+  bool isExynosM1() const { return CPUString == "exynos-m1"; }
 
   bool useAA() const override { return isCortexA53(); }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index db6e244..c52c554 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -203,7 +203,7 @@ public:
 } // namespace
 
 TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(AArch64TTIImpl(this, F));
   });
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e085cca..9af0e64 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
-unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
   // Check if the immediate can be encoded within an instruction.
   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
     return 0;
@@ -37,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
 }
 
 /// \brief Calculate the cost of materializing the given constant.
-unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -51,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 
   // Split the constant into 64-bit chunks and calculate the cost for each
   // chunk.
-  unsigned Cost = 0;
+  int Cost = 0;
   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
     int64_t Val = Tmp.getSExtValue();
     Cost += getIntImmCost(Val);
   }
   // We need at least one instruction to materialze the constant.
-  return std::max(1U, Cost);
+  return std::max(1, Cost);
 }
 
-unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                       const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -118,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
   }
 
   if (Idx == ImmIdx) {
-    unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
     return (Cost <= NumConstants * TTI::TCC_Basic)
-               ? static_cast<unsigned>(TTI::TCC_Free)
+               ? static_cast<int>(TTI::TCC_Free)
                : Cost;
   }
   return AArch64TTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                       const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -147,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow:
     if (Idx == 1) {
-      unsigned NumConstants = (BitSize + 63) / 64;
-      unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+      int NumConstants = (BitSize + 63) / 64;
+      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
       return (Cost <= NumConstants * TTI::TCC_Basic)
-                 ? static_cast<unsigned>(TTI::TCC_Free)
+                 ? static_cast<int>(TTI::TCC_Free)
                  : Cost;
     }
     break;
@@ -176,8 +176,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                          Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -187,7 +186,31 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+  static const TypeConversionCostTblEntry
+  ConversionTbl[] = {
+    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
+    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
+    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+    // The number of shll instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
     // LowerVectorINT_TO_FP:
     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
@@ -210,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
 
+    // Complex: to v8f32
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+    // Complex: to v16f32
+    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
     // Complex: to v2f64
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
@@ -250,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
   };
 
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
+  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+                                                 DstTy.getSimpleVT(),
+                                                 SrcTy.getSimpleVT()))
+    return Entry->Cost;
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                            unsigned Index) {
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                       unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   if (Index != -1U) {
     // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
     // This type is legalized to a scalar type.
     if (!LT.second.isVector())
@@ -281,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   }
 
   // All other insert/extracts cost this much.
-  return 2;
+  return 3;
 }
 
-unsigned AArch64TTIImpl::getArithmeticInstrCost(
+int AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
@@ -300,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
     // normally expanded to the sequence ADD + CMP + SELECT + SRA.
     // The OperandValue properties many not be same as that of previous
     // operation; conservatively assume OP_None.
-    unsigned Cost =
-      getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                             TargetTransformInfo::OP_None,
-                             TargetTransformInfo::OP_None);
+    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                      TargetTransformInfo::OP_None,
+                                      TargetTransformInfo::OP_None);
     Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                    TargetTransformInfo::OP_None,
                                    TargetTransformInfo::OP_None);
@@ -331,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
   }
 }
 
-unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -346,19 +377,20 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   return 1;
 }
 
-unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) {
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                       Type *CondTy) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
+  // We don't lower some vector selects well that are wider than the register
+  // width.
   if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
     // We would need this many instructions to hide the scalarization happening.
-    const unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    const int AmortizationCost = 20;
+    static const TypeConversionCostTblEntry
     VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
@@ -367,20 +399,18 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     EVT SelCondTy = TLI->getValueType(DL, CondTy);
     EVT SelValTy = TLI->getValueType(DL, ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
+      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+                                                     SelCondTy.getSimpleVT(),
+                                                     SelValTy.getSimpleVT()))
+        return Entry->Cost;
     }
   }
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                         unsigned Alignment,
-                                         unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isIntegerTy(64)) {
@@ -389,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     // practice on inlined memcpy code.
     // We make v2i64 stores expensive so that we will only vectorize if there
     // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
+    int AmortizationCost = 6;
 
     return LT.first * 2 * AmortizationCost;
   }
@@ -407,16 +437,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
@@ -427,8 +459,8 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
                                            Alignment, AddressSpace);
 }
 
-unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
-  unsigned Cost = 0;
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+  int Cost = 0;
   for (auto *I : Tys) {
     if (!I->isVectorTy())
       continue;
@@ -506,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_ld4:
     Info.ReadMem = true;
     Info.WriteMem = false;
-    Info.Vol = false;
+    Info.IsSimple = true;
     Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(0);
     break;
@@ -515,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.Vol = false;
+    Info.IsSimple = true;
     Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
     break;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 444d3cc..ec58c4f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -48,7 +48,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   };
 
 public:
-  explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F)
+  explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
@@ -63,12 +63,11 @@ public:
   /// @{
 
   using BaseT::getIntImmCost;
-  unsigned getIntImmCost(int64_t Val);
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
+  int getIntImmCost(int64_t Val);
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   /// @}
@@ -76,6 +75,8 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
+  bool enableInterleavedAccessVectorization() { return true; }
+
   unsigned getNumberOfRegisters(bool Vector) {
     if (Vector) {
       if (ST->hasNEON())
@@ -96,25 +97,25 @@ public:
 
   unsigned getMaxInterleaveFactor(unsigned VF);
 
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
 
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
-  unsigned getArithmeticInstrCost(
+  int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
 
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex);
+  int getAddressComputationCost(Type *Ty, bool IsComplex);
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
 
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
 
-  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+  int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
 
@@ -123,11 +124,9 @@ public:
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                      unsigned Factor,
-                                      ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace);
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
+                                 unsigned AddressSpace);
   /// @}
 };
 
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 38e8b4d..394c8e7 100644
--- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -43,7 +43,6 @@ class AArch64Operand;
 class AArch64AsmParser : public MCTargetAsmParser {
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
-  MCSubtargetInfo &STI;
 
   // Map of register aliases registers via the .req directive.
   StringMap<std::pair<bool, unsigned> > RegisterReqs;
@@ -101,6 +100,7 @@ private:
   OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
   OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
   OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
@@ -115,16 +115,16 @@ public:
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "AArch64GenAsmMatcher.inc"
   };
-  AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser,
+  AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                    const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(STI) {
+    : MCTargetAsmParser(Options, STI) {
     MCAsmParserExtension::Initialize(Parser);
     MCStreamer &S = getParser().getStreamer();
     if (S.getTargetStreamer() == nullptr)
       new AArch64TargetStreamer(S);
 
     // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -160,7 +160,8 @@ private:
     k_Prefetch,
     k_ShiftExtend,
     k_FPImm,
-    k_Barrier
+    k_Barrier,
+    k_PSBHint,
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -228,6 +229,12 @@ private:
     unsigned Length;
   };
 
+  struct PSBHintOp {
+    unsigned Val;
+    const char *Data;
+    unsigned Length;
+  };
+
   struct ShiftExtendOp {
     AArch64_AM::ShiftExtendType Type;
     unsigned Amount;
@@ -251,6 +258,7 @@ private:
     struct SysRegOp SysReg;
     struct SysCRImmOp SysCRImm;
     struct PrefetchOp Prefetch;
+    struct PSBHintOp PSBHint;
     struct ShiftExtendOp ShiftExtend;
   };
 
@@ -302,6 +310,9 @@ public:
     case k_Prefetch:
       Prefetch = o.Prefetch;
       break;
+    case k_PSBHint:
+      PSBHint = o.PSBHint;
+      break;
     case k_ShiftExtend:
       ShiftExtend = o.ShiftExtend;
       break;
@@ -393,6 +404,16 @@ public:
     return Prefetch.Val;
   }
 
+  unsigned getPSBHint() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return PSBHint.Val;
+  }
+
+  StringRef getPSBHintName() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return StringRef(PSBHint.Data, PSBHint.Length);
+  }
+
   StringRef getPrefetchName() const {
     assert(Kind == k_Prefetch && "Invalid access!");
     return StringRef(Prefetch.Data, Prefetch.Length);
@@ -497,6 +518,15 @@ public:
     return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
+  bool isImm0_1() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 2);
+  }
   bool isImm0_7() const {
     if (!isImm())
       return false;
@@ -876,12 +906,15 @@ public:
   }
   bool isMSRSystemRegister() const {
     if (!isSysReg()) return false;
-
     return SysReg.MSRReg != -1U;
   }
-  bool isSystemPStateField() const {
+  bool isSystemPStateFieldWithImm0_1() const {
     if (!isSysReg()) return false;
-
+    return (SysReg.PStateField == AArch64PState::PAN ||
+            SysReg.PStateField == AArch64PState::UAO);
+  }
+  bool isSystemPStateFieldWithImm0_15() const {
+    if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
     return SysReg.PStateField != -1U;
   }
   bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
@@ -950,6 +983,7 @@ public:
   }
   bool isSysCR() const { return Kind == k_SysCR; }
   bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isPSBHint() const { return Kind == k_PSBHint; }
   bool isShiftExtend() const { return Kind == k_ShiftExtend; }
   bool isShifter() const {
     if (!isShiftExtend())
@@ -1175,8 +1209,10 @@ public:
   template <unsigned NumRegs>
   void addVectorList64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { AArch64::D0,       AArch64::D0_D1,
-                                    AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+    static const unsigned FirstRegs[] = { AArch64::D0,
+                                          AArch64::D0_D1,
+                                          AArch64::D0_D1_D2,
+                                          AArch64::D0_D1_D2_D3 };
     unsigned FirstReg = FirstRegs[NumRegs - 1];
 
     Inst.addOperand(
@@ -1186,8 +1222,10 @@ public:
   template <unsigned NumRegs>
   void addVectorList128Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { AArch64::Q0,       AArch64::Q0_Q1,
-                                    AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+    static const unsigned FirstRegs[] = { AArch64::Q0,
+                                          AArch64::Q0_Q1,
+                                          AArch64::Q0_Q1_Q2,
+                                          AArch64::Q0_Q1_Q2_Q3 };
     unsigned FirstReg = FirstRegs[NumRegs - 1];
 
     Inst.addOperand(
@@ -1304,6 +1342,12 @@ public:
     Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
   }
 
+  void addImm0_1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
   void addImm0_7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
@@ -1491,7 +1535,13 @@ public:
     Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
   }
 
-  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+  void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+  }
+
+  void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
     Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
@@ -1507,6 +1557,11 @@ public:
     Inst.addOperand(MCOperand::createImm(getPrefetch()));
   }
 
+  void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getPSBHint()));
+  }
+
   void addShifterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned Imm =
@@ -1703,6 +1758,19 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+                                                       StringRef Str,
+                                                       SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+    Op->PSBHint.Val = Val;
+    Op->PSBHint.Data = Str.data();
+    Op->PSBHint.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static std::unique_ptr<AArch64Operand>
   CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
                     bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
@@ -1776,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
       OS << "<prfop invalid #" << getPrefetch() << ">";
     break;
   }
+  case k_PSBHint: {
+    OS << getPSBHintName();
+    break;
+  }
   case k_ShiftExtend: {
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
@@ -1849,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) {
       .Case(".h", true)
       .Case(".s", true)
       .Case(".d", true)
+      // Needed for fp16 scalar pairwise reductions
+      .Case(".2h", true)
       .Default(false);
 }
 
@@ -2016,7 +2090,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
     bool Valid;
     auto Mapper = AArch64PRFM::PRFMMapper();
     StringRef Name = 
-        Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid);
+        Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
     Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
                                                       S, getContext()));
     return MatchOperand_Success;
@@ -2030,7 +2104,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
   bool Valid;
   auto Mapper = AArch64PRFM::PRFMMapper();
   unsigned prfop = 
-      Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid);
+      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
   if (!Valid) {
     TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
@@ -2042,6 +2116,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  auto Mapper = AArch64PSBHint::PSBHintMapper();
+  unsigned psbhint =
+      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
+  if (!Valid) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
+                                                   S, getContext()));
+  return MatchOperand_Success;
+}
+
 /// tryParseAdrpLabel - Parse and validate a source label for the ADRP
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
@@ -2439,6 +2539,13 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     } else if (!Op.compare_lower("cisw")) {
       // SYS #0, C7, C14, #2
       SYS_ALIAS(0, 7, 14, 2);
+    } else if (!Op.compare_lower("cvap")) {
+      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+        // SYS #3, C7, C12, #1
+        SYS_ALIAS(3, 7, 12, 1);
+      } else {
+        return TokError("DC CVAP requires ARMv8.2a");
+      }
     } else {
       return TokError("invalid operand for DC instruction");
     }
@@ -2479,6 +2586,20 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     } else if (!Op.compare_lower("s12e0w")) {
       // SYS #4, C7, C8, #7
       SYS_ALIAS(4, 7, 8, 7);
+    } else if (!Op.compare_lower("s1e1rp")) {
+      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+        // SYS #0, C7, C9, #0
+        SYS_ALIAS(0, 7, 9, 0);
+      } else {
+        return TokError("AT S1E1RP requires ARMv8.2a");
+      }
+    } else if (!Op.compare_lower("s1e1wp")) {
+      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+        // SYS #0, C7, C9, #1
+        SYS_ALIAS(0, 7, 9, 1);
+      } else {
+        return TokError("AT S1E1WP requires ARMv8.2a");
+      }
     } else {
       return TokError("invalid operand for AT instruction");
     }
@@ -2644,7 +2765,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     bool Valid;
     auto Mapper = AArch64DB::DBarrierMapper();
     StringRef Name = 
-        Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid);
+        Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
     Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
                                                       ExprLoc, getContext()));
     return MatchOperand_Success;
@@ -2658,7 +2779,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
   bool Valid;
   auto Mapper = AArch64DB::DBarrierMapper();
   unsigned Opt = 
-      Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid);
+      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
   if (!Valid) {
     TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
@@ -2687,20 +2808,21 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
 
   bool IsKnown;
   auto MRSMapper = AArch64SysReg::MRSMapper();
-  uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), STI.getFeatureBits(),
-                                         IsKnown);
+  uint32_t MRSReg = MRSMapper.fromString(Tok.getString(),
+                                         getSTI().getFeatureBits(), IsKnown);
   assert(IsKnown == (MRSReg != -1U) &&
          "register should be -1 if and only if it's unknown");
 
   auto MSRMapper = AArch64SysReg::MSRMapper();
-  uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), STI.getFeatureBits(),
-                                         IsKnown);
+  uint32_t MSRReg = MSRMapper.fromString(Tok.getString(),
+                                         getSTI().getFeatureBits(), IsKnown);
   assert(IsKnown == (MSRReg != -1U) &&
          "register should be -1 if and only if it's unknown");
 
   auto PStateMapper = AArch64PState::PStateMapper();
   uint32_t PStateField = 
-      PStateMapper.fromString(Tok.getString(), STI.getFeatureBits(), IsKnown);
+      PStateMapper.fromString(Tok.getString(),
+                              getSTI().getFeatureBits(), IsKnown);
   assert(IsKnown == (PStateField != -1U) &&
          "register should be -1 if and only if it's unknown");
 
@@ -3151,7 +3273,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
 
     if (Operands.size() < 2 ||
         !static_cast<AArch64Operand &>(*Operands[1]).isReg())
-      return true;
+      return Error(Loc, "Only valid when first operand is register");
 
     bool IsXReg =
         AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
@@ -3183,7 +3305,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
     }
     // If it is a label or an imm that cannot fit in a movz, put it into CP.
     const MCExpr *CPLoc =
-        getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4);
+        getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
     Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
     return false;
   }
@@ -3601,6 +3723,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
   case Match_InvalidMemoryIndexed16:
     return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_1:
+    return Error(Loc, "immediate must be an integer in range [0, 1].");
   case Match_InvalidImm0_7:
     return Error(Loc, "immediate must be an integer in range [0, 7].");
   case Match_InvalidImm0_15:
@@ -3912,7 +4036,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
     if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
       unsigned zreg =
-          AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+          !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
               RegOp.getReg())
               ? AArch64::WZR
               : AArch64::XZR;
@@ -3929,10 +4053,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   // If that fails, try against the alternate table containing long-form NEON:
   // "fadd v0.2s, v1.2s, v2.2s"
-  if (MatchResult != Match_Success)
+  if (MatchResult != Match_Success) {
+    // But first, save the short-form match result: we can use it in case the
+    // long-form match also fails.
+    auto ShortFormNEONErrorInfo = ErrorInfo;
+    auto ShortFormNEONMatchResult = MatchResult;
+
     MatchResult =
         MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
 
+    // Now, both matches failed, and the long-form match failed on the mnemonic
+    // suffix token operand.  The short-form match failure is probably more
+    // relevant: use it instead.
+    if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
+        Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
+        ((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
+      MatchResult = ShortFormNEONMatchResult;
+      ErrorInfo = ShortFormNEONErrorInfo;
+    }
+  }
+
+
   switch (MatchResult) {
   case Match_Success: {
     // Perform range checking and other semantic validations
@@ -3944,7 +4085,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
 
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
+    Out.EmitInstruction(Inst, getSTI());
     return false;
   }
   case Match_MissingFeature: {
@@ -3966,6 +4107,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return showMatchError(IDLoc, MatchResult);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
+
     if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
@@ -4011,6 +4153,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidMemoryIndexed8SImm7:
   case Match_InvalidMemoryIndexed16SImm7:
   case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_1:
   case Match_InvalidImm0_7:
   case Match_InvalidImm0_15:
   case Match_InvalidImm0_31:
@@ -4083,7 +4226,7 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
       if (getParser().parseExpression(Value))
         return true;
 
-      getParser().getStreamer().EmitValue(Value, Size);
+      getParser().getStreamer().EmitValue(Value, Size, L);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -4155,7 +4298,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
   Inst.setOpcode(AArch64::TLSDESCCALL);
   Inst.addOperand(MCOperand::createExpr(Expr));
 
-  getParser().getStreamer().EmitInstruction(Inst, STI);
+  getParser().getStreamer().EmitInstruction(Inst, getSTI());
   return false;
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index db9fb0e..f1f968e 100644
--- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1516,6 +1516,10 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
 
   uint64_t pstate_field = (op1 << 3) | op2;
 
+  if ((pstate_field == AArch64PState::PAN  ||
+       pstate_field == AArch64PState::UAO) && crm > 1)
+    return Fail;
+
   Inst.addOperand(MCOperand::createImm(pstate_field));
   Inst.addOperand(MCOperand::createImm(crm));
 
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 7f56c2c..d8a8108 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -55,7 +56,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   unsigned Opcode = MI->getOpcode();
 
   if (Opcode == AArch64::SYSxt)
-    if (printSysAlias(MI, O)) {
+    if (printSysAlias(MI, STI, O)) {
       printAnnotation(O, Annot);
       return;
     }
@@ -269,7 +270,7 @@ struct LdStNInstrDesc {
   int NaturalOffset;
 };
 
-static LdStNInstrDesc LdStNInstInfo[] = {
+static const LdStNInstrDesc LdStNInstInfo[] = {
   { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
   { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
   { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
@@ -612,7 +613,7 @@ static LdStNInstrDesc LdStNInstInfo[] = {
   { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
 };
 
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
   unsigned Idx;
   for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
     if (LdStNInstInfo[Idx].Opcode == Opcode)
@@ -641,7 +642,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+  if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
     O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
 
     // Now onto the operands: first a vector list with possible lane
@@ -674,7 +675,9 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   AArch64InstPrinter::printInst(MI, O, Annot, STI);
 }
 
-bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
 #ifndef NDEBUG
   unsigned Opcode = MI->getOpcode();
   assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
@@ -729,6 +732,11 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
       if (Op1Val == 3 && Op2Val == 1)
         Asm = "dc\tcvau";
       break;
+    case 12:
+      if (Op1Val == 3 && Op2Val == 1 &&
+          (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
+        Asm = "dc\tcvap";
+      break;
     case 14:
       if (Op1Val == 3 && Op2Val == 1)
         Asm = "dc\tcivac";
@@ -773,6 +781,21 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
         break;
       }
       break;
+    case 9:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
+          switch (Op2Val) {
+          default:
+            break;
+          case 0: Asm = "at\ts1e1rp"; break;
+          case 1: Asm = "at\ts1e1wp"; break;
+          }
+        }
+        break;
+      }
     }
   } else if (CnVal == 8) {
     // TLBI aliases
@@ -1122,6 +1145,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
     O << '#' << prfop;
 }
 
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned psbhintop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name =
+      AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << '#' << psbhintop;
+}
+
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 15dee97..ea68d98 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,14 +15,10 @@
 #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
-class MCOperand;
-
 class AArch64InstPrinter : public MCInstPrinter {
 public:
   AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
@@ -48,7 +44,8 @@ public:
                                      unsigned AltIdx = AArch64::NoRegAltName);
 
 protected:
-  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+                     raw_ostream &O);
   // Operand printers
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
@@ -122,6 +119,9 @@ protected:
   void printPrefetchOp(const MCInst *MI, unsigned OpNum,
                        const MCSubtargetInfo &STI, raw_ostream &O);
 
+  void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printFPImmOperand(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
 
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index ed24343..648b1df 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -364,6 +364,32 @@ static inline float getFPImmFloat(unsigned Imm) {
   return FPUnion.F;
 }
 
+/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP16Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15;  // -14 to 15
+  int32_t Mantissa = Imm.getZExtValue() & 0x3ff;  // 10 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x3f)
+    return -1;
+  Mantissa >>= 6;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP16Imm(const APFloat &FPImm) {
+  return getFP16Imm(FPImm.bitcastToAPInt());
+}
+
 /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
 /// floating-point value. If the value cannot be represented as an 8-bit
 /// floating-point value, then return -1.
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 16d5356..d26604f 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -128,10 +128,9 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                     const SMLoc &Loc) override {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size);
+    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
   }
 
 private:
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 921c4b9..fbce26e 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -48,10 +48,6 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
   UseDataRegionDirectives = true;
 
   ExceptionsType = ExceptionHandling::DwarfCFI;
-
-  // AArch64 Darwin doesn't have the baggage of X86/ARM, so it's fine to use
-  // LShr instead of AShr.
-  UseLogicalShr = true;
 }
 
 const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 2870341..a540f49 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -85,13 +85,13 @@ void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
   Streamer.visitUsedExpr(*getSubExpr());
 }
 
-MCSection *AArch64MCExpr::findAssociatedSection() const {
+MCFragment *AArch64MCExpr::findAssociatedFragment() const {
   llvm_unreachable("FIXME: what goes here?");
 }
 
 bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout,
-					    const MCFixup *Fixup) const {
+                                              const MCAsmLayout *Layout,
+                                              const MCFixup *Fixup) const {
   if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
     return false;
 
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 1165314..db36a65 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -149,11 +149,10 @@ public:
 
   void visitUsedExpr(MCStreamer &Streamer) const override;
 
-  MCSection *findAssociatedSection() const override;
+  MCFragment *findAssociatedFragment() const override;
 
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout,
-				 const MCFixup *Fixup) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
 
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
@@ -162,7 +161,6 @@ public:
   }
 
   static bool classof(const AArch64MCExpr *) { return true; }
-
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 741b273..61c96f1 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -90,9 +90,11 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     Log2Size = llvm::Log2_32(4);
     // This encompasses the relocation for the whole 21-bit value.
     switch (Sym->getKind()) {
-    default:
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                  "ADR/ADRP relocations must be GOT relative");
+    default: {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "ADR/ADRP relocations must be GOT relative");
+      return false;
+    }
     case MCSymbolRefExpr::VK_PAGE:
       RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
       return true;
@@ -170,25 +172,25 @@ void AArch64MachObjectWriter::recordRelocation(
   // assembler local symbols. If we got here, that's not what we have,
   // so complain loudly.
   if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
-    Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                "conditional branch requires assembler-local"
-                                " label. '" +
-                                    Target.getSymA()->getSymbol().getName() +
-                                    "' is external.");
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "conditional branch requires assembler-local"
+                                 " label. '" +
+                                     Target.getSymA()->getSymbol().getName() +
+                                     "' is external.");
     return;
   }
 
   // 14-bit branch relocations should only target internal labels, and so
   // should never get here.
   if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
-    Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                "Invalid relocation on conditional branch!");
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "Invalid relocation on conditional branch!");
     return;
   }
 
   if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
-                                  Asm)) {
-    Asm.getContext().reportFatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+                                    Asm)) {
+    Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
     return;
   }
 
@@ -200,8 +202,9 @@ void AArch64MachObjectWriter::recordRelocation(
     Type = MachO::ARM64_RELOC_UNSIGNED;
 
     if (IsPCRel) {
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                  "PC relative absolute relocation!");
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "PC relative absolute relocation!");
+      return;
 
       // FIXME: x86_64 sets the type to a branch reloc here. Should we do
       // something similar?
@@ -229,16 +232,20 @@ void AArch64MachObjectWriter::recordRelocation(
       Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
       return;
     } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
       // Otherwise, neither symbol can be modified.
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                  "unsupported relocation of modified symbol");
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
 
     // We don't support PCrel relocations of differences.
-    if (IsPCRel)
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                  "unsupported pc-relative relocation of "
-                                  "difference");
+    if (IsPCRel) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported pc-relative relocation of "
+                                   "difference");
+      return;
+    }
 
     // AArch64 always uses external relocations. If there is no symbol to use as
     // a base address (a local symbol with no preceding non-local symbol),
@@ -246,20 +253,26 @@ void AArch64MachObjectWriter::recordRelocation(
     //
     // FIXME: We should probably just synthesize an external symbol and use
     // that.
-    if (!A_Base)
-      Asm.getContext().reportFatalError(
+    if (!A_Base) {
+      Asm.getContext().reportError(
           Fixup.getLoc(),
           "unsupported relocation of local symbol '" + A->getName() +
               "'. Must have non-local symbol earlier in section.");
-    if (!B_Base)
-      Asm.getContext().reportFatalError(
+      return;
+    }
+    if (!B_Base) {
+      Asm.getContext().reportError(
           Fixup.getLoc(),
           "unsupported relocation of local symbol '" + B->getName() +
               "'. Must have non-local symbol earlier in section.");
+      return;
+    }
 
-    if (A_Base == B_Base && A_Base)
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                  "unsupported relocation with identical base");
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
 
     Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
              (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress(
@@ -309,10 +322,12 @@ void AArch64MachObjectWriter::recordRelocation(
       // we need to preserve and merge with the new Target? How about
       // the FixedValue?
       if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
-                                                             &Fixup))
-        Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                    "unable to resolve variable '" +
-                                        Symbol->getName() + "'");
+                                                             &Fixup)) {
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unable to resolve variable '" +
+                                         Symbol->getName() + "'");
+        return;
+      }
       return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                               FixedValue);
     }
@@ -337,11 +352,13 @@ void AArch64MachObjectWriter::recordRelocation(
         Value +=
             Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base);
     } else if (Symbol->isInSection()) {
-      if (!CanUseLocalRelocation)
-        Asm.getContext().reportFatalError(
+      if (!CanUseLocalRelocation) {
+        Asm.getContext().reportError(
             Fixup.getLoc(),
             "unsupported relocation of local symbol '" + Symbol->getName() +
                 "'. Must have non-local symbol earlier in section.");
+        return;
+      }
       // Adjust the relocation to be section-relative.
       // The index is the section ordinal (1-based).
       const MCSection &Sec = Symbol->getSection();
@@ -361,9 +378,10 @@ void AArch64MachObjectWriter::recordRelocation(
           return;
         }
       }
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
+      Asm.getContext().reportError(Fixup.getLoc(),
                                   "unsupported relocation of variable '" +
                                       Symbol->getName() + "'");
+      return;
     }
   }
 
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 52b000d..3e86a42 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -26,8 +26,9 @@ AArch64TargetStreamer::~AArch64TargetStreamer() {}
 // The constant pool handling is shared by all AArch64TargetStreamer
 // implementations.
 const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
-                                                          unsigned Size) {
-  return ConstantPools->addEntry(Streamer, Expr, Size);
+                                                          unsigned Size,
+                                                          SMLoc Loc) {
+  return ConstantPools->addEntry(Streamer, Expr, Size, Loc);
 }
 
 void AArch64TargetStreamer::emitCurrentConstantPool() {
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index fcc0d05..51432830 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -24,7 +24,7 @@ public:
   /// Callback used to implement the ldr= pseudo.
   /// Add a new entry to the constant pool for the current section and return an
   /// MCExpr that can be used to refer to the constant pool location.
-  const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size);
+  const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc);
 
   /// Callback used to implemnt the .ltorg directive.
   /// Emit contents of constant pool for the current section.
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index ee85b65b..cde1c6d 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -146,11 +146,22 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings
 
   // v8.1a "Privileged Access Never" extension-specific PStates
   {"pan", PAN, {AArch64::HasV8_1aOps}},
+
+  // v8.2a
+  {"uao", UAO, {AArch64::HasV8_2aOps}},
 };
 
 AArch64PState::PStateMapper::PStateMapper()
   : AArch64NamedImmMapper(PStateMappings, 0) {}
 
+const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
+  // v8.2a "Statistical Profiling" extension-specific PSB operand
+  {"csync", CSync, {AArch64::FeatureSPE}},
+};
+
+AArch64PSBHint::PSBHintMapper::PSBHintMapper()
+  : AArch64NamedImmMapper(PSBHintMappings, 0) {}
+
 const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
   {"mdccsr_el0", MDCCSR_EL0, {}},
   {"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
@@ -192,6 +203,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
   {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}},
   {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}},
   {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}},
+  {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}},
   {"mvfr0_el1", MVFR0_EL1, {}},
   {"mvfr1_el1", MVFR1_EL1, {}},
   {"mvfr2_el1", MVFR2_EL1, {}},
@@ -275,9 +287,6 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
   {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}},
   {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}},
   {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}},
-
-  // v8.1a "Privileged Access Never" extension-specific system registers
-  {"pan", PAN, {AArch64::HasV8_1aOps}},
 };
 
 AArch64SysReg::MSRMapper::MSRMapper() {
@@ -804,10 +813,28 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
   {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}},
   {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}},
   {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}},
+
+  // v8.2a registers
+  {"uao",           UAO,           {AArch64::HasV8_2aOps}},
+
+  // v8.2a "Statistical Profiling extension" registers
+  {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
+  {"pmbptr_el1",    PMBPTR_EL1,    {AArch64::FeatureSPE}},
+  {"pmbsr_el1",     PMBSR_EL1,     {AArch64::FeatureSPE}},
+  {"pmbidr_el1",    PMBIDR_EL1,    {AArch64::FeatureSPE}},
+  {"pmscr_el2",     PMSCR_EL2,     {AArch64::FeatureSPE}},
+  {"pmscr_el12",    PMSCR_EL12,    {AArch64::FeatureSPE}},
+  {"pmscr_el1",     PMSCR_EL1,     {AArch64::FeatureSPE}},
+  {"pmsicr_el1",    PMSICR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsirr_el1",    PMSIRR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsfcr_el1",    PMSFCR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsevfr_el1",   PMSEVFR_EL1,   {AArch64::FeatureSPE}},
+  {"pmslatfr_el1",  PMSLATFR_EL1,  {AArch64::FeatureSPE}},
+  {"pmsidr_el1",    PMSIDR_EL1,    {AArch64::FeatureSPE}},
 };
 
 uint32_t
-AArch64SysReg::SysRegMapper::fromString(StringRef Name, 
+AArch64SysReg::SysRegMapper::fromString(StringRef Name,
     const FeatureBitset& FeatureBits, bool &Valid) const {
   std::string NameLower = Name.lower();
 
@@ -851,7 +878,7 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name,
 }
 
 std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits, 
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
                                       const FeatureBitset& FeatureBits) const {
   // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7e42f8e..e63627e 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -285,17 +285,17 @@ struct AArch64NamedImmMapper {
     // Zero value of FeatureBitSet means the mapping is always available
     FeatureBitset FeatureBitSet;
 
-    bool isNameEqual(std::string Other, 
+    bool isNameEqual(std::string Other,
                      const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() && 
+      if (FeatureBitSet.any() &&
           (FeatureBitSet & FeatureBits).none())
         return false;
       return Name == Other;
     }
 
-    bool isValueEqual(uint32_t Other, 
+    bool isValueEqual(uint32_t Other,
                       const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() && 
+      if (FeatureBitSet.any() &&
           (FeatureBitSet & FeatureBits).none())
         return false;
       return Value == Other;
@@ -310,7 +310,7 @@ struct AArch64NamedImmMapper {
   StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits,
                      bool &Valid) const;
   // Maps string to value, depending on availability for FeatureBits given
-  uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, 
+  uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
                      bool &Valid) const;
 
   /// Many of the instructions allow an alternative assembly form consisting of
@@ -337,7 +337,9 @@ namespace AArch64AT {
     S12E1R = 0x63c4, // 01  100  0111  1000  100
     S12E1W = 0x63c5, // 01  100  0111  1000  101
     S12E0R = 0x63c6, // 01  100  0111  1000  110
-    S12E0W = 0x63c7  // 01  100  0111  1000  111
+    S12E0W = 0x63c7, // 01  100  0111  1000  111
+    S1E1RP = 0x43c8, // 01  000  0111  1001  000
+    S1E1WP = 0x43c9  // 01  000  0111  1001  001
   };
 
   struct ATMapper : AArch64NamedImmMapper {
@@ -463,6 +465,9 @@ namespace AArch64PState {
 
     // v8.1a "Privileged Access Never" extension-specific PStates
     PAN = 0x04,
+
+    // v8.2a "User Access Override" extension-specific PStates
+    UAO = 0x03
   };
 
   struct PStateMapper : AArch64NamedImmMapper {
@@ -473,6 +478,21 @@ namespace AArch64PState {
 
 }
 
+namespace AArch64PSBHint {
+  enum PSBHintValues {
+    Invalid = -1,
+    // v8.2a "Statistical Profiling" extension-specific PSB operands
+    CSync = 0x11,  // psb csync = hint #0x11
+  };
+
+  struct PSBHintMapper : AArch64NamedImmMapper {
+    const static Mapping PSBHintMappings[];
+
+    PSBHintMapper();
+  };
+
+}
+
 namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
@@ -594,6 +614,7 @@ namespace AArch64SysReg {
     ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
     ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
     ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
+    ID_A64MMFR2_EL1   = 0xc03a, // 11  000  0000  0111  010
     MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
     MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
     MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
@@ -1190,6 +1211,24 @@ namespace AArch64SysReg {
     SPSR_EL12         = 0xea00, // 11  101  0100  0000  000
     ELR_EL12          = 0xea01, // 11  101  0100  0000  001
 
+    // v8.2a registers
+    UAO               = 0xc214, // 11  000  0100  0010  100
+
+    // v8.2a "Statistical Profiling extension" registers
+    PMBLIMITR_EL1     = 0xc4d0, // 11  000  1001  1010  000
+    PMBPTR_EL1        = 0xc4d1, // 11  000  1001  1010  001
+    PMBSR_EL1         = 0xc4d3, // 11  000  1001  1010  011
+    PMBIDR_EL1        = 0xc4d7, // 11  000  1001  1010  111
+    PMSCR_EL2         = 0xe4c8, // 11  100  1001  1001  000
+    PMSCR_EL12        = 0xecc8, // 11  101  1001  1001  000
+    PMSCR_EL1         = 0xc4c8, // 11  000  1001  1001  000
+    PMSICR_EL1        = 0xc4ca, // 11  000  1001  1001  010
+    PMSIRR_EL1        = 0xc4cb, // 11  000  1001  1001  011
+    PMSFCR_EL1        = 0xc4cc, // 11  000  1001  1001  100
+    PMSEVFR_EL1       = 0xc4cd, // 11  000  1001  1001  101
+    PMSLATFR_EL1      = 0xc4ce, // 11  000  1001  1001  110
+    PMSIDR_EL1        = 0xc4cf, // 11  000  1001  1001  111
+
     // Cyclone specific system registers
     CPM_IOACC_CTL_EL3 = 0xff90,
   };
@@ -1283,7 +1322,7 @@ namespace AArch64TLBI {
       return true;
     }
   }
-} 
+}
 
 namespace AArch64II {
   /// Target Operand Flag enum.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0a05d25..5d00e1c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -20,8 +20,10 @@ class AMDGPUInstrPrinter;
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
+class MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
+class ScheduleDAGInstrs;
 class Target;
 class TargetMachine;
 
@@ -44,15 +46,23 @@ FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
-FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
+
+ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
+
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
+extern char &AMDGPUAnnotateKernelFeaturesID;
 
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIFixSGPRCopiesPass(PassRegistry &);
+extern char &SIFixSGPRCopiesID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -64,6 +74,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+FunctionPass *createAMDGPUAnnotateUniformValues();
 
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
@@ -71,6 +83,8 @@ extern char &SIFixControlFlowLiveIntervalsID;
 void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
 extern char &SIFixSGPRLiveRangesID;
 
+void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
+extern char &AMDGPUAnnotateUniformValuesPassID;
 
 extern Target TheAMDGPUTarget;
 extern Target TheGCNTarget;
@@ -85,8 +99,6 @@ enum TargetIndex {
 };
 }
 
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
 } // End namespace llvm
 
 namespace ShaderType {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
index 68b5050..db869cf 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -108,11 +108,21 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol
         "true",
         "Force using DS instruction immediate offsets on SI">;
 
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+        "FlatForGlobal",
+        "true",
+        "Force to generate flat instruction for global">;
+
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
         "FlatAddressSpace",
         "true",
         "Support flat address space">;
 
+def FeatureXNACK : SubtargetFeature<"xnack",
+        "EnableXNACK",
+        "true",
+        "Enable XNACK support">;
+
 def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
         "EnableVGPRSpilling",
         "true",
@@ -272,9 +282,14 @@ def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
 >, AssemblerPredicate<"FeatureGCN1Encoding">;
 
+def isVI : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+  AssemblerPredicate<"FeatureGCN3Encoding">;
+
 class PredicateControl {
   Predicate SubtargetPredicate;
   Predicate SIAssemblerPredicate = isSICI;
+  Predicate VIAssemblerPredicate = isVI;
   list<Predicate> AssemblerPredicates = [];
   Predicate AssemblerPredicate = TruePredicate;
   list<Predicate> OtherPredicates = [];
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
new file mode 100644
index 0000000..3781839
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -0,0 +1,126 @@
+//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass adds target attributes to functions which use intrinsics
+/// which will impact calling convention lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateKernelFeatures : public ModulePass {
+private:
+  void addAttrToCallers(Function *Intrin, StringRef AttrName);
+  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+
+public:
+  static char ID;
+
+  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override {
+    return "AMDGPU Annotate Kernel Features";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+};
+
+}
+
+char AMDGPUAnnotateKernelFeatures::ID = 0;
+
+char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                      "Add AMDGPU function attributes", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                    "Add AMDGPU function attributes", false, false)
+
+
+void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
+                                                    StringRef AttrName) {
+  SmallPtrSet<Function *, 4> SeenFuncs;
+
+  for (User *U : Intrin->users()) {
+    // CallInst is the only valid user for an intrinsic.
+    CallInst *CI = cast<CallInst>(U);
+
+    Function *CallingFunction = CI->getParent()->getParent();
+    if (SeenFuncs.insert(CallingFunction).second)
+      CallingFunction->addFnAttr(AttrName);
+  }
+}
+
+bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
+  Module &M,
+  ArrayRef<StringRef[2]> IntrinsicToAttr) {
+  bool Changed = false;
+
+  for (const StringRef *Arr  : IntrinsicToAttr) {
+    if (Function *Fn = M.getFunction(Arr[0])) {
+      addAttrToCallers(Fn, Arr[1]);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+  Triple TT(M.getTargetTriple());
+
+  static const StringRef IntrinsicToAttr[][2] = {
+    // .x omitted
+    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
+    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
+
+    // .x omitted
+    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
+    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
+
+  };
+
+  static const StringRef HSAIntrinsicToAttr[][2] = {
+    { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" },
+
+    { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" },
+    { "llvm.amdgcn.dispatch.ptr",     "amdgpu-dispatch-ptr" }
+  };
+
+  // TODO: Intrinsics that require queue ptr.
+
+  // We do not need to note the x workitem or workgroup id because they are
+  // always initialized.
+
+  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
+  if (TT.getOS() == Triple::AMDHSA)
+    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+
+  return Changed;
+}
+
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+  return new AMDGPUAnnotateKernelFeatures();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
new file mode 100644
index 0000000..dfddc34
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -0,0 +1,84 @@
+//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass adds amdgpu.uniform metadata to IR values so this information
+/// can be used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-uniform"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateUniformValues : public FunctionPass,
+                       public InstVisitor<AMDGPUAnnotateUniformValues> {
+  DivergenceAnalysis *DA;
+
+public:
+  static char ID;
+  AMDGPUAnnotateUniformValues() :
+    FunctionPass(ID) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DivergenceAnalysis>();
+    AU.setPreservesAll();
+ }
+
+  void visitLoadInst(LoadInst &I);
+
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+                      "Add AMDGPU uniform metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+                    "Add AMDGPU uniform metadata", false, false)
+
+char AMDGPUAnnotateUniformValues::ID = 0;
+
+void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  if (!DA->isUniform(Ptr))
+    return;
+
+  if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+    PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
+
+}
+
+bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+  return false;
+}
+
+bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+  DA = &getAnalysis<DivergenceAnalysis>();
+  visit(F);
+
+  return true;
+}
+
+FunctionPass *
+llvm::createAMDGPUAnnotateUniformValues() {
+  return new AMDGPUAnnotateUniformValues();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0a5309b..1239dfb2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -91,6 +91,25 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)) {}
 
+void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+
+  // Need to construct an MCSubtargetInfo here in case we have no functions
+  // in the module.
+  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+        TM.getTargetTriple().str(), TM.getTargetCPU(),
+        TM.getTargetFeatureString()));
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  TS->EmitDirectiveHSACodeObjectVersion(1, 0);
+  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
+  TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
+                                    "AMD", "AMDGPU");
+}
+
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
@@ -100,14 +119,67 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   }
 }
 
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+  if (MFI->isKernel() && STM.isAmdHsaOS()) {
+    AMDGPUTargetStreamer *TS =
+        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+    TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
+                             ELF::STT_AMDGPU_HSA_KERNEL);
+  }
+
+  AsmPrinter::EmitFunctionEntryLabel();
+}
 
-  // This label is used to mark the end of the .text section.
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  OutStreamer->SwitchSection(TLOF.getTextSection());
-  MCSymbol *EndOfTextLabel =
-      OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-  OutStreamer->EmitLabel(EndOfTextLabel);
+static bool isModuleLinkage(const GlobalValue *GV) {
+  switch (GV->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::CommonLinkage:
+   return true;
+  case GlobalValue::ExternalLinkage:
+   return false;
+  default: llvm_unreachable("unknown linkage type");
+  }
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
+    AsmPrinter::EmitGlobalVariable(GV);
+    return;
+  }
+
+  if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
+    AsmPrinter::EmitGlobalVariable(GV);
+    return;
+  }
+
+  // Group segment variables aren't emitted in HSA.
+  if (AMDGPU::isGroupSegment(GV))
+    return;
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (isModuleLinkage(GV)) {
+    TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
+  } else {
+    TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
+  }
+
+  MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV));
+  const DataLayout &DL = getDataLayout();
+
+  // Emit the size
+  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+  OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
+  OutStreamer->PushSection();
+  OutStreamer->SwitchSection(
+      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+  const Constant *C = GV->getInitializer();
+  OutStreamer->EmitLabel(GVSym);
+  EmitGlobalConstant(DL, C);
+  OutStreamer->PopSection();
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -125,17 +197,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    getSIProgramInfo(KernelInfo, MF);
     if (!STM.isAmdHsaOS()) {
-      getSIProgramInfo(KernelInfo, MF);
       EmitProgramInfoSI(MF, KernelInfo);
     }
-    // Emit directives
-    AMDGPUTargetStreamer *TS =
-        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-    TS->EmitDirectiveHSACodeObjectVersion(1, 0);
-    AMDGPU::IsaVersion ISA = STM.getIsaVersion();
-    TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
-                                      "AMD", "AMDGPU");
   } else {
     EmitProgramInfoR600(MF);
   }
@@ -165,6 +230,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                   false);
       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
                                   false);
+
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+                                  Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+                                  Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+                                  Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+                                  Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+                                  Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+
     } else {
       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       OutStreamer->emitRawComment(
@@ -278,27 +360,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         unsigned width = 0;
         bool isSGPR = false;
 
-        if (!MO.isReg()) {
+        if (!MO.isReg())
           continue;
-        }
+
         unsigned reg = MO.getReg();
-        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
-	    reg == AMDGPU::VCC_HI) {
+        switch (reg) {
+        case AMDGPU::EXEC:
+        case AMDGPU::SCC:
+        case AMDGPU::M0:
+          continue;
+
+        case AMDGPU::VCC:
+        case AMDGPU::VCC_LO:
+        case AMDGPU::VCC_HI:
           VCCUsed = true;
           continue;
-        } else if (reg == AMDGPU::FLAT_SCR ||
-                   reg == AMDGPU::FLAT_SCR_LO ||
-                   reg == AMDGPU::FLAT_SCR_HI) {
+
+        case AMDGPU::FLAT_SCR:
+        case AMDGPU::FLAT_SCR_LO:
+        case AMDGPU::FLAT_SCR_HI:
           FlatUsed = true;
           continue;
-        }
 
-        switch (reg) {
-        default: break;
-        case AMDGPU::SCC:
-        case AMDGPU::EXEC:
-        case AMDGPU::M0:
-          continue;
+        default:
+          break;
         }
 
         if (AMDGPU::SReg_32RegClass.contains(reg)) {
@@ -348,11 +433,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     }
   }
 
+  unsigned ExtraSGPRs = 0;
+
   if (VCCUsed)
-    MaxSGPR += 2;
+    ExtraSGPRs = 2;
 
-  if (FlatUsed)
-    MaxSGPR += 2;
+  if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (FlatUsed)
+      ExtraSGPRs = 4;
+  } else {
+    if (STM.isXNACKEnabled())
+      ExtraSGPRs = 4;
+
+    if (FlatUsed)
+      ExtraSGPRs = 6;
+  }
+
+  MaxSGPR += ExtraSGPRs;
 
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
@@ -368,6 +465,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
   }
 
+  if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    Ctx.emitError("too many user SGPRs used");
+  }
+
   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
@@ -419,18 +521,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
       S_00B848_PRIV(ProgInfo.Priv) |
       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
-      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
+  // 0 = X, 1 = XY, 2 = XYZ
+  unsigned TIDIGCompCnt = 0;
+  if (MFI->hasWorkItemIDZ())
+    TIDIGCompCnt = 2;
+  else if (MFI->hasWorkItemIDY())
+    TIDIGCompCnt = 1;
+
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
-      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
-      S_00B84C_TGID_X_EN(1) |
-      S_00B84C_TGID_Y_EN(1) |
-      S_00B84C_TGID_Z_EN(1) |
-      S_00B84C_TG_SIZE_EN(1) |
-      S_00B84C_TIDIG_COMP_CNT(2) |
-      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+      S_00B84C_EXCP_EN_MSB(0) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+      S_00B84C_EXCP_EN(0);
 }
 
 static unsigned getRsrcReg(unsigned ShaderType) {
@@ -476,7 +587,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
+    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
 }
 
@@ -491,14 +604,56 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   header.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties =
-      AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
-      AMD_CODE_PROPERTY_IS_PTR64;
+  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+  if (MFI->hasPrivateSegmentBuffer()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (MFI->hasQueuePtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+  if (MFI->hasKernargSegmentPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+  if (MFI->hasDispatchID())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+  if (MFI->hasFlatScratchInit())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  // TODO: Private segment size
+
+  if (MFI->hasGridWorkgroupCountX()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+  }
+
+  if (MFI->hasGridWorkgroupCountY()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+  }
+
+  if (MFI->hasGridWorkgroupCountZ()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (STM.isXNACKEnabled())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
   header.kernarg_segment_byte_size = MFI->ABIArgOffset;
   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
   header.workitem_vgpr_count = KernelInfo.NumVGPR;
-
+  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
 
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 345af9b..99d4091 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -99,7 +99,11 @@ public:
 
   void EmitFunctionBodyStart() override;
 
-  void EmitEndOfAsmFile(Module &M) override;
+  void EmitFunctionEntryLabel() override;
+
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+  void EmitStartOfAsmFile(Module &M) override;
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 6ffa7a0..b0db261 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -20,28 +20,83 @@ def CC_SI : CallingConv<[
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
   >>>,
 
+  // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
   CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
-    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
   ]>>>,
 
   CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
   >>>
 
 ]>;
 
+def RetCC_SI : CallingConv<[
+  CCIfType<[i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+  ]>>,
+
+  // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
+  CCIfType<[f32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>
+]>;
+
 // Calling convention for R600
 def CC_R600 : CallingConv<[
   CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
new file mode 100644
index 0000000..2f6b302
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
@@ -0,0 +1,26 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
+
+using namespace llvm;
+
+DiagnosticInfoUnsupported::DiagnosticInfoUnsupported(
+  const Function &Fn,
+  const Twine &Desc,
+  DiagnosticSeverity Severity)
+  : DiagnosticInfo(getKindID(), Severity),
+    Description(Desc),
+    Fn(Fn) { }
+
+int DiagnosticInfoUnsupported::KindID = 0;
+
+void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
+  DP << "unsupported " << getDescription() << " in " << Fn.getName();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
new file mode 100644
index 0000000..0fd37e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
@@ -0,0 +1,48 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+namespace llvm {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                            DiagnosticSeverity Severity = DS_Error);
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 8175786..4d84d28 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
 }
 
 /// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
+int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                int FI,
+                                                unsigned &FrameReg) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
   // Start the offset at 2 so we don't overwrite work group information.
   // XXX: We should only do this when the shader actually uses this
   // information.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 9f31be1..257a3da 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,14 +8,12 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface to describe a layout of a stack frame on a AMDIL target
-/// machine.
+/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
 
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -34,7 +32,8 @@ public:
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
   const SpillSlot *
     getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 64c54cc..b33040b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,6 +11,8 @@
 /// \brief Defines an instruction selector for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
@@ -20,9 +22,9 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Function.h"
@@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const AMDGPUSubtarget *Subtarget;
+
 public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
   bool runOnMachineFunction(MachineFunction &MF) override;
   SDNode *Select(SDNode *N) override;
   const char *getPassName() const override;
+  void PreprocessISelDAG() override;
   void PostprocessISelDAG() override;
 
 private:
@@ -91,7 +95,7 @@ private:
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
                                  SDValue &Offset1) const;
-  void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+  bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
                    SDValue &TFE) const;
@@ -108,6 +112,16 @@ private:
                          SDValue &TFE) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                          SDValue &Offset, SDValue &GLC) const;
+  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+                        bool &Imm) const;
+  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+                  bool &Imm) const;
+  bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
   SDNode *SelectAddrSpaceCast(SDNode *N);
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   return N;
 }
 
+static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
+  switch (NumVectorElts) {
+  case 1:
+    return AMDGPU::SReg_32RegClassID;
+  case 2:
+    return AMDGPU::SReg_64RegClassID;
+  case 4:
+    return AMDGPU::SReg_128RegClassID;
+  case 8:
+    return AMDGPU::SReg_256RegClassID;
+  case 16:
+    return AMDGPU::SReg_512RegClassID;
+  }
+
+  llvm_unreachable("invalid vector size");
+}
+
 SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     EVT EltVT = VT.getVectorElementType();
     assert(EltVT.bitsEq(MVT::i32));
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      bool UseVReg = true;
-      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                    U != E; ++U) {
-        if (!U->isMachineOpcode()) {
-          continue;
-        }
-        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-        if (!RC) {
-          continue;
-        }
-        if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
-          UseVReg = false;
-        }
-      }
-      switch(NumVectorElts) {
-      case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
-                                     AMDGPU::SReg_32RegClassID;
-        break;
-      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
-                                     AMDGPU::SReg_64RegClassID;
-        break;
-      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
-                                     AMDGPU::SReg_128RegClassID;
-        break;
-      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
-                                     AMDGPU::SReg_256RegClassID;
-        break;
-      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
-                                      AMDGPU::SReg_512RegClassID;
-        break;
-      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
-      }
+      RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
     } else {
       // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
       // that adds a 128 bits reg copy when going through TwoAddressInstructions
@@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
                                   N->getValueType(0), Ops);
   }
-
-  case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(N);
-    SDLoc SL(N);
-    EVT VT = N->getValueType(0);
-
-    if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
-      N = glueCopyToM0(N);
-      break;
-    }
-
-    // To simplify the TableGen patters, we replace all i64 loads with
-    // v2i32 loads.  Alternatively, we could promote i64 loads to v2i32
-    // during DAG legalization, however, so places (ExpandUnalignedLoad)
-    // in the DAG legalizer assume that if i64 is legal, so doing this
-    // promotion early can cause problems.
-
-    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
-                                      LD->getBasePtr(), LD->getMemOperand());
-    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
-                                      MVT::i64, NewLoad);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
-    SDNode *Load = glueCopyToM0(NewLoad.getNode());
-    SelectCode(Load);
-    N = BitCast.getNode();
-    break;
-  }
-
+  case ISD::LOAD:
   case ISD::STORE: {
-    // Handle i64 stores here for the same reason mentioned above for loads.
-    StoreSDNode *ST = cast<StoreSDNode>(N);
-    SDValue Value = ST->getValue();
-    if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
-
-      SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
-                                        MVT::v2i32, Value);
-      SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
-                                          ST->getBasePtr(), ST->getMemOperand());
-
-      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
-
-      if (NewValue.getOpcode() == ISD::BITCAST) {
-        Select(NewStore.getNode());
-        return SelectCode(NewValue.getNode());
-      }
-
-      // getNode() may fold the bitcast if its input was another bitcast.  If that
-      // happens we should only select the new store.
-      N = NewStore.getNode();
-    }
-
     N = glueCopyToM0(N);
     break;
   }
 
-  case AMDGPUISD::REGISTER_LOAD: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-
-    SDLoc DL(N);
-    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
-    const SDValue Ops[] = {
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
-                                  CurDAG->getVTList(MVT::i32, MVT::i64,
-                                                    MVT::Other),
-                                  Ops);
-  }
-  case AMDGPUISD::REGISTER_STORE: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
-    SDLoc DL(N);
-    const SDValue Ops[] = {
-      N->getOperand(1),
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
-                                        CurDAG->getVTList(MVT::Other),
-                                        Ops);
-  }
-
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
     return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
                     N->getOperand(0), OffsetVal, WidthVal);
-
   }
   case AMDGPUISD::DIV_SCALE: {
     return SelectDIV_SCALE(N);
@@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
-
 bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
   assert(AS != 0 && "Use checkPrivateAddress instead.");
   if (!Ptr)
@@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
   if (checkPrivateAddress(N->getMemOperand())) {
     if (MMO) {
       const PseudoSourceValue *PSV = MMO->getPseudoValue();
-      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+      if (PSV && PSV->isConstantPool()) {
         return true;
       }
     }
@@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   unsigned Opc
     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
 
-  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+  // omod
   SDValue Ops[8];
 
   SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
       Offset = N1;
       return true;
     }
-  }
+  } else if (Addr.getOpcode() == ISD::SUB) {
+    // sub C, x -> add (sub 0, x), C
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      int64_t ByteOffset = C->getSExtValue();
+      if (isUInt<16>(ByteOffset)) {
+        SDLoc DL(Addr);
+        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+        // XXX - This is kind of hacky. Create a dummy sub node so we can check
+        // the known bits in isDSOffsetLegal. We need to emit the selected node
+        // here, so this is thrown away.
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
+
+        if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+          MachineSDNode *MachineSub
+            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+                                     Zero, Addr.getOperand(1));
+
+          Base = SDValue(MachineSub, 0);
+          Offset = Addr.getOperand(0);
+          return true;
+        }
+      }
+    }
+  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    // If we have a constant address, prefer to put the constant into the
+    // offset. This can save moves to load the constant address since multiple
+    // operations can share the zero base address register, and enables merging
+    // into read2 / write2 instructions.
 
-  SDLoc DL(Addr);
+    SDLoc DL(Addr);
 
-  // If we have a constant address, prefer to put the constant into the
-  // offset. This can save moves to load the constant address since multiple
-  // operations can share the zero base address register, and enables merging
-  // into read2 / write2 instructions.
-  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     if (isUInt<16>(CAddr->getZExtValue())) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
@@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
 
   // default case
   Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
   return true;
 }
 
+// TODO: If offset is too big, put low 16-bit into offset.
 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
                                                    SDValue &Offset0,
                                                    SDValue &Offset1) const {
@@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
       Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
       return true;
     }
-  }
-
-  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+  } else if (Addr.getOpcode() == ISD::SUB) {
+    // sub C, x -> add (sub 0, x), C
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      unsigned DWordOffset0 = C->getZExtValue() / 4;
+      unsigned DWordOffset1 = DWordOffset0 + 1;
+
+      if (isUInt<8>(DWordOffset0)) {
+        SDLoc DL(Addr);
+        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+        // XXX - This is kind of hacky. Create a dummy sub node so we can check
+        // the known bits in isDSOffsetLegal. We need to emit the selected node
+        // here, so this is thrown away.
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
+
+        if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+          MachineSDNode *MachineSub
+            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+                                     Zero, Addr.getOperand(1));
+
+          Base = SDValue(MachineSub, 0);
+          Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+          Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+          return true;
+        }
+      }
+    }
+  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
     unsigned DWordOffset1 = DWordOffset0 + 1;
     assert(4 * DWordOffset0 == CAddr->getZExtValue());
@@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
   return isUInt<12>(Imm->getZExtValue());
 }
 
-void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &VAddr, SDValue &SOffset,
                                      SDValue &Offset, SDValue &Offen,
                                      SDValue &Idxen, SDValue &Addr64,
                                      SDValue &GLC, SDValue &SLC,
                                      SDValue &TFE) const {
+  // Subtarget prefers to use flat instruction
+  if (Subtarget->useFlatForGlobal())
+    return false;
+
   SDLoc DL(Addr);
 
   GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
 
     if (isLegalMUBUFImmOffset(C1)) {
         Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return;
+        return true;
     } else if (isUInt<32>(C1->getZExtValue())) {
       // Illegal offset, store it in soffset.
       Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
       SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
                    CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
                         0);
-      return;
+      return true;
     }
   }
 
@@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
     Ptr = N0;
     VAddr = N1;
     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-    return;
+    return true;
   }
 
   // default case -> offset
@@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
   Ptr = Addr;
   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
 
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
@@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return false;
 
-  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE);
+  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE))
+    return false;
 
   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
   if (C->getSExtValue()) {
@@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &VAddr, SDValue &SOffset,
-					   SDValue &Offset,
-					   SDValue &SLC) const {
+                                           SDValue &Offset,
+                                           SDValue &SLC) const {
   SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
   SDValue GLC, TFE;
 
@@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
 
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  unsigned ScratchOffsetReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
-                                ScratchOffsetReg, MVT::i32);
-  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
-  SDValue ScratchRsrcDword0 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
-  SDValue ScratchRsrcDword1 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
-
-  const SDValue RsrcOps[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      ScratchRsrcDword0,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      ScratchRsrcDword1,
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-  };
-  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::v2i32, RsrcOps), 0);
-  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
-  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
-  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE);
+  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE))
+    return false;
 
   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
@@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
 }
 
+///
+/// \param EncodedOffset This is the immediate value that will be encoded
+///        directly into the instruction.  On SI/CI the \p EncodedOffset
+///        will be in units of dwords and on VI+ it will be units of bytes.
+static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
+                                 int64_t EncodedOffset) {
+  return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+     isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+                                          SDValue &Offset, bool &Imm) const {
+
+  // FIXME: Handle non-constant offsets.
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
+  if (!C)
+    return false;
+
+  SDLoc SL(ByteOffsetNode);
+  AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+  int64_t ByteOffset = C->getSExtValue();
+  int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+      ByteOffset >> 2 : ByteOffset;
+
+  if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+    Imm = true;
+    return true;
+  }
+
+  if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+    return false;
+
+  if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
+    // 32-bit Immediates are supported on Sea Islands.
+    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+  } else {
+    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+    Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
+                                            C32Bit), 0);
+  }
+  Imm = false;
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+                                     SDValue &Offset, bool &Imm) const {
+
+  SDLoc SL(Addr);
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+
+    if (SelectSMRDOffset(N1, Offset, Imm)) {
+      SBase = N0;
+      return true;
+    }
+  }
+  SBase = Addr;
+  Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+  Imm = true;
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
+                                       SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
+                                         SDValue &Offset) const {
+
+  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  bool Imm;
+  if (!SelectSMRD(Addr, SBase, Offset, Imm))
+    return false;
+
+  return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
+                                        SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
+         !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
+                                             SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
+                                               SDValue &Offset) const {
+  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  bool Imm;
+  if (!SelectSMRDOffset(Addr, Offset, Imm))
+    return false;
+
+  return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
+                                              SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
+         !isa<ConstantSDNode>(Offset);
+}
+
 // FIXME: This is incorrect and only enough to be able to compile.
 SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
   SDLoc DL(N);
 
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
+                                           "addrspacecast not implemented");
+  CurDAG->getContext()->diagnose(NotImplemented);
+
   assert(Subtarget->hasFlatAddressSpace() &&
          "addrspacecast only supported with flat address space!");
 
-  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
-         "Cannot cast address space to / from constant address!");
-
   assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
           ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
          "Can only cast to / from flat address space!");
@@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
   }
 
-
   if (DestSize > SrcSize) {
     assert(SrcSize == 32 && DestSize == 64);
 
@@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+  bool Modified = false;
+
+  // XXX - Other targets seem to be able to do this without a worklist.
+  SmallVector<LoadSDNode *, 8> LoadsToReplace;
+  SmallVector<StoreSDNode *, 8> StoresToReplace;
+
+  for (SDNode &Node : CurDAG->allnodes()) {
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) {
+      EVT VT = LD->getValueType(0);
+      if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+        continue;
+
+      // To simplify the TableGen patters, we replace all i64 loads with v2i32
+      // loads.  Alternatively, we could promote i64 loads to v2i32 during DAG
+      // legalization, however, so places (ExpandUnalignedLoad) in the DAG
+      // legalizer assume that if i64 is legal, so doing this promotion early
+      // can cause problems.
+      LoadsToReplace.push_back(LD);
+    } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) {
+      // Handle i64 stores here for the same reason mentioned above for loads.
+      SDValue Value = ST->getValue();
+      if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+        continue;
+      StoresToReplace.push_back(ST);
+    }
+  }
+
+  for (LoadSDNode *LD : LoadsToReplace) {
+    SDLoc SL(LD);
+
+    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
+                                      LD->getBasePtr(), LD->getMemOperand());
+    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
+                                      MVT::i64, NewLoad);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
+    Modified = true;
+  }
+
+  for (StoreSDNode *ST : StoresToReplace) {
+    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
+                                       MVT::v2i32, ST->getValue());
+    const SDValue StoreOps[] = {
+      ST->getChain(),
+      NewValue,
+      ST->getBasePtr(),
+      ST->getOffset()
+    };
+
+    CurDAG->UpdateNodeOperands(ST, StoreOps);
+    Modified = true;
+  }
+
+  // XXX - Is this necessary?
+  if (Modified)
+    CurDAG->RemoveDeadNodes();
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3a65f3b..1a59a46 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
@@ -27,50 +28,9 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
 
 using namespace llvm;
 
-namespace {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  const Twine &Description;
-  const Function &Fn;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
-                          DiagnosticSeverity Severity = DS_Error)
-    : DiagnosticInfo(getKindID(), Severity),
-      Description(Desc),
-      Fn(Fn) { }
-
-  const Function &getFunction() const { return Fn; }
-  const Twine &getDescription() const { return Description; }
-
-  void print(DiagnosticPrinter &DP) const override {
-    DP << "unsupported " << getDescription() << " in " << Fn.getName();
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-}
-
-
 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo,
                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
+  // This is totally unsupported, just custom lower to produce an error.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
   // We need to custom lower some of the intrinsics
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
@@ -319,12 +282,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SMAX, MVT::i32, Legal);
   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 
-  if (!Subtarget->hasFFBH())
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+  else
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
 
   if (!Subtarget->hasFFBL())
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
 
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -352,7 +322,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Custom);
-    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::ADDC, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -429,12 +399,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
 
-  // There are no integer divide instructions, and these expand to a pretty
-  // large sequence of instructions.
-  setIntDivIsCheap(false);
-  setPow2SDivIsCheap(false);
   setFsqrtIsCheap(true);
 
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;
@@ -534,6 +510,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
   return true;
 }
 
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+  // There are few operations which truly have vector input operands. Any vector
+  // operation is going to involve operations on each component, and a
+  // build_vector will be a copy per element, so it always makes sense to use a
+  // build_vector input in place of the extracted element to avoid a copy into a
+  // super register.
+  //
+  // We should probably only do this if all users are extracts only, but this
+  // should be the common case.
+  return true;
+}
+
 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
   // Truncate is just accessing a subregister.
   return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
@@ -584,6 +572,12 @@ void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
 }
 
+void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
+
+  State.AnalyzeReturn(Outs, RetCC_SI);
+}
+
 SDValue AMDGPUTargetLowering::LowerReturn(
                                      SDValue Chain,
                                      CallingConv::ID CallConv,
@@ -617,6 +611,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   return SDValue();
 }
 
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+  DAG.getContext()->diagnose(NoDynamicAlloca);
+  return SDValue();
+}
+
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -643,6 +646,10 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    return LowerCTLZ(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
   return Op;
 }
@@ -892,7 +899,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
   unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  unsigned IgnoredFrameReg;
+  unsigned Offset =
+      TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
                          Op.getValueType());
 }
@@ -1043,9 +1052,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1),
                          Op.getOperand(2));
 
-    case AMDGPUIntrinsic::AMDGPU_brev:
-      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
-
   case Intrinsic::AMDGPU_class:
     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
@@ -1057,6 +1063,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
     case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
       return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
+      return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
   }
 }
 
@@ -1077,6 +1085,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
+  // TODO: Should this propagate fast-math-flags?
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
                                 DAG.getConstantFP(1.0f, DL, MVT::f32),
                                 Op.getOperand(1));
@@ -1167,45 +1176,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
   return SDValue();
 }
 
-// FIXME: Remove this when combines added to DAGCombiner.
-SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
-                                             EVT VT,
-                                             SDValue LHS,
-                                             SDValue RHS,
-                                             SDValue True,
-                                             SDValue False,
-                                             SDValue CC,
-                                             SelectionDAG &DAG) const {
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-  switch (CCOpcode) {
-  case ISD::SETULE:
-  case ISD::SETULT: {
-    unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETLE:
-  case ISD::SETLT: {
-    unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETGT:
-  case ISD::SETGE: {
-    unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETUGE:
-  case ISD::SETUGT: {
-    unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  default:
-    return SDValue();
-  }
-}
-
 SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
                                                   SelectionDAG &DAG) const {
   LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1260,7 +1230,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   EVT PtrVT = BasePtr.getValueType();
   EVT MemVT = Load->getMemoryVT();
   SDLoc SL(Op);
-  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
 
   EVT LoVT, HiVT;
   EVT LoMemVT, HiMemVT;
@@ -1269,23 +1240,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+
+  unsigned Size = LoMemVT.getStoreSize();
+  unsigned BaseAlign = Load->getAlignment();
+  unsigned HiAlign = MinAlign(BaseAlign, Size);
+
   SDValue LoLoad
     = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
                      Load->getChain(), BasePtr,
                      SrcValue,
                      LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), Load->getAlignment());
+                     Load->isInvariant(), BaseAlign);
 
   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
-                                              PtrVT));
+                              DAG.getConstant(Size, SL, PtrVT));
 
   SDValue HiLoad
     = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
                      Load->getChain(), HiPtr,
                      SrcValue.getWithOffset(LoMemVT.getStoreSize()),
                      HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), Load->getAlignment());
+                     Load->isInvariant(), HiAlign);
 
   SDValue Ops[] = {
     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1415,7 +1390,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
                                               PtrVT));
 
-  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
+  unsigned BaseAlign = Store->getAlignment();
+  unsigned Size = LoMemVT.getStoreSize();
+  unsigned HiAlign = MinAlign(BaseAlign, Size);
+
   SDValue LoStore
     = DAG.getTruncStore(Chain, SL, Lo,
                         BasePtr,
@@ -1423,15 +1402,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                         LoMemVT,
                         Store->isNonTemporal(),
                         Store->isVolatile(),
-                        Store->getAlignment());
+                        BaseAlign);
   SDValue HiStore
     = DAG.getTruncStore(Chain, SL, Hi,
                         HiPtr,
-                        SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                        SrcValue.getWithOffset(Size),
                         HiMemVT,
                         Store->isNonTemporal(),
                         Store->isVolatile(),
-                        Store->getAlignment());
+                        HiAlign);
 
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
 }
@@ -1529,7 +1508,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
       Store->getValue().getValueType().isVector()) {
-    return ScalarizeVectorStore(Op, DAG);
+    return SplitVectorStore(Op, DAG);
   }
 
   EVT MemVT = Store->getMemoryVT();
@@ -1630,6 +1609,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // float fb = (float)ib;
   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
 
+  // TODO: Should this propagate fast-math-flags?
   // float fq = native_divide(fa, fb);
   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
@@ -1940,6 +1920,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
 
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
@@ -1968,6 +1950,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
 
   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  // TODO: Should this propagate fast-math-flags?
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
@@ -2045,6 +2028,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
 
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
 
@@ -2074,6 +2059,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
 
   SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
 
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
 
   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
@@ -2184,9 +2171,149 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
 
   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  // TODO: Should this propagate fast-math-flags?
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
+SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+  if (ZeroUndef && Src.getValueType() == MVT::i32)
+    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+                                   *DAG.getContext(), MVT::i32);
+
+  SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+
+  SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
+  SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+
+  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
+  SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
+
+  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+  SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+
+  if (!ZeroUndef) {
+    // Test if the full 64-bit input is zero.
+
+    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
+    // which we probably don't want.
+    SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
+    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+
+    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
+    // with the same cycles, otherwise it is slower.
+    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
+    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
+
+    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
+
+    // The instruction returns -1 for 0 input, but the defined intrinsic
+    // behavior is to return the number of bits.
+    NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                          SrcIsZero, Bits32, NewCtlz);
+  }
+
+  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  // Unsigned
+  // cul2f(ulong u)
+  //{
+  //  uint lz = clz(u);
+  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
+  //  u = (u << lz) & 0x7fffffffffffffffUL;
+  //  ulong t = u & 0xffffffffffUL;
+  //  uint v = (e << 23) | (uint)(u >> 40);
+  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
+  //  return as_float(v + r);
+  //}
+  // Signed
+  // cl2f(long l)
+  //{
+  //  long s = l >> 63;
+  //  float r = cul2f((l + s) ^ s);
+  //  return s ? -r : r;
+  //}
+
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+  SDValue L = Src;
+
+  SDValue S;
+  if (Signed) {
+    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
+    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
+
+    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
+    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
+  }
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+                                   *DAG.getContext(), MVT::f32);
+
+
+  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
+  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
+  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
+  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
+
+  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
+  SDValue E = DAG.getSelect(SL, MVT::i32,
+    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
+    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
+    ZeroI32);
+
+  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
+    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
+    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
+
+  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
+                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
+
+  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
+                             U, DAG.getConstant(40, SL, MVT::i64));
+
+  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
+    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
+    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
+
+  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
+  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
+  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
+
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
+
+  SDValue R = DAG.getSelect(SL, MVT::i32,
+    RCmp,
+    One,
+    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
+  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
+  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
+
+  if (!Signed)
+    return R;
+
+  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
+  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
+}
+
 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
                                                bool Signed) const {
   SDLoc SL(Op);
@@ -2206,40 +2333,35 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
 
   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
                               DAG.getConstant(32, SL, MVT::i32));
-
+  // TODO: Should this propagate fast-math-flags?
   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
 }
 
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
                                                SelectionDAG &DAG) const {
-  SDValue S0 = Op.getOperand(0);
-  if (S0.getValueType() != MVT::i64)
-    return SDValue();
+  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+         "operation should be legal");
 
   EVT DestVT = Op.getValueType();
   if (DestVT == MVT::f64)
     return LowerINT_TO_FP64(Op, DAG, false);
 
-  assert(DestVT == MVT::f32);
+  if (DestVT == MVT::f32)
+    return LowerINT_TO_FP32(Op, DAG, false);
 
-  SDLoc DL(Op);
-
-  // f32 uint_to_fp i64
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
-                           DAG.getConstant(0, DL, MVT::i32));
-  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
-                           DAG.getConstant(1, DL, MVT::i32));
-  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
-  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
-                        DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
-  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
                                               SelectionDAG &DAG) const {
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
+  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+         "operation should be legal");
+
+  EVT DestVT = Op.getValueType();
+  if (DestVT == MVT::f32)
+    return LowerINT_TO_FP32(Op, DAG, true);
+
+  if (DestVT == MVT::f64)
     return LowerINT_TO_FP64(Op, DAG, true);
 
   return SDValue();
@@ -2257,7 +2379,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
                                  MVT::f64);
   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
                                  MVT::f64);
-
+  // TODO: Should this propagate fast-math-flags?
   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
 
   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
@@ -2474,6 +2596,97 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   return DAG.getSExtOrTrunc(Mul, DL, VT);
 }
 
+static bool isNegativeOne(SDValue Val) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
+    return C->isAllOnesValue();
+  return false;
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+// Get FFBH node if the incoming op may have been type legalized from a smaller
+// type VT.
+// Need to match pre-legalized type because the generic legalization inserts the
+// add/sub between the select and compare.
+static SDValue getFFBH_U32(const TargetLowering &TLI,
+                           SelectionDAG &DAG, SDLoc SL, SDValue Op) {
+  EVT VT = Op.getValueType();
+  EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  if (LegalVT != MVT::i32)
+    return SDValue();
+
+  if (VT != MVT::i32)
+    Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
+
+  SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
+  if (VT != MVT::i32)
+    FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
+
+  return FFBH;
+}
+
+// The native instructions return -1 on 0 input. Optimize out a select that
+// produces -1 on 0.
+//
+// TODO: If zero is not undef, we could also do this if the output is compared
+// against the bitwidth.
+//
+// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
+SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
+                                                 SDValue Cond,
+                                                 SDValue LHS,
+                                                 SDValue RHS,
+                                                 DAGCombinerInfo &DCI) const {
+  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+  if (!CmpRhs || !CmpRhs->isNullValue())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  SDValue CmpLHS = Cond.getOperand(0);
+
+  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+  if (CCOpcode == ISD::SETEQ &&
+      isCtlzOpc(RHS.getOpcode()) &&
+      RHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(LHS)) {
+    return getFFBH_U32(*this, DAG, SL, CmpLHS);
+  }
+
+  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+  if (CCOpcode == ISD::SETNE &&
+      isCtlzOpc(LHS.getOpcode()) &&
+      LHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(RHS)) {
+    return getFFBH_U32(*this, DAG, SL, CmpLHS);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  SDValue Cond = N->getOperand(0);
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  SDValue CC = Cond.getOperand(2);
+
+  SDValue True = N->getOperand(1);
+  SDValue False = N->getOperand(2);
+
+  if (VT == MVT::f32 && Cond.hasOneUse())
+    return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+
+  // There's no reason to not do this if the condition has other uses.
+  return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2498,29 +2711,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     simplifyI24(N1, DCI);
     return SDValue();
   }
-  case ISD::SELECT: {
-    SDValue Cond = N->getOperand(0);
-    if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
-      EVT VT = N->getValueType(0);
-      SDValue LHS = Cond.getOperand(0);
-      SDValue RHS = Cond.getOperand(1);
-      SDValue CC = Cond.getOperand(2);
-
-      SDValue True = N->getOperand(1);
-      SDValue False = N->getOperand(2);
-
-      if (VT == MVT::f32)
-        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-
-      // TODO: Implement min / max Evergreen instructions.
-      if (VT == MVT::i32 &&
-          Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-        return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-      }
-    }
-
-    break;
-  }
+  case ISD::SELECT:
+    return performSelectCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
@@ -2652,20 +2844,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->isExactlyValue(1.0);
   }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isAllOnesValue();
-  }
-  return false;
+  return isAllOnesConstant(Op);
 }
 
 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->getValueAPF().isZero();
   }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isNullValue();
-  }
-  return false;
+  return isNullConstant(Op);
 }
 
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
@@ -2738,7 +2924,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
-  NODE_NAME_CASE(BREV)
+  NODE_NAME_CASE(FFBH_U32)
   NODE_NAME_CASE(MUL_U24)
   NODE_NAME_CASE(MUL_I24)
   NODE_NAME_CASE(MAD_U24)
@@ -2893,8 +3079,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
       return 1;
 
     unsigned SignBits = 32 - Width->getZExtValue() + 1;
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!Offset || !Offset->isNullValue())
+    if (!isNullConstant(Op.getOperand(1)))
       return SignBits;
 
     // TODO: Could probably figure something out with non-0 offsets.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 478b203..3792541 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -54,6 +54,9 @@ private:
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
@@ -67,6 +70,9 @@ private:
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS,
+                             DAGCombinerInfo &DCI) const;
+  SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 protected:
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
@@ -109,6 +115,8 @@ protected:
                                SmallVectorImpl<ISD::InputArg> &OrigIns) const;
   void AnalyzeFormalArguments(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
+  void AnalyzeReturn(CCState &State,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
 public:
   AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
@@ -138,6 +146,7 @@ public:
   bool storeOfVectorConstantIsCheap(EVT MemVT,
                                     unsigned NumElem,
                                     unsigned AS) const override;
+  bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
@@ -149,6 +158,9 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                  SelectionDAG &DAG) const;
+
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
@@ -165,14 +177,6 @@ public:
                                SDValue False,
                                SDValue CC,
                                DAGCombinerInfo &DCI) const;
-  SDValue CombineIMinMax(SDLoc DL,
-                         EVT VT,
-                         SDValue LHS,
-                         SDValue RHS,
-                         SDValue True,
-                         SDValue False,
-                         SDValue CC,
-                         SelectionDAG &DAG) const;
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
@@ -216,7 +220,7 @@ public:
 
   /// \brief Helper function that returns the byte offset of the given
   /// type of implicit parameter.
-  unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+  uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
                                       const ImplicitParameter Param) const;
 };
 
@@ -267,7 +271,7 @@ enum NodeType : unsigned {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
-  BREV, // Reverse bits.
+  FFBH_U32, // ctlz with -1 if input is zero.
   MUL_U24,
   MUL_I24,
   MAD_U24,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 15a3d54..a266e71 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
   // TODO: Implement this function
   return nullptr;
 }
-bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                           ArrayRef<unsigned> Ops) const {
-  // TODO: Implement this function
-  return false;
-}
 bool
 AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                                  unsigned Reg, bool UnfoldLoad,
@@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
     return -1;
   }
 
-  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+  unsigned IgnoredFrameReg;
+  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference(
+      MF, -1, IgnoredFrameReg);
 
   return getIndirectIndexBegin(MF) + Offset;
 }
@@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
+
+ArrayRef<std::pair<int, const char *>>
+AMDGPUInstrInfo::getSerializableTargetIndices() const {
+  static const std::pair<int, const char *> TargetIndices[] = {
+      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+  return makeArrayRef(TargetIndices);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 86d3962..53e8b23 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -103,8 +103,6 @@ public:
   /// read or write or -1 if indirect addressing is not used by this program.
   int getIndirectIndexEnd(const MachineFunction &MF) const;
 
-  bool canFoldMemoryOperand(const MachineInstr *MI,
-                            ArrayRef<unsigned> Ops) const override;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                         unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
                         SmallVectorImpl<MachineInstr *> &NewMIs) const override;
@@ -147,6 +145,9 @@ public:
     return get(pseudoToMCOpcode(Opcode));
   }
 
+  ArrayRef<std::pair<int, const char *>>
+  getSerializableTargetIndices() const override;
+
 //===---------------------------------------------------------------------===//
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
@@ -195,6 +196,7 @@ public:
 };
 
 namespace AMDGPU {
+  LLVM_READONLY
   int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
 }  // End namespace AMDGPU
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b413897..575dfe4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -191,7 +191,7 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
-def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
+def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
 
 // Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
 // performing the mulitply.  The result is a 32-bit value.
@@ -242,4 +242,4 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
 def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue]>;
+    [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 72cab39..2a7ce6a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -204,14 +204,6 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
-def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
 def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@@ -243,14 +235,6 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
-def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
 def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@@ -299,16 +283,6 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
   return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;
 
-def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
 def local_store : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
   return isLocalStore(dyn_cast<StoreSDNode>(N));
@@ -385,15 +359,6 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
 
 defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
 
-def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def flat_store : PatFrag<(ops node:$val, node:$ptr),
-                         (store node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
 def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
   return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
@@ -514,7 +479,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
 class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
                        SubRegIndex sub_reg>
   : Pat<
-  (sub_type (vector_extract vec_type:$src, sub_idx)),
+  (sub_type (extractelt vec_type:$src, sub_idx)),
   (EXTRACT_SUBREG $src, sub_reg)
 >;
 
@@ -522,7 +487,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
 class Insert_Element <ValueType elem_type, ValueType vec_type,
                       int sub_idx, SubRegIndex sub_reg>
   : Pat <
-  (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+  (insertelt vec_type:$vec, elem_type:$elem, sub_idx),
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ab489cd..1de3546 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
-  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
+  def int_AMDGPU_barrier_local  : Intrinsic<[], [], [IntrConvergent]>;
+  def int_AMDGPU_barrier_global  : Intrinsic<[], [], [IntrConvergent]>;
 }
 
 // Legacy names for compatibility.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2083146..dfc652f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_Register:
-      MCOp = MCOperand::createReg(MO.getReg());
+      MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
       break;
     case MachineOperand::MO_MachineBasicBlock:
       MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
@@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
       break;
     }
-    case MachineOperand::MO_TargetIndex: {
-      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      MCOp = MCOperand::createExpr(Expr);
-      break;
-    }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
       const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 #endif
   if (MI->isBundle()) {
     const MachineBasicBlock *MBB = MI->getParent();
-    MachineBasicBlock::const_instr_iterator I = MI;
-    ++I;
-    while (I != MBB->end() && I->isInsideBundle()) {
-      EmitInstruction(I);
+    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+    while (I != MBB->instr_end() && I->isInsideBundle()) {
+      EmitInstruction(&*I);
       ++I;
     }
   } else {
@@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
       InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
                                     MF->getSubtarget<MCSubtargetInfo>());
-      CodeStream.flush();
-
       HexLines.resize(HexLines.size() + 1);
       std::string &HexLine = HexLines.back();
       raw_string_ostream HexStream(HexLine);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 21c7da6..5413717 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,11 +1,10 @@
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
-static const char *const ShaderTypeAttribute = "ShaderType";
-
 // Pin the vtable to this file.
 void AMDGPUMachineFunction::anchor() {}
 
@@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   ShaderType(ShaderType::COMPUTE),
   LDSSize(0),
+  ABIArgOffset(0),
   ScratchSize(0),
   IsKernel(true) {
-  Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
 
-  if (A.isStringAttribute()) {
-    StringRef Str = A.getValueAsString();
-    if (Str.getAsInteger(0, ShaderType))
-      llvm_unreachable("Can't parse shader type!");
-  }
+  ShaderType = AMDGPU::getShaderType(*MF.getFunction());
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index f5e4694..46fcee8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -37,6 +37,11 @@ public:
     return ShaderType;
   }
 
+  bool isKernel() const {
+    // FIXME: Assume everything is a kernel until function calls are supported.
+    return true;
+  }
+
   unsigned ScratchSize;
   bool IsKernel;
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
new file mode 100644
index 0000000..554bf1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -0,0 +1,373 @@
+//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass resolves calls to OpenCL image attribute, image resource ID and
+/// sampler resource ID getter functions.
+///
+/// Image attributes (size and format) are expected to be passed to the kernel
+/// as kernel arguments immediately following the image argument itself,
+/// therefore this pass adds image size and format arguments to the kernel
+/// functions in the module. The kernel functions with image arguments are
+/// re-created using the new signature. The new arguments are added to the
+/// kernel metadata with kernel_arg_type set to "image_size" or "image_format".
+/// Note: this pass may invalidate pointers to functions.
+///
+/// Resource IDs of read-only images, write-only images and samplers are
+/// defined to be their index among the kernel arguments of the same
+/// type and access qualifier.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+StringRef GetImageSizeFunc =         "llvm.OpenCL.image.get.size";
+StringRef GetImageFormatFunc =       "llvm.OpenCL.image.get.format";
+StringRef GetImageResourceIDFunc =   "llvm.OpenCL.image.get.resource.id";
+StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+
+StringRef ImageSizeArgMDType =   "__llvm_image_size";
+StringRef ImageFormatArgMDType = "__llvm_image_format";
+
+StringRef KernelsMDNodeName = "opencl.kernels";
+StringRef KernelArgMDNodeNames[] = {
+  "kernel_arg_addr_space",
+  "kernel_arg_access_qual",
+  "kernel_arg_type",
+  "kernel_arg_base_type",
+  "kernel_arg_type_qual"};
+const unsigned NumKernelArgMDNodes = 5;
+
+typedef SmallVector<Metadata *, 8> MDVector;
+struct KernelArgMD {
+  MDVector ArgVector[NumKernelArgMDNodes];
+};
+
+} // end anonymous namespace
+
+static inline bool
+IsImageType(StringRef TypeString) {
+  return TypeString == "image2d_t" || TypeString == "image3d_t";
+}
+
+static inline bool
+IsSamplerType(StringRef TypeString) {
+  return TypeString == "sampler_t";
+}
+
+static Function *
+GetFunctionFromMDNode(MDNode *Node) {
+  if (!Node)
+    return nullptr;
+
+  size_t NumOps = Node->getNumOperands();
+  if (NumOps != NumKernelArgMDNodes + 1)
+    return nullptr;
+
+  auto F = mdconst::dyn_extract<Function>(Node->getOperand(0));
+  if (!F)
+    return nullptr;
+
+  // Sanity checks.
+  size_t ExpectNumArgNodeOps = F->arg_size() + 1;
+  for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
+    MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
+    if (ArgNode->getNumOperands() != ExpectNumArgNodeOps)
+      return nullptr;
+    if (!ArgNode->getOperand(0))
+      return nullptr;
+
+    // FIXME: It should be possible to do image lowering when some metadata
+    // args missing or not in the expected order.
+    MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0));
+    if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i])
+      return nullptr;
+  }
+
+  return F;
+}
+
+static StringRef
+AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+  MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2));
+  return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static StringRef
+ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+  MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3));
+  return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static MDVector
+GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) {
+  MDVector Res;
+  for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+    MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1));
+    Res.push_back(Node->getOperand(OpIdx));
+  }
+  return Res;
+}
+
+static void
+PushArgMD(KernelArgMD &MD, const MDVector &V) {
+  assert(V.size() == NumKernelArgMDNodes);
+  for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+    MD.ArgVector[i].push_back(V[i]);
+  }
+}
+
+namespace {
+
+class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+  static char ID;
+
+  LLVMContext *Context;
+  Type *Int32Type;
+  Type *ImageSizeType;
+  Type *ImageFormatType;
+  SmallVector<Instruction *, 4> InstsToErase;
+
+  bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID,
+                        Argument &ImageSizeArg,
+                        Argument &ImageFormatArg) {
+    bool Modified = false;
+
+    for (auto &Use : ImageArg.uses()) {
+      auto Inst = dyn_cast<CallInst>(Use.getUser());
+      if (!Inst) {
+        continue;
+      }
+
+      Function *F = Inst->getCalledFunction();
+      if (!F)
+        continue;
+
+      Value *Replacement = nullptr;
+      StringRef Name = F->getName();
+      if (Name.startswith(GetImageResourceIDFunc)) {
+        Replacement = ConstantInt::get(Int32Type, ResourceID);
+      } else if (Name.startswith(GetImageSizeFunc)) {
+        Replacement = &ImageSizeArg;
+      } else if (Name.startswith(GetImageFormatFunc)) {
+        Replacement = &ImageFormatArg;
+      } else {
+        continue;
+      }
+
+      Inst->replaceAllUsesWith(Replacement);
+      InstsToErase.push_back(Inst);
+      Modified = true;
+    }
+
+    return Modified;
+  }
+
+  bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) {
+    bool Modified = false;
+
+    for (const auto &Use : SamplerArg.uses()) {
+      auto Inst = dyn_cast<CallInst>(Use.getUser());
+      if (!Inst) {
+        continue;
+      }
+
+      Function *F = Inst->getCalledFunction();
+      if (!F)
+        continue;
+
+      Value *Replacement = nullptr;
+      StringRef Name = F->getName();
+      if (Name == GetSamplerResourceIDFunc) {
+        Replacement = ConstantInt::get(Int32Type, ResourceID);
+      } else {
+        continue;
+      }
+
+      Inst->replaceAllUsesWith(Replacement);
+      InstsToErase.push_back(Inst);
+      Modified = true;
+    }
+
+    return Modified;
+  }
+
+  bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) {
+    uint32_t NumReadOnlyImageArgs = 0;
+    uint32_t NumWriteOnlyImageArgs = 0;
+    uint32_t NumSamplerArgs = 0;
+
+    bool Modified = false;
+    InstsToErase.clear();
+    for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) {
+      Argument &Arg = *ArgI;
+      StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo());
+
+      // Handle image types.
+      if (IsImageType(Type)) {
+        StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo());
+        uint32_t ResourceID;
+        if (AccessQual == "read_only") {
+          ResourceID = NumReadOnlyImageArgs++;
+        } else if (AccessQual == "write_only") {
+          ResourceID = NumWriteOnlyImageArgs++;
+        } else {
+          llvm_unreachable("Wrong image access qualifier.");
+        }
+
+        Argument &SizeArg = *(++ArgI);
+        Argument &FormatArg = *(++ArgI);
+        Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg);
+
+      // Handle sampler type.
+      } else if (IsSamplerType(Type)) {
+        uint32_t ResourceID = NumSamplerArgs++;
+        Modified |= replaceSamplerUses(Arg, ResourceID);
+      }
+    }
+    for (unsigned i = 0; i < InstsToErase.size(); ++i) {
+      InstsToErase[i]->eraseFromParent();
+    }
+
+    return Modified;
+  }
+
+  std::tuple<Function *, MDNode *>
+  addImplicitArgs(Function *F, MDNode *KernelMDNode) {
+    bool Modified = false;
+
+    FunctionType *FT = F->getFunctionType();
+    SmallVector<Type *, 8> ArgTypes;
+
+    // Metadata operands for new MDNode.
+    KernelArgMD NewArgMDs;
+    PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0));
+
+    // Add implicit arguments to the signature.
+    for (unsigned i = 0; i < FT->getNumParams(); ++i) {
+      ArgTypes.push_back(FT->getParamType(i));
+      MDVector ArgMD = GetArgMD(KernelMDNode, i + 1);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      if (!IsImageType(ArgTypeFromMD(KernelMDNode, i)))
+        continue;
+
+      // Add size implicit argument.
+      ArgTypes.push_back(ImageSizeType);
+      ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      // Add format implicit argument.
+      ArgTypes.push_back(ImageFormatType);
+      ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      Modified = true;
+    }
+    if (!Modified) {
+      return std::make_tuple(nullptr, nullptr);
+    }
+
+    // Create function with new signature and clone the old body into it.
+    auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false);
+    auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName());
+    ValueToValueMapTy VMap;
+    auto NewFArgIt = NewF->arg_begin();
+    for (auto &Arg: F->args()) {
+      auto ArgName = Arg.getName();
+      NewFArgIt->setName(ArgName);
+      VMap[&Arg] = &(*NewFArgIt++);
+      if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) {
+        (NewFArgIt++)->setName(Twine("__size_") + ArgName);
+        (NewFArgIt++)->setName(Twine("__format_") + ArgName);
+      }
+    }
+    SmallVector<ReturnInst*, 8> Returns;
+    CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+    // Build new MDNode.
+    SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+    KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
+    for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
+      KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+    MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
+
+    return std::make_tuple(NewF, NewMDNode);
+  }
+
+  bool transformKernels(Module &M) {
+    NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName);
+    if (!KernelsMDNode)
+      return false;
+
+    bool Modified = false;
+    for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) {
+      MDNode *KernelMDNode = KernelsMDNode->getOperand(i);
+      Function *F = GetFunctionFromMDNode(KernelMDNode);
+      if (!F)
+        continue;
+
+      Function *NewF;
+      MDNode *NewMDNode;
+      std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode);
+      if (NewF) {
+        // Replace old function and metadata with new ones.
+        F->eraseFromParent();
+        M.getFunctionList().push_back(NewF);
+        M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(),
+                              NewF->getAttributes());
+        KernelsMDNode->setOperand(i, NewMDNode);
+
+        F = NewF;
+        KernelMDNode = NewMDNode;
+        Modified = true;
+      }
+
+      Modified |= replaceImageAndSamplerUses(F, KernelMDNode);
+    }
+
+    return Modified;
+  }
+
+ public:
+  AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    Context = &M.getContext();
+    Int32Type = Type::getInt32Ty(M.getContext());
+    ImageSizeType = ArrayType::get(Int32Type, 3);
+    ImageFormatType = ArrayType::get(Int32Type, 2);
+
+    return transformKernels(M);
+  }
+
+  const char *getPassName() const override {
+    return "AMDGPU OpenCL Image Type Pass";
+  }
+};
+
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
+} // end anonymous namespace
+
+ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
+  return new AMDGPUOpenCLImageTypeLoweringPass();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 57b7a73..87d50d5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
 
 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
-  const FunctionType *FTy = F.getFunctionType();
+  FunctionType *FTy = F.getFunctionType();
 
   LocalMemAvailable = ST.getLocalMemorySize();
 
@@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   // possible these arguments require the entire local memory space, so
   // we cannot use local memory in the pass.
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
-    const Type *ParamTy = FTy->getParamType(i);
+    Type *ParamTy = FTy->getParamType(i);
     if (ParamTy->isPointerTy() &&
         ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
       LocalMemAvailable = 0;
@@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
     // Check how much local memory is being used by global objects
     for (Module::global_iterator I = Mod->global_begin(),
                                  E = Mod->global_end(); I != E; ++I) {
-      GlobalVariable *GV = I;
+      GlobalVariable *GV = &*I;
       PointerType *GVTy = GV->getType();
       if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
         continue;
@@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   return false;
 }
 
-static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+static VectorType *arrayTypeToVecType(Type *ArrayTy) {
   return VectorType::get(ArrayTy->getArrayElementType(),
                          ArrayTy->getArrayNumElements());
 }
@@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
 }
 
 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+  if (!I.isStaticAlloca())
+    return;
+
   IRBuilder<> Builder(&I);
 
   // First try to replace the alloca with a vector
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index cfd800b..0344834 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
     assert(!"Unimplemented");  return BitVector();
   }
 
-  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
-    assert(!"Unimplemented"); return nullptr;
-  }
-
   virtual unsigned getHWRegIndex(unsigned Reg) const {
     assert(!"Unimplemented"); return 0;
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 5f32a65..c6af5b9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -16,6 +16,7 @@
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineScheduler.h"
+#include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // disable it.
 
   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+    FullFS += "+flat-for-global,";
   FullFS += FS;
 
   if (GPU == "" && TT.getArch() == Triple::amdgcn)
@@ -67,26 +70,37 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       DumpCode(false), R600ALUInst(false), HasVertexCache(false),
       TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
       FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
-      CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
-      EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
-      EnableUnsafeDSOffsetFolding(false),
+      CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
+      EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
+      EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
+      EnableXNACK(false),
       WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
       IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
-      FrameLowering(TargetFrameLowering::StackGrowsUp,
-                    64 * 16, // Maximum stack alignment (long16)
-                    0),
+      FrameLowering(nullptr),
       InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
 
   initializeSubtargetDependencies(TT, GPU, FS);
 
+  const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)
+
   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     InstrInfo.reset(new R600InstrInfo(*this));
     TLInfo.reset(new R600TargetLowering(TM, *this));
+
+    // FIXME: Should have R600 specific FrameLowering
+    FrameLowering.reset(new AMDGPUFrameLowering(
+                          TargetFrameLowering::StackGrowsUp,
+                          MaxStackAlign,
+                          0));
   } else {
     InstrInfo.reset(new SIInstrInfo(*this));
     TLInfo.reset(new SITargetLowering(TM, *this));
+    FrameLowering.reset(new SIFrameLowering(
+                          TargetFrameLowering::StackGrowsUp,
+                          MaxStackAlign,
+                          0));
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 735f01d..d371227 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,4 +1,4 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,17 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
-#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+
 #include "AMDGPU.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUISelLowering.h"
 #include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -72,11 +70,13 @@ private:
   bool FastFMAF32;
   bool CaymanISA;
   bool FlatAddressSpace;
+  bool FlatForGlobal;
   bool EnableIRStructurizer;
   bool EnablePromoteAlloca;
   bool EnableIfCvt;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
+  bool EnableXNACK;
   unsigned WavefrontSize;
   bool CFALUBug;
   int LocalMemorySize;
@@ -88,10 +88,10 @@ private:
   bool CIInsts;
   bool FeatureDisable;
   int LDSBankCount;
-  unsigned IsaVersion; 
+  unsigned IsaVersion;
   bool EnableHugeScratchBuffer;
 
-  AMDGPUFrameLowering FrameLowering;
+  std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   InstrItineraryData InstrItins;
@@ -104,7 +104,7 @@ public:
                                                    StringRef GPU, StringRef FS);
 
   const AMDGPUFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return FrameLowering.get();
   }
   const AMDGPUInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
@@ -161,6 +161,10 @@ public:
     return FlatAddressSpace;
   }
 
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
+  }
+
   bool hasBFE() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -287,6 +291,10 @@ public:
   }
   bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
 
+  bool isXNACKEnabled() const {
+    return EnableXNACK;
+  }
+
   unsigned getMaxWavesPerCU() const {
     if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
       return 10;
@@ -305,6 +313,9 @@ public:
     return isAmdHsaOS() ? 0 : 36;
   }
 
+  unsigned getMaxNumUserSGPRs() const {
+    return 16;
+  }
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2297b52..b1be619 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
@@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
   RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
+
+  PassRegistry *PR = PassRegistry::getPassRegistry();
+  initializeSILowerI1CopiesPass(*PR);
+  initializeSIFixSGPRCopiesPass(*PR);
+  initializeSIFoldOperandsPass(*PR);
+  initializeSIFixSGPRLiveRangesPass(*PR);
+  initializeSIFixControlFlowLiveIntervalsPass(*PR);
+  initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+  initializeAMDGPUAnnotateUniformValuesPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.getOS() == Triple::AMDHSA)
+    return make_unique<AMDGPUHSATargetObjectFile>();
+
+  return make_unique<AMDGPUTargetObjectFile>();
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -48,8 +66,12 @@ static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
 }
 
 static MachineSchedRegistry
-SchedCustomRegistry("r600", "Run R600's custom scheduler",
-                    createR600MachineScheduler);
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+                   createR600MachineScheduler);
+
+static MachineSchedRegistry
+SISchedRegistry("si", "Run SI's custom scheduler",
+                createSIMachineScheduler);
 
 static std::string computeDataLayout(const Triple &TT) {
   std::string Ret = "e-p:32:32";
@@ -72,15 +94,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OptLevel)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
                         OptLevel),
-      TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this),
+      TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
       IntrinsicInfo() {
   setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
-AMDGPUTargetMachine::~AMDGPUTargetMachine() {
-  delete TLOF;
-}
+AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
 
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
@@ -110,7 +130,13 @@ namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+
+    // Exceptions and StackMaps are not supported, so these passes will never do
+    // anything.
+    disablePass(&StackMapLivenessID);
+    disablePass(&FuncletLayoutID);
+  }
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     return getTM<AMDGPUTargetMachine>();
@@ -126,8 +152,9 @@ public:
 
   void addIRPasses() override;
   void addCodeGenPrepare() override;
-  virtual bool addPreISel() override;
-  virtual bool addInstSelector() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addGCPasses() override;
 };
 
 class R600PassConfig : public AMDGPUPassConfig {
@@ -147,6 +174,8 @@ public:
     : AMDGPUPassConfig(TM, PM) { }
   bool addPreISel() override;
   bool addInstSelector() override;
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreSched2() override;
@@ -156,7 +185,7 @@ public:
 } // End of anonymous namespace
 
 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(
         AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
   });
@@ -172,6 +201,10 @@ void AMDGPUPassConfig::addIRPasses() {
   // functions, then we will generate code for the first function
   // without ever running any passes on the second.
   addPass(createBarrierNoopPass());
+
+  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+  addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+
   TargetPassConfig::addIRPasses();
 }
 
@@ -198,6 +231,11 @@ bool AMDGPUPassConfig::addInstSelector() {
   return false;
 }
 
+bool AMDGPUPassConfig::addGCPasses() {
+  // Do nothing. GC is not supported.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Pass Setup
 //===----------------------------------------------------------------------===//
@@ -238,16 +276,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
+
+  // FIXME: We need to run a pass to propagate the attributes when calls are
+  // supported.
+  addPass(&AMDGPUAnnotateKernelFeaturesID);
+
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
   addPass(createSIAnnotateControlFlowPass());
+  addPass(createAMDGPUAnnotateUniformValues());
+
   return false;
 }
 
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
-  addPass(createSIFixSGPRCopiesPass(*TM));
+  addPass(&SIFixSGPRCopiesID);
   addPass(createSIFoldOperandsPass());
   return false;
 }
@@ -259,7 +304,6 @@ void GCNPassConfig::addPreRegAlloc() {
   // earlier passes might recompute live intervals.
   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
   if (getOptLevel() > CodeGenOpt::None) {
-    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
   }
 
@@ -269,16 +313,27 @@ void GCNPassConfig::addPreRegAlloc() {
 
     // This should be run after scheduling, but before register allocation. It
     // also need extra copies to the address operand to be eliminated.
-    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
-  addPass(createSIFixSGPRLiveRangesPass(), false);
+}
+
+void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&SIFixSGPRLiveRangesID);
+  TargetPassConfig::addFastRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  // We want to run this after LiveVariables is computed to avoid computing them
+  // twice.
+  // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
+  // that needs to be fixed.
+  insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
+  TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }
 
 void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIPrepareScratchRegs(), false);
   addPass(createSIShrinkInstructionsPass(), false);
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 14792e3..236e3f8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
 private:
 
 protected:
-  TargetLoweringObjectFile *TLOF;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AMDGPUSubtarget Subtarget;
   AMDGPUIntrinsicInfo IntrinsicInfo;
 
@@ -52,7 +52,7 @@ public:
   TargetIRAnalysis getTargetIRAnalysis() override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
-    return TLOF;
+    return TLOF.get();
   }
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
new file mode 100644
index 0000000..e050f21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                          SectionKind Kind,
+                                                          Mangler &Mang,
+                                                const TargetMachine &TM) const {
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+    return TextSection;
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+//===----------------------------------------------------------------------===//
+// HSA Object File
+//===----------------------------------------------------------------------===//
+
+
+void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
+                                           const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  TextSection = AMDGPU::getHSATextSection(Ctx);
+
+  DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
+  DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
+
+  RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
+    const char *SectionName) const {
+  return cast<MCSectionELF>(DataGlobalAgentSection)
+      ->getSectionName()
+      .equals(SectionName);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
+  // Read-only segments can only have agent allocation.
+  return AMDGPU::isReadOnlySegment(GV) ||
+         (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
+          isAgentAllocationSection(GV->getSection()));
+}
+
+bool AMDGPUHSATargetObjectFile::isProgramAllocation(
+    const GlobalValue *GV) const {
+  // The default for global segments is program allocation.
+  return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
+}
+
+MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
+                                        const GlobalValue *GV, SectionKind Kind,
+                                        Mangler &Mang,
+                                        const TargetMachine &TM) const {
+  if (Kind.isText() && !GV->hasComdat())
+    return getTextSection();
+
+  if (AMDGPU::isGlobalSegment(GV)) {
+    if (isAgentAllocation(GV))
+      return DataGlobalAgentSection;
+
+    if (isProgramAllocation(GV))
+      return DataGlobalProgramSection;
+  }
+
+  return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
new file mode 100644
index 0000000..921341e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -0,0 +1,51 @@
+//===-- AMDGPUTargetObjectFile.h - AMDGPU  Object Info ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the AMDGPU-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                      Mangler &Mang,
+                                      const TargetMachine &TM) const override;
+};
+
+class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
+private:
+  MCSection *DataGlobalAgentSection;
+  MCSection *DataGlobalProgramSection;
+  MCSection *RodataReadonlyAgentSection;
+
+  bool isAgentAllocationSection(const char *SectionName) const;
+  bool isAgentAllocation(const GlobalValue *GV) const;
+  bool isProgramAllocation(const GlobalValue *GV) const;
+
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                    Mangler &Mang,
+                                    const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6dacc74..54a003d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+  return Vector ? 0 : 32;
+}
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Semi-arbitrary large amount.
   return 64;
 }
+
+unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+  // XXX - For some reason this isn't called for switch.
+  switch (Opcode) {
+  case Instruction::Br:
+  case Instruction::Ret:
+    return 10;
+  default:
+    return BaseT::getCFInstrCost(Opcode);
+  }
+}
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                      unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
+
+static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
+                                          const IntrinsicInst *I) {
+  switch (I->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::not_intrinsic:
+    // This means we have an intrinsic that isn't defined in
+    // IntrinsicsAMDGPU.td
+    break;
+
+  case Intrinsic::amdgcn_interp_p1:
+  case Intrinsic::amdgcn_interp_p2:
+  case Intrinsic::amdgcn_mbcnt_hi:
+  case Intrinsic::amdgcn_mbcnt_lo:
+  case Intrinsic::r600_read_tidig_x:
+  case Intrinsic::r600_read_tidig_y:
+  case Intrinsic::r600_read_tidig_z:
+    return true;
+  }
+
+  StringRef Name = I->getCalledFunction()->getName();
+  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
+  default:
+    return false;
+  case AMDGPUIntrinsic::SI_tid:
+  case AMDGPUIntrinsic::SI_fs_interp:
+    return true;
+  }
+}
+
+static bool isArgPassedInSGPR(const Argument *A) {
+  const Function *F = A->getParent();
+  unsigned ShaderType = AMDGPU::getShaderType(*F);
+
+  // Arguments to compute shaders are never a source of divergence.
+  if (ShaderType == ShaderType::COMPUTE)
+    return true;
+
+  // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+  if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
+      F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+    return true;
+
+  // Everything else is in VGPRs.
+  return false;
+}
+
+///
+/// \returns true if the result of the value could potentially be
+/// different across workitems in a wavefront.
+bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return !isArgPassedInSGPR(A);
+
+  // Loads from the private address space are divergent, because threads
+  // can execute the load instruction with the same inputs and get different
+  // results.
+  //
+  // All other loads are not divergent, because if threads issue loads with the
+  // same arguments, they will always get the same result.
+  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
+    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    const TargetMachine &TM = getTLI()->getTargetMachine();
+    return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
+  }
+
+  // Assume all function calls are a source of divergence.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee0a69..976afb0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -60,6 +60,11 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
+
+  unsigned getCFInstrCost(unsigned Opcode);
+
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+  bool isSourceOfDivergence(const Value *V) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index d918ac3..917efd1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -185,7 +185,7 @@ protected:
   MachinePostDominatorTree *PDT;
   MachineLoopInfo *MLI;
   const R600InstrInfo *TII;
-  const AMDGPURegisterInfo *TRI;
+  const R600RegisterInfo *TRI;
 
   // PRINT FUNCTIONS
   /// Print the ordered Blocks.
@@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() {
     } //while, "one iteration" over the function.
 
     MachineBasicBlock *EntryMBB =
-        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+        &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
     if (EntryMBB->succ_size() == 0) {
       Finish = true;
       DEBUG(
@@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() {
   } while (!Finish && MakeProgress);
 
   // Misc wrap up to maintain the consistency of the Function representation.
-  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+  wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
   for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
@@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
 
   for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
       E = ContMBB.end(); It != E; ++It) {
-    (*It)->removeSuccessor(LoopHeader);
+    (*It)->removeSuccessor(LoopHeader, true);
   }
 
   numLoopcontPatternMatch += NumCont;
@@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // If MigrateTrue is true, then TrueBB is the block being "branched into"
     // and if MigrateFalse is true, then FalseBB is the block being
     // "branched into"
-    // 
+    //
     // Here is the pseudo code for how I think the optimization should work:
     // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
     // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
@@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // the late machine optimization passes, however if we implement
     // bool TargetRegisterInfo::requiresRegisterScavenging(
     //                                                const MachineFunction &MF)
-    // and have it return true, liveness will be tracked correctly 
+    // and have it return true, liveness will be tracked correctly
     // by generic optimization passes.  We will also need to make sure that
     // all of our target-specific passes that run after regalloc and before
     // the CFGStructurizer track liveness and we will need to modify this pass
@@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
   );
   DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
-  DstMBB->removeSuccessor(SrcMBB);
+  DstMBB->removeSuccessor(SrcMBB, true);
   cloneSuccessorList(DstMBB, SrcMBB);
 
   removeSuccessor(SrcMBB);
@@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
 
   if (TrueMBB) {
     MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
-    MBB->removeSuccessor(TrueMBB);
+    MBB->removeSuccessor(TrueMBB, true);
     if (LandMBB && TrueMBB->succ_size()!=0)
-      TrueMBB->removeSuccessor(LandMBB);
+      TrueMBB->removeSuccessor(LandMBB, true);
     retireBlock(TrueMBB);
     MLI->removeBlock(TrueMBB);
   }
@@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     insertInstrBefore(I, AMDGPU::ELSE);
     MBB->splice(I, FalseMBB, FalseMBB->begin(),
                    FalseMBB->end());
-    MBB->removeSuccessor(FalseMBB);
+    MBB->removeSuccessor(FalseMBB, true);
     if (LandMBB && FalseMBB->succ_size() != 0)
-      FalseMBB->removeSuccessor(LandMBB);
+      FalseMBB->removeSuccessor(LandMBB, true);
     retireBlock(FalseMBB);
     MLI->removeBlock(FalseMBB);
   }
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
 
   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->addSuccessor(LandMBB);
-  DstBlk->removeSuccessor(DstBlk);
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 
@@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   //now branchInst can be erase safely
   BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
-  ExitingMBB->removeSuccessor(LandMBB);
+  ExitingMBB->removeSuccessor(LandMBB, true);
 }
 
 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  PredMBB->removeSuccessor(MBB);
-  PredMBB->addSuccessor(CloneMBB);
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
 
   // add all successor to cloneBlk
   cloneSuccessorList(CloneMBB, MBB);
@@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
     );
     SpliceEnd = SrcMBB->end();
   } else {
-    DEBUG(
-      dbgs() << "migrateInstruction see branch instr\n" ;
-      BranchMI->dump();
-    );
+    DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
     SpliceEnd = BranchMI;
   }
   DEBUG(
@@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
 
   DEBUG(
     dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << "\n";
+      << "srcSize = " << SrcMBB->size() << '\n';
   );
 }
 
@@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
   while ((BranchMI = getLoopendBlockBranchInstr(MBB))
           && isUncondBranch(BranchMI)) {
-    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
     BranchMI->eraseFromParent();
   }
 }
@@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
 
   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
   assert(BranchMI && isCondBranch(BranchMI));
-  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
   BranchMI->eraseFromParent();
   SHOWNEWBLK(MBB1, "Removing redundant successor");
-  MBB->removeSuccessor(MBB1);
+  MBB->removeSuccessor(MBB1, true);
 }
 
 void AMDGPUCFGStructurizer::addDummyExitBlock(
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2018983..d9f753f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -28,7 +28,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -83,6 +85,7 @@ public:
     unsigned RegNo;
     int Modifiers;
     const MCRegisterInfo *TRI;
+    const MCSubtargetInfo *STI;
     bool IsForcedVOP3;
   };
 
@@ -102,7 +105,7 @@ public:
   }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createReg(getReg()));
+    Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
   }
 
   void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
@@ -215,6 +218,10 @@ public:
            (isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
   }
 
+  bool isSCSrc64() const {
+    return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm();
+  }
+
   bool isVCSrc32() const {
     return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
   }
@@ -251,7 +258,22 @@ public:
     return EndLoc;
   }
 
-  void print(raw_ostream &OS) const override { }
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case Register:
+      OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>';
+      break;
+    case Immediate:
+      OS << getImm();
+      break;
+    case Token:
+      OS << '\'' << getToken() << '\'';
+      break;
+    case Expression:
+      OS << "<expr " << *Expr << '>';
+      break;
+    }
+  }
 
   static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
                                                   enum ImmTy Type = ImmTyNone,
@@ -278,10 +300,12 @@ public:
   static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
                                                   SMLoc E,
                                                   const MCRegisterInfo *TRI,
+                                                  const MCSubtargetInfo *STI,
                                                   bool ForceVOP3) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Register);
     Op->Reg.RegNo = RegNo;
     Op->Reg.TRI = TRI;
+    Op->Reg.STI = STI;
     Op->Reg.Modifiers = -1;
     Op->Reg.IsForcedVOP3 = ForceVOP3;
     Op->StartLoc = S;
@@ -301,14 +325,32 @@ public:
   bool isDSOffset01() const;
   bool isSWaitCnt() const;
   bool isMubufOffset() const;
+  bool isSMRDOffset() const;
+  bool isSMRDLiteralOffset() const;
 };
 
 class AMDGPUAsmParser : public MCTargetAsmParser {
-  MCSubtargetInfo &STI;
   const MCInstrInfo &MII;
   MCAsmParser &Parser;
 
   unsigned ForcedEncodingSize;
+
+  bool isSI() const {
+    return AMDGPU::isSI(getSTI());
+  }
+
+  bool isCI() const {
+    return AMDGPU::isCI(getSTI());
+  }
+
+  bool isVI() const {
+    return AMDGPU::isVI(getSTI());
+  }
+
+  bool hasSGPR102_SGPR103() const {
+    return !isVI();
+  }
+
   /// @name Auto-generated Match Functions
   /// {
 
@@ -323,20 +365,34 @@ private:
   bool ParseDirectiveHSACodeObjectISA();
   bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
   bool ParseDirectiveAMDKernelCodeT();
+  bool ParseSectionDirectiveHSAText();
+  bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+  bool ParseDirectiveAMDGPUHsaKernel();
+  bool ParseDirectiveAMDGPUHsaModuleGlobal();
+  bool ParseDirectiveAMDGPUHsaProgramGlobal();
+  bool ParseSectionDirectiveHSADataGlobalAgent();
+  bool ParseSectionDirectiveHSADataGlobalProgram();
+  bool ParseSectionDirectiveHSARodataReadonlyAgent();
 
 public:
-  AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser,
+public:
+  enum AMDGPUMatchResultTy {
+    Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
+  };
+
+  AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0){
+      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
+        ForcedEncodingSize(0) {
+    MCAsmParserExtension::Initialize(Parser);
 
-    if (STI.getFeatureBits().none()) {
+    if (getSTI().getFeatureBits().none()) {
       // Set default features.
-      STI.ToggleFeature("SOUTHERN_ISLANDS");
+      copySTI().ToggleFeature("SOUTHERN_ISLANDS");
     }
 
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
 
   AMDGPUTargetStreamer &getTargetStreamer() {
@@ -420,10 +476,10 @@ struct OptionalOperand {
 
 }
 
-static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
+static int getRegClass(bool IsVgpr, unsigned RegWidth) {
   if (IsVgpr) {
     switch (RegWidth) {
-      default: llvm_unreachable("Unknown register width");
+      default: return -1;
       case 1: return AMDGPU::VGPR_32RegClassID;
       case 2: return AMDGPU::VReg_64RegClassID;
       case 3: return AMDGPU::VReg_96RegClassID;
@@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
   }
 
   switch (RegWidth) {
-    default: llvm_unreachable("Unknown register width");
+    default: return -1;
     case 1: return AMDGPU::SGPR_32RegClassID;
     case 2: return AMDGPU::SGPR_64RegClassID;
     case 4: return AMDGPU::SReg_128RegClassID;
@@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
   }
 }
 
-static unsigned getRegForName(const StringRef &RegName) {
+static unsigned getRegForName(StringRef RegName) {
 
   return StringSwitch<unsigned>(RegName)
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
-    .Case("flat_scr", AMDGPU::FLAT_SCR)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
     .Case("m0", AMDGPU::M0)
     .Case("scc", AMDGPU::SCC)
-    .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO)
-    .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
     .Case("vcc_lo", AMDGPU::VCC_LO)
     .Case("vcc_hi", AMDGPU::VCC_HI)
     .Case("exec_lo", AMDGPU::EXEC_LO)
@@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
   const AsmToken Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
-  const StringRef &RegName = Tok.getString();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+  StringRef RegName = Tok.getString();
   RegNo = getRegForName(RegName);
 
   if (RegNo) {
     Parser.Lex();
-    return false;
+    return !subtargetHasRegister(*TRI, RegNo);
   }
 
   // Match vgprs and sgprs
@@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
       RegIndexInClass = RegLo;
     } else {
       // SGPR registers are aligned.  Max alignment is 4 dwords.
-      RegIndexInClass = RegLo / std::min(RegWidth, 4u);
+      unsigned Size = std::min(RegWidth, 4u);
+      if (RegLo % Size != 0)
+        return true;
+
+      RegIndexInClass = RegLo / Size;
     }
   }
 
-  const MCRegisterInfo *TRC = getContext().getRegisterInfo();
-  unsigned RC = getRegClass(IsVgpr, RegWidth);
-  if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs())
+  int RCID = getRegClass(IsVgpr, RegWidth);
+  if (RCID == -1)
     return true;
-  RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass);
-  return false;
+
+  const MCRegisterClass RC = TRI->getRegClass(RCID);
+  if (RegIndexInClass >= RC.getNumRegs())
+    return true;
+
+  RegNo = RC.getRegister(RegIndexInClass);
+  return !subtargetHasRegister(*TRI, RegNo);
 }
 
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
     return Match_InvalidOperand;
 
+  if ((TSFlags & SIInstrFlags::VOP3) &&
+      (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
+      getForcedEncodingSize() != 64)
+    return Match_PreferE32;
+
   return Match_Success;
 }
 
@@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     default: break;
     case Match_Success:
       Inst.setLoc(IDLoc);
-      Out.EmitInstruction(Inst, STI);
+      Out.EmitInstruction(Inst, getSTI());
       return false;
     case Match_MissingFeature:
       return Error(IDLoc, "instruction not supported on this GPU");
@@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       }
       return Error(ErrorLoc, "invalid operand for instruction");
     }
+    case Match_PreferE32:
+      return Error(IDLoc, "internal error: instruction without _e64 suffix "
+                          "should be encoded as e32");
   }
   llvm_unreachable("Implement any new match types added!");
 }
@@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits());
+    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
     getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
                                                       Isa.Stepping,
                                                       "AMD", "AMDGPU");
@@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
 
   amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
 
   while (true) {
 
@@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   return false;
 }
 
+bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSATextSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef KernelName = Parser.getTok().getString();
+
+  getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
+                                           ELF::STT_AMDGPU_HSA_KERNEL);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalAgentSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalProgramSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
 
+  if (IDVal == ".hsatext" || IDVal == ".text")
+    return ParseSectionDirectiveHSAText();
+
+  if (IDVal == ".amdgpu_hsa_kernel")
+    return ParseDirectiveAMDGPUHsaKernel();
+
+  if (IDVal == ".amdgpu_hsa_module_global")
+    return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+  if (IDVal == ".amdgpu_hsa_program_global")
+    return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+  if (IDVal == ".hsadata_global_agent")
+    return ParseSectionDirectiveHSADataGlobalAgent();
+
+  if (IDVal == ".hsadata_global_program")
+    return ParseSectionDirectiveHSADataGlobalProgram();
+
+  if (IDVal == ".hsarodata_readonly_agent")
+    return ParseSectionDirectiveHSARodataReadonlyAgent();
+
+  return true;
+}
+
+bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
+                                           unsigned RegNo) const {
+  if (isCI())
+    return true;
+
+  if (isSI()) {
+    // No flat_scr
+    switch (RegNo) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::FLAT_SCR_LO:
+    case AMDGPU::FLAT_SCR_HI:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
+  // SI/CI have.
+  for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
+       R.isValid(); ++R) {
+    if (*R == RegNo)
+      return false;
+  }
+
   return true;
 }
 
@@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       int64_t IntVal;
       if (getParser().parseAbsoluteExpression(IntVal))
         return MatchOperand_ParseFail;
-      APInt IntVal32(32, IntVal);
-      if (IntVal32.getSExtValue() != IntVal) {
+      if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
         Error(S, "invalid immediate: only 32-bit values are legal");
         return MatchOperand_ParseFail;
       }
 
-      IntVal = IntVal32.getSExtValue();
       if (Negate)
         IntVal *= -1;
       Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
@@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
 
         Operands.push_back(AMDGPUOperand::CreateReg(
-            RegNo, S, E, getContext().getRegisterInfo(),
+            RegNo, S, E, getContext().getRegisterInfo(), &getSTI(),
             isForcedVOP3()));
 
         if (HasModifiers || Modifiers) {
@@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) {
 }
 
 //===----------------------------------------------------------------------===//
+// smrd
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSMRDOffset() const {
+
+  // FIXME: Support 20-bit offsets on VI.  We need to to pass subtarget
+  // information here.
+  return isImm() && isUInt<8>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDLiteralOffset() const {
+  // 32-bit literals are only supported on CI and we only want to use them
+  // when the offset is > 8-bits.
+  return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
+}
+
+//===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
 
@@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
 }
 
 void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
-  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
-  unsigned i = 2;
+
+  unsigned i = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  if (Desc.getNumDefs() > 0) {
+    ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1);
+  }
 
   std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
index 2f5fdbe..c543814 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
@@ -8,6 +8,22 @@
 //===----------------------------------------------------------------------===//
 // Instruction definitions for CI and newer.
 //===----------------------------------------------------------------------===//
+// Remaining instructions:
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
 
 
 def isCIVI : Predicate <
@@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
 
 let SubtargetPredicate = isCIVI in {
 
+let SchedRW = [WriteDoubleAdd] in {
 defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
   VOP_F64_F64, ftrunc
 >;
@@ -35,115 +52,282 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
 defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
   VOP_F64_F64, frint
 >;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
 defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
   VOP_F32_F32
 >;
 defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
   VOP_F32_F32
 >;
+} // End SchedRW = [WriteQuarterRate32]
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+  VOP_I32_I32_I32
+>;
+
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+  VOP_I64_I32_I32_I64
+>;
+
+// XXX - Does this set VCC?
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+  VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
+
+
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
+
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
+  "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
+  "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
+>;
 
 //===----------------------------------------------------------------------===//
 // Flat Instructions
 //===----------------------------------------------------------------------===//
 
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
-def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
-  0x1d, "flat_store_dwordx2", VReg_64
+defm FLAT_LOAD_UBYTE : FLAT_Load_Helper <
+  flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32
+>;
+defm FLAT_LOAD_SBYTE : FLAT_Load_Helper <
+  flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32
+>;
+defm FLAT_LOAD_USHORT : FLAT_Load_Helper <
+  flat<0xa, 0x12>, "flat_load_ushort", VGPR_32
+>;
+defm FLAT_LOAD_SSHORT : FLAT_Load_Helper <
+  flat<0xb, 0x13>, "flat_load_sshort", VGPR_32>
+;
+defm FLAT_LOAD_DWORD : FLAT_Load_Helper <
+  flat<0xc, 0x14>, "flat_load_dword", VGPR_32
+>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <
+  flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64
+>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <
+  flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128
+>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <
+  flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96
 >;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
-  0x1e, "flat_store_dwordx4", VReg_128
+defm FLAT_STORE_BYTE : FLAT_Store_Helper <
+  flat<0x18>, "flat_store_byte", VGPR_32
 >;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
-  0x1f, "flat_store_dwordx3", VReg_96
+defm FLAT_STORE_SHORT : FLAT_Store_Helper <
+  flat <0x1a>, "flat_store_short", VGPR_32
+>;
+defm FLAT_STORE_DWORD : FLAT_Store_Helper <
+  flat<0x1c>, "flat_store_dword", VGPR_32
+>;
+defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+  flat<0x1d>, "flat_store_dwordx2", VReg_64
+>;
+defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+  flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128
+>;
+defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+  flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
+>;
+defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
+  flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
 >;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
 defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
-  0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
-defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
-  0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+  flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
+  flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+>;
+defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
+  flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+>;
+defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
+  flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+>;
+defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
+  flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+>;
+defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
+  flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+>;
+defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
+  flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+>;
+defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
+  flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+>;
+defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
+  flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+>;
+defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
+  flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+>;
+defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
+  flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+>;
+defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
+  flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
+  flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
 >;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
 defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
-  0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
-defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
-  0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+  flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
+  flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
+  flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
+  flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
+  flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
+  flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
+  flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
+  flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
+  flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
+  flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
+  flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
+  flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
 >;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
 
 } // End SubtargetPredicate = isCIVI
 
+// CI Only flat instructions
+
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
+  flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
+  flat<0x3f>, "flat_atomic_fmin", VGPR_32
+>;
+defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
+  flat<0x40>, "flat_atomic_fmax", VGPR_32
+>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
+  flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
+  flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
+  flat<0x60>, "flat_atomic_fmax_x2", VReg_64
+>;
+
+} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
+
+let Predicates = [isCI] in {
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isCI]
+
+
 //===----------------------------------------------------------------------===//
 // Flat Patterns
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasFlatAddressSpace] in {
+let Predicates = [isCIVI] in {
 
-class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
-                             PatFrag flat_ld> :
-  Pat <(vt (flat_ld i64:$ptr)),
-       (Instr_ADDR64 $ptr, 0, 0, 0)
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 0, 0, 0)
 >;
 
-def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
 
-class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
-  Pat <(st vt:$value, i64:$ptr),
-        (Instr $value, $ptr, 0, 0, 0)
-  >;
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (node vt:$data, i64:$addr),
+  (inst $data, $addr, 0, 0, 0)
+>;
 
-def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
-def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
-def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr, vt:$data)),
+  (inst $addr, $data, 0, 0)
+>;
 
-} // End HasFlatAddressSpace predicate
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
 
+} // End Predicates = [isCIVI]
diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index ba4df82..a6c3785 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
 def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
 def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
 
+def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
 class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
     : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 7adcd46..2245f14 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
     : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
                  "MEM_RAT "#name, pattern>;
 
+class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
+    : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+                           i32imm:$rat_id, InstFlag:$eop),
+                  "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
+                               #!if(has_eop, ", $eop", ""),
+                  [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
+                                             R600_Reg128:$index_gpr,
+                                             (i32 imm:$rat_id))]>;
+
 def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
   "MSKOR $rw_gpr.XW, $index_gpr",
@@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
   [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
 >;
 
+def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
+
 } // End usesCustomInserter = 1
 
 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
@@ -338,7 +349,7 @@ def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
-def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
 def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
 
 let hasSideEffects = 1 in {
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index e811d5c..a187de8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
     O << "4.0";
   else if (Imm == DoubleToBits(-4.0))
     O << "-4.0";
-  else
-    llvm_unreachable("64-bit literal constants not supported");
+  else {
+    assert(isUInt<32>(Imm));
+
+    // In rare situations, we will have a 32-bit literal in a 64-bit
+    // operand. This is technically allowed for the encoding of s_mov_b64.
+    O << formatHex(static_cast<uint64_t>(Imm));
+  }
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
     } else {
       unsigned Stream = (SImm16 >> 8) & 0x3;
       if (Op == 1)
-	O << "cut";
+        O << "cut";
       else if (Op == 2)
-	O << "emit";
+        O << "emit";
       else if (Op == 3)
-	O << "emit-cut";
+        O << "emit-cut";
       O << " stream " << Stream;
     }
     O << "), [m0] ";
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 14fb511..90541d8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -13,9 +13,7 @@
 #ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
 #define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 4434d9b..60e8c8f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
     case AMDGPU::fixup_si_rodata: {
       uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      *Dst = Value;
-      break;
-    }
-
-    case AMDGPU::fixup_si_end_of_text: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // The value points to the last instruction in the text section, so we
-      // need to add 4 bytes to get to the start of the constants.
+      // We emit constant data at the end of the text section and generate its
+      // address using the following code sequence:
+      // s_getpc_b64 s[0:1]
+      // s_add_u32 s0, s0, $symbol
+      // s_addc_u32 s1, s1, 0
+      //
+      // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+      // the fixup replaces $symbol with a literal constant, which is a
+      // pc-relative  offset from the encoding of the $symbol operand to the
+      // constant data.
+      //
+      // What we want here is an offset from the start of the s_add_u32
+      // instruction to the constant data, but since the encoding of $symbol
+      // starts 4 bytes after the start of the add instruction, we end up
+      // with an offset that is 4 bytes too small.  This requires us to
+      // add 4 to the fixup value before applying it.
       *Dst = Value + 4;
       break;
     }
@@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
     // name                   offset bits  flags
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   0 },
-    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+    { "fixup_si_rodata",      0,     32,   MCFixupKindInfo::FKF_IsPCRel }
   };
 
   if (Kind < FirstTargetFixupKind)
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
new file mode 100644
index 0000000..9ff9fe7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -0,0 +1,26 @@
+//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUELFStreamer.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+void AMDGPUELFStreamer::InitSections(bool NoExecStack) {
+  // Start with the .hsatext section by default.
+  SwitchSection(AMDGPU::getHSATextSection(getContext()));
+}
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
+                                           MCAsmBackend &MAB,
+                                           raw_pwrite_stream &OS,
+                                           MCCodeEmitter *Emitter,
+                                           bool RelaxAll) {
+  return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
new file mode 100644
index 0000000..488d7e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -0,0 +1,40 @@
+//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+  AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                  MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, MAB, OS, Emitter) { }
+
+  virtual void InitSections(bool NoExecStac) override;
+};
+
+MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 01021d6..59a9178 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -21,9 +21,6 @@ enum Fixups {
   /// fixup for global addresses with constant initializers
   fixup_si_rodata,
 
-  /// fixup for offset from instruction to end of text section
-  fixup_si_end_of_text,
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 028a86d..4bc80a0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -22,22 +22,21 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   InlineAsmEnd = ";#ASMEND";
 
   //===--- Data Emission Directives -------------------------------------===//
-  ZeroDirective = ".zero";
-  AsciiDirective = ".ascii\t";
-  AscizDirective = ".asciz\t";
-  Data8bitsDirective = ".byte\t";
-  Data16bitsDirective = ".short\t";
-  Data32bitsDirective = ".long\t";
-  Data64bitsDirective = ".quad\t";
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
   //===--- Global Variable Emission Directives --------------------------===//
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
-  HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
   WeakRefDirective = ".weakref\t";
   //===--- Dwarf Emission Directives -----------------------------------===//
   SupportsDebugInformation = true;
 }
+
+bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+  return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+         SectionName == ".hsadata_global_program" ||
+         SectionName == ".hsarodata_readonly_agent" ||
+         MCAsmInfo::shouldOmitSectionDirective(SectionName);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index a5bac51..a546961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -21,12 +21,13 @@ class Triple;
 
 // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
 // you will need to make sure your new class sets PrivateGlobalPrefix to
-// a prefix that won't appeary in a fuction name.  The default value
+// a prefix that won't appear in a function name.  The default value
 // for PrivateGlobalPrefix is 'L', so it will consider any function starting
 // with 'L' as a local symbol.
 class AMDGPUMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit AMDGPUMCAsmInfo(const Triple &TT);
+  bool shouldOmitSectionDirective(StringRef SectionName) const override;
 };
 } // namespace llvm
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c709741..f704094 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUELFStreamer.h"
 #include "AMDGPUMCAsmInfo.h"
 #include "AMDGPUTargetStreamer.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
@@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
   return new AMDGPUTargetELFStreamer(S);
 }
 
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  if (T.getOS() == Triple::AMDHSA)
+    return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+
+  return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
 extern "C" void LLVMInitializeAMDGPUTargetMC() {
   for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
     RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
@@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
     TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
     TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
     TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+    TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
   }
 
   // R600 specific registration
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 09e6cb1..b91134d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -13,6 +13,7 @@
 
 #include "AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
@@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
 }
 
+void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+                                                   unsigned Type) {
+  switch (Type) {
+    default: llvm_unreachable("Invalid AMDGPU symbol type");
+    case ELF::STT_AMDGPU_HSA_KERNEL:
+      OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
+      break;
+  }
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
@@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection());
+  // The MCObjectFileInfo that is available to the assembler is a generic
+  // implementation and not AMDGPUHSATargetObjectFile, so we can't use
+  // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection.
+  OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext()));
   OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
   OS.PopSection();
 }
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+                                                   unsigned Type) {
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(SymbolName));
+  Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_GLOBAL);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index d37677c..83bb728 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -27,6 +30,12 @@ public:
                                              StringRef ArchName) = 0;
 
   virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
+
+  virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+  virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+  virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
 };
 
 class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -41,6 +50,12 @@ public:
                                      StringRef ArchName) override;
 
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+  void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
@@ -72,6 +87,12 @@ public:
 
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
+  void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 }
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index e683498..3c1142d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
   const MCRegisterInfo &MRI;
 
 public:
-
   R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
     : MCII(mcii), MRI(mri) { }
 
@@ -50,8 +49,8 @@ public:
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
-private:
 
+private:
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
 
   void Emit(uint32_t value, raw_ostream &OS) const;
@@ -59,7 +58,6 @@ private:
 
   unsigned getHWRegChan(unsigned reg) const;
   unsigned getHWReg(unsigned regNo) const;
-
 };
 
 } // End anonymous namespace
@@ -83,7 +81,7 @@ enum FCInstr {
 
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                              const MCRegisterInfo &MRI,
-					     MCContext &Ctx) {
+                                             MCContext &Ctx) {
   return new R600MCCodeEmitter(MCII, MRI);
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 65a0eeb..9eb3dad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   void operator=(const SIMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
-  MCContext &Ctx;
 
   /// \brief Can this operand also contain immediate values?
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -47,7 +46,7 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                   MCContext &ctx)
-    : MCII(mcii), MRI(mri), Ctx(ctx) { }
+    : MCII(mcii), MRI(mri) { }
 
   ~SIMCCodeEmitter() override {}
 
@@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   if (MO.isExpr()) {
     const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind;
-    const MCSymbol *Sym =
-        Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
-    if (&Expr->getSymbol() == Sym) {
-      // Add the offset to the beginning of the constant values.
-      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
-    } else {
-      // This is used for constant data stored in .rodata.
-     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    }
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
     Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
index d9a0723..a1584a2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Processors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
 def : ProcessorModel<"fiji", SIQuarterSpeedModel,
   [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
 >;
+
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index c8f37f6..bd80bb2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -405,8 +405,8 @@ private:
             if (MO.isReg() && MO.isInternalRead())
               MO.setIsInternalRead(false);
           }
-          getLiteral(BI, Literals);
-          ClauseContent.push_back(BI);
+          getLiteral(&*BI, Literals);
+          ClauseContent.push_back(&*BI);
         }
         I = BI;
         DeleteMI->eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 4e4d554..124a9c6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
   setSchedulingPreference(Sched::Source);
 }
 
+static inline bool isEOP(MachineBasicBlock::iterator I) {
+  return std::next(I)->getOpcode() == AMDGPU::RETURN;
+}
+
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
   MachineFunction * MF = BB->getParent();
@@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
-
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
             .addOperand(MI->getOperand(0))
             .addOperand(MI->getOperand(1))
-            .addImm(EOP); // Set End of program bit
+            .addImm(isEOP(I)); // Set End of program bit
+    break;
+  }
+  case AMDGPU::RAT_STORE_TYPED_eg: {
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(2))
+            .addImm(isEOP(I)); // Set End of program bit
     break;
   }
 
@@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
         }
       }
     }
-    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+    bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
@@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue Arg = Op.getOperand(0);
   SDLoc DL(Op);
+
+  // TODO: Should this propagate fast-math-flags?
   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT,
         DAG.getNode(ISD::FMUL, DL, VT, Arg,
@@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
             Arg->getOperand(0).getOperand(Element));
       }
     }
+    break;
   }
 
   case ISD::SELECT_CC: {
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 855fa9f..8b6eea1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -922,7 +922,7 @@ bool
 R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
                                    unsigned NumCyles,
                                    unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const{
+                                   BranchProbability Probability) const{
   return true;
 }
 
@@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
                                    MachineBasicBlock &FMBB,
                                    unsigned NumFCycles,
                                    unsigned ExtraFCycles,
-                                   const BranchProbability &Probability) const {
+                                   BranchProbability Probability) const {
   return true;
 }
 
 bool
 R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
                                          unsigned NumCyles,
-                                         const BranchProbability &Probability)
+                                         BranchProbability Probability)
                                          const {
   return true;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index dee4c2b..e7251c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -174,18 +174,18 @@ namespace llvm {
 
   bool
    isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             const BranchProbability &Probability) const override;
+                             BranchProbability Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const override ;
+                           BranchProbability Probability) const override ;
 
   bool
    isProfitableToIfCvt(MachineBasicBlock &TMBB,
                        unsigned NumTCycles, unsigned ExtraTCycles,
                        MachineBasicBlock &FMBB,
                        unsigned NumFCycles, unsigned ExtraFCycles,
-                       const BranchProbability &Probability) const override;
+                       BranchProbability Probability) const override;
 
   bool DefinesPredicate(MachineInstr *MI,
                                   std::vector<MachineOperand> &Pred) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
index 7beed09..33ef6a4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
 // ISel Patterns
 //===----------------------------------------------------------------------===//
 
-// CND*_INT Pattterns for f32 True / False values
+// CND*_INT Patterns for f32 True / False values
 
 class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
   (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 0c06ccc..5efb3b9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
   MRI = &(Fn.getRegInfo());
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
        MBB != MBBe; ++MBB) {
-    MachineBasicBlock *MB = MBB;
+    MachineBasicBlock *MB = &*MBB;
     PreviousRegSeq.clear();
     PreviousRegSeqByReg.clear();
     PreviousRegSeqByUndefCount.clear();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index deee5bc..2126961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -81,11 +81,11 @@ private:
     int LastDstChan = -1;
     do {
       bool isTrans = false;
-      int BISlot = getSlot(BI);
+      int BISlot = getSlot(&*BI);
       if (LastDstChan >= BISlot)
         isTrans = true;
       LastDstChan = BISlot;
-      if (TII->isPredicated(BI))
+      if (TII->isPredicated(&*BI))
         continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
@@ -95,7 +95,7 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (isTrans || TII->isTransOnly(BI)) {
+      if (isTrans || TII->isTransOnly(&*BI)) {
         Result[Dst] = AMDGPU::PS;
         continue;
       }
@@ -149,7 +149,7 @@ private:
 public:
   // Ctor.
   R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
-      : VLIWPacketizerList(MF, MLI, true),
+      : VLIWPacketizerList(MF, MLI, nullptr),
         TII(static_cast<const R600InstrInfo *>(
             MF.getSubtarget().getInstrInfo())),
         TRI(TII->getRegisterInfo()) {
@@ -162,14 +162,14 @@ public:
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *MI,
-                               MachineBasicBlock *MBB) override {
+  bool ignorePseudoInstruction(const MachineInstr *MI,
+                               const MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(MachineInstr *MI) override {
+  bool isSoloInstruction(const MachineInstr *MI) override {
     if (TII->isVector(*MI))
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
@@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+        if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
           break;
       }
       I = MBB->begin();
@@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
         continue;
       }
 
-      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
       RegionEnd = I;
     }
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 9713e60..4f8a129 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+  const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
 
   const RegClassWeight &
     getRegClassWeight(const TargetRegisterClass *RC) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index ccfbf1b..fa4d24a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
       if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
         Preds.push_back(*PI);
     }
-    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
-                                      LI, false);
+    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
-  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+  CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index 4c32639..aa1e352 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -37,7 +37,8 @@ enum {
   MIMG = 1 << 18,
   FLAT = 1 << 19,
   WQM = 1 << 20,
-  VGPRSpill = 1 << 21
+  VGPRSpill = 1 << 21,
+  VOPAsmPrefer32Bit = 1 << 22
 };
 }
 
@@ -136,7 +137,7 @@ namespace SIOutMods {
 #define   C_00B84C_EXCP_EN 
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
-
+#define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
 
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
index 5fe8d19..636750d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -16,15 +16,9 @@
 
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 23502b4..f59d994 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -82,22 +82,10 @@ using namespace llvm;
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
-
-private:
+public:
   static char ID;
-  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
-                                           const MachineRegisterInfo &MRI,
-                                           unsigned Reg,
-                                           unsigned SubReg) const;
-  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const;
-  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
-                        const MachineRegisterInfo &MRI) const;
 
-public:
-  SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+  SIFixSGPRCopies() : MachineFunctionPass(ID) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -105,14 +93,23 @@ public:
     return "SI Fix SGPR copies";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
 
+INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
+                "SI Fix SGPR copies", false, false)
+
 char SIFixSGPRCopies::ID = 0;
 
-FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
-  return new SIFixSGPRCopies(tm);
+char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass() {
+  return new SIFixSGPRCopies();
 }
 
 static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
@@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
   return false;
 }
 
-/// This functions walks the use list of Reg until it finds an Instruction
-/// that isn't a COPY returns the register class of that instruction.
-/// \return The register defined by the first non-COPY instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
-                                                 const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const {
-
-  const TargetRegisterClass *RC
-    = TargetRegisterInfo::isVirtualRegister(Reg) ?
-    MRI.getRegClass(Reg) :
-    TRI->getPhysRegClass(Reg);
-
-  RC = TRI->getSubRegClass(RC, SubReg);
-  for (MachineRegisterInfo::use_instr_iterator
-       I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
-    switch (I->getOpcode()) {
-    case AMDGPU::COPY:
-      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
-                                  I->getOperand(0).getReg(),
-                                  I->getOperand(0).getSubReg()));
-      break;
-    }
-  }
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getCopyRegClasses(const MachineInstr &Copy,
+                  const SIRegisterInfo &TRI,
+                  const MachineRegisterInfo &MRI) {
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+
+  const TargetRegisterClass *SrcRC =
+    TargetRegisterInfo::isVirtualRegister(SrcReg) ?
+    MRI.getRegClass(SrcReg) :
+    TRI.getPhysRegClass(SrcReg);
 
-  return RC;
+  // We don't really care about the subregister here.
+  // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
+
+  const TargetRegisterClass *DstRC =
+    TargetRegisterInfo::isVirtualRegister(DstReg) ?
+    MRI.getRegClass(DstReg) :
+    TRI.getPhysRegClass(DstReg);
+
+  return std::make_pair(SrcRC, DstRC);
 }
 
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
-                                                 const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
-    return TRI->getSubRegClass(RC, SubReg);
-  }
-  MachineInstr *Def = MRI.getVRegDef(Reg);
-  if (Def->getOpcode() != AMDGPU::COPY) {
-    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
-  }
+static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
+                             const TargetRegisterClass *DstRC,
+                             const SIRegisterInfo &TRI) {
+  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+}
 
-  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
-                                   Def->getOperand(1).getSubReg());
+static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
+                             const TargetRegisterClass *DstRC,
+                             const SIRegisterInfo &TRI) {
+  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
 }
 
-bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
-                                      const SIRegisterInfo *TRI,
-                                      const MachineRegisterInfo &MRI) const {
+// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
+//
+// SGPRx = ...
+// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+// VGPRz = COPY SGPRy
+//
+// ==>
+//
+// VGPRx = COPY SGPRx
+// VGPRz = REG_SEQUENCE VGPRx, sub0
+//
+// This exposes immediate folding opportunities when materializing 64-bit
+// immediates.
+static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
+                                        const SIRegisterInfo *TRI,
+                                        const SIInstrInfo *TII,
+                                        MachineRegisterInfo &MRI) {
+  assert(MI.isRegSequence());
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
+    return false;
 
-  unsigned DstReg = Copy.getOperand(0).getReg();
-  unsigned SrcReg = Copy.getOperand(1).getReg();
-  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+  if (!MRI.hasOneUse(DstReg))
+    return false;
 
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg)) {
-    // If the destination register is a physical register there isn't really
-    // much we can do to fix this.
+  MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
+  if (!CopyUse.isCopy())
     return false;
-  }
 
-  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  const TargetRegisterClass *SrcRC, *DstRC;
+  std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
 
-  const TargetRegisterClass *SrcRC;
+  if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
+    return false;
 
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
+  // TODO: Could have multiple extracts?
+  unsigned SubReg = CopyUse.getOperand(1).getSubReg();
+  if (SubReg != AMDGPU::NoSubRegister)
     return false;
 
-  SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
-  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+  MRI.setRegClass(DstReg, DstRC);
+
+  // SGPRx = ...
+  // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+  // VGPRz = COPY SGPRy
+
+  // =>
+  // VGPRx = COPY SGPRx
+  // VGPRz = REG_SEQUENCE VGPRx, sub0
+
+  MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+
+  for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+    unsigned SrcReg = MI.getOperand(I).getReg();
+    unsigned SrcSubReg = MI.getOperand(I).getSubReg();
+
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+    assert(TRI->isSGPRClass(SrcRC) &&
+           "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
+
+    SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
+    const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
+
+    unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+
+    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
+      .addOperand(MI.getOperand(I));
+
+    MI.getOperand(I).setReg(TmpReg);
+  }
+
+  CopyUse.eraseFromParent();
+  return true;
 }
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
@@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  SmallVector<MachineInstr *, 16> Worklist;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
 
     MachineBasicBlock &MBB = *BI;
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                      I != E; ++I) {
+         I != E; ++I) {
       MachineInstr &MI = *I;
-      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
-        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
-        DEBUG(MI.print(dbgs()));
-        TII->moveToVALU(MI);
-
-      }
 
       switch (MI.getOpcode()) {
-      default: continue;
-      case AMDGPU::PHI: {
-        DEBUG(dbgs() << "Fixing PHI: " << MI);
-
-        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
-          const MachineOperand &Op = MI.getOperand(i);
-          unsigned Reg = Op.getReg();
-          const TargetRegisterClass *RC
-            = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+      default:
+        continue;
+      case AMDGPU::COPY: {
+        // If the destination register is a physical register there isn't really
+        // much we can do to fix this.
+        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
+          continue;
 
-          MRI.constrainRegClass(Op.getReg(), RC);
-        }
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
-                                                  MI.getOperand(0).getSubReg());
-        if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
-          MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+        const TargetRegisterClass *SrcRC, *DstRC;
+        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
+          DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
+          TII->moveToVALU(MI);
         }
 
+        break;
+      }
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << "Fixing PHI: " << MI);
+        unsigned Reg = MI.getOperand(0).getReg();
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
           break;
 
@@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       }
       case AMDGPU::REG_SEQUENCE: {
         if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
-            !hasVGPROperands(MI, TRI))
+            !hasVGPROperands(MI, TRI)) {
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
           continue;
+        }
 
         DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
index 0c54446..8bda283 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file
-/// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define in some cases.
+/// \file SALU instructions ignore the execution mask, so we need to modify the
+/// live ranges of the registers they define in some cases.
 ///
 /// The main case we need to handle is when a def is used in one side of a
 /// branch and not another.  For example:
@@ -42,13 +41,15 @@
 /// ENDIF
 /// %use
 ///
-/// Adding this use will make the def live thoughout the IF branch, which is
+/// Adding this use will make the def live throughout the IF branch, which is
 /// what we want.
 
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
@@ -79,9 +80,13 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveIntervals>();
+    AU.addRequired<LiveVariables>();
+    AU.addPreserved<LiveVariables>();
+
     AU.addRequired<MachinePostDominatorTree>();
+    AU.addPreserved<MachinePostDominatorTree>();
     AU.setPreservesCFG();
+
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -90,7 +95,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
                       "SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
                     "SI Fix SGPR Live Ranges", false, false)
@@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
-  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
- MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
-  std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
+  bool MadeChange = false;
+
+  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+  SmallVector<unsigned, 16> SGPRLiveRanges;
+
+  LiveVariables *LV = &getAnalysis<LiveVariables>();
+  MachineBasicBlock *Entry = &MF.front();
 
-  // First pass, collect all live intervals for SGPRs
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
+  // Use a depth first order so that in SSA, we encounter all defs before
+  // uses. Once the defs of the block have been found, attempt to insert
+  // SGPR_USE instructions in successor blocks if required.
+  for (MachineBasicBlock *MBB : depth_first(Entry)) {
+    for (const MachineInstr &MI : *MBB) {
       for (const MachineOperand &MO : MI.defs()) {
-        if (MO.isImplicit())
-          continue;
+        // We should never see a live out def of a physical register, so we also
+        // do not need to worry about implicit_defs().
         unsigned Def = MO.getReg();
         if (TargetRegisterInfo::isVirtualRegister(Def)) {
-          if (TRI->isSGPRClass(MRI.getRegClass(Def)))
-            SGPRLiveRanges.push_back(
-                std::make_pair(Def, &LIS->getInterval(Def)));
-        } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
-            SGPRLiveRanges.push_back(
-                std::make_pair(Def, &LIS->getRegUnit(Def)));
+          if (TRI->isSGPRClass(MRI.getRegClass(Def))) {
+            // Only consider defs that are live outs. We don't care about def /
+            // use within the same block.
+
+            // LiveVariables does not consider registers that are only used in a
+            // phi in a sucessor block as live out, unlike LiveIntervals.
+            //
+            // This is OK because SIFixSGPRCopies replaced any SGPR phis with
+            // VGPRs.
+            if (LV->isLiveOut(Def, *MBB))
+              SGPRLiveRanges.push_back(Def);
+          }
         }
       }
     }
-  }
 
-  // Second pass fix the intervals
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-    if (MBB.succ_size() < 2)
+    if (MBB->succ_size() < 2)
       continue;
 
-    // We have structured control flow, so number of succesors should be two.
-    assert(MBB.succ_size() == 2);
-    MachineBasicBlock *SuccA = *MBB.succ_begin();
-    MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+    // We have structured control flow, so the number of successors should be
+    // two.
+    assert(MBB->succ_size() == 2);
+    MachineBasicBlock *SuccA = *MBB->succ_begin();
+    MachineBasicBlock *SuccB = *(++MBB->succ_begin());
     MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
 
     if (!NCD)
@@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
       NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
                                             *(++NCD->succ_begin()));
     }
-    assert(SuccA && SuccB);
-    for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
-      unsigned Reg = RegLR.first;
-      LiveRange *LR = RegLR.second;
-
-      // FIXME: We could be smarter here.  If the register is Live-In to
-      // one block, but the other doesn't have any SGPR defs, then there
-      // won't be a conflict.  Also, if the branch decision is based on
-      // a value in an SGPR, then there will be no conflict.
-      bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
-      bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
-
-      if ((!LiveInToA && !LiveInToB) ||
-          (LiveInToA && LiveInToB))
+
+    for (unsigned Reg : SGPRLiveRanges) {
+      // FIXME: We could be smarter here. If the register is Live-In to one
+      // block, but the other doesn't have any SGPR defs, then there won't be a
+      // conflict. Also, if the branch condition is uniform then there will be
+      // no conflict.
+      bool LiveInToA = LV->isLiveIn(Reg, *SuccA);
+      bool LiveInToB = LV->isLiveIn(Reg, *SuccB);
+
+      if (!LiveInToA && !LiveInToB) {
+        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+              << " is live into neither successor\n");
         continue;
+      }
+
+      if (LiveInToA && LiveInToB) {
+        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+              << " is live into both successors\n");
+        continue;
+      }
 
       // This interval is live in to one successor, but not the other, so
       // we need to update its range so it is live in to both.
-      DEBUG(dbgs() << "Possible SGPR conflict detected " <<  " in " << *LR <<
-                      " BB#" << SuccA->getNumber() << ", BB#" <<
-                      SuccB->getNumber() <<
-                      " with NCD = " << NCD->getNumber() << '\n');
+      DEBUG(dbgs() << "Possible SGPR conflict detected for "
+            << PrintReg(Reg, TRI, 0)
+            << " BB#" << SuccA->getNumber()
+            << ", BB#" << SuccB->getNumber()
+            << " with NCD = BB#" << NCD->getNumber() << '\n');
+
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+             "Not expecting to extend live range of physreg");
 
       // FIXME: Need to figure out how to update LiveRange here so this pass
       // will be able to preserve LiveInterval analysis.
-      BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
-              TII->get(AMDGPU::SGPR_USE))
-              .addReg(Reg, RegState::Implicit);
-      DEBUG(NCD->getFirstNonPHI()->dump());
+      MachineInstr *NCDSGPRUse =
+        BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+                TII->get(AMDGPU::SGPR_USE))
+        .addReg(Reg, RegState::Implicit);
+
+      MadeChange = true;
+      LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse);
+
+      DEBUG(NCDSGPRUse->dump());
     }
   }
 
-  return false;
+  return MadeChange;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c288725..6230d1e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -45,6 +45,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
 
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
-    unsigned CommuteIdx0;
-    unsigned CommuteIdx1;
+    unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+    unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
     bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
 
     if (CanCommute) {
@@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
         OpNo = CommuteIdx0;
     }
 
-    if (!CanCommute || !TII->commuteInstruction(MI))
+    // One of operands might be an Imm operand, and OpNo may refer to it after
+    // the call of commuteInstruction() below. Such situations are avoided
+    // here explicitly as OpNo must be a register operand to be a candidate
+    // for memory folding.
+    if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+                       !MI->getOperand(CommuteIdx1).isReg()))
+      return false;
+
+    if (!CanCommute ||
+        !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
     if (!TII->isOperandLegal(MI, OpNo, OpToFold))
@@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
   return true;
 }
 
+static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
+                        unsigned UseOpIdx,
+                        std::vector<FoldCandidate> &FoldList,
+                        SmallVectorImpl<MachineInstr *> &CopiesToReplace,
+                        const SIInstrInfo *TII, const SIRegisterInfo &TRI,
+                        MachineRegisterInfo &MRI) {
+  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+
+  // FIXME: Fold operands with subregs.
+  if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+      UseOp.isImplicit())) {
+    return;
+  }
+
+  bool FoldingImm = OpToFold.isImm();
+  APInt Imm;
+
+  if (FoldingImm) {
+    unsigned UseReg = UseOp.getReg();
+    const TargetRegisterClass *UseRC
+      = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+      MRI.getRegClass(UseReg) :
+      TRI.getPhysRegClass(UseReg);
+
+    Imm = APInt(64, OpToFold.getImm());
+
+    const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode());
+    const TargetRegisterClass *FoldRC =
+        TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+    // Split 64-bit constants into 32-bits for folding.
+    if (FoldRC->getSize() == 8 && UseOp.getSubReg()) {
+      if (UseRC->getSize() != 8)
+        return;
+
+      if (UseOp.getSubReg() == AMDGPU::sub0) {
+        Imm = Imm.getLoBits(32);
+      } else {
+        assert(UseOp.getSubReg() == AMDGPU::sub1);
+        Imm = Imm.getHiBits(32);
+      }
+    }
+
+    // In order to fold immediates into copies, we need to change the
+    // copy to a MOV.
+    if (UseMI->getOpcode() == AMDGPU::COPY) {
+      unsigned DestReg = UseMI->getOperand(0).getReg();
+      const TargetRegisterClass *DestRC
+        = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+        MRI.getRegClass(DestReg) :
+        TRI.getPhysRegClass(DestReg);
+
+      unsigned MovOp = TII->getMovOpcode(DestRC);
+      if (MovOp == AMDGPU::COPY)
+        return;
+
+      UseMI->setDesc(TII->get(MovOp));
+      CopiesToReplace.push_back(UseMI);
+    }
+  }
+
+  // Special case for REG_SEQUENCE: We can't fold literals into
+  // REG_SEQUENCE instructions, so we have to fold them into the
+  // uses of REG_SEQUENCE.
+  if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+    unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+    for (MachineRegisterInfo::use_iterator
+         RSUse = MRI.use_begin(RegSeqDstReg),
+         RSE = MRI.use_end(); RSUse != RSE; ++RSUse) {
+
+      MachineInstr *RSUseMI = RSUse->getParent();
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
+        continue;
+
+      foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+                  CopiesToReplace, TII, TRI, MRI);
+    }
+    return;
+  }
+
+  const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+  // Don't fold into target independent nodes.  Target independent opcodes
+  // don't have defined register classes.
+  if (UseDesc.isVariadic() ||
+      UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+    return;
+
+  if (FoldingImm) {
+    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+    return;
+  }
+
+  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+  // FIXME: We could try to change the instruction from 64-bit to 32-bit
+  // to enable more folding opportunites.  The shrink operands pass
+  // already does this.
+  return;
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII =
@@ -220,94 +334,50 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           !MRI.hasOneUse(MI.getOperand(0).getReg()))
         continue;
 
-      // FIXME: Fold operands with subregs.
       if (OpToFold.isReg() &&
-          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
-           OpToFold.getSubReg()))
+          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
         continue;
 
+      // Prevent folding operands backwards in the function. For example,
+      // the COPY opcode must not be replaced by 1 in this example:
+      //
+      //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+      //    ...
+      //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+      MachineOperand &Dst = MI.getOperand(0);
+      if (Dst.isReg() &&
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
+
+      // We need mutate the operands of new mov instructions to add implicit
+      // uses of EXEC, but adding them invalidates the use_iterator, so defer
+      // this.
+      SmallVector<MachineInstr *, 4> CopiesToReplace;
+
       std::vector<FoldCandidate> FoldList;
       for (MachineRegisterInfo::use_iterator
            Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
            Use != E; ++Use) {
 
         MachineInstr *UseMI = Use->getParent();
-        const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
 
-        // FIXME: Fold operands with subregs.
-        if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
-            UseOp.isImplicit())) {
-          continue;
-        }
-
-        APInt Imm;
-
-        if (FoldingImm) {
-          unsigned UseReg = UseOp.getReg();
-          const TargetRegisterClass *UseRC
-            = TargetRegisterInfo::isVirtualRegister(UseReg) ?
-            MRI.getRegClass(UseReg) :
-            TRI.getPhysRegClass(UseReg);
-
-          Imm = APInt(64, OpToFold.getImm());
-
-          // Split 64-bit constants into 32-bits for folding.
-          if (UseOp.getSubReg()) {
-            if (UseRC->getSize() != 8)
-              continue;
-
-            if (UseOp.getSubReg() == AMDGPU::sub0) {
-              Imm = Imm.getLoBits(32);
-            } else {
-              assert(UseOp.getSubReg() == AMDGPU::sub1);
-              Imm = Imm.getHiBits(32);
-            }
-          }
-
-          // In order to fold immediates into copies, we need to change the
-          // copy to a MOV.
-          if (UseMI->getOpcode() == AMDGPU::COPY) {
-            unsigned DestReg = UseMI->getOperand(0).getReg();
-            const TargetRegisterClass *DestRC
-              = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-              MRI.getRegClass(DestReg) :
-              TRI.getPhysRegClass(DestReg);
-
-            unsigned MovOp = TII->getMovOpcode(DestRC);
-            if (MovOp == AMDGPU::COPY)
-              continue;
-
-            UseMI->setDesc(TII->get(MovOp));
-          }
-        }
-
-        const MCInstrDesc &UseDesc = UseMI->getDesc();
-
-        // Don't fold into target independent nodes.  Target independent opcodes
-        // don't have defined register classes.
-        if (UseDesc.isVariadic() ||
-            UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
-          continue;
-
-        if (FoldingImm) {
-          MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-          tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
-          continue;
-        }
-
-        tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
-
-        // FIXME: We could try to change the instruction from 64-bit to 32-bit
-        // to enable more folding opportunites.  The shrink operands pass
-        // already does this.
+        foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+                    CopiesToReplace, TII, TRI, MRI);
       }
 
+      // Make sure we add EXEC uses to any new v_mov instructions created.
+      for (MachineInstr *Copy : CopiesToReplace)
+        Copy->addImplicitDefUseOperands(MF);
+
       for (FoldCandidate &Fold : FoldList) {
         if (updateOperand(Fold, TRI)) {
           // Clear kill flags.
           if (!Fold.isImm()) {
             assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            Fold.OpToFold->setIsKill(false);
+            // FIXME: Probably shouldn't bother trying to fold if not an
+            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+            // copies.
+            MRI.clearKillFlags(Fold.OpToFold->getReg());
           }
           DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
                 Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
new file mode 100644
index 0000000..7d20509
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -0,0 +1,245 @@
+//===----------------------- SIFrameLowering.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+                              const MachineFrameInfo *FrameInfo) {
+  if (!FuncInfo->hasSpilledSGPRs())
+    return false;
+
+  if (FuncInfo->hasSpilledVGPRs())
+    return false;
+
+  for (int I = FrameInfo->getObjectIndexBegin(),
+         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+    if (!FrameInfo->isSpillSlotObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+  return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+                      AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+                      AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  if (!MF.getFrameInfo()->hasStackObjects())
+    return;
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // If we only have SGPR spills, we won't actually be using scratch memory
+  // since these spill to VGPRs.
+  //
+  // FIXME: We should be cleaning up these unused SGPR spill frame indices
+  // somewhere.
+  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+    return;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+  if (ST.isAmdHsaOS()) {
+    PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+      MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+  }
+
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+    // We should always reserve these 5 registers at the same time.
+    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+           "scratch wave offset and private segment buffer inconsistent");
+    return;
+  }
+
+
+  // We added live-ins during argument lowering, but since they were not used
+  // they were deleted. We're adding the uses now, so add them back.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+  if (ST.isAmdHsaOS()) {
+    MRI.addLiveIn(PreloadedPrivateBufferReg);
+    MBB.addLiveIn(PreloadedPrivateBufferReg);
+  }
+
+  if (!ST.hasSGPRInitBug()) {
+    // We reserved the last registers for this. Shift it down to the end of those
+    // which were actually used.
+    //
+    // FIXME: It might be safer to use a pseudoregister before replacement.
+
+    // FIXME: We should be able to eliminate unused input registers. We only
+    // cannot do this for the resources required for scratch access. For now we
+    // skip over user SGPRs and may leave unused holes.
+
+    // We find the resource first because it has an alignment requirement.
+    if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+      // Skip the last 2 elements because the last one is reserved for VCC, and
+      // this is the 2nd to last element already.
+      for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+        // Pick the first unallocated one. Make sure we don't clobber the other
+        // reserved input we needed.
+        if (!MRI.isPhysRegUsed(Reg)) {
+          assert(MRI.isAllocatable(Reg));
+          MRI.replaceRegWith(ScratchRsrcReg, Reg);
+          ScratchRsrcReg = Reg;
+          MFI->setScratchRSrcReg(ScratchRsrcReg);
+          break;
+        }
+      }
+    }
+
+    if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      // Skip the last 2 elements because the last one is reserved for VCC, and
+      // this is the 2nd to last element already.
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+      for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+        // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+        // scratch descriptor, since we haven’t added its uses yet.
+        if (!MRI.isPhysRegUsed(Reg)) {
+          assert(MRI.isAllocatable(Reg) &&
+                !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+          MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+          ScratchWaveOffsetReg = Reg;
+          MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+          break;
+        }
+      }
+    }
+  }
+
+
+  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+    // Make sure we emit the copy for the offset first. We may have chosen to copy
+    // the buffer resource into a register that aliases the input offset register.
+    BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+  }
+
+  if (ST.isAmdHsaOS()) {
+    // Insert copies from argument register.
+    assert(
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+    unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+    unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+    unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+    const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+    BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+      .addReg(Lo, RegState::Kill);
+    BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+      .addReg(Hi, RegState::Kill);
+  } else {
+    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+    // Use relocations to get the pointer, and setup the other bits manually.
+    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+      .addImm(Rsrc23 & 0xffffffff)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+      .addImm(Rsrc23 >> 32)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  }
+
+  // Make the register selected live throughout the function.
+  for (MachineBasicBlock &OtherBB : MF) {
+    if (&OtherBB == &MBB)
+      continue;
+
+    OtherBB.addLiveIn(ScratchRsrcReg);
+    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+  }
+}
+
+void SIFrameLowering::processFunctionBeforeFrameFinalized(
+  MachineFunction &MF,
+  RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  if (!MFI->hasStackObjects())
+    return;
+
+  bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
+
+  assert((RS || !MayNeedScavengingEmergencySlot) &&
+         "RegScavenger required if spilling");
+
+  if (MayNeedScavengingEmergencySlot) {
+    int ScavengeFI = MFI->CreateSpillStackObject(
+      AMDGPU::SGPR_32RegClass.getSize(),
+      AMDGPU::SGPR_32RegClass.getAlignment());
+    RS->addScavengingFrameIndex(ScavengeFI);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
new file mode 100644
index 0000000..a9152fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -0,0 +1,34 @@
+//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class SIFrameLowering final : public AMDGPUFrameLowering {
+public:
+  SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                  unsigned TransAl = 1) :
+    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  ~SIFrameLowering() override {}
+
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+
+  void processFunctionBeforeFrameFinalized(
+    MachineFunction &MF,
+    RegScavenger *RS = nullptr) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c2db9ff..5448675 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,6 +20,7 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
@@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
 
+  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
 
@@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
@@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
+
+  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
 
+  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
 
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+
+
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch(Op) {
       case ISD::LOAD:
@@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
       case ISD::INSERT_VECTOR_ELT:
       case ISD::INSERT_SUBVECTOR:
       case ISD::EXTRACT_SUBVECTOR:
+      case ISD::SCALAR_TO_VECTOR:
         break;
       case ISD::CONCAT_VECTORS:
         setOperationAction(Op, VT, Custom);
@@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
     }
   }
 
+  // Most operations are naturally 32-bit vector operations. We only support
+  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+  }
+
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -215,7 +259,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setTargetDAGCombine(ISD::SMAX);
   setTargetDAGCombine(ISD::UMIN);
   setTargetDAGCombine(ISD::UMAX);
-  setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
@@ -261,6 +304,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
   return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
 }
 
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+  // additionally can do r + r + i with addr64. 32-bit has more addressing
+  // mode options. Depending on the resource constant, it can also do
+  // (i64 r0) + (i32 r1) * (i14 i).
+  //
+  // Private arrays end up using a scratch buffer most of the time, so also
+  // assume those use MUBUF instructions. Scratch loads / stores are currently
+  // implemented as mubuf instructions with offen bit set, so slightly
+  // different than the normal addr64.
+  if (!isUInt<12>(AM.BaseOffs))
+    return false;
+
+  // FIXME: Since we can split immediate into soffset and immediate offset,
+  // would it make sense to allow any immediate?
+
+  switch (AM.Scale) {
+  case 0: // r + i or just i, depending on HasBaseReg.
+    return true;
+  case 1:
+    return true; // We have r + r or r + i.
+  case 2:
+    if (AM.HasBaseReg) {
+      // Reject 2 * r + r.
+      return false;
+    }
+
+    // Allow 2 * r as r + r
+    // Or  2 * r + i is allowed as r + r + i.
+    return true;
+  default: // Don't allow n * r
+    return false;
+  }
+}
+
 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                              const AddrMode &AM, Type *Ty,
                                              unsigned AS) const {
@@ -269,7 +347,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     return false;
 
   switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS: {
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
@@ -282,51 +360,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       // because it has never been validated.
       return isLegalFlatAddressingMode(AM);
     }
-    // fall-through
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
-    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
-    // additionally can do r + r + i with addr64. 32-bit has more addressing
-    // mode options. Depending on the resource constant, it can also do
-    // (i64 r0) + (i32 r1) * (i14 i).
-    //
-    // SMRD instructions have an 8-bit, dword offset.
-    //
-    // Assume nonunifom access, since the address space isn't enough to know
-    // what instruction we will use, and since we don't know if this is a load
-    // or store and scalar stores are only available on VI.
-    //
-    // We also know if we are doing an extload, we can't do a scalar load.
-    //
-    // Private arrays end up using a scratch buffer most of the time, so also
-    // assume those use MUBUF instructions. Scratch loads / stores are currently
-    // implemented as mubuf instructions with offen bit set, so slightly
-    // different than the normal addr64.
-    if (!isUInt<12>(AM.BaseOffs))
-      return false;
 
-    // FIXME: Since we can split immediate into soffset and immediate offset,
-    // would it make sense to allow any immediate?
+    return isLegalMUBUFAddressingMode(AM);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // If the offset isn't a multiple of 4, it probably isn't going to be
+    // correctly aligned.
+    if (AM.BaseOffs % 4 != 0)
+      return isLegalMUBUFAddressingMode(AM);
+
+    // There are no SMRD extloads, so if we have to do a small type access we
+    // will use a MUBUF load.
+    // FIXME?: We also need to do this if unaligned, but we don't know the
+    // alignment here.
+    if (DL.getTypeStoreSize(Ty) < 4)
+      return isLegalMUBUFAddressingMode(AM);
+
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      // SMRD instructions have an 8-bit, dword offset on SI.
+      if (!isUInt<8>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+      // On CI+, this can also be a 32-bit literal constant offset. If it fits
+      // in 8-bits, it can use a smaller encoding.
+      if (!isUInt<32>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+      if (!isUInt<20>(AM.BaseOffs))
+        return false;
+    } else
+      llvm_unreachable("unhandled generation");
 
-    switch (AM.Scale) {
-    case 0: // r + i or just i, depending on HasBaseReg.
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
       return true;
-    case 1:
-      return true; // We have r + r or r + i.
-    case 2:
-      if (AM.HasBaseReg) {
-        // Reject 2 * r + r.
-        return false;
-      }
 
-      // Allow 2 * r as r + r
-      // Or  2 * r + i is allowed as r + r + i.
+    if (AM.Scale == 1 && AM.HasBaseReg)
       return true;
-    default: // Don't allow n * r
-      return false;
-    }
+
+    return false;
   }
+
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    return isLegalMUBUFAddressingMode(AM);
+
   case AMDGPUAS::LOCAL_ADDRESS:
   case AMDGPUAS::REGION_ADDRESS: {
     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -374,7 +452,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
-    return Align % 4 == 0;
+    bool AlignedBy4 = (Align % 4 == 0);
+    if (IsFast)
+      *IsFast = AlignedBy4;
+    return AlignedBy4;
   }
 
   // Smaller than dword value must be aligned.
@@ -411,6 +492,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+    AS == AMDGPUAS::FLAT_ADDRESS ||
+    AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                           unsigned DestAS) const {
+  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+}
+
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+  const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers
+  if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
+      isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -426,12 +533,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return TII->isInlineConstant(Imm);
 }
 
-static EVT toIntegerVT(EVT VT) {
-  if (VT.isVector())
-    return VT.changeVectorElementTypeToInteger();
-  return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
@@ -439,7 +540,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -455,30 +556,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
 
   unsigned Align = DL.getABITypeAlignment(Ty);
 
-  if (VT != MemVT && VT.isFloatingPoint()) {
-    // Do an integer load and convert.
-    // FIXME: This is mostly because load legalization after type legalization
-    // doesn't handle FP extloads.
-    assert(VT.getScalarType() == MVT::f32 &&
-           MemVT.getScalarType() == MVT::f16);
-
-    EVT IVT = toIntegerVT(VT);
-    EVT MemIVT = toIntegerVT(MemVT);
-    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
-                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
-                               false, // isVolatile
-                               true, // isNonTemporal
-                               true, // isInvariant
-                               Align); // Alignment
-    SDValue Ops[] = {
-      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
-      Load.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, SL);
-  }
-
   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  if (MemVT.isFloatingPoint())
+    ExtTy = ISD::EXTLOAD;
+
   return DAG.getLoad(ISD::UNINDEXED, ExtTy,
                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
                      false, // isVolatile
@@ -497,8 +578,16 @@ SDValue SITargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
 
-  assert(CallConv == CallingConv::C);
+  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return SDValue();
+  }
+
+  // FIXME: We currently assume all calling conventions are kernels.
 
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
@@ -508,18 +597,20 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     // First check if it's a PS input addr
     if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
-        !Arg.Flags.isByVal()) {
-
-      assert((PSInputNum <= 15) && "Too many PS inputs!");
+        !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
-      if (!Arg.Used) {
-        // We can savely skip PS inputs
+      if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+        // We can safely skip PS inputs
         Skipped.set(i);
         ++PSInputNum;
         continue;
       }
 
-      Info->PSInputAddr |= 1 << PSInputNum++;
+      Info->markPSInputAllocated(PSInputNum);
+      if (Arg.Used)
+        Info->PSInputEna |= 1 << PSInputNum;
+
+      ++PSInputNum;
     }
 
     // Second split vertices into their elements
@@ -530,7 +621,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
       // three or five element vertex only needs three or five registers,
-      // NOT four or eigth.
+      // NOT four or eight.
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
 
@@ -549,41 +640,25 @@ SDValue SITargetLowering::LowerFormalArguments(
                  *DAG.getContext());
 
   // At least one interpolation mode must be enabled or else the GPU will hang.
+  //
+  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
+  // PSInputAddr, the user wants to enable some bits after the compilation
+  // based on run-time states. Since we can't know what the final PSInputEna
+  // will look like, so we shouldn't do anything here and the user should take
+  // responsibility for the correct programming.
+  //
+  // Otherwise, the following restrictions apply:
+  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+  //   enabled too.
   if (Info->getShaderType() == ShaderType::PIXEL &&
-      (Info->PSInputAddr & 0x7F) == 0) {
-    Info->PSInputAddr |= 1;
+      ((Info->getPSInputAddr() & 0x7F) == 0 ||
+       ((Info->getPSInputAddr() & 0xF) == 0 &&
+	Info->isPSInputAllocated(11)))) {
     CCInfo.AllocateReg(AMDGPU::VGPR0);
     CCInfo.AllocateReg(AMDGPU::VGPR1);
-  }
-
-  // The pointer to the list of arguments is stored in SGPR0, SGPR1
-	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
-    else
-      Info->NumUserSGPRs = 4;
-
-    unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
-    unsigned InputPtrRegLo =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned InputPtrRegHi =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    unsigned ScratchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-    unsigned ScratchPtrRegLo =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned ScratchPtrRegHi =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    CCInfo.AllocateReg(InputPtrRegLo);
-    CCInfo.AllocateReg(InputPtrRegHi);
-    CCInfo.AllocateReg(ScratchPtrRegLo);
-    CCInfo.AllocateReg(ScratchPtrRegHi);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
-    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
+    Info->markPSInputAllocated(0);
+    Info->PSInputEna |= 1;
   }
 
   if (Info->getShaderType() == ShaderType::COMPUTE) {
@@ -591,6 +666,25 @@ SDValue SITargetLowering::LowerFormalArguments(
                             Splits);
   }
 
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -617,7 +711,7 @@ SDValue SITargetLowering::LowerFormalArguments(
                                    Offset, Ins[i].Flags.isSExt());
       Chains.push_back(Arg.getValue(1));
 
-      const PointerType *ParamTy =
+      auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -678,10 +772,113 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
-        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
-    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+
+  // Start adding system SGPRs.
+  if (Info->hasWorkGroupIDX()) {
+    unsigned Reg = Info->addWorkGroupIDX();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("work group id x is always enabled");
+
+  if (Info->hasWorkGroupIDY()) {
+    unsigned Reg = Info->addWorkGroupIDY();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDZ()) {
+    unsigned Reg = Info->addWorkGroupIDZ();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupInfo()) {
+    unsigned Reg = Info->addWorkGroupInfo();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasPrivateSegmentWaveByteOffset()) {
+    // Scratch wave offset passed in system SGPR.
+    unsigned PrivateSegmentWaveByteOffsetReg
+      = Info->addPrivateSegmentWaveByteOffset();
+
+    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+  }
+
+  // Now that we've figured out where the scratch register inputs are, see if
+  // should reserve the arguments and use them directly.
+
+  bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+  if (ST.isAmdHsaOS()) {
+    // TODO: Assume we will spill without optimizations.
+    if (HasStackObjects) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the HSA ABI, this will be the first 4 user SGPR
+      // inputs. We can reserve those and use them directly.
+
+      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+    } else {
+      unsigned ReservedBufferReg
+        = TRI->reservedPrivateSegmentBufferReg(MF);
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+      // We tentatively reserve the last registers (skipping the last two
+      // which may contain VCC). After register allocation, we'll replace
+      // these with the ones immediately after those which were really
+      // allocated. In the prologue copies will be inserted from the argument
+      // to these reserved registers.
+      Info->setScratchRSrcReg(ReservedBufferReg);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  } else {
+    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+    // Without HSA, relocations are used for the scratch pointer and the
+    // buffer resource setup is always inserted in the prologue. Scratch wave
+    // offset is still in an input SGPR.
+    Info->setScratchRSrcReg(ReservedBufferReg);
+
+    if (HasStackObjects) {
+      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+    } else {
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  }
+
+  if (Info->hasWorkItemIDX()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("workitem id x should always be enabled");
+
+  if (Info->hasWorkItemIDY()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDZ()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
   if (Chains.empty())
@@ -690,30 +887,105 @@ SDValue SITargetLowering::LowerFormalArguments(
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
+SDValue SITargetLowering::LowerReturn(SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
+                                      SDLoc DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (Info->getShaderType() == ShaderType::COMPUTE)
+    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
+                                             OutVals, DL, DAG);
+
+  Info->setIfReturnsVoid(Outs.size() == 0);
+
+  SmallVector<ISD::OutputArg, 48> Splits;
+  SmallVector<SDValue, 48> SplitVals;
+
+  // Split vectors into their elements.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+
+    if (Out.VT.isVector()) {
+      MVT VT = Out.VT.getVectorElementType();
+      ISD::OutputArg NewOut = Out;
+      NewOut.Flags.setSplit();
+      NewOut.VT = VT;
+
+      // We want the original number of vector elements here, e.g.
+      // three or five, not four or eight.
+      unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+                                   DAG.getConstant(j, DL, MVT::i32));
+        SplitVals.push_back(Elem);
+        Splits.push_back(NewOut);
+        NewOut.PartOffset += NewOut.VT.getStoreSize();
+      }
+    } else {
+      SplitVals.push_back(OutVals[i]);
+      Splits.push_back(Out);
+    }
+  }
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 48> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze outgoing return values.
+  AnalyzeReturn(CCInfo, Splits);
+
+  SDValue Flag;
+  SmallVector<SDValue, 48> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = SplitVals[realRVLocIdx];
+
+    // Copied from other backends.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
-  MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH:
     return BB;
-  case AMDGPU::SI_RegisterStorePseudo: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
-                Reg);
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-      MIB.addOperand(MI->getOperand(i));
-
-    MI->eraseFromParent();
-    break;
-  }
   }
   return BB;
 }
@@ -944,20 +1216,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   const GlobalValue *GV = GSD->getGlobal();
   MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
 
-  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
-  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(0, DL, MVT::i32));
-  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(1, DL, MVT::i32));
-
-  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrLo, GA);
-  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrHi, DAG.getConstant(0, DL, MVT::i32),
-                           SDValue(Lo.getNode(), 1));
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
 }
 
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
@@ -977,6 +1237,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
                                                       // a glue result.
 }
 
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+                                                 SDValue Op,
+                                                 MVT VT,
+                                                 unsigned Offset) const {
+  SDLoc SL(Op);
+  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+                                 DAG.getEntryNode(), Offset, false);
+  // The local size values will have the hi 16-bits as zero.
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+                     DAG.getValueType(VT));
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -988,7 +1260,20 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc DL(Op);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
+  // TODO: Should this propagate fast-math-flags?
+
   switch (IntrinsicID) {
+  case Intrinsic::amdgcn_dispatch_ptr:
+    if (!Subtarget->isAmdHsaOS()) {
+      DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
+                                          "hsa intrinsic without hsa target");
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+    }
+
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
+
   case Intrinsic::r600_read_ngroups_x:
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_X, false);
@@ -1008,37 +1293,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::AMDGPU_read_workdim:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+    // Really only 2 bits.
+    return lowerImplicitZextParam(DAG, Op, MVT::i8,
+                                  getImplicitParameterOffset(MFI, GRID_DIM));
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
       Op.getOperand(1),
@@ -1077,6 +1361,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        DAG.getConstant(2, DL, MVT::i32), // P0
                        Op.getOperand(1), Op.getOperand(2), Glue);
   }
+  case AMDGPUIntrinsic::SI_packf16:
+    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+      return DAG.getUNDEF(MVT::i32);
+    return Op;
   case AMDGPUIntrinsic::SI_fs_interp: {
     SDValue IJ = Op.getOperand(4);
     SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
@@ -1092,6 +1380,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
                              Op.getOperand(1), Op.getOperand(2), Glue);
   }
+  case Intrinsic::amdgcn_interp_p1: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Glue);
+  }
+  case Intrinsic::amdgcn_interp_p2: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+    SDValue Glue = SDValue(M0.getNode(), 1);
+    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+                       Glue);
+  }
   default:
     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   }
@@ -1152,16 +1453,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
            "Custom lowering for non-i32 vectors hasn't been implemented.");
     unsigned NumElements = Op.getValueType().getVectorNumElements();
     assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
     switch (Load->getAddressSpace()) {
       default: break;
+      case AMDGPUAS::CONSTANT_ADDRESS:
+      if (isMemOpUniform(Load))
+        break;
+        // Non-uniform loads will be selected to MUBUF instructions, so they
+        // have the same legalization requires ments as global and private
+        // loads.
+        //
+        // Fall-through
       case AMDGPUAS::GLOBAL_ADDRESS:
       case AMDGPUAS::PRIVATE_ADDRESS:
+        if (NumElements >= 8)
+          return SplitVectorLoad(Op, DAG);
+
         // v4 loads are supported for private and global memory.
         if (NumElements <= 4)
           break;
         // fall-through
       case AMDGPUAS::LOCAL_ADDRESS:
-        return ScalarizeVectorLoad(Op, DAG);
+        // If properly aligned, if we split we might be able to use ds_read_b64.
+        return SplitVectorLoad(Op, DAG);
     }
   }
 
@@ -1236,8 +1550,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
   if (Unsafe) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
+    SDNodeFlags Flags;
+    Flags.setUnsafeAlgebra(true);
     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
   }
 
   return SDValue();
@@ -1274,6 +1590,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
 
+  // TODO: Should this propagate fast-math-flags?
+
   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
@@ -1379,7 +1697,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     return Ret;
 
   if (VT.isVector() && VT.getVectorNumElements() >= 8)
-      return ScalarizeVectorStore(Op, DAG);
+      return SplitVectorStore(Op, DAG);
 
   if (VT == MVT::i1)
     return DAG.getTruncStore(Store->getChain(), DL,
@@ -1393,6 +1711,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Arg = Op.getOperand(0);
+  // TODO: Should this propagate fast-math-flags?
   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
                                               DAG.getConstantFP(0.5/M_PI, DL,
@@ -1821,7 +2140,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
   case ISD::UINT_TO_FP: {
     return performUCharToFloatCombine(N, DCI);
-
+  }
   case ISD::FADD: {
     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
       break;
@@ -1903,7 +2222,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
-  }
   case ISD::LOAD:
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
@@ -2125,9 +2443,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  TII->legalizeOperands(MI);
 
-  if (TII->isMIMG(MI->getOpcode())) {
+  if (TII->isVOP3(MI->getOpcode())) {
+    // Make sure constant bus requirements are respected.
+    TII->legalizeOperandsVOP3(MRI, MI);
+    return;
+  }
+
+  if (TII->isMIMG(*MI)) {
     unsigned VReg = MI->getOperand(0).getReg();
     unsigned Writemask = MI->getOperand(1).getImm();
     unsigned BitsSet = 0;
@@ -2169,53 +2492,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
                                                 SDLoc DL,
                                                 SDValue Ptr) const {
   const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
-    // XXX - Workaround for moveToVALU not handling different register class
-    // inserts for REG_SEQUENCE.
-
-    // Build the half of the subregister with the constants.
-    const SDValue Ops0[] = {
-      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                  MVT::v2i32, Ops0), 0);
-
-    // Combine the constants and the pointer.
-    const SDValue Ops1[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
-      SubRegHi,
-      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
-    };
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  const SDValue Ops0[] = {
+    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, 0),
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+  };
 
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
-    const SDValue Ops[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::v2i32, Ops0), 0);
 
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+  // Combine the constants and the pointer.
+  const SDValue Ops1[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    Ptr,
+    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+    SubRegHi,
+    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+  };
 
-#endif
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
 }
 
 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
-///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
-///        of the resource descriptor) to create an offset, which is added to the
-///        resource ponter.
+///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to
+///        the resource pointer.
 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
                                            SDLoc DL,
                                            SDValue Ptr,
@@ -2248,15 +2556,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                const TargetRegisterClass *RC,
                                                unsigned Reg, EVT VT) const {
@@ -2274,13 +2573,41 @@ std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
-  if (Constraint == "r") {
-    switch(VT.SimpleTy) {
-      default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
-      case MVT::i64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
-      case MVT::i32:
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 's':
+    case 'r':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
         return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      }
+
+    case 'v':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
+        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+      case 96:
+        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+      }
     }
   }
 
@@ -2301,3 +2628,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 's':
+    case 'v':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d84c32e..f01b2c0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
 
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+                                 MVT VT, unsigned Offset) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
@@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+  bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 public:
   SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
 
@@ -76,6 +80,9 @@ public:
                           bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
+  bool isMemOpUniform(const SDNode *N) const;
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
@@ -88,6 +95,13 @@ public:
                                SDLoc DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue LowerReturn(SDValue Chain,
+                      CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                       MachineBasicBlock * BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
@@ -112,13 +126,10 @@ public:
                            SDValue Ptr,
                            uint32_t RsrcDword1,
                            uint64_t RsrcDword2And3) const;
-  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
-                                  SDLoc DL,
-                                  SDValue Ptr) const;
-
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index 90a37f1..94e6147 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -84,6 +84,9 @@ private:
 
   bool LastInstWritesM0;
 
+  /// \brief Whether the machine function returns void
+  bool ReturnsVoid;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -91,7 +94,8 @@ private:
   bool isOpRelevant(MachineOperand &Op);
 
   /// \brief Get register interval an operand affects.
-  RegInterval getRegInterval(MachineOperand &Op);
+  RegInterval getRegInterval(const TargetRegisterClass *RC,
+                             const MachineOperand &Reg) const;
 
   /// \brief Handle instructions async components
   void pushInstruction(MachineBasicBlock &MBB,
@@ -121,9 +125,13 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "SI insert wait  instructions";
+    return "SI insert wait instructions";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
@@ -138,9 +146,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
 }
 
 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-
-  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
-  Counters Result;
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  Counters Result = { { 0, 0, 0 } };
 
   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 
@@ -151,15 +158,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 
-    if (TII->isSMRD(MI.getOpcode())) {
-
-      MachineOperand &Op = MI.getOperand(0);
-      assert(Op.isReg() && "First LGKM operand must be a register!");
-
-      unsigned Reg = Op.getReg();
-      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-      Result.Named.LGKM = Size > 4 ? 2 : 1;
-
+    if (TII->isSMRD(MI)) {
+
+      if (MI.getNumOperands() != 0) {
+        assert(MI.getOperand(0).isReg() &&
+               "First LGKM operand must be a register!");
+
+        // XXX - What if this is a write into a super register?
+        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
+        unsigned Size = RC->getSize();
+        Result.Named.LGKM = Size > 4 ? 2 : 1;
+      } else {
+        // s_dcache_inv etc. do not have a a destination register. Assume we
+        // want a wait on these.
+        // XXX - What is the right value?
+        Result.Named.LGKM = 1;
+      }
     } else {
       // DS
       Result.Named.LGKM = 1;
@@ -173,9 +187,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 }
 
 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-
   // Constants are always irrelevant
-  if (!Op.isReg())
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     return false;
 
   // Defines are always relevant
@@ -196,7 +209,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
   // operand comes before the value operand and it may have
   // multiple data operands.
 
-  if (TII->isDS(MI.getOpcode())) {
+  if (TII->isDS(MI)) {
     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
     if (Data && Op.isIdenticalTo(*Data))
       return true;
@@ -224,18 +237,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
   return false;
 }
 
-RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
-
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-    return std::make_pair(0, 0);
-
-  unsigned Reg = Op.getReg();
-  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-
+RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
+                                          const MachineOperand &Reg) const {
+  unsigned Size = RC->getSize();
   assert(Size >= 4);
 
   RegInterval Result;
-  Result.first = TRI->getEncodingValue(Reg);
+  Result.first = TRI->getEncodingValue(Reg.getReg());
   Result.second = Result.first + Size / 4;
 
   return Result;
@@ -246,10 +254,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 
   // Get the hardware counter increments and sum them up
   Counters Increment = getHwCounts(*I);
+  Counters Limit = ZeroCounts;
   unsigned Sum = 0;
 
   for (unsigned i = 0; i < 3; ++i) {
     LastIssued.Array[i] += Increment.Array[i];
+    if (Increment.Array[i])
+      Limit.Array[i] = LastIssued.Array[i];
     Sum += Increment.Array[i];
   }
 
@@ -261,7 +272,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 
   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
       AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
     // or SMEM clause, respectively.
     //
     // The temporary workaround is to break the clauses with S_NOP.
@@ -270,7 +281,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     // and destination registers don't overlap, e.g. this is illegal:
     //   r0 = load r2
     //   r2 = load r0
-    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+    if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
         (LastOpcodeType == VMEM && Increment.Named.VM)) {
       // Insert a NOP to break the clause.
       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -278,7 +289,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
       LastInstWritesM0 = false;
     }
 
-    if (TII->isSMRD(I->getOpcode()))
+    if (TII->isSMRD(*I))
       LastOpcodeType = SMEM;
     else if (Increment.Named.VM)
       LastOpcodeType = VMEM;
@@ -290,21 +301,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
   }
 
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-
     MachineOperand &Op = I->getOperand(i);
     if (!isOpRelevant(Op))
       continue;
 
-    RegInterval Interval = getRegInterval(Op);
+    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
+    RegInterval Interval = getRegInterval(RC, Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 
       // Remember which registers we define
       if (Op.isDef())
-        DefinedRegs[j] = LastIssued;
+        DefinedRegs[j] = Limit;
 
       // and which one we are using
       if (Op.isUse())
-        UsedRegs[j] = LastIssued;
+        UsedRegs[j] = Limit;
     }
   }
 }
@@ -314,7 +325,9 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                                const Counters &Required) {
 
   // End of program? No need to wait on anything
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+  // A function not returning void needs to wait, because other bytecode will
+  // be appended after it and we don't know what it will be.
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
     return false;
 
   // Figure out if the async instructions execute in order
@@ -390,12 +403,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
   if (MI.getOpcode() == AMDGPU::S_SENDMSG)
     return LastIssued;
 
-  // For each register affected by this
-  // instruction increase the result sequence
+  // For each register affected by this instruction increase the result
+  // sequence.
+  //
+  // TODO: We could probably just look at explicit operands if we removed VCC /
+  // EXEC from SMRD dest reg classes.
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-
     MachineOperand &Op = MI.getOperand(i);
-    RegInterval Interval = getRegInterval(Op);
+    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+      continue;
+
+    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
+    RegInterval Interval = getRegInterval(RC, Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 
       if (Op.isDef()) {
@@ -451,6 +470,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
+  ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
 
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -474,6 +494,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 
     // Wait for everything at the end of the MBB
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+
+    // Functions returning something shouldn't contain S_ENDPGM, because other
+    // bytecode will be appended after it.
+    if (!ReturnsVoid) {
+      MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+      if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+        I->eraseFromParent();
+    }
   }
 
   return Changes;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 211666a..0e883f6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> WQM = 0;
   field bits<1> VGPRSpill = 0;
 
+  // This bit tells the assembler to use the 32-bit encoding in case it
+  // is unable to infer the encoding from the operands.
+  field bits<1> VOPAsmPrefer32Bit = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   let TSFlags{19} = FLAT;
   let TSFlags{20} = WQM;
   let TSFlags{21} = VGPRSpill;
+  let TSFlags{22} = VOPAsmPrefer32Bit;
 
-  // Most instructions require adjustments after selection to satisfy
-  // operand requirements.
-  let hasPostISelHook = 1;
   let SchedRW = [Write32Bit];
 }
 
@@ -86,7 +88,6 @@ class Enc64 {
 }
 
 class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
-def VOPDstVCC : VOPDstOperand <VCCReg>;
 
 let Uses = [EXEC] in {
 
@@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 class VOPCCommon <dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> {
+    VOPAnyCommon <(outs), ins, asm, pattern> {
 
-  let DisableEncoding = "$dst";
   let VOPC = 1;
   let Size = 4;
+  let Defs = [VCC];
 }
 
 class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let isCodeGenOnly = 0;
 
   int Size = 8;
+
+  // Because SGPRs may be allowed if there are multiple operands, we
+  // need a post-isel hook to insert copies in order to avoid
+  // violating constant bus requirements.
+  let hasPostISelHook = 1;
 }
 
 } // End Uses = [EXEC]
@@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   let Inst{31-27} = 0x18; //encoding
 }
 
+class SMRD_IMMe_ci <bits<5> op> : Enc64 {
+  bits<7> sdst;
+  bits<7> sbase;
+  bits<32> offset;
+
+  let Inst{7-0}   = 0xff;
+  let Inst{8}     = 0;
+  let Inst{14-9}  = sbase{6-1};
+  let Inst{21-15} = sdst;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+  let Inst{63-32} = offset;
+}
+
 let SchedRW = [WriteSALU] in {
 class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI<outs, ins, asm, pattern> {
@@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
 class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   InstSI<outs, ins, asm, pattern>, SOPCe <op> {
 
-  let DisableEncoding = "$dst";
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
   let SOPC = 1;
   let isCodeGenOnly = 0;
+  let Defs = [SCC];
 
   let UseNamedOperandTable = 1;
 }
@@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
 // Vector I/O operations
 //===----------------------------------------------------------------------===//
 
-let Uses = [EXEC] in {
-
 class DS <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI <outs, ins, asm, pattern> {
 
   let LGKM_CNT = 1;
   let DS = 1;
   let UseNamedOperandTable = 1;
-  let Uses = [M0];
+  let Uses = [M0, EXEC];
 
   // Most instruction load and store data, so set this as the default.
   let mayLoad = 1;
@@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MUBUF = 1;
+  let Uses = [EXEC];
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
@@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MTBUF = 1;
+  let Uses = [EXEC];
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
@@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MIMG = 1;
+  let Uses = [EXEC];
 
   let hasSideEffects = 0; // XXX ????
 }
-
-
-} // End Uses = [EXEC]
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cfd2c42..1e10d25 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   switch (MI->getOpcode()) {
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
     return true;
   default:
     return false;
@@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
                                         unsigned &Offset,
                                         const TargetRegisterInfo *TRI) const {
   unsigned Opc = LdSt->getOpcode();
-  if (isDS(Opc)) {
+
+  if (isDS(*LdSt)) {
     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
                                                       AMDGPU::OpName::offset);
     if (OffsetImm) {
@@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     return false;
   }
 
-  if (isMUBUF(Opc) || isMTBUF(Opc)) {
+  if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
       return false;
 
@@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     return true;
   }
 
-  if (isSMRD(Opc)) {
+  if (isSMRD(*LdSt)) {
     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
                                                       AMDGPU::OpName::offset);
     if (!OffsetImm)
@@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
                                      MachineInstr *SecondLdSt,
                                      unsigned NumLoads) const {
-  unsigned Opc0 = FirstLdSt->getOpcode();
-  unsigned Opc1 = SecondLdSt->getOpcode();
-
   // TODO: This needs finer tuning
   if (NumLoads > 4)
     return false;
 
-  if (isDS(Opc0) && isDS(Opc1))
+  if (isDS(*FirstLdSt) && isDS(*SecondLdSt))
     return true;
 
-  if (isSMRD(Opc0) && isSMRD(Opc1))
+  if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt))
     return true;
 
-  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+  if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) &&
+      (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)))
     return true;
 
   return false;
@@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+  };
+
+  static const int16_t Sub0_15_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
   };
 
   static const int16_t Sub0_7[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+  };
+
+  static const int16_t Sub0_7_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
   };
 
   static const int16_t Sub0_3[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+  };
+
+  static const int16_t Sub0_3_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
   };
 
   static const int16_t Sub0_2[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
   };
 
   static const int16_t Sub0_1[] = {
-    AMDGPU::sub0, AMDGPU::sub1, 0
+    AMDGPU::sub0, AMDGPU::sub1,
   };
 
   unsigned Opcode;
-  const int16_t *SubIndices;
+  ArrayRef<int16_t> SubIndices;
+  bool Forward;
 
   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       } else {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
           .addImm(0)
           .addReg(SrcReg, getKillRegState(KillSrc));
       }
@@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_3;
+    Opcode = AMDGPU::S_MOV_B64;
+    SubIndices = Sub0_3_64;
 
   } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_7;
+    Opcode = AMDGPU::S_MOV_B64;
+    SubIndices = Sub0_7_64;
 
   } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_15;
+    Opcode = AMDGPU::S_MOV_B64;
+    SubIndices = Sub0_15_64;
 
   } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
@@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     llvm_unreachable("Can't copy register!");
   }
 
-  while (unsigned SubIdx = *SubIndices++) {
+  if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
+    Forward = true;
+  else
+    Forward = false;
+
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    unsigned SubIdx;
+    if (Forward)
+      SubIdx = SubIndices[Idx];
+    else
+      SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
       get(Opcode), RI.getSubReg(DestReg, SubIdx));
 
-    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
 
-    if (*SubIndices)
+    if (Idx == SubIndices.size() - 1)
+      Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
+
+    if (Idx == 0)
       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
   }
 }
@@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
   return AMDGPU::COPY;
 }
 
+static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_S32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_S64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_S128_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_S256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_S512_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_V32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_V64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_V128_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_V256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_V512_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       unsigned SrcReg, bool isKill,
@@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  int Opcode = -1;
+
+  unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+  unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+  MachineMemOperand *MMO
+    = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                               Size, Align);
 
   if (RI.isSGPRClass(RC)) {
+    MFI->setHasSpilledSGPRs();
+
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling
     // SGPRs.
-    switch (RC->getSize() * 8) {
-      case 32:  Opcode = AMDGPU::SI_SPILL_S32_SAVE;  break;
-      case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
-      case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
-    }
-  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
-    MFI->setHasSpilledVGPRs();
-
-    switch(RC->getSize() * 8) {
-      case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
-      case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
-      case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
-      case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
-    }
+    unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
+    BuildMI(MBB, MI, DL, get(Opcode))
+      .addReg(SrcReg)            // src
+      .addFrameIndex(FrameIndex) // frame_idx
+      .addMemOperand(MMO);
+
+    return;
   }
 
-  if (Opcode != -1) {
-    FrameInfo->setObjectAlignment(FrameIndex, 4);
-    BuildMI(MBB, MI, DL, get(Opcode))
-            .addReg(SrcReg)
-            .addFrameIndex(FrameIndex)
-            // Place-holder registers, these will be filled in by
-            // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  } else {
+  if (!ST.isVGPRSpillingEnabled(MFI)) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
                   " spill register");
     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
-            .addReg(SrcReg);
+      .addReg(SrcReg);
+
+    return;
+  }
+
+  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+  unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+  MFI->setHasSpilledVGPRs();
+  BuildMI(MBB, MI, DL, get(Opcode))
+    .addReg(SrcReg)                   // src
+    .addFrameIndex(FrameIndex)        // frame_idx
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addMemOperand(MMO);
+}
+
+static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_S32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_S64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_S128_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_S256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_S512_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_V32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_V64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_V128_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_V256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_V512_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
   }
 }
 
@@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  int Opcode = -1;
-
-  if (RI.isSGPRClass(RC)){
-    switch(RC->getSize() * 8) {
-      case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
-      case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
-      case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
-    }
-  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
-    switch(RC->getSize() * 8) {
-      case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
-      case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
-      case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
-      case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
-    }
-  }
+  unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+  unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+    PtrInfo, MachineMemOperand::MOLoad, Size, Align);
 
-  if (Opcode != -1) {
-    FrameInfo->setObjectAlignment(FrameIndex, 4);
+  if (RI.isSGPRClass(RC)) {
+    // FIXME: Maybe this should not include a memoperand because it will be
+    // lowered to non-memory instructions.
+    unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-            .addFrameIndex(FrameIndex)
-            // Place-holder registers, these will be filled in by
-            // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
+      .addFrameIndex(FrameIndex) // frame_idx
+      .addMemOperand(MMO);
 
-  } else {
+    return;
+  }
+
+  if (!ST.isVGPRSpillingEnabled(MFI)) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
                   " restore register");
     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+
+    return;
   }
+
+  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+  unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+    .addFrameIndex(FrameIndex)        // frame_idx
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addMemOperand(MMO);
 }
 
 /// \param @Offset Offset in bytes of the FrameIndex being spilled
@@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     if (MFI->getShaderType() == ShaderType::COMPUTE &&
         WorkGroupSize > WavefrontSize) {
 
-      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
-      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
-      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+      unsigned TIDIGXReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+      unsigned TIDIGYReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+      unsigned TIDIGZReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
       unsigned InputPtrReg =
-          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
         if (!Entry.isLiveIn(Reg))
           Entry.addLiveIn(Reg);
       }
 
       RS->enterBasicBlock(&Entry);
+      // FIXME: Can we scavenge an SReg_64 and access the subregs?
       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
@@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
   return TmpReg;
 }
 
-void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
-                             int Count) const {
+void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
+                                   int Count) const {
   while (Count > 0) {
     int Arg;
     if (Count >= 8)
@@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    unsigned Reg = MI->getOperand(0).getReg();
-    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
-    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
-    // Add 32-bit offset from this instruction to the start of the constant data.
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
-            .addReg(RegLo)
-            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
-            .addReg(RegHi)
-            .addImm(0)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
-            .addReg(AMDGPU::SCC, RegState::Implicit);
-    MI->eraseFromParent();
-    break;
-  }
   case AMDGPU::SGPR_USE:
     // This is just a placeholder for register allocation.
     MI->eraseFromParent();
@@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     MI->eraseFromParent();
     break;
   }
+
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+    MachineFunction &MF = *MBB.getParent();
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+
+    // Create a bundle so these instructions won't be re-ordered by the
+    // post-RA scheduler.
+    MIBundleBuilder Bundler(MBB, MI);
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+    // Add 32-bit offset from this instruction to the start of the
+    // constant data.
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+                           .addReg(RegLo)
+                           .addOperand(MI->getOperand(1)));
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+                           .addReg(RegHi)
+                           .addImm(0));
+
+    llvm::finalizeBundle(MBB, Bundler.begin());
+
+    MI->eraseFromParent();
+    break;
+  }
   }
   return true;
 }
 
-MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
-                                              bool NewMI) const {
-
-  if (MI->getNumOperands() < 3)
-    return nullptr;
-
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx0 and OpIdx1.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                  bool NewMI,
+                                                  unsigned OpIdx0,
+                                                  unsigned OpIdx1) const {
   int CommutedOpcode = commuteOpcode(*MI);
   if (CommutedOpcode == -1)
     return nullptr;
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src0);
-  assert(Src0Idx != -1 && "Should always have src0 operand");
-
   MachineOperand &Src0 = MI->getOperand(Src0Idx);
   if (!Src0.isReg())
     return nullptr;
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src1);
-  if (Src1Idx == -1)
+
+  if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
+       OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
+      (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
+       OpIdx1 != static_cast<unsigned>(Src0Idx)))
     return nullptr;
 
   MachineOperand &Src1 = MI->getOperand(Src1Idx);
 
-  // Make sure it's legal to commute operands for VOP2.
-  if (isVOP2(MI->getOpcode()) &&
-      (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0))) {
-    return nullptr;
+
+  if (isVOP2(*MI)) {
+    const MCInstrDesc &InstrDesc = MI->getDesc();
+    // For VOP2 instructions, any operand type is valid to use for src0.  Make
+    // sure we can use the src1 as src0.
+    //
+    // We could be stricter here and only allow commuting if there is a reason
+    // to do so. i.e. if both operands are VGPRs there is no real benefit,
+    // although MachineCSE attempts to find matches by commuting.
+    const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
+      return nullptr;
   }
 
   if (!Src1.isReg()) {
     // Allow commuting instructions with Imm operands.
     if (NewMI || !Src1.isImm() ||
-       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+        (!isVOP2(*MI) && !isVOP3(*MI))) {
       return nullptr;
     }
-
     // Be sure to copy the source modifiers to the right place.
     if (MachineOperand *Src0Mods
           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
@@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
     Src1.ChangeToRegister(Reg, false);
     Src1.setSubReg(SubReg);
   } else {
-    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
   }
 
   if (MI)
@@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
 // between the true commutable operands, and the base
 // TargetInstrInfo::commuteInstruction uses it.
 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                        unsigned &SrcOpIdx1,
-                                        unsigned &SrcOpIdx2) const {
+                                        unsigned &SrcOpIdx0,
+                                        unsigned &SrcOpIdx1) const {
   const MCInstrDesc &MCID = MI->getDesc();
   if (!MCID.isCommutable())
     return false;
@@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
     return false;
 
   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
-  // immediate.
+  // immediate. Also, immediate src0 operand is not handled in
+  // SIInstrInfo::commuteInstruction();
   if (!MI->getOperand(Src0Idx).isReg())
     return false;
 
@@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   if (Src1Idx == -1)
     return false;
 
-  if (!MI->getOperand(Src1Idx).isReg())
-    return false;
-
-  // If any source modifiers are set, the generic instruction commuting won't
-  // understand how to copy the source modifiers.
-  if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
-      hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  if (Src1.isImm()) {
+    // SIInstrInfo::commuteInstruction() does support commuting the immediate
+    // operand src1 in 2 and 3 operand instructions.
+    if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+      return false;
+  } else if (Src1.isReg()) {
+    // If any source modifiers are set, the generic instruction commuting won't
+    // understand how to copy the source modifiers.
+    if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+      return false;
+  } else
     return false;
 
-  SrcOpIdx1 = Src0Idx;
-  SrcOpIdx2 = Src1Idx;
-  return true;
+  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
@@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const {
   }
 }
 
-bool
-SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
-  return RC != &AMDGPU::EXECRegRegClass;
-}
-
 static void removeModOperands(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
       }
 
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::src2));
-      // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       removeModOperands(*UseMI);
@@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
   return false;
 }
 
-bool
-SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
-                                         AliasAnalysis *AA) const {
-  switch(MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::V_MOV_B32_e32:
-    return MI->getOperand(1).isImm();
-  }
-}
-
 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
                                 int WidthB, int OffsetB) {
   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
                                                   MachineInstr *MIb,
                                                   AliasAnalysis *AA) const {
-  unsigned Opc0 = MIa->getOpcode();
-  unsigned Opc1 = MIb->getOpcode();
-
   assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
          "MIa must load from or modify a memory location");
   assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
@@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
-  // underlying addres space, even if it was lowered to a different one,
+  // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(Opc0)) {
-    if (isDS(Opc1))
+  if (isDS(*MIa)) {
+    if (isDS(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(Opc1);
+    return !isFLAT(*MIb);
   }
 
-  if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
-    if (isMUBUF(Opc1) || isMTBUF(Opc1))
+  if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
+    if (isMUBUF(*MIb) || isMTBUF(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(Opc1) && !isSMRD(Opc1);
+    return !isFLAT(*MIb) && !isSMRD(*MIb);
   }
 
-  if (isSMRD(Opc0)) {
-    if (isSMRD(Opc1))
+  if (isSMRD(*MIa)) {
+    if (isSMRD(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+    return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
   }
 
-  if (isFLAT(Opc0)) {
-    if (isFLAT(Opc1))
+  if (isFLAT(*MIa)) {
+    if (isFLAT(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return false;
@@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   return false;
 }
 
+static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands()) {
+    // We only care about reads.
+    if (MO.isDef())
+      continue;
+
+    switch (MO.getReg()) {
+    case AMDGPU::VCC:
+    case AMDGPU::M0:
+    case AMDGPU::FLAT_SCR:
+      return MO.getReg();
+
+    default:
+      break;
+    }
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
                                     StringRef &ErrInfo) const {
   uint16_t Opcode = MI->getOpcode();
@@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
      return false;
   }
 
-  // Make sure the register classes are correct
+  // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     if (MI->getOperand(i).isFPImm()) {
       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
@@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
 
   // Verify VOP*
-  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+  if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
     unsigned ConstantBusCount = 0;
-    unsigned SGPRUsed = AMDGPU::NoRegister;
+    unsigned SGPRUsed = findImplicitSGPRRead(*MI);
+    if (SGPRUsed != AMDGPU::NoRegister)
+      ++ConstantBusCount;
+
     for (int OpIdx : OpIndices) {
       if (OpIdx == -1)
         break;
@@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     }
   }
 
+  // Make sure we aren't losing exec uses in the td files. This mostly requires
+  // being careful when using let Uses to try to add other use registers.
+  if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
+    const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
+    if (!Exec || !Exec->isImplicit()) {
+      ErrInfo = "VALU instruction does not implicitly read exec mask";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
   case AMDGPU::S_LOAD_DWORD_IMM:
-  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORD_SGPR:
+  case AMDGPU::S_LOAD_DWORD_IMM_ci:
+    return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
   case AMDGPU::S_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_SGPR:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
+    return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_SGPR:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
+    return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC)
                                          const {
-  assert(SuperReg.isReg());
-
-  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
   unsigned SubReg = MRI.createVirtualRegister(SubRC);
 
+  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
+    BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+      .addReg(SuperReg.getReg(), 0, SubIdx);
+    return SubReg;
+  }
+
   // Just in case the super register is itself a sub-register, copy it to a new
   // value so we don't need to worry about merging its subreg index with the
   // SubIdx passed to this function. The register coalescer should be able to
   // eliminate this extra copy.
-  MachineBasicBlock *MBB = MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
 
   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
   return MachineOperand::CreateReg(SubReg, false);
 }
 
-unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
-                                    MachineBasicBlock::iterator MI,
-                                    MachineRegisterInfo &MRI,
-                                    const TargetRegisterClass *RC,
-                                    const MachineOperand &Op) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned Dst = MRI.createVirtualRegister(RC);
-
-  MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                             LoDst)
-    .addImm(Op.getImm() & 0xFFFFFFFF);
-  MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                             HiDst)
-    .addImm(Op.getImm() >> 32);
-
-  BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
-    .addReg(LoDst)
-    .addImm(AMDGPU::sub0)
-    .addReg(HiDst)
-    .addImm(AMDGPU::sub1);
-
-  Worklist.push_back(Lo);
-  Worklist.push_back(Hi);
-
-  return Dst;
-}
-
 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   assert(Inst->getNumExplicitOperands() == 3);
@@ -1643,6 +1765,45 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   Inst->addOperand(Op1);
 }
 
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+                                    const MCOperandInfo &OpInfo,
+                                    const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  const TargetRegisterClass *RC =
+    TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    RI.getPhysRegClass(Reg);
+
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  RC = TRI->getSubRegClass(RC, MO.getSubReg());
+
+  // In order to be legal, the common sub-class must be equal to the
+  // class of the current operand.  For example:
+  //
+  // v_mov_b32 s0 ; Operand defined as vsrc_32
+  //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+  //
+  // s_sendmsg 0, s0 ; Operand defined as m0reg
+  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                                     const MCOperandInfo &OpInfo,
+                                     const MachineOperand &MO) const {
+  if (MO.isReg())
+    return isLegalRegOperand(MRI, OpInfo, MO);
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  return true;
+}
+
 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1653,7 +1814,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   if (!MO)
     MO = &MI->getOperand(OpIdx);
 
-  if (isVALU(InstDesc.Opcode) &&
+  if (isVALU(*MI) &&
       usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
     unsigned SGPRUsed =
         MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
@@ -1670,21 +1831,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    const TargetRegisterClass *RC =
-        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
-            MRI.getRegClass(MO->getReg()) :
-            RI.getPhysRegClass(MO->getReg());
-
-    // In order to be legal, the common sub-class must be equal to the
-    // class of the current operand.  For example:
-    //
-    // v_mov_b32 s0 ; Operand defined as vsrc_32
-    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
-    //
-    // s_sendmsg 0, s0 ; Operand defined as m0reg
-    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
-    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+    return isLegalRegOperand(MRI, OpInfo, *MO);
   }
 
 
@@ -1699,81 +1846,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+                                       MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &InstrDesc = get(Opc);
 
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src1);
-  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src2);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
 
-  // Legalize VOP2
-  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
-    // Legalize src0
-    if (!isOperandLegal(MI, Src0Idx))
+  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+  // we need to only have one constant bus use.
+  //
+  // Note we do not need to worry about literal constants here. They are
+  // disabled for the operand type for instructions because they will always
+  // violate the one constant bus use rule.
+  bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+  if (HasImplicitSGPR) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+    if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
       legalizeOpWithMove(MI, Src0Idx);
+  }
 
-    // Legalize src1
-    if (isOperandLegal(MI, Src1Idx))
-      return;
+  // VOP2 src0 instructions support all operand types, so we don't need to check
+  // their legality. If src1 is already legal, we don't need to do anything.
+  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+    return;
 
-    // Usually src0 of VOP2 instructions allow more types of inputs
-    // than src1, so try to commute the instruction to decrease our
-    // chances of having to insert a MOV instruction to legalize src1.
-    if (MI->isCommutable()) {
-      if (commuteInstruction(MI))
-        // If we are successful in commuting, then we know MI is legal, so
-        // we are done.
-        return;
-    }
+  // We do not use commuteInstruction here because it is too aggressive and will
+  // commute if it is possible. We only want to commute here if it improves
+  // legality. This can be called a fairly large number of times so don't waste
+  // compile time pointlessly swapping and checking legality again.
+  if (HasImplicitSGPR || !MI->isCommutable()) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
 
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+  // If src0 can be used as src1, commuting will make the operands legal.
+  // Otherwise we have to give up and insert a move.
+  //
+  // TODO: Other immediate-like operand kinds could be commuted if there was a
+  // MachineOperand::ChangeTo* for them.
+  if ((!Src1.isImm() && !Src1.isReg()) ||
+      !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
     legalizeOpWithMove(MI, Src1Idx);
     return;
   }
 
-  // XXX - Do any VOP3 instructions read VCC?
-  // Legalize VOP3
-  if (isVOP3(MI->getOpcode())) {
-    int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+  int CommutedOpc = commuteOpcode(*MI);
+  if (CommutedOpc == -1) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
 
-    // Find the one SGPR operand we are allowed to use.
-    unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+  MI->setDesc(get(CommutedOpc));
 
-    for (unsigned i = 0; i < 3; ++i) {
-      int Idx = VOP3Idx[i];
-      if (Idx == -1)
-        break;
-      MachineOperand &MO = MI->getOperand(Idx);
+  unsigned Src0Reg = Src0.getReg();
+  unsigned Src0SubReg = Src0.getSubReg();
+  bool Src0Kill = Src0.isKill();
 
-      if (MO.isReg()) {
-        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
-          continue; // VGPRs are legal
+  if (Src1.isImm())
+    Src0.ChangeToImmediate(Src1.getImm());
+  else if (Src1.isReg()) {
+    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+    Src0.setSubReg(Src1.getSubReg());
+  } else
+    llvm_unreachable("Should only have register or immediate operands");
 
-        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+  Src1.setSubReg(Src0SubReg);
+}
 
-        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
-          SGPRReg = MO.getReg();
-          // We can use one SGPR in each VOP3 instruction.
-          continue;
-        }
-      } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
-        // If it is not a register and not a literal constant, then it must be
-        // an inline constant which is always legal.
-        continue;
-      }
-      // If we make it this far, then the operand is not legal and we must
-      // legalize it.
-      legalizeOpWithMove(MI, Idx);
+// Legalize VOP3 operands. Because all operand types are supported for any
+// operand, and since literal constants are not allowed and should never be
+// seen, we only need to worry about inserting copies if we use multiple SGPR
+// operands.
+void SIInstrInfo::legalizeOperandsVOP3(
+  MachineRegisterInfo &MRI,
+  MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  int VOP3Idx[3] = {
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
+  };
+
+  // Find the one SGPR operand we are allowed to use.
+  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+  for (unsigned i = 0; i < 3; ++i) {
+    int Idx = VOP3Idx[i];
+    if (Idx == -1)
+      break;
+    MachineOperand &MO = MI->getOperand(Idx);
+
+    // We should never see a VOP3 instruction with an illegal immediate operand.
+    if (!MO.isReg())
+      continue;
+
+    if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+      continue; // VGPRs are legal
+
+    if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+      SGPRReg = MO.getReg();
+      // We can use one SGPR in each VOP3 instruction.
+      continue;
     }
+
+    // If we make it this far, then the operand is not legal and we must
+    // legalize it.
+    legalizeOpWithMove(MI, Idx);
+  }
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  // Legalize VOP2
+  if (isVOP2(*MI)) {
+    legalizeOperandsVOP2(MRI, MI);
+    return;
+  }
+
+  // Legalize VOP3
+  if (isVOP3(*MI)) {
+    legalizeOperandsVOP3(MRI, MI);
+    return;
   }
 
   // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
-      MI->getOpcode() == AMDGPU::PHI) {
+  if (MI->getOpcode() == AMDGPU::PHI) {
     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
       if (!MI->getOperand(i).isReg() ||
@@ -1802,26 +2011,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
 
     // Update all the operands so they have the same type.
-    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
-      if (!MI->getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+    for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+      MachineOperand &Op = MI->getOperand(I);
+      if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
-      MachineBasicBlock *InsertBB;
-      MachineBasicBlock::iterator Insert;
-      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-        InsertBB = MI->getParent();
-        Insert = MI;
-      } else {
-        // MI is a PHI instruction.
-        InsertBB = MI->getOperand(i + 1).getMBB();
-        Insert = InsertBB->getFirstTerminator();
+
+      // MI is a PHI instruction.
+      MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
+      MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
+
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+        .addOperand(Op);
+      Op.setReg(DstReg);
+    }
+  }
+
+  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
+  // VGPR dest type and SGPR sources, insert copies so all operands are
+  // VGPRs. This seems to help operand folding / the register coalescer.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    MachineBasicBlock *MBB = MI->getParent();
+    const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
+    if (RI.hasVGPRs(DstRC)) {
+      // Update all the operands so they are VGPR register classes. These may
+      // not be the same register class because REG_SEQUENCE supports mixing
+      // subregister index types e.g. sub0_sub1 + sub2 + sub3
+      for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+        MachineOperand &Op = MI->getOperand(I);
+        if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+          continue;
+
+        const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
+        const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
+        if (VRC == OpRC)
+          continue;
+
+        unsigned DstReg = MRI.createVirtualRegister(VRC);
+
+        BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+          .addOperand(Op);
+
+        Op.setReg(DstReg);
+        Op.setIsKill();
       }
-      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
-              get(AMDGPU::COPY), DstReg)
-              .addOperand(MI->getOperand(i));
-      MI->getOperand(i).setReg(DstReg);
     }
+
+    return;
   }
 
   // Legalize INSERT_SUBREG
@@ -1858,15 +2094,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
 
     MachineBasicBlock &MBB = *MI->getParent();
-    // Extract the ptr from the resource descriptor.
-
-    // SRsrcPtrLo = srsrc:sub0
-    unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
 
-    // SRsrcPtrHi = srsrc:sub1
-    unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
+    // Extract the ptr from the resource descriptor.
+    unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
+      &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
 
     // Create an empty resource descriptor
     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -1891,80 +2122,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
             .addImm(RsrcDataFormat >> 32);
 
     // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewSRsrc)
-            .addReg(Zero64)
-            .addImm(AMDGPU::sub0_sub1)
-            .addReg(SRsrcFormatLo)
-            .addImm(AMDGPU::sub2)
-            .addReg(SRsrcFormatHi)
-            .addImm(AMDGPU::sub3);
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+      .addReg(Zero64)
+      .addImm(AMDGPU::sub0_sub1)
+      .addReg(SRsrcFormatLo)
+      .addImm(AMDGPU::sub2)
+      .addReg(SRsrcFormatHi)
+      .addImm(AMDGPU::sub3);
 
     MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-    unsigned NewVAddrLo;
-    unsigned NewVAddrHi;
     if (VAddr) {
       // This is already an ADDR64 instruction so we need to add the pointer
       // extracted from the resource descriptor to the current value of VAddr.
-      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-      // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
-              NewVAddrLo)
-              .addReg(SRsrcPtrLo)
-              .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
-              .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
-
-      // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
-              NewVAddrHi)
-              .addReg(SRsrcPtrHi)
-              .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
-              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
-              .addReg(AMDGPU::VCC, RegState::Implicit);
-
+      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+      // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+      DebugLoc DL = MI->getDebugLoc();
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+        .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+
+      // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+        .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+        .addReg(NewVAddrLo)
+        .addImm(AMDGPU::sub0)
+        .addReg(NewVAddrHi)
+        .addImm(AMDGPU::sub1);
     } else {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
+      assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
+             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+             "FIXME: Need to emit flat atomics here");
+
       MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
       MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
       MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-
-      // Create the new instruction.
       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
-      MachineInstr *Addr64 =
-          BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-                  .addOperand(*VData)
-                  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-                                              // This will be replaced later
-                                              // with the new value of vaddr.
-                  .addOperand(*SRsrc)
-                  .addOperand(*SOffset)
-                  .addOperand(*Offset)
-                  .addImm(0) // glc
-                  .addImm(0) // slc
-                  .addImm(0); // tfe
+
+      // Atomics rith return have have an additional tied operand and are
+      // missing some of the special bits.
+      MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
+      MachineInstr *Addr64;
+
+      if (!VDataIn) {
+        // Regular buffer load / store.
+        MachineInstrBuilder MIB
+          = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+          .addOperand(*VData)
+          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+          // This will be replaced later
+          // with the new value of vaddr.
+          .addOperand(*SRsrc)
+          .addOperand(*SOffset)
+          .addOperand(*Offset);
+
+        // Atomics do not have this operand.
+        if (const MachineOperand *GLC
+            = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
+          MIB.addImm(GLC->getImm());
+        }
+
+        MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
+
+        if (const MachineOperand *TFE
+            = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
+          MIB.addImm(TFE->getImm());
+        }
+
+        MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+        Addr64 = MIB;
+      } else {
+        // Atomics with return.
+        Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+          .addOperand(*VData)
+          .addOperand(*VDataIn)
+          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+          // This will be replaced later
+          // with the new value of vaddr.
+          .addOperand(*SRsrc)
+          .addOperand(*SOffset)
+          .addOperand(*Offset)
+          .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
+          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      }
 
       MI->removeFromParent();
       MI = Addr64;
 
-      NewVAddrLo = SRsrcPtrLo;
-      NewVAddrHi = SRsrcPtrHi;
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+        .addImm(AMDGPU::sub0)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+        .addImm(AMDGPU::sub1);
+
       VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
       SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
     }
 
-    // NewVaddr = {NewVaddrHi, NewVaddrLo}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewVAddr)
-            .addReg(NewVAddrLo)
-            .addImm(AMDGPU::sub0)
-            .addReg(NewVAddrHi)
-            .addImm(AMDGPU::sub1);
-
-
     // Update the instruction to use NewVaddr
     VAddr->setReg(NewVAddr);
     // Update the instruction to use NewSRsrc
@@ -2028,53 +2291,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI,
                   .addOperand(*SOff);
     unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
-            .addOperand(*SOff)
-            .addImm(HalfSize);
-    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+      .addReg(SOff->getReg(), 0, SOff->getSubReg())
+      .addImm(HalfSize);
+    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
                   .addReg(SBase->getReg(), getKillRegState(IsKill),
                           SBase->getSubReg())
                   .addReg(OffsetSGPR);
   }
 
   unsigned SubLo, SubHi;
+  const TargetRegisterClass *NewDstRC;
   switch (HalfSize) {
     case 4:
       SubLo = AMDGPU::sub0;
       SubHi = AMDGPU::sub1;
+      NewDstRC = &AMDGPU::VReg_64RegClass;
       break;
     case 8:
       SubLo = AMDGPU::sub0_sub1;
       SubHi = AMDGPU::sub2_sub3;
+      NewDstRC = &AMDGPU::VReg_128RegClass;
       break;
     case 16:
       SubLo = AMDGPU::sub0_sub1_sub2_sub3;
       SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+      NewDstRC = &AMDGPU::VReg_256RegClass;
       break;
     case 32:
       SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
       SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+      NewDstRC = &AMDGPU::VReg_512RegClass;
       break;
     default:
       llvm_unreachable("Unhandled HalfSize");
   }
 
-  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
-          .addOperand(MI->getOperand(0))
-          .addReg(RegLo)
-          .addImm(SubLo)
-          .addReg(RegHi)
-          .addImm(SubHi);
+  unsigned OldDst = MI->getOperand(0).getReg();
+  unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
+
+  MRI.replaceRegWith(OldDst, NewDst);
+
+  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
+    .addReg(RegLo)
+    .addImm(SubLo)
+    .addReg(RegHi)
+    .addImm(SubHi);
 }
 
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
+                                 MachineRegisterInfo &MRI,
+                                 SmallVectorImpl<MachineInstr *> &Worklist) const {
   MachineBasicBlock *MBB = MI->getParent();
-  switch (MI->getOpcode()) {
-    case AMDGPU::S_LOAD_DWORD_IMM:
-    case AMDGPU::S_LOAD_DWORD_SGPR:
-    case AMDGPU::S_LOAD_DWORDX2_IMM:
-    case AMDGPU::S_LOAD_DWORDX2_SGPR:
-    case AMDGPU::S_LOAD_DWORDX4_IMM:
-    case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+  int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  assert(DstIdx != -1);
+  unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
+  switch(RI.getRegClass(DstRCID)->getSize()) {
+    case 4:
+    case 8:
+    case 16: {
       unsigned NewOpcode = getVALUOp(*MI);
       unsigned RegOffset;
       unsigned ImmOffset;
@@ -2118,53 +2392,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
               .addImm(RsrcDataFormat >> 32);
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
-              .addReg(DWord0)
-              .addImm(AMDGPU::sub0)
-              .addReg(DWord1)
-              .addImm(AMDGPU::sub1)
-              .addReg(DWord2)
-              .addImm(AMDGPU::sub2)
-              .addReg(DWord3)
-              .addImm(AMDGPU::sub3);
-      MI->setDesc(get(NewOpcode));
-      if (MI->getOperand(2).isReg()) {
-        MI->getOperand(2).setReg(SRsrc);
-      } else {
-        MI->getOperand(2).ChangeToRegister(SRsrc, false);
-      }
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
-
-      const TargetRegisterClass *NewDstRC =
-          RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
-
-      unsigned DstReg = MI->getOperand(0).getReg();
+        .addReg(DWord0)
+        .addImm(AMDGPU::sub0)
+        .addReg(DWord1)
+        .addImm(AMDGPU::sub1)
+        .addReg(DWord2)
+        .addImm(AMDGPU::sub2)
+        .addReg(DWord3)
+        .addImm(AMDGPU::sub3);
+
+      const MCInstrDesc &NewInstDesc = get(NewOpcode);
+      const TargetRegisterClass *NewDstRC
+        = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
       unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      unsigned DstReg = MI->getOperand(0).getReg();
       MRI.replaceRegWith(DstReg, NewDstReg);
+
+      MachineInstr *NewInst =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
+        .addOperand(MI->getOperand(1)) // sbase
+        .addReg(SRsrc)
+        .addImm(0)
+        .addImm(ImmOffset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // tfe
+        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MI->eraseFromParent();
+
+      legalizeOperands(NewInst);
+      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
       break;
     }
-    case AMDGPU::S_LOAD_DWORDX8_IMM:
-    case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+    case 32: {
       MachineInstr *Lo, *Hi;
       splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
                 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
       MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI);
-      moveSMRDToVALU(Hi, MRI);
+      moveSMRDToVALU(Lo, MRI, Worklist);
+      moveSMRDToVALU(Hi, MRI, Worklist);
       break;
     }
 
-    case AMDGPU::S_LOAD_DWORDX16_IMM:
-    case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+    case 64: {
       MachineInstr *Lo, *Hi;
       splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
                 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
       MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI);
-      moveSMRDToVALU(Hi, MRI);
+      moveSMRDToVALU(Lo, MRI, Worklist);
+      moveSMRDToVALU(Hi, MRI, Worklist);
       break;
     }
   }
@@ -2185,51 +2461,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     // Handle some special cases
     switch (Opcode) {
     default:
-      if (isSMRD(Inst->getOpcode())) {
-        moveSMRDToVALU(Inst, MRI);
+      if (isSMRD(*Inst)) {
+        moveSMRDToVALU(Inst, MRI, Worklist);
+        continue;
       }
       break;
-    case AMDGPU::S_MOV_B64: {
-      DebugLoc DL = Inst->getDebugLoc();
-
-      // If the source operand is a register we can replace this with a
-      // copy.
-      if (Inst->getOperand(1).isReg()) {
-        MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
-          .addOperand(Inst->getOperand(0))
-          .addOperand(Inst->getOperand(1));
-        Worklist.push_back(Copy);
-      } else {
-        // Otherwise, we need to split this into two movs, because there is
-        // no 64-bit VALU move instruction.
-        unsigned Reg = Inst->getOperand(0).getReg();
-        unsigned Dst = split64BitImm(Worklist,
-                                     Inst,
-                                     MRI,
-                                     MRI.getRegClass(Reg),
-                                     Inst->getOperand(1));
-        MRI.replaceRegWith(Reg, Dst);
-      }
-      Inst->eraseFromParent();
-      continue;
-    }
     case AMDGPU::S_AND_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
-      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
       Inst->eraseFromParent();
       continue;
 
@@ -2281,6 +2534,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       }
       break;
 
+    case AMDGPU::S_ABS_I32:
+      lowerScalarAbs(Worklist, Inst);
+      Inst->eraseFromParent();
+      continue;
+
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
@@ -2319,7 +2577,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    addDescImplicitUseDef(NewDesc, Inst);
+    Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
 
     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
       const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
@@ -2337,27 +2595,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     }
 
     // Update the destination register class.
-
-    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
-
-    switch (Opcode) {
-      // For target instructions, getOpRegClass just returns the virtual
-      // register class associated with the operand, so we need to find an
-      // equivalent VGPR register class in order to move the instruction to the
-      // VALU.
-    case AMDGPU::COPY:
-    case AMDGPU::PHI:
-    case AMDGPU::REG_SEQUENCE:
-    case AMDGPU::INSERT_SUBREG:
-      if (RI.hasVGPRs(NewDstRC))
-        continue;
-      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
-      if (!NewDstRC)
-        continue;
-      break;
-    default:
-      break;
-    }
+    const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
+    if (!NewDstRC)
+      continue;
 
     unsigned DstReg = Inst->getOperand(0).getReg();
     unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
@@ -2366,13 +2606,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     // Legalize the operands
     legalizeOperands(Inst);
 
-    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
-           E = MRI.use_end(); I != E; ++I) {
-      MachineInstr &UseMI = *I->getParent();
-      if (!canReadVGPR(UseMI, I.getOperandNo())) {
-        Worklist.push_back(&UseMI);
-      }
-    }
+    addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
   }
 }
 
@@ -2390,6 +2624,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::VGPR_32RegClass;
 }
 
+void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+                                 MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src = Inst->getOperand(1);
+  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+    .addImm(0)
+    .addReg(Src.getReg());
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+    .addReg(Src.getReg())
+    .addReg(TmpReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::splitScalar64BitUnaryOp(
   SmallVectorImpl<MachineInstr *> &Worklist,
   MachineInstr *Inst,
@@ -2414,20 +2672,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
                                                        AMDGPU::sub0, Src0SubRC);
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
-  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
-  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub0)
     .addOperand(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
-  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
-  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub1)
     .addOperand(SrcReg0Sub1);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
     .addReg(DestSub0)
     .addImm(AMDGPU::sub0)
@@ -2436,10 +2695,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 
   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
 
-  // Try to legalize the operands in case we need to swap the order to keep it
-  // valid.
-  Worklist.push_back(LoHalf);
-  Worklist.push_back(HiHalf);
+  // We don't need to legalizeOperands here because for a single operand, src0
+  // will support any kind of input.
+
+  // Move all users of this moved value.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
@@ -2474,9 +2734,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
                                                        AMDGPU::sub0, Src1SubRC);
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
-  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
     .addOperand(SrcReg0Sub0)
     .addOperand(SrcReg1Sub0);
@@ -2486,12 +2747,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
                                                        AMDGPU::sub1, Src1SubRC);
 
-  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
     .addOperand(SrcReg0Sub1)
     .addOperand(SrcReg1Sub1);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
     .addReg(DestSub0)
     .addImm(AMDGPU::sub0)
@@ -2502,8 +2763,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   // Try to legalize the operands in case we need to swap the order to keep it
   // valid.
-  Worklist.push_back(LoHalf);
-  Worklist.push_back(HiHalf);
+  legalizeOperands(LoHalf);
+  legalizeOperands(HiHalf);
+
+  // Move all users of this moved vlaue.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2532,18 +2796,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
                                                       AMDGPU::sub1, SrcSubRC);
 
-  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+  BuildMI(MBB, MII, DL, InstDesc, MidReg)
     .addOperand(SrcRegSub0)
     .addImm(0);
 
-  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+  BuildMI(MBB, MII, DL, InstDesc, ResultReg)
     .addOperand(SrcRegSub1)
     .addReg(MidReg);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
 
-  Worklist.push_back(First);
-  Worklist.push_back(Second);
+  // We don't need to legalize operands here. src0 for etiher instruction can be
+  // an SGPR, and the second input is unused or determined here.
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2587,6 +2852,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
       .addImm(AMDGPU::sub1);
 
     MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
     return;
   }
 
@@ -2605,33 +2871,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
     .addImm(AMDGPU::sub1);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
-                                        MachineInstr *Inst) const {
-  // Add the implict and explicit register definitions.
-  if (NewDesc.ImplicitUses) {
-    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
-      unsigned Reg = NewDesc.ImplicitUses[i];
-      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+void SIInstrInfo::addUsersToMoveToVALUWorklist(
+  unsigned DstReg,
+  MachineRegisterInfo &MRI,
+  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
+         E = MRI.use_end(); I != E; ++I) {
+    MachineInstr &UseMI = *I->getParent();
+    if (!canReadVGPR(UseMI, I.getOperandNo())) {
+      Worklist.push_back(&UseMI);
     }
   }
+}
 
-  if (NewDesc.ImplicitDefs) {
-    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
-      unsigned Reg = NewDesc.ImplicitDefs[i];
-      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
-    }
+const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
+  const MachineInstr &Inst) const {
+  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
+
+  switch (Inst.getOpcode()) {
+  // For target instructions, getOpRegClass just returns the virtual register
+  // class associated with the operand, so we need to find an equivalent VGPR
+  // register class in order to move the instruction to the VALU.
+  case AMDGPU::COPY:
+  case AMDGPU::PHI:
+  case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::INSERT_SUBREG:
+    if (RI.hasVGPRs(NewDstRC))
+      return nullptr;
+
+    NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+    if (!NewDstRC)
+      return nullptr;
+    return NewDstRC;
+  default:
+    return NewDstRC;
   }
 }
 
+// Find the one SGPR operand we are allowed to use.
 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
                                    int OpIndices[3]) const {
-  const MCInstrDesc &Desc = get(MI->getOpcode());
+  const MCInstrDesc &Desc = MI->getDesc();
 
   // Find the one SGPR operand we are allowed to use.
-  unsigned SGPRReg = AMDGPU::NoRegister;
-
+  //
   // First we need to consider the instruction's operand requirements before
   // legalizing. Some operands are required to be SGPRs, such as implicit uses
   // of VCC, but we are still bound by the constant bus requirement to only use
@@ -2639,17 +2925,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   //
   // If the operand's class is an SGPR, we can never move it.
 
-  for (const MachineOperand &MO : MI->implicit_operands()) {
-    // We only care about reads.
-    if (MO.isDef())
-      continue;
-
-    if (MO.getReg() == AMDGPU::VCC)
-      return AMDGPU::VCC;
-
-    if (MO.getReg() == AMDGPU::FLAT_SCR)
-      return AMDGPU::FLAT_SCR;
-  }
+  unsigned SGPRReg = findImplicitSGPRRead(*MI);
+  if (SGPRReg != AMDGPU::NoRegister)
+    return SGPRReg;
 
   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -2660,15 +2938,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
       break;
 
     const MachineOperand &MO = MI->getOperand(Idx);
-    if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
-      SGPRReg = MO.getReg();
+    if (!MO.isReg())
+      continue;
 
-    if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
-      UsedSGPRs[i] = MO.getReg();
-  }
+    // Is this operand statically required to be an SGPR based on the operand
+    // constraints?
+    const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
+    bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
+    if (IsRequiredSGPR)
+      return MO.getReg();
 
-  if (SGPRReg != AMDGPU::NoRegister)
-    return SGPRReg;
+    // If this could be a VGPR or an SGPR, Check the dynamic register class.
+    unsigned Reg = MO.getReg();
+    const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+    if (RI.isSGPRClass(RegRC))
+      UsedSGPRs[i] = Reg;
+  }
 
   // We don't have a required SGPR operand, so we have a bit more freedom in
   // selecting operands to move.
@@ -2680,6 +2965,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   // V_FMA_F32 v0, s0, s0, s0 -> No moves
   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
 
+  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
+  // prefer those.
+
   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
       SGPRReg = UsedSGPRs[0];
@@ -2720,7 +3008,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
   unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
                                       getIndirectIndexBegin(*MBB->getParent()));
 
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1))
           .addOperand(I->getOperand(0))
           .addOperand(I->getOperand(1))
           .addReg(IndirectBaseReg)
@@ -2791,3 +3079,15 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 
   return Rsrc23;
 }
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  return isSMRD(Opc);
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5053786..cce1ae7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -39,14 +39,11 @@ private:
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC) const;
 
-  unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
-                         MachineBasicBlock::iterator MI,
-                         MachineRegisterInfo &MRI,
-                         const TargetRegisterClass *RC,
-                         const MachineOperand &Op) const;
-
   void swapOperands(MachineBasicBlock::iterator Inst) const;
 
+  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+                      MachineInstr *Inst) const;
+
   void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
                                MachineInstr *Inst, unsigned Opcode) const;
 
@@ -58,13 +55,24 @@ private:
   void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
                            MachineInstr *Inst) const;
 
-  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+  void addUsersToMoveToVALUWorklist(
+    unsigned Reg, MachineRegisterInfo &MRI,
+    SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+  const TargetRegisterClass *
+  getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
 
   bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
                                     MachineInstr *MIb) const;
 
   unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
 
+protected:
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+                                       bool NewMI,
+                                       unsigned OpIdx0,
+                                       unsigned OpIdx1) const override;
+
 public:
   explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
@@ -117,17 +125,14 @@ public:
   // register.  If there is no hardware instruction that can store to \p
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+
+  LLVM_READONLY
   int commuteOpcode(const MachineInstr &MI) const;
 
-  MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI = false) const override;
   bool findCommutedOpIndices(MachineInstr *MI,
                              unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
-  bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = nullptr) const;
-
   bool areMemAccessesTriviallyDisjoint(
     MachineInstr *MIa, MachineInstr *MIb,
     AliasAnalysis *AA = nullptr) const override;
@@ -137,8 +142,6 @@ public:
                               unsigned DstReg, unsigned SrcReg) const override;
   bool isMov(unsigned Opcode) const override;
 
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
   bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
                      unsigned Reg, MachineRegisterInfo *MRI) const final;
 
@@ -148,78 +151,154 @@ public:
                                       MachineBasicBlock::iterator &MI,
                                       LiveVariables *LV) const override;
 
+  static bool isSALU(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SALU;
+  }
+
   bool isSALU(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SALU;
   }
 
+  static bool isVALU(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VALU;
+  }
+
   bool isVALU(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
+  static bool isSOP1(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
+  }
+
   bool isSOP1(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP1;
   }
 
+  static bool isSOP2(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
+  }
+
   bool isSOP2(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP2;
   }
 
+  static bool isSOPC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
+  }
+
   bool isSOPC(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPC;
   }
 
+  static bool isSOPK(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
+  }
+
   bool isSOPK(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPK;
   }
 
+  static bool isSOPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
+  }
+
   bool isSOPP(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPP;
   }
 
+  static bool isVOP1(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
+  }
+
   bool isVOP1(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP1;
   }
 
+  static bool isVOP2(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
+  }
+
   bool isVOP2(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP2;
   }
 
+  static bool isVOP3(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+  }
+
   bool isVOP3(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP3;
   }
 
+  static bool isVOPC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
+  }
+
   bool isVOPC(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOPC;
   }
 
+  static bool isMUBUF(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
+  }
+
   bool isMUBUF(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
   }
 
+  static bool isMTBUF(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
+  }
+
   bool isMTBUF(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
   }
 
+  static bool isSMRD(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
+  }
+
   bool isSMRD(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SMRD;
   }
 
+  static bool isDS(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DS;
+  }
+
   bool isDS(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isMIMG(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
+  }
+
   bool isMIMG(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MIMG;
   }
 
+  static bool isFLAT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
+  }
+
   bool isFLAT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
+  static bool isWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::WQM;
+  }
+
   bool isWQM(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
+  static bool isVGPRSpill(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
+  }
+
   bool isVGPRSpill(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
   }
@@ -302,6 +381,26 @@ public:
   bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
+  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// definition \p OpInfo. Note this does not attempt to validate constant bus
+  /// restrictions (e.g. literal constant usage).
+  bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                          const MCOperandInfo &OpInfo,
+                          const MachineOperand &MO) const;
+
+  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// given operand description.
+  bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+                         const MCOperandInfo &OpInfo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// copy of src1.
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
+  /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
   /// \brief Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;
@@ -312,7 +411,8 @@ public:
                  unsigned HalfImmOp, unsigned HalfSGPROp,
                  MachineInstr *&Lo, MachineInstr *&Hi) const;
 
-  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
+                      SmallVectorImpl<MachineInstr *> &Worklist) const;
 
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
@@ -341,29 +441,52 @@ public:
   void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
               unsigned SavReg, unsigned IndexReg) const;
 
-  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+  void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const;
 
   /// \brief Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
+  LLVM_READONLY
   MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
 
+  LLVM_READONLY
   const MachineOperand *getNamedOperand(const MachineInstr &MI,
                                         unsigned OpName) const {
     return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
   }
 
+  /// Get required immediate operand
+  int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const {
+    int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+    return MI.getOperand(Idx).getImm();
+  }
+
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
+
+  bool isLowLatencyInstruction(const MachineInstr *MI) const;
+  bool isHighLatencyInstruction(const MachineInstr *MI) const;
 };
 
 namespace AMDGPU {
-
+  LLVM_READONLY
   int getVOPe64(uint16_t Opcode);
+
+  LLVM_READONLY
   int getVOPe32(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteOrig(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAddr64Inst(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAtomicRetOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAtomicNoRetOp(uint16_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8d8110b..8735277 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 def isCI : Predicate<"Subtarget->getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isVI : Predicate <
-  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
-  AssemblerPredicate<"FeatureGCN3Encoding">;
+def isCIOnly : Predicate<"Subtarget->getGeneration() =="
+                         "AMDGPUSubtarget::SEA_ISLANDS">,
+  AssemblerPredicate <"FeatureSeaIslands">;
 
 def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
 
@@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> {
   field bits<5> VI = vi;
 }
 
+// Specify an SMRD opcode for SI and SMEM opcode for VI
+
+// FIXME: This should really be bits<5> si, Tablegen crashes if
+// parameter default value is other parameter with different bit size
+class smrd<bits<8> si, bits<8> vi = si> {
+  field bits<5> SI = si{4-0};
+  field bits<8> VI = vi;
+}
+
 // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
 // in AMDGPUInstrInfo.cpp
 def SISubtarget {
@@ -121,10 +130,49 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
 def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
+                                                     SDTCisVT<0, i64>]>
 >;
 
 //===----------------------------------------------------------------------===//
+// PatFrags for FLAT instructions
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+                                               (ld node:$ptr), [{
+  return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
+         isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
+         isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def flat_load : flat_ld <load>;
+def flat_az_extloadi8 : flat_ld <az_extloadi8>;
+def flat_sextloadi8 : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16 : flat_ld <sextloadi16>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+                                               (st node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N)) ||
+         isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def flat_store: flat_st <store>;
+def flat_truncstorei8 : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+
+def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+	return isGlobalLoad(cast<LoadSDNode>(N)) ||
+         isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+  return isConstantLoad(cast<LoadSDNode>(N), -1) &&
+  static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
 // to be glued to the memory instructions.
 //===----------------------------------------------------------------------===//
@@ -328,9 +376,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                 U != E; ++U) {
-    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+    if (RC && SIRI->isSGPRClass(RC))
       return true;
-    }
   }
   return false;
 }]>;
@@ -354,6 +402,8 @@ def sopp_brtarget : Operand<OtherVT> {
   let ParserMatchClass = SoppBrTarget;
 }
 
+def const_ga : Operand<iPTR>;
+
 include "SIInstrFormats.td"
 include "VIInstrFormats.td"
 
@@ -393,7 +443,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
 class GLCBaseMatchClass <string parser> : AsmOperandClass {
   let Name = "GLC"#parser;
   let PredicateMethod = "isImm";
-  let ParserMethod = parser; 
+  let ParserMethod = parser;
   let RenderMethod = "addImmOperands";
 }
 
@@ -436,6 +486,17 @@ def ClampMatchClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass {
+  let Name = "SMRDOffset"#predicate;
+  let PredicateMethod = predicate;
+  let RenderMethod = "addImmOperands";
+}
+
+def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">;
+def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass <
+  "isSMRDLiteralOffset"
+>;
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def offen : Operand<i1> {
@@ -510,6 +571,16 @@ def ClampMod : Operand <i1> {
   let ParserMatchClass = ClampMatchClass;
 }
 
+def smrd_offset : Operand <i32> {
+  let PrintMethod = "printU32ImmOperand";
+  let ParserMatchClass = SMRDOffsetMatchClass;
+}
+
+def smrd_literal_offset : Operand <i32> {
+  let PrintMethod = "printU32ImmOperand";
+  let ParserMatchClass = SMRDLiteralOffsetMatchClass;
+}
+
 } // End OperandType = "OPERAND_IMMEDIATE"
 
 def VOPDstS64 : VOPDstOperand <SReg_64>;
@@ -528,6 +599,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
+def SMRDImm   : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr  : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
@@ -717,19 +795,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
   let AssemblerPredicates = [isVI];
 }
 
-multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
-  def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
-
-  def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-    opName#" $dst, $src0, $src1 [$scc]">;
-
-  def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-    opName#" $dst, $src0, $src1 [$scc]">;
-}
-
 multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
                    list<dag> pattern> {
 
@@ -758,8 +823,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
 
 class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
                     string opName, PatLeaf cond> : SOPC <
-  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
-  opName#" $src0, $src1", []>;
+  op, (outs), (ins rc:$src0, rc:$src1),
+  opName#" $src0, $src1", []> {
+  let Defs = [SCC];
+}
 
 class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
   : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
@@ -812,15 +879,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
 }
 
 multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
-  def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
-    (ins SReg_32:$src0, u16imm:$src1), pattern>;
+  def "" : SOPK_Pseudo <opName, (outs),
+    (ins SReg_32:$src0, u16imm:$src1), pattern> {
+    let Defs = [SCC];
+  }
+
 
-  let DisableEncoding = "$dst" in {
-    def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
-      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+  def _si : SOPK_Real_si <op, opName, (outs),
+    (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+    let Defs = [SCC];
+  }
 
-    def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
-      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+  def _vi : SOPK_Real_vi <op, opName, (outs),
+    (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+    let Defs = [SCC];
   }
 }
 
@@ -868,35 +940,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
 }
 
 class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
-                    string asm> :
-  SMRD <outs, ins, asm, []>,
+                    string asm, list<dag> pattern = []> :
+  SMRD <outs, ins, asm, pattern>,
   SMEMe_vi <op, imm>,
   SIMCInstr<opName, SISubtarget.VI> {
   let AssemblerPredicates = [isVI];
 }
 
-multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins,
                    string asm, list<dag> pattern> {
 
   def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
 
-  def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+  def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>;
 
   // glc is only applicable to scalar stores, which are not yet
   // implemented.
   let glc = 0 in {
-    def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+    def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>;
+  }
+}
+
+multiclass SMRD_Inval <smrd op, string opName,
+                       SDPatternOperator node> {
+  let hasSideEffects = 1, mayStore = 1 in {
+    def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>;
+
+    let sbase = 0, offset = 0 in {
+      let sdst = 0 in {
+        def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>;
+      }
+
+      let glc = 0, sdata = 0 in {
+        def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>;
+      }
+    }
   }
 }
 
-multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
+class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
+  SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> {
+  let hasSideEffects = 1;
+  let mayStore = 1;
+  let sbase = 0;
+  let sdata = 0;
+  let glc = 0;
+  let offset = 0;
+}
+
+multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
                         RegisterClass dstClass> {
   defm _IMM : SMRD_m <
     op, opName#"_IMM", 1, (outs dstClass:$dst),
-    (ins baseClass:$sbase, u32imm:$offset),
+    (ins baseClass:$sbase, smrd_offset:$offset),
     opName#" $dst, $sbase, $offset", []
   >;
 
+  def _IMM_ci : SMRD <
+    (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
+    opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
+    let AssemblerPredicates = [isCIOnly];
+  }
+
   defm _SGPR : SMRD_m <
     op, opName#"_SGPR", 0, (outs dstClass:$dst),
     (ins baseClass:$sbase, SReg_32:$soff),
@@ -922,11 +1027,12 @@ def InputModsNoDefault : Operand <i32> {
   let ParserMatchClass = InputModsMatchClass;
 }
 
-class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
   int ret =
-    !if (!eq(Src1.Value, untyped.Value),      1,   // VOP1
+    !if (!eq(Src0.Value, untyped.Value),      0,
+      !if (!eq(Src1.Value, untyped.Value),    1,   // VOP1
          !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
-                                              3)); // VOP3
+                                              3))); // VOP3
 }
 
 // Returns the register class to use for the destination of VOP[123C]
@@ -934,28 +1040,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> {
 class getVALUDstForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
                           !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
-                            VOPDstOperand<SReg_64>)); // else VT == i1
+                            !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+                            VOPDstOperand<SReg_64>))); // else VT == i1
 }
 
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+  RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
 }
 
 // Returns the register class to use for source 1 of VOP[12C] for the
 // given VT.
 class getVOPSrc1ForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
+  RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+  RegisterOperand ret =
+  !if(!eq(VT.Size, 64),
+      VCSrc_64,
+      !if(!eq(VT.Value, i1.Value),
+          SCSrc_64,
+          VCSrc_32
+       )
+    );
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
 class hasModifiers<ValueType SrcVT> {
   bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
             !if(!eq(SrcVT.Value, f64.Value), 1, 0));
@@ -1009,17 +1124,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
 // Returns the assembly string for the inputs and outputs of a VOP[12C]
 // instruction.  This does not add the _e32 suffix, so it can be reused
 // by getAsm64.
-class getAsm32 <int NumSrcArgs> {
+class getAsm32 <bit HasDst, int NumSrcArgs> {
+  string dst = "$dst";
+  string src0 = ", $src0";
   string src1 = ", $src1";
   string src2 = ", $src2";
-  string ret = "$dst, $src0"#
-               !if(!eq(NumSrcArgs, 1), "", src1)#
-               !if(!eq(NumSrcArgs, 3), src2, "");
+  string ret = !if(HasDst, dst, "") #
+               !if(!eq(NumSrcArgs, 1), src0, "") #
+               !if(!eq(NumSrcArgs, 2), src0#src1, "") #
+               !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
 }
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
@@ -1027,11 +1145,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> {
   string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
   string ret =
   !if(!eq(HasModifiers, 0),
-      getAsm32<NumSrcArgs>.ret,
+      getAsm32<HasDst, NumSrcArgs>.ret,
       "$dst, "#src0#src1#src2#"$clamp"#"$omod");
 }
 
-
 class VOPProfile <list<ValueType> _ArgVT> {
 
   field list<ValueType> ArgVT = _ArgVT;
@@ -1047,29 +1164,38 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
 
-  field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+  field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+  field bit HasDst32 = HasDst;
+  field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
   field bit HasModifiers = hasModifiers<Src0VT>.ret;
 
-  field dag Outs = (outs DstRC:$dst);
+  field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs));
+
+  // VOP3b instructions are a special case with a second explicit
+  // output. This is manually overridden for them.
+  field dag Outs32 = Outs;
+  field dag Outs64 = Outs;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasModifiers>.ret;
 
-  field string Asm32 = getAsm32<NumSrcArgs>.ret;
-  field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+  field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret;
 }
 
 // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
 //        for the instruction patterns to work.
-def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>;
-def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>;
-def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
 
-def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>;
-def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
 
+def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
+
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
 def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
 def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
@@ -1087,25 +1213,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
 def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
 def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+  let Asm32 = "$dst, vcc, $src0, $src1";
+  let Asm64 = "$dst, $sdst, $src0, $src1";
+  let Outs32 = (outs DstRC:$dst);
+  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+  // We use VCSrc_32 to exclude literal constants, even though the
+  // encoding normally allows them since the implicit VCC use means
+  // using one would always violate the constant bus
+  // restriction. SGPRs are still allowed because it should
+  // technically be possible to use VCC again as src0.
   let Src0RC32 = VCSrc_32;
+  let Asm32 = "$dst, vcc, $src0, $src1, vcc";
+  let Asm64 = "$dst, $sdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$dst);
+  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+
+  // Suppress src2 implied by type since the 32-bit encoding uses an
+  // implicit VCC use.
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
 }
 
-def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
-  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  let Asm64 = "$dst, $src0_modifiers, $src1";
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+}
+
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VReg_64>;
+}
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> {
+  let Asm32 = "vcc, $src0, $src1";
+  // The destination for 32-bit encoding is implicit.
+  let HasDst32 = 0;
 }
 
-def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
   let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$dst, $src0_modifiers, $src1";
 }
 
+def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<f64>;
+def VOPC_I1_I32_I32 : VOPC_Profile<i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<i64>;
+
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
+
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
 def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
 def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2);
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
   let Asm64 = "$dst, $src0, $src1, $src2";
 }
@@ -1119,13 +1296,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                              HasModifiers>.ret;
-  let Asm32 = getAsm32<2>.ret;
-  let Asm64 = getAsm64<2, HasModifiers>.ret;
+  let Asm32 = getAsm32<1, 2>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers>.ret;
 }
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 
+class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
+    InstAlias <asm, (inst)>, PredicateControl {
+
+  field bit isCompare;
+  field bit isCommutable;
+
+  let ResultInst =
+    !if (p.HasDst32,
+      !if (!eq(p.NumSrcArgs, 0),
+        // 1 dst, 0 src
+        (inst p.DstRC:$dst),
+      !if (!eq(p.NumSrcArgs, 1),
+        // 1 dst, 1 src
+        (inst p.DstRC:$dst, p.Src0RC32:$src0),
+      !if (!eq(p.NumSrcArgs, 2),
+        // 1 dst, 2 src
+        (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+      // else - unreachable
+        (inst)))),
+    // else
+      !if (!eq(p.NumSrcArgs, 2),
+        // 0 dst, 2 src
+        (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+      !if (!eq(p.NumSrcArgs, 1),
+        // 0 dst, 1 src
+        (inst p.Src0RC32:$src1),
+      // else
+        // 0 dst, 0 src
+        (inst))));
+}
+
+class SIInstAliasSI <string asm, string op_name, VOPProfile p> :
+  SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> {
+  let AssemblerPredicate = SIAssemblerPredicate;
+}
+
+class SIInstAliasVI <string asm, string op_name, VOPProfile p> :
+  SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass SIInstAliasBuilder <string asm, VOPProfile p> {
+
+  def : SIInstAliasSI <asm, NAME, p>;
+
+  def : SIInstAliasVI <asm, NAME, p>;
+}
 
 class VOP <string opName> {
   string OpName = opName;
@@ -1165,20 +1389,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
   let AssemblerPredicates = [isVI];
 }
 
-multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName> {
-  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+                   string asm = opName#p.Asm32> {
+  def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
 
-  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
+
+  def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>;
 
-  def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>;
 }
 
-multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName> {
-  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+                     string asm = opName#p.Asm32> {
+
+  def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
 
-  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
 }
 
 class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
@@ -1202,22 +1428,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
   let AssemblerPredicates = [isVI];
 }
 
-multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
-                     string opName, string revOp> {
-  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
+                     string revOp> {
+
+  def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
            VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
 }
 
-multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, string revOp> {
-  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
+                   string revOp> {
+
+  def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
            VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
 
-  def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>;
+  def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>;
 
 }
 
@@ -1250,6 +1478,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   MnemonicAlias<opName#"_e64", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
+
+  field bit vdst;
+  field bit src0;
 }
 
 class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
@@ -1295,22 +1526,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
                               HasMods>;
 }
 
-// VOP3_m without source modifiers
-multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, int NumSrcArgs, bit HasMods = 1> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-  let src0_modifiers = 0,
-      src1_modifiers = 0,
-      src2_modifiers = 0,
-      clamp = 0,
-      omod = 0 in {
-    def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
-    def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
-  }
-}
-
 multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, bit HasMods = 1> {
 
@@ -1335,7 +1550,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
 
 multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
-                     bit HasMods = 1, bit UseFullOp = 0> {
+                     bit HasMods = 1> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1349,7 +1564,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
 
 multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
-                     bit HasMods = 1, bit UseFullOp = 0> {
+                     bit HasMods = 1> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1360,54 +1575,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
   // No VI instruction. This class is for SI only.
 }
 
-// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
-// option of implicit vcc use?
-multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
-                      list<dag> pattern, string opName, string revOp,
-                      bit HasMods = 1, bit UseFullOp = 0> {
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
-  // The VOP2 variant puts the carry out into VCC, the VOP3 variant
-  // can write it into any SGPR. We currently don't use the carry out,
-  // so for now hardcode it to VCC as well.
-  let sdst = SIOperand.VCC, Defs = [VCC] in {
-    def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods>;
-
-    def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods>;
-  } // End sdst = SIOperand.VCC, Defs = [VCC]
-}
-
-multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
-                      list<dag> pattern, string opName, string revOp,
-                      bit HasMods = 1, bit UseFullOp = 0> {
+// Two operand VOP3b instruction that may have a 3rd SGPR bool operand
+// instead of an implicit VCC as in the VOP2b format.
+multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
+                        list<dag> pattern, string opName, string revOp,
+                        bit HasMods = 1, bit useSrc2Input = 0> {
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
 
-
   def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 1, HasMods>;
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
 
   def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 1, HasMods>;
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
 }
 
 multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName,
-                     bit HasMods, bit defExec, string revOp> {
+                     bit HasMods, bit defExec,
+                     string revOp, list<SchedReadWrite> sched> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+    let Defs = !if(defExec, [EXEC], []);
+    let SchedRW = sched;
+  }
 
   def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
+    let SchedRW = sched;
   }
 
   def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
+    let SchedRW = sched;
   }
 }
 
@@ -1432,32 +1634,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
   }
 }
 
-multiclass VOP1_Helper <vop1 op, string opName, dag outs,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag ins64, string asm64, list<dag> pat64,
-                        bit HasMods> {
+multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
+                        list<dag> pat64> {
 
-  defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
+  defm _e32 : VOP1_m <op, opName, p, pat32>;
 
-  defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>;
+  defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+                        p.HasModifiers>;
 }
 
 multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP1_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
                                 i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
-  P.HasModifiers
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0))])
 >;
 
 multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
                        SDPatternOperator node = null_frag> {
 
-  defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+  defm _e32 : VOP1SI_m <op, opName, P, []>;
 
   defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
@@ -1467,36 +1665,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
     opName, P.HasModifiers>;
 }
 
-multiclass VOP2_Helper <vop2 op, string opName, dag outs,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag ins64, string asm64, list<dag> pat64,
-                        string revOp, bit HasMods> {
-  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
+                        list<dag> pat64, string revOp> {
 
-  defm _e64 : VOP3_2_m <op,
-    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
-  >;
+  defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+
+  defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+                        revOp, p.HasModifiers>;
 }
 
 multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag,
                      string revOp = opName> : VOP2_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
       [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
+  revOp
 >;
 
 multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
                        SDPatternOperator node = null_frag,
                        string revOp = opName> {
-  defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+  defm _e32 : VOP2SI_m <op, opName, P, [], revOp>;
 
   defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
@@ -1508,58 +1703,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
     opName, revOp, P.HasModifiers>;
 }
 
-multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
-                         dag ins32, string asm32, list<dag> pat32,
-                         dag ins64, string asm64, list<dag> pat64,
-                         string revOp, bit HasMods> {
+multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
+                         list<dag> pat32, list<dag> pat64,
+                         string revOp, bit useSGPRInput> {
 
-  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+  let SchedRW = [Write32Bit, WriteSALU] in {
+    let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+      defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+    }
 
-  defm _e64 : VOP3b_2_m <op,
-    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
-  >;
+    defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
+                             opName, revOp, p.HasModifiers, useSGPRInput>;
+  }
 }
 
 multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
                       SDPatternOperator node = null_frag,
                       string revOp = opName> : VOP2b_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
       [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
+  revOp, !eq(P.NumSrcArgs, 3)
 >;
 
 // A VOP2 instruction that is VOP3-only on VI.
-multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
-                            dag ins32, string asm32, list<dag> pat32,
-                            dag ins64, string asm64, list<dag> pat64,
-                            string revOp, bit HasMods> {
-  defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p,
+                            list<dag> pat32, list<dag> pat64, string revOp> {
+
+  defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>;
 
-  defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName,
-                        revOp, HasMods>;
+  defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+                        revOp, p.HasModifiers>;
 }
 
 multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
                           SDPatternOperator node = null_frag,
                           string revOp = opName>
                           : VOP2_VI3_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
       [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
+  revOp
 >;
 
 multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
@@ -1583,64 +1775,75 @@ let isCodeGenOnly = 0 in {
 } // End isCodeGenOnly = 0
 }
 
-class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
   VOPCCommon <ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
+  SIMCInstr<opName#"_e32", SISubtarget.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, bit DefExec, string revOpName = ""> {
-  def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOPC<op.SI, ins, asm, []>,
-            SIMCInstr <opName#"_e32", SISubtarget.SI> {
-    let Defs = !if(DefExec, [EXEC], []);
-    let hasSideEffects = DefExec;
-    let AssemblerPredicates = [isSICI];
+multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
+                   string opName, bit DefExec, VOPProfile p,
+                   list<SchedReadWrite> sched,
+                   string revOpName = "", string asm = opName#"_e32 "#op_asm,
+                   string alias_asm = opName#" "#op_asm> {
+  def "" : VOPC_Pseudo <ins, pattern, opName> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = sched;
   }
 
-  def _vi : VOPC<op.VI, ins, asm, []>,
-            SIMCInstr <opName#"_e32", SISubtarget.VI> {
-    let Defs = !if(DefExec, [EXEC], []);
-    let hasSideEffects = DefExec;
-    let AssemblerPredicates = [isVI];
-  }
+  let AssemblerPredicates = [isSICI] in {
+    def _si : VOPC<op.SI, ins, asm, []>,
+              SIMCInstr <opName#"_e32", SISubtarget.SI> {
+      let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+      let hasSideEffects = DefExec;
+      let SchedRW = sched;
+    }
+
+  } // End AssemblerPredicates = [isSICI]
+
+  let AssemblerPredicates = [isVI] in {
+    def _vi : VOPC<op.VI, ins, asm, []>,
+              SIMCInstr <opName#"_e32", SISubtarget.VI> {
+      let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+      let hasSideEffects = DefExec;
+      let SchedRW = sched;
+    }
+
+  } // End AssemblerPredicates = [isVI]
+
+  defm : SIInstAliasBuilder<alias_asm, p>;
 }
 
-multiclass VOPC_Helper <vopc op, string opName,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag out64, dag ins64, string asm64, list<dag> pat64,
-                        bit HasMods, bit DefExec, string revOp> {
-  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
+                        list<dag> pat64, bit DefExec, string revOp,
+                        VOPProfile p, list<SchedReadWrite> sched> {
+  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
 
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
-                        opName, HasMods, DefExec, revOp>;
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+                        opName, p.HasModifiers, DefExec, revOp, sched>;
 }
 
 // Special case for class instructions which only have modifiers on
 // the 1st source operand.
-multiclass VOPC_Class_Helper <vopc op, string opName,
-                             dag ins32, string asm32, list<dag> pat32,
-                             dag out64, dag ins64, string asm64, list<dag> pat64,
-                             bit HasMods, bit DefExec, string revOp> {
-  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
-
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
-                        opName, HasMods, DefExec, revOp>,
+multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
+                              list<dag> pat64, bit DefExec, string revOp,
+                              VOPProfile p, list<SchedReadWrite> sched> {
+  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+                        opName, p.HasModifiers, DefExec, revOp, sched>,
                         VOP3DisableModFields<1, 0, 0>;
 }
 
 multiclass VOPCInst <vopc op, string opName,
                      VOPProfile P, PatLeaf cond = COND_NULL,
                      string revOp = opName,
-                     bit DefExec = 0> : VOPC_Helper <
-  op, opName,
-  P.Ins32, P.Asm32, [],
-  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+                     bit DefExec = 0,
+                     list<SchedReadWrite> sched = [Write32Bit]> :
+                     VOPC_Helper <
+  op, opName, [],
   !if(P.HasModifiers,
       [(set i1:$dst,
           (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
@@ -1648,51 +1851,51 @@ multiclass VOPCInst <vopc op, string opName,
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
                  cond))],
       [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
-  P.HasModifiers, DefExec, revOp
+  DefExec, revOp, P, sched
 >;
 
 multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
-                     bit DefExec = 0> : VOPC_Class_Helper <
-  op, opName,
-  P.Ins32, P.Asm32, [],
-  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+                     bit DefExec = 0,
+                     list<SchedReadWrite> sched> : VOPC_Class_Helper <
+  op, opName, [],
   !if(P.HasModifiers,
       [(set i1:$dst,
           (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
       [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
-  P.HasModifiers, DefExec, opName
+  DefExec, opName, P, sched
 >;
 
 
 multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>;
 
 multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>;
 
 multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>;
 
 multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>;
 
 
 multiclass VOPCX <vopc op, string opName, VOPProfile P,
                   PatLeaf cond = COND_NULL,
+                  list<SchedReadWrite> sched,
                   string revOp = "">
-  : VOPCInst <op, opName, P, cond, revOp, 1>;
+  : VOPCInst <op, opName, P, cond, revOp, 1, sched>;
 
 multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>;
 
 multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>;
 
 multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>;
 
 multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
 
 multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
                         list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
@@ -1700,16 +1903,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
 >;
 
 multiclass VOPC_CLASS_F32 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+  VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
 
 multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+  VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>;
 
 multiclass VOPC_CLASS_F64 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+  VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>;
 
 multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+  VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
 
 multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP3_Helper <
@@ -1761,25 +1964,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
   3, 1
 >;
 
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
-                    string opName, list<dag> pattern> :
-  VOP3b_3_m <
-  op, (outs vrc:$vdst, SReg_64:$sdst),
-      (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
-           InputModsNoDefault:$src1_modifiers, arc:$src1,
-           InputModsNoDefault:$src2_modifiers, arc:$src2,
-           ClampMod:$clamp, omod:$omod),
-  opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
-  opName, opName, 1, 1
+multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> :
+  VOP3b_2_3_m <
+  op, P.Outs64, P.Ins64,
+  opName#" "#P.Asm64, pattern,
+  opName, "", 1, 1
 >;
 
-multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
-
-
 class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
   (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
         (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
@@ -1925,12 +2116,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
   dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
   string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
 
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 1>;
+  let hasPostISelHook = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, []>,
+             AtomicNoRet<noRetOp, 1>;
 
-  let data1 = 0 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    let data1 = 0 in {
+      def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    }
   }
 }
 
@@ -1939,11 +2132,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs rc:$vdst),
   string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
 
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 1>;
+  let hasPostISelHook = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, []>,
+             AtomicNoRet<noRetOp, 1>;
 
-  def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-  def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  }
 }
 
 multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
@@ -2214,7 +2409,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
 
       defm _ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_addr64", (outs),
-        (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+        (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
              SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
         name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
       >;
@@ -2233,7 +2428,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
 
       defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_rtn_addr64", (outs rc:$vdata),
-        (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+        (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
              SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
         name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
         [(set vt:$vdata,
@@ -2245,7 +2440,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
         op, name#"_rtn_offset", (outs rc:$vdata),
         (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
              mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc",
         [(set vt:$vdata,
          (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
                                     i1:$slc), vt:$vdata_in))], 1
@@ -2256,6 +2451,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
   } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
 }
 
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
 multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
                               ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag> {
@@ -2368,47 +2565,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
   } // End mayLoad = 0, mayStore = 1
 }
 
-class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
-      FLAT <op, (outs regClass:$vdst),
-                (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-            asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
-  let data = 0;
-  let mayLoad = 1;
+// For cache invalidation instructions.
+multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> {
+  let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in {
+    def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>;
+
+    // Set everything to 0.
+    let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0,
+        vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in {
+      let addr64 = 0 in {
+        def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>;
+      }
+
+      def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>;
+    }
+  } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = ""
 }
 
-class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
-      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
-                             glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-          name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
-         []> {
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
 
-  let mayLoad = 0;
-  let mayStore = 1;
+class flat <bits<7> ci, bits<7> vi = ci> {
+  field bits<7> CI = ci;
+  field bits<7> VI = vi;
+}
 
-  // Encoding
-  let vdst = 0;
+class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+     FLAT <0, outs, ins, "", pattern>,
+      SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
-multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
-                        RegisterClass data_rc = vdst_rc> {
+class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
+    FLAT <op, outs, ins, asm, []>,
+    SIMCInstr<opName, SISubtarget.SI> {
+  let AssemblerPredicate = isCIOnly;
+}
 
-  let mayLoad = 1, mayStore = 1 in {
-    def "" : FLAT <op, (outs),
-                  (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                       tfe_flat_atomic:$tfe),
-                   name#" $addr, $data"#"$slc"#"$tfe", []>,
-             AtomicNoRet <NAME, 0> {
-      let glc = 0;
-      let vdst = 0;
-    }
+class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
+    FLAT <op, outs, ins, asm, []>,
+    SIMCInstr<opName, SISubtarget.VI> {
+  let AssemblerPredicate = VIAssemblerPredicate;
+}
 
-    def _RTN : FLAT <op, (outs vdst_rc:$vdst),
-                     (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                          tfe_flat_atomic:$tfe),
-                     name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
-               AtomicNoRet <NAME, 1> {
-      let glc = 1;
-    }
+multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
+                   list<dag> pattern> {
+  def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>,
+               AtomicNoRet <NAME, 1>;
+
+  def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>;
+
+  def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>;
+}
+
+multiclass FLAT_Load_Helper <flat op, string asm_name,
+    RegisterClass regClass,
+    dag outs = (outs regClass:$vdst),
+    dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+    string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+  let data = 0, mayLoad = 1 in {
+
+    def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+    def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+    def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+  }
+}
+
+multiclass FLAT_Store_Helper <flat op, string asm_name,
+    RegisterClass vdataClass,
+    dag outs = (outs),
+    dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
+                   slc_flat:$slc, tfe_flat:$tfe),
+    string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+  let mayLoad = 0, mayStore = 1, vdst = 0 in {
+
+    def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+    def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+    def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+  }
+}
+
+multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+    RegisterClass data_rc = vdst_rc,
+    dag outs_noret = (outs),
+    string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
+
+  let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
+    def "" : FLAT_Pseudo <NAME, outs_noret,
+                          (ins VReg_64:$addr, data_rc:$data,
+                               slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+             AtomicNoRet <NAME, 0>;
+
+    def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+                            (ins VReg_64:$addr, data_rc:$data,
+                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                            asm_noret>;
+
+    def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+                            (ins VReg_64:$addr, data_rc:$data,
+                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                            asm_noret>;
+  }
+
+  let glc = 1, hasPostISelHook = 1 in {
+    defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
+                        (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+                             tfe_flat_atomic:$tfe),
+                        asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index e0eeea9..89692ab 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() "
                       ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
             AssemblerPredicate<"FeatureGCN">;
 def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+           AssemblerPredicate<"FeatureSouthernIslands">;
+
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
@@ -57,41 +59,39 @@ defm EXP : EXP_m;
 // SMRD Instructions
 //===----------------------------------------------------------------------===//
 
-let mayLoad = 1 in {
-
 // We are using the SGPR_32 and not the SReg_32 register class for 32-bit
 // SMRD instructions, because the SGPR_32 register class does not include M0
 // and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "s_buffer_load_dword", SReg_128, SGPR_32
+  smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
+  smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64
 >;
 
 defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
+  smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128
 >;
 
 defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
+  smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256
 >;
 
 defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
+  smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;
 
-} // mayLoad = 1
-
 //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
+
+defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
+  int_amdgcn_s_dcache_inv>;
 
 //===----------------------------------------------------------------------===//
 // SOP1 Instructions
@@ -123,7 +123,7 @@ let Defs = [SCC] in {
 
 
 defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
-  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+  [(set i32:$dst, (bitreverse i32:$src0))]
 >;
 defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
 
@@ -144,7 +144,7 @@ defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
 defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
 
 defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
-  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+  [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))]
 >;
 
 defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
@@ -183,10 +183,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>
 
 defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
 defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+
+let Uses = [M0] in {
 defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
 defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
 defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
 defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+} // End Uses = [M0]
+
 defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
 defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
 let Defs = [SCC] in {
@@ -354,7 +358,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
 // SOPK Instructions
 //===----------------------------------------------------------------------===//
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isMoveImm = 1 in {
 defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
 } // End isReMaterializable = 1
 let Uses = [SCC] in {
@@ -438,36 +442,38 @@ def S_BRANCH : SOPP <
   let isBarrier = 1;
 }
 
-let DisableEncoding = "$scc" in {
+let Uses = [SCC] in {
 def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  0x00000004, (ins sopp_brtarget:$simm16),
   "s_cbranch_scc0 $simm16"
 >;
 def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  0x00000005, (ins sopp_brtarget:$simm16),
   "s_cbranch_scc1 $simm16"
 >;
-} // End DisableEncoding = "$scc"
+} // End Uses = [SCC]
 
+let Uses = [VCC] in {
 def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  0x00000006, (ins sopp_brtarget:$simm16),
   "s_cbranch_vccz $simm16"
 >;
 def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  0x00000007, (ins sopp_brtarget:$simm16),
   "s_cbranch_vccnz $simm16"
 >;
+} // End Uses = [VCC]
 
-let DisableEncoding = "$exec" in {
+let Uses = [EXEC] in {
 def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  0x00000008, (ins sopp_brtarget:$simm16),
   "s_cbranch_execz $simm16"
 >;
 def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  0x00000009, (ins sopp_brtarget:$simm16),
   "s_cbranch_execnz $simm16"
 >;
-} // End DisableEncoding = "$exec"
+} // End Uses = [EXEC]
 
 
 } // End isBranch = 1
@@ -477,11 +483,11 @@ let hasSideEffects = 1 in {
 def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   [(int_AMDGPU_barrier_local)]
 > {
+  let SchedRW = [WriteBarrier];
   let simm16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
   let mayLoad = 1;
   let mayStore = 1;
+  let isConvergent = 1;
 }
 
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
@@ -805,9 +811,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps
 defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
 defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
 defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-let SubtargetPredicate = isCI in {
-defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
-} // End isCI
 defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
 let mayStore = 0 in {
 defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
@@ -905,11 +908,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
 defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
 defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
 
-//let SubtargetPredicate = isCI in {
-// DS_CONDXCHG32_RTN_B64
-// DS_CONDXCHG32_RTN_B128
-//} // End isCI
-
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
@@ -951,13 +949,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
   mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
-  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
+  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
-  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
 >;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
-  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
 >;
 
 defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
@@ -1034,9 +1032,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
 //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
-//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
+
+let SubtargetPredicate = isSI in {
+defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
+}
+
+defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>;
 
 //===----------------------------------------------------------------------===//
 // MTBUF Instructions
@@ -1155,8 +1156,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-let vdst = 0, src0 = 0 in {
-defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>;
 }
 
 let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -1292,7 +1293,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
   VOP_F64_F64, fsqrt
 >;
 
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
 
 defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
   VOP_F32_F32, AMDGPUsin
@@ -1300,6 +1303,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
 defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
   VOP_F32_F32, AMDGPUcos
 >;
+
+} // End SchedRW = [WriteQuarterRate32]
+
 defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
 defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
 defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
@@ -1308,24 +1314,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
   VOP_I32_F64
 >;
+
+let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
   VOP_F64_F64
 >;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
+  VOP_F64_F64
+>;
+} // End SchedRW = [WriteDoubleAdd]
+
+
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
   VOP_I32_F32
 >;
 defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
   VOP_F32_F32
 >;
-let vdst = 0, src0 = 0 in {
-defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
-  "v_clrexcp"
->;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>;
 }
+
+let Uses = [M0, EXEC] in {
 defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
 defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
 defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+} // End Uses = [M0, EXEC]
 
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -1343,7 +1358,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
   VOP_F32_F32, AMDGPUrsq_legacy
 >;
 
-} // End let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteDouble] in {
 
@@ -1360,7 +1375,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
 // VINTRP Instructions
 //===----------------------------------------------------------------------===//
 
-let Uses = [M0] in {
+let Uses = [M0, EXEC] in {
 
 // FIXME: Specify SchedRW for VINTRP insturctions.
 
@@ -1405,16 +1420,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
   [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
                                     (i32 imm:$attr)))]>;
 
-} // End Uses = [M0]
+} // End Uses = [M0, EXEC]
 
 //===----------------------------------------------------------------------===//
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
 multiclass V_CNDMASK <vop2 op, string name> {
-  defm _e32 : VOP2_m <
-      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [],
-      name, name>;
+  defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>;
 
   defm _e64  : VOP3_m <
       op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
@@ -1500,34 +1513,32 @@ let isCommutable = 1 in {
 defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
 } // End isCommutable = 1
 
-let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+let isCommutable = 1 in {
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
 
 // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
 // but the VI instructions behave the same as the SI versions.
 defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
-  VOP_I32_I32_I32, add
+  VOP2b_I32_I1_I32_I32
 >;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>;
 
 defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
-  VOP_I32_I32_I32, null_frag, "v_sub_i32"
+  VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32"
 >;
 
-let Uses = [VCC] in { // Carry-in comes from VCC
 defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
-  VOP_I32_I32_I32_VCC
+  VOP2b_I32_I1_I32_I32_I1
 >;
 defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
-  VOP_I32_I32_I32_VCC
+  VOP2b_I32_I1_I32_I32_I1
 >;
 defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
-  VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
+  VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32"
 >;
 
-} // End Uses = [VCC]
-} // End isCommutable = 1, Defs = [VCC]
+} // End isCommutable = 1
 
 defm V_READLANE_B32 : VOP2SI_3VI_m <
   vop3 <0x001, 0x289>,
@@ -1575,10 +1586,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
   VOP_I32_I32_I32
 >;
 defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, int_amdgcn_mbcnt_lo
 >;
 defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, int_amdgcn_mbcnt_hi
 >;
 defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
   VOP_F32_F32_I32, AMDGPUldexp
@@ -1704,15 +1715,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <
   vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
 >;
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
 
 defm V_DIV_FIXUP_F64 : VOP3Inst <
   vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
 >;
 
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
 let isCommutable = 1 in {
 
 defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
@@ -1735,7 +1746,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
   VOP_F64_F64_I32, AMDGPUldexp
 >;
 
-} // let SchedRW = [WriteDouble]
+} // let SchedRW = [WriteDoubleAdd]
 
 let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
 
@@ -1756,16 +1767,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
 } // isCommutable = 1, SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteFloatFMA, WriteSALU] in {
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
+  VOP3b_F32_I1_F32_F32_F32
+>;
 }
 
 let SchedRW = [WriteDouble, WriteSALU] in {
 // Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
+  VOP3b_F64_I1_F64_F64_F64
+>;
 } // let SchedRW = [WriteDouble]
 
-let isCommutable = 1, Uses = [VCC] in {
+let isCommutable = 1, Uses = [VCC, EXEC] in {
 
+let SchedRW = [WriteFloatFMA] in {
 // v_div_fmas_f32:
 //   result = src0 * src1 + src2
 //   if (vcc)
@@ -1774,6 +1790,7 @@ let isCommutable = 1, Uses = [VCC] in {
 defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
   VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
 >;
+}
 
 let SchedRW = [WriteDouble] in {
 // v_div_fmas_f64:
@@ -1786,7 +1803,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
 >;
 
 } // End SchedRW = [WriteDouble]
-} // End isCommutable = 1
+} // End isCommutable = 1, Uses = [VCC, EXEC]
 
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
@@ -1835,13 +1852,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
 >;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 // 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
 // pass to enable folding of inline immediates.
 def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
 } // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
-let hasSideEffects = 1 in {
+let hasSideEffects = 1, SALU = 1 in {
 def SGPR_USE : InstSI <(outs),(ins), "", []>;
 }
 
@@ -1921,39 +1938,9 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
+class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
   (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterLoad = 1;
-  let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
-  outs,
-  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterStore = 1;
-  let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
-def SI_INDIRECT_SRC : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+  (ins rc:$src, VSrc_32:$idx, i32imm:$off),
   "si_indirect_src $dst, $temp, $src, $idx, $off",
   []
 >;
@@ -1967,6 +1954,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   let Constraints = "$src = $dst";
 }
 
+// TODO: We can support indirect SGPR access.
+def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
+def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+
 def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
@@ -1977,19 +1971,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 
-  let UseNamedOperandTable = 1 in {
+  let UseNamedOperandTable = 1, Uses = [EXEC] in {
     def _SAVE : InstSI <
       (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset),
+      (ins sgpr_class:$src, i32imm:$frame_idx),
       "", []
-    >;
+    > {
+      let mayStore = 1;
+      let mayLoad = 0;
+    }
 
     def _RESTORE : InstSI <
       (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+      (ins i32imm:$frame_idx),
       "", []
-    >;
+    > {
+      let mayStore = 0;
+      let mayLoad = 1;
+    }
   } // End UseNamedOperandTable = 1
 }
 
@@ -2003,19 +2002,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 
 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
-  let UseNamedOperandTable = 1, VGPRSpill = 1 in {
+  let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
     def _SAVE : InstSI <
       (outs),
       (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
            SReg_32:$scratch_offset),
       "", []
-    >;
+    > {
+      let mayStore = 1;
+      let mayLoad = 0;
+    }
 
     def _RESTORE : InstSI <
       (outs vgpr_class:$dst),
       (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
       "", []
-    >;
+    > {
+      let mayStore = 0;
+      let mayLoad = 1;
+    }
   } // End UseNamedOperandTable = 1, VGPRSpill = 1
 }
 
@@ -2030,9 +2035,11 @@ let Defs = [SCC] in {
 
 def SI_CONSTDATA_PTR : InstSI <
   (outs SReg_64:$dst),
-  (ins),
-  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
->;
+  (ins const_ga:$ptr),
+  "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
+> {
+  let SALU = 1;
+}
 
 } // End Defs = [SCC]
 
@@ -2072,84 +2079,63 @@ def : Pat <
 // SMRD Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
 
-  // 1. SI-CI: Offset as 8bit DWORD immediate
+  // 1. IMM offset
   def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
-    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+    (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset))
   >;
 
-  // 2. Offset loaded in an 32bit SGPR
+  // 2. SGPR offset
   def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+    (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset))
   >;
 
-  // 3. No offset at all
   def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
+    (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+    (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset))
+  > {
+    let Predicates = [isCIOnly];
+  }
 }
 
-multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. VI: Offset as 20bit immediate in bytes
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
-    (vt (Instr_IMM $sbase, (as_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-let Predicates = [isSICI] in {
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isSICI]
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
 
-let Predicates = [isVI] in {
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isVI]
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
-let Predicates = [isSICI] in {
+// 1. Offset as an immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset)
+>;
 
-// 1. Offset as 8bit DWORD immediate
+// 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+  (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset)
 >;
 
-} // End Predicates = [isSICI]
+let Predicates = [isCI] in {
 
-// 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, imm:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+  (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset)
 >;
 
+} // End Predicates = [isCI]
+
+} // End let AddedComplexity = 10000
+
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -2161,6 +2147,11 @@ def : Pat <
      (S_MOV_B32 0), sub1))
 >;
 
+def : Pat <
+  (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+  (S_ABS_I32 $x)
+>;
+
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
@@ -2488,6 +2479,11 @@ def : Pat <
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
 
+//def : Extract_Element<i64, v2i64, 0, sub0_sub1>;
+//def : Extract_Element<i64, v2i64, 1, sub2_sub3>;
+//def : Extract_Element<f64, v2f64, 0, sub0_sub1>;
+//def : Extract_Element<f64, v2f64, 1, sub2_sub3>;
+
 foreach Index = 0-2 in {
   def Extract_Element_v2i32_#Index : Extract_Element <
     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2568,11 +2564,25 @@ def : BitConvert <v2i32, i64, VReg_64>;
 def : BitConvert <i64, v2i32, VReg_64>;
 def : BitConvert <v2f32, i64, VReg_64>;
 def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2f32, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
 
+
+def : BitConvert <v2i64, v4i32, SReg_128>;
+def : BitConvert <v4i32, v2i64, SReg_128>;
+
+def : BitConvert <v2f64, v4f32, VReg_128>;
+def : BitConvert <v2f64, v4i32, VReg_128>;
+def : BitConvert <v4f32, v2f64, VReg_128>;
+def : BitConvert <v4i32, v2f64, VReg_128>;
+
+
+
+
 def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, SReg_256>;
 def : BitConvert <v8i32, v32i8, SReg_256>;
@@ -2601,10 +2611,9 @@ def : Pat <
 
 // Prevent expanding both fneg and fabs.
 
-// FIXME: Should use S_OR_B32
 def : Pat <
   (fneg (fabs f32:$src)),
-  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+  (S_OR_B32 $src, 0x80000000) /* Set sign bit */
 >;
 
 // FIXME: Should use S_OR_B32
@@ -2836,10 +2845,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
 // -1. For the non-rtn variants, the manual says it does
 // DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
 // will always do the increment so I'm assuming it's the same.
-//
-// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
-// needs to be a VGPR. The SGPR copy pass will fix this, and it's
-// easier since there is no v_mov_b64.
 class DSAtomicIncRetPat<DS inst, ValueType vt,
                         Instruction LoadImm, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
@@ -2855,9 +2860,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
 
 // 32-bit atomics.
 def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
-                        S_MOV_B32, si_atomic_load_add_local>;
+                        V_MOV_B32_e32, si_atomic_load_add_local>;
 def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
-                        S_MOV_B32, si_atomic_load_sub_local>;
+                        V_MOV_B32_e32, si_atomic_load_sub_local>;
 
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
@@ -2874,9 +2879,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
 
 // 64-bit atomics.
 def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
-                        S_MOV_B64, si_atomic_load_add_local>;
+                        V_MOV_B64_PSEUDO, si_atomic_load_add_local>;
 def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
-                        S_MOV_B64, si_atomic_load_sub_local>;
+                        V_MOV_B64_PSEUDO, si_atomic_load_sub_local>;
 
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
@@ -3019,90 +3024,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
 def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
 def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
-let SubtargetPredicate = isCI in {
-
-defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
-  VOP_I32_I32_I32
->;
-defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
-  VOP_I32_I32_I32
->;
-defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
-  VOP_I32_I32_I32
->;
-
-let isCommutable = 1 in {
-defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
-  VOP_I64_I32_I32_I64
->;
-
-// XXX - Does this set VCC?
-defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
-  VOP_I64_I32_I32_I64
->;
-} // End isCommutable = 1
-
-// Remaining instructions:
-// FLAT_*
-// S_CBRANCH_CDBGUSER
-// S_CBRANCH_CDBGSYS
-// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER
-// S_DCACHE_INV_VOL
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
-
-} // End isCI
-
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
 /********** ====================== **********/
 
-multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
 
   // 1. Extract with offset
   def : Pat<
-    (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))),
-    (SI_INDIRECT_SRC $vec, $idx, imm:$off)
+    (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
   >;
 
   // 2. Extract without offset
   def : Pat<
-    (eltvt (vector_extract vt:$vec, i32:$idx)),
-    (SI_INDIRECT_SRC $vec, $idx, 0)
+    (eltvt (extractelt vt:$vec, i32:$idx)),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
   >;
 
   // 3. Insert with offset
   def : Pat<
-    (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
-    (IndDst $vec, $idx, imm:$off, $val)
+    (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
   >;
 
   // 4. Insert without offset
   def : Pat<
-    (vector_insert vt:$vec, eltvt:$val, i32:$idx),
-    (IndDst $vec, $idx, 0, $val)
+    (insertelt vt:$vec, eltvt:$val, i32:$idx),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
   >;
 }
 
-defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
 
-defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
 
 //===----------------------------------------------------------------------===//
 // Conversion Patterns
@@ -3215,12 +3176,12 @@ def : Pat <
 
 def : Pat <
   (i1 (trunc i32:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
+  (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1)
 >;
 
 def : Pat <
   (i1 (trunc i64:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+  (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1),
                     (EXTRACT_SUBREG $a, sub0)), 1)
 >;
 
@@ -3301,24 +3262,6 @@ def : Pat <
 
 } // End Predicates = [isSI]
 
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
-  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
-             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
-  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index c319b32..126f624 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,6 +103,10 @@ public:
     return "SI Lower control flow instructions";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
@@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
 
   DebugLoc DL = From.getDebugLoc();
   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-          .addOperand(To)
-          .addReg(AMDGPU::EXEC);
+    .addOperand(To);
 }
 
 void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
@@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
 
   // If the exec mask is non-zero, skip the next two instructions
   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addImm(3)
-          .addReg(AMDGPU::EXEC);
+    .addImm(3);
 
   // Exec mask is zero: Export to NULL target...
   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
@@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
           .addReg(Src);
 
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addOperand(MI.getOperand(1))
-          .addReg(AMDGPU::EXEC);
+    .addOperand(MI.getOperand(1));
 
   MI.eraseFromParent();
 }
@@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
               .addImm(0);
     }
   } else {
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
            .addImm(0)
            .addOperand(Op);
   }
@@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
             .addReg(AMDGPU::VCC_LO);
 
     // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
-            .addReg(AMDGPU::M0)
-            .addReg(Idx);
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+      .addReg(AMDGPU::M0)
+      .addReg(Idx);
 
     // Update EXEC, save the original EXEC value to VCC
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
@@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
 
     // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-            .addImm(-7)
-            .addReg(AMDGPU::EXEC);
+      .addImm(-7);
 
     // Restore EXEC
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
   MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
             .addReg(Reg)
-            .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Vec, RegState::Implicit);
 
   LoadM0(MI, MovRel, Off);
@@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
             .addReg(Reg, RegState::Define)
             .addReg(Val)
-            .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Dst, RegState::Implicit);
 
   LoadM0(MI, MovRel, Off);
@@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
+      if (TII->isWQM(MI) || TII->isDS(MI))
         NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
-      if (TII->isFLAT(MI.getOpcode()))
+      if (TII->isFLAT(MI))
         NeedFlat = true;
 
       switch (MI.getOpcode()) {
@@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           Branch(MI);
           break;
 
-        case AMDGPU::SI_INDIRECT_SRC:
+        case AMDGPU::SI_INDIRECT_SRC_V1:
+        case AMDGPU::SI_INDIRECT_SRC_V2:
+        case AMDGPU::SI_INDIRECT_SRC_V4:
+        case AMDGPU::SI_INDIRECT_SRC_V8:
+        case AMDGPU::SI_INDIRECT_SRC_V16:
           IndirectSrc(MI);
           break;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 67421e2..a2fa5fd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -48,6 +48,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 587ea63..49677fc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,10 +29,118 @@ void SIMachineFunctionInfo::anchor() {}
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    HasSpilledVGPRs(false),
+    ScratchRSrcReg(AMDGPU::NoRegister),
+    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+    DispatchPtrUserSGPR(AMDGPU::NoRegister),
+    QueuePtrUserSGPR(AMDGPU::NoRegister),
+    KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+    DispatchIDUserSGPR(AMDGPU::NoRegister),
+    FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+    PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+    WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+    PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
+    ReturnsVoid(true),
+    LDSWaveSpillSize(0),
+    PSInputEna(0),
     NumUserSGPRs(0),
-    LDSWaveSpillSize(0) { }
+    NumSystemSGPRs(0),
+    HasSpilledSGPRs(false),
+    HasSpilledVGPRs(false),
+    PrivateSegmentBuffer(false),
+    DispatchPtr(false),
+    QueuePtr(false),
+    DispatchID(false),
+    KernargSegmentPtr(false),
+    FlatScratchInit(false),
+    GridWorkgroupCountX(false),
+    GridWorkgroupCountY(false),
+    GridWorkgroupCountZ(false),
+    WorkGroupIDX(true),
+    WorkGroupIDY(false),
+    WorkGroupIDZ(false),
+    WorkGroupInfo(false),
+    PrivateSegmentWaveByteOffset(false),
+    WorkItemIDX(true),
+    WorkItemIDY(false),
+    WorkItemIDZ(false) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const Function *F = MF.getFunction();
+
+  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+  if (getShaderType() == ShaderType::COMPUTE)
+    KernargSegmentPtr = true;
+
+  if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+    WorkGroupIDY = true;
+
+  if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+    WorkGroupIDZ = true;
+
+  if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+    WorkItemIDY = true;
+
+  if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+    WorkItemIDZ = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  bool HasStackObjects = FrameInfo->hasStackObjects();
+
+  if (HasStackObjects || MaySpill)
+    PrivateSegmentWaveByteOffset = true;
+
+  if (ST.isAmdHsaOS()) {
+    if (HasStackObjects || MaySpill)
+      PrivateSegmentBuffer = true;
+
+    if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+      DispatchPtr = true;
+  }
+
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+  const SIRegisterInfo &TRI) {
+  PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  NumUserSGPRs += 4;
+  return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+  DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+  QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+  KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return KernargSegmentPtrUserSGPR;
+}
 
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        MachineFunction *MF,
@@ -52,8 +160,18 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
 
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+
+    if (LaneVGPR == AMDGPU::NoRegister) {
+      LLVMContext &Ctx = MF->getFunction()->getContext();
+      Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+
+      // When compiling from inside Mesa, the compilation continues.
+      // Select an arbitrary register to avoid triggering assertions
+      // during subsequent passes.
+      LaneVGPR = AMDGPU::VGPR0;
+    }
+
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-    MRI.setPhysRegUsed(LaneVGPR);
 
     // Add this register as live-in to all blocks to avoid machine verifer
     // complaining about use of an undefined physical register.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 667da4c..846ee5d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,13 +26,87 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  // FIXME: This should be removed and getPreloadedValue moved here.
+  friend struct SIRegisterInfo;
   void anchor() override;
 
   unsigned TIDReg;
-  bool HasSpilledVGPRs;
+
+  // Registers that may be reserved for spilling purposes. These may be the same
+  // as the input registers.
+  unsigned ScratchRSrcReg;
+  unsigned ScratchWaveOffsetReg;
+
+  // Input registers setup for the HSA ABI.
+  // User SGPRs in allocation order.
+  unsigned PrivateSegmentBufferUserSGPR;
+  unsigned DispatchPtrUserSGPR;
+  unsigned QueuePtrUserSGPR;
+  unsigned KernargSegmentPtrUserSGPR;
+  unsigned DispatchIDUserSGPR;
+  unsigned FlatScratchInitUserSGPR;
+  unsigned PrivateSegmentSizeUserSGPR;
+  unsigned GridWorkGroupCountXUserSGPR;
+  unsigned GridWorkGroupCountYUserSGPR;
+  unsigned GridWorkGroupCountZUserSGPR;
+
+  // System SGPRs in allocation order.
+  unsigned WorkGroupIDXSystemSGPR;
+  unsigned WorkGroupIDYSystemSGPR;
+  unsigned WorkGroupIDZSystemSGPR;
+  unsigned WorkGroupInfoSystemSGPR;
+  unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
+
+  // Graphics info.
+  unsigned PSInputAddr;
+  bool ReturnsVoid;
 
 public:
+  // FIXME: Make private
+  unsigned LDSWaveSpillSize;
+  unsigned PSInputEna;
+  std::map<unsigned, unsigned> LaneVGPRs;
+  unsigned ScratchOffsetReg;
+  unsigned NumUserSGPRs;
+  unsigned NumSystemSGPRs;
+
+private:
+  bool HasSpilledSGPRs;
+  bool HasSpilledVGPRs;
+
+  // Feature bits required for inputs passed in user SGPRs.
+  bool PrivateSegmentBuffer : 1;
+  bool DispatchPtr : 1;
+  bool QueuePtr : 1;
+  bool DispatchID : 1;
+  bool KernargSegmentPtr : 1;
+  bool FlatScratchInit : 1;
+  bool GridWorkgroupCountX : 1;
+  bool GridWorkgroupCountY : 1;
+  bool GridWorkgroupCountZ : 1;
+
+  // Feature bits required for inputs passed in system SGPRs.
+  bool WorkGroupIDX : 1; // Always initialized.
+  bool WorkGroupIDY : 1;
+  bool WorkGroupIDZ : 1;
+  bool WorkGroupInfo : 1;
+  bool PrivateSegmentWaveByteOffset : 1;
+
+  bool WorkItemIDX : 1; // Always initialized.
+  bool WorkItemIDY : 1;
+  bool WorkItemIDZ : 1;
+
 
+  MCPhysReg getNextUserSGPR() const {
+    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+    return AMDGPU::SGPR0 + NumUserSGPRs;
+  }
+
+  MCPhysReg getNextSystemSGPR() const {
+    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+  }
+
+public:
   struct SpilledReg {
     unsigned VGPR;
     int Lane;
@@ -46,16 +120,182 @@ public:
   SIMachineFunctionInfo(const MachineFunction &MF);
   SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
                            unsigned SubIdx);
-  unsigned PSInputAddr;
-  unsigned NumUserSGPRs;
-  std::map<unsigned, unsigned> LaneVGPRs;
-  unsigned LDSWaveSpillSize;
-  unsigned ScratchOffsetReg;
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
-  bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
-  void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
+
+  // Add user SGPRs.
+  unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+  unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+  unsigned addQueuePtr(const SIRegisterInfo &TRI);
+  unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+  // Add system SGPRs.
+  unsigned addWorkGroupIDX() {
+    WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDXSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDY() {
+    WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDYSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDZ() {
+    WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDZSystemSGPR;
+  }
+
+  unsigned addWorkGroupInfo() {
+    WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupInfoSystemSGPR;
+  }
+
+  unsigned addPrivateSegmentWaveByteOffset() {
+    PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  bool hasPrivateSegmentBuffer() const {
+    return PrivateSegmentBuffer;
+  }
+
+  bool hasDispatchPtr() const {
+    return DispatchPtr;
+  }
+
+  bool hasQueuePtr() const {
+    return QueuePtr;
+  }
+
+  bool hasDispatchID() const {
+    return DispatchID;
+  }
+
+  bool hasKernargSegmentPtr() const {
+    return KernargSegmentPtr;
+  }
+
+  bool hasFlatScratchInit() const {
+    return FlatScratchInit;
+  }
+
+  bool hasGridWorkgroupCountX() const {
+    return GridWorkgroupCountX;
+  }
+
+  bool hasGridWorkgroupCountY() const {
+    return GridWorkgroupCountY;
+  }
+
+  bool hasGridWorkgroupCountZ() const {
+    return GridWorkgroupCountZ;
+  }
+
+  bool hasWorkGroupIDX() const {
+    return WorkGroupIDX;
+  }
+
+  bool hasWorkGroupIDY() const {
+    return WorkGroupIDY;
+  }
+
+  bool hasWorkGroupIDZ() const {
+    return WorkGroupIDZ;
+  }
+
+  bool hasWorkGroupInfo() const {
+    return WorkGroupInfo;
+  }
+
+  bool hasPrivateSegmentWaveByteOffset() const {
+    return PrivateSegmentWaveByteOffset;
+  }
+
+  bool hasWorkItemIDX() const {
+    return WorkItemIDX;
+  }
+
+  bool hasWorkItemIDY() const {
+    return WorkItemIDY;
+  }
+
+  bool hasWorkItemIDZ() const {
+    return WorkItemIDZ;
+  }
+
+  unsigned getNumUserSGPRs() const {
+    return NumUserSGPRs;
+  }
+
+  unsigned getNumPreloadedSGPRs() const {
+    return NumUserSGPRs + NumSystemSGPRs;
+  }
+
+  unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  /// \brief Returns the physical register reserved for use as the resource
+  /// descriptor for scratch accesses.
+  unsigned getScratchRSrcReg() const {
+    return ScratchRSrcReg;
+  }
+
+  void setScratchRSrcReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchRSrcReg = Reg;
+  }
+
+  unsigned getScratchWaveOffsetReg() const {
+    return ScratchWaveOffsetReg;
+  }
+
+  void setScratchWaveOffsetReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchWaveOffsetReg = Reg;
+  }
+
+  bool hasSpilledSGPRs() const {
+    return HasSpilledSGPRs;
+  }
+
+  void setHasSpilledSGPRs(bool Spill = true) {
+    HasSpilledSGPRs = Spill;
+  }
+
+  bool hasSpilledVGPRs() const {
+    return HasSpilledVGPRs;
+  }
+
+  void setHasSpilledVGPRs(bool Spill = true) {
+    HasSpilledVGPRs = Spill;
+  }
+
+  unsigned getPSInputAddr() const {
+    return PSInputAddr;
+  }
+
+  bool isPSInputAllocated(unsigned Index) const {
+    return PSInputAddr & (1 << Index);
+  }
+
+  void markPSInputAllocated(unsigned Index) {
+    PSInputAddr |= 1 << Index;
+  }
+
+  bool returnsVoid() const {
+    return ReturnsVoid;
+  }
+
+  void setIfReturnsVoid(bool Value) {
+    ReturnsVoid = Value;
+  }
 
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
new file mode 100644
index 0000000..1cfa984
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,1968 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+//   performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+//   We want the blocks help control register usage and hide high latencies
+//   later. To help control register usage, we typically want all local
+//   computations, when for example you create a result that can be comsummed
+//   right away, to be contained in a block. Block inputs and outputs would
+//   typically be important results that are needed in several locations of
+//   the shader. Since we do want blocks to help hide high latencies, we want
+//   the instructions inside the block to have a minimal set of dependencies
+//   on high latencies. It will make it easy to pick blocks to hide specific
+//   high latencies.
+//   The block creation algorithm is divided into several steps, and several
+//   variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is choosen.
+//   At that step we do take into account only register usage and hiding
+//   low latency instructions
+//
+// Third the block order is choosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwith limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND";
+  case RegUsage:       return "REGUSAGE";
+  case Latency:        return "LATENCY";
+  case Successor:      return "SUCCESSOR";
+  case Depth:          return "DEPTH";
+  case NodeOrder:      return "ORDER";
+  }
+  llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+                    SISchedulerCandidate &TryCand,
+                    SISchedulerCandidate &Cand,
+                    SIScheduleCandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       SISchedulerCandidate &TryCand,
+                       SISchedulerCandidate &Cand,
+                       SIScheduleCandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+  NodeNum2Index[SU->NodeNum] = SUnits.size();
+  SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+                                          SISchedCandidate &TryCand) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  if (Cand.SGPRUsage > 60 &&
+      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Schedule low latency instructions as top as possible.
+  // Order of priority is:
+  // . Low latency instructions which do not depend on other low latency
+  //   instructions we haven't waited for
+  // . Other instructions which do not depend on low latency instructions
+  //   we haven't waited for
+  // . Low latencies
+  // . All other instructions
+  // Goal is to get: low latency instructions - independant instructions
+  //     - (eventually some more low latency instructions)
+  //     - instructions that depend on the first low latency instructions.
+  // If in the block there is a lot of constant loads, the SGPR usage
+  // could go quite high, thus above the arbitrary limit of 60 will encourage
+  // use the already loaded constants (in order to release some SGPRs) before
+  // loading more.
+  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+              Cand.HasLowLatencyNonWaitedParent,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                 TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (TryCand.IsLowLatency &&
+      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+    TryCand.Reason = NodeOrder;
+  }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+  SISchedCandidate TopCand;
+
+  for (SUnit* SU : TopReadySUs) {
+    SISchedCandidate TryCand;
+    std::vector<unsigned> pressure;
+    std::vector<unsigned> MaxPressure;
+    // Predict register usage after this instruction.
+    TryCand.SU = SU;
+    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+    TryCand.HasLowLatencyNonWaitedParent =
+      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+    tryCandidateTopDown(TopCand, TryCand);
+    if (TryCand.Reason != NoCand)
+      TopCand.setBest(TryCand);
+  }
+
+  return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+  TopReadySUs.clear();
+  if (Scheduled)
+    undoSchedule();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = TopReadySUs[0];
+    ScheduledSUnits.push_back(SU);
+    nodeScheduled(SU);
+  }
+
+  Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+                           SlotIndex First, SlotIndex Last,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::def_instr_iterator
+       UI = MRI->def_instr_begin(Reg),
+       UE = MRI->def_instr_end(); UI != UE; ++UI) {
+    const MachineInstr* MI = &*UI;
+    if (MI->isDebugValue())
+      continue;
+    SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+    if (InstSlot >= First && InstSlot <= Last)
+      return true;
+  }
+  return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                                      MachineBasicBlock::iterator EndBlock) {
+  IntervalPressure Pressure, BotPressure;
+  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineRegisterInfo *MRI = DAG->getMRI();
+  DAG->initRPTracker(TopRPTracker);
+  DAG->initRPTracker(BotRPTracker);
+  DAG->initRPTracker(RPTracker);
+
+  // Goes though all SU. RPTracker captures what had to be alive for the SUs
+  // to execute, and what is still alive at the end.
+  for (SUnit* SU : ScheduledSUnits) {
+    RPTracker.setPos(SU->getInstr());
+    RPTracker.advance();
+  }
+
+  // Close the RPTracker to finalize live ins/outs.
+  RPTracker.closeRegion();
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Do not Track Physical Registers, because it messes up.
+  for (unsigned Reg : RPTracker.getPressure().LiveInRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      LiveInRegs.insert(Reg);
+  }
+  LiveOutRegs.clear();
+  // There is several possibilities to distinguish:
+  // 1) Reg is not input to any instruction in the block, but is output of one
+  // 2) 1) + read in the block and not needed after it
+  // 3) 1) + read in the block but needed in another block
+  // 4) Reg is input of an instruction but another block will read it too
+  // 5) Reg is input of an instruction and then rewritten in the block.
+  //    result is not read in the block (implies used in another block)
+  // 6) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block and not needed in another block
+  // 7) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block but also needed in another block
+  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+  // We want LiveOutRegs to contain only Regs whose content will be read after
+  // in another block, and whose content was written in the current block,
+  // that is we want it to get 1, 3, 5, 7
+  // Since we made the MIs of a block to be packed all together before
+  // scheduling, then the LiveIntervals were correct, and the RPTracker was
+  // able to correctly handle 5 vs 6, 2 vs 3.
+  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // The use of findDefBetween removes the case 4.
+  for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+        isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
+                       LIS->getInstructionIndex(EndBlock).getRegSlot(),
+                       MRI, LIS)) {
+      LiveOutRegs.insert(Reg);
+    }
+  }
+
+  // Pressure = sum_alive_registers register size
+  // Internally llvm will represent some registers as big 128 bits registers
+  // for example, but they actually correspond to 4 actual 32 bits registers.
+  // Thus Pressure is not equal to num_alive_registers * constant.
+  LiveInPressure = TopPressure.MaxSetPressure;
+  LiveOutPressure = BotPressure.MaxSetPressure;
+
+  // Prepares TopRPTracker for top down scheduling.
+  TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+                               MachineBasicBlock::iterator EndBlock) {
+  if (!Scheduled)
+    fastSchedule();
+
+  // PreScheduling phase to set LiveIn and LiveOut.
+  initRegPressure(BeginBlock, EndBlock);
+  undoSchedule();
+
+  // Schedule for real now.
+
+  TopReadySUs.clear();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = pickNode();
+    ScheduledSUnits.push_back(SU);
+    TopRPTracker.setPos(SU->getInstr());
+    TopRPTracker.advance();
+    nodeScheduled(SU);
+  }
+
+  // TODO: compute InternalAdditionnalPressure.
+  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+  // Check everything is right.
+#ifndef NDEBUG
+  assert(SUnits.size() == ScheduledSUnits.size() &&
+            TopReadySUs.empty());
+  for (SUnit* SU : SUnits) {
+    assert(SU->isScheduled &&
+              SU->NumPredsLeft == 0);
+  }
+#endif
+
+  Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+  for (SUnit* SU : SUnits) {
+    SU->isScheduled = false;
+    for (SDep& Succ : SU->Succs) {
+      if (BC->isSUInBlock(Succ.getSUnit(), ID))
+        undoReleaseSucc(SU, &Succ);
+    }
+  }
+  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+  ScheduledSUnits.clear();
+  Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    ++SuccSU->WeakPredsLeft;
+    return;
+  }
+  ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(DAG);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+
+  --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+  for (SDep& Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+
+    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+      continue;
+
+    releaseSucc(SU, &Succ);
+    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+      TopReadySUs.push_back(SuccSU);
+  }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+  // Is in TopReadySUs
+  assert (!SU->NumPredsLeft);
+  std::vector<SUnit*>::iterator I =
+    std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
+  if (I == TopReadySUs.end()) {
+    dbgs() << "Data Structure Bug in SI Scheduler\n";
+    llvm_unreachable(nullptr);
+  }
+  TopReadySUs.erase(I);
+
+  releaseSuccessors(SU, true);
+  // Scheduling this node will trigger a wait,
+  // thus propagate to other instructions that they do not need to wait either.
+  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+  if (DAG->IsLowLatencySU[SU->NodeNum]) {
+     for (SDep& Succ : SU->Succs) {
+      std::map<unsigned, unsigned>::iterator I =
+        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+      if (I != NodeNum2Index.end())
+        HasLowLatencyNonWaitedParent[I->second] = 1;
+    }
+  }
+  SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+  // We remove links from outside blocks to enable scheduling inside the block.
+  for (SUnit* SU : SUnits) {
+    releaseSuccessors(SU, false);
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      HighLatencyBlock = true;
+  }
+  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+  unsigned PredID = Pred->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* P : Preds) {
+    if (PredID == P->getID())
+      return;
+  }
+  Preds.push_back(Pred);
+
+#ifndef NDEBUG
+  for (SIScheduleBlock* S : Succs) {
+    if (PredID == S->getID())
+      assert(!"Loop in the Block Graph!\n");
+  }
+#endif
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+  unsigned SuccID = Succ->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* S : Succs) {
+    if (SuccID == S->getID())
+      return;
+  }
+  if (Succ->isHighLatencyBlock())
+    ++NumHighLatencySuccessors;
+  Succs.push_back(Succ);
+#ifndef NDEBUG
+  for (SIScheduleBlock* P : Preds) {
+    if (SuccID == P->getID())
+      assert("Loop in the Block Graph!\n");
+  }
+#endif
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+  dbgs() << "Block (" << ID << ")\n";
+  if (!full)
+    return;
+
+  dbgs() << "\nContains High Latency Instruction: "
+         << HighLatencyBlock << '\n';
+  dbgs() << "\nDepends On:\n";
+  for (SIScheduleBlock* P : Preds) {
+    P->printDebug(false);
+  }
+
+  dbgs() << "\nSuccessors:\n";
+  for (SIScheduleBlock* S : Succs) {
+    S->printDebug(false);
+  }
+
+  if (Scheduled) {
+    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+    dbgs() << "LiveIns:\n";
+    for (unsigned Reg : LiveInRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+    dbgs() << "\nLiveOuts:\n";
+    for (unsigned Reg : LiveOutRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+  }
+
+  dbgs() << "\nInstructions:\n";
+  if (!Scheduled) {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  } else {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  }
+
+   dbgs() << "///////////////////////\n";
+}
+
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() {
+}
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+    Blocks.find(BlockVariant);
+  if (B == Blocks.end()) {
+    SIScheduleBlocks Res;
+    createBlocksForVariant(BlockVariant);
+    topologicalSort();
+    scheduleInsideBlocks();
+    fillStats();
+    Res.Blocks = CurrentBlocks;
+    Res.TopDownIndex2Block = TopDownIndex2Block;
+    Res.TopDownBlock2Index = TopDownBlock2Index;
+    Blocks[BlockVariant] = Res;
+    return Res;
+  } else {
+    return B->second;
+  }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+  if (SU->NodeNum >= DAG->SUnits.size())
+    return false;
+  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      CurrentColoring[SU->NodeNum] = NextReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned NumHighLatencies = 0;
+  unsigned GroupSize;
+  unsigned Color = NextReservedID;
+  unsigned Count = 0;
+  std::set<unsigned> FormingGroup;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      ++NumHighLatencies;
+  }
+
+  if (NumHighLatencies == 0)
+    return;
+
+  if (NumHighLatencies <= 6)
+    GroupSize = 2;
+  else if (NumHighLatencies <= 12)
+    GroupSize = 3;
+  else
+    GroupSize = 4;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      unsigned CompatibleGroup = true;
+      unsigned ProposedColor = Color;
+      for (unsigned j : FormingGroup) {
+        // TODO: Currently CompatibleGroup will always be false,
+        // because the graph enforces the load order. This
+        // can be fixed, but as keeping the load order is often
+        // good for performance that causes a performance hit (both
+        // the default scheduler and this scheduler).
+        // When this scheduler determines a good load order,
+        // this can be fixed.
+        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
+            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+          CompatibleGroup = false;
+      }
+      if (!CompatibleGroup || ++Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        if (!CompatibleGroup) {
+          ProposedColor = Color;
+          FormingGroup.insert(SU->NodeNum);
+        }
+        Count = 0;
+      } else {
+        FormingGroup.insert(SU->NodeNum);
+      }
+      CurrentColoring[SU->NodeNum] = ProposedColor;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+  CurrentTopDownReservedDependencyColoring.clear();
+  CurrentBottomUpReservedDependencyColoring.clear();
+
+  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+  // Traverse TopDown, and give different colors to SUs depending
+  // on which combination of High Latencies they depend on.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+   for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+    }
+    // Color 0 by default.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+
+  ColorCombinations.clear();
+
+  // Same as before, but BottomUp.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+    }
+    // Keep color 0.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+  // Every combination of colors given by the top down
+  // and bottom up Reserved node dependency
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    std::pair<unsigned, unsigned> SUColors;
+
+    // High latency instructions: already given.
+    if (CurrentColoring[SU->NodeNum])
+      continue;
+
+    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+      ColorCombinations.find(SUColors);
+    if (Pos != ColorCombinations.end()) {
+      CurrentColoring[SU->NodeNum] = Pos->second;
+    } else {
+      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      ColorCombinations[SUColors] = NextNonReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::vector<int> PendingColoring = CurrentColoring;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+    std::set<unsigned> SUColorsPending;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentColoring[Succ->NodeNum]);
+      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+      PendingColoring[SU->NodeNum] = *SUColors.begin();
+    else // TODO: Attribute new colors depending on color
+         // combination of children.
+      PendingColoring[SU->NodeNum] = NextNonReservedID++;
+  }
+  CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned PreviousColor;
+  std::set<unsigned> SeenColors;
+
+  if (DAGSize <= 1)
+    return;
+
+  PreviousColor = CurrentColoring[0];
+
+  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned CurrentColor = CurrentColoring[i];
+    unsigned PreviousColorSave = PreviousColor;
+    assert(i == SU->NodeNum);
+
+    if (CurrentColor != PreviousColor)
+      SeenColors.insert(PreviousColor);
+    PreviousColor = CurrentColor;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (SeenColors.find(CurrentColor) == SeenColors.end())
+      continue;
+
+    if (PreviousColorSave != CurrentColor)
+      CurrentColoring[i] = NextNonReservedID++;
+    else
+      CurrentColoring[i] = CurrentColoring[i-1];
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    // No predecessor: Vgpr constant loading.
+    // Low latency instructions usually have a predecessor (the address)
+    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned, unsigned> ColorCount;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
+      if (Pos != ColorCount.end()) {
+        ++ColorCount[color];
+      } else {
+        ColorCount[color] = 1;
+      }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (ColorCount[color] > 1)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() != color) {
+      --ColorCount[color];
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+      ++ColorCount[*SUColors.begin()];
+    }
+  }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+  // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+  unsigned DAGSize = DAG->SUnits.size();
+  int GroupID = NextNonReservedID++;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    bool hasSuccessor = false;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      hasSuccessor = true;
+    }
+    if (!hasSuccessor)
+      CurrentColoring[SU->NodeNum] = GroupID;
+  }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned,unsigned> RealID;
+
+  CurrentBlocks.clear();
+  CurrentColoring.clear();
+  CurrentColoring.resize(DAGSize, 0);
+  Node2CurrentBlock.clear();
+
+  // Restore links previous scheduling variant has overridden.
+  DAG->restoreSULinksLeft();
+
+  NextReservedID = 1;
+  NextNonReservedID = DAGSize + 1;
+
+  DEBUG(dbgs() << "Coloring the graph\n");
+
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+    colorHighLatenciesGroups();
+  else
+    colorHighLatenciesAlone();
+  colorComputeReservedDependencies();
+  colorAccordingToReservedDependencies();
+  colorEndsAccordingToDependencies();
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+    colorForceConsecutiveOrderInGroup();
+  regroupNoUserInstructions();
+  colorMergeConstantLoadsNextGroup();
+  colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Put SUs of same color into same block
+  Node2CurrentBlock.resize(DAGSize, -1);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned Color = CurrentColoring[SU->NodeNum];
+    if (RealID.find(Color) == RealID.end()) {
+      int ID = CurrentBlocks.size();
+      BlockPtrs.push_back(
+        make_unique<SIScheduleBlock>(DAG, this, ID));
+      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+      RealID[Color] = ID;
+    }
+    CurrentBlocks[RealID[Color]]->addUnit(SU);
+    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+  }
+
+  // Build dependencies between blocks.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int SUID = Node2CurrentBlock[i];
+     for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+    }
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+    }
+  }
+
+  // Free root and leafs of all blocks to enable scheduling inside them.
+  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->finalizeUnits();
+  }
+  DEBUG(
+    dbgs() << "Blocks created:\n\n";
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+  unsigned DAGSize = CurrentBlocks.size();
+  std::vector<int> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+
+  WorkList.reserve(DAGSize);
+  TopDownIndex2Block.resize(DAGSize);
+  TopDownBlock2Index.resize(DAGSize);
+  BottomUpIndex2Block.resize(DAGSize);
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    unsigned Degree = Block->getSuccs().size();
+    TopDownBlock2Index[i] = Degree;
+    if (Degree == 0) {
+      WorkList.push_back(i);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    int i = WorkList.back();
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    WorkList.pop_back();
+    TopDownBlock2Index[i] = --Id;
+    TopDownIndex2Block[Id] = i;
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      if (!--TopDownBlock2Index[Pred->getID()])
+        WorkList.push_back(Pred->getID());
+    }
+  }
+
+#ifndef NDEBUG
+  // Check correctness of the ordering.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+      "Wrong Top Down topological sorting");
+    }
+  }
+#endif
+
+  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+                                         TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+  // We do schedule a valid scheduling such that a Block corresponds
+  // to a range of instructions.
+  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->fastSchedule();
+  }
+
+  // Note: the following code, and the part restoring previous position
+  // is by far the most expensive operation of the Scheduler.
+
+  // Do not update CurrentTop.
+  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+  std::vector<MachineBasicBlock::iterator> PosOld;
+  std::vector<MachineBasicBlock::iterator> PosNew;
+  PosOld.reserve(DAG->SUnits.size());
+  PosNew.reserve(DAG->SUnits.size());
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs) {
+      MachineInstr *MI = SU->getInstr();
+      MachineBasicBlock::iterator Pos = MI;
+      PosOld.push_back(Pos);
+      if (&*CurrentTopFastSched == MI) {
+        PosNew.push_back(Pos);
+        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+                                          DAG->getCurrentBottom());
+      } else {
+        // Update the instruction stream.
+        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+        // Update LiveIntervals.
+        // Note: Moving all instructions and calling handleMove everytime
+        // is the most cpu intensive operation of the scheduler.
+        // It would gain a lot if there was a way to recompute the
+        // LiveIntervals for the entire scheduling region.
+        DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+        PosNew.push_back(CurrentTopFastSched);
+      }
+    }
+  }
+
+  // Now we have Block of SUs == Block of MI.
+  // We do the final schedule for the instructions inside the block.
+  // The property that all the SUs of the Block are grouped together as MI
+  // is used for correct reg usage tracking.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+  }
+
+  DEBUG(dbgs() << "Restoring MI Pos\n");
+  // Restore old ordering (which prevents a LIS->handleMove bug).
+  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+    MachineBasicBlock::iterator POld = PosOld[i-1];
+    MachineBasicBlock::iterator PNew = PosNew[i-1];
+    if (PNew != POld) {
+      // Update the instruction stream.
+      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+      // Update LiveIntervals.
+      DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+    }
+  }
+
+  DEBUG(
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getPreds().size() == 0)
+      Block->Depth = 0;
+    else {
+      unsigned Depth = 0;
+      for (SIScheduleBlock *Pred : Block->getPreds()) {
+        if (Depth < Pred->Depth + 1)
+          Depth = Pred->Depth + 1;
+      }
+      Block->Depth = Depth;
+    }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = BottomUpIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getSuccs().size() == 0)
+      Block->Height = 0;
+    else {
+      unsigned Height = 0;
+      for (SIScheduleBlock *Succ : Block->getSuccs()) {
+        if (Height < Succ->Height + 1)
+          Height = Succ->Height + 1;
+      }
+      Block->Height = Height;
+    }
+  }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                                                   SISchedulerBlockSchedulerVariant Variant,
+                                                   SIScheduleBlocks  BlocksStruct) :
+  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+  // Fill the usage of every output
+  // Warning: while by construction we always have a link between two blocks
+  // when one needs a result from the other, the number of users of an output
+  // is not the sum of child blocks having as input the same virtual register.
+  // Here is an example. A produces x and y. B eats x and produces x'.
+  // C eats x' and y. The register coalescer may have attributed the same
+  // virtual register to x and x'.
+  // To count accurately, we do a topological sort. In case the register is
+  // found for several parents, we increment the usage of the one with the
+  // highest topological index.
+  LiveOutRegsNumUsages.resize(Blocks.size());
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      int topoInd = -1;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+          }
+        }
+      }
+
+      if (!Found)
+        continue;
+
+      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+      std::map<unsigned, unsigned>::iterator RegPos =
+        LiveOutRegsNumUsages[PredID].find(Reg);
+      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
+        ++LiveOutRegsNumUsages[PredID][Reg];
+      } else {
+        LiveOutRegsNumUsages[PredID][Reg] = 1;
+      }
+    }
+  }
+
+  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+  BlockNumPredsLeft.resize(Blocks.size());
+  BlockNumSuccsLeft.resize(Blocks.size());
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    BlockNumPredsLeft[i] = Block->getPreds().size();
+    BlockNumSuccsLeft[i] = Block->getSuccs().size();
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    assert(Block->getID() == i);
+  }
+#endif
+
+  std::set<unsigned> InRegs = DAG->getInRegs();
+  addLiveRegs(InRegs);
+
+  // Fill LiveRegsConsumers for regs that were already
+  // defined before scheduling.
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found) {
+        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+          LiveRegsConsumers[Reg] = 1;
+        else
+          ++LiveRegsConsumers[Reg];
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    if (BlockNumPredsLeft[i] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+  }
+
+  while (SIScheduleBlock *Block = pickBlock()) {
+    BlocksScheduled.push_back(Block);
+    blockScheduled(Block);
+  }
+
+  DEBUG(
+    dbgs() << "Block Order:";
+    for (SIScheduleBlock* Block : BlocksScheduled) {
+      dbgs() << ' ' << Block->getID();
+    }
+  );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                                                   SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Try to hide high latencies.
+  if (tryLess(TryCand.LastPosHighLatParentScheduled,
+              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+    return true;
+  // Schedule high latencies early so you can hide them better.
+  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                 TryCand, Cand, Latency))
+    return true;
+  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+                                          TryCand, Cand, Depth))
+    return true;
+  if (tryGreater(TryCand.NumHighLatencySuccessors,
+                 Cand.NumHighLatencySuccessors,
+                 TryCand, Cand, Successor))
+    return true;
+  return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                                                    SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+              TryCand, Cand, RegUsage))
+    return true;
+  if (tryGreater(TryCand.NumSuccessors > 0,
+                 Cand.NumSuccessors > 0,
+                 TryCand, Cand, Successor))
+    return true;
+  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+    return true;
+  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+              TryCand, Cand, RegUsage))
+    return true;
+  return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+  SIBlockSchedCandidate Cand;
+  std::vector<SIScheduleBlock*>::iterator Best;
+  SIScheduleBlock *Block;
+  if (ReadyBlocks.empty())
+    return nullptr;
+
+  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+                        VregCurrentUsage, SregCurrentUsage);
+  if (VregCurrentUsage > maxVregUsage)
+    maxVregUsage = VregCurrentUsage;
+  if (VregCurrentUsage > maxSregUsage)
+    maxSregUsage = VregCurrentUsage;
+  DEBUG(
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (SIScheduleBlock* Block : ReadyBlocks)
+      dbgs() << Block->getID() << ' ';
+    dbgs() << "\nCurrent Live:\n";
+    for (unsigned Reg : LiveRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+    dbgs() << '\n';
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+  );
+
+  Cand.Block = nullptr;
+  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+       E = ReadyBlocks.end(); I != E; ++I) {
+    SIBlockSchedCandidate TryCand;
+    TryCand.Block = *I;
+    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+    TryCand.VGPRUsageDiff =
+      checkRegUsageImpact(TryCand.Block->getInRegs(),
+                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+    TryCand.NumHighLatencySuccessors =
+      TryCand.Block->getNumHighLatencySuccessors();
+    TryCand.LastPosHighLatParentScheduled =
+      (unsigned int) std::max<int> (0,
+         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+           LastPosWaitedHighLatency);
+    TryCand.Height = TryCand.Block->Height;
+    // Try not to increase VGPR usage too much, else we may spill.
+    if (VregCurrentUsage > 120 ||
+        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+      if (!tryCandidateRegUsage(Cand, TryCand) &&
+          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+        tryCandidateLatency(Cand, TryCand);
+    } else {
+      if (!tryCandidateLatency(Cand, TryCand))
+        tryCandidateRegUsage(Cand, TryCand);
+    }
+    if (TryCand.Reason != NoCand) {
+      Cand.setBest(TryCand);
+      Best = I;
+      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+                   << getReasonStr(Cand.Reason) << '\n');
+    }
+  }
+
+  DEBUG(
+    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+    dbgs() << "Is a block with high latency instruction: "
+      << (Cand.IsHighLatency ? "yes\n" : "no\n");
+    dbgs() << "Position of last high latency dependency: "
+           << Cand.LastPosHighLatParentScheduled << '\n';
+    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+    dbgs() << '\n';
+  );
+
+  Block = Cand.Block;
+  ReadyBlocks.erase(Best);
+  return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // If not already in the live set, then add it.
+    (void) LiveRegs.insert(Reg);
+  }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+                                       std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+    assert (Pos != LiveRegs.end() && // Reg must be live.
+               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+               LiveRegsConsumers[Reg] >= 1);
+    --LiveRegsConsumers[Reg];
+    if (LiveRegsConsumers[Reg] == 0)
+      LiveRegs.erase(Pos);
+  }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+  for (SIScheduleBlock* Block : Parent->getSuccs()) {
+    --BlockNumPredsLeft[Block->getID()];
+    if (BlockNumPredsLeft[Block->getID()] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+    // TODO: Improve check. When the dependency between the high latency
+    // instructions and the instructions of the other blocks are WAR or WAW
+    // there will be no wait triggered. We would like these cases to not
+    // update LastPosHighLatencyParentScheduled.
+    if (Parent->isHighLatencyBlock())
+      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+  decreaseLiveRegs(Block, Block->getInRegs());
+  addLiveRegs(Block->getOutRegs());
+  releaseBlockSuccs(Block);
+  for (std::map<unsigned, unsigned>::iterator RegI =
+       LiveOutRegsNumUsages[Block->getID()].begin(),
+       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+    std::pair<unsigned, unsigned> RegP = *RegI;
+    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+      LiveRegsConsumers[RegP.first] = RegP.second;
+    else {
+      assert(LiveRegsConsumers[RegP.first] == 0);
+      LiveRegsConsumers[RegP.first] += RegP.second;
+    }
+  }
+  if (LastPosHighLatencyParentScheduled[Block->getID()] >
+        (unsigned)LastPosWaitedHighLatency)
+    LastPosWaitedHighLatency =
+      LastPosHighLatencyParentScheduled[Block->getID()];
+  ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                     std::set<unsigned> &OutRegs) {
+  std::vector<int> DiffSetPressure;
+  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+  for (unsigned Reg : InRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (LiveRegsConsumers[Reg] > 1)
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] -= PSetI.getWeight();
+    }
+  }
+
+  for (unsigned Reg : OutRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] += PSetI.getWeight();
+    }
+  }
+
+  return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
+  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+  std::vector<SIScheduleBlock*> ScheduledBlocks;
+  struct SIScheduleBlockResult Res;
+
+  ScheduledBlocks = Scheduler.getBlocks();
+
+  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+    SIScheduleBlock *Block = ScheduledBlocks[b];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs)
+      Res.SUs.push_back(SU->NodeNum);
+  }
+
+  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+  return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+  ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
+  SITII = static_cast<const SIInstrInfo*>(TII);
+  SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  VGPRSetID = SITRI->getVGPR32PressureSet();
+  SGPRSetID = SITRI->getSGPR32PressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() {
+}
+
+ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
+  return new SIScheduleDAGMI(C);
+}
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+  std::vector<int> TopDownSU2Index;
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+  WorkList.reserve(DAGSize);
+
+  TopDownIndex2SU.resize(DAGSize);
+  TopDownSU2Index.resize(DAGSize);
+  BottomUpIndex2SU.resize(DAGSize);
+
+  WorkList.push_back(&getExitSU());
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    TopDownSU2Index[NodeNum] = Degree;
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize) {
+      TopDownSU2Index[SU->NodeNum] = --Id;
+      TopDownIndex2SU[Id] = SU->NodeNum;
+    }
+    for (SDep& Pred : SU->Preds) {
+      SUnit *SU = Pred.getSUnit();
+      if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
+        WorkList.push_back(SU);
+    }
+  }
+
+  BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
+                                      TopDownIndex2SU.rend());
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SDep& Pred : SU->Preds) {
+      if (Pred.getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownSU2Index[SU->NodeNum] >
+             TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
+             "Wrong Top Down topological sorting");
+    }
+  }
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SDep& Succ : SU->Succs) {
+      if (Succ.getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownSU2Index[SU->NodeNum] <
+             TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
+             "Wrong Bottom Up topological sorting");
+    }
+  }
+#endif
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+   unsigned DAGSize = SUnits.size();
+   int LastLowLatencyUser = -1;
+   int LastLowLatencyPos = -1;
+
+   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[ScheduledSUnits[i]];
+    bool IsLowLatencyUser = false;
+    unsigned MinPos = 0;
+
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+        IsLowLatencyUser = true;
+      }
+      if (Pred->NodeNum >= DAGSize)
+        continue;
+      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+      if (PredPos >= MinPos)
+        MinPos = PredPos + 1;
+    }
+
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      unsigned BestPos = LastLowLatencyUser + 1;
+      if ((int)BestPos <= LastLowLatencyPos)
+        BestPos = LastLowLatencyPos + 1;
+      if (BestPos < MinPos)
+        BestPos = MinPos;
+      if (BestPos < i) {
+        for (unsigned u = i; u > BestPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[BestPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+      }
+      LastLowLatencyPos = BestPos;
+      if (IsLowLatencyUser)
+        LastLowLatencyUser = BestPos;
+    } else if (IsLowLatencyUser) {
+      LastLowLatencyUser = i;
+    // Moves COPY instructions on which depends
+    // the low latency instructions too.
+    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+      bool CopyForLowLat = false;
+      for (SDep& SuccDep : SU->Succs) {
+        SUnit *Succ = SuccDep.getSUnit();
+        if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+          CopyForLowLat = true;
+        }
+      }
+      if (!CopyForLowLat)
+        continue;
+      if (MinPos < i) {
+        for (unsigned u = i; u > MinPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[MinPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+      }
+    }
+  }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnits[i].isScheduled = false;
+    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+  }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+                                  unsigned &VgprUsage, unsigned &SgprUsage) {
+  VgprUsage = 0;
+  SgprUsage = 0;
+  for (_Iterator RegI = First; RegI != End; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      if (*PSetI == VGPRSetID)
+        VgprUsage += PSetI.getWeight();
+      else if (*PSetI == SGPRSetID)
+        SgprUsage += PSetI.getWeight();
+    }
+  }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  SIScheduleBlockResult Best, Temp;
+  DEBUG(dbgs() << "Preparing Scheduling\n");
+
+  buildDAGWithRegPressure();
+  DEBUG(
+    for(SUnit& SU : SUnits)
+       SU.dumpAll(this)
+  );
+
+  Topo.InitDAGTopologicalSorting();
+  topologicalSort();
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+  // functions, but to make them happy we must initialize
+  // the default Scheduler implementation (even if we do not
+  // run it)
+  SchedImpl->initialize(this);
+  initQueues(TopRoots, BotRoots);
+
+  // Fill some stats to help scheduling.
+
+  SUnitsLinksBackup = SUnits;
+  IsLowLatencySU.clear();
+  LowLatencyOffset.clear();
+  IsHighLatencySU.clear();
+
+  IsLowLatencySU.resize(SUnits.size(), 0);
+  LowLatencyOffset.resize(SUnits.size(), 0);
+  IsHighLatencySU.resize(SUnits.size(), 0);
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    unsigned BaseLatReg, OffLatReg;
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      IsLowLatencySU[i] = 1;
+      if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
+                                      OffLatReg, TRI))
+        LowLatencyOffset[i] = OffLatReg;
+    } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+      IsHighLatencySU[i] = 1;
+  }
+
+  SIScheduler Scheduler(this);
+  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+#if 0 // To enable when handleMove fix lands
+  // if VGPR usage is extremely high, try other good performing variants
+  // which could lead to lower VGPR usage
+  if (Best.MaxVGPRUsage > 180) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+      { LatenciesAlone, BlockRegUsageLatency },
+//      { LatenciesAlone, BlockRegUsage },
+      { LatenciesGrouped, BlockLatencyRegUsage },
+//      { LatenciesGrouped, BlockRegUsageLatency },
+//      { LatenciesGrouped, BlockRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+  // if VGPR usage is still extremely high, we may spill. Try other variants
+  // which are less performing, but that could lead to lower VGPR usage.
+  if (Best.MaxVGPRUsage > 200) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+//      { LatenciesAlone, BlockRegUsageLatency },
+      { LatenciesAlone, BlockRegUsage },
+//      { LatenciesGrouped, BlockLatencyRegUsage },
+      { LatenciesGrouped, BlockRegUsageLatency },
+      { LatenciesGrouped, BlockRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+#endif
+  ScheduledSUnits = Best.SUs;
+  ScheduledSUnitsInv.resize(SUnits.size());
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+  }
+
+  moveLowLatencies();
+
+  // Tell the outside world about the result of the scheduling.
+
+  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+  TopRPTracker.setPos(CurrentTop);
+
+  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+       E = ScheduledSUnits.end(); I != E; ++I) {
+    SUnit *SU = &SUnits[*I];
+
+    scheduleMI(SU, true);
+
+    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                 << *SU->getInstr());
+  }
+
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
new file mode 100644
index 0000000..b270136
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,489 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+  NoCand,
+  RegUsage,
+  Latency,
+  Successor,
+  Depth,
+  NodeOrder
+};
+
+struct SISchedulerCandidate {
+  // The reason for this candidate.
+  SIScheduleCandReason Reason;
+
+  // Set of reasons that apply to multiple candidates.
+  uint32_t RepeatReasonSet;
+
+  SISchedulerCandidate()
+    :  Reason(NoCand), RepeatReasonSet(0) {}
+
+  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator *BC;
+
+  std::vector<SUnit*> SUnits;
+  std::map<unsigned, unsigned> NodeNum2Index;
+  std::vector<SUnit*> TopReadySUs;
+  std::vector<SUnit*> ScheduledSUnits;
+
+  /// The top of the unscheduled zone.
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  // Pressure: number of said class of registers needed to
+  // store the live virtual and real registers.
+  // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+  // Pressure of additional registers required inside the block.
+  std::vector<unsigned> InternalAdditionnalPressure;
+  // Pressure of input and output registers
+  std::vector<unsigned> LiveInPressure;
+  std::vector<unsigned> LiveOutPressure;
+  // Registers required by the block, and outputs.
+  // We do track only virtual registers.
+  // Note that some registers are not 32 bits,
+  // and thus the pressure is not equal
+  // to the number of live registers.
+  std::set<unsigned> LiveInRegs;
+  std::set<unsigned> LiveOutRegs;
+
+  bool Scheduled;
+  bool HighLatencyBlock;
+
+  std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+  unsigned ID;
+
+  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
+  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+  unsigned NumHighLatencySuccessors;
+
+public:
+  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+                  unsigned ID):
+    DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
+    TopRPTracker(TopPressure), Scheduled(false),
+    HighLatencyBlock(false), ID(ID),
+    Preds(), Succs(), NumHighLatencySuccessors(0) {};
+
+  ~SIScheduleBlock() {};
+
+  unsigned getID() const { return ID; }
+
+  /// Functions for Block construction.
+  void addUnit(SUnit *SU);
+
+  // When all SUs have been added.
+  void finalizeUnits();
+
+  // Add block pred, which has instruction predecessor of SU.
+  void addPred(SIScheduleBlock *Pred);
+  void addSucc(SIScheduleBlock *Succ);
+
+  const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
+  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+
+  unsigned Height;  // Maximum topdown path length to block without outputs
+  unsigned Depth;   // Maximum bottomup path length to block without inputs
+
+  unsigned getNumHighLatencySuccessors() const {
+    return NumHighLatencySuccessors;
+  }
+
+  bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+  // This is approximative.
+  // Ideally should take into accounts some instructions (rcp, etc)
+  // are 4 times slower.
+  int getCost() { return SUnits.size(); }
+
+  // The block Predecessors and Successors must be all registered
+  // before fastSchedule().
+  // Fast schedule with no particular requirement.
+  void fastSchedule();
+
+  std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+  // Complete schedule that will try to minimize reg pressure and
+  // low latencies, and will fill liveins and liveouts.
+  // Needs all MIs to be grouped between BeginBlock and EndBlock.
+  // The MIs can be moved after the scheduling,
+  // it is just used to allow correct track of live registers.
+  void schedule(MachineBasicBlock::iterator BeginBlock,
+                MachineBasicBlock::iterator EndBlock);
+
+  bool isScheduled() { return Scheduled; }
+
+
+  // Needs the block to be scheduled inside
+  // TODO: find a way to compute it.
+  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+    return InternalAdditionnalPressure;
+  }
+
+  std::set<unsigned> &getInRegs() { return LiveInRegs; }
+  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+  void printDebug(bool Full);
+
+private:
+  struct SISchedCandidate : SISchedulerCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    unsigned SGPRUsage;
+    unsigned VGPRUsage;
+    bool IsLowLatency;
+    unsigned LowLatencyOffset;
+    bool HasLowLatencyNonWaitedParent;
+
+    SISchedCandidate()
+      : SU(nullptr) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SISchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      SGPRUsage = Best.SGPRUsage;
+      VGPRUsage = Best.VGPRUsage;
+      IsLowLatency = Best.IsLowLatency;
+      LowLatencyOffset = Best.LowLatencyOffset;
+      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+    }
+  };
+
+  void undoSchedule();
+
+  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  // InOrOutBlock: restrict to links pointing inside the block (true),
+  // or restrict to links pointing outside the block (false).
+  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+  void nodeScheduled(SUnit *SU);
+  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  SUnit* pickNode();
+  void traceCandidate(const SISchedCandidate &Cand);
+  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                       MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+  std::vector<SIScheduleBlock*> Blocks;
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+    LatenciesAlone,
+    LatenciesGrouped,
+    LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+  SIScheduleDAGMI *DAG;
+  // unique_ptr handles freeing memory for us.
+  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+  std::map<SISchedulerBlockCreatorVariant,
+           SIScheduleBlocks> Blocks;
+  std::vector<SIScheduleBlock*> CurrentBlocks;
+  std::vector<int> Node2CurrentBlock;
+
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+  std::vector<int> BottomUpIndex2Block;
+
+  // 0 -> Color not given.
+  // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+  // Above -> Other groups.
+  int NextReservedID;
+  int NextNonReservedID;
+  std::vector<int> CurrentColoring;
+  std::vector<int> CurrentTopDownReservedDependencyColoring;
+  std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+  ~SIScheduleBlockCreator();
+
+  SIScheduleBlocks
+  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+  bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+  // Give a Reserved color to every high latency.
+  void colorHighLatenciesAlone();
+
+  // Create groups of high latencies with a Reserved color.
+  void colorHighLatenciesGroups();
+
+  // Compute coloring for topdown and bottom traversals with
+  // different colors depending on dependencies on Reserved colors.
+  void colorComputeReservedDependencies();
+
+  // Give color to all non-colored SUs according to Reserved groups dependencies.
+  void colorAccordingToReservedDependencies();
+
+  // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+  // The new colors are computed according to the dependencies on the other blocks
+  // formed with colorAccordingToReservedDependencies.
+  void colorEndsAccordingToDependencies();
+
+  // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+  void colorForceConsecutiveOrderInGroup();
+
+  // Merge Constant loads that have all their users into another group to the group.
+  // (TODO: else if all their users depend on the same group, put them there)
+  void colorMergeConstantLoadsNextGroup();
+
+  // Merge SUs that have all their users into another group to the group
+  void colorMergeIfPossibleNextGroup();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only for Reserved groups.
+  void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only if the group is no more than a few SUs.
+  void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+  // Divides Blocks with important size.
+  // Idea of implementation: attribute new colors depending on topdown and
+  // bottom up links to other blocks.
+  void cutHugeBlocks();
+
+  // Put in one group all instructions with no users in this scheduling region
+  // (we'd want these groups be at the end).
+  void regroupNoUserInstructions();
+
+  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+  void topologicalSort();
+
+  void scheduleInsideBlocks();
+
+  void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+  BlockLatencyRegUsage,
+  BlockRegUsageLatency,
+  BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+  SIScheduleDAGMI *DAG;
+  SISchedulerBlockSchedulerVariant Variant;
+  std::vector<SIScheduleBlock*> Blocks;
+
+  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+  std::set<unsigned> LiveRegs;
+  // Num of schedulable unscheduled blocks reading the register.
+  std::map<unsigned, unsigned> LiveRegsConsumers;
+
+  std::vector<unsigned> LastPosHighLatencyParentScheduled;
+  int LastPosWaitedHighLatency;
+
+  std::vector<SIScheduleBlock*> BlocksScheduled;
+  unsigned NumBlockScheduled;
+  std::vector<SIScheduleBlock*> ReadyBlocks;
+
+  unsigned VregCurrentUsage;
+  unsigned SregCurrentUsage;
+
+  // Currently is only approximation.
+  unsigned maxVregUsage;
+  unsigned maxSregUsage;
+
+  std::vector<unsigned> BlockNumPredsLeft;
+  std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                           SISchedulerBlockSchedulerVariant Variant,
+                           SIScheduleBlocks BlocksStruct);
+  ~SIScheduleBlockScheduler() {};
+
+  std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
+
+  unsigned getVGPRUsage() { return maxVregUsage; };
+  unsigned getSGPRUsage() { return maxSregUsage; };
+
+private:
+  struct SIBlockSchedCandidate : SISchedulerCandidate {
+    // The best Block candidate.
+    SIScheduleBlock *Block;
+
+    bool IsHighLatency;
+    int VGPRUsageDiff;
+    unsigned NumSuccessors;
+    unsigned NumHighLatencySuccessors;
+    unsigned LastPosHighLatParentScheduled;
+    unsigned Height;
+
+    SIBlockSchedCandidate()
+      : Block(nullptr) {}
+
+    bool isValid() const { return Block; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SIBlockSchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      Block = Best.Block;
+      Reason = Best.Reason;
+      IsHighLatency = Best.IsHighLatency;
+      VGPRUsageDiff = Best.VGPRUsageDiff;
+      NumSuccessors = Best.NumSuccessors;
+      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+      Height = Best.Height;
+    }
+  };
+
+  bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                           SIBlockSchedCandidate &TryCand);
+  bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                            SIBlockSchedCandidate &TryCand);
+  SIScheduleBlock *pickBlock();
+
+  void addLiveRegs(std::set<unsigned> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void releaseBlockSuccs(SIScheduleBlock *Parent);
+  void blockScheduled(SIScheduleBlock *Block);
+
+  // Check register pressure change
+  // by scheduling a block with these LiveIn and LiveOut.
+  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                       std::set<unsigned> &OutRegs);
+
+  void schedule();
+};
+
+struct SIScheduleBlockResult {
+  std::vector<unsigned> SUs;
+  unsigned MaxSGPRUsage;
+  unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator BlockCreator;
+
+public:
+  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+
+  ~SIScheduler() {};
+
+  struct SIScheduleBlockResult
+  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                  SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI : public ScheduleDAGMILive {
+  const SIInstrInfo *SITII;
+  const SIRegisterInfo *SITRI;
+
+  std::vector<SUnit> SUnitsLinksBackup;
+
+  // For moveLowLatencies. After all Scheduling variants are tested.
+  std::vector<unsigned> ScheduledSUnits;
+  std::vector<unsigned> ScheduledSUnitsInv;
+
+  unsigned VGPRSetID;
+  unsigned SGPRSetID;
+
+public:
+  SIScheduleDAGMI(MachineSchedContext *C);
+
+  ~SIScheduleDAGMI() override;
+
+  // Entry point for the schedule.
+  void schedule() override;
+
+  // To init Block's RPTracker.
+  void initRPTracker(RegPressureTracker &RPTracker) {
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  }
+
+  MachineBasicBlock *getBB() { return BB; }
+  MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
+  MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
+  LiveIntervals *getLIS() { return LIS; }
+  MachineRegisterInfo *getMRI() { return &MRI; }
+  const TargetRegisterInfo *getTRI() { return TRI; }
+  SUnit& getEntrySU() { return EntrySU; };
+  SUnit& getExitSU() { return ExitSU; };
+
+  void restoreSULinksLeft();
+
+  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+                                                     _Iterator End,
+                                                     unsigned &VgprUsage,
+                                                     unsigned &SgprUsage);
+  std::set<unsigned> getInRegs() {
+    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
+                               RPTracker.getPressure().LiveInRegs.end());
+    return InRegs;
+  };
+
+  unsigned getVGPRSetID() const { return VGPRSetID; }
+  unsigned getSGPRSetID() const { return SGPRSetID; }
+
+private:
+  void topologicalSort();
+  // After scheduling is done, improve low latency placements.
+  void moveLowLatencies();
+
+public:
+  // Some stats for scheduling inside blocks.
+  std::vector<unsigned> IsLowLatencySU;
+  std::vector<unsigned> LowLatencyOffset;
+  std::vector<unsigned> IsHighLatencySU;
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2SU;
+  std::vector<int> BottomUpIndex2SU;
+};
+
+} // namespace llvm
+
+#endif /* SIMACHINESCHEDULER_H_ */
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index 2cd600d..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program.  These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
-  static char ID;
-
-public:
-  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI prepare scratch registers";
-  }
-
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
-  return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  MachineBasicBlock *Entry = MF.begin();
-  MachineBasicBlock::iterator I = Entry->begin();
-  DebugLoc DL = I->getDebugLoc();
-
-  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
-  // run this pass.
-  if (!MFI->hasSpilledVGPRs())
-    return false;
-
-  unsigned ScratchPtrPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-  unsigned ScratchOffsetPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
-  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
-    Entry->addLiveIn(ScratchPtrPreloadReg);
-
-  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
-    Entry->addLiveIn(ScratchOffsetPreloadReg);
-
-  // Load the scratch offset.
-  unsigned ScratchOffsetReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = -1;
-
-  if (ScratchOffsetReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use
-    MRI.setPhysRegUsed(ScratchOffsetReg);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
-            .addReg(ScratchOffsetPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
-            .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI)
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  }
-
-
-  // Now that we have the scratch pointer and offset values, we need to
-  // add them to all the SI_SPILL_V* instructions.
-
-  RegScavenger RS;
-  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
-  RS.addScavengingFrameIndex(ScratchRsrcFI);
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    // Add the scratch offset reg as a live-in so that the register scavenger
-    // doesn't re-use it.
-    if (!MBB.isLiveIn(ScratchOffsetReg) &&
-        ScratchOffsetReg != AMDGPU::NoRegister)
-      MBB.addLiveIn(ScratchOffsetReg);
-    RS.enterBasicBlock(&MBB);
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
-      RS.forward(I);
-      DebugLoc DL = MI.getDebugLoc();
-      if (!TII->isVGPRSpill(MI.getOpcode()))
-        continue;
-
-      // Scratch resource
-      unsigned ScratchRsrcReg =
-          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
-      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc23 & 0xffffffff)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc23 >> 32)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      // Scratch Offset
-      if (ScratchOffsetReg == AMDGPU::NoRegister) {
-        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
-                ScratchOffsetReg)
-                .addFrameIndex(ScratchOffsetFI)
-                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-                .addReg(AMDGPU::SGPR0, RegState::Undef);
-      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
-        MBB.addLiveIn(ScratchOffsetReg);
-      }
-
-      if (ScratchRsrcReg == AMDGPU::NoRegister ||
-          ScratchOffsetReg == AMDGPU::NoRegister) {
-        LLVMContext &Ctx = MF.getFunction()->getContext();
-        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-        ScratchRsrcReg = AMDGPU::SGPR0;
-        ScratchOffsetReg = AMDGPU::SGPR0;
-      }
-      MI.getOperand(2).setReg(ScratchRsrcReg);
-      MI.getOperand(2).setIsKill(true);
-      MI.getOperand(2).setIsUndef(false);
-      MI.getOperand(3).setReg(ScratchOffsetReg);
-      MI.getOperand(3).setIsUndef(false);
-      MI.getOperand(3).setIsKill(false);
-      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
-    }
-  }
-  return true;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e9e8412..609f5e7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "SIRegisterInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -24,7 +23,20 @@
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+  unsigned NumRegPressureSets = getNumRegPressureSets();
+
+  SGPR32SetID = NumRegPressureSets;
+  VGPR32SetID = NumRegPressureSets;
+  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+    if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
+      SGPR32SetID = i;
+    else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
+      VGPR32SetID = i;
+  }
+  assert(SGPR32SetID < NumRegPressureSets &&
+         VGPR32SetID < NumRegPressureSets);
+}
 
 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
   MCRegAliasIterator R(Reg, this, true);
@@ -33,6 +45,42 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
     Reserved.set(*R);
 }
 
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    // Leave space for flat_scr, xnack_mask, vcc, and alignment
+    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
+    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
+    // 100/101 for vcc. This is the next sgpr128 down.
+    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+  }
+
+  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
+    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Next register before reservations for flat_scr, xnack_mask, vcc,
+    // and scratch resource.
+    return AMDGPU::SGPR91;
+  }
+
+  return AMDGPU::SGPR95;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -42,17 +90,30 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
-  // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
-  reserveRegisterTuples(Reserved, AMDGPU::VGPR254);
-  reserveRegisterTuples(Reserved, AMDGPU::VGPR255);
+  // Reserve the last 2 registers so we will always have at least 2 more that
+  // will physically contain VCC.
+  reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
+    // for VCC/XNACK_MASK/FLAT_SCR.
+    //
+    // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
+    // SGPRs when the XNACK feature is not used. This is currently not done
+    // because the code that counts SGPRs cannot account for such holes.
+    reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
+    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
+    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+  }
 
   // Tonga and Iceland can only allocate a fixed number of SGPRs due
   // to a hw bug.
-  if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
+  if (ST.hasSGPRInitBug()) {
     unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
-    // Assume XNACK_MASK is unused.
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
+    // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
+    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
 
     for (unsigned i = Limit; i < NumSGPRs; ++i) {
       unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
@@ -60,34 +121,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+  }
+
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg != AMDGPU::NoRegister) {
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+    // to spill.
+    // TODO: May need to reserve a VGPR if doing LDS spilling.
+    reserveRegisterTuples(Reserved, ScratchRSrcReg);
+    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+  }
+
   return Reserved;
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
                                                 unsigned Idx) const {
-
   const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
   // FIXME: We should adjust the max number of waves based on LDS size.
   unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
                                           STI.getMaxWavesPerCU());
   unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
 
+  unsigned VSLimit = SGPRLimit + VGPRLimit;
+
   for (regclass_iterator I = regclass_begin(), E = regclass_end();
        I != E; ++I) {
+    const TargetRegisterClass *RC = *I;
 
-    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+    unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
     unsigned Limit;
 
-    if (isSGPRClass(*I)) {
+    if (isPseudoRegClass(RC)) {
+      // FIXME: This is a hack. We should never be considering the pressure of
+      // these since no virtual register should ever have this class.
+      Limit = VSLimit;
+    } else if (isSGPRClass(RC)) {
       Limit = SGPRLimit / NumSubRegs;
     } else {
       Limit = VGPRLimit / NumSubRegs;
     }
 
-    const int *Sets = getRegClassPressureSets(*I);
+    const int *Sets = getRegClassPressureSets(RC);
     assert(Sets);
     for (unsigned i = 0; Sets[i] != -1; ++i) {
-	    if (Sets[i] == (int)Idx)
+      if (Sets[i] == (int)Idx)
         return Limit;
     }
   }
@@ -174,17 +258,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     unsigned SubReg = NumSubRegs > 1 ?
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
-    bool IsKill = (i == e - 1);
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-            .addReg(SubReg, getDefRegState(IsLoad))
-            .addReg(ScratchRsrcReg, getKillRegState(IsKill))
-            .addReg(SOffset)
-            .addImm(Offset)
-            .addImm(0) // glc
-            .addImm(0) // slc
-            .addImm(0) // tfe
-            .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+      .addReg(SubReg, getDefRegState(IsLoad))
+      .addReg(ScratchRsrcReg)
+      .addReg(SOffset)
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   }
 }
 
@@ -217,17 +301,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         struct SIMachineFunctionInfo::SpilledReg Spill =
             MFI->getSpilledReg(MF, Index, i);
 
-        if (Spill.VGPR == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-        }
-
         BuildMI(*MBB, MI, DL,
                 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
                 Spill.VGPR)
                 .addReg(SubReg)
                 .addImm(Spill.Lane);
 
+        // FIXME: Since this spills to another register instead of an actual
+        // frame index, we should delete the frame index when all references to
+        // it are fixed.
       }
       MI->eraseFromParent();
       break;
@@ -247,11 +329,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         struct SIMachineFunctionInfo::SpilledReg Spill =
             MFI->getSpilledReg(MF, Index, i);
 
-        if (Spill.VGPR == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-        }
-
         BuildMI(*MBB, MI, DL,
                 TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
                 SubReg)
@@ -263,16 +340,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       // TODO: only do this when it is needed
       switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
       case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
-        TII->insertNOPs(MI, 3);
+        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
+        // ("S_NOP 3") on SI
+        TII->insertWaitStates(MI, 4);
         break;
       case AMDGPUSubtarget::SEA_ISLANDS:
         break;
       default: // VOLCANIC_ISLANDS and later
-        // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
-        // and later. This also applies to VALUs which write VCC, but we're
-        // unlikely to see VMEM use VCC.
-        TII->insertNOPs(MI, 4);
+        // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
+        // ("S_NOP 4") on VI and later. This also applies to VALUs which write
+        // VCC, but we're unlikely to see VMEM use VCC.
+        TII->insertWaitStates(MI, 5);
       }
 
       MI->eraseFromParent();
@@ -322,22 +400,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   }
 }
 
-const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
-                                                                   MVT VT) const {
-  switch(VT.SimpleTy) {
-    default:
-    case MVT::i32: return &AMDGPU::VGPR_32RegClass;
-  }
-}
-
 unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
   return getEncodingValue(Reg) & 0xff;
 }
 
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
-  static const TargetRegisterClass *BaseClasses[] = {
+  static const TargetRegisterClass *const BaseClasses[] = {
     &AMDGPU::VGPR_32RegClass,
     &AMDGPU::SReg_32RegClass,
     &AMDGPU::VReg_64RegClass,
@@ -359,33 +431,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   return nullptr;
 }
 
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+  switch (RC->getSize()) {
+  case 4:
+    return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+  case 8:
+    return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+  case 12:
+    return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+  case 16:
+    return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+  case 32:
+    return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+  case 64:
+    return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
 }
 
 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
                                          const TargetRegisterClass *SRC) const {
-    if (hasVGPRs(SRC)) {
-      return SRC;
-    } else if (SRC == &AMDGPU::SCCRegRegClass) {
-      return &AMDGPU::VCCRegRegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
-      return &AMDGPU::VGPR_32RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
-      return &AMDGPU::VReg_64RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
-      return &AMDGPU::VReg_128RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
-      return &AMDGPU::VReg_256RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
-      return &AMDGPU::VReg_512RegClass;
-    }
-    return nullptr;
+  switch (SRC->getSize()) {
+  case 4:
+    return &AMDGPU::VGPR_32RegClass;
+  case 8:
+    return &AMDGPU::VReg_64RegClass;
+  case 12:
+    return &AMDGPU::VReg_96RegClass;
+  case 16:
+    return &AMDGPU::VReg_128RegClass;
+  case 32:
+    return &AMDGPU::VReg_256RegClass;
+  case 64:
+    return &AMDGPU::VReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
 }
 
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -393,15 +477,65 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
   if (SubIdx == AMDGPU::NoSubRegister)
     return RC;
 
-  // If this register has a sub-register, we can safely assume it is a 32-bit
-  // register, because all of SI's sub-registers are 32-bit.
+  // We can assume that each lane corresponds to one 32-bit register.
+  unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
   if (isSGPRClass(RC)) {
-    return &AMDGPU::SGPR_32RegClass;
+    switch (Count) {
+    case 1:
+      return &AMDGPU::SGPR_32RegClass;
+    case 2:
+      return &AMDGPU::SReg_64RegClass;
+    case 4:
+      return &AMDGPU::SReg_128RegClass;
+    case 8:
+      return &AMDGPU::SReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
   } else {
-    return &AMDGPU::VGPR_32RegClass;
+    switch (Count) {
+    case 1:
+      return &AMDGPU::VGPR_32RegClass;
+    case 2:
+      return &AMDGPU::VReg_64RegClass;
+    case 3:
+      return &AMDGPU::VReg_96RegClass;
+    case 4:
+      return &AMDGPU::VReg_128RegClass;
+    case 8:
+      return &AMDGPU::VReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
   }
 }
 
+bool SIRegisterInfo::shouldRewriteCopySrc(
+  const TargetRegisterClass *DefRC,
+  unsigned DefSubReg,
+  const TargetRegisterClass *SrcRC,
+  unsigned SrcSubReg) const {
+  // We want to prefer the smallest register class possible, so we don't want to
+  // stop and rewrite on anything that looks like a subregister
+  // extract. Operations mostly don't care about the super register class, so we
+  // only want to stop on the most basic of copies between the smae register
+  // class.
+  //
+  // e.g. if we have something like
+  // vreg0 = ...
+  // vreg1 = ...
+  // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+  // vreg3 = COPY vreg2, sub0
+  //
+  // We want to look through the COPY to find:
+  //  => vreg3 = COPY vreg0
+
+  // Plain copy.
+  return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
 unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
                                           const TargetRegisterClass *SubRC,
                                           unsigned Channel) const {
@@ -462,30 +596,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
   return OpType == AMDGPU::OPERAND_REG_INLINE_C;
 }
 
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  (void)ST;
   switch (Value) {
-  case SIRegisterInfo::TGID_X:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
-  case SIRegisterInfo::TGID_Y:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
-  case SIRegisterInfo::TGID_Z:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
-  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
-    if (MFI->getShaderType() != ShaderType::COMPUTE)
-      return MFI->ScratchOffsetReg;
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
-  case SIRegisterInfo::SCRATCH_PTR:
-    return AMDGPU::SGPR2_SGPR3;
-  case SIRegisterInfo::INPUT_PTR:
-    return AMDGPU::SGPR0_SGPR1;
-  case SIRegisterInfo::TIDIG_X:
+  case SIRegisterInfo::WORKGROUP_ID_X:
+    assert(MFI->hasWorkGroupIDX());
+    return MFI->WorkGroupIDXSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Y:
+    assert(MFI->hasWorkGroupIDY());
+    return MFI->WorkGroupIDYSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Z:
+    assert(MFI->hasWorkGroupIDZ());
+    return MFI->WorkGroupIDZSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+    return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+    assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+    assert(MFI->hasKernargSegmentPtr());
+    return MFI->KernargSegmentPtrUserSGPR;
+  case SIRegisterInfo::DISPATCH_PTR:
+    assert(MFI->hasDispatchPtr());
+    return MFI->DispatchPtrUserSGPR;
+  case SIRegisterInfo::QUEUE_PTR:
+    llvm_unreachable("not implemented");
+  case SIRegisterInfo::WORKITEM_ID_X:
+    assert(MFI->hasWorkItemIDX());
     return AMDGPU::VGPR0;
-  case SIRegisterInfo::TIDIG_Y:
+  case SIRegisterInfo::WORKITEM_ID_Y:
+    assert(MFI->hasWorkItemIDY());
     return AMDGPU::VGPR1;
-  case SIRegisterInfo::TIDIG_Z:
+  case SIRegisterInfo::WORKITEM_ID_Z:
+    assert(MFI->hasWorkItemIDZ());
     return AMDGPU::VGPR2;
   }
   llvm_unreachable("unexpected preloaded value type");
@@ -496,12 +647,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
 //         AMDGPU::NoRegister.
 unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
                                            const TargetRegisterClass *RC) const {
-
-  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
-       I != E; ++I) {
-    if (!MRI.isPhysRegUsed(*I))
-      return *I;
-  }
+  for (unsigned Reg : *RC)
+    if (!MRI.isPhysRegUsed(Reg))
+      return Reg;
   return AMDGPU::NoRegister;
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7da6de2..9410e20 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -18,17 +18,30 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm {
 
 struct SIRegisterInfo : public AMDGPURegisterInfo {
 private:
+  unsigned SGPR32SetID;
+  unsigned VGPR32SetID;
+
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 
 public:
   SIRegisterInfo();
 
+  /// Return the end register initially reserved for the scratch buffer in case
+  /// spilling is needed.
+  unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+  /// Return the end register initially reserved for the scratch wave offset in
+  /// case spilling is needed.
+  unsigned reservedPrivateSegmentWaveByteOffsetReg(
+    const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
@@ -40,10 +53,6 @@ public:
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  /// \brief get the register class of the specified type to use in the
-  /// CFGStructurizer
-  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
-
   unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief Return the 'base' register class for this register.
@@ -52,23 +61,30 @@ public:
 
   /// \returns true if this class contains only SGPR registers
   bool isSGPRClass(const TargetRegisterClass *RC) const {
-    if (!RC)
-      return false;
-
     return !hasVGPRs(RC);
   }
 
   /// \returns true if this class ID contains only SGPR registers
   bool isSGPRClassID(unsigned RCID) const {
-    if (static_cast<int>(RCID) == -1)
-      return false;
-
     return isSGPRClass(getRegClass(RCID));
   }
 
+  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return isSGPRClass(MRI.getRegClass(Reg));
+    return getPhysRegClass(Reg);
+  }
+
   /// \returns true if this class contains VGPR registers.
   bool hasVGPRs(const TargetRegisterClass *RC) const;
 
+  /// returns true if this is a pseudoregister class combination of VGPRs and
+  /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on
+  /// them.
+  static bool isPseudoRegClass(const TargetRegisterClass *RC) {
+    return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass;
+  }
+
   /// \returns A VGPR reg class with the same width as \p SRC
   const TargetRegisterClass *getEquivalentVGPRClass(
                                           const TargetRegisterClass *SRC) const;
@@ -79,6 +95,11 @@ public:
   const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
                                             unsigned SubIdx) const;
 
+  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                            unsigned DefSubReg,
+                            const TargetRegisterClass *SrcRC,
+                            unsigned SrcSubReg) const override;
+
   /// \p Channel This is the register channel (e.g. a value from 0-16), not the
   ///            SubReg index.
   /// \returns The sub-register of Reg that is in Channel.
@@ -91,19 +112,25 @@ public:
 
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
-  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
   bool opCanUseInlineConstant(unsigned OpType) const;
 
   enum PreloadedValue {
-    TGID_X,
-    TGID_Y,
-    TGID_Z,
-    SCRATCH_WAVE_OFFSET,
-    SCRATCH_PTR,
-    INPUT_PTR,
-    TIDIG_X,
-    TIDIG_Y,
-    TIDIG_Z
+    // SGPRS:
+    PRIVATE_SEGMENT_BUFFER =  0,
+    DISPATCH_PTR        =  1,
+    QUEUE_PTR           =  2,
+    KERNARG_SEGMENT_PTR =  3,
+    WORKGROUP_ID_X      = 10,
+    WORKGROUP_ID_Y      = 11,
+    WORKGROUP_ID_Z      = 12,
+    PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
+    // VGPRS:
+    FIRST_VGPR_VALUE    = 15,
+    WORKITEM_ID_X       = FIRST_VGPR_VALUE,
+    WORKITEM_ID_Y       = 16,
+    WORKITEM_ID_Z       = 17
   };
 
   /// \brief Returns the physical register that \p Value is stored in.
@@ -122,6 +149,9 @@ public:
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
 
+  unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
+  unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
+
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                              unsigned LoadStoreOp, unsigned Value,
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 2a9017f..bfaf937 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -10,10 +10,13 @@
 //===----------------------------------------------------------------------===//
 //  Declarations that describe the SI registers
 //===----------------------------------------------------------------------===//
-
-class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+  DwarfRegNum<[!cast<int>(HWEncoding)]> {
   let Namespace = "AMDGPU";
-  let HWEncoding = encoding;
+
+  // This is the not yet the complete register encoding. An additional
+  // bit is set for VGPRs.
+  let HWEncoding = regIdx;
 }
 
 // Special Registers
@@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
 // VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+          DwarfRegAlias<VCC_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 106;
@@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
 def EXEC_LO : SIReg<"exec_lo", 126>;
 def EXEC_HI : SIReg<"exec_hi", 127>;
 
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+           DwarfRegAlias<EXEC_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 126;
@@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
-def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
-def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+  def _ci : SIReg<n, ci_e>;
+  def _vi : SIReg<n, vi_e>;
+  def "" : SIReg<"", 0>;
+}
 
-// Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+class FlatReg <Register lo, Register hi, bits<16> encoding> :
+    RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+    DwarfRegAlias<lo> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 104;
+  let HWEncoding = encoding;
 }
 
+defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes.
+defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes.
+
+def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>;
+def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
+def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
+
 // SGPR registers
-foreach Index = 0-101 in {
+foreach Index = 0-103 in {
   def SGPR#Index : SIReg <"SGPR"#Index, Index>;
 }
 
@@ -65,25 +81,27 @@ foreach Index = 0-255 in {
 //  Groupings using register classes and tuples
 //===----------------------------------------------------------------------===//
 
+// TODO: Do we need to set DwarfRegAlias on register tuples?
+
 // SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "SGPR%u", 0, 101))>;
+                            (add (sequence "SGPR%u", 0, 103))>;
 
 // SGPR 64-bit registers
 def SGPR_64Regs : RegisterTuples<[sub0, sub1],
-                             [(add (decimate (trunc SGPR_32, 101), 2)),
+                             [(add (decimate SGPR_32, 2)),
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
 // SGPR 128-bit registers
 def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
-                              [(add (decimate (trunc SGPR_32, 99), 4)),
+                              [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4))]>;
 
 // SGPR 256-bit registers
 def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
-                              [(add (decimate (trunc SGPR_32, 95), 4)),
+                              [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4)),
@@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
 // SGPR 512-bit registers
 def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
-                              [(add (decimate (trunc SGPR_32, 87), 4)),
+                              [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4)),
@@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass {
   let RenderMethod = "addRegOrImmOperands";
 }
 
-// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
-  let CopyCost = -1; // Theoretically it is possible to read from SCC,
-                     // but it should never be necessary.
-}
-
-def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
-def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
-
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
   (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
 >;
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+  (add SGPR_64, VCC, EXEC, FLAT_SCR)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> {
+  // Requires 2 s_mov_b64 to copy
+  let CopyCost = 2;
+}
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> {
+  // Requires 4 s_mov_b64 to copy
+  let CopyCost = 4;
+}
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+  // Requires 8 s_mov_b64 to copy
+  let CopyCost = 8;
+}
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+  // Requires 2 v_mov_b32 to copy
+  let CopyCost = 2;
+}
 
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
   let Size = 96;
+
+  // Requires 3 v_mov_b32 to copy
+  let CopyCost = 3;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+  // Requires 4 v_mov_b32 to copy
+  let CopyCost = 4;
+}
 
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> {
+  let CopyCost = 8;
+}
 
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+  let CopyCost = 16;
+}
 
 def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
   let Size = 32;
@@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> {
 
 def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
 
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+  let CopyCost = 2;
+}
 
 def VSrc_32 : RegisterOperand<VS_32> {
   let OperandNamespace = "AMDGPU";
@@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> {
   let OperandType = "OPERAND_REG_INLINE_C";
   let ParserMatchClass = RegImmMatcher<"VCSrc64">;
 }
+
+//===----------------------------------------------------------------------===//
+//  SCSrc_* Operands with an SGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_64 : RegisterOperand<SReg_64> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_INLINE_C";
+  let ParserMatchClass = RegImmMatcher<"SCSrc64">;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
index 9b1f676..cd77e51 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -17,16 +17,28 @@ def WriteLDS    : SchedWrite;
 def WriteSALU   : SchedWrite;
 def WriteSMEM   : SchedWrite;
 def WriteVMEM   : SchedWrite;
+def WriteBarrier : SchedWrite;
 
 // Vector ALU instructions
 def Write32Bit         : SchedWrite;
 def WriteQuarterRate32 : SchedWrite;
+def WriteFullOrQuarterRate32 : SchedWrite;
 
 def WriteFloatFMA   : SchedWrite;
 
-def WriteDouble     : SchedWrite;
+// Slow quarter rate f64 instruction.
+def WriteDouble : SchedWrite;
+
+// half rate f64 instruction (same as v_add_f64)
 def WriteDoubleAdd  : SchedWrite;
 
+// Half rate 64-bit instructions.
+def Write64Bit : SchedWrite;
+
+// FIXME: Should there be a class for instructions which are VALU
+// instructions and have VALU rates, but write to the SALU (i.e. VOPC
+// instructions)
+
 def SIFullSpeedModel : SchedMachineModel;
 def SIQuarterSpeedModel : SchedMachineModel;
 
@@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
 
 
 // The latency numbers are taken from AMD Accelerated Parallel Processing
-// guide.  They may not be acurate.
+// guide. They may not be accurate.
 
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
@@ -64,8 +76,10 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
   def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
   def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+  def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
 
   def : HWVALUWriteRes<Write32Bit,         1>;
+  def : HWVALUWriteRes<Write64Bit,         2>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 5d00bdd..4f0913f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   if (!MRI.isSSA())
     return;
 
-  assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
-         TII->isVOPC(MI.getOpcode()));
+  assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
 
 }
 
+// Copy MachineOperand with all flags except setting it as implicit.
+static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
+  assert(!Orig.isImplicit());
+  return MachineOperand::CreateReg(Orig.getReg(),
+                                   Orig.isDef(),
+                                   true,
+                                   Orig.isKill(),
+                                   Orig.isDead(),
+                                   Orig.isUndef(),
+                                   Orig.isEarlyClobber(),
+                                   Orig.getSubReg(),
+                                   Orig.isDebug(),
+                                   Orig.isInternalRead());
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII =
@@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (TII->isVOPC(Op32)) {
         unsigned DstReg = MI.getOperand(0).getReg();
         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-          // VOPC instructions can only write to the VCC register.  We can't
-          // force them to use VCC here, because the register allocator has
-          // trouble with sequences like this, which cause the allocator to run
-          // out of registers if vreg0 and vreg1 belong to the VCCReg register
-          // class:
-          // vreg0 = VOPC;
-          // vreg1 = VOPC;
-          // S_AND_B64 vreg0, vreg1
+          // VOPC instructions can only write to the VCC register. We can't
+          // force them to use VCC here, because this is only one register and
+          // cannot deal with sequences which would require multiple copies of
+          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
           //
           // So, instead of forcing the instruction to write to VCC, we provide
           // a hint to the register allocator to use VCC and then we we will run
@@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // We can shrink this instruction
-      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
+      DEBUG(dbgs() << "Shrinking " << MI);
 
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 
-      // dst
-      Inst32.addOperand(MI.getOperand(0));
+      // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+      // For VOPC instructions, this is replaced by an implicit def of vcc.
+      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+      if (Op32DstIdx != -1) {
+        // dst
+        Inst32.addOperand(MI.getOperand(0));
+      } else {
+        assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+               "Unexpected case");
+      }
+
 
       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
 
@@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         Inst32.addOperand(*Src1);
 
       const MachineOperand *Src2 =
-          TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-      if (Src2)
-        Inst32.addOperand(*Src2);
+        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2) {
+        int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+        if (Op32Src2Idx != -1) {
+          Inst32.addOperand(*Src2);
+        } else {
+          // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+          // replaced with an implicit read of vcc.
+          assert(Src2->getReg() == AMDGPU::VCC &&
+                 "Unexpected missing register operand");
+          Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+        }
+      }
 
       ++NumInstructionsShrunk;
       MI.eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
index 591ce85..d36c5d2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 
@@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
 }
 
 bool SITypeRewriter::runOnFunction(Function &F) {
-  Attribute A = F.getFnAttribute("ShaderType");
-
-  unsigned ShaderType = ShaderType::COMPUTE;
-  if (A.isStringAttribute()) {
-    StringRef Str = A.getValueAsString();
-    Str.getAsInteger(0, ShaderType);
-  }
-  if (ShaderType == ShaderType::COMPUTE)
+  if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
     return false;
 
   visit(F);
@@ -104,6 +98,9 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
   SmallVector <Type*, 8> Types;
   bool NeedToReplace = false;
   Function *F = I.getCalledFunction();
+  if (!F)
+    return;
+
   std::string Name = F->getName();
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
     Value *Arg = I.getArgOperand(i);
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b76b400..3b4c235 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -7,12 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 #undef GET_SUBTARGETINFO_ENUM
 
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
 namespace llvm {
 namespace AMDGPU {
 
@@ -56,5 +67,98 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.private_segment_alignment = 4;
 }
 
+MCSection *getHSATextSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_EXECINSTR |
+                           ELF::SHF_AMDGPU_HSA_AGENT |
+                           ELF::SHF_AMDGPU_HSA_CODE);
+}
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_AMDGPU_HSA_GLOBAL |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+  return  Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+                            ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                            ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+static unsigned getIntegerAttribute(const Function &F, const char *Name,
+                                    unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  unsigned Result = Default;
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, Result)) {
+      LLVMContext &Ctx = F.getContext();
+      Ctx.emitError("can't parse shader type");
+    }
+  }
+  return Result;
+}
+
+unsigned getShaderType(const Function &F) {
+  return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
+}
+
+unsigned getInitialPSInputAddr(const Function &F) {
+  return getIntegerAttribute(F, "InitialPSInputAddr", 0);
+}
+
+bool isSI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+}
+
+bool isCI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+}
+
+bool isVI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+
+  switch(Reg) {
+  default: break;
+  case AMDGPU::FLAT_SCR:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+
+  case AMDGPU::FLAT_SCR_LO:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+
+  case AMDGPU::FLAT_SCR_HI:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
+  }
+  return Reg;
+}
+
 } // End namespace AMDGPU
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f57028c..57cbe1b5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -15,6 +15,11 @@
 namespace llvm {
 
 class FeatureBitset;
+class Function;
+class GlobalValue;
+class MCContext;
+class MCSection;
+class MCSubtargetInfo;
 
 namespace AMDGPU {
 
@@ -27,6 +32,29 @@ struct IsaVersion {
 IsaVersion getIsaVersion(const FeatureBitset &Features);
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
+MCSection *getHSATextSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
+unsigned getShaderType(const Function &F);
+unsigned getInitialPSInputAddr(const Function &F);
+
+
+bool isSI(const MCSubtargetInfo &STI);
+bool isCI(const MCSubtargetInfo &STI);
+bool isVI(const MCSubtargetInfo &STI);
+
+/// If \p Reg is a pseudo reg, return the correct hardware register given
+/// \p STI otherwise return \p Reg.
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
 } // end namespace AMDGPU
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
index aca4673..1a7801c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
 } // End isCommutable = 1
 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
 
-// Aliases to simplify matching of floating-pint instructions that are VOP2 on
-// SI and VOP3 on VI.
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
 
 class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
   name#" $dst, $src0, $src1",
@@ -89,12 +89,18 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
 def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
 def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
 
-} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-
 //===----------------------------------------------------------------------===//
-// SMEM Patterns
+// SMEM Instructions
 //===----------------------------------------------------------------------===//
 
+def S_DCACHE_WB : SMEM_Inval <0x21,
+  "s_dcache_wb", int_amdgcn_s_dcache_wb>;
+
+def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
+  "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+
+} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
+
 let Predicates = [isVI] in {
 
 // 1. Offset as 20bit DWORD immediate
@@ -103,46 +109,4 @@ def : Pat <
   (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
 >;
 
-// Patterns for global loads with no offset
-class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr)),
-  (inst $addr, 0, 0, 0)
->;
-
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
-
-class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (node vt:$data, i64:$addr),
-  (inst $data, $addr, 0, 0, 0)
->;
-
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
-
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr, vt:$data)),
-  (inst $addr, $data, 0, 0)
->;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-
 } // End Predicates = [isVI]
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
index 9550a3a..cd7540e 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -35,7 +35,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
-FunctionPass *createARMGlobalBaseRegPass();
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
index ef609a6..c171656 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -17,6 +17,17 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
+// ARM Helper classes.
+//
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+class Architecture<string fname, string aname, list<SubtargetFeature> features >
+  : SubtargetFeature<fname, "ARMArch", aname,
+                     !strconcat(aname, " architecture"), features>;
+
+//===----------------------------------------------------------------------===//
 // ARM Subtarget state.
 //
 
@@ -51,8 +62,11 @@ def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
                                    "true", "Enable ARMv8 FP",
                                    [FeatureVFP4]>;
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                       "Enable full half-precision floating point",
+                                       [FeatureFPARMv8]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
-                                     "Restrict VFP3 to 16 double registers">;
+                                     "Restrict FP to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
                                      "Enable divide instructions">;
 def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
@@ -119,9 +133,9 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
 def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true",
                                      "Has return address stack">;
 
-/// Some M architectures don't have the DSP extension (v7E-M vs. v7M)
-def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
-                                 "Supports v7 DSP instructions in Thumb2">;
+/// DSP extension.
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
+                              "Supports DSP instructions in ARM and/or Thumb2">;
 
 // Multiprocessing extension.
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
@@ -150,11 +164,28 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
                                        "NaCl trap">;
 
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+                                          "StrictAlign", "true",
+                                          "Disallow all unaligned memory "
+                                          "access">;
+
 def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
                                         "Generate calls via indirect call "
                                         "instructions">;
 
-// ARM ISAs.
+def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                        "Reserve R9, making it unavailable as "
+                                        "GPR">;
+
+def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
+                                     "Don't use movt/movw pairs for 32-bit "
+                                     "imms">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM ISAa.
+//
+
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
 def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
@@ -180,302 +211,452 @@ def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    [HasV6T2Ops, FeaturePerfMon]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops, FeatureVirtualization,
-                                    FeatureMP]>;
+                                   [HasV7Ops]>;
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
-                                   [HasV8Ops, FeatureAClass, FeatureCRC]>;
+                                   [HasV8Ops]>;
+def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+                                   "Support ARM v8.2a instructions",
+                                   [HasV8_1aOps]>;
+
 
 //===----------------------------------------------------------------------===//
-// ARM Processors supported.
+// ARM Processor subtarget features.
 //
 
-include "ARMSchedule.td"
-
-// ARM processor families.
 def ProcA5      : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
-                                   "Cortex-A5 ARM processors",
-                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
-                                    FeatureVMLxForwarding, FeatureT2XtPk,
-                                    FeatureTrustZone, FeatureMP]>;
+                                   "Cortex-A5 ARM processors", []>;
 def ProcA7      : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7",
-                                   "Cortex-A7 ARM processors",
-                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
-                                    FeatureVMLxForwarding, FeatureT2XtPk,
-                                    FeatureVFP4, FeatureMP,
-                                    FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureTrustZone, FeatureVirtualization]>;
+                                   "Cortex-A7 ARM processors", []>;
 def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
-                                   "Cortex-A8 ARM processors",
-                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
-                                    FeatureVMLxForwarding, FeatureT2XtPk,
-                                    FeatureTrustZone]>;
+                                   "Cortex-A8 ARM processors", []>;
 def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
-                                   "Cortex-A9 ARM processors",
-                                   [FeatureVMLxForwarding,
-                                    FeatureT2XtPk, FeatureFP16,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureTrustZone]>;
-def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
-                                   "Swift ARM processors",
-                                   [FeatureNEONForFP, FeatureT2XtPk,
-                                    FeatureVFP4, FeatureMP, FeatureHWDiv,
-                                    FeatureHWDivARM, FeatureAvoidPartialCPSR,
-                                    FeatureAvoidMOVsShOp,
-                                    FeatureHasSlowFPVMLx, FeatureTrustZone]>;
+                                   "Cortex-A9 ARM processors", []>;
 def ProcA12     : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12",
-                                   "Cortex-A12 ARM processors",
-                                   [FeatureVMLxForwarding,
-                                    FeatureT2XtPk, FeatureVFP4,
-                                    FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureVirtualization,
-                                    FeatureTrustZone]>;
-
-
-// FIXME: It has not been determined if A15 has these features.
-def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
-                                   "Cortex-A15 ARM processors",
-                                   [FeatureT2XtPk, FeatureVFP4,
-                                    FeatureMP, FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureTrustZone, FeatureVirtualization]>;
-
+                                   "Cortex-A12 ARM processors", []>;
+def ProcA15     : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+                                   "Cortex-A15 ARM processors", []>;
 def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
-                                   "Cortex-A17 ARM processors",
-                                   [FeatureVMLxForwarding,
-                                    FeatureT2XtPk, FeatureVFP4,
-                                    FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureVirtualization,
-                                    FeatureTrustZone]>;
-
+                                   "Cortex-A17 ARM processors", []>;
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors", []>;
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
-                                   "Cortex-A53 ARM processors",
-                                   [FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureTrustZone, FeatureT2XtPk,
-                                    FeatureCrypto, FeatureCRC]>;
-
+                                   "Cortex-A53 ARM processors", []>;
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
-                                   "Cortex-A57 ARM processors",
-                                   [FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureTrustZone, FeatureT2XtPk,
-                                    FeatureCrypto, FeatureCRC]>;
+                                   "Cortex-A57 ARM processors", []>;
+def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+                                   "Cortex-A72 ARM processors", []>;
 
-def ProcR4      : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
-                                   "Cortex-R4 ARM processors",
-                                   [FeatureHWDiv,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureDSPThumb2, FeatureT2XtPk,
-                                    HasV7Ops, FeatureDB, FeatureHasRAS,
-                                    FeatureRClass]>;
+def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
+                                   "Qualcomm ARM processors", []>;
+def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+                                   "Swift ARM processors", []>;
 
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M1 processors", []>;
+
+def ProcR4      : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
+                                    "Cortex-R4 ARM processors", []>;
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
-                                   "Cortex-R5 ARM processors",
-                                   [FeatureSlowFPBrcc,
-                                    FeatureHWDiv, FeatureHWDivARM,
-                                    FeatureHasSlowFPVMLx,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureT2XtPk]>;
-
-// FIXME: krait has currently the same features as A9
-// plus VFP4 and hardware division features.
-def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
-                                   "Qualcomm ARM processors",
-                                   [FeatureVMLxForwarding,
-                                    FeatureT2XtPk, FeatureFP16,
-                                    FeatureAvoidPartialCPSR,
-                                    FeatureTrustZone,
-                                    FeatureVFP4,
-                                    FeatureHWDiv,
-                                    FeatureHWDivARM]>;
+                                   "Cortex-R5 ARM processors", []>;
+def ProcR7      : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7",
+                                   "Cortex-R7 ARM processors", []>;
 
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+//===----------------------------------------------------------------------===//
+// ARM schedules.
+//
+
+include "ARMSchedule.td"
+
+
+//===----------------------------------------------------------------------===//
+// ARM architectures
+//
+
+def ARMv2     : Architecture<"armv2",     "ARMv2",    []>;
+
+def ARMv2a    : Architecture<"armv2a",    "ARMv2a",   []>;
+
+def ARMv3     : Architecture<"armv3",     "ARMv3",    []>;
+
+def ARMv3m    : Architecture<"armv3m",    "ARMv3m",   []>;
+
+def ARMv4     : Architecture<"armv4",     "ARMv4",    []>;
+
+def ARMv4t    : Architecture<"armv4t",    "ARMv4t",   [HasV4TOps]>;
+
+def ARMv5t    : Architecture<"armv5t",    "ARMv5t",   [HasV5TOps]>;
+
+def ARMv5te   : Architecture<"armv5te",   "ARMv5te",  [HasV5TEOps]>;
+
+def ARMv5tej  : Architecture<"armv5tej",  "ARMv5tej", [HasV5TEOps]>;
+
+def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops]>;
+
+def ARMv6t2   : Architecture<"armv6t2",   "ARMv6t2",  [HasV6T2Ops,
+                                                       FeatureDSP]>;
+
+def ARMv6k    : Architecture<"armv6k",    "ARMv6k",   [HasV6KOps]>;
+
+def ARMv6kz   : Architecture<"armv6kz",   "ARMv6kz",  [HasV6KOps,
+                                                       FeatureTrustZone]>;
+
+def ARMv6m    : Architecture<"armv6-m",   "ARMv6m",   [HasV6MOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureMClass]>;
+
+def ARMv6sm   : Architecture<"armv6s-m",  "ARMv6sm",  [HasV6MOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureMClass]>;
+
+def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
+                                                       FeatureNEON,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureAClass]>;
+
+def ARMv7r    : Architecture<"armv7-r",   "ARMv7r",   [HasV7Ops,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureHWDiv,
+                                                       FeatureRClass]>;
+
+def ARMv7m    : Architecture<"armv7-m",   "ARMv7m",   [HasV7Ops,
+                                                       FeatureThumb2,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureMClass]>;
+
+def ARMv7em   : Architecture<"armv7e-m",  "ARMv7em",  [HasV7Ops,
+                                                       FeatureThumb2,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureMClass,
+                                                       FeatureDSP,
+                                                       FeatureT2XtPk]>;
+
+def ARMv8a    : Architecture<"armv8-a",   "ARMv8a",   [HasV8Ops,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
+def ARMv81a   : Architecture<"armv8.1-a", "ARMv81a",  [HasV8_1aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
+def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
+// Aliases
+def IWMMXT   : Architecture<"iwmmxt",      "ARMv5te",  [ARMv5te]>;
+def IWMMXT2  : Architecture<"iwmmxt2",     "ARMv5te",  [ARMv5te]>;
+def XScale   : Architecture<"xscale",      "ARMv5te",  [ARMv5te]>;
+def ARMv6j   : Architecture<"armv6j",      "ARMv7a",   [ARMv6]>;
+def ARMv7k   : Architecture<"armv7k",      "ARMv7a",   [ARMv7a]>;
+def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARM processors
+//
+
+// Dummy CPU, used to target architectures
+def : ProcNoItin<"generic",                             []>;
+
+def : ProcNoItin<"arm8",                                [ARMv4]>;
+def : ProcNoItin<"arm810",                              [ARMv4]>;
+def : ProcNoItin<"strongarm",                           [ARMv4]>;
+def : ProcNoItin<"strongarm110",                        [ARMv4]>;
+def : ProcNoItin<"strongarm1100",                       [ARMv4]>;
+def : ProcNoItin<"strongarm1110",                       [ARMv4]>;
+
+def : ProcNoItin<"arm7tdmi",                            [ARMv4t]>;
+def : ProcNoItin<"arm7tdmi-s",                          [ARMv4t]>;
+def : ProcNoItin<"arm710t",                             [ARMv4t]>;
+def : ProcNoItin<"arm720t",                             [ARMv4t]>;
+def : ProcNoItin<"arm9",                                [ARMv4t]>;
+def : ProcNoItin<"arm9tdmi",                            [ARMv4t]>;
+def : ProcNoItin<"arm920",                              [ARMv4t]>;
+def : ProcNoItin<"arm920t",                             [ARMv4t]>;
+def : ProcNoItin<"arm922t",                             [ARMv4t]>;
+def : ProcNoItin<"arm940t",                             [ARMv4t]>;
+def : ProcNoItin<"ep9312",                              [ARMv4t]>;
+
+def : ProcNoItin<"arm10tdmi",                           [ARMv5t]>;
+def : ProcNoItin<"arm1020t",                            [ARMv5t]>;
+
+def : ProcNoItin<"arm9e",                               [ARMv5te]>;
+def : ProcNoItin<"arm926ej-s",                          [ARMv5te]>;
+def : ProcNoItin<"arm946e-s",                           [ARMv5te]>;
+def : ProcNoItin<"arm966e-s",                           [ARMv5te]>;
+def : ProcNoItin<"arm968e-s",                           [ARMv5te]>;
+def : ProcNoItin<"arm10e",                              [ARMv5te]>;
+def : ProcNoItin<"arm1020e",                            [ARMv5te]>;
+def : ProcNoItin<"arm1022e",                            [ARMv5te]>;
+def : ProcNoItin<"xscale",                              [ARMv5te]>;
+def : ProcNoItin<"iwmmxt",                              [ARMv5te]>;
+
+def : Processor<"arm1136j-s",       ARMV6Itineraries,   [ARMv6]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries,   [ARMv6,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+def : Processor<"cortex-m0",        ARMV6Itineraries,   [ARMv6m]>;
+def : Processor<"cortex-m0plus",    ARMV6Itineraries,   [ARMv6m]>;
+def : Processor<"cortex-m1",        ARMV6Itineraries,   [ARMv6m]>;
+def : Processor<"sc000",            ARMV6Itineraries,   [ARMv6m]>;
+
+def : Processor<"arm1176jz-s",      ARMV6Itineraries,   [ARMv6kz]>;
+def : Processor<"arm1176jzf-s",     ARMV6Itineraries,   [ARMv6kz,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+def : Processor<"mpcorenovfp",      ARMV6Itineraries,   [ARMv6k]>;
+def : Processor<"mpcore",           ARMV6Itineraries,   [ARMv6k,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+def : Processor<"arm1156t2-s",      ARMV6Itineraries,   [ARMv6t2]>;
+def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
 
-// V4 Processors.
-def : ProcNoItin<"generic",         []>;
-def : ProcNoItin<"arm8",            []>;
-def : ProcNoItin<"arm810",          []>;
-def : ProcNoItin<"strongarm",       []>;
-def : ProcNoItin<"strongarm110",    []>;
-def : ProcNoItin<"strongarm1100",   []>;
-def : ProcNoItin<"strongarm1110",   []>;
-
-// V4T Processors.
-def : ProcNoItin<"arm7tdmi",        [HasV4TOps]>;
-def : ProcNoItin<"arm7tdmi-s",      [HasV4TOps]>;
-def : ProcNoItin<"arm710t",         [HasV4TOps]>;
-def : ProcNoItin<"arm720t",         [HasV4TOps]>;
-def : ProcNoItin<"arm9",            [HasV4TOps]>;
-def : ProcNoItin<"arm9tdmi",        [HasV4TOps]>;
-def : ProcNoItin<"arm920",          [HasV4TOps]>;
-def : ProcNoItin<"arm920t",         [HasV4TOps]>;
-def : ProcNoItin<"arm922t",         [HasV4TOps]>;
-def : ProcNoItin<"arm940t",         [HasV4TOps]>;
-def : ProcNoItin<"ep9312",          [HasV4TOps]>;
-
-// V5T Processors.
-def : ProcNoItin<"arm10tdmi",       [HasV5TOps]>;
-def : ProcNoItin<"arm1020t",        [HasV5TOps]>;
-
-// V5TE Processors.
-def : ProcNoItin<"arm9e",           [HasV5TEOps]>;
-def : ProcNoItin<"arm926ej-s",      [HasV5TEOps]>;
-def : ProcNoItin<"arm946e-s",       [HasV5TEOps]>;
-def : ProcNoItin<"arm966e-s",       [HasV5TEOps]>;
-def : ProcNoItin<"arm968e-s",       [HasV5TEOps]>;
-def : ProcNoItin<"arm10e",          [HasV5TEOps]>;
-def : ProcNoItin<"arm1020e",        [HasV5TEOps]>;
-def : ProcNoItin<"arm1022e",        [HasV5TEOps]>;
-def : ProcNoItin<"xscale",          [HasV5TEOps]>;
-def : ProcNoItin<"iwmmxt",          [HasV5TEOps]>;
-
-// V6 Processors.
-def : Processor<"arm1136j-s",       ARMV6Itineraries, [HasV6Ops]>;
-def : Processor<"arm1136jf-s",      ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
-                                                       FeatureHasSlowFPVMLx]>;
-
-// V6M Processors.
-def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
-                                                       FeatureDB, FeatureMClass]>;
-def : Processor<"cortex-m0plus",    ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
-                                                       FeatureDB, FeatureMClass]>;
-def : Processor<"cortex-m1",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
-                                                       FeatureDB, FeatureMClass]>;
-def : Processor<"sc000",            ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
-                                                       FeatureDB, FeatureMClass]>;
-
-// V6K Processors.
-def : Processor<"arm1176jz-s",      ARMV6Itineraries, [HasV6KOps]>;
-def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [HasV6KOps, FeatureVFP2,
-                                                       FeatureHasSlowFPVMLx]>;
-def : Processor<"mpcorenovfp",      ARMV6Itineraries, [HasV6KOps]>;
-def : Processor<"mpcore",           ARMV6Itineraries, [HasV6KOps, FeatureVFP2,
-                                                       FeatureHasSlowFPVMLx]>;
-
-// V6T2 Processors.
-def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops,
-                                                       FeatureDSPThumb2]>;
-def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
-                                                       FeatureHasSlowFPVMLx,
-                                                       FeatureDSPThumb2]>;
-
-// V7a Processors.
 // FIXME: A5 has currently the same Schedule model as A8
-def : ProcessorModel<"cortex-a5",   CortexA8Model,
-                                    [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureVFP4, FeatureDSPThumb2,
-                                     FeatureHasRAS, FeatureAClass]>;
-def : ProcessorModel<"cortex-a7",   CortexA8Model,
-                                    [ProcA7, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS,
-                                     FeatureAClass]>;
-def : ProcessorModel<"cortex-a8",   CortexA8Model,
-                                    [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS,
-                                     FeatureAClass]>;
-def : ProcessorModel<"cortex-a9",   CortexA9Model,
-                                    [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS, FeatureMP,
-                                     FeatureAClass]>;
+def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk,
+                                                         FeatureMP,
+                                                         FeatureVFP4]>;
+
+def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk,
+                                                         FeatureMP,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureVirtualization]>;
+
+def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk]>;
+
+def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk,
+                                                         FeatureFP16,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureMP]>;
 
 // FIXME: A12 has currently the same Schedule model as A9
-def : ProcessorModel<"cortex-a12", CortexA9Model,
-                                    [ProcA12, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureMP,
-                                     FeatureHasRAS, FeatureAClass]>;
-
-// FIXME: A15 has currently the same ProcessorModel as A9.
-def : ProcessorModel<"cortex-a15",   CortexA9Model,
-                                    [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS,
-                                     FeatureAClass]>;
+def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVirtualization,
+                                                         FeatureMP]>;
+
+// FIXME: A15 has currently the same Schedule model as A9.
+def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureT2XtPk,
+                                                         FeatureVFP4,
+                                                         FeatureMP,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVirtualization]>;
 
 // FIXME: A17 has currently the same Schedule model as A9
-def : ProcessorModel<"cortex-a17",  CortexA9Model,
-                                    [ProcA17, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureMP,
-                                     FeatureHasRAS, FeatureAClass]>;
+def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
+                                                         FeatureHasRAS,
+                                                         FeatureTrustZone,
+                                                         FeatureMP,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVirtualization]>;
 
 // FIXME: krait has currently the same Schedule model as A9
-def : ProcessorModel<"krait",       CortexA9Model,
-                                    [ProcKrait, HasV7Ops,
-                                     FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS,
-                                     FeatureAClass]>;
+// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
+//        division features.
+def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
+                                                         FeatureHasRAS,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureT2XtPk,
+                                                         FeatureFP16,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM]>;
+
+def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
+                                                         FeatureHasRAS,
+                                                         FeatureNEONForFP,
+                                                         FeatureT2XtPk,
+                                                         FeatureVFP4,
+                                                         FeatureMP,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureAvoidMOVsShOp,
+                                                         FeatureHasSlowFPVMLx]>;
 
 // FIXME: R4 has currently the same ProcessorModel as A8.
-def : ProcessorModel<"cortex-r4",   CortexA8Model,
-                                    [ProcR4]>;
+def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
+                                                         FeatureHasRAS,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureT2XtPk]>;
 
 // FIXME: R4F has currently the same ProcessorModel as A8.
-def : ProcessorModel<"cortex-r4f",  CortexA8Model,
-                                    [ProcR4,
-                                     FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
-                                     FeatureVFP3, FeatureD16]>;
+def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
+                                                         FeatureHasRAS,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureT2XtPk]>;
 
 // FIXME: R5 has currently the same ProcessorModel as A8.
-def : ProcessorModel<"cortex-r5",   CortexA8Model,
-                                    [ProcR5, HasV7Ops, FeatureDB,
-                                     FeatureVFP3, FeatureDSPThumb2,
-                                     FeatureHasRAS,
-                                     FeatureD16, FeatureRClass]>;
+def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
+                                                         FeatureHasRAS,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureT2XtPk]>;
 
 // FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
-def : ProcessorModel<"cortex-r7",   CortexA8Model,
-                                    [ProcR5, HasV7Ops, FeatureDB,
-                                     FeatureVFP3, FeatureDSPThumb2,
-                                     FeatureHasRAS, FeatureVFPOnlySP,
-                                     FeatureD16, FeatureMP, FeatureRClass]>;
-
-// V7M Processors.
-def : ProcNoItin<"cortex-m3",       [HasV7Ops,
-                                     FeatureThumb2, FeatureNoARM, FeatureDB,
-                                     FeatureHWDiv, FeatureMClass]>;
-def : ProcNoItin<"sc300",           [HasV7Ops,
-                                     FeatureThumb2, FeatureNoARM, FeatureDB,
-                                     FeatureHWDiv, FeatureMClass]>;
-
-// V7EM Processors.
-def : ProcNoItin<"cortex-m4",       [HasV7Ops,
-                                     FeatureThumb2, FeatureNoARM, FeatureDB,
-                                     FeatureHWDiv, FeatureDSPThumb2,
-                                     FeatureT2XtPk, FeatureVFP4,
-                                     FeatureVFPOnlySP, FeatureD16,
-                                     FeatureMClass]>;
-def : ProcNoItin<"cortex-m7",       [HasV7Ops,
-                                     FeatureThumb2, FeatureNoARM, FeatureDB,
-                                     FeatureHWDiv, FeatureDSPThumb2,
-                                     FeatureT2XtPk, FeatureFPARMv8,
-                                     FeatureD16, FeatureMClass]>;
-
-
-// Swift uArch Processors.
-def : ProcessorModel<"swift",       SwiftModel,
-                                    [ProcSwift, HasV7Ops, FeatureNEON,
-                                     FeatureDB, FeatureDSPThumb2,
-                                     FeatureHasRAS, FeatureAClass]>;
-
-// V8 Processors
-def : ProcNoItin<"cortex-a53",      [ProcA53, HasV8Ops, FeatureAClass,
-                                    FeatureDB, FeatureFPARMv8,
-                                    FeatureNEON, FeatureDSPThumb2]>;
-def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
-                                    FeatureDB, FeatureFPARMv8,
-                                    FeatureNEON, FeatureDSPThumb2]>;
-// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
-def : ProcNoItin<"cortex-a72",      [ProcA57, HasV8Ops, FeatureAClass,
-                                    FeatureDB, FeatureFPARMv8,
-                                    FeatureNEON, FeatureDSPThumb2]>;
+def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
+                                                         FeatureHasRAS,
+                                                         FeatureVFP3,
+                                                         FeatureVFPOnlySP,
+                                                         FeatureD16,
+                                                         FeatureFP16,
+                                                         FeatureMP,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureT2XtPk]>;
+
+def : ProcNoItin<"cortex-m3",                           [ARMv7m]>;
+def : ProcNoItin<"sc300",                               [ARMv7m]>;
+
+def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+                                                         FeatureVFP4,
+                                                         FeatureVFPOnlySP,
+                                                         FeatureD16]>;
+
+def : ProcNoItin<"cortex-m7",                           [ARMv7em,
+                                                         FeatureFPARMv8,
+                                                         FeatureD16]>;
+
+
+def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a57",                          [ARMv8a, ProcA57,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
 
 // Cyclone is very similar to swift
-def : ProcessorModel<"cyclone",     SwiftModel,
-                                    [ProcSwift, HasV8Ops, HasV7Ops,
-                                     FeatureCrypto, FeatureFPARMv8,
-                                     FeatureDB,FeatureDSPThumb2,
-                                     FeatureHasRAS, FeatureZCZeroing]>;
+def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
+                                                         FeatureHasRAS,
+                                                         FeatureNEONForFP,
+                                                         FeatureT2XtPk,
+                                                         FeatureVFP4,
+                                                         FeatureMP,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureAvoidMOVsShOp,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureCrypto,
+                                                         FeatureZCZeroing]>;
+
+def : ProcNoItin<"exynos-m1",                           [ARMv8a, ProcExynosM1,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -504,8 +685,15 @@ def ARMAsmWriter : AsmWriter {
   bit isMCAsmWriter = 1;
 }
 
+def ARMAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "ARM";
+  string BreakCharacters = ".";
+}
+
 def ARM : Target {
   // Pull in Instruction Info:
   let InstructionSet = ARMInstrInfo;
   let AssemblyWriters = [ARMAsmWriter];
+  let AssemblyParserVariants = [ARMAsmParserVariant];
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 738dded..206db96 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -60,7 +60,7 @@ using namespace llvm;
 ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
-      InConstantPool(false) {}
+      InConstantPool(false), OptimizationGoals(-1) {}
 
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
@@ -80,8 +80,8 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() {
   OutStreamer->EmitLabel(CurrentFnSym);
 }
 
-void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
-  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
+  uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType());
   assert(Size && "C++ constructor pointer had zero size!");
 
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
@@ -106,9 +106,38 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<ARMSubtarget>();
 
   SetupMachineFunction(MF);
+  const Function* F = MF.getFunction();
+  const TargetMachine& TM = MF.getTarget();
+
+  // Calculate this function's optimization goal.
+  unsigned OptimizationGoal;
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    // For best debugging illusion, speed and small size sacrificed
+    OptimizationGoal = 6;
+  else if (F->optForMinSize())
+    // Aggressively for small size, speed and debug illusion sacrificed
+    OptimizationGoal = 4;
+  else if (F->optForSize())
+    // For small size, but speed and debugging illusion preserved
+    OptimizationGoal = 3;
+  else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    // Aggressively for speed, small size and debug illusion sacrificed
+    OptimizationGoal = 2;
+  else if (TM.getOptLevel() > CodeGenOpt::None)
+    // For speed, but small size and good debug illusion preserved
+    OptimizationGoal = 1;
+  else // TM.getOptLevel() == CodeGenOpt::None
+    // For good debugging, but speed and small size preserved
+    OptimizationGoal = 5;
+
+  // Combine a new optimization goal with existing ones.
+  if (OptimizationGoals == -1) // uninitialized goals
+    OptimizationGoals = OptimizationGoal;
+  else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
+    OptimizationGoals = 0;
 
   if (Subtarget->isTargetCOFF()) {
-    bool Internal = MF.getFunction()->hasInternalLinkage();
+    bool Internal = F->hasInternalLinkage();
     COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
                                             : COFF::IMAGE_SYM_CLASS_EXTERNAL;
     int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
@@ -198,22 +227,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
 
 MCSymbol *ARMAsmPrinter::
 GetARMJTIPICJumpTableLabel(unsigned uid) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   SmallString<60> Name;
-  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
+  raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
                             << getFunctionNumber() << '_' << uid;
   return OutContext.getOrCreateSymbol(Name);
 }
 
-
-MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
-  const DataLayout *DL = TM.getDataLayout();
-  SmallString<60> Name;
-  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH"
-    << getFunctionNumber();
-  return OutContext.getOrCreateSymbol(Name);
-}
-
 bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                     unsigned AsmVariant, const char *ExtraCode,
                                     raw_ostream &O) {
@@ -515,6 +535,17 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+
+  // The last attribute to be emitted is ABI_optimization_goals
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  if (OptimizationGoals > 0 &&
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI()))
+    ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
+  OptimizationGoals = -1;
+
+  ATS.finishAttributeSection();
 }
 
 //===----------------------------------------------------------------------===//
@@ -532,7 +563,7 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
   if (Subtarget->hasV8Ops())
     return ARMBuildAttrs::v8;
   else if (Subtarget->hasV7Ops()) {
-    if (Subtarget->isMClass() && Subtarget->hasThumb2DSP())
+    if (Subtarget->isMClass() && Subtarget->hasDSP())
       return ARMBuildAttrs::v7E_M;
     return ARMBuildAttrs::v7;
   } else if (Subtarget->hasV6T2Ops())
@@ -587,7 +618,7 @@ void ARMAsmPrinter::emitAttributes() {
       // We consider krait as a "cortex-a9" + hwdiv CPU
       // Enable hwdiv through ".arch_extension idiv"
       if (STI.hasDivide() || STI.hasDivideInARMMode())
-        ATS.emitArchExtension(ARM::AEK_HWDIV);
+        ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM);
     } else
       ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
   }
@@ -807,8 +838,6 @@ void ARMAsmPrinter::emitAttributes() {
   else if (STI.hasVirtualization())
     ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
                       ARMBuildAttrs::AllowVirtualization);
-
-  ATS.finishAttributeSection();
 }
 
 //===----------------------------------------------------------------------===//
@@ -828,8 +857,7 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
   case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_TLSGD;
   case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_TPOFF;
   case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_GOTTPOFF;
-  case ARMCP::GOT:         return MCSymbolRefExpr::VK_GOT;
-  case ARMCP::GOTOFF:      return MCSymbolRefExpr::VK_GOTOFF;
+  case ARMCP::GOT_PREL:    return MCSymbolRefExpr::VK_ARM_GOT_PREL;
   }
   llvm_unreachable("Invalid ARMCPModifier!");
 }
@@ -875,8 +903,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
 
 void ARMAsmPrinter::
 EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
-  const DataLayout *DL = TM.getDataLayout();
-  int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
+  const DataLayout &DL = getDataLayout();
+  int Size = DL.getTypeAllocSize(MCPV->getType());
 
   ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
 
@@ -909,10 +937,9 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
                             OutContext);
 
   if (ACPV->getPCAdjustment()) {
-    MCSymbol *PCLabel = getPICLabel(DL->getPrivateGlobalPrefix(),
-                                    getFunctionNumber(),
-                                    ACPV->getLabelId(),
-                                    OutContext);
+    MCSymbol *PCLabel =
+        getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                    ACPV->getLabelId(), OutContext);
     const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext);
     PCRelExpr =
       MCBinaryExpr::createAdd(PCRelExpr,
@@ -1136,6 +1163,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         Offset = 0;
         break;
       case ARM::ADDri:
+      case ARM::t2ADDri:
         Offset = -MI->getOperand(2).getImm();
         break;
       case ARM::SUBri:
@@ -1198,7 +1226,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -1355,9 +1383,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
 
-    MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
-                                     getFunctionNumber(),
-                                     MI->getOperand(2).getImm(), OutContext);
+    MCSymbol *LabelSym =
+        getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                    MI->getOperand(2).getImm(), OutContext);
     const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
     const MCExpr *PCRelExpr =
@@ -1388,9 +1416,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
 
-    MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
-                                     getFunctionNumber(),
-                                     MI->getOperand(3).getImm(), OutContext);
+    MCSymbol *LabelSym =
+        getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                    MI->getOperand(3).getImm(), OutContext);
     const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
     const MCExpr *PCRelExpr =
@@ -1414,10 +1442,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
-                                       MI->getOperand(2).getImm(),
-                                       OutContext));
+                                       MI->getOperand(2).getImm(), OutContext));
 
     // Form and emit the add.
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
@@ -1436,10 +1463,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
-                                       MI->getOperand(2).getImm(),
-                                       OutContext));
+                                       MI->getOperand(2).getImm(), OutContext));
 
     // Form and emit the add.
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr)
@@ -1468,10 +1494,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // a PC-relative address at the ldr instruction.
 
     // Emit the label.
-    OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
-                                       MI->getOperand(2).getImm(),
-                                       OutContext));
+                                       MI->getOperand(2).getImm(), OutContext));
 
     // Form and emit the load
     unsigned Opcode;
@@ -1519,7 +1544,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (MCPE.isMachineConstantPoolEntry())
       EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
-      EmitGlobalConstant(MCPE.Val.ConstVal);
+      EmitGlobalConstant(DL, MCPE.Val.ConstVal);
     return;
   }
   case ARM::JUMPTABLE_ADDRS:
@@ -1653,12 +1678,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // adds $val, #7
     // str $val, [$src, #4]
     // movs r0, #0
-    // b 1f
+    // b LSJLJEH
     // movs r0, #1
-    // 1:
+    // LSJLJEH:
     unsigned SrcReg = MI->getOperand(0).getReg();
     unsigned ValReg = MI->getOperand(1).getReg();
-    MCSymbol *Label = GetARMSJLJEHLabel();
+    MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
     OutStreamer->AddComment("eh_setjmp begin");
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
       .addReg(ValReg)
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 3d25121..ed7be2d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -51,6 +51,11 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   /// labels used for ARMv4t thumb code to make register indirect calls.
   SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
 
+  /// OptimizationGoals - Maintain a combined optimization goal for all
+  /// functions in a module: one of Tag_ABI_optimization_goals values,
+  /// -1 if uninitialized, 0 if conflicting goals
+  int OptimizationGoals;
+
 public:
   explicit ARMAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer);
@@ -84,7 +89,7 @@ public:
   void EmitFunctionEntryLabel() override;
   void EmitStartOfAsmFile(Module &M) override;
   void EmitEndOfAsmFile(Module &M) override;
-  void EmitXXStructor(const Constant *CV) override;
+  void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
 
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
@@ -119,8 +124,6 @@ private:
   MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
   MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const;
 
-  MCSymbol *GetARMSJLJEHLabel() const;
-
   MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
 
 public:
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9f43e73..49f3288 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -97,7 +97,7 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
     Subtarget(STI) {
   for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
     if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
-      assert(false && "Duplicated entries?");
+      llvm_unreachable("Duplicated entries?");
     MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
     MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
   }
@@ -440,7 +440,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
 
 bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
   if (MI->isBundle()) {
-    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
     MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
       int PIdx = I->findFirstPredOperandIdx();
@@ -518,7 +518,7 @@ bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
 
 static bool isCPSRDefined(const MachineInstr *MI) {
   for (const auto &MO : MI->operands())
-    if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef())
+    if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
       return true;
   return false;
 }
@@ -647,7 +647,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
 
 unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
   unsigned Size = 0;
-  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
   while (++I != E && I->isInsideBundle()) {
     assert(!I->isBundle() && "No nested bundle!");
@@ -853,11 +853,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
 
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                            MachineMemOperand::MOStore,
-                            MFI.getObjectSize(FI),
-                            Align);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), Align);
 
   switch (RC->getSize()) {
     case 4:
@@ -1043,12 +1041,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(
-                    MachinePointerInfo::getFixedStack(FI),
-                            MachineMemOperand::MOLoad,
-                            MFI.getObjectSize(FI),
-                            Align);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), Align);
 
   switch (RC->getSize()) {
   case 4:
@@ -1224,6 +1219,60 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
   return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
 }
 
+/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
+/// depending on whether the result is used.
+void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const {
+  bool isThumb1 = Subtarget.isThumb1Only();
+  bool isThumb2 = Subtarget.isThumb2();
+  const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo();
+
+  MachineInstr *MI = MBBI;
+  DebugLoc dl = MI->getDebugLoc();
+  MachineBasicBlock *BB = MI->getParent();
+
+  MachineInstrBuilder LDM, STM;
+  if (isThumb1 || !MI->getOperand(1).isDead()) {
+    LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
+                                                 : isThumb1 ? ARM::tLDMIA_UPD
+                                                            : ARM::LDMIA_UPD))
+             .addOperand(MI->getOperand(1));
+  } else {
+    LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
+  }
+
+  if (isThumb1 || !MI->getOperand(0).isDead()) {
+    STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
+                                                 : isThumb1 ? ARM::tSTMIA_UPD
+                                                            : ARM::STMIA_UPD))
+             .addOperand(MI->getOperand(0));
+  } else {
+    STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
+  }
+
+  AddDefaultPred(LDM.addOperand(MI->getOperand(3)));
+  AddDefaultPred(STM.addOperand(MI->getOperand(2)));
+
+  // Sort the scratch registers into ascending order.
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+  llvm::SmallVector<unsigned, 6> ScratchRegs;
+  for(unsigned I = 5; I < MI->getNumOperands(); ++I)
+    ScratchRegs.push_back(MI->getOperand(I).getReg());
+  std::sort(ScratchRegs.begin(), ScratchRegs.end(),
+            [&TRI](const unsigned &Reg1,
+                   const unsigned &Reg2) -> bool {
+              return TRI.getEncodingValue(Reg1) <
+                     TRI.getEncodingValue(Reg2);
+            });
+
+  for (const auto &Reg : ScratchRegs) {
+    LDM.addReg(Reg, RegState::Define);
+    STM.addReg(Reg, RegState::Kill);
+  }
+
+  BB->erase(MBBI);
+}
+
+
 bool
 ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
@@ -1237,6 +1286,11 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     return true;
   }
 
+  if (MI->getOpcode() == ARM::MEMCPY) {
+    expandMEMCPY(MI);
+    return true;
+  }
+
   // This hook gets to expand COPY instructions before they become
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
@@ -1325,9 +1379,9 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   // instructions, so that's probably OK, but is PIC always correct when
   // we get here?
   if (ACPV->isGlobalValue())
-    NewCPV = ARMConstantPoolConstant::
-      Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId,
-             ARMCP::CPValue, 4);
+    NewCPV = ARMConstantPoolConstant::Create(
+        cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue,
+        4, ACPV->getModifier(), ACPV->mustAddCurrentAddress());
   else if (ACPV->isExtSymbol())
     NewCPV = ARMConstantPoolSymbol::
       Create(MF.getFunction()->getContext(),
@@ -1645,16 +1699,14 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
 bool ARMBaseInstrInfo::
 isProfitableToIfCvt(MachineBasicBlock &MBB,
                     unsigned NumCycles, unsigned ExtraPredCycles,
-                    const BranchProbability &Probability) const {
+                    BranchProbability Probability) const {
   if (!NumCycles)
     return false;
 
   // If we are optimizing for size, see if the branch in the predecessor can be
   // lowered to cbn?z by the constant island lowering pass, and return false if
   // so. This results in a shorter instruction sequence.
-  const Function *F = MBB.getParent()->getFunction();
-  if (F->hasFnAttribute(Attribute::OptimizeForSize) ||
-      F->hasFnAttribute(Attribute::MinSize)) {
+  if (MBB.getParent()->getFunction()->optForSize()) {
     MachineBasicBlock *Pred = *MBB.pred_begin();
     if (!Pred->empty()) {
       MachineInstr *LastMI = &*Pred->rbegin();
@@ -1677,12 +1729,14 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
   }
 
   // Attempt to estimate the relative costs of predication versus branching.
-  unsigned UnpredCost = Probability.getNumerator() * NumCycles;
-  UnpredCost /= Probability.getDenominator();
-  UnpredCost += 1; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() / 10;
-
-  return (NumCycles + ExtraPredCycles) <= UnpredCost;
+  // Here we scale up each component of UnpredCost to avoid precision issue when
+  // scaling NumCycles by Probability.
+  const unsigned ScalingUpFactor = 1024;
+  unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
+  UnpredCost += ScalingUpFactor; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+
+  return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
 }
 
 bool ARMBaseInstrInfo::
@@ -1690,23 +1744,22 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
                     unsigned TCycles, unsigned TExtra,
                     MachineBasicBlock &FMBB,
                     unsigned FCycles, unsigned FExtra,
-                    const BranchProbability &Probability) const {
+                    BranchProbability Probability) const {
   if (!TCycles || !FCycles)
     return false;
 
   // Attempt to estimate the relative costs of predication versus branching.
-  unsigned TUnpredCost = Probability.getNumerator() * TCycles;
-  TUnpredCost /= Probability.getDenominator();
-
-  uint32_t Comp = Probability.getDenominator() - Probability.getNumerator();
-  unsigned FUnpredCost = Comp * FCycles;
-  FUnpredCost /= Probability.getDenominator();
-
+  // Here we scale up each component of UnpredCost to avoid precision issue when
+  // scaling TCycles/FCycles by Probability.
+  const unsigned ScalingUpFactor = 1024;
+  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+  unsigned FUnpredCost =
+      Probability.getCompl().scale(FCycles * ScalingUpFactor);
   unsigned UnpredCost = TUnpredCost + FUnpredCost;
-  UnpredCost += 1; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() / 10;
+  UnpredCost += 1 * ScalingUpFactor; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
 
-  return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost;
+  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
 }
 
 bool
@@ -1744,9 +1797,10 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
   llvm_unreachable("Unknown unconditional branch opcode!");
 }
 
-/// commuteInstruction - Handle commutable instructions.
-MachineInstr *
-ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                       bool NewMI,
+                                                       unsigned OpIdx1,
+                                                       unsigned OpIdx2) const {
   switch (MI->getOpcode()) {
   case ARM::MOVCCr:
   case ARM::t2MOVCCr: {
@@ -1756,7 +1810,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
       return nullptr;
-    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
     if (!MI)
       return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
@@ -1765,7 +1819,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     return MI;
   }
   }
-  return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
 }
 
 /// Identify instructions that can be folded into a MOVCC instruction, and
@@ -1975,21 +2029,12 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
   }
 }
 
-static bool isAnySubRegLive(unsigned Reg, const TargetRegisterInfo *TRI,
-                      MachineInstr *MI) {
-  for (MCSubRegIterator Subreg(Reg, TRI, /* IncludeSelf */ true);
-       Subreg.isValid(); ++Subreg)
-    if (MI->getParent()->computeRegisterLiveness(TRI, *Subreg, MI) !=
-        MachineBasicBlock::LQR_Dead)
-      return true;
-  return false;
-}
 bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
                                       MachineFunction &MF, MachineInstr *MI,
                                       unsigned NumBytes) {
   // This optimisation potentially adds lots of load and store
   // micro-operations, it's only really a great benefit to code-size.
-  if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
+  if (!MF.getFunction()->optForMinSize())
     return false;
 
   // If only one register is pushed/popped, LLVM can use an LDR/STR
@@ -2058,11 +2103,9 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
     // registers live within the function we might clobber a return value
     // register; the other way a register can be live here is if it's
     // callee-saved.
-    // TODO: Currently, computeRegisterLiveness() does not report "live" if a
-    // sub reg is live. When computeRegisterLiveness() works for sub reg, it
-    // can replace isAnySubRegLive().
     if (isCalleeSavedRegister(CurReg, CSRegs) ||
-        isAnySubRegLive(CurReg, TRI, MI)) {
+        MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) !=
+        MachineBasicBlock::LQR_Dead) {
       // VFP pops don't allow holes in the register list, so any skip is fatal
       // for our transformation. GPR pops do, so we should just keep looking.
       if (IsVFPPushPop)
@@ -3381,7 +3424,7 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
 
   assert(Idx != -1 && "Cannot find bundled definition!");
   DefIdx = Idx;
-  return II;
+  return &*II;
 }
 
 static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
@@ -3389,7 +3432,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
                                            unsigned &UseIdx, unsigned &Dist) {
   Dist = 0;
 
-  MachineBasicBlock::const_instr_iterator II = MI; ++II;
+  MachineBasicBlock::const_instr_iterator II = ++MI->getIterator();
   assert(II->isInsideBundle() && "Empty bundle?");
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
@@ -3410,7 +3453,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
   }
 
   UseIdx = Idx;
-  return II;
+  return &*II;
 }
 
 /// Return the number of cycles to add to (or subtract from) the static
@@ -3652,6 +3695,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     // instructions).
     if (Latency > 0 && Subtarget.isThumb2()) {
       const MachineFunction *MF = DefMI->getParent()->getParent();
+      // FIXME: Use Function::optForSize().
       if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
         --Latency;
     }
@@ -3931,11 +3975,11 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   // other passes may query the latency of a bundled instruction.
   if (MI->isBundle()) {
     unsigned Latency = 0;
-    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
     MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
       if (I->getOpcode() != ARM::t2IT)
-        Latency += getInstrLatency(ItinData, I, PredCost);
+        Latency += getInstrLatency(ItinData, &*I, PredCost);
     }
     return Latency;
   }
@@ -4054,8 +4098,8 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
     MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
     MIB.addReg(Reg, RegState::Kill).addImm(0);
     unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
-    MachineMemOperand *MMO = MBB.getParent()->
-        getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 4, 4);
+    MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+        MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
     MIB.addMemOperand(MMO);
     AddDefaultPred(MIB);
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index b4706e3..d80c494 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -86,6 +86,18 @@ protected:
                             RegSubRegPair &BaseReg,
                             RegSubRegPairAndIdx &InsertedReg) const override;
 
+  /// Commutes the operands in the given instruction.
+  /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+  ///
+  /// Do not call this method for a non-commutable instruction or for
+  /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+                                       bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
+
 public:
   // Return whether the target has an explicit NOP encoding.
   bool hasNOP() const;
@@ -188,9 +200,6 @@ public:
   MachineInstr *duplicate(MachineInstr *Orig,
                           MachineFunction &MF) const override;
 
-  MachineInstr *commuteInstruction(MachineInstr*,
-                                   bool=false) const override;
-
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                                      unsigned SubIdx, unsigned State,
                                      const TargetRegisterInfo *TRI) const;
@@ -224,15 +233,15 @@ public:
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB,
                            unsigned NumCycles, unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const override;
+                           BranchProbability Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
                            unsigned ExtraT, MachineBasicBlock &FMBB,
                            unsigned NumF, unsigned ExtraF,
-                           const BranchProbability &Probability) const override;
+                           BranchProbability Probability) const override;
 
   bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
-                          const BranchProbability &Probability) const override {
+                                 BranchProbability Probability) const override {
     return NumCycles == 1;
   }
 
@@ -343,6 +352,8 @@ private:
   virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI,
                                     Reloc::Model RM) const = 0;
 
+  void expandMEMCPY(MachineBasicBlock::iterator) const;
+
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e7d5be77..a520770 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -87,9 +87,22 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     }
   }
 
+  if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
+               ? CSR_iOS_CXX_TLS_PE_SaveList
+               : CSR_iOS_CXX_TLS_SaveList;
   return RegList;
 }
 
+const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<ARMFunctionInfo>()->isSplitCSR())
+    return CSR_iOS_CXX_TLS_ViaCopy_SaveList;
+  return nullptr;
+}
+
 const uint32_t *
 ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
@@ -97,6 +110,8 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
+  if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS)
+    return CSR_iOS_CXX_TLS_RegMask;
   return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
@@ -106,6 +121,14 @@ ARMBaseRegisterInfo::getNoPreservedMask() const {
 }
 
 const uint32_t *
+ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const {
+  assert(MF.getSubtarget<ARMSubtarget>().isTargetDarwin() &&
+         "only know about special TLS call on Darwin");
+  return CSR_iOS_TLSCall_RegMask;
+}
+
+
+const uint32_t *
 ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
                                                 CallingConv::ID CC) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
@@ -225,7 +248,8 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
                                            ArrayRef<MCPhysReg> Order,
                                            SmallVectorImpl<MCPhysReg> &Hints,
                                            const MachineFunction &MF,
-                                           const VirtRegMap *VRM) const {
+                                           const VirtRegMap *VRM,
+                                           const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
 
@@ -338,7 +362,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
   // 3. There are VLAs in the function and the base pointer is disabled.
-  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+  if (!TargetRegisterInfo::canRealignStack(MF))
     return false;
   if (AFI->isThumb1OnlyFunction())
     return false;
@@ -356,18 +380,6 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 }
 
 bool ARMBaseRegisterInfo::
-needsStackRealignment(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARMFrameLowering *TFI = getFrameLowering(MF);
-  const Function *F = MF.getFunction();
-  unsigned StackAlign = TFI->getStackAlignment();
-  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
-                              F->hasFnAttribute(Attribute::StackAlignment));
-
-  return requiresRealignment && canRealignStack(MF);
-}
-
-bool ARMBaseRegisterInfo::
 cannotEliminateFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index fdc1ef9..6a9a45a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -62,6 +62,12 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   switch (Reg) {
     case D15: case D14: case D13: case D12:
     case D11: case D10: case D9:  case D8:
+    case D7:  case D6:  case D5:  case D4:
+    case D3:  case D2:  case D1:  case D0:
+    case D31: case D30: case D29: case D28:
+    case D27: case D26: case D25: case D24:
+    case D23: case D22: case D21: case D20:
+    case D19: case D18: case D17: case D16:
       return true;
     default:
       return false;
@@ -92,9 +98,12 @@ protected:
 public:
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
-  const uint32_t *getNoPreservedMask() const;
+  const uint32_t *getNoPreservedMask() const override;
+  const uint32_t *getTLSCallPreservedMask(const MachineFunction &MF) const;
 
   /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
   /// case that 'returned' is on an i32 first argument if the calling convention
@@ -126,15 +135,15 @@ public:
                              ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
                              const MachineFunction &MF,
-                             const VirtRegMap *VRM) const override;
+                             const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
 
   void updateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
 
-  bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const override;
+  bool canRealignStack(const MachineFunction &MF) const override;
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
                                    int Idx) const override;
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
index d687568..a731d00 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -160,15 +160,15 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    State);
 }
 
-static const uint16_t RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
+static const MCPhysReg RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
 
-static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
-                                     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
-                                     ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
-                                     ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
-static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
-                                     ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
-static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+static const MCPhysReg SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                      ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                      ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                      ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
 
 
 // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
@@ -199,9 +199,11 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
 
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
-  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
+  auto &DL = State.getMachineFunction().getDataLayout();
+  unsigned StackAlign = DL.getStackAlignment();
+  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
 
-  ArrayRef<uint16_t> RegList;
+  ArrayRef<MCPhysReg> RegList;
   switch (LocVT.SimpleTy) {
   case MVT::i32: {
     RegList = RRegList;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
index 27cf06b..847ef87 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -125,6 +125,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
   CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
   CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCIfAlign<"16",
+           CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
   CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
 ]>;
 
@@ -223,6 +225,21 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
 
+def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP,
+                                           (sequence "R%u", 12, 1),
+                                           (sequence "D%u", 31, 0))>;
+
+// C++ TLS access function saves all registers except SP. Try to match
+// the order of CSRs in CSR_iOS.
+def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1),
+                                           (sequence "D%u", 31, 0))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>;
+
 // The "interrupt" attribute is used to generate code that is acceptable in
 // exception-handlers of various kinds. It makes us use a different return
 // instruction (handled elsewhere) and affects which registers we must return to
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index f4ec8c6..55c1684 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -340,12 +340,12 @@ namespace {
 /// verify - check BBOffsets, BBSizes, alignment of islands
 void ARMConstantIslands::verify() {
 #ifndef NDEBUG
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned MBBId = MBB->getNumber();
-    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
-  }
+  assert(std::is_sorted(MF->begin(), MF->end(),
+                        [this](const MachineBasicBlock &LHS,
+                               const MachineBasicBlock &RHS) {
+                          return BBInfo[LHS.getNumber()].postOffset() <
+                                 BBInfo[RHS.getNumber()].postOffset();
+                        }));
   DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
   for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
     CPUser &U = CPUsers[i];
@@ -542,7 +542,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  const DataLayout &TD = MF->getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
@@ -589,6 +589,8 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
   MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
   for (MachineBasicBlock &MBB : *MF) {
     auto MI = MBB.getLastNonDebugInstr();
+    if (MI == MBB.end())
+      continue;
 
     unsigned JTOpcode;
     switch (MI->getOpcode()) {
@@ -639,12 +641,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
 /// into the block immediately after it.
 bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
+  MachineFunction::iterator MBBI = MBB->getIterator();
   // Can't fall off end of function.
   if (std::next(MBBI) == MBB->getParent()->end())
     return false;
 
-  MachineBasicBlock *NextBB = std::next(MBBI);
+  MachineBasicBlock *NextBB = &*std::next(MBBI);
   if (std::find(MBB->succ_begin(), MBB->succ_end(), NextBB) == MBB->succ_end())
     return false;
 
@@ -722,15 +724,15 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   // has any inline assembly in it. If so, we have to be conservative about
   // alignment assumptions, as we don't know for sure the size of any
   // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(&MBB);
 
   // The known bits of the entry block offset are determined by the function
   // alignment.
   BBInfo.front().KnownBits = MF->getAlignment();
 
   // Compute block offsets and known bits.
-  adjustBBOffsetsAfter(MF->begin());
+  adjustBBOffsetsAfter(&MF->front());
 
   // Now go back through the instructions and build up our data structures.
   for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
@@ -968,7 +970,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
   // Create a new MBB for the code after the OrigBB.
   MachineBasicBlock *NewBB =
     MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MachineFunction::iterator MBBI = ++OrigBB->getIterator();
   MF->insert(MBBI, NewBB);
 
   // Splice the instructions starting with MI over to NewBB.
@@ -1088,7 +1090,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
   unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
   unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
   unsigned NextBlockOffset, NextBlockAlignment;
-  MachineFunction::const_iterator NextBlock = Water;
+  MachineFunction::const_iterator NextBlock = Water->getIterator();
   if (++NextBlock == MF->end()) {
     NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
     NextBlockAlignment = 0;
@@ -1350,7 +1352,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
             << format(", expected CPE offset %#x\n", CPEOffset));
-      NewMBB = std::next(MachineFunction::iterator(UserMBB));
+      NewMBB = &*++UserMBB->getIterator();
       // Add an unconditional branch from UserMBB to fallthrough block.  Record
       // it for branch lengthening; this new branch will not get out of range,
       // but if the preceding conditional branch is out of range, the targets
@@ -1503,8 +1505,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
       NewWaterList.insert(NewIsland);
 
     // The new CPE goes before the following block (NewMBB).
-    NewMBB = std::next(MachineFunction::iterator(WaterBB));
-
+    NewMBB = &*++WaterBB->getIterator();
   } else {
     // No water found.
     DEBUG(dbgs() << "No water found\n");
@@ -1515,7 +1516,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // next iteration for constant pools, but in this context, we don't want
     // it.  Check for this so it will be removed from the WaterList.
     // Also remove any entry from NewWaterList.
-    MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
+    MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
     IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
     if (IP != WaterList.end())
       NewWaterList.erase(WaterBB);
@@ -1532,7 +1533,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     WaterList.erase(IP);
 
   // Okay, we know we can put an island before NewMBB now, do it!
-  MF->insert(NewMBB, NewIsland);
+  MF->insert(NewMBB->getIterator(), NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
   updateForInsertedWaterBlock(NewIsland);
@@ -1553,7 +1554,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
-  adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
+  adjustBBOffsetsAfter(&*--NewIsland->getIterator());
 
   // Finally, change the CPI in the instruction operand to be ID.
   for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1732,7 +1733,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
   }
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = &*++MBB->getIterator();
 
   DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
                << " also invert condition and change dest. to BB#"
@@ -2058,9 +2059,9 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
 /// \brief Returns whether CPEMI is the first instruction in the block
 /// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
 /// we can switch the first register to PC and usually remove the address
-/// calculation that preceeded it.
+/// calculation that preceded it.
 static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
-  MachineFunction::iterator MBB = JTMI->getParent();
+  MachineFunction::iterator MBB = JTMI->getParent()->getIterator();
   MachineFunction *MF = MBB->getParent();
   ++MBB;
 
@@ -2235,7 +2236,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   SmallVector<MachineOperand, 4> CondPrior;
-  MachineFunction::iterator BBi = BB;
+  MachineFunction::iterator BBi = BB->getIterator();
   MachineFunction::iterator OldPrior = std::prev(BBi);
 
   // If the block terminator isn't analyzable, don't try to move the block
@@ -2258,7 +2259,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   // Create a new MBB for the code after the jump BB.
   MachineBasicBlock *NewBB =
     MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
-  MachineFunction::iterator MBBI = JTBB; ++MBBI;
+  MachineFunction::iterator MBBI = ++JTBB->getIterator();
   MF->insert(MBBI, NewBB);
 
   // Add an unconditional branch from NewBB to BB.
@@ -2273,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
 
   // Update the CFG.
   NewBB->addSuccessor(BB);
-  JTBB->removeSuccessor(BB);
-  JTBB->addSuccessor(NewBB);
+  JTBB->replaceSuccessor(BB, NewBB);
 
   ++NumJTInserted;
   return NewBB;
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
index 7d41c69..c9849b2 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -52,8 +52,7 @@ const char *ARMConstantPoolValue::getModifierText() const {
     // strings if that's legal.
   case ARMCP::no_modifier: return "none";
   case ARMCP::TLSGD:       return "tlsgd";
-  case ARMCP::GOT:         return "GOT";
-  case ARMCP::GOTOFF:      return "GOTOFF";
+  case ARMCP::GOT_PREL:    return "GOT_PREL";
   case ARMCP::GOTTPOFF:    return "gottpoff";
   case ARMCP::TPOFF:       return "tpoff";
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
index 36f63e2..6b18a4e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -39,8 +39,7 @@ namespace ARMCP {
   enum ARMCPModifier {
     no_modifier,
     TLSGD,
-    GOT,
-    GOTOFF,
+    GOT_PREL,
     GOTTPOFF,
     TPOFF
   };
@@ -103,8 +102,6 @@ public:
   bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
   bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
 
-  unsigned getRelocationInfo() const override { return 2; }
-
   int getExistingMachineCPValue(MachineConstantPool *CP,
                                 unsigned Alignment) override;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 4438f50..56f3498 100644
--- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -330,22 +330,19 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 /// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
 /// load or store pseudo instruction.
 static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
-  const unsigned NumEntries = array_lengthof(NEONLdStTable);
-
 #ifndef NDEBUG
   // Make sure the table is sorted.
   static bool TableChecked = false;
   if (!TableChecked) {
-    for (unsigned i = 0; i != NumEntries-1; ++i)
-      assert(NEONLdStTable[i] < NEONLdStTable[i+1] &&
-             "NEONLdStTable is not sorted!");
+    assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
+           "NEONLdStTable is not sorted!");
     TableChecked = true;
   }
 #endif
 
-  const NEONLdStTableEntry *I =
-    std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
-  if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
+  auto I = std::lower_bound(std::begin(NEONLdStTable),
+                            std::end(NEONLdStTable), Opcode);
+  if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode)
     return I;
   return nullptr;
 }
@@ -734,7 +731,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   HI16.addImm(Pred).addReg(PredReg);
 
   if (RequiresBundling)
-    finalizeBundle(MBB, &*LO16, &*MBBI);
+    finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator());
 
   TransferImpOps(MI, LO16, HI16);
   MI.eraseFromParent();
@@ -747,6 +744,55 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   switch (Opcode) {
     default:
       return false;
+
+    case ARM::TCRETURNdi:
+    case ARM::TCRETURNri: {
+      MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+      assert(MBBI->isReturn() &&
+             "Can only insert epilog into returning blocks");
+      unsigned RetOpcode = MBBI->getOpcode();
+      DebugLoc dl = MBBI->getDebugLoc();
+      const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+          MBB.getParent()->getSubtarget().getInstrInfo());
+
+      // Tail call return: adjust the stack pointer and jump to callee.
+      MBBI = MBB.getLastNonDebugInstr();
+      MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+      // Jump to label or value in register.
+      if (RetOpcode == ARM::TCRETURNdi) {
+        unsigned TCOpcode =
+            STI->isThumb()
+                ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND)
+                : ARM::TAILJMPd;
+        MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+        if (JumpTarget.isGlobal())
+          MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                               JumpTarget.getTargetFlags());
+        else {
+          assert(JumpTarget.isSymbol());
+          MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                                JumpTarget.getTargetFlags());
+        }
+
+        // Add the default predicate in Thumb mode.
+        if (STI->isThumb())
+          MIB.addImm(ARMCC::AL).addReg(0);
+      } else if (RetOpcode == ARM::TCRETURNri) {
+        BuildMI(MBB, MBBI, dl,
+                TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
+            .addReg(JumpTarget.getReg(), RegState::Kill);
+      }
+
+      MachineInstr *NewMI = std::prev(MBBI);
+      for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+        NewMI->addOperand(MBBI->getOperand(i));
+
+      // Delete the pseudo instruction TCRETURN.
+      MBB.erase(MBBI);
+      MBBI = NewMI;
+      return true;
+    }
     case ARM::VMOVScc:
     case ARM::VMOVDcc: {
       unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
index fdd0763..ff2fcfa 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -578,7 +578,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
 
 unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   // For now 32-bit only.
-  if (VT != MVT::i32) return 0;
+  if (VT != MVT::i32 || GV->isThreadLocal()) return 0;
 
   Reloc::Model RelocM = TM.getRelocationModel();
   bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
@@ -922,12 +922,9 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
   if (Addr.BaseType == Address::FrameIndexBase) {
     int FI = Addr.Base.FI;
     int Offset = Addr.Offset;
-    MachineMemOperand *MMO =
-          FuncInfo.MF->getMachineMemOperand(
-                                  MachinePointerInfo::getFixedStack(FI, Offset),
-                                  Flags,
-                                  MFI.getObjectSize(FI),
-                                  MFI.getObjectAlignment(FI));
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
     MIB.addFrameIndex(FI);
 
@@ -1278,8 +1275,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
       .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
-      fastEmitBranch(FBB, DbgLoc);
-      FuncInfo.MBB->addSuccessor(TBB);
+      finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1303,8 +1299,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
       .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
 
-      fastEmitBranch(FBB, DbgLoc);
-      FuncInfo.MBB->addSuccessor(TBB);
+      finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
   } else if (const ConstantInt *CI =
@@ -1341,8 +1336,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
                   .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
-  fastEmitBranch(FBB, DbgLoc);
-  FuncInfo.MBB->addSuccessor(TBB);
+  finishCondBranch(BI->getParent(), TBB, FBB);
   return true;
 }
 
@@ -1355,8 +1349,8 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
                           TII.get(Opc)).addReg(AddrReg));
 
   const IndirectBrInst *IB = cast<IndirectBrInst>(I);
-  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
-    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+  for (const BasicBlock *SuccBB : IB->successors())
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
 
   return true;
 }
@@ -1860,8 +1854,9 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
         return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
       else
         return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
-    } else
-        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    } else {
+      return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    }
   case CallingConv::ARM_AAPCS_VFP:
     if (!isVarArg)
       return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
@@ -2088,6 +2083,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
   // Build a list of return value registers.
   SmallVector<unsigned, 4> RetRegs;
 
@@ -2944,48 +2942,51 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
 unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
                                      unsigned Align, MVT VT) {
-  bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
-  ARMConstantPoolConstant *CPV =
-    ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
-  unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+  bool UseGOT_PREL =
+      !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+
+  LLVMContext *Context = &MF->getFunction()->getContext();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+      GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+      UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+      /*AddCurrentAddress=*/UseGOT_PREL);
+
+  unsigned ConstAlign =
+      MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+  unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
+
+  unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+  unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
+          .addConstantPoolIndex(Idx);
+  if (Opc == ARM::LDRcp)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
 
-  unsigned Opc;
-  unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
-  // Load value.
-  if (isThumb2) {
-    DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                            TII.get(ARM::t2LDRpci), DestReg1)
-                    .addConstantPoolIndex(Idx));
-    Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
-  } else {
-    // The extra immediate is for addrmode2.
-    DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                            DbgLoc, TII.get(ARM::LDRcp), DestReg1)
-                    .addConstantPoolIndex(Idx).addImm(0));
-    Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs;
-  }
+  // Fix the address by adding pc.
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR
+                                                          : ARM::PICADD;
+  DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0);
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+            .addReg(TempReg)
+            .addImm(ARMPCLabelIndex);
+  if (!Subtarget->isThumb())
+    AddDefaultPred(MIB);
 
-  unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
-  if (GlobalBaseReg == 0) {
-    GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT));
-    AFI->setGlobalBaseReg(GlobalBaseReg);
+  if (UseGOT_PREL && Subtarget->isThumb()) {
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(ARM::t2LDRi12), NewDestReg)
+              .addReg(DestReg)
+              .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
   }
-
-  unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
-  DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0);
-  DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
-  GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                                    DbgLoc, TII.get(Opc), DestReg2)
-                            .addReg(DestReg1)
-                            .addReg(GlobalBaseReg);
-  if (!UseGOTOFF)
-    MIB.addImm(0);
-  AddOptionalDefs(MIB);
-
-  return DestReg2;
+  return DestReg;
 }
 
 bool ARMFastISel::fastLowerArguments() {
@@ -3038,7 +3039,7 @@ bool ARMFastISel::fastLowerArguments() {
   }
 
 
-  static const uint16_t GPRArgRegs[] = {
+  static const MCPhysReg GPRArgRegs[] = {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
@@ -3055,7 +3056,7 @@ bool ARMFastISel::fastLowerArguments() {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
-    updateValueMap(I, ResultReg);
+    updateValueMap(&*I, ResultReg);
   }
 
   return true;
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 6744000..c5990bb 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
@@ -58,7 +59,7 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   // iOS requires FP not to be clobbered for backtracing purpose.
-  if (STI.isTargetIOS())
+  if (STI.isTargetIOS() || STI.isTargetWatchOS())
     return true;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -288,7 +289,6 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
 
 void ARMFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented");
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo  *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -305,7 +305,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+  
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
   // Determine the sizes of each callee-save spill areas and record which frame
@@ -489,7 +493,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
 
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
+    if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
+        tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
       DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
     else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
@@ -689,60 +694,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-// Resolve TCReturn pseudo-instruction
-void ARMFrameLowering::fixTCReturn(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  unsigned RetOpcode = MBBI->getOpcode();
-  DebugLoc dl = MBBI->getDebugLoc();
-  const ARMBaseInstrInfo &TII =
-      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-  if (!(RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri))
-    return;
-
-  // Tail call return: adjust the stack pointer and jump to callee.
-  MBBI = MBB.getLastNonDebugInstr();
-  MachineOperand &JumpTarget = MBBI->getOperand(0);
-
-  // Jump to label or value in register.
-  if (RetOpcode == ARM::TCRETURNdi) {
-    unsigned TCOpcode = STI.isThumb() ?
-             (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
-             ARM::TAILJMPd;
-    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
-    if (JumpTarget.isGlobal())
-      MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                           JumpTarget.getTargetFlags());
-    else {
-      assert(JumpTarget.isSymbol());
-      MIB.addExternalSymbol(JumpTarget.getSymbolName(),
-                            JumpTarget.getTargetFlags());
-    }
-
-    // Add the default predicate in Thumb mode.
-    if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
-  } else if (RetOpcode == ARM::TCRETURNri) {
-    BuildMI(MBB, MBBI, dl,
-            TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
-      addReg(JumpTarget.getReg(), RegState::Kill);
-  }
-
-  MachineInstr *NewMI = std::prev(MBBI);
-  for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
-    NewMI->addOperand(MBBI->getOperand(i));
-
-  // Delete the pseudo instruction TCRETURN.
-  MBB.erase(MBBI);
-  MBBI = NewMI;
-}
-
 void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
@@ -758,10 +711,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
-  if (MF.getFunction()->getCallingConv() == CallingConv::GHC) {
-    fixTCReturn(MF, MBB);
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
     return;
-  }
+
+  // First put ourselves on the first (from top) terminator instructions.
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0)
@@ -840,8 +795,6 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
   }
 
-  fixTCReturn(MF, MBB);
-
   if (ArgRegsSaveSize)
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
 }
@@ -932,12 +885,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
   return Offset;
 }
 
-int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                          int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
-}
-
 void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MI,
                                     const std::vector<CalleeSavedInfo> &CSI,
@@ -950,7 +897,6 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   DebugLoc DL;
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
 
   SmallVector<std::pair<unsigned,bool>, 4> Regs;
   unsigned i = CSI.size();
@@ -1008,7 +954,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     // Put any subsequent vpush instructions before this one: they will refer to
     // higher register numbers so need to be pushed first in order to preserve
     // monotonicity.
-    --MI;
+    if (MI != MBB.begin())
+      --MI;
   }
 }
 
@@ -1022,12 +969,20 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned RetOpcode = MI->getOpcode();
-  bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
-                     RetOpcode == ARM::TCRETURNri);
-  bool isInterrupt =
-      RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+  DebugLoc DL;
+  bool isTailCall = false;
+  bool isInterrupt = false;
+  bool isTrap = false;
+  if (MBB.end() != MI) {
+    DL = MI->getDebugLoc();
+    unsigned RetOpcode = MI->getOpcode();
+    isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri);
+    isInterrupt =
+        RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+    isTrap =
+        RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
+        RetOpcode == ARM::tTRAP;
+  }
 
   SmallVector<unsigned, 4> Regs;
   unsigned i = CSI.size();
@@ -1043,11 +998,14 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
         continue;
 
       if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
-          STI.hasV5TOps()) {
-        Reg = ARM::PC;
-        LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+          !isTrap && STI.hasV5TOps()) {
+        if (MBB.succ_empty()) {
+          Reg = ARM::PC;
+          DeleteRet = true;
+          LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+        } else
+          LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
         // Fold the return instruction into the LDM.
-        DeleteRet = true;
       }
 
       // If NoGap is true, pop consecutive registers and then leave the rest
@@ -1068,7 +1026,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
                        .addReg(ARM::SP));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
-      if (DeleteRet) {
+      if (DeleteRet && MI != MBB.end()) {
         MIB.copyImplicitOps(&*MI);
         MI->eraseFromParent();
       }
@@ -1095,7 +1053,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
 
     // Put any subsequent vpop instructions after this one: they will refer to
     // higher register numbers so need to be popped afterwards.
-    ++MI;
+    if (MI != MBB.end())
+      ++MI;
   }
 }
 
@@ -1109,7 +1068,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
                                     const TargetRegisterInfo *TRI) {
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
@@ -1118,7 +1077,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // slot offsets can be wrong. The offset for d8 will always be correct.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned DNum = CSI[i].getReg() - ARM::D8;
-    if (DNum >= 8)
+    if (DNum > NumAlignedDPRCS2Regs - 1)
       continue;
     int FI = CSI[i].getFrameIdx();
     // The even-numbered registers will be 16-byte aligned, the odd-numbered
@@ -1269,7 +1228,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
                                       const TargetRegisterInfo *TRI) {
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // Find the frame index assigned to d8.
@@ -1654,13 +1613,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // FIXME: We could add logic to be more precise about negative offsets
   //        and which instructions will need a scratch register for them. Is it
   //        worth the effort and added fragility?
-  bool BigStack =
-    (RS &&
-     (MFI->estimateStackSize(MF) +
-      ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
-      estimateRSStackSizeLimit(MF, this)))
-    || MFI->hasVarSizedObjects()
-    || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
+  bool BigStack = (RS && (MFI->estimateStackSize(MF) +
+                              ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >=
+                          estimateRSStackSizeLimit(MF, this))) ||
+                  MFI->hasVarSizedObjects() ||
+                  (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
 
   bool ExtraCSSpill = false;
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
@@ -1698,8 +1655,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
         for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
           unsigned Reg = UnspilledCS1GPRs[i];
-          // Don't spill high register if the function is thumb
+          // Don't spill high register if the function is thumb.  In the case of
+          // Windows on ARM, accept R11 (frame pointer)
           if (!AFI->isThumbFunction() ||
+              (STI.isTargetWindows() && Reg == ARM::R11) ||
               isARMLowRegister(Reg) || Reg == ARM::LR) {
             SavedRegs.set(Reg);
             if (!MRI.isReserved(Reg))
@@ -1784,8 +1743,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      unsigned Align = getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
+      Amount = alignSPAdjust(Amount);
 
       ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
       assert(!AFI->isThumb1OnlyFunction() &&
@@ -1885,7 +1843,6 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   if (!ST->isTargetAndroid() && !ST->isTargetLinux())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
-  assert(&PrologueMBB == &MF.front() && "Shrink-wrapping not yet implemented");
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   MCContext &Context = MMI.getContext();
@@ -1913,21 +1870,48 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock();
 
-  for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(),
-                                          e = PrologueMBB.livein_end();
-       i != e; ++i) {
-    AllocMBB->addLiveIn(*i);
-    GetMBB->addLiveIn(*i);
-    McrMBB->addLiveIn(*i);
-    PrevStackMBB->addLiveIn(*i);
-    PostStackMBB->addLiveIn(*i);
+  // Grab everything that reaches PrologueMBB to update there liveness as well.
+  SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion;
+  SmallVector<MachineBasicBlock *, 2> WalkList;
+  WalkList.push_back(&PrologueMBB);
+
+  do {
+    MachineBasicBlock *CurMBB = WalkList.pop_back_val();
+    for (MachineBasicBlock *PredBB : CurMBB->predecessors()) {
+      if (BeforePrologueRegion.insert(PredBB).second)
+        WalkList.push_back(PredBB);
+    }
+  } while (!WalkList.empty());
+
+  // The order in that list is important.
+  // The blocks will all be inserted before PrologueMBB using that order.
+  // Therefore the block that should appear first in the CFG should appear
+  // first in the list.
+  MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB,
+                                      PostStackMBB};
+
+  for (MachineBasicBlock *B : AddedBlocks)
+    BeforePrologueRegion.insert(B);
+
+  for (const auto &LI : PrologueMBB.liveins()) {
+    for (MachineBasicBlock *PredBB : BeforePrologueRegion)
+      PredBB->addLiveIn(LI);
+  }
+
+  // Remove the newly added blocks from the list, since we know
+  // we do not have to do the following updates for them.
+  for (MachineBasicBlock *B : AddedBlocks) {
+    BeforePrologueRegion.erase(B);
+    MF.insert(PrologueMBB.getIterator(), B);
   }
 
-  MF.push_front(PostStackMBB);
-  MF.push_front(AllocMBB);
-  MF.push_front(GetMBB);
-  MF.push_front(McrMBB);
-  MF.push_front(PrevStackMBB);
+  for (MachineBasicBlock *MBB : BeforePrologueRegion) {
+    // Make sure the LiveIns are still sorted and unique.
+    MBB->sortUniqueLiveIns();
+    // Replace the edges to PrologueMBB by edges to the sequences
+    // we are about to add.
+    MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+  }
 
   // The required stack size that is aligned to ARM constant criterion.
   AlignedStackSize = alignToARMConstant(StackSize);
@@ -1991,7 +1975,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
         MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0);
     MachineConstantPool *MCP = MF.getConstantPool();
-    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment());
+    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
 
     // ldr SR0, [pc, offset(STACK_LIMIT)]
     AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
index 6fdc5ef..66f4dfb 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -31,8 +31,6 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void fixTCReturn(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
@@ -52,7 +50,6 @@ public:
                              unsigned &FrameReg) const override;
   int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
                                  unsigned &FrameReg, int SPAdj) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
@@ -60,6 +57,11 @@ public:
   void adjustForSegmentedStacks(MachineFunction &MF,
                                 MachineBasicBlock &MBB) const override;
 
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return true;
+  }
+
  private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                     const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b110628..dfbb969 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -160,11 +160,6 @@ public:
 
   // Thumb Addressing Modes:
   bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
-  bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset,
-                             unsigned Scale);
-  bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset);
-  bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset);
-  bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset);
   bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
                                 SDValue &OffImm);
   bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
@@ -176,8 +171,6 @@ public:
   bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
 
   // Thumb 2 Addressing Modes:
-  bool SelectT2ShifterOperandReg(SDValue N,
-                                 SDValue &BaseReg, SDValue &Opc);
   bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
                             SDValue &OffImm);
@@ -278,6 +271,22 @@ private:
   // Get the alignment operand for a NEON VLD or VST instruction.
   SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
                         bool is64BitVector);
+
+  /// Returns the number of instructions required to materialize the given
+  /// constant in a register, or 3 if a literal pool load is needed.
+  unsigned ConstantMaterializationCost(unsigned Val) const;
+
+  /// Checks if N is a multiplication by a constant where we can extract out a
+  /// power of two from the constant so that it can be used in a shift, but only
+  /// if it simplifies the materialization of the constant. Returns true if it
+  /// is, and assigns to PowerOfTwo the power of two that should be extracted
+  /// out and to NewMulConst the new constant to be multiplied by.
+  bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
+                              unsigned &PowerOfTwo, SDValue &NewMulConst) const;
+
+  /// Replace N with M in CurDAG, in a way that also ensures that M gets
+  /// selected when N would have been selected.
+  void replaceDAGValue(const SDValue &N, SDValue M);
 };
 }
 
@@ -334,7 +343,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
   bool isThumb2 = Subtarget->isThumb();
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
-    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 
     if (N->getOpcode() != ISD::ADD)
       continue;
@@ -388,7 +397,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     SDValue CPTmp1;
     SDValue CPTmp2;
     if (isThumb2) {
-      if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1))
+      if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1))
         continue;
     } else {
       if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
@@ -471,6 +480,61 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
          (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
 }
 
+unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
+  if (Subtarget->isThumb()) {
+    if (Val <= 255) return 1;                               // MOV
+    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (~Val <= 255) return 2;                              // MOV + MVN
+    if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1) return 1;           // MOV
+    if (ARM_AM::getSOImmVal(~Val) != -1) return 1;          // MVN
+    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (ARM_AM::isSOImmTwoPartVal(Val)) return 2;           // two instrs
+  }
+  if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
+  return 3; // Literal pool load
+}
+
+bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
+                                             unsigned MaxShift,
+                                             unsigned &PowerOfTwo,
+                                             SDValue &NewMulConst) const {
+  assert(N.getOpcode() == ISD::MUL);
+  assert(MaxShift > 0);
+
+  // If the multiply is used in more than one place then changing the constant
+  // will make other uses incorrect, so don't.
+  if (!N.hasOneUse()) return false;
+  // Check if the multiply is by a constant
+  ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!MulConst) return false;
+  // If the constant is used in more than one place then modifying it will mean
+  // we need to materialize two constants instead of one, which is a bad idea.
+  if (!MulConst->hasOneUse()) return false;
+  unsigned MulConstVal = MulConst->getZExtValue();
+  if (MulConstVal == 0) return false;
+
+  // Find the largest power of 2 that MulConstVal is a multiple of
+  PowerOfTwo = MaxShift;
+  while ((MulConstVal % (1 << PowerOfTwo)) != 0) {
+    --PowerOfTwo;
+    if (PowerOfTwo == 0) return false;
+  }
+
+  // Only optimise if the new cost is better
+  unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
+  NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
+  unsigned OldCost = ConstantMaterializationCost(MulConstVal);
+  unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+  return NewCost < OldCost;
+}
+
+void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
+  CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
+  CurDAG->ReplaceAllUsesWith(N, M);
+}
+
 bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
                                               SDValue &BaseReg,
                                               SDValue &Opc,
@@ -478,6 +542,24 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
   if (DisableShifterOp)
     return false;
 
+  // If N is a multiply-by-constant and it's profitable to extract a shift and
+  // use it in a shifted operand do so.
+  if (N.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+      BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
+                                               N.getOperand(0), NewMulConst)
+                                   .getNode()),
+                        0);
+      replaceDAGValue(N.getOperand(1), NewMulConst);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
+                                                          PowerOfTwo),
+                                      SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
   ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
 
   // Don't match base register only case. That is matched to a separate
@@ -540,7 +622,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else
       Base = N;
@@ -662,6 +745,18 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
     }
   }
 
+  // If Offset is a multiply-by-constant and it's profitable to extract a shift
+  // and use it in a shifted operand do so.
+  if (Offset.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(Offset.getOperand(1), NewMulConst);
+      ShAmt = PowerOfTwo;
+      ShOpcVal = ARM_AM::lsl;
+    }
+  }
+
   Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
                                   SDLoc(N), MVT::i32);
   return true;
@@ -707,7 +802,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = CurDAG->getTargetFrameIndex(
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
@@ -973,7 +1069,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
       Base = CurDAG->getTargetFrameIndex(
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
@@ -1086,77 +1183,14 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
 }
 
 bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base,
-                                       SDValue &Offset, unsigned Scale) {
-  if (Scale == 4) {
-    SDValue TmpBase, TmpOffImm;
-    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
-      return false;  // We want to select tLDRspi / tSTRspi instead.
-
-    if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
-      return false;  // We want to select tLDRpci instead.
-  }
-
-  if (!CurDAG->isBaseWithConstantOffset(N))
-    return false;
-
-  // Thumb does not have [sp, r] address mode.
-  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
-  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
-  if ((LHSR && LHSR->getReg() == ARM::SP) ||
-      (RHSR && RHSR->getReg() == ARM::SP))
-    return false;
-
-  // FIXME: Why do we explicitly check for a match here and then return false?
-  // Presumably to allow something else to match, but shouldn't this be
-  // documented?
-  int RHSC;
-  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC))
-    return false;
-
-  Base = N.getOperand(0);
-  Offset = N.getOperand(1);
-  return true;
-}
-
-bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N,
-                                          SDValue &Base,
-                                          SDValue &Offset) {
-  return SelectThumbAddrModeRI(N, Base, Offset, 1);
-}
-
-bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N,
-                                          SDValue &Base,
-                                          SDValue &Offset) {
-  return SelectThumbAddrModeRI(N, Base, Offset, 2);
-}
-
-bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N,
-                                          SDValue &Base,
-                                          SDValue &Offset) {
-  return SelectThumbAddrModeRI(N, Base, Offset, 4);
-}
-
-bool
 ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
                                           SDValue &Base, SDValue &OffImm) {
-  if (Scale == 4) {
-    SDValue TmpBase, TmpOffImm;
-    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
-      return false;  // We want to select tLDRspi / tSTRspi instead.
-
-    if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
-      return false;  // We want to select tLDRpci instead.
-  }
-
   if (!CurDAG->isBaseWithConstantOffset(N)) {
-    if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+    if (N.getOpcode() == ISD::ADD) {
+      return false; // We want to select register offset instead
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else {
       Base = N;
@@ -1166,23 +1200,6 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
     return true;
   }
 
-  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
-  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
-  if ((LHSR && LHSR->getReg() == ARM::SP) ||
-      (RHSR && RHSR->getReg() == ARM::SP)) {
-    ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0));
-    ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    unsigned LHSC = LHS ? LHS->getZExtValue() : 0;
-    unsigned RHSC = RHS ? RHS->getZExtValue() : 0;
-
-    // Thumb does not have [sp, #imm5] address mode for non-zero imm5.
-    if (LHSC != 0 || RHSC != 0) return false;
-
-    Base = N;
-    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
-    return true;
-  }
-
   // If the RHS is + imm5 * scale, fold into addr mode.
   int RHSC;
   if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
@@ -1191,9 +1208,8 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
     return true;
   }
 
-  Base = N.getOperand(0);
-  OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
-  return true;
+  // Offset is too large, so use register offset instead.
+  return false;
 }
 
 bool
@@ -1263,28 +1279,6 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
 //===----------------------------------------------------------------------===//
 
 
-bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
-                                                SDValue &Opc) {
-  if (DisableShifterOp)
-    return false;
-
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
-
-  // Don't match base register only case. That is matched to a separate
-  // lower complexity pattern with explicit register operand.
-  if (ShOpcVal == ARM_AM::no_shift) return false;
-
-  BaseReg = N.getOperand(0);
-  unsigned ShImmVal = 0;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    ShImmVal = RHS->getZExtValue() & 31;
-    Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), SDLoc(N));
-    return true;
-  }
-
-  return false;
-}
-
 bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   // Match simple R + imm12 operands.
@@ -1302,7 +1296,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::TargetConstantPool)
         return false;  // We want to select t2LDRpci instead.
@@ -1425,6 +1420,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
     }
   }
 
+  // If OffReg is a multiply-by-constant and it's profitable to extract a shift
+  // and use it in a shifted operand do so.
+  if (OffReg.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(OffReg.getOperand(1), NewMulConst);
+      ShAmt = PowerOfTwo;
+    }
+  }
+
   ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32);
 
   return true;
@@ -2503,25 +2509,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
-    bool UseCP = true;
-    if (Subtarget->useMovt(*MF))
-      // Thumb2-aware targets have the MOVT instruction, so all immediates can
-      // be done with MOV + MOVT, at worst.
-      UseCP = false;
-    else {
-      if (Subtarget->isThumb()) {
-        UseCP = (Val > 255 &&                                  // MOV
-                 ~Val > 255 &&                                 // MOV + MVN
-                 !ARM_AM::isThumbImmShiftedVal(Val) &&         // MOV + LSL
-                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
-      } else
-        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&             // MOV
-                 ARM_AM::getSOImmVal(~Val) == -1 &&            // MVN
-                 !ARM_AM::isSOImmTwoPartVal(Val) &&            // two instrs.
-                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
-    }
-
-    if (UseCP) {
+    // If we can't materialize the constant we need to use a literal pool
+    if (ConstantMaterializationCost(Val) > 2) {
       SDValue CPIdx = CurDAG->getTargetConstantPool(
           ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
           TLI->getPointerTy(CurDAG->getDataLayout()));
@@ -3376,7 +3365,7 @@ static void getIntOperandsFromRegisterString(StringRef RegString,
                                              SelectionDAG *CurDAG, SDLoc DL,
                                              std::vector<SDValue>& Ops) {
   SmallVector<StringRef, 5> Fields;
-  RegString.split(Fields, ":");
+  RegString.split(Fields, ':');
 
   if (Fields.size() > 1) {
     bool AllIntFields = true;
@@ -3461,9 +3450,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
 // The flags here are common to those allowed for apsr in the A class cores and
 // those allowed for the special registers in the M class cores. Returns a
 // value representing which flags were present, -1 if invalid.
-static inline int getMClassFlagsMask(StringRef Flags) {
+static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
   if (Flags.empty())
-    return 0x3;
+    return 0x2 | (int)hasDSP;
 
   return StringSwitch<int>(Flags)
           .Case("g", 0x1)
@@ -3492,7 +3481,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
-  int Mask = getMClassFlagsMask(Flags);
+  int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
 
   // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
   // shouldn't have flags present.
@@ -3501,7 +3490,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
 
   // The _g and _nzcvqg versions are only valid if the DSP extension is
   // available.
-  if (!Subtarget->hasThumb2DSP() && (Mask & 0x2))
+  if (!Subtarget->hasDSP() && (Mask & 0x1))
     return -1;
 
   // The register was valid so need to put the mask in the correct place
@@ -3523,7 +3512,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
     // The flags permitted for apsr are the same flags that are allowed in
     // M class registers. We get the flag value and then shift the flags into
     // the correct place to combine with the mask.
-    Mask = getMClassFlagsMask(Flags);
+    Mask = getMClassFlagsMask(Flags, true);
     if (Mask == -1)
       return -1;
     return Mask << 2;
@@ -3742,7 +3731,7 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
   }
 
   SmallVector<StringRef, 5> Fields;
-  StringRef(SpecialReg).split(Fields, "_", 1, false);
+  StringRef(SpecialReg).split(Fields, '_', 1, false);
   std::string Reg = Fields[0].str();
   StringRef Flags = Fields.size() == 2 ? Fields[1] : "";
 
@@ -3943,6 +3932,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
     //        be an immediate and not a memory constraint.
     // Fallthrough.
   case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_o:
   case InlineAsm::Constraint_Q:
   case InlineAsm::Constraint_Um:
   case InlineAsm::Constraint_Un:
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8cc06df..37c0795 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -142,6 +142,11 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::UREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
+
+  if (!VT.isFloatingPoint() &&
+      VT != MVT::v2i64 && VT != MVT::v1i64)
+    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+      setOperationAction(Opcode, VT, Legal);
 }
 
 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
@@ -166,77 +171,78 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
-      // Single-precision floating-point arithmetic.
-      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
-      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
-      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
-      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
-
-      // Double-precision floating-point arithmetic.
-      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
-      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
-      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
-      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
-
-      // Single-precision comparisons.
-      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
-      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
-      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
-      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
-      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
-      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
-      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
-      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
-
-      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
-      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
-
-      // Double-precision comparisons.
-      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
-      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
-      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
-      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
-      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
-      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
-      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
-      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
-
-      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
-      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
-      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
-
-      // Floating-point to integer conversions.
-      // i64 conversions are done via library routines even when generating VFP
-      // instructions, so use the same ones.
-      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
-      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
-      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
-      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
-
-      // Conversions between floating types.
-      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
-      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
+      static const struct {
+        const RTLIB::Libcall Op;
+        const char * const Name;
+        const ISD::CondCode Cond;
+      } LibraryCalls[] = {
+        // Single-precision floating-point arithmetic.
+        { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+
+        // Double-precision floating-point arithmetic.
+        { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+
+        // Single-precision comparisons.
+        { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
+        { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
+        { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
+        { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
+        { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
+        { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
+        { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
+        { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
+
+        // Double-precision comparisons.
+        { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
+        { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
+        { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
+        { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
+        { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
+        { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
+        { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
+        { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
+
+        // Floating-point to integer conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
+        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
+        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
+        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+
+        // Conversions between floating types.
+        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
+        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
+
+        // Integer to floating-point conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+        // e.g., __floatunsidf vs. __floatunssidfvfp.
+        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
+        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
+        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
+        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        if (LC.Cond != ISD::SETCC_INVALID)
+          setCmpLibcallCC(LC.Op, LC.Cond);
+      }
+    }
 
-      // Integer to floating-point conversions.
-      // i64 conversions are done via library routines even when generating VFP
-      // instructions, so use the same ones.
-      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
-      // e.g., __floatunsidf vs. __floatunssidfvfp.
-      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
-      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
-      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
-      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+    // Set the correct calling convention for ARMv7k WatchOS. It's just
+    // AAPCS_VFP for functions as simple as libcalls.
+    if (Subtarget->isTargetWatchOS()) {
+      for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
+        setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
     }
   }
 
@@ -245,8 +251,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::SRL_I128, nullptr);
   setLibcallName(RTLIB::SRA_I128, nullptr);
 
-  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
-      !Subtarget->isTargetWindows()) {
+  // RTLIB
+  if (Subtarget->isAAPCS_ABI() &&
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+       Subtarget->isTargetAndroid())) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
@@ -334,12 +342,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-
-      // Memory operations
-      // RTABI chapter 4.3.4
-      { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
     };
 
     for (const auto &LC : LibraryCalls) {
@@ -348,6 +350,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       if (LC.Cond != ISD::SETCC_INVALID)
         setCmpLibcallCC(LC.Op, LC.Cond);
     }
+
+    // EABI dependent RTLIB
+    if (TM.Options.EABIVersion == EABI::EABI4 ||
+        TM.Options.EABIVersion == EABI::EABI5) {
+      static const struct {
+        const RTLIB::Libcall Op;
+        const char *const Name;
+        const CallingConv::ID CC;
+        const ISD::CondCode Cond;
+      } MemOpsLibraryCalls[] = {
+        // Memory operations
+        // RTABI chapter 4.3.4
+        { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+        { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+        { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      };
+
+      for (const auto &LC : MemOpsLibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+        if (LC.Cond != ISD::SETCC_INVALID)
+          setCmpLibcallCC(LC.Op, LC.Cond);
+      }
+    }
   }
 
   if (Subtarget->isTargetWindows()) {
@@ -364,6 +390,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SDIV_I32, "__rt_sdiv",   CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UDIV_I32, "__rt_udiv",   CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
     };
 
     for (const auto &LC : LibraryCalls) {
@@ -373,8 +403,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->getTargetTriple().isiOS() &&
-      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
+  if (Subtarget->isTargetWatchOS() ||
+      (Subtarget->isTargetIOS() &&
+       !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
@@ -392,6 +423,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
   }
 
+  // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
+  // a __gnu_ prefix (which is the default).
+  if (Subtarget->isTargetAEABI()) {
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
+    setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
+    setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
+  }
+
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
@@ -579,7 +618,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
     setTargetDAGCombine(ISD::ANY_EXTEND);
-    setTargetDAGCombine(ISD::SELECT_CC);
     setTargetDAGCombine(ISD::BUILD_VECTOR);
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
@@ -605,7 +643,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::ADDC);
 
   if (Subtarget->isFPOnlySP()) {
-    // When targetting a floating-point unit with only single-precision
+    // When targeting a floating-point unit with only single-precision
     // operations, f64 is legal for the few double-precision instructions which
     // are present However, no double-precision operations other than moves,
     // loads and stores are provided by the hardware.
@@ -689,7 +727,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
   }
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
-      || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
+      || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
     setOperationAction(ISD::MULHS, MVT::i32, Expand);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
@@ -706,8 +744,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
   }
 
+  if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
+    setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
   // ARM does not have ROTL.
-  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+  }
   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
@@ -717,7 +762,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
 
-  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+  // @llvm.readcyclecounter requires the Performance Monitors extension.
+  // Default to the 0 expansion on unsupported platforms.
+  // FIXME: Technically there are older ARM CPUs that have
+  // implementation-specific ways of obtaining this information.
+  if (Subtarget->hasPerfMon())
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
 
   // Only ARMv6 has BSWAP.
   if (!Subtarget->hasV6Ops())
@@ -726,15 +776,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
       !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
     // These are expanded into libcalls if the cpu doesn't have HW divider.
-    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
-    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+    setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
+    setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
   }
 
-  // FIXME: Also set divmod for SREM on EABI
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
   // Register based DivRem for AEABI (RTABI 4.2)
-  if (Subtarget->isTargetAEABI()) {
+  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) {
+    setOperationAction(ISD::SREM, MVT::i64, Custom);
+    setOperationAction(ISD::UREM, MVT::i64, Custom);
+
     setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
     setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
     setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
@@ -762,7 +814,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
-  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 
@@ -776,13 +827,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (!Subtarget->isTargetMachO()) {
-    // Non-MachO platforms may return values in these registers via the
-    // personality function.
-    setExceptionPointerRegister(ARM::R0);
-    setExceptionSelectorRegister(ARM::R1);
-  }
-
   if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   else
@@ -849,11 +893,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  if (Subtarget->isTargetDarwin()) {
-    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
-    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+  if (Subtarget->useSjLjEH())
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
-  }
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
@@ -912,7 +956,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->getTargetTriple().isiOS()) {
+    if (Subtarget->isTargetWatchOS()) {
+      setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
+      setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
+    }
+    if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
       // For iOS, we don't want to the normal expansion of a libcall to
       // sincos. We want to issue a libcall to __sincos_stret.
       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
@@ -928,6 +976,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
     setOperationAction(ISD::FRINT, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
     if (!Subtarget->isFPOnlySP()) {
       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -935,8 +990,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
       setOperationAction(ISD::FRINT, MVT::f64, Legal);
+      setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+      setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     }
   }
+
+  if (Subtarget->hasNEON()) {
+    // vmin and vmax aren't available in a scalar form, so we use
+    // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+  }
+
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
   setTargetDAGCombine(ISD::ADD);
@@ -959,11 +1028,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   //// temporary - rewrite interface to use type
   MaxStoresPerMemset = 8;
-  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemsetOptSize = 4;
   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
-  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemcpyOptSize = 2;
   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
-  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemmoveOptSize = 2;
 
   // On ARM arguments smaller than 4 bytes are extended, so all arguments
   // are at least 4 bytes aligned.
@@ -1054,8 +1123,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::CMOV:          return "ARMISD::CMOV";
 
-  case ARMISD::RBIT:          return "ARMISD::RBIT";
-
   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
@@ -1069,7 +1136,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
 
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
-  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
+  case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
+  case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
 
   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
 
@@ -1082,6 +1150,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
 
   case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
+  case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
 
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
@@ -1133,14 +1202,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
-  case ARMISD::FMAX:          return "ARMISD::FMAX";
-  case ARMISD::FMIN:          return "ARMISD::FMIN";
-  case ARMISD::VMAXNM:        return "ARMISD::VMAX";
-  case ARMISD::VMINNM:        return "ARMISD::VMIN";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
   case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
@@ -1319,6 +1385,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
     else
       return CallingConv::ARM_AAPCS;
   case CallingConv::Fast:
+  case CallingConv::CXX_FAST_TLS:
     if (!Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
         return CallingConv::Fast;
@@ -1449,9 +1516,10 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
-  return DAG.getStore(Chain, dl, Arg, PtrOff,
-                      MachinePointerInfo::getStack(LocMemOffset),
-                      false, false, 0);
+  return DAG.getStore(
+      Chain, dl, Arg, PtrOff,
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+      false, false, 0);
 }
 
 void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
@@ -1734,9 +1802,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(), false, false,
-                           false, 0);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
@@ -1748,9 +1817,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(), false, false,
-                           false, 0);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
@@ -1768,7 +1838,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           ARMISD::WrapperPIC, dl, PtrVt,
           DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
       Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
-                           MachinePointerInfo::getGOT(), false, false, true, 0);
+                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                           false, false, true, 0);
     } else if (Subtarget->isTargetCOFF()) {
       assert(Subtarget->isTargetWindows() &&
              "Windows is the only supported COFF target");
@@ -1781,7 +1852,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         Callee =
             DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
                         DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
-                        MachinePointerInfo::getGOT(), false, false, false, 0);
+                        MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                        false, false, false, 0);
     } else {
       // On ELF targets for PIC code, direct calls should go through the PLT
       unsigned OpFlags = 0;
@@ -1804,9 +1876,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                       ARMPCLabelIndex, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(), false, false,
-                           false, 0);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
     } else {
@@ -1821,7 +1894,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
-  bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
@@ -1831,8 +1903,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
-               // Emit regular call when code size is the priority
-               !HasMinSizeAttr)
+             // Emit regular call when code size is the priority
+             !MF.getFunction()->optForMinSize())
       // "mov lr, pc; b _foo" to avoid confusing the RSP
       CallOpc = ARMISD::CALL_NOLINK;
     else
@@ -2014,6 +2086,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
+  assert(Subtarget->supportsTailCall());
+
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
 
@@ -2033,26 +2107,6 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
-  // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
-  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
-  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
-  // support in the assembler and linker to be used. This would need to be
-  // fixed to fully support tail calls in Thumb1.
-  //
-  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
-  // LR.  This means if we need to reload LR, it takes an extra instructions,
-  // which outweighs the value of the tail call; but here we don't know yet
-  // whether LR is going to be used.  Probably the right approach is to
-  // generate the tail call here and turn it back into CALL/RET in
-  // emitEpilogue if LR is used.
-
-  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
-  // but we need to make sure there are enough registers; the only valid
-  // registers are the 4 used for parameters.  We don't currently do this
-  // case.
-  if (Subtarget->isThumb1Only())
-    return false;
-
   // Externally-defined functions with weak linkage should not be
   // tail-called on ARM when the OS does not support dynamic
   // pre-emption of symbols, as the AAELF spec requires normal calls
@@ -2294,6 +2348,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (ARM::GPRRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+      else if (ARM::DPRRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
 
   // Update chain and glue.
   RetOps[0] = Chain;
@@ -2400,7 +2467,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
     return false;
 
-  return !Subtarget->isThumb1Only();
+  return true;
 }
 
 // Trying to write a 64 bit value so need to split into two 32 bit values first,
@@ -2467,15 +2534,82 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
-  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
-                               MachinePointerInfo::getConstantPool(),
-                               false, false, false, 0);
+  SDValue Result =
+      DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 0);
   if (RelocM == Reloc::Static)
     return Result;
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
 }
 
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address for Darwin, and return an
+/// SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i32] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first word, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "r0".
+///
+/// Since this descriptor may be in a different unit, in general access must
+/// proceed along the usual ARM rules. A common sequence to produce is:
+///
+///     movw rT1, :lower16:_var$non_lazy_ptr
+///     movt rT1, :upper16:_var$non_lazy_ptr
+///     ldr r0, [rT1]
+///     ldr rT2, [r0]
+///     blx rT2
+///     [...address now in r0...]
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+  SDLoc DL(Op);
+
+  // First step is to get the address of the actua global symbol. This is where
+  // the TLS descriptor lives.
+  SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i32, DL, Chain, DescAddr,
+                  MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                  false, true, true, 4);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFunction &F = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = F.getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+  // silly).
+  auto TRI =
+      getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
+  auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: r0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
+}
+
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
@@ -2491,9 +2625,10 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
-  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
+  Argument =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 0);
   SDValue Chain = Argument.getValue(1);
 
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2543,17 +2678,19 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                       true);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
-    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
+    Offset = DAG.getLoad(
+        PtrVT, dl, Chain, Offset,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, 0);
     Chain = Offset.getValue(1);
 
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
 
-    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
+    Offset = DAG.getLoad(
+        PtrVT, dl, Chain, Offset,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, 0);
   } else {
     // local exec model
     assert(model == TLSModel::LocalExec);
@@ -2561,9 +2698,10 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
-    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
+    Offset = DAG.getLoad(
+        PtrVT, dl, Chain, Offset,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, 0);
   }
 
   // The address of the thread local variable is the add of the thread
@@ -2573,10 +2711,14 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
 
 SDValue
 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerGlobalTLSAddressDarwin(Op, DAG);
+
   // TODO: implement the "local dynamic" model
-  assert(Subtarget->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
+  assert(Subtarget->isTargetELF() && "Only ELF implemented here");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
 
   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
 
@@ -2597,22 +2739,31 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
-    ARMConstantPoolValue *CPV =
-      ARMConstantPoolConstant::Create(GV,
-                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+    bool UseGOT_PREL =
+        !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    SDLoc dl(Op);
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+        GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+        UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+        /*AddCurrentAddress=*/UseGOT_PREL);
     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                                 CPAddr,
-                                 MachinePointerInfo::getConstantPool(),
-                                 false, false, false, 0);
+    SDValue Result = DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, 0);
     SDValue Chain = Result.getValue(1);
-    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
-    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
-    if (!UseGOTOFF)
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    if (UseGOT_PREL)
       Result = DAG.getLoad(PtrVT, dl, Chain, Result,
-                           MachinePointerInfo::getGOT(),
+                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
                            false, false, false, 0);
     return Result;
   }
@@ -2628,9 +2779,10 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   } else {
     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                       MachinePointerInfo::getConstantPool(),
-                       false, false, false, 0);
+    return DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, 0);
   }
 }
 
@@ -2654,7 +2806,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
 
   if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
   return Result;
 }
 
@@ -2680,32 +2833,11 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
                                                   TargetFlags));
   if (GV->hasDLLImportStorageClass())
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
   return Result;
 }
 
-SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() &&
-         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc dl(Op);
-  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
-  ARMConstantPoolValue *CPV =
-    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
-                                  ARMPCLabelIndex, PCAdj);
-  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                               MachinePointerInfo::getConstantPool(),
-                               false, false, false, 0);
-  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
-  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
-}
-
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -2722,6 +2854,13 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
 }
 
+SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
+                     Op.getOperand(0));
+}
+
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
@@ -2732,7 +2871,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::arm_rbit: {
     assert(Op.getOperand(1).getValueType() == MVT::i32 &&
            "RBIT intrinsic must have i32 type!");
-    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
+    return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
   }
   case Intrinsic::arm_thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2752,10 +2891,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                       ARMCP::CPLSDA, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    SDValue Result =
-      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                  MachinePointerInfo::getConstantPool(),
-                  false, false, false, 0);
+    SDValue Result = DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+        false, false, 0);
 
     if (RelocM == Reloc::PIC_) {
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2770,6 +2909,36 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
+  case Intrinsic::arm_neon_vminnm:
+  case Intrinsic::arm_neon_vmaxnm: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
+      ? ISD::FMINNUM : ISD::FMAXNUM;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  case Intrinsic::arm_neon_vminu:
+  case Intrinsic::arm_neon_vmaxu: {
+    if (Op.getValueType().isFloatingPoint())
+      return SDValue();
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
+      ? ISD::UMIN : ISD::UMAX;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                         Op.getOperand(1), Op.getOperand(2));
+  }
+  case Intrinsic::arm_neon_vmins:
+  case Intrinsic::arm_neon_vmaxs: {
+    // v{min,max}s is overloaded between signed integers and floats.
+    if (!Op.getValueType().isFloatingPoint()) {
+      unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+        ? ISD::SMIN : ISD::SMAX;
+      return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                         Op.getOperand(1), Op.getOperand(2));
+    }
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+      ? ISD::FMINNAN : ISD::FMAXNAN;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
   }
 }
 
@@ -2870,9 +3039,10 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 
     // Create load node to retrieve arguments from the stack.
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
-                            MachinePointerInfo::getFixedStack(FI),
-                            false, false, false, 0);
+    ArgValue2 = DAG.getLoad(
+        MVT::i32, dl, Root, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+        false, false, 0);
   } else {
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
@@ -3056,9 +3226,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           if (VA.isMemLoc()) {
             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
-                                    MachinePointerInfo::getFixedStack(FI),
-                                    false, false, false, 0);
+            ArgValue2 = DAG.getLoad(
+                MVT::f64, dl, Chain, FIN,
+                MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+                false, false, false, 0);
           } else {
             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
                                              Chain, DAG, dl);
@@ -3139,9 +3310,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                    "Byval arguments cannot be implicit");
             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
 
-            int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg,
-                                            CurByValIndex, VA.getLocMemOffset(),
-                                            Flags.getByValSize());
+            int FrameIndex = StoreByValRegs(
+                CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
+                VA.getLocMemOffset(), Flags.getByValSize());
             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
             CCInfo.nextInRegsParam();
           } else {
@@ -3151,9 +3322,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
             // Create load nodes to retrieve arguments from the stack.
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                         MachinePointerInfo::getFixedStack(FI),
-                                         false, false, false, 0));
+            InVals.push_back(DAG.getLoad(
+                VA.getValVT(), dl, Chain, FIN,
+                MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+                false, false, false, 0));
           }
           lastInsIndex = index;
         }
@@ -3188,13 +3360,9 @@ static bool isFloatingPointZero(SDValue Op) {
     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
     // created by LowerConstantFP().
     SDValue BitcastOp = Op->getOperand(0);
-    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) {
-      SDValue MoveOp = BitcastOp->getOperand(0);
-      if (MoveOp->getOpcode() == ISD::TargetConstant &&
-          cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) {
-        return true;
-      }
-    }
+    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
+        isNullConstant(BitcastOp->getOperand(0)))
+      return true;
   }
   return false;
 }
@@ -3559,113 +3727,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // Try to generate VMAXNM/VMINNM on ARMv8.
   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
                                   TrueVal.getValueType() == MVT::f64)) {
-    // We can use VMAXNM/VMINNM for a compare followed by a select with the
-    // same operands, as follows:
-    //   c = fcmp [?gt, ?ge, ?lt, ?le] a, b
-    //   select c, a, b
-    // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'.
-    bool swapSides = false;
-    if (!getTargetMachine().Options.NoNaNsFPMath) {
-      // transformability may depend on which way around we compare
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETOGT:
-      case ISD::SETOGE:
-      case ISD::SETOLT:
-      case ISD::SETOLE:
-        // the non-NaN should be RHS
-        swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS);
-        break;
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-      case ISD::SETULT:
-      case ISD::SETULE:
-        // the non-NaN should be LHS
-        swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS);
-        break;
-      }
-    }
-    swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal);
-    if (swapSides) {
-      CC = ISD::getSetCCSwappedOperands(CC);
-      std::swap(LHS, RHS);
-    }
-    if (LHS == TrueVal && RHS == FalseVal) {
-      bool canTransform = true;
-      // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here
-      if (!getTargetMachine().Options.UnsafeFPMath &&
-          !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
-        const ConstantFPSDNode *Zero;
-        switch (CC) {
-        default:
-          break;
-        case ISD::SETOGT:
-        case ISD::SETUGT:
-        case ISD::SETGT:
-          // RHS must not be -0
-          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
-                         !Zero->isNegative();
-          break;
-        case ISD::SETOGE:
-        case ISD::SETUGE:
-        case ISD::SETGE:
-          // LHS must not be -0
-          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
-                         !Zero->isNegative();
-          break;
-        case ISD::SETOLT:
-        case ISD::SETULT:
-        case ISD::SETLT:
-          // RHS must not be +0
-          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
-                          Zero->isNegative();
-          break;
-        case ISD::SETOLE:
-        case ISD::SETULE:
-        case ISD::SETLE:
-          // LHS must not be +0
-          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
-                          Zero->isNegative();
-          break;
-        }
-      }
-      if (canTransform) {
-        // Note: If one of the elements in a pair is a number and the other
-        // element is NaN, the corresponding result element is the number.
-        // This is consistent with the IEEE 754-2008 standard.
-        // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN
-        switch (CC) {
-        default:
-          break;
-        case ISD::SETOGT:
-        case ISD::SETOGE:
-          if (!DAG.isKnownNeverNaN(RHS))
-            break;
-          return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS);
-        case ISD::SETUGT:
-        case ISD::SETUGE:
-          if (!DAG.isKnownNeverNaN(LHS))
-            break;
-        case ISD::SETGT:
-        case ISD::SETGE:
-          return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS);
-        case ISD::SETOLT:
-        case ISD::SETOLE:
-          if (!DAG.isKnownNeverNaN(RHS))
-            break;
-          return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS);
-        case ISD::SETULT:
-        case ISD::SETULE:
-          if (!DAG.isKnownNeverNaN(LHS))
-            break;
-        case ISD::SETLT:
-        case ISD::SETLE:
-          return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS);
-        }
-      }
-    }
-
     bool swpCmpOps = false;
     bool swpVselOps = false;
     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -3890,16 +3951,18 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
                        Addr, Op.getOperand(2), JTI);
   }
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
-                       MachinePointerInfo::getJumpTable(),
-                       false, false, false, 0);
+    Addr =
+        DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
+                    false, false, false, 0);
     Chain = Addr.getValue(1);
     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   } else {
-    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
-                       MachinePointerInfo::getJumpTable(),
-                       false, false, false, 0);
+    Addr =
+        DAG.getLoad(PTy, dl, Chain, Addr,
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
+                    false, false, false, 0);
     Chain = Addr.getValue(1);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   }
@@ -3936,7 +3999,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     else
       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
                               Op.getValueType());
-    return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
+    return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
                        /*isSigned*/ false, SDLoc(Op)).first;
   }
 
@@ -3988,7 +4051,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
     else
       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
                               Op.getValueType());
-    return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
+    return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
                        /*isSigned*/ false, SDLoc(Op)).first;
   }
 
@@ -4153,6 +4216,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
   Results.push_back(Read.getOperand(0));
 }
 
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+                                                SelectionDAG &DAG) {
+  SDValue Op = BC->getOperand(0);
+  EVT DstVT = BC->getValueType(0);
+
+  // The only vector instruction that can produce a scalar (remember,
+  // since the bitcast was about to be turned into VMOVDRR, the source
+  // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+  // Moreover, we can do this combine only if there is one use.
+  // Finally, if the destination type is not a vector, there is not
+  // much point on forcing everything on the vector bank.
+  if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !Op.hasOneUse())
+    return SDValue();
+
+  // If the index is not constant, we will introduce an additional
+  // multiply that will stick.
+  // Give up in that case.
+  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Index)
+    return SDValue();
+  unsigned DstNumElt = DstVT.getVectorNumElements();
+
+  // Compute the new index.
+  const APInt &APIntIndex = Index->getAPIntValue();
+  APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+  NewIndex *= APIntIndex;
+  // Check if the new constant index fits into i32.
+  if (NewIndex.getBitWidth() > 32)
+    return SDValue();
+
+  // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+  // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+  SDLoc dl(Op);
+  SDValue ExtractSrc = Op.getOperand(0);
+  EVT VecVT = EVT::getVectorVT(
+      *DAG.getContext(), DstVT.getScalarType(),
+      ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+  SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+                     DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4172,6 +4285,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+    // if we can combine the bitcast with its source.
+    if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+      return Val;
+
     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(0, dl, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
@@ -4383,7 +4501,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
   if (!ST->hasV6T2Ops())
     return SDValue();
 
-  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
+  SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
 }
 
@@ -4544,8 +4662,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
          "Unknown shift to lower!");
 
   // We only lower SRA, SRL of 1 here, all others use generic lowering.
-  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
-      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+  if (!isOneConstant(N->getOperand(1)))
     return SDValue();
 
   // If we are in thumb mode, we don't have RRX.
@@ -5036,18 +5153,56 @@ static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
   return VT == MVT::v8i8 && M.size() == 8;
 }
 
+// Checks whether the shuffle mask represents a vector transpose (VTRN) by
+// checking that pairs of elements in the shuffle mask represent the same index
+// in each vector, incrementing the expected index by 2 at each step.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
+//  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
+//  v2={e,f,g,h}
+// WhichResult gives the offset for each element in the mask based on which
+// of the two results it belongs to.
+//
+// The transpose can be represented either as:
+// result1 = shufflevector v1, v2, result1_shuffle_mask
+// result2 = shufflevector v1, v2, result2_shuffle_mask
+// where v1/v2 and the shuffle masks have the same number of elements
+// (here WhichResult (see below) indicates which result is being checked)
+//
+// or as:
+// results = shufflevector v1, v2, shuffle_mask
+// where both results are returned in one vector and the shuffle mask has twice
+// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
+// want to check the low half and high half of the shuffle mask as if it were
+// the other case
 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
-        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
-      return false;
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  // If the mask is twice as long as the input vector then we need to check the
+  // upper and lower parts of the mask with a matching value for WhichResult
+  // FIXME: A mask with only even values will be rejected in case the first
+  // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
+  // M[0] is used to determine WhichResult
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
+        return false;
+    }
   }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
   return true;
 }
 
@@ -5060,28 +5215,55 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
-        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
-      return false;
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
+        return false;
+    }
   }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
   return true;
 }
 
+// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
+// that the mask elements are either all even and in steps of size 2 or all odd
+// and in steps of size 2.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
+//  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
+//  v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with
+// respect the how results are returned.
 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (M[i] < 0) continue; // ignore UNDEF indices
-    if ((unsigned) M[i] != 2 * i + WhichResult)
-      return false;
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; ++j) {
+      if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
+        return false;
+    }
   }
 
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
@@ -5097,18 +5279,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
   if (EltSz == 64)
     return false;
 
-  unsigned Half = VT.getVectorNumElements() / 2;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned j = 0; j != 2; ++j) {
-    unsigned Idx = WhichResult;
-    for (unsigned i = 0; i != Half; ++i) {
-      int MIdx = M[i + j * Half];
-      if (MIdx >= 0 && (unsigned) MIdx != Idx)
-        return false;
-      Idx += 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  unsigned Half = NumElts / 2;
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; j += Half) {
+      unsigned Idx = WhichResult;
+      for (unsigned k = 0; k < Half; ++k) {
+        int MIdx = M[i + j + k];
+        if (MIdx >= 0 && (unsigned) MIdx != Idx)
+          return false;
+        Idx += 2;
+      }
     }
   }
 
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
@@ -5116,21 +5307,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
   return true;
 }
 
+// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
+// that pairs of elements of the shufflemask represent the same index in each
+// vector incrementing sequentially through the vectors.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
+//  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
+//  v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with respect the how results
+// are returned.
 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
-        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
-      return false;
-    Idx += 1;
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    unsigned Idx = WhichResult * NumElts / 2;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
+        return false;
+      Idx += 1;
+    }
   }
 
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
@@ -5147,15 +5354,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
-        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
-      return false;
-    Idx += 1;
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    unsigned Idx = WhichResult * NumElts / 2;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
+        return false;
+      Idx += 1;
+    }
   }
 
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
@@ -5329,16 +5544,14 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       // just use VDUPLANE. We can only do this if the lane being extracted
       // is at a constant index, as the VDUP from lane instructions only have
       // constant-index forms.
+      ConstantSDNode *constIndex;
       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(Value->getOperand(1))) {
+          (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
         // We need to create a new undef vector to use for the VDUPLANE if the
         // size of the vector from which we get the value is different than the
         // size of the vector that we need to create. We will insert the element
         // such that the register coalescer will remove unnecessary copies.
         if (VT != Value->getOperand(0).getValueType()) {
-          ConstantSDNode *constIndex;
-          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
-          assert(constIndex && "The index is not a constant!");
           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
                              VT.getVectorNumElements();
           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
@@ -5437,14 +5650,35 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 // shuffle in combination with VEXTs.
 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
+
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
 
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
@@ -5453,127 +5687,166 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       // A shuffle can only come from building a vector from various
       // elements of other vectors.
       return SDValue();
-    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
-               VT.getVectorElementType()) {
-      // This code doesn't know how to handle shuffles where the vector
-      // element types do not match (this happens because type legalization
-      // promotes the return type of EXTRACT_VECTOR_ELT).
-      // FIXME: It might be appropriate to extend this code to handle
-      // mismatched types.
+    } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
+      // Furthermore, shuffles require a constant mask, whereas extractelts
+      // accept variable indices.
       return SDValue();
     }
 
-    // Record this extraction against the appropriate vector if possible...
+    // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
-    // If the element number isn't a constant, we can't effectively
-    // analyze what's going on.
-    if (!isa<ConstantSDNode>(V.getOperand(1)))
-      return SDValue();
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
+    auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+    if (Source == Sources.end())
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
   // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
+  // are involved.
+  if (Sources.size() > 2)
     return SDValue();
 
-  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = {0, 0};
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy))
+      SmallestEltTy = SrcEltTy;
+  }
+  unsigned ResMultiplier =
+      VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
+
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+      continue;
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
-      VEXTOffsets[i] = 0;
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
+        return SDValue();
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
       continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
-      // It probably isn't worth padding out a smaller vector just to
-      // break it down again in a shuffle.
-      return SDValue();
     }
 
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
+    if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
+      return SDValue();
 
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       // Span too large for a VEXT to cope
       return SDValue();
     }
 
-    if (MinElts[i] >= NumElts) {
+    if (Src.MinElt >= NumSrcElts) {
       // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i],
-                                   DAG.getIntPtrConstant(NumElts, dl));
-    } else if (MaxElts[i] < NumElts) {
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, dl, MVT::i32));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i],
-                                   DAG.getIntPtrConstant(0, dl));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, dl, MVT::i32));
     } else {
       // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i],
-                                     DAG.getIntPtrConstant(0, dl));
-      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i],
-                                     DAG.getIntPtrConstant(NumElts, dl));
-      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(VEXTOffsets[i], dl,
-                                                   MVT::i32));
-    }
-  }
-
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, dl, MVT::i32));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, dl, MVT::i32));
+
+      Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2,
+                                   DAG.getConstant(Src.MinElt, dl, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
+    }
+  }
+
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
+      continue;
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
+
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
+    if (Entry.getOpcode() == ISD::UNDEF)
       continue;
-    }
 
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
-                                          .getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
-    }
+    auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+                               VT.getVectorElementType().getSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
   }
 
   // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
 
-  return SDValue();
+  // We can't handle more than two sources. This should have already
+  // been checked before this point.
+  assert(Sources.size() <= 2 && "Too many sources!");
+
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], &Mask[0]);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
 /// isShuffleMaskLegal - Targets can use this to indicate that they only
@@ -6235,6 +6508,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue
 LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
+  // TODO: Should this propagate fast-math-flags?
+
   // Convert to float
   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
@@ -6265,6 +6540,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
 
 static SDValue
 LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue N2;
   // Convert to float.
   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
@@ -6337,6 +6614,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+  // TODO: Should this propagate fast-math-flags?
   EVT VT = Op.getValueType();
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::UDIV");
@@ -6445,45 +6723,56 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
-
-  // Create stack object for sret.
+  Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   auto &DL = DAG.getDataLayout();
-  const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-  const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
-  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
-  SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
 
   ArgListTy Args;
-  ArgListEntry Entry;
-
-  Entry.Node = SRet;
-  Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
-  Entry.isSRet = true;
-  Args.push_back(Entry);
+  bool ShouldUseSRet = Subtarget->isAPCS_ABI();
+  SDValue SRet;
+  if (ShouldUseSRet) {
+    // Create stack object for sret.
+    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+    const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
+    int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
+
+    ArgListEntry Entry;
+    Entry.Node = SRet;
+    Entry.Ty = RetTy->getPointerTo();
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isSRet = true;
+    Args.push_back(Entry);
+    RetTy = Type::getVoidTy(*DAG.getContext());
+  }
 
+  ArgListEntry Entry;
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
   Entry.isSExt = false;
   Entry.isZExt = false;
   Args.push_back(Entry);
 
-  const char *LibcallName  = (ArgVT == MVT::f64)
-  ? "__sincos_stret" : "__sincosf_stret";
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  RTLIB::Libcall LC =
+      (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
+  CallingConv::ID CC = getLibcallCallingConv(LC);
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
-               std::move(Args), 0)
-    .setDiscardResult();
-
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setCallee(CC, RetTy, Callee, std::move(Args), 0)
+      .setDiscardResult(ShouldUseSRet);
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
+  if (!ShouldUseSRet)
+    return CallResult.first;
+
   SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
                                 MachinePointerInfo(), false, false, false, 0);
 
@@ -6498,6 +6787,85 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                      LoadSin.getValue(0), LoadCos.getValue(0));
 }
 
+SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
+                                                  bool Signed,
+                                                  SDValue &Chain) const {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  const auto &DL = DAG.getDataLayout();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  const char *Name = nullptr;
+  if (Signed)
+    Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+  else
+    Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+
+  SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+
+  ARMTargetLowering::ArgListTy Args;
+
+  for (auto AI : {1, 0}) {
+    ArgListEntry Arg;
+    Arg.Node = Op.getOperand(AI);
+    Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Args.push_back(Arg);
+  }
+
+  CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+    .setChain(Chain)
+    .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
+               ES, std::move(Args), 0);
+
+  return LowerCallTo(CLI).first;
+}
+
+SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
+                                            bool Signed) const {
+  assert(Op.getValueType() == MVT::i32 &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
+                               DAG.getEntryNode(), Op.getOperand(1));
+
+  return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+}
+
+void ARMTargetLowering::ExpandDIV_Windows(
+    SDValue Op, SelectionDAG &DAG, bool Signed,
+    SmallVectorImpl<SDValue> &Results) const {
+  const auto &DL = DAG.getDataLayout();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  assert(Op.getValueType() == MVT::i64 &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
+                           DAG.getConstant(0, dl, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
+                           DAG.getConstant(1, dl, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi);
+
+  SDValue DBZCHK =
+      DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or);
+
+  SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+
+  SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
+  SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
+                              DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+  Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
+
+  Results.push_back(Lower);
+  Results.push_back(Upper);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   // Monotonic load/store is legal for all targets
   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
@@ -6513,36 +6881,22 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
                                     SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) {
   SDLoc DL(N);
-  SDValue Cycles32, OutChain;
-
-  if (Subtarget->hasPerfMon()) {
-    // Under Power Management extensions, the cycle-count is:
-    //    mrc p15, #0, <Rt>, c9, c13, #0
-    SDValue Ops[] = { N->getOperand(0), // Chain
-                      DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
-                      DAG.getConstant(15, DL, MVT::i32),
-                      DAG.getConstant(0, DL, MVT::i32),
-                      DAG.getConstant(9, DL, MVT::i32),
-                      DAG.getConstant(13, DL, MVT::i32),
-                      DAG.getConstant(0, DL, MVT::i32)
-    };
-
-    Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
-                           DAG.getVTList(MVT::i32, MVT::Other), Ops);
-    OutChain = Cycles32.getValue(1);
-  } else {
-    // Intrinsic is defined to return 0 on unsupported platforms. Technically
-    // there are older ARM CPUs that have implementation-specific ways of
-    // obtaining this information (FIXME!).
-    Cycles32 = DAG.getConstant(0, DL, MVT::i32);
-    OutChain = DAG.getEntryNode();
-  }
-
+  // Under Power Management extensions, the cycle-count is:
+  //    mrc p15, #0, <Rt>, c9, c13, #0
+  SDValue Ops[] = { N->getOperand(0), // Chain
+                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                    DAG.getConstant(15, DL, MVT::i32),
+                    DAG.getConstant(0, DL, MVT::i32),
+                    DAG.getConstant(9, DL, MVT::i32),
+                    DAG.getConstant(13, DL, MVT::i32),
+                    DAG.getConstant(0, DL, MVT::i32)
+  };
 
-  SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
-                                 Cycles32, DAG.getConstant(0, DL, MVT::i32));
-  Results.push_back(Cycles64);
-  Results.push_back(OutChain);
+  SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                                 DAG.getVTList(MVT::i32, MVT::Other), Ops);
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
+                                DAG.getConstant(0, DL, MVT::i32)));
+  Results.push_back(Cycles32.getValue(1));
 }
 
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -6576,15 +6930,17 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
-  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
+  case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
+  case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
@@ -6622,13 +6978,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     llvm_unreachable("Don't know how to custom lower this!");
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ARMISD::WIN__DBZCHK: return SDValue();
   }
 }
 
 /// ReplaceNodeResults - Replace the results of node with an illegal result
 /// type with new values built out of custom code.
 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
-                                           SmallVectorImpl<SDValue>&Results,
+                                           SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
   SDValue Res;
   switch (N->getOpcode()) {
@@ -6644,9 +7001,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
     break;
+  case ISD::SREM:
+  case ISD::UREM:
+    Res = LowerREM(N, DAG);
+    break;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
+  case ISD::UDIV:
+  case ISD::SDIV:
+    assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
+    return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
+                             Results);
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -6683,12 +7049,12 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
 
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
-    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
-                             MachineMemOperand::MOLoad, 4, 4);
+      MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                               MachineMemOperand::MOLoad, 4, 4);
 
   MachineMemOperand *FIMMOSt =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                             MachineMemOperand::MOStore, 4, 4);
+      MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
+                               MachineMemOperand::MOStore, 4, 4);
 
   // Load the address of the dispatch MBB into the jump buffer.
   if (isThumb2) {
@@ -6792,7 +7158,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   MachineModuleInfo &MMI = MF->getMMI();
   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
        ++BB) {
-    if (!BB->isLandingPad()) continue;
+    if (!BB->isEHPad()) continue;
 
     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
     // pad.
@@ -6807,7 +7173,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       for (SmallVectorImpl<unsigned>::iterator
              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
            CSI != CSE; ++CSI) {
-        CallSiteNumToLPad[*CSI].push_back(BB);
+        CallSiteNumToLPad[*CSI].push_back(&*BB);
         MaxCSNum = std::max(MaxCSNum, *CSI);
       }
       break;
@@ -6840,7 +7206,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
 
   // Shove the dispatch's address into the return slot in the function context.
   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
-  DispatchBB->setIsLandingPad();
+  DispatchBB->setIsEHPad();
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   unsigned trap_opcode;
@@ -6864,10 +7230,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   // context.
   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
 
-  MachineMemOperand *FIMMOLd =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                             MachineMemOperand::MOLoad |
-                             MachineMemOperand::MOVolatile, 4, 4);
+  MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI),
+      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
@@ -6982,9 +7347,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
                    .addReg(NewVReg2, RegState::Kill)
                    .addReg(NewVReg3));
 
-    MachineMemOperand *JTMMOLd =
-      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
-                               MachineMemOperand::MOLoad, 4, 4);
+    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
 
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
@@ -7066,9 +7430,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
                    .addJumpTableIndex(MJTI));
 
-    MachineMemOperand *JTMMOLd =
-      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
-                               MachineMemOperand::MOLoad, 4, 4);
+    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     AddDefaultPred(
       BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
@@ -7109,13 +7472,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
                                                   BB->succ_end());
     while (!Successors.empty()) {
       MachineBasicBlock *SMBB = Successors.pop_back_val();
-      if (SMBB->isLandingPad()) {
+      if (SMBB->isEHPad()) {
         BB->removeSuccessor(SMBB);
         MBBLPads.push_back(SMBB);
       }
     }
 
-    BB->addSuccessor(DispatchBB);
+    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
+    BB->normalizeSuccProbs();
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled. This prevents code from
@@ -7157,7 +7521,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
   // landing pad now.
   for (SmallVectorImpl<MachineBasicBlock*>::iterator
          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
-    (*I)->setIsLandingPad(false);
+    (*I)->setIsEHPad(false);
 
   // The instruction is gone now.
   MI->eraseFromParent();
@@ -7280,8 +7644,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   // Otherwise, we will generate unrolled scalar copies.
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   unsigned dest = MI->getOperand(0).getReg();
   unsigned src = MI->getOperand(1).getReg();
@@ -7574,6 +7937,32 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
 }
 
 MachineBasicBlock *
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
+  MF->push_back(ContBB);
+  ContBB->splice(ContBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  MBB->addSuccessor(ContBB);
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  MF->push_back(TrapBB);
+  BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249);
+  MBB->addSuccessor(TrapBB);
+
+  BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
+      .addReg(MI->getOperand(0).getReg())
+      .addMBB(TrapBB);
+
+  MI->eraseFromParent();
+  return ContBB;
+}
+
+MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -7643,8 +8032,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // destination vreg to set, the condition code register to branch on, the
     // true/false values to select between, and a branch opcode to use.
     const BasicBlock *LLVM_BB = BB->getBasicBlock();
-    MachineFunction::iterator It = BB;
-    ++It;
+    MachineFunction::iterator It = ++BB->getIterator();
 
     //  thisMBB:
     //  ...
@@ -7741,6 +8129,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::tInt_eh_sjlj_setjmp:
   case ARM::t2Int_eh_sjlj_setjmp:
   case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    return BB;
+
+  case ARM::Int_eh_sjlj_setup_dispatch:
     EmitSjLjDispatchBlock(MI, BB);
     return BB;
 
@@ -7759,8 +8150,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
     //     SinkBB: V1 = PHI(V2, V3)
     const BasicBlock *LLVM_BB = BB->getBasicBlock();
-    MachineFunction::iterator BBI = BB;
-    ++BBI;
+    MachineFunction::iterator BBI = ++BB->getIterator();
     MachineFunction *Fn = BB->getParent();
     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
@@ -7824,11 +8214,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitStructByval(MI, BB);
   case ARM::WIN__CHKSTK:
     return EmitLowered__chkstk(MI, BB);
+  case ARM::WIN__DBZCHK:
+    return EmitLowered__dbzchk(MI, BB);
+  }
+}
+
+/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// when it is expanded into LDM/STM. This is done as a post-isel lowering
+/// instead of as a custom inserter because we need the use list from the SDNode.
+static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
+                                   MachineInstr *MI, const SDNode *Node) {
+  bool isThumb1 = Subtarget->isThumb1Only();
+
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+
+  // If the new dst/src is unused mark it as dead.
+  if (!Node->hasAnyUseOfValue(0)) {
+    MI->getOperand(0).setIsDead(true);
+  }
+  if (!Node->hasAnyUseOfValue(1)) {
+    MI->getOperand(1).setIsDead(true);
+  }
+
+  // The MEMCPY both defines and kills the scratch registers.
+  for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
+    unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+                                                         : &ARM::GPRRegClass);
+    MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
   }
 }
 
 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                       SDNode *Node) const {
+  if (MI->getOpcode() == ARM::MEMCPY) {
+    attachMEMCPYScratchRegs(Subtarget, MI, Node);
+    return;
+  }
+
   const MCInstrDesc *MCID = &MI->getDesc();
   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
@@ -7898,10 +8323,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
 
 // Helper function that checks if N is a null or all ones constant.
 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
-  if (!C)
-    return false;
-  return AllOnes ? C->isAllOnesValue() : C->isNullValue();
+  return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
 }
 
 // Return true if N is conditionally 0 or all ones.
@@ -8723,12 +9145,88 @@ static SDValue PerformXORCombine(SDNode *N,
   return SDValue();
 }
 
-/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
-/// the bits being cleared by the AND are not demanded by the BFI.
+// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
+// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
+// their position in "to" (Rd).
+static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
+  assert(N->getOpcode() == ARMISD::BFI);
+
+  SDValue From = N->getOperand(1);
+  ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+  FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
+
+  // If the Base came from a SHR #C, we can deduce that it is really testing bit
+  // #C in the base of the SHR.
+  if (From->getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(From->getOperand(1))) {
+    APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+    assert(Shift.getLimitedValue() < 32 && "Shift too large!");
+    FromMask <<= Shift.getLimitedValue(31);
+    From = From->getOperand(0);
+  }
+
+  return From;
+}
+
+// If A and B contain one contiguous set of bits, does A | B == A . B?
+//
+// Neither A nor B must be zero.
+static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
+  unsigned LastActiveBitInA =  A.countTrailingZeros();
+  unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
+  return LastActiveBitInA - 1 == FirstActiveBitInB;
+}
+
+static SDValue FindBFIToCombineWith(SDNode *N) {
+  // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
+  // if one exists.
+  APInt ToMask, FromMask;
+  SDValue From = ParseBFI(N, ToMask, FromMask);
+  SDValue To = N->getOperand(0);
+
+  // Now check for a compatible BFI to merge with. We can pass through BFIs that
+  // aren't compatible, but not if they set the same bit in their destination as
+  // we do (or that of any BFI we're going to combine with).
+  SDValue V = To;
+  APInt CombinedToMask = ToMask;
+  while (V.getOpcode() == ARMISD::BFI) {
+    APInt NewToMask, NewFromMask;
+    SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+    if (NewFrom != From) {
+      // This BFI has a different base. Keep going.
+      CombinedToMask |= NewToMask;
+      V = V.getOperand(0);
+      continue;
+    }
+
+    // Do the written bits conflict with any we've seen so far?
+    if ((NewToMask & CombinedToMask).getBoolValue())
+      // Conflicting bits - bail out because going further is unsafe.
+      return SDValue();
+
+    // Are the new bits contiguous when combined with the old bits?
+    if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+        BitsProperlyConcatenate(FromMask, NewFromMask))
+      return V;
+    if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+        BitsProperlyConcatenate(NewFromMask, FromMask))
+      return V;
+
+    // We've seen a write to some bits, so track it.
+    CombinedToMask |= NewToMask;
+    // Keep going...
+    V = V.getOperand(0);
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformBFICombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N1 = N->getOperand(1);
   if (N1.getOpcode() == ISD::AND) {
+    // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+    // the bits being cleared by the AND are not demanded by the BFI.
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C)
       return SDValue();
@@ -8744,6 +9242,38 @@ static SDValue PerformBFICombine(SDNode *N,
       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
                              N->getOperand(0), N1.getOperand(0),
                              N->getOperand(2));
+  } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+    // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
+    // Keep track of any consecutive bits set that all come from the same base
+    // value. We can combine these together into a single BFI.
+    SDValue CombineBFI = FindBFIToCombineWith(N);
+    if (CombineBFI == SDValue())
+      return SDValue();
+
+    // We've found a BFI.
+    APInt ToMask1, FromMask1;
+    SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
+
+    APInt ToMask2, FromMask2;
+    SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
+    assert(From1 == From2);
+    (void)From2;
+
+    // First, unlink CombineBFI.
+    DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
+    // Then create a new BFI, combining the two together.
+    APInt NewFromMask = FromMask1 | FromMask2;
+    APInt NewToMask = ToMask1 | ToMask2;
+
+    EVT VT = N->getValueType(0);
+    SDLoc dl(N);
+
+    if (NewFromMask[0] == 0)
+      From1 = DCI.DAG.getNode(
+        ISD::SRL, dl, VT, From1,
+        DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+    return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
+                           DCI.DAG.getConstant(~NewToMask, dl, VT));
   }
   return SDValue();
 }
@@ -9521,32 +10051,6 @@ static SDValue PerformSTORECombine(SDNode *N,
   return SDValue();
 }
 
-// isConstVecPow2 - Return true if each vector element is a power of 2, all
-// elements are the same constant, C, and Log2(C) ranges from 1 to 32.
-static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
-{
-  integerPart cN;
-  integerPart c0 = 0;
-  for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
-       I != E; I++) {
-    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
-    if (!C)
-      return false;
-
-    bool isExact;
-    APFloat APF = C->getValueAPF();
-    if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
-        != APFloat::opOK || !isExact)
-      return false;
-
-    c0 = (I == 0) ? cN : c0;
-    if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
-      return false;
-  }
-  C = c0;
-  return true;
-}
-
 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
 /// can replace combinations of VMUL and VCVT (floating-point to integer)
 /// when the VMUL has a constant operand that is a power of 2.
@@ -9556,30 +10060,25 @@ static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
 ///  vcvt.s32.f32    d16, d16
 /// becomes:
 ///  vcvt.s32.f32    d16, d16, #3
-static SDValue PerformVCVTCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
                                   const ARMSubtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Op = N->getOperand(0);
+  if (!Subtarget->hasNEON())
+    return SDValue();
 
-  if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
-      Op.getOpcode() != ISD::FMUL)
+  SDValue Op = N->getOperand(0);
+  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
-  uint64_t C;
-  SDValue N0 = Op->getOperand(0);
   SDValue ConstVec = Op->getOperand(1);
-  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
-
-  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
-      !isConstVecPow2(ConstVec, isSigned, C))
+  if (!isa<BuildVectorSDNode>(ConstVec))
     return SDValue();
 
   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
   unsigned NumLanes = Op.getValueType().getVectorNumElements();
-  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 ||
-      NumLanes > 4) {
+  if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
     // These instructions only exist converting from f32 to i32. We can handle
     // smaller integers by generating an extra truncate, but larger ones would
     // be lossy. We also can't handle more then 4 lanes, since these intructions
@@ -9587,16 +10086,22 @@ static SDValue PerformVCVTCombine(SDNode *N,
     return SDValue();
   }
 
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+  if (C == -1 || C == 0 || C > 32)
+    return SDValue();
+
   SDLoc dl(N);
+  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
     Intrinsic::arm_neon_vcvtfp2fxu;
-  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
-                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
-                                 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
-                                 N0,
-                                 DAG.getConstant(Log2_64(C), dl, MVT::i32));
+  SDValue FixConv = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
+      DAG.getConstant(C, dl, MVT::i32));
 
-  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+  if (IntBits < FloatBits)
     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
 
   return FixConv;
@@ -9611,38 +10116,44 @@ static SDValue PerformVCVTCombine(SDNode *N,
 ///  vdiv.f32        d16, d17, d16
 /// becomes:
 ///  vcvt.f32.s32    d16, d16, #3
-static SDValue PerformVDIVCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
                                   const ARMSubtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
   SDValue Op = N->getOperand(0);
   unsigned OpOpcode = Op.getNode()->getOpcode();
-
-  if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
+  if (!N->getValueType(0).isVector() ||
       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
     return SDValue();
 
-  uint64_t C;
   SDValue ConstVec = N->getOperand(1);
-  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
-
-  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
-      !isConstVecPow2(ConstVec, isSigned, C))
+  if (!isa<BuildVectorSDNode>(ConstVec))
     return SDValue();
 
   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
-  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+  uint32_t IntBits = IntTy.getSizeInBits();
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
     // These instructions only exist converting from i32 to f32. We can handle
     // smaller integers by generating an extra extend, but larger ones would
-    // be lossy.
+    // be lossy. We also can't handle more then 4 lanes, since these intructions
+    // only support v2i32/v4i32 types.
     return SDValue();
   }
 
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+  if (C == -1 || C == 0 || C > 32)
+    return SDValue();
+
   SDLoc dl(N);
+  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
   SDValue ConvInput = Op.getOperand(0);
-  unsigned NumLanes = Op.getValueType().getVectorNumElements();
-  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+  if (IntBits < FloatBits)
     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
                             ConvInput);
@@ -9652,7 +10163,7 @@ static SDValue PerformVDIVCombine(SDNode *N,
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
                      Op.getValueType(),
                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
-                     ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32));
+                     ConvInput, DAG.getConstant(C, dl, MVT::i32));
 }
 
 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
@@ -9680,7 +10191,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
 ///   0 <= Value <= ElementBits for a long left shift.
 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
   if (! getVShiftImm(Op, ElementBits, Cnt))
     return false;
   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
@@ -9695,12 +10206,16 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
                          int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
   if (! getVShiftImm(Op, ElementBits, Cnt))
     return false;
-  if (isIntrinsic)
+  if (!isIntrinsic)
+    return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+  if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
     Cnt = -Cnt;
-  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+    return true;
+  }
+  return false;
 }
 
 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
@@ -9939,89 +10454,123 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
-/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
-static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
-                                       const ARMSubtarget *ST) {
-  // If the target supports NEON, try to use vmax/vmin instructions for f32
-  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
-  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
-  // a NaN; only do the transformation when it matches that behavior.
-
-  // For now only do this when using NEON for FP operations; if using VFP, it
-  // is not obvious that the benefit outweighs the cost of switching to the
-  // NEON pipeline.
-  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
-      N->getValueType(0) != MVT::f32)
+static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
+                             APInt &KnownOne) {
+  if (Op.getOpcode() == ARMISD::BFI) {
+    // Conservatively, we can recurse down the first operand
+    // and just mask out all affected bits.
+    computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
+
+    // The operand to BFI is already a mask suitable for removing the bits it
+    // sets.
+    ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
+    APInt Mask = CI->getAPIntValue();
+    KnownZero &= Mask;
+    KnownOne &= Mask;
+    return;
+  }
+  if (Op.getOpcode() == ARMISD::CMOV) {
+    APInt KZ2(KnownZero.getBitWidth(), 0);
+    APInt KO2(KnownOne.getBitWidth(), 0);
+    computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
+    computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
+
+    KnownZero &= KZ2;
+    KnownOne &= KO2;
+    return;
+  }
+  return DAG.computeKnownBits(Op, KnownZero, KnownOne);
+}
+
+SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
+  // If we have a CMOV, OR and AND combination such as:
+  //   if (x & CN)
+  //     y |= CM;
+  //
+  // And:
+  //   * CN is a single bit;
+  //   * All bits covered by CM are known zero in y
+  //
+  // Then we can convert this into a sequence of BFI instructions. This will
+  // always be a win if CM is a single bit, will always be no worse than the
+  // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
+  // three bits (due to the extra IT instruction).
+
+  SDValue Op0 = CMOV->getOperand(0);
+  SDValue Op1 = CMOV->getOperand(1);
+  auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
+  auto CC = CCNode->getAPIntValue().getLimitedValue();
+  SDValue CmpZ = CMOV->getOperand(4);
+
+  // The compare must be against zero.
+  if (!isNullConstant(CmpZ->getOperand(1)))
     return SDValue();
 
-  SDValue CondLHS = N->getOperand(0);
-  SDValue CondRHS = N->getOperand(1);
-  SDValue LHS = N->getOperand(2);
-  SDValue RHS = N->getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-
-  unsigned Opcode = 0;
-  bool IsReversed;
-  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
-    IsReversed = false; // x CC y ? x : y
-  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
-    IsReversed = true ; // x CC y ? y : x
-  } else {
+  assert(CmpZ->getOpcode() == ARMISD::CMPZ);
+  SDValue And = CmpZ->getOperand(0);
+  if (And->getOpcode() != ISD::AND)
     return SDValue();
+  ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
+  if (!AndC || !AndC->getAPIntValue().isPowerOf2())
+    return SDValue();
+  SDValue X = And->getOperand(0);
+
+  if (CC == ARMCC::EQ) {
+    // We're performing an "equal to zero" compare. Swap the operands so we
+    // canonicalize on a "not equal to zero" compare.
+    std::swap(Op0, Op1);
+  } else {
+    assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
   }
+  
+  if (Op1->getOpcode() != ISD::OR)
+    return SDValue();
 
-  bool IsUnordered;
-  switch (CC) {
-  default: break;
-  case ISD::SETOLT:
-  case ISD::SETOLE:
-  case ISD::SETLT:
-  case ISD::SETLE:
-  case ISD::SETULT:
-  case ISD::SETULE:
-    // If LHS is NaN, an ordered comparison will be false and the result will
-    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
-    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
-    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
-    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
-      break;
-    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
-    // will return -0, so vmin can only be used for unsafe math or if one of
-    // the operands is known to be nonzero.
-    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
-        !DAG.getTarget().Options.UnsafeFPMath &&
-        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
-      break;
-    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
-    break;
+  ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
+  if (!OrC)
+    return SDValue();
+  SDValue Y = Op1->getOperand(0);
 
-  case ISD::SETOGT:
-  case ISD::SETOGE:
-  case ISD::SETGT:
-  case ISD::SETGE:
-  case ISD::SETUGT:
-  case ISD::SETUGE:
-    // If LHS is NaN, an ordered comparison will be false and the result will
-    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
-    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
-    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
-    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
-      break;
-    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
-    // will return +0, so vmax can only be used for unsafe math or if one of
-    // the operands is known to be nonzero.
-    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
-        !DAG.getTarget().Options.UnsafeFPMath &&
-        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
-      break;
-    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
-    break;
-  }
+  if (Op0 != Y)
+    return SDValue();
+
+  // Now, is it profitable to continue?
+  APInt OrCI = OrC->getAPIntValue();
+  unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
+  if (OrCI.countPopulation() > Heuristic)
+    return SDValue();
 
-  if (!Opcode)
+  // Lastly, can we determine that the bits defined by OrCI
+  // are zero in Y?
+  APInt KnownZero, KnownOne;
+  computeKnownBits(DAG, Y, KnownZero, KnownOne);
+  if ((OrCI & KnownZero) != OrCI)
     return SDValue();
-  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
+
+  // OK, we can do the combine.
+  SDValue V = Y;
+  SDLoc dl(X);
+  EVT VT = X.getValueType();
+  unsigned BitInX = AndC->getAPIntValue().logBase2();
+  
+  if (BitInX != 0) {
+    // We must shift X first.
+    X = DAG.getNode(ISD::SRL, dl, VT, X,
+                    DAG.getConstant(BitInX, dl, VT));
+  }
+
+  for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
+       BitInY < NumActiveBits; ++BitInY) {
+    if (OrCI[BitInY] == 0)
+      continue;
+    APInt Mask(VT.getSizeInBits(), 0);
+    Mask.setBit(BitInY);
+    V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
+                    // Confusingly, the operand is an *inverted* mask.
+                    DAG.getConstant(~Mask, dl, VT));
+  }
+
+  return V;
 }
 
 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
@@ -10042,6 +10591,13 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   ARMCC::CondCodes CC =
     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
 
+  // BFI is only available on V6T2+.
+  if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
+    SDValue R = PerformCMOVToBFICombine(N, DAG);
+    if (R)
+      return R;
+  }
+
   // Simplify
   //   mov     r1, r0
   //   cmp     r1, x
@@ -10108,8 +10664,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
   case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
-  case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
+  case ISD::FP_TO_UINT:
+    return PerformVCVTCombine(N, DCI.DAG, Subtarget);
+  case ISD::FDIV:
+    return PerformVDIVCombine(N, DCI.DAG, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
   case ISD::SRA:
@@ -10117,7 +10675,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
-  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
   case ARMISD::VLD2DUP:
@@ -10932,7 +11489,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'J':
-        if (Subtarget->isThumb()) {  // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
           // This must be a constant between -255 and -1, for negated ADD
           // immediates. This can be used in GCC with an "n" modifier that
           // prints the negated value, for use with SUB instructions. It is
@@ -11001,7 +11558,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'M':
-        if (Subtarget->isThumb()) { // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
           // This must be a multiple of 4 between 0 and 1020, for
           // ADD sp + immediate.
           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
@@ -11043,37 +11600,61 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
-  unsigned Opcode = Op->getOpcode();
-  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
-         "Invalid opcode for Div/Rem lowering");
-  bool isSigned = (Opcode == ISD::SDIVREM);
-  EVT VT = Op->getValueType(0);
-  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
+static RTLIB::Libcall getDivRemLibcall(
+    const SDNode *N, MVT::SimpleValueType SVT) {
+  assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+          N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
+         "Unhandled Opcode in getDivRemLibcall");
+  bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+                  N->getOpcode() == ISD::SREM;
   RTLIB::Libcall LC;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (SVT) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
   }
+  return LC;
+}
 
-  SDValue InChain = DAG.getEntryNode();
-
+static TargetLowering::ArgListTy getDivRemArgList(
+    const SDNode *N, LLVMContext *Context) {
+  assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+          N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
+         "Unhandled Opcode in getDivRemArgList");
+  bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+                  N->getOpcode() == ISD::SREM;
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
-    EVT ArgVT = Op->getOperand(i).getValueType();
-    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op->getOperand(i);
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = N->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*Context);
+    Entry.Node = N->getOperand(i);
     Entry.Ty = ArgTy;
     Entry.isSExt = isSigned;
     Entry.isZExt = !isSigned;
     Args.push_back(Entry);
   }
+  return Args;
+}
+
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+  assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) &&
+         "Register-based DivRem lowering only");
+  unsigned Opcode = Op->getOpcode();
+  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+         "Invalid opcode for Div/Rem lowering");
+  bool isSigned = (Opcode == ISD::SDIVREM);
+  EVT VT = Op->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
+                                       VT.getSimpleVT().SimpleTy);
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
+                                                    DAG.getContext());
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
@@ -11090,6 +11671,47 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   return CallInfo.first;
 }
 
+// Lowers REM using divmod helpers
+// see RTABI section 4.2/4.3
+SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
+  // Build return types (div and rem)
+  std::vector<Type*> RetTyParams;
+  Type *RetTyElement;
+
+  switch (N->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
+  case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
+  case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
+  case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
+  }
+
+  RetTyParams.push_back(RetTyElement);
+  RetTyParams.push_back(RetTyElement);
+  ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
+  Type *RetTy = StructType::get(*DAG.getContext(), ret);
+
+  RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
+                                                             SimpleTy);
+  SDValue InChain = DAG.getEntryNode();
+  TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext());
+  bool isSigned = N->getOpcode() == ISD::SREM;
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  // Lower call
+  CallLoweringInfo CLI(DAG);
+  CLI.setChain(InChain)
+     .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0)
+     .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  // Return second (rem) result operand (first contains div)
+  SDNode *ResNode = CallResult.first.getNode();
+  assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
+  return ResNode->getOperand(1);
+}
+
 SDValue
 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "unsupported target platform");
@@ -11124,8 +11746,8 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
 }
 
 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -11137,8 +11759,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
 }
 
 bool
@@ -11186,7 +11808,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -11212,7 +11834,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
@@ -11295,8 +11917,6 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return true;
 }
 
-bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; }
-
 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
                                         ARM_MB::MemBOpt Domain) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -11392,19 +12012,26 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
 // guarantee, see DDI0406C ARM architecture reference manual,
 // sections A8.8.72-74 LDRD)
-bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return (Size == 64) && !Subtarget->isMClass();
+  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
+                                                  : AtomicExpansionKind::None;
 }
 
 // For the real atomic operations, we have ldrex/strex up to 32 bits,
 // and up to 64 bits on the non-M profiles
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   return (Size <= (Subtarget->isMClass() ? 32U : 64U))
-             ? AtomicRMWExpansionKind::LLSC
-             : AtomicRMWExpansionKind::None;
+             ? AtomicExpansionKind::LLSC
+             : AtomicExpansionKind::None;
+}
+
+bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
+    AtomicCmpXchgInst *AI) const {
+  return true;
 }
 
 // This has so far only been implemented for MachO.
@@ -11419,7 +12046,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
     return false;
 
   // Floating point values and vector values map to the same register file.
-  // Therefore, althought we could do a store extract of a vector type, this is
+  // Therefore, although we could do a store extract of a vector type, this is
   // better to leave at float as we have more freedom in the addressing mode for
   // those.
   if (VectorTy->isFPOrFPVectorTy())
@@ -11441,6 +12068,14 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
   return false;
 }
 
+bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+  return Subtarget->hasV6T2Ops();
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+  return Subtarget->hasV6T2Ops();
+}
+
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -11477,6 +12112,14 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
       cast<PointerType>(Addr->getType())->getElementType());
 }
 
+void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+    IRBuilder<> &Builder) const {
+  if (!Subtarget->hasV7Ops())
+    return;
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+}
+
 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                                Value *Addr,
                                                AtomicOrdering Ord) const {
@@ -11534,12 +12177,12 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
-  // support i64/f64 element).
-  if ((VecSize != 64 && VecSize != 128) || EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types and vector types
+  // with i64/f64 elements (vldN doesn't support i64/f64 elements).
+  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
     return false;
 
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
@@ -11552,9 +12195,6 @@ bool ARMTargetLowering::lowerInterleavedLoad(
                                             Intrinsic::arm_neon_vld3,
                                             Intrinsic::arm_neon_vld4};
 
-  Function *VldnFunc =
-      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
-
   IRBuilder<> Builder(LI);
   SmallVector<Value *, 2> Ops;
 
@@ -11562,6 +12202,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
   Ops.push_back(Builder.getInt32(LI->getAlignment()));
 
+  Type *Tys[] = { VecTy, Int8Ptr };
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
   CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
 
   // Replace uses of each shufflevector with the corresponding vector loaded
@@ -11624,12 +12267,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip illegal sub vector types and vector types of i64/f64 element (vstN
-  // doesn't support i64/f64 element).
-  if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types and vector types
+  // with i64/f64 elements (vstN doesn't support i64/f64 elements).
+  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
+      EltIs64Bits)
     return false;
 
   Value *Op0 = SVI->getOperand(0);
@@ -11650,17 +12294,18 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, NumSubElts);
   }
 
-  static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
-                                       Intrinsic::arm_neon_vst3,
-                                       Intrinsic::arm_neon_vst4};
-  Function *VstNFunc = Intrinsic::getDeclaration(
-      SI->getModule(), StoreInts[Factor - 2], SubVecTy);
-
+  static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+                                             Intrinsic::arm_neon_vst3,
+                                             Intrinsic::arm_neon_vst4};
   SmallVector<Value *, 6> Ops;
 
   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
 
+  Type *Tys[] = { Int8Ptr, SubVecTy };
+  Function *VstNFunc = Intrinsic::getDeclaration(
+      SI->getModule(), StoreInts[Factor - 2], Tys);
+
   // Split the shufflevector operands into sub vectors for the new vstN call.
   for (unsigned i = 0; i < Factor; i++)
     Ops.push_back(Builder.CreateShuffleVector(
@@ -11681,14 +12326,14 @@ enum HABaseType {
 
 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
                                    uint64_t &Members) {
-  if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+  if (auto *ST = dyn_cast<StructType>(Ty)) {
     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
       uint64_t SubMembers = 0;
       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
         return false;
       Members += SubMembers;
     }
-  } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+  } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
     uint64_t SubMembers = 0;
     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
       return false;
@@ -11703,7 +12348,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
       return false;
     Members = 1;
     Base = HA_DOUBLE;
-  } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) {
+  } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
     Members = 1;
     switch (Base) {
     case HA_FLOAT:
@@ -11747,3 +12392,63 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
   return IsHA || IsIntArray;
 }
+
+unsigned ARMTargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  // Platforms which do not use SjLj EH may return values in these registers
+  // via the personality function.
+  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
+}
+
+unsigned ARMTargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  // Platforms which do not use SjLj EH may return values in these registers
+  // via the personality function.
+  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
+}
+
+void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  // Update IsSplitCSR in ARMFunctionInfo.
+  ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void ARMTargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (ARM::GPRRegClass.contains(*I))
+      RC = &ARM::GPRRegClass;
+    else if (ARM::DPRRegClass.contains(*I))
+      RC = &ARM::DPRRegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            NewVR)
+        .addReg(*I);
+
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+              *I)
+          .addReg(NewVR);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index efc9020..96b56c3 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -63,8 +63,6 @@ namespace llvm {
 
       BCC_i64,
 
-      RBIT,         // ARM bitreverse instruction
-
       SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
       SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
       RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
@@ -79,6 +77,7 @@ namespace llvm {
 
       EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
       EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+      EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
 
       TC_RETURN,    // Tail call return pseudo.
 
@@ -91,6 +90,7 @@ namespace llvm {
       PRELOAD,      // Preload
 
       WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
+      WIN__DBZCHK,  // Windows' divide by zero check
 
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
@@ -172,12 +172,6 @@ namespace llvm {
       // BUILD_VECTOR for this purpose.
       BUILD_VECTOR,
 
-      // Floating-point max and min:
-      FMAX,
-      FMIN,
-      VMAXNM,
-      VMINNM,
-
       // Bit-field insert
       BFI,
 
@@ -189,6 +183,10 @@ namespace llvm {
       // Vector bitwise select
       VBSL,
 
+      // Pseudo-instruction representing a memory copy using ldm/stm
+      // instructions.
+      MEMCPY,
+
       // Vector load N-element structure to all lanes:
       VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
       VLD3DUP,
@@ -260,6 +258,7 @@ namespace llvm {
                                        SDNode *Node) const override;
 
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
@@ -348,6 +347,8 @@ namespace llvm {
     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "Q")
         return InlineAsm::Constraint_Q;
+      else if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
       else if (ConstraintCode.size() == 2) {
         if (ConstraintCode[0] == 'U') {
           switch(ConstraintCode[1]) {
@@ -420,13 +421,24 @@ namespace llvm {
     bool functionArgumentNeedsConsecutiveRegisters(
         Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
 
-    bool hasLoadLinkedStoreConditional() const override;
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
     Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                           AtomicOrdering Ord) const override;
     Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                 Value *Addr, AtomicOrdering Ord) const override;
 
+    void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
     Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                           bool IsStore, bool IsLoad) const override;
     Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
@@ -441,16 +453,21 @@ namespace llvm {
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
-    bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
-    TargetLoweringBase::AtomicRMWExpansionKind
+    TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+    bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
     bool useLoadStackGuardNode() const override;
 
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
+    bool isCheapToSpeculateCttz() const override;
+    bool isCheapToSpeculateCtlz() const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -496,6 +513,7 @@ namespace llvm {
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -508,6 +526,7 @@ namespace llvm {
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                  SelectionDAG &DAG,
                                  TLSModel::Model model) const;
+    SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
@@ -526,6 +545,12 @@ namespace llvm {
                               const ARMSubtarget *ST) const;
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+    void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
+                           SmallVectorImpl<SDValue> &Results) const;
+    SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
+                                   SDValue &Chain) const;
+    SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
@@ -555,6 +580,15 @@ namespace llvm {
                             SmallVectorImpl<SDValue> &InVals,
                             bool isThisReturn, SDValue ThisVal) const;
 
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+          MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
     SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
@@ -635,6 +669,8 @@ namespace llvm {
 
     MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
                                            MachineBasicBlock *MBB) const;
+    MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
   };
 
   enum NEONModImmType {
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 84f95be..cf973d6 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -51,7 +51,8 @@ void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
 
 unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   switch (Opc) {
-  default: break;
+  default:
+    break;
   case ARM::LDR_PRE_IMM:
   case ARM::LDR_PRE_REG:
   case ARM::LDR_POST_IMM:
@@ -124,82 +125,10 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
             .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
   unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-      MachinePointerInfo::getGOT(), Flag, 4, 4);
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
   MIB.addMemOperand(MMO);
   MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
   MIB.addReg(Reg, RegState::Kill).addImm(0);
   MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   AddDefaultPred(MIB);
 }
-
-namespace {
-  /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC
-  /// global base register for ARM ELF.
-  struct ARMCGBR : public MachineFunctionPass {
-    static char ID;
-    ARMCGBR() : MachineFunctionPass(ID) {}
-
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-      if (AFI->getGlobalBaseReg() == 0)
-        return false;
-      const ARMSubtarget &STI =
-          static_cast<const ARMSubtarget &>(MF.getSubtarget());
-      // Don't do this for Thumb1.
-      if (STI.isThumb1Only())
-	return false;
-
-      const TargetMachine &TM = MF.getTarget();
-      if (TM.getRelocationModel() != Reloc::PIC_)
-        return false;
-
-      LLVMContext *Context = &MF.getFunction()->getContext();
-      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      unsigned PCAdj = STI.isThumb() ? 4 : 8;
-      ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
-          *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
-
-      unsigned Align = TM.getDataLayout()->getPrefTypeAlignment(
-          Type::getInt32PtrTy(*Context));
-      unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
-
-      MachineBasicBlock &FirstMBB = MF.front();
-      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
-      unsigned TempReg =
-          MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
-      unsigned Opc = STI.isThumb2() ? ARM::t2LDRpci : ARM::LDRcp;
-      const TargetInstrInfo &TII = *STI.getInstrInfo();
-      MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
-                                        TII.get(Opc), TempReg)
-                                .addConstantPoolIndex(Idx);
-      if (Opc == ARM::LDRcp)
-        MIB.addImm(0);
-      AddDefaultPred(MIB);
-
-      // Fix the GOT address by adding pc.
-      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
-      Opc = STI.isThumb2() ? ARM::tPICADD : ARM::PICADD;
-      MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
-                .addReg(TempReg)
-                .addImm(ARMPCLabelIndex);
-      if (Opc == ARM::PICADD)
-        AddDefaultPred(MIB);
-
-      return true;
-    }
-
-    const char *getPassName() const override {
-      return "ARM PIC Global Base Reg Initialization";
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
-
-char ARMCGBR::ID = 0;
-FunctionPass*
-llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index 9f5bde3..c446ba3 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -59,6 +59,7 @@ def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
 def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>;
 
 def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
@@ -70,8 +71,11 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
-def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
-def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_ARMMEMCPY  : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                          SDTCisVT<4, i32>]>;
 
 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
                                             [SDTCisSameAs<0, 2>,
@@ -163,21 +167,23 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                SDT_ARMEH_SJLJ_Longjmp,
                                [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH",
+                                      SDT_ARMEH_SJLJ_SetupDispatch,
+                                      [SDNPHasChain, SDNPSideEffect]>;
 
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
                                [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
-def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
-
 def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
 def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 
-def ARMvmaxnm        : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
-def ARMvminnm        : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
+def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                         SDNPMayStore, SDNPMayLoad]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
@@ -209,6 +215,8 @@ def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
                                  AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
                                  AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
+                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -228,7 +236,9 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
-                                 AssemblerPredicate<"FeatureFP16","half-float">;
+                                 AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
+                                 AssemblerPredicate<"FeatureFullFP16","full half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
                                  AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
@@ -236,9 +246,8 @@ def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
                                  AssemblerPredicate<"FeatureT2XtPk",
                                                      "pack/extract">;
-def HasThumb2DSP     : Predicate<"Subtarget->hasThumb2DSP()">,
-                                 AssemblerPredicate<"FeatureDSPThumb2",
-                                                    "thumb2-dsp">;
+def HasDSP           : Predicate<"Subtarget->hasDSP()">,
+                                 AssemblerPredicate<"FeatureDSP", "dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
                                  AssemblerPredicate<"FeatureDB",
                                                     "data-barriers">;
@@ -2322,6 +2331,7 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{23-4} = 0b01100000000000000111;
   let Inst{3-0} = opt;
 }
+def : MnemonicAlias<"smi", "smc">;
 
 // Supervisor Call (Software Interrupt)
 let isCall = 1, Uses = [SP] in {
@@ -3671,10 +3681,10 @@ def USAT16 : AI<(outs GPRnopc:$Rd),
   let Inst{3-0} = Rn;
 }
 
-def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos),
-               (SSAT imm:$pos, GPRnopc:$a, 0)>;
-def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
-               (USAT imm:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
+               (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
+               (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
@@ -4186,7 +4196,7 @@ def CLZ  : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
 
 def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
               IIC_iUNAr, "rbit", "\t$Rd, $Rm",
-              [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>,
+              [(set GPR:$Rd, (bitreverse GPR:$Rm))]>,
            Requires<[IsARM, HasV6T2]>,
            Sched<[WriteALU]>;
 
@@ -4578,6 +4588,19 @@ let usesCustomInserter = 1 in {
       [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
 }
 
+let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in {
+    // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs...
+    // Copies N registers worth of memory from address %src to address %dst
+    // and returns the incremented addresses.  N scratch register will
+    // be attached for the copy to use.
+    def MEMCPY : PseudoInst<
+      (outs GPR:$newdst, GPR:$newsrc),
+      (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops),
+      NoItinerary,
+      [(set GPR:$newdst, GPR:$newsrc,
+            (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
+}
+
 def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
@@ -4705,7 +4728,7 @@ def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
 
 def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
                 [(int_arm_clrex)]>,
-            Requires<[IsARM, HasV6]>  {
+            Requires<[IsARM, HasV6K]>  {
   let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
@@ -5242,6 +5265,12 @@ def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
 let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
   def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
 
+def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
+                         [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+let usesCustomInserter = 1, Defs = [CPSR] in
+  def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary,
+                               [(win__dbzchk GPR:$divisor)]>;
+
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
@@ -5301,6 +5330,10 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
                                 Requires<[IsARM]>;
 }
 
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary,
+            [(ARMeh_sjlj_setup_dispatch)]>;
+
 // eh.sjlj.dispatchsetup pseudo-instruction.
 // This pseudo is used for both ARM and Thumb. Any differences are handled when
 // the pseudo is expanded (which happens before any passes that need the
@@ -5365,6 +5398,27 @@ def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
                     Requires<[IsARM, UseMovt]>;
 } // isReMaterializable
 
+// The many different faces of TLS access.
+def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst),
+             (MOVi32imm tglobaltlsaddr :$dst)>,
+      Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapper tglobaltlsaddr:$src),
+          (LDRLIT_ga_abs tglobaltlsaddr:$src)>,
+      Requires<[IsARM, DontUseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsARM, DontUseMovt]>;
+let AddedComplexity = 10 in
+def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
+          (MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>,
+      Requires<[IsARM, UseMovt]>;
+
+
 // ConstantPool, GlobalAddress, and JumpTable
 def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
@@ -5622,16 +5676,16 @@ def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
                    (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
 def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                   (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+                   (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
-                   (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+                   (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                   (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+                   (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
-                   (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+                   (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
 // Likewise, "add Rd, mod_imm_neg" -> sub
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
index f035d61..defef4e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -587,11 +587,6 @@ def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
 def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
 def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
 
-def SDTARMFMAX    : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
-                                         SDTCisSameAs<0, 2>]>;
-def NEONfmax      : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
-def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
-
 def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
   unsigned EltBits = 0;
@@ -2465,17 +2460,17 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Same as above, but not predicated.
-class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
-  : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+  : N2Vnp<op19_18, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
           itin, OpcodeStr, Dt,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 
-class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
-  : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+  : N2Vnp<op19_18, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
           itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
@@ -3255,6 +3250,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                   [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
+  def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, "f16", asm, "",
+                  [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+              Requires<[HasNEON,HasFullFP16]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
 
   // 128-bit vector types.
   def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
@@ -3275,6 +3277,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                   [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
+  def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, "f16", asm, "",
+                  [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+              Requires<[HasNEON,HasFullFP16]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
 }
 
 
@@ -4110,6 +4119,12 @@ def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
                      v2f32, v2f32, fadd, 1>;
 def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
                      v4f32, v4f32, fadd, 1>;
+def  VADDhd   : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16",
+                     v4f16, v4f16, fadd, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VADDhq   : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16",
+                     v8f16, v8f16, fadd, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
 //   VADDL    : Vector Add Long (Q = D + D)
 defm VADDLs   : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
                             "vaddl", "s", add, sext, 1>;
@@ -4165,10 +4180,21 @@ def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
                      v2f32, v2f32, fmul, 1>;
 def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
                      v4f32, v4f32, fmul, 1>;
+def  VMULhd   : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16",
+                     v4f16, v4f16, fmul, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VMULhq   : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16",
+                     v8f16, v8f16, fmul, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
 defm VMULsl   : N3VSL_HS<0b1000, "vmul", mul>;
 def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
 def  VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
                        v2f32, fmul>;
+def  VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16,
+                       v4f16, fmul>,
+                Requires<[HasNEON,HasFullFP16]>;
 
 def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
                       (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
@@ -4277,6 +4303,12 @@ def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
 def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
                           v4f32, fmul_su, fadd_mlx>,
                 Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAhd   : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
+                          v4f16, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAhq   : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
+                          v8f16, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
 defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
@@ -4285,6 +4317,12 @@ def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
 def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
                             v4f32, v2f32, fmul_su, fadd_mlx>,
                 Requires<[HasNEON, UseFPVMLx]>;
+def  VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16",
+                            v4f16, fmul, fadd>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def  VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16",
+                            v8f16, v4f16, fmul, fadd>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 
 def : Pat<(v8i16 (add (v8i16 QPR:$src1),
                   (mul (v8i16 QPR:$src2),
@@ -4495,6 +4533,12 @@ def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
 def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
                           v4f32, fmul_su, fsub_mlx>,
                 Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLShd   : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
+                          v4f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLShq   : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
+                          v8f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
 defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
@@ -4503,6 +4547,12 @@ def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
 def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
                             v4f32, v2f32, fmul_su, fsub_mlx>,
                 Requires<[HasNEON, UseFPVMLx]>;
+def  VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16",
+                            v4f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def  VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16",
+                            v8f16, v4f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 
 def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
                   (mul (v8i16 QPR:$src2),
@@ -4570,6 +4620,13 @@ def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
 def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
                           v4f32, fmul_su, fadd_mlx>,
                 Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def  VFMAhd   : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16",
+                          v4f16, fmul, fadd>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+def  VFMAhq   : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16",
+                          v8f16, fmul, fadd>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
 
 //   Fused Vector Multiply Subtract (floating-point)
 def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
@@ -4578,6 +4635,12 @@ def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
 def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                           v4f32, fmul_su, fsub_mlx>,
                 Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def  VFMShd   : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16",
+                          v4f16, fmul, fsub>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+def  VFMShq   : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16",
+                          v8f16, fmul, fsub>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
 def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
@@ -4602,6 +4665,12 @@ def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
                      v2f32, v2f32, fsub, 0>;
 def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
                      v4f32, v4f32, fsub, 0>;
+def  VSUBhd   : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16",
+                     v4f16, v4f16, fsub, 0>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VSUBhq   : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16",
+                     v8f16, v8f16, fsub, 0>,
+                Requires<[HasNEON,HasFullFP16]>;
 //   VSUBL    : Vector Subtract Long (Q = D - D)
 defm VSUBLs   : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
                             "vsubl", "s", sub, sext, 0>;
@@ -4646,6 +4715,12 @@ def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
                      NEONvceq, 1>;
 def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                      NEONvceq, 1>;
+def  VCEQhd   : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+                     NEONvceq, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCEQhq   : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+                     NEONvceq, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
@@ -4660,6 +4735,12 @@ def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
                      NEONvcge, 0>;
 def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                      NEONvcge, 0>;
+def  VCGEhd   : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+                     NEONvcge, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCGEhq   : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+                     NEONvcge, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
@@ -4677,6 +4758,12 @@ def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
                      NEONvcgt, 0>;
 def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
                      NEONvcgt, 0>;
+def  VCGThd   : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+                     NEONvcgt, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCGThq   : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+                     NEONvcgt, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
@@ -4686,36 +4773,68 @@ defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
 }
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
-def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+def  VACGEfd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
                         "f32", v2i32, v2f32, int_arm_neon_vacge, 0>;
-def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+def  VACGEfq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
                         "f32", v4i32, v4f32, int_arm_neon_vacge, 0>;
+def  VACGEhd   : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f16", v4i16, v4f16, int_arm_neon_vacge, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
+def  VACGEhq   : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f16", v8i16, v8f16, int_arm_neon_vacge, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
 //   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
-def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+def  VACGTfd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
                         "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>;
-def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+def  VACGTfq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
                         "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>;
+def  VACGThd   : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
+def  VACGThq   : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
 //   VTST     : Vector Test Bits
 defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                         IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
 
 def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
-                   (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+                   (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
 def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
-                   (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+                   (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
 def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
-                   (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+                   (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
 def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
-                   (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+                   (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+}
 
 def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
-                   (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+                   (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
 def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
-                   (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+                   (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
 def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
-                   (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+                   (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
 def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
-                   (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+                   (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+                   (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+                   (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+                   (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+                   (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+}
 
 // Vector Bitwise Operations.
 
@@ -5007,6 +5126,12 @@ def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
                         "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
 def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
                         "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
+def  VABDhd   : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND,
+                        "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VABDhq   : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+                        "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 //   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
 defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
@@ -5014,6 +5139,29 @@ defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
 defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
                                "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
 
+def abd_shr :
+    PatFrag<(ops node:$in1, node:$in2, node:$shift),
+            (NEONvshrs (sub (zext node:$in1),
+                            (zext node:$in2)), (i32 $shift))>;
+
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))),
+               (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)),
+                                                   (zext (v8i8 DPR:$opB))),
+                                              (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))),
+          (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)),
+               (v4i32 (add (sub (zext (v4i16 DPR:$opA)),
+                                (zext (v4i16 DPR:$opB))),
+                           (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))),
+          (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
+               (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
+                                                   (zext (v2i32 DPR:$opB))),
+                                         (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
+          (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
+
 //   VABA     : Vector Absolute Difference and Accumulate
 defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
                              "vaba", "s", int_arm_neon_vabds, add>;
@@ -5031,53 +5179,85 @@ defm VABALu   : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
 //   VMAX     : Vector Maximum
 defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vmax", "s", int_arm_neon_vmaxs, 1>;
+                           "vmax", "s", smax, 1>;
 defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vmax", "u", int_arm_neon_vmaxu, 1>;
+                           "vmax", "u", umax, 1>;
 def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmax", "f32",
-                        v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+                        v2f32, v2f32, fmaxnan, 1>;
 def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f32",
-                        v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+                        v4f32, v4f32, fmaxnan, 1>;
+def  VMAXhd   : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmax", "f16",
+                        v4f16, v4f16, fmaxnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VMAXhq   : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmax", "f16",
+                        v8f16, v8f16, fmaxnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 // VMAXNM
 let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
-  def VMAXNMND  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+  def VMAXNMNDf  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
                             N3RegFrm, NoItinerary, "vmaxnm", "f32",
-                            v2f32, v2f32, int_arm_neon_vmaxnm, 1>,
+                            v2f32, v2f32, fmaxnum, 1>,
                             Requires<[HasV8, HasNEON]>;
-  def VMAXNMNQ  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+  def VMAXNMNQf  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
                             N3RegFrm, NoItinerary, "vmaxnm", "f32",
-                            v4f32, v4f32, int_arm_neon_vmaxnm, 1>,
+                            v4f32, v4f32, fmaxnum, 1>,
                             Requires<[HasV8, HasNEON]>;
+  def VMAXNMNDh  : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f16",
+                            v4f16, v4f16, fmaxnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
+  def VMAXNMNQh  : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f16",
+                            v8f16, v8f16, fmaxnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
 }
 
 //   VMIN     : Vector Minimum
 defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vmin", "s", int_arm_neon_vmins, 1>;
+                           "vmin", "s", smin, 1>;
 defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vmin", "u", int_arm_neon_vminu, 1>;
+                           "vmin", "u", umin, 1>;
 def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmin", "f32",
-                        v2f32, v2f32, int_arm_neon_vmins, 1>;
+                        v2f32, v2f32, fminnan, 1>;
 def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f32",
-                        v4f32, v4f32, int_arm_neon_vmins, 1>;
+                        v4f32, v4f32, fminnan, 1>;
+def  VMINhd   : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmin", "f16",
+                        v4f16, v4f16, fminnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VMINhq   : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmin", "f16",
+                        v8f16, v8f16, fminnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 // VMINNM
 let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
-  def VMINNMND  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+  def VMINNMNDf  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
                             N3RegFrm, NoItinerary, "vminnm", "f32",
-                            v2f32, v2f32, int_arm_neon_vminnm, 1>,
+                            v2f32, v2f32, fminnum, 1>,
                             Requires<[HasV8, HasNEON]>;
-  def VMINNMNQ  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+  def VMINNMNQf  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
                             N3RegFrm, NoItinerary, "vminnm", "f32",
-                            v4f32, v4f32, int_arm_neon_vminnm, 1>,
+                            v4f32, v4f32, fminnum, 1>,
                             Requires<[HasV8, HasNEON]>;
+  def VMINNMNDh  : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f16",
+                            v4f16, v4f16, fminnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
+  def VMINNMNQh  : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f16",
+                            v8f16, v8f16, fminnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
 }
 
 // Vector Pairwise Operations.
@@ -5095,6 +5275,10 @@ def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
 def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
                         IIC_VPBIND, "vpadd", "f32",
                         v2f32, v2f32, int_arm_neon_vpadd, 0>;
+def  VPADDh   : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm,
+                        IIC_VPBIND, "vpadd", "f16",
+                        v4f16, v4f16, int_arm_neon_vpadd, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 //   VPADDL   : Vector Pairwise Add Long
 defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
@@ -5123,6 +5307,9 @@ def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
                         "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
 def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
                         "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXh   : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+                        "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 //   VPMIN    : Vector Pairwise Minimum
 def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
@@ -5139,6 +5326,9 @@ def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
                         "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
 def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
                         "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
+def  VPMINh   : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+                        "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
 
@@ -5155,6 +5345,14 @@ def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
 def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
                         IIC_VUNAQ, "vrecpe", "f32",
                         v4f32, v4f32, int_arm_neon_vrecpe>;
+def  VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe", "f16",
+                        v4f16, v4f16, int_arm_neon_vrecpe>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe", "f16",
+                        v8f16, v8f16, int_arm_neon_vrecpe>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 //   VRECPS   : Vector Reciprocal Step
 def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
@@ -5163,6 +5361,14 @@ def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
 def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
                         IIC_VRECSQ, "vrecps", "f32",
                         v4f32, v4f32, int_arm_neon_vrecps, 1>;
+def  VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrecps", "f16",
+                        v4f16, v4f16, int_arm_neon_vrecps, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrecps", "f16",
+                        v8f16, v8f16, int_arm_neon_vrecps, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 //   VRSQRTE  : Vector Reciprocal Square Root Estimate
 def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
@@ -5177,6 +5383,14 @@ def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
 def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
                          IIC_VUNAQ, "vrsqrte", "f32",
                          v4f32, v4f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte", "f16",
+                         v4f16, v4f16, int_arm_neon_vrsqrte>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+                         IIC_VUNAQ, "vrsqrte", "f16",
+                         v8f16, v8f16, int_arm_neon_vrsqrte>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 //   VRSQRTS  : Vector Reciprocal Square Root Step
 def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
@@ -5185,6 +5399,14 @@ def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
 def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
                         IIC_VRECSQ, "vrsqrts", "f32",
                         v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrsqrts", "f16",
+                        v4f16, v4f16, int_arm_neon_vrsqrts, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrsqrts", "f16",
+                        v8f16, v8f16, int_arm_neon_vrsqrts, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 // Vector Shifts.
 
@@ -5336,6 +5558,14 @@ def  VABSfd   : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
 def  VABSfq   : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
                      "vabs", "f32",
                       v4f32, v4f32, fabs>;
+def  VABShd   : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+                     "vabs", "f16",
+                     v4f16, v4f16, fabs>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VABShq   : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+                     "vabs", "f16",
+                      v8f16, v8f16, fabs>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
                (v2i32 (bitconvert (v8i8 (add DPR:$src,
@@ -5398,6 +5628,16 @@ def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
                     (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
                     "vneg", "f32", "$Vd, $Vm", "",
                     [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
+def  VNEGhd   : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+                    "vneg", "f16", "$Vd, $Vm", "",
+                    [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VNEGhq   : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+                    "vneg", "f16", "$Vd, $Vm", "",
+                    [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>,
+                Requires<[HasNEON, HasFullFP16]>;
 
 def : Pat<(v8i8  (vnegd  DPR:$src)), (VNEGs8d DPR:$src)>;
 def : Pat<(v4i16 (vnegd  DPR:$src)), (VNEGs16d DPR:$src)>;
@@ -5449,7 +5689,10 @@ def : NEONInstAlias<"vmov${p} $Vd, $Vm",
 
 //   VMOV     : Vector Move (Immediate)
 
-let isReMaterializable = 1 in {
+// Although VMOVs are not strictly speaking cheap, they are as expensive
+// as their copies counterpart (VORR), so we should prefer rematerialization
+// over splitting when it applies.
+let isReMaterializable = 1, isAsCheapAsAMove=1 in {
 def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
                          (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
                          "vmov", "i8", "$Vd, $SIMM", "",
@@ -5504,7 +5747,7 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
                          (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
                          "vmov", "f32", "$Vd, $SIMM", "",
                          [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
-} // isReMaterializable
+} // isReMaterializable, isAsCheapAsAMove
 
 // Add support for bytes replication feature, so it could be GAS compatible.
 // E.g. instructions below:
@@ -5868,18 +6111,56 @@ def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
 def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
                      v4f32, v4i32, uint_to_fp>;
 
+def  VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+                     v4i16, v4f16, fp_to_sint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+                     v4i16, v4f16, fp_to_uint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+                     v4f16, v4i16, sint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+                     v4f16, v4i16, uint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+def  VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+                     v8i16, v8f16, fp_to_sint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+                     v8i16, v8f16, fp_to_uint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+                     v8f16, v8i16, sint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+                     v8f16, v8i16, uint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+
 // VCVT{A, N, P, M}
 multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
                     SDPatternOperator IntU> {
   let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
-    def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+    def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
                        "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
-    def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+    def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
                        "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
-    def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+    def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
                        "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
-    def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+    def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
                        "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s16.f16", v4i16, v4f16, IntS>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+    def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s16.f16", v8i16, v8f16, IntS>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+    def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u16.f16", v4i16, v4f16, IntU>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+    def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u16.f16", v8i16, v8f16, IntU>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
   }
 }
 
@@ -5898,6 +6179,16 @@ def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
                         v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
 def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+                        v4i16, v4f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+                        v4i16, v4f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+                        v4f16, v4i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+                        v4f16, v4i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
 }
 
 let DecoderMethod = "DecodeVCVTQ" in {
@@ -5909,6 +6200,16 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
                         v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+                        v8i16, v8f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+                        v8i16, v8f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+                        v8f16, v8i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+                        v8f16, v8i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
 }
 
 def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
@@ -5929,6 +6230,24 @@ def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
 def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
                     (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
 
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0",
+                    (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0",
+                    (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0",
+                    (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0",
+                    (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0",
+                    (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0",
+                    (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0",
+                    (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0",
+                    (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
 
 //   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
 def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
@@ -6182,22 +6501,40 @@ def  VTBX4Pseudo
 // VRINT      : Vector Rounding
 multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
   let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
-    def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary,
+    def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
                       !strconcat("vrint", op), "f32",
                       v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
       let Inst{9-7} = op9_7;
     }
-    def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary,
+    def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
                       !strconcat("vrint", op), "f32",
                       v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
       let Inst{9-7} = op9_7;
     }
+    def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f16",
+                      v4f16, v4f16, Int>,
+             Requires<[HasV8, HasNEON, HasFullFP16]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f16",
+                      v8f16, v8f16, Int>,
+             Requires<[HasV8, HasNEON, HasFullFP16]> {
+      let Inst{9-7} = op9_7;
+    }
   }
 
   def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+                  (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>;
   def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
-                  (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
+                  (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>;
+  def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>;
+  }
 }
 
 defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
@@ -6343,8 +6680,8 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
       Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N2VSPat<fabs, VABSfd>;
 def : N2VSPat<fneg, VNEGfd>;
-def : N3VSPat<NEONfmax, VMAXfd>;
-def : N3VSPat<NEONfmin, VMINfd>;
+def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
 def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
 def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
 def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
@@ -7704,6 +8041,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
                     (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
 def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
                     (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm",
+                    (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
 // Q-register versions.
 def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
                     (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
@@ -7719,6 +8059,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
                     (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
 def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
                     (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm",
+                    (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
 
 // VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
 // D-register versions.
@@ -7736,6 +8079,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
                     (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
 def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
                     (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm",
+                    (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
 // Q-register versions.
 def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
                     (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
@@ -7751,6 +8097,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
                     (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
 def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
                     (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm",
+                    (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
 
 // VSWP allows, but does not require, a type suffix.
 defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
index 40414da..5b1f9a0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -591,6 +591,34 @@ def tTRAP : TI<(outs), (ins), IIC_Br,
 //  Load Store Instructions.
 //
 
+// PC-relative loads need to be matched first as constant pool accesses need to
+// always be PC-relative. We do this using AddedComplexity, as the pattern is
+// simpler than the patterns of the other load instructions.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                  "ldr", "\t$Rt, $addr",
+                  [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+              T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
+// SP-relative loads should be matched before standard immediate-offset loads as
+// it means we avoid having to move SP to another register.
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+                    "ldr", "\t$Rt, $addr",
+                    [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+              T1LdStSP<{1,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
 // Loads: reg/reg and reg/imm5
 let canFoldAsLoad = 1, isReMaterializable = 1 in
 multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
@@ -598,16 +626,20 @@ multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
                               AddrMode am, InstrItinClass itin_r,
                               InstrItinClass itin_i, string asm,
                               PatFrag opnode> {
-  def r : // reg/reg
-    T1pILdStEncode<reg_opc,
-                   (outs tGPR:$Rt), (ins AddrMode_r:$addr),
-                   am, itin_r, asm, "\t$Rt, $addr",
-                   [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+  // Immediate-offset loads should be matched before register-offset loads as
+  // when the offset is a constant it's simpler to first check if it fits in the
+  // immediate offset field then fall back to register-offset if it doesn't.
   def i : // reg/imm5
     T1pILdStEncodeImm<imm_opc, 1 /* Load */,
                       (outs tGPR:$Rt), (ins AddrMode_i:$addr),
                       am, itin_i, asm, "\t$Rt, $addr",
                       [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+  // Register-offset loads are matched last.
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
 }
 // Stores: reg/reg and reg/imm5
 multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
@@ -615,32 +647,32 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
                               AddrMode am, InstrItinClass itin_r,
                               InstrItinClass itin_i, string asm,
                               PatFrag opnode> {
-  def r : // reg/reg
-    T1pILdStEncode<reg_opc,
-                   (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
-                   am, itin_r, asm, "\t$Rt, $addr",
-                   [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
   def i : // reg/imm5
     T1pILdStEncodeImm<imm_opc, 0 /* Store */,
                       (outs), (ins tGPR:$Rt, AddrMode_i:$addr),
                       am, itin_i, asm, "\t$Rt, $addr",
                       [(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
 }
 
 // A8.6.57 & A8.6.60
-defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4,
+defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
                                 t_addrmode_is4, AddrModeT1_4,
                                 IIC_iLoad_r, IIC_iLoad_i, "ldr",
                                 UnOpFrag<(load node:$Src)>>;
 
 // A8.6.64 & A8.6.61
-defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1,
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
                                 t_addrmode_is1, AddrModeT1_1,
                                 IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
                                 UnOpFrag<(zextloadi8 node:$Src)>>;
 
 // A8.6.76 & A8.6.73
-defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
                                 t_addrmode_is2, AddrModeT1_2,
                                 IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
                                 UnOpFrag<(zextloadi16 node:$Src)>>;
@@ -659,58 +691,36 @@ def tLDRSH :                    // A8.6.84
                  "ldrsh", "\t$Rt, $addr",
                  [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
 
-let canFoldAsLoad = 1 in
-def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
-                    "ldr", "\t$Rt, $addr",
-                    [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
-              T1LdStSP<{1,?,?}> {
-  bits<3> Rt;
-  bits<8> addr;
-  let Inst{10-8} = Rt;
-  let Inst{7-0} = addr;
-}
 
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
-                  "ldr", "\t$Rt, $addr",
-                  [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
-              T1Encoding<{0,1,0,0,1,?}> {
-  // A6.2 & A8.6.59
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+                    "str", "\t$Rt, $addr",
+                    [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+              T1LdStSP<{0,?,?}> {
   bits<3> Rt;
   bits<8> addr;
   let Inst{10-8} = Rt;
-  let Inst{7-0}  = addr;
+  let Inst{7-0} = addr;
 }
 
 // A8.6.194 & A8.6.192
-defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
+defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
                                 t_addrmode_is4, AddrModeT1_4,
                                 IIC_iStore_r, IIC_iStore_i, "str",
                                 BinOpFrag<(store node:$LHS, node:$RHS)>>;
 
 // A8.6.197 & A8.6.195
-defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1,
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
                                 t_addrmode_is1, AddrModeT1_1,
                                 IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
                                 BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 
 // A8.6.207 & A8.6.205
-defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2,
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
                                t_addrmode_is2, AddrModeT1_2,
                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
                                BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 
-def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
-                    "str", "\t$Rt, $addr",
-                    [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
-              T1LdStSP<{0,?,?}> {
-  bits<3> Rt;
-  bits<8> addr;
-  let Inst{10-8} = Rt;
-  let Inst{7-0} = addr;
-}
-
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -730,6 +740,7 @@ def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
 // Writeback version is just a pseudo, as there's no encoding difference.
 // Writeback happens iff the base register is not in the destination register
 // list.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 def tLDMIA_UPD :
     InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
                  "$Rn = $wb", IIC_iLoad_mu>,
@@ -1328,16 +1339,16 @@ def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
             (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
 
 // Bswap 16 with load/store
-def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)),
-            (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>;
 def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
             (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
-def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
-                           t_addrmode_rrs2:$addr),
-            (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
+            (tREV16 (tLDRHr t_addrmode_rr:$addr))>;
 def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
                            t_addrmode_is2:$addr),
             (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_rr:$addr),
+            (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>;
 
 // ConstantPool
 def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
@@ -1355,6 +1366,14 @@ def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
                                       (ARMWrapper tglobaladdr:$src))]>,
                      Requires<[IsThumb, DontUseMovt]>;
 
+// TLS globals
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, DontUseMovt]>;
+def : Pat<(ARMWrapper tglobaltlsaddr:$addr),
+          (tLDRLIT_ga_abs tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, DontUseMovt]>;
+
 
 // JumpTable
 def : T1Pat<(ARMWrapperJT tjumptable:$dst),
@@ -1372,10 +1391,10 @@ def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
       Requires<[IsThumb, HasV5T]>;
 
 // zextload i1 -> zextload i8
-def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
-            (tLDRBr t_addrmode_rrs1:$addr)>;
 def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
             (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_rr:$addr),
+            (tLDRBr t_addrmode_rr:$addr)>;
 
 // extload from the stack -> word load from the stack, as it avoids having to
 // materialize the base in a separate register. This only works when a word
@@ -1389,61 +1408,61 @@ def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
       Requires<[IsThumb, IsThumb1Only, IsLE]>;
 
 // extload -> zextload
-def : T1Pat<(extloadi1  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
-def : T1Pat<(extloadi1  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
-def : T1Pat<(extloadi8  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
-def : T1Pat<(extloadi8  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
-def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>;
-def : T1Pat<(extloadi16 t_addrmode_is2:$addr),  (tLDRHi t_addrmode_is2:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
 
 // If it's impossible to use [r,r] address mode for sextload, select to
 // ldr{b|h} + sxt{b|h} instead.
 def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
             (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
       Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
-            (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>,
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+            (tSXTB (tLDRBr t_addrmode_rr:$addr))>,
       Requires<[IsThumb, IsThumb1Only, HasV6]>;
 def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
             (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
       Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
-            (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>,
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+            (tSXTH (tLDRHr t_addrmode_rr:$addr))>,
       Requires<[IsThumb, IsThumb1Only, HasV6]>;
 
-def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
-            (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>;
 def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
             (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
-def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
-            (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+            (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>;
 def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
             (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+            (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>;
 
 def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
              (tLDRBi t_addrmode_is1:$src)>;
-def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src),
-             (tLDRBr t_addrmode_rrs1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rr:$src),
+             (tLDRBr t_addrmode_rr:$src)>;
 def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
              (tLDRHi t_addrmode_is2:$src)>;
-def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src),
-             (tLDRHr t_addrmode_rrs2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rr:$src),
+             (tLDRHr t_addrmode_rr:$src)>;
 def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
              (tLDRi t_addrmode_is4:$src)>;
-def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src),
-             (tLDRr t_addrmode_rrs4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rr:$src),
+             (tLDRr t_addrmode_rr:$src)>;
 def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
              (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
-def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val),
-             (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val),
+             (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>;
 def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
              (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
-def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val),
-             (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val),
+             (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>;
 def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
              (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
-def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val),
-             (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val),
+             (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>;
 
 // Large immediate handling.
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index aba8a7b..f42f456 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -43,7 +43,7 @@ def t2_shift_imm : Operand<i32> {
 // Shifted operands. No register controlled shifts for Thumb2.
 // Note: We do not support rrx shifted operands yet.
 def t2_so_reg : Operand<i32>,    // reg imm
-                ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
+                ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
                                [shl,srl,sra,rotr]> {
   let EncoderMethod = "getT2SORegOpValue";
   let PrintMethod = "printT2SOOperand";
@@ -1554,19 +1554,21 @@ def t2STRBT  : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
 def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 
 // ldrd / strd pre / post variants
-// For disassembly only.
 
+let mayLoad = 1 in
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
+let mayLoad = 1 in
 def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
                  IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
                  "$addr.base = $wb", []>;
 
+let mayStore = 1 in
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
@@ -1574,6 +1576,7 @@ def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
+let mayStore = 1 in
 def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
                       t2am_imm8s4_offset:$imm),
@@ -2100,7 +2103,7 @@ def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
 
 def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-24} = 0b010;
   let Inst{23} = 0b1;
@@ -2117,7 +2120,7 @@ class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
               dag iops = (ins rGPR:$Rn, rGPR:$Rm),
               string asm = "\t$Rd, $Rn, $Rm">
   : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
-    Requires<[IsThumb2, HasThumb2DSP]> {
+    Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0101;
   let Inst{22-20} = op22_20;
@@ -2215,13 +2218,13 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
 def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                                            (ins rGPR:$Rn, rGPR:$Rm),
                         NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{15-12} = 0b1111;
 }
 def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                        (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
                         "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 
 // Signed/Unsigned saturate.
 class T2SatI<dag oops, dag iops, InstrItinClass itin,
@@ -2254,7 +2257,7 @@ def t2SSAT: T2SatI<
 def t2SSAT16: T2SatI<
                 (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
                 "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2278,7 +2281,7 @@ def t2USAT: T2SatI<
 def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
                      NoItinerary,
                      "usat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-22} = 0b1111001110;
   let Inst{20} = 0;
   let Inst{15} = 0;
@@ -2288,8 +2291,8 @@ def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
   let Inst{5-4} = 0b00;
 }
 
-def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
-def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
@@ -2605,7 +2608,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
                   (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
                   "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 } // hasSideEffects
 
 // Rounding variants of the below included for disassembly only
@@ -2614,7 +2617,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110,
 def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                   "smmul", "\t$Rd, $Rn, $Rm",
                   [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2624,7 +2627,7 @@ def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
 
 def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                   "smmulr", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2636,7 +2639,7 @@ def t2SMMLA : T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "smmla", "\t$Rd, $Rn, $Rm, $Ra",
                 [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>,
-              Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+              Requires<[IsThumb2, HasDSP, UseMulOps]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2646,7 +2649,7 @@ def t2SMMLA : T2FourReg<
 def t2SMMLAR: T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                   "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2657,7 +2660,7 @@ def t2SMMLS: T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "smmls", "\t$Rd, $Rn, $Rm, $Ra",
                 [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>,
-             Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+             Requires<[IsThumb2, HasDSP, UseMulOps]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b110;
@@ -2667,7 +2670,7 @@ def t2SMMLS: T2FourReg<
 def t2SMMLSR:T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b110;
@@ -2679,7 +2682,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
               !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
               [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
                                       (sext_inreg rGPR:$Rm, i16)))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2692,7 +2695,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
               !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
               [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
                                       (sra rGPR:$Rm, (i32 16))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2705,7 +2708,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
               !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
               [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
                                       (sext_inreg rGPR:$Rm, i16)))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2718,7 +2721,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
               !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
               [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
                                       (sra rGPR:$Rm, (i32 16))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2730,7 +2733,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
   def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
               !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
               []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -2742,7 +2745,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
   def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
               !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
               []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -2760,7 +2763,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               [(set rGPR:$Rd, (add rGPR:$Ra,
                                (opnode (sext_inreg rGPR:$Rn, i16),
                                        (sext_inreg rGPR:$Rm, i16))))]>,
-           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+           Requires<[IsThumb2, HasDSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2773,7 +2776,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
              !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
                                                  (sra rGPR:$Rm, (i32 16)))))]>,
-           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+           Requires<[IsThumb2, HasDSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2786,7 +2789,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
                                                (sext_inreg rGPR:$Rm, i16))))]>,
-           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+           Requires<[IsThumb2, HasDSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2799,7 +2802,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
                                                  (sra rGPR:$Rm, (i32 16)))))]>,
-           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+           Requires<[IsThumb2, HasDSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2811,7 +2814,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
               !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
               []>,
-           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+           Requires<[IsThumb2, HasDSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -2823,7 +2826,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
               !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
               []>,
-           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+           Requires<[IsThumb2, HasDSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -2839,79 +2842,79 @@ defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
          (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd),
          (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd),
          (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
          (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 
 // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
 def t2SMUAD: T2ThreeReg_mac<
             0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
             IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{15-12} = 0b1111;
 }
 def t2SMUADX:T2ThreeReg_mac<
             0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
             IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{15-12} = 0b1111;
 }
 def t2SMUSD: T2ThreeReg_mac<
             0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
             IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{15-12} = 0b1111;
 }
 def t2SMUSDX:T2ThreeReg_mac<
             0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
             IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+          Requires<[IsThumb2, HasDSP]> {
   let Inst{15-12} = 0b1111;
 }
 def t2SMLAD   : T2FourReg_mac<
             0, 0b010, 0b0000, (outs rGPR:$Rd),
             (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
             "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLADX  : T2FourReg_mac<
             0, 0b010, 0b0001, (outs rGPR:$Rd),
             (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx",
             "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLSD   : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd),
             (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd",
             "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLSDX  : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
             (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx",
             "\t$Rd, $Rn, $Rm, $Ra", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLALD  : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
                         (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald",
                         "\t$Ra, $Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
                         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx",
                         "\t$Ra, $Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLSLD  : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
                         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld",
                         "\t$Ra, $Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
                         (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
                         "\t$Ra, $Rd, $Rn, $Rm", []>,
-          Requires<[IsThumb2, HasThumb2DSP]>;
+          Requires<[IsThumb2, HasDSP]>;
 
 //===----------------------------------------------------------------------===//
 //  Division Instructions.
@@ -2961,7 +2964,7 @@ def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
 
 def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                       "rbit", "\t$Rd, $Rm",
-                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>,
+                      [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>,
                       Sched<[WriteALU]>;
 
 def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
@@ -3872,6 +3875,13 @@ def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
 
 }
 
+def : T2Pat<(ARMWrapperPIC tglobaltlsaddr :$dst),
+            (t2MOV_ga_pcrel tglobaltlsaddr:$dst)>,
+      Requires<[IsThumb2, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst),
+            (t2MOVi32imm tglobaltlsaddr:$dst)>,
+      Requires<[IsThumb2, UseMovt]>;
+
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
index e83f8c8..63e7940 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -20,7 +20,6 @@ def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 
-
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
 //
@@ -93,7 +92,7 @@ def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
 
 def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
                  IIC_fpLoad32, "vldr", "\t$Sd, $addr",
-                 [(set SPR:$Sd, (load addrmode5:$addr))]> {
+                 [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> {
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
@@ -107,7 +106,7 @@ def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
 
 def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
                  IIC_fpStore32, "vstr", "\t$Sd, $addr",
-                 [(store SPR:$Sd, addrmode5:$addr)]> {
+                 [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> {
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
@@ -393,8 +392,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   }
 }
 
-defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>;
-defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>;
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
 
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
@@ -541,19 +540,23 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 // FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
@@ -922,6 +925,22 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
   let isRegSequence = 1;
 }
 
+// Hoist an fabs or a fneg of a value coming from integer registers
+// and do the fabs/fneg on the integer value. This is never a lose
+// and could enable the conversion to float to be removed completely.
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+      Requires<[IsARM, HasV6T2]>;
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+      Requires<[IsThumb2, HasV6T2]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>,
+      Requires<[IsARM]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>,
+      Requires<[IsThumb2]>;
+
 let hasSideEffects = 0 in
 def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                      (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
@@ -1003,7 +1022,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
   def : VFPPat<(f64 (sint_to_fp GPR:$a)),
                (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-  def : VFPPat<(f64 (sint_to_fp (i32 (load addrmode5:$a)))),
+  def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                (VSITOD (VLDRS addrmode5:$a))>;
 }
 
@@ -1021,7 +1040,7 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
 def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
                    (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (load addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VSITOS (VLDRS addrmode5:$a))>;
 
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
@@ -1035,7 +1054,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
   def : VFPPat<(f64 (uint_to_fp GPR:$a)),
                (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-  def : VFPPat<(f64 (uint_to_fp (i32 (load addrmode5:$a)))),
+  def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                (VUITOD (VLDRS addrmode5:$a))>;
 }
 
@@ -1053,7 +1072,7 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
 def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
                    (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (load addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
                    (VUITOS (VLDRS addrmode5:$a))>;
 
 // FP -> Int:
@@ -1106,7 +1125,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
   def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
                (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
 
-  def : VFPPat<(store (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
+  def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
                (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
 }
 
@@ -1124,7 +1143,8 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
 def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
                    (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
 
-def : VFPNoNEONPat<(store (i32 (fp_to_sint (f32 SPR:$a))), addrmode5:$ptr),
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
+                                   addrmode5:$ptr),
                    (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
@@ -1138,7 +1158,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
   def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
                (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
 
-  def : VFPPat<(store (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
+  def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
                (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
 }
 
@@ -1156,7 +1176,8 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
 def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
                    (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
 
-def : VFPNoNEONPat<(store (i32 (fp_to_uint (f32 SPR:$a))), addrmode5:$ptr),
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
+                                   addrmode5:$ptr),
                   (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 265b86f..6e7e47b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,17 +60,24 @@ STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 
+namespace llvm {
+void initializeARMLoadStoreOptPass(PassRegistry &);
+}
+
+#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
+
 namespace {
   /// Post- register allocation pass the combine load / store instructions to
   /// form ldm / stm instructions.
   struct ARMLoadStoreOpt : public MachineFunctionPass {
     static char ID;
-    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {
+      initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
+    }
 
     const MachineFunction *MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    const MachineRegisterInfo *MRI;
     const ARMSubtarget *STI;
     const TargetLowering *TL;
     ARMFunctionInfo *AFI;
@@ -84,7 +91,7 @@ namespace {
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
     const char *getPassName() const override {
-      return "ARM load / store optimization pass";
+      return ARM_LOAD_STORE_OPT_NAME;
     }
 
   private:
@@ -118,6 +125,7 @@ namespace {
     };
     SpecificBumpPtrAllocator<MergeCandidate> Allocator;
     SmallVector<const MergeCandidate*,4> Candidates;
+    SmallVector<MachineInstr*,4> MergeBaseCandidates;
 
     void moveLiveRegsBefore(const MachineBasicBlock &MBB,
                             MachineBasicBlock::const_iterator Before);
@@ -140,12 +148,16 @@ namespace {
                              MachineBasicBlock::iterator &MBBI);
     bool MergeBaseUpdateLoadStore(MachineInstr *MI);
     bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
+    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+    bool CombineMovBx(MachineBasicBlock &MBB);
   };
   char ARMLoadStoreOpt::ID = 0;
 }
 
+INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
+
 static bool definesCPSR(const MachineInstr *MI) {
   for (const auto &MO : MI->operands()) {
     if (!MO.isReg())
@@ -619,9 +631,10 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
 
     unsigned NewBase;
     if (isi32Load(Opcode)) {
-      // If it is a load, then just use one of the destination register to
-      // use as the new base.
+      // If it is a load, then just use one of the destination registers
+      // as the new base. Will no longer be writeback in Thumb1.
       NewBase = Regs[NumRegs-1].first;
+      Writeback = false;
     } else {
       // Find a free register that we can use as scratch register.
       moveLiveRegsBefore(MBB, InsertBefore);
@@ -725,9 +738,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
   MachineInstrBuilder MIB;
 
   if (Writeback) {
-    if (Opcode == ARM::tLDMIA)
+    assert(isThumb1 && "expected Writeback only inThumb1");
+    if (Opcode == ARM::tLDMIA) {
+      assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
       // Update tLDMIA with writeback if necessary.
       Opcode = ARM::tLDMIA_UPD;
+    }
 
     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
 
@@ -784,6 +800,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   SmallVector<std::pair<unsigned, bool>, 8> Regs;
   SmallVector<unsigned, 4> ImpDefs;
   DenseSet<unsigned> KilledRegs;
+  DenseSet<unsigned> UsedRegs;
   // Determine list of registers and list of implicit super-register defs.
   for (const MachineInstr *MI : Cand.Instrs) {
     const MachineOperand &MO = getLoadStoreRegOp(*MI);
@@ -792,6 +809,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
     if (IsKill)
       KilledRegs.insert(Reg);
     Regs.push_back(std::make_pair(Reg, IsKill));
+    UsedRegs.insert(Reg);
 
     if (IsLoad) {
       // Collect any implicit defs of super-registers, after merging we can't
@@ -881,7 +899,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
       for (MachineOperand &MO : MI.uses()) {
         if (!MO.isReg() || !MO.isKill())
           continue;
-        if (KilledRegs.count(MO.getReg()))
+        if (UsedRegs.count(MO.getReg()))
           MO.setIsKill(false);
       }
     }
@@ -995,76 +1013,6 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   } while (SIndex < EIndex);
 }
 
-static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
-                                unsigned Bytes, unsigned Limit,
-                                ARMCC::CondCodes Pred, unsigned PredReg) {
-  unsigned MyPredReg = 0;
-  if (!MI)
-    return false;
-
-  bool CheckCPSRDef = false;
-  switch (MI->getOpcode()) {
-  default: return false;
-  case ARM::tSUBi8:
-  case ARM::t2SUBri:
-  case ARM::SUBri:
-    CheckCPSRDef = true;
-    break;
-  case ARM::tSUBspi:
-    break;
-  }
-
-  // Make sure the offset fits in 8 bits.
-  if (Bytes == 0 || (Limit && Bytes >= Limit))
-    return false;
-
-  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
-                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
-  if (!(MI->getOperand(0).getReg() == Base &&
-        MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm() * Scale) == Bytes &&
-        getInstrPredicate(MI, MyPredReg) == Pred &&
-        MyPredReg == PredReg))
-    return false;
-
-  return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
-static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
-                                unsigned Bytes, unsigned Limit,
-                                ARMCC::CondCodes Pred, unsigned PredReg) {
-  unsigned MyPredReg = 0;
-  if (!MI)
-    return false;
-
-  bool CheckCPSRDef = false;
-  switch (MI->getOpcode()) {
-  default: return false;
-  case ARM::tADDi8:
-  case ARM::t2ADDri:
-  case ARM::ADDri:
-    CheckCPSRDef = true;
-    break;
-  case ARM::tADDspi:
-    break;
-  }
-
-  if (Bytes == 0 || (Limit && Bytes >= Limit))
-    // Make sure the offset fits in 8 bits.
-    return false;
-
-  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
-                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
-  if (!(MI->getOperand(0).getReg() == Base &&
-        MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm() * Scale) == Bytes &&
-        getInstrPredicate(MI, MyPredReg) == Pred &&
-        MyPredReg == PredReg))
-    return false;
-
-  return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
                                             ARM_AM::AMSubMode Mode) {
   switch (Opc) {
@@ -1132,6 +1080,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
   }
 }
 
+/// Check if the given instruction increments or decrements a register and
+/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
+/// generated by the instruction are possibly read as well.
+static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
+                                  ARMCC::CondCodes Pred, unsigned PredReg) {
+  bool CheckCPSRDef;
+  int Scale;
+  switch (MI.getOpcode()) {
+  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
+  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
+  case ARM::t2SUBri:
+  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
+  case ARM::t2ADDri:
+  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
+  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
+  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
+  default: return 0;
+  }
+
+  unsigned MIPredReg;
+  if (MI.getOperand(0).getReg() != Reg ||
+      MI.getOperand(1).getReg() != Reg ||
+      getInstrPredicate(&MI, MIPredReg) != Pred ||
+      MIPredReg != PredReg)
+    return 0;
+
+  if (CheckCPSRDef && definesCPSR(&MI))
+    return 0;
+  return MI.getOperand(2).getImm() * Scale;
+}
+
+/// Searches for an increment or decrement of \p Reg before \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (MBBI == BeginMBBI)
+    return EndMBBI;
+
+  // Skip debug values.
+  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+    --PrevMBBI;
+
+  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : PrevMBBI;
+}
+
+/// Searches for a increment or decrement of \p Reg after \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+  // Skip debug values.
+  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+    ++NextMBBI;
+  if (NextMBBI == EndMBBI)
+    return EndMBBI;
+
+  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : NextMBBI;
+}
+
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
 ///
@@ -1151,7 +1168,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   const MachineOperand &BaseOP = MI->getOperand(0);
   unsigned Base = BaseOP.getReg();
   bool BaseKill = BaseOP.isKill();
-  unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned Opcode = MI->getOpcode();
@@ -1163,49 +1179,24 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
     if (MI->getOperand(i).getReg() == Base)
       return false;
 
-  bool DoMerge = false;
-  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
-
-  // Try merging with the previous instruction.
+  int Bytes = getLSMultipleTransferSize(MI);
   MachineBasicBlock &MBB = *MI->getParent();
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
   MachineBasicBlock::iterator MBBI(MI);
-  if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
-      --PrevMBBI;
-    if (Mode == ARM_AM::ia &&
-        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      Mode = ARM_AM::db;
-      DoMerge = true;
-    } else if (Mode == ARM_AM::ib &&
-               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      Mode = ARM_AM::da;
-      DoMerge = true;
-    }
-    if (DoMerge)
-      MBB.erase(PrevMBBI);
-  }
-
-  // Try merging with the next instruction.
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-      ++NextMBBI;
-    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
-        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      DoMerge = true;
-    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
-               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge)
-      MBB.erase(NextMBBI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
+  if (Mode == ARM_AM::ia && Offset == -Bytes) {
+    Mode = ARM_AM::db;
+  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
+    Mode = ARM_AM::da;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
+        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
+      return false;
   }
-
-  if (!DoMerge)
-    return false;
+  MBB.erase(MergeInstr);
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1283,7 +1274,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
 
   unsigned Base = getLoadStoreBaseOp(*MI).getReg();
   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
-  unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned Opcode = MI->getOpcode();
   DebugLoc DL = MI->getDebugLoc();
   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
@@ -1295,7 +1285,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
     return false;
 
-  bool isLd = isLoadSingle(Opcode);
   // Can't do the merge if the destination register is the same as the would-be
   // writeback register.
   if (MI->getOperand(0).getReg() == Base)
@@ -1303,55 +1292,31 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
 
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
-  bool DoMerge = false;
-  ARM_AM::AddrOpc AddSub = ARM_AM::add;
-  unsigned NewOpc = 0;
-  // AM2 - 12 bits, thumb2 - 8 bits.
-  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
-
-  // Try merging with the previous instruction.
+  int Bytes = getLSMultipleTransferSize(MI);
   MachineBasicBlock &MBB = *MI->getParent();
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
   MachineBasicBlock::iterator MBBI(MI);
-  if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
-      --PrevMBBI;
-    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
-      DoMerge = true;
-      AddSub = ARM_AM::sub;
-    } else if (!isAM5 &&
-               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
-      MBB.erase(PrevMBBI);
-    }
-  }
-
-  // Try merging with the next instruction.
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-      ++NextMBBI;
-    if (!isAM5 &&
-        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
-      DoMerge = true;
-      AddSub = ARM_AM::sub;
-    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
-      MBB.erase(NextMBBI);
-    }
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  unsigned NewOpc;
+  if (!isAM5 && Offset == Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+  } else if (Offset == -Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+    } else if (!isAM5 && Offset == -Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+    } else
+      return false;
   }
+  MBB.erase(MergeInstr);
 
-  if (!DoMerge)
-    return false;
+  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
 
+  bool isLd = isLoadSingle(Opcode);
   if (isAM5) {
     // VLDM[SD]_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
@@ -1368,18 +1333,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     if (isAM2) {
       // LDR_PRE, LDR_POST
       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
-        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
       } else {
-        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
       }
     } else {
-      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
@@ -1391,13 +1354,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     // the vestigal zero-reg offset register. When that's fixed, this clause
     // can be removed entirely.
     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
-      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
     } else {
-      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
@@ -1409,46 +1371,75 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   return true;
 }
 
-/// Returns true if instruction is a memory operation that this pass is capable
-/// of operating on.
-static bool isMemoryOp(const MachineInstr *MI) {
-  // When no memory operands are present, conservatively assume unaligned,
-  // volatile, unfoldable.
-  if (!MI->hasOneMemOperand())
+bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
+         "Must have t2STRDi8 or t2LDRDi8");
+  if (MI.getOperand(3).getImm() != 0)
     return false;
 
-  const MachineMemOperand *MMO = *MI->memoperands_begin();
-
-  // Don't touch volatile memory accesses - we may be changing their order.
-  if (MMO->isVolatile())
+  // Behaviour for writeback is undefined if base register is the same as one
+  // of the others.
+  const MachineOperand &BaseOp = MI.getOperand(2);
+  unsigned Base = BaseOp.getReg();
+  const MachineOperand &Reg0Op = MI.getOperand(0);
+  const MachineOperand &Reg1Op = MI.getOperand(1);
+  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
     return false;
 
-  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
-  // not.
-  if (MMO->getAlignment() < 4)
-    return false;
+  unsigned PredReg;
+  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  MachineBasicBlock::iterator MBBI(MI);
+  MachineBasicBlock &MBB = *MI.getParent();
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
+                                                            PredReg, Offset);
+  unsigned NewOpc;
+  if (Offset == 8 || Offset == -8) {
+    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == 8 || Offset == -8) {
+      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+    } else
+      return false;
+  }
+  MBB.erase(MergeInstr);
 
-  // str <undef> could probably be eliminated entirely, but for now we just want
-  // to avoid making a mess of it.
-  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
-  if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
-      MI->getOperand(0).isUndef())
-    return false;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
+    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
+       .addReg(BaseOp.getReg(), RegState::Define);
+  } else {
+    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
+    MIB.addReg(BaseOp.getReg(), RegState::Define)
+       .addOperand(Reg0Op).addOperand(Reg1Op);
+  }
+  MIB.addReg(BaseOp.getReg(), RegState::Kill)
+     .addImm(Offset).addImm(Pred).addReg(PredReg);
+  assert(TII->get(Opcode).getNumOperands() == 6 &&
+         TII->get(NewOpc).getNumOperands() == 7 &&
+         "Unexpected number of operands in Opcode specification.");
 
-  // Likewise don't mess with references to undefined addresses.
-  if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
-      MI->getOperand(1).isUndef())
-    return false;
+  // Transfer implicit operands.
+  for (const MachineOperand &MO : MI.implicit_operands())
+    MIB.addOperand(MO);
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
-  unsigned Opcode = MI->getOpcode();
+  MBB.erase(MBBI);
+  return true;
+}
+
+/// Returns true if instruction is a memory operation that this pass is capable
+/// of operating on.
+static bool isMemoryOp(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
-  default: break;
   case ARM::VLDRS:
   case ARM::VSTRS:
-    return MI->getOperand(1).isReg();
   case ARM::VLDRD:
   case ARM::VSTRD:
-    return MI->getOperand(1).isReg();
   case ARM::LDRi12:
   case ARM::STRi12:
   case ARM::tLDRi:
@@ -1459,9 +1450,40 @@ static bool isMemoryOp(const MachineInstr *MI) {
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
   case ARM::t2STRi12:
-    return MI->getOperand(1).isReg();
+    break;
+  default:
+    return false;
   }
-  return false;
+  if (!MI.getOperand(1).isReg())
+    return false;
+
+  // When no memory operands are present, conservatively assume unaligned,
+  // volatile, unfoldable.
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand &MMO = **MI.memoperands_begin();
+
+  // Don't touch volatile memory accesses - we may be changing their order.
+  if (MMO.isVolatile())
+    return false;
+
+  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+  // not.
+  if (MMO.getAlignment() < 4)
+    return false;
+
+  // str <undef> could probably be eliminated entirely, but for now we just want
+  // to avoid making a mess of it.
+  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+  if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
+    return false;
+
+  // Likewise don't mess with references to undefined addresses.
+  if (MI.getOperand(1).isUndef())
+    return false;
+
+  return true;
 }
 
 static void InsertLDR_STR(MachineBasicBlock &MBB,
@@ -1616,6 +1638,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   ARMCC::CondCodes CurrPred = ARMCC::AL;
   unsigned Position = 0;
   assert(Candidates.size() == 0);
+  assert(MergeBaseCandidates.size() == 0);
   LiveRegsValid = false;
 
   for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
@@ -1626,7 +1649,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       continue;
     ++Position;
 
-    if (isMemoryOp(MBBI)) {
+    if (isMemoryOp(*MBBI)) {
       unsigned Opcode = MBBI->getOpcode();
       const MachineOperand &MO = MBBI->getOperand(0);
       unsigned Reg = MO.getReg();
@@ -1694,8 +1717,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       MBBI = I;
       --Position;
       // Fallthrough to look into existing chain.
-    } else if (MBBI->isDebugValue())
+    } else if (MBBI->isDebugValue()) {
       continue;
+    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
+               MBBI->getOpcode() == ARM::t2STRDi8) {
+      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
+      // remember them because we may still be able to merge add/sub into them.
+      MergeBaseCandidates.push_back(MBBI);
+    }
+
 
     // If we are here then the chain is broken; Extract candidates for a merge.
     if (MemOps.size() > 0) {
@@ -1726,7 +1756,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       if (Merged) {
         Changed = true;
         unsigned Opcode = Merged->getOpcode();
-        if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
+        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
+          MergeBaseUpdateLSDouble(*Merged);
+        else
           MergeBaseUpdateLSMultiple(Merged);
       } else {
         for (MachineInstr *MI : Candidate->Instrs) {
@@ -1741,6 +1773,10 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     }
   }
   Candidates.clear();
+  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
+  for (MachineInstr *MI : MergeBaseCandidates)
+    MergeBaseUpdateLSDouble(*MI);
+  MergeBaseCandidates.clear();
 
   return Changed;
 }
@@ -1765,7 +1801,11 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
       (MBBI->getOpcode() == ARM::BX_RET ||
        MBBI->getOpcode() == ARM::tBX_RET ||
        MBBI->getOpcode() == ARM::MOVPCLR)) {
-    MachineInstr *PrevMI = std::prev(MBBI);
+    MachineBasicBlock::iterator PrevI = std::prev(MBBI);
+    // Ignore any DBG_VALUE instructions.
+    while (PrevI->isDebugValue() && PrevI != MBB.begin())
+      --PrevI;
+    MachineInstr *PrevMI = PrevI;
     unsigned Opcode = PrevMI->getOpcode();
     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
@@ -1786,6 +1826,30 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
   return false;
 }
 
+bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  if (MBBI == MBB.begin() || MBBI == MBB.end() ||
+      MBBI->getOpcode() != ARM::tBX_RET)
+    return false;
+
+  MachineBasicBlock::iterator Prev = MBBI;
+  --Prev;
+  if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
+    return false;
+
+  for (auto Use : Prev->uses())
+    if (Use.isKill()) {
+      AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
+          .addReg(Use.getReg(), RegState::Kill))
+          .copyImplicitOps(&*MBBI);
+      MBB.erase(MBBI);
+      MBB.erase(Prev);
+      return true;
+    }
+
+  llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
+}
+
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   MF = &Fn;
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
@@ -1793,7 +1857,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
-  MRI = &Fn.getRegInfo();
+
   RegClassInfoValid = false;
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
@@ -1805,18 +1869,29 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
     Modified |= LoadStoreMultipleOpti(MBB);
     if (STI->hasV5TOps())
       Modified |= MergeReturnIntoLDM(MBB);
+    if (isThumb1)
+      Modified |= CombineMovBx(MBB);
   }
 
   Allocator.DestroyAll();
   return Modified;
 }
 
+namespace llvm {
+void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+}
+
+#define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
+  "ARM pre- register allocation load / store optimization pass"
+
 namespace {
   /// Pre- register allocation pass that move load / stores from consecutive
   /// locations close to make it more likely they will be combined later.
   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
     static char ID;
-    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
+      initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
+    }
 
     const DataLayout *TD;
     const TargetInstrInfo *TII;
@@ -1828,7 +1903,7 @@ namespace {
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
     const char *getPassName() const override {
-      return "ARM pre- register allocation load / store optimization pass";
+      return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
     }
 
   private:
@@ -1847,8 +1922,11 @@ namespace {
   char ARMPreAllocLoadStoreOpt::ID = 0;
 }
 
+INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
+                ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  TD = Fn.getTarget().getDataLayout();
+  TD = &Fn.getDataLayout();
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
@@ -1856,9 +1934,8 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   MF  = &Fn;
 
   bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI)
-    Modified |= RescheduleLoadStoreInstrs(MFI);
+  for (MachineBasicBlock &MFI : Fn)
+    Modified |= RescheduleLoadStoreInstrs(&MFI);
 
   return Modified;
 }
@@ -1909,23 +1986,6 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
   return AddedRegPressure.size() <= MemRegs.size() * 2;
 }
 
-
-/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
-static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
-                                   MachineInstr *Op1) {
-  assert(MI->memoperands_empty() && "expected a new machineinstr");
-  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
-    + (Op1->memoperands_end() - Op1->memoperands_begin());
-
-  MachineFunction *MF = MI->getParent()->getParent();
-  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
-  MachineSDNode::mmo_iterator MemEnd =
-    std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
-  MemEnd =
-    std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
-  MI->setMemRefs(MemBegin, MemEnd);
-}
-
 bool
 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
                                           DebugLoc &dl, unsigned &NewOpc,
@@ -2119,7 +2179,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
             if (!isT2)
               MIB.addReg(0);
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
-            concatenateMemOperands(MIB, Op0, Op1);
+            MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
             DEBUG(dbgs() << "Formed " << *MIB << "\n");
             ++NumLDRDFormed;
           } else {
@@ -2133,7 +2193,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
             if (!isT2)
               MIB.addReg(0);
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
-            concatenateMemOperands(MIB, Op0, Op1);
+            MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
             DEBUG(dbgs() << "Formed " << *MIB << "\n");
             ++NumSTRDFormed;
           }
@@ -2187,7 +2247,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
       if (!MI->isDebugValue())
         MI2LocMap[MI] = ++Loc;
 
-      if (!isMemoryOp(MI))
+      if (!isMemoryOp(*MI))
         continue;
       unsigned PredReg = 0;
       if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
@@ -2275,3 +2335,4 @@ FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
     return new ARMPreAllocLoadStoreOpt();
   return new ARMLoadStoreOpt();
 }
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index f5250ff..71ad7a4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachineFuctionInfo.cpp - ARM machine function info -------------===//
+//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,4 +21,4 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
       FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
       GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
       PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
-      GlobalBaseReg(0) {}
+      IsSplitCSR(false) {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 14dd9ef..68f9aec 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -52,7 +52,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   unsigned ReturnRegsCount;
 
   /// HasStackFrame - True if this function has a stack frame. Set by
-  /// processFunctionBeforeCalleeSavedScan().
+  /// determineCalleeSaves().
   bool HasStackFrame;
 
   /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
@@ -110,11 +110,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// pass.
   DenseMap<unsigned, unsigned> CPEClones;
 
-  /// GlobalBaseReg - keeps track of the virtual register initialized for
-  /// use as the global base register. This is used for PIC in some PIC
-  /// relocation models.
-  unsigned GlobalBaseReg;
-
   /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
   /// being passed on the stack
   unsigned ArgumentStackSize;
@@ -123,6 +118,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// coalesced weights.
   DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
 
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -133,7 +132,7 @@ public:
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
     NumAlignedDPRCS2Regs(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
+    VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false) {}
 
   explicit ARMFunctionInfo(MachineFunction &MF);
 
@@ -204,8 +203,8 @@ public:
   bool hasITBlocks() const { return HasITBlocks; }
   void setHasITBlocks(bool h) { HasITBlocks = h; }
 
-  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
-  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
 
   void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
     if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 45cc9ea..02cbfb1 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
 }
 
 // Scalar single precision floating point register class..
-// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
-// avoid partial-write dependencies on D registers (S registers are
-// renamed as portions of D registers).
-def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
-                                                (sequence "S%u", 0, 31), 2),
-                                               (sequence "S%u", 0, 31))>;
+// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
+// to avoid partial-write dependencies on D or Q (depending on platform)
+// registers (S registers are renamed as portions of D/Q registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
+  let AltOrders = [(add (decimate SPR, 2), SPR),
+                   (add (decimate SPR, 4),
+                        (decimate SPR, 2),
+                        (decimate (rotl SPR, 1), 4),
+                        (decimate (rotl SPR, 1), 2))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+}
 
 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
 // operations
@@ -281,25 +288,29 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
 // class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
                         (sequence "D%u", 0, 31)> {
-  // Allocate non-VFP2 registers D16-D31 first.
-  let AltOrders = [(rotl DPR, 16)];
-  let AltOrderSelect = [{ return 1; }];
+  // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
+  // Darwin platforms.
+  let AltOrders = [(rotl DPR, 16),
+                   (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
 }
 
 // Subset of DPR that are accessible with VFP2 (and so that also have
 // 32-bit SPR subregs).
-def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
                              (trunc DPR, 16)>;
 
 // Subset of DPR which can be used as a source of NEON scalars for 16-bit
 // operations
-def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
                           (trunc DPR, 8)>;
 
 // Generic 128-bit vector register class.
-def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
                         (sequence "Q%u", 0, 15)> {
   // Allocate non-VFP2 aliases Q8-Q15 first.
   let AltOrders = [(rotl QPR, 8)];
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
index b03d5ff..3ad7730 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -37,1050 +37,13 @@ def SW_FDIV : FuncUnit;
 // FIXME: Add preload instruction when it is documented.
 // FIXME: Model non-pipelined nature of FP div / sqrt unit.
 
-def SwiftItineraries : ProcessorItineraries<
-  [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
-  //
-  // Move instructions, unconditional
-  InstrItinData<IIC_iMOVi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2]>,
-  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                 [3]>,
-  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_LS]>],
-                              [5]>,
-  //
-  // MVN instructions
-  InstrItinData<IIC_iMVNi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMVNr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  //
-  // No operand cycles
-  InstrItinData<IIC_iALUx   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
-  //
-  // Binary Instructions that produce a result
-  InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-  InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1]>,
-  InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1]>,
-  InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1]>,
-  InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1, 1]>,
-  //
-  // Bitwise Instructions that produce a result
-  InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-  InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1]>,
-  InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1]>,
-  InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1, 1]>,
-  //
-  // Unary Instructions that produce a result
-
-  // CLZ, RBIT, etc.
-  InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-
-  // BFC, BFI, UBFX, SBFX
-  InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1]>,
-
-  //
-  // Zero and sign extension instructions
-  InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-  InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1]>,
-  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1, 1]>,
-  //
-  // Compare instructions
-  InstrItinData<IIC_iCMPi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iCMPr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1, 1]>,
-  //
-  // Test instructions
-  InstrItinData<IIC_iTSTi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iTSTr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1, 1]>,
-  //
-  // Move instructions, conditional
-  // FIXME: Correctly model the extra input dep on the destination.
-  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2, 1, 1]>,
-  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2]>,
-
-  // Integer multiply pipeline
-  //
-  InstrItinData<IIC_iMUL16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [3, 1, 1]>,
-  InstrItinData<IIC_iMAC16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [3, 1, 1, 1]>,
-  InstrItinData<IIC_iMUL32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  InstrItinData<IIC_iMAC32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1, 1]>,
-  InstrItinData<IIC_iMUL64  , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0], 1>,
-                               InstrStage<1, [SW_ALU0], 3>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [5, 5, 1, 1]>,
-  InstrItinData<IIC_iMAC64  , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0], 1>,
-                               InstrStage<1, [SW_ALU0], 1>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [5, 6, 1, 1]>,
-  //
-  // Integer divide
-  InstrItinData<IIC_iDIV  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0], 0>,
-                             InstrStage<14, [SW_IDIV]>],
-                            [14, 1, 1]>,
-
-  // Integer load pipeline
-  // FIXME: The timings are some rough approximations
-  //
-  // Immediate offset
-  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1]>,
-  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1]>,
-  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_LS], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 4, 1]>,
-  //
-  // Register offset
-  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS], 1>,
-                                 InstrStage<1, [SW_LS], 3>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [3, 4, 1, 1]>,
-  //
-  // Scaled register offset
-  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                 InstrStage<1, [SW_LS]>],
-                                [5, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                 InstrStage<1, [SW_LS]>],
-                                [5, 1, 1]>,
-  //
-  // Immediate offset with update
-  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  //
-  // Register offset with update
-  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1, 1]>,
-  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
-                                 InstrStage<1, [SW_LS], 3>,
-                                 InstrStage<1, [SW_LS], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [3, 4, 1, 1]>,
-  //
-  // Scaled register offset with update
-  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                 InstrStage<1, [SW_LS], 3>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [5, 3, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_DIS2], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                  InstrStage<1, [SW_LS], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [5, 3, 1, 1]>,
-  //
-  // Load multiple, def is the 5th operand.
-  // FIXME: This assumes 3 to 4 registers.
-  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
-
-  //
-  // Load multiple + update, defs are the 1st and 5th operands.
-  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
-                                InstrStage<1, [SW_LS], 3>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                               [2, 1, 1, 1, 3], [], -1>, // dynamic uops
-  //
-  // Load multiple plus branch
-  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
-  //
-  // Pop, def is the 3rd operand.
-  InstrItinData<IIC_iPop  ,    [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 3], [], -1>, // dynamic uops
-  //
-  // Pop + branch, def is the 3rd operand.
-  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 3], [], -1>, // dynamic uops
-
-  //
-  // iLoadi + iALUr for t2LDRpci_pic.
-  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                InstrStage<1, [SW_LS], 3>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                               [4, 1]>,
-
-  // Integer store pipeline
-  ///
-  // Immediate offset
-  InstrItinData<IIC_iStore_i  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1]>,
-  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1]>,
-  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1]>,
-  //
-  // Register offset
-  InstrItinData<IIC_iStore_r  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1, 1]>,
-  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1, 1]>,
-  //
-  // Scaled register offset
-  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  //
-  // Immediate offset with update
-  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  //
-  // Register offset with update
-  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1, 1]>,
-  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1, 1]>,
-  //
-  // Scaled register offset with update
-  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [SW_DIS0], 0>,
-                                    InstrStage<1, [SW_DIS1], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                    InstrStage<1, [SW_LS], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
-                                   [3, 1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
-                                    InstrStage<1, [SW_DIS1], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                    InstrStage<1, [SW_LS], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
-                                   [3, 1, 1, 1]>,
-  //
-  // Store multiple
-  InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                                [], [], -1>, // dynamic uops
-  //
-  // Store multiple + update
-  InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [2], [], -1>, // dynamic uops
-
-  //
-  // Preload
-  InstrItinData<IIC_Preload,   [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
-
-  // Branch
-  //
-  // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br       , [InstrStage<1, [SW_DIS0], 0>]>,
-
-  // FP Special Register to Integer Register File Move
-  InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                              InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                             [1]>,
-  //
-  // Single-precision FP Unary
-  //
-  // Most floating-point moves get issued on ALU0.
-  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Double-precision FP Unary
-  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-
-  //
-  // Single-precision FP Compare
-  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [1, 1]>,
-  //
-  // Double-precision FP Compare
-  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [1, 1]>,
-  //
-  // Single to Double FP Convert
-  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Double to Single FP Convert
-  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-
-  //
-  // Single to Half FP Convert
-  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_ALU1], 4>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1]>,
-  //
-  // Half to Single FP Convert
-  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-
-  //
-  // Single-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Double-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Integer to Single-Precision FP Convert
-  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Integer to Double-Precision FP Convert
-  InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Single-precision FP ALU
-  InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-precision FP ALU
-  InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Single-precision FP Multiply
-  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Double-precision FP Multiply
-  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1, 1]>,
-  //
-  // Single-precision FP MAC
-  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-precision FP MAC
-  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [12, 1, 1]>,
-  //
-  // Single-precision Fused FP MAC
-  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-precision Fused FP MAC
-  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [12, 1, 1]>,
-  //
-  // Single-precision FP DIV
-  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<15, [SW_FDIV]>],
-                              [17, 1, 1]>,
-  //
-  // Double-precision FP DIV
-  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<30, [SW_FDIV]>],
-                              [32, 1, 1]>,
-  //
-  // Single-precision FP SQRT
-  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<15, [SW_FDIV]>],
-                              [17, 1]>,
-  //
-  // Double-precision FP SQRT
-  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<30, [SW_FDIV]>],
-                              [32, 1, 1]>,
-
-  //
-  // Integer to Single-precision Move
-  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [6, 1]>,
-  //
-  // Integer to Double-precision Move
-  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1]>,
-  //
-  // Single-precision to Integer Move
-  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 1]>,
-  //
-  // Double-precision to Integer Move
-  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 3>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 4, 1]>,
-  //
-  // Single-precision FP Load
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1]>,
-  //
-  // Double-precision FP Load
-  InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1]>,
-  //
-  // FP Load Multiple
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1, 1, 4], [], -1>, // dynamic uops
-  //
-  // FP Load Multiple + update
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2, 1, 1, 1, 4], [], -1>, // dynamic uops
-  //
-  // Single-precision FP Store
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1]>,
-  //
-  // Double-precision FP Store
-  InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1]>,
-  //
-  // FP Store Multiple
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1, 1], [], -1>, // dynamic uops
-  //
-  // FP Store Multiple + update
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_LS], 4>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                               [2, 1, 1, 1], [], -1>, // dynamic uops
-  // NEON
-  //
-  // Double-register Integer Unary
-  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Quad-register Integer Unary
-  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Double-register Integer Q-Unary
-  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Quad-register Integer CountQ-Unary
-  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Double-register Integer Binary
-  InstrItinData<IIC_VBINiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Binary
-  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Integer Subtract
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Subtract
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Integer Shift
-  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Shift
-  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-
-  //
-  // Double-register Integer Count
-  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Count
-  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Quad-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Double-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-
-  //
-  // Double-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-
-  //
-  // Double-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Double-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Quad-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Quad-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-
-  //
-  // Move
-  InstrItinData<IIC_VMOV,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Move Immediate
-  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2]>,
-  //
-  // Double-register Permute Move
-  InstrItinData<IIC_VMOVD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1]>,
-  //
-  // Quad-register Permute Move
-  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1]>,
-  //
-  // Integer to Single-precision Move
-  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [6, 1]>,
-  //
-  // Integer to Double-precision Move
-  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1, 1]>,
-  //
-  // Single-precision to Integer Move
-  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 1]>,
-  //
-  // Double-precision to Integer Move
-  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 3>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 4, 1]>,
-  //
-  // Integer to Lane Move
-  // FIXME: I think this is correct, but it is not clear from the tuning guide.
-  InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [6, 1]>,
-
-  //
-  // Vector narrow move
-  InstrItinData<IIC_VMOVN,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1]>,
-  //
-  // Double-register FP Unary
-  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
-  //        and they issue on a different pipeline.
-  InstrItinData<IIC_VUNAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Quad-register FP Unary
-  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
-  //        and they issue on a different pipeline.
-  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Double-register FP Binary
-  // FIXME: We're using this itin for many instructions.
-  InstrItinData<IIC_VBIND,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-
-  //
-  // VPADD, etc.
-  InstrItinData<IIC_VPBIND,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register FP VMUL
-  InstrItinData<IIC_VFMULD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register FP Binary
-  InstrItinData<IIC_VBINQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register FP VMUL
-  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Quad-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-register Fused FP Multiple-Accumulate
-  InstrItinData<IIC_VFMACD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Quad-register FusedF P Multiple-Accumulate
-  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-register Reciprical Step
-  InstrItinData<IIC_VRECSD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Quad-register Reciprical Step
-  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-register Permute
-  // FIXME: The latencies are unclear from the documentation.
-  InstrItinData<IIC_VPERMD,   [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [3, 4, 3, 4]>,
-  //
-  // Quad-register Permute
-  // FIXME: The latencies are unclear from the documentation.
-  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [3, 4, 3, 4]>,
-  //
-  // Quad-register Permute (3 cycle issue on A9)
-  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [3, 4, 3, 4]>,
-
-  //
-  // Double-register VEXT
-  InstrItinData<IIC_VEXTD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register VEXT
-  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  //
-  // VTB
-  InstrItinData<IIC_VTB1,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  InstrItinData<IIC_VTB2,     [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 3, 3]>,
-  InstrItinData<IIC_VTB3,     [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1, 3, 5, 5]>,
-  InstrItinData<IIC_VTB4,     [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 3, 5, 7, 7]>,
-  //
-  // VTBX
-  InstrItinData<IIC_VTBX1,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  InstrItinData<IIC_VTBX2,    [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 3, 3]>,
-  InstrItinData<IIC_VTBX3,    [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1, 3, 5, 5]>,
-  InstrItinData<IIC_VTBX4,    [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 3, 5, 7, 7]>
-]>;
-
-// ===---------------------------------------------------------------------===//
-// This following definitions describe the simple machine model which
-// will replace itineraries.
-
 // Swift machine model for scheduling and other instruction cost heuristics.
 def SwiftModel : SchedMachineModel {
   let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
   let MicroOpBufferSize = 45; // Based on NEON renamed registers.
   let LoadLatency = 3;
   let MispredictPenalty = 14; // A branch direction mispredict.
-
-  let Itineraries = SwiftItineraries;
+  let CompleteModel = 0;      // FIXME: Remove if all instructions are covered.
 }
 
 // Swift predicates.
@@ -1558,6 +521,13 @@ let SchedModel = SwiftModel in {
         (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
         "PUSH", "tPUSH")>;
 
+  // LDRLIT pseudo instructions, they expand to LDR + PICADD
+  def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU],
+        (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+  // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
+  def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle],
+        (instregex "LDRLIT_ga_pcrel_ldr")>;
+
   // 4.2.26 Branch
   def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
   def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 6cafbbb..6fded9c 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -160,41 +160,39 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   unsigned VTSize = 4;
   unsigned i = 0;
   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
-  const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
+  const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
   SDValue TFOps[6];
   SDValue Loads[6];
   uint64_t SrcOff = 0, DstOff = 0;
 
-  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
-  // same number of stores.  The loads and stores will get combined into
-  // ldm/stm later on.
-  while (EmittedNumMemOps < NumMemOps) {
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      Loads[i] = DAG.getLoad(VT, dl, Chain,
-                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
-                                         DAG.getConstant(SrcOff, dl, MVT::i32)),
-                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
-                             false, false, 0);
-      TFOps[i] = Loads[i].getValue(1);
-      SrcOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        makeArrayRef(TFOps, i));
+  // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
+  // VLDM/VSTM and make this code emit it when appropriate. This would reduce
+  // pressure on the general purpose registers. However this seems harder to map
+  // onto the register allocator's view of the world.
 
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                              DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
-                                          DAG.getConstant(DstOff, dl, MVT::i32)),
-                              DstPtrInfo.getWithOffset(DstOff),
-                              isVolatile, false, 0);
-      DstOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        makeArrayRef(TFOps, i));
+  // The number of MEMCPY pseudo-instructions to emit. We use up to
+  // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
+  // later on. This is a lower bound on the number of MEMCPY operations we must
+  // emit.
+  unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
+
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
+
+  for (unsigned I = 0; I != NumMEMCPYs; ++I) {
+    // Evenly distribute registers among MEMCPY operations to reduce register
+    // pressure.
+    unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
+    unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
+
+    Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+                      DAG.getConstant(NumRegs, dl, MVT::i32));
+    Src = Dst.getValue(1);
+    Chain = Dst.getValue(2);
+
+    DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
+    SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
 
-    EmittedNumMemOps += i;
+    EmittedNumMemOps = NextEmittedNumMemOps;
   }
 
   if (BytesLeft == 0)
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 002c3e9..bb6ae28 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
@@ -40,37 +41,9 @@ using namespace llvm;
 #include "ARMGenSubtargetInfo.inc"
 
 static cl::opt<bool>
-ReserveR9("arm-reserve-r9", cl::Hidden,
-          cl::desc("Reserve R9, making it unavailable as GPR"));
-
-static cl::opt<bool>
-ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
                cl::init(true), cl::Hidden);
 
-namespace {
-enum AlignMode {
-  DefaultAlign,
-  StrictAlign,
-  NoStrictAlign
-};
-}
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
-      cl::Hidden, cl::init(DefaultAlign),
-      cl::values(
-          clEnumValN(DefaultAlign,  "arm-default-align",
-                     "Generate unaligned accesses only on hardware/OS "
-                     "combinations that are known to support them"),
-          clEnumValN(StrictAlign,   "arm-strict-align",
-                     "Disallow all unaligned memory accesses"),
-          clEnumValN(NoStrictAlign, "arm-no-strict-align",
-                     "Allow unaligned memory accesses"),
-          clEnumValEnd));
-
 enum ITMode {
   DefaultIT,
   RestrictedIT,
@@ -88,6 +61,12 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
                          "Allow IT blocks based on ARMv7"),
               clEnumValEnd));
 
+/// ForceFastISel - Use the fast-isel, even for subtargets where it is not
+/// currently supported (for testing only).
+static cl::opt<bool>
+ForceFastISel("arm-force-fast-isel",
+               cl::init(false), cl::Hidden);
+
 /// initializeSubtargetDependencies - Initializes using a CPU and feature string
 /// so that we can use initializer lists for subtarget initialization.
 ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -110,8 +89,8 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle)
     : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
-      TargetTriple(TT), Options(TM.Options), TM(TM),
+      ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU),
+      IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
       FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
@@ -133,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() {
   HasV7Ops = false;
   HasV8Ops = false;
   HasV8_1aOps = false;
+  HasV8_2aOps = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
@@ -147,10 +127,11 @@ void ARMSubtarget::initializeEnvironment() {
   UseSoftFloat = false;
   HasThumb2 = false;
   NoARM = false;
-  IsR9Reserved = ReserveR9;
-  UseMovt = false;
+  ReserveR9 = false;
+  NoMovt = false;
   SupportsTailCall = false;
   HasFP16 = false;
+  HasFullFP16 = false;
   HasD16 = false;
   HasHardwareDivide = false;
   HasHardwareDivideInARM = false;
@@ -168,20 +149,36 @@ void ARMSubtarget::initializeEnvironment() {
   HasCrypto = false;
   HasCRC = false;
   HasZeroCycleZeroing = false;
-  AllowsUnalignedMem = false;
-  Thumb2DSP = false;
+  StrictAlign = false;
+  HasDSP = false;
   UseNaClTrap = false;
   GenLongCalls = false;
   UnsafeFPMath = false;
+
+  // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this
+  // directly from it, but we can try to make sure they're consistent when both
+  // available.
+  UseSjLjEH = isTargetDarwin() && !isTargetWatchOS();
+  assert((!TM.getMCAsmInfo() ||
+          (TM.getMCAsmInfo()->getExceptionHandlingType() ==
+           ExceptionHandling::SjLj) == UseSjLjEH) &&
+         "inconsistent sjlj choice between CodeGen and MC");
 }
 
 void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (CPUString.empty()) {
-    if (isTargetDarwin() && TargetTriple.getArchName().endswith("v7s"))
-      // Default to the Swift CPU when targeting armv7s/thumbv7s.
-      CPUString = "swift";
-    else
-      CPUString = "generic";
+    CPUString = "generic";
+
+    if (isTargetDarwin()) {
+      StringRef ArchName = TargetTriple.getArchName();
+      if (ArchName.endswith("v7s"))
+        // Default to the Swift CPU when targeting armv7s/thumbv7s.
+        CPUString = "swift";
+      else if (ArchName.endswith("v7k"))
+        // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
+        // ARMv7k does not use SjLj exception handling.
+        CPUString = "cortex-a7";
+    }
   }
 
   // Insert the architecture feature derived from the target triple into the
@@ -212,44 +209,31 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   if (isAAPCS_ABI())
     stackAlignment = 8;
-  if (isTargetNaCl())
+  if (isTargetNaCl() || isAAPCS16_ABI())
     stackAlignment = 16;
 
-  UseMovt = hasV6T2Ops() && ArmUseMOVT;
-
-  if (isTargetMachO()) {
-    IsR9Reserved = ReserveR9 || !HasV6Ops;
-    SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0);
-  } else {
-    IsR9Reserved = ReserveR9;
-    SupportsTailCall = !isThumb1Only();
-  }
-
-  if (Align == DefaultAlign) {
-    // Assume pre-ARMv6 doesn't support unaligned accesses.
-    //
-    // ARMv6 may or may not support unaligned accesses depending on the
-    // SCTLR.U bit, which is architecture-specific. We assume ARMv6
-    // Darwin and NetBSD targets support unaligned accesses, and others don't.
-    //
-    // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
-    // which raises an alignment fault on unaligned accesses. Linux
-    // defaults this bit to 0 and handles it as a system-wide (not
-    // per-process) setting. It is therefore safe to assume that ARMv7+
-    // Linux targets support unaligned accesses. The same goes for NaCl.
-    //
-    // The above behavior is consistent with GCC.
-    AllowsUnalignedMem =
-      (hasV7Ops() && (isTargetLinux() || isTargetNaCl() ||
-                      isTargetNetBSD())) ||
-      (hasV6Ops() && (isTargetMachO() || isTargetNetBSD()));
-  } else {
-    AllowsUnalignedMem = !(Align == StrictAlign);
-  }
-
-  // No v6M core supports unaligned memory access (v6M ARM ARM A3.2)
-  if (isV6M())
-    AllowsUnalignedMem = false;
+  // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
+  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
+  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
+  // support in the assembler and linker to be used. This would need to be
+  // fixed to fully support tail calls in Thumb1.
+  //
+  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+  // LR.  This means if we need to reload LR, it takes an extra instructions,
+  // which outweighs the value of the tail call; but here we don't know yet
+  // whether LR is going to be used.  Probably the right approach is to
+  // generate the tail call here and turn it back into CALL/RET in
+  // emitEpilogue if LR is used.
+
+  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+  // but we need to make sure there are enough registers; the only valid
+  // registers are the 4 used for parameters.  We don't currently do this
+  // case.
+
+  SupportsTailCall = !isThumb1Only();
+
+  if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
+    SupportsTailCall = false;
 
   switch (IT) {
   case DefaultIT:
@@ -276,9 +260,15 @@ bool ARMSubtarget::isAPCS_ABI() const {
 }
 bool ARMSubtarget::isAAPCS_ABI() const {
   assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS;
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
+         TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+bool ARMSubtarget::isAAPCS16_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
 }
 
+
 /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
 bool
 ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
@@ -321,11 +311,23 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
 }
 
 bool ARMSubtarget::hasSinCos() const {
-  return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
+  return isTargetWatchOS() ||
+    (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0));
+}
+
+bool ARMSubtarget::enableMachineScheduler() const {
+  // Enable the MachineScheduler before register allocation for out-of-order
+  // architectures where we do not use the PostRA scheduler anymore (for now
+  // restricted to swift).
+  return getSchedModel().isOutOfOrder() && isSwift();
 }
 
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
+  // No need for PostRA scheduling on out of order CPUs (for now restricted to
+  // swift).
+  if (getSchedModel().isOutOfOrder() && isSwift())
+    return false;
   return (!isThumb() || hasThumb2());
 }
 
@@ -333,15 +335,30 @@ bool ARMSubtarget::enableAtomicExpand() const {
   return hasAnyDataBarrier() && !isThumb1Only();
 }
 
+bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
+  // For general targets, the prologue can grow when VFPs are allocated with
+  // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
+  // format which it's more important to get right.
+  return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize());
+}
+
 bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
   // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
   // immediates as it is inherently position independent, and may be out of
   // range otherwise.
-  return UseMovt && (isTargetWindows() ||
-                     !MF.getFunction()->hasFnAttribute(Attribute::MinSize));
+  return !NoMovt && hasV6T2Ops() &&
+         (isTargetWindows() || !MF.getFunction()->optForMinSize());
 }
 
 bool ARMSubtarget::useFastISel() const {
+  // Enable fast-isel for any target, for testing only.
+  if (ForceFastISel)
+    return true;
+
+  // Limit fast-isel to the targets that are or have been tested.
+  if (!hasV6Ops())
+    return false;
+
   // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
   return TM.Options.EnableFastISel &&
          ((isTargetMachO() && !isThumb1Only()) ||
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
index dd101df..4d54e57 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -43,11 +43,17 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
     Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, Swift, CortexA53, CortexA57, Krait,
+    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
+    CortexA57, CortexA72, Krait, Swift, ExynosM1
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
   };
+  enum ARMArchEnum {
+    ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
+    ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
+    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a
+  };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
   ARMProcFamilyEnum ARMProcFamily;
@@ -55,6 +61,9 @@ protected:
   /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
   ARMProcClassEnum ARMProcClass;
 
+  /// ARMArch - ARM architecture
+  ARMArchEnum ARMArch;
+
   /// HasV4TOps, HasV5TOps, HasV5TEOps,
   /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
@@ -68,6 +77,7 @@ protected:
   bool HasV7Ops;
   bool HasV8Ops;
   bool HasV8_1aOps;
+  bool HasV8_2aOps;
 
   /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
@@ -109,22 +119,24 @@ protected:
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
-  /// IsR9Reserved - True if R9 is a not available as general purpose register.
-  bool IsR9Reserved;
+  /// ReserveR9 - True if R9 is not available as a general purpose register.
+  bool ReserveR9;
 
-  /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
-  /// imms (including global addresses).
-  bool UseMovt;
+  /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
+  /// 32-bit imms (including global addresses).
+  bool NoMovt;
 
   /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
   /// must be able to synthesize call stubs for interworking between ARM and
   /// Thumb.
   bool SupportsTailCall;
 
-  /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
-  /// only so far)
+  /// HasFP16 - True if subtarget supports half-precision FP conversions
   bool HasFP16;
 
+  /// HasFullFP16 - True if subtarget supports half-precision FP operations
+  bool HasFullFP16;
+
   /// HasD16 - True if subtarget is limited to 16 double precision
   /// FP registers for VFPv3.
   bool HasD16;
@@ -190,18 +202,18 @@ protected:
   /// particularly effective at zeroing a VFP register.
   bool HasZeroCycleZeroing;
 
-  /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
+  /// StrictAlign - If true, the subtarget disallows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
-  bool AllowsUnalignedMem;
+  bool StrictAlign;
 
   /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
   ///  blocks to conform to ARMv8 rule.
   bool RestrictIT;
 
-  /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
-  /// and such) instructions in Thumb2 code.
-  bool Thumb2DSP;
+  /// HasDSP - If true, the subtarget supports the DSP (saturating arith
+  /// and such) instructions.
+  bool HasDSP;
 
   /// NaCl TRAP instruction is generated instead of the regular TRAP.
   bool UseNaClTrap;
@@ -212,6 +224,9 @@ protected:
   /// Target machine allowed unsafe FP math (such as use of NEON fp)
   bool UnsafeFPMath;
 
+  /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
+  bool UseSjLjEH;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -297,6 +312,7 @@ public:
   bool hasV7Ops()   const { return HasV7Ops;  }
   bool hasV8Ops()   const { return HasV8Ops;  }
   bool hasV8_1aOps() const { return HasV8_1aOps; }
+  bool hasV8_2aOps() const { return HasV8_2aOps; }
 
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA7() const { return ARMProcFamily == CortexA7; }
@@ -343,17 +359,20 @@ public:
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
   bool hasRAS() const { return HasRAS; }
   bool hasMPExtension() const { return HasMPExtension; }
-  bool hasThumb2DSP() const { return Thumb2DSP; }
+  bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
+  bool useSjLjEH() const { return UseSjLjEH; }
   bool genLongCalls() const { return GenLongCalls; }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
+  bool hasFullFP16() const { return HasFullFP16; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
@@ -375,6 +394,11 @@ public:
             TargetTriple.getEnvironment() == Triple::EABIHF) &&
            !isTargetDarwin() && !isTargetWindows();
   }
+  bool isTargetGNUAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::GNUEABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
 
   // ARM Targets that support EHABI exception handling standard
   // Darwin uses SjLj. Other targets might need more checks.
@@ -383,7 +407,7 @@ public:
             TargetTriple.getEnvironment() == Triple::GNUEABI ||
             TargetTriple.getEnvironment() == Triple::EABIHF ||
             TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
-            TargetTriple.getEnvironment() == Triple::Android) &&
+            isTargetAndroid()) &&
            !isTargetDarwin() && !isTargetWindows();
   }
 
@@ -391,14 +415,13 @@ public:
     // FIXME: this is invalid for WindowsCE
     return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
            TargetTriple.getEnvironment() == Triple::EABIHF ||
-           isTargetWindows();
-  }
-  bool isTargetAndroid() const {
-    return TargetTriple.getEnvironment() == Triple::Android;
+           isTargetWindows() || isAAPCS16_ABI();
   }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
 
   bool isAPCS_ABI() const;
   bool isAAPCS_ABI() const;
+  bool isAAPCS16_ABI() const;
 
   bool useSoftFloat() const { return UseSoftFloat; }
   bool isThumb() const { return InThumbMode; }
@@ -409,17 +432,17 @@ public:
   bool isRClass() const { return ARMProcClass == RClass; }
   bool isAClass() const { return ARMProcClass == AClass; }
 
-  bool isV6M() const {
-    return isThumb1Only() && isMClass();
+  bool isR9Reserved() const {
+    return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
   }
 
-  bool isR9Reserved() const { return IsR9Reserved; }
+  bool useStride4VFPs(const MachineFunction &MF) const;
 
   bool useMovt(const MachineFunction &MF) const;
 
   bool supportsTailCall() const { return SupportsTailCall; }
 
-  bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+  bool allowsUnalignedMem() const { return !StrictAlign; }
 
   bool restrictIT() const { return RestrictIT; }
 
@@ -433,6 +456,9 @@ public:
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// Returns true if machine scheduler should be enabled.
+  bool enableMachineScheduler() const override;
+
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 93495d6..fca1901 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -66,7 +66,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 static ARMBaseTargetMachine::ARMABI
 computeTargetABI(const Triple &TT, StringRef CPU,
                  const TargetOptions &Options) {
-  if (Options.MCOptions.getABIName().startswith("aapcs"))
+  if (Options.MCOptions.getABIName() == "aapcs16")
+    return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+  else if (Options.MCOptions.getABIName().startswith("aapcs"))
     return ARMBaseTargetMachine::ARM_ABI_AAPCS;
   else if (Options.MCOptions.getABIName().startswith("apcs"))
     return ARMBaseTargetMachine::ARM_ABI_APCS;
@@ -83,6 +85,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
         (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
         CPU.startswith("cortex-m")) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+    } else if (TT.isWatchOS()) {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
     } else {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
     }
@@ -106,7 +110,7 @@ computeTargetABI(const Triple &TT, StringRef CPU,
       if (TT.isOSNetBSD())
         TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
       else
-	TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+        TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
       break;
     }
   }
@@ -145,7 +149,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   // to 64. We always ty to give them natural alignment.
   if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
     Ret += "-v64:32:64-v128:32:128";
-  else
+  else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16)
     Ret += "-v128:64:128";
 
   // Try to align aggregates to 32 bits (the default is 64 bits, which has no
@@ -157,7 +161,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
 
   // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
   // aligned everywhere else.
-  if (TT.isOSNaCl())
+  if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
     Ret += "-S128";
   else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
     Ret += "-S64";
@@ -184,6 +188,15 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType =
         Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+
+  // Default to triple-appropriate EABI
+  if (Options.EABIVersion == EABI::Default ||
+      Options.EABIVersion == EABI::Unknown) {
+    if (Subtarget.isTargetGNUAEABI())
+      this->Options.EABIVersion = EABI::GNU;
+    else
+      this->Options.EABIVersion = EABI::EABI5;
+  }
 }
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
@@ -225,12 +238,12 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &F) { return TargetTransformInfo(ARMTTIImpl(this, F)); });
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(ARMTTIImpl(this, F));
+  });
 }
 
-
-void ARMTargetMachine::anchor() { }
+void ARMTargetMachine::anchor() {}
 
 ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
@@ -244,7 +257,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
                        "support ARM mode execution!");
 }
 
-void ARMLETargetMachine::anchor() { }
+void ARMLETargetMachine::anchor() {}
 
 ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -253,7 +266,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL)
     : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
-void ARMBETargetMachine::anchor() { }
+void ARMBETargetMachine::anchor() {}
 
 ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -262,7 +275,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL)
     : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void ThumbTargetMachine::anchor() { }
+void ThumbTargetMachine::anchor() {}
 
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -273,7 +286,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-void ThumbLETargetMachine::anchor() { }
+void ThumbLETargetMachine::anchor() {}
 
 ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
@@ -282,7 +295,7 @@ ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
                                            CodeGenOpt::Level OL)
     : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
-void ThumbBETargetMachine::anchor() { }
+void ThumbBETargetMachine::anchor() {}
 
 ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
@@ -348,7 +361,13 @@ bool ARMPassConfig::addPreISel() {
     // tricky when doing code gen per function.
     bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
                                (EnableGlobalMerge == cl::BOU_UNSET);
-    addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize));
+    // Merging of extern globals is enabled by default on non-Mach-O as we
+    // expect it to be generally either beneficial or harmless. On Mach-O it
+    // is disabled as we emit the .subsections_via_symbols directive which
+    // means that merging extern globals is not safe.
+    bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
+    addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize,
+                                  MergeExternalByDefault));
   }
 
   return false;
@@ -356,9 +375,6 @@ bool ARMPassConfig::addPreISel() {
 
 bool ARMPassConfig::addInstSelector() {
   addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
-
-  if (TM->getTargetTriple().isOSBinFormatELF() && TM->Options.EnableFastISel)
-    addPass(createARMGlobalBaseRegPass());
   return false;
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
index 8c98e08..8ad1f3d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -26,7 +26,8 @@ public:
   enum ARMABI {
     ARM_ABI_UNKNOWN,
     ARM_ABI_APCS,
-    ARM_ABI_AAPCS // ARM EABI
+    ARM_ABI_AAPCS, // ARM EABI
+    ARM_ABI_AAPCS16
   } TargetABI;
 
 protected:
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2f194cf..c152011 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "armtti"
 
-unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned Bits = Ty->getPrimitiveSizeInBits();
@@ -47,12 +47,12 @@ unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   return 3;
 }
 
-unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
   // Single to/from double precision conversions.
-  static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
+  static const CostTblEntry NEONFltDblTbl[] = {
     // Vector fptrunc/fpext conversions.
     { ISD::FP_ROUND,   MVT::v2f64, 2 },
     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
@@ -61,10 +61,9 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
 
   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                           ISD == ISD::FP_EXTEND)) {
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
-    int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
-    if (Idx != -1)
-      return LT.first * NEONFltDblTbl[Idx].Cost;
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
+      return LT.first * Entry->Cost;
   }
 
   EVT SrcTy = TLI->getValueType(DL, Src);
@@ -76,8 +75,7 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
   // TODO: Get these tables to know at least what the related operations are.
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  NEONVectorConversionTbl[] = {
+  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
@@ -153,15 +151,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   };
 
   if (SrcTy.isVector() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
-                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
-    if (Idx != -1)
-      return NEONVectorConversionTbl[Idx].Cost;
+    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
   }
 
   // Scalar float to integer conversions.
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  NEONFloatConversionTbl[] = {
+  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
@@ -184,15 +181,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
   };
   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
-                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
-    if (Idx != -1)
-        return NEONFloatConversionTbl[Idx].Cost;
+    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
   }
 
   // Scalar integer to float conversions.
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  NEONIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
@@ -216,15 +212,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   };
 
   if (SrcTy.isInteger() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
-                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
-    if (Idx != -1)
-      return NEONIntegerConversionTbl[Idx].Cost;
+    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
+                                                   ISD, DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
   }
 
   // Scalar integer conversion costs.
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  ARMIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
 
@@ -236,17 +231,17 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   };
 
   if (SrcTy.isInteger()) {
-    int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
-                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
-    if (Idx != -1)
-      return ARMIntegerConversionTbl[Idx].Cost;
+    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
   }
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                        unsigned Index) {
+int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                   unsigned Index) {
   // Penalize inserting into an D-subregister. We end up with a three times
   // lower estimated throughput on swift.
   if (ST->isSwift() &&
@@ -255,28 +250,30 @@ unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
       ValTy->getScalarSizeInBits() <= 32)
     return 3;
 
-  // Cross-class copies are expensive on many microarchitectures,
-  // so assume they are expensive by default.
   if ((Opcode == Instruction::InsertElement ||
-       Opcode == Instruction::ExtractElement) &&
-      ValTy->getVectorElementType()->isIntegerTy())
-    return 3;
+       Opcode == Instruction::ExtractElement)) {
+    // Cross-class copies are expensive on many microarchitectures,
+    // so assume they are expensive by default.
+    if (ValTy->getVectorElementType()->isIntegerTy())
+      return 3;
+
+    // Even if it's not a cross class copy, this likely leads to mixing
+    // of NEON and VFP code and should be therefore penalized.
+    if (ValTy->isVectorTy() &&
+        ValTy->getScalarSizeInBits() <= 32)
+      return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
+  }
 
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
-unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                        Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
     // Lowering of some vector selects is currently far from perfect.
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-    NEONVectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
+    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
@@ -285,21 +282,20 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     EVT SelCondTy = TLI->getValueType(DL, CondTy);
     EVT SelValTy = TLI->getValueType(DL, ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
-                                       SelCondTy.getSimpleVT(),
-                                       SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return NEONVectorSelectTbl[Idx].Cost;
+      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+                                                     SelCondTy.getSimpleVT(),
+                                                     SelValTy.getSimpleVT()))
+        return Entry->Cost;
     }
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
     return LT.first;
   }
 
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -314,7 +310,7 @@ unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   return 1;
 }
 
-unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
+int ARMTTIImpl::getFPOpCost(Type *Ty) {
   // Use similar logic that's in ARMISelLowering:
   // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
   // to VFP.
@@ -333,14 +329,14 @@ unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
   return TargetTransformInfo::TCC_Expensive;
 }
 
-unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                                    Type *SubTp) {
+int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
   // We only handle costs of reverse and alternate shuffles for now.
   if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
-    static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+    static const CostTblEntry NEONShuffleTbl[] = {
         // Reverse shuffle cost one instruction if we are shuffling within a
         // double word (vrev) or two if we shuffle a quad word (vrev, vext).
         {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
@@ -353,16 +349,16 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
-    int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-    if (Idx == -1)
-      return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+    if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
+                                            LT.second))
+      return LT.first * Entry->Cost;
 
-    return LT.first * NEONShuffleTbl[Idx].Cost;
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
   }
   if (Kind == TTI::SK_Alternate) {
-    static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+    static const CostTblEntry NEONAltShuffleTbl[] = {
         // Alt shuffle cost table for ARM. Cost is the number of instructions
         // required to create the shuffled vector.
 
@@ -379,27 +375,26 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    int Idx =
-        CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-    if (Idx == -1)
-      return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
-    return LT.first * NEONAltShuffleTbl[Idx].Cost;
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl,
+                                            ISD::VECTOR_SHUFFLE, LT.second))
+      return LT.first * Entry->Cost;
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
   }
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-unsigned ARMTTIImpl::getArithmeticInstrCost(
+int ARMTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   const unsigned FunctionCallDivCost = 20;
   const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
+  static const CostTblEntry CostTbl[] = {
     // Division.
     // These costs are somewhat random. Choose a cost of 20 to indicate that
     // vectorizing devision (added function call) is going to be very expensive.
@@ -440,16 +435,12 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
     // Multiplication.
   };
 
-  int Idx = -1;
-
   if (ST->hasNEON())
-    Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
-
-  if (Idx != -1)
-    return LT.first * CostTbl[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
+      return LT.first * Entry->Cost;
 
-  unsigned Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                                Opd1PropInfo, Opd2PropInfo);
+  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                           Opd1PropInfo, Opd2PropInfo);
 
   // This is somewhat of a hack. The problem that we are facing is that SROA
   // creates a sequence of shift, and, or instructions to construct values.
@@ -465,10 +456,9 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
   return Cost;
 }
 
-unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                     unsigned Alignment,
-                                     unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isDoubleTy()) {
@@ -479,21 +469,21 @@ unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                                unsigned Factor,
-                                                ArrayRef<unsigned> Indices,
-                                                unsigned Alignment,
-                                                unsigned AddressSpace) {
+int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 84f256f..7d8d238 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -41,7 +41,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   const ARMTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F)
+  explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
@@ -52,11 +52,13 @@ public:
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
 
+  bool enableInterleavedAccessVectorization() { return true; }
+
   /// \name Scalar TTI Implementations
   /// @{
 
   using BaseT::getIntImmCost;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty);
 
   /// @}
 
@@ -92,34 +94,31 @@ public:
     return 1;
   }
 
-  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                          Type *SubTp);
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
 
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
 
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
-  unsigned getAddressComputationCost(Type *Val, bool IsComplex);
+  int getAddressComputationCost(Type *Val, bool IsComplex);
 
-  unsigned getFPOpCost(Type *Ty);
+  int getFPOpCost(Type *Ty);
 
-  unsigned getArithmeticInstrCost(
+  int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
 
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                      unsigned Factor,
-                                      ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace);
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
+                                 unsigned AddressSpace);
   /// @}
 };
 
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index cf6b892..c69a741 100644
--- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -129,7 +129,6 @@ public:
 };
 
 class ARMAsmParser : public MCTargetAsmParser {
-  MCSubtargetInfo &STI;
   const MCInstrInfo &MII;
   const MCRegisterInfo *MRI;
   UnwindContext UC;
@@ -247,48 +246,49 @@ class ARMAsmParser : public MCTargetAsmParser {
                                      OperandVector &Operands);
   bool isThumb() const {
     // FIXME: Can tablegen auto-generate this?
-    return STI.getFeatureBits()[ARM::ModeThumb];
+    return getSTI().getFeatureBits()[ARM::ModeThumb];
   }
   bool isThumbOne() const {
-    return isThumb() && !STI.getFeatureBits()[ARM::FeatureThumb2];
+    return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2];
   }
   bool isThumbTwo() const {
-    return isThumb() && STI.getFeatureBits()[ARM::FeatureThumb2];
+    return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2];
   }
   bool hasThumb() const {
-    return STI.getFeatureBits()[ARM::HasV4TOps];
+    return getSTI().getFeatureBits()[ARM::HasV4TOps];
   }
   bool hasV6Ops() const {
-    return STI.getFeatureBits()[ARM::HasV6Ops];
+    return getSTI().getFeatureBits()[ARM::HasV6Ops];
   }
   bool hasV6MOps() const {
-    return STI.getFeatureBits()[ARM::HasV6MOps];
+    return getSTI().getFeatureBits()[ARM::HasV6MOps];
   }
   bool hasV7Ops() const {
-    return STI.getFeatureBits()[ARM::HasV7Ops];
+    return getSTI().getFeatureBits()[ARM::HasV7Ops];
   }
   bool hasV8Ops() const {
-    return STI.getFeatureBits()[ARM::HasV8Ops];
+    return getSTI().getFeatureBits()[ARM::HasV8Ops];
   }
   bool hasARM() const {
-    return !STI.getFeatureBits()[ARM::FeatureNoARM];
+    return !getSTI().getFeatureBits()[ARM::FeatureNoARM];
   }
-  bool hasThumb2DSP() const {
-    return STI.getFeatureBits()[ARM::FeatureDSPThumb2];
+  bool hasDSP() const {
+    return getSTI().getFeatureBits()[ARM::FeatureDSP];
   }
   bool hasD16() const {
-    return STI.getFeatureBits()[ARM::FeatureD16];
+    return getSTI().getFeatureBits()[ARM::FeatureD16];
   }
   bool hasV8_1aOps() const {
-    return STI.getFeatureBits()[ARM::HasV8_1aOps];
+    return getSTI().getFeatureBits()[ARM::HasV8_1aOps];
   }
 
   void SwitchMode() {
+    MCSubtargetInfo &STI = copySTI();
     uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
   }
   bool isMClass() const {
-    return STI.getFeatureBits()[ARM::FeatureMClass];
+    return getSTI().getFeatureBits()[ARM::FeatureMClass];
   }
 
   /// @name Auto-generated Match Functions
@@ -343,14 +343,15 @@ public:
     Match_RequiresNotITBlock,
     Match_RequiresV6,
     Match_RequiresThumb2,
+    Match_RequiresV8,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "ARMGenAsmMatcher.inc"
 
   };
 
-  ARMAsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser,
+  ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : STI(STI), MII(MII), UC(Parser) {
+    : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Cache the MCRegisterInfo.
@@ -564,87 +565,6 @@ class ARMOperand : public MCParsedAsmOperand {
 
 public:
   ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
-  ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
-    Kind = o.Kind;
-    StartLoc = o.StartLoc;
-    EndLoc = o.EndLoc;
-    switch (Kind) {
-    case k_CondCode:
-      CC = o.CC;
-      break;
-    case k_ITCondMask:
-      ITMask = o.ITMask;
-      break;
-    case k_Token:
-      Tok = o.Tok;
-      break;
-    case k_CCOut:
-    case k_Register:
-      Reg = o.Reg;
-      break;
-    case k_RegisterList:
-    case k_DPRRegisterList:
-    case k_SPRRegisterList:
-      Registers = o.Registers;
-      break;
-    case k_VectorList:
-    case k_VectorListAllLanes:
-    case k_VectorListIndexed:
-      VectorList = o.VectorList;
-      break;
-    case k_CoprocNum:
-    case k_CoprocReg:
-      Cop = o.Cop;
-      break;
-    case k_CoprocOption:
-      CoprocOption = o.CoprocOption;
-      break;
-    case k_Immediate:
-      Imm = o.Imm;
-      break;
-    case k_MemBarrierOpt:
-      MBOpt = o.MBOpt;
-      break;
-    case k_InstSyncBarrierOpt:
-      ISBOpt = o.ISBOpt;
-    case k_Memory:
-      Memory = o.Memory;
-      break;
-    case k_PostIndexRegister:
-      PostIdxReg = o.PostIdxReg;
-      break;
-    case k_MSRMask:
-      MMask = o.MMask;
-      break;
-    case k_BankedReg:
-      BankedReg = o.BankedReg;
-      break;
-    case k_ProcIFlags:
-      IFlags = o.IFlags;
-      break;
-    case k_ShifterImmediate:
-      ShifterImm = o.ShifterImm;
-      break;
-    case k_ShiftedRegister:
-      RegShiftedReg = o.RegShiftedReg;
-      break;
-    case k_ShiftedImmediate:
-      RegShiftedImm = o.RegShiftedImm;
-      break;
-    case k_RotateImmediate:
-      RotImm = o.RotImm;
-      break;
-    case k_ModifiedImmediate:
-      ModImm = o.ModImm;
-      break;
-    case k_BitfieldDescriptor:
-      Bitfield = o.Bitfield;
-      break;
-    case k_VectorIndex:
-      VectorIndex = o.VectorIndex;
-      break;
-    }
-  }
 
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
@@ -4054,7 +3974,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
     if (FlagsVal == ~0U)
       return MatchOperand_NoMatch;
 
-    if (!hasThumb2DSP() && (FlagsVal & 0x400))
+    if (!hasDSP() && (FlagsVal & 0x400))
       // The _g and _nzcvqg versions are only valid if the DSP extension is
       // available.
       return MatchOperand_NoMatch;
@@ -5202,6 +5122,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     // FALLTHROUGH
   }
   case AsmToken::Colon: {
+    S = Parser.getTok().getLoc();
     // ":lower16:" and ":upper16:" expression prefixes
     // FIXME: Check it's an expression prefix,
     // e.g. (FOO - :lower16:BAR) isn't legal.
@@ -5220,8 +5141,9 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     return false;
   }
   case AsmToken::Equal: {
+    S = Parser.getTok().getLoc();
     if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
-      return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+      return Error(S, "unexpected token in operand");
 
     Parser.Lex(); // Eat '='
     const MCExpr *SubExprVal;
@@ -5229,7 +5151,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       return true;
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
-    const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal);
+    const MCExpr *CPLoc =
+        getTargetStreamer().addConstantPoolEntry(SubExprVal, S);
     Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E));
     return false;
   }
@@ -5682,9 +5605,11 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
   // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
   unsigned RegIdx = 3;
   if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
-      static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") {
+      (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
+       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
     if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-        static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32")
+        (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" ||
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16"))
       RegIdx = 4;
 
     if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
@@ -8610,18 +8535,29 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
         inITBlock())
       return Match_RequiresNotITBlock;
+  } else if (isThumbOne()) {
+    // Some high-register supporting Thumb1 encodings only allow both registers
+    // to be from r0-r7 when in Thumb2.
+    if (Opc == ARM::tADDhirr && !hasV6MOps() &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        isARMLowRegister(Inst.getOperand(2).getReg()))
+      return Match_RequiresThumb2;
+    // Others only require ARMv6 or later.
+    else if (Opc == ARM::tMOVr && !hasV6Ops() &&
+             isARMLowRegister(Inst.getOperand(0).getReg()) &&
+             isARMLowRegister(Inst.getOperand(1).getReg()))
+      return Match_RequiresV6;
   }
-  // Some high-register supporting Thumb1 encodings only allow both registers
-  // to be from r0-r7 when in Thumb2.
-  else if (Opc == ARM::tADDhirr && isThumbOne() && !hasV6MOps() &&
-           isARMLowRegister(Inst.getOperand(1).getReg()) &&
-           isARMLowRegister(Inst.getOperand(2).getReg()))
-    return Match_RequiresThumb2;
-  // Others only require ARMv6 or later.
-  else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() &&
-           isARMLowRegister(Inst.getOperand(0).getReg()) &&
-           isARMLowRegister(Inst.getOperand(1).getReg()))
-    return Match_RequiresV6;
+
+  for (unsigned I = 0; I < MCID.NumOperands; ++I)
+    if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
+      // rGPRRegClass excludes PC, and also excluded SP before ARMv8
+      if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops())
+        return Match_RequiresV8;
+      else if (Inst.getOperand(I).getReg() == ARM::PC)
+        return Match_InvalidOperand;
+    }
+
   return Match_Success;
 }
 
@@ -8680,7 +8616,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return false;
 
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
+    Out.EmitInstruction(Inst, getSTI());
     return false;
   case Match_MissingFeature: {
     assert(ErrorInfo && "Unknown missing feature!");
@@ -8720,6 +8656,8 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv6 or later");
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
+  case Match_RequiresV8:
+    return Error(IDLoc, "instruction variant requires ARMv8 or later");
   case Match_ImmRange0_15: {
     SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
@@ -8868,7 +8806,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
         return false;
       }
 
-      getParser().getStreamer().EmitValue(Value, Size);
+      getParser().getStreamer().EmitValue(Value, Size, L);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -9098,7 +9036,7 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
 bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
   StringRef Arch = getParser().parseStringToEndOfStatement().trim();
 
-  unsigned ID = ARMTargetParser::parseArch(Arch);
+  unsigned ID = ARM::parseArch(Arch);
 
   if (ID == ARM::AK_INVALID) {
     Error(L, "Unknown arch name");
@@ -9106,7 +9044,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
   }
 
   Triple T;
-  STI.setDefaultFeatures(T.getARMCPUForArch(Arch));
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
   getTargetStreamer().emitArch(ID);
@@ -9233,12 +9172,13 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
 
   // FIXME: This is using table-gen data, but should be moved to
   // ARMTargetParser once that is table-gen'd.
-  if (!STI.isCPUStringValid(CPU)) {
+  if (!getSTI().isCPUStringValid(CPU)) {
     Error(L, "Unknown CPU name");
     return false;
   }
 
-  STI.setDefaultFeatures(CPU);
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures(CPU, "");
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
   return false;
@@ -9249,13 +9189,14 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
   SMLoc FPUNameLoc = getTok().getLoc();
   StringRef FPU = getParser().parseStringToEndOfStatement().trim();
 
-  unsigned ID = ARMTargetParser::parseFPU(FPU);
+  unsigned ID = ARM::parseFPU(FPU);
   std::vector<const char *> Features;
-  if (!ARMTargetParser::getFPUFeatures(ID, Features)) {
+  if (!ARM::getFPUFeatures(ID, Features)) {
     Error(FPUNameLoc, "Unknown FPU name");
     return false;
   }
 
+  MCSubtargetInfo &STI = copySTI();
   for (auto Feature : Features)
     STI.ApplyFeatureFlag(Feature);
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -9895,7 +9836,7 @@ bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
   SMLoc ArchLoc = Parser.getTok().getLoc();
   getLexer().Lex();
 
-  unsigned ID = ARMTargetParser::parseArch(Arch);
+  unsigned ID = ARM::parseArch(Arch);
 
   if (ID == ARM::AK_INVALID) {
     Error(ArchLoc, "unknown architecture '" + Arch + "'");
@@ -9976,22 +9917,22 @@ extern "C" void LLVMInitializeARMAsmParser() {
 // when we start to table-generate them, and we can use the ARM
 // flags below, that were generated by table-gen.
 static const struct {
-  const ARM::ArchExtKind Kind;
-  const unsigned ArchCheck;
+  const unsigned Kind;
+  const uint64_t ArchCheck;
   const FeatureBitset Features;
 } Extensions[] = {
   { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
   { ARM::AEK_CRYPTO,  Feature_HasV8,
     {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
   { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
-  { ARM::AEK_HWDIV, Feature_HasV7 | Feature_IsNotMClass,
+  { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
     {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} },
   { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
   { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
-  // FIXME: Also available in ARMv6-K
-  { ARM::AEK_SEC, Feature_HasV7, {ARM::FeatureTrustZone} },
+  { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
   // FIXME: Only available in A-class, isel not predicated
   { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+  { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
   // FIXME: Unsupported extensions.
   { ARM::AEK_OS, Feature_None, {} },
   { ARM::AEK_IWMMXT, Feature_None, {} },
@@ -10020,7 +9961,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
     EnableFeature = false;
     Name = Name.substr(2);
   }
-  unsigned FeatureKind = ARMTargetParser::parseArchExt(Name);
+  unsigned FeatureKind = ARM::parseArchExt(Name);
   if (FeatureKind == ARM::AEK_INVALID)
     Error(ExtLoc, "unknown architectural extension: " + Name);
 
@@ -10037,6 +9978,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
       return false;
     }
 
+    MCSubtargetInfo &STI = copySTI();
     FeatureBitset ToggleFeatures = EnableFeature
       ? (~STI.getFeatureBits() & Extension.Features)
       : ( STI.getFeatureBits() & Extension.Features);
@@ -10078,6 +10020,10 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
              "expression value must be representable in 32 bits");
     }
     break;
+  case MCK_rGPR:
+    if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP)
+      return Match_Success;
+    break;
   case MCK_GPRPair:
     if (Op.isReg() &&
         MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 097ec04..e63defe 100644
--- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -59,7 +59,7 @@ namespace {
       }
 
       // Called when decoding an IT instruction. Sets the IT state for the following
-      // instructions that for the IT block. Firstcond and Mask correspond to the 
+      // instructions that for the IT block. Firstcond and Mask correspond to the
       // fields in the IT instruction encoding.
       void setITState(char Firstcond, char Mask) {
         // (3 - the number of trailing zeros) is the number of then / else.
@@ -459,21 +459,18 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   // VFP and NEON instructions, similarly, are shared between ARM
   // and Thumb modes.
-  MI.clear();
   Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
     return Result;
   }
 
-  MI.clear();
   Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
     return Result;
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -485,7 +482,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
                              this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -497,7 +493,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -509,7 +504,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -517,7 +511,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -525,7 +518,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -718,7 +710,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
                              STI);
   if (Result) {
@@ -729,7 +720,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -763,7 +753,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   uint32_t Insn32 =
       (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16);
-  MI.clear();
   Result =
       decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -774,7 +763,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -784,7 +772,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
-    MI.clear();
     Result =
         decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
     if (Result != MCDisassembler::Fail) {
@@ -794,7 +781,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  MI.clear();
   Result =
       decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
@@ -803,7 +789,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
-    MI.clear();
     Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
                                STI);
     if (Result != MCDisassembler::Fail) {
@@ -814,7 +799,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
-    MI.clear();
     uint32_t NEONLdStInsn = Insn32;
     NEONLdStInsn &= 0xF0FFFFFF;
     NEONLdStInsn |= 0x04000000;
@@ -828,7 +812,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
-    MI.clear();
     uint32_t NEONDataInsn = Insn32;
     NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
     NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
@@ -841,7 +824,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       return Result;
     }
 
-    MI.clear();
     uint32_t NEONCryptoInsn = Insn32;
     NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
     NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
@@ -853,7 +835,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       return Result;
     }
 
-    MI.clear();
     uint32_t NEONv8Insn = Insn32;
     NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
     Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
@@ -864,7 +845,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -902,7 +882,7 @@ static DecodeStatus
 DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
-  
+
   if (RegNo == 15) 
     S = MCDisassembler::SoftFail;
 
@@ -986,8 +966,13 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
-  if (RegNo == 13 || RegNo == 15)
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15)
     S = MCDisassembler::SoftFail;
+
   Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
   return S;
 }
@@ -1147,7 +1132,7 @@ static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
   unsigned imm = fieldFromInstruction(Val, 7, 5);
 
   // Register-immediate
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
 
   ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
@@ -1658,7 +1643,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::STRD_POST:
       if (P == 0 && W == 1)
         S = MCDisassembler::SoftFail;
-      
+
       if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
         S = MCDisassembler::SoftFail;
       if (type && Rm == 15)
@@ -4131,7 +4116,7 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
         // indicates the move for the GE{3:0} bits, the mask{0} bit can be set
         // only if the processor includes the DSP extension.
         if (Mask == 0 || (Mask != 2 && ValLow > 3) ||
-            (!(FeatureBits[ARM::FeatureDSPThumb2]) && (Mask & 1)))
+            (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1)))
           S = MCDisassembler::SoftFail;
       }
     }
@@ -5065,6 +5050,10 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder) {
+  const FeatureBitset &featureBits =
+      ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+  bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
   unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
   Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
   unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
@@ -5075,10 +5064,35 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
 
   DecodeStatus S = MCDisassembler::Success;
 
-  // VMOVv2f32 is ambiguous with these decodings.
-  if (!(imm & 0x38) && cmode == 0xF) {
-    if (op == 1) return MCDisassembler::Fail;
-    Inst.setOpcode(ARM::VMOVv2f32);
+  // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+  if (!(imm & 0x38)) {
+    if (cmode == 0xF) {
+      if (op == 1) return MCDisassembler::Fail;
+      Inst.setOpcode(ARM::VMOVv2f32);
+    }
+    if (hasFullFP16) {
+      if (cmode == 0xE) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMOVv1i64);
+        } else {
+          Inst.setOpcode(ARM::VMOVv8i8);
+        }
+      }
+      if (cmode == 0xD) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv2i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv2i32);
+        }
+      }
+      if (cmode == 0xC) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv2i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv2i32);
+        }
+      }
+    }
     return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
@@ -5095,6 +5109,10 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder) {
+  const FeatureBitset &featureBits =
+      ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+  bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
   unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
   Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
   unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
@@ -5105,10 +5123,35 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
 
   DecodeStatus S = MCDisassembler::Success;
 
-  // VMOVv4f32 is ambiguous with these decodings.
-  if (!(imm & 0x38) && cmode == 0xF) {
-    if (op == 1) return MCDisassembler::Fail;
-    Inst.setOpcode(ARM::VMOVv4f32);
+  // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+  if (!(imm & 0x38)) {
+    if (cmode == 0xF) {
+      if (op == 1) return MCDisassembler::Fail;
+      Inst.setOpcode(ARM::VMOVv4f32);
+    }
+    if (hasFullFP16) {
+      if (cmode == 0xE) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMOVv2i64);
+        } else {
+          Inst.setOpcode(ARM::VMOVv16i8);
+        }
+      }
+      if (cmode == 0xD) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv4i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv4i32);
+        }
+      }
+      if (cmode == 0xC) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv4i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv4i32);
+        }
+      }
+    }
     return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
@@ -5132,7 +5175,7 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
   unsigned Rm = fieldFromInstruction(Val, 0, 4);
   Rm |= (fieldFromInstruction(Val, 23, 1) << 4);
   unsigned Cond = fieldFromInstruction(Val, 28, 4);
- 
+
   if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt)
     S = MCDisassembler::SoftFail;
 
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 0bff521..33fc85a 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -804,7 +805,7 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
     unsigned Opcode = MI->getOpcode();
 
     // For writes, handle extended mask bits if the DSP extension is present.
-    if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSPThumb2]) {
+    if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) {
       switch (SYSm) {
       case 0x400:
         O << "apsr_g";
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 3927c9f..52f7115 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -15,12 +15,9 @@
 #define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
-class MCOperand;
-
 class ARMInstPrinter : public MCInstPrinter {
 public:
   ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1114635..fa52c93 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -25,13 +25,17 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -180,9 +184,8 @@ bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
   return false;
 }
 
-bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                         const MCRelaxableFragment *DF,
-                                         const MCAsmLayout &Layout) const {
+const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
+                                                    uint64_t Value) const {
   switch ((unsigned)Fixup.getKind()) {
   case ARM::fixup_arm_thumb_br: {
     // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
@@ -192,7 +195,9 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
     //
     // Relax if the value is too big for a (signed) i8.
     int64_t Offset = int64_t(Value) - 4;
-    return Offset > 2046 || Offset < -2048;
+    if (Offset > 2046 || Offset < -2048)
+      return "out of range pc-relative fixup value";
+    break;
   }
   case ARM::fixup_arm_thumb_bcc: {
     // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
@@ -202,23 +207,40 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
     //
     // Relax if the value is too big for a (signed) i8.
     int64_t Offset = int64_t(Value) - 4;
-    return Offset > 254 || Offset < -256;
+    if (Offset > 254 || Offset < -256)
+      return "out of range pc-relative fixup value";
+    break;
   }
   case ARM::fixup_thumb_adr_pcrel_10:
   case ARM::fixup_arm_thumb_cp: {
     // If the immediate is negative, greater than 1020, or not a multiple
     // of four, the wide version of the instruction must be used.
     int64_t Offset = int64_t(Value) - 4;
-    return Offset > 1020 || Offset < 0 || Offset & 3;
+    if (Offset & 3)
+      return "misaligned pc-relative fixup value";
+    else if (Offset > 1020 || Offset < 0)
+      return "out of range pc-relative fixup value";
+    break;
   }
-  case ARM::fixup_arm_thumb_cb:
+  case ARM::fixup_arm_thumb_cb: {
     // If we have a Thumb CBZ or CBNZ instruction and its target is the next
     // instruction it is is actually out of range for the instruction.
     // It will be changed to a NOP.
     int64_t Offset = (Value & ~1);
-    return Offset == 2;
+    if (Offset == 2)
+      return "will be converted to nop";
+    break;
   }
-  llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!");
+  default:
+    llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!");
+  }
+  return nullptr;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  return reasonForFixupRelaxation(Fixup, Value);
 }
 
 void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
@@ -317,9 +339,10 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
   return Value;
 }
 
-static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 bool IsPCRel, MCContext *Ctx,
-                                 bool IsLittleEndian) {
+unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                         bool IsPCRel, MCContext *Ctx,
+                                         bool IsLittleEndian,
+                                         bool IsResolved) const {
   unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
@@ -372,8 +395,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       isAdd = false;
     }
-    if (Ctx && Value >= 4096)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Ctx && Value >= 4096) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
     Value |= isAdd << 23;
 
     // Same addressing mode as fixup_arm_pcrel_10,
@@ -383,8 +408,6 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     return Value;
   }
-  case ARM::fixup_thumb_adr_pcrel_10:
-    return ((Value - 4) >> 2) & 0xff;
   case ARM::fixup_arm_adr_pcrel_12: {
     // ARM PC-relative values are offset by 8.
     Value -= 8;
@@ -393,8 +416,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       opc = 2; // 0b0010
     }
-    if (Ctx && ARM_AM::getSOImmVal(Value) == -1)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
     // Encode the immediate and shift the opcode into place.
     return ARM_AM::getSOImmVal(Value) | (opc << 21);
   }
@@ -517,21 +542,44 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                            ((uint16_t)imm10LBits) << 1);
     return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
+  case ARM::fixup_thumb_adr_pcrel_10:
   case ARM::fixup_arm_thumb_cp:
-    // Offset by 4, and don't encode the low two bits. Two bytes of that
-    // 'off by 4' is implicitly handled by the half-word ordering of the
-    // Thumb encoding, so we only need to adjust by 2 here.
-    return ((Value - 2) >> 2) & 0xff;
+    // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
+    // could have an error on our hands.
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+      const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+      if (FixupDiagnostic) {
+        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        return 0;
+      }
+    }
+    // Offset by 4, and don't encode the low two bits.
+    return ((Value - 4) >> 2) & 0xff;
   case ARM::fixup_arm_thumb_cb: {
     // Offset by 4 and don't encode the lower bit, which is always 0.
+    // FIXME: diagnose if no Thumb2
     uint32_t Binary = (Value - 4) >> 1;
     return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+      const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+      if (FixupDiagnostic) {
+        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        return 0;
+      }
+    }
     return ((Value - 4) >> 1) & 0x7ff;
   case ARM::fixup_arm_thumb_bcc:
     // Offset by 4 and don't encode the lower bit, which is always 0.
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+      const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+      if (FixupDiagnostic) {
+        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        return 0;
+      }
+    }
     return ((Value - 4) >> 1) & 0xff;
   case ARM::fixup_arm_pcrel_10_unscaled: {
     Value = Value - 8; // ARM fixups offset by an additional word and don't
@@ -542,8 +590,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
-    if (Ctx && Value >= 256)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
     Value = (Value & 0xf) | ((Value & 0xf0) << 4);
     return Value | (isAdd << 23);
   }
@@ -561,8 +611,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     }
     // These values don't encode the low two bits since they're always zero.
     Value >>= 2;
-    if (Ctx && Value >= 256)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
     Value |= isAdd << 23;
 
     // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
@@ -582,6 +634,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
                                       const MCValue &Target, uint64_t &Value,
                                       bool &IsResolved) {
   const MCSymbolRefExpr *A = Target.getSymA();
+  const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
   // Some fixups to thumb function symbols need the low bit (thumb bit)
   // twiddled.
   if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
@@ -590,18 +643,21 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
       (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
       (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
       (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
-    if (A) {
-      const MCSymbol &Sym = A->getSymbol();
-      if (Asm.isThumbFunc(&Sym))
+    if (Sym) {
+      if (Asm.isThumbFunc(Sym))
         Value |= 1;
     }
   }
-  // For Thumb1 BL instruction, it is possible to be a long jump between
-  // the basic blocks of the same function.  Thus, we would like to resolve
-  // the offset when the destination has the same MCFragment.
-  if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
-    const MCSymbol &Sym = A->getSymbol();
-    IsResolved = (Sym.getFragment() == DF);
+  if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+    assert(Sym && "How did we resolve this?");
+
+    // If the symbol is external the linker will handle it.
+    // FIXME: Should we handle it as an optimization?
+
+    // If the symbol is out of range, produce a relocation and hope the
+    // linker can handle it. GNU AS produces an error in this case.
+    if (Sym->isExternal() || Value >= 0x400004)
+      IsResolved = false;
   }
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
@@ -616,7 +672,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // the instruction. This allows adjustFixupValue() to issue a diagnostic
   // if the value aren't invalid.
   (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
-                         IsLittleEndian);
+                         IsLittleEndian, IsResolved);
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -719,7 +775,8 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                unsigned DataSize, uint64_t Value,
                                bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian);
+  Value =
+      adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -743,6 +800,249 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
 }
 
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  UNWIND_ARM_MODE_MASK                         = 0x0F000000,
+  UNWIND_ARM_MODE_FRAME                        = 0x01000000,
+  UNWIND_ARM_MODE_FRAME_D                      = 0x02000000,
+  UNWIND_ARM_MODE_DWARF                        = 0x04000000,
+
+  UNWIND_ARM_FRAME_STACK_ADJUST_MASK           = 0x00C00000,
+
+  UNWIND_ARM_FRAME_FIRST_PUSH_R4               = 0x00000001,
+  UNWIND_ARM_FRAME_FIRST_PUSH_R5               = 0x00000002,
+  UNWIND_ARM_FRAME_FIRST_PUSH_R6               = 0x00000004,
+
+  UNWIND_ARM_FRAME_SECOND_PUSH_R8              = 0x00000008,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R9              = 0x00000010,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R10             = 0x00000020,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R11             = 0x00000040,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R12             = 0x00000080,
+
+  UNWIND_ARM_FRAME_D_REG_COUNT_MASK            = 0x00000F00,
+
+  UNWIND_ARM_DWARF_SECTION_OFFSET              = 0x00FFFFFF
+};
+
+} // end CU namespace
+
+/// Generate compact unwind encoding for the function based on the CFI
+/// instructions. If the CFI instructions describe a frame that cannot be
+/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
+/// tells the runtime to fallback and unwind using dwarf.
+uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+    ArrayRef<MCCFIInstruction> Instrs) const {
+  DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
+  // Only armv7k uses CFI based unwinding.
+  if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K)
+    return 0;
+  // No .cfi directives means no frame.
+  if (Instrs.empty())
+    return 0;
+  // Start off assuming CFA is at SP+0.
+  int CFARegister = ARM::SP;
+  int CFARegisterOffset = 0;
+  // Mark savable registers as initially unsaved
+  DenseMap<unsigned, int> RegOffsets;
+  int FloatRegCount = 0;
+  // Process each .cfi directive and build up compact unwind info.
+  for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+    int Reg;
+    const MCCFIInstruction &Inst = Instrs[i];
+    switch (Inst.getOperation()) {
+    case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
+      CFARegisterOffset = -Inst.getOffset();
+      CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      break;
+    case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
+      CFARegisterOffset = -Inst.getOffset();
+      break;
+    case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
+      CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      break;
+    case MCCFIInstruction::OpOffset: // DW_CFA_offset
+      Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+        RegOffsets[Reg] = Inst.getOffset();
+      else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+        RegOffsets[Reg] = Inst.getOffset();
+        ++FloatRegCount;
+      } else {
+        DEBUG_WITH_TYPE("compact-unwind",
+                        llvm::dbgs() << ".cfi_offset on unknown register="
+                                     << Inst.getRegister() << "\n");
+        return CU::UNWIND_ARM_MODE_DWARF;
+      }
+      break;
+    case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc
+      // Ignore
+      break;
+    default:
+      // Directive not convertable to compact unwind, bail out.
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs()
+                          << "CFI directive not compatiable with comact "
+                             "unwind encoding, opcode=" << Inst.getOperation()
+                          << "\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+      break;
+    }
+  }
+
+  // If no frame set up, return no unwind info.
+  if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0))
+    return 0;
+
+  // Verify standard frame (lr/r7) was used.
+  if (CFARegister != ARM::R7) {
+    DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is "
+                                                   << CFARegister
+                                                   << " instead of r7\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+  int StackAdjust = CFARegisterOffset - 8;
+  if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) {
+    DEBUG_WITH_TYPE("compact-unwind",
+                    llvm::dbgs()
+                        << "LR not saved as standard frame, StackAdjust="
+                        << StackAdjust
+                        << ", CFARegisterOffset=" << CFARegisterOffset
+                        << ", lr save at offset=" << RegOffsets[14] << "\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+  if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) {
+    DEBUG_WITH_TYPE("compact-unwind",
+                    llvm::dbgs() << "r7 not saved as standard frame\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+  uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME;
+
+  // If var-args are used, there may be a stack adjust required.
+  switch (StackAdjust) {
+  case 0:
+    break;
+  case 4:
+    CompactUnwindEncoding |= 0x00400000;
+    break;
+  case 8:
+    CompactUnwindEncoding |= 0x00800000;
+    break;
+  case 12:
+    CompactUnwindEncoding |= 0x00C00000;
+    break;
+  default:
+    DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs()
+                                          << ".cfi_def_cfa stack adjust ("
+                                          << StackAdjust << ") out of range\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+
+  // If r6 is saved, it must be right below r7.
+  static struct {
+    unsigned Reg;
+    unsigned Encoding;
+  } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6},
+                   {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5},
+                   {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4},
+                   {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12},
+                   {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11},
+                   {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10},
+                   {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9},
+                   {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}};
+
+  int CurOffset = -8 - StackAdjust;
+  for (auto CSReg : GPRCSRegs) {
+    auto Offset = RegOffsets.find(CSReg.Reg);
+    if (Offset == RegOffsets.end())
+      continue;
+
+    int RegOffset = Offset->second;
+    if (RegOffset != CurOffset - 4) {
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at "
+                                   << RegOffset << " but only supported at "
+                                   << CurOffset << "\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+    }
+    CompactUnwindEncoding |= CSReg.Encoding;
+    CurOffset -= 4;
+  }
+
+  // If no floats saved, we are done.
+  if (FloatRegCount == 0)
+    return CompactUnwindEncoding;
+
+  // Switch mode to include D register saving.
+  CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK;
+  CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D;
+
+  // FIXME: supporting more than 4 saved D-registers compactly would be trivial,
+  // but needs coordination with the linker and libunwind.
+  if (FloatRegCount > 4) {
+    DEBUG_WITH_TYPE("compact-unwind",
+                    llvm::dbgs() << "unsupported number of D registers saved ("
+                                 << FloatRegCount << ")\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+  }
+
+  // Floating point registers must either be saved sequentially, or we defer to
+  // DWARF. No gaps allowed here so check that each saved d-register is
+  // precisely where it should be.
+  static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 };
+  for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) {
+    auto Offset = RegOffsets.find(FPRCSRegs[Idx]);
+    if (Offset == RegOffsets.end()) {
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+                                   << MRI.getName(FPRCSRegs[Idx])
+                                   << " not saved\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+    } else if (Offset->second != CurOffset - 8) {
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+                                   << MRI.getName(FPRCSRegs[Idx])
+                                   << " saved at " << Offset->second
+                                   << ", expected at " << CurOffset - 8
+                                   << "\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+    }
+    CurOffset -= 8;
+  }
+
+  return CompactUnwindEncoding | ((FloatRegCount - 1) << 8);
+}
+
+static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
+  unsigned AK = ARM::parseArch(Arch);
+  switch (AK) {
+  default:
+    return MachO::CPU_SUBTYPE_ARM_V7;
+  case ARM::AK_ARMV4T:
+    return MachO::CPU_SUBTYPE_ARM_V4T;
+  case ARM::AK_ARMV5T:
+  case ARM::AK_ARMV5TE:
+  case ARM::AK_ARMV5TEJ:
+    return MachO::CPU_SUBTYPE_ARM_V5;
+  case ARM::AK_ARMV6:
+  case ARM::AK_ARMV6K:
+    return MachO::CPU_SUBTYPE_ARM_V6;
+  case ARM::AK_ARMV7A:
+    return MachO::CPU_SUBTYPE_ARM_V7;
+  case ARM::AK_ARMV7S:
+    return MachO::CPU_SUBTYPE_ARM_V7S;
+  case ARM::AK_ARMV7K:
+    return MachO::CPU_SUBTYPE_ARM_V7K;
+  case ARM::AK_ARMV6M:
+    return MachO::CPU_SUBTYPE_ARM_V6M;
+  case ARM::AK_ARMV7M:
+    return MachO::CPU_SUBTYPE_ARM_V7M;
+  case ARM::AK_ARMV7EM:
+    return MachO::CPU_SUBTYPE_ARM_V7EM;
+  }
+}
+
 MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI,
                                         const Triple &TheTriple, StringRef CPU,
@@ -751,19 +1051,8 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
   default:
     llvm_unreachable("unsupported object format");
   case Triple::MachO: {
-    MachO::CPUSubTypeARM CS =
-        StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
-            .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
-            .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
-            .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
-            .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
-            .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
-            .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
-            .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
-            .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
-            .Default(MachO::CPU_SUBTYPE_ARM_V7);
-
-    return new ARMAsmBackendDarwin(T, TheTriple, CS);
+    MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
+    return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS);
   }
   case Triple::COFF:
     assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 6b4abd5..28a6213 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -45,6 +45,10 @@ public:
                          const MCValue &Target, uint64_t &Value,
                          bool &IsResolved) override;
 
+  unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+                            MCContext *Ctx, bool IsLittleEndian,
+                            bool IsResolved) const;
+
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value, bool IsPCRel) const override;
 
@@ -52,6 +56,9 @@ public:
 
   bool mayNeedRelaxation(const MCInst &Inst) const override;
 
+  const char *reasonForFixupRelaxation(const MCFixup &Fixup,
+                                       uint64_t Value) const;
+
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index a6206e3..995dd0f 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -16,11 +16,12 @@ using namespace llvm;
 
 namespace {
 class ARMAsmBackendDarwin : public ARMAsmBackend {
+  const MCRegisterInfo &MRI;
 public:
   const MachO::CPUSubTypeARM Subtype;
   ARMAsmBackendDarwin(const Target &T, const Triple &TT,
-                      MachO::CPUSubTypeARM st)
-      : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) {
+                      const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
+      : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
     HasDataInCodeSupport = true;
   }
 
@@ -28,6 +29,9 @@ public:
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
                                      Subtype);
   }
+
+  uint32_t generateCompactUnwindEncoding(
+      ArrayRef<MCCFIInstruction> Instrs) const override;
 };
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 804d353..52eba8be 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -95,7 +95,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_GOTTPOFF:
         Type = ELF::R_ARM_TLS_IE32;
         break;
-      case MCSymbolRefExpr::VK_GOTPCREL:
+      case MCSymbolRefExpr::VK_ARM_GOT_PREL:
         Type = ELF::R_ARM_GOT_PREL;
         break;
       }
@@ -192,7 +192,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_GOTOFF:
         Type = ELF::R_ARM_GOTOFF32;
         break;
-      case MCSymbolRefExpr::VK_GOTPCREL:
+      case MCSymbolRefExpr::VK_ARM_GOT_PREL:
         Type = ELF::R_ARM_GOT_PREL;
         break;
       case MCSymbolRefExpr::VK_ARM_TARGET1:
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index d17fdb9..57577dc 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -79,7 +79,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void emitAttribute(unsigned Attribute, unsigned Value) override;
   void emitTextAttribute(unsigned Attribute, StringRef String) override;
   void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
-                            StringRef StrinValue) override;
+                            StringRef StringValue) override;
   void emitArch(unsigned Arch) override;
   void emitArchExtension(unsigned ArchExt) override;
   void emitObjectArch(unsigned Arch) override;
@@ -195,16 +195,16 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
   OS << "\n";
 }
 void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
-  OS << "\t.arch\t" << ARMTargetParser::getArchName(Arch) << "\n";
+  OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
 }
 void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
-  OS << "\t.arch_extension\t" << ARMTargetParser::getArchExtName(ArchExt) << "\n";
+  OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
 }
 void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
-  OS << "\t.object_arch\t" << ARMTargetParser::getArchName(Arch) << '\n';
+  OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n';
 }
 void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
-  OS << "\t.fpu\t" << ARMTargetParser::getFPUName(FPU) << "\n";
+  OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n";
 }
 void ARMTargetAsmStreamer::finishAttributeSection() {
 }
@@ -243,7 +243,7 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
 class ARMTargetELFStreamer : public ARMTargetStreamer {
 private:
   // This structure holds all attributes, accounting for
-  // their string/numeric value, so we can later emmit them
+  // their string/numeric value, so we can later emit them
   // in declaration order, keeping all in the same vector
   struct AttributeItem {
     enum {
@@ -254,7 +254,7 @@ private:
     } Type;
     unsigned Tag;
     unsigned IntValue;
-    StringRef StringValue;
+    std::string StringValue;
 
     static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
       // The conformance tag must be emitted first when serialised
@@ -388,6 +388,9 @@ private:
 
   size_t calculateContentSize() const;
 
+  // Reset state between object emissions
+  void reset() override;
+
 public:
   ARMTargetELFStreamer(MCStreamer &S)
     : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID),
@@ -415,7 +418,7 @@ public:
                  MCCodeEmitter *Emitter, bool IsThumb)
       : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
         MappingSymbolCounter(0), LastEMS(EMS_None) {
-    Reset();
+    EHReset();
   }
 
   ~ARMELFStreamer() {}
@@ -507,14 +510,15 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                     const SMLoc &Loc) override {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
     if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
-      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4))
-        getContext().reportFatalError(Loc, "relocated expression must be 32-bit");
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
+        getContext().reportError(Loc, "relocated expression must be 32-bit");
+        return;
+      }
 
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size);
+    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
   }
 
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
@@ -578,7 +582,10 @@ private:
   }
 
   // Helper functions for ARM exception handling directives
-  void Reset();
+  void EHReset();
+
+  // Reset state between object emissions
+  void reset() override;
 
   void EmitPersonalityFixup(StringRef Name);
   void FlushPendingOffset();
@@ -684,16 +691,16 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   using namespace ARMBuildAttrs;
 
   setAttributeItem(CPU_name,
-                   ARMTargetParser::getCPUAttr(Arch),
+                   ARM::getCPUAttr(Arch),
                    false);
 
   if (EmittedArch == ARM::AK_INVALID)
     setAttributeItem(CPU_arch,
-                     ARMTargetParser::getArchAttr(Arch),
+                     ARM::getArchAttr(Arch),
                      false);
   else
     setAttributeItem(CPU_arch,
-                     ARMTargetParser::getArchAttr(EmittedArch),
+                     ARM::getArchAttr(EmittedArch),
                      false);
 
   switch (Arch) {
@@ -702,7 +709,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::AK_ARMV3:
   case ARM::AK_ARMV3M:
   case ARM::AK_ARMV4:
-  case ARM::AK_ARMV5:
     setAttributeItem(ARM_ISA_use, Allowed, false);
     break;
 
@@ -710,7 +716,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::AK_ARMV5T:
   case ARM::AK_ARMV5TE:
   case ARM::AK_ARMV6:
-  case ARM::AK_ARMV6J:
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, Allowed, false);
     break;
@@ -721,8 +726,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
     break;
 
   case ARM::AK_ARMV6K:
-  case ARM::AK_ARMV6Z:
-  case ARM::AK_ARMV6ZK:
+  case ARM::AK_ARMV6KZ:
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, Allowed, false);
     setAttributeItem(Virtualization_use, AllowTZ, false);
@@ -732,10 +736,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
     setAttributeItem(THUMB_ISA_use, Allowed, false);
     break;
 
-  case ARM::AK_ARMV7:
-    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
-    break;
-
   case ARM::AK_ARMV7A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
@@ -755,6 +755,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
 
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1045,6 +1046,8 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
   getStreamer().emitInst(Inst, Suffix);
 }
 
+void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
+
 void ARMELFStreamer::FinishImpl() {
   MCTargetStreamer &TS = *getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1053,6 +1056,18 @@ void ARMELFStreamer::FinishImpl() {
   MCELFStreamer::FinishImpl();
 }
 
+void ARMELFStreamer::reset() {
+  MCTargetStreamer &TS = *getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.reset();
+  MappingSymbolCounter = 0;
+  MCELFStreamer::reset();
+  // MCELFStreamer clear's the assembler's e_flags. However, for
+  // arm we manually set the ABI version on streamer creation, so
+  // do the same here
+  getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+}
+
 inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
                                               unsigned Type,
                                               unsigned Flags,
@@ -1084,19 +1099,14 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
 }
 
 inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
-  SwitchToEHSection(".ARM.extab",
-                    ELF::SHT_PROGBITS,
-                    ELF::SHF_ALLOC,
-                    SectionKind::getDataRel(),
-                    FnStart);
+  SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC,
+                    SectionKind::getData(), FnStart);
 }
 
 inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
-  SwitchToEHSection(".ARM.exidx",
-                    ELF::SHT_ARM_EXIDX,
+  SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX,
                     ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
-                    SectionKind::getDataRel(),
-                    FnStart);
+                    SectionKind::getData(), FnStart);
 }
 void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
   MCDataFragment *Frag = getOrCreateDataFragment();
@@ -1104,7 +1114,7 @@ void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
                                               Kind));
 }
 
-void ARMELFStreamer::Reset() {
+void ARMELFStreamer::EHReset() {
   ExTab = nullptr;
   FnStart = nullptr;
   Personality = nullptr;
@@ -1174,7 +1184,7 @@ void ARMELFStreamer::emitFnEnd() {
   SwitchSection(&FnStart->getSection());
 
   // Clean exception handling frame information
-  Reset();
+  EHReset();
 }
 
 void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 1ac0815..bda37f6 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -33,7 +33,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::SjLj;
+  ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS()
+                       ? ExceptionHandling::SjLj
+                       : ExceptionHandling::DwarfCFI;
 
   UseIntegratedAssembler = true;
 }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 99a5fff..5e54816 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -19,34 +19,37 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class Triple;
-
-  class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
-
-  public:
-    explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
-  };
-
-  class ARMELFMCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit ARMELFMCAsmInfo(const Triple &TT);
-
-    void setUseIntegratedAssembler(bool Value) override;
-  };
-
-  class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
-    void anchor() override;
-  public:
-    explicit ARMCOFFMCAsmInfoMicrosoft();
-  };
-
-  class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
-    void anchor() override;
-  public:
-    explicit ARMCOFFMCAsmInfoGNU();
-  };
+class Triple;
+
+class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
+};
+
+class ARMELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit ARMELFMCAsmInfo(const Triple &TT);
+
+  void setUseIntegratedAssembler(bool Value) override;
+};
+
+class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit ARMCOFFMCAsmInfoMicrosoft();
+};
+
+class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
+  void anchor() override;
+
+public:
+  explicit ARMCOFFMCAsmInfoGNU();
+};
 
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index 9146d4d..75dde80 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -63,8 +63,8 @@ public:
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *findAssociatedSection() const override {
-    return getSubExpr()->findAssociatedSection();
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
   }
 
   // There are no TLS ARMMCExprs at the moment.
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 21c9fc1..8c8c249 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -134,101 +135,11 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
   bool isThumb =
       TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb;
 
-  bool NoCPU = CPU == "generic" || CPU.empty();
   std::string ARMArchFeature;
-  switch (TT.getSubArch()) {
-  default:
-    llvm_unreachable("invalid sub-architecture for ARM");
-  case Triple::ARMSubArch_v8:
-    if (NoCPU)
-      // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
-      //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
-      //      FeatureT2XtPk, FeatureCrypto, FeatureCRC
-      ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
-                       "+trustzone,+t2xtpk,+crypto,+crc";
-    else
-      // Use CPU to figure out the exact features
-      ARMArchFeature = "+v8";
-    break;
-  case Triple::ARMSubArch_v8_1a:
-    if (NoCPU)
-      // v8.1a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
-      //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
-      //      FeatureT2XtPk, FeatureCrypto, FeatureCRC, FeatureV8_1a
-      ARMArchFeature = "+v8.1a,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
-                       "+trustzone,+t2xtpk,+crypto,+crc";
-    else
-      // Use CPU to figure out the exact features
-      ARMArchFeature = "+v8.1a";
-    break;
-  case Triple::ARMSubArch_v7m:
-    isThumb = true;
-    if (NoCPU)
-      // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
-      ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
-    else
-      // Use CPU to figure out the exact features.
-      ARMArchFeature = "+v7";
-    break;
-  case Triple::ARMSubArch_v7em:
-    if (NoCPU)
-      // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
-      //       FeatureT2XtPk, FeatureMClass
-      ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,+t2xtpk,+mclass";
-    else
-      // Use CPU to figure out the exact features.
-      ARMArchFeature = "+v7";
-    break;
-  case Triple::ARMSubArch_v7s:
-    if (NoCPU)
-      // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS
-      //      Swift
-      ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras";
-    else
-      // Use CPU to figure out the exact features.
-      ARMArchFeature = "+v7";
-    break;
-  case Triple::ARMSubArch_v7:
-    // v7 CPUs have lots of different feature sets. If no CPU is specified,
-    // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
-    // the "minimum" feature set and use CPU string to figure out the exact
-    // features.
-    if (NoCPU)
-      // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
-      ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
-    else
-      // Use CPU to figure out the exact features.
-      ARMArchFeature = "+v7";
-    break;
-  case Triple::ARMSubArch_v6t2:
-    ARMArchFeature = "+v6t2";
-    break;
-  case Triple::ARMSubArch_v6k:
-    ARMArchFeature = "+v6k";
-    break;
-  case Triple::ARMSubArch_v6m:
-    isThumb = true;
-    if (NoCPU)
-      // v6m: FeatureNoARM, FeatureMClass
-      ARMArchFeature = "+v6m,+noarm,+mclass";
-    else
-      ARMArchFeature = "+v6";
-    break;
-  case Triple::ARMSubArch_v6:
-    ARMArchFeature = "+v6";
-    break;
-  case Triple::ARMSubArch_v5te:
-    ARMArchFeature = "+v5te";
-    break;
-  case Triple::ARMSubArch_v5:
-    ARMArchFeature = "+v5t";
-    break;
-  case Triple::ARMSubArch_v4t:
-    ARMArchFeature = "+v4t";
-    break;
-  case Triple::NoSubArch:
-    break;
-  }
+
+  unsigned ArchID = ARM::parseArch(TT.getArchName());
+  if (ArchID != ARM::AK_INVALID &&  (CPU.empty() || CPU == "generic"))
+    ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str();
 
   if (isThumb) {
     if (ARMArchFeature.empty())
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index fd30623..c2bbc8e 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -86,7 +86,8 @@ MCAsmBackend *createThumbBEAsmBackend(const Target &T,
 // object file.
 MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_pwrite_stream &OS,
-                                     MCCodeEmitter *Emitter, bool RelaxAll);
+                                     MCCodeEmitter *Emitter, bool RelaxAll,
+                                     bool IncrementalLinkerCompatible);
 
 /// Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 95d7ea7..cfd504e 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -150,10 +150,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
 
-  if (!A->getFragment())
-    Asm.getContext().reportFatalError(Fixup.getLoc(),
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(Fixup.getLoc(),
                        "symbol '" + A->getName() +
                        "' can not be undefined in a subtraction expression");
+    return;
+  }
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
   uint32_t Value2 = 0;
@@ -163,10 +165,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     const MCSymbol *SB = &B->getSymbol();
 
-    if (!SB->getFragment())
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(Fixup.getLoc(),
                          "symbol '" + B->getSymbol().getName() +
                          "' can not be undefined in a subtraction expression");
+      return;
+    }
 
     // Select the appropriate difference relocation type.
     Type = MachO::ARM_RELOC_HALF_SECTDIFF;
@@ -251,10 +255,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
 
-  if (!A->getFragment())
-    Asm.getContext().reportFatalError(Fixup.getLoc(),
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(Fixup.getLoc(),
                        "symbol '" + A->getName() +
                        "' can not be undefined in a subtraction expression");
+    return;
+  }
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
@@ -265,10 +271,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
     assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
     const MCSymbol *SB = &B->getSymbol();
 
-    if (!SB->getFragment())
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(Fixup.getLoc(),
                          "symbol '" + B->getSymbol().getName() +
                          "' can not be undefined in a subtraction expression");
+      return;
+    }
 
     // Select the appropriate difference relocation type.
     Type = MachO::ARM_RELOC_SECTDIFF;
@@ -346,13 +354,15 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Log2Size;
   unsigned RelocType = MachO::ARM_RELOC_VANILLA;
-  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
+  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) {
     // If we failed to get fixup kind info, it's because there's no legal
     // relocation type for the fixup kind. This happens when it's a fixup that's
     // expected to always be resolvable at assembly time and not have any
     // relocations needed.
-    Asm.getContext().reportFatalError(Fixup.getLoc(),
-                                "unsupported relocation on symbol");
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "unsupported relocation on symbol");
+    return;
+  }
 
   // If this is a difference or a defined symbol plus an offset, then we need a
   // scattered relocation entry.  Differences always require scattered
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b680db5..c0d10c8 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -27,8 +27,8 @@ ARMTargetStreamer::~ARMTargetStreamer() {}
 
 // The constant pool handling is shared by all ARMTargetStreamer
 // implementations.
-const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
-  return ConstantPools->addEntry(Streamer, Expr, 4);
+const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) {
+  return ConstantPools->addEntry(Streamer, Expr, 4, Loc);
 }
 
 void ARMTargetStreamer::emitCurrentConstantPool() {
@@ -38,6 +38,9 @@ void ARMTargetStreamer::emitCurrentConstantPool() {
 // finish() - write out any non-empty assembler constant pools.
 void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
 
+// reset() - Reset any state
+void ARMTargetStreamer::reset() {}
+
 // The remaining callbacks should be handled separately by each
 // streamer.
 void ARMTargetStreamer::emitFnStart() {}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index b993b1b..83fa084 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -37,11 +37,11 @@ void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
 }
 }
 
-MCStreamer *llvm::createARMWinCOFFStreamer(MCContext &Context,
-                                           MCAsmBackend &MAB,
-                                           raw_pwrite_stream &OS,
-                                           MCCodeEmitter *Emitter,
-                                           bool RelaxAll) {
-  return new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS);
+MCStreamer *llvm::createARMWinCOFFStreamer(
+    MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+    MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) {
+  auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 3b4358b..93e0ac4 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "Thumb1FrameLowering.h"
 #include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -84,7 +85,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
                                        MachineBasicBlock &MBB) const {
-  assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented");
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo  *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -100,7 +100,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   assert(NumBytes >= ArgRegsSaveSize &&
          "ArgRegsSaveSize is included in NumBytes");
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+  
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned BasePtr = RegInfo->getBaseRegister();
   int CFAOffset = 0;
@@ -168,8 +172,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
 
   if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
     ++MBBI;
-    if (MBBI != MBB.end())
-      dl = MBBI->getDebugLoc();
   }
 
   // Determine starting offsets of spill areas.
@@ -232,11 +234,10 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (HasFP) {
-    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI)
-		                     + GPRCS1Size + ArgRegsSaveSize;
+    FramePtrOffsetInBlock +=
+        MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
       .setMIFlags(MachineInstr::FrameSetup));
@@ -321,11 +322,8 @@ static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
 
 void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert((MBBI->getOpcode() == ARM::tBX_RET ||
-          MBBI->getOpcode() == ARM::tPOP_RET) &&
-         "Can only insert epilog into returning blocks");
-  DebugLoc dl = MBBI->getDebugLoc();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ThumbRegisterInfo *RegInfo =
@@ -377,9 +375,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
                                ARM::SP)
           .addReg(FramePtr));
     } else {
-      if (MBBI->getOpcode() == ARM::tBX_RET &&
-          &MBB.front() != MBBI &&
-          std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+      if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
+          &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
         if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes))
           emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
@@ -388,66 +385,189 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     }
   }
 
-  bool IsV4PopReturn = false;
-  for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo())
+  if (needPopSpecialFixUp(MF)) {
+    bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true);
+    (void)Done;
+    assert(Done && "Emission of the special fixup failed!?");
+  }
+}
+
+bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  if (!needPopSpecialFixUp(*MBB.getParent()))
+    return true;
+
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+  return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false);
+}
+
+bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
+  ARMFunctionInfo *AFI =
+      const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>();
+  if (AFI->getArgRegsSaveSize())
+    return true;
+
+  // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
+  for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo())
     if (CSI.getReg() == ARM::LR)
-      IsV4PopReturn = true;
-  IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps();
-
-  // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
-  // to LR, and we can't pop the value directly to the PC since
-  // we need to update the SP after popping the value. So instead
-  // we have to emit:
-  //   POP {r3}
-  //   ADD sp, #offset
-  //   BX r3
-  // If this would clobber a return value, then generate this sequence instead:
-  //   MOV ip, r3
-  //   POP {r3}
-  //   ADD sp, #offset
-  //   MOV lr, r3
-  //   MOV r3, ip
-  //   BX lr
-  if (ArgRegsSaveSize || IsV4PopReturn) {
-    // Get the last instruction, tBX_RET
-    MBBI = MBB.getLastNonDebugInstr();
-    assert (MBBI->getOpcode() == ARM::tBX_RET);
-    DebugLoc dl = MBBI->getDebugLoc();
-
-    if (AFI->getReturnRegsCount() <= 3) {
-      // Epilogue: pop saved LR to R3 and branch off it. 
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
-        .addReg(ARM::R3, RegState::Define);
-
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
-
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX))
-        .addReg(ARM::R3, RegState::Kill);
-      AddDefaultPred(MIB);
-      MIB.copyImplicitOps(&*MBBI);
-      // erase the old tBX_RET instruction
-      MBB.erase(MBBI);
-    } else {
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-        .addReg(ARM::R12, RegState::Define)
-        .addReg(ARM::R3, RegState::Kill));
+      return true;
+
+  return false;
+}
+
+bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
+                                              bool DoIt) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
 
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
-        .addReg(ARM::R3, RegState::Define);
+  // If MBBI is a return instruction, or is a tPOP followed by a return
+  // instruction in the successor BB, we may be able to directly restore
+  // LR in the PC.
+  // This is only possible with v5T ops (v4T can't change the Thumb bit via
+  // a POP PC instruction), and only if we do not need to emit any SP update.
+  // Otherwise, we need a temporary register to pop the value
+  // and copy that value into LR.
+  auto MBBI = MBB.getFirstTerminator();
+  bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize;
+  if (CanRestoreDirectly) {
+    if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB)
+      CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET ||
+                            MBBI->getOpcode() == ARM::tPOP_RET);
+    else {
+      auto MBBI_prev = MBBI;
+      MBBI_prev--;
+      assert(MBBI_prev->getOpcode() == ARM::tPOP);
+      assert(MBB.succ_size() == 1);
+      if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET)
+        MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET.
+      else
+        CanRestoreDirectly = false;
+    }
+  }
 
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+  if (CanRestoreDirectly) {
+    if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
+      return true;
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+    // Copy implicit ops and popped registers, if any.
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
+        MIB.addOperand(MO);
+    MIB.addReg(ARM::PC, RegState::Define);
+    // Erase the old instruction (tBX_RET or tPOP).
+    MBB.erase(MBBI);
+    return true;
+  }
 
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-        .addReg(ARM::LR, RegState::Define)
-        .addReg(ARM::R3, RegState::Kill));
+  // Look for a temporary register to use.
+  // First, compute the liveness information.
+  LivePhysRegs UsedRegs(STI.getRegisterInfo());
+  UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true);
+  // The semantic of pristines changed recently and now,
+  // the callee-saved registers that are touched in the function
+  // are not part of the pristines set anymore.
+  // Add those callee-saved now.
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    UsedRegs.addReg(CSRegs[i]);
+
+  DebugLoc dl = DebugLoc();
+  if (MBBI != MBB.end()) {
+    dl = MBBI->getDebugLoc();
+    auto InstUpToMBBI = MBB.end();
+    while (InstUpToMBBI != MBBI)
+      // The pre-decrement is on purpose here.
+      // We want to have the liveness right before MBBI.
+      UsedRegs.stepBackward(*--InstUpToMBBI);
+  }
 
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-        .addReg(ARM::R3, RegState::Define)
-        .addReg(ARM::R12, RegState::Kill));
-      // Keep the tBX_RET instruction
+  // Look for a register that can be directly use in the POP.
+  unsigned PopReg = 0;
+  // And some temporary register, just in case.
+  unsigned TemporaryReg = 0;
+  BitVector PopFriendly =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+  assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
+  // Rebuild the GPRs from the high registers because they are removed
+  // form the GPR reg class for thumb1.
+  BitVector GPRsNoLRSP =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+  GPRsNoLRSP |= PopFriendly;
+  GPRsNoLRSP.reset(ARM::LR);
+  GPRsNoLRSP.reset(ARM::SP);
+  GPRsNoLRSP.reset(ARM::PC);
+  for (int Register = GPRsNoLRSP.find_first(); Register != -1;
+       Register = GPRsNoLRSP.find_next(Register)) {
+    if (!UsedRegs.contains(Register)) {
+      // Remember the first pop-friendly register and exit.
+      if (PopFriendly.test(Register)) {
+        PopReg = Register;
+        TemporaryReg = 0;
+        break;
+      }
+      // Otherwise, remember that the register will be available to
+      // save a pop-friendly register.
+      TemporaryReg = Register;
     }
   }
+
+  if (!DoIt && !PopReg && !TemporaryReg)
+    return false;
+
+  assert((PopReg || TemporaryReg) && "Cannot get LR");
+
+  if (TemporaryReg) {
+    assert(!PopReg && "Unnecessary MOV is about to be inserted");
+    PopReg = PopFriendly.find_first();
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                       .addReg(TemporaryReg, RegState::Define)
+                       .addReg(PopReg, RegState::Kill));
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
+    // We couldn't use the direct restoration above, so
+    // perform the opposite conversion: tPOP_RET to tPOP.
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+    bool Popped = false;
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+          MO.getReg() != ARM::PC) {
+        MIB.addOperand(MO);
+        if (!MO.isImplicit())
+          Popped = true;
+      }
+    // Is there anything left to pop?
+    if (!Popped)
+      MBB.erase(MIB.getInstr());
+    // Erase the old instruction.
+    MBB.erase(MBBI);
+    MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET)));
+  }
+
+  assert(PopReg && "Do not know how to get LR");
+  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(PopReg, RegState::Define);
+
+  emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                     .addReg(ARM::LR, RegState::Define)
+                     .addReg(PopReg, RegState::Kill));
+
+  if (TemporaryReg)
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                       .addReg(PopReg, RegState::Define)
+                       .addReg(TemporaryReg, RegState::Kill));
+
+  return true;
 }
 
 bool Thumb1FrameLowering::
@@ -461,8 +581,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL;
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
-
   MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
   AddDefaultPred(MIB);
   for (unsigned i = CSI.size(); i != 0; --i) {
@@ -501,31 +619,38 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   bool isVarArg = AFI->getArgRegsSaveSize() > 0;
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
   MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
   AddDefaultPred(MIB);
 
-  bool NumRegs = false;
+  bool NeedsPop = false;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
     if (Reg == ARM::LR) {
-      // Special epilogue for vararg functions. See emitEpilogue
-      if (isVarArg)
-        continue;
-      // ARMv4T requires BX, see emitEpilogue
-      if (STI.hasV4TOps() && !STI.hasV5TOps())
+      if (MBB.succ_empty()) {
+        // Special epilogue for vararg functions. See emitEpilogue
+        if (isVarArg)
+          continue;
+        // ARMv4T requires BX, see emitEpilogue
+        if (!STI.hasV5TOps())
+          continue;
+        Reg = ARM::PC;
+        (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+        if (MI != MBB.end())
+          MIB.copyImplicitOps(&*MI);
+        MI = MBB.erase(MI);
+      } else
+        // LR may only be popped into PC, as part of return sequence.
+        // If this isn't the return sequence, we'll need emitPopSpecialFixUp
+        // to restore LR the hard way.
         continue;
-      Reg = ARM::PC;
-      (*MIB).setDesc(TII.get(ARM::tPOP_RET));
-      MIB.copyImplicitOps(&*MI);
-      MI = MBB.erase(MI);
     }
     MIB.addReg(Reg, getDefRegState(true));
-    NumRegs = true;
+    NeedsPop = true;
   }
 
   // It's illegal to emit pop instruction without operands.
-  if (NumRegs)
+  if (NeedsPop)
     MBB.insert(MI, &*MIB);
   else
     MF.DeleteMachineInstr(MIB);
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
index 31d5732..27faac6 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -45,6 +45,47 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
+
+  /// Check whether or not the given \p MBB can be used as a epilogue
+  /// for the target.
+  /// The epilogue will be inserted before the first terminator of that block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+  /// Disable shrink wrap as tBfar/BL will be used to adjust for long jumps.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return false;
+  }
+
+private:
+  /// Check if the frame lowering of \p MF needs a special fixup
+  /// code sequence for the epilogue.
+  /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+  /// to LR, and we can't pop the value directly to the PC when
+  /// we need to update the SP after popping the value. So instead
+  /// we have to emit:
+  ///   POP {r3}
+  ///   ADD sp, #offset
+  ///   BX r3
+  /// If this would clobber a return value, then generate this sequence instead:
+  ///   MOV ip, r3
+  ///   POP {r3}
+  ///   ADD sp, #offset
+  ///   MOV lr, r3
+  ///   MOV r3, ip
+  ///   BX lr
+  bool needPopSpecialFixUp(const MachineFunction &MF) const;
+
+  /// Emit the special fixup code sequence for the epilogue.
+  /// \see needPopSpecialFixUp for more details.
+  /// \p DoIt, tells this method whether or not to actually insert
+  /// the code sequence in \p MBB. I.e., when \p DoIt is false,
+  /// \p MBB is left untouched.
+  /// \returns For \p DoIt == true: True when the emission succeeded
+  /// false otherwise. For \p DoIt == false: True when the emission
+  /// would have been possible, false otherwise.
+  bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const;
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 216e776..530e1d3 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -84,11 +84,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                              MachineMemOperand::MOStore,
-                              MFI.getObjectSize(FI),
-                              MFI.getObjectAlignment(FI));
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
@@ -112,11 +110,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                              MachineMemOperand::MOLoad,
-                              MFI.getObjectSize(FI),
-                              MFI.getObjectAlignment(FI));
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
   }
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 68736bc..bf0498d 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -256,8 +256,8 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
 
     // Finalize the bundle.
-    MachineBasicBlock::instr_iterator LI = LastITMI;
-    finalizeBundle(MBB, InsertPos.getInstrIterator(), std::next(LI));
+    finalizeBundle(MBB, InsertPos.getInstrIterator(),
+                   ++LastITMI->getIterator());
 
     Modified = true;
     ++NumITs;
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index dc74f4e..4da769f 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -131,11 +131,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                            MachineMemOperand::MOStore,
-                            MFI.getObjectSize(FI),
-                            MFI.getObjectAlignment(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
 
   if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
       RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
@@ -171,11 +169,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                            MachineMemOperand::MOLoad,
-                            MFI.getObjectSize(FI),
-                            MFI.getObjectAlignment(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index d9ab824..bcd0e57 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -125,7 +125,10 @@ namespace {
   { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,   1,   1,  1,1, 0,1,0 },
-  // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
+  // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent.
+  // tSTMIA_UPD is a change in semantics which can only be used if the base
+  // register is killed. This difference is correctly handled elsewhere.
+  { ARM::t2STMIA, ARM::tSTMIA_UPD, 0,          0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,   1,   1,  1,1, 0,1,0 }
   };
@@ -210,12 +213,12 @@ Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
   for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
     unsigned FromOpc = ReduceTable[i].WideOpc;
     if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
-      assert(false && "Duplicated entries?");
+      llvm_unreachable("Duplicated entries?");
   }
 }
 
 static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
-  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+  for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
     if (*Regs == ARM::CPSR)
       return true;
   return false;
@@ -435,6 +438,14 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     isLdStMul = true;
     break;
   }
+  case ARM::t2STMIA: {
+    // If the base register is killed, we don't care what its value is after the
+    // instruction, so we can use an updating STMIA.
+    if (!MI->getOperand(0).isKill())
+      return false;
+
+    break;
+  }
   case ARM::t2LDMIA_RET: {
     unsigned BaseReg = MI->getOperand(1).getReg();
     if (BaseReg != ARM::SP)
@@ -492,6 +503,12 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   // Add the 16-bit load / store instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
+
+  // tSTMIA_UPD takes a defining register operand. We've already checked that
+  // the register is killed, so mark it as dead here.
+  if (Entry.WideOpc == ARM::t2STMIA)
+    MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
+
   if (!isLdStMul) {
     MIB.addOperand(MI->getOperand(0));
     MIB.addOperand(MI->getOperand(1));
@@ -633,10 +650,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
 
-  if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
-      STI->avoidMOVsShifterOperand())
+  if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
     // Don't issue movs with shifter operand for some CPUs unless we
-    // are optimizing / minimizing for size.
+    // are optimizing for size.
     return false;
 
   unsigned Reg0 = MI->getOperand(0).getReg();
@@ -660,11 +676,13 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
     }
   } else if (Reg0 != Reg1) {
     // Try to commute the operands to make it a 2-address instruction.
-    unsigned CommOpIdx1, CommOpIdx2;
+    unsigned CommOpIdx1 = 1;
+    unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
     if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
-        CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+        MI->getOperand(CommOpIdx2).getReg() != Reg0)
       return false;
-    MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+    MachineInstr *CommutedMI =
+        TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2);
     if (!CommutedMI)
       return false;
   }
@@ -750,10 +768,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
     return false;
 
-  if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
-      STI->avoidMOVsShifterOperand())
+  if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
     // Don't issue movs with shifter operand for some CPUs unless we
-    // are optimizing / minimizing for size.
+    // are optimizing for size.
     return false;
 
   unsigned Limit = ~0U;
@@ -1012,9 +1029,9 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
 
   TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
 
-  // Optimizing / minimizing size?
-  OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
-  MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+  // Optimizing / minimizing size? Minimizing size implies optimizing for size.
+  OptimizeSize = MF.getFunction()->optForSize();
+  MinimizeSize = MF.getFunction()->optForMinSize();
 
   BlockInfo.clear();
   BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h
new file mode 100644
index 0000000..4c1667e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.h
@@ -0,0 +1,54 @@
+//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AVR back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_H
+#define LLVM_AVR_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+class AVRTargetMachine;
+class FunctionPass;
+
+FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+FunctionPass *createAVRExpandPseudoPass();
+FunctionPass *createAVRFrameAnalyzerPass();
+FunctionPass *createAVRDynAllocaSRPass();
+FunctionPass *createAVRBranchSelectionPass();
+
+/**
+ * Contains the AVR backend.
+ */
+namespace AVR {
+
+enum AddressSpace { DataMemory, ProgramMemory };
+
+template <typename T> bool isProgramMemoryAddress(T *V) {
+  return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
+}
+
+inline bool isProgramMemoryAccess(MemSDNode const *N) {
+  auto V = N->getMemOperand()->getValue();
+
+  return (V != nullptr) ? isProgramMemoryAddress(V) : false;
+}
+
+} // end of namespace AVR
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_H
diff --git a/contrib/llvm/lib/Target/AVR/AVR.td b/contrib/llvm/lib/Target/AVR/AVR.td
new file mode 100644
index 0000000..9e80717
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.td
@@ -0,0 +1,563 @@
+//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+// This is the top level entry point for the AVR target.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===---------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===---------------------------------------------------------------------===//
+// AVR Subtarget Features.
+//===---------------------------------------------------------------------===//
+
+// :TODO: Implement the skip errata, see `gcc/config/avr/avr-arch.h` for details
+// :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD.
+//        In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`.
+//        avr2 (with SRAM) adds the rest of the variants.
+// :TODO: s/AVRTiny/Tiny
+
+
+// A feature set aggregates features, grouping them. We don't want to create a
+// new member in AVRSubtarget (to store a value) for each set because we do not
+// care if the set is supported, only the subfeatures inside the set. We fix
+// this by simply setting the same dummy member for all feature sets, which is
+// then ignored.
+class FeatureSet<string name, string desc, list<SubtargetFeature> i>
+  : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>;
+
+// A family of microcontrollers, defining a set of supported features.
+class Family<string name, list<SubtargetFeature> i>
+  : FeatureSet<name, !strconcat("The device is a part of the ",
+               name, " family"), i>;
+
+// The device has SRAM, and supports the bare minimum of
+// SRAM-relevant instructions.
+//
+// These are:
+// LD - all 9 variants
+// ST - all 9 variants
+// LDD - two variants for Y and Z
+// STD - two variants for Y and Z
+// `LDS Rd, K`
+// `STS k, Rr`
+// `PUSH`/`POP`
+def FeatureSRAM           : SubtargetFeature<"sram", "m_hasSRAM", "true",
+                                  "The device has random access memory">;
+
+// The device supports the `JMP k` and `CALL k` instructions.
+def FeatureJMPCALL        : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true",
+                                  "The device supports the `JMP` and "
+                                  "`CALL` instructions">;
+
+
+// The device supports the indirect branches `IJMP` and `ICALL`.
+def FeatureIJMPCALL       : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL",
+                                  "true",
+                                  "The device supports `IJMP`/`ICALL`"
+                                  "instructions">;
+
+// The device supports the extended indirect branches `EIJMP` and `EICALL`.
+def FeatureEIJMPCALL      : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL",
+                                  "true", "The device supports the "
+                                  "`EIJMP`/`EICALL` instructions">;
+
+// The device supports `ADDI Rd, K`, `SUBI Rd, K`.
+def FeatureADDSUBIW       : SubtargetFeature<"addsubiw", "m_hasADDSUBIW",
+                                  "true", "Enable 16-bit register-immediate "
+                                  "addition and subtraction instructions">;
+
+// The device has an 8-bit stack pointer (SP) register.
+def FeatureSmallStack     : SubtargetFeature<"smallstack", "m_hasSmallStack",
+                                  "true", "The device has an 8-bit "
+                                  "stack pointer">;
+
+// The device supports the 16-bit GPR pair MOVW instruction.
+def FeatureMOVW           : SubtargetFeature<"movw", "m_hasMOVW", "true",
+                                  "The device supports the 16-bit MOVW "
+                                  "instruction">;
+
+// The device supports the `LPM` instruction, with implied destination being r0.
+def FeatureLPM            : SubtargetFeature<"lpm", "m_hasLPM", "true",
+                                  "The device supports the `LPM` instruction">;
+
+// The device supports the `LPM Rd, Z[+] instruction.
+def FeatureLPMX           : SubtargetFeature<"lpmx", "m_hasLPMX", "true",
+                                  "The device supports the `LPM Rd, Z[+]` "
+                                  "instruction">;
+
+// The device supports the `ELPM` instruction.
+def FeatureELPM           : SubtargetFeature<"elpm", "m_hasELPM", "true",
+                                  "The device supports the ELPM instruction">;
+
+// The device supports the `ELPM Rd, Z[+]` instructions.
+def FeatureELPMX          : SubtargetFeature<"elpmx", "m_hasELPMX", "true",
+                                  "The device supports the `ELPM Rd, Z[+]` "
+                                  "instructions">;
+
+// The device supports the `SPM` instruction.
+def FeatureSPM            : SubtargetFeature<"spm", "m_hasSPM", "true",
+                                  "The device supports the `SPM` instruction">;
+
+// The device supports the `SPM Z+` instruction.
+def FeatureSPMX           : SubtargetFeature<"spmx", "m_hasSPMX", "true",
+                                  "The device supports the `SPM Z+` "
+                                  "instruction">;
+
+// The device supports the `DES k` instruction.
+def FeatureDES            : SubtargetFeature<"des", "m_hasDES", "true",
+                                  "The device supports the `DES k` encryption "
+                                  "instruction">;
+
+// The device supports the Read-Write-Modify instructions
+// XCH, LAS, LAC, and LAT.
+def FeatureRMW            : SubtargetFeature<"rmw", "m_supportsRMW", "true",
+                                  "The device supports the read-write-modify "
+                                  "instructions: XCH, LAS, LAC, LAT">;
+
+// The device supports the `[F]MUL[S][U]` family of instructions.
+def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication",
+                                  "true", "The device supports the "
+                                  "multiplication instructions">;
+
+// The device supports the `BREAK` instruction.
+def FeatureBREAK          : SubtargetFeature<"break", "m_hasBREAK", "true",
+                                  "The device supports the `BREAK` debugging "
+                                  "instruction">;
+
+// The device has instruction encodings specific to the Tiny core.
+def FeatureTinyEncoding   : SubtargetFeature<"tinyencoding",
+                                  "m_hasTinyEncoding", "true",
+                                  "The device has Tiny core specific "
+                                  "instruction encodings">;
+
+class ELFArch<string name>  : SubtargetFeature<"", "ELFArch",
+                                    !strconcat("ELF::",name), "">;
+
+// ELF e_flags architecture values
+def ELFArchAVR1    : ELFArch<"EF_AVR_ARCH_AVR1">;
+def ELFArchAVR2    : ELFArch<"EF_AVR_ARCH_AVR2">;
+def ELFArchAVR25   : ELFArch<"EF_AVR_ARCH_AVR25">;
+def ELFArchAVR3    : ELFArch<"EF_AVR_ARCH_AVR3">;
+def ELFArchAVR31   : ELFArch<"EF_AVR_ARCH_AVR31">;
+def ELFArchAVR35   : ELFArch<"EF_AVR_ARCH_AVR35">;
+def ELFArchAVR4    : ELFArch<"EF_AVR_ARCH_AVR4">;
+def ELFArchAVR5    : ELFArch<"EF_AVR_ARCH_AVR5">;
+def ELFArchAVR51   : ELFArch<"EF_AVR_ARCH_AVR51">;
+def ELFArchAVR6    : ELFArch<"EF_AVR_ARCH_AVR6">;
+def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">;
+def ELFArchXMEGA1  : ELFArch<"EF_AVR_ARCH_XMEGA1">;
+def ELFArchXMEGA2  : ELFArch<"EF_AVR_ARCH_XMEGA2">;
+def ELFArchXMEGA3  : ELFArch<"EF_AVR_ARCH_XMEGA3">;
+def ELFArchXMEGA4  : ELFArch<"EF_AVR_ARCH_XMEGA4">;
+def ELFArchXMEGA5  : ELFArch<"EF_AVR_ARCH_XMEGA5">;
+def ELFArchXMEGA6  : ELFArch<"EF_AVR_ARCH_XMEGA6">;
+def ELFArchXMEGA7  : ELFArch<"EF_AVR_ARCH_XMEGA7">;
+
+//===---------------------------------------------------------------------===//
+// AVR Families
+//===---------------------------------------------------------------------===//
+
+// The device has at least the bare minimum that **every** single AVR
+// device should have.
+def FamilyAVR0           : Family<"avr0", []>;
+
+def FamilyAVR1           : Family<"avr1", [FamilyAVR0, FeatureLPM]>;
+
+def FamilyAVR2           : Family<"avr2",
+                                 [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW,
+                                  FeatureSRAM]>;
+
+def FamilyAVR25          : Family<"avr25",
+                                 [FamilyAVR2, FeatureMOVW, FeatureLPMX,
+                                  FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR3           : Family<"avr3",
+                                 [FamilyAVR2, FeatureJMPCALL]>;
+
+def FamilyAVR31          : Family<"avr31",
+                                 [FamilyAVR3, FeatureELPM]>;
+
+def FamilyAVR35          : Family<"avr35",
+                                 [FamilyAVR3, FeatureMOVW, FeatureLPMX,
+                                  FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR4           : Family<"avr4",
+                                 [FamilyAVR2, FeatureMultiplication,
+                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
+                                  FeatureBREAK]>;
+
+def FamilyAVR5           : Family<"avr5",
+                                 [FamilyAVR3, FeatureMultiplication,
+                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
+                                  FeatureBREAK]>;
+
+def FamilyAVR51          : Family<"avr51",
+                                 [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
+
+def FamilyAVR6           : Family<"avr6",
+                                 [FamilyAVR51]>;
+
+def FamilyAVRTiny        : Family<"avrtiny",
+                                 [FamilyAVR0, FeatureBREAK, FeatureSRAM,
+                                  FeatureTinyEncoding]>;
+
+def FamilyXMEGA          : Family<"xmega",
+                                 [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX,
+                                  FeatureDES]>;
+
+def FamilyXMEGAU         : Family<"xmegau",
+                                  [FamilyXMEGA, FeatureRMW]>;
+
+def FeatureSetSpecial    : FeatureSet<"special",
+                                      "Enable use of the entire instruction "
+                                      "set - used for debugging",
+                                      [FeatureSRAM, FeatureJMPCALL,
+                                       FeatureIJMPCALL, FeatureEIJMPCALL,
+                                       FeatureADDSUBIW, FeatureMOVW,
+                                       FeatureLPM, FeatureLPMX, FeatureELPM,
+                                       FeatureELPMX, FeatureSPM, FeatureSPMX,
+                                       FeatureDES, FeatureRMW,
+                                       FeatureMultiplication, FeatureBREAK]>;
+
+//===---------------------------------------------------------------------===//
+// AVR microcontrollers supported.
+//===---------------------------------------------------------------------===//
+
+class Device<string Name, Family Fam, ELFArch Arch,
+             list<SubtargetFeature> ExtraFeatures = []>
+  : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>;
+
+// Generic MCUs
+// Note that several versions of GCC has strange ELF architecture
+// settings for backwards compatibility - see `gas/config/tc-avr.c`
+// in AVR binutils. We do not replicate this.
+def : Device<"avr1",      FamilyAVR1,    ELFArchAVR1>;
+def : Device<"avr2",      FamilyAVR2,    ELFArchAVR2>;
+def : Device<"avr25",     FamilyAVR25,   ELFArchAVR25>;
+def : Device<"avr3",      FamilyAVR3,    ELFArchAVR3>;
+def : Device<"avr31",     FamilyAVR31,   ELFArchAVR31>;
+def : Device<"avr35",     FamilyAVR35,   ELFArchAVR35>;
+def : Device<"avr4",      FamilyAVR4,    ELFArchAVR4>;
+def : Device<"avr5",      FamilyAVR5,    ELFArchAVR5>;
+def : Device<"avr51",     FamilyAVR51,   ELFArchAVR51>;
+def : Device<"avr6",      FamilyAVR6,    ELFArchAVR6>;
+def : Device<"avrxmega1", FamilyXMEGA,   ELFArchXMEGA1>;
+def : Device<"avrxmega2", FamilyXMEGA,   ELFArchXMEGA2>;
+def : Device<"avrxmega3", FamilyXMEGA,   ELFArchXMEGA3>;
+def : Device<"avrxmega4", FamilyXMEGA,   ELFArchXMEGA4>;
+def : Device<"avrxmega5", FamilyXMEGA,   ELFArchXMEGA5>;
+def : Device<"avrxmega6", FamilyXMEGA,   ELFArchXMEGA6>;
+def : Device<"avrxmega7", FamilyXMEGA,   ELFArchXMEGA7>;
+def : Device<"avrtiny",   FamilyAVRTiny, ELFArchAVRTiny>;
+
+// Specific MCUs
+def : Device<"at90s1200",          FamilyAVR0, ELFArchAVR1>;
+def : Device<"attiny11",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny12",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny15",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny28",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"at90s2313",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2323",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2333",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2343",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny22",           FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny26",           FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
+def : Device<"at86rf401",          FamilyAVR2, ELFArchAVR25,
+             [FeatureMOVW, FeatureLPMX]>;
+def : Device<"at90s4414",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4433",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4434",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8515",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90c8534",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8535",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"ata5272",            FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313a",        FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny4313",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny25",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny45",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny85",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny87",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny43u",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny48",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny88",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny828",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"at43usb355",         FamilyAVR3,  ELFArchAVR3>;
+def : Device<"at76c711",           FamilyAVR3,  ELFArchAVR3>;
+def : Device<"atmega103",          FamilyAVR31, ELFArchAVR31>;
+def : Device<"at43usb320",         FamilyAVR31, ELFArchAVR31>;
+def : Device<"attiny167",          FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb82",          FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb162",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata5505",            FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8u2",          FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega16u2",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega32u2",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"attiny1634",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8",            FamilyAVR4,  ELFArchAVR4>; // FIXME: family may be wrong
+def : Device<"ata6289",            FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega8a",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"ata6285",            FamilyAVR4,  ELFArchAVR4>;
+def : Device<"ata6286",            FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48a",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48pa",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48p",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88a",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88p",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88pa",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega8515",         FamilyAVR2,  ELFArchAVR4,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8535",         FamilyAVR2,  ELFArchAVR4,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8hva",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm1",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm2",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm2b",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm3",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm3b",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm81",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"ata5790",            FamilyAVR5,  ELFArchAVR5>;
+def : Device<"ata5795",            FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16",           FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16a",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega161",          FamilyAVR3,  ELFArchAVR5,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega162",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega163",          FamilyAVR3,  ELFArchAVR5,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega164a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega164p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega164pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32",           FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32a",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega323",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250pa",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega328",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega328p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290pa",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega406",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64",           FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64a",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega640",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega645",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega645a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega645p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega649",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega649a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega649p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6450",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6450a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6450p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6490",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6490a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6490p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64rfr2",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644rfr2",      FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hva",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hva2",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hvb",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hvbrevb",    FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32hvb",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32hvbrevb",    FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64hve",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90can32",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90can64",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90pwm161",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90pwm216",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90pwm316",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32c1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64c1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16m1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32m1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64m1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16u4",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32u4",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32u6",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90usb646",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90usb647",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90scr100",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at94k",              FamilyAVR3,  ELFArchAVR5,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>;
+def : Device<"m3000",              FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega128",          FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128a",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1280",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1281",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284p",        FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfa1",      FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfr2",      FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284rfr2",     FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90can128",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1286",        FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1287",        FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega2560",         FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atmega2561",         FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atmega256rfr2",      FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atmega2564rfr2",     FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atxmega16a4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16c4",        FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16d4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32c4",        FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32d4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16e5",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega8e5",         FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32x1",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega64a3",        FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a3u",       FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64a4u",       FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b1",        FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b3",        FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64c3",        FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64d3",        FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64d4",        FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a1",        FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"atxmega64a1u",       FamilyXMEGAU, ELFArchXMEGA5>;
+def : Device<"atxmega128a3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b1",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128d4",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256a3b",      FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3bu",     FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega384c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega384d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a1",       FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"atxmega128a1u",      FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"atxmega128a4u",      FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"attiny4",            FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny5",            FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny9",            FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny10",           FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny20",           FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny40",           FamilyAVRTiny, ELFArchAVRTiny>;
+
+//===---------------------------------------------------------------------===//
+// Register File Description
+//===---------------------------------------------------------------------===//
+
+include "AVRRegisterInfo.td"
+
+//===---------------------------------------------------------------------===//
+// Instruction Descriptions
+//===---------------------------------------------------------------------===//
+
+//include "AVRInstrInfo.td"
+
+//def AVRInstrInfo : InstrInfo;
+
+//===---------------------------------------------------------------------===//
+// Calling Conventions
+//===---------------------------------------------------------------------===//
+
+include "AVRCallingConv.td"
+
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+// def AVRAsmWriter : AsmWriter {
+//  string AsmWriterClassName = "InstPrinter";
+//  bit isMCAsmWriter = 1;
+// }
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+// def AVRAsmParser : AsmParser {
+//   let ShouldEmitMatchRegisterName = 1;
+//   let ShouldEmitMatchRegisterAltName = 1;
+// }
+
+// def AVRAsmParserVariant : AsmParserVariant {
+//   int Variant = 0;
+//
+//   // Recognize hard coded registers.
+//   string RegisterPrefix = "$";
+// }
+
+//===---------------------------------------------------------------------===//
+// Target Declaration
+//===---------------------------------------------------------------------===//
+
+def AVR : Target {
+//   let InstructionSet         = AVRInstrInfo;
+//   let AssemblyWriters        = [AVRAsmWriter];
+//
+//   let AssemblyParsers        = [AVRAsmParser];
+//   let AssemblyParserVariants = [AVRAsmParserVariant];
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRCallingConv.td b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td
new file mode 100644
index 0000000..d8cb3fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -0,0 +1,65 @@
+//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for AVR architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AVR Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+def RetCC_AVR : CallingConv
+<[
+  // i8 is returned in R24.
+  CCIfType<[i8], CCAssignToReg<[R24]>>,
+
+  // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
+  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
+]>;
+
+// Special return value calling convention for runtime functions.
+def RetCC_AVR_RT : CallingConv
+<[
+  CCIfType<[i8], CCAssignToReg<[R24,R25]>>,
+  CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// The calling conventions are implemented in custom C++ code
+
+// Calling convention for variadic functions.
+def ArgCC_AVR_Vararg : CallingConv
+<[
+  // i16 are always passed through the stack with an alignment of 1.
+  CCAssignToStack<2, 1>
+]>;
+
+// Special argument calling convention for
+// multiplication runtime functions.
+def ArgCC_AVR_RT_MUL : CallingConv
+<[
+  CCIfType<[i16], CCAssignToReg<[R27R26,R19R18]>>
+]>;
+
+// Special argument calling convention for
+// division runtime functions.
+def ArgCC_AVR_RT_DIV : CallingConv
+<[
+  CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
+  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
+def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>;
diff --git a/contrib/llvm/lib/Target/AVR/AVRConfig.h b/contrib/llvm/lib/Target/AVR/AVRConfig.h
new file mode 100644
index 0000000..65588bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRConfig.h
@@ -0,0 +1,15 @@
+//===-- AVRConfig.h - AVR Backend Configuration Header ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_CONFIG_H
+#define LLVM_AVR_CONFIG_H
+
+#define LLVM_AVR_GCC_COMPAT
+
+#endif // LLVM_AVR_CONFIG_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
new file mode 100644
index 0000000..6571d5d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -0,0 +1,73 @@
+//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AVR-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H
+#define LLVM_AVR_MACHINE_FUNCTION_INFO_H
+
+#include "AVRConfig.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/**
+ * Contains AVR-specific information for each MachineFunction.
+ */
+class AVRMachineFunctionInfo : public MachineFunctionInfo {
+  /// Indicates if a register has been spilled by the register
+  /// allocator.
+  bool HasSpills;
+
+  /// Indicates if there are any fixed size allocas present.
+  /// Note that if there are only variable sized allocas this is set to false.
+  bool HasAllocas;
+
+  /// Indicates if arguments passed using the stack are being
+  /// used inside the function.
+  bool HasStackArgs;
+
+  /// Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  AVRMachineFunctionInfo()
+      : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+        CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+  explicit AVRMachineFunctionInfo(MachineFunction &MF)
+      : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+        CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+  bool getHasSpills() const { return HasSpills; }
+  void setHasSpills(bool B) { HasSpills = B; }
+
+  bool getHasAllocas() const { return HasAllocas; }
+  void setHasAllocas(bool B) { HasAllocas = B; }
+
+  bool getHasStackArgs() const { return HasStackArgs; }
+  void setHasStackArgs(bool B) { HasStackArgs = B; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+};
+
+} // end llvm namespace
+
+#endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
new file mode 100644
index 0000000..32650fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -0,0 +1,216 @@
+//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the AVR register file
+//===----------------------------------------------------------------------===//
+
+// 8-bit General purpose register definition.
+class AVRReg<bits<16> num,
+             string name,
+             list<Register> subregs = [],
+             list<string> altNames = []>
+  : RegisterWithSubRegs<name, subregs>
+{
+  field bits<16> Num = num;
+
+  let HWEncoding = num;
+  let Namespace = "AVR";
+  let SubRegs = subregs;
+  let AltNames = altNames;
+}
+
+// Subregister indices.
+let Namespace = "AVR" in
+{
+  def sub_lo : SubRegIndex<8>;
+  def sub_hi : SubRegIndex<8, 8>;
+}
+
+let Namespace = "AVR" in {
+  def ptr : RegAltNameIndex;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  8-bit general purpose registers
+//===----------------------------------------------------------------------===//
+
+def R0  : AVRReg<0,  "r0">,  DwarfRegNum<[0]>;
+def R1  : AVRReg<1,  "r1">,  DwarfRegNum<[1]>;
+def R2  : AVRReg<2,  "r2">,  DwarfRegNum<[2]>;
+def R3  : AVRReg<3,  "r3">,  DwarfRegNum<[3]>;
+def R4  : AVRReg<4,  "r4">,  DwarfRegNum<[4]>;
+def R5  : AVRReg<5,  "r5">,  DwarfRegNum<[5]>;
+def R6  : AVRReg<6,  "r6">,  DwarfRegNum<[6]>;
+def R7  : AVRReg<7,  "r7">,  DwarfRegNum<[7]>;
+def R8  : AVRReg<8,  "r8">,  DwarfRegNum<[8]>;
+def R9  : AVRReg<9,  "r9">,  DwarfRegNum<[9]>;
+def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AVRReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AVRReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AVRReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AVRReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AVRReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AVRReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AVRReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AVRReg<20, "r20">, DwarfRegNum<[20]>;
+def R21 : AVRReg<21, "r21">, DwarfRegNum<[21]>;
+def R22 : AVRReg<22, "r22">, DwarfRegNum<[22]>;
+def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>;
+def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>;
+def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>;
+def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>;
+def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>;
+def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>;
+def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>;
+def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>;
+def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>;
+def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
+def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
+
+let SubRegIndices = [sub_lo, sub_hi],
+CoveredBySubRegs = 1 in
+{
+  // 16 bit GPR pairs.
+  def SP     : AVRReg<32, "SP",      [SPL, SPH]>, DwarfRegNum<[32]>;
+
+  // The pointer registers (X,Y,Z) are a special case because they
+  // are printed as a `high:low` pair when a DREG is expected,
+  // but printed using `X`, `Y`, `Z` when a pointer register is expected.
+  let RegAltNameIndices = [ptr] in {
+      def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+      def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+      def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+  }
+  def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
+  def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
+  def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
+  def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
+  def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
+  def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
+  def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
+  def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
+  def R9R8   : AVRReg<8,  "r9:r8",   [R8, R9]>,   DwarfRegNum<[8]>;
+  def R7R6   : AVRReg<6,  "r7:r6",   [R6, R7]>,   DwarfRegNum<[6]>;
+  def R5R4   : AVRReg<4,  "r5:r4",   [R4, R5]>,   DwarfRegNum<[4]>;
+  def R3R2   : AVRReg<2,  "r3:r2",   [R2, R3]>,   DwarfRegNum<[2]>;
+  def R1R0   : AVRReg<0,  "r1:r0",   [R0, R1]>,   DwarfRegNum<[0]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+//:TODO: use proper set instructions instead of using always "add"
+
+// Main 8-bit register class.
+def GPR8 : RegisterClass<"AVR", [i8], 8,
+  (
+    // Return value and argument registers.
+    add R24, R25, R18, R19, R20, R21, R22, R23,
+    // Scratch registers.
+    R30, R31, R26, R27,
+    // Callee saved registers.
+    R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+    R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+  )>;
+
+// Simple lower registers r0..r15
+def GPR8lo : RegisterClass<"AVR", [i8], 8,
+  (
+    add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+  )>;
+
+// 8-bit register class for instructions which take immediates.
+def LD8 : RegisterClass<"AVR", [i8], 8,
+  (
+    // Return value and arguments.
+    add R24, R25, R18, R19, R20, R21, R22, R23,
+    // Scratch registers.
+    R30, R31, R26, R27,
+    // Callee saved registers.
+    R28, R29, R17, R16
+  )>;
+
+// Simple lower registers r16..r23
+def LD8lo : RegisterClass<"AVR", [i8], 8,
+  (
+    add R23, R22, R21, R20, R19, R18, R17, R16
+  )>;
+
+// Main 16-bit pair register class.
+def DREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0
+  )>;
+
+// 16-bit register class for immediate instructions.
+def DLDREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16
+  )>;
+
+// 16-bit register class for the adiw/sbiw instructions.
+def IWREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28
+  )>;
+
+// 16-bit register class for the ld and st instructions.
+// AKA X,Y, and Z
+def PTRREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    add R27R26, // X
+        R29R28, // Y
+        R31R30  // Z
+  ), ptr>;
+
+// 16-bit register class for the ldd and std instructions.
+// AKA Y and Z.
+def PTRDISPREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    add R31R30, R29R28
+  ), ptr>;
+
+// We have a bunch of instructions with an explicit Z register argument. We
+// model this using a register class containing only the Z register.
+// :TODO: Rename to 'ZREG'.
+def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
+
+// Register class used for the stack read pseudo instruction.
+def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>;
+
+//:TODO: if we remove this we get an error in tablegen
+//:TODO: this is just a hack, remove it once add16 works!
+// Status register.
+def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>;
+def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)>
+{
+  let CopyCost = -1;      // Don't allow copying of status registers
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
new file mode 100644
index 0000000..ee832ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
@@ -0,0 +1,29 @@
+//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SELECTION_DAG_INFO_H
+#define LLVM_AVR_SELECTION_DAG_INFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+/**
+ * Holds information about the AVR instruction selection DAG.
+ */
+class AVRSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SELECTION_DAG_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
new file mode 100644
index 0000000..a91dce8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -0,0 +1,4 @@
+
+extern "C" void LLVMInitializeAVRTarget() {
+
+}
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
new file mode 100644
index 0000000..85f03e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -0,0 +1,40 @@
+//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetObjectFile.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+#include "AVR.h"
+
+namespace llvm {
+void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  Base::Initialize(Ctx, TM);
+  ProgmemDataSection =
+      Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+}
+
+MCSection *
+AVRTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind, Mangler &Mang,
+                                            const TargetMachine &TM) const {
+  // Global values in flash memory are placed in the progmem.data section
+  // unless they already have a user assigned section.
+  if (AVR::isProgramMemoryAddress(GV) && !GV->hasSection())
+    return ProgmemDataSection;
+
+  // Otherwise, we work the same way as ELF.
+  return Base::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
new file mode 100644
index 0000000..bdda35b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_OBJECT_FILE_H
+#define LLVM_AVR_TARGET_OBJECT_FILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+/**
+ * Lowering for an AVR ELF32 object file.
+ */
+class AVRTargetObjectFile : public TargetLoweringObjectFileELF {
+  typedef TargetLoweringObjectFileELF Base;
+
+public:
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override;
+
+  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                    Mangler &Mang,
+                                    const TargetMachine &TM) const override;
+
+private:
+  MCSection *ProgmemDataSection;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_OBJECT_FILE_H
diff --git a/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
new file mode 100644
index 0000000..c0e0d20
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -0,0 +1,25 @@
+//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+Target TheAVRTarget;
+}
+
+extern "C" void LLVMInitializeAVRTargetInfo() {
+  llvm::RegisterTarget<llvm::Triple::avr> X(
+      llvm::TheAVRTarget, "avr", "Atmel AVR Microcontroller");
+}
+
+// FIXME: Temporary stub - this function must be defined for linking
+// to succeed. Remove once this function is properly implemented.
+extern "C" void LLVMInitializeAVRTargetMC() {
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td
index a4ce90a..8493b0f 100644
--- a/contrib/llvm/lib/Target/BPF/BPF.td
+++ b/contrib/llvm/lib/Target/BPF/BPF.td
@@ -25,7 +25,14 @@ def BPFInstPrinter : AsmWriter {
   bit isMCAsmWriter = 1;
 }
 
+def BPFAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "BPF";
+  string BreakCharacters = ".";
+}
+
 def BPF : Target {
   let InstructionSet = BPFInstrInfo;
   let AssemblyWriters = [BPFInstPrinter];
+  let AssemblyParserVariants = [BPFAsmParserVariant];
 }
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 7341828..6a5b37e 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -547,8 +547,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // to set, the condition code register to branch on, the true/false values to
   // select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = BB;
-  ++I;
+  MachineFunction::iterator I = ++BB->getIterator();
 
   // ThisMBB:
   // ...
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
index adcaff6..4276d08 100644
--- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
@@ -17,8 +17,6 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-class MCOperand;
-
 class BPFInstPrinter : public MCInstPrinter {
 public:
   BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 36f9926..8c358ca 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -68,16 +68,23 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
-    return;
-  }
-  assert(Fixup.getKind() == FK_PCRel_2);
-  Value = (uint16_t)((Value - 8) / 8);
-  if (IsLittleEndian) {
-    Data[Fixup.getOffset() + 2] = Value & 0xFF;
-    Data[Fixup.getOffset() + 3] = Value >> 8;
+  } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
+    unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
+
+    for (unsigned i = 0; i != Size; ++i) {
+      unsigned Idx = IsLittleEndian ? i : Size - i;
+      Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
+    }
   } else {
-    Data[Fixup.getOffset() + 2] = Value >> 8;
-    Data[Fixup.getOffset() + 3] = Value & 0xFF;
+    assert(Fixup.getKind() == FK_PCRel_2);
+    Value = (uint16_t)((Value - 8) / 8);
+    if (IsLittleEndian) {
+      Data[Fixup.getOffset() + 2] = Value & 0xFF;
+      Data[Fixup.getOffset() + 3] = Value >> 8;
+    } else {
+      Data[Fixup.getOffset() + 2] = Value >> 8;
+      Data[Fixup.getOffset() + 3] = Value & 0xFF;
+    }
   }
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 05ba618..87cdd5e 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -44,6 +44,10 @@ unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target,
     return ELF::R_X86_64_64;
   case FK_SecRel_4:
     return ELF::R_X86_64_PC32;
+  case FK_Data_8:
+    return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+  case FK_Data_4:
+    return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index d63bbf4..1f440fe 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -34,6 +34,8 @@ public:
     UsesELFSectionDirectiveForBSS = true;
     HasSingleParameterDotFile = false;
     HasDotTypeDotSizeDirective = false;
+
+    SupportsDebugInformation = true;
   }
 };
 }
diff --git a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
index 272688e..5ea6551 100644
--- a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
+++ b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
@@ -551,7 +551,8 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
 void CppWriter::printType(Type* Ty) {
   // We don't print definitions for primitive types
   if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() ||
-      Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy())
+      Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy() ||
+      Ty->isTokenTy())
     return;
 
   // If we already defined this type, we don't need to define it again.
@@ -1355,23 +1356,18 @@ void CppWriter::printInstruction(const Instruction *I,
   }
   case Instruction::GetElementPtr: {
     const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
-    if (gep->getNumOperands() <= 2) {
-      Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
-          << opNames[0];
-      if (gep->getNumOperands() == 2)
-        Out << ", " << opNames[1];
-    } else {
-      Out << "std::vector<Value*> " << iName << "_indices;";
-      nl(Out);
-      for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
-        Out << iName << "_indices.push_back("
-            << opNames[i] << ");";
-        nl(Out);
+    Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+        << getCppName(gep->getSourceElementType()) << ", " << opNames[0] << ", {";
+    in();
+    for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+      if (i != 1) {
+        Out << ", ";
       }
-      Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
-          << opNames[0] << ", " << iName << "_indices";
+      nl(Out);
+      Out << opNames[i];
     }
-    Out << ", \"";
+    out();
+    nl(Out) << "}, \"";
     printEscapedString(gep->getName());
     Out << "\", " << bbname << ");";
     break;
@@ -1803,13 +1799,12 @@ void CppWriter::printFunctionBody(const Function *F) {
           << "->arg_begin();";
       nl(Out);
     }
-    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-         AI != AE; ++AI) {
-      Out << "Value* " << getCppName(AI) << " = args++;";
+    for (const Argument &AI : F->args()) {
+      Out << "Value* " << getCppName(&AI) << " = args++;";
       nl(Out);
-      if (AI->hasName()) {
-        Out << getCppName(AI) << "->setName(\"";
-        printEscapedString(AI->getName());
+      if (AI.hasName()) {
+        Out << getCppName(&AI) << "->setName(\"";
+        printEscapedString(AI.getName());
         Out << "\");";
         nl(Out);
       }
@@ -1818,29 +1813,25 @@ void CppWriter::printFunctionBody(const Function *F) {
 
   // Create all the basic blocks
   nl(Out);
-  for (Function::const_iterator BI = F->begin(), BE = F->end();
-       BI != BE; ++BI) {
-    std::string bbname(getCppName(BI));
+  for (const BasicBlock &BI : *F) {
+    std::string bbname(getCppName(&BI));
     Out << "BasicBlock* " << bbname <<
            " = BasicBlock::Create(mod->getContext(), \"";
-    if (BI->hasName())
-      printEscapedString(BI->getName());
-    Out << "\"," << getCppName(BI->getParent()) << ",0);";
+    if (BI.hasName())
+      printEscapedString(BI.getName());
+    Out << "\"," << getCppName(BI.getParent()) << ",0);";
     nl(Out);
   }
 
   // Output all of its basic blocks... for the function
-  for (Function::const_iterator BI = F->begin(), BE = F->end();
-       BI != BE; ++BI) {
-    std::string bbname(getCppName(BI));
-    nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+  for (const BasicBlock &BI : *F) {
+    std::string bbname(getCppName(&BI));
+    nl(Out) << "// Block " << BI.getName() << " (" << bbname << ")";
     nl(Out);
 
     // Output all of the instructions in the basic block...
-    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
-         I != E; ++I) {
-      printInstruction(I,bbname);
-    }
+    for (const Instruction &I : BI)
+      printInstruction(&I, bbname);
   }
 
   // Loop over the ForwardRefs and resolve them now that all instructions
@@ -1883,7 +1874,7 @@ void CppWriter::printInline(const std::string& fname,
   printFunctionUses(F);
   printFunctionBody(F);
   is_inline = false;
-  Out << "return " << getCppName(F->begin()) << ";";
+  Out << "return " << getCppName(&F->front()) << ";";
   nl(Out) << "}";
   nl(Out);
 }
@@ -1896,17 +1887,14 @@ void CppWriter::printModuleBody() {
   // Functions can call each other and global variables can reference them so
   // define all the functions first before emitting their function bodies.
   nl(Out) << "// Function Declarations"; nl(Out);
-  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
-       I != E; ++I)
-    printFunctionHead(I);
+  for (const Function &I : *TheModule)
+    printFunctionHead(&I);
 
   // Process the global variables declarations. We can't initialze them until
   // after the constants are printed so just print a header for each global
   nl(Out) << "// Global Variable Declarations\n"; nl(Out);
-  for (Module::const_global_iterator I = TheModule->global_begin(),
-         E = TheModule->global_end(); I != E; ++I) {
-    printVariableHead(I);
-  }
+  for (const GlobalVariable &I : TheModule->globals())
+    printVariableHead(&I);
 
   // Print out all the constants definitions. Constants don't recurse except
   // through GlobalValues. All GlobalValues have been declared at this point
@@ -1918,21 +1906,18 @@ void CppWriter::printModuleBody() {
   // been emitted. These definitions just couple the gvars with their constant
   // initializers.
   nl(Out) << "// Global Variable Definitions"; nl(Out);
-  for (Module::const_global_iterator I = TheModule->global_begin(),
-         E = TheModule->global_end(); I != E; ++I) {
-    printVariableBody(I);
-  }
+  for (const GlobalVariable &I : TheModule->globals())
+    printVariableBody(&I);
 
   // Finally, we can safely put out all of the function bodies.
   nl(Out) << "// Function Definitions"; nl(Out);
-  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
-       I != E; ++I) {
-    if (!I->isDeclaration()) {
-      nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+  for (const Function &I : *TheModule) {
+    if (!I.isDeclaration()) {
+      nl(Out) << "// Function: " << I.getName() << " (" << getCppName(&I)
               << ")";
       nl(Out) << "{";
       nl(Out,1);
-      printFunctionBody(I);
+      printFunctionBody(&I);
       nl(Out,-1) << "}";
       nl(Out);
     }
diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
new file mode 100644
index 0000000..a8622a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -0,0 +1,2152 @@
+//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mcasmparser"
+
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCExpr.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
+#include "MCTargetDesc/HexagonShuffler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableFutureRegs("mfuture-regs",
+                                      cl::desc("Enable future registers"));
+
+static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
+cl::desc("Warn for missing parenthesis around predicate registers"),
+cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
+cl::desc("Error for missing parenthesis around predicate registers"),
+cl::init(false));
+static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
+cl::desc("Warn for mismatching a signed and unsigned value"),
+cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
+cl::desc("Warn for register names that arent contigious"),
+cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
+cl::desc("Error for register names that aren't contigious"),
+cl::init(false));
+
+
+namespace {
+struct HexagonOperand;
+
+class HexagonAsmParser : public MCTargetAsmParser {
+
+  HexagonTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
+    return static_cast<HexagonTargetStreamer &>(TS);
+  }
+
+  MCAsmParser &Parser;
+  MCAssembler *Assembler;
+  MCInstrInfo const &MCII;
+  MCInst MCB;
+  bool InBrackets;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAssembler *getAssembler() const { return Assembler; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  bool equalIsAsmAssignment() override { return false; }
+  bool isLabel(AsmToken &Token) override;
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool ParseDirectiveFalign(unsigned Size, SMLoc L);
+
+  virtual bool ParseRegister(unsigned &RegNo,
+                             SMLoc &StartLoc,
+                             SMLoc &EndLoc) override;
+  bool ParseDirectiveSubsection(SMLoc L);
+  bool ParseDirectiveValue(unsigned Size, SMLoc L);
+  bool ParseDirectiveComm(bool IsLocal, SMLoc L);
+  bool RegisterMatchesArch(unsigned MatchNum) const;
+
+  bool matchBundleOptions();
+  bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc);
+  bool finishBundle(SMLoc IDLoc, MCStreamer &Out);
+  void canonicalizeImmediates(MCInst &MCI);
+  bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc,
+                           OperandVector &InstOperands, uint64_t &ErrorInfo,
+                           bool MatchingInlineAsm, bool &MustExtend);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+  void OutOfRange(SMLoc IDLoc, long long Val, long long Max);
+  int processInstruction(MCInst &Inst, OperandVector const &Operands,
+                         SMLoc IDLoc, bool &MustExtend);
+
+  // Check if we have an assembler and, if so, set the ELF e_header flags.
+  void chksetELFHeaderEFlags(unsigned flags) {
+    if (getAssembler())
+      getAssembler()->setELFHeaderEFlags(flags);
+  }
+
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "HexagonGenAsmMatcher.inc"
+
+  /// }
+
+public:
+  HexagonAsmParser(const MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                   const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, _STI), Parser(_Parser),
+      MCII (MII), MCB(HexagonMCInstrInfo::createBundle()), InBrackets(false) {
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+  MCAsmParserExtension::Initialize(_Parser);
+
+  Assembler = nullptr;
+  // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+  if (!Parser.getStreamer().hasRawTextSupport()) {
+    MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+    Assembler = &MES->getAssembler();
+  }
+  }
+
+  bool mustExtend(OperandVector &Operands);
+  bool splitIdentifier(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands);
+  bool parseInstruction(OperandVector &Operands);
+  bool implicitExpressionLocation(OperandVector &Operands);
+  bool parseExpressionOrOperand(OperandVector &Operands);
+  bool parseExpression(MCExpr const *& Expr);
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc, OperandVector &Operands) override
+  {
+    llvm_unreachable("Unimplemented");
+  }
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                AsmToken ID, OperandVector &Operands) override;
+
+  virtual bool ParseDirective(AsmToken DirectiveID) override;
+};
+
+/// HexagonOperand - Instances of this class represent a parsed Hexagon machine
+/// instruction.
+struct HexagonOperand : public MCParsedAsmOperand {
+  enum KindTy { Token, Immediate, Register } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokTy {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegTy {
+    unsigned RegNum;
+  };
+
+  struct ImmTy {
+    const MCExpr *Val;
+    bool MustExtend;
+  };
+
+  struct InstTy {
+    OperandVector *SubInsts;
+  };
+
+  union {
+    struct TokTy Tok;
+    struct RegTy Reg;
+    struct ImmTy Imm;
+  };
+
+  HexagonOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+public:
+  HexagonOperand(const HexagonOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case Register:
+      Reg = o.Reg;
+      break;
+    case Immediate:
+      Imm = o.Imm;
+      break;
+    case Token:
+      Tok = o.Tok;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  bool isToken() const { return Kind == Token; }
+  bool isImm() const { return Kind == Immediate; }
+  bool isMem() const { llvm_unreachable("No isMem"); }
+  bool isReg() const { return Kind == Register; }
+
+  bool CheckImmRange(int immBits, int zeroBits, bool isSigned,
+                     bool isRelocatable, bool Extendable) const {
+    if (Kind == Immediate) {
+      const MCExpr *myMCExpr = getImm();
+      if (Imm.MustExtend && !Extendable)
+        return false;
+      int64_t Res;
+      if (myMCExpr->evaluateAsAbsolute(Res)) {
+        int bits = immBits + zeroBits;
+        // Field bit range is zerobits + bits
+        // zeroBits must be 0
+        if (Res & ((1 << zeroBits) - 1))
+          return false;
+        if (isSigned) {
+          if (Res < (1LL << (bits - 1)) && Res >= -(1LL << (bits - 1)))
+            return true;
+        } else {
+          if (bits == 64)
+            return true;
+          if (Res >= 0)
+            return ((uint64_t)Res < (uint64_t)(1ULL << bits)) ? true : false;
+          else {
+            const int64_t high_bit_set = 1ULL << 63;
+            const uint64_t mask = (high_bit_set >> (63 - bits));
+            return (((uint64_t)Res & mask) == mask) ? true : false;
+          }
+        }
+      } else if (myMCExpr->getKind() == MCExpr::SymbolRef && isRelocatable)
+        return true;
+      else if (myMCExpr->getKind() == MCExpr::Binary ||
+               myMCExpr->getKind() == MCExpr::Unary)
+        return true;
+    }
+    return false;
+  }
+
+  bool isf32Ext() const { return false; }
+  bool iss32Imm() const { return CheckImmRange(32, 0, true, true, false); }
+  bool iss8Imm() const { return CheckImmRange(8, 0, true, false, false); }
+  bool iss8Imm64() const { return CheckImmRange(8, 0, true, true, false); }
+  bool iss7Imm() const { return CheckImmRange(7, 0, true, false, false); }
+  bool iss6Imm() const { return CheckImmRange(6, 0, true, false, false); }
+  bool iss4Imm() const { return CheckImmRange(4, 0, true, false, false); }
+  bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
+  bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
+  bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
+  bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
+  bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
+  bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
+  bool iss3Imm() const { return CheckImmRange(3, 0, true, false, false); }
+
+  bool isu64Imm() const { return CheckImmRange(64, 0, false, true, true); }
+  bool isu32Imm() const { return CheckImmRange(32, 0, false, true, false); }
+  bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
+  bool isu16Imm() const { return CheckImmRange(16, 0, false, true, false); }
+  bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
+  bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
+  bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
+  bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
+  bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
+  bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
+  bool isu10Imm() const { return CheckImmRange(10, 0, false, false, false); }
+  bool isu9Imm() const { return CheckImmRange(9, 0, false, false, false); }
+  bool isu8Imm() const { return CheckImmRange(8, 0, false, false, false); }
+  bool isu7Imm() const { return CheckImmRange(7, 0, false, false, false); }
+  bool isu6Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isu5Imm() const { return CheckImmRange(5, 0, false, false, false); }
+  bool isu4Imm() const { return CheckImmRange(4, 0, false, false, false); }
+  bool isu3Imm() const { return CheckImmRange(3, 0, false, false, false); }
+  bool isu2Imm() const { return CheckImmRange(2, 0, false, false, false); }
+  bool isu1Imm() const { return CheckImmRange(1, 0, false, false, false); }
+
+  bool ism6Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isn8Imm() const { return CheckImmRange(8, 0, false, false, false); }
+
+  bool iss16Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
+  bool iss12Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
+  bool iss10Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
+  bool iss9Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
+  bool iss8Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
+  bool iss7Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
+  bool iss6Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
+  bool iss11_0Ext() const {
+    return CheckImmRange(11 + 26, 0, true, true, true);
+  }
+  bool iss11_1Ext() const {
+    return CheckImmRange(11 + 26, 1, true, true, true);
+  }
+  bool iss11_2Ext() const {
+    return CheckImmRange(11 + 26, 2, true, true, true);
+  }
+  bool iss11_3Ext() const {
+    return CheckImmRange(11 + 26, 3, true, true, true);
+  }
+
+  bool isu6Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
+  bool isu7Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
+  bool isu8Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
+  bool isu9Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
+  bool isu10Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
+  bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
+  bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
+  bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
+  bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
+  bool isu32MustExt() const { return isImm() && Imm.MustExtend; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createExpr(getImm()));
+  }
+
+  void addSignedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    MCExpr const *Expr = getImm();
+    int64_t Value;
+    if (!Expr->evaluateAsAbsolute(Value)) {
+      Inst.addOperand(MCOperand::createExpr(Expr));
+      return;
+    }
+    int64_t Extended = SignExtend64 (Value, 32);
+    if ((Extended < 0) == (Value < 0)) {
+      Inst.addOperand(MCOperand::createExpr(Expr));
+      return;
+    }
+    // Flip bit 33 to signal signed unsigned mismatch
+    Extended ^= 0x100000000;
+    Inst.addOperand(MCOperand::createImm(Extended));
+  }
+
+  void addf32ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void adds32ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds8ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds8Imm64Operands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds6ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds3ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+
+  void addu64ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu32ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu10ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu9ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu8ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu7ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu5ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu4ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu2ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu1ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addm6ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addn8ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void adds16ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds12ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds10ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds9ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds8ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds6ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+
+  void addu6ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu7ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu8ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu9ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu10ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu32MustExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+  }
+
+  void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+  }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  virtual void print(raw_ostream &OS) const;
+
+  static std::unique_ptr<HexagonOperand> CreateToken(StringRef Str, SMLoc S) {
+    HexagonOperand *Op = new HexagonOperand(Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return std::unique_ptr<HexagonOperand>(Op);
+  }
+
+  static std::unique_ptr<HexagonOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                                   SMLoc E) {
+    HexagonOperand *Op = new HexagonOperand(Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return std::unique_ptr<HexagonOperand>(Op);
+  }
+
+  static std::unique_ptr<HexagonOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                   SMLoc E) {
+    HexagonOperand *Op = new HexagonOperand(Immediate);
+    Op->Imm.Val = Val;
+    Op->Imm.MustExtend = false;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return std::unique_ptr<HexagonOperand>(Op);
+  }
+};
+
+} // end anonymous namespace.
+
+void HexagonOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case Immediate:
+    getImm()->print(OS, nullptr);
+    break;
+  case Register:
+    OS << "<register R";
+    OS << getReg() << ">";
+    break;
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  }
+}
+
+/// @name Auto-generated Match Functions
+static unsigned MatchRegisterName(StringRef Name);
+
+bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
+  DEBUG(dbgs() << "Bundle:");
+  DEBUG(MCB.dump_pretty(dbgs()));
+  DEBUG(dbgs() << "--\n");
+
+  // Check the bundle for errors.
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI);
+
+  bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(),
+                                                        getContext(), MCB,
+                                                        &Check);
+
+  while (Check.getNextErrInfo() == true) {
+    unsigned Reg = Check.getErrRegister();
+    Twine R(RI->getName(Reg));
+
+    uint64_t Err = Check.getError();
+    if (Err != HexagonMCErrInfo::CHECK_SUCCESS) {
+      if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err)
+        Error(IDLoc,
+              "unconditional branch cannot precede another branch in packet");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err ||
+          HexagonMCErrInfo::CHECK_ERROR_NEWV & Err)
+        Error(IDLoc, "register `" + R +
+                         "' used with `.new' "
+                         "but not validly modified in the same packet");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err)
+        Error(IDLoc, "register `" + R + "' modified more than once");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err)
+        Error(IDLoc, "cannot write to read-only register `" + R + "'");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err)
+        Error(IDLoc, "loop-setup and some branch instructions "
+                     "cannot be in the same packet");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) {
+        Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1');
+        Error(IDLoc, "packet marked with `:endloop" + N + "' " +
+                         "cannot contain instructions that modify register " +
+                         "`" + R + "'");
+      }
+
+      if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err)
+        Error(IDLoc,
+              "instruction cannot appear in packet with other instructions");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err)
+        Error(IDLoc, "too many slots used in packet");
+
+      if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) {
+        uint64_t Erm = Check.getShuffleError();
+
+        if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm)
+          Error(IDLoc, "invalid instruction packet");
+        else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm)
+          Error(IDLoc, "invalid instruction packet: too many stores");
+        else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm)
+          Error(IDLoc, "invalid instruction packet: too many loads");
+        else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm)
+          Error(IDLoc, "too many branches in packet");
+        else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm)
+          Error(IDLoc, "invalid instruction packet: out of slots");
+        else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm)
+          Error(IDLoc, "invalid instruction packet: slot error");
+        else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm)
+          Error(IDLoc, "v60 packet violation");
+        else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm)
+          Error(IDLoc, "slot 0 instruction does not allow slot 1 store");
+        else
+          Error(IDLoc, "unknown error in instruction packet");
+      }
+    }
+
+    unsigned Warn = Check.getWarning();
+    if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) {
+      if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn)
+        Warning(IDLoc, "register `" + R + "' used with `.cur' "
+                                          "but not used in the same packet");
+      else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn)
+        Warning(IDLoc, "register `" + R + "' used with `.tmp' "
+                                          "but not used in the same packet");
+    }
+  }
+
+  if (CheckOk) {
+    MCB.setLoc(IDLoc);
+    if (HexagonMCInstrInfo::bundleSize(MCB) == 0) {
+      assert(!HexagonMCInstrInfo::isInnerLoop(MCB));
+      assert(!HexagonMCInstrInfo::isOuterLoop(MCB));
+      // Empty packets are valid yet aren't emitted
+      return false;
+    }
+    Out.EmitInstruction(MCB, getSTI());
+  } else {
+    // If compounding and duplexing didn't reduce the size below
+    // 4 or less we have a packet that is too big.
+    if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
+      Error(IDLoc, "invalid instruction packet: out of slots");
+      return true; // Error
+    }
+  }
+
+  return false; // No error
+}
+
+bool HexagonAsmParser::matchBundleOptions() {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  while (true) {
+    if (!Parser.getTok().is(AsmToken::Colon))
+      return false;
+    Lexer.Lex();
+    StringRef Option = Parser.getTok().getString();
+    if (Option.compare_lower("endloop0") == 0)
+      HexagonMCInstrInfo::setInnerLoop(MCB);
+    else if (Option.compare_lower("endloop1") == 0)
+      HexagonMCInstrInfo::setOuterLoop(MCB);
+    else if (Option.compare_lower("mem_noshuf") == 0)
+      HexagonMCInstrInfo::setMemReorderDisabled(MCB);
+    else if (Option.compare_lower("mem_shuf") == 0)
+      HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
+    else
+      return true;
+    Lexer.Lex();
+  }
+}
+
+// For instruction aliases, immediates are generated rather than
+// MCConstantExpr.  Convert them for uniform MCExpr.
+// Also check for signed/unsigned mismatches and warn
+void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
+  MCInst NewInst;
+  NewInst.setOpcode(MCI.getOpcode());
+  for (MCOperand &I : MCI)
+    if (I.isImm()) {
+      int64_t Value (I.getImm());
+      if ((Value & 0x100000000) != (Value & 0x80000000)) {
+        // Detect flipped bit 33 wrt bit 32 and signal warning
+        Value ^= 0x100000000;
+        if (WarnSignedMismatch)
+          Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
+      }
+      NewInst.addOperand(MCOperand::createExpr(
+          MCConstantExpr::create(Value, getContext())));
+    }
+    else
+      NewInst.addOperand(I);
+  MCI = NewInst;
+}
+
+bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
+                                           OperandVector &InstOperands,
+                                           uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm,
+                                           bool &MustExtend) {
+  // Perform matching with tablegen asmmatcher generated function
+  int result =
+      MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm);
+  if (result == Match_Success) {
+    MCI.setLoc(IDLoc);
+    MustExtend = mustExtend(InstOperands);
+    canonicalizeImmediates(MCI);
+    result = processInstruction(MCI, InstOperands, IDLoc, MustExtend);
+
+    DEBUG(dbgs() << "Insn:");
+    DEBUG(MCI.dump_pretty(dbgs()));
+    DEBUG(dbgs() << "\n\n");
+
+    MCI.setLoc(IDLoc);
+  }
+
+  // Create instruction operand for bundle instruction
+  //   Break this into a separate function Code here is less readable
+  //   Think about how to get an instruction error to report correctly.
+  //   SMLoc will return the "{"
+  switch (result) {
+  default:
+    break;
+  case Match_Success:
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "invalid instruction");
+  case Match_MnemonicFail:
+    return Error(IDLoc, "unrecognized instruction");
+  case Match_InvalidOperand:
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= InstOperands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = (static_cast<HexagonOperand *>(InstOperands[ErrorInfo].get()))
+                     ->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool HexagonAsmParser::mustExtend(OperandVector &Operands) {
+  unsigned Count = 0;
+  for (std::unique_ptr<MCParsedAsmOperand> &i : Operands)
+    if (i->isImm())
+      if (static_cast<HexagonOperand *>(i.get())->Imm.MustExtend)
+        ++Count;
+  // Multiple extenders should have been filtered by iss9Ext et. al.
+  assert(Count < 2 && "Multiple extenders");
+  return Count == 1;
+}
+
+bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               uint64_t &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  if (!InBrackets) {
+    MCB.clear();
+    MCB.addOperand(MCOperand::createImm(0));
+  }
+  HexagonOperand &FirstOperand = static_cast<HexagonOperand &>(*Operands[0]);
+  if (FirstOperand.isToken() && FirstOperand.getToken() == "{") {
+    assert(Operands.size() == 1 && "Brackets should be by themselves");
+    if (InBrackets) {
+      getParser().Error(IDLoc, "Already in a packet");
+      return true;
+    }
+    InBrackets = true;
+    return false;
+  }
+  if (FirstOperand.isToken() && FirstOperand.getToken() == "}") {
+    assert(Operands.size() == 1 && "Brackets should be by themselves");
+    if (!InBrackets) {
+      getParser().Error(IDLoc, "Not in a packet");
+      return true;
+    }
+    InBrackets = false;
+    if (matchBundleOptions())
+      return true;
+    return finishBundle(IDLoc, Out);
+  }
+  MCInst *SubInst = new (getParser().getContext()) MCInst;
+  bool MustExtend = false;
+  if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
+                          MatchingInlineAsm, MustExtend))
+    return true;
+  HexagonMCInstrInfo::extendIfNeeded(
+      getParser().getContext(), MCII, MCB, *SubInst,
+      HexagonMCInstrInfo::isExtended(MCII, *SubInst) || MustExtend);
+  MCB.addOperand(MCOperand::createInst(SubInst));
+  if (!InBrackets)
+    return finishBundle(IDLoc, Out);
+  return false;
+}
+
+/// ParseDirective parses the Hexagon specific directives
+bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte"))
+    return ParseDirectiveValue(4, DirectiveID.getLoc());
+  if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" ||
+      IDVal.lower() == ".half")
+    return ParseDirectiveValue(2, DirectiveID.getLoc());
+  if (IDVal.lower() == ".falign")
+    return ParseDirectiveFalign(256, DirectiveID.getLoc());
+  if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon"))
+    return ParseDirectiveComm(true, DirectiveID.getLoc());
+  if ((IDVal.lower() == ".comm") || (IDVal.lower() == ".common"))
+    return ParseDirectiveComm(false, DirectiveID.getLoc());
+  if (IDVal.lower() == ".subsection")
+    return ParseDirectiveSubsection(DirectiveID.getLoc());
+
+  return true;
+}
+bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
+  const MCExpr *Subsection = 0;
+  int64_t Res;
+
+  assert((getLexer().isNot(AsmToken::EndOfStatement)) &&
+         "Invalid subsection directive");
+  getParser().parseExpression(Subsection);
+
+  if (!Subsection->evaluateAsAbsolute(Res))
+    return Error(L, "Cannot evaluate subsection number");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  // 0-8192 is the hard-coded range in MCObjectStreamper.cpp, this keeps the
+  // negative subsections together and in the same order but at the opposite
+  // end of the section.  Only legacy hexagon-gcc created assembly code
+  // used negative subsections.
+  if ((Res < 0) && (Res > -8193))
+    Subsection = MCConstantExpr::create(8192 + Res, this->getContext());
+
+  getStreamer().SubSection(Subsection);
+  return false;
+}
+
+///  ::= .falign [expression]
+bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
+
+  int64_t MaxBytesToFill = 15;
+
+  // if there is an arguement
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    const MCExpr *Value;
+    SMLoc ExprLoc = L;
+
+    // Make sure we have a number (false is returned if expression is a number)
+    if (getParser().parseExpression(Value) == false) {
+      // Make sure this is a number that is in range
+      const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+      uint64_t IntValue = MCE->getValue();
+      if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue))
+        return Error(ExprLoc, "literal value out of range (256) for falign");
+      MaxBytesToFill = IntValue;
+      Lex();
+    } else {
+      return Error(ExprLoc, "not a valid expression for falign directive");
+    }
+  }
+
+  getTargetStreamer().emitFAlign(16, MaxBytesToFill);
+  Lex();
+
+  return false;
+}
+
+///  ::= .word [ expression (, expression)* ]
+bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    for (;;) {
+      const MCExpr *Value;
+      SMLoc ExprLoc = L;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      // Special case constant expressions to match code generator.
+      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size);
+      } else
+        getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+// This is largely a copy of AsmParser's ParseDirectiveComm extended to
+// accept a 3rd argument, AccessAlignment which indicates the smallest
+// memory access made to the symbol, expressed in bytes.  If no
+// AccessAlignment is specified it defaults to the Alignment Value.
+// Hexagon's .lcomm:
+//   .lcomm Symbol, Length, Alignment, AccessAlignment
+bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
+  // FIXME: need better way to detect if AsmStreamer (upstream removed
+  // getKind())
+  if (getStreamer().hasRawTextSupport())
+    return true; // Only object file output requires special treatment.
+
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t ByteAlignment = 1;
+  SMLoc ByteAlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    ByteAlignmentLoc = getLexer().getLoc();
+    if (getParser().parseAbsoluteExpression(ByteAlignment))
+      return true;
+    if (!isPowerOf2_64(ByteAlignment))
+      return Error(ByteAlignmentLoc, "alignment must be a power of 2");
+  }
+
+  int64_t AccessAlignment = 0;
+  if (getLexer().is(AsmToken::Comma)) {
+    // The optional access argument specifies the size of the smallest memory
+    //   access to be made to the symbol, expressed in bytes.
+    SMLoc AccessAlignmentLoc;
+    Lex();
+    AccessAlignmentLoc = getLexer().getLoc();
+    if (getParser().parseAbsoluteExpression(AccessAlignment))
+      return true;
+
+    if (!isPowerOf2_64(AccessAlignment))
+      return Error(AccessAlignmentLoc, "access alignment must be a power of 2");
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.comm' or '.lcomm' directive");
+
+  Lex();
+
+  // NOTE: a size of zero for a .comm should create a undefined symbol
+  // but a size of .lcomm creates a bss symbol of size zero.
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+                          "be less than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (ByteAlignment < 0)
+    return Error(ByteAlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+                                   "alignment, can't be less than zero");
+
+  if (!Sym->isUndefined())
+    return Error(Loc, "invalid symbol redefinition");
+
+  HexagonMCELFStreamer &HexagonELFStreamer =
+      static_cast<HexagonMCELFStreamer &>(getStreamer());
+  if (IsLocal) {
+    HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(Sym, Size, ByteAlignment,
+                                                      AccessAlignment);
+    return false;
+  }
+
+  HexagonELFStreamer.HexagonMCEmitCommonSymbol(Sym, Size, ByteAlignment,
+                                               AccessAlignment);
+  return false;
+}
+
+// validate register against architecture
+bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+  return true;
+}
+
+// extern "C" void LLVMInitializeHexagonAsmLexer();
+
+/// Force static initialization.
+extern "C" void LLVMInitializeHexagonAsmParser() {
+  RegisterMCAsmParser<HexagonAsmParser> X(TheHexagonTarget);
+}
+
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_REGISTER_MATCHER
+#include "HexagonGenAsmMatcher.inc"
+
+namespace {
+bool previousEqual(OperandVector &Operands, size_t Index, StringRef String) {
+  if (Index >= Operands.size())
+    return false;
+  MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1];
+  if (!Operand.isToken())
+    return false;
+  return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String);
+}
+bool previousIsLoop(OperandVector &Operands, size_t Index) {
+  return previousEqual(Operands, Index, "loop0") ||
+         previousEqual(Operands, Index, "loop1") ||
+         previousEqual(Operands, Index, "sp1loop0") ||
+         previousEqual(Operands, Index, "sp2loop0") ||
+         previousEqual(Operands, Index, "sp3loop0");
+}
+}
+
+bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) {
+  AsmToken const &Token = getParser().getTok();
+  StringRef String = Token.getString();
+  SMLoc Loc = Token.getLoc();
+  getLexer().Lex();
+  do {
+    std::pair<StringRef, StringRef> HeadTail = String.split('.');
+    if (!HeadTail.first.empty())
+      Operands.push_back(HexagonOperand::CreateToken(HeadTail.first, Loc));
+    if (!HeadTail.second.empty())
+      Operands.push_back(HexagonOperand::CreateToken(
+          String.substr(HeadTail.first.size(), 1), Loc));
+    String = HeadTail.second;
+  } while (!String.empty());
+  return false;
+}
+
+bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
+  unsigned Register;
+  SMLoc Begin;
+  SMLoc End;
+  MCAsmLexer &Lexer = getLexer();
+  if (!ParseRegister(Register, Begin, End)) {
+    if (!ErrorMissingParenthesis)
+      switch (Register) {
+      default:
+        break;
+      case Hexagon::P0:
+      case Hexagon::P1:
+      case Hexagon::P2:
+      case Hexagon::P3:
+        if (previousEqual(Operands, 0, "if")) {
+          if (WarnMissingParenthesis)
+            Warning (Begin, "Missing parenthesis around predicate register");
+          static char const *LParen = "(";
+          static char const *RParen = ")";
+          Operands.push_back(HexagonOperand::CreateToken(LParen, Begin));
+          Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+          AsmToken MaybeDotNew = Lexer.getTok();
+          if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+              MaybeDotNew.getString().equals_lower(".new"))
+            splitIdentifier(Operands);
+          Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+          return false;
+        }
+        if (previousEqual(Operands, 0, "!") &&
+            previousEqual(Operands, 1, "if")) {
+          if (WarnMissingParenthesis)
+            Warning (Begin, "Missing parenthesis around predicate register");
+          static char const *LParen = "(";
+          static char const *RParen = ")";
+          Operands.insert(Operands.end () - 1,
+                          HexagonOperand::CreateToken(LParen, Begin));
+          Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+          AsmToken MaybeDotNew = Lexer.getTok();
+          if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+              MaybeDotNew.getString().equals_lower(".new"))
+            splitIdentifier(Operands);
+          Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+          return false;
+        }
+        break;
+      }
+    Operands.push_back(HexagonOperand::CreateReg(
+        Register, Begin, End));
+    return false;
+  }
+  return splitIdentifier(Operands);
+}
+
+bool HexagonAsmParser::isLabel(AsmToken &Token) {
+  MCAsmLexer &Lexer = getLexer();
+  AsmToken const &Second = Lexer.getTok();
+  AsmToken Third = Lexer.peekTok();  
+  StringRef String = Token.getString();
+  if (Token.is(AsmToken::TokenKind::LCurly) ||
+      Token.is(AsmToken::TokenKind::RCurly))
+    return false;
+  if (!Token.is(AsmToken::TokenKind::Identifier))
+    return true;
+  if (!MatchRegisterName(String.lower()))
+    return true;
+  (void)Second;
+  assert(Second.is(AsmToken::Colon));
+  StringRef Raw (String.data(), Third.getString().data() - String.data() +
+                 Third.getString().size());
+  std::string Collapsed = Raw;
+  Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace),
+                  Collapsed.end());
+  StringRef Whole = Collapsed;
+  std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
+  if (!MatchRegisterName(DotSplit.first.lower()))
+    return true;
+  return false;
+}
+
+bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) {
+  if (!Contigious && ErrorNoncontigiousRegister) {
+    Error(Loc, "Register name is not contigious");
+    return true;
+  }
+  if (!Contigious && WarnNoncontigiousRegister)
+    Warning(Loc, "Register name is not contigious");
+  return false;
+}
+
+bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  MCAsmLexer &Lexer = getLexer();
+  StartLoc = getLexer().getLoc();
+  SmallVector<AsmToken, 5> Lookahead;
+  StringRef RawString(Lexer.getTok().getString().data(), 0);
+  bool Again = Lexer.is(AsmToken::Identifier);
+  bool NeededWorkaround = false;
+  while (Again) {
+    AsmToken const &Token = Lexer.getTok();
+    RawString = StringRef(RawString.data(),
+                          Token.getString().data() - RawString.data () +
+                          Token.getString().size());
+    Lookahead.push_back(Token);
+    Lexer.Lex();
+    bool Contigious = Lexer.getTok().getString().data() ==
+                      Lookahead.back().getString().data() +
+                      Lookahead.back().getString().size();
+    bool Type = Lexer.is(AsmToken::Identifier) || Lexer.is(AsmToken::Dot) ||
+                Lexer.is(AsmToken::Integer) || Lexer.is(AsmToken::Real) ||
+                Lexer.is(AsmToken::Colon);
+    bool Workaround = Lexer.is(AsmToken::Colon) ||
+                      Lookahead.back().is(AsmToken::Colon);
+    Again = (Contigious && Type) || (Workaround && Type);
+    NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
+  }
+  std::string Collapsed = RawString;
+  Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace),
+                  Collapsed.end());
+  StringRef FullString = Collapsed;
+  std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
+  unsigned DotReg = MatchRegisterName(DotSplit.first.lower());
+  if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+    if (DotSplit.second.empty()) {
+      RegNo = DotReg;
+      EndLoc = Lexer.getLoc();
+      if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+        return true;
+      return false;
+    } else {
+      RegNo = DotReg;
+      size_t First = RawString.find('.');
+      StringRef DotString (RawString.data() + First, RawString.size() - First);
+      Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString));
+      EndLoc = Lexer.getLoc();
+      if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+        return true;
+      return false;
+    }
+  }
+  std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
+  unsigned ColonReg = MatchRegisterName(ColonSplit.first.lower());
+  if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+    Lexer.UnLex(Lookahead.back());
+    Lookahead.pop_back();
+    Lexer.UnLex(Lookahead.back());
+    Lookahead.pop_back();
+    RegNo = ColonReg;
+    EndLoc = Lexer.getLoc();
+    if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+      return true;
+    return false;
+  }
+  while (!Lookahead.empty()) {
+    Lexer.UnLex(Lookahead.back());
+    Lookahead.pop_back();
+  }
+  return true;
+}
+
+bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) {
+  if (previousEqual(Operands, 0, "call"))
+    return true;
+  if (previousEqual(Operands, 0, "jump"))
+    if (!getLexer().getTok().is(AsmToken::Colon))
+      return true;
+  if (previousEqual(Operands, 0, "(") && previousIsLoop(Operands, 1))
+    return true;
+  if (previousEqual(Operands, 1, ":") && previousEqual(Operands, 2, "jump") &&
+      (previousEqual(Operands, 0, "nt") || previousEqual(Operands, 0, "t")))
+    return true;
+  return false;
+}
+
+bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) {
+  llvm::SmallVector<AsmToken, 4> Tokens;
+  MCAsmLexer &Lexer = getLexer();
+  bool Done = false;
+  static char const * Comma = ",";
+  do {
+    Tokens.emplace_back (Lexer.getTok());
+    Lexer.Lex();
+    switch (Tokens.back().getKind())
+    {
+    case AsmToken::TokenKind::Hash:
+      if (Tokens.size () > 1)
+        if ((Tokens.end () - 2)->getKind() == AsmToken::TokenKind::Plus) {
+          Tokens.insert(Tokens.end() - 2,
+                        AsmToken(AsmToken::TokenKind::Comma, Comma));
+          Done = true;
+        }
+      break;
+    case AsmToken::TokenKind::RCurly:
+    case AsmToken::TokenKind::EndOfStatement:
+    case AsmToken::TokenKind::Eof:
+      Done = true;
+      break;
+    default:
+      break;
+    }
+  } while (!Done);
+  while (!Tokens.empty()) {
+    Lexer.UnLex(Tokens.back());
+    Tokens.pop_back();
+  }
+  return getParser().parseExpression(Expr);
+}
+
+bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) {
+  if (implicitExpressionLocation(Operands)) {
+    MCAsmParser &Parser = getParser();
+    SMLoc Loc = Parser.getLexer().getLoc();
+    std::unique_ptr<HexagonOperand> Expr =
+        HexagonOperand::CreateImm(nullptr, Loc, Loc);
+    MCExpr const *& Val = Expr->Imm.Val;
+    Operands.push_back(std::move(Expr));
+    return parseExpression(Val);
+  }
+  return parseOperand(Operands);
+}
+
+/// Parse an instruction.
+bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  while (true) {
+    AsmToken const &Token = Parser.getTok();
+    switch (Token.getKind()) {
+    case AsmToken::EndOfStatement: {
+      Lexer.Lex();
+      return false;
+    }
+    case AsmToken::LCurly: {
+      if (!Operands.empty())
+        return true;
+      Operands.push_back(
+          HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+      Lexer.Lex();
+      return false;
+    }
+    case AsmToken::RCurly: {
+      if (Operands.empty()) {
+        Operands.push_back(
+            HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+        Lexer.Lex();
+      }
+      return false;
+    }
+    case AsmToken::Comma: {
+      Lexer.Lex();
+      continue;
+    }
+    case AsmToken::EqualEqual:
+    case AsmToken::ExclaimEqual:
+    case AsmToken::GreaterEqual:
+    case AsmToken::GreaterGreater:
+    case AsmToken::LessEqual:
+    case AsmToken::LessLess: {
+      Operands.push_back(HexagonOperand::CreateToken(
+          Token.getString().substr(0, 1), Token.getLoc()));
+      Operands.push_back(HexagonOperand::CreateToken(
+          Token.getString().substr(1, 1), Token.getLoc()));
+      Lexer.Lex();
+      continue;
+    }
+    case AsmToken::Hash: {
+      bool MustNotExtend = false;
+      bool ImplicitExpression = implicitExpressionLocation(Operands);
+      std::unique_ptr<HexagonOperand> Expr = HexagonOperand::CreateImm(
+          nullptr, Lexer.getLoc(), Lexer.getLoc());
+      if (!ImplicitExpression)
+        Operands.push_back(
+          HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+      Lexer.Lex();
+      bool MustExtend = false;
+      bool HiOnly = false;
+      bool LoOnly = false;
+      if (Lexer.is(AsmToken::Hash)) {
+        Lexer.Lex();
+        MustExtend = true;
+      } else if (ImplicitExpression)
+        MustNotExtend = true;
+      AsmToken const &Token = Parser.getTok();
+      if (Token.is(AsmToken::Identifier)) {
+        StringRef String = Token.getString();
+        AsmToken IDToken = Token;
+        if (String.lower() == "hi") {
+          HiOnly = true;
+        } else if (String.lower() == "lo") {
+          LoOnly = true;
+        }
+        if (HiOnly || LoOnly) {
+          AsmToken LParen = Lexer.peekTok();
+          if (!LParen.is(AsmToken::LParen)) {
+            HiOnly = false;
+            LoOnly = false;
+          } else {
+            Lexer.Lex();
+          }
+        }
+      }
+      if (parseExpression(Expr->Imm.Val))
+        return true;
+      int64_t Value;
+      MCContext &Context = Parser.getContext();
+      assert(Expr->Imm.Val != nullptr);
+      if (Expr->Imm.Val->evaluateAsAbsolute(Value)) {
+        if (HiOnly)
+          Expr->Imm.Val = MCBinaryExpr::createLShr(
+              Expr->Imm.Val, MCConstantExpr::create(16, Context), Context);
+        if (HiOnly || LoOnly)
+          Expr->Imm.Val = MCBinaryExpr::createAnd(
+              Expr->Imm.Val, MCConstantExpr::create(0xffff, Context), Context);
+      }
+      if (MustNotExtend)
+        Expr->Imm.Val = HexagonNoExtendOperand::Create(Expr->Imm.Val, Context);
+      Expr->Imm.MustExtend = MustExtend;
+      Operands.push_back(std::move(Expr));
+      continue;
+    }
+    default:
+      break;
+    }
+    if (parseExpressionOrOperand(Operands))
+      return true;
+  }
+}
+
+bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name,
+                                        AsmToken ID,
+                                        OperandVector &Operands) {
+  getLexer().UnLex(ID);
+  return parseInstruction(Operands);
+}
+
+namespace {
+MCInst makeCombineInst(int opCode, MCOperand &Rdd,
+                       MCOperand &MO1, MCOperand &MO2) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(opCode);
+  TmpInst.addOperand(Rdd);
+  TmpInst.addOperand(MO1);
+  TmpInst.addOperand(MO2);
+
+  return TmpInst;
+}
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                      unsigned Kind) {
+  HexagonOperand *Op = static_cast<HexagonOperand *>(&AsmOp);
+
+  switch (Kind) {
+  case MCK_0: {
+    int64_t Value;
+    return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 0
+               ? Match_Success
+               : Match_InvalidOperand;
+  }
+  case MCK_1: {
+    int64_t Value;
+    return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 1
+               ? Match_Success
+               : Match_InvalidOperand;
+  }
+  case MCK__MINUS_1: {
+    int64_t Value;
+    return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == -1
+               ? Match_Success
+               : Match_InvalidOperand;
+  }
+  }
+  if (Op->Kind == HexagonOperand::Token && Kind != InvalidMatchClass) {
+    StringRef myStringRef = StringRef(Op->Tok.Data, Op->Tok.Length);
+    if (matchTokenString(myStringRef.lower()) == (MatchClassKind)Kind)
+      return Match_Success;
+    if (matchTokenString(myStringRef.upper()) == (MatchClassKind)Kind)
+      return Match_Success;
+  }
+
+  DEBUG(dbgs() << "Unmatched Operand:");
+  DEBUG(Op->dump());
+  DEBUG(dbgs() << "\n");
+
+  return Match_InvalidOperand;
+}
+
+void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
+  std::string errStr;
+  raw_string_ostream ES(errStr);
+  ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: ";
+  if (Max >= 0)
+    ES << "0-" << Max;
+  else
+    ES << Max << "-" << (-Max - 1);
+  Error(IDLoc, ES.str().c_str());
+}
+
+int HexagonAsmParser::processInstruction(MCInst &Inst,
+                                         OperandVector const &Operands,
+                                         SMLoc IDLoc, bool &MustExtend) {
+  MCContext &Context = getParser().getContext();
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  std::string r = "r";
+  std::string v = "v";
+  std::string Colon = ":";
+
+  bool is32bit = false; // used to distinguish between CONST32 and CONST64
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+
+  case Hexagon::M4_mpyrr_addr:
+  case Hexagon::S4_addi_asl_ri:
+  case Hexagon::S4_addi_lsr_ri:
+  case Hexagon::S4_andi_asl_ri:
+  case Hexagon::S4_andi_lsr_ri:
+  case Hexagon::S4_ori_asl_ri:
+  case Hexagon::S4_ori_lsr_ri:
+  case Hexagon::S4_or_andix:
+  case Hexagon::S4_subi_asl_ri:
+  case Hexagon::S4_subi_lsr_ri: {
+    MCOperand &Ry = Inst.getOperand(0);
+    MCOperand &src = Inst.getOperand(2);
+    if (RI->getEncodingValue(Ry.getReg()) != RI->getEncodingValue(src.getReg()))
+      return Match_InvalidOperand;
+    break;
+  }
+
+  case Hexagon::C2_cmpgei: {
+    MCOperand &MO = Inst.getOperand(2);
+    MO.setExpr(MCBinaryExpr::createSub(
+        MO.getExpr(), MCConstantExpr::create(1, Context), Context));
+    Inst.setOpcode(Hexagon::C2_cmpgti);
+    break;
+  }
+
+  case Hexagon::C2_cmpgeui: {
+    MCOperand &MO = Inst.getOperand(2);
+    int64_t Value;
+    bool Success = MO.getExpr()->evaluateAsAbsolute(Value);
+    (void)Success;
+    assert(Success && "Assured by matcher");
+    if (Value == 0) {
+      MCInst TmpInst;
+      MCOperand &Pd = Inst.getOperand(0);
+      MCOperand &Rt = Inst.getOperand(1);
+      TmpInst.setOpcode(Hexagon::C2_cmpeq);
+      TmpInst.addOperand(Pd);
+      TmpInst.addOperand(Rt);
+      TmpInst.addOperand(Rt);
+      Inst = TmpInst;
+    } else {
+      MO.setExpr(MCBinaryExpr::createSub(
+          MO.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Inst.setOpcode(Hexagon::C2_cmpgtui);
+    }
+    break;
+  }
+  case Hexagon::J2_loop1r:
+  case Hexagon::J2_loop1i:
+  case Hexagon::J2_loop0r:
+  case Hexagon::J2_loop0i: {
+    MCOperand &MO = Inst.getOperand(0);
+    // Loop has different opcodes for extended vs not extended, but we should
+    //   not use the other opcode as it is a legacy artifact of TD files.
+    int64_t Value;
+    if (MO.getExpr()->evaluateAsAbsolute(Value)) {
+      // if the operand can fit within a 7:2 field
+      if (Value < (1 << 8) && Value >= -(1 << 8)) {
+        SMLoc myLoc = Operands[2]->getStartLoc();
+        // # is left in startLoc in the case of ##
+        // If '##' found then force extension.
+        if (*myLoc.getPointer() == '#') {
+          MustExtend = true;
+          break;
+        }
+      } else {
+        // If immediate and out of 7:2 range.
+        MustExtend = true;
+      }
+    }
+    break;
+  }
+
+  // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+  case Hexagon::A2_tfrp: {
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+    StringRef Reg1(R1);
+    MO.setReg(MatchRegisterName(Reg1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = r + llvm::utostr_32(RegPairNum);
+    StringRef Reg2(R2);
+    Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+    Inst.setOpcode(Hexagon::A2_combinew);
+    break;
+  }
+
+  case Hexagon::A2_tfrpt:
+  case Hexagon::A2_tfrpf: {
+    MCOperand &MO = Inst.getOperand(2);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+    StringRef Reg1(R1);
+    MO.setReg(MatchRegisterName(Reg1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = r + llvm::utostr_32(RegPairNum);
+    StringRef Reg2(R2);
+    Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+    Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+                       ? Hexagon::C2_ccombinewt
+                       : Hexagon::C2_ccombinewf);
+    break;
+  }
+  case Hexagon::A2_tfrptnew:
+  case Hexagon::A2_tfrpfnew: {
+    MCOperand &MO = Inst.getOperand(2);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+    StringRef Reg1(R1);
+    MO.setReg(MatchRegisterName(Reg1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = r + llvm::utostr_32(RegPairNum);
+    StringRef Reg2(R2);
+    Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+    Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+                       ? Hexagon::C2_ccombinewnewt
+                       : Hexagon::C2_ccombinewnewf);
+    break;
+  }
+
+  // Translate a "$Rx =  CONST32(#imm)" to "$Rx = memw(gp+#LABEL) "
+  case Hexagon::CONST32:
+  case Hexagon::CONST32_Float_Real:
+  case Hexagon::CONST32_Int_Real:
+  case Hexagon::FCONST32_nsdata:
+    is32bit = true;
+  // Translate a "$Rx:y =  CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) "
+  case Hexagon::CONST64_Float_Real:
+  case Hexagon::CONST64_Int_Real:
+
+    // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+    if (!Parser.getStreamer().hasRawTextSupport()) {
+      MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+      MCOperand &MO_1 = Inst.getOperand(1);
+      MCOperand &MO_0 = Inst.getOperand(0);
+
+      // push section onto section stack
+      MES->PushSection();
+
+      std::string myCharStr;
+      MCSectionELF *mySection;
+
+      // check if this as an immediate or a symbol
+      int64_t Value;
+      bool Absolute = MO_1.getExpr()->evaluateAsAbsolute(Value);
+      if (Absolute) {
+        // Create a new section - one for each constant
+        // Some or all of the zeros are replaced with the given immediate.
+        if (is32bit) {
+          std::string myImmStr = utohexstr(static_cast<uint32_t>(Value));
+          myCharStr = StringRef(".gnu.linkonce.l4.CONST_00000000")
+                          .drop_back(myImmStr.size())
+                          .str() +
+                      myImmStr;
+        } else {
+          std::string myImmStr = utohexstr(Value);
+          myCharStr = StringRef(".gnu.linkonce.l8.CONST_0000000000000000")
+                          .drop_back(myImmStr.size())
+                          .str() +
+                      myImmStr;
+        }
+
+        mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+                                               ELF::SHF_ALLOC | ELF::SHF_WRITE);
+      } else if (MO_1.isExpr()) {
+        // .lita - for expressions
+        myCharStr = ".lita";
+        mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+                                               ELF::SHF_ALLOC | ELF::SHF_WRITE);
+      } else
+        llvm_unreachable("unexpected type of machine operand!");
+
+      MES->SwitchSection(mySection);
+      unsigned byteSize = is32bit ? 4 : 8;
+      getStreamer().EmitCodeAlignment(byteSize, byteSize);
+
+      MCSymbol *Sym;
+
+      // for symbols, get rid of prepended ".gnu.linkonce.lx."
+
+      // emit symbol if needed
+      if (Absolute) {
+        Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16));
+        if (Sym->isUndefined()) {
+          getStreamer().EmitLabel(Sym);
+          getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+          getStreamer().EmitIntValue(Value, byteSize);
+        }
+      } else if (MO_1.isExpr()) {
+        const char *StringStart = 0;
+        const char *StringEnd = 0;
+        if (*Operands[4]->getStartLoc().getPointer() == '#') {
+          StringStart = Operands[5]->getStartLoc().getPointer();
+          StringEnd = Operands[6]->getStartLoc().getPointer();
+        } else { // no pound
+          StringStart = Operands[4]->getStartLoc().getPointer();
+          StringEnd = Operands[5]->getStartLoc().getPointer();
+        }
+
+        unsigned size = StringEnd - StringStart;
+        std::string DotConst = ".CONST_";
+        Sym = getContext().getOrCreateSymbol(DotConst +
+                                             StringRef(StringStart, size));
+
+        if (Sym->isUndefined()) {
+          // case where symbol is not yet defined: emit symbol
+          getStreamer().EmitLabel(Sym);
+          getStreamer().EmitSymbolAttribute(Sym, MCSA_Local);
+          getStreamer().EmitValue(MO_1.getExpr(), 4);
+        }
+      } else
+        llvm_unreachable("unexpected type of machine operand!");
+
+      MES->PopSection();
+
+      if (Sym) {
+        MCInst TmpInst;
+        if (is32bit) // 32 bit
+          TmpInst.setOpcode(Hexagon::L2_loadrigp);
+        else // 64 bit
+          TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+
+        TmpInst.addOperand(MO_0);
+        TmpInst.addOperand(
+            MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+        Inst = TmpInst;
+      }
+    }
+    break;
+
+  // Translate a "$Rdd = #-imm" to "$Rdd = combine(#[-1,0], #-imm)"
+  case Hexagon::A2_tfrpi: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO = Inst.getOperand(1);
+    int64_t Value;
+    int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0;
+    MCOperand imm(MCOperand::createExpr(MCConstantExpr::create(sVal, Context)));
+    Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO);
+    break;
+  }
+
+  // Translate a "$Rdd = [#]#imm" to "$Rdd = combine(#, [#]#imm)"
+  case Hexagon::TFRI64_V4: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO = Inst.getOperand(1);
+    int64_t Value;
+    if (MO.getExpr()->evaluateAsAbsolute(Value)) {
+      unsigned long long u64 = Value;
+      signed int s8 = (u64 >> 32) & 0xFFFFFFFF;
+      if (s8 < -128 || s8 > 127)
+        OutOfRange(IDLoc, s8, -128);
+      MCOperand imm(MCOperand::createExpr(
+          MCConstantExpr::create(s8, Context))); // upper 32
+      MCOperand imm2(MCOperand::createExpr(
+          MCConstantExpr::create(u64 & 0xFFFFFFFF, Context))); // lower 32
+      Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2);
+    } else {
+      MCOperand imm(MCOperand::createExpr(
+          MCConstantExpr::create(0, Context))); // upper 32
+      Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO);
+    }
+    break;
+  }
+
+  // Handle $Rdd = combine(##imm, #imm)"
+  case Hexagon::TFRI64_V2_ext: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO1 = Inst.getOperand(1);
+    MCOperand &MO2 = Inst.getOperand(2);
+    int64_t Value;
+    if (MO2.getExpr()->evaluateAsAbsolute(Value)) {
+      int s8 = Value;
+      if (s8 < -128 || s8 > 127)
+        OutOfRange(IDLoc, s8, -128);
+    }
+    Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, MO1, MO2);
+    break;
+  }
+
+  // Handle $Rdd = combine(#imm, ##imm)"
+  case Hexagon::A4_combineii: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO1 = Inst.getOperand(1);
+    int64_t Value;
+    if (MO1.getExpr()->evaluateAsAbsolute(Value)) {
+      int s8 = Value;
+      if (s8 < -128 || s8 > 127)
+        OutOfRange(IDLoc, s8, -128);
+    }
+    MCOperand &MO2 = Inst.getOperand(2);
+    Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, MO1, MO2);
+    break;
+  }
+
+  case Hexagon::S2_tableidxb_goodsyntax: {
+    Inst.setOpcode(Hexagon::S2_tableidxb);
+    break;
+  }
+
+  case Hexagon::S2_tableidxh_goodsyntax: {
+    MCInst TmpInst;
+    MCOperand &Rx = Inst.getOperand(0);
+    MCOperand &_dst_ = Inst.getOperand(1);
+    MCOperand &Rs = Inst.getOperand(2);
+    MCOperand &Imm4 = Inst.getOperand(3);
+    MCOperand &Imm6 = Inst.getOperand(4);
+    Imm6.setExpr(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(1, Context), Context));
+    TmpInst.setOpcode(Hexagon::S2_tableidxh);
+    TmpInst.addOperand(Rx);
+    TmpInst.addOperand(_dst_);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm4);
+    TmpInst.addOperand(Imm6);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_tableidxw_goodsyntax: {
+    MCInst TmpInst;
+    MCOperand &Rx = Inst.getOperand(0);
+    MCOperand &_dst_ = Inst.getOperand(1);
+    MCOperand &Rs = Inst.getOperand(2);
+    MCOperand &Imm4 = Inst.getOperand(3);
+    MCOperand &Imm6 = Inst.getOperand(4);
+    Imm6.setExpr(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(2, Context), Context));
+    TmpInst.setOpcode(Hexagon::S2_tableidxw);
+    TmpInst.addOperand(Rx);
+    TmpInst.addOperand(_dst_);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm4);
+    TmpInst.addOperand(Imm6);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_tableidxd_goodsyntax: {
+    MCInst TmpInst;
+    MCOperand &Rx = Inst.getOperand(0);
+    MCOperand &_dst_ = Inst.getOperand(1);
+    MCOperand &Rs = Inst.getOperand(2);
+    MCOperand &Imm4 = Inst.getOperand(3);
+    MCOperand &Imm6 = Inst.getOperand(4);
+    Imm6.setExpr(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(3, Context), Context));
+    TmpInst.setOpcode(Hexagon::S2_tableidxd);
+    TmpInst.addOperand(Rx);
+    TmpInst.addOperand(_dst_);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm4);
+    TmpInst.addOperand(Imm6);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::M2_mpyui: {
+    Inst.setOpcode(Hexagon::M2_mpyi);
+    break;
+  }
+  case Hexagon::M2_mpysmi: {
+    MCInst TmpInst;
+    MCOperand &Rd = Inst.getOperand(0);
+    MCOperand &Rs = Inst.getOperand(1);
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (!MustExtend) {
+      if (Value < 0 && Value > -256) {
+        Imm.setExpr(MCConstantExpr::create(Value * -1, Context));
+        TmpInst.setOpcode(Hexagon::M2_mpysin);
+      } else if (Value < 256 && Value >= 0)
+        TmpInst.setOpcode(Hexagon::M2_mpysip);
+      else
+        return Match_InvalidOperand;
+    } else {
+      if (Value >= 0)
+        TmpInst.setOpcode(Hexagon::M2_mpysip);
+      else
+        return Match_InvalidOperand;
+    }
+    TmpInst.addOperand(Rd);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+    MCOperand &Imm = Inst.getOperand(2);
+    MCInst TmpInst;
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0) { // convert to $Rd = $Rs
+      TmpInst.setOpcode(Hexagon::A2_tfr);
+      MCOperand &Rd = Inst.getOperand(0);
+      MCOperand &Rs = Inst.getOperand(1);
+      TmpInst.addOperand(Rd);
+      TmpInst.addOperand(Rs);
+    } else {
+      Imm.setExpr(MCBinaryExpr::createSub(
+          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+      MCOperand &Rd = Inst.getOperand(0);
+      MCOperand &Rs = Inst.getOperand(1);
+      TmpInst.addOperand(Rd);
+      TmpInst.addOperand(Rs);
+      TmpInst.addOperand(Imm);
+    }
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &Rss = Inst.getOperand(1);
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1])
+      MCInst TmpInst;
+      unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+      std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+      StringRef Reg1(R1);
+      Rss.setReg(MatchRegisterName(Reg1));
+      // Add a new operand for the second register in the pair.
+      std::string R2 = r + llvm::utostr_32(RegPairNum);
+      StringRef Reg2(R2);
+      TmpInst.setOpcode(Hexagon::A2_combinew);
+      TmpInst.addOperand(Rdd);
+      TmpInst.addOperand(Rss);
+      TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+      Inst = TmpInst;
+    } else {
+      Imm.setExpr(MCBinaryExpr::createSub(
+          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Inst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+    }
+    break;
+  }
+
+  case Hexagon::A4_boundscheck: {
+    MCOperand &Rs = Inst.getOperand(1);
+    unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+    if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+      Inst.setOpcode(Hexagon::A4_boundscheck_hi);
+      std::string Name =
+          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+      StringRef RegPair = Name;
+      Rs.setReg(MatchRegisterName(RegPair));
+    } else { // raw:lo
+      Inst.setOpcode(Hexagon::A4_boundscheck_lo);
+      std::string Name =
+          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+      StringRef RegPair = Name;
+      Rs.setReg(MatchRegisterName(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::A2_addsp: {
+    MCOperand &Rs = Inst.getOperand(1);
+    unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+    if (RegNum & 1) { // Odd mapped to raw:hi
+      Inst.setOpcode(Hexagon::A2_addsph);
+      std::string Name =
+          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+      StringRef RegPair = Name;
+      Rs.setReg(MatchRegisterName(RegPair));
+    } else { // Even mapped raw:lo
+      Inst.setOpcode(Hexagon::A2_addspl);
+      std::string Name =
+          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+      StringRef RegPair = Name;
+      Rs.setReg(MatchRegisterName(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::M2_vrcmpys_s1: {
+    MCOperand &Rt = Inst.getOperand(2);
+    unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+    if (RegNum & 1) { // Odd mapped to sat:raw:hi
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+      std::string Name =
+          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+      StringRef RegPair = Name;
+      Rt.setReg(MatchRegisterName(RegPair));
+    } else { // Even mapped sat:raw:lo
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+      std::string Name =
+          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+      StringRef RegPair = Name;
+      Rt.setReg(MatchRegisterName(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::M2_vrcmpys_acc_s1: {
+    MCInst TmpInst;
+    MCOperand &Rxx = Inst.getOperand(0);
+    MCOperand &Rss = Inst.getOperand(2);
+    MCOperand &Rt = Inst.getOperand(3);
+    unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+    if (RegNum & 1) { // Odd mapped to sat:raw:hi
+      TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+      std::string Name =
+          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+      StringRef RegPair = Name;
+      Rt.setReg(MatchRegisterName(RegPair));
+    } else { // Even mapped sat:raw:lo
+      TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+      std::string Name =
+          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+      StringRef RegPair = Name;
+      Rt.setReg(MatchRegisterName(RegPair));
+    }
+    // Registers are in different positions
+    TmpInst.addOperand(Rxx);
+    TmpInst.addOperand(Rxx);
+    TmpInst.addOperand(Rss);
+    TmpInst.addOperand(Rt);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::M2_vrcmpys_s1rp: {
+    MCOperand &Rt = Inst.getOperand(2);
+    unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+    if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+      std::string Name =
+          r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+      StringRef RegPair = Name;
+      Rt.setReg(MatchRegisterName(RegPair));
+    } else { // Even mapped rnd:sat:raw:lo
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+      std::string Name =
+          r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+      StringRef RegPair = Name;
+      Rt.setReg(MatchRegisterName(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0)
+      Inst.setOpcode(Hexagon::S2_vsathub);
+    else {
+      Imm.setExpr(MCBinaryExpr::createSub(
+          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+    }
+    break;
+  }
+
+  case Hexagon::S5_vasrhrnd_goodsyntax: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &Rss = Inst.getOperand(1);
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0) {
+      MCInst TmpInst;
+      unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+      std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+      StringRef Reg1(R1);
+      Rss.setReg(MatchRegisterName(Reg1));
+      // Add a new operand for the second register in the pair.
+      std::string R2 = r + llvm::utostr_32(RegPairNum);
+      StringRef Reg2(R2);
+      TmpInst.setOpcode(Hexagon::A2_combinew);
+      TmpInst.addOperand(Rdd);
+      TmpInst.addOperand(Rss);
+      TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+      Inst = TmpInst;
+    } else {
+      Imm.setExpr(MCBinaryExpr::createSub(
+          Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+      Inst.setOpcode(Hexagon::S5_vasrhrnd);
+    }
+    break;
+  }
+
+  case Hexagon::A2_not: {
+    MCInst TmpInst;
+    MCOperand &Rd = Inst.getOperand(0);
+    MCOperand &Rs = Inst.getOperand(1);
+    TmpInst.setOpcode(Hexagon::A2_subri);
+    TmpInst.addOperand(Rd);
+    TmpInst.addOperand(
+        MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    TmpInst.addOperand(Rs);
+    Inst = TmpInst;
+    break;
+  }
+  } // switch
+
+  return Match_Success;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
index cb7e633..ea96eb0 100644
--- a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -868,7 +868,7 @@ void BT::visitNonBranch(const MachineInstr *MI) {
       continue;
 
     bool Changed = false;
-    if (!Eval || !ResMap.has(RD.Reg)) {
+    if (!Eval || ResMap.count(RD.Reg) == 0) {
       // Set to "ref" (aka "bottom").
       uint16_t DefBW = ME.getRegBitWidth(RD);
       RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW);
@@ -951,11 +951,11 @@ void BT::visitBranchesFrom(const MachineInstr *BI) {
     // be processed.
     for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) {
       const MachineBasicBlock *SB = *I;
-      if (SB->isLandingPad())
+      if (SB->isEHPad())
         Targets.insert(SB);
     }
     if (FallsThrough) {
-      MachineFunction::const_iterator BIt = &B;
+      MachineFunction::const_iterator BIt = B.getIterator();
       MachineFunction::const_iterator Next = std::next(BIt);
       if (Next != MF.end())
         Targets.insert(&*Next);
@@ -1005,7 +1005,7 @@ void BT::put(RegisterRef RR, const RegisterCell &RC) {
 // Replace all references to bits from OldRR with the corresponding bits
 // in NewRR.
 void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
-  assert(Map.has(OldRR.Reg) && "OldRR not present in map");
+  assert(Map.count(OldRR.Reg) > 0 && "OldRR not present in map");
   BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub);
   BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub);
   uint16_t OMB = OM.first(), OME = OM.last();
@@ -1104,9 +1104,9 @@ void BT::run() {
     }
     // If block end has been reached, add the fall-through edge to the queue.
     if (It == End) {
-      MachineFunction::const_iterator BIt = &B;
+      MachineFunction::const_iterator BIt = B.getIterator();
       MachineFunction::const_iterator Next = std::next(BIt);
-      if (Next != MF.end()) {
+      if (Next != MF.end() && B.isSuccessor(&*Next)) {
         int ThisN = B.getNumber();
         int NextN = Next->getNumber();
         FlowQ.push(CFGEdge(ThisN, NextN));
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
index ed002a7..959c831 100644
--- a/contrib/llvm/lib/Target/Hexagon/BitTracker.h
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
@@ -36,9 +36,7 @@ struct BitTracker {
 
   typedef SetVector<const MachineBasicBlock *> BranchTargetList;
 
-  struct CellMapType : public std::map<unsigned,RegisterCell> {
-    bool has(unsigned Reg) const;
-  };
+  typedef std::map<unsigned, RegisterCell> CellMapType;
 
   BitTracker(const MachineEvaluator &E, MachineFunction &F);
   ~BitTracker();
@@ -79,7 +77,6 @@ private:
 // Abstraction of a reference to bit at position Pos from a register Reg.
 struct BitTracker::BitRef {
   BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
-  BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {}
   bool operator== (const BitRef &BR) const {
     // If Reg is 0, disregard Pos.
     return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
@@ -146,7 +143,6 @@ struct BitTracker::BitValue {
 
   BitValue(ValueType T = Top) : Type(T) {}
   BitValue(bool B) : Type(B ? One : Zero) {}
-  BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {}
   BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {}
 
   bool operator== (const BitValue &V) const {
@@ -279,11 +275,6 @@ struct BitTracker::RegisterCell {
     return !operator==(RC);
   }
 
-  const RegisterCell &operator=(const RegisterCell &RC) {
-    Bits = RC.Bits;
-    return *this;
-  }
-
   // Generate a "ref" cell for the corresponding register. In the resulting
   // cell each bit will be described as being the same as the corresponding
   // bit in register Reg (i.e. the cell is "defined" by register Reg).
@@ -344,11 +335,6 @@ BitTracker::RegisterCell::ref(const RegisterCell &C) {
   return RC;
 }
 
-
-inline bool BitTracker::CellMapType::has(unsigned Reg) const {
-  return find(Reg) != end();
-}
-
 // A class to evaluate target's instructions and update the cell maps.
 // This is used internally by the bit tracker.  A target that wants to
 // utilize this should implement the evaluation functions (noted below)
diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 9cc1e94..4a9c341 100644
--- a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -7,42 +7,45 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "hexagon-disassembler"
+
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-
-#include "llvm/MC/MCContext.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/raw_ostream.h"
-#include <array>
+#include "llvm/Support/TargetRegistry.h"
 #include <vector>
 
 using namespace llvm;
 using namespace Hexagon;
 
-#define DEBUG_TYPE "hexagon-disassembler"
-
-// Pull DecodeStatus and its enum values into the global namespace.
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 /// \brief Hexagon disassembler for all Hexagon platforms.
 class HexagonDisassembler : public MCDisassembler {
 public:
+  std::unique_ptr<MCInstrInfo const> const MCII;
   std::unique_ptr<MCInst *> CurrentBundle;
-  HexagonDisassembler(MCSubtargetInfo const &STI, MCContext &Ctx)
-      : MCDisassembler(STI, Ctx), CurrentBundle(new MCInst *) {}
+  HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                      MCInstrInfo const *MCII)
+      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *) {}
 
   DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
                                     ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -52,23 +55,57 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
+
+  void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
+  void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
 };
 }
 
-static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+// Forward declare these because the auto-generated code will reference them.
+// Definitions are further down.
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
-                                                 void const *Decoder);
+                                                 const void *Decoder);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+                                 void const *Decoder);
 
 static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
                                  raw_ostream &os);
-static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst);
 
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+                                       uint64_t Address, const void *Decoder);
 static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                   const void *Decoder);
 static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
@@ -95,129 +132,19 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
-
-static const uint16_t IntRegDecoderTable[] = {
-    Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
-    Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
-    Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
-    Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
-    Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
-    Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
-    Hexagon::R30, Hexagon::R31};
-
-static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
-                                               Hexagon::P2, Hexagon::P3};
-
-static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        const uint16_t Table[], size_t Size) {
-  if (RegNo < Size) {
-    Inst.addOperand(MCOperand::createReg(Table[RegNo]));
-    return MCDisassembler::Success;
-  } else
-    return MCDisassembler::Fail;
-}
-
-static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t /*Address*/,
-                                               void const *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Register = IntRegDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t /*Address*/,
-                                               const void *Decoder) {
-  static const uint16_t CtrlRegDecoderTable[] = {
-      Hexagon::SA0,  Hexagon::LC0,        Hexagon::SA1,  Hexagon::LC1,
-      Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6,   Hexagon::C7,
-      Hexagon::USR,  Hexagon::PC,         Hexagon::UGP,  Hexagon::GP,
-      Hexagon::CS0,  Hexagon::CS1,        Hexagon::UPCL, Hexagon::UPCH};
-
-  if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0]))
-    return MCDisassembler::Fail;
-
-  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
-    return MCDisassembler::Fail;
-
-  unsigned Register = CtrlRegDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t /*Address*/,
-                                                 void const *Decoder) {
-  static const uint16_t CtrlReg64DecoderTable[] = {
-      Hexagon::C1_0,       Hexagon::NoRegister, Hexagon::C3_2,
-      Hexagon::NoRegister, Hexagon::NoRegister, Hexagon::NoRegister,
-      Hexagon::C7_6,       Hexagon::NoRegister, Hexagon::C9_8,
-      Hexagon::NoRegister, Hexagon::C11_10,     Hexagon::NoRegister,
-      Hexagon::CS,         Hexagon::NoRegister, Hexagon::UPC,
-      Hexagon::NoRegister};
-
-  if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0]))
-    return MCDisassembler::Fail;
-
-  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
-    return MCDisassembler::Fail;
-
-  unsigned Register = CtrlReg64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t /*Address*/,
-                                               const void *Decoder) {
-  unsigned Register = 0;
-  switch (RegNo) {
-  case 0:
-    Register = Hexagon::M0;
-    break;
-  case 1:
-    Register = Hexagon::M1;
-    break;
-  default:
-    return MCDisassembler::Fail;
-  }
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                  uint64_t /*Address*/,
-                                                  const void *Decoder) {
-  static const uint16_t DoubleRegDecoderTable[] = {
-      Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
-      Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
-      Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
-      Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
-
-  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable,
-                              sizeof(DoubleRegDecoderTable)));
-}
-
-static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t /*Address*/,
-                                                void const *Decoder) {
-  if (RegNo > 3)
-    return MCDisassembler::Fail;
-
-  unsigned Register = PredRegDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder);
 
 #include "HexagonGenDisassemblerTables.inc"
 
-static MCDisassembler *createHexagonDisassembler(Target const &T,
-                                                 MCSubtargetInfo const &STI,
+static MCDisassembler *createHexagonDisassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI,
                                                  MCContext &Ctx) {
-  return new HexagonDisassembler(STI, Ctx);
+  return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
 extern "C" void LLVMInitializeHexagonDisassembler() {
@@ -235,8 +162,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   Size = 0;
 
   *CurrentBundle = &MI;
-  MI.setOpcode(Hexagon::BUNDLE);
-  MI.addOperand(MCOperand::createImm(0));
+  MI = HexagonMCInstrInfo::createBundle();
   while (Result == Success && Complete == false) {
     if (Bytes.size() < HEXAGON_INSTR_SIZE)
       return MCDisassembler::Fail;
@@ -246,7 +172,21 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Size += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
-  return Result;
+  if(Result == MCDisassembler::Fail)
+    return Result;
+  HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
+  if(!Checker.check())
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
+namespace {
+HexagonDisassembler const &disassembler(void const *Decoder) {
+  return *static_cast<HexagonDisassembler const *>(Decoder);
+}
+MCContext &contextFromDecoder(void const *Decoder) {
+  return disassembler(Decoder).getContext();
+}
 }
 
 DecodeStatus HexagonDisassembler::getSingleInstruction(
@@ -255,8 +195,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
 
   uint32_t Instruction =
-      llvm::support::endian::read<uint32_t, llvm::support::little,
-                                  llvm::support::unaligned>(Bytes.data());
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
 
   auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
@@ -360,8 +299,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     MILow->setOpcode(opLow);
     MCInst *MIHigh = new (getContext()) MCInst;
     MIHigh->setOpcode(opHigh);
-    AddSubinstOperands(MILow, opLow, instLow);
-    AddSubinstOperands(MIHigh, opHigh, instHigh);
+    addSubinstOperands(MILow, opLow, instLow);
+    addSubinstOperands(MIHigh, opHigh, instHigh);
     // see ConvertToSubInst() in
     // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
 
@@ -378,102 +317,774 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     // Calling the auto-generated decoder function.
     Result =
         decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
+
+    // If a, "standard" insn isn't found check special cases.
+    if (MCDisassembler::Success != Result ||
+        MI.getOpcode() == Hexagon::A4_ext) {
+      Result = decodeImmext(MI, Instruction, this);
+      if (MCDisassembler::Success != Result) {
+        Result = decodeSpecial(MI, Instruction);
+      }
+    } else {
+      // If the instruction is a compound instruction, register values will
+      // follow the duplex model, so the register values in the MCInst are
+      // incorrect. If the instruction is a compound, loop through the
+      // operands and change registers appropriately.
+      if (llvm::HexagonMCInstrInfo::getType(*MCII, MI) ==
+          HexagonII::TypeCOMPOUND) {
+        for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
+          if (i->isReg()) {
+            unsigned reg = i->getReg() - Hexagon::R0;
+            i->setReg(getRegFromSubinstEncoding(reg));
+          }
+        }
+      }
+    }
+  }
+
+  if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) {
+    unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI);
+    MCOperand &MCO = MI.getOperand(OpIndex);
+    assert(MCO.isReg() && "New value consumers must be registers");
+    unsigned Register =
+        getContext().getRegisterInfo()->getEncodingValue(MCO.getReg());
+    if ((Register & 0x6) == 0)
+      // HexagonPRM 10.11 Bit 1-2 == 0 is reserved
+      return MCDisassembler::Fail;
+    unsigned Lookback = (Register & 0x6) >> 1;
+    unsigned Offset = 1;
+    bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.end() - 1;
+    for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
+      if (i == n)
+        // Couldn't find producer
+        return MCDisassembler::Fail;
+      if (Vector && !HexagonMCInstrInfo::isVector(*MCII, *i->getInst()))
+        // Skip scalars when calculating distances for vectors
+        ++Lookback;
+      if (HexagonMCInstrInfo::isImmext(*i->getInst()))
+        ++Lookback;
+      if (Offset == Lookback)
+        break;
+    }
+    auto const &Inst = *i->getInst();
+    bool SubregBit = (Register & 0x1) != 0;
+    if (SubregBit && HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) {
+      // If subreg bit is set we're selecting the second produced newvalue
+      unsigned Producer =
+          HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg();
+      assert(Producer != Hexagon::NoRegister);
+      MCO.setReg(Producer);
+    } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
+      unsigned Producer =
+          HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
+      if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+        Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
+      else if (SubregBit)
+        // Subreg bit should not be set for non-doublevector newvalue producers
+        return MCDisassembler::Fail;
+      assert(Producer != Hexagon::NoRegister);
+      MCO.setReg(Producer);
+    } else
+      return MCDisassembler::Fail;
   }
 
+  adjustExtendedInstructions(MI, MCB);
+  MCInst const *Extender =
+    HexagonMCInstrInfo::extenderForIndex(MCB,
+                                         HexagonMCInstrInfo::bundleSize(MCB));
+  if(Extender != nullptr) {
+    MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
+                          *MI.getOperand(1).getInst() : MI;
+    if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
+        !HexagonMCInstrInfo::isExtended(*MCII, Inst))
+      return MCDisassembler::Fail;
+  }
   return Result;
 }
 
+void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
+                                                     MCInst const &MCB) const {
+  if (!HexagonMCInstrInfo::hasExtenderForIndex(
+          MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
+    unsigned opcode;
+    // This code is used by the disassembler to disambiguate between GP
+    // relative and absolute addressing instructions since they both have
+    // same encoding bits. However, an absolute addressing instruction must
+    // follow an immediate extender. Disassembler alwaus select absolute
+    // addressing instructions first and uses this code to change them into
+    // GP relative instruction in the absence of the corresponding immediate
+    // extender.
+    switch (MCI.getOpcode()) {
+    case Hexagon::S2_storerbabs:
+      opcode = Hexagon::S2_storerbgp;
+      break;
+    case Hexagon::S2_storerhabs:
+      opcode = Hexagon::S2_storerhgp;
+      break;
+    case Hexagon::S2_storerfabs:
+      opcode = Hexagon::S2_storerfgp;
+      break;
+    case Hexagon::S2_storeriabs:
+      opcode = Hexagon::S2_storerigp;
+      break;
+    case Hexagon::S2_storerbnewabs:
+      opcode = Hexagon::S2_storerbnewgp;
+      break;
+    case Hexagon::S2_storerhnewabs:
+      opcode = Hexagon::S2_storerhnewgp;
+      break;
+    case Hexagon::S2_storerinewabs:
+      opcode = Hexagon::S2_storerinewgp;
+      break;
+    case Hexagon::S2_storerdabs:
+      opcode = Hexagon::S2_storerdgp;
+      break;
+    case Hexagon::L4_loadrb_abs:
+      opcode = Hexagon::L2_loadrbgp;
+      break;
+    case Hexagon::L4_loadrub_abs:
+      opcode = Hexagon::L2_loadrubgp;
+      break;
+    case Hexagon::L4_loadrh_abs:
+      opcode = Hexagon::L2_loadrhgp;
+      break;
+    case Hexagon::L4_loadruh_abs:
+      opcode = Hexagon::L2_loadruhgp;
+      break;
+    case Hexagon::L4_loadri_abs:
+      opcode = Hexagon::L2_loadrigp;
+      break;
+    case Hexagon::L4_loadrd_abs:
+      opcode = Hexagon::L2_loadrdgp;
+      break;
+    default:
+      opcode = MCI.getOpcode();
+    }
+    MCI.setOpcode(opcode);
+  }
+}
+
+namespace llvm {
+extern const MCInstrDesc HexagonInsts[];
+}
+
+static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
+                                        ArrayRef<MCPhysReg> Table) {
+  if (RegNo < Table.size()) {
+    Inst.addOperand(MCOperand::createReg(Table[RegNo]));
+    return MCDisassembler::Success;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  static const MCPhysReg IntRegDecoderTable[] = {
+      Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
+      Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
+      Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
+      Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+      Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
+      Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
+      Hexagon::R30, Hexagon::R31};
+
+  return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
+}
+
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
+  static const MCPhysReg VecRegDecoderTable[] = {
+      Hexagon::V0,  Hexagon::V1,  Hexagon::V2,  Hexagon::V3,  Hexagon::V4,
+      Hexagon::V5,  Hexagon::V6,  Hexagon::V7,  Hexagon::V8,  Hexagon::V9,
+      Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
+      Hexagon::V15, Hexagon::V16, Hexagon::V17, Hexagon::V18, Hexagon::V19,
+      Hexagon::V20, Hexagon::V21, Hexagon::V22, Hexagon::V23, Hexagon::V24,
+      Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
+      Hexagon::V30, Hexagon::V31};
+
+  return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable);
+}
+
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
+  static const MCPhysReg DoubleRegDecoderTable[] = {
+      Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
+      Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
+      Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
+      Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
+
+  return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
+  static const MCPhysReg VecDblRegDecoderTable[] = {
+      Hexagon::W0,  Hexagon::W1,  Hexagon::W2,  Hexagon::W3,
+      Hexagon::W4,  Hexagon::W5,  Hexagon::W6,  Hexagon::W7,
+      Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
+      Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
+
+  return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable));
+}
+
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                uint64_t /*Address*/,
+                                                const void *Decoder) {
+  static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+                                                  Hexagon::P2, Hexagon::P3};
+
+  return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t /*Address*/,
+                                                   const void *Decoder) {
+  static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
+                                                     Hexagon::Q2, Hexagon::Q3};
+
+  return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable);
+}
+
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
+  static const MCPhysReg CtrlRegDecoderTable[] = {
+    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
+    Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
+    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
+    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+  };
+
+  if (RegNo >= array_lengthof(CtrlRegDecoderTable))
+    return MCDisassembler::Fail;
+
+  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = CtrlRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t /*Address*/,
+                                                 const void *Decoder) {
+  static const MCPhysReg CtrlReg64DecoderTable[] = {
+      Hexagon::C1_0,   Hexagon::NoRegister,
+      Hexagon::C3_2,   Hexagon::NoRegister,
+      Hexagon::C7_6,   Hexagon::NoRegister,
+      Hexagon::C9_8,   Hexagon::NoRegister,
+      Hexagon::C11_10, Hexagon::NoRegister,
+      Hexagon::CS,     Hexagon::NoRegister,
+      Hexagon::UPC,    Hexagon::NoRegister
+  };
+
+  if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
+    return MCDisassembler::Fail;
+
+  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = CtrlReg64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+  case 0:
+    Register = Hexagon::M0;
+    break;
+  case 1:
+    Register = Hexagon::M1;
+    break;
+  default:
+    return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+namespace {
+uint32_t fullValue(MCInstrInfo const &MCII,
+                  MCInst &MCB,
+                  MCInst &MI,
+                  int64_t Value) {
+  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+    MCB, HexagonMCInstrInfo::bundleSize(MCB));
+  if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+    return Value;
+  unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+  uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+  int64_t Bits;
+  bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+  assert(Success);(void)Success;
+  uint32_t Upper26 = static_cast<uint32_t>(Bits);
+  uint32_t Operand = Upper26 | Lower6;
+  return Operand;
+}
+template <size_t T>
+void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  int64_t FullValue = fullValue(*Disassembler.MCII,
+                                **Disassembler.CurrentBundle,
+                                MI, SignExtend64<T>(tmp));
+  int64_t Extended = SignExtend64<32>(FullValue);
+  HexagonMCInstrInfo::addConstant(MI, Extended,
+                                  Disassembler.getContext());
+}
+}
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+                                       uint64_t /*Address*/,
+                                       const void *Decoder) {
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  int64_t FullValue = fullValue(*Disassembler.MCII,
+                                **Disassembler.CurrentBundle,
+                                MI, tmp);
+  assert(FullValue >= 0 && "Negative in unsigned decoder");
+  HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp,
                                   uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<16>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<16>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp,
                                   uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<12>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<12>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<11>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<11>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<12>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<13>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<13>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<14>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<14>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s10ImmDecoder(MCInst &MI, unsigned tmp,
                                   uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<10>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<10>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s8ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
                                  const void *Decoder) {
-  uint64_t imm = SignExtend64<8>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<8>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
                                    uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<6>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<6>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
                                    uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<4>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<4>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
                                    uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<5>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<5>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
                                    uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<6>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<6>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
 static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
                                    uint64_t /*Address*/, const void *Decoder) {
-  uint64_t imm = SignExtend64<7>(tmp);
-  MI.addOperand(MCOperand::createImm(imm));
+  signedDecoder<7>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<10>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<19>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+// custom decoder for various jump/call immediates
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder) {
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+  // r13_2 is not extendable, so if there are no extent bits, it's r13_2
+  if (Bits == 0)
+    Bits = 15;
+  uint32_t FullValue = fullValue(*Disassembler.MCII,
+                                **Disassembler.CurrentBundle,
+                                MI, SignExtend64(tmp, Bits));
+  int64_t Extended = SignExtend64<32>(FullValue) + Address;
+  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
+                                              0, 4))
+    HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+  return MCDisassembler::Success;
+}
+
+// Addressing mode dependent load store opcode map.
+//   - If an insn is preceded by an extender the address is absolute.
+//      - memw(##symbol) = r0
+//   - If an insn is not preceded by an extender the address is GP relative.
+//      - memw(gp + #symbol) = r0
+// Please note that the instructions must be ordered in the descending order
+// of their opcode.
+// HexagonII::INST_ICLASS_ST
+static const unsigned int StoreConditionalOpcodeData[][2] = {
+    {S4_pstorerdfnew_abs, 0xafc02084},
+    {S4_pstorerdtnew_abs, 0xafc02080},
+    {S4_pstorerdf_abs, 0xafc00084},
+    {S4_pstorerdt_abs, 0xafc00080},
+    {S4_pstorerinewfnew_abs, 0xafa03084},
+    {S4_pstorerinewtnew_abs, 0xafa03080},
+    {S4_pstorerhnewfnew_abs, 0xafa02884},
+    {S4_pstorerhnewtnew_abs, 0xafa02880},
+    {S4_pstorerbnewfnew_abs, 0xafa02084},
+    {S4_pstorerbnewtnew_abs, 0xafa02080},
+    {S4_pstorerinewf_abs, 0xafa01084},
+    {S4_pstorerinewt_abs, 0xafa01080},
+    {S4_pstorerhnewf_abs, 0xafa00884},
+    {S4_pstorerhnewt_abs, 0xafa00880},
+    {S4_pstorerbnewf_abs, 0xafa00084},
+    {S4_pstorerbnewt_abs, 0xafa00080},
+    {S4_pstorerifnew_abs, 0xaf802084},
+    {S4_pstoreritnew_abs, 0xaf802080},
+    {S4_pstorerif_abs, 0xaf800084},
+    {S4_pstorerit_abs, 0xaf800080},
+    {S4_pstorerhfnew_abs, 0xaf402084},
+    {S4_pstorerhtnew_abs, 0xaf402080},
+    {S4_pstorerhf_abs, 0xaf400084},
+    {S4_pstorerht_abs, 0xaf400080},
+    {S4_pstorerbfnew_abs, 0xaf002084},
+    {S4_pstorerbtnew_abs, 0xaf002080},
+    {S4_pstorerbf_abs, 0xaf000084},
+    {S4_pstorerbt_abs, 0xaf000080}};
+// HexagonII::INST_ICLASS_LD
+
+// HexagonII::INST_ICLASS_LD_ST_2
+static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000},
+                                                {L4_loadri_abs, 0x49800000},
+                                                {L4_loadruh_abs, 0x49600000},
+                                                {L4_loadrh_abs, 0x49400000},
+                                                {L4_loadrub_abs, 0x49200000},
+                                                {L4_loadrb_abs, 0x49000000},
+                                                {S2_storerdabs, 0x48c00000},
+                                                {S2_storerinewabs, 0x48a01000},
+                                                {S2_storerhnewabs, 0x48a00800},
+                                                {S2_storerbnewabs, 0x48a00000},
+                                                {S2_storeriabs, 0x48800000},
+                                                {S2_storerfabs, 0x48600000},
+                                                {S2_storerhabs, 0x48400000},
+                                                {S2_storerbabs, 0x48000000}};
+static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
+static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
+
+  unsigned MachineOpcode = 0;
+  unsigned LLVMOpcode = 0;
+
+  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
+    for (size_t i = 0; i < NumCondS; ++i) {
+      if ((insn & StoreConditionalOpcodeData[i][1]) ==
+          StoreConditionalOpcodeData[i][1]) {
+        MachineOpcode = StoreConditionalOpcodeData[i][1];
+        LLVMOpcode = StoreConditionalOpcodeData[i][0];
+        break;
+      }
+    }
+  }
+  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
+    for (size_t i = 0; i < NumLS; ++i) {
+      if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
+        MachineOpcode = LoadStoreOpcodeData[i][1];
+        LLVMOpcode = LoadStoreOpcodeData[i][0];
+        break;
+      }
+    }
+  }
+
+  if (MachineOpcode) {
+    unsigned Value = 0;
+    unsigned shift = 0;
+    MI.setOpcode(LLVMOpcode);
+    // Remove the parse bits from the insn.
+    insn &= ~HexagonII::INST_PARSE_MASK;
+
+    switch (LLVMOpcode) {
+    default:
+      return MCDisassembler::Fail;
+      break;
+
+    case Hexagon::S4_pstorerdf_abs:
+    case Hexagon::S4_pstorerdt_abs:
+    case Hexagon::S4_pstorerdfnew_abs:
+    case Hexagon::S4_pstorerdtnew_abs: {
+      // op: Pv
+      Value = insn & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, 0);
+      // op: u6
+      Value = (insn >> 12) & UINT64_C(48);
+      Value |= (insn >> 3) & UINT64_C(15);
+      MI.addOperand(MCOperand::createImm(Value));
+      // op: Rtt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+      break;
+    }
+
+    case Hexagon::S4_pstorerbnewf_abs:
+    case Hexagon::S4_pstorerbnewt_abs:
+    case Hexagon::S4_pstorerbnewfnew_abs:
+    case Hexagon::S4_pstorerbnewtnew_abs:
+    case Hexagon::S4_pstorerhnewf_abs:
+    case Hexagon::S4_pstorerhnewt_abs:
+    case Hexagon::S4_pstorerhnewfnew_abs:
+    case Hexagon::S4_pstorerhnewtnew_abs:
+    case Hexagon::S4_pstorerinewf_abs:
+    case Hexagon::S4_pstorerinewt_abs:
+    case Hexagon::S4_pstorerinewfnew_abs:
+    case Hexagon::S4_pstorerinewtnew_abs: {
+      // op: Pv
+      Value = insn & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, 0);
+      // op: u6
+      Value = (insn >> 12) & UINT64_C(48);
+      Value |= (insn >> 3) & UINT64_C(15);
+      MI.addOperand(MCOperand::createImm(Value));
+      // op: Nt
+      Value = (insn >> 8) & UINT64_C(7);
+      DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+      break;
+    }
+
+    case Hexagon::S4_pstorerbf_abs:
+    case Hexagon::S4_pstorerbt_abs:
+    case Hexagon::S4_pstorerbfnew_abs:
+    case Hexagon::S4_pstorerbtnew_abs:
+    case Hexagon::S4_pstorerhf_abs:
+    case Hexagon::S4_pstorerht_abs:
+    case Hexagon::S4_pstorerhfnew_abs:
+    case Hexagon::S4_pstorerhtnew_abs:
+    case Hexagon::S4_pstorerif_abs:
+    case Hexagon::S4_pstorerit_abs:
+    case Hexagon::S4_pstorerifnew_abs:
+    case Hexagon::S4_pstoreritnew_abs: {
+      // op: Pv
+      Value = insn & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, 0);
+      // op: u6
+      Value = (insn >> 12) & UINT64_C(48);
+      Value |= (insn >> 3) & UINT64_C(15);
+      MI.addOperand(MCOperand::createImm(Value));
+      // op: Rt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+      break;
+    }
+
+    case Hexagon::L4_ploadrdf_abs:
+    case Hexagon::L4_ploadrdt_abs:
+    case Hexagon::L4_ploadrdfnew_abs:
+    case Hexagon::L4_ploadrdtnew_abs: {
+      // op: Rdd
+      Value = insn & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+      // op: Pt
+      Value = ((insn >> 9) & UINT64_C(3));
+      DecodePredRegsRegisterClass(MI, Value, 0, 0);
+      // op: u6
+      Value = ((insn >> 15) & UINT64_C(62));
+      Value |= ((insn >> 8) & UINT64_C(1));
+      MI.addOperand(MCOperand::createImm(Value));
+      break;
+    }
+
+    case Hexagon::L4_ploadrbf_abs:
+    case Hexagon::L4_ploadrbt_abs:
+    case Hexagon::L4_ploadrbfnew_abs:
+    case Hexagon::L4_ploadrbtnew_abs:
+    case Hexagon::L4_ploadrhf_abs:
+    case Hexagon::L4_ploadrht_abs:
+    case Hexagon::L4_ploadrhfnew_abs:
+    case Hexagon::L4_ploadrhtnew_abs:
+    case Hexagon::L4_ploadrubf_abs:
+    case Hexagon::L4_ploadrubt_abs:
+    case Hexagon::L4_ploadrubfnew_abs:
+    case Hexagon::L4_ploadrubtnew_abs:
+    case Hexagon::L4_ploadruhf_abs:
+    case Hexagon::L4_ploadruht_abs:
+    case Hexagon::L4_ploadruhfnew_abs:
+    case Hexagon::L4_ploadruhtnew_abs:
+    case Hexagon::L4_ploadrif_abs:
+    case Hexagon::L4_ploadrit_abs:
+    case Hexagon::L4_ploadrifnew_abs:
+    case Hexagon::L4_ploadritnew_abs:
+      // op: Rd
+      Value = insn & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+      // op: Pt
+      Value = (insn >> 9) & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, 0);
+      // op: u6
+      Value = (insn >> 15) & UINT64_C(62);
+      Value |= (insn >> 8) & UINT64_C(1);
+      MI.addOperand(MCOperand::createImm(Value));
+      break;
+
+    // op: g16_2
+    case (Hexagon::L4_loadri_abs):
+      ++shift;
+    // op: g16_1
+    case Hexagon::L4_loadrh_abs:
+    case Hexagon::L4_loadruh_abs:
+      ++shift;
+    // op: g16_0
+    case Hexagon::L4_loadrb_abs:
+    case Hexagon::L4_loadrub_abs: {
+      // op: Rd
+      Value |= insn & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(511);
+      MI.addOperand(MCOperand::createImm(Value << shift));
+      break;
+    }
+
+    case Hexagon::L4_loadrd_abs: {
+      Value = insn & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(511);
+      MI.addOperand(MCOperand::createImm(Value << 3));
+      break;
+    }
+
+    case Hexagon::S2_storerdabs: {
+      // op: g16_3
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(256);
+      Value |= insn & UINT64_C(255);
+      MI.addOperand(MCOperand::createImm(Value << 3));
+      // op: Rtt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+      break;
+    }
+
+    // op: g16_2
+    case Hexagon::S2_storerinewabs:
+      ++shift;
+    // op: g16_1
+    case Hexagon::S2_storerhnewabs:
+      ++shift;
+    // op: g16_0
+    case Hexagon::S2_storerbnewabs: {
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(256);
+      Value |= insn & UINT64_C(255);
+      MI.addOperand(MCOperand::createImm(Value << shift));
+      // op: Nt
+      Value = (insn >> 8) & UINT64_C(7);
+      DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+      break;
+    }
+
+    // op: g16_2
+    case Hexagon::S2_storeriabs:
+      ++shift;
+    // op: g16_1
+    case Hexagon::S2_storerhabs:
+    case Hexagon::S2_storerfabs:
+      ++shift;
+    // op: g16_0
+    case Hexagon::S2_storerbabs: {
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(256);
+      Value |= insn & UINT64_C(255);
+      MI.addOperand(MCOperand::createImm(Value << shift));
+      // op: Rt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+      break;
+    }
+    }
+    return MCDisassembler::Success;
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+                                 void const *Decoder) {
+
+  // Instruction Class for a constant a extender: bits 31:28 = 0x0000
+  if ((~insn & 0xf0000000) == 0xf0000000) {
+    unsigned Value;
+    // 27:16 High 12 bits of 26-bit extender.
+    Value = (insn & 0x0fff0000) << 4;
+    // 13:0 Low 14 bits of 26-bit extender.
+    Value |= ((insn & 0x3fff) << 6);
+    MI.setOpcode(Hexagon::A4_ext);
+    HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
+    return MCDisassembler::Success;
+  }
+  return MCDisassembler::Fail;
+}
+
 // These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
 enum subInstBinaryValues {
   V4_SA1_addi_BITS = 0x0000,
@@ -731,6 +1342,8 @@ static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
     return Hexagon::R0 + encoded_reg;
   else if (encoded_reg < 16)
     return Hexagon::R0 + encoded_reg + 8;
+
+  // patently false value
   return Hexagon::NoRegister;
 }
 
@@ -739,10 +1352,13 @@ static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
     return Hexagon::D0 + encoded_dreg;
   else if (encoded_dreg < 8)
     return Hexagon::D0 + encoded_dreg + 4;
+
+  // patently false value
   return Hexagon::NoRegister;
 }
 
-static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
+void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
+                                             unsigned inst) const {
   int64_t operand;
   MCOperand Op;
   switch (opcode) {
@@ -762,8 +1378,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
   case Hexagon::V4_SS2_allocframe:
     // u 8-4{5_3}
     operand = ((inst & 0x1f0) >> 4) << 3;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SL1_loadri_io:
     // Rd 3-0, Rs 7-4, u 11-8{4_2}
@@ -774,8 +1389,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0xf00) >> 6;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SL1_loadrub_io:
     // Rd 3-0, Rs 7-4, u 11-8
@@ -786,8 +1400,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0xf00) >> 8;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SL2_loadrb_io:
     // Rd 3-0, Rs 7-4, u 10-8
@@ -798,8 +1411,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0x700) >> 8;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SL2_loadrh_io:
   case Hexagon::V4_SL2_loadruh_io:
@@ -811,8 +1423,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = ((inst & 0x700) >> 8) << 1;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SL2_loadrd_sp:
     // Rdd 2-0, u 7-3{5_3}
@@ -820,8 +1431,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = ((inst & 0x0f8) >> 3) << 3;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SL2_loadri_sp:
     // Rd 3-0, u 8-4{5_2}
@@ -829,8 +1439,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = ((inst & 0x1f0) >> 4) << 2;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SA1_addi:
     // Rx 3-0 (x2), s7 10-4
@@ -839,8 +1448,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     MI->addOperand(Op);
     MI->addOperand(Op);
     operand = SignExtend64<7>((inst & 0x7f0) >> 4);
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SA1_addrx:
     // Rx 3-0 (x2), Rs 7-4
@@ -873,8 +1481,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = ((inst & 0x3f0) >> 4) << 2;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SA1_seti:
     // Rd 3-0, u 9-4
@@ -882,8 +1489,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0x3f0) >> 4;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SA1_clrf:
   case Hexagon::V4_SA1_clrfnew:
@@ -901,8 +1507,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = inst & 0x3;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SA1_combine0i:
   case Hexagon::V4_SA1_combine1i:
@@ -913,8 +1518,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0x060) >> 5;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SA1_combinerz:
   case Hexagon::V4_SA1_combinezr:
@@ -932,8 +1536,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0xf00) >> 8;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     operand = getRegFromSubinstEncoding(inst & 0xf);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
@@ -944,8 +1547,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = ((inst & 0xf00) >> 8) << 2;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     operand = getRegFromSubinstEncoding(inst & 0xf);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
@@ -957,8 +1559,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = inst & 0xf;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SS2_storewi0:
   case Hexagon::V4_SS2_storewi1:
@@ -967,25 +1568,23 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = (inst & 0xf) << 2;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     break;
   case Hexagon::V4_SS2_stored_sp:
     // s 8-3{6_3}, Rtt 2-0
     operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     operand = getDRegFromSubinstEncoding(inst & 0x7);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
+    break;
   case Hexagon::V4_SS2_storeh_io:
     // Rs 7-4, u 10-8{3_1}, Rt 3-0
     operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
     operand = ((inst & 0x700) >> 8) << 1;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     operand = getRegFromSubinstEncoding(inst & 0xf);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
@@ -993,8 +1592,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
   case Hexagon::V4_SS2_storew_sp:
     // u 8-4{5_2}, Rd 3-0
     operand = ((inst & 0x1f0) >> 4) << 2;
-    Op = MCOperand::createImm(operand);
-    MI->addOperand(Op);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
     operand = getRegFromSubinstEncoding(inst & 0xf);
     Op = MCOperand::createReg(operand);
     MI->addOperand(Op);
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
index d360be2..ed7d957 100644
--- a/contrib/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
@@ -47,15 +47,8 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-  class MachineInstr;
-  class MCInst;
-  class MCInstrInfo;
-  class HexagonAsmPrinter;
   class HexagonTargetMachine;
 
-  void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
-                        HexagonAsmPrinter &AP);
-
   /// \brief Creates a Hexagon-specific Target Transformation Info pass.
   ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
 } // end namespace llvm;
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
index 53a687c..5a7eb21 100644
--- a/contrib/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -24,14 +24,32 @@ include "llvm/Target/Target.td"
 // Hexagon Architectures
 def ArchV4:  SubtargetFeature<"v4",  "HexagonArchVersion", "V4",  "Hexagon V4">;
 def ArchV5:  SubtargetFeature<"v5",  "HexagonArchVersion", "V5",  "Hexagon V5">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+
+// Hexagon ISA Extensions
+def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps",
+                                   "true", "Hexagon HVX instructions">;
+def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps",
+                                   "true", "Hexagon HVX Double instructions">;
 
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasV5T                      : Predicate<"HST->hasV5TOps()">;
-def NoV5T                       : Predicate<"!HST->hasV5TOps()">;
-def UseMEMOP                    : Predicate<"HST->useMemOps()">;
-def IEEERndNearV5T              : Predicate<"HST->modeIEEERndNear()">;
+def HasV5T             : Predicate<"HST->hasV5TOps()">;
+def NoV5T              : Predicate<"!HST->hasV5TOps()">;
+def HasV55T            : Predicate<"HST->hasV55TOps()">,
+                         AssemblerPredicate<"ArchV55">;
+def HasV60T            : Predicate<"HST->hasV60TOps()">,
+                         AssemblerPredicate<"ArchV60">;
+def UseMEMOP           : Predicate<"HST->useMemOps()">;
+def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
+def UseHVXDbl          : Predicate<"HST->useHVXDblOps()">,
+                         AssemblerPredicate<"ExtensionHVXDbl">;
+def UseHVXSgl          : Predicate<"HST->useHVXSglOps()">;
+
+def UseHVX             : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
+                         AssemblerPredicate<"ExtensionHVX">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
@@ -53,6 +71,7 @@ class NewValueRel: PredNewRel;
 // NewValueRel - Filter class used to relate load/store instructions having
 // different addressing modes with each other.
 class AddrModeRel: NewValueRel;
+class IntrinsicsRel;
 
 //===----------------------------------------------------------------------===//
 // Generate mapping table to relate non-predicate instructions with their
@@ -62,7 +81,7 @@ class AddrModeRel: NewValueRel;
 def getPredOpcode : InstrMapping {
   let FilterClass = "PredRel";
   // Instructions with the same BaseOpcode and isNVStore values form a row.
-  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"];
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
   // Instructions with the same predicate sense form a column.
   let ColFields = ["PredSense"];
   // The key column is the unpredicated instructions.
@@ -77,7 +96,7 @@ def getPredOpcode : InstrMapping {
 //
 def getFalsePredOpcode : InstrMapping {
   let FilterClass = "PredRel";
-  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
   let ColFields = ["PredSense"];
   let KeyCol = ["true"];
   let ValueCols = [["false"]];
@@ -89,7 +108,7 @@ def getFalsePredOpcode : InstrMapping {
 //
 def getTruePredOpcode : InstrMapping {
   let FilterClass = "PredRel";
-  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
   let ColFields = ["PredSense"];
   let KeyCol = ["false"];
   let ValueCols = [["true"]];
@@ -125,7 +144,7 @@ def getPredOldOpcode : InstrMapping {
 //
 def getNewValueOpcode : InstrMapping {
   let FilterClass = "NewValueRel";
-  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"];
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
   let ColFields = ["NValueST"];
   let KeyCol = ["false"];
   let ValueCols = [["true"]];
@@ -137,16 +156,16 @@ def getNewValueOpcode : InstrMapping {
 //
 def getNonNVStore : InstrMapping {
   let FilterClass = "NewValueRel";
-  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"];
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
   let ColFields = ["NValueST"];
   let KeyCol = ["true"];
   let ValueCols = [["false"]];
 }
 
-def getBasedWithImmOffset : InstrMapping {
+def getBaseWithImmOffset : InstrMapping {
   let FilterClass = "AddrModeRel";
   let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
-                   "isMEMri", "isFloat"];
+                   "isFloat"];
   let ColFields = ["addrMode"];
   let KeyCol = ["Absolute"];
   let ValueCols = [["BaseImmOffset"]];
@@ -168,6 +187,37 @@ def getRegForm : InstrMapping {
   let ValueCols = [["reg"]];
 }
 
+def getRegShlForm : InstrMapping {
+  let FilterClass = "ImmRegShl";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["InputType"];
+  let KeyCol = ["imm"];
+  let ValueCols = [["reg"]];
+}
+
+def notTakenBranchPrediction : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue",  "PredSense", "isBranch", "isPredicated"];
+  let ColFields = ["isBrTaken"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+def takenBranchPrediction : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue",  "PredSense", "isBranch", "isPredicated"];
+  let ColFields = ["isBrTaken"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+def getRealHWInstr : InstrMapping {
+  let FilterClass = "IntrinsicsRel";
+  let RowFields = ["BaseOpcode"];
+  let ColFields = ["InstrType"];
+  let KeyCol = ["Pseudo"];
+  let ValueCols = [["Pseudo"], ["Real"]];
+}
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -192,12 +242,27 @@ def : Proc<"hexagonv4",  HexagonModelV4,
            [ArchV4]>;
 def : Proc<"hexagonv5",  HexagonModelV4,
            [ArchV4, ArchV5]>;
+def : Proc<"hexagonv55", HexagonModelV55,
+           [ArchV4, ArchV5, ArchV55]>;
+def : Proc<"hexagonv60", HexagonModelV60,
+           [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
 
+def HexagonAsmParser : AsmParser {
+  bit HasMnemonicFirst = 0;
+}
+
+def HexagonAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string TokenizingCharacters = "#()=:.<>!+*";
+}
+
 def Hexagon : Target {
   // Pull in Instruction Info:
   let InstructionSet = HexagonInstrInfo;
+  let AssemblyParsers = [HexagonAsmParser];
+  let AssemblyParserVariants = [HexagonAsmParserVariant];
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 05728d2..4c7c039 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -40,11 +40,13 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -56,12 +58,27 @@
 
 using namespace llvm;
 
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
 #define DEBUG_TYPE "asm-printer"
 
 static cl::opt<bool> AlignCalls(
          "hexagon-align-calls", cl::Hidden, cl::init(true),
           cl::desc("Insert falign after call instruction for Hexagon target"));
 
+// Given a scalar register return its pair.
+inline static unsigned getHexagonRegisterPair(unsigned Reg,
+      const MCRegisterInfo *RI) {
+  assert(Hexagon::IntRegsRegClass.contains(Reg));
+  MCSuperRegIterator SR(Reg, RI, false);
+  unsigned Pair = *SR;
+  assert(Hexagon::DoubleRegsRegClass.contains(Pair));
+  return Pair;
+}
+
 HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
                                      std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {}
@@ -102,9 +119,8 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
 //
 bool HexagonAsmPrinter::
 isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
-  if (MBB->hasAddressTaken()) {
+  if (MBB->hasAddressTaken())
     return false;
-  }
   return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
 }
 
@@ -117,7 +133,8 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                         raw_ostream &OS) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
     default:
@@ -173,45 +190,407 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
+                           MCStreamer &OutStreamer, const MCOperand &Imm,
+                           int AlignSize) {
+  MCSymbol *Sym;
+  int64_t Value;
+  if (Imm.getExpr()->evaluateAsAbsolute(Value)) {
+    StringRef sectionPrefix;
+    std::string ImmString;
+    StringRef Name;
+    if (AlignSize == 8) {
+       Name = ".CONST_0000000000000000";
+       sectionPrefix = ".gnu.linkonce.l8";
+       ImmString = utohexstr(Value);
+    } else {
+       Name = ".CONST_00000000";
+       sectionPrefix = ".gnu.linkonce.l4";
+       ImmString = utohexstr(static_cast<uint32_t>(Value));
+    }
+
+    std::string symbolName =   // Yes, leading zeros are kept.
+      Name.drop_back(ImmString.size()).str() + ImmString;
+    std::string sectionName = sectionPrefix.str() + symbolName;
+
+    MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+        sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+    OutStreamer.SwitchSection(Section);
+
+    Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName));
+    if (Sym->isUndefined()) {
+      OutStreamer.EmitLabel(Sym);
+      OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+      OutStreamer.EmitIntValue(Value, AlignSize);
+      OutStreamer.EmitCodeAlignment(AlignSize);
+    }
+  } else {
+    assert(Imm.isExpr() && "Expected expression and found none");
+    const MachineOperand &MO = MI.getOperand(1);
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = AP.getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = AP.GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = AP.GetJTISymbol(MO.getIndex());
+    else
+      llvm_unreachable("Unknown operand type!");
+
+    StringRef SymbolName = MOSymbol->getName();
+    std::string LitaName = ".CONST_" + SymbolName.str();
+
+    MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+        ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+
+    OutStreamer.SwitchSection(Section);
+    Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName));
+    if (Sym->isUndefined()) {
+      OutStreamer.EmitLabel(Sym);
+      OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local);
+      OutStreamer.EmitValue(Imm.getExpr(), AlignSize);
+      OutStreamer.EmitCodeAlignment(AlignSize);
+    }
+  }
+  return Sym;
+}
+
+void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
+                                                  const MachineInstr &MI) {
+  MCInst &MappedInst = static_cast <MCInst &>(Inst);
+  const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+
+  switch (Inst.getOpcode()) {
+  default: return;
+
+  // "$dst = CONST64(#$src1)",
+  case Hexagon::CONST64_Float_Real:
+  case Hexagon::CONST64_Int_Real:
+    if (!OutStreamer->hasRawTextSupport()) {
+      const MCOperand &Imm = MappedInst.getOperand(1);
+      MCSectionSubPair Current = OutStreamer->getCurrentSection();
+
+      MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8);
+
+      OutStreamer->SwitchSection(Current.first, Current.second);
+      MCInst TmpInst;
+      MCOperand &Reg = MappedInst.getOperand(0);
+      TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+      TmpInst.addOperand(Reg);
+      TmpInst.addOperand(MCOperand::createExpr(
+                         MCSymbolRefExpr::create(Sym, OutContext)));
+      MappedInst = TmpInst;
+
+    }
+    break;
+  case Hexagon::CONST32:
+  case Hexagon::CONST32_Float_Real:
+  case Hexagon::CONST32_Int_Real:
+  case Hexagon::FCONST32_nsdata:
+    if (!OutStreamer->hasRawTextSupport()) {
+      MCOperand &Imm = MappedInst.getOperand(1);
+      MCSectionSubPair Current = OutStreamer->getCurrentSection();
+      MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4);
+      OutStreamer->SwitchSection(Current.first, Current.second);
+      MCInst TmpInst;
+      MCOperand &Reg = MappedInst.getOperand(0);
+      TmpInst.setOpcode(Hexagon::L2_loadrigp);
+      TmpInst.addOperand(Reg);
+      TmpInst.addOperand(MCOperand::createExpr(
+                         MCSymbolRefExpr::create(Sym, OutContext)));
+      MappedInst = TmpInst;
+    }
+    break;
+
+  // C2_pxfer_map maps to C2_or instruction. Though, it's possible to use
+  // C2_or during instruction selection itself but it results
+  // into suboptimal code.
+  case Hexagon::C2_pxfer_map: {
+    MCOperand &Ps = Inst.getOperand(1);
+    MappedInst.setOpcode(Hexagon::C2_or);
+    MappedInst.addOperand(Ps);
+    return;
+  }
+
+  // Vector reduce complex multiply by scalar, Rt & 1 map to :hi else :lo
+  // The insn is mapped from the 4 operand to the 3 operand raw form taking
+  // 3 register pairs.
+  case Hexagon::M2_vrcmpys_acc_s1: {
+    MCOperand &Rt = Inst.getOperand(3);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+    else
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+  case Hexagon::M2_vrcmpys_s1: {
+    MCOperand &Rt = Inst.getOperand(2);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+    else
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+
+  case Hexagon::M2_vrcmpys_s1rp: {
+    MCOperand &Rt = Inst.getOperand(2);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+    else
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+
+  case Hexagon::A4_boundscheck: {
+    MCOperand &Rs = Inst.getOperand(1);
+    assert (Rs.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rs.getReg());
+    if (Reg & 1) // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+      MappedInst.setOpcode(Hexagon::A4_boundscheck_hi);
+    else         // raw:lo
+      MappedInst.setOpcode(Hexagon::A4_boundscheck_lo);
+    Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
+    return;
+  }
+  case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+    MCOperand &MO = MappedInst.getOperand(2);
+    int64_t Imm;
+    MCExpr const *Expr = MO.getExpr();
+    bool Success = Expr->evaluateAsAbsolute(Imm);
+    assert (Success && "Expected immediate and none was found");(void)Success;
+    MCInst TmpInst;
+    if (Imm == 0) {
+      TmpInst.setOpcode(Hexagon::S2_vsathub);
+      TmpInst.addOperand(MappedInst.getOperand(0));
+      TmpInst.addOperand(MappedInst.getOperand(1));
+      MappedInst = TmpInst;
+      return;
+    }
+    TmpInst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+    TmpInst.addOperand(MappedInst.getOperand(0));
+    TmpInst.addOperand(MappedInst.getOperand(1));
+    const MCExpr *One = MCConstantExpr::create(1, OutContext);
+    const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(Sub));
+    MappedInst = TmpInst;
+    return;
+  }
+  case Hexagon::S5_vasrhrnd_goodsyntax:
+  case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+    MCOperand &MO2 = MappedInst.getOperand(2);
+    MCExpr const *Expr = MO2.getExpr();
+    int64_t Imm;
+    bool Success = Expr->evaluateAsAbsolute(Imm);
+    assert (Success && "Expected immediate and none was found");(void)Success;
+    MCInst TmpInst;
+    if (Imm == 0) {
+      TmpInst.setOpcode(Hexagon::A2_combinew);
+      TmpInst.addOperand(MappedInst.getOperand(0));
+      MCOperand &MO1 = MappedInst.getOperand(1);
+      unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::subreg_hireg);
+      unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::subreg_loreg);
+      // Add a new operand for the second register in the pair.
+      TmpInst.addOperand(MCOperand::createReg(High));
+      TmpInst.addOperand(MCOperand::createReg(Low));
+      MappedInst = TmpInst;
+      return;
+    }
+
+    if (Inst.getOpcode() == Hexagon::S2_asr_i_p_rnd_goodsyntax)
+      TmpInst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+    else
+      TmpInst.setOpcode(Hexagon::S5_vasrhrnd);
+    TmpInst.addOperand(MappedInst.getOperand(0));
+    TmpInst.addOperand(MappedInst.getOperand(1));
+    const MCExpr *One = MCConstantExpr::create(1, OutContext);
+    const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(Sub));
+    MappedInst = TmpInst;
+    return;
+  }
+  // if ("#u5==0") Assembler mapped to: "Rd=Rs"; else Rd=asr(Rs,#u5-1):rnd
+  case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+    MCOperand &MO = Inst.getOperand(2);
+    MCExpr const *Expr = MO.getExpr();
+    int64_t Imm;
+    bool Success = Expr->evaluateAsAbsolute(Imm);
+    assert (Success && "Expected immediate and none was found");(void)Success;
+    MCInst TmpInst;
+    if (Imm == 0) {
+      TmpInst.setOpcode(Hexagon::A2_tfr);
+      TmpInst.addOperand(MappedInst.getOperand(0));
+      TmpInst.addOperand(MappedInst.getOperand(1));
+      MappedInst = TmpInst;
+      return;
+    }
+    TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+    TmpInst.addOperand(MappedInst.getOperand(0));
+    TmpInst.addOperand(MappedInst.getOperand(1));
+    const MCExpr *One = MCConstantExpr::create(1, OutContext);
+    const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(Sub));
+    MappedInst = TmpInst;
+    return;
+  }
+  case Hexagon::TFRI_f:
+    MappedInst.setOpcode(Hexagon::A2_tfrsi);
+    return;
+  case Hexagon::TFRI_cPt_f:
+    MappedInst.setOpcode(Hexagon::C2_cmoveit);
+    return;
+  case Hexagon::TFRI_cNotPt_f:
+    MappedInst.setOpcode(Hexagon::C2_cmoveif);
+    return;
+  case Hexagon::MUX_ri_f:
+    MappedInst.setOpcode(Hexagon::C2_muxri);
+    return;
+  case Hexagon::MUX_ir_f:
+    MappedInst.setOpcode(Hexagon::C2_muxir);
+    return;
+
+  // Translate a "$Rdd = #imm" to "$Rdd = combine(#[-1,0], #imm)"
+  case Hexagon::A2_tfrpi: {
+    MCInst TmpInst;
+    MCOperand &Rdd = MappedInst.getOperand(0);
+    MCOperand &MO = MappedInst.getOperand(1);
+
+    TmpInst.setOpcode(Hexagon::A2_combineii);
+    TmpInst.addOperand(Rdd);
+    int64_t Imm;
+    bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
+    if (Success && Imm < 0) {
+      const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(MOne));
+    } else {
+      const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(Zero));
+    }
+    TmpInst.addOperand(MO);
+    MappedInst = TmpInst;
+    return;
+  }
+  // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+  case Hexagon::A2_tfrp: {
+    MCOperand &MO = MappedInst.getOperand(1);
+    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg);
+    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg);
+    MO.setReg(High);
+    // Add a new operand for the second register in the pair.
+    MappedInst.addOperand(MCOperand::createReg(Low));
+    MappedInst.setOpcode(Hexagon::A2_combinew);
+    return;
+  }
+
+  case Hexagon::A2_tfrpt:
+  case Hexagon::A2_tfrpf: {
+    MCOperand &MO = MappedInst.getOperand(2);
+    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg);
+    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg);
+    MO.setReg(High);
+    // Add a new operand for the second register in the pair.
+    MappedInst.addOperand(MCOperand::createReg(Low));
+    MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+                          ? Hexagon::C2_ccombinewt
+                          : Hexagon::C2_ccombinewf);
+    return;
+  }
+  case Hexagon::A2_tfrptnew:
+  case Hexagon::A2_tfrpfnew: {
+    MCOperand &MO = MappedInst.getOperand(2);
+    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg);
+    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg);
+    MO.setReg(High);
+    // Add a new operand for the second register in the pair.
+    MappedInst.addOperand(MCOperand::createReg(Low));
+    MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+                          ? Hexagon::C2_ccombinewnewt
+                          : Hexagon::C2_ccombinewnewf);
+    return;
+  }
+
+  case Hexagon::M2_mpysmi: {
+    MCOperand &Imm = MappedInst.getOperand(2);
+    MCExpr const *Expr = Imm.getExpr();
+    int64_t Value;
+    bool Success = Expr->evaluateAsAbsolute(Value);
+    assert(Success);(void)Success;
+    if (Value < 0 && Value > -256) {
+      MappedInst.setOpcode(Hexagon::M2_mpysin);
+      Imm.setExpr(MCUnaryExpr::createMinus(Expr, OutContext));
+    }
+    else
+      MappedInst.setOpcode(Hexagon::M2_mpysip);
+    return;
+  }
+
+  case Hexagon::A2_addsp: {
+    MCOperand &Rt = Inst.getOperand(1);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::A2_addsph);
+    else
+      MappedInst.setOpcode(Hexagon::A2_addspl);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+  case Hexagon::HEXAGON_V6_vd0_pseudo:
+  case Hexagon::HEXAGON_V6_vd0_pseudo_128B: {
+    MCInst TmpInst;
+    assert (Inst.getOperand(0).isReg() &&
+            "Expected register and none was found");
+
+    TmpInst.setOpcode(Hexagon::V6_vxor);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    MappedInst = TmpInst;
+    return;
+  }
+
+  }
+}
+
 
 /// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
 /// the current output stream.
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MCInst MCB;
-  MCB.setOpcode(Hexagon::BUNDLE);
-  MCB.addOperand(MCOperand::createImm(0));
+  MCInst MCB = HexagonMCInstrInfo::createBundle();
+  const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
 
   if (MI->isBundle()) {
     const MachineBasicBlock* MBB = MI->getParent();
-    MachineBasicBlock::const_instr_iterator MII = MI;
+    MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
     unsigned IgnoreCount = 0;
 
-    for (++MII; MII != MBB->end() && MII->isInsideBundle(); ++MII) {
+    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
       if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
           MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
         ++IgnoreCount;
-      else {
-        HexagonLowerToMC(MII, MCB, *this);
-      }
-    }
+      else
+        HexagonLowerToMC(MCII, &*MII, MCB, *this);
   }
-  else {
-    HexagonLowerToMC(MI, MCB, *this);
-    HexagonMCInstrInfo::padEndloop(MCB);
-  }
-  // Examine the packet and try to find instructions that can be converted
-  // to compounds.
-  HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(),
-                                  OutStreamer->getContext(), MCB);
-  // Examine the packet and convert pairs of instructions to duplex
-  // instructions when possible.
-  SmallVector<DuplexCandidate, 8> possibleDuplexes;
-  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(
-      *Subtarget->getInstrInfo(), MCB);
-  HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget,
-                   OutStreamer->getContext(), MCB, possibleDuplexes);
-  EmitToStreamer(*OutStreamer, MCB);
+  else
+    HexagonLowerToMC(MCII, MI, MCB, *this);
+
+  bool Ok = HexagonMCInstrInfo::canonicalizePacket(
+      MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
+  assert(Ok);
+  (void)Ok;
+  if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+    return;
+  OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
 }
 
 extern "C" void LLVMInitializeHexagonAsmPrinter() {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index 792fc8b..a78d97e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -42,6 +42,10 @@ namespace llvm {
 
     void EmitInstruction(const MachineInstr *MI) override;
 
+    void HexagonProcessInstruction(MCInst &Inst,
+                                   const MachineInstr &MBB);
+
+
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
new file mode 100644
index 0000000..4d2b545
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -0,0 +1,2790 @@
+//===--- HexagonBitSimplify.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexbit"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonBitSimplifyPass(PassRegistry& Registry);
+  FunctionPass *createHexagonBitSimplify();
+}
+
+namespace {
+  // Set of virtual registers, based on BitVector.
+  struct RegisterSet : private BitVector {
+    RegisterSet() : BitVector() {}
+    explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+    RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
+
+    using BitVector::clear;
+    using BitVector::count;
+
+    unsigned find_first() const {
+      int First = BitVector::find_first();
+      if (First < 0)
+        return 0;
+      return x2v(First);
+    }
+
+    unsigned find_next(unsigned Prev) const {
+      int Next = BitVector::find_next(v2x(Prev));
+      if (Next < 0)
+        return 0;
+      return x2v(Next);
+    }
+
+    RegisterSet &insert(unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return static_cast<RegisterSet&>(BitVector::set(Idx));
+    }
+    RegisterSet &remove(unsigned R) {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return *this;
+      return static_cast<RegisterSet&>(BitVector::reset(Idx));
+    }
+
+    RegisterSet &insert(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+    }
+    RegisterSet &remove(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::reset(Rs));
+    }
+
+    reference operator[](unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return BitVector::operator[](Idx);
+    }
+    bool operator[](unsigned R) const {
+      unsigned Idx = v2x(R);
+      assert(Idx < size());
+      return BitVector::operator[](Idx);
+    }
+    bool has(unsigned R) const {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return false;
+      return BitVector::test(Idx);
+    }
+
+    bool empty() const {
+      return !BitVector::any();
+    }
+    bool includes(const RegisterSet &Rs) const {
+      // A.BitVector::test(B)  <=>  A-B != {}
+      return !Rs.BitVector::test(*this);
+    }
+    bool intersects(const RegisterSet &Rs) const {
+      return BitVector::anyCommon(Rs);
+    }
+
+  private:
+    void ensure(unsigned Idx) {
+      if (size() <= Idx)
+        resize(std::max(Idx+1, 32U));
+    }
+    static inline unsigned v2x(unsigned v) {
+      return TargetRegisterInfo::virtReg2Index(v);
+    }
+    static inline unsigned x2v(unsigned x) {
+      return TargetRegisterInfo::index2VirtReg(x);
+    }
+  };
+
+
+  struct PrintRegSet {
+    PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+      : RS(S), TRI(RI) {}
+    friend raw_ostream &operator<< (raw_ostream &OS,
+          const PrintRegSet &P);
+  private:
+    const RegisterSet &RS;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+    OS << '{';
+    for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+      OS << ' ' << PrintReg(R, P.TRI);
+    OS << " }";
+    return OS;
+  }
+}
+
+
+namespace {
+  class Transformation;
+
+  class HexagonBitSimplify : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonBitSimplify() : MachineFunctionPass(ID), MDT(0) {
+      initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "Hexagon bit simplification";
+    }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    static void getInstrDefs(const MachineInstr &MI, RegisterSet &Defs);
+    static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses);
+    static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1,
+        const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W);
+    static bool isConst(const BitTracker::RegisterCell &RC, uint16_t B,
+        uint16_t W);
+    static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B,
+        uint16_t W);
+    static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
+        uint16_t W, uint64_t &U);
+    static bool replaceReg(unsigned OldR, unsigned NewR,
+        MachineRegisterInfo &MRI);
+    static bool getSubregMask(const BitTracker::RegisterRef &RR,
+        unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI);
+    static bool replaceRegWithSub(unsigned OldR, unsigned NewR,
+        unsigned NewSR, MachineRegisterInfo &MRI);
+    static bool replaceSubWithSub(unsigned OldR, unsigned OldSR,
+        unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI);
+    static bool parseRegSequence(const MachineInstr &I,
+        BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH);
+
+    static bool getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+        uint16_t Begin);
+    static bool getUsedBits(unsigned Opc, unsigned OpN, BitVector &Bits,
+        uint16_t Begin, const HexagonInstrInfo &HII);
+
+    static const TargetRegisterClass *getFinalVRegClass(
+        const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI);
+    static bool isTransparentCopy(const BitTracker::RegisterRef &RD,
+        const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI);
+
+  private:
+    MachineDominatorTree *MDT;
+
+    bool visitBlock(MachineBasicBlock &B, Transformation &T, RegisterSet &AVs);
+  };
+
+  char HexagonBitSimplify::ID = 0;
+  typedef HexagonBitSimplify HBS;
+
+
+  // The purpose of this class is to provide a common facility to traverse
+  // the function top-down or bottom-up via the dominator tree, and keep
+  // track of the available registers.
+  class Transformation {
+  public:
+    bool TopDown;
+    Transformation(bool TD) : TopDown(TD) {}
+    virtual bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) = 0;
+    virtual ~Transformation() {}
+  };
+}
+
+INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit",
+      "Hexagon bit simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
+      "Hexagon bit simplification", false, false)
+
+
+bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
+      RegisterSet &AVs) {
+  MachineDomTreeNode *N = MDT->getNode(&B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  bool Changed = false;
+
+  if (T.TopDown)
+    Changed = T.processBlock(B, AVs);
+
+  RegisterSet Defs;
+  for (auto &I : B)
+    getInstrDefs(I, Defs);
+  RegisterSet NewAVs = AVs;
+  NewAVs.insert(Defs);
+
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    Changed |= visitBlock(*SB, T, NewAVs);
+  }
+  if (!T.TopDown)
+    Changed |= T.processBlock(B, AVs);
+
+  return Changed;
+}
+
+//
+// Utility functions:
+//
+void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
+      RegisterSet &Defs) {
+  for (auto &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isDef())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Defs.insert(R);
+  }
+}
+
+void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
+      RegisterSet &Uses) {
+  for (auto &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Uses.insert(R);
+  }
+}
+
+// Check if all the bits in range [B, E) in both cells are equal.
+bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1,
+      uint16_t B1, const BitTracker::RegisterCell &RC2, uint16_t B2,
+      uint16_t W) {
+  for (uint16_t i = 0; i < W; ++i) {
+    // If RC1[i] is "bottom", it cannot be proven equal to RC2[i].
+    if (RC1[B1+i].Type == BitTracker::BitValue::Ref && RC1[B1+i].RefI.Reg == 0)
+      return false;
+    // Same for RC2[i].
+    if (RC2[B2+i].Type == BitTracker::BitValue::Ref && RC2[B2+i].RefI.Reg == 0)
+      return false;
+    if (RC1[B1+i] != RC2[B2+i])
+      return false;
+  }
+  return true;
+}
+
+
+bool HexagonBitSimplify::isConst(const BitTracker::RegisterCell &RC,
+      uint16_t B, uint16_t W) {
+  assert(B < RC.width() && B+W <= RC.width());
+  for (uint16_t i = B; i < B+W; ++i)
+    if (!RC[i].num())
+      return false;
+  return true;
+}
+
+
+bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC,
+      uint16_t B, uint16_t W) {
+  assert(B < RC.width() && B+W <= RC.width());
+  for (uint16_t i = B; i < B+W; ++i)
+    if (!RC[i].is(0))
+      return false;
+  return true;
+}
+
+
+bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
+        uint16_t B, uint16_t W, uint64_t &U) {
+  assert(B < RC.width() && B+W <= RC.width());
+  int64_t T = 0;
+  for (uint16_t i = B+W; i > B; --i) {
+    const BitTracker::BitValue &BV = RC[i-1];
+    T <<= 1;
+    if (BV.is(1))
+      T |= 1;
+    else if (!BV.is(0))
+      return false;
+  }
+  U = T;
+  return true;
+}
+
+
+bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
+      MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+      !TargetRegisterInfo::isVirtualRegister(NewR))
+    return false;
+  auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+  decltype(End) NextI;
+  for (auto I = Begin; I != End; I = NextI) {
+    NextI = std::next(I);
+    I->setReg(NewR);
+  }
+  return Begin != End;
+}
+
+
+bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
+      unsigned NewSR, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+      !TargetRegisterInfo::isVirtualRegister(NewR))
+    return false;
+  auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+  decltype(End) NextI;
+  for (auto I = Begin; I != End; I = NextI) {
+    NextI = std::next(I);
+    I->setReg(NewR);
+    I->setSubReg(NewSR);
+  }
+  return Begin != End;
+}
+
+
+bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
+      unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+      !TargetRegisterInfo::isVirtualRegister(NewR))
+    return false;
+  auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+  decltype(End) NextI;
+  for (auto I = Begin; I != End; I = NextI) {
+    NextI = std::next(I);
+    if (I->getSubReg() != OldSR)
+      continue;
+    I->setReg(NewR);
+    I->setSubReg(NewSR);
+  }
+  return Begin != End;
+}
+
+
+// For a register ref (pair Reg:Sub), set Begin to the position of the LSB
+// of Sub in Reg, and set Width to the size of Sub in bits. Return true,
+// if this succeeded, otherwise return false.
+bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
+      unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI) {
+  const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg);
+  if (RC == &Hexagon::IntRegsRegClass) {
+    assert(RR.Sub == 0);
+    Begin = 0;
+    Width = 32;
+    return true;
+  }
+  if (RC == &Hexagon::DoubleRegsRegClass) {
+    if (RR.Sub == 0) {
+      Begin = 0;
+      Width = 64;
+      return true;
+    }
+    assert(RR.Sub == Hexagon::subreg_loreg || RR.Sub == Hexagon::subreg_hireg);
+    Width = 32;
+    Begin = (RR.Sub == Hexagon::subreg_loreg ? 0 : 32);
+    return true;
+  }
+  return false;
+}
+
+
+// For a REG_SEQUENCE, set SL to the low subregister and SH to the high
+// subregister.
+bool HexagonBitSimplify::parseRegSequence(const MachineInstr &I,
+      BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH) {
+  assert(I.getOpcode() == TargetOpcode::REG_SEQUENCE);
+  unsigned Sub1 = I.getOperand(2).getImm(), Sub2 = I.getOperand(4).getImm();
+  assert(Sub1 != Sub2);
+  if (Sub1 == Hexagon::subreg_loreg && Sub2 == Hexagon::subreg_hireg) {
+    SL = I.getOperand(1);
+    SH = I.getOperand(3);
+    return true;
+  }
+  if (Sub1 == Hexagon::subreg_hireg && Sub2 == Hexagon::subreg_loreg) {
+    SH = I.getOperand(1);
+    SL = I.getOperand(3);
+    return true;
+  }
+  return false;
+}
+
+
+// All stores (except 64-bit stores) take a 32-bit register as the source
+// of the value to be stored. If the instruction stores into a location
+// that is shorter than 32 bits, some bits of the source register are not
+// used. For each store instruction, calculate the set of used bits in
+// the source register, and set appropriate bits in Bits. Return true if
+// the bits are calculated, false otherwise.
+bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+      uint16_t Begin) {
+  using namespace Hexagon;
+
+  switch (Opc) {
+    // Store byte
+    case S2_storerb_io:           // memb(Rs32+#s11:0)=Rt32
+    case S2_storerbnew_io:        // memb(Rs32+#s11:0)=Nt8.new
+    case S2_pstorerbt_io:         // if (Pv4) memb(Rs32+#u6:0)=Rt32
+    case S2_pstorerbf_io:         // if (!Pv4) memb(Rs32+#u6:0)=Rt32
+    case S4_pstorerbtnew_io:      // if (Pv4.new) memb(Rs32+#u6:0)=Rt32
+    case S4_pstorerbfnew_io:      // if (!Pv4.new) memb(Rs32+#u6:0)=Rt32
+    case S2_pstorerbnewt_io:      // if (Pv4) memb(Rs32+#u6:0)=Nt8.new
+    case S2_pstorerbnewf_io:      // if (!Pv4) memb(Rs32+#u6:0)=Nt8.new
+    case S4_pstorerbnewtnew_io:   // if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+    case S4_pstorerbnewfnew_io:   // if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+    case S2_storerb_pi:           // memb(Rx32++#s4:0)=Rt32
+    case S2_storerbnew_pi:        // memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbt_pi:         // if (Pv4) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbf_pi:         // if (!Pv4) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbtnew_pi:      // if (Pv4.new) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbfnew_pi:      // if (!Pv4.new) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbnewt_pi:      // if (Pv4) memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbnewf_pi:      // if (!Pv4) memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbnewtnew_pi:   // if (Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbnewfnew_pi:   // if (!Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+    case S4_storerb_ap:           // memb(Re32=#U6)=Rt32
+    case S4_storerbnew_ap:        // memb(Re32=#U6)=Nt8.new
+    case S2_storerb_pr:           // memb(Rx32++Mu2)=Rt32
+    case S2_storerbnew_pr:        // memb(Rx32++Mu2)=Nt8.new
+    case S4_storerb_ur:           // memb(Ru32<<#u2+#U6)=Rt32
+    case S4_storerbnew_ur:        // memb(Ru32<<#u2+#U6)=Nt8.new
+    case S2_storerb_pbr:          // memb(Rx32++Mu2:brev)=Rt32
+    case S2_storerbnew_pbr:       // memb(Rx32++Mu2:brev)=Nt8.new
+    case S2_storerb_pci:          // memb(Rx32++#s4:0:circ(Mu2))=Rt32
+    case S2_storerbnew_pci:       // memb(Rx32++#s4:0:circ(Mu2))=Nt8.new
+    case S2_storerb_pcr:          // memb(Rx32++I:circ(Mu2))=Rt32
+    case S2_storerbnew_pcr:       // memb(Rx32++I:circ(Mu2))=Nt8.new
+    case S4_storerb_rr:           // memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_storerbnew_rr:        // memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbt_rr:         // if (Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbf_rr:         // if (!Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbtnew_rr:      // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbfnew_rr:      // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbnewt_rr:      // if (Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbnewf_rr:      // if (!Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbnewtnew_rr:   // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbnewfnew_rr:   // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S2_storerbgp:            // memb(gp+#u16:0)=Rt32
+    case S2_storerbnewgp:         // memb(gp+#u16:0)=Nt8.new
+    case S4_pstorerbt_abs:        // if (Pv4) memb(#u6)=Rt32
+    case S4_pstorerbf_abs:        // if (!Pv4) memb(#u6)=Rt32
+    case S4_pstorerbtnew_abs:     // if (Pv4.new) memb(#u6)=Rt32
+    case S4_pstorerbfnew_abs:     // if (!Pv4.new) memb(#u6)=Rt32
+    case S4_pstorerbnewt_abs:     // if (Pv4) memb(#u6)=Nt8.new
+    case S4_pstorerbnewf_abs:     // if (!Pv4) memb(#u6)=Nt8.new
+    case S4_pstorerbnewtnew_abs:  // if (Pv4.new) memb(#u6)=Nt8.new
+    case S4_pstorerbnewfnew_abs:  // if (!Pv4.new) memb(#u6)=Nt8.new
+      Bits.set(Begin, Begin+8);
+      return true;
+
+    // Store low half
+    case S2_storerh_io:           // memh(Rs32+#s11:1)=Rt32
+    case S2_storerhnew_io:        // memh(Rs32+#s11:1)=Nt8.new
+    case S2_pstorerht_io:         // if (Pv4) memh(Rs32+#u6:1)=Rt32
+    case S2_pstorerhf_io:         // if (!Pv4) memh(Rs32+#u6:1)=Rt32
+    case S4_pstorerhtnew_io:      // if (Pv4.new) memh(Rs32+#u6:1)=Rt32
+    case S4_pstorerhfnew_io:      // if (!Pv4.new) memh(Rs32+#u6:1)=Rt32
+    case S2_pstorerhnewt_io:      // if (Pv4) memh(Rs32+#u6:1)=Nt8.new
+    case S2_pstorerhnewf_io:      // if (!Pv4) memh(Rs32+#u6:1)=Nt8.new
+    case S4_pstorerhnewtnew_io:   // if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+    case S4_pstorerhnewfnew_io:   // if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+    case S2_storerh_pi:           // memh(Rx32++#s4:1)=Rt32
+    case S2_storerhnew_pi:        // memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerht_pi:         // if (Pv4) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhf_pi:         // if (!Pv4) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhtnew_pi:      // if (Pv4.new) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhfnew_pi:      // if (!Pv4.new) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhnewt_pi:      // if (Pv4) memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerhnewf_pi:      // if (!Pv4) memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerhnewtnew_pi:   // if (Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerhnewfnew_pi:   // if (!Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+    case S4_storerh_ap:           // memh(Re32=#U6)=Rt32
+    case S4_storerhnew_ap:        // memh(Re32=#U6)=Nt8.new
+    case S2_storerh_pr:           // memh(Rx32++Mu2)=Rt32
+    case S2_storerhnew_pr:        // memh(Rx32++Mu2)=Nt8.new
+    case S4_storerh_ur:           // memh(Ru32<<#u2+#U6)=Rt32
+    case S4_storerhnew_ur:        // memh(Ru32<<#u2+#U6)=Nt8.new
+    case S2_storerh_pbr:          // memh(Rx32++Mu2:brev)=Rt32
+    case S2_storerhnew_pbr:       // memh(Rx32++Mu2:brev)=Nt8.new
+    case S2_storerh_pci:          // memh(Rx32++#s4:1:circ(Mu2))=Rt32
+    case S2_storerhnew_pci:       // memh(Rx32++#s4:1:circ(Mu2))=Nt8.new
+    case S2_storerh_pcr:          // memh(Rx32++I:circ(Mu2))=Rt32
+    case S2_storerhnew_pcr:       // memh(Rx32++I:circ(Mu2))=Nt8.new
+    case S4_storerh_rr:           // memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerht_rr:         // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerhf_rr:         // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerhtnew_rr:      // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerhfnew_rr:      // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_storerhnew_rr:        // memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewt_rr:      // if (Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewf_rr:      // if (!Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewtnew_rr:   // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewfnew_rr:   // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S2_storerhgp:            // memh(gp+#u16:1)=Rt32
+    case S2_storerhnewgp:         // memh(gp+#u16:1)=Nt8.new
+    case S4_pstorerht_abs:        // if (Pv4) memh(#u6)=Rt32
+    case S4_pstorerhf_abs:        // if (!Pv4) memh(#u6)=Rt32
+    case S4_pstorerhtnew_abs:     // if (Pv4.new) memh(#u6)=Rt32
+    case S4_pstorerhfnew_abs:     // if (!Pv4.new) memh(#u6)=Rt32
+    case S4_pstorerhnewt_abs:     // if (Pv4) memh(#u6)=Nt8.new
+    case S4_pstorerhnewf_abs:     // if (!Pv4) memh(#u6)=Nt8.new
+    case S4_pstorerhnewtnew_abs:  // if (Pv4.new) memh(#u6)=Nt8.new
+    case S4_pstorerhnewfnew_abs:  // if (!Pv4.new) memh(#u6)=Nt8.new
+      Bits.set(Begin, Begin+16);
+      return true;
+
+    // Store high half
+    case S2_storerf_io:           // memh(Rs32+#s11:1)=Rt.H32
+    case S2_pstorerft_io:         // if (Pv4) memh(Rs32+#u6:1)=Rt.H32
+    case S2_pstorerff_io:         // if (!Pv4) memh(Rs32+#u6:1)=Rt.H32
+    case S4_pstorerftnew_io:      // if (Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+    case S4_pstorerffnew_io:      // if (!Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+    case S2_storerf_pi:           // memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerft_pi:         // if (Pv4) memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerff_pi:         // if (!Pv4) memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerftnew_pi:      // if (Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerffnew_pi:      // if (!Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+    case S4_storerf_ap:           // memh(Re32=#U6)=Rt.H32
+    case S2_storerf_pr:           // memh(Rx32++Mu2)=Rt.H32
+    case S4_storerf_ur:           // memh(Ru32<<#u2+#U6)=Rt.H32
+    case S2_storerf_pbr:          // memh(Rx32++Mu2:brev)=Rt.H32
+    case S2_storerf_pci:          // memh(Rx32++#s4:1:circ(Mu2))=Rt.H32
+    case S2_storerf_pcr:          // memh(Rx32++I:circ(Mu2))=Rt.H32
+    case S4_storerf_rr:           // memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerft_rr:         // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerff_rr:         // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerftnew_rr:      // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerffnew_rr:      // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S2_storerfgp:            // memh(gp+#u16:1)=Rt.H32
+    case S4_pstorerft_abs:        // if (Pv4) memh(#u6)=Rt.H32
+    case S4_pstorerff_abs:        // if (!Pv4) memh(#u6)=Rt.H32
+    case S4_pstorerftnew_abs:     // if (Pv4.new) memh(#u6)=Rt.H32
+    case S4_pstorerffnew_abs:     // if (!Pv4.new) memh(#u6)=Rt.H32
+      Bits.set(Begin+16, Begin+32);
+      return true;
+  }
+
+  return false;
+}
+
+
+// For an instruction with opcode Opc, calculate the set of bits that it
+// uses in a register in operand OpN. This only calculates the set of used
+// bits for cases where it does not depend on any operands (as is the case
+// in shifts, for example). For concrete instructions from a program, the
+// operand may be a subregister of a larger register, while Bits would
+// correspond to the larger register in its entirety. Because of that,
+// the parameter Begin can be used to indicate which bit of Bits should be
+// considered the LSB of of the operand.
+bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
+      BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) {
+  using namespace Hexagon;
+
+  const MCInstrDesc &D = HII.get(Opc);
+  if (D.mayStore()) {
+    if (OpN == D.getNumOperands()-1)
+      return getUsedBitsInStore(Opc, Bits, Begin);
+    return false;
+  }
+
+  switch (Opc) {
+    // One register source. Used bits: R1[0-7].
+    case A2_sxtb:
+    case A2_zxtb:
+    case A4_cmpbeqi:
+    case A4_cmpbgti:
+    case A4_cmpbgtui:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+8);
+        return true;
+      }
+      break;
+
+    // One register source. Used bits: R1[0-15].
+    case A2_aslh:
+    case A2_sxth:
+    case A2_zxth:
+    case A4_cmpheqi:
+    case A4_cmphgti:
+    case A4_cmphgtui:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      break;
+
+    // One register source. Used bits: R1[16-31].
+    case A2_asrh:
+      if (OpN == 1) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      break;
+
+    // Two register sources. Used bits: R1[0-7], R2[0-7].
+    case A4_cmpbeq:
+    case A4_cmpbgt:
+    case A4_cmpbgtu:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+8);
+        return true;
+      }
+      break;
+
+    // Two register sources. Used bits: R1[0-15], R2[0-15].
+    case A4_cmpheq:
+    case A4_cmphgt:
+    case A4_cmphgtu:
+    case A2_addh_h16_ll:
+    case A2_addh_h16_sat_ll:
+    case A2_addh_l16_ll:
+    case A2_addh_l16_sat_ll:
+    case A2_combine_ll:
+    case A2_subh_h16_ll:
+    case A2_subh_h16_sat_ll:
+    case A2_subh_l16_ll:
+    case A2_subh_l16_sat_ll:
+    case M2_mpy_acc_ll_s0:
+    case M2_mpy_acc_ll_s1:
+    case M2_mpy_acc_sat_ll_s0:
+    case M2_mpy_acc_sat_ll_s1:
+    case M2_mpy_ll_s0:
+    case M2_mpy_ll_s1:
+    case M2_mpy_nac_ll_s0:
+    case M2_mpy_nac_ll_s1:
+    case M2_mpy_nac_sat_ll_s0:
+    case M2_mpy_nac_sat_ll_s1:
+    case M2_mpy_rnd_ll_s0:
+    case M2_mpy_rnd_ll_s1:
+    case M2_mpy_sat_ll_s0:
+    case M2_mpy_sat_ll_s1:
+    case M2_mpy_sat_rnd_ll_s0:
+    case M2_mpy_sat_rnd_ll_s1:
+    case M2_mpyd_acc_ll_s0:
+    case M2_mpyd_acc_ll_s1:
+    case M2_mpyd_ll_s0:
+    case M2_mpyd_ll_s1:
+    case M2_mpyd_nac_ll_s0:
+    case M2_mpyd_nac_ll_s1:
+    case M2_mpyd_rnd_ll_s0:
+    case M2_mpyd_rnd_ll_s1:
+    case M2_mpyu_acc_ll_s0:
+    case M2_mpyu_acc_ll_s1:
+    case M2_mpyu_ll_s0:
+    case M2_mpyu_ll_s1:
+    case M2_mpyu_nac_ll_s0:
+    case M2_mpyu_nac_ll_s1:
+    case M2_mpyud_acc_ll_s0:
+    case M2_mpyud_acc_ll_s1:
+    case M2_mpyud_ll_s0:
+    case M2_mpyud_ll_s1:
+    case M2_mpyud_nac_ll_s0:
+    case M2_mpyud_nac_ll_s1:
+      if (OpN == 1 || OpN == 2) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      break;
+
+    // Two register sources. Used bits: R1[0-15], R2[16-31].
+    case A2_addh_h16_lh:
+    case A2_addh_h16_sat_lh:
+    case A2_combine_lh:
+    case A2_subh_h16_lh:
+    case A2_subh_h16_sat_lh:
+    case M2_mpy_acc_lh_s0:
+    case M2_mpy_acc_lh_s1:
+    case M2_mpy_acc_sat_lh_s0:
+    case M2_mpy_acc_sat_lh_s1:
+    case M2_mpy_lh_s0:
+    case M2_mpy_lh_s1:
+    case M2_mpy_nac_lh_s0:
+    case M2_mpy_nac_lh_s1:
+    case M2_mpy_nac_sat_lh_s0:
+    case M2_mpy_nac_sat_lh_s1:
+    case M2_mpy_rnd_lh_s0:
+    case M2_mpy_rnd_lh_s1:
+    case M2_mpy_sat_lh_s0:
+    case M2_mpy_sat_lh_s1:
+    case M2_mpy_sat_rnd_lh_s0:
+    case M2_mpy_sat_rnd_lh_s1:
+    case M2_mpyd_acc_lh_s0:
+    case M2_mpyd_acc_lh_s1:
+    case M2_mpyd_lh_s0:
+    case M2_mpyd_lh_s1:
+    case M2_mpyd_nac_lh_s0:
+    case M2_mpyd_nac_lh_s1:
+    case M2_mpyd_rnd_lh_s0:
+    case M2_mpyd_rnd_lh_s1:
+    case M2_mpyu_acc_lh_s0:
+    case M2_mpyu_acc_lh_s1:
+    case M2_mpyu_lh_s0:
+    case M2_mpyu_lh_s1:
+    case M2_mpyu_nac_lh_s0:
+    case M2_mpyu_nac_lh_s1:
+    case M2_mpyud_acc_lh_s0:
+    case M2_mpyud_acc_lh_s1:
+    case M2_mpyud_lh_s0:
+    case M2_mpyud_lh_s1:
+    case M2_mpyud_nac_lh_s0:
+    case M2_mpyud_nac_lh_s1:
+    // These four are actually LH.
+    case A2_addh_l16_hl:
+    case A2_addh_l16_sat_hl:
+    case A2_subh_l16_hl:
+    case A2_subh_l16_sat_hl:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      if (OpN == 2) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      break;
+
+    // Two register sources, used bits: R1[16-31], R2[0-15].
+    case A2_addh_h16_hl:
+    case A2_addh_h16_sat_hl:
+    case A2_combine_hl:
+    case A2_subh_h16_hl:
+    case A2_subh_h16_sat_hl:
+    case M2_mpy_acc_hl_s0:
+    case M2_mpy_acc_hl_s1:
+    case M2_mpy_acc_sat_hl_s0:
+    case M2_mpy_acc_sat_hl_s1:
+    case M2_mpy_hl_s0:
+    case M2_mpy_hl_s1:
+    case M2_mpy_nac_hl_s0:
+    case M2_mpy_nac_hl_s1:
+    case M2_mpy_nac_sat_hl_s0:
+    case M2_mpy_nac_sat_hl_s1:
+    case M2_mpy_rnd_hl_s0:
+    case M2_mpy_rnd_hl_s1:
+    case M2_mpy_sat_hl_s0:
+    case M2_mpy_sat_hl_s1:
+    case M2_mpy_sat_rnd_hl_s0:
+    case M2_mpy_sat_rnd_hl_s1:
+    case M2_mpyd_acc_hl_s0:
+    case M2_mpyd_acc_hl_s1:
+    case M2_mpyd_hl_s0:
+    case M2_mpyd_hl_s1:
+    case M2_mpyd_nac_hl_s0:
+    case M2_mpyd_nac_hl_s1:
+    case M2_mpyd_rnd_hl_s0:
+    case M2_mpyd_rnd_hl_s1:
+    case M2_mpyu_acc_hl_s0:
+    case M2_mpyu_acc_hl_s1:
+    case M2_mpyu_hl_s0:
+    case M2_mpyu_hl_s1:
+    case M2_mpyu_nac_hl_s0:
+    case M2_mpyu_nac_hl_s1:
+    case M2_mpyud_acc_hl_s0:
+    case M2_mpyud_acc_hl_s1:
+    case M2_mpyud_hl_s0:
+    case M2_mpyud_hl_s1:
+    case M2_mpyud_nac_hl_s0:
+    case M2_mpyud_nac_hl_s1:
+      if (OpN == 1) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      if (OpN == 2) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      break;
+
+    // Two register sources, used bits: R1[16-31], R2[16-31].
+    case A2_addh_h16_hh:
+    case A2_addh_h16_sat_hh:
+    case A2_combine_hh:
+    case A2_subh_h16_hh:
+    case A2_subh_h16_sat_hh:
+    case M2_mpy_acc_hh_s0:
+    case M2_mpy_acc_hh_s1:
+    case M2_mpy_acc_sat_hh_s0:
+    case M2_mpy_acc_sat_hh_s1:
+    case M2_mpy_hh_s0:
+    case M2_mpy_hh_s1:
+    case M2_mpy_nac_hh_s0:
+    case M2_mpy_nac_hh_s1:
+    case M2_mpy_nac_sat_hh_s0:
+    case M2_mpy_nac_sat_hh_s1:
+    case M2_mpy_rnd_hh_s0:
+    case M2_mpy_rnd_hh_s1:
+    case M2_mpy_sat_hh_s0:
+    case M2_mpy_sat_hh_s1:
+    case M2_mpy_sat_rnd_hh_s0:
+    case M2_mpy_sat_rnd_hh_s1:
+    case M2_mpyd_acc_hh_s0:
+    case M2_mpyd_acc_hh_s1:
+    case M2_mpyd_hh_s0:
+    case M2_mpyd_hh_s1:
+    case M2_mpyd_nac_hh_s0:
+    case M2_mpyd_nac_hh_s1:
+    case M2_mpyd_rnd_hh_s0:
+    case M2_mpyd_rnd_hh_s1:
+    case M2_mpyu_acc_hh_s0:
+    case M2_mpyu_acc_hh_s1:
+    case M2_mpyu_hh_s0:
+    case M2_mpyu_hh_s1:
+    case M2_mpyu_nac_hh_s0:
+    case M2_mpyu_nac_hh_s1:
+    case M2_mpyud_acc_hh_s0:
+    case M2_mpyud_acc_hh_s1:
+    case M2_mpyud_hh_s0:
+    case M2_mpyud_hh_s1:
+    case M2_mpyud_nac_hh_s0:
+    case M2_mpyud_nac_hh_s1:
+      if (OpN == 1 || OpN == 2) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      break;
+  }
+
+  return false;
+}
+
+
+// Calculate the register class that matches Reg:Sub. For example, if
+// vreg1 is a double register, then vreg1:subreg_hireg would match "int"
+// register class.
+const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
+      const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return nullptr;
+  auto *RC = MRI.getRegClass(RR.Reg);
+  if (RR.Sub == 0)
+    return RC;
+
+  auto VerifySR = [] (unsigned Sub) -> void {
+    assert(Sub == Hexagon::subreg_hireg || Sub == Hexagon::subreg_loreg);
+  };
+
+  switch (RC->getID()) {
+    case Hexagon::DoubleRegsRegClassID:
+      VerifySR(RR.Sub);
+      return &Hexagon::IntRegsRegClass;
+  }
+  return nullptr;
+}
+
+
+// Check if RD could be replaced with RS at any possible use of RD.
+// For example a predicate register cannot be replaced with a integer
+// register, but a 64-bit register with a subregister can be replaced
+// with a 32-bit register.
+bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
+      const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) ||
+      !TargetRegisterInfo::isVirtualRegister(RS.Reg))
+    return false;
+  // Return false if one (or both) classes are nullptr.
+  auto *DRC = getFinalVRegClass(RD, MRI);
+  if (!DRC)
+    return false;
+
+  return DRC == getFinalVRegClass(RS, MRI);
+}
+
+
+//
+// Dead code elimination
+//
+namespace {
+  class DeadCodeElimination {
+  public:
+    DeadCodeElimination(MachineFunction &mf, MachineDominatorTree &mdt)
+      : MF(mf), HII(*MF.getSubtarget<HexagonSubtarget>().getInstrInfo()),
+        MDT(mdt), MRI(mf.getRegInfo()) {}
+
+    bool run() {
+      return runOnNode(MDT.getRootNode());
+    }
+
+  private:
+    bool isDead(unsigned R) const;
+    bool runOnNode(MachineDomTreeNode *N);
+
+    MachineFunction &MF;
+    const HexagonInstrInfo &HII;
+    MachineDominatorTree &MDT;
+    MachineRegisterInfo &MRI;
+  };
+}
+
+
+bool DeadCodeElimination::isDead(unsigned R) const {
+  for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (UseI->isDebugValue())
+      continue;
+    if (UseI->isPHI()) {
+      assert(!UseI->getOperand(0).getSubReg());
+      unsigned DR = UseI->getOperand(0).getReg();
+      if (DR == R)
+        continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+
+bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
+  bool Changed = false;
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+    Changed |= runOnNode(*I);
+
+  MachineBasicBlock *B = N->getBlock();
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  for (auto MI : Instrs) {
+    unsigned Opc = MI->getOpcode();
+    // Do not touch lifetime markers. This is why the target-independent DCE
+    // cannot be used.
+    if (Opc == TargetOpcode::LIFETIME_START ||
+        Opc == TargetOpcode::LIFETIME_END)
+      continue;
+    bool Store = false;
+    if (MI->isInlineAsm())
+      continue;
+    // Delete PHIs if possible.
+    if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+      continue;
+
+    bool AllDead = true;
+    SmallVector<unsigned,2> Regs;
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) {
+        AllDead = false;
+        break;
+      }
+      Regs.push_back(R);
+    }
+    if (!AllDead)
+      continue;
+
+    B->erase(MI);
+    for (unsigned i = 0, n = Regs.size(); i != n; ++i)
+      MRI.markUsesInDebugValueAsUndef(Regs[i]);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+//
+// Eliminate redundant instructions
+//
+// This transformation will identify instructions where the output register
+// is the same as one of its input registers. This only works on instructions
+// that define a single register (unlike post-increment loads, for example).
+// The equality check is actually more detailed: the code calculates which
+// bits of the output are used, and only compares these bits with the input
+// registers.
+// If the output matches an input, the instruction is replaced with COPY.
+// The copies will be removed by another transformation.
+namespace {
+  class RedundantInstrElimination : public Transformation {
+  public:
+    RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii,
+          MachineRegisterInfo &mri)
+        : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+  private:
+    bool isLossyShiftLeft(const MachineInstr &MI, unsigned OpN,
+          unsigned &LostB, unsigned &LostE);
+    bool isLossyShiftRight(const MachineInstr &MI, unsigned OpN,
+          unsigned &LostB, unsigned &LostE);
+    bool computeUsedBits(unsigned Reg, BitVector &Bits);
+    bool computeUsedBits(const MachineInstr &MI, unsigned OpN, BitVector &Bits,
+          uint16_t Begin);
+    bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS);
+
+    const HexagonInstrInfo &HII;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+  };
+}
+
+
+// Check if the instruction is a lossy shift left, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftLeft(const MachineInstr &MI,
+      unsigned OpN, unsigned &LostB, unsigned &LostE) {
+  using namespace Hexagon;
+  unsigned Opc = MI.getOpcode();
+  unsigned ImN, RegN, Width;
+  switch (Opc) {
+    case S2_asl_i_p:
+      ImN = 2;
+      RegN = 1;
+      Width = 64;
+      break;
+    case S2_asl_i_p_acc:
+    case S2_asl_i_p_and:
+    case S2_asl_i_p_nac:
+    case S2_asl_i_p_or:
+    case S2_asl_i_p_xacc:
+      ImN = 3;
+      RegN = 2;
+      Width = 64;
+      break;
+    case S2_asl_i_r:
+      ImN = 2;
+      RegN = 1;
+      Width = 32;
+      break;
+    case S2_addasl_rrri:
+    case S4_andi_asl_ri:
+    case S4_ori_asl_ri:
+    case S4_addi_asl_ri:
+    case S4_subi_asl_ri:
+    case S2_asl_i_r_acc:
+    case S2_asl_i_r_and:
+    case S2_asl_i_r_nac:
+    case S2_asl_i_r_or:
+    case S2_asl_i_r_sat:
+    case S2_asl_i_r_xacc:
+      ImN = 3;
+      RegN = 2;
+      Width = 32;
+      break;
+    default:
+      return false;
+  }
+
+  if (RegN != OpN)
+    return false;
+
+  assert(MI.getOperand(ImN).isImm());
+  unsigned S = MI.getOperand(ImN).getImm();
+  if (S == 0)
+    return false;
+  LostB = Width-S;
+  LostE = Width;
+  return true;
+}
+
+
+// Check if the instruction is a lossy shift right, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftRight(const MachineInstr &MI,
+      unsigned OpN, unsigned &LostB, unsigned &LostE) {
+  using namespace Hexagon;
+  unsigned Opc = MI.getOpcode();
+  unsigned ImN, RegN;
+  switch (Opc) {
+    case S2_asr_i_p:
+    case S2_lsr_i_p:
+      ImN = 2;
+      RegN = 1;
+      break;
+    case S2_asr_i_p_acc:
+    case S2_asr_i_p_and:
+    case S2_asr_i_p_nac:
+    case S2_asr_i_p_or:
+    case S2_lsr_i_p_acc:
+    case S2_lsr_i_p_and:
+    case S2_lsr_i_p_nac:
+    case S2_lsr_i_p_or:
+    case S2_lsr_i_p_xacc:
+      ImN = 3;
+      RegN = 2;
+      break;
+    case S2_asr_i_r:
+    case S2_lsr_i_r:
+      ImN = 2;
+      RegN = 1;
+      break;
+    case S4_andi_lsr_ri:
+    case S4_ori_lsr_ri:
+    case S4_addi_lsr_ri:
+    case S4_subi_lsr_ri:
+    case S2_asr_i_r_acc:
+    case S2_asr_i_r_and:
+    case S2_asr_i_r_nac:
+    case S2_asr_i_r_or:
+    case S2_lsr_i_r_acc:
+    case S2_lsr_i_r_and:
+    case S2_lsr_i_r_nac:
+    case S2_lsr_i_r_or:
+    case S2_lsr_i_r_xacc:
+      ImN = 3;
+      RegN = 2;
+      break;
+
+    default:
+      return false;
+  }
+
+  if (RegN != OpN)
+    return false;
+
+  assert(MI.getOperand(ImN).isImm());
+  unsigned S = MI.getOperand(ImN).getImm();
+  LostB = 0;
+  LostE = S;
+  return true;
+}
+
+
+// Calculate the bit vector that corresponds to the used bits of register Reg.
+// The vector Bits has the same size, as the size of Reg in bits. If the cal-
+// culation fails (i.e. the used bits are unknown), it returns false. Other-
+// wise, it returns true and sets the corresponding bits in Bits.
+bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
+  BitVector Used(Bits.size());
+  RegisterSet Visited;
+  std::vector<unsigned> Pending;
+  Pending.push_back(Reg);
+
+  for (unsigned i = 0; i < Pending.size(); ++i) {
+    unsigned R = Pending[i];
+    if (Visited.has(R))
+      continue;
+    Visited.insert(R);
+    for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+      BitTracker::RegisterRef UR = *I;
+      unsigned B, W;
+      if (!HBS::getSubregMask(UR, B, W, MRI))
+        return false;
+      MachineInstr &UseI = *I->getParent();
+      if (UseI.isPHI() || UseI.isCopy()) {
+        unsigned DefR = UseI.getOperand(0).getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(DefR))
+          return false;
+        Pending.push_back(DefR);
+      } else {
+        if (!computeUsedBits(UseI, I.getOperandNo(), Used, B))
+          return false;
+      }
+    }
+  }
+  Bits |= Used;
+  return true;
+}
+
+
+// Calculate the bits used by instruction MI in a register in operand OpN.
+// Return true/false if the calculation succeeds/fails. If is succeeds, set
+// used bits in Bits. This function does not reset any bits in Bits, so
+// subsequent calls over different instructions will result in the union
+// of the used bits in all these instructions.
+// The register in question may be used with a sub-register, whereas Bits
+// holds the bits for the entire register. To keep track of that, the
+// argument Begin indicates where in Bits is the lowest-significant bit
+// of the register used in operand OpN. For example, in instruction:
+//   vreg1 = S2_lsr_i_r vreg2:subreg_hireg, 10
+// the operand 1 is a 32-bit register, which happens to be a subregister
+// of the 64-bit register vreg2, and that subregister starts at position 32.
+// In this case Begin=32, since Bits[32] would be the lowest-significant bit
+// of vreg2:subreg_hireg.
+bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
+      unsigned OpN, BitVector &Bits, uint16_t Begin) {
+  unsigned Opc = MI.getOpcode();
+  BitVector T(Bits.size());
+  bool GotBits = HBS::getUsedBits(Opc, OpN, T, Begin, HII);
+  // Even if we don't have bits yet, we could still provide some information
+  // if the instruction is a lossy shift: the lost bits will be marked as
+  // not used.
+  unsigned LB, LE;
+  if (isLossyShiftLeft(MI, OpN, LB, LE) || isLossyShiftRight(MI, OpN, LB, LE)) {
+    assert(MI.getOperand(OpN).isReg());
+    BitTracker::RegisterRef RR = MI.getOperand(OpN);
+    const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI);
+    uint16_t Width = RC->getSize()*8;
+
+    if (!GotBits)
+      T.set(Begin, Begin+Width);
+    assert(LB <= LE && LB < Width && LE <= Width);
+    T.reset(Begin+LB, Begin+LE);
+    GotBits = true;
+  }
+  if (GotBits)
+    Bits |= T;
+  return GotBits;
+}
+
+
+// Calculates the used bits in RD ("defined register"), and checks if these
+// bits in RS ("used register") and RD are identical.
+bool RedundantInstrElimination::usedBitsEqual(BitTracker::RegisterRef RD,
+      BitTracker::RegisterRef RS) {
+  const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+  const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+
+  unsigned DB, DW;
+  if (!HBS::getSubregMask(RD, DB, DW, MRI))
+    return false;
+  unsigned SB, SW;
+  if (!HBS::getSubregMask(RS, SB, SW, MRI))
+    return false;
+  if (SW != DW)
+    return false;
+
+  BitVector Used(DC.width());
+  if (!computeUsedBits(RD.Reg, Used))
+    return false;
+
+  for (unsigned i = 0; i != DW; ++i)
+    if (Used[i+DB] && DC[DB+i] != SC[SB+i])
+      return false;
+  return true;
+}
+
+
+bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
+      const RegisterSet&) {
+  bool Changed = false;
+
+  for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) {
+    NextI = std::next(I);
+    MachineInstr *MI = &*I;
+
+    if (MI->getOpcode() == TargetOpcode::COPY)
+      continue;
+    if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+      continue;
+    unsigned NumD = MI->getDesc().getNumDefs();
+    if (NumD != 1)
+      continue;
+
+    BitTracker::RegisterRef RD = MI->getOperand(0);
+    if (!BT.has(RD.Reg))
+      continue;
+    const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+
+    // Find a source operand that is equal to the result.
+    for (auto &Op : MI->uses()) {
+      if (!Op.isReg())
+        continue;
+      BitTracker::RegisterRef RS = Op;
+      if (!BT.has(RS.Reg))
+        continue;
+      if (!HBS::isTransparentCopy(RD, RS, MRI))
+        continue;
+
+      unsigned BN, BW;
+      if (!HBS::getSubregMask(RS, BN, BW, MRI))
+        continue;
+
+      const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+      if (!usedBitsEqual(RD, RS) && !HBS::isEqual(DC, 0, SC, BN, BW))
+        continue;
+
+      // If found, replace the instruction with a COPY.
+      DebugLoc DL = MI->getDebugLoc();
+      const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+      unsigned NewR = MRI.createVirtualRegister(FRC);
+      BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+          .addReg(RS.Reg, 0, RS.Sub);
+      HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+      BT.put(BitTracker::RegisterRef(NewR), SC);
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+
+//
+// Const generation
+//
+// Recognize instructions that produce constant values known at compile-time.
+// Replace them with register definitions that load these constants directly.
+namespace {
+  class ConstGeneration : public Transformation {
+  public:
+    ConstGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+        MachineRegisterInfo &mri)
+      : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+  private:
+    bool isTfrConst(const MachineInstr *MI) const;
+    bool isConst(unsigned R, int64_t &V) const;
+    unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
+        MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
+
+    const HexagonInstrInfo &HII;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+  };
+}
+
+bool ConstGeneration::isConst(unsigned R, int64_t &C) const {
+  if (!BT.has(R))
+    return false;
+  const BitTracker::RegisterCell &RC = BT.lookup(R);
+  int64_t T = 0;
+  for (unsigned i = RC.width(); i > 0; --i) {
+    const BitTracker::BitValue &V = RC[i-1];
+    T <<= 1;
+    if (V.is(1))
+      T |= 1;
+    else if (!V.is(0))
+      return false;
+  }
+  C = T;
+  return true;
+}
+
+
+bool ConstGeneration::isTfrConst(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+    case Hexagon::A2_tfrsi:
+    case Hexagon::A2_tfrpi:
+    case Hexagon::TFR_PdTrue:
+    case Hexagon::TFR_PdFalse:
+    case Hexagon::CONST32_Int_Real:
+    case Hexagon::CONST64_Int_Real:
+      return true;
+  }
+  return false;
+}
+
+
+// Generate a transfer-immediate instruction that is appropriate for the
+// register class and the actual value being transferred.
+unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
+      MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
+  unsigned Reg = MRI.createVirtualRegister(RC);
+  if (RC == &Hexagon::IntRegsRegClass) {
+    BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
+        .addImm(int32_t(C));
+    return Reg;
+  }
+
+  if (RC == &Hexagon::DoubleRegsRegClass) {
+    if (isInt<8>(C)) {
+      BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrpi), Reg)
+          .addImm(C);
+      return Reg;
+    }
+
+    unsigned Lo = Lo_32(C), Hi = Hi_32(C);
+    if (isInt<8>(Lo) || isInt<8>(Hi)) {
+      unsigned Opc = isInt<8>(Lo) ? Hexagon::A2_combineii
+                                  : Hexagon::A4_combineii;
+      BuildMI(B, At, DL, HII.get(Opc), Reg)
+          .addImm(int32_t(Hi))
+          .addImm(int32_t(Lo));
+      return Reg;
+    }
+
+    BuildMI(B, At, DL, HII.get(Hexagon::CONST64_Int_Real), Reg)
+        .addImm(C);
+    return Reg;
+  }
+
+  if (RC == &Hexagon::PredRegsRegClass) {
+    unsigned Opc;
+    if (C == 0)
+      Opc = Hexagon::TFR_PdFalse;
+    else if ((C & 0xFF) == 0xFF)
+      Opc = Hexagon::TFR_PdTrue;
+    else
+      return 0;
+    BuildMI(B, At, DL, HII.get(Opc), Reg);
+    return Reg;
+  }
+
+  return 0;
+}
+
+
+bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+  bool Changed = false;
+  RegisterSet Defs;
+
+  for (auto I = B.begin(), E = B.end(); I != E; ++I) {
+    if (isTfrConst(I))
+      continue;
+    Defs.clear();
+    HBS::getInstrDefs(*I, Defs);
+    if (Defs.count() != 1)
+      continue;
+    unsigned DR = Defs.find_first();
+    if (!TargetRegisterInfo::isVirtualRegister(DR))
+      continue;
+    int64_t C;
+    if (isConst(DR, C)) {
+      DebugLoc DL = I->getDebugLoc();
+      auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+      unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
+      if (ImmReg) {
+        HBS::replaceReg(DR, ImmReg, MRI);
+        BT.put(ImmReg, BT.lookup(DR));
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+
+//
+// Copy generation
+//
+// Identify pairs of available registers which hold identical values.
+// In such cases, only one of them needs to be calculated, the other one
+// will be defined as a copy of the first.
+//
+// Copy propagation
+//
+// Eliminate register copies RD = RS, by replacing the uses of RD with
+// with uses of RS.
+namespace {
+  class CopyGeneration : public Transformation {
+  public:
+    CopyGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+        MachineRegisterInfo &mri)
+      : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+  private:
+    bool findMatch(const BitTracker::RegisterRef &Inp,
+        BitTracker::RegisterRef &Out, const RegisterSet &AVs);
+
+    const HexagonInstrInfo &HII;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+  };
+
+  class CopyPropagation : public Transformation {
+  public:
+    CopyPropagation(const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+        : Transformation(false), MRI(mri) {}
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+    static bool isCopyReg(unsigned Opc);
+  private:
+    bool propagateRegCopy(MachineInstr &MI);
+
+    MachineRegisterInfo &MRI;
+  };
+
+}
+
+
+/// Check if there is a register in AVs that is identical to Inp. If so,
+/// set Out to the found register. The output may be a pair Reg:Sub.
+bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp,
+      BitTracker::RegisterRef &Out, const RegisterSet &AVs) {
+  if (!BT.has(Inp.Reg))
+    return false;
+  const BitTracker::RegisterCell &InpRC = BT.lookup(Inp.Reg);
+  unsigned B, W;
+  if (!HBS::getSubregMask(Inp, B, W, MRI))
+    return false;
+
+  for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) {
+    if (!BT.has(R) || !HBS::isTransparentCopy(R, Inp, MRI))
+      continue;
+    const BitTracker::RegisterCell &RC = BT.lookup(R);
+    unsigned RW = RC.width();
+    if (W == RW) {
+      if (MRI.getRegClass(Inp.Reg) != MRI.getRegClass(R))
+        continue;
+      if (!HBS::isEqual(InpRC, B, RC, 0, W))
+        continue;
+      Out.Reg = R;
+      Out.Sub = 0;
+      return true;
+    }
+    // Check if there is a super-register, whose part (with a subregister)
+    // is equal to the input.
+    // Only do double registers for now.
+    if (W*2 != RW)
+      continue;
+    if (MRI.getRegClass(R) != &Hexagon::DoubleRegsRegClass)
+      continue;
+
+    if (HBS::isEqual(InpRC, B, RC, 0, W))
+      Out.Sub = Hexagon::subreg_loreg;
+    else if (HBS::isEqual(InpRC, B, RC, W, W))
+      Out.Sub = Hexagon::subreg_hireg;
+    else
+      continue;
+    Out.Reg = R;
+    return true;
+  }
+  return false;
+}
+
+
+bool CopyGeneration::processBlock(MachineBasicBlock &B,
+      const RegisterSet &AVs) {
+  RegisterSet AVB(AVs);
+  bool Changed = false;
+  RegisterSet Defs;
+
+  for (auto I = B.begin(), E = B.end(), NextI = I; I != E;
+       ++I, AVB.insert(Defs)) {
+    NextI = std::next(I);
+    Defs.clear();
+    HBS::getInstrDefs(*I, Defs);
+
+    unsigned Opc = I->getOpcode();
+    if (CopyPropagation::isCopyReg(Opc))
+      continue;
+
+    for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) {
+      BitTracker::RegisterRef MR;
+      if (!findMatch(R, MR, AVB))
+        continue;
+      DebugLoc DL = I->getDebugLoc();
+      auto *FRC = HBS::getFinalVRegClass(MR, MRI);
+      unsigned NewR = MRI.createVirtualRegister(FRC);
+      auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+      BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+        .addReg(MR.Reg, 0, MR.Sub);
+      BT.put(BitTracker::RegisterRef(NewR), BT.get(MR));
+    }
+  }
+
+  return Changed;
+}
+
+
+bool CopyPropagation::isCopyReg(unsigned Opc) {
+  switch (Opc) {
+    case TargetOpcode::COPY:
+    case TargetOpcode::REG_SEQUENCE:
+    case Hexagon::A2_tfr:
+    case Hexagon::A2_tfrp:
+    case Hexagon::A2_combinew:
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineri:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+
+bool CopyPropagation::propagateRegCopy(MachineInstr &MI) {
+  bool Changed = false;
+  unsigned Opc = MI.getOpcode();
+  BitTracker::RegisterRef RD = MI.getOperand(0);
+  assert(MI.getOperand(0).getSubReg() == 0);
+
+  switch (Opc) {
+    case TargetOpcode::COPY:
+    case Hexagon::A2_tfr:
+    case Hexagon::A2_tfrp: {
+      BitTracker::RegisterRef RS = MI.getOperand(1);
+      if (!HBS::isTransparentCopy(RD, RS, MRI))
+        break;
+      if (RS.Sub != 0)
+        Changed = HBS::replaceRegWithSub(RD.Reg, RS.Reg, RS.Sub, MRI);
+      else
+        Changed = HBS::replaceReg(RD.Reg, RS.Reg, MRI);
+      break;
+    }
+    case TargetOpcode::REG_SEQUENCE: {
+      BitTracker::RegisterRef SL, SH;
+      if (HBS::parseRegSequence(MI, SL, SH)) {
+        Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg,
+                                         SL.Reg, SL.Sub, MRI);
+        Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg,
+                                          SH.Reg, SH.Sub, MRI);
+      }
+      break;
+    }
+    case Hexagon::A2_combinew: {
+      BitTracker::RegisterRef RH = MI.getOperand(1), RL = MI.getOperand(2);
+      Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg,
+                                       RL.Reg, RL.Sub, MRI);
+      Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg,
+                                        RH.Reg, RH.Sub, MRI);
+      break;
+    }
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineri: {
+      unsigned SrcX = (Opc == Hexagon::A4_combineir) ? 2 : 1;
+      unsigned Sub = (Opc == Hexagon::A4_combineir) ? Hexagon::subreg_loreg
+                                                    : Hexagon::subreg_hireg;
+      BitTracker::RegisterRef RS = MI.getOperand(SrcX);
+      Changed = HBS::replaceSubWithSub(RD.Reg, Sub, RS.Reg, RS.Sub, MRI);
+      break;
+    }
+  }
+  return Changed;
+}
+
+
+bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  bool Changed = false;
+  for (auto I : Instrs) {
+    unsigned Opc = I->getOpcode();
+    if (!CopyPropagation::isCopyReg(Opc))
+      continue;
+    Changed |= propagateRegCopy(*I);
+  }
+
+  return Changed;
+}
+
+
+//
+// Bit simplification
+//
+// Recognize patterns that can be simplified and replace them with the
+// simpler forms.
+// This is by no means complete
+namespace {
+  class BitSimplification : public Transformation {
+  public:
+    BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
+        MachineRegisterInfo &mri)
+      : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+  private:
+    struct RegHalf : public BitTracker::RegisterRef {
+      bool Low;  // Low/High halfword.
+    };
+
+    bool matchHalf(unsigned SelfR, const BitTracker::RegisterCell &RC,
+          unsigned B, RegHalf &RH);
+
+    bool matchPackhl(unsigned SelfR, const BitTracker::RegisterCell &RC,
+          BitTracker::RegisterRef &Rs, BitTracker::RegisterRef &Rt);
+    unsigned getCombineOpcode(bool HLow, bool LLow);
+
+    bool genStoreUpperHalf(MachineInstr *MI);
+    bool genStoreImmediate(MachineInstr *MI);
+    bool genPackhl(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool genExtractHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool genCombineHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+
+    const HexagonInstrInfo &HII;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+  };
+}
+
+
+// Check if the bits [B..B+16) in register cell RC form a valid halfword,
+// i.e. [0..16), [16..32), etc. of some register. If so, return true and
+// set the information about the found register in RH.
+bool BitSimplification::matchHalf(unsigned SelfR,
+      const BitTracker::RegisterCell &RC, unsigned B, RegHalf &RH) {
+  // XXX This could be searching in the set of available registers, in case
+  // the match is not exact.
+
+  // Match 16-bit chunks, where the RC[B..B+15] references exactly one
+  // register and all the bits B..B+15 match between RC and the register.
+  // This is meant to match "v1[0-15]", where v1 = { [0]:0 [1-15]:v1... },
+  // and RC = { [0]:0 [1-15]:v1[1-15]... }.
+  bool Low = false;
+  unsigned I = B;
+  while (I < B+16 && RC[I].num())
+    I++;
+  if (I == B+16)
+    return false;
+
+  unsigned Reg = RC[I].RefI.Reg;
+  unsigned P = RC[I].RefI.Pos;    // The RefI.Pos will be advanced by I-B.
+  if (P < I-B)
+    return false;
+  unsigned Pos = P - (I-B);
+
+  if (Reg == 0 || Reg == SelfR)    // Don't match "self".
+    return false;
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return false;
+  if (!BT.has(Reg))
+    return false;
+
+  const BitTracker::RegisterCell &SC = BT.lookup(Reg);
+  if (Pos+16 > SC.width())
+    return false;
+
+  for (unsigned i = 0; i < 16; ++i) {
+    const BitTracker::BitValue &RV = RC[i+B];
+    if (RV.Type == BitTracker::BitValue::Ref) {
+      if (RV.RefI.Reg != Reg)
+        return false;
+      if (RV.RefI.Pos != i+Pos)
+        return false;
+      continue;
+    }
+    if (RC[i+B] != SC[i+Pos])
+      return false;
+  }
+
+  unsigned Sub = 0;
+  switch (Pos) {
+    case 0:
+      Sub = Hexagon::subreg_loreg;
+      Low = true;
+      break;
+    case 16:
+      Sub = Hexagon::subreg_loreg;
+      Low = false;
+      break;
+    case 32:
+      Sub = Hexagon::subreg_hireg;
+      Low = true;
+      break;
+    case 48:
+      Sub = Hexagon::subreg_hireg;
+      Low = false;
+      break;
+    default:
+      return false;
+  }
+
+  RH.Reg = Reg;
+  RH.Sub = Sub;
+  RH.Low = Low;
+  // If the subregister is not valid with the register, set it to 0.
+  if (!HBS::getFinalVRegClass(RH, MRI))
+    RH.Sub = 0;
+
+  return true;
+}
+
+
+// Check if RC matches the pattern of a S2_packhl. If so, return true and
+// set the inputs Rs and Rt.
+bool BitSimplification::matchPackhl(unsigned SelfR,
+      const BitTracker::RegisterCell &RC, BitTracker::RegisterRef &Rs,
+      BitTracker::RegisterRef &Rt) {
+  RegHalf L1, H1, L2, H2;
+
+  if (!matchHalf(SelfR, RC, 0, L2)  || !matchHalf(SelfR, RC, 16, L1))
+    return false;
+  if (!matchHalf(SelfR, RC, 32, H2) || !matchHalf(SelfR, RC, 48, H1))
+    return false;
+
+  // Rs = H1.L1, Rt = H2.L2
+  if (H1.Reg != L1.Reg || H1.Sub != L1.Sub || H1.Low || !L1.Low)
+    return false;
+  if (H2.Reg != L2.Reg || H2.Sub != L2.Sub || H2.Low || !L2.Low)
+    return false;
+
+  Rs = H1;
+  Rt = H2;
+  return true;
+}
+
+
+unsigned BitSimplification::getCombineOpcode(bool HLow, bool LLow) {
+  return HLow ? LLow ? Hexagon::A2_combine_ll
+                     : Hexagon::A2_combine_lh
+              : LLow ? Hexagon::A2_combine_hl
+                     : Hexagon::A2_combine_hh;
+}
+
+
+// If MI stores the upper halfword of a register (potentially obtained via
+// shifts or extracts), replace it with a storerf instruction. This could
+// cause the "extraction" code to become dead.
+bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc != Hexagon::S2_storerh_io)
+    return false;
+
+  MachineOperand &ValOp = MI->getOperand(2);
+  BitTracker::RegisterRef RS = ValOp;
+  if (!BT.has(RS.Reg))
+    return false;
+  const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+  RegHalf H;
+  if (!matchHalf(0, RC, 0, H))
+    return false;
+  if (H.Low)
+    return false;
+  MI->setDesc(HII.get(Hexagon::S2_storerf_io));
+  ValOp.setReg(H.Reg);
+  ValOp.setSubReg(H.Sub);
+  return true;
+}
+
+
+// If MI stores a value known at compile-time, and the value is within a range
+// that avoids using constant-extenders, replace it with a store-immediate.
+bool BitSimplification::genStoreImmediate(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  unsigned Align = 0;
+  switch (Opc) {
+    case Hexagon::S2_storeri_io:
+      Align++;
+    case Hexagon::S2_storerh_io:
+      Align++;
+    case Hexagon::S2_storerb_io:
+      break;
+    default:
+      return false;
+  }
+
+  // Avoid stores to frame-indices (due to an unknown offset).
+  if (!MI->getOperand(0).isReg())
+    return false;
+  MachineOperand &OffOp = MI->getOperand(1);
+  if (!OffOp.isImm())
+    return false;
+
+  int64_t Off = OffOp.getImm();
+  // Offset is u6:a. Sadly, there is no isShiftedUInt(n,x).
+  if (!isUIntN(6+Align, Off) || (Off & ((1<<Align)-1)))
+    return false;
+  // Source register:
+  BitTracker::RegisterRef RS = MI->getOperand(2);
+  if (!BT.has(RS.Reg))
+    return false;
+  const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+  uint64_t U;
+  if (!HBS::getConst(RC, 0, RC.width(), U))
+    return false;
+
+  // Only consider 8-bit values to avoid constant-extenders.
+  int V;
+  switch (Opc) {
+    case Hexagon::S2_storerb_io:
+      V = int8_t(U);
+      break;
+    case Hexagon::S2_storerh_io:
+      V = int16_t(U);
+      break;
+    case Hexagon::S2_storeri_io:
+      V = int32_t(U);
+      break;
+  }
+  if (!isInt<8>(V))
+    return false;
+
+  MI->RemoveOperand(2);
+  switch (Opc) {
+    case Hexagon::S2_storerb_io:
+      MI->setDesc(HII.get(Hexagon::S4_storeirb_io));
+      break;
+    case Hexagon::S2_storerh_io:
+      MI->setDesc(HII.get(Hexagon::S4_storeirh_io));
+      break;
+    case Hexagon::S2_storeri_io:
+      MI->setDesc(HII.get(Hexagon::S4_storeiri_io));
+      break;
+  }
+  MI->addOperand(MachineOperand::CreateImm(V));
+  return true;
+}
+
+
+// If MI is equivalent o S2_packhl, generate the S2_packhl. MI could be the
+// last instruction in a sequence that results in something equivalent to
+// the pack-halfwords. The intent is to cause the entire sequence to become
+// dead.
+bool BitSimplification::genPackhl(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == Hexagon::S2_packhl)
+    return false;
+  BitTracker::RegisterRef Rs, Rt;
+  if (!matchPackhl(RD.Reg, RC, Rs, Rt))
+    return false;
+
+  MachineBasicBlock &B = *MI->getParent();
+  unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+  DebugLoc DL = MI->getDebugLoc();
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  BuildMI(B, At, DL, HII.get(Hexagon::S2_packhl), NewR)
+      .addReg(Rs.Reg, 0, Rs.Sub)
+      .addReg(Rt.Reg, 0, Rt.Sub);
+  HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+  BT.put(BitTracker::RegisterRef(NewR), RC);
+  return true;
+}
+
+
+// If MI produces halfword of the input in the low half of the output,
+// replace it with zero-extend or extractu.
+bool BitSimplification::genExtractHalf(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  RegHalf L;
+  // Check for halfword in low 16 bits, zeros elsewhere.
+  if (!matchHalf(RD.Reg, RC, 0, L) || !HBS::isZero(RC, 16, 16))
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Prefer zxth, since zxth can go in any slot, while extractu only in
+  // slots 2 and 3.
+  unsigned NewR = 0;
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  if (L.Low && Opc != Hexagon::A2_zxth) {
+    NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR)
+        .addReg(L.Reg, 0, L.Sub);
+  } else if (!L.Low && Opc != Hexagon::S2_extractu) {
+    NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    BuildMI(B, MI, DL, HII.get(Hexagon::S2_extractu), NewR)
+        .addReg(L.Reg, 0, L.Sub)
+        .addImm(16)
+        .addImm(16);
+  }
+  if (NewR == 0)
+    return false;
+  HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+  BT.put(BitTracker::RegisterRef(NewR), RC);
+  return true;
+}
+
+
+// If MI is equivalent to a combine(.L/.H, .L/.H) replace with with the
+// combine.
+bool BitSimplification::genCombineHalf(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  RegHalf L, H;
+  // Check for combine h/l
+  if (!matchHalf(RD.Reg, RC, 0, L) || !matchHalf(RD.Reg, RC, 16, H))
+    return false;
+  // Do nothing if this is just a reg copy.
+  if (L.Reg == H.Reg && L.Sub == H.Sub && !H.Low && L.Low)
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  unsigned COpc = getCombineOpcode(H.Low, L.Low);
+  if (COpc == Opc)
+    return false;
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  BuildMI(B, At, DL, HII.get(COpc), NewR)
+      .addReg(H.Reg, 0, H.Sub)
+      .addReg(L.Reg, 0, L.Sub);
+  HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+  BT.put(BitTracker::RegisterRef(NewR), RC);
+  return true;
+}
+
+
+// If MI resets high bits of a register and keeps the lower ones, replace it
+// with zero-extend byte/half, and-immediate, or extractu, as appropriate.
+bool BitSimplification::genExtractLow(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A2_zxtb:
+    case Hexagon::A2_zxth:
+    case Hexagon::S2_extractu:
+      return false;
+  }
+  if (Opc == Hexagon::A2_andir && MI->getOperand(2).isImm()) {
+    int32_t Imm = MI->getOperand(2).getImm();
+    if (isInt<10>(Imm))
+      return false;
+  }
+
+  if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+    return false;
+  unsigned W = RC.width();
+  while (W > 0 && RC[W-1].is(0))
+    W--;
+  if (W == 0 || W == RC.width())
+    return false;
+  unsigned NewOpc = (W == 8)  ? Hexagon::A2_zxtb
+                  : (W == 16) ? Hexagon::A2_zxth
+                  : (W < 10)  ? Hexagon::A2_andir
+                  : Hexagon::S2_extractu;
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  for (auto &Op : MI->uses()) {
+    if (!Op.isReg())
+      continue;
+    BitTracker::RegisterRef RS = Op;
+    if (!BT.has(RS.Reg))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+    unsigned BN, BW;
+    if (!HBS::getSubregMask(RS, BN, BW, MRI))
+      continue;
+    if (BW < W || !HBS::isEqual(RC, 0, SC, BN, W))
+      continue;
+
+    unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+    auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR)
+                  .addReg(RS.Reg, 0, RS.Sub);
+    if (NewOpc == Hexagon::A2_andir)
+      MIB.addImm((1 << W) - 1);
+    else if (NewOpc == Hexagon::S2_extractu)
+      MIB.addImm(W).addImm(0);
+    HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+    BT.put(BitTracker::RegisterRef(NewR), RC);
+    return true;
+  }
+  return false;
+}
+
+
+// Check for tstbit simplification opportunity, where the bit being checked
+// can be tracked back to another register. For example:
+//   vreg2 = S2_lsr_i_r  vreg1, 5
+//   vreg3 = S2_tstbit_i vreg2, 0
+// =>
+//   vreg3 = S2_tstbit_i vreg1, 5
+bool BitSimplification::simplifyTstbit(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc != Hexagon::S2_tstbit_i)
+    return false;
+
+  unsigned BN = MI->getOperand(2).getImm();
+  BitTracker::RegisterRef RS = MI->getOperand(1);
+  unsigned F, W;
+  DebugLoc DL = MI->getDebugLoc();
+  if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI))
+    return false;
+  MachineBasicBlock &B = *MI->getParent();
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+
+  const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+  const BitTracker::BitValue &V = SC[F+BN];
+  if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != RS.Reg) {
+    const TargetRegisterClass *TC = MRI.getRegClass(V.RefI.Reg);
+    // Need to map V.RefI.Reg to a 32-bit register, i.e. if it is
+    // a double register, need to use a subregister and adjust bit
+    // number.
+    unsigned P = UINT_MAX;
+    BitTracker::RegisterRef RR(V.RefI.Reg, 0);
+    if (TC == &Hexagon::DoubleRegsRegClass) {
+      P = V.RefI.Pos;
+      RR.Sub = Hexagon::subreg_loreg;
+      if (P >= 32) {
+        P -= 32;
+        RR.Sub = Hexagon::subreg_hireg;
+      }
+    } else if (TC == &Hexagon::IntRegsRegClass) {
+      P = V.RefI.Pos;
+    }
+    if (P != UINT_MAX) {
+      unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+      BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
+          .addReg(RR.Reg, 0, RR.Sub)
+          .addImm(P);
+      HBS::replaceReg(RD.Reg, NewR, MRI);
+      BT.put(NewR, RC);
+      return true;
+    }
+  } else if (V.is(0) || V.is(1)) {
+    unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+    unsigned NewOpc = V.is(0) ? Hexagon::TFR_PdFalse : Hexagon::TFR_PdTrue;
+    BuildMI(B, At, DL, HII.get(NewOpc), NewR);
+    HBS::replaceReg(RD.Reg, NewR, MRI);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool BitSimplification::processBlock(MachineBasicBlock &B,
+      const RegisterSet &AVs) {
+  bool Changed = false;
+  RegisterSet AVB = AVs;
+  RegisterSet Defs;
+
+  for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) {
+    MachineInstr *MI = &*I;
+    Defs.clear();
+    HBS::getInstrDefs(*MI, Defs);
+
+    unsigned Opc = MI->getOpcode();
+    if (Opc == TargetOpcode::COPY || Opc == TargetOpcode::REG_SEQUENCE)
+      continue;
+
+    if (MI->mayStore()) {
+      bool T = genStoreUpperHalf(MI);
+      T = T || genStoreImmediate(MI);
+      Changed |= T;
+      continue;
+    }
+
+    if (Defs.count() != 1)
+      continue;
+    const MachineOperand &Op0 = MI->getOperand(0);
+    if (!Op0.isReg() || !Op0.isDef())
+      continue;
+    BitTracker::RegisterRef RD = Op0;
+    if (!BT.has(RD.Reg))
+      continue;
+    const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+    const BitTracker::RegisterCell &RC = BT.lookup(RD.Reg);
+
+    if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
+      bool T = genPackhl(MI, RD, RC);
+      Changed |= T;
+      continue;
+    }
+
+    if (FRC->getID() == Hexagon::IntRegsRegClassID) {
+      bool T = genExtractHalf(MI, RD, RC);
+      T = T || genCombineHalf(MI, RD, RC);
+      T = T || genExtractLow(MI, RD, RC);
+      Changed |= T;
+      continue;
+    }
+
+    if (FRC->getID() == Hexagon::PredRegsRegClassID) {
+      bool T = simplifyTstbit(MI, RD, RC);
+      Changed |= T;
+      continue;
+    }
+  }
+  return Changed;
+}
+
+
+bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HRI = *HST.getRegisterInfo();
+  auto &HII = *HST.getInstrInfo();
+
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool Changed;
+
+  Changed = DeadCodeElimination(MF, *MDT).run();
+
+  const HexagonEvaluator HE(HRI, MRI, HII, MF);
+  BitTracker BT(HE, MF);
+  DEBUG(BT.trace(true));
+  BT.run();
+
+  MachineBasicBlock &Entry = MF.front();
+
+  RegisterSet AIG;  // Available registers for IG.
+  ConstGeneration ImmG(BT, HII, MRI);
+  Changed |= visitBlock(Entry, ImmG, AIG);
+
+  RegisterSet ARE;  // Available registers for RIE.
+  RedundantInstrElimination RIE(BT, HII, MRI);
+  Changed |= visitBlock(Entry, RIE, ARE);
+
+  RegisterSet ACG;  // Available registers for CG.
+  CopyGeneration CopyG(BT, HII, MRI);
+  Changed |= visitBlock(Entry, CopyG, ACG);
+
+  RegisterSet ACP;  // Available registers for CP.
+  CopyPropagation CopyP(HRI, MRI);
+  Changed |= visitBlock(Entry, CopyP, ACP);
+
+  Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+  BT.run();
+  RegisterSet ABS;  // Available registers for BS.
+  BitSimplification BitS(BT, HII, MRI);
+  Changed |= visitBlock(Entry, BitS, ABS);
+
+  Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+  if (Changed) {
+    for (auto &B : MF)
+      for (auto &I : B)
+        I.clearKillInfo();
+    DeadCodeElimination(MF, *MDT).run();
+  }
+  return Changed;
+}
+
+
+// Recognize loops where the code at the end of the loop matches the code
+// before the entry of the loop, and the matching code is such that is can
+// be simplified. This pass relies on the bit simplification above and only
+// prepares code in a way that can be handled by the bit simplifcation.
+//
+// This is the motivating testcase (and explanation):
+//
+// {
+//   loop0(.LBB0_2, r1)      // %for.body.preheader
+//   r5:4 = memd(r0++#8)
+// }
+// {
+//   r3 = lsr(r4, #16)
+//   r7:6 = combine(r5, r5)
+// }
+// {
+//   r3 = insert(r5, #16, #16)
+//   r7:6 = vlsrw(r7:6, #16)
+// }
+// .LBB0_2:
+// {
+//   memh(r2+#4) = r5
+//   memh(r2+#6) = r6            # R6 is really R5.H
+// }
+// {
+//   r2 = add(r2, #8)
+//   memh(r2+#0) = r4
+//   memh(r2+#2) = r3            # R3 is really R4.H
+// }
+// {
+//   r5:4 = memd(r0++#8)
+// }
+// {                             # "Shuffling" code that sets up R3 and R6
+//   r3 = lsr(r4, #16)           # so that their halves can be stored in the
+//   r7:6 = combine(r5, r5)      # next iteration. This could be folded into
+// }                             # the stores if the code was at the beginning
+// {                             # of the loop iteration. Since the same code
+//   r3 = insert(r5, #16, #16)   # precedes the loop, it can actually be moved
+//   r7:6 = vlsrw(r7:6, #16)     # there.
+// }:endloop0
+//
+//
+// The outcome:
+//
+// {
+//   loop0(.LBB0_2, r1)
+//   r5:4 = memd(r0++#8)
+// }
+// .LBB0_2:
+// {
+//   memh(r2+#4) = r5
+//   memh(r2+#6) = r5.h
+// }
+// {
+//   r2 = add(r2, #8)
+//   memh(r2+#0) = r4
+//   memh(r2+#2) = r4.h
+// }
+// {
+//   r5:4 = memd(r0++#8)
+// }:endloop0
+
+namespace llvm {
+  FunctionPass *createHexagonLoopRescheduling();
+  void initializeHexagonLoopReschedulingPass(PassRegistry&);
+}
+
+namespace {
+  class HexagonLoopRescheduling : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonLoopRescheduling() : MachineFunctionPass(ID),
+        HII(0), HRI(0), MRI(0), BTP(0) {
+      initializeHexagonLoopReschedulingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+    MachineRegisterInfo *MRI;
+    BitTracker *BTP;
+
+    struct LoopCand {
+      LoopCand(MachineBasicBlock *lb, MachineBasicBlock *pb,
+            MachineBasicBlock *eb) : LB(lb), PB(pb), EB(eb) {}
+      MachineBasicBlock *LB, *PB, *EB;
+    };
+    typedef std::vector<MachineInstr*> InstrList;
+    struct InstrGroup {
+      BitTracker::RegisterRef Inp, Out;
+      InstrList Ins;
+    };
+    struct PhiInfo {
+      PhiInfo(MachineInstr &P, MachineBasicBlock &B);
+      unsigned DefR;
+      BitTracker::RegisterRef LR, PR;
+      MachineBasicBlock *LB, *PB;
+    };
+
+    static unsigned getDefReg(const MachineInstr *MI);
+    bool isConst(unsigned Reg) const;
+    bool isBitShuffle(const MachineInstr *MI, unsigned DefR) const;
+    bool isStoreInput(const MachineInstr *MI, unsigned DefR) const;
+    bool isShuffleOf(unsigned OutR, unsigned InpR) const;
+    bool isSameShuffle(unsigned OutR1, unsigned InpR1, unsigned OutR2,
+        unsigned &InpR2) const;
+    void moveGroup(InstrGroup &G, MachineBasicBlock &LB, MachineBasicBlock &PB,
+        MachineBasicBlock::iterator At, unsigned OldPhiR, unsigned NewPredR);
+    bool processLoop(LoopCand &C);
+  };
+}
+
+char HexagonLoopRescheduling::ID = 0;
+
+INITIALIZE_PASS(HexagonLoopRescheduling, "hexagon-loop-resched",
+  "Hexagon Loop Rescheduling", false, false)
+
+
+HexagonLoopRescheduling::PhiInfo::PhiInfo(MachineInstr &P,
+      MachineBasicBlock &B) {
+  DefR = HexagonLoopRescheduling::getDefReg(&P);
+  LB = &B;
+  PB = nullptr;
+  for (unsigned i = 1, n = P.getNumOperands(); i < n; i += 2) {
+    const MachineOperand &OpB = P.getOperand(i+1);
+    if (OpB.getMBB() == &B) {
+      LR = P.getOperand(i);
+      continue;
+    }
+    PB = OpB.getMBB();
+    PR = P.getOperand(i);
+  }
+}
+
+
+unsigned HexagonLoopRescheduling::getDefReg(const MachineInstr *MI) {
+  RegisterSet Defs;
+  HBS::getInstrDefs(*MI, Defs);
+  if (Defs.count() != 1)
+    return 0;
+  return Defs.find_first();
+}
+
+
+bool HexagonLoopRescheduling::isConst(unsigned Reg) const {
+  if (!BTP->has(Reg))
+    return false;
+  const BitTracker::RegisterCell &RC = BTP->lookup(Reg);
+  for (unsigned i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (!V.is(0) && !V.is(1))
+      return false;
+  }
+  return true;
+}
+
+
+bool HexagonLoopRescheduling::isBitShuffle(const MachineInstr *MI,
+      unsigned DefR) const {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::COPY:
+    case Hexagon::S2_lsr_i_r:
+    case Hexagon::S2_asr_i_r:
+    case Hexagon::S2_asl_i_r:
+    case Hexagon::S2_lsr_i_p:
+    case Hexagon::S2_asr_i_p:
+    case Hexagon::S2_asl_i_p:
+    case Hexagon::S2_insert:
+    case Hexagon::A2_or:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_and:
+    case Hexagon::A2_andp:
+    case Hexagon::A2_combinew:
+    case Hexagon::A4_combineri:
+    case Hexagon::A4_combineir:
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+    case Hexagon::A2_combine_ll:
+    case Hexagon::A2_combine_lh:
+    case Hexagon::A2_combine_hl:
+    case Hexagon::A2_combine_hh:
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonLoopRescheduling::isStoreInput(const MachineInstr *MI,
+      unsigned InpR) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    if (!Op.isReg())
+      continue;
+    if (Op.getReg() == InpR)
+      return i == n-1;
+  }
+  return false;
+}
+
+
+bool HexagonLoopRescheduling::isShuffleOf(unsigned OutR, unsigned InpR) const {
+  if (!BTP->has(OutR) || !BTP->has(InpR))
+    return false;
+  const BitTracker::RegisterCell &OutC = BTP->lookup(OutR);
+  for (unsigned i = 0, w = OutC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = OutC[i];
+    if (V.Type != BitTracker::BitValue::Ref)
+      continue;
+    if (V.RefI.Reg != InpR)
+      return false;
+  }
+  return true;
+}
+
+
+bool HexagonLoopRescheduling::isSameShuffle(unsigned OutR1, unsigned InpR1,
+      unsigned OutR2, unsigned &InpR2) const {
+  if (!BTP->has(OutR1) || !BTP->has(InpR1) || !BTP->has(OutR2))
+    return false;
+  const BitTracker::RegisterCell &OutC1 = BTP->lookup(OutR1);
+  const BitTracker::RegisterCell &OutC2 = BTP->lookup(OutR2);
+  unsigned W = OutC1.width();
+  unsigned MatchR = 0;
+  if (W != OutC2.width())
+    return false;
+  for (unsigned i = 0; i < W; ++i) {
+    const BitTracker::BitValue &V1 = OutC1[i], &V2 = OutC2[i];
+    if (V1.Type != V2.Type || V1.Type == BitTracker::BitValue::One)
+      return false;
+    if (V1.Type != BitTracker::BitValue::Ref)
+      continue;
+    if (V1.RefI.Pos != V2.RefI.Pos)
+      return false;
+    if (V1.RefI.Reg != InpR1)
+      return false;
+    if (V2.RefI.Reg == 0 || V2.RefI.Reg == OutR2)
+      return false;
+    if (!MatchR)
+      MatchR = V2.RefI.Reg;
+    else if (V2.RefI.Reg != MatchR)
+      return false;
+  }
+  InpR2 = MatchR;
+  return true;
+}
+
+
+void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
+      MachineBasicBlock &PB, MachineBasicBlock::iterator At, unsigned OldPhiR,
+      unsigned NewPredR) {
+  DenseMap<unsigned,unsigned> RegMap;
+
+  const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR);
+  unsigned PhiR = MRI->createVirtualRegister(PhiRC);
+  BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR)
+    .addReg(NewPredR)
+    .addMBB(&PB)
+    .addReg(G.Inp.Reg)
+    .addMBB(&LB);
+  RegMap.insert(std::make_pair(G.Inp.Reg, PhiR));
+
+  for (unsigned i = G.Ins.size(); i > 0; --i) {
+    const MachineInstr *SI = G.Ins[i-1];
+    unsigned DR = getDefReg(SI);
+    const TargetRegisterClass *RC = MRI->getRegClass(DR);
+    unsigned NewDR = MRI->createVirtualRegister(RC);
+    DebugLoc DL = SI->getDebugLoc();
+
+    auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR);
+    for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) {
+      const MachineOperand &Op = SI->getOperand(j);
+      if (!Op.isReg()) {
+        MIB.addOperand(Op);
+        continue;
+      }
+      if (!Op.isUse())
+        continue;
+      unsigned UseR = RegMap[Op.getReg()];
+      MIB.addReg(UseR, 0, Op.getSubReg());
+    }
+    RegMap.insert(std::make_pair(DR, NewDR));
+  }
+
+  HBS::replaceReg(OldPhiR, RegMap[G.Out.Reg], *MRI);
+}
+
+
+bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
+  DEBUG(dbgs() << "Processing loop in BB#" << C.LB->getNumber() << "\n");
+  std::vector<PhiInfo> Phis;
+  for (auto &I : *C.LB) {
+    if (!I.isPHI())
+      break;
+    unsigned PR = getDefReg(&I);
+    if (isConst(PR))
+      continue;
+    bool BadUse = false, GoodUse = false;
+    for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) {
+      MachineInstr *UseI = UI->getParent();
+      if (UseI->getParent() != C.LB) {
+        BadUse = true;
+        break;
+      }
+      if (isBitShuffle(UseI, PR) || isStoreInput(UseI, PR))
+        GoodUse = true;
+    }
+    if (BadUse || !GoodUse)
+      continue;
+
+    Phis.push_back(PhiInfo(I, *C.LB));
+  }
+
+  DEBUG({
+    dbgs() << "Phis: {";
+    for (auto &I : Phis) {
+      dbgs() << ' ' << PrintReg(I.DefR, HRI) << "=phi("
+             << PrintReg(I.PR.Reg, HRI, I.PR.Sub) << ":b" << I.PB->getNumber()
+             << ',' << PrintReg(I.LR.Reg, HRI, I.LR.Sub) << ":b"
+             << I.LB->getNumber() << ')';
+    }
+    dbgs() << " }\n";
+  });
+
+  if (Phis.empty())
+    return false;
+
+  bool Changed = false;
+  InstrList ShufIns;
+
+  // Go backwards in the block: for each bit shuffling instruction, check
+  // if that instruction could potentially be moved to the front of the loop:
+  // the output of the loop cannot be used in a non-shuffling instruction
+  // in this loop.
+  for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) {
+    if (I->isTerminator())
+      continue;
+    if (I->isPHI())
+      break;
+
+    RegisterSet Defs;
+    HBS::getInstrDefs(*I, Defs);
+    if (Defs.count() != 1)
+      continue;
+    unsigned DefR = Defs.find_first();
+    if (!TargetRegisterInfo::isVirtualRegister(DefR))
+      continue;
+    if (!isBitShuffle(&*I, DefR))
+      continue;
+
+    bool BadUse = false;
+    for (auto UI = MRI->use_begin(DefR), UE = MRI->use_end(); UI != UE; ++UI) {
+      MachineInstr *UseI = UI->getParent();
+      if (UseI->getParent() == C.LB) {
+        if (UseI->isPHI()) {
+          // If the use is in a phi node in this loop, then it should be
+          // the value corresponding to the back edge.
+          unsigned Idx = UI.getOperandNo();
+          if (UseI->getOperand(Idx+1).getMBB() != C.LB)
+            BadUse = true;
+        } else {
+          auto F = std::find(ShufIns.begin(), ShufIns.end(), UseI);
+          if (F == ShufIns.end())
+            BadUse = true;
+        }
+      } else {
+        // There is a use outside of the loop, but there is no epilog block
+        // suitable for a copy-out.
+        if (C.EB == nullptr)
+          BadUse = true;
+      }
+      if (BadUse)
+        break;
+    }
+
+    if (BadUse)
+      continue;
+    ShufIns.push_back(&*I);
+  }
+
+  // Partition the list of shuffling instructions into instruction groups,
+  // where each group has to be moved as a whole (i.e. a group is a chain of
+  // dependent instructions). A group produces a single live output register,
+  // which is meant to be the input of the loop phi node (although this is
+  // not checked here yet). It also uses a single register as its input,
+  // which is some value produced in the loop body. After moving the group
+  // to the beginning of the loop, that input register would need to be
+  // the loop-carried register (through a phi node) instead of the (currently
+  // loop-carried) output register.
+  typedef std::vector<InstrGroup> InstrGroupList;
+  InstrGroupList Groups;
+
+  for (unsigned i = 0, n = ShufIns.size(); i < n; ++i) {
+    MachineInstr *SI = ShufIns[i];
+    if (SI == nullptr)
+      continue;
+
+    InstrGroup G;
+    G.Ins.push_back(SI);
+    G.Out.Reg = getDefReg(SI);
+    RegisterSet Inputs;
+    HBS::getInstrUses(*SI, Inputs);
+
+    for (unsigned j = i+1; j < n; ++j) {
+      MachineInstr *MI = ShufIns[j];
+      if (MI == nullptr)
+        continue;
+      RegisterSet Defs;
+      HBS::getInstrDefs(*MI, Defs);
+      // If this instruction does not define any pending inputs, skip it.
+      if (!Defs.intersects(Inputs))
+        continue;
+      // Otherwise, add it to the current group and remove the inputs that
+      // are defined by MI.
+      G.Ins.push_back(MI);
+      Inputs.remove(Defs);
+      // Then add all registers used by MI.
+      HBS::getInstrUses(*MI, Inputs);
+      ShufIns[j] = nullptr;
+    }
+
+    // Only add a group if it requires at most one register.
+    if (Inputs.count() > 1)
+      continue;
+    auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+      return G.Out.Reg == P.LR.Reg;
+    };
+    if (std::find_if(Phis.begin(), Phis.end(), LoopInpEq) == Phis.end())
+      continue;
+
+    G.Inp.Reg = Inputs.find_first();
+    Groups.push_back(G);
+  }
+
+  DEBUG({
+    for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+      InstrGroup &G = Groups[i];
+      dbgs() << "Group[" << i << "] inp: "
+             << PrintReg(G.Inp.Reg, HRI, G.Inp.Sub)
+             << "  out: " << PrintReg(G.Out.Reg, HRI, G.Out.Sub) << "\n";
+      for (unsigned j = 0, m = G.Ins.size(); j < m; ++j)
+        dbgs() << "  " << *G.Ins[j];
+    }
+  });
+
+  for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+    InstrGroup &G = Groups[i];
+    if (!isShuffleOf(G.Out.Reg, G.Inp.Reg))
+      continue;
+    auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+      return G.Out.Reg == P.LR.Reg;
+    };
+    auto F = std::find_if(Phis.begin(), Phis.end(), LoopInpEq);
+    if (F == Phis.end())
+      continue;
+    unsigned PredR = 0;
+    if (!isSameShuffle(G.Out.Reg, G.Inp.Reg, F->PR.Reg, PredR)) {
+      const MachineInstr *DefPredR = MRI->getVRegDef(F->PR.Reg);
+      unsigned Opc = DefPredR->getOpcode();
+      if (Opc != Hexagon::A2_tfrsi && Opc != Hexagon::A2_tfrpi)
+        continue;
+      if (!DefPredR->getOperand(1).isImm())
+        continue;
+      if (DefPredR->getOperand(1).getImm() != 0)
+        continue;
+      const TargetRegisterClass *RC = MRI->getRegClass(G.Inp.Reg);
+      if (RC != MRI->getRegClass(F->PR.Reg)) {
+        PredR = MRI->createVirtualRegister(RC);
+        unsigned TfrI = (RC == &Hexagon::IntRegsRegClass) ? Hexagon::A2_tfrsi
+                                                          : Hexagon::A2_tfrpi;
+        auto T = C.PB->getFirstTerminator();
+        DebugLoc DL = (T != C.PB->end()) ? T->getDebugLoc() : DebugLoc();
+        BuildMI(*C.PB, T, DL, HII->get(TfrI), PredR)
+          .addImm(0);
+      } else {
+        PredR = F->PR.Reg;
+      }
+    }
+    assert(MRI->getRegClass(PredR) == MRI->getRegClass(G.Inp.Reg));
+    moveGroup(G, *F->LB, *F->PB, F->LB->getFirstNonPHI(), F->DefR, PredR);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  HII = HST.getInstrInfo();
+  HRI = HST.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+  BitTracker BT(HE, MF);
+  DEBUG(BT.trace(true));
+  BT.run();
+  BTP = &BT;
+
+  std::vector<LoopCand> Cand;
+
+  for (auto &B : MF) {
+    if (B.pred_size() != 2 || B.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PB = nullptr;
+    bool IsLoop = false;
+    for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) {
+      if (*PI != &B)
+        PB = *PI;
+      else
+        IsLoop = true;
+    }
+    if (!IsLoop)
+      continue;
+
+    MachineBasicBlock *EB = nullptr;
+    for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) {
+      if (*SI == &B)
+        continue;
+      // Set EP to the epilog block, if it has only 1 predecessor (i.e. the
+      // edge from B to EP is non-critical.
+      if ((*SI)->pred_size() == 1)
+        EB = *SI;
+      break;
+    }
+
+    Cand.push_back(LoopCand(&B, PB, EB));
+  }
+
+  bool Changed = false;
+  for (auto &C : Cand)
+    Changed |= processLoop(C);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopRescheduling() {
+  return new HexagonLoopRescheduling();
+}
+
+FunctionPass *llvm::createHexagonBitSimplify() {
+  return new HexagonBitSimplify();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index 021e58a..d5848dc 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -84,6 +84,8 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
   uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
   switch (ID) {
     case DoubleRegsRegClassID:
+    case VecDblRegsRegClassID:
+    case VecDblRegs128BRegClassID:
       return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1)
                                    : BT::BitMask(RW, 2*RW-1);
     default:
@@ -95,30 +97,29 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
   llvm_unreachable("Unexpected register/subregister");
 }
 
-
 namespace {
-  struct RegisterRefs : public std::vector<BT::RegisterRef> {
-    typedef std::vector<BT::RegisterRef> Base;
-    RegisterRefs(const MachineInstr *MI);
-    const BT::RegisterRef &operator[](unsigned n) const {
-      // The main purpose of this operator is to assert with bad argument.
-      assert(n < size());
-      return Base::operator[](n);
-    }
-  };
+class RegisterRefs {
+  std::vector<BT::RegisterRef> Vector;
 
-  RegisterRefs::RegisterRefs(const MachineInstr *MI)
-    : Base(MI->getNumOperands()) {
-    for (unsigned i = 0, n = size(); i < n; ++i) {
+public:
+  RegisterRefs(const MachineInstr *MI) : Vector(MI->getNumOperands()) {
+    for (unsigned i = 0, n = Vector.size(); i < n; ++i) {
       const MachineOperand &MO = MI->getOperand(i);
       if (MO.isReg())
-        at(i) = BT::RegisterRef(MO);
+        Vector[i] = BT::RegisterRef(MO);
       // For indices that don't correspond to registers, the entry will
       // remain constructed via the default constructor.
     }
   }
-}
 
+  size_t size() const { return Vector.size(); }
+  const BT::RegisterRef &operator[](unsigned n) const {
+    // The main purpose of this operator is to assert with bad argument.
+    assert(n < Vector.size());
+    return Vector[n];
+  }
+};
+}
 
 bool HexagonEvaluator::evaluate(const MachineInstr *MI,
       const CellMapType &Inputs, CellMapType &Outputs) const {
@@ -189,7 +190,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
     return true;
   };
   // Get the cell corresponding to the N-th operand.
-  auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W)
+  auto cop = [this,&Reg,&MI,&Inputs] (unsigned N, uint16_t W)
         -> BT::RegisterCell {
     const MachineOperand &Op = MI->getOperand(N);
     if (Op.isImm())
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 3753b745..efafdd0 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -102,7 +102,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
        MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock* MBB = MBBb;
+    MachineBasicBlock *MBB = &*MBBb;
 
     // Traverse the basic block.
     MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
@@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->removeSuccessor(JumpAroundTarget);
-              MBB->addSuccessor(UncondTarget);
+              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
-              LayoutSucc->removeSuccessor(UncondTarget);
-              LayoutSucc->addSuccessor(JumpAroundTarget);
+              LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
 
               // This code performs the conversion for case 2, which moves
               // the block to the fall-thru case (BB3 in the code above).
@@ -210,16 +208,15 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
               // The live-in to LayoutSucc is now all values live-in to
               // JumpAroundTarget.
               //
-              std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(),
-                                               LayoutSucc->livein_end());
-              std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(),
-                                              JumpAroundTarget->livein_end());
-              for (unsigned i = 0; i < OrigLiveIn.size(); ++i) {
-                LayoutSucc->removeLiveIn(OrigLiveIn[i]);
-              }
-              for (unsigned i = 0; i < NewLiveIn.size(); ++i) {
-                LayoutSucc->addLiveIn(NewLiveIn[i]);
-              }
+              std::vector<MachineBasicBlock::RegisterMaskPair> OrigLiveIn(
+                  LayoutSucc->livein_begin(), LayoutSucc->livein_end());
+              std::vector<MachineBasicBlock::RegisterMaskPair> NewLiveIn(
+                  JumpAroundTarget->livein_begin(),
+                  JumpAroundTarget->livein_end());
+              for (const auto &OrigLI : OrigLiveIn)
+                LayoutSucc->removeLiveIn(OrigLI.PhysReg);
+              for (const auto &NewLI : NewLiveIn)
+                LayoutSucc->addLiveIn(NewLI);
             }
           }
         }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 9f5fac1..931db66 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -59,30 +59,23 @@ namespace {
 
   // Numbering map for gep nodes. Used to keep track of ordering for
   // gep nodes.
-  struct NodeNumbering : public std::map<const GepNode*,unsigned> {
-  };
-
-  struct NodeOrdering : public NodeNumbering {
+  struct NodeOrdering {
     NodeOrdering() : LastNum(0) {}
-#ifdef _MSC_VER
-    void special_insert_for_special_msvc(const GepNode *N)
-#else
-    using NodeNumbering::insert;
-    void insert(const GepNode* N)
-#endif
-    {
-      insert(std::make_pair(N, ++LastNum));
-    }
-    bool operator() (const GepNode* N1, const GepNode *N2) const {
-      const_iterator F1 = find(N1), F2 = find(N2);
-      assert(F1 != end() && F2 != end());
+
+    void insert(const GepNode *N) { Map.insert(std::make_pair(N, ++LastNum)); }
+    void clear() { Map.clear(); }
+
+    bool operator()(const GepNode *N1, const GepNode *N2) const {
+      auto F1 = Map.find(N1), F2 = Map.find(N2);
+      assert(F1 != Map.end() && F2 != Map.end());
       return F1->second < F2->second;
     }
+
   private:
+    std::map<const GepNode *, unsigned> Map;
     unsigned LastNum;
   };
 
-
   class HexagonCommonGEP : public FunctionPass {
   public:
     static char ID;
@@ -360,11 +353,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
     Us.insert(&UI.getUse());
   }
   Nodes.push_back(N);
-#ifdef _MSC_VER
-  NodeOrder.special_insert_for_special_msvc(N);
-#else
   NodeOrder.insert(N);
-#endif
 
   // Skip the first index operand, since we only handle 0. This dereferences
   // the pointer operand.
@@ -379,11 +368,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
     Nx->PTy = PtrTy;
     Nx->Idx = Op;
     Nodes.push_back(Nx);
-#ifdef _MSC_VER
-    NodeOrder.special_insert_for_special_msvc(Nx);
-#else
     NodeOrder.insert(Nx);
-#endif
     PN = Nx;
 
     PtrTy = next_type(PtrTy, Op);
@@ -404,7 +389,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
 void HexagonCommonGEP::collect() {
   // Establish depth-first traversal order of the dominator tree.
   ValueVect BO;
-  getBlockTraversalOrder(Fn->begin(), BO);
+  getBlockTraversalOrder(&Fn->front(), BO);
 
   // The creation of gep nodes requires DT-traversal. When processing a GEP
   // instruction that uses another GEP instruction as the base pointer, the
@@ -737,7 +722,7 @@ namespace {
       Instruction *In = cast<Instruction>(V);
       if (In->getParent() != B)
         continue;
-      BasicBlock::iterator It = In;
+      BasicBlock::iterator It = In->getIterator();
       if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd))
         FirstUse = It;
     }
@@ -1135,7 +1120,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
     ArrayRef<Value*> A(IdxList, IdxC);
     Type *InpTy = Input->getType();
     Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
-    NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At);
+    NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
     DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
     Input = NewInst;
   } while (nax <= Num);
@@ -1213,7 +1198,7 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
       Last = Child;
     } while (true);
 
-    BasicBlock::iterator InsertAt = LastB->getTerminator();
+    BasicBlock::iterator InsertAt = LastB->getTerminator()->getIterator();
     if (LastUsed || LastCN > 0) {
       ValueVect Urs;
       getAllUsersForNode(Root, Urs, NCM);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
new file mode 100644
index 0000000..ee0c318
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -0,0 +1,1063 @@
+//===--- HexagonEarlyIfConv.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a Hexagon-specific if-conversion pass that runs on the
+// SSA form.
+// In SSA it is not straightforward to represent instructions that condi-
+// tionally define registers, since a conditionally-defined register may
+// only be used under the same condition on which the definition was based.
+// To avoid complications of this nature, this patch will only generate
+// predicated stores, and speculate other instructions from the "if-conver-
+// ted" block.
+// The code will recognize CFG patterns where a block with a conditional
+// branch "splits" into a "true block" and a "false block". Either of these
+// could be omitted (in case of a triangle, for example).
+// If after conversion of the side block(s) the CFG allows it, the resul-
+// ting blocks may be merged. If the "join" block contained PHI nodes, they
+// will be replaced with MUX (or MUX-like) instructions to maintain the
+// semantics of the PHI.
+//
+// Example:
+//
+//         %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+//         %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+//         J2_jumpt %vreg41<kill>, <BB#5>, %PC<imp-def,dead>
+//         J2_jump <BB#4>, %PC<imp-def,dead>
+//     Successors according to CFG: BB#4(62) BB#5(62)
+//
+// BB#4: derived from LLVM BB %if.then
+//     Predecessors according to CFG: BB#3
+//         %vreg11<def> = A2_addp %vreg6, %vreg10
+//         S2_storerd_io %vreg32, 16, %vreg11
+//     Successors according to CFG: BB#5
+//
+// BB#5: derived from LLVM BB %if.end
+//     Predecessors according to CFG: BB#3 BB#4
+//         %vreg12<def> = PHI %vreg6, <BB#3>, %vreg11, <BB#4>
+//         %vreg13<def> = A2_addp %vreg7, %vreg12
+//         %vreg42<def> = C2_cmpeqi %vreg9, 10
+//         J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+//         J2_jump <BB#6>, %PC<imp-def,dead>
+//     Successors according to CFG: BB#6(4) BB#3(124)
+//
+// would become:
+//
+//         %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+//         %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+// spec->  %vreg11<def> = A2_addp %vreg6, %vreg10
+// pred->  S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11
+//         %vreg46<def> = MUX64_rr %vreg41, %vreg6, %vreg11
+//         %vreg13<def> = A2_addp %vreg7, %vreg46
+//         %vreg42<def> = C2_cmpeqi %vreg9, 10
+//         J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+//         J2_jump <BB#6>, %PC<imp-def,dead>
+//     Successors according to CFG: BB#6 BB#3
+
+#define DEBUG_TYPE "hexagon-eif"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "HexagonTargetMachine.h"
+
+#include <functional>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+  FunctionPass *createHexagonEarlyIfConversion();
+  void initializeHexagonEarlyIfConversionPass(PassRegistry& Registry);
+}
+
+namespace {
+  cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden,
+    cl::init(false), cl::desc("Enable branch probability info"));
+  cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
+    cl::desc("Size limit in Hexagon early if-conversion"));
+
+  struct PrintMB {
+    PrintMB(const MachineBasicBlock *B) : MB(B) {}
+    const MachineBasicBlock *MB;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintMB &P) {
+    if (!P.MB)
+      return OS << "<none>";
+    return OS << '#' << P.MB->getNumber();
+  }
+
+  struct FlowPattern {
+    FlowPattern() : SplitB(0), TrueB(0), FalseB(0), JoinB(0), PredR(0) {}
+    FlowPattern(MachineBasicBlock *B, unsigned PR, MachineBasicBlock *TB,
+          MachineBasicBlock *FB, MachineBasicBlock *JB)
+      : SplitB(B), TrueB(TB), FalseB(FB), JoinB(JB), PredR(PR) {}
+
+    MachineBasicBlock *SplitB;
+    MachineBasicBlock *TrueB, *FalseB, *JoinB;
+    unsigned PredR;
+  };
+  struct PrintFP {
+    PrintFP(const FlowPattern &P, const TargetRegisterInfo &T)
+      : FP(P), TRI(T) {}
+    const FlowPattern &FP;
+    const TargetRegisterInfo &TRI;
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P);
+  };
+  raw_ostream &operator<<(raw_ostream &OS,
+                          const PrintFP &P) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
+    OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
+       << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
+       << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
+       << PrintMB(P.FP.FalseB)
+       << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
+    return OS;
+  }
+
+  class HexagonEarlyIfConversion : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonEarlyIfConversion() : MachineFunctionPass(ID),
+        TII(0), TRI(0), MFN(0), MRI(0), MDT(0), MLI(0) {
+      initializeHexagonEarlyIfConversionPass(*PassRegistry::getPassRegistry());
+    }
+    const char *getPassName() const override {
+      return "Hexagon early if conversion";
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    typedef DenseSet<MachineBasicBlock*> BlockSetType;
+
+    bool isPreheader(const MachineBasicBlock *B) const;
+    bool matchFlowPattern(MachineBasicBlock *B, MachineLoop *L,
+          FlowPattern &FP);
+    bool visitBlock(MachineBasicBlock *B, MachineLoop *L);
+    bool visitLoop(MachineLoop *L);
+
+    bool hasEHLabel(const MachineBasicBlock *B) const;
+    bool hasUncondBranch(const MachineBasicBlock *B) const;
+    bool isValidCandidate(const MachineBasicBlock *B) const;
+    bool usesUndefVReg(const MachineInstr *MI) const;
+    bool isValid(const FlowPattern &FP) const;
+    unsigned countPredicateDefs(const MachineBasicBlock *B) const;
+    unsigned computePhiCost(MachineBasicBlock *B) const;
+    bool isProfitable(const FlowPattern &FP) const;
+    bool isPredicableStore(const MachineInstr *MI) const;
+    bool isSafeToSpeculate(const MachineInstr *MI) const;
+
+    unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const;
+    void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At,
+          MachineInstr *MI, unsigned PredR, bool IfTrue);
+    void predicateBlockNB(MachineBasicBlock *ToB,
+          MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+          unsigned PredR, bool IfTrue);
+
+    void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
+    void convert(const FlowPattern &FP);
+
+    void removeBlock(MachineBasicBlock *B);
+    void eliminatePhis(MachineBasicBlock *B);
+    void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB);
+    void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB);
+    void simplifyFlowGraph(const FlowPattern &FP);
+
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineFunction *MFN;
+    MachineRegisterInfo *MRI;
+    MachineDominatorTree *MDT;
+    MachineLoopInfo *MLI;
+    BlockSetType Deleted;
+    const MachineBranchProbabilityInfo *MBPI;
+  };
+
+  char HexagonEarlyIfConversion::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonEarlyIfConversion, "hexagon-eif",
+  "Hexagon early if conversion", false, false)
+
+bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
+  if (B->succ_size() != 1)
+    return false;
+  MachineBasicBlock *SB = *B->succ_begin();
+  MachineLoop *L = MLI->getLoopFor(SB);
+  return L && SB == L->getHeader();
+}
+
+
+bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
+    MachineLoop *L, FlowPattern &FP) {
+  DEBUG(dbgs() << "Checking flow pattern at BB#" << B->getNumber() << "\n");
+
+  // Interested only in conditional branches, no .new, no new-value, etc.
+  // Check the terminators directly, it's easier than handling all responses
+  // from AnalyzeBranch.
+  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock::const_iterator T1I = B->getFirstTerminator();
+  if (T1I == B->end())
+    return false;
+  unsigned Opc = T1I->getOpcode();
+  if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf)
+    return false;
+  unsigned PredR = T1I->getOperand(0).getReg();
+
+  // Get the layout successor, or 0 if B does not have one.
+  MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B));
+  MachineBasicBlock *NextB = (NextBI != MFN->end()) ? &*NextBI : 0;
+
+  MachineBasicBlock *T1B = T1I->getOperand(1).getMBB();
+  MachineBasicBlock::const_iterator T2I = std::next(T1I);
+  // The second terminator should be an unconditional branch.
+  assert(T2I == B->end() || T2I->getOpcode() == Hexagon::J2_jump);
+  MachineBasicBlock *T2B = (T2I == B->end()) ? NextB
+                                             : T2I->getOperand(0).getMBB();
+  if (T1B == T2B) {
+    // XXX merge if T1B == NextB, or convert branch to unconditional.
+    // mark as diamond with both sides equal?
+    return false;
+  }
+  // Loop could be null for both.
+  if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
+    return false;
+
+  // Record the true/false blocks in such a way that "true" means "if (PredR)",
+  // and "false" means "if (!PredR)".
+  if (Opc == Hexagon::J2_jumpt)
+    TB = T1B, FB = T2B;
+  else
+    TB = T2B, FB = T1B;
+
+  if (!MDT->properlyDominates(B, TB) || !MDT->properlyDominates(B, FB))
+    return false;
+
+  // Detect triangle first. In case of a triangle, one of the blocks TB/FB
+  // can fall through into the other, in other words, it will be executed
+  // in both cases. We only want to predicate the block that is executed
+  // conditionally.
+  unsigned TNP = TB->pred_size(), FNP = FB->pred_size();
+  unsigned TNS = TB->succ_size(), FNS = FB->succ_size();
+
+  // A block is predicable if it has one predecessor (it must be B), and
+  // it has a single successor. In fact, the block has to end either with
+  // an unconditional branch (which can be predicated), or with a fall-
+  // through.
+  bool TOk = (TNP == 1) && (TNS == 1);
+  bool FOk = (FNP == 1) && (FNS == 1);
+
+  // If neither is predicable, there is nothing interesting.
+  if (!TOk && !FOk)
+    return false;
+
+  MachineBasicBlock *TSB = (TNS > 0) ? *TB->succ_begin() : 0;
+  MachineBasicBlock *FSB = (FNS > 0) ? *FB->succ_begin() : 0;
+  MachineBasicBlock *JB = 0;
+
+  if (TOk) {
+    if (FOk) {
+      if (TSB == FSB)
+        JB = TSB;
+      // Diamond: "if (P) then TB; else FB;".
+    } else {
+      // TOk && !FOk
+      if (TSB == FB) {
+        JB = FB;
+        FB = 0;
+      }
+    }
+  } else {
+    // !TOk && FOk  (at least one must be true by now).
+    if (FSB == TB) {
+      JB = TB;
+      TB = 0;
+    }
+  }
+  // Don't try to predicate loop preheaders.
+  if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
+    DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
+                 << " is a loop preheader. Skipping.\n");
+    return false;
+  }
+
+  FP = FlowPattern(B, PredR, TB, FB, JB);
+  DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
+  return true;
+}
+
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that
+// contains EH_LABEL.
+bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
+  for (auto &I : *B)
+    if (I.isEHLabel())
+      return true;
+  return false;
+}
+
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize
+// that a block can never fall-through.
+bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B)
+      const {
+  MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+  while (I != E) {
+    if (I->isBarrier())
+      return true;
+    ++I;
+  }
+  return false;
+}
+
+
+bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
+      const {
+  if (!B)
+    return true;
+  if (B->isEHPad() || B->hasAddressTaken())
+    return false;
+  if (B->succ_size() == 0)
+    return false;
+
+  for (auto &MI : *B) {
+    if (MI.isDebugValue())
+      continue;
+    if (MI.isConditionalBranch())
+      return false;
+    unsigned Opc = MI.getOpcode();
+    bool IsJMP = (Opc == Hexagon::J2_jump);
+    if (!isPredicableStore(&MI) && !IsJMP && !isSafeToSpeculate(&MI))
+      return false;
+    // Look for predicate registers defined by this instruction. It's ok
+    // to speculate such an instruction, but the predicate register cannot
+    // be used outside of this block (or else it won't be possible to
+    // update the use of it after predication). PHI uses will be updated
+    // to use a result of a MUX, and a MUX cannot be created for predicate
+    // registers.
+    for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef())
+        continue;
+      unsigned R = MO->getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R))
+        continue;
+      if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
+        continue;
+      for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
+        if (U->getParent()->isPHI())
+          return false;
+    }
+  }
+  return true;
+}
+
+
+bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || !MO->isUse())
+      continue;
+    unsigned R = MO->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    const MachineInstr *DefI = MRI->getVRegDef(R);
+    // "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
+    assert(DefI && "Expecting a reaching def in MRI");
+    if (DefI->isImplicitDef())
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
+  if (hasEHLabel(FP.SplitB))  // KLUDGE: see function definition
+    return false;
+  if (FP.TrueB && !isValidCandidate(FP.TrueB))
+    return false;
+  if (FP.FalseB && !isValidCandidate(FP.FalseB))
+    return false;
+  // Check the PHIs in the join block. If any of them use a register
+  // that is defined as IMPLICIT_DEF, do not convert this. This can
+  // legitimately happen if one side of the split never executes, but
+  // the compiler is unable to prove it. That side may then seem to
+  // provide an "undef" value to the join block, however it will never
+  // execute at run-time. If we convert this case, the "undef" will
+  // be used in a MUX instruction, and that may seem like actually
+  // using an undefined value to other optimizations. This could lead
+  // to trouble further down the optimization stream, cause assertions
+  // to fail, etc.
+  if (FP.JoinB) {
+    const MachineBasicBlock &B = *FP.JoinB;
+    for (auto &MI : B) {
+      if (!MI.isPHI())
+        break;
+      if (usesUndefVReg(&MI))
+        return false;
+      unsigned DefR = MI.getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+      if (RC == &Hexagon::PredRegsRegClass)
+        return false;
+    }
+  }
+  return true;
+}
+
+
+unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
+  assert(B->pred_size() <= 2);
+  if (B->pred_size() < 2)
+    return 0;
+
+  unsigned Cost = 0;
+  MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
+  for (I = B->begin(); I != E; ++I) {
+    const MachineOperand &RO1 = I->getOperand(1);
+    const MachineOperand &RO3 = I->getOperand(3);
+    assert(RO1.isReg() && RO3.isReg());
+    // Must have a MUX if the phi uses a subregister.
+    if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+      Cost++;
+      continue;
+    }
+    MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
+    MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+    if (!TII->isPredicable(Def1) || !TII->isPredicable(Def3))
+      Cost++;
+  }
+  return Cost;
+}
+
+
+unsigned HexagonEarlyIfConversion::countPredicateDefs(
+      const MachineBasicBlock *B) const {
+  unsigned PredDefs = 0;
+  for (auto &MI : *B) {
+    for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef())
+        continue;
+      unsigned R = MO->getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R))
+        continue;
+      if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass)
+        PredDefs++;
+    }
+  }
+  return PredDefs;
+}
+
+
+bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
+  if (FP.TrueB && FP.FalseB) {
+
+    // Do not IfCovert if the branch is one sided.
+    if (MBPI) {
+      BranchProbability Prob(9, 10);
+      if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob)
+        return false;
+      if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob)
+        return false;
+    }
+
+    // If both sides are predicable, convert them if they join, and the
+    // join block has no other predecessors.
+    MachineBasicBlock *TSB = *FP.TrueB->succ_begin();
+    MachineBasicBlock *FSB = *FP.FalseB->succ_begin();
+    if (TSB != FSB)
+      return false;
+    if (TSB->pred_size() != 2)
+      return false;
+  }
+
+  // Calculate the total size of the predicated blocks.
+  // Assume instruction counts without branches to be the approximation of
+  // the code size. If the predicated blocks are smaller than a packet size,
+  // approximate the spare room in the packet that could be filled with the
+  // predicated/speculated instructions.
+  unsigned TS = 0, FS = 0, Spare = 0;
+  if (FP.TrueB) {
+    TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
+    if (TS < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-TS;
+  }
+  if (FP.FalseB) {
+    FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
+    if (FS < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-TS;
+  }
+  unsigned TotalIn = TS+FS;
+  DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
+               << TotalIn << ", spare room: " << Spare << "\n");
+  if (TotalIn >= SizeLimit+Spare)
+    return false;
+
+  // Count the number of PHI nodes that will need to be updated (converted
+  // to MUX). Those can be later converted to predicated instructions, so
+  // they aren't always adding extra cost.
+  // KLUDGE: Also, count the number of predicate register definitions in
+  // each block. The scheduler may increase the pressure of these and cause
+  // expensive spills (e.g. bitmnp01).
+  unsigned TotalPh = 0;
+  unsigned PredDefs = countPredicateDefs(FP.SplitB);
+  if (FP.JoinB) {
+    TotalPh = computePhiCost(FP.JoinB);
+    PredDefs += countPredicateDefs(FP.JoinB);
+  } else {
+    if (FP.TrueB && FP.TrueB->succ_size() > 0) {
+      MachineBasicBlock *SB = *FP.TrueB->succ_begin();
+      TotalPh += computePhiCost(SB);
+      PredDefs += countPredicateDefs(SB);
+    }
+    if (FP.FalseB && FP.FalseB->succ_size() > 0) {
+      MachineBasicBlock *SB = *FP.FalseB->succ_begin();
+      TotalPh += computePhiCost(SB);
+      PredDefs += countPredicateDefs(SB);
+    }
+  }
+  DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
+               << TotalPh << "\n");
+  if (TotalIn+TotalPh >= SizeLimit+Spare)
+    return false;
+
+  DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n");
+  if (PredDefs > 4)
+    return false;
+
+  return true;
+}
+
+
+bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
+      MachineLoop *L) {
+  bool Changed = false;
+
+  // Visit all dominated blocks from the same loop first, then process B.
+  MachineDomTreeNode *N = MDT->getNode(B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  // We will change CFG/DT during this traversal, so take precautions to
+  // avoid problems related to invalidated iterators. In fact, processing
+  // a child C of B cannot cause another child to be removed, but it can
+  // cause a new child to be added (which was a child of C before C itself
+  // was removed. This new child C, however, would have been processed
+  // prior to processing B, so there is no need to process it again.
+  // Simply keep a list of children of B, and traverse that list.
+  typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+  DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+  for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    if (!Deleted.count(SB))
+      Changed |= visitBlock(SB, L);
+  }
+  // When walking down the dominator tree, we want to traverse through
+  // blocks from nested (other) loops, because they can dominate blocks
+  // that are in L. Skip the non-L blocks only after the tree traversal.
+  if (MLI->getLoopFor(B) != L)
+    return Changed;
+
+  FlowPattern FP;
+  if (!matchFlowPattern(B, L, FP))
+    return Changed;
+
+  if (!isValid(FP)) {
+    DEBUG(dbgs() << "Conversion is not valid\n");
+    return Changed;
+  }
+  if (!isProfitable(FP)) {
+    DEBUG(dbgs() << "Conversion is not profitable\n");
+    return Changed;
+  }
+
+  convert(FP);
+  simplifyFlowGraph(FP);
+  return true;
+}
+
+
+bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
+  MachineBasicBlock *HB = L ? L->getHeader() : 0;
+  DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
+           : dbgs() << "Visiting function") << "\n");
+  bool Changed = false;
+  if (L) {
+    for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+      Changed |= visitLoop(*I);
+  }
+
+  MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN);
+  Changed |= visitBlock(L ? HB : EntryB, L);
+  return Changed;
+}
+
+
+bool HexagonEarlyIfConversion::isPredicableStore(const MachineInstr *MI)
+      const {
+  // Exclude post-increment stores. Those return a value, so we cannot
+  // predicate them.
+  unsigned Opc = MI->getOpcode();
+  using namespace Hexagon;
+  switch (Opc) {
+    // Store byte:
+    case S2_storerb_io: case S4_storerb_rr:
+    case S2_storerbabs: case S4_storeirb_io:  case S2_storerbgp:
+    // Store halfword:
+    case S2_storerh_io: case S4_storerh_rr:
+    case S2_storerhabs: case S4_storeirh_io:  case S2_storerhgp:
+    // Store upper halfword:
+    case S2_storerf_io: case S4_storerf_rr:
+    case S2_storerfabs: case S2_storerfgp:
+    // Store word:
+    case S2_storeri_io: case S4_storeri_rr:
+    case S2_storeriabs: case S4_storeiri_io:  case S2_storerigp:
+    // Store doubleword:
+    case S2_storerd_io: case S4_storerd_rr:
+    case S2_storerdabs: case S2_storerdgp:
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI)
+      const {
+  if (MI->mayLoad() || MI->mayStore())
+    return false;
+  if (MI->isCall() || MI->isBarrier() || MI->isBranch())
+    return false;
+  if (MI->hasUnmodeledSideEffects())
+    return false;
+
+  return true;
+}
+
+
+unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc,
+      bool IfTrue) const {
+  // Exclude post-increment stores.
+  using namespace Hexagon;
+  switch (Opc) {
+    case S2_storerb_io:
+      return IfTrue ? S2_pstorerbt_io : S2_pstorerbf_io;
+    case S4_storerb_rr:
+      return IfTrue ? S4_pstorerbt_rr : S4_pstorerbf_rr;
+    case S2_storerbabs:
+    case S2_storerbgp:
+      return IfTrue ? S4_pstorerbt_abs : S4_pstorerbf_abs;
+    case S4_storeirb_io:
+      return IfTrue ? S4_storeirbt_io : S4_storeirbf_io;
+    case S2_storerh_io:
+      return IfTrue ? S2_pstorerht_io : S2_pstorerhf_io;
+    case S4_storerh_rr:
+      return IfTrue ? S4_pstorerht_rr : S4_pstorerhf_rr;
+    case S2_storerhabs:
+    case S2_storerhgp:
+      return IfTrue ? S4_pstorerht_abs : S4_pstorerhf_abs;
+    case S2_storerf_io:
+      return IfTrue ? S2_pstorerft_io : S2_pstorerff_io;
+    case S4_storerf_rr:
+      return IfTrue ? S4_pstorerft_rr : S4_pstorerff_rr;
+    case S2_storerfabs:
+    case S2_storerfgp:
+      return IfTrue ? S4_pstorerft_abs : S4_pstorerff_abs;
+    case S4_storeirh_io:
+      return IfTrue ? S4_storeirht_io : S4_storeirhf_io;
+    case S2_storeri_io:
+      return IfTrue ? S2_pstorerit_io : S2_pstorerif_io;
+    case S4_storeri_rr:
+      return IfTrue ? S4_pstorerit_rr : S4_pstorerif_rr;
+    case S2_storeriabs:
+    case S2_storerigp:
+      return IfTrue ? S4_pstorerit_abs : S4_pstorerif_abs;
+    case S4_storeiri_io:
+      return IfTrue ? S4_storeirit_io : S4_storeirif_io;
+    case S2_storerd_io:
+      return IfTrue ? S2_pstorerdt_io : S2_pstorerdf_io;
+    case S4_storerd_rr:
+      return IfTrue ? S4_pstorerdt_rr : S4_pstorerdf_rr;
+    case S2_storerdabs:
+    case S2_storerdgp:
+      return IfTrue ? S4_pstorerdt_abs : S4_pstorerdf_abs;
+  }
+  llvm_unreachable("Unexpected opcode");
+  return 0;
+}
+
+
+void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
+      MachineBasicBlock::iterator At, MachineInstr *MI,
+      unsigned PredR, bool IfTrue) {
+  DebugLoc DL;
+  if (At != ToB->end())
+    DL = At->getDebugLoc();
+  else if (!ToB->empty())
+    DL = ToB->back().getDebugLoc();
+
+  unsigned Opc = MI->getOpcode();
+
+  if (isPredicableStore(MI)) {
+    unsigned COpc = getCondStoreOpcode(Opc, IfTrue);
+    assert(COpc);
+    MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, TII->get(COpc))
+      .addReg(PredR);
+    for (MIOperands MO(MI); MO.isValid(); ++MO)
+      MIB.addOperand(*MO);
+
+    // Set memory references.
+    MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+    MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+
+    MI->eraseFromParent();
+    return;
+  }
+
+  if (Opc == Hexagon::J2_jump) {
+    MachineBasicBlock *TB = MI->getOperand(0).getMBB();
+    const MCInstrDesc &D = TII->get(IfTrue ? Hexagon::J2_jumpt
+                                           : Hexagon::J2_jumpf);
+    BuildMI(*ToB, At, DL, D)
+      .addReg(PredR)
+      .addMBB(TB);
+    MI->eraseFromParent();
+    return;
+  }
+
+  // Print the offending instruction unconditionally as we are about to
+  // abort.
+  dbgs() << *MI;
+  llvm_unreachable("Unexpected instruction");
+}
+
+
+// Predicate/speculate non-branch instructions from FromB into block ToB.
+// Leave the branches alone, they will be handled later. Btw, at this point
+// FromB should have at most one branch, and it should be unconditional.
+void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
+      MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+      unsigned PredR, bool IfTrue) {
+  DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
+  MachineBasicBlock::iterator End = FromB->getFirstTerminator();
+  MachineBasicBlock::iterator I, NextI;
+
+  for (I = FromB->begin(); I != End; I = NextI) {
+    assert(!I->isPHI());
+    NextI = std::next(I);
+    if (isSafeToSpeculate(&*I))
+      ToB->splice(At, FromB, I);
+    else
+      predicateInstr(ToB, At, &*I, PredR, IfTrue);
+  }
+}
+
+
+void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
+      const FlowPattern &FP) {
+  // Visit all PHI nodes in the WhereB block and generate MUX instructions
+  // in the split block. Update the PHI nodes with the values of the MUX.
+  auto NonPHI = WhereB->getFirstNonPHI();
+  for (auto I = WhereB->begin(); I != NonPHI; ++I) {
+    MachineInstr *PN = &*I;
+    // Registers and subregisters corresponding to TrueB, FalseB and SplitB.
+    unsigned TR = 0, TSR = 0, FR = 0, FSR = 0, SR = 0, SSR = 0;
+    for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+      const MachineOperand &RO = PN->getOperand(i), &BO = PN->getOperand(i+1);
+      if (BO.getMBB() == FP.SplitB)
+        SR = RO.getReg(), SSR = RO.getSubReg();
+      else if (BO.getMBB() == FP.TrueB)
+        TR = RO.getReg(), TSR = RO.getSubReg();
+      else if (BO.getMBB() == FP.FalseB)
+        FR = RO.getReg(), FSR = RO.getSubReg();
+      else
+        continue;
+      PN->RemoveOperand(i+1);
+      PN->RemoveOperand(i);
+    }
+    if (TR == 0)
+      TR = SR, TSR = SSR;
+    else if (FR == 0)
+      FR = SR, FSR = SSR;
+    assert(TR && FR);
+
+    using namespace Hexagon;
+    unsigned DR = PN->getOperand(0).getReg();
+    const TargetRegisterClass *RC = MRI->getRegClass(DR);
+    const MCInstrDesc &D = RC == &IntRegsRegClass ? TII->get(C2_mux)
+                                                  : TII->get(MUX64_rr);
+
+    MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
+    DebugLoc DL;
+    if (MuxAt != FP.SplitB->end())
+      DL = MuxAt->getDebugLoc();
+    unsigned MuxR = MRI->createVirtualRegister(RC);
+    BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
+      .addReg(FP.PredR)
+      .addReg(TR, 0, TSR)
+      .addReg(FR, 0, FSR);
+
+    PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+    PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
+  }
+}
+
+
+void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
+  MachineBasicBlock *TSB = 0, *FSB = 0;
+  MachineBasicBlock::iterator OldTI = FP.SplitB->getFirstTerminator();
+  assert(OldTI != FP.SplitB->end());
+  DebugLoc DL = OldTI->getDebugLoc();
+
+  if (FP.TrueB) {
+    TSB = *FP.TrueB->succ_begin();
+    predicateBlockNB(FP.SplitB, OldTI, FP.TrueB, FP.PredR, true);
+  }
+  if (FP.FalseB) {
+    FSB = *FP.FalseB->succ_begin();
+    MachineBasicBlock::iterator At = FP.SplitB->getFirstTerminator();
+    predicateBlockNB(FP.SplitB, At, FP.FalseB, FP.PredR, false);
+  }
+
+  // Regenerate new terminators in the split block and update the successors.
+  // First, remember any information that may be needed later and remove the
+  // existing terminators/successors from the split block.
+  MachineBasicBlock *SSB = 0;
+  FP.SplitB->erase(OldTI, FP.SplitB->end());
+  while (FP.SplitB->succ_size() > 0) {
+    MachineBasicBlock *T = *FP.SplitB->succ_begin();
+    // It's possible that the split block had a successor that is not a pre-
+    // dicated block. This could only happen if there was only one block to
+    // be predicated. Example:
+    //   split_b:
+    //     if (p) jump true_b
+    //     jump unrelated2_b
+    //   unrelated1_b:
+    //     ...
+    //   unrelated2_b:  ; can have other predecessors, so it's not "false_b"
+    //     jump other_b
+    //   true_b:        ; only reachable from split_b, can be predicated
+    //     ...
+    //
+    // Find this successor (SSB) if it exists.
+    if (T != FP.TrueB && T != FP.FalseB) {
+      assert(!SSB);
+      SSB = T;
+    }
+    FP.SplitB->removeSuccessor(FP.SplitB->succ_begin());
+  }
+
+  // Insert new branches and update the successors of the split block. This
+  // may create unconditional branches to the layout successor, etc., but
+  // that will be cleaned up later. For now, make sure that correct code is
+  // generated.
+  if (FP.JoinB) {
+    assert(!SSB || SSB == FP.JoinB);
+    BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump))
+      .addMBB(FP.JoinB);
+    FP.SplitB->addSuccessor(FP.JoinB);
+  } else {
+    bool HasBranch = false;
+    if (TSB) {
+      BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jumpt))
+        .addReg(FP.PredR)
+        .addMBB(TSB);
+      FP.SplitB->addSuccessor(TSB);
+      HasBranch = true;
+    }
+    if (FSB) {
+      const MCInstrDesc &D = HasBranch ? TII->get(Hexagon::J2_jump)
+                                       : TII->get(Hexagon::J2_jumpf);
+      MachineInstrBuilder MIB = BuildMI(*FP.SplitB, FP.SplitB->end(), DL, D);
+      if (!HasBranch)
+        MIB.addReg(FP.PredR);
+      MIB.addMBB(FSB);
+      FP.SplitB->addSuccessor(FSB);
+    }
+    if (SSB) {
+      // This cannot happen if both TSB and FSB are set. [TF]SB are the
+      // successor blocks of the TrueB and FalseB (or null of the TrueB
+      // or FalseB block is null). SSB is the potential successor block
+      // of the SplitB that is neither TrueB nor FalseB.
+      BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump))
+        .addMBB(SSB);
+      FP.SplitB->addSuccessor(SSB);
+    }
+  }
+
+  // What is left to do is to update the PHI nodes that could have entries
+  // referring to predicated blocks.
+  if (FP.JoinB) {
+    updatePhiNodes(FP.JoinB, FP);
+  } else {
+    if (TSB)
+      updatePhiNodes(TSB, FP);
+    if (FSB)
+      updatePhiNodes(FSB, FP);
+    // Nothing to update in SSB, since SSB's predecessors haven't changed.
+  }
+}
+
+
+void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
+  DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
+
+  // Transfer the immediate dominator information from B to its descendants.
+  MachineDomTreeNode *N = MDT->getNode(B);
+  MachineDomTreeNode *IDN = N->getIDom();
+  if (IDN) {
+    MachineBasicBlock *IDB = IDN->getBlock();
+    typedef GraphTraits<MachineDomTreeNode*> GTN;
+    typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+    DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+    for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+      MachineBasicBlock *SB = (*I)->getBlock();
+      MDT->changeImmediateDominator(SB, IDB);
+    }
+  }
+
+  while (B->succ_size() > 0)
+    B->removeSuccessor(B->succ_begin());
+
+  for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
+    (*I)->removeSuccessor(B, true);
+
+  Deleted.insert(B);
+  MDT->eraseNode(B);
+  MFN->erase(B->getIterator());
+}
+
+
+void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
+  DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
+  MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI();
+  for (I = B->begin(); I != NonPHI; I = NextI) {
+    NextI = std::next(I);
+    MachineInstr *PN = &*I;
+    assert(PN->getNumOperands() == 3 && "Invalid phi node");
+    MachineOperand &UO = PN->getOperand(1);
+    unsigned UseR = UO.getReg(), UseSR = UO.getSubReg();
+    unsigned DefR = PN->getOperand(0).getReg();
+    unsigned NewR = UseR;
+    if (UseSR) {
+      // MRI.replaceVregUsesWith does not allow to update the subregister,
+      // so instead of doing the use-iteration here, create a copy into a
+      // "non-subregistered" register.
+      DebugLoc DL = PN->getDebugLoc();
+      const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+      NewR = MRI->createVirtualRegister(RC);
+      NonPHI = BuildMI(*B, NonPHI, DL, TII->get(TargetOpcode::COPY), NewR)
+        .addReg(UseR, 0, UseSR);
+    }
+    MRI->replaceRegWith(DefR, NewR);
+    B->erase(I);
+  }
+}
+
+
+void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
+      MachineBasicBlock *NewB) {
+  for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) {
+    MachineBasicBlock *SB = *I;
+    MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
+    for (P = SB->begin(); P != N; ++P) {
+      MachineInstr *PN = &*P;
+      for (MIOperands MO(PN); MO.isValid(); ++MO)
+        if (MO->isMBB() && MO->getMBB() == OldB)
+          MO->setMBB(NewB);
+    }
+  }
+}
+
+
+void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
+      MachineBasicBlock *SuccB) {
+  DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
+               << PrintMB(SuccB) << "\n");
+  bool TermOk = hasUncondBranch(SuccB);
+  eliminatePhis(SuccB);
+  TII->RemoveBranch(*PredB);
+  PredB->removeSuccessor(SuccB);
+  PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
+  MachineBasicBlock::succ_iterator I, E = SuccB->succ_end();
+  for (I = SuccB->succ_begin(); I != E; ++I)
+    PredB->addSuccessor(*I);
+  PredB->normalizeSuccProbs();
+  replacePhiEdges(SuccB, PredB);
+  removeBlock(SuccB);
+  if (!TermOk)
+    PredB->updateTerminator();
+}
+
+
+void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
+  if (FP.TrueB)
+    removeBlock(FP.TrueB);
+  if (FP.FalseB)
+    removeBlock(FP.FalseB);
+
+  FP.SplitB->updateTerminator();
+  if (FP.SplitB->succ_size() != 1)
+    return;
+
+  MachineBasicBlock *SB = *FP.SplitB->succ_begin();
+  if (SB->pred_size() != 1)
+    return;
+
+  // By now, the split block has only one successor (SB), and SB has only
+  // one predecessor. We can try to merge them. We will need to update ter-
+  // minators in FP.Split+SB, and that requires working AnalyzeBranch, which
+  // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends
+  // with an unconditional branch, we won't need to touch the terminators.
+  if (!hasEHLabel(SB) || hasUncondBranch(SB))
+    mergeBlocks(FP.SplitB, SB);
+}
+
+
+bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
+  auto &ST = MF.getSubtarget();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MFN = &MF;
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
+    nullptr;
+
+  Deleted.clear();
+  bool Changed = false;
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
+    Changed |= visitLoop(*I);
+  Changed |= visitLoop(0);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+FunctionPass *llvm::createHexagonEarlyIfConversion() {
+  return new HexagonEarlyIfConversion();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index e4c8d8f..6e2dbc0 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -74,7 +74,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
        MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock* MBB = MBBb;
+    MachineBasicBlock *MBB = &*MBBb;
     // Traverse the basic block.
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
          ++MII) {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 21a8996..7a52a1c 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -147,6 +147,48 @@ static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit", cl::init(UINT_MAX),
     cl::Hidden, cl::ZeroOrMore, cl::desc("Max count of stack frame "
     "shrink-wraps"));
 
+static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
+    cl::Hidden, cl::desc("Use allocframe more conservatively"));
+
+
+namespace llvm {
+  void initializeHexagonCallFrameInformationPass(PassRegistry&);
+  FunctionPass *createHexagonCallFrameInformation();
+}
+
+namespace {
+  class HexagonCallFrameInformation : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonCallFrameInformation() : MachineFunctionPass(ID) {
+      PassRegistry &PR = *PassRegistry::getPassRegistry();
+      initializeHexagonCallFrameInformationPass(PR);
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+  };
+
+  char HexagonCallFrameInformation::ID = 0;
+}
+
+bool HexagonCallFrameInformation::runOnMachineFunction(MachineFunction &MF) {
+  auto &HFI = *MF.getSubtarget<HexagonSubtarget>().getFrameLowering();
+  bool NeedCFI = MF.getMMI().hasDebugInfo() ||
+                 MF.getFunction()->needsUnwindTableEntry();
+
+  if (!NeedCFI)
+    return false;
+  HFI.insertCFIInstructions(MF);
+  return true;
+}
+
+INITIALIZE_PASS(HexagonCallFrameInformation, "hexagon-cfi",
+                "Hexagon call frame information", false, false)
+
+FunctionPass *llvm::createHexagonCallFrameInformation() {
+  return new HexagonCallFrameInformation();
+}
+
+
 namespace {
   /// Map a register pair Reg to the subregister that has the greater "number",
   /// i.e. D3 (aka R7:6) will be mapped to R7, etc.
@@ -370,11 +412,11 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
     insertEpilogueInBlock(*EpilogB);
   } else {
     for (auto &B : MF)
-      if (!B.empty() && B.back().isReturn())
+      if (B.isReturnBlock())
         insertCSRRestoresInBlock(B, CSI, HRI);
 
     for (auto &B : MF)
-      if (!B.empty() && B.back().isReturn())
+      if (B.isReturnBlock())
         insertEpilogueInBlock(B);
   }
 }
@@ -383,10 +425,7 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
 void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
-  auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
   DebugLoc dl;
@@ -405,10 +444,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
 
   bool AlignStack = (MaxAlign > getStackAlignment());
 
-  // Check if frame moves are needed for EH.
-  bool needsFrameMoves = MMI.hasDebugInfo() ||
-    MF.getFunction()->needsUnwindTableEntry();
-
   // Get the number of bytes to allocate from the FrameInfo.
   unsigned NumBytes = MFI->getStackSize();
   unsigned SP = HRI.getStackRegister();
@@ -424,14 +459,7 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
     MI->eraseFromParent();
   }
 
-  //
-  // Only insert ALLOCFRAME if we need to or at -O0 for the debugger.  Think
-  // that this shouldn't be required, but doing so now because gcc does and
-  // gdb can't break at the start of the function without it.  Will remove if
-  // this turns out to be a gdb bug.
-  //
-  bool NoOpt = (HTM.getOptLevel() == CodeGenOpt::None);
-  if (!NoOpt && !FuncInfo->hasClobberLR() && !hasFP(MF))
+  if (!hasFP(MF))
     return;
 
   // Check for overflow.
@@ -469,92 +497,11 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
         .addReg(SP)
         .addImm(-int64_t(MaxAlign));
   }
-
-  if (needsFrameMoves) {
-    std::vector<MCCFIInstruction> Instructions = MMI.getFrameInstructions();
-    MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-
-    // Advance CFA. DW_CFA_def_cfa
-    unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
-    unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
-
-    // CFA = FP + 8
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
-                                               FrameLabel, DwFPReg, -8));
-    BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
-           .addCFIIndex(CFIIndex);
-
-    // R31 (return addr) = CFA - #4
-    CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-                                               FrameLabel, DwRAReg, -4));
-    BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
-           .addCFIIndex(CFIIndex);
-
-    // R30 (frame ptr) = CFA - #8)
-    CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-                                               FrameLabel, DwFPReg, -8));
-    BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
-           .addCFIIndex(CFIIndex);
-
-    unsigned int regsToMove[] = {
-      Hexagon::R1,  Hexagon::R0,  Hexagon::R3,  Hexagon::R2,
-      Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18,
-      Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22,
-      Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26,
-      Hexagon::D0,  Hexagon::D1,  Hexagon::D8,  Hexagon::D9,  Hexagon::D10,
-      Hexagon::D11, Hexagon::D12, Hexagon::D13, Hexagon::NoRegister
-    };
-
-    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-    for (unsigned i = 0; regsToMove[i] != Hexagon::NoRegister; ++i) {
-      for (unsigned I = 0, E = CSI.size(); I < E; ++I) {
-        if (CSI[I].getReg() == regsToMove[i]) {
-          // Subtract 8 to make room for R30 and R31, which are added above.
-          int64_t Offset = getFrameIndexOffset(MF, CSI[I].getFrameIdx()) - 8;
-
-          if (regsToMove[i] < Hexagon::D0 || regsToMove[i] > Hexagon::D15) {
-            unsigned DwarfReg = HRI.getDwarfRegNum(regsToMove[i], true);
-            unsigned CFIIndex = MMI.addFrameInst(
-                                    MCCFIInstruction::createOffset(FrameLabel,
-                                                        DwarfReg, Offset));
-            BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
-                   .addCFIIndex(CFIIndex);
-          } else {
-            // Split the double regs into subregs, and generate appropriate
-            // cfi_offsets.
-            // The only reason, we are split double regs is, llvm-mc does not
-            // understand paired registers for cfi_offset.
-            // Eg .cfi_offset r1:0, -64
-            unsigned HiReg = getMax32BitSubRegister(regsToMove[i], HRI);
-            unsigned LoReg = getMax32BitSubRegister(regsToMove[i], HRI, false);
-            unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
-            unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
-            unsigned HiCFIIndex = MMI.addFrameInst(
-                                    MCCFIInstruction::createOffset(FrameLabel,
-                                                        HiDwarfReg, Offset+4));
-            BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
-                   .addCFIIndex(HiCFIIndex);
-            unsigned LoCFIIndex = MMI.addFrameInst(
-                                    MCCFIInstruction::createOffset(FrameLabel,
-                                                        LoDwarfReg, Offset));
-            BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
-                   .addCFIIndex(LoCFIIndex);
-          }
-          break;
-        }
-      } // for CSI.size()
-    } // for regsToMove
-  } // needsFrameMoves
 }
 
 void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   MachineFunction &MF = *MBB.getParent();
-  //
-  // Only insert deallocframe if we need to.  Also at -O0.  See comment
-  // in insertPrologueInBlock above.
-  //
-  if (!hasFP(MF) && MF.getTarget().getOptLevel() != CodeGenOpt::None)
+  if (!hasFP(MF))
     return;
 
   auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
@@ -630,12 +577,172 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
 }
 
 
+namespace {
+  bool IsAllocFrame(MachineBasicBlock::const_iterator It) {
+    if (!It->isBundle())
+      return It->getOpcode() == Hexagon::S2_allocframe;
+    auto End = It->getParent()->instr_end();
+    MachineBasicBlock::const_instr_iterator I = It.getInstrIterator();
+    while (++I != End && I->isBundled())
+      if (I->getOpcode() == Hexagon::S2_allocframe)
+        return true;
+    return false;
+  }
+
+  MachineBasicBlock::iterator FindAllocFrame(MachineBasicBlock &B) {
+    for (auto &I : B)
+      if (IsAllocFrame(I))
+        return I;
+    return B.end();
+  }
+}
+
+
+void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
+  for (auto &B : MF) {
+    auto AF = FindAllocFrame(B);
+    if (AF == B.end())
+      continue;
+    insertCFIInstructionsAt(B, ++AF);
+  }
+}
+
+
+void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator At) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+
+  // If CFI instructions have debug information attached, something goes
+  // wrong with the final assembly generation: the prolog_end is placed
+  // in a wrong location.
+  DebugLoc DL;
+  const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION);
+
+  MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+
+  if (hasFP(MF)) {
+    unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
+    unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
+
+    // Define CFA via an offset from the value of FP.
+    //
+    //  -8   -4    0 (SP)
+    // --+----+----+---------------------
+    //   | FP | LR |          increasing addresses -->
+    // --+----+----+---------------------
+    //   |         +-- Old SP (before allocframe)
+    //   +-- New FP (after allocframe)
+    //
+    // MCCFIInstruction::createDefCfa subtracts the offset from the register.
+    // MCCFIInstruction::createOffset takes the offset without sign change.
+    auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8);
+    BuildMI(MBB, At, DL, CFID)
+        .addCFIIndex(MMI.addFrameInst(DefCfa));
+    // R31 (return addr) = CFA - 4
+    auto OffR31 = MCCFIInstruction::createOffset(FrameLabel, DwRAReg, -4);
+    BuildMI(MBB, At, DL, CFID)
+        .addCFIIndex(MMI.addFrameInst(OffR31));
+    // R30 (frame ptr) = CFA - 8
+    auto OffR30 = MCCFIInstruction::createOffset(FrameLabel, DwFPReg, -8);
+    BuildMI(MBB, At, DL, CFID)
+        .addCFIIndex(MMI.addFrameInst(OffR30));
+  }
+
+  static unsigned int RegsToMove[] = {
+    Hexagon::R1,  Hexagon::R0,  Hexagon::R3,  Hexagon::R2,
+    Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18,
+    Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22,
+    Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26,
+    Hexagon::D0,  Hexagon::D1,  Hexagon::D8,  Hexagon::D9,
+    Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13,
+    Hexagon::NoRegister
+  };
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) {
+    unsigned Reg = RegsToMove[i];
+    auto IfR = [Reg] (const CalleeSavedInfo &C) -> bool {
+      return C.getReg() == Reg;
+    };
+    auto F = std::find_if(CSI.begin(), CSI.end(), IfR);
+    if (F == CSI.end())
+      continue;
+
+    // Subtract 8 to make room for R30 and R31, which are added above.
+    unsigned FrameReg;
+    int64_t Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg) - 8;
+
+    if (Reg < Hexagon::D0 || Reg > Hexagon::D15) {
+      unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true);
+      auto OffReg = MCCFIInstruction::createOffset(FrameLabel, DwarfReg,
+                                                   Offset);
+      BuildMI(MBB, At, DL, CFID)
+          .addCFIIndex(MMI.addFrameInst(OffReg));
+    } else {
+      // Split the double regs into subregs, and generate appropriate
+      // cfi_offsets.
+      // The only reason, we are split double regs is, llvm-mc does not
+      // understand paired registers for cfi_offset.
+      // Eg .cfi_offset r1:0, -64
+
+      unsigned HiReg = HRI.getSubReg(Reg, Hexagon::subreg_hireg);
+      unsigned LoReg = HRI.getSubReg(Reg, Hexagon::subreg_loreg);
+      unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
+      unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
+      auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg,
+                                                  Offset+4);
+      BuildMI(MBB, At, DL, CFID)
+          .addCFIIndex(MMI.addFrameInst(OffHi));
+      auto OffLo = MCCFIInstruction::createOffset(FrameLabel, LoDwarfReg,
+                                                  Offset);
+      BuildMI(MBB, At, DL, CFID)
+          .addCFIIndex(MMI.addFrameInst(OffLo));
+    }
+  }
+}
+
+
 bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const HexagonMachineFunctionInfo *FuncInfo =
-    MF.getInfo<HexagonMachineFunctionInfo>();
-  return MFI->hasCalls() || MFI->getStackSize() > 0 ||
-         FuncInfo->hasClobberLR();
+  auto &MFI = *MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+  bool HasFixed = MFI.getNumFixedObjects();
+  bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
+                        .getLocalFrameObjectCount();
+  bool HasExtraAlign = HRI.needsStackRealignment(MF);
+  bool HasAlloca = MFI.hasVarSizedObjects();
+
+  // Insert ALLOCFRAME if we need to or at -O0 for the debugger.  Think
+  // that this shouldn't be required, but doing so now because gcc does and
+  // gdb can't break at the start of the function without it.  Will remove if
+  // this turns out to be a gdb bug.
+  //
+  if (MF.getTarget().getOptLevel() == CodeGenOpt::None)
+    return true;
+
+  // By default we want to use SP (since it's always there). FP requires
+  // some setup (i.e. ALLOCFRAME).
+  // Fixed and preallocated objects need FP if the distance from them to
+  // the SP is unknown (as is with alloca or aligna).
+  if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+    return true;
+
+  if (MFI.getStackSize() > 0) {
+    if (UseAllocframe)
+      return true;
+  }
+
+  if (MFI.hasCalls() ||
+      MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+    return true;
+
+  return false;
 }
 
 
@@ -718,9 +825,89 @@ static void addCalleeSaveRegistersAsImpOperand(MachineInstr *Inst,
 }
 
 
-int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-      int FI) const {
-  return MF.getFrameInfo()->getObjectOffset(FI);
+int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+      int FI, unsigned &FrameReg) const {
+  auto &MFI = *MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+  // Large parts of this code are shared with HRI::eliminateFrameIndex.
+  int Offset = MFI.getObjectOffset(FI);
+  bool HasAlloca = MFI.hasVarSizedObjects();
+  bool HasExtraAlign = HRI.needsStackRealignment(MF);
+  bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
+
+  unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
+  unsigned AP = 0;
+  if (const MachineInstr *AI = getAlignaInstr(MF))
+    AP = AI->getOperand(0).getReg();
+  unsigned FrameSize = MFI.getStackSize();
+
+  bool UseFP = false, UseAP = false;  // Default: use SP (except at -O0).
+  // Use FP at -O0, except when there are objects with extra alignment.
+  // That additional alignment requirement may cause a pad to be inserted,
+  // which will make it impossible to use FP to access objects located
+  // past the pad.
+  if (NoOpt && !HasExtraAlign)
+    UseFP = true;
+  if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) {
+    // Fixed and preallocated objects will be located before any padding
+    // so FP must be used to access them.
+    UseFP |= (HasAlloca || HasExtraAlign);
+  } else {
+    if (HasAlloca) {
+      if (HasExtraAlign)
+        UseAP = true;
+      else
+        UseFP = true;
+    }
+  }
+
+  // If FP was picked, then there had better be FP.
+  bool HasFP = hasFP(MF);
+  assert((HasFP || !UseFP) && "This function must have frame pointer");
+
+  // Having FP implies allocframe. Allocframe will store extra 8 bytes:
+  // FP/LR. If the base register is used to access an object across these
+  // 8 bytes, then the offset will need to be adjusted by 8.
+  //
+  // After allocframe:
+  //                    HexagonISelLowering adds 8 to ---+
+  //                    the offsets of all stack-based   |
+  //                    arguments (*)                    |
+  //                                                     |
+  //   getObjectOffset < 0   0     8  getObjectOffset >= 8
+  // ------------------------+-----+------------------------> increasing
+  //     <local objects>     |FP/LR|    <input arguments>     addresses
+  // -----------------+------+-----+------------------------>
+  //                  |      |
+  //    SP/AP point --+      +-- FP points here (**)
+  //    somewhere on
+  //    this side of FP/LR
+  //
+  // (*) See LowerFormalArguments. The FP/LR is assumed to be present.
+  // (**) *FP == old-FP. FP+0..7 are the bytes of FP/LR.
+
+  // The lowering assumes that FP/LR is present, and so the offsets of
+  // the formal arguments start at 8. If FP/LR is not there we need to
+  // reduce the offset by 8.
+  if (Offset > 0 && !HasFP)
+    Offset -= 8;
+
+  if (UseFP)
+    FrameReg = FP;
+  else if (UseAP)
+    FrameReg = AP;
+  else
+    FrameReg = SP;
+
+  // Calculate the actual offset in the instruction. If there is no FP
+  // (in other words, no allocframe), then SP will not be adjusted (i.e.
+  // there will be no SP -= FrameSize), so the frame size should not be
+  // added to the calculated offset.
+  int RealOffset = Offset;
+  if (!UseFP && !UseAP && HasFP)
+    RealOffset = FrameSize+Offset;
+  return RealOffset;
 }
 
 
@@ -731,7 +918,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
 
   MachineBasicBlock::iterator MI = MBB.begin();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
   if (useSpillFunction(MF, CSI)) {
     unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI);
@@ -739,7 +926,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
     // Call spill function.
     DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
     MachineInstr *SaveRegsCall =
-        BuildMI(MBB, MI, DL, TII.get(Hexagon::SAVE_REGISTERS_CALL_V4))
+        BuildMI(MBB, MI, DL, HII.get(Hexagon::SAVE_REGISTERS_CALL_V4))
           .addExternalSymbol(SpillFun);
     // Add callee-saved registers as use.
     addCalleeSaveRegistersAsImpOperand(SaveRegsCall, MaxReg, false);
@@ -757,7 +944,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
     bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
     int FI = CSI[i].getFrameIdx();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
+    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
     if (IsKill)
       MBB.addLiveIn(Reg);
   }
@@ -772,7 +959,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
 
   MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
 
   if (useRestoreFunction(MF, CSI)) {
     bool HasTC = hasTailCall(MBB) || !hasReturn(MBB);
@@ -787,14 +974,14 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
 
     if (HasTC) {
       unsigned ROpc = Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4;
-      DeallocCall = BuildMI(MBB, MI, DL, TII.get(ROpc))
+      DeallocCall = BuildMI(MBB, MI, DL, HII.get(ROpc))
           .addExternalSymbol(RestoreFn);
     } else {
       // The block has a return.
       MachineBasicBlock::iterator It = MBB.getFirstTerminator();
       assert(It->isReturn() && std::next(It) == MBB.end());
       unsigned ROpc = Hexagon::RESTORE_DEALLOC_RET_JMP_V4;
-      DeallocCall = BuildMI(MBB, It, DL, TII.get(ROpc))
+      DeallocCall = BuildMI(MBB, It, DL, HII.get(ROpc))
           .addExternalSymbol(RestoreFn);
       // Transfer the function live-out registers.
       DeallocCall->copyImplicitOps(MF, It);
@@ -807,7 +994,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     unsigned Reg = CSI[i].getReg();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
     int FI = CSI[i].getFrameIdx();
-    TII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
+    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
   }
   return true;
 }
@@ -832,9 +1019,9 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
   // via AP, which may not be available at the particular place in the program.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   bool HasAlloca = MFI->hasVarSizedObjects();
-  bool HasAligna = (MFI->getMaxAlignment() > getStackAlignment());
+  bool NeedsAlign = (MFI->getMaxAlignment() > getStackAlignment());
 
-  if (!HasAlloca || !HasAligna)
+  if (!HasAlloca || !NeedsAlign)
     return;
 
   unsigned LFS = MFI->getLocalFrameSize();
@@ -864,13 +1051,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
   // Check for an unused caller-saved register.
   for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
     MCPhysReg FreeReg = *CallerSavedRegs;
-    if (MRI.isPhysRegUsed(FreeReg))
+    if (!MRI.reg_nodbg_empty(FreeReg))
       continue;
 
     // Check aliased register usage.
     bool IsCurrentRegUsed = false;
     for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
-      if (MRI.isPhysRegUsed(*AI)) {
+      if (!MRI.reg_nodbg_empty(*AI)) {
         IsCurrentRegUsed = true;
         break;
       }
@@ -896,7 +1083,7 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF)
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
        MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock* MBB = MBBb;
+    MachineBasicBlock *MBB = &*MBBb;
     // Traverse the basic block.
     MachineBasicBlock::iterator NextII;
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
@@ -1210,7 +1397,8 @@ bool HexagonFrameLowering::needsAligna(const MachineFunction &MF) const {
 }
 
 
-MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const {
+const MachineInstr *HexagonFrameLowering::getAlignaInstr(
+      const MachineFunction &MF) const {
   for (auto &B : MF)
     for (auto &I : B)
       if (I.getOpcode() == Hexagon::ALIGNA)
@@ -1219,6 +1407,7 @@ MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const {
 }
 
 
+// FIXME: Use Function::optForSize().
 inline static bool isOptSize(const MachineFunction &MF) {
   AttributeSet AF = MF.getFunction()->getAttributes();
   return AF.hasAttribute(AttributeSet::FunctionIndex,
@@ -1226,8 +1415,7 @@ inline static bool isOptSize(const MachineFunction &MF) {
 }
 
 inline static bool isMinSize(const MachineFunction &MF) {
-  AttributeSet AF = MF.getFunction()->getAttributes();
-  return AF.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  return MF.getFunction()->optForMinSize();
 }
 
 
@@ -1289,4 +1477,3 @@ bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
                                      : SpillFuncThreshold;
   return Threshold < NumCSI;
 }
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index d39ee2c..683b303 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -51,7 +51,8 @@ public:
   bool targetHandlesStackFrameRounding() const override {
     return true;
   }
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
   bool hasFP(const MachineFunction &MF) const override;
 
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
@@ -73,7 +74,9 @@ public:
       const override;
 
   bool needsAligna(const MachineFunction &MF) const;
-  MachineInstr *getAlignaInstr(MachineFunction &MF) const;
+  const MachineInstr *getAlignaInstr(const MachineFunction &MF) const;
+
+  void insertCFIInstructions(MachineFunction &MF) const;
 
 private:
   typedef std::vector<CalleeSavedInfo> CSIVect;
@@ -86,6 +89,8 @@ private:
       const HexagonRegisterInfo &HRI) const;
   bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
       const HexagonRegisterInfo &HRI) const;
+  void insertCFIInstructionsAt(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator At) const;
 
   void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const;
   bool replacePredRegPseudoSpillCode(MachineFunction &MF) const;
@@ -94,7 +99,7 @@ private:
   void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB,
       MachineBasicBlock *&EpilogB) const;
 
-  bool shouldInlineCSR(llvm::MachineFunction&, const CSIVect&) const;
+  bool shouldInlineCSR(llvm::MachineFunction &MF, const CSIVect &CSI) const;
   bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
   bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
 };
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
index 4d32208..f26e2ff 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -195,7 +195,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
       return false;
   }
 
-  IRBuilder<> IRB(BB, In);
+  IRBuilder<> IRB(In);
   Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
                                    : Intrinsic::hexagon_S2_extractup;
   Module *Mod = BB->getParent()->getParent();
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 096da94..64a2b6c 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -77,9 +77,8 @@ namespace {
 namespace {
   // Set of virtual registers, based on BitVector.
   struct RegisterSet : private BitVector {
-    RegisterSet() : BitVector() {}
+    RegisterSet() = default;
     explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
-    RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
 
     using BitVector::clear;
 
@@ -1496,7 +1495,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
   // version of DCE that preserves lifetime markers. Without it, merging
   // of stack objects can fail to recognize and merge disjoint objects
   // leading to unnecessary stack growth.
-  Changed |= removeDeadCode(MDT->getRootNode());
+  Changed = removeDeadCode(MDT->getRootNode());
 
   const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
   BitTracker BTLoc(HE, MF);
@@ -1534,7 +1533,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
   }
 
   if (IFMap.empty())
-    return false;
+    return Changed;
 
   {
     NamedRegionTimer _T("pruning", "hexinsert", TimingDetail);
@@ -1547,7 +1546,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
   }
 
   if (IFMap.empty())
-    return false;
+    return Changed;
 
   {
     NamedRegionTimer _T("selection", "hexinsert", TimingDetail);
@@ -1572,13 +1571,15 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
     for (unsigned i = 0, n = Out.size(); i < n; ++i)
       IFMap.erase(Out[i]);
   }
+  if (IFMap.empty())
+    return Changed;
 
   {
     NamedRegionTimer _T("generation", "hexinsert", TimingDetail);
-    Changed = generateInserts();
+    generateInserts();
   }
 
-  return Changed;
+  return true;
 }
 
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
new file mode 100644
index 0000000..c059d56
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -0,0 +1,319 @@
+//===--- HexagonGenMux.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// During instruction selection, MUX instructions are generated for
+// conditional assignments. Since such assignments often present an
+// opportunity to predicate instructions, HexagonExpandCondsets
+// expands MUXes into pairs of conditional transfers, and then proceeds
+// with predication of the producers/consumers of the registers involved.
+// This happens after exiting from the SSA form, but before the machine
+// instruction scheduler. After the scheduler and after the register
+// allocation there can be cases of pairs of conditional transfers
+// resulting from a MUX where neither of them was further predicated. If
+// these transfers are now placed far enough from the instruction defining
+// the predicate register, they cannot use the .new form. In such cases it
+// is better to collapse them back to a single MUX instruction.
+
+#define DEBUG_TYPE "hexmux"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+  FunctionPass *createHexagonGenMux();
+  void initializeHexagonGenMuxPass(PassRegistry& Registry);
+}
+
+namespace {
+  class HexagonGenMux : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonGenMux() : MachineFunctionPass(ID), HII(0), HRI(0) {
+      initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry());
+    }
+    const char *getPassName() const override {
+      return "Hexagon generate mux instructions";
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    struct CondsetInfo {
+      unsigned PredR;
+      unsigned TrueX, FalseX;
+      CondsetInfo() : PredR(0), TrueX(UINT_MAX), FalseX(UINT_MAX) {}
+    };
+    struct DefUseInfo {
+      BitVector Defs, Uses;
+      DefUseInfo() : Defs(), Uses() {}
+      DefUseInfo(const BitVector &D, const BitVector &U) : Defs(D), Uses(U) {}
+    };
+    struct MuxInfo {
+      MachineBasicBlock::iterator At;
+      unsigned DefR, PredR;
+      MachineOperand *SrcT, *SrcF;
+      MachineInstr *Def1, *Def2;
+      MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR,
+            MachineOperand *TOp, MachineOperand *FOp,
+            MachineInstr *D1, MachineInstr *D2)
+        : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(D1),
+          Def2(D2) {}
+    };
+    typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap;
+    typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap;
+    typedef SmallVector<MuxInfo,4> MuxInfoList;
+
+    bool isRegPair(unsigned Reg) const {
+      return Hexagon::DoubleRegsRegClass.contains(Reg);
+    }
+    void getSubRegs(unsigned Reg, BitVector &SRs) const;
+    void expandReg(unsigned Reg, BitVector &Set) const;
+    void getDefsUses(const MachineInstr *MI, BitVector &Defs,
+          BitVector &Uses) const;
+    void buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+          DefUseInfoMap &DUM);
+    bool isCondTransfer(unsigned Opc) const;
+    unsigned getMuxOpcode(const MachineOperand &Src1,
+          const MachineOperand &Src2) const;
+    bool genMuxInBlock(MachineBasicBlock &B);
+  };
+
+  char HexagonGenMux::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonGenMux, "hexagon-mux",
+  "Hexagon generate mux instructions", false, false)
+
+
+void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const {
+  for (MCSubRegIterator I(Reg, HRI); I.isValid(); ++I)
+    SRs[*I] = true;
+}
+
+
+void HexagonGenMux::expandReg(unsigned Reg, BitVector &Set) const {
+  if (isRegPair(Reg))
+    getSubRegs(Reg, Set);
+  else
+    Set[Reg] = true;
+}
+
+
+void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
+      BitVector &Uses) const {
+  // First, get the implicit defs and uses for this instruction.
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &D = HII->get(Opc);
+  if (const MCPhysReg *R = D.ImplicitDefs)
+    while (*R)
+      expandReg(*R++, Defs);
+  if (const MCPhysReg *R = D.ImplicitUses)
+    while (*R)
+      expandReg(*R++, Uses);
+
+  // Look over all operands, and collect explicit defs and uses.
+  for (ConstMIOperands Mo(MI); Mo.isValid(); ++Mo) {
+    if (!Mo->isReg() || Mo->isImplicit())
+      continue;
+    unsigned R = Mo->getReg();
+    BitVector &Set = Mo->isDef() ? Defs : Uses;
+    expandReg(R, Set);
+  }
+}
+
+
+void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+      DefUseInfoMap &DUM) {
+  unsigned Index = 0;
+  unsigned NR = HRI->getNumRegs();
+  BitVector Defs(NR), Uses(NR);
+
+  for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    I2X.insert(std::make_pair(MI, Index));
+    Defs.reset();
+    Uses.reset();
+    getDefsUses(MI, Defs, Uses);
+    DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses)));
+    Index++;
+  }
+}
+
+
+bool HexagonGenMux::isCondTransfer(unsigned Opc) const {
+  switch (Opc) {
+    case Hexagon::A2_tfrt:
+    case Hexagon::A2_tfrf:
+    case Hexagon::C2_cmoveit:
+    case Hexagon::C2_cmoveif:
+      return true;
+  }
+  return false;
+}
+
+
+unsigned HexagonGenMux::getMuxOpcode(const MachineOperand &Src1,
+      const MachineOperand &Src2) const {
+  bool IsReg1 = Src1.isReg(), IsReg2 = Src2.isReg();
+  if (IsReg1)
+    return IsReg2 ? Hexagon::C2_mux : Hexagon::C2_muxir;
+  if (IsReg2)
+    return Hexagon::C2_muxri;
+
+  // Neither is a register. The first source is extendable, but the second
+  // is not (s8).
+  if (Src2.isImm() && isInt<8>(Src2.getImm()))
+    return Hexagon::C2_muxii;
+
+  return 0;
+}
+
+
+bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
+  bool Changed = false;
+  InstrIndexMap I2X;
+  DefUseInfoMap DUM;
+  buildMaps(B, I2X, DUM);
+
+  typedef DenseMap<unsigned,CondsetInfo> CondsetMap;
+  CondsetMap CM;
+  MuxInfoList ML;
+
+  MachineBasicBlock::iterator NextI, End = B.end();
+  for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) {
+    MachineInstr *MI = &*I;
+    NextI = std::next(I);
+    unsigned Opc = MI->getOpcode();
+    if (!isCondTransfer(Opc))
+      continue;
+    unsigned DR = MI->getOperand(0).getReg();
+    if (isRegPair(DR))
+      continue;
+
+    unsigned PR = MI->getOperand(1).getReg();
+    unsigned Idx = I2X.lookup(MI);
+    CondsetMap::iterator F = CM.find(DR);
+    bool IfTrue = HII->isPredicatedTrue(Opc);
+
+    // If there is no record of a conditional transfer for this register,
+    // or the predicate register differs, create a new record for it.
+    if (F != CM.end() && F->second.PredR != PR) {
+      CM.erase(F);
+      F = CM.end();
+    }
+    if (F == CM.end()) {
+      auto It = CM.insert(std::make_pair(DR, CondsetInfo()));
+      F = It.first;
+      F->second.PredR = PR;
+    }
+    CondsetInfo &CI = F->second;
+    if (IfTrue)
+      CI.TrueX = Idx;
+    else
+      CI.FalseX = Idx;
+    if (CI.TrueX == UINT_MAX || CI.FalseX == UINT_MAX)
+      continue;
+
+    // There is now a complete definition of DR, i.e. we have the predicate
+    // register, the definition if-true, and definition if-false.
+
+    // First, check if both definitions are far enough from the definition
+    // of the predicate register.
+    unsigned MinX = std::min(CI.TrueX, CI.FalseX);
+    unsigned MaxX = std::max(CI.TrueX, CI.FalseX);
+    unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0;
+    bool NearDef = false;
+    for (unsigned X = SearchX; X < MaxX; ++X) {
+      const DefUseInfo &DU = DUM.lookup(X);
+      if (!DU.Defs[PR])
+        continue;
+      NearDef = true;
+      break;
+    }
+    if (NearDef)
+      continue;
+
+    // The predicate register is not defined in the last few instructions.
+    // Check if the conversion to MUX is possible (either "up", i.e. at the
+    // place of the earlier partial definition, or "down", where the later
+    // definition is located). Examine all defs and uses between these two
+    // definitions.
+    // SR1, SR2 - source registers from the first and the second definition.
+    MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin();
+    std::advance(It1, MinX);
+    std::advance(It2, MaxX);
+    MachineInstr *Def1 = It1, *Def2 = It2;
+    MachineOperand *Src1 = &Def1->getOperand(2), *Src2 = &Def2->getOperand(2);
+    unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
+    unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
+    bool Failure = false, CanUp = true, CanDown = true;
+    for (unsigned X = MinX+1; X < MaxX; X++) {
+      const DefUseInfo &DU = DUM.lookup(X);
+      if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
+        Failure = true;
+        break;
+      }
+      if (CanDown && DU.Defs[SR1])
+        CanDown = false;
+      if (CanUp && DU.Defs[SR2])
+        CanUp = false;
+    }
+    if (Failure || (!CanUp && !CanDown))
+      continue;
+
+    MachineOperand *SrcT = (MinX == CI.TrueX) ? Src1 : Src2;
+    MachineOperand *SrcF = (MinX == CI.FalseX) ? Src1 : Src2;
+    // Prefer "down", since this will move the MUX farther away from the
+    // predicate definition.
+    MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
+    ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
+  }
+
+  for (unsigned I = 0, N = ML.size(); I < N; ++I) {
+    MuxInfo &MX = ML[I];
+    MachineBasicBlock &B = *MX.At->getParent();
+    DebugLoc DL = MX.At->getDebugLoc();
+    unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
+    if (!MxOpc)
+      continue;
+    BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+      .addReg(MX.PredR)
+      .addOperand(*MX.SrcT)
+      .addOperand(*MX.SrcF);
+    B.erase(MX.Def1);
+    B.erase(MX.Def2);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) {
+  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  bool Changed = false;
+  for (auto &I : MF)
+    Changed |= genMuxInBlock(I);
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenMux() {
+  return new HexagonGenMux();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 6905c4f..d9675b5 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -250,7 +250,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
   unsigned NewPR = MRI->createVirtualRegister(PredRC);
 
   // For convertible instructions, do not modify them, so that they can
-  // be coverted later.  Generate a copy from Reg to NewPR.
+  // be converted later.  Generate a copy from Reg to NewPR.
   if (isConvertibleToPredForm(DefI)) {
     MachineBasicBlock::iterator DefIt = DefI;
     BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 53b6bf6..d20a809 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -727,9 +727,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // Phis that may feed into the loop.
   LoopFeederMap LoopFeederPhi;
 
-  // Check if the inital value may be zero and can be decremented in the first
+  // Check if the initial value may be zero and can be decremented in the first
   // iteration. If the value is zero, the endloop instruction will not decrement
-  // the loop counter, so we shoudn't generate a hardware loop in this case.
+  // the loop counter, so we shouldn't generate a hardware loop in this case.
   if (loopCountMayWrapOrUnderFlow(Start, End, Loop->getLoopPreheader(), Loop,
                                   LoopFeederPhi))
       return nullptr;
@@ -1288,14 +1288,14 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
 
   typedef MachineBasicBlock::instr_iterator instr_iterator;
   // Check if things are in order to begin with.
-  for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I)
+  for (instr_iterator I(BumpI), E = BB->instr_end(); I != E; ++I)
     if (&*I == CmpI)
       return true;
 
   // Out of order.
   unsigned PredR = CmpI->getOperand(0).getReg();
   bool FoundBump = false;
-  instr_iterator CmpIt = CmpI, NextIt = std::next(CmpIt);
+  instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt);
   for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
     MachineInstr *In = &*I;
     for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
@@ -1307,9 +1307,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
     }
 
     if (In == BumpI) {
-      instr_iterator After = BumpI;
-      instr_iterator From = CmpI;
-      BB->splice(std::next(After), BB, From);
+      BB->splice(++BumpI->getIterator(), BB, CmpI->getIterator());
       FoundBump = true;
       break;
     }
@@ -1440,7 +1438,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
     if (Comparison::isSigned(Cmp))
       return false;
 
-    // Check if there is a comparison of the inital value. If the initial value
+    // Check if there is a comparison of the initial value. If the initial value
     // is greater than or not equal to another value, then assume this is a
     // range check.
     if ((Cmp & Comparison::G) || Cmp == Comparison::NE)
@@ -1850,7 +1848,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   }
 
   MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
-  MF->insert(Header, NewPH);
+  MF->insert(Header->getIterator(), NewPH);
 
   if (Header->pred_size() > 2) {
     // Ensure that the header has only two predecessors: the preheader and
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9123057..a0da945 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -50,16 +50,21 @@ namespace {
 class HexagonDAGToDAGISel : public SelectionDAGISel {
   const HexagonTargetMachine& HTM;
   const HexagonSubtarget *HST;
+  const HexagonInstrInfo *HII;
+  const HexagonRegisterInfo *HRI;
 public:
   explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), HTM(tm) {
+      : SelectionDAGISel(tm, OptLevel), HTM(tm), HST(nullptr), HII(nullptr),
+        HRI(nullptr) {
     initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     // Reset the subtarget each time through.
     HST = &MF.getSubtarget<HexagonSubtarget>();
+    HII = HST->getInstrInfo();
+    HRI = HST->getRegisterInfo();
     SelectionDAGISel::runOnMachineFunction(MF);
     return true;
   }
@@ -104,7 +109,6 @@ public:
   SDNode *SelectConstantFP(SDNode *N);
   SDNode *SelectAdd(SDNode *N);
   SDNode *SelectBitOp(SDNode *N);
-  bool isConstExtProfitable(SDNode *N) const;
 
   // XformMskToBitPosU5Imm - Returns the bit position which
   // the single bit 32 bit mask represents.
@@ -139,8 +143,8 @@ public:
   // type i32 where the negative literal is transformed into a positive literal
   // for use in -= memops.
   inline SDValue XformM5ToU5Imm(signed Imm, SDLoc DL) {
-     assert( (Imm >= -31 && Imm <= -1)  && "Constant out of range for Memops");
-     return CurDAG->getTargetConstant( - Imm, DL, MVT::i32);
+     assert((Imm >= -31 && Imm <= -1)  && "Constant out of range for Memops");
+     return CurDAG->getTargetConstant(-Imm, DL, MVT::i32);
   }
 
   // XformU7ToU7M1Imm - Return a target constant decremented by 1, in range
@@ -203,11 +207,10 @@ void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) {
 
 
 // Intrinsics that return a a predicate.
-static unsigned doesIntrinsicReturnPredicate(unsigned ID)
-{
+static bool doesIntrinsicReturnPredicate(unsigned ID) {
   switch (ID) {
     default:
-      return 0;
+      return false;
     case Intrinsic::hexagon_C2_cmpeq:
     case Intrinsic::hexagon_C2_cmpgt:
     case Intrinsic::hexagon_C2_cmpgtu:
@@ -244,7 +247,7 @@ static unsigned doesIntrinsicReturnPredicate(unsigned ID)
     case Intrinsic::hexagon_C2_tfrrp:
     case Intrinsic::hexagon_S2_tstbit_i:
     case Intrinsic::hexagon_S2_tstbit_r:
-      return 1;
+      return true;
   }
 }
 
@@ -258,8 +261,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
   SDNode *OffsetNode = Offset.getNode();
   int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
 
-  const HexagonInstrInfo &TII = *HST->getInstrInfo();
-  if (TII.isValidAutoIncImm(LoadedVT, Val)) {
+  if (HII->isValidAutoIncImm(LoadedVT, Val)) {
     SDValue TargetConst = CurDAG->getTargetConstant(Val, dl, MVT::i32);
     SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
                                               MVT::Other, Base, TargetConst,
@@ -312,8 +314,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
   SDNode *OffsetNode = Offset.getNode();
   int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
 
-  const HexagonInstrInfo &TII = *HST->getInstrInfo();
-  if (TII.isValidAutoIncImm(LoadedVT, Val)) {
+  if (HII->isValidAutoIncImm(LoadedVT, Val)) {
     SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
     SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
     SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
@@ -378,29 +379,46 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
   // loads.
   ISD::LoadExtType ExtType = LD->getExtensionType();
   bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD);
+  bool HasVecOffset = false;
 
   // Figure out the opcode.
-  const HexagonInstrInfo &TII = *HST->getInstrInfo();
   if (LoadedVT == MVT::i64) {
-    if (TII.isValidAutoIncImm(LoadedVT, Val))
+    if (HII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = Hexagon::L2_loadrd_pi;
     else
       Opcode = Hexagon::L2_loadrd_io;
   } else if (LoadedVT == MVT::i32) {
-    if (TII.isValidAutoIncImm(LoadedVT, Val))
+    if (HII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = Hexagon::L2_loadri_pi;
     else
       Opcode = Hexagon::L2_loadri_io;
   } else if (LoadedVT == MVT::i16) {
-    if (TII.isValidAutoIncImm(LoadedVT, Val))
+    if (HII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = IsZeroExt ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi;
     else
       Opcode = IsZeroExt ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io;
   } else if (LoadedVT == MVT::i8) {
-    if (TII.isValidAutoIncImm(LoadedVT, Val))
+    if (HII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = IsZeroExt ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi;
     else
       Opcode = IsZeroExt ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io;
+  } else if (LoadedVT == MVT::v16i32 || LoadedVT == MVT::v8i64 ||
+             LoadedVT == MVT::v32i16 || LoadedVT == MVT::v64i8) {
+    HasVecOffset = true;
+    if (HII->isValidAutoIncImm(LoadedVT, Val)) {
+      Opcode = Hexagon::V6_vL32b_pi;
+    }
+    else
+      Opcode = Hexagon::V6_vL32b_ai;
+  // 128B
+  } else if (LoadedVT == MVT::v32i32 || LoadedVT == MVT::v16i64 ||
+             LoadedVT == MVT::v64i16 || LoadedVT == MVT::v128i8) {
+    HasVecOffset = true;
+    if (HII->isValidAutoIncImm(LoadedVT, Val)) {
+      Opcode = Hexagon::V6_vL32b_pi_128B;
+    }
+    else
+      Opcode = Hexagon::V6_vL32b_ai_128B;
   } else
     llvm_unreachable("unknown memory type");
 
@@ -411,7 +429,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
   if (LD->getValueType(0) == MVT::i64 && ExtType == ISD::SEXTLOAD)
     return SelectIndexedLoadSignExtend64(LD, Opcode, dl);
 
-  if (TII.isValidAutoIncImm(LoadedVT, Val)) {
+  if (HII->isValidAutoIncImm(LoadedVT, Val)) {
     SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
     SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
                                             LD->getValueType(0),
@@ -420,15 +438,25 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
     MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
     MemOp[0] = LD->getMemOperand();
     cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
-    const SDValue Froms[] = { SDValue(LD, 0),
-                              SDValue(LD, 1),
-                              SDValue(LD, 2)
-    };
-    const SDValue Tos[]   = { SDValue(Result, 0),
-                              SDValue(Result, 1),
-                              SDValue(Result, 2)
-    };
-    ReplaceUses(Froms, Tos, 3);
+    if (HasVecOffset) {
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result, 0),
+                                SDValue(Result, 2)
+      };
+      ReplaceUses(Froms, Tos, 2);
+    } else {
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result, 0),
+                                SDValue(Result, 1),
+                                SDValue(Result, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+    }
     return Result;
   } else {
     SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
@@ -487,8 +515,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
 
   // Offset value must be within representable range
   // and must have correct alignment properties.
-  const HexagonInstrInfo &TII = *HST->getInstrInfo();
-  if (TII.isValidAutoIncImm(StoredVT, Val)) {
+  if (HII->isValidAutoIncImm(StoredVT, Val)) {
     unsigned Opcode = 0;
 
     // Figure out the post inc version of opcode.
@@ -496,7 +523,15 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
     else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_pi;
     else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_pi;
     else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi;
-    else llvm_unreachable("unknown memory type");
+    else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 ||
+             StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) {
+      Opcode = Hexagon::V6_vS32b_pi;
+    }
+    // 128B
+    else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 ||
+             StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) {
+      Opcode = Hexagon::V6_vS32b_pi_128B;
+    } else llvm_unreachable("unknown memory type");
 
     if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) {
       assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store");
@@ -530,6 +565,13 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
   else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io;
   else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io;
   else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io;
+  else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 ||
+           StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8)
+     Opcode = Hexagon::V6_vS32b_ai;
+  // 128B
+  else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 ||
+           StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8)
+     Opcode = Hexagon::V6_vS32b_ai_128B;
   else llvm_unreachable("unknown memory type");
 
   // Build regular store.
@@ -1113,14 +1155,12 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
   }
 
   if (Opc == ISD::AND) {
-    if (((ValueVT == MVT::i32) &&
-                  (!((Val & 0x80000000) || (Val & 0x7fffffff)))) ||
-        ((ValueVT == MVT::i64) &&
-                  (!((Val & 0x8000000000000000) || (Val & 0x7fffffff)))))
-      // If it's simple AND, do the normal op.
-      return SelectCode(N);
-    else
+    // Check if this is a bit-clearing AND, if not select code the usual way.
+    if ((ValueVT == MVT::i32 && isPowerOf2_32(~Val)) ||
+        (ValueVT == MVT::i64 && isPowerOf2_64(~Val)))
       Val = ~Val;
+    else
+      return SelectCode(N);
   }
 
   // If OR or AND is being fed by shl, srl and, sra don't do this change,
@@ -1128,7 +1168,8 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
   // Traverse the DAG to see if there is shl, srl and sra.
   if (Opc == ISD::OR || Opc == ISD::AND) {
     switch (N->getOperand(0)->getOpcode()) {
-      default: break;
+      default:
+        break;
       case ISD::SRA:
       case ISD::SRL:
       case ISD::SHL:
@@ -1137,23 +1178,24 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
   }
 
   // Make sure it's power of 2.
-  unsigned bitpos = 0;
+  unsigned BitPos = 0;
   if (Opc != ISD::FABS && Opc != ISD::FNEG) {
-    if (((ValueVT == MVT::i32) && !isPowerOf2_32(Val)) ||
-        ((ValueVT == MVT::i64) && !isPowerOf2_64(Val)))
+    if ((ValueVT == MVT::i32 && !isPowerOf2_32(Val)) ||
+        (ValueVT == MVT::i64 && !isPowerOf2_64(Val)))
       return SelectCode(N);
 
     // Get the bit position.
-    bitpos = countTrailingZeros(uint64_t(Val));
+    BitPos = countTrailingZeros(uint64_t(Val));
   } else {
     // For fabs and fneg, it's always the 31st bit.
-    bitpos = 31;
+    BitPos = 31;
   }
 
   unsigned BitOpc = 0;
   // Set the right opcode for bitwise operations.
-  switch(Opc) {
-    default: llvm_unreachable("Only bit-wise/abs/neg operations are allowed.");
+  switch (Opc) {
+    default:
+      llvm_unreachable("Only bit-wise/abs/neg operations are allowed.");
     case ISD::AND:
     case ISD::FABS:
       BitOpc = Hexagon::S2_clrbit_i;
@@ -1169,7 +1211,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
 
   SDNode *Result;
   // Get the right SDVal for the opcode.
-  SDValue SDVal = CurDAG->getTargetConstant(bitpos, dl, MVT::i32);
+  SDValue SDVal = CurDAG->getTargetConstant(BitPos, dl, MVT::i32);
 
   if (ValueVT == MVT::i32 || ValueVT == MVT::f32) {
     Result = CurDAG->getMachineNode(BitOpc, dl, ValueVT,
@@ -1198,7 +1240,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
                                                     MVT::i32, SDValue(Reg, 0));
 
     // Clear/set/toggle hi or lo registers depending on the bit position.
-    if (SubValueVT != MVT::f32 && bitpos < 32) {
+    if (SubValueVT != MVT::f32 && BitPos < 32) {
       SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT,
                                                SubregLO, SDVal);
       const SDValue Ops[] = { RegClass, SubregHI, SubregHiIdx,
@@ -1207,7 +1249,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
                                       dl, ValueVT, Ops);
     } else {
       if (Opc != ISD::FABS && Opc != ISD::FNEG)
-        SDVal = CurDAG->getTargetConstant(bitpos - 32, dl, MVT::i32);
+        SDVal = CurDAG->getTargetConstant(BitPos-32, dl, MVT::i32);
       SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT,
                                                SubregHI, SDVal);
       const SDValue Ops[] = { RegClass, SDValue(Result0, 0), SubregHiIdx,
@@ -1328,25 +1370,12 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
   return false;
 }
 
-bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const {
-  unsigned UseCount = 0;
-  unsigned CallCount = 0;
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
-    // Ignore call instructions.
-    if (I->getOpcode() == ISD::CopyToReg)
-      ++CallCount;
-    UseCount++;
-  }
-
-  return (UseCount <= 1) || (CallCount > 1);
-
-}
 
 void HexagonDAGToDAGISel::PreprocessISelDAG() {
   SelectionDAG &DAG = *CurDAG;
   std::vector<SDNode*> Nodes;
-  for (auto I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I)
-    Nodes.push_back(I);
+  for (SDNode &Node : DAG.allnodes())
+    Nodes.push_back(&Node);
 
   // Simplify: (or (select c x 0) z)  ->  (select c (or x z) z)
   //           (or (select c 0 y) z)  ->  (select c z (or y z))
@@ -1397,11 +1426,10 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
     return;
 
   MachineFrameInfo *MFI = MF->getFrameInfo();
-  MachineBasicBlock *EntryBB = MF->begin();
+  MachineBasicBlock *EntryBB = &MF->front();
   unsigned AR = FuncInfo->CreateReg(MVT::i32);
   unsigned MaxA = MFI->getMaxAlignment();
-  auto &HII = *HST.getInstrInfo();
-  BuildMI(EntryBB, DebugLoc(), HII.get(Hexagon::ALIGNA), AR)
+  BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::ALIGNA), AR)
       .addImm(MaxA);
   MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index c739afb..0167090 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-lowering"
 
-static cl::opt<bool>
-EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
+static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables",
+  cl::init(true), cl::Hidden,
   cl::desc("Control jump table emission on Hexagon target"));
 
 static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched",
@@ -98,6 +98,9 @@ public:
 }
 
 // Implement calling convention for Hexagon.
+
+static bool IsHvxVectorType(MVT ty);
+
 static bool
 CC_Hexagon(unsigned ValNo, MVT ValVT,
            MVT LocVT, CCValAssign::LocInfo LocInfo,
@@ -114,6 +117,11 @@ CC_Hexagon64(unsigned ValNo, MVT ValVT,
              ISD::ArgFlagsTy ArgFlags, CCState &State);
 
 static bool
+CC_HexagonVector(unsigned ValNo, MVT ValVT,
+                 MVT LocVT, CCValAssign::LocInfo LocInfo,
+                 ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
 RetCC_Hexagon(unsigned ValNo, MVT ValVT,
               MVT LocVT, CCValAssign::LocInfo LocInfo,
               ISD::ArgFlagsTy ArgFlags, CCState &State);
@@ -129,6 +137,11 @@ RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
                 ISD::ArgFlagsTy ArgFlags, CCState &State);
 
 static bool
+RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+                    MVT LocVT, CCValAssign::LocInfo LocInfo,
+                    ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
 CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
             MVT LocVT, CCValAssign::LocInfo LocInfo,
             ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -169,15 +182,43 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
+  if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
+      LocVT == MVT::v16i8) {
+    ofst = State.AllocateStack(16, 16);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
+      LocVT == MVT::v32i8) {
+    ofst = State.AllocateStack(32, 32);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+      LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
+    ofst = State.AllocateStack(64, 64);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+      LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
+    ofst = State.AllocateStack(128, 128);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+      LocVT == MVT::v256i8) {
+    ofst = State.AllocateStack(256, 256);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+
   llvm_unreachable(nullptr);
 }
 
 
-static bool
-CC_Hexagon (unsigned ValNo, MVT ValVT,
-            MVT LocVT, CCValAssign::LocInfo LocInfo,
-            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
+static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
+      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
   if (ArgFlags.isByVal()) {
     // Passed on stack.
     unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
@@ -213,6 +254,17 @@ CC_Hexagon (unsigned ValNo, MVT ValVT,
       return false;
   }
 
+  if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
+    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  if (IsHvxVectorType(LocVT)) {
+    if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
   return true;  // CC didn't match.
 }
 
@@ -260,10 +312,82 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
   return false;
 }
 
+static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
+                             MVT LocVT, CCValAssign::LocInfo LocInfo,
+                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+    static const MCPhysReg VecLstS[] = { Hexagon::V0, Hexagon::V1,
+                                         Hexagon::V2, Hexagon::V3,
+                                         Hexagon::V4, Hexagon::V5,
+                                         Hexagon::V6, Hexagon::V7,
+                                         Hexagon::V8, Hexagon::V9,
+                                         Hexagon::V10, Hexagon::V11,
+                                         Hexagon::V12, Hexagon::V13,
+                                         Hexagon::V14, Hexagon::V15};
+    static const MCPhysReg VecLstD[] = { Hexagon::W0, Hexagon::W1,
+                                         Hexagon::W2, Hexagon::W3,
+                                         Hexagon::W4, Hexagon::W5,
+                                         Hexagon::W6, Hexagon::W7};
+  auto &MF = State.getMachineFunction();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  if ((UseHVX && !UseHVXDbl) &&
+      (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+       LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
+    if (unsigned Reg = State.AllocateReg(VecLstS)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(64, 64);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if ((UseHVX && !UseHVXDbl) &&
+      (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+       LocVT == MVT::v128i8)) {
+    if (unsigned Reg = State.AllocateReg(VecLstD)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(128, 128);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  // 128B Mode
+  if ((UseHVX && UseHVXDbl) &&
+      (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+       LocVT == MVT::v256i8)) {
+    if (unsigned Reg = State.AllocateReg(VecLstD)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(256, 256);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if ((UseHVX && UseHVXDbl) &&
+      (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+       LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
+    if (unsigned Reg = State.AllocateReg(VecLstS)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(128, 128);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  return true;
+}
+
 static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
                           MVT LocVT, CCValAssign::LocInfo LocInfo,
                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
+  auto &MF = State.getMachineFunction();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
 
   if (LocVT == MVT::i1 ||
       LocVT == MVT::i8 ||
@@ -282,8 +406,24 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
   } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
     LocVT = MVT::i64;
     LocInfo = CCValAssign::BCvt;
+  } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
+             LocVT == MVT::v16i32 || LocVT == MVT::v8i64 ||
+             LocVT == MVT::v512i1) {
+    LocVT = MVT::v16i32;
+    ValVT = MVT::v16i32;
+    LocInfo = CCValAssign::Full;
+  } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
+             LocVT == MVT::v32i32 || LocVT == MVT::v16i64 ||
+             (LocVT == MVT::v1024i1 && UseHVX && UseHVXDbl)) {
+    LocVT = MVT::v32i32;
+    ValVT = MVT::v32i32;
+    LocInfo = CCValAssign::Full;
+  } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
+             LocVT == MVT::v64i32 || LocVT == MVT::v32i64) {
+    LocVT = MVT::v64i32;
+    ValVT = MVT::v64i32;
+    LocInfo = CCValAssign::Full;
   }
-
   if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
     return false;
@@ -293,7 +433,10 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
     if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
     return false;
   }
-
+  if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
+    if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
   return true;  // CC didn't match.
 }
 
@@ -328,6 +471,52 @@ static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
   return false;
 }
 
+static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+                                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  auto &MF = State.getMachineFunction();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  unsigned OffSiz = 64;
+  if (LocVT == MVT::v16i32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  } else if (LocVT == MVT::v32i32) {
+    unsigned Req = (UseHVX && UseHVXDbl) ? Hexagon::V0 : Hexagon::W0;
+    if (unsigned Reg = State.AllocateReg(Req)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    OffSiz = 128;
+  } else if (LocVT == MVT::v64i32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    OffSiz = 256;
+  }
+
+  unsigned Offset = State.AllocateStack(OffSiz, OffSiz);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(),
+                      PromotedLdStVT.getSimpleVT());
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(),
+                      PromotedLdStVT.getSimpleVT());
+  }
+}
+
 SDValue
 HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
 const {
@@ -351,6 +540,15 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                        MachinePointerInfo(), MachinePointerInfo());
 }
 
+static bool IsHvxVectorType(MVT ty) {
+  return (ty == MVT::v8i64 || ty == MVT::v16i32 || ty == MVT::v32i16 ||
+          ty == MVT::v64i8 ||
+          ty == MVT::v16i64 || ty == MVT::v32i32 || ty == MVT::v64i16 ||
+          ty == MVT::v128i8 ||
+          ty == MVT::v32i64 || ty == MVT::v64i32 || ty == MVT::v128i16 ||
+          ty == MVT::v256i8 ||
+          ty == MVT::v512i1 || ty == MVT::v1024i1);
+}
 
 // LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
 // passed by value, the function prototype is modified to return void and
@@ -463,19 +661,15 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Check for varargs.
   int NumNamedVarArgParams = -1;
-  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
-  {
-    const Function* CalleeFn = nullptr;
-    Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
-    if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
-    {
+  if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = GAN->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+    if (const Function* F = dyn_cast<Function>(GV)) {
       // If a function has zero args and is a vararg function, that's
       // disallowed so it must be an undeclared function.  Do not assume
       // varargs if the callee is undefined.
-      if (CalleeFn->isVarArg() &&
-          CalleeFn->getFunctionType()->getNumParams() != 0) {
-        NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams();
-      }
+      if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
+        NumNamedVarArgParams = F->getFunctionType()->getNumParams();
     }
   }
 
@@ -519,11 +713,16 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
 
+  bool NeedsArgAlign = false;
+  unsigned LargestAlignSeen = 0;
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    // Record if we need > 8 byte alignment on an argument.
+    bool ArgAlign = IsHvxVectorType(VA.getValVT());
+    NeedsArgAlign |= ArgAlign;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -549,13 +748,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       SDValue MemAddr = DAG.getConstant(LocMemOffset, dl,
                                         StackPtr.getValueType());
       MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr);
+      if (ArgAlign)
+        LargestAlignSeen = std::max(LargestAlignSeen,
+                                    VA.getLocVT().getStoreSizeInBits() >> 3);
       if (Flags.isByVal()) {
         // The argument is a struct passed by value. According to LLVM, "Arg"
         // is is pointer.
         MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
                                                         Flags, DAG, dl));
       } else {
-        MachinePointerInfo LocPI = MachinePointerInfo::getStack(LocMemOffset);
+        MachinePointerInfo LocPI = MachinePointerInfo::getStack(
+            DAG.getMachineFunction(), LocMemOffset);
         SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI, false,
                                  false, 0);
         MemOpChains.push_back(S);
@@ -569,6 +772,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   }
 
+  if (NeedsArgAlign && Subtarget.hasV60TOps()) {
+    DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+    MachineFrameInfo* MFI = DAG.getMachineFunction().getFrameInfo();
+    // V6 vectors passed by value have 64 or 128 byte alignment depending
+    // on whether we are 64 byte vector mode or 128 byte.
+    bool UseHVXDbl = Subtarget.useHVXDblOps();
+    assert(Subtarget.useHVXOps());
+    const unsigned ObjAlign = UseHVXDbl ? 128 : 64;
+    LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+    MFI->ensureMaxAlignment(LargestAlignSeen);
+  }
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
@@ -613,12 +827,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  if (flag_aligned_memcpy) {
-    const char *MemcpyName =
-      "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
-    Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT);
-    flag_aligned_memcpy = false;
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT);
   } else if (ExternalSymbolSDNode *S =
              dyn_cast<ExternalSymbolSDNode>(Callee)) {
@@ -668,7 +877,19 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
   if (Ptr->getOpcode() != ISD::ADD)
     return false;
 
-  if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+  auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  bool ValidHVXDblType =
+    (UseHVX && UseHVXDbl) && (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+                              VT == MVT::v64i16 || VT == MVT::v128i8);
+  bool ValidHVXType =
+    UseHVX && !UseHVXDbl && (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+                             VT == MVT::v32i16 || VT == MVT::v64i8);
+
+  if (ValidHVXDblType || ValidHVXType ||
+      VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
     isInc = (Ptr->getOpcode() == ISD::ADD);
     Base = Ptr->getOperand(0);
     Offset = Ptr->getOperand(1);
@@ -679,23 +900,6 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
   return false;
 }
 
-// TODO: Put this function along with the other isS* functions in
-// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the
-// functions defined in HexagonOperands.td.
-static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) {
-  ConstantSDNode *N = cast<ConstantSDNode>(S);
-
-  // immS4 predicate - True if the immediate fits in a 4-bit sign extended.
-  // field.
-  int64_t v = (int64_t)N->getSExtValue();
-  int64_t m = 0;
-  if (ShiftAmount > 0) {
-    m = v % ShiftAmount;
-    v = v >> ShiftAmount;
-  }
-  return (v <= 7) && (v >= -8) && (m == 0);
-}
-
 /// getPostIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if this node can be
 /// combined with a load / store to form a post-indexed load / store.
@@ -724,18 +928,20 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   bool isInc = false;
   bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
                                         isInc, DAG);
-  // ShiftAmount = number of left-shifted bits in the Hexagon instruction.
-  int ShiftAmount = VT.getSizeInBits() / 16;
-  if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) {
-    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
-    return true;
+  if (isLegal) {
+    auto &HII = *Subtarget.getInstrInfo();
+    int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+    if (HII.isValidAutoIncImm(VT, OffsetVal)) {
+      AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+      return true;
+    }
   }
 
   return false;
 }
 
-SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
-                                              SelectionDAG &DAG) const {
+SDValue
+HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   MachineFunction &MF = DAG.getMachineFunction();
   auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
@@ -784,47 +990,6 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
   return Op;
 }
 
-
-//
-// Taken from the XCore backend.
-//
-SDValue HexagonTargetLowering::
-LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
-{
-  SDValue Chain = Op.getOperand(0);
-  SDValue Table = Op.getOperand(1);
-  SDValue Index = Op.getOperand(2);
-  SDLoc dl(Op);
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
-  unsigned JTI = JT->getIndex();
-  MachineFunction &MF = DAG.getMachineFunction();
-  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
-  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
-
-  // Mark all jump table targets as address taken.
-  const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables();
-  const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs;
-  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = JTBBs[i];
-    MBB->setHasAddressTaken();
-    // This line is needed to set the hasAddressTaken flag on the BasicBlock
-    // object.
-    BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
-  }
-
-  SDValue JumpTableBase = DAG.getNode(
-      HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT);
-  SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
-                                   DAG.getConstant(2, dl, MVT::i32));
-  SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
-                                  ShiftIndex);
-  SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress,
-                                   MachinePointerInfo(), false, false, false,
-                                   0);
-  return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget);
-}
-
-
 SDValue
 HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                SelectionDAG &DAG) const {
@@ -850,7 +1015,10 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   SDValue AC = DAG.getConstant(A, dl, MVT::i32);
   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
-  return DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
+  SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
+  if (Op.getNode()->getHasDebugValue())
+    DAG.TransferDbgValues(Op, AA);
+  return AA;
 }
 
 SDValue
@@ -882,7 +1050,8 @@ const {
   // equal to) 8 bytes. If not, no address will be passed into callee and
   // callee return the result direclty through R0/R1.
 
-  SmallVector<SDValue, 4> MemOps;
+  SmallVector<SDValue, 8> MemOps;
+  bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -908,6 +1077,42 @@ const {
           RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+      // Single Vector
+      } else if ((RegVT == MVT::v8i64 || RegVT == MVT::v16i32 ||
+                  RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VectorRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+    } else if (UseHVX && UseHVXDbl &&
+               ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+                 RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VectorRegs128BRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+      // Double Vector
+      } else if ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+                  RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VecDblRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (UseHVX && UseHVXDbl &&
+                ((RegVT == MVT::v32i64 || RegVT == MVT::v64i32 ||
+                  RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VecDblRegs128BRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
+        assert(0 && "need to support VecPred regs");
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VecPredRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       } else {
         assert (0);
       }
@@ -1056,8 +1261,8 @@ SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG)
-      const {
+SDValue
+HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue PredOp = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
   EVT OpVT = Op1.getValueType();
@@ -1163,16 +1368,33 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
-  SDLoc dl(Op);
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  SDValue Res;
-  if (CP->isMachineConstantPoolEntry())
-    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy,
-                                    CP->getAlignment());
+  ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op);
+  unsigned Align = CPN->getAlignment();
+  Reloc::Model RM = HTM.getRelocationModel();
+  unsigned char TF = (RM == Reloc::PIC_) ? HexagonII::MO_PCREL : 0;
+
+  SDValue T;
+  if (CPN->isMachineConstantPoolEntry())
+    T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, TF);
   else
-    Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy,
-                                    CP->getAlignment());
-  return DAG.getNode(HexagonISD::CP, dl, ValTy, Res);
+    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, TF);
+  if (RM == Reloc::PIC_)
+    return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T);
+  return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T);
+}
+
+SDValue
+HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int Idx = cast<JumpTableSDNode>(Op)->getIndex();
+  Reloc::Model RM = HTM.getRelocationModel();
+  if (RM == Reloc::PIC_) {
+    SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+    return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T);
+  }
+
+  SDValue T = DAG.getTargetJumpTable(Idx, VT);
+  return DAG.getNode(HexagonISD::JT, SDLoc(Op), VT, T);
 }
 
 SDValue
@@ -1219,52 +1441,70 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
-SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
-                                                 SelectionDAG& DAG) const {
+SDValue
+HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const {
   SDLoc dl(Op);
   return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
 }
 
 
-SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  SDValue Result;
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+SDValue
+HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  auto *GAN = cast<GlobalAddressSDNode>(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
-  Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+  auto *GV = GAN->getGlobal();
+  int64_t Offset = GAN->getOffset();
+
+  auto &HLOF = *HTM.getObjFileLowering();
+  Reloc::Model RM = HTM.getRelocationModel();
 
-  const HexagonTargetObjectFile *TLOF =
-      static_cast<const HexagonTargetObjectFile *>(
-          getTargetMachine().getObjFileLowering());
-  if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) {
-    return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result);
+  if (RM == Reloc::Static) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+    if (HLOF.IsGlobalInSmallSection(GV, HTM))
+      return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
+    return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
   }
 
-  return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result);
+  bool UsePCRel = GV->hasInternalLinkage() || GV->hasHiddenVisibility() ||
+                  (GV->hasLocalLinkage() && !isa<Function>(GV));
+  if (UsePCRel) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
+                                            HexagonII::MO_PCREL);
+    return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, GA);
+  }
+
+  // Use GOT index.
+  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, HexagonII::MO_GOT);
+  SDValue Off = DAG.getConstant(Offset, dl, MVT::i32);
+  return DAG.getNode(HexagonISD::AT_GOT, dl, PtrVT, GOT, GA, Off);
 }
 
 // Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
-void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) {
-  if (VT != PromotedLdStVT) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(),
-                      PromotedLdStVT.getSimpleVT());
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(),
-                      PromotedLdStVT.getSimpleVT());
+  Reloc::Model RM = HTM.getRelocationModel();
+  if (RM == Reloc::Static) {
+    SDValue A =  DAG.getTargetBlockAddress(BA, PtrVT);
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A);
   }
+
+  SDValue A = DAG.getTargetBlockAddress(BA, PtrVT, 0, HexagonII::MO_PCREL);
+  return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, A);
 }
 
 SDValue
-HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
-  SDLoc dl(Op);
-  return DAG.getNode(HexagonISD::CONST32_GP, dl,
-                     getPointerTy(DAG.getDataLayout()), BA_SD);
+HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
+      const {
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue GOTSym = DAG.getTargetExternalSymbol(HEXAGON_GOT_SYM_NAME, PtrVT,
+                                               HexagonII::MO_PCREL);
+  return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1272,18 +1512,19 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 //===----------------------------------------------------------------------===//
 
 HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
-                                             const HexagonSubtarget &STI)
+                                             const HexagonSubtarget &ST)
     : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
-      Subtarget(STI) {
+      Subtarget(ST) {
   bool IsV4 = !Subtarget.hasV5TOps();
   auto &HRI = *Subtarget.getRegisterInfo();
+  bool UseHVX = Subtarget.useHVXOps();
+  bool UseHVXSgl = Subtarget.useHVXSglOps();
+  bool UseHVXDbl = Subtarget.useHVXDblOps();
 
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
   setMinFunctionAlignment(2);
   setInsertFencesForAtomic(false);
-  setExceptionPointerRegister(Hexagon::R0);
-  setExceptionSelectorRegister(Hexagon::R1);
   setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
 
   if (EnableHexSDNodeSched)
@@ -1320,6 +1561,31 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
   }
 
+  if (Subtarget.hasV60TOps()) {
+    if (Subtarget.useHVXSglOps()) {
+      addRegisterClass(MVT::v64i8,  &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v32i16, &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v16i32, &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v8i64,  &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v128i8, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v64i16, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v32i32, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v16i64, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v512i1, &Hexagon::VecPredRegsRegClass);
+    } else if (Subtarget.useHVXDblOps()) {
+      addRegisterClass(MVT::v128i8,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v64i16,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v32i32,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v16i64,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v256i8,  &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v128i16, &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v64i32,  &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v32i64,  &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v1024i1, &Hexagon::VecPredRegs128BRegClass);
+    }
+
+  }
+
   //
   // Handling of scalar operations.
   //
@@ -1336,10 +1602,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal); // Default: expand
 
   setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
 
   // Custom legalize GlobalAddress nodes into CONST32.
@@ -1361,11 +1629,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
   if (EmitJumpTables)
-    setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+    setMinimumJumpTableEntries(2);
   else
-    setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  // Increase jump tables cutover to 5, was 4.
-  setMinimumJumpTableEntries(MinimumJumpTables);
+    setMinimumJumpTableEntries(MinimumJumpTables);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 
   // Hexagon has instructions for add/sub with carry. The problem with
   // modeling these instructions is that they produce 2 results: Rdd and Px.
@@ -1420,9 +1687,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS, MVT::i64, Expand);
 
   for (unsigned IntExpOp :
-       {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM,
-        ISD::ROTL, ISD::ROTR, ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS,
-        ISD::SRL_PARTS, ISD::SMUL_LOHI, ISD::UMUL_LOHI}) {
+       { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
+         ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
+         ISD::BSWAP,     ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+         ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
     setOperationAction(IntExpOp, MVT::i32, Expand);
     setOperationAction(IntExpOp, MVT::i64, Expand);
   }
@@ -1475,7 +1743,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
   // Set the action for vector operations to "expand", then override it with
   // either "custom" or "legal" for specific cases.
-  static unsigned VectExpOps[] = {
+  static const unsigned VectExpOps[] = {
     // Integer arithmetic:
     ISD::ADD,     ISD::SUB,     ISD::MUL,     ISD::SDIV,    ISD::UDIV,
     ISD::SREM,    ISD::UREM,    ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
@@ -1539,7 +1807,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VSELECT,        MVT::v2i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
-
+  if (UseHVX) {
+    if (UseHVXSgl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64,  Custom);
+    } else if (UseHVXDbl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64,  Custom);
+    } else {
+      llvm_unreachable("Unrecognized HVX mode");
+    }
+  }
   // Subtarget-specific operation actions.
   //
   if (Subtarget.hasV5TOps()) {
@@ -1586,7 +1868,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
     for (ISD::CondCode FPExpCCV4 :
          {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE,
-          ISD::SETUO, ISD::SETO}) {
+          ISD::SETUO,  ISD::SETO}) {
       setCondCodeAction(FPExpCCV4, MVT::f32, Expand);
       setCondCodeAction(FPExpCCV4, MVT::f64, Expand);
     }
@@ -1599,6 +1881,13 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(ISD::POST_INC, LSXTy, Legal);
   }
 
+  if (UseHVXDbl) {
+    for (MVT VT : {MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) {
+      setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+      setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+    }
+  }
+
   computeRegisterProperties(&HRI);
 
   //
@@ -1720,7 +2009,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::AT_GOT:        return "HexagonISD::AT_GOT";
   case HexagonISD::AT_PCREL:      return "HexagonISD::AT_PCREL";
   case HexagonISD::BARRIER:       return "HexagonISD::BARRIER";
-  case HexagonISD::BR_JT:         return "HexagonISD::BR_JT";
   case HexagonISD::CALLR:         return "HexagonISD::CALLR";
   case HexagonISD::CALLv3nr:      return "HexagonISD::CALLv3nr";
   case HexagonISD::CALLv3:        return "HexagonISD::CALLv3";
@@ -1737,7 +2025,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
   case HexagonISD::JT:            return "HexagonISD::JT";
   case HexagonISD::PACKHL:        return "HexagonISD::PACKHL";
-  case HexagonISD::PIC_ADD:       return "HexagonISD::PIC_ADD";
   case HexagonISD::POPCOUNT:      return "HexagonISD::POPCOUNT";
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::SHUFFEB:       return "HexagonISD::SHUFFEB";
@@ -1754,6 +2041,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VCMPWEQ:       return "HexagonISD::VCMPWEQ";
   case HexagonISD::VCMPWGT:       return "HexagonISD::VCMPWGT";
   case HexagonISD::VCMPWGTU:      return "HexagonISD::VCMPWGTU";
+  case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
   case HexagonISD::VSHLH:         return "HexagonISD::VSHLH";
   case HexagonISD::VSHLW:         return "HexagonISD::VSHLW";
   case HexagonISD::VSPLATB:       return "HexagonISD::VSPLTB";
@@ -1923,8 +2211,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   unsigned Size = VT.getSizeInBits();
 
-  // A vector larger than 64 bits cannot be represented in Hexagon.
-  // Expand will split the vector.
+  // Only handle vectors of 64 bits or shorter.
   if (Size > 64)
     return SDValue();
 
@@ -2058,58 +2345,61 @@ SDValue
 HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  bool UseHVX = Subtarget.useHVXOps();
   EVT VT = Op.getValueType();
   unsigned NElts = Op.getNumOperands();
-  SDValue Vec = Op.getOperand(0);
-  EVT VecVT = Vec.getValueType();
-  SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), dl, MVT::i64);
-  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
-                                DAG.getConstant(32, dl, MVT::i64));
-  SDValue ConstVal = DAG.getConstant(0, dl, MVT::i64);
-
-  ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width);
-  ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted);
-
-  if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) {
-    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
-      // We are trying to concat two v2i16 to a single v4i16.
-      SDValue Vec0 = Op.getOperand(1);
-      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
+  SDValue Vec0 = Op.getOperand(0);
+  EVT VecVT = Vec0.getValueType();
+  unsigned Width = VecVT.getSizeInBits();
+
+  if (NElts == 2) {
+    MVT ST = VecVT.getSimpleVT();
+    // We are trying to concat two v2i16 to a single v4i16, or two v4i8
+    // into a single v8i8.
+    if (ST == MVT::v2i16 || ST == MVT::v4i8)
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
+
+    if (UseHVX) {
+      assert((Width ==  64*8 && Subtarget.useHVXSglOps()) ||
+             (Width == 128*8 && Subtarget.useHVXDblOps()));
+      SDValue Vec1 = Op.getOperand(1);
+      MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32;
+      MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32;
+      SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
+      SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
+      SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
+      return DAG.getNode(ISD::BITCAST, dl, VT, VC);
     }
   }
 
-  if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) {
-    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
-      // We are trying to concat two v4i8 to a single v8i8.
-      SDValue Vec0 = Op.getOperand(1);
-      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
-    }
-  }
+  if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
+    return SDValue();
+
+  SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
+  SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
+  SDValue W = DAG.getConstant(Width, dl, MVT::i64);
+  // Create the "width" part of the argument to insert_rp/insertp_rp.
+  SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
+  SDValue V = C0;
 
   for (unsigned i = 0, e = NElts; i != e; ++i) {
-    unsigned OpIdx = NElts - i - 1;
-    SDValue Operand = Op.getOperand(OpIdx);
+    unsigned N = NElts-i-1;
+    SDValue OpN = Op.getOperand(N);
 
-    if (VT.getSizeInBits() == 64 &&
-        Operand.getValueType().getSizeInBits() == 32) {
+    if (VT.getSizeInBits() == 64 && OpN.getValueType().getSizeInBits() == 32) {
       SDValue C = DAG.getConstant(0, dl, MVT::i32);
-      Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+      OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
     }
-
-    SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64);
-    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
-    SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
-    const SDValue Ops[] = {ConstVal, Operand, Combined};
-
+    SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
+    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
     if (VT.getSizeInBits() == 32)
-      ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
     else
-      ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
   }
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }
 
 SDValue
@@ -2301,6 +2591,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SHL:
     case ISD::SRL:                  return LowerVECTOR_SHIFT(Op, DAG);
     case ISD::ConstantPool:         return LowerConstantPool(Op, DAG);
+    case ISD::JumpTable:            return LowerJumpTable(Op, DAG);
     case ISD::EH_RETURN:            return LowerEH_RETURN(Op, DAG);
       // Frame & Return address. Currently unimplemented.
     case ISD::RETURNADDR:           return LowerRETURNADDR(Op, DAG);
@@ -2308,8 +2599,8 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::ATOMIC_FENCE:         return LowerATOMIC_FENCE(Op, DAG);
     case ISD::GlobalAddress:        return LowerGLOBALADDRESS(Op, DAG);
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
+    case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
-    case ISD::BR_JT:                return LowerBR_JT(Op, DAG);
     // Custom lower some vector loads.
     case ISD::LOAD:                 return LowerLOAD(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
@@ -2321,6 +2612,16 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+/// Returns relocation base for the given PIC jumptable.
+SDValue
+HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                SelectionDAG &DAG) const {
+  int Idx = cast<JumpTableSDNode>(Table)->getIndex();
+  EVT VT = Table.getValueType();
+  SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+  return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T);
+}
+
 MachineBasicBlock *
 HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                    MachineBasicBlock *BB)
@@ -2343,6 +2644,8 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 std::pair<unsigned, const TargetRegisterClass *>
 HexagonTargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':   // R0-R31
@@ -2358,6 +2661,42 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
        case MVT::f64:
          return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
       }
+    case 'q': // q0-q3
+       switch (VT.SimpleTy) {
+       default:
+         llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+       case MVT::v1024i1:
+       case MVT::v512i1:
+       case MVT::v32i16:
+       case MVT::v16i32:
+       case MVT::v64i8:
+       case MVT::v8i64:
+         return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+    }
+    case 'v': // V0-V31
+       switch (VT.SimpleTy) {
+       default:
+         llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+       case MVT::v16i32:
+       case MVT::v32i16:
+       case MVT::v64i8:
+       case MVT::v8i64:
+         return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
+       case MVT::v32i32:
+       case MVT::v64i16:
+       case MVT::v16i64:
+       case MVT::v128i8:
+         if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
+           return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
+         else
+           return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
+       case MVT::v256i8:
+       case MVT::v128i16:
+       case MVT::v64i32:
+       case MVT::v32i64:
+         return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
+       }
+
     default:
       llvm_unreachable("Unknown asm register class");
     }
@@ -2397,6 +2736,14 @@ bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   return true;
 }
 
+/// Return true if folding a constant offset with the given GlobalAddress is
+/// legal.  It is frequently not legal in PIC relocation models.
+bool HexagonTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA)
+      const {
+  return HTM.getRelocationModel() == Reloc::Static;
+}
+
+
 /// isLegalICmpImmediate - Return true if the specified immediate is legal
 /// icmp immediate, that is the target has icmp instructions which can compare
 /// a register against the immediate without having to materialize the
@@ -2428,8 +2775,8 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
   // ***************************************************************************
 
   // If this is a tail call via a function pointer, then don't do it!
-  if (!(dyn_cast<GlobalAddressSDNode>(Callee))
-      && !(dyn_cast<ExternalSymbolSDNode>(Callee))) {
+  if (!(isa<GlobalAddressSDNode>(Callee)) &&
+      !(isa<ExternalSymbolSDNode>(Callee))) {
     return false;
   }
 
@@ -2467,6 +2814,41 @@ bool llvm::isPositiveHalfWord(SDNode *N) {
   }
 }
 
+std::pair<const TargetRegisterClass*, uint8_t>
+HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+      MVT VT) const {
+  const TargetRegisterClass *RRC = nullptr;
+
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(TRI, VT);
+  case MVT::v64i8:
+  case MVT::v32i16:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    RRC = &Hexagon::VectorRegsRegClass;
+    break;
+  case MVT::v128i8:
+  case MVT::v64i16:
+  case MVT::v32i32:
+  case MVT::v16i64:
+    if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
+        Subtarget.useHVXDblOps())
+      RRC = &Hexagon::VectorRegs128BRegClass;
+    else
+      RRC = &Hexagon::VecDblRegsRegClass;
+    break;
+  case MVT::v256i8:
+  case MVT::v128i16:
+  case MVT::v64i32:
+  case MVT::v32i64:
+    RRC = &Hexagon::VecDblRegs128BRegClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
 Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
       AtomicOrdering Ord) const {
   BasicBlock *BB = Builder.GetInsertBlock();
@@ -2498,13 +2880,15 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
   return Ext;
 }
 
-bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   // Do not expand loads and stores that don't exceed 64 bits.
-  return LI->getType()->getPrimitiveSizeInBits() > 64;
+  return LI->getType()->getPrimitiveSizeInBits() > 64
+             ? AtomicExpansionKind::LLOnly
+             : AtomicExpansionKind::None;
 }
 
 bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   // Do not expand loads and stores that don't exceed 64 bits.
   return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
 }
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 2642abf..bf378b9 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -35,16 +35,14 @@ bool isPositiveHalfWord(SDNode *N);
       ALLOCA,
       ARGEXTEND,
 
-      PIC_ADD,
-      AT_GOT,
-      AT_PCREL,
+      AT_GOT,      // Index in GOT.
+      AT_PCREL,    // Offset relative to PC.
 
       CALLv3,      // A V3+ call instruction.
       CALLv3nr,    // A V3+ call instruction that doesn't return.
       CALLR,
 
       RET_FLAG,    // Return with a flag operand.
-      BR_JT,       // Branch through jump table.
       BARRIER,     // Memory barrier.
       JT,          // Jump table.
       CP,          // Constant pool.
@@ -80,6 +78,7 @@ bool isPositiveHalfWord(SDNode *N);
       INSERTRP,
       EXTRACTU,
       EXTRACTURP,
+      VCOMBINE,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
@@ -127,7 +126,6 @@ bool isPositiveHalfWord(SDNode *N);
     SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
@@ -137,6 +135,7 @@ bool isPositiveHalfWord(SDNode *N);
         SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
         SmallVectorImpl<SDValue> &InVals) const override;
@@ -163,8 +162,23 @@ bool isPositiveHalfWord(SDNode *N);
     MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
         MachineBasicBlock *BB) const override;
 
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return Hexagon::R0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return Hexagon::R1;
+    }
+
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
                            EVT VT) const override {
       if (!VT.isVector())
@@ -200,6 +214,10 @@ bool isPositiveHalfWord(SDNode *N);
     /// TODO: Handle pre/postinc as well.
     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
                                Type *Ty, unsigned AS) const override;
+    /// Return true if folding a constant offset with the given GlobalAddress
+    /// is legal.  It is frequently not legal in PIC relocation models.
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -208,20 +226,26 @@ bool isPositiveHalfWord(SDNode *N);
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
 
+    /// Returns relocation base for the given PIC jumptable.
+    SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
+                                     const override;
+
     // Handling of atomic RMW instructions.
-    bool hasLoadLinkedStoreConditional() const override {
-      return true;
-    }
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
         AtomicOrdering Ord) const override;
     Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
         Value *Addr, AtomicOrdering Ord) const override;
-    bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
-    AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI)
-        const override {
-      return AtomicRMWExpansionKind::LLSC;
+    AtomicExpansionKind
+    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
+      return AtomicExpansionKind::LLSC;
     }
+
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
+        const override;
   };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
new file mode 100644
index 0000000..5a1a69b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
@@ -0,0 +1,462 @@
+//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//                     Hexagon Instruction Mappings
+//===----------------------------------------------------------------------===//
+
+
+def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
+                (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
+                (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
+                (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memb({GP}+#$addr) = $Nt",
+                (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt",
+                (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
+                (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt",
+                (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memd({GP}+#$addr) = $Nt",
+                (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
+
+def : InstAlias<"$Nt = memb({GP}+#$addr)",
+                (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memub({GP}+#$addr)",
+                (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memh({GP}+#$addr)",
+                (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memuh({GP}+#$addr)",
+                (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memw({GP}+#$addr)",
+                (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
+def : InstAlias<"$Nt = memd({GP}+#$addr)",
+                (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
+
+// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
+def : InstAlias<"memb($Rs) = $Rt",
+      (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt",
+      (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.h",
+      (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt",
+      (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = $Rt.new",
+      (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.new",
+      (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt.new",
+      (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = #$S8",
+      (S4_storeirb_io IntRegs:$Rs, 0, s8Ext:$S8), 0>;
+
+def : InstAlias<"memh($Rs) = #$S8",
+      (S4_storeirh_io IntRegs:$Rs, 0, s8Ext:$S8), 0>;
+
+def : InstAlias<"memw($Rs) = #$S8",
+      (S4_storeiri_io IntRegs:$Rs, 0, s8Ext:$S8), 0>;
+
+def : InstAlias<"memd($Rs) = $Rtt",
+      (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"memb($Rs) = setbit(#$U5)",
+      (L4_ior_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = setbit(#$U5)",
+      (L4_ior_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = setbit(#$U5)",
+      (L4_ior_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memb($Rs) = clrbit(#$U5)",
+      (L4_iand_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = clrbit(#$U5)",
+      (L4_iand_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = clrbit(#$U5)",
+      (L4_iand_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
+def : InstAlias<"$Rd = memb($Rs)",
+      (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memub($Rs)",
+      (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memh($Rs)",
+      (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memuh($Rs)",
+      (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memw($Rs)",
+      (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memd($Rs)",
+      (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memubh($Rs)",
+      (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memubh($Rs)",
+      (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = membh($Rs)",
+      (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = membh($Rs)",
+      (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memb_fifo($Rs)",
+      (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memh_fifo($Rs)",
+      (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
+//       to: if ($Pt) $Rd = memXX($Rs)
+def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
+      (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
+      (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
+      (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
+      (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
+      (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
+      (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
+//       to: if ($Pt) memXX($Rs) = $Rt
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
+      (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
+      (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
+      (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
+      (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
+      (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
+      (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
+      (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
+      (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
+      (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
+      (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
+      (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+
+// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
+//       to: if (!$Pt) $Rd = memXX($Rs)
+def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
+      (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
+      (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
+      (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
+      (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
+      (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
+      (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
+//       to: if (!$Pt) memXX($Rs) = $Rt
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
+      (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
+      (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
+      (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
+      (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
+      (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
+      (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
+      (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
+      (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
+      (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
+      (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
+      (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
+      (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
+      (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
+      (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
+      (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
+      (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
+      (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
+      (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
+      (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
+      (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
+      (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
+      (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
+      (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
+//       to: memXX($Rs) |= $Rt
+def : InstAlias<"memb($Rs) &= $Rt",
+      (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) |= $Rt",
+      (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += $Rt",
+      (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= $Rt",
+      (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += #$U5",
+      (L4_iadd_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= #$U5",
+      (L4_isub_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) &= $Rt",
+      (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) |= $Rt",
+      (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += $Rt",
+      (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= $Rt",
+      (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += #$U5",
+      (L4_iadd_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= #$U5",
+      (L4_isub_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) &= $Rt",
+      (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) |= $Rt",
+      (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += $Rt",
+      (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= $Rt",
+      (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += #$U5",
+      (L4_iadd_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= #$U5",
+      (L4_isub_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+//
+// Alias of: if ($Pv.new) memX($Rs) = $Rt
+//       to: if (p3.new) memX(r17 + #0) = $Rt
+def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
+      (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
+      (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
+      (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
+      (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
+      (S4_pstorerdtnew_io
+       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
+      (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
+      (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
+      (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
+      (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
+      (S4_pstorerdfnew_io
+       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+//
+// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
+//       to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
+def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
+      (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
+      (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
+      (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
+      (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
+      (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
+      (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
+      (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
+      (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
+      (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
+      (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
+      (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
+      (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"dcfetch($Rs)",
+      (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
+
+// Alias of some insn mappings, others must be handled by the parser
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
+// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
+def : InstAlias<"$Rd = neg($Rs)",
+      (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
+
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+def : InstAlias<"$Pd = $Ps",
+      (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
+
+def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
+      (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
+
+def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
+      (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
+      (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
+
+// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
new file mode 100644
index 0000000..280832f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
@@ -0,0 +1,1019 @@
+class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { opc{14-4}, src2};
+  let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
+}
+
+class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
+class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
+class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
+class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
+class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
+class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
+class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
+class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
+class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
+class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
+class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
+class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
+class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
+class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
+class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
+class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
+class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
+class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
+class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
+class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
+class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
+class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
+class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
+class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
+class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
+class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
+class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
+class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
+class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
+class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
+class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
+class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
+class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
+class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
+class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
+class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
+class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
+class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
+class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
+class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
+class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
+class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
+class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
+class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
+class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
+class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
+class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
+class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
+class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
+class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
+class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
+class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
+class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
+class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
+class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
+class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
+class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
+class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
+class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
+class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
+class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
+class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
+class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
+class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
+class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
+class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
+class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
+class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
+class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
+class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
+class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
+class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
+class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
+class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
+class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
+class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
+class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
+class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
+class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
+class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
+class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
+class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
+class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
+class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
+class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
+class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
+class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
+class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
+class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
+class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
+class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
+class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
+class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
+class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
+class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
+class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
+class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
+class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
+class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
+class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
+class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
+class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
+class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
+class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
+class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
+class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
+class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
+class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
+class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
+class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
+class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
+class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
+class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
+class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
+class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
+class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
+class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
+class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
+class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
+class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
+class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
+class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
+class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
+class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
+class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
+class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
+class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
+class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
+class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
+class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
+class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
+class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
+class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
+class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
+class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
+class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
+class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
+class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
+class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
+class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
+class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
+class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
+class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
+class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
+class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
+class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
+class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
+class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
+class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
+class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
+class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
+class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
+class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
+class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
+class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
+class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
+class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
+class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
+class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
+class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
+class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
+class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
+class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
+class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
+class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
+class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
+class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
+class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
+class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
+class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
+class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
+class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
+class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
+class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
+class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
+class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
+class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
+class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
+class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
+class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
+class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
+class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
+class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
+class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
+class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
+
+class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
+  bits<2> dst;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
+  let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
+}
+
+class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
+class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
+class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
+class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
+class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
+class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
+class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
+class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
+class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
+class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
+class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
+class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
+class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
+class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
+class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
+class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
+class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
+class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
+class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
+class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
+class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
+class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
+class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
+class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
+class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
+class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
+class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
+class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
+class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
+class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
+class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
+class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
+class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
+class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
+class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
+class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
+class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
+class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
+
+class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> dst;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
+  let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
+class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
+class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
+class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
+class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
+class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
+class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
+class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
+class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
+class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
+class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
+class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
+
+class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+
+  let Inst{31-16} = { 0b00011110000000, opc{5-4} };
+  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
+class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
+class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
+class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
+class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
+class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
+class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
+class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
+class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
+class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
+class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
+class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
+class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
+class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
+class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
+class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
+class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
+class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
+class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
+class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
+class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
+class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
+class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
+class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
+class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
+
+class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<10> src2;
+  bits<4> src2_vector;
+
+  let src2_vector = src2{9-6};
+  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
+class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
+class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
+class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
+class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
+class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
+class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<11> src2;
+  bits<4> src2_vector;
+
+  let src2_vector = src2{10-7};
+  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
+class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
+class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
+class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
+class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
+class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
+class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
+
+class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<4> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{9-6};
+  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<11> src2;
+  bits<4> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{10-7};
+  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
+class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
+class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
+
+class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
+class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
+class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<4> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{9-6};
+  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
+class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<11> src2;
+  bits<4> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{10-7};
+  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
+class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<4> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{9-6};
+  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<11> src3;
+  bits<4> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{10-7};
+  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
+class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
+class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
+class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
+class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
+class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
+class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
+class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
+class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
+class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
+
+class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
+class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
+class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
+class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
+class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
+class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
+class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
+class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
+class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
+class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<4> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{9-6};
+  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
+class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
+class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
+class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<11> src3;
+  bits<4> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{10-7};
+  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
+class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
+class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
+class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
+
+// TODO: Change script to generate dst, src1, src2 instead of
+// dst, dst2, src1.
+class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<9> src2;
+  bits<3> src2_vector;
+
+  let src2_vector = src2{8-6};
+  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
+class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
+class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
+class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
+class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
+class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
+class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<10> src2;
+  bits<3> src2_vector;
+
+  let src2_vector = src2{9-7};
+  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
+class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
+class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
+class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
+class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
+class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
+class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
+
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<9> src2;
+  bits<3> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{8-6};
+  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
+class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
+class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
+
+class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<3> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{9-7};
+  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
+class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
+class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<9> src2;
+  bits<3> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{8-6};
+  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
+class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<3> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{9-7};
+  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
+class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<9> src3;
+  bits<3> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{8-6};
+  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
+class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
+class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
+class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
+class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
+class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
+class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
+class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
+class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
+class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<3> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{9-7};
+  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
+class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
+class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
+class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
+class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
+class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
+class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
+class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
+class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
+class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<9> src3;
+  bits<3> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{8-6};
+  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
+class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
+class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
+class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<3> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{9-7};
+  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
+class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
+class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
+class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
+
+class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<1> src2;
+
+  let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
+  let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
+class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
+class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
+class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
+class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
+class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
+class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
+
+class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<1> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
+  let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
+class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
+class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<1> src2;
+  bits<3> src3;
+
+  let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
+}
+
+class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
+class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<1> src3;
+  bits<5> src4;
+
+  let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
+class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
+class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
+class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
+class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
+class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
+class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
+class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
+class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
+class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<1> src3;
+  bits<3> src4;
+
+  let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
+class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
+class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
+class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
+
+
+class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<1> src3;
+
+  let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
+  let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
+}
+
+class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
+class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
+class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
+class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
+class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
+class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
+
+class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<2> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
+  let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
+}
+
+class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
+class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
+
+class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<5> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { 0b00011001111, src3{4-0} };
+  let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
+}
+
+class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
+class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
+
+
+class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> dst;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
+  let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
+class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
+
+class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> dst;
+  bits<5> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
+  let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
+}
+
+class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
+class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
+
+class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<3> src3;
+
+  let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
+  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
+class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
+class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
+class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
+class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
+class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
+class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
+class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
+class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
+class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
+class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
+class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
+class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
+class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
+class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
+
+class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<3> src3;
+
+  let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
+  let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
+}
+
+class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
+class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
+class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
+class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
+class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
+class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
+class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
+class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
+
+class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
+  bits<2> dst;
+  bits<2> src1;
+  bits<2> src2;
+
+  let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
+  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
+}
+
+class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
+class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
+class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
+class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
+class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
+
+class V6_pred_not_enc : OpcodeHexagon {
+  bits<2> dst;
+  bits<2> src1;
+
+  let Inst{31-16} = { 0b0001111000000011 };
+  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
+}
+
+class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<2> src1;
+  bits<5> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
+  let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
+class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
+
+class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+
+  let Inst{31-16} = { opc{15-5}, src1{4-0} };
+  let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
+}
+
+class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
+class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
+class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
+
+
+class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
+  bits<2> dst;
+  bits<5> src1;
+
+  let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
+  let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
+}
+
+class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
+class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
+
+class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<6> src2;
+
+  let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
+  let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
+class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
+class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
+class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
+class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
+class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
+
+class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { opc{14-4}, src1{4-0} };
+  let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
+class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
+class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
+class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
+class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
+class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
+class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
+class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
+
+class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
+
+  let Inst{31-16} = { opc{24-10}, 0 };
+  let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
+}
+
+class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
+class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
+class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
+class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
+
+class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
+  bits<5> src1;
+
+  let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
+  let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
+}
+
+class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
+class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
+
+class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
+  bits<5> src1;
+
+  let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
+  let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
+}
+
+class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
+class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
+
+class A5_ACS_enc : OpcodeHexagon {
+  bits<5> dst1;
+  bits<2> dst2;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b11101010101, src1{4-0} };
+  let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
+
+class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<2> src3;
+
+  let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
+  let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
+}
+
+class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
+class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
+class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
+
+class V6_vhistq_enc : OpcodeHexagon {
+  bits<2> src1;
+
+  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
+  let Inst{13-0} = { 0b10000010000000 };
+}
+
+// TODO: Change script to generate dst1 instead of dst.
+class A6_vminub_RdP_enc : OpcodeHexagon {
+  bits<5> dst1;
+  bits<2> dst2;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b11101010111, src2{4-0} };
+  let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index 44bab29..3c5ec17 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -34,6 +34,8 @@ class SubTarget<bits<6> value> {
 
 def HasAnySubT    : SubTarget<0x3f>;  // 111111
 def HasV5SubT     : SubTarget<0x3e>;  // 111110
+def HasV55SubT    : SubTarget<0x3c>;  // 111100
+def HasV60SubT    : SubTarget<0x38>;  // 111000
 
 // Addressing modes for load/store instructions
 class AddrModeType<bits<3> value> {
@@ -57,6 +59,8 @@ def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
 def WordAccess       : MemAccessSize<3>;// Word access instruction (memw).
 def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+def Vector64Access   : MemAccessSize<7>;// Vector access instruction (memv)
+def Vector128Access  : MemAccessSize<8>;// Vector access instruction (memv)
 
 
 //===----------------------------------------------------------------------===//
@@ -167,14 +171,23 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<1> isFP = 0;
   let TSFlags {48} = isFP; // Floating-point.
 
+  bits<1> hasNewValue2 = 0;
+  let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+  bits<3> opNewValue2 = 0;
+  let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+
+  bits<1> isAccumulator = 0;
+  let TSFlags{54} = isAccumulator;
+
   // Fields used for relation models.
+  bit isNonTemporal = 0;
+  string isNT = ""; // set to "true" for non-temporal vector stores.
   string BaseOpcode = "";
   string CextOpcode = "";
   string PredSense = "";
   string PNewValue = "";
   string NValueST  = "";    // Set to "true" for new-value stores.
   string InputType = "";    // Input is "imm" or "reg" type.
-  string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
   string isFloat = "false"; // Set to "true" for the floating-point load/store.
   string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions
 
@@ -182,6 +195,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
                                     "");
   let PNewValue = !if(isPredicatedNew, "new", "");
   let NValueST = !if(isNVStore, "true", "false");
+  let isNT = !if(isNonTemporal, "true", "false");
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
@@ -217,6 +231,11 @@ class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
+let mayLoad = 1 in
+class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
+
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
@@ -234,6 +253,12 @@ class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
 
+// Same as ST0Inst but doesn't derive from OpcodeHexagon.
+let mayStore = 1 in
+class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
@@ -277,6 +302,11 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
     OpcodeHexagon;
 
+// Same as above but doesn't derive from OpcodeHexagon
+class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
@@ -294,6 +324,10 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
     OpcodeHexagon;
 
+class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
@@ -402,3 +436,13 @@ include "HexagonInstrFormatsV4.td"
 //===----------------------------------------------------------------------===//
 // V4 Instruction Format Definitions +
 //===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index db83ef6..2d1dea5 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -21,8 +21,6 @@ def TypeMEMOP    : IType<9>;
 def TypeNV       : IType<10>;
 def TypeDUPLEX   : IType<11>;
 def TypeCOMPOUND : IType<12>;
-def TypeAG_VX    : IType<28>;
-def TypeAG_VM    : IType<29>;
 def TypePREFIX   : IType<30>;
 
 //                      Duplex Instruction Class Declaration
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
new file mode 100644
index 0000000..f3d43de
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -0,0 +1,238 @@
+//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+//                         Hexagon Intruction Flags +
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeCVI_VA         : IType<13>;
+def TypeCVI_VA_DV      : IType<14>;
+def TypeCVI_VX         : IType<15>;
+def TypeCVI_VX_DV      : IType<16>;
+def TypeCVI_VP         : IType<17>;
+def TypeCVI_VP_VS      : IType<18>;
+def TypeCVI_VS         : IType<19>;
+def TypeCVI_VINLANESAT : IType<20>;
+def TypeCVI_VM_LD      : IType<21>;
+def TypeCVI_VM_TMP_LD  : IType<22>;
+def TypeCVI_VM_CUR_LD  : IType<23>;
+def TypeCVI_VM_VP_LDU  : IType<24>;
+def TypeCVI_VM_ST      : IType<25>;
+def TypeCVI_VM_NEW_ST  : IType<26>;
+def TypeCVI_VM_STU     : IType<27>;
+def TypeCVI_HIST       : IType<28>;
+//----------------------------------------------------------------------------//
+//                         Intruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VA>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VA_DV>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_long<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_late<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_LATE>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+     Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_DV>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_DV_SLOT2>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VX_DV_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_Resource_long<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_VS_EARLY>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_VS_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_VS_LONG_EARLY>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VS>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VINLANESAT>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource_long<dag outs, dag ins, string asmstr,
+                           list<dag> pattern = [], string cstr = "",
+                           InstrItinClass itin = CVI_VS>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VM_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr,
+                              list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VM_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_TMP_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr,
+                                  list<dag> pattern = [], string cstr = "",
+                                  InstrItinClass itin = CVI_VM_TMP_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_CUR_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_VP_LDU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr,
+                                  list<dag> pattern = [], string cstr = "",
+                                  InstrItinClass itin = CVI_VM_VP_LDU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VM_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr,
+                              list<dag> pattern = [], string cstr = "",
+                              InstrItinClass itin = CVI_VM_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_NEW_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr,
+                                  list<dag> pattern = [], string cstr = "",
+                                  InstrItinClass itin = CVI_VM_NEW_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr,
+                          list<dag> pattern = [], string cstr = "",
+                          InstrItinClass itin = CVI_VM_STU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr,
+                               list<dag> pattern = [], string cstr = "",
+                               InstrItinClass itin = CVI_VM_STU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource<dag outs, dag ins, string asmstr,
+                        list<dag> pattern = [], string cstr = "",
+                        InstrItinClass itin = CVI_HIST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+}
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource1<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VA>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+     Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VX_DV>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource1<dag outs, dag ins, string asmstr,
+                        list<dag> pattern = [], string cstr = "",
+                        InstrItinClass itin = CVI_HIST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+     Requires<[HasV60T, UseHVX]>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3cb0823..eb3590c 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -23,9 +23,11 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -36,9 +38,41 @@ using namespace llvm;
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
 
+using namespace llvm;
+
+cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden,
+  cl::init(false), cl::desc("Do not consider inline-asm a scheduling/"
+                            "packetization boundary."));
+
+static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction",
+  cl::Hidden, cl::init(true), cl::desc("Enable branch prediction"));
+
+static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable schedule adjustment for new value stores."));
+
+static cl::opt<bool> EnableTimingClassLatency(
+  "enable-timing-class-latency", cl::Hidden, cl::init(false),
+  cl::desc("Enable timing class latency"));
+
+static cl::opt<bool> EnableALUForwarding(
+  "enable-alu-forwarding", cl::Hidden, cl::init(true),
+  cl::desc("Enable vec alu forwarding"));
+
+static cl::opt<bool> EnableACCForwarding(
+  "enable-acc-forwarding", cl::Hidden, cl::init(true),
+  cl::desc("Enable vec acc forwarding"));
+
+static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large",
+  cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm"));
+
 ///
 /// Constants for Hexagon instructions.
 ///
+const int Hexagon_MEMV_OFFSET_MAX_128B = 2047;  // #s7
+const int Hexagon_MEMV_OFFSET_MIN_128B = -2048; // #s7
+const int Hexagon_MEMV_OFFSET_MAX = 1023;  // #s6
+const int Hexagon_MEMV_OFFSET_MIN = -1024; // #s6
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
 const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -57,71 +91,49 @@ const int Hexagon_MEMH_AUTOINC_MAX = 14;
 const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
 const int Hexagon_MEMB_AUTOINC_MIN = -8;
+const int Hexagon_MEMV_AUTOINC_MAX = 192;
+const int Hexagon_MEMV_AUTOINC_MIN = -256;
+const int Hexagon_MEMV_AUTOINC_MAX_128B = 384;
+const int Hexagon_MEMV_AUTOINC_MIN_128B = -512;
 
 // Pin the vtable to this file.
 void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
     : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
-      RI(), Subtarget(ST) {}
+      RI() {}
 
-/// isLoadFromStackSlot - If the specified machine instruction is a direct
-/// load from a stack slot, return the virtual or physical register number of
-/// the destination along with the FrameIndex of the loaded stack slot.  If
-/// not, return 0.  This predicate must return 0 if the instruction has
-/// any side effects other than loading from the stack slot.
-unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
 
+static bool isIntRegForSubInst(unsigned Reg) {
+  return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+         (Reg >= Hexagon::R16 && Reg <= Hexagon::R23);
+}
 
-  switch (MI->getOpcode()) {
-  default: break;
-  case Hexagon::L2_loadri_io:
-  case Hexagon::L2_loadrd_io:
-  case Hexagon::L2_loadrh_io:
-  case Hexagon::L2_loadrb_io:
-  case Hexagon::L2_loadrub_io:
-    if (MI->getOperand(2).isFI() &&
-        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
-      FrameIndex = MI->getOperand(2).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-  return 0;
+
+static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) {
+  return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_loreg)) &&
+         isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_hireg));
 }
 
 
-/// isStoreToStackSlot - If the specified machine instruction is a direct
-/// store to a stack slot, return the virtual or physical register number of
-/// the source reg along with the FrameIndex of the loaded stack slot.  If
-/// not, return 0.  This predicate must return 0 if the instruction has
-/// any side effects other than storing to the stack slot.
-unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                            int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case Hexagon::S2_storeri_io:
-  case Hexagon::S2_storerd_io:
-  case Hexagon::S2_storerh_io:
-  case Hexagon::S2_storerb_io:
-    if (MI->getOperand(2).isFI() &&
-        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
-      FrameIndex = MI->getOperand(0).getIndex();
-      return MI->getOperand(2).getReg();
-    }
-    break;
+/// Calculate number of instructions excluding the debug instructions.
+static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
+                              MachineBasicBlock::const_instr_iterator MIE) {
+  unsigned Count = 0;
+  for (; MIB != MIE; ++MIB) {
+    if (!MIB->isDebugValue())
+      ++Count;
   }
-  return 0;
+  return Count;
 }
 
-// Find the hardware loop instruction used to set-up the specified loop.
-// On Hexagon, we have two instructions used to set-up the hardware loop
-// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
-// to indicate the end of a loop.
-static MachineInstr *
-findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
-              SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
+
+/// Find the hardware loop instruction used to set-up the specified loop.
+/// On Hexagon, we have two instructions used to set-up the hardware loop
+/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
+/// to indicate the end of a loop.
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+      SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
   int LOOPi;
   int LOOPr;
   if (EndLoopOp == Hexagon::ENDLOOP0) {
@@ -157,100 +169,108 @@ findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
   return 0;
 }
 
-unsigned HexagonInstrInfo::InsertBranch(
-    MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
 
-  Opcode_t BOpc   = Hexagon::J2_jump;
-  Opcode_t BccOpc = Hexagon::J2_jumpt;
+/// Gather register def/uses from MI.
+/// This treats possible (predicated) defs as actually happening ones
+/// (conservatively).
+static inline void parseOperands(const MachineInstr *MI,
+      SmallVector<unsigned, 4> &Defs, SmallVector<unsigned, 8> &Uses) {
+  Defs.clear();
+  Uses.clear();
 
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
 
-  // Check if ReverseBranchCondition has asked to reverse this branch
-  // If we want to reverse the branch an odd number of times, we want
-  // J2_jumpf.
-  if (!Cond.empty() && Cond[0].isImm())
-    BccOpc = Cond[0].getImm();
+    if (!MO.isReg())
+      continue;
 
-  if (!FBB) {
-    if (Cond.empty()) {
-      // Due to a bug in TailMerging/CFG Optimization, we need to add a
-      // special case handling of a predicated jump followed by an
-      // unconditional jump. If not, Tail Merging and CFG Optimization go
-      // into an infinite loop.
-      MachineBasicBlock *NewTBB, *NewFBB;
-      SmallVector<MachineOperand, 4> Cond;
-      MachineInstr *Term = MBB.getFirstTerminator();
-      if (Term != MBB.end() && isPredicated(Term) &&
-          !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) {
-        MachineBasicBlock *NextBB =
-          std::next(MachineFunction::iterator(&MBB));
-        if (NewTBB == NextBB) {
-          ReverseBranchCondition(Cond);
-          RemoveBranch(MBB);
-          return InsertBranch(MBB, TBB, nullptr, Cond, DL);
-        }
-      }
-      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
-    } else if (isEndLoopN(Cond[0].getImm())) {
-      int EndLoopOp = Cond[0].getImm();
-      assert(Cond[1].isMBB());
-      // Since we're adding an ENDLOOP, there better be a LOOP instruction.
-      // Check for it, and change the BB target if needed.
-      SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
-      assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
-      Loop->getOperand(0).setMBB(TBB);
-      // Add the ENDLOOP after the finding the LOOP0.
-      BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
-    } else if (isNewValueJump(Cond[0].getImm())) {
-      assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump");
-      // New value jump
-      // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
-      // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
-      unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
-      DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber(););
-      if (Cond[2].isReg()) {
-        unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
-        BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
-          addReg(Cond[2].getReg(), Flags2).addMBB(TBB);
-      } else if(Cond[2].isImm()) {
-        BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
-          addImm(Cond[2].getImm()).addMBB(TBB);
-      } else
-        llvm_unreachable("Invalid condition for branching");
-    } else {
-      assert((Cond.size() == 2) && "Malformed cond vector");
-      const MachineOperand &RO = Cond[1];
-      unsigned Flags = getUndefRegState(RO.isUndef());
-      BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
-    }
-    return 1;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (MO.isUse())
+      Uses.push_back(MO.getReg());
+
+    if (MO.isDef())
+      Defs.push_back(MO.getReg());
   }
-  assert((!Cond.empty()) &&
-         "Cond. cannot be empty when multiple branchings are required");
-  assert((!isNewValueJump(Cond[0].getImm())) &&
-         "NV-jump cannot be inserted with another branch");
-  // Special case for hardware loops.  The condition is a basic block.
-  if (isEndLoopN(Cond[0].getImm())) {
-    int EndLoopOp = Cond[0].getImm();
-    assert(Cond[1].isMBB());
-    // Since we're adding an ENDLOOP, there better be a LOOP instruction.
-    // Check for it, and change the BB target if needed.
-    SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
-    assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
-    Loop->getOperand(0).setMBB(TBB);
-    // Add the ENDLOOP after the finding the LOOP0.
-    BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
-  } else {
-    const MachineOperand &RO = Cond[1];
-    unsigned Flags = getUndefRegState(RO.isUndef());
-    BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+}
+
+
+// Position dependent, so check twice for swap.
+static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+  switch (Ga) {
+  case HexagonII::HSIG_None:
+  default:
+    return false;
+  case HexagonII::HSIG_L1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_L2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_A:
+    return (Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_Compound:
+    return (Gb == HexagonII::HSIG_Compound);
   }
-  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+  return false;
+}
 
-  return 2;
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case Hexagon::L2_loadri_io:
+  case Hexagon::L2_loadrd_io:
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::L2_loadrub_io:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case Hexagon::S2_storeri_io:
+  case Hexagon::S2_storerd_io:
+  case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storerb_io:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+    break;
+  }
+  return 0;
 }
 
 
@@ -269,9 +289,6 @@ unsigned HexagonInstrInfo::InsertBranch(
 /// Cond[0] = Hexagon::CMPEQri_f_Jumpnv_t_V4 -- specific opcode
 /// Cond[1] = R
 /// Cond[2] = Imm
-/// @note Related function is \fn findInstrPredicate which fills in
-/// Cond. vector when a predicated instruction is passed to it.
-/// We follow same protocol in that case too.
 ///
 bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                      MachineBasicBlock *&TBB,
@@ -314,7 +331,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
-  
+
   bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump &&
                      I->getOperand(0).isMBB();
   // Delete the J2_jump if it's equivalent to a fall-through.
@@ -327,17 +344,17 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(&*I))
     return false;
 
   // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
+  MachineInstr *LastInst = &*I;
   MachineInstr *SecondLastInst = nullptr;
   // Find one more terminator if present.
-  do {
-    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) {
+  for (;;) {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) {
       if (!SecondLastInst)
-        SecondLastInst = I;
+        SecondLastInst = &*I;
       else
         // This is a third branch.
         return true;
@@ -345,7 +362,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     if (I == MBB.instr_begin())
       break;
     --I;
-  } while(I);
+  }
 
   int LastOpcode = LastInst->getOpcode();
   int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0;
@@ -418,7 +435,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   // executed, so remove it.
   if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) {
     TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
+    I = LastInst->getIterator();
     if (AllowModify)
       I->eraseFromParent();
     return false;
@@ -438,6 +455,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   return true;
 }
 
+
 unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   DEBUG(dbgs() << "\nRemoving branches out of BB#" << MBB.getNumber());
   MachineBasicBlock::iterator I = MBB.end();
@@ -458,100 +476,127 @@ unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return Count;
 }
 
-/// \brief For a comparison instruction, return the source registers in
-/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
-/// compares against in CmpValue. Return true if the comparison instruction
-/// can be analyzed.
-bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
-                                      unsigned &SrcReg, unsigned &SrcReg2,
-                                      int &Mask, int &Value) const {
-  unsigned Opc = MI->getOpcode();
 
-  // Set mask and the first source register.
-  switch (Opc) {
-    case Hexagon::C2_cmpeq:
-    case Hexagon::C2_cmpeqp:
-    case Hexagon::C2_cmpgt:
-    case Hexagon::C2_cmpgtp:
-    case Hexagon::C2_cmpgtu:
-    case Hexagon::C2_cmpgtup:
-    case Hexagon::C4_cmpneq:
-    case Hexagon::C4_cmplte:
-    case Hexagon::C4_cmplteu:
-    case Hexagon::C2_cmpeqi:
-    case Hexagon::C2_cmpgti:
-    case Hexagon::C2_cmpgtui:
-    case Hexagon::C4_cmpneqi:
-    case Hexagon::C4_cmplteui:
-    case Hexagon::C4_cmpltei:
-      SrcReg = MI->getOperand(1).getReg();
-      Mask = ~0;
-      break;
-    case Hexagon::A4_cmpbeq:
-    case Hexagon::A4_cmpbgt:
-    case Hexagon::A4_cmpbgtu:
-    case Hexagon::A4_cmpbeqi:
-    case Hexagon::A4_cmpbgti:
-    case Hexagon::A4_cmpbgtui:
-      SrcReg = MI->getOperand(1).getReg();
-      Mask = 0xFF;
-      break;
-    case Hexagon::A4_cmpheq:
-    case Hexagon::A4_cmphgt:
-    case Hexagon::A4_cmphgtu:
-    case Hexagon::A4_cmpheqi:
-    case Hexagon::A4_cmphgti:
-    case Hexagon::A4_cmphgtui:
-      SrcReg = MI->getOperand(1).getReg();
-      Mask = 0xFFFF;
-      break;
-  }
+unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+      MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+      ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+  unsigned BOpc   = Hexagon::J2_jump;
+  unsigned BccOpc = Hexagon::J2_jumpt;
+  assert(validateBranchCond(Cond) && "Invalid branching condition");
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
-  // Set the value/second source register.
-  switch (Opc) {
-    case Hexagon::C2_cmpeq:
-    case Hexagon::C2_cmpeqp:
-    case Hexagon::C2_cmpgt:
-    case Hexagon::C2_cmpgtp:
-    case Hexagon::C2_cmpgtu:
-    case Hexagon::C2_cmpgtup:
-    case Hexagon::A4_cmpbeq:
-    case Hexagon::A4_cmpbgt:
-    case Hexagon::A4_cmpbgtu:
-    case Hexagon::A4_cmpheq:
-    case Hexagon::A4_cmphgt:
-    case Hexagon::A4_cmphgtu:
-    case Hexagon::C4_cmpneq:
-    case Hexagon::C4_cmplte:
-    case Hexagon::C4_cmplteu:
-      SrcReg2 = MI->getOperand(2).getReg();
-      return true;
+  // Check if ReverseBranchCondition has asked to reverse this branch
+  // If we want to reverse the branch an odd number of times, we want
+  // J2_jumpf.
+  if (!Cond.empty() && Cond[0].isImm())
+    BccOpc = Cond[0].getImm();
 
-    case Hexagon::C2_cmpeqi:
-    case Hexagon::C2_cmpgtui:
-    case Hexagon::C2_cmpgti:
-    case Hexagon::C4_cmpneqi:
-    case Hexagon::C4_cmplteui:
-    case Hexagon::C4_cmpltei:
-    case Hexagon::A4_cmpbeqi:
-    case Hexagon::A4_cmpbgti:
-    case Hexagon::A4_cmpbgtui:
-    case Hexagon::A4_cmpheqi:
-    case Hexagon::A4_cmphgti:
-    case Hexagon::A4_cmphgtui:
-      SrcReg2 = 0;
-      Value = MI->getOperand(2).getImm();
-      return true;
+  if (!FBB) {
+    if (Cond.empty()) {
+      // Due to a bug in TailMerging/CFG Optimization, we need to add a
+      // special case handling of a predicated jump followed by an
+      // unconditional jump. If not, Tail Merging and CFG Optimization go
+      // into an infinite loop.
+      MachineBasicBlock *NewTBB, *NewFBB;
+      SmallVector<MachineOperand, 4> Cond;
+      MachineInstr *Term = MBB.getFirstTerminator();
+      if (Term != MBB.end() && isPredicated(Term) &&
+          !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) {
+        MachineBasicBlock *NextBB = &*++MBB.getIterator();
+        if (NewTBB == NextBB) {
+          ReverseBranchCondition(Cond);
+          RemoveBranch(MBB);
+          return InsertBranch(MBB, TBB, nullptr, Cond, DL);
+        }
+      }
+      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else if (isEndLoopN(Cond[0].getImm())) {
+      int EndLoopOp = Cond[0].getImm();
+      assert(Cond[1].isMBB());
+      // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+      // Check for it, and change the BB target if needed.
+      SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+      assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+      Loop->getOperand(0).setMBB(TBB);
+      // Add the ENDLOOP after the finding the LOOP0.
+      BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+    } else if (isNewValueJump(Cond[0].getImm())) {
+      assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump");
+      // New value jump
+      // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
+      // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
+      unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
+      DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber(););
+      if (Cond[2].isReg()) {
+        unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
+        BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+          addReg(Cond[2].getReg(), Flags2).addMBB(TBB);
+      } else if(Cond[2].isImm()) {
+        BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+          addImm(Cond[2].getImm()).addMBB(TBB);
+      } else
+        llvm_unreachable("Invalid condition for branching");
+    } else {
+      assert((Cond.size() == 2) && "Malformed cond vector");
+      const MachineOperand &RO = Cond[1];
+      unsigned Flags = getUndefRegState(RO.isUndef());
+      BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+    }
+    return 1;
   }
+  assert((!Cond.empty()) &&
+         "Cond. cannot be empty when multiple branchings are required");
+  assert((!isNewValueJump(Cond[0].getImm())) &&
+         "NV-jump cannot be inserted with another branch");
+  // Special case for hardware loops.  The condition is a basic block.
+  if (isEndLoopN(Cond[0].getImm())) {
+    int EndLoopOp = Cond[0].getImm();
+    assert(Cond[1].isMBB());
+    // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+    // Check for it, and change the BB target if needed.
+    SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+    assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+    Loop->getOperand(0).setMBB(TBB);
+    // Add the ENDLOOP after the finding the LOOP0.
+    BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+  } else {
+    const MachineOperand &RO = Cond[1];
+    unsigned Flags = getUndefRegState(RO.isUndef());
+    BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+  }
+  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
 
-  return false;
+  return 2;
+}
+
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+      unsigned NumCycles, unsigned ExtraPredCycles,
+      BranchProbability Probability) const {
+  return nonDbgBBSize(&MBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+      unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB,
+      unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability)
+      const {
+  return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+      unsigned NumInstrs, BranchProbability Probability) const {
+  return NumInstrs <= 4;
 }
 
 
 void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
+      MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg,
+      unsigned SrcReg, bool KillSrc) const {
+  auto &HRI = getRegisterInfo();
   if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
     BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg).addReg(SrcReg);
     return;
@@ -599,28 +644,74 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+      Hexagon::IntRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg).
+      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg),
+             getKillRegState(KillSrc)).
+      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg),
+             getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg).
+      addReg(SrcReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::VecPredRegsRegClass.contains(SrcReg) &&
+    Hexagon::VectorRegsRegClass.contains(DestReg)) {
+    llvm_unreachable("Unimplemented pred to vec");
+    return;
+  }
+  if (Hexagon::VecPredRegsRegClass.contains(DestReg) &&
+      Hexagon::VectorRegsRegClass.contains(SrcReg)) {
+    llvm_unreachable("Unimplemented vec to pred");
+    return;
+  }
+  if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and),
+      HRI.getSubReg(DestReg, Hexagon::subreg_hireg)).
+      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg),
+             getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and),
+      HRI.getSubReg(DestReg, Hexagon::subreg_loreg)).
+      addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg),
+             getKillRegState(KillSrc));
+    return;
+  }
 
+#ifndef NDEBUG
+  // Show the invalid registers to ease debugging.
+  dbgs() << "Invalid registers for copy in BB#" << MBB.getNumber()
+         << ": " << PrintReg(DestReg, &HRI)
+         << " = " << PrintReg(SrcReg, &HRI) << '\n';
+#endif
   llvm_unreachable("Unimplemented");
 }
 
 
-void HexagonInstrInfo::
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
-                    const TargetRegisterClass *RC,
-                    const TargetRegisterInfo *TRI) const {
-
+void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI,
+      const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
 
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
-                      MachineMemOperand::MOStore,
-                      MFI.getObjectSize(FI),
-                      Align);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), Align);
 
   if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
@@ -640,33 +731,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 }
 
 
-void HexagonInstrInfo::storeRegToAddr(
-                                 MachineFunction &MF, unsigned SrcReg,
-                                 bool isKill,
-                                 SmallVectorImpl<MachineOperand> &Addr,
-                                 const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const
-{
-  llvm_unreachable("Unimplemented");
-}
-
-
-void HexagonInstrInfo::
-loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
-                     const TargetRegisterClass *RC,
-                     const TargetRegisterInfo *TRI) const {
+void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I, unsigned DestReg, int FI,
+      const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
 
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
-                      MachineMemOperand::MOLoad,
-                      MFI.getObjectSize(FI),
-                      Align);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), Align);
   if (RC == &Hexagon::IntRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
@@ -682,27 +757,136 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 }
 
 
-void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                        SmallVectorImpl<MachineOperand> &Addr,
-                                        const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  llvm_unreachable("Unimplemented");
-}
-bool
-HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  const HexagonRegisterInfo &TRI = getRegisterInfo();
+/// expandPostRAPseudo - This function is called for all pseudo instructions
+/// that remain after register allocation. Many pseudo instructions are
+/// created to help register allocation. This is the place to convert them
+/// into real instructions. The target can edit MI in place, or it can insert
+/// new instructions and erase MI. The function should return true if
+/// anything was changed.
+bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
+      const {
+  const HexagonRegisterInfo &HRI = getRegisterInfo();
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Opc = MI->getOpcode();
+  const unsigned VecOffset = 1;
+  bool Is128B = false;
 
   switch (Opc) {
     case Hexagon::ALIGNA:
       BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI->getOperand(0).getReg())
-          .addReg(TRI.getFrameRegister())
+          .addReg(HRI.getFrameRegister())
           .addImm(-MI->getOperand(1).getImm());
       MBB.erase(MI);
       return true;
+    case Hexagon::HEXAGON_V6_vassignp_128B:
+    case Hexagon::HEXAGON_V6_vassignp: {
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (SrcReg != DstReg)
+        copyPhysReg(MBB, MI, DL, DstReg, SrcReg, MI->getOperand(1).isKill());
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::HEXAGON_V6_lo_128B:
+    case Hexagon::HEXAGON_V6_lo: {
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg);
+      copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI->getOperand(1).isKill());
+      MBB.erase(MI);
+      MRI.clearKillFlags(SrcSubLo);
+      return true;
+    }
+    case Hexagon::HEXAGON_V6_hi_128B:
+    case Hexagon::HEXAGON_V6_hi: {
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg);
+      copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI->getOperand(1).isKill());
+      MBB.erase(MI);
+      MRI.clearKillFlags(SrcSubHi);
+      return true;
+    }
+    case Hexagon::STrivv_indexed_128B:
+      Is128B = true;
+    case Hexagon::STrivv_indexed: {
+      unsigned SrcReg = MI->getOperand(2).getReg();
+      unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg);
+      unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg);
+      unsigned NewOpcd = Is128B ? Hexagon::V6_vS32b_ai_128B
+                                : Hexagon::V6_vS32b_ai;
+      unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+      MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpcd))
+          .addOperand(MI->getOperand(0))
+          .addImm(MI->getOperand(1).getImm())
+          .addReg(SrcSubLo)
+          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MI1New->getOperand(0).setIsKill(false);
+      BuildMI(MBB, MI, DL, get(NewOpcd))
+        .addOperand(MI->getOperand(0))
+        // The Vectors are indexed in multiples of vector size.
+        .addImm(MI->getOperand(1).getImm()+Offset)
+        .addReg(SrcSubHi)
+        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::LDrivv_pseudo_V6_128B:
+    case Hexagon::LDrivv_indexed_128B:
+      Is128B = true;
+    case Hexagon::LDrivv_pseudo_V6:
+    case Hexagon::LDrivv_indexed: {
+      unsigned NewOpcd = Is128B ? Hexagon::V6_vL32b_ai_128B
+                                : Hexagon::V6_vL32b_ai;
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+      MachineInstr *MI1New =
+          BuildMI(MBB, MI, DL, get(NewOpcd),
+                  HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
+              .addOperand(MI->getOperand(1))
+              .addImm(MI->getOperand(2).getImm());
+      MI1New->getOperand(1).setIsKill(false);
+      BuildMI(MBB, MI, DL, get(NewOpcd),
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
+          .addOperand(MI->getOperand(1))
+          // The Vectors are indexed in multiples of vector size.
+          .addImm(MI->getOperand(2).getImm() + Offset)
+          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::LDriv_pseudo_V6_128B:
+      Is128B = true;
+    case Hexagon::LDriv_pseudo_V6: {
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B
+                               : Hexagon::V6_vL32b_ai;
+      int32_t Off = MI->getOperand(2).getImm();
+      int32_t Idx = Off;
+      BuildMI(MBB, MI, DL, get(NewOpc), DstReg)
+        .addOperand(MI->getOperand(1))
+        .addImm(Idx)
+        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::STriv_pseudo_V6_128B:
+      Is128B = true;
+    case Hexagon::STriv_pseudo_V6: {
+      unsigned NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B
+                               : Hexagon::V6_vS32b_ai;
+      int32_t Off = MI->getOperand(1).getImm();
+      int32_t Idx = Is128B ? (Off >> 7) : (Off >> 6);
+      BuildMI(MBB, MI, DL, get(NewOpc))
+        .addOperand(MI->getOperand(0))
+        .addImm(Idx)
+        .addOperand(MI->getOperand(2))
+        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MBB.erase(MI);
+      return true;
+    }
     case Hexagon::TFR_PdTrue: {
       unsigned Reg = MI->getOperand(0).getReg();
       BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
@@ -724,15 +908,15 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
       unsigned DstReg = MI->getOperand(0).getReg();
       unsigned Src1Reg = MI->getOperand(1).getReg();
       unsigned Src2Reg = MI->getOperand(2).getReg();
-      unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
-      unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
-      unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
-      unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
+      unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
+      unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
+      unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
+      unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
       BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
-              TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
           .addReg(Src2SubHi);
       BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
-              TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+              HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
           .addReg(Src2SubLo);
       MBB.erase(MI);
       MRI.clearKillFlags(Src1SubHi);
@@ -747,17 +931,17 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
       unsigned Src1Reg = MI->getOperand(1).getReg();
       unsigned Src2Reg = MI->getOperand(2).getReg();
       unsigned Src3Reg = MI->getOperand(3).getReg();
-      unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
-      unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
-      unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
-      unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
-      unsigned Src3SubHi = TRI.getSubReg(Src3Reg, Hexagon::subreg_hireg);
-      unsigned Src3SubLo = TRI.getSubReg(Src3Reg, Hexagon::subreg_loreg);
+      unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
+      unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
+      unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
+      unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
+      unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::subreg_hireg);
+      unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::subreg_loreg);
       BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
-              TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+              HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
           .addReg(Src2SubHi).addReg(Src3SubHi);
       BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
-              TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+              HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
           .addReg(Src2SubLo).addReg(Src3SubLo);
       MBB.erase(MI);
       MRI.clearKillFlags(Src1SubHi);
@@ -768,104 +952,168 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
       MRI.clearKillFlags(Src3SubLo);
       return true;
     }
+    case Hexagon::MUX64_rr: {
+      const MachineOperand &Op0 = MI->getOperand(0);
+      const MachineOperand &Op1 = MI->getOperand(1);
+      const MachineOperand &Op2 = MI->getOperand(2);
+      const MachineOperand &Op3 = MI->getOperand(3);
+      unsigned Rd = Op0.getReg();
+      unsigned Pu = Op1.getReg();
+      unsigned Rs = Op2.getReg();
+      unsigned Rt = Op3.getReg();
+      DebugLoc DL = MI->getDebugLoc();
+      unsigned K1 = getKillRegState(Op1.isKill());
+      unsigned K2 = getKillRegState(Op2.isKill());
+      unsigned K3 = getKillRegState(Op3.isKill());
+      if (Rd != Rs)
+        BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpt), Rd)
+          .addReg(Pu, (Rd == Rt) ? K1 : 0)
+          .addReg(Rs, K2);
+      if (Rd != Rt)
+        BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpf), Rd)
+          .addReg(Pu, K1)
+          .addReg(Rt, K3);
+      MBB.erase(MI);
+      return true;
+    }
     case Hexagon::TCRETURNi:
       MI->setDesc(get(Hexagon::J2_jump));
       return true;
     case Hexagon::TCRETURNr:
       MI->setDesc(get(Hexagon::J2_jumpr));
       return true;
+    case Hexagon::TFRI_f:
+    case Hexagon::TFRI_cPt_f:
+    case Hexagon::TFRI_cNotPt_f: {
+      unsigned Opx = (Opc == Hexagon::TFRI_f) ? 1 : 2;
+      APFloat FVal = MI->getOperand(Opx).getFPImm()->getValueAPF();
+      APInt IVal = FVal.bitcastToAPInt();
+      MI->RemoveOperand(Opx);
+      unsigned NewOpc = (Opc == Hexagon::TFRI_f)     ? Hexagon::A2_tfrsi   :
+                        (Opc == Hexagon::TFRI_cPt_f) ? Hexagon::C2_cmoveit :
+                                                       Hexagon::C2_cmoveif;
+      MI->setDesc(get(NewOpc));
+      MI->addOperand(MachineOperand::CreateImm(IVal.getZExtValue()));
+      return true;
+    }
   }
 
   return false;
 }
 
-MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FI) const {
-  // Hexagon_TODO: Implement.
-  return nullptr;
+
+// We indicate that we want to reverse the branch by
+// inserting the reversed branching opcode.
+bool HexagonInstrInfo::ReverseBranchCondition(
+      SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond.empty())
+    return true;
+  assert(Cond[0].isImm() && "First entry in the cond vector not imm-val");
+  unsigned opcode = Cond[0].getImm();
+  //unsigned temp;
+  assert(get(opcode).isBranch() && "Should be a branching condition.");
+  if (isEndLoopN(opcode))
+    return true;
+  unsigned NewOpcode = getInvertedPredicatedOpcode(opcode);
+  Cond[0].setImm(NewOpcode);
+  return false;
 }
 
-unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
 
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  const TargetRegisterClass *TRC;
-  if (VT == MVT::i1) {
-    TRC = &Hexagon::PredRegsRegClass;
-  } else if (VT == MVT::i32 || VT == MVT::f32) {
-    TRC = &Hexagon::IntRegsRegClass;
-  } else if (VT == MVT::i64 || VT == MVT::f64) {
-    TRC = &Hexagon::DoubleRegsRegClass;
-  } else {
-    llvm_unreachable("Cannot handle this register class");
-  }
+void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator MI) const {
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Hexagon::A2_nop));
+}
 
-  unsigned NewReg = RegInfo.createVirtualRegister(TRC);
-  return NewReg;
+
+// Returns true if an instruction is predicated irrespective of the predicate
+// sense. For example, all of the following will return true.
+// if (p0) R1 = add(R2, R3)
+// if (!p0) R1 = add(R2, R3)
+// if (p0.new) R1 = add(R2, R3)
+// if (!p0.new) R1 = add(R2, R3)
+// Note: New-value stores are not included here as in the current
+// implementation, we don't need to check their predicate sense.
+bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
 }
 
-bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
-  const MCInstrDesc &MID = MI->getDesc();
-  const uint64_t F = MID.TSFlags;
-  if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
-    return true;
 
-  // TODO: This is largely obsolete now. Will need to be removed
-  // in consecutive patches.
-  switch(MI->getOpcode()) {
-    // TFR_FI Remains a special case.
-    case Hexagon::TFR_FI:
-      return true;
-    default:
-      return false;
+bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI,
+      ArrayRef<MachineOperand> Cond) const {
+  if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
+      isEndLoopN(Cond[0].getImm())) {
+    DEBUG(dbgs() << "\nCannot predicate:"; MI->dump(););
+    return false;
   }
-  return  false;
-}
+  int Opc = MI->getOpcode();
+  assert (isPredicable(MI) && "Expected predicable instruction");
+  bool invertJump = predOpcodeHasNot(Cond);
 
-// This returns true in two cases:
-// - The OP code itself indicates that this is an extended instruction.
-// - One of MOs has been marked with HMOTF_ConstExtended flag.
-bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
-  // First check if this is permanently extended op code.
-  const uint64_t F = MI->getDesc().TSFlags;
-  if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
-    return true;
-  // Use MO operand flags to determine if one of MI's operands
-  // has HMOTF_ConstExtended flag set.
-  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-       E = MI->operands_end(); I != E; ++I) {
-    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
-      return true;
+  // We have to predicate MI "in place", i.e. after this function returns,
+  // MI will need to be transformed into a predicated form. To avoid com-
+  // plicated manipulations with the operands (handling tied operands,
+  // etc.), build a new temporary instruction, then overwrite MI with it.
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned PredOpc = getCondOpcode(Opc, invertJump);
+  MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
+  unsigned NOp = 0, NumOps = MI->getNumOperands();
+  while (NOp < NumOps) {
+    MachineOperand &Op = MI->getOperand(NOp);
+    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+      break;
+    T.addOperand(Op);
+    NOp++;
   }
-  return  false;
-}
 
-bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const {
-  return MI->getDesc().isBranch();
-}
+  unsigned PredReg, PredRegPos, PredRegFlags;
+  bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags);
+  (void)GotPredReg;
+  assert(GotPredReg);
+  T.addReg(PredReg, PredRegFlags);
+  while (NOp < NumOps)
+    T.addOperand(MI->getOperand(NOp++));
 
-bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
-  if (isNewValueJump(MI))
-    return true;
+  MI->setDesc(get(PredOpc));
+  while (unsigned n = MI->getNumOperands())
+    MI->RemoveOperand(n-1);
+  for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
+    MI->addOperand(T->getOperand(i));
 
-  if (isNewValueStore(MI))
-    return true;
+  MachineBasicBlock::instr_iterator TI = T->getIterator();
+  B.erase(TI);
 
-  return false;
+  MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+  MRI.clearKillFlags(PredReg);
+  return true;
 }
 
-bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
-  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
-}
 
-bool HexagonInstrInfo::isNewValue(Opcode_t Opcode) const {
-  const uint64_t F = get(Opcode).TSFlags;
-  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+      ArrayRef<MachineOperand> Pred2) const {
+  // TODO: Fix this
+  return false;
 }
 
-bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
-  return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
+
+bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                   std::vector<MachineOperand> &Pred) const {
+  auto &HRI = getRegisterInfo();
+  for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
+    MachineOperand MO = MI->getOperand(oper);
+    if (MO.isReg() && MO.isDef()) {
+      const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
+      if (RC == &Hexagon::PredRegsRegClass) {
+        Pred.push_back(MO);
+        return true;
+      }
+    }
+  }
+  return false;
 }
 
 bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
@@ -875,10 +1123,21 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
     return false;
 
   const int Opc = MI->getOpcode();
+  int NumOperands = MI->getNumOperands();
+
+  // Keep a flag for upto 4 operands in the instructions, to indicate if
+  // that operand has been constant extended.
+  bool OpCExtended[4];
+  if (NumOperands > 4)
+    NumOperands = 4;
+
+  for (int i = 0; i < NumOperands; i++)
+    OpCExtended[i] = (isOperandExtended(MI, i) && isConstExtended(MI));
 
   switch(Opc) {
   case Hexagon::A2_tfrsi:
-    return (isOperandExtended(MI, 1) && isConstExtended(MI)) || isInt<12>(MI->getOperand(1).getImm());
+    return (isOperandExtended(MI, 1) && isConstExtended(MI)) ||
+           isInt<12>(MI->getOperand(1).getImm());
 
   case Hexagon::S2_storerd_io:
     return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
@@ -926,8 +1185,8 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   case Hexagon::S4_storeirb_io:
   case Hexagon::S4_storeirh_io:
   case Hexagon::S4_storeiri_io:
-    return (isUInt<6>(MI->getOperand(1).getImm()) &&
-            isInt<6>(MI->getOperand(2).getImm()));
+    return (OpCExtended[1] || isUInt<6>(MI->getOperand(1).getImm())) &&
+           (OpCExtended[2] || isInt<6>(MI->getOperand(2).getImm()));
 
   case Hexagon::A2_addi:
     return isInt<8>(MI->getOperand(2).getImm());
@@ -944,269 +1203,1117 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
-// This function performs the following inversiones:
-//
-//  cPt    ---> cNotPt
-//  cNotPt ---> cPt
-//
-unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
-  int InvPredOpcode;
-  InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
-                                        : Hexagon::getTruePredOpcode(Opc);
-  if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
-    return InvPredOpcode;
 
-  switch(Opc) {
-    default: llvm_unreachable("Unexpected predicated instruction");
-    case Hexagon::C2_ccombinewt:
-      return Hexagon::C2_ccombinewf;
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+      const MachineBasicBlock *MBB, const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Throwing call is a boundary.
+  if (MI->isCall()) {
+    // If any of the block's successors is a landing pad, this could be a
+    // throwing call.
+    for (auto I : MBB->successors())
+      if (I->isEHPad())
+        return true;
+  }
+
+  // Don't mess around with no return calls.
+  if (MI->getOpcode() == Hexagon::CALLv3nr)
+    return true;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isPosition())
+    return true;
+
+  if (MI->isInlineAsm() && !ScheduleInlineAsm)
+      return true;
+
+  return false;
+}
+
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// Hexagon counts the number of ##'s and adjust for that many
+/// constant exenders.
+unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
+      const MCAsmInfo &MAI) const {
+  StringRef AStr(Str);
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+
+  // Add to size number of constant extenders seen * 4.
+  StringRef Occ("##");
+  Length += AStr.count(Occ)*4;
+  return Length;
+}
+
+
+ScheduleHazardRecognizer*
+HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
+      const InstrItineraryData *II, const ScheduleDAG *DAG) const {
+  return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
+      unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const {
+  unsigned Opc = MI->getOpcode();
+
+  // Set mask and the first source register.
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpltei:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = ~0;
+      break;
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpbgtui:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFF;
+      break;
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFFFF;
+      break;
+  }
+
+  // Set the value/second source register.
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+      SrcReg2 = MI->getOperand(2).getReg();
+      return true;
+
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpltei:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      SrcReg2 = 0;
+      Value = MI->getOperand(2).getImm();
+      return true;
+  }
+
+  return false;
+}
+
+
+unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+      const MachineInstr *MI, unsigned *PredCost) const {
+  return getInstrTimingClassLatency(ItinData, MI);
+}
+
+
+DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
+    const TargetSubtargetInfo &STI) const {
+  const InstrItineraryData *II = STI.getInstrItineraryData();
+  return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II);
+}
+
+
+// Inspired by this pair:
+//  %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0]
+//  S2_storeri_io %R29, 132, %R1<kill>; flags:  mem:ST4[FixedStack1]
+// Currently AA considers the addresses in these instructions to be aliasing.
+bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+      MachineInstr *MIb, AliasAnalysis *AA) const {
+  int OffsetA = 0, OffsetB = 0;
+  unsigned SizeA = 0, SizeB = 0;
+
+  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
+      MIa->hasOrderedMemoryRef() || MIa->hasOrderedMemoryRef())
+    return false;
+
+  // Instructions that are pure loads, not loads and stores like memops are not
+  // dependent.
+  if (MIa->mayLoad() && !isMemOp(MIa) && MIb->mayLoad() && !isMemOp(MIb))
+    return true;
+
+  // Get base, offset, and access size in MIa.
+  unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA);
+  if (!BaseRegA || !SizeA)
+    return false;
+
+  // Get base, offset, and access size in MIb.
+  unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB);
+  if (!BaseRegB || !SizeB)
+    return false;
+
+  if (BaseRegA != BaseRegB)
+    return false;
+
+  // This is a mem access with the same base register and known offsets from it.
+  // Reason about it.
+  if (OffsetA > OffsetB) {
+    uint64_t offDiff = (uint64_t)((int64_t)OffsetA - (int64_t)OffsetB);
+    return (SizeB <= offDiff);
+  } else if (OffsetA < OffsetB) {
+    uint64_t offDiff = (uint64_t)((int64_t)OffsetB - (int64_t)OffsetA);
+    return (SizeA <= offDiff);
+  }
+
+  return false;
+}
+
+
+unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *TRC;
+  if (VT == MVT::i1) {
+    TRC = &Hexagon::PredRegsRegClass;
+  } else if (VT == MVT::i32 || VT == MVT::f32) {
+    TRC = &Hexagon::IntRegsRegClass;
+  } else if (VT == MVT::i64 || VT == MVT::f64) {
+    TRC = &Hexagon::DoubleRegsRegClass;
+  } else {
+    llvm_unreachable("Cannot handle this register class");
+  }
+
+  unsigned NewReg = MRI.createVirtualRegister(TRC);
+  return NewReg;
+}
+
+
+bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr* MI) const {
+  return (getAddrMode(MI) == HexagonII::AbsoluteSet);
+}
+
+
+bool HexagonInstrInfo::isAccumulator(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
+
+bool HexagonInstrInfo::isComplex(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  if (!(isTC1(MI))
+      && !(QII->isTC2Early(MI))
+      && !(MI->getDesc().mayLoad())
+      && !(MI->getDesc().mayStore())
+      && (MI->getDesc().getOpcode() != Hexagon::S2_allocframe)
+      && (MI->getDesc().getOpcode() != Hexagon::L2_deallocframe)
+      && !(QII->isMemOp(MI))
+      && !(MI->isBranch())
+      && !(MI->isReturn())
+      && !MI->isCall())
+    return true;
+
+  return false;
+}
+
+
+// Return true if the instruction is a compund branch instruction.
+bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const {
+  return (getType(MI) == HexagonII::TypeCOMPOUND && MI->isBranch());
+}
+
+
+bool HexagonInstrInfo::isCondInst(const MachineInstr *MI) const {
+  return (MI->isBranch() && isPredicated(MI)) ||
+         isConditionalTransfer(MI) ||
+         isConditionalALU32(MI)    ||
+         isConditionalLoad(MI)     ||
+         // Predicated stores which don't have a .new on any operands.
+         (MI->mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
+          !isPredicatedNew(MI));
+}
+
+
+bool HexagonInstrInfo::isConditionalALU32(const MachineInstr* MI) const {
+  switch (MI->getOpcode()) {
+    case Hexagon::A2_paddf:
+    case Hexagon::A2_paddfnew:
+    case Hexagon::A2_paddif:
+    case Hexagon::A2_paddifnew:
+    case Hexagon::A2_paddit:
+    case Hexagon::A2_padditnew:
+    case Hexagon::A2_paddt:
+    case Hexagon::A2_paddtnew:
+    case Hexagon::A2_pandf:
+    case Hexagon::A2_pandfnew:
+    case Hexagon::A2_pandt:
+    case Hexagon::A2_pandtnew:
+    case Hexagon::A2_porf:
+    case Hexagon::A2_porfnew:
+    case Hexagon::A2_port:
+    case Hexagon::A2_portnew:
+    case Hexagon::A2_psubf:
+    case Hexagon::A2_psubfnew:
+    case Hexagon::A2_psubt:
+    case Hexagon::A2_psubtnew:
+    case Hexagon::A2_pxorf:
+    case Hexagon::A2_pxorfnew:
+    case Hexagon::A2_pxort:
+    case Hexagon::A2_pxortnew:
+    case Hexagon::A4_paslhf:
+    case Hexagon::A4_paslhfnew:
+    case Hexagon::A4_paslht:
+    case Hexagon::A4_paslhtnew:
+    case Hexagon::A4_pasrhf:
+    case Hexagon::A4_pasrhfnew:
+    case Hexagon::A4_pasrht:
+    case Hexagon::A4_pasrhtnew:
+    case Hexagon::A4_psxtbf:
+    case Hexagon::A4_psxtbfnew:
+    case Hexagon::A4_psxtbt:
+    case Hexagon::A4_psxtbtnew:
+    case Hexagon::A4_psxthf:
+    case Hexagon::A4_psxthfnew:
+    case Hexagon::A4_psxtht:
+    case Hexagon::A4_psxthtnew:
+    case Hexagon::A4_pzxtbf:
+    case Hexagon::A4_pzxtbfnew:
+    case Hexagon::A4_pzxtbt:
+    case Hexagon::A4_pzxtbtnew:
+    case Hexagon::A4_pzxthf:
+    case Hexagon::A4_pzxthfnew:
+    case Hexagon::A4_pzxtht:
+    case Hexagon::A4_pzxthtnew:
     case Hexagon::C2_ccombinewf:
-      return Hexagon::C2_ccombinewt;
+    case Hexagon::C2_ccombinewt:
+      return true;
+  }
+  return false;
+}
+
+
+// FIXME - Function name and it's functionality don't match.
+// It should be renamed to hasPredNewOpcode()
+bool HexagonInstrInfo::isConditionalLoad(const MachineInstr* MI) const {
+  if (!MI->getDesc().mayLoad() || !isPredicated(MI))
+    return false;
+
+  int PNewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
+  // Instruction with valid predicated-new opcode can be promoted to .new.
+  return PNewOpcode >= 0;
+}
+
 
-      // Dealloc_return.
-    case Hexagon::L4_return_t:
-      return Hexagon::L4_return_f;
-    case Hexagon::L4_return_f:
-      return Hexagon::L4_return_t;
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+bool HexagonInstrInfo::isConditionalStore(const MachineInstr* MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::S4_storeirbt_io:
+    case Hexagon::S4_storeirbf_io:
+    case Hexagon::S4_pstorerbt_rr:
+    case Hexagon::S4_pstorerbf_rr:
+    case Hexagon::S2_pstorerbt_io:
+    case Hexagon::S2_pstorerbf_io:
+    case Hexagon::S2_pstorerbt_pi:
+    case Hexagon::S2_pstorerbf_pi:
+    case Hexagon::S2_pstorerdt_io:
+    case Hexagon::S2_pstorerdf_io:
+    case Hexagon::S4_pstorerdt_rr:
+    case Hexagon::S4_pstorerdf_rr:
+    case Hexagon::S2_pstorerdt_pi:
+    case Hexagon::S2_pstorerdf_pi:
+    case Hexagon::S2_pstorerht_io:
+    case Hexagon::S2_pstorerhf_io:
+    case Hexagon::S4_storeirht_io:
+    case Hexagon::S4_storeirhf_io:
+    case Hexagon::S4_pstorerht_rr:
+    case Hexagon::S4_pstorerhf_rr:
+    case Hexagon::S2_pstorerht_pi:
+    case Hexagon::S2_pstorerhf_pi:
+    case Hexagon::S2_pstorerit_io:
+    case Hexagon::S2_pstorerif_io:
+    case Hexagon::S4_storeirit_io:
+    case Hexagon::S4_storeirif_io:
+    case Hexagon::S4_pstorerit_rr:
+    case Hexagon::S4_pstorerif_rr:
+    case Hexagon::S2_pstorerit_pi:
+    case Hexagon::S2_pstorerif_pi:
+
+    // V4 global address store before promoting to dot new.
+    case Hexagon::S4_pstorerdt_abs:
+    case Hexagon::S4_pstorerdf_abs:
+    case Hexagon::S4_pstorerbt_abs:
+    case Hexagon::S4_pstorerbf_abs:
+    case Hexagon::S4_pstorerht_abs:
+    case Hexagon::S4_pstorerhf_abs:
+    case Hexagon::S4_pstorerit_abs:
+    case Hexagon::S4_pstorerif_abs:
+      return true;
+
+    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+    // from the "Conditional Store" list. Because a predicated new value store
+    // would NOT be promoted to a double dot new store.
+    // This function returns yes for those stores that are predicated but not
+    // yet promoted to predicate dot new instructions.
   }
 }
 
-// New Value Store instructions.
-bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+
+bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    case Hexagon::A2_tfrt:
+    case Hexagon::A2_tfrf:
+    case Hexagon::C2_cmoveit:
+    case Hexagon::C2_cmoveif:
+    case Hexagon::A2_tfrtnew:
+    case Hexagon::A2_tfrfnew:
+    case Hexagon::C2_cmovenewit:
+    case Hexagon::C2_cmovenewif:
+    case Hexagon::A2_tfrpt:
+    case Hexagon::A2_tfrpf:
+      return true;
+
+    default:
+      return false;
+  }
+  return false;
+}
+
+
+// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
+// isFPImm and later getFPImm as well.
+bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+  if (isExtended) // Instruction must be extended.
+    return true;
+
+  unsigned isExtendable =
+    (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+  if (!isExtendable)
+    return false;
+
+  if (MI->isCall())
+    return false;
+
+  short ExtOpNum = getCExtOpNum(MI);
+  const MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // Use MO operand flags to determine if MO
+  // has the HMOTF_ConstExtended flag set.
+  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+    return true;
+  // If this is a Machine BB address we are talking about, and it is
+  // not marked as extended, say so.
+  if (MO.isMBB())
+    return false;
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
+      MO.isJTI() || MO.isCPI())
+    return true;
 
-  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int MinValue = getMinValue(MI);
+  int MaxValue = getMaxValue(MI);
+  int ImmValue = MO.getImm();
+
+  return (ImmValue < MinValue || ImmValue > MaxValue);
 }
 
-bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::L4_return :
+  case Hexagon::L4_return_t :
+  case Hexagon::L4_return_f :
+  case Hexagon::L4_return_tnew_pnt :
+  case Hexagon::L4_return_fnew_pnt :
+  case Hexagon::L4_return_tnew_pt :
+  case Hexagon::L4_return_fnew_pt :
+   return true;
+  }
+  return false;
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+bool HexagonInstrInfo::isDependent(const MachineInstr *ProdMI,
+      const MachineInstr *ConsMI) const {
+  const MCInstrDesc &ProdMCID = ProdMI->getDesc();
+  if (!ProdMCID.getNumDefs())
+    return false;
+
+  auto &HRI = getRegisterInfo();
+
+  SmallVector<unsigned, 4> DefsA;
+  SmallVector<unsigned, 4> DefsB;
+  SmallVector<unsigned, 8> UsesA;
+  SmallVector<unsigned, 8> UsesB;
+
+  parseOperands(ProdMI, DefsA, UsesA);
+  parseOperands(ConsMI, DefsB, UsesB);
+
+  for (auto &RegA : DefsA)
+    for (auto &RegB : UsesB) {
+      // True data dependency.
+      if (RegA == RegB)
+        return true;
+
+      if (Hexagon::DoubleRegsRegClass.contains(RegA))
+        for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
+          if (RegB == *SubRegs)
+            return true;
+
+      if (Hexagon::DoubleRegsRegClass.contains(RegB))
+        for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
+          if (RegA == *SubRegs)
+            return true;
+    }
+
+  return false;
+}
+
+
+// Returns true if the instruction is alread a .cur.
+bool HexagonInstrInfo::isDotCurInst(const MachineInstr* MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::V6_vL32b_cur_pi:
+  case Hexagon::V6_vL32b_cur_ai:
+  case Hexagon::V6_vL32b_cur_pi_128B:
+  case Hexagon::V6_vL32b_cur_ai_128B:
+    return true;
+  }
+  return false;
+}
+
+
+// Returns true, if any one of the operands is a dot new
+// insn, whether it is predicated dot new or register dot new.
+bool HexagonInstrInfo::isDotNewInst(const MachineInstr* MI) const {
+  if (isNewValueInst(MI) ||
+     (isPredicated(MI) && isPredicatedNew(MI)))
+    return true;
+
+  return false;
+}
+
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonInstrInfo::isDuplexPair(const MachineInstr *MIa,
+      const MachineInstr *MIb) const {
+  HexagonII::SubInstructionGroup MIaG = getDuplexCandidateGroup(MIa);
+  HexagonII::SubInstructionGroup MIbG = getDuplexCandidateGroup(MIb);
+  return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+
+bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+
+  if (MI->mayLoad() || MI->mayStore() || MI->isCompare())
+    return true;
+
+  // Multiply
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23)
+    return true;
+  return false;
+}
+
+
+bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const {
+  return (Opcode == Hexagon::ENDLOOP0 ||
+          Opcode == Hexagon::ENDLOOP1);
+}
+
+
+bool HexagonInstrInfo::isExpr(unsigned OpType) const {
+  switch(OpType) {
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
+  const MCInstrDesc &MID = MI->getDesc();
+  const uint64_t F = MID.TSFlags;
+  if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+    return true;
+
+  // TODO: This is largely obsolete now. Will need to be removed
+  // in consecutive patches.
+  switch(MI->getOpcode()) {
+    // TFR_FI Remains a special case.
+    case Hexagon::TFR_FI:
+      return true;
+    default:
+      return false;
+  }
+  return  false;
+}
+
+
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
+bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
+  // First check if this is permanently extended op code.
+  const uint64_t F = MI->getDesc().TSFlags;
+  if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+    return true;
+  // Use MO operand flags to determine if one of MI's operands
+  // has HMOTF_ConstExtended flag set.
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+       E = MI->operands_end(); I != E; ++I) {
+    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+      return true;
+  }
+  return  false;
+}
+
+
+bool HexagonInstrInfo::isFloat(const MachineInstr *MI) const {
+  unsigned Opcode = MI->getOpcode();
   const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::FPPos) & HexagonII::FPMask;
+}
 
-  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+
+// No V60 HVX VMEM with A_INDIRECT.
+bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr *I,
+      const MachineInstr *J) const {
+  if (!isV60VectorInstruction(I))
+    return false;
+  if (!I->mayLoad() && !I->mayStore())
+    return false;
+  return J->isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J);
 }
 
-int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
-  enum Hexagon::PredSense inPredSense;
-  inPredSense = invertPredicate ? Hexagon::PredSense_false :
-                                  Hexagon::PredSense_true;
-  int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
-  if (CondOpcode >= 0) // Valid Conditional opcode/instruction
-    return CondOpcode;
 
-  // This switch case will be removed once all the instructions have been
-  // modified to use relation maps.
-  switch(Opc) {
-  case Hexagon::TFRI_f:
-    return !invertPredicate ? Hexagon::TFRI_cPt_f :
-                              Hexagon::TFRI_cNotPt_f;
-  case Hexagon::A2_combinew:
-    return !invertPredicate ? Hexagon::C2_ccombinewt :
-                              Hexagon::C2_ccombinewf;
+bool HexagonInstrInfo::isIndirectCall(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::J2_callr :
+  case Hexagon::J2_callrf :
+  case Hexagon::J2_callrt :
+    return true;
+  }
+  return false;
+}
 
-  // DEALLOC_RETURN.
-  case Hexagon::L4_return:
-    return !invertPredicate ? Hexagon::L4_return_t:
-                              Hexagon::L4_return_f;
+
+bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::L4_return :
+  case Hexagon::L4_return_t :
+  case Hexagon::L4_return_f :
+  case Hexagon::L4_return_fnew_pnt :
+  case Hexagon::L4_return_fnew_pt :
+  case Hexagon::L4_return_tnew_pnt :
+  case Hexagon::L4_return_tnew_pt :
+    return true;
   }
-  llvm_unreachable("Unexpected predicable instruction");
+  return false;
 }
 
 
-bool HexagonInstrInfo::
-PredicateInstruction(MachineInstr *MI,
-                     ArrayRef<MachineOperand> Cond) const {
-  if (Cond.empty() || isEndLoopN(Cond[0].getImm())) {
-    DEBUG(dbgs() << "\nCannot predicate:"; MI->dump(););
+bool HexagonInstrInfo::isJumpR(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::J2_jumpr :
+  case Hexagon::J2_jumprt :
+  case Hexagon::J2_jumprf :
+  case Hexagon::J2_jumprtnewpt :
+  case Hexagon::J2_jumprfnewpt  :
+  case Hexagon::J2_jumprtnew :
+  case Hexagon::J2_jumprfnew :
+    return true;
+  }
+  return false;
+}
+
+
+// Return true if a given MI can accomodate given offset.
+// Use abs estimate as oppose to the exact number.
+// TODO: This will need to be changed to use MC level
+// definition of instruction extendable field size.
+bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr *MI,
+      unsigned offset) const {
+  // This selection of jump instructions matches to that what
+  // AnalyzeBranch can parse, plus NVJ.
+  if (isNewValueJump(MI)) // r9:2
+    return isInt<11>(offset);
+
+  switch (MI->getOpcode()) {
+  // Still missing Jump to address condition on register value.
+  default:
     return false;
+  case Hexagon::J2_jump: // bits<24> dst; // r22:2
+  case Hexagon::J2_call:
+  case Hexagon::CALLv3nr:
+    return isInt<24>(offset);
+  case Hexagon::J2_jumpt: //bits<17> dst; // r15:2
+  case Hexagon::J2_jumpf:
+  case Hexagon::J2_jumptnew:
+  case Hexagon::J2_jumptnewpt:
+  case Hexagon::J2_jumpfnew:
+  case Hexagon::J2_jumpfnewpt:
+  case Hexagon::J2_callt:
+  case Hexagon::J2_callf:
+    return isInt<17>(offset);
+  case Hexagon::J2_loop0i:
+  case Hexagon::J2_loop0iext:
+  case Hexagon::J2_loop0r:
+  case Hexagon::J2_loop0rext:
+  case Hexagon::J2_loop1i:
+  case Hexagon::J2_loop1iext:
+  case Hexagon::J2_loop1r:
+  case Hexagon::J2_loop1rext:
+    return isInt<9>(offset);
+  // TODO: Add all the compound branches here. Can we do this in Relation model?
+  case Hexagon::J4_cmpeqi_tp0_jump_nt:
+  case Hexagon::J4_cmpeqi_tp1_jump_nt:
+    return isInt<11>(offset);
   }
-  int Opc = MI->getOpcode();
-  assert (isPredicable(MI) && "Expected predicable instruction");
-  bool invertJump = predOpcodeHasNot(Cond);
+}
 
-  // We have to predicate MI "in place", i.e. after this function returns,
-  // MI will need to be transformed into a predicated form. To avoid com-
-  // plicated manipulations with the operands (handling tied operands,
-  // etc.), build a new temporary instruction, then overwrite MI with it.
 
-  MachineBasicBlock &B = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned PredOpc = getCondOpcode(Opc, invertJump);
-  MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
-  unsigned NOp = 0, NumOps = MI->getNumOperands();
-  while (NOp < NumOps) {
-    MachineOperand &Op = MI->getOperand(NOp);
-    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
-      break;
-    T.addOperand(Op);
-    NOp++;
+bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI,
+      const MachineInstr *ESMI) const {
+  if (!LRMI || !ESMI)
+    return false;
+
+  bool isLate = isLateResultInstr(LRMI);
+  bool isEarly = isEarlySourceInstr(ESMI);
+
+  DEBUG(dbgs() << "V60" <<  (isLate ? "-LR  " : " --  "));
+  DEBUG(LRMI->dump());
+  DEBUG(dbgs() << "V60" <<  (isEarly ? "-ES  " : " --  "));
+  DEBUG(ESMI->dump());
+
+  if (isLate && isEarly) {
+    DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
+    return true;
   }
 
-  unsigned PredReg, PredRegPos, PredRegFlags;
-  bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags);
-  (void)GotPredReg;
-  assert(GotPredReg);
-  T.addReg(PredReg, PredRegFlags);
-  while (NOp < NumOps)
-    T.addOperand(MI->getOperand(NOp++));
+  return false;
+}
 
-  MI->setDesc(get(PredOpc));
-  while (unsigned n = MI->getNumOperands())
-    MI->RemoveOperand(n-1);
-  for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
-    MI->addOperand(T->getOperand(i));
 
-  MachineBasicBlock::instr_iterator TI = &*T;
-  B.erase(TI);
+bool HexagonInstrInfo::isLateResultInstr(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
 
-  MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
-  MRI.clearKillFlags(PredReg);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::PHI:
+    return false;
+  default:
+    break;
+  }
 
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_1_SLOT23:
+  case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+  case Hexagon::Sched::S_2op_tc_1_SLOT23:
+  case Hexagon::Sched::S_3op_tc_1_SLOT23:
+  case Hexagon::Sched::V2LDST_tc_ld_SLOT01:
+  case Hexagon::Sched::V2LDST_tc_st_SLOT0:
+  case Hexagon::Sched::V2LDST_tc_st_SLOT01:
+  case Hexagon::Sched::V4LDST_tc_ld_SLOT01:
+  case Hexagon::Sched::V4LDST_tc_st_SLOT0:
+  case Hexagon::Sched::V4LDST_tc_st_SLOT01:
+    return false;
+  }
   return true;
 }
 
 
-bool
-HexagonInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &MBB,
-                    unsigned NumCycles,
-                    unsigned ExtraPredCycles,
-                    const BranchProbability &Probability) const {
-  return true;
+bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+
+  // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply
+  // resource, but all operands can be received late like an ALU instruction.
+  return MI->getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE;
 }
 
 
-bool
-HexagonInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                    unsigned NumTCycles,
-                    unsigned ExtraTCycles,
-                    MachineBasicBlock &FMBB,
-                    unsigned NumFCycles,
-                    unsigned ExtraFCycles,
-                    const BranchProbability &Probability) const {
-  return true;
+bool HexagonInstrInfo::isLoopN(const MachineInstr *MI) const {
+  unsigned Opcode = MI->getOpcode();
+  return Opcode == Hexagon::J2_loop0i    ||
+         Opcode == Hexagon::J2_loop0r    ||
+         Opcode == Hexagon::J2_loop0iext ||
+         Opcode == Hexagon::J2_loop0rext ||
+         Opcode == Hexagon::J2_loop1i    ||
+         Opcode == Hexagon::J2_loop1r    ||
+         Opcode == Hexagon::J2_loop1iext ||
+         Opcode == Hexagon::J2_loop1rext;
+}
+
+
+bool HexagonInstrInfo::isMemOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::L4_iadd_memopw_io :
+    case Hexagon::L4_isub_memopw_io :
+    case Hexagon::L4_add_memopw_io :
+    case Hexagon::L4_sub_memopw_io :
+    case Hexagon::L4_and_memopw_io :
+    case Hexagon::L4_or_memopw_io :
+    case Hexagon::L4_iadd_memoph_io :
+    case Hexagon::L4_isub_memoph_io :
+    case Hexagon::L4_add_memoph_io :
+    case Hexagon::L4_sub_memoph_io :
+    case Hexagon::L4_and_memoph_io :
+    case Hexagon::L4_or_memoph_io :
+    case Hexagon::L4_iadd_memopb_io :
+    case Hexagon::L4_isub_memopb_io :
+    case Hexagon::L4_add_memopb_io :
+    case Hexagon::L4_sub_memopb_io :
+    case Hexagon::L4_and_memopb_io :
+    case Hexagon::L4_or_memopb_io :
+    case Hexagon::L4_ior_memopb_io:
+    case Hexagon::L4_ior_memoph_io:
+    case Hexagon::L4_ior_memopw_io:
+    case Hexagon::L4_iand_memopb_io:
+    case Hexagon::L4_iand_memoph_io:
+    case Hexagon::L4_iand_memopw_io:
+    return true;
+  }
+  return false;
 }
 
-// Returns true if an instruction is predicated irrespective of the predicate
-// sense. For example, all of the following will return true.
-// if (p0) R1 = add(R2, R3)
-// if (!p0) R1 = add(R2, R3)
-// if (p0.new) R1 = add(R2, R3)
-// if (!p0.new) R1 = add(R2, R3)
-bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
 
-  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
 }
 
-bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+
+bool HexagonInstrInfo::isNewValue(unsigned Opcode) const {
   const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
+}
+
 
-  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
+  return isNewValueJump(MI) || isNewValueStore(MI);
 }
 
-bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
 
-  assert(isPredicated(MI));
-  return (!((F >> HexagonII::PredicatedFalsePos) &
-            HexagonII::PredicatedFalseMask));
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+  return isNewValue(MI) && MI->isBranch();
 }
 
-bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+
+bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const {
+  return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
   const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
 
-  // Make sure that the instruction is predicated.
-  assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
-  return (!((F >> HexagonII::PredicatedFalsePos) &
-            HexagonII::PredicatedFalseMask));
+
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
+    unsigned OperandNum) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
 }
 
+
+bool HexagonInstrInfo::isPostIncrement(const MachineInstr* MI) const {
+  return getAddrMode(MI) == HexagonII::PostInc;
+}
+
+
 bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
-
   assert(isPredicated(MI));
-  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+  return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
 }
 
+
 bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
   const uint64_t F = get(Opcode).TSFlags;
-
   assert(isPredicated(Opcode));
-  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+  return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
 }
 
-// Returns true, if a ST insn can be promoted to a new-value store.
-bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
+  return !((F >> HexagonII::PredicatedFalsePos) &
+           HexagonII::PredicatedFalseMask);
+}
+
 
-  return ((F >> HexagonII::mayNVStorePos) &
-           HexagonII::mayNVStoreMask);
+bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  // Make sure that the instruction is predicated.
+  assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+  return !((F >> HexagonII::PredicatedFalsePos) &
+           HexagonII::PredicatedFalseMask);
 }
 
-bool
-HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                   std::vector<MachineOperand> &Pred) const {
-  for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
-    MachineOperand MO = MI->getOperand(oper);
-    if (MO.isReg() && MO.isDef()) {
-      const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
-      if (RC == &Hexagon::PredRegsRegClass) {
-        Pred.push_back(MO);
-        return true;
-      }
-    }
+
+bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
+}
+
+
+bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
+}
+
+
+bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  assert(get(Opcode).isBranch() &&
+         (isPredicatedNew(Opcode) || isNewValue(Opcode)));
+  return (F >> HexagonII::TakenPos) & HexagonII::TakenMask;
+}
+
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
+  return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 ||
+         MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT;
+}
+
+
+bool HexagonInstrInfo::isSolo(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::SoloPos) & HexagonII::SoloMask;
+}
+
+
+bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case Hexagon::STriw_pred :
+  case Hexagon::LDriw_pred :
+    return true;
+  default:
+    return false;
   }
-  return false;
 }
 
 
-bool
-HexagonInstrInfo::
-SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                  ArrayRef<MachineOperand> Pred2) const {
-  // TODO: Fix this
-  return false;
+// Returns true when SU has a timing class TC1.
+bool HexagonInstrInfo::isTC1(const MachineInstr *MI) const {
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_1_SLOT23:
+  case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+  //case Hexagon::Sched::M_tc_1_SLOT23:
+  case Hexagon::Sched::S_2op_tc_1_SLOT23:
+  case Hexagon::Sched::S_3op_tc_1_SLOT23:
+    return true;
+
+  default:
+    return false;
+  }
 }
 
 
-//
-// We indicate that we want to reverse the branch by
-// inserting the reversed branching opcode.
-//
-bool HexagonInstrInfo::ReverseBranchCondition(
-    SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond.empty())
+bool HexagonInstrInfo::isTC2(const MachineInstr *MI) const {
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_2_SLOT23:
+  case Hexagon::Sched::CR_tc_2_SLOT3:
+  case Hexagon::Sched::M_tc_2_SLOT23:
+  case Hexagon::Sched::S_2op_tc_2_SLOT23:
+  case Hexagon::Sched::S_3op_tc_2_SLOT23:
     return true;
-  assert(Cond[0].isImm() && "First entry in the cond vector not imm-val");
-  Opcode_t opcode = Cond[0].getImm();
-  //unsigned temp;
-  assert(get(opcode).isBranch() && "Should be a branching condition.");
-  if (isEndLoopN(opcode))
+
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isTC2Early(const MachineInstr *MI) const {
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123:
+  case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_2early_SLOT23:
+  case Hexagon::Sched::CR_tc_2early_SLOT23:
+  case Hexagon::Sched::CR_tc_2early_SLOT3:
+  case Hexagon::Sched::J_tc_2early_SLOT0123:
+  case Hexagon::Sched::J_tc_2early_SLOT2:
+  case Hexagon::Sched::J_tc_2early_SLOT23:
+  case Hexagon::Sched::S_2op_tc_2early_SLOT23:
+  case Hexagon::Sched::S_3op_tc_2early_SLOT23:
     return true;
-  Opcode_t NewOpcode = getInvertedPredicatedOpcode(opcode);
-  Cond[0].setImm(NewOpcode);
-  return false;
+
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isTC4x(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23;
 }
 
 
-bool HexagonInstrInfo::
-isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
-                          const BranchProbability &Probability) const {
-  return (NumInstrs <= 4);
+bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+
+  const uint64_t V = getType(MI);
+  return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST;
 }
 
-bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default: return false;
-  case Hexagon::L4_return:
-  case Hexagon::L4_return_t:
-  case Hexagon::L4_return_f:
-  case Hexagon::L4_return_tnew_pnt:
-  case Hexagon::L4_return_fnew_pnt:
-  case Hexagon::L4_return_tnew_pt:
-  case Hexagon::L4_return_fnew_pt:
-   return true;
+
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const {
+  if (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+      VT == MVT::v32i16 || VT == MVT::v64i8) {
+      return (Offset >= Hexagon_MEMV_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMV_AUTOINC_MAX &&
+              (Offset & 0x3f) == 0);
+  }
+  // 128B
+  if (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+      VT == MVT::v64i16 || VT == MVT::v128i8) {
+      return (Offset >= Hexagon_MEMV_AUTOINC_MIN_128B &&
+              Offset <= Hexagon_MEMV_AUTOINC_MAX_128B &&
+              (Offset & 0x7f) == 0);
+  }
+  if (VT == MVT::i64) {
+      return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+              (Offset & 0x7) == 0);
+  }
+  if (VT == MVT::i32) {
+      return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+              (Offset & 0x3) == 0);
+  }
+  if (VT == MVT::i16) {
+      return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+              (Offset & 0x1) == 0);
+  }
+  if (VT == MVT::i8) {
+      return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMB_AUTOINC_MAX);
   }
+  llvm_unreachable("Not an auto-inc opc!");
 }
 
 
@@ -1222,6 +2329,40 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   // misaligns with respect to load size.
 
   switch (Opcode) {
+  case Hexagon::STriq_pred_V6:
+  case Hexagon::STriq_pred_vec_V6:
+  case Hexagon::STriv_pseudo_V6:
+  case Hexagon::STrivv_pseudo_V6:
+  case Hexagon::LDriq_pred_V6:
+  case Hexagon::LDriq_pred_vec_V6:
+  case Hexagon::LDriv_pseudo_V6:
+  case Hexagon::LDrivv_pseudo_V6:
+  case Hexagon::LDrivv_indexed:
+  case Hexagon::STrivv_indexed:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vS32Ub_ai:
+    return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMV_OFFSET_MAX);
+
+  case Hexagon::STriq_pred_V6_128B:
+  case Hexagon::STriq_pred_vec_V6_128B:
+  case Hexagon::STriv_pseudo_V6_128B:
+  case Hexagon::STrivv_pseudo_V6_128B:
+  case Hexagon::LDriq_pred_V6_128B:
+  case Hexagon::LDriq_pred_vec_V6_128B:
+  case Hexagon::LDriv_pseudo_V6_128B:
+  case Hexagon::LDrivv_pseudo_V6_128B:
+  case Hexagon::LDrivv_indexed_128B:
+  case Hexagon::STrivv_indexed_128B:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::V6_vS32Ub_ai_128B:
+    return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
+      (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+
   case Hexagon::J2_loop0i:
   case Hexagon::J2_loop1i:
     return isUInt<10>(Offset);
@@ -1248,8 +2389,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
       (Offset <= Hexagon_MEMH_OFFSET_MAX);
 
   case Hexagon::L2_loadrb_io:
-  case Hexagon::S2_storerb_io:
   case Hexagon::L2_loadrub_io:
+  case Hexagon::S2_storerb_io:
     return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMB_OFFSET_MAX);
 
@@ -1257,28 +2398,28 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
     return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
       (Offset <= Hexagon_ADDI_OFFSET_MAX);
 
-  case Hexagon::L4_iadd_memopw_io:
-  case Hexagon::L4_isub_memopw_io:
-  case Hexagon::L4_add_memopw_io:
-  case Hexagon::L4_sub_memopw_io:
-  case Hexagon::L4_and_memopw_io:
-  case Hexagon::L4_or_memopw_io:
+  case Hexagon::L4_iadd_memopw_io :
+  case Hexagon::L4_isub_memopw_io :
+  case Hexagon::L4_add_memopw_io :
+  case Hexagon::L4_sub_memopw_io :
+  case Hexagon::L4_and_memopw_io :
+  case Hexagon::L4_or_memopw_io :
     return (0 <= Offset && Offset <= 255);
 
-  case Hexagon::L4_iadd_memoph_io:
-  case Hexagon::L4_isub_memoph_io:
-  case Hexagon::L4_add_memoph_io:
-  case Hexagon::L4_sub_memoph_io:
-  case Hexagon::L4_and_memoph_io:
-  case Hexagon::L4_or_memoph_io:
+  case Hexagon::L4_iadd_memoph_io :
+  case Hexagon::L4_isub_memoph_io :
+  case Hexagon::L4_add_memoph_io :
+  case Hexagon::L4_sub_memoph_io :
+  case Hexagon::L4_and_memoph_io :
+  case Hexagon::L4_or_memoph_io :
     return (0 <= Offset && Offset <= 127);
 
-  case Hexagon::L4_iadd_memopb_io:
-  case Hexagon::L4_isub_memopb_io:
-  case Hexagon::L4_add_memopb_io:
-  case Hexagon::L4_sub_memopb_io:
-  case Hexagon::L4_and_memopb_io:
-  case Hexagon::L4_or_memopb_io:
+  case Hexagon::L4_iadd_memopb_io :
+  case Hexagon::L4_isub_memopb_io :
+  case Hexagon::L4_add_memopb_io :
+  case Hexagon::L4_sub_memopb_io :
+  case Hexagon::L4_and_memopb_io :
+  case Hexagon::L4_or_memopb_io :
     return (0 <= Offset && Offset <= 63);
 
   // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
@@ -1291,223 +2432,556 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::TFR_FIA:
   case Hexagon::INLINEASM:
     return true;
-  }
+
+  case Hexagon::L2_ploadrbt_io:
+  case Hexagon::L2_ploadrbf_io:
+  case Hexagon::L2_ploadrubt_io:
+  case Hexagon::L2_ploadrubf_io:
+  case Hexagon::S2_pstorerbt_io:
+  case Hexagon::S2_pstorerbf_io:
+  case Hexagon::S4_storeirb_io:
+  case Hexagon::S4_storeirbt_io:
+  case Hexagon::S4_storeirbf_io:
+    return isUInt<6>(Offset);
+
+  case Hexagon::L2_ploadrht_io:
+  case Hexagon::L2_ploadrhf_io:
+  case Hexagon::L2_ploadruht_io:
+  case Hexagon::L2_ploadruhf_io:
+  case Hexagon::S2_pstorerht_io:
+  case Hexagon::S2_pstorerhf_io:
+  case Hexagon::S4_storeirh_io:
+  case Hexagon::S4_storeirht_io:
+  case Hexagon::S4_storeirhf_io:
+    return isShiftedUInt<6,1>(Offset);
+
+  case Hexagon::L2_ploadrit_io:
+  case Hexagon::L2_ploadrif_io:
+  case Hexagon::S2_pstorerit_io:
+  case Hexagon::S2_pstorerif_io:
+  case Hexagon::S4_storeiri_io:
+  case Hexagon::S4_storeirit_io:
+  case Hexagon::S4_storeirif_io:
+    return isShiftedUInt<6,2>(Offset);
+
+  case Hexagon::L2_ploadrdt_io:
+  case Hexagon::L2_ploadrdf_io:
+  case Hexagon::S2_pstorerdt_io:
+  case Hexagon::S2_pstorerdf_io:
+    return isShiftedUInt<6,3>(Offset);
+  } // switch
 
   llvm_unreachable("No offset range is defined for this opcode. "
                    "Please define it in the above switch statement!");
 }
 
 
-//
-// Check if the Offset is a valid auto-inc imm by Load/Store Type.
-//
-bool HexagonInstrInfo::
-isValidAutoIncImm(const EVT VT, const int Offset) const {
+bool HexagonInstrInfo::isVecAcc(const MachineInstr *MI) const {
+  return MI && isV60VectorInstruction(MI) && isAccumulator(MI);
+}
 
-  if (VT == MVT::i64) {
-      return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
-              Offset <= Hexagon_MEMD_AUTOINC_MAX &&
-              (Offset & 0x7) == 0);
-  }
-  if (VT == MVT::i32) {
-      return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
-              Offset <= Hexagon_MEMW_AUTOINC_MAX &&
-              (Offset & 0x3) == 0);
-  }
-  if (VT == MVT::i16) {
-      return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
-              Offset <= Hexagon_MEMH_AUTOINC_MAX &&
-              (Offset & 0x1) == 0);
-  }
-  if (VT == MVT::i8) {
-      return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
-              Offset <= Hexagon_MEMB_AUTOINC_MAX);
+
+bool HexagonInstrInfo::isVecALU(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  const uint64_t F = get(MI->getOpcode()).TSFlags;
+  const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+  return
+    V == HexagonII::TypeCVI_VA         ||
+    V == HexagonII::TypeCVI_VA_DV;
+}
+
+
+bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr *ProdMI,
+      const MachineInstr *ConsMI) const {
+  if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI))
+    return true;
+
+  if (EnableALUForwarding && (isVecALU(ConsMI) || isLateSourceInstr(ConsMI)))
+    return true;
+
+  if (mayBeNewStore(ConsMI))
+    return true;
+
+  return false;
+}
+
+
+/// \brief Can these instructions execute at the same time in a bundle.
+bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First,
+      const MachineInstr *Second) const {
+  if (DisableNVSchedule)
+    return false;
+  if (mayBeNewStore(Second)) {
+    // Make sure the definition of the first instruction is the value being
+    // stored.
+    const MachineOperand &Stored =
+      Second->getOperand(Second->getNumOperands() - 1);
+    if (!Stored.isReg())
+      return false;
+    for (unsigned i = 0, e = First->getNumOperands(); i < e; ++i) {
+      const MachineOperand &Op = First->getOperand(i);
+      if (Op.isReg() && Op.isDef() && Op.getReg() == Stored.getReg())
+        return true;
+    }
   }
-  llvm_unreachable("Not an auto-inc opc!");
+  return false;
+}
+
+
+bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const {
+  for (auto &I : *B)
+    if (I.isEHLabel())
+      return true;
+  return false;
 }
 
 
-bool HexagonInstrInfo::
-isMemOp(const MachineInstr *MI) const {
-//  return MI->getDesc().mayLoad() && MI->getDesc().mayStore();
-
-  switch (MI->getOpcode())
-  {
-  default: return false;
-  case Hexagon::L4_iadd_memopw_io:
-  case Hexagon::L4_isub_memopw_io:
-  case Hexagon::L4_add_memopw_io:
-  case Hexagon::L4_sub_memopw_io:
-  case Hexagon::L4_and_memopw_io:
-  case Hexagon::L4_or_memopw_io:
-  case Hexagon::L4_iadd_memoph_io:
-  case Hexagon::L4_isub_memoph_io:
-  case Hexagon::L4_add_memoph_io:
-  case Hexagon::L4_sub_memoph_io:
-  case Hexagon::L4_and_memoph_io:
-  case Hexagon::L4_or_memoph_io:
-  case Hexagon::L4_iadd_memopb_io:
-  case Hexagon::L4_isub_memopb_io:
-  case Hexagon::L4_add_memopb_io:
-  case Hexagon::L4_sub_memopb_io:
-  case Hexagon::L4_and_memopb_io:
-  case Hexagon::L4_or_memopb_io:
-  case Hexagon::L4_ior_memopb_io:
-  case Hexagon::L4_ior_memoph_io:
-  case Hexagon::L4_ior_memopw_io:
-  case Hexagon::L4_iand_memopb_io:
-  case Hexagon::L4_iand_memoph_io:
-  case Hexagon::L4_iand_memopw_io:
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr *MI) const {
+  short NonExtOpcode;
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
+    return true;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retrieve non-ext equivalent instruction.
+
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      // Load/store with absolute addressing mode can be converted into
+      // base+offset mode.
+      NonExtOpcode = Hexagon::getBaseWithImmOffset(MI->getOpcode());
+      break;
+    case HexagonII::BaseImmOffset :
+      // Load/store with base+offset addressing mode can be converted into
+      // base+register offset addressing mode. However left shift operand should
+      // be set to 0.
+      NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
+      break;
+    case HexagonII::BaseLongOffset:
+      NonExtOpcode = Hexagon::getRegShlForm(MI->getOpcode());
+      break;
+    default:
+      return false;
+    }
+    if (NonExtOpcode < 0)
+      return false;
     return true;
   }
   return false;
 }
 
 
-bool HexagonInstrInfo::
-isSpillPredRegOp(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-    default: return false;
-    case Hexagon::STriw_pred :
-    case Hexagon::LDriw_pred :
+bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr *MI) const {
+  return Hexagon::getRealHWInstr(MI->getOpcode(),
+                                 Hexagon::InstrType_Pseudo) >= 0;
+}
+
+
+bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
+      const {
+  MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+  while (I != E) {
+    if (I->isBarrier())
       return true;
+    ++I;
   }
+  return false;
 }
 
-bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-    default: return false;
-    case Hexagon::C2_cmpeq:
-    case Hexagon::C2_cmpeqi:
-    case Hexagon::C2_cmpgt:
-    case Hexagon::C2_cmpgti:
-    case Hexagon::C2_cmpgtu:
-    case Hexagon::C2_cmpgtui:
+
+// Returns true, if a LD insn can be promoted to a cur load.
+bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr *MI) const {
+  auto &HST = MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>();
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) &&
+         HST.hasV60TOps();
+}
+
+
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr *ProdMI,
+      const MachineInstr *ConsMI) const {
+  // There is no stall when ProdMI is not a V60 vector.
+  if (!isV60VectorInstruction(ProdMI))
+    return false;
+
+  // There is no stall when ProdMI and ConsMI are not dependent.
+  if (!isDependent(ProdMI, ConsMI))
+    return false;
+
+  // When Forward Scheduling is enabled, there is no stall if ProdMI and ConsMI
+  // are scheduled in consecutive packets.
+  if (isVecUsableNextPacket(ProdMI, ConsMI))
+    return false;
+
+  return true;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr *MI,
+      MachineBasicBlock::const_instr_iterator BII) const {
+  // There is no stall when I is not a V60 vector.
+  if (!isV60VectorInstruction(MI))
+    return false;
+
+  MachineBasicBlock::const_instr_iterator MII = BII;
+  MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end();
+
+  if (!(*MII).isBundle()) {
+    const MachineInstr *J = &*MII;
+    if (!isV60VectorInstruction(J))
+      return false;
+    else if (isVecUsableNextPacket(J, MI))
+      return false;
+    return true;
+  }
+
+  for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) {
+    const MachineInstr *J = &*MII;
+    if (producesStall(J, MI))
       return true;
   }
+  return false;
+}
+
+
+bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr *MI,
+      unsigned PredReg) const {
+  for (unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+    const MachineOperand &MO = MI->getOperand(opNum);
+    if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
+      return false; // Predicate register must be explicitly defined.
+  }
+
+  // Hexagon Programmer's Reference says that decbin, memw_locked, and
+  // memd_locked cannot be used as .new as well,
+  // but we don't seem to have these instructions defined.
+  return MI->getOpcode() != Hexagon::A4_tlbmatch;
+}
+
+
+bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
+  return (Opcode == Hexagon::J2_jumpt)      ||
+         (Opcode == Hexagon::J2_jumpf)      ||
+         (Opcode == Hexagon::J2_jumptnew)   ||
+         (Opcode == Hexagon::J2_jumpfnew)   ||
+         (Opcode == Hexagon::J2_jumptnewpt) ||
+         (Opcode == Hexagon::J2_jumpfnewpt);
+}
+
+
+bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
+  if (Cond.empty() || !isPredicated(Cond[0].getImm()))
+    return false;
+  return !isPredicatedTrue(Cond[0].getImm());
+}
+
+
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask;
+}
+
+
+// Returns the base register in a memory access (load/store). The offset is
+// returned in Offset and the access size is returned in AccessSize.
+unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr *MI,
+      int &Offset, unsigned &AccessSize) const {
+  // Return if it is not a base+offset type instruction or a MemOp.
+  if (getAddrMode(MI) != HexagonII::BaseImmOffset &&
+      getAddrMode(MI) != HexagonII::BaseLongOffset &&
+      !isMemOp(MI) && !isPostIncrement(MI))
+    return 0;
+
+  // Since it is a memory access instruction, getMemAccessSize() should never
+  // return 0.
+  assert (getMemAccessSize(MI) &&
+          "BaseImmOffset or BaseLongOffset or MemOp without accessSize");
+
+  // Return Values of getMemAccessSize() are
+  // 0 - Checked in the assert above.
+  // 1, 2, 3, 4 & 7, 8 - The statement below is correct for all these.
+  // MemAccessSize is represented as 1+log2(N) where N is size in bits.
+  AccessSize = (1U << (getMemAccessSize(MI) - 1));
+
+  unsigned basePos = 0, offsetPos = 0;
+  if (!getBaseAndOffsetPosition(MI, basePos, offsetPos))
+    return 0;
+
+  // Post increment updates its EA after the mem access,
+  // so we need to treat its offset as zero.
+  if (isPostIncrement(MI))
+    Offset = 0;
+  else {
+    Offset = MI->getOperand(offsetPos).getImm();
+  }
+
+  return MI->getOperand(basePos).getReg();
+}
+
+
+/// Return the position of the base and offset operands for this instruction.
+bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr *MI,
+      unsigned &BasePos, unsigned &OffsetPos) const {
+  // Deal with memops first.
+  if (isMemOp(MI)) {
+    assert (MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+            "Bad Memop.");
+    BasePos = 0;
+    OffsetPos = 1;
+  } else if (MI->mayStore()) {
+    BasePos = 0;
+    OffsetPos = 1;
+  } else if (MI->mayLoad()) {
+    BasePos = 1;
+    OffsetPos = 2;
+  } else
+    return false;
+
+  if (isPredicated(MI)) {
+    BasePos++;
+    OffsetPos++;
+  }
+  if (isPostIncrement(MI)) {
+    BasePos++;
+    OffsetPos++;
+  }
+
+  if (!MI->getOperand(BasePos).isReg() || !MI->getOperand(OffsetPos).isImm())
+    return false;
+
+  return true;
+}
+
+
+// Inserts branching instructions in reverse order of their occurence.
+// e.g. jump_t t1 (i1)
+// jump t2        (i2)
+// Jumpers = {i2, i1}
+SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
+      MachineBasicBlock& MBB) const {
+  SmallVector<MachineInstr*, 2> Jumpers;
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  if (I == MBB.instr_begin())
+    return Jumpers;
+
+  // A basic block may looks like this:
+  //
+  //  [   insn
+  //     EH_LABEL
+  //      insn
+  //      insn
+  //      insn
+  //     EH_LABEL
+  //      insn     ]
+  //
+  // It has two succs but does not have a terminator
+  // Don't know how to handle it.
+  do {
+    --I;
+    if (I->isEHLabel())
+      return Jumpers;
+  } while (I != MBB.instr_begin());
+
+  I = MBB.instr_end();
+  --I;
+
+  while (I->isDebugValue()) {
+    if (I == MBB.instr_begin())
+      return Jumpers;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(&*I))
+    return Jumpers;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  Jumpers.push_back(LastInst);
+  MachineInstr *SecondLastInst = nullptr;
+  // Find one more terminator if present.
+  do {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) {
+      if (!SecondLastInst) {
+        SecondLastInst = &*I;
+        Jumpers.push_back(SecondLastInst);
+      } else // This is a third branch.
+        return Jumpers;
+    }
+    if (I == MBB.instr_begin())
+      break;
+    --I;
+  } while (true);
+  return Jumpers;
 }
 
-bool HexagonInstrInfo::
-isConditionalTransfer (const MachineInstr *MI) const {
+
+// Returns Operand Index for the constant extended instruction.
+unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask;
+}
+
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup(
+      const MachineInstr *MI) const {
+  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
   switch (MI->getOpcode()) {
-    default: return false;
-    case Hexagon::A2_tfrt:
-    case Hexagon::A2_tfrf:
-    case Hexagon::C2_cmoveit:
-    case Hexagon::C2_cmoveif:
-    case Hexagon::A2_tfrtnew:
-    case Hexagon::A2_tfrfnew:
-    case Hexagon::C2_cmovenewit:
-    case Hexagon::C2_cmovenewif:
-      return true;
+  default:
+    return HexagonII::HCG_None;
+  //
+  // Compound pairs.
+  // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+  // "Rd16=#U6 ; jump #r9:2"
+  // "Rd16=Rs16 ; jump #r9:2"
+  //
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgtu:
+    DstReg = MI->getOperand(0).getReg();
+    Src1Reg = MI->getOperand(1).getReg();
+    Src2Reg = MI->getOperand(2).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgtui:
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() &&
+        ((isUInt<5>(MI->getOperand(2).getImm())) ||
+         (MI->getOperand(2).getImm() == -1)))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfr:
+    // Rd = Rs
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfrsi:
+    // Rd = #u6
+    // Do not test for #u6 size since the const is getting extended
+    // regardless and compound could be formed.
+    DstReg = MI->getOperand(0).getReg();
+    if (isIntRegForSubInst(DstReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::S2_tstbit_i:
+    DstReg = MI->getOperand(0).getReg();
+    Src1Reg = MI->getOperand(1).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        MI->getOperand(2).isImm() &&
+        isIntRegForSubInst(Src1Reg) && (MI->getOperand(2).getImm() == 0))
+      return HexagonII::HCG_A;
+    break;
+  // The fact that .new form is used pretty much guarantees
+  // that predicate register will match. Nevertheless,
+  // there could be some false positives without additional
+  // checking.
+  case Hexagon::J2_jumptnew:
+  case Hexagon::J2_jumpfnew:
+  case Hexagon::J2_jumptnewpt:
+  case Hexagon::J2_jumpfnewpt:
+    Src1Reg = MI->getOperand(0).getReg();
+    if (Hexagon::PredRegsRegClass.contains(Src1Reg) &&
+        (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg))
+      return HexagonII::HCG_B;
+    break;
+  // Transfer and jump:
+  // Rd=#U6 ; jump #r9:2
+  // Rd=Rs ; jump #r9:2
+  // Do not test for jump range here.
+  case Hexagon::J2_jump:
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+    return HexagonII::HCG_C;
+    break;
   }
+
+  return HexagonII::HCG_None;
+}
+
+
+// Returns -1 when there is no opcode found.
+unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr *GA,
+      const MachineInstr *GB) const {
+  assert(getCompoundCandidateGroup(GA) == HexagonII::HCG_A);
+  assert(getCompoundCandidateGroup(GB) == HexagonII::HCG_B);
+  if ((GA->getOpcode() != Hexagon::C2_cmpeqi) ||
+      (GB->getOpcode() != Hexagon::J2_jumptnew))
+    return -1;
+  unsigned DestReg = GA->getOperand(0).getReg();
+  if (!GB->readsRegister(DestReg))
+    return -1;
+  if (DestReg == Hexagon::P0)
+    return Hexagon::J4_cmpeqi_tp0_jump_nt;
+  if (DestReg == Hexagon::P1)
+    return Hexagon::J4_cmpeqi_tp1_jump_nt;
+  return -1;
 }
 
-bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
-  switch (MI->getOpcode())
-  {
-    default: return false;
-    case Hexagon::A2_paddf:
-    case Hexagon::A2_paddfnew:
-    case Hexagon::A2_paddt:
-    case Hexagon::A2_paddtnew:
-    case Hexagon::A2_pandf:
-    case Hexagon::A2_pandfnew:
-    case Hexagon::A2_pandt:
-    case Hexagon::A2_pandtnew:
-    case Hexagon::A4_paslhf:
-    case Hexagon::A4_paslhfnew:
-    case Hexagon::A4_paslht:
-    case Hexagon::A4_paslhtnew:
-    case Hexagon::A4_pasrhf:
-    case Hexagon::A4_pasrhfnew:
-    case Hexagon::A4_pasrht:
-    case Hexagon::A4_pasrhtnew:
-    case Hexagon::A2_porf:
-    case Hexagon::A2_porfnew:
-    case Hexagon::A2_port:
-    case Hexagon::A2_portnew:
-    case Hexagon::A2_psubf:
-    case Hexagon::A2_psubfnew:
-    case Hexagon::A2_psubt:
-    case Hexagon::A2_psubtnew:
-    case Hexagon::A2_pxorf:
-    case Hexagon::A2_pxorfnew:
-    case Hexagon::A2_pxort:
-    case Hexagon::A2_pxortnew:
-    case Hexagon::A4_psxthf:
-    case Hexagon::A4_psxthfnew:
-    case Hexagon::A4_psxtht:
-    case Hexagon::A4_psxthtnew:
-    case Hexagon::A4_psxtbf:
-    case Hexagon::A4_psxtbfnew:
-    case Hexagon::A4_psxtbt:
-    case Hexagon::A4_psxtbtnew:
-    case Hexagon::A4_pzxtbf:
-    case Hexagon::A4_pzxtbfnew:
-    case Hexagon::A4_pzxtbt:
-    case Hexagon::A4_pzxtbtnew:
-    case Hexagon::A4_pzxthf:
-    case Hexagon::A4_pzxthfnew:
-    case Hexagon::A4_pzxtht:
-    case Hexagon::A4_pzxthtnew:
-    case Hexagon::A2_paddit:
-    case Hexagon::A2_paddif:
-    case Hexagon::C2_ccombinewt:
-    case Hexagon::C2_ccombinewf:
-      return true;
+
+int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
+  enum Hexagon::PredSense inPredSense;
+  inPredSense = invertPredicate ? Hexagon::PredSense_false :
+                                  Hexagon::PredSense_true;
+  int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+  if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+    return CondOpcode;
+
+  // This switch case will be removed once all the instructions have been
+  // modified to use relation maps.
+  switch(Opc) {
+  case Hexagon::TFRI_f:
+    return !invertPredicate ? Hexagon::TFRI_cPt_f :
+                              Hexagon::TFRI_cNotPt_f;
   }
+
+  llvm_unreachable("Unexpected predicable instruction");
 }
 
-bool HexagonInstrInfo::
-isConditionalLoad (const MachineInstr* MI) const {
-  switch (MI->getOpcode())
-  {
-    default: return false;
-    case Hexagon::L2_ploadrdt_io :
-    case Hexagon::L2_ploadrdf_io:
-    case Hexagon::L2_ploadrit_io:
-    case Hexagon::L2_ploadrif_io:
-    case Hexagon::L2_ploadrht_io:
-    case Hexagon::L2_ploadrhf_io:
-    case Hexagon::L2_ploadrbt_io:
-    case Hexagon::L2_ploadrbf_io:
-    case Hexagon::L2_ploadruht_io:
-    case Hexagon::L2_ploadruhf_io:
-    case Hexagon::L2_ploadrubt_io:
-    case Hexagon::L2_ploadrubf_io:
-    case Hexagon::L2_ploadrdt_pi:
-    case Hexagon::L2_ploadrdf_pi:
-    case Hexagon::L2_ploadrit_pi:
-    case Hexagon::L2_ploadrif_pi:
-    case Hexagon::L2_ploadrht_pi:
-    case Hexagon::L2_ploadrhf_pi:
-    case Hexagon::L2_ploadrbt_pi:
-    case Hexagon::L2_ploadrbf_pi:
-    case Hexagon::L2_ploadruht_pi:
-    case Hexagon::L2_ploadruhf_pi:
-    case Hexagon::L2_ploadrubt_pi:
-    case Hexagon::L2_ploadrubf_pi:
-    case Hexagon::L4_ploadrdt_rr:
-    case Hexagon::L4_ploadrdf_rr:
-    case Hexagon::L4_ploadrbt_rr:
-    case Hexagon::L4_ploadrbf_rr:
-    case Hexagon::L4_ploadrubt_rr:
-    case Hexagon::L4_ploadrubf_rr:
-    case Hexagon::L4_ploadrht_rr:
-    case Hexagon::L4_ploadrhf_rr:
-    case Hexagon::L4_ploadruht_rr:
-    case Hexagon::L4_ploadruhf_rr:
-    case Hexagon::L4_ploadrit_rr:
-    case Hexagon::L4_ploadrif_rr:
-      return true;
+
+// Return the cur value instruction for a given store.
+int HexagonInstrInfo::getDotCurOp(const MachineInstr* MI) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .cur type");
+  case Hexagon::V6_vL32b_pi:
+    return Hexagon::V6_vL32b_cur_pi;
+  case Hexagon::V6_vL32b_ai:
+    return Hexagon::V6_vL32b_cur_ai;
+  //128B
+  case Hexagon::V6_vL32b_pi_128B:
+    return Hexagon::V6_vL32b_cur_pi_128B;
+  case Hexagon::V6_vL32b_ai_128B:
+    return Hexagon::V6_vL32b_cur_ai_128B;
   }
+  return 0;
 }
 
-// Returns true if an instruction is a conditional store.
-//
-// Note: It doesn't include conditional new-value stores as they can't be
-// converted to .new predicate.
+
+
+// The diagram below shows the steps involved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
 //
 //               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
 //                ^           ^
@@ -1524,8 +2998,6 @@ isConditionalLoad (const MachineInstr* MI) const {
 //                 p.old store
 //             [if (p0)memw(R0+#0)=R2]
 //
-// The above diagram shows the steps involoved in the conversion of a predicated
-// store instruction to its .new predicated new-value form.
 //
 // The following set of instructions further explains the scenario where
 // conditional new-value store becomes invalid when promoted to .new predicate
@@ -1538,105 +3010,33 @@ isConditionalLoad (const MachineInstr* MI) const {
 // the first two instructions because in instr 1, r0 is conditional on old value
 // of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
 // is not valid for new-value stores.
-bool HexagonInstrInfo::
-isConditionalStore (const MachineInstr* MI) const {
-  switch (MI->getOpcode())
-  {
-    default: return false;
-    case Hexagon::S4_storeirbt_io:
-    case Hexagon::S4_storeirbf_io:
-    case Hexagon::S4_pstorerbt_rr:
-    case Hexagon::S4_pstorerbf_rr:
-    case Hexagon::S2_pstorerbt_io:
-    case Hexagon::S2_pstorerbf_io:
-    case Hexagon::S2_pstorerbt_pi:
-    case Hexagon::S2_pstorerbf_pi:
-    case Hexagon::S2_pstorerdt_io:
-    case Hexagon::S2_pstorerdf_io:
-    case Hexagon::S4_pstorerdt_rr:
-    case Hexagon::S4_pstorerdf_rr:
-    case Hexagon::S2_pstorerdt_pi:
-    case Hexagon::S2_pstorerdf_pi:
-    case Hexagon::S2_pstorerht_io:
-    case Hexagon::S2_pstorerhf_io:
-    case Hexagon::S4_storeirht_io:
-    case Hexagon::S4_storeirhf_io:
-    case Hexagon::S4_pstorerht_rr:
-    case Hexagon::S4_pstorerhf_rr:
-    case Hexagon::S2_pstorerht_pi:
-    case Hexagon::S2_pstorerhf_pi:
-    case Hexagon::S2_pstorerit_io:
-    case Hexagon::S2_pstorerif_io:
-    case Hexagon::S4_storeirit_io:
-    case Hexagon::S4_storeirif_io:
-    case Hexagon::S4_pstorerit_rr:
-    case Hexagon::S4_pstorerif_rr:
-    case Hexagon::S2_pstorerit_pi:
-    case Hexagon::S2_pstorerif_pi:
-
-    // V4 global address store before promoting to dot new.
-    case Hexagon::S4_pstorerdt_abs:
-    case Hexagon::S4_pstorerdf_abs:
-    case Hexagon::S4_pstorerbt_abs:
-    case Hexagon::S4_pstorerbf_abs:
-    case Hexagon::S4_pstorerht_abs:
-    case Hexagon::S4_pstorerhf_abs:
-    case Hexagon::S4_pstorerit_abs:
-    case Hexagon::S4_pstorerif_abs:
-      return true;
-
-    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
-    // from the "Conditional Store" list. Because a predicated new value store
-    // would NOT be promoted to a double dot new store. See diagram below:
-    // This function returns yes for those stores that are predicated but not
-    // yet promoted to predicate dot new instructions.
-    //
-    //                          +---------------------+
-    //                    /-----| if (p0) memw(..)=r0 |---------\~
-    //                   ||     +---------------------+         ||
-    //          promote  ||       /\       /\                   ||  promote
-    //                   ||      /||\     /||\                  ||
-    //                  \||/    demote     ||                  \||/
-    //                   \/       ||       ||                   \/
-    //       +-------------------------+   ||   +-------------------------+
-    //       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
-    //       +-------------------------+   ||   +-------------------------+
-    //                        ||           ||         ||
-    //                        ||         demote      \||/
-    //                      promote        ||         \/ NOT possible
-    //                        ||           ||         /\~
-    //                       \||/          ||        /||\~
-    //                        \/           ||         ||
-    //                      +-----------------------------+
-    //                      | if (p0.new) memw(..)=r0.new |
-    //                      +-----------------------------+
-    //                           Double Dot New Store
-    //
-  }
-}
-
-
-bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
-  if (isNewValue(MI) && isBranch(MI))
-    return true;
-  return false;
-}
-
-bool HexagonInstrInfo::isNewValueJump(Opcode_t Opcode) const {
-  return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
-}
-
-bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
-  return (getAddrMode(MI) == HexagonII::PostInc);
-}
-
-// Returns true, if any one of the operands is a dot new
-// insn, whether it is predicated dot new or register dot new.
-bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
-  return (isNewValueInst(MI) ||
-     (isPredicated(MI) && isPredicatedNew(MI)));
-}
-
+// Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+// from the "Conditional Store" list. Because a predicated new value store
+// would NOT be promoted to a double dot new store. See diagram below:
+// This function returns yes for those stores that are predicated but not
+// yet promoted to predicate dot new instructions.
+//
+//                          +---------------------+
+//                    /-----| if (p0) memw(..)=r0 |---------\~
+//                   ||     +---------------------+         ||
+//          promote  ||       /\       /\                   ||  promote
+//                   ||      /||\     /||\                  ||
+//                  \||/    demote     ||                  \||/
+//                   \/       ||       ||                   \/
+//       +-------------------------+   ||   +-------------------------+
+//       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
+//       +-------------------------+   ||   +-------------------------+
+//                        ||           ||         ||
+//                        ||         demote      \||/
+//                      promote        ||         \/ NOT possible
+//                        ||           ||         /\~
+//                       \||/          ||        /||\~
+//                        \/           ||         ||
+//                      +-----------------------------+
+//                      | if (p0.new) memw(..)=r0.new |
+//                      +-----------------------------+
+//                           Double Dot New Store
+//
 // Returns the most basic instruction for the .new predicated instructions and
 // new-value stores.
 // For example, all of the following instructions will be converted back to the
@@ -1645,24 +3045,23 @@ bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
 // 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
 // 3) if (p0.new) memw(R0+#0) = R1      --->
 //
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+//   if (p0.new) R2 = add(R3, R4)
+//   R5 = add (R3, R1)
+// }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
 
-int HexagonInstrInfo::GetDotOldOp(const int opc) const {
-  int NewOp = opc;
-  if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
-    NewOp = Hexagon::getPredOldOpcode(NewOp);
-    assert(NewOp >= 0 &&
-           "Couldn't change predicate new instruction to its old form.");
-  }
-
-  if (isNewValueStore(NewOp)) { // Convert into non-new-value format
-    NewOp = Hexagon::getNonNVStore(NewOp);
-    assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
-  }
-  return NewOp;
-}
 
 // Return the new value instruction for a given store.
-int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
+int HexagonInstrInfo::getDotNewOp(const MachineInstr* MI) const {
   int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode());
   if (NVOpcode >= 0) // Valid new-value store instruction.
     return NVOpcode;
@@ -1672,12 +3071,6 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
   case Hexagon::S4_storerb_ur:
     return Hexagon::S4_storerbnew_ur;
 
-  case Hexagon::S4_storerh_ur:
-    return Hexagon::S4_storerhnew_ur;
-
-  case Hexagon::S4_storeri_ur:
-    return Hexagon::S4_storerinew_ur;
-
   case Hexagon::S2_storerb_pci:
     return Hexagon::S2_storerb_pci;
 
@@ -1692,203 +3085,496 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
 
   case Hexagon::S2_storerf_pci:
     return Hexagon::S2_storerf_pci;
+
+  case Hexagon::V6_vS32b_ai:
+    return Hexagon::V6_vS32b_new_ai;
+
+  case Hexagon::V6_vS32b_pi:
+    return Hexagon::V6_vS32b_new_pi;
+
+  // 128B
+  case Hexagon::V6_vS32b_ai_128B:
+    return Hexagon::V6_vS32b_new_ai_128B;
+
+  case Hexagon::V6_vS32b_pi_128B:
+    return Hexagon::V6_vS32b_new_pi_128B;
   }
   return 0;
 }
 
-// Return .new predicate version for an instruction.
-int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI,
-                                      const MachineBranchProbabilityInfo
-                                      *MBPI) const {
+// Returns the opcode to use when converting MI, which is a conditional jump,
+// into a conditional instruction which uses the .new value of the predicate.
+// We also use branch probabilities to add a hint to the jump.
+int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr *MI,
+      const MachineBranchProbabilityInfo *MBPI) const {
+  // We assume that block can have at most two successors.
+  bool taken = false;
+  const MachineBasicBlock *Src = MI->getParent();
+  const MachineOperand *BrTarget = &MI->getOperand(1);
+  const MachineBasicBlock *Dst = BrTarget->getMBB();
 
+  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
+  if (Prediction >= BranchProbability(1,2))
+    taken = true;
+
+  switch (MI->getOpcode()) {
+  case Hexagon::J2_jumpt:
+    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+  case Hexagon::J2_jumpf:
+    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+
+  default:
+    llvm_unreachable("Unexpected jump instruction.");
+  }
+}
+
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::getDotNewPredOp(const MachineInstr *MI,
+      const MachineBranchProbabilityInfo *MBPI) const {
   int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
   if (NewOpcode >= 0) // Valid predicate new instruction
     return NewOpcode;
 
   switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unknown .new type");
   // Condtional Jumps
   case Hexagon::J2_jumpt:
   case Hexagon::J2_jumpf:
     return getDotNewPredJumpOp(MI, MBPI);
 
-  case Hexagon::J2_jumprt:
-    return Hexagon::J2_jumptnewpt;
-
-  case Hexagon::J2_jumprf:
-    return Hexagon::J2_jumprfnewpt;
-
-  case Hexagon::JMPrett:
-    return Hexagon::J2_jumprtnewpt;
-
-  case Hexagon::JMPretf:
-    return Hexagon::J2_jumprfnewpt;
-
-
-  // Conditional combine
-  case Hexagon::C2_ccombinewt:
-    return Hexagon::C2_ccombinewnewt;
-  case Hexagon::C2_ccombinewf:
-    return Hexagon::C2_ccombinewnewf;
+  default:
+    assert(0 && "Unknown .new type");
   }
+  return 0;
 }
 
 
-unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
+int HexagonInstrInfo::getDotOldOp(const int opc) const {
+  int NewOp = opc;
+  if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+    NewOp = Hexagon::getPredOldOpcode(NewOp);
+    assert(NewOp >= 0 &&
+           "Couldn't change predicate new instruction to its old form.");
+  }
 
-  return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask);
+  if (isNewValueStore(NewOp)) { // Convert into non-new-value format
+    NewOp = Hexagon::getNonNVStore(NewOp);
+    assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
+  }
+  return NewOp;
 }
 
-/// immediateExtend - Changes the instruction in place to one using an immediate
-/// extender.
-void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
-  assert((isExtendable(MI)||isConstExtended(MI)) &&
-                               "Instruction must be extendable");
-  // Find which operand is extendable.
-  short ExtOpNum = getCExtOpNum(MI);
-  MachineOperand &MO = MI->getOperand(ExtOpNum);
-  // This needs to be something we understand.
-  assert((MO.isMBB() || MO.isImm()) &&
-         "Branch with unknown extendable field type");
-  // Mark given operand as extended.
-  MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
-}
 
-DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
-    const TargetSubtargetInfo &STI) const {
-  const InstrItineraryData *II = STI.getInstrItineraryData();
-  return static_cast<const HexagonSubtarget &>(STI).createDFAPacketizer(II);
-}
-
-bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
-                                            const MachineBasicBlock *MBB,
-                                            const MachineFunction &MF) const {
-  // Debug info is never a scheduling boundary. It's necessary to be explicit
-  // due to the special treatment of IT instructions below, otherwise a
-  // dbg_value followed by an IT will result in the IT instruction being
-  // considered a scheduling hazard, which is wrong. It should be the actual
-  // instruction preceding the dbg_value instruction(s), just like it is
-  // when debug info is not present.
-  if (MI->isDebugValue())
-    return false;
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
+      const MachineInstr *MI) const {
+  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+  auto &HRI = getRegisterInfo();
 
-  // Terminators and labels can't be scheduled around.
-  if (MI->getDesc().isTerminator() || MI->isPosition() || MI->isInlineAsm())
-    return true;
+  switch (MI->getOpcode()) {
+  default:
+    return HexagonII::HSIG_None;
+  //
+  // Group L1:
+  //
+  // Rd = memw(Rs+#u4:2)
+  // Rd = memub(Rs+#u4:0)
+  case Hexagon::L2_loadri_io:
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    // Special case this one from Group L2.
+    // Rd = memw(r29+#u5:2)
+    if (isIntRegForSubInst(DstReg)) {
+      if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+          HRI.getStackRegister() == SrcReg &&
+          MI->getOperand(2).isImm() &&
+          isShiftedUInt<5,2>(MI->getOperand(2).getImm()))
+        return HexagonII::HSIG_L2;
+      // Rd = memw(Rs+#u4:2)
+      if (isIntRegForSubInst(SrcReg) &&
+          (MI->getOperand(2).isImm() &&
+          isShiftedUInt<4,2>(MI->getOperand(2).getImm())))
+        return HexagonII::HSIG_L1;
+    }
+    break;
+  case Hexagon::L2_loadrub_io:
+    // Rd = memub(Rs+#u4:0)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI->getOperand(2).isImm() && isUInt<4>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_L1;
+    break;
+  //
+  // Group L2:
+  //
+  // Rd = memh/memuh(Rs+#u3:1)
+  // Rd = memb(Rs+#u3:0)
+  // Rd = memw(r29+#u5:2) - Handled above.
+  // Rdd = memd(r29+#u5:3)
+  // deallocframe
+  // [if ([!]p0[.new])] dealloc_return
+  // [if ([!]p0[.new])] jumpr r31
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+    // Rd = memh/memuh(Rs+#u3:1)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI->getOperand(2).isImm() &&
+        isShiftedUInt<3,1>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::L2_loadrb_io:
+    // Rd = memb(Rs+#u3:0)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI->getOperand(2).isImm() &&
+        isUInt<3>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::L2_loadrd_io:
+    // Rdd = memd(r29+#u5:3)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) &&
+        Hexagon::IntRegsRegClass.contains(SrcReg) &&
+        HRI.getStackRegister() == SrcReg &&
+        MI->getOperand(2).isImm() &&
+        isShiftedUInt<5,3>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_L2;
+    break;
+  // dealloc_return is not documented in Hexagon Manual, but marked
+  // with A_SUBINSN attribute in iset_v4classic.py.
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+  case Hexagon::L4_return:
+  case Hexagon::L2_deallocframe:
+    return HexagonII::HSIG_L2;
+  case Hexagon::EH_RETURN_JMPR:
+  case Hexagon::JMPret :
+    // jumpr r31
+    // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+    DstReg = MI->getOperand(0).getReg();
+    if (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::JMPrett:
+  case Hexagon::JMPretf:
+  case Hexagon::JMPrettnewpt:
+  case Hexagon::JMPretfnewpt :
+  case Hexagon::JMPrettnew :
+  case Hexagon::JMPretfnew :
+    DstReg = MI->getOperand(1).getReg();
+    SrcReg = MI->getOperand(0).getReg();
+    // [if ([!]p0[.new])] jumpr r31
+    if ((Hexagon::PredRegsRegClass.contains(SrcReg) &&
+        (Hexagon::P0 == SrcReg)) &&
+        (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg)))
+      return HexagonII::HSIG_L2;
+     break;
+  case Hexagon::L4_return_t :
+  case Hexagon::L4_return_f :
+  case Hexagon::L4_return_tnew_pnt :
+  case Hexagon::L4_return_fnew_pnt :
+  case Hexagon::L4_return_tnew_pt :
+  case Hexagon::L4_return_fnew_pt :
+    // [if ([!]p0[.new])] dealloc_return
+    SrcReg = MI->getOperand(0).getReg();
+    if (Hexagon::PredRegsRegClass.contains(SrcReg) && (Hexagon::P0 == SrcReg))
+      return HexagonII::HSIG_L2;
+    break;
+  //
+  // Group S1:
+  //
+  // memw(Rs+#u4:2) = Rt
+  // memb(Rs+#u4:0) = Rt
+  case Hexagon::S2_storeri_io:
+    // Special case this one from Group S2.
+    // memw(r29+#u5:2) = Rt
+    Src1Reg = MI->getOperand(0).getReg();
+    Src2Reg = MI->getOperand(2).getReg();
+    if (Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+        isIntRegForSubInst(Src2Reg) &&
+        HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() &&
+        isShiftedUInt<5,2>(MI->getOperand(1).getImm()))
+      return HexagonII::HSIG_S2;
+    // memw(Rs+#u4:2) = Rt
+    if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+        MI->getOperand(1).isImm() &&
+        isShiftedUInt<4,2>(MI->getOperand(1).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  case Hexagon::S2_storerb_io:
+    // memb(Rs+#u4:0) = Rt
+    Src1Reg = MI->getOperand(0).getReg();
+    Src2Reg = MI->getOperand(2).getReg();
+    if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+        MI->getOperand(1).isImm() && isUInt<4>(MI->getOperand(1).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  //
+  // Group S2:
+  //
+  // memh(Rs+#u3:1) = Rt
+  // memw(r29+#u5:2) = Rt
+  // memd(r29+#s6:3) = Rtt
+  // memw(Rs+#u4:2) = #U1
+  // memb(Rs+#u4) = #U1
+  // allocframe(#u5:3)
+  case Hexagon::S2_storerh_io:
+    // memh(Rs+#u3:1) = Rt
+    Src1Reg = MI->getOperand(0).getReg();
+    Src2Reg = MI->getOperand(2).getReg();
+    if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+        MI->getOperand(1).isImm() &&
+        isShiftedUInt<3,1>(MI->getOperand(1).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  case Hexagon::S2_storerd_io:
+    // memd(r29+#s6:3) = Rtt
+    Src1Reg = MI->getOperand(0).getReg();
+    Src2Reg = MI->getOperand(2).getReg();
+    if (isDblRegForSubInst(Src2Reg, HRI) &&
+        Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+        HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() &&
+        isShiftedInt<6,3>(MI->getOperand(1).getImm()))
+      return HexagonII::HSIG_S2;
+    break;
+  case Hexagon::S4_storeiri_io:
+    // memw(Rs+#u4:2) = #U1
+    Src1Reg = MI->getOperand(0).getReg();
+    if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() &&
+        isShiftedUInt<4,2>(MI->getOperand(1).getImm()) &&
+        MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_S2;
+    break;
+  case Hexagon::S4_storeirb_io:
+    // memb(Rs+#u4) = #U1
+    Src1Reg = MI->getOperand(0).getReg();
+    if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() &&
+        isUInt<4>(MI->getOperand(1).getImm()) && MI->getOperand(2).isImm() &&
+        MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_S2;
+    break;
+  case Hexagon::S2_allocframe:
+    if (MI->getOperand(0).isImm() &&
+        isShiftedUInt<5,3>(MI->getOperand(0).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  //
+  // Group A:
+  //
+  // Rx = add(Rx,#s7)
+  // Rd = Rs
+  // Rd = #u6
+  // Rd = #-1
+  // if ([!]P0[.new]) Rd = #0
+  // Rd = add(r29,#u6:2)
+  // Rx = add(Rx,Rs)
+  // P0 = cmp.eq(Rs,#u2)
+  // Rdd = combine(#0,Rs)
+  // Rdd = combine(Rs,#0)
+  // Rdd = combine(#u2,#U2)
+  // Rd = add(Rs,#1)
+  // Rd = add(Rs,#-1)
+  // Rd = sxth/sxtb/zxtb/zxth(Rs)
+  // Rd = and(Rs,#1)
+  case Hexagon::A2_addi:
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg)) {
+      // Rd = add(r29,#u6:2)
+      if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+        HRI.getStackRegister() == SrcReg && MI->getOperand(2).isImm() &&
+        isShiftedUInt<6,2>(MI->getOperand(2).getImm()))
+        return HexagonII::HSIG_A;
+      // Rx = add(Rx,#s7)
+      if ((DstReg == SrcReg) && MI->getOperand(2).isImm() &&
+          isInt<7>(MI->getOperand(2).getImm()))
+        return HexagonII::HSIG_A;
+      // Rd = add(Rs,#1)
+      // Rd = add(Rs,#-1)
+      if (isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() &&
+          ((MI->getOperand(2).getImm() == 1) ||
+          (MI->getOperand(2).getImm() == -1)))
+        return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_add:
+    // Rx = add(Rx,Rs)
+    DstReg = MI->getOperand(0).getReg();
+    Src1Reg = MI->getOperand(1).getReg();
+    Src2Reg = MI->getOperand(2).getReg();
+    if (isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+        isIntRegForSubInst(Src2Reg))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_andir:
+    // Same as zxtb.
+    // Rd16=and(Rs16,#255)
+    // Rd16=and(Rs16,#1)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI->getOperand(2).isImm() &&
+        ((MI->getOperand(2).getImm() == 1) ||
+        (MI->getOperand(2).getImm() == 255)))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_tfr:
+    // Rd = Rs
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_tfrsi:
+    // Rd = #u6
+    // Do not test for #u6 size since the const is getting extended
+    // regardless and compound could be formed.
+    // Rd = #-1
+    DstReg = MI->getOperand(0).getReg();
+    if (isIntRegForSubInst(DstReg))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::C2_cmoveit:
+  case Hexagon::C2_cmovenewit:
+  case Hexagon::C2_cmoveif:
+  case Hexagon::C2_cmovenewif:
+    // if ([!]P0[.new]) Rd = #0
+    // Actual form:
+    // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) &&
+        Hexagon::PredRegsRegClass.contains(SrcReg) && Hexagon::P0 == SrcReg &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0)
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::C2_cmpeqi:
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        Hexagon::P0 == DstReg && isIntRegForSubInst(SrcReg) &&
+        MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm()))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_combineii:
+  case Hexagon::A4_combineii:
+    // Rdd = combine(#u2,#U2)
+    DstReg = MI->getOperand(0).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) &&
+        ((MI->getOperand(1).isImm() && isUInt<2>(MI->getOperand(1).getImm())) ||
+        (MI->getOperand(1).isGlobal() &&
+        isUInt<2>(MI->getOperand(1).getOffset()))) &&
+        ((MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm())) ||
+        (MI->getOperand(2).isGlobal() &&
+        isUInt<2>(MI->getOperand(2).getOffset()))))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A4_combineri:
+    // Rdd = combine(Rs,#0)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+        ((MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) ||
+        (MI->getOperand(2).isGlobal() && MI->getOperand(2).getOffset() == 0)))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A4_combineir:
+    // Rdd = combine(#0,Rs)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(2).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+        ((MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) ||
+        (MI->getOperand(1).isGlobal() && MI->getOperand(1).getOffset() == 0)))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_sxtb:
+  case Hexagon::A2_sxth:
+  case Hexagon::A2_zxtb:
+  case Hexagon::A2_zxth:
+    // Rd = sxth/sxtb/zxtb/zxth(Rs)
+    DstReg = MI->getOperand(0).getReg();
+    SrcReg = MI->getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+      return HexagonII::HSIG_A;
+    break;
+  }
 
-  return false;
+  return HexagonII::HSIG_None;
 }
 
-bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
-  unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
-  if (isExtended) // Instruction must be extended.
-    return true;
 
-  unsigned isExtendable =
-    (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
-  if (!isExtendable)
-    return false;
-
-  short ExtOpNum = getCExtOpNum(MI);
-  const MachineOperand &MO = MI->getOperand(ExtOpNum);
-  // Use MO operand flags to determine if MO
-  // has the HMOTF_ConstExtended flag set.
-  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
-    return true;
-  // If this is a Machine BB address we are talking about, and it is
-  // not marked as extended, say so.
-  if (MO.isMBB())
-    return false;
-
-  // We could be using an instruction with an extendable immediate and shoehorn
-  // a global address into it. If it is a global address it will be constant
-  // extended. We do this for COMBINE.
-  // We currently only handle isGlobal() because it is the only kind of
-  // object we are going to end up with here for now.
-  // In the future we probably should add isSymbol(), etc.
-  if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
-      MO.isJTI() || MO.isCPI())
-    return true;
-
-  // If the extendable operand is not 'Immediate' type, the instruction should
-  // have 'isExtended' flag set.
-  assert(MO.isImm() && "Extendable operand must be Immediate type");
+short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr *MI) const {
+  return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Real);
+}
 
-  int MinValue = getMinValue(MI);
-  int MaxValue = getMaxValue(MI);
-  int ImmValue = MO.getImm();
 
-  return (ImmValue < MinValue || ImmValue > MaxValue);
+// Return first non-debug instruction in the basic block.
+MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
+      const {
+  for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) {
+    MachineInstr *MI = &*MII;
+    if (MI->isDebugValue())
+      continue;
+    return MI;
+  }
+  return nullptr;
 }
 
-// Return the number of bytes required to encode the instruction.
-// Hexagon instructions are fixed length, 4 bytes, unless they
-// use a constant extender, which requires another 4 bytes.
-// For debug instructions and prolog labels, return 0.
-unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const {
 
-  if (MI->isDebugValue() || MI->isPosition())
-    return 0;
+unsigned HexagonInstrInfo::getInstrTimingClassLatency(
+      const InstrItineraryData *ItinData, const MachineInstr *MI) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return getInstrLatency(ItinData, MI);
 
-  unsigned Size = MI->getDesc().getSize();
-  if (!Size)
-    // Assume the default insn size in case it cannot be determined
-    // for whatever reason.
-    Size = HEXAGON_INSTR_SIZE;
-
-  if (isConstExtended(MI) || isExtended(MI))
-    Size += HEXAGON_INSTR_SIZE;
-
-  return Size;
+  // Get the latency embedded in the itinerary. If we're not using timing class
+  // latencies or if we using BSB scheduling, then restrict the maximum latency
+  // to 1 (that is, either 0 or 1).
+  if (MI->isTransient())
+    return 0;
+  unsigned Latency = ItinData->getStageLatency(MI->getDesc().getSchedClass());
+  if (!EnableTimingClassLatency ||
+      MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>().
+      useBSBScheduling())
+    if (Latency > 1)
+      Latency = 1;
+  return Latency;
 }
 
-// Returns the opcode to use when converting MI, which is a conditional jump,
-// into a conditional instruction which uses the .new value of the predicate.
-// We also use branch probabilities to add a hint to the jump.
-int
-HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI,
-                                  const
-                                  MachineBranchProbabilityInfo *MBPI) const {
-
-  // We assume that block can have at most two successors.
-  bool taken = false;
-  MachineBasicBlock *Src = MI->getParent();
-  MachineOperand *BrTarget = &MI->getOperand(1);
-  MachineBasicBlock *Dst = BrTarget->getMBB();
 
-  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
-  if (Prediction >= BranchProbability(1,2))
-    taken = true;
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::getInvertedPredSense(
+      SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond.empty())
+    return false;
+  unsigned Opc = getInvertedPredicatedOpcode(Cond[0].getImm());
+  Cond[0].setImm(Opc);
+  return true;
+}
 
-  switch (MI->getOpcode()) {
-  case Hexagon::J2_jumpt:
-    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
-  case Hexagon::J2_jumpf:
-    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
 
-  default:
-    llvm_unreachable("Unexpected jump instruction.");
-  }
-}
-// Returns true if a particular operand is extendable for an instruction.
-bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
-                                         unsigned short OperandNum) const {
-  const uint64_t F = MI->getDesc().TSFlags;
+unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+  int InvPredOpcode;
+  InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
+                                        : Hexagon::getTruePredOpcode(Opc);
+  if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
+    return InvPredOpcode;
 
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
-          == OperandNum;
+  llvm_unreachable("Unexpected predicated instruction");
 }
 
-// Returns Operand Index for the constant extended instruction.
-unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
-  const uint64_t F = MI->getDesc().TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
-}
 
-// Returns the min value that doesn't need to be extended.
-int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
   unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
                     & HexagonII::ExtentSignedMask;
@@ -1896,13 +3582,20 @@ int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
                     & HexagonII::ExtentBitsMask;
 
   if (isSigned) // if value is signed
-    return -1U << (bits - 1);
+    return ~(-1U << (bits - 1));
   else
-    return 0;
+    return ~(-1U << bits);
 }
 
-// Returns the max value that doesn't need to be extended.
-int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
+
+unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask;
+}
+
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
   unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
                     & HexagonII::ExtentSignedMask;
@@ -1910,49 +3603,14 @@ int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
                     & HexagonII::ExtentBitsMask;
 
   if (isSigned) // if value is signed
-    return ~(-1U << (bits - 1));
+    return -1U << (bits - 1);
   else
-    return ~(-1U << bits);
+    return 0;
 }
 
-// Returns true if an instruction can be converted into a non-extended
-// equivalent instruction.
-bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const {
-
-  short NonExtOpcode;
-  // Check if the instruction has a register form that uses register in place
-  // of the extended operand, if so return that as the non-extended form.
-  if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
-    return true;
-
-  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
-    // Check addressing mode and retrieve non-ext equivalent instruction.
-
-    switch (getAddrMode(MI)) {
-    case HexagonII::Absolute :
-      // Load/store with absolute addressing mode can be converted into
-      // base+offset mode.
-      NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode());
-      break;
-    case HexagonII::BaseImmOffset :
-      // Load/store with base+offset addressing mode can be converted into
-      // base+register offset addressing mode. However left shift operand should
-      // be set to 0.
-      NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
-      break;
-    default:
-      return false;
-    }
-    if (NonExtOpcode < 0)
-      return false;
-    return true;
-  }
-  return false;
-}
 
 // Returns opcode of the non-extended equivalent instruction.
-short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
-
+short HexagonInstrInfo::getNonExtOpcode(const MachineInstr *MI) const {
   // Check if the instruction has a register form that uses register in place
   // of the extended operand, if so return that as the non-extended form.
   short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode());
@@ -1963,9 +3621,12 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
     // Check addressing mode and retrieve non-ext equivalent instruction.
     switch (getAddrMode(MI)) {
     case HexagonII::Absolute :
-      return Hexagon::getBasedWithImmOffset(MI->getOpcode());
+      return Hexagon::getBaseWithImmOffset(MI->getOpcode());
     case HexagonII::BaseImmOffset :
       return Hexagon::getBaseWithRegOffset(MI->getOpcode());
+    case HexagonII::BaseLongOffset:
+      return Hexagon::getRegShlForm(MI->getOpcode());
+
     default:
       return -1;
     }
@@ -1973,29 +3634,9 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
   return -1;
 }
 
-bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const {
-  return (Opcode == Hexagon::J2_jumpt) ||
-         (Opcode == Hexagon::J2_jumpf) ||
-         (Opcode == Hexagon::J2_jumptnewpt) ||
-         (Opcode == Hexagon::J2_jumpfnewpt) ||
-         (Opcode == Hexagon::J2_jumpt) ||
-         (Opcode == Hexagon::J2_jumpf);
-}
-
-bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
-  if (Cond.empty() || !isPredicated(Cond[0].getImm()))
-    return false;
-  return !isPredicatedTrue(Cond[0].getImm());
-}
-
-bool HexagonInstrInfo::isEndLoopN(Opcode_t Opcode) const {
-  return (Opcode == Hexagon::ENDLOOP0 ||
-          Opcode == Hexagon::ENDLOOP1);
-}
 
 bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
-                                  unsigned &PredReg, unsigned &PredRegPos,
-                                  unsigned &PredRegFlags) const {
+      unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const {
   if (Cond.empty())
     return false;
   assert(Cond.size() == 2);
@@ -2014,3 +3655,174 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
   return true;
 }
 
+
+short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr *MI) const {
+  return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Pseudo);
+}
+
+
+short HexagonInstrInfo::getRegForm(const MachineInstr *MI) const {
+  return Hexagon::getRegForm(MI->getOpcode());
+}
+
+
+// Return the number of bytes required to encode the instruction.
+// Hexagon instructions are fixed length, 4 bytes, unless they
+// use a constant extender, which requires another 4 bytes.
+// For debug instructions and prolog labels, return 0.
+unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const {
+  if (MI->isDebugValue() || MI->isPosition())
+    return 0;
+
+  unsigned Size = MI->getDesc().getSize();
+  if (!Size)
+    // Assume the default insn size in case it cannot be determined
+    // for whatever reason.
+    Size = HEXAGON_INSTR_SIZE;
+
+  if (isConstExtended(MI) || isExtended(MI))
+    Size += HEXAGON_INSTR_SIZE;
+
+  // Try and compute number of instructions in asm.
+  if (BranchRelaxAsmLarge && MI->getOpcode() == Hexagon::INLINEASM) {
+    const MachineBasicBlock &MBB = *MI->getParent();
+    const MachineFunction *MF = MBB.getParent();
+    const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+    // Count the number of register definitions to find the asm string.
+    unsigned NumDefs = 0;
+    for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+         ++NumDefs)
+      assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
+
+    assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+    // Disassemble the AsmStr and approximate number of instructions.
+    const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+    Size = getInlineAsmLength(AsmStr, *MAI);
+  }
+
+  return Size;
+}
+
+
+uint64_t HexagonInstrInfo::getType(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
+}
+
+
+unsigned HexagonInstrInfo::getUnits(const MachineInstr* MI) const {
+  const TargetSubtargetInfo &ST = MI->getParent()->getParent()->getSubtarget();
+  const InstrItineraryData &II = *ST.getInstrItineraryData();
+  const InstrStage &IS = *II.beginStage(MI->getDesc().getSchedClass());
+
+  return IS.getUnits();
+}
+
+
+unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask;
+}
+
+
+// Calculate size of the basic block without debug instructions.
+unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const {
+  return nonDbgMICount(BB->instr_begin(), BB->instr_end());
+}
+
+
+unsigned HexagonInstrInfo::nonDbgBundleSize(
+      MachineBasicBlock::const_iterator BundleHead) const {
+  assert(BundleHead->isBundle() && "Not a bundle header");
+  auto MII = BundleHead.getInstrIterator();
+  // Skip the bundle header.
+  return nonDbgMICount(++MII, getBundleEnd(BundleHead));
+}
+
+
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
+  assert((isExtendable(MI)||isConstExtended(MI)) &&
+                               "Instruction must be extendable");
+  // Find which operand is extendable.
+  short ExtOpNum = getCExtOpNum(MI);
+  MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // This needs to be something we understand.
+  assert((MO.isMBB() || MO.isImm()) &&
+         "Branch with unknown extendable field type");
+  // Mark given operand as extended.
+  MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
+
+
+bool HexagonInstrInfo::invertAndChangeJumpTarget(
+      MachineInstr* MI, MachineBasicBlock* NewTarget) const {
+  DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#"
+               << NewTarget->getNumber(); MI->dump(););
+  assert(MI->isBranch());
+  unsigned NewOpcode = getInvertedPredicatedOpcode(MI->getOpcode());
+  int TargetPos = MI->getNumOperands() - 1;
+  // In general branch target is the last operand,
+  // but some implicit defs added at the end might change it.
+  while ((TargetPos > -1) && !MI->getOperand(TargetPos).isMBB())
+    --TargetPos;
+  assert((TargetPos >= 0) && MI->getOperand(TargetPos).isMBB());
+  MI->getOperand(TargetPos).setMBB(NewTarget);
+  if (EnableBranchPrediction && isPredicatedNew(MI)) {
+    NewOpcode = reversePrediction(NewOpcode);
+  }
+  MI->setDesc(get(NewOpcode));
+  return true;
+}
+
+
+void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
+  /* +++ The code below is used to generate complete set of Hexagon Insn +++ */
+  MachineFunction::iterator A = MF.begin();
+  MachineBasicBlock &B = *A;
+  MachineBasicBlock::iterator I = B.begin();
+  MachineInstr *MI = &*I;
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstr *NewMI;
+
+  for (unsigned insn = TargetOpcode::GENERIC_OP_END+1;
+       insn < Hexagon::INSTRUCTION_LIST_END; ++insn) {
+    NewMI = BuildMI(B, MI, DL, get(insn));
+    DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) <<
+          "  Class: " << NewMI->getDesc().getSchedClass());
+    NewMI->eraseFromParent();
+  }
+  /* --- The code above is used to generate complete set of Hexagon Insn --- */
+}
+
+
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::reversePredSense(MachineInstr* MI) const {
+  DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI->dump());
+  MI->setDesc(get(getInvertedPredicatedOpcode(MI->getOpcode())));
+  return true;
+}
+
+
+// Reverse the branch prediction.
+unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const {
+  int PredRevOpcode = -1;
+  if (isPredictedTaken(Opcode))
+    PredRevOpcode = Hexagon::notTakenBranchPrediction(Opcode);
+  else
+    PredRevOpcode = Hexagon::takenBranchPrediction(Opcode);
+  assert(PredRevOpcode > 0);
+  return PredRevOpcode;
+}
+
+
+// TODO: Add more rigorous validation.
+bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond)
+      const {
+  return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1));
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index d0b8a46..9530d9f 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -1,4 +1,3 @@
-
 //===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -28,23 +27,18 @@ namespace llvm {
 
 struct EVT;
 class HexagonSubtarget;
+
 class HexagonInstrInfo : public HexagonGenInstrInfo {
   virtual void anchor();
   const HexagonRegisterInfo RI;
-  const HexagonSubtarget &Subtarget;
 
 public:
-  typedef unsigned Opcode_t;
-
   explicit HexagonInstrInfo(HexagonSubtarget &ST);
 
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
+  /// TargetInstrInfo overrides.
   ///
-  const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
@@ -52,7 +46,7 @@ public:
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const override;
 
-  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
@@ -60,50 +54,118 @@ public:
   unsigned isStoreToStackSlot(const MachineInstr *MI,
                               int &FrameIndex) const override;
 
-
+  /// Analyze the branching code at the end of MBB, returning
+  /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+  /// implemented for a target).  Upon success, this returns false and returns
+  /// with the following information in various cases:
+  ///
+  /// 1. If this block ends with no branches (it just falls through to its succ)
+  ///    just return false, leaving TBB/FBB null.
+  /// 2. If this block ends with only an unconditional branch, it sets TBB to be
+  ///    the destination block.
+  /// 3. If this block ends with a conditional branch and it falls through to a
+  ///    successor block, it sets TBB to be the branch destination block and a
+  ///    list of operands that evaluate the condition. These operands can be
+  ///    passed to other TargetInstrInfo methods to create new branches.
+  /// 4. If this block ends with a conditional branch followed by an
+  ///    unconditional branch, it returns the 'true' destination in TBB, the
+  ///    'false' destination in FBB, and a list of operands that evaluate the
+  ///    condition.  These operands can be passed to other TargetInstrInfo
+  ///    methods to create new branches.
+  ///
+  /// Note that RemoveBranch and InsertBranch must be implemented to support
+  /// cases where this method returns success.
+  ///
+  /// If AllowModify is true, then this routine is allowed to modify the basic
+  /// block (e.g. delete instructions after the unconditional branch).
+  ///
   bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                          MachineBasicBlock *&FBB,
                          SmallVectorImpl<MachineOperand> &Cond,
                          bool AllowModify) const override;
 
+  /// Remove the branching code at the end of the specific MBB.
+  /// This is only invoked in cases where AnalyzeBranch returns success. It
+  /// returns the number of instructions that were removed.
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
+  /// Insert branch code into the end of the specified MachineBasicBlock.
+  /// The operands to this method are the same as those
+  /// returned by AnalyzeBranch.  This is only invoked in cases where
+  /// AnalyzeBranch returns success. It returns the number of instructions
+  /// inserted.
+  ///
+  /// It is also invoked by tail merging to add unconditional branches in
+  /// cases where AnalyzeBranch doesn't apply because there was no original
+  /// branch to analyze.  At least this much must be implemented, else tail
+  /// merging needs to be disabled.
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         DebugLoc DL) const override;
 
-  bool analyzeCompare(const MachineInstr *MI,
-                      unsigned &SrcReg, unsigned &SrcReg2,
-                      int &Mask, int &Value) const override;
+  /// Return true if it's profitable to predicate
+  /// instructions with accumulated instruction latency of "NumCycles"
+  /// of the specified basic block, where the probability of the instructions
+  /// being executed is given by Probability, and Confidence is a measure
+  /// of our confidence that it will be properly predicted.
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           BranchProbability Probability) const override;
+
+  /// Second variant of isProfitableToIfCvt. This one
+  /// checks for the case where two basic blocks from true and false path
+  /// of a if-then-else (diamond) are predicated on mutally exclusive
+  /// predicates, where the probability of the true path being taken is given
+  /// by Probability, and Confidence is a measure of our confidence that it
+  /// will be properly predicted.
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           BranchProbability Probability) const override;
+
+  /// Return true if it's profitable for if-converter to duplicate instructions
+  /// of specified accumulated instruction latencies in the specified MBB to
+  /// enable if-conversion.
+  /// The probability of the instructions being executed is given by
+  /// Probability, and Confidence is a measure of our confidence that it
+  /// will be properly predicted.
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                 BranchProbability Probability) const override;
 
+  /// Emit instructions to copy a pair of physical registers.
+  ///
+  /// This function should support copies within any legal register class as
+  /// well as any cross-class copies created during instruction selection.
+  ///
+  /// The source and destination registers may overlap, which may require a
+  /// careful implementation when multiple copy instructions are required for
+  /// large registers. See for example the ARM target.
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  /// Store the specified register of the given register class to the specified
+  /// stack frame index. The store instruction is to be added to the given
+  /// machine basic block before the specified machine instruction. If isKill
+  /// is true, the register operand is the last use and must be marked kill.
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
-  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                      SmallVectorImpl<MachineOperand> &Addr,
-                      const TargetRegisterClass *RC,
-                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
+  /// Load the specified register of the given register class from the specified
+  /// stack frame index. The load instruction is to be added to the given
+  /// machine basic block before the specified machine instruction.
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                       SmallVectorImpl<MachineOperand> &Addr,
-                       const TargetRegisterClass *RC,
-                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  /// expandPostRAPseudo - This function is called for all pseudo instructions
+  /// This function is called for all pseudo instructions
   /// that remain after register allocation. Many pseudo instructions are
   /// created to help register allocation. This is the place to convert them
   /// into real instructions. The target can edit MI in place, or it can insert
@@ -111,122 +173,228 @@ public:
   /// anything was changed.
   bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
+  /// Reverses the branch condition of the specified condition list,
+  /// returning false on success and true if it cannot be reversed.
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+        const override;
 
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      MachineInstr *LoadMI) const override {
-    return nullptr;
-  }
+  /// Insert a noop into the instruction stream at the specified point.
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
-  unsigned createVR(MachineFunction* MF, MVT VT) const;
+  /// Returns true if the instruction is already predicated.
+  bool isPredicated(const MachineInstr *MI) const override;
 
-  bool isBranch(const MachineInstr *MI) const;
-  bool isPredicable(MachineInstr *MI) const override;
+  /// Convert the instruction into a predicated instruction.
+  /// It returns true if the operation was successful.
   bool PredicateInstruction(MachineInstr *MI,
                             ArrayRef<MachineOperand> Cond) const override;
 
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
-                           unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const override;
-
-  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                           unsigned NumTCycles, unsigned ExtraTCycles,
-                           MachineBasicBlock &FMBB,
-                           unsigned NumFCycles, unsigned ExtraFCycles,
-                           const BranchProbability &Probability) const override;
+  /// Returns true if the first specified predicate
+  /// subsumes the second, e.g. GE subsumes GT.
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
 
-  bool isPredicated(const MachineInstr *MI) const override;
-  bool isPredicated(unsigned Opcode) const;
-  bool isPredicatedTrue(const MachineInstr *MI) const;
-  bool isPredicatedTrue(unsigned Opcode) const;
-  bool isPredicatedNew(const MachineInstr *MI) const;
-  bool isPredicatedNew(unsigned Opcode) const;
+  /// If the specified instruction defines any predicate
+  /// or condition code register(s) used for predication, returns true as well
+  /// as the definition predicate(s) by reference.
   bool DefinesPredicate(MachineInstr *MI,
                         std::vector<MachineOperand> &Pred) const override;
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
 
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  /// Return true if the specified instruction can be predicated.
+  /// By default, this returns true for every instruction with a
+  /// PredicateOperand.
+  bool isPredicable(MachineInstr *MI) const override;
 
-  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
-                           const BranchProbability &Probability) const override;
+  /// Test if the given instruction should be considered a scheduling boundary.
+  /// This primarily includes labels and terminators.
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
 
+  /// Measure the specified inline asm to determine an approximation of its
+  /// length.
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const override;
+
+  /// Allocate and return a hazard recognizer to use for this target when
+  /// scheduling the machine instructions after register allocation.
+  ScheduleHazardRecognizer*
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
+                                     const ScheduleDAG *DAG) const override;
+
+  /// For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
+
+  /// Compute the instruction latency of a given instruction.
+  /// If the instruction has higher cost when predicated, it's returned via
+  /// PredCost.
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = 0) const override;
+
+  /// Create machine specific model for scheduling.
   DFAPacketizer *
   CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override;
 
-  bool isSchedulingBoundary(const MachineInstr *MI,
-                            const MachineBasicBlock *MBB,
-                            const MachineFunction &MF) const override;
-  bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
-  bool isValidAutoIncImm(const EVT VT, const int Offset) const;
-  bool isMemOp(const MachineInstr *MI) const;
-  bool isSpillPredRegOp(const MachineInstr *MI) const;
-  bool isU6_3Immediate(const int value) const;
-  bool isU6_2Immediate(const int value) const;
-  bool isU6_1Immediate(const int value) const;
-  bool isU6_0Immediate(const int value) const;
-  bool isS4_3Immediate(const int value) const;
-  bool isS4_2Immediate(const int value) const;
-  bool isS4_1Immediate(const int value) const;
-  bool isS4_0Immediate(const int value) const;
-  bool isS12_Immediate(const int value) const;
-  bool isU6_Immediate(const int value) const;
-  bool isS8_Immediate(const int value) const;
-  bool isS6_Immediate(const int value) const;
-
-  bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const;
-  bool isConditionalTransfer(const MachineInstr* MI) const;
+  // Sometimes, it is possible for the target
+  // to tell, even without aliasing information, that two MIs access different
+  // memory addresses. This function returns true if two MIs access different
+  // memory addresses and false otherwise.
+  bool areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+                                       AliasAnalysis *AA = nullptr)
+                                       const override;
+
+
+  /// HexagonInstrInfo specifics.
+  ///
+
+  const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+  bool isAbsoluteSet(const MachineInstr* MI) const;
+  bool isAccumulator(const MachineInstr *MI) const;
+  bool isComplex(const MachineInstr *MI) const;
+  bool isCompoundBranchInstr(const MachineInstr *MI) const;
+  bool isCondInst(const MachineInstr *MI) const;
   bool isConditionalALU32 (const MachineInstr* MI) const;
-  bool isConditionalLoad (const MachineInstr* MI) const;
+  bool isConditionalLoad(const MachineInstr* MI) const;
   bool isConditionalStore(const MachineInstr* MI) const;
-  bool isNewValueInst(const MachineInstr* MI) const;
-  bool isNewValue(const MachineInstr* MI) const;
-  bool isNewValue(Opcode_t Opcode) const;
-  bool isDotNewInst(const MachineInstr* MI) const;
-  int GetDotOldOp(const int opc) const;
-  int GetDotNewOp(const MachineInstr* MI) const;
-  int GetDotNewPredOp(MachineInstr *MI,
-                      const MachineBranchProbabilityInfo
-                      *MBPI) const;
-  bool mayBeNewStore(const MachineInstr* MI) const;
+  bool isConditionalTransfer(const MachineInstr* MI) const;
+  bool isConstExtended(const MachineInstr *MI) const;
   bool isDeallocRet(const MachineInstr *MI) const;
-  unsigned getInvertedPredicatedOpcode(const int Opc) const;
+  bool isDependent(const MachineInstr *ProdMI,
+                   const MachineInstr *ConsMI) const;
+  bool isDotCurInst(const MachineInstr* MI) const;
+  bool isDotNewInst(const MachineInstr* MI) const;
+  bool isDuplexPair(const MachineInstr *MIa, const MachineInstr *MIb) const;
+  bool isEarlySourceInstr(const MachineInstr *MI) const;
+  bool isEndLoopN(unsigned Opcode) const;
+  bool isExpr(unsigned OpType) const;
   bool isExtendable(const MachineInstr* MI) const;
   bool isExtended(const MachineInstr* MI) const;
-  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isFloat(const MachineInstr *MI) const;
+  bool isHVXMemWithAIndirect(const MachineInstr *I,
+                             const MachineInstr *J) const;
+  bool isIndirectCall(const MachineInstr *MI) const;
+  bool isIndirectL4Return(const MachineInstr *MI) const;
+  bool isJumpR(const MachineInstr *MI) const;
+  bool isJumpWithinBranchRange(const MachineInstr *MI, unsigned offset) const;
+  bool isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI,
+                                  const MachineInstr *ESMI) const;
+  bool isLateResultInstr(const MachineInstr *MI) const;
+  bool isLateSourceInstr(const MachineInstr *MI) const;
+  bool isLoopN(const MachineInstr *MI) const;
+  bool isMemOp(const MachineInstr *MI) const;
+  bool isNewValue(const MachineInstr* MI) const;
+  bool isNewValue(unsigned Opcode) const;
+  bool isNewValueInst(const MachineInstr* MI) const;
+  bool isNewValueJump(const MachineInstr* MI) const;
+  bool isNewValueJump(unsigned Opcode) const;
   bool isNewValueStore(const MachineInstr* MI) const;
   bool isNewValueStore(unsigned Opcode) const;
-  bool isNewValueJump(const MachineInstr* MI) const;
-  bool isNewValueJump(Opcode_t Opcode) const;
-  bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+  bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const;
+  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isPredicatedNew(const MachineInstr *MI) const;
+  bool isPredicatedNew(unsigned Opcode) const;
+  bool isPredicatedTrue(const MachineInstr *MI) const;
+  bool isPredicatedTrue(unsigned Opcode) const;
+  bool isPredicated(unsigned Opcode) const;
+  bool isPredicateLate(unsigned Opcode) const;
+  bool isPredictedTaken(unsigned Opcode) const;
+  bool isSaveCalleeSavedRegsCall(const MachineInstr *MI) const;
+  bool isSolo(const MachineInstr* MI) const;
+  bool isSpillPredRegOp(const MachineInstr *MI) const;
+  bool isTC1(const MachineInstr *MI) const;
+  bool isTC2(const MachineInstr *MI) const;
+  bool isTC2Early(const MachineInstr *MI) const;
+  bool isTC4x(const MachineInstr *MI) const;
+  bool isV60VectorInstruction(const MachineInstr *MI) const;
+  bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+  bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
+  bool isVecAcc(const MachineInstr *MI) const;
+  bool isVecALU(const MachineInstr *MI) const;
+  bool isVecUsableNextPacket(const MachineInstr *ProdMI,
+                             const MachineInstr *ConsMI) const;
+
+
+  bool canExecuteInBundle(const MachineInstr *First,
+                          const MachineInstr *Second) const;
+  bool hasEHLabel(const MachineBasicBlock *B) const;
+  bool hasNonExtEquivalent(const MachineInstr *MI) const;
+  bool hasPseudoInstrPair(const MachineInstr *MI) const;
+  bool hasUncondBranch(const MachineBasicBlock *B) const;
+  bool mayBeCurLoad(const MachineInstr* MI) const;
+  bool mayBeNewStore(const MachineInstr* MI) const;
+  bool producesStall(const MachineInstr *ProdMI,
+                     const MachineInstr *ConsMI) const;
+  bool producesStall(const MachineInstr *MI,
+                     MachineBasicBlock::const_instr_iterator MII) const;
+  bool predCanBeUsedAsDotNew(const MachineInstr *MI, unsigned PredReg) const;
+  bool PredOpcodeHasJMP_c(unsigned Opcode) const;
+  bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
 
 
-  void immediateExtend(MachineInstr *MI) const;
-  bool isConstExtended(const MachineInstr *MI) const;
-  unsigned getSize(const MachineInstr *MI) const;  
-  int getDotNewPredJumpOp(MachineInstr *MI,
-                      const MachineBranchProbabilityInfo *MBPI) const;
   unsigned getAddrMode(const MachineInstr* MI) const;
-  bool isOperandExtended(const MachineInstr *MI,
-                         unsigned short OperandNum) const;
-  unsigned short getCExtOpNum(const MachineInstr *MI) const;
-  int getMinValue(const MachineInstr *MI) const;
+  unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset,
+                            unsigned &AccessSize) const;
+  bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
+                                unsigned &OffsetPos) const;
+  SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const;
+  unsigned getCExtOpNum(const MachineInstr *MI) const;
+  HexagonII::CompoundGroup
+  getCompoundCandidateGroup(const MachineInstr *MI) const;
+  unsigned getCompoundOpcode(const MachineInstr *GA,
+                             const MachineInstr *GB) const;
+  int getCondOpcode(int Opc, bool sense) const;
+  int getDotCurOp(const MachineInstr* MI) const;
+  int getDotNewOp(const MachineInstr* MI) const;
+  int getDotNewPredJumpOp(const MachineInstr *MI,
+                          const MachineBranchProbabilityInfo *MBPI) const;
+  int getDotNewPredOp(const MachineInstr *MI,
+                      const MachineBranchProbabilityInfo *MBPI) const;
+  int getDotOldOp(const int opc) const;
+  HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr *MI)
+                                                         const;
+  short getEquivalentHWInstr(const MachineInstr *MI) const;
+  MachineInstr *getFirstNonDbgInst(MachineBasicBlock *BB) const;
+  unsigned getInstrTimingClassLatency(const InstrItineraryData *ItinData,
+                                      const MachineInstr *MI) const;
+  bool getInvertedPredSense(SmallVectorImpl<MachineOperand> &Cond) const;
+  unsigned getInvertedPredicatedOpcode(const int Opc) const;
   int getMaxValue(const MachineInstr *MI) const;
-  bool NonExtEquivalentExists (const MachineInstr *MI) const;
+  unsigned getMemAccessSize(const MachineInstr* MI) const;
+  int getMinValue(const MachineInstr *MI) const;
   short getNonExtOpcode(const MachineInstr *MI) const;
-  bool PredOpcodeHasJMP_c(Opcode_t Opcode) const;
-  bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
-  bool isEndLoopN(Opcode_t Opcode) const;
   bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg,
                   unsigned &PredRegPos, unsigned &PredRegFlags) const;
-  int getCondOpcode(int Opc, bool sense) const;
+  short getPseudoInstrPair(const MachineInstr *MI) const;
+  short getRegForm(const MachineInstr *MI) const;
+  unsigned getSize(const MachineInstr *MI) const;
+  uint64_t getType(const MachineInstr* MI) const;
+  unsigned getUnits(const MachineInstr* MI) const;
+  unsigned getValidSubTargets(const unsigned Opcode) const;
+
 
+  /// getInstrTimingClassLatency - Compute the instruction latency of a given
+  /// instruction using Timing Class information, if available.
+  unsigned nonDbgBBSize(const MachineBasicBlock *BB) const;
+  unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const;
+
+
+  void immediateExtend(MachineInstr *MI) const;
+  bool invertAndChangeJumpTarget(MachineInstr* MI,
+                                 MachineBasicBlock* NewTarget) const;
+  void genAllInsnTimingClasses(MachineFunction &MF) const;
+  bool reversePredSense(MachineInstr* MI) const;
+  unsigned reversePrediction(unsigned Opcode) const;
+  bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const;
 };
 
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
index 3b32c10..421403f 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -13,7 +13,7 @@
 
 include "HexagonInstrFormats.td"
 include "HexagonOperands.td"
-
+include "HexagonInstrEnc.td"
 // Pattern fragment that combines the value type and the register class
 // into a single parameter.
 // The pat frags in the definitions below need to have a named register,
@@ -1426,9 +1426,6 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
 
-def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
-def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
-
 class CondStr<string CReg, bit True, bit New> {
   string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
 }
@@ -1606,8 +1603,6 @@ def EH_RETURN_JMPR : T_JMPr;
 
 def: Pat<(eh_return),
          (EH_RETURN_JMPR (i32 R31))>;
-def: Pat<(HexagonBR_JT (i32 IntRegs:$dst)),
-         (J2_jumpr IntRegs:$dst)>;
 def: Pat<(brind (i32 IntRegs:$dst)),
          (J2_jumpr IntRegs:$dst)>;
 
@@ -2825,7 +2820,7 @@ let CextOpcode = "ADD_acc" in {
   let isExtentSigned = 1 in
   def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8Ext,
                  [(set (i32 IntRegs:$dst),
-                       (add (add (i32 IntRegs:$src2), s16_16ImmPred:$src3),
+                       (add (add (i32 IntRegs:$src2), s32ImmPred:$src3),
                             (i32 IntRegs:$src1)))]>, ImmRegRel;
 
   def M2_acci  : T_MType_acc_rr <"+= add",  0b000, 0b001, 0,
@@ -2859,7 +2854,7 @@ class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp>
 def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>;
 def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32ImmPred>;
 
-def : T_MType_acc_pat1 <M2_naccii, add, sub, s16_16ImmPred>;
+def : T_MType_acc_pat1 <M2_naccii, add, sub, s32ImmPred>;
 def : T_MType_acc_pat2 <M2_nacci, add, sub>;
 
 //===----------------------------------------------------------------------===//
@@ -3303,7 +3298,8 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
                      !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
                      !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
                                       /* s4_0Imm */ offset{3-0})));
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
 
     let IClass = 0b1010;
 
@@ -3322,7 +3318,7 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
 //===----------------------------------------------------------------------===//
 let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
 class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                      bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew >
+                   bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
   : STInst <(outs IntRegs:$_dst_),
             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
   !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
@@ -3341,7 +3337,8 @@ class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
                      !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
                                       /* s4_0Imm */ offset{3-0})));
 
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
     let isPredicatedNew = isPredNew;
     let isPredicatedFalse = isPredNot;
 
@@ -3404,7 +3401,6 @@ def: Storepi_pat<post_store,      I64, s4_3ImmPred, S2_storerd_pi>;
 //===----------------------------------------------------------------------===//
 // Template class for post increment stores with register offset.
 //===----------------------------------------------------------------------===//
-let isNVStorable = 1 in
 class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
                      MemAccessSize AccessSz, bit isHalf = 0>
   : STInst <(outs IntRegs:$_dst_),
@@ -3416,6 +3412,9 @@ class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
     bits<5> src3;
     let accessSize = AccessSz;
 
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
+
     let IClass = 0b1010;
 
     let Inst{27-24} = 0b1101;
@@ -3430,12 +3429,11 @@ def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
 def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
 def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
 def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
-
 def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
 
 let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
 class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<3>MajOp, bit isH = 0>
+                  bits<3> MajOp, bit isH = 0>
   : STInst <(outs),
             (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
   mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
@@ -3455,6 +3453,8 @@ class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
                      !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
                      !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
                                       /* s11_0Ext */ src2{10-0})));
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
     let IClass = 0b1010;
 
     let Inst{27} = 0b0;
@@ -3494,7 +3494,10 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
                      !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
                      !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
                                       /* u6_0Ext */ src3{5-0})));
-     let IClass = 0b0100;
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+    let IClass = 0b0100;
 
     let Inst{27} = 0b0;
     let Inst{26} = PredNot;
@@ -3508,7 +3511,7 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
     let Inst{1-0} = src1;
   }
 
-let isExtendable = 1, isNVStorable = 1, hasSideEffects = 0 in
+let isExtendable = 1, hasSideEffects = 0 in
 multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
                  Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
   let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
@@ -3665,7 +3668,7 @@ def S2_allocframe: ST0Inst <
 
 // S2_storer[bhwdf]_pci: Store byte/half/word/double.
 // S2_storer[bhwdf]_pci -> S2_storerbnew_pci
-let Uses = [CS], isNVStorable = 1 in
+let Uses = [CS] in
 class T_store_pci <string mnemonic, RegisterClass RC,
                          Operand Imm, bits<4>MajOp,
                          MemAccessSize AlignSize, string RegSrc = "Rt">
@@ -3679,6 +3682,8 @@ class T_store_pci <string mnemonic, RegisterClass RC,
     bits<1> Mu;
     bits<5> Rt;
     let accessSize = AlignSize;
+    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
 
     let IClass = 0b1010;
     let Inst{27-25} = 0b100;
@@ -3696,15 +3701,15 @@ class T_store_pci <string mnemonic, RegisterClass RC,
   }
 
 def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
-                                        ByteAccess>;
+                                 ByteAccess>;
 def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
-                                        HalfWordAccess>;
+                                 HalfWordAccess>;
 def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
-                                        HalfWordAccess, "Rt.h">;
+                                 HalfWordAccess, "Rt.h">;
 def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
-                                        WordAccess>;
+                                 WordAccess>;
 def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
-                                        DoubleWordAccess>;
+                                 DoubleWordAccess>;
 
 let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4 in
 class T_storenew_pci <string mnemonic, Operand Imm,
@@ -3762,7 +3767,7 @@ def S2_storerd_pci_pseudo : T_store_pci_pseudo <"memd", DoubleRegs>;
 //===----------------------------------------------------------------------===//
 // Circular stores with auto-increment register
 //===----------------------------------------------------------------------===//
-let Uses = [CS], isNVStorable = 1 in
+let Uses = [CS] in
 class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
                                MemAccessSize AlignSize, string RegSrc = "Rt">
   : STInst <(outs IntRegs:$_dst_),
@@ -3775,6 +3780,8 @@ class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
     bits<5> Rt;
 
     let accessSize = AlignSize;
+    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
 
     let IClass = 0b1010;
     let Inst{27-25} = 0b100;
@@ -5784,7 +5791,21 @@ include "HexagonInstrInfoV5.td"
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
+// V60 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
 // ALU32/64/Vector +
 //===----------------------------------------------------------------------===///
 
 include "HexagonInstrInfoVector.td"
+
+include "HexagonInstrAlias.td"
+include "HexagonSystemInst.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 65b0f49..37c2042 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -684,7 +684,7 @@ def: Pat<(i64 (zext (i32 IntRegs:$src1))),
 // Template class for store instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
 let isExtended = 1, opExtendable = 1, opExtentBits = 6,
-    addrMode = AbsoluteSet, isNVStorable = 1 in
+    addrMode = AbsoluteSet in
 class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
                    bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
   : STInst<(outs IntRegs:$dst),
@@ -696,6 +696,9 @@ class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
     let accessSize = AccessSz;
     let BaseOpcode = BaseOp#"_AbsSet";
 
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
     let IClass = 0b1010;
 
     let Inst{27-24} = 0b1011;
@@ -750,7 +753,7 @@ let mayStore = 1, addrMode = AbsoluteSet in {
 }
 
 let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
-addrMode = BaseLongOffset, AddedComplexity = 40 in
+    addrMode = BaseLongOffset, AddedComplexity = 40 in
 class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
                      bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
   : STInst<(outs),
@@ -766,6 +769,10 @@ class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
     let accessSize = AccessSz;
     let CextOpcode = CextOp;
     let BaseOpcode = CextOp#"_shl";
+
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
     let IClass = 0b1010;
 
     let Inst{27-24} =0b1101;
@@ -856,6 +863,9 @@ class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
     bits<2> u2;
     bits<5> Rt;
 
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
     let IClass = 0b0011;
 
     let Inst{27-24} = 0b1011;
@@ -888,6 +898,8 @@ class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
 
     let isPredicatedFalse = isNot;
     let isPredicatedNew = isPredNew;
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
 
     let IClass = 0b0011;
 
@@ -1826,43 +1838,22 @@ def: LogLogNot_pat<or,  or,  C4_or_orn>;
 // below are needed to support code generation for PIC
 //===----------------------------------------------------------------------===//
 
-def SDT_HexagonPICAdd
+def SDT_HexagonAtGot
+  : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def SDT_HexagonAtPcrel
   : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDT_HexagonGOTAdd
-  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-def SDT_HexagonGOTAddInternal   : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-def SDT_HexagonGOTAddInternalJT : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-def SDT_HexagonGOTAddInternalBA : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-
-def Hexagonpic_add      : SDNode<"HexagonISD::PIC_ADD", SDT_HexagonPICAdd>;
-def Hexagonat_got       : SDNode<"HexagonISD::AT_GOT", SDT_HexagonGOTAdd>;
-def Hexagongat_pcrel    : SDNode<"HexagonISD::AT_PCREL",
-                                 SDT_HexagonGOTAddInternal>;
-def Hexagongat_pcrel_jt : SDNode<"HexagonISD::AT_PCREL",
-                                 SDT_HexagonGOTAddInternalJT>;
-def Hexagongat_pcrel_ba : SDNode<"HexagonISD::AT_PCREL",
-                                 SDT_HexagonGOTAddInternalBA>;
-
-// PIC: Map from a block address computation to a PC-relative add
-def: Pat<(Hexagongat_pcrel_ba tblockaddress:$src1),
-         (C4_addipc u32ImmPred:$src1)>;
-
-// PIC: Map from the computation to generate a GOT pointer to a PC-relative add
-def: Pat<(Hexagonpic_add texternalsym:$src1),
-         (C4_addipc u32ImmPred:$src1)>;
-
-// PIC: Map from a jump table address computation to a PC-relative add
-def: Pat<(Hexagongat_pcrel_jt tjumptable:$src1),
-         (C4_addipc u32ImmPred:$src1)>;
 
-// PIC: Map from a GOT-relative symbol reference to a load
-def: Pat<(Hexagonat_got (i32 IntRegs:$src1), tglobaladdr:$src2),
-         (L2_loadri_io IntRegs:$src1, s30_2ImmPred:$src2)>;
+// AT_GOT address-of-GOT, address-of-global, offset-in-global
+def HexagonAtGot       : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>;
+// AT_PCREL address-of-global
+def HexagonAtPcrel     : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>;
 
-// PIC: Map from a static symbol reference to a PC-relative add
-def: Pat<(Hexagongat_pcrel tglobaladdr:$src1),
-         (C4_addipc u32ImmPred:$src1)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)),
+         (L2_loadri_io I32:$got, imm:$addr)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off),
+         (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>;
+def: Pat<(HexagonAtPcrel I32:$addr),
+         (C4_addipc imm:$addr)>;
 
 //===----------------------------------------------------------------------===//
 // CR -
@@ -1903,7 +1894,7 @@ def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
                             (ins IntRegs:$Rs, IntRegs:$Ru, s6Ext:$s6),
   "$Rd = add($Rs, add($Ru, #$s6))" ,
   [(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rs),
-                           (add (i32 IntRegs:$Ru), s16_16ImmPred:$s6)))],
+                           (add (i32 IntRegs:$Ru), s32ImmPred:$s6)))],
   "", ALU64_tc_2_SLOT23> {
     bits<5> Rd;
     bits<5> Rs;
@@ -3290,27 +3281,33 @@ defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
     Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
   def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+  let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
 }
 
 // Restore registers and dealloc frame before a tail call.
 let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
   def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel;
+  let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel;
 }
 
 // Save registers function call.
 let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
   def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel;
+  let isExtended = 1, opExtendable = 0 in
+    def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel;
 }
 
 //===----------------------------------------------------------------------===//
 // Template class for non predicated store instructions with
 // GP-Relative or absolute addressing.
 //===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1, isNVStorable = 1 in
+let hasSideEffects = 0, isPredicable = 1 in
 class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                    bits<2>MajOp, Operand AddrOp, bit isAbs, bit isHalf>
-  : STInst<(outs), (ins AddrOp:$addr, RC:$src),
-  mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src"#!if(isHalf, ".h",""),
+                    bits<2>MajOp, bit isAbs, bit isHalf>
+  : STInst<(outs), (ins ImmOp:$addr, RC:$src),
+  mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
   [], "", V2LDST_tc_st_SLOT01> {
     bits<19> addr;
     bits<5> src;
@@ -3321,6 +3318,10 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
                      !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
                      !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
                                       /* u16_0Imm */ addr{15-0})));
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+    let Uses = !if (isAbs, [], [GP]);
+
     let IClass = 0b0100;
     let Inst{27} = 1;
     let Inst{26-25} = offsetBits{15-14};
@@ -3337,11 +3338,10 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
 // Template class for predicated store instructions with
 // GP-Relative or absolute addressing.
 //===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, isNVStorable = 1, opExtentBits = 6,
-    opExtendable = 1 in
+let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
 class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
                        bit isHalf, bit isNot, bit isNew>
-  : STInst<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, RC: $src2),
+  : STInst<(outs), (ins PredRegs:$src1, u32MustExt:$absaddr, RC: $src2),
   !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
   ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
   [], "", ST_tc_st_SLOT01>, AddrModeRel {
@@ -3351,6 +3351,8 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
 
     let isPredicatedNew = isNew;
     let isPredicatedFalse = isNot;
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
 
     let IClass = 0b1010;
 
@@ -3371,7 +3373,7 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
 //===----------------------------------------------------------------------===//
 class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
                  bits<2> MajOp, bit isHalf>
-  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1, isHalf>,
+  : T_StoreAbsGP <mnemonic, RC, u32MustExt, MajOp, 1, isHalf>,
                   AddrModeRel {
   string ImmOpStr = !cast<string>(ImmOp);
   let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
@@ -3424,6 +3426,7 @@ class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs>
                      !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
                      !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
                                       /* u16_0Imm */ addr{15-0})));
+    let Uses = !if (isAbs, [], [GP]);
     let IClass = 0b0100;
 
     let Inst{27} = 1;
@@ -3538,7 +3541,7 @@ defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
 let isAsmParserOnly = 1 in
 class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
-  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0, isHalf> {
+  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
     // Set BaseOpcode same as absolute addressing instructions so that
     // non-predicated GP-Rel instructions can have relate with predicated
     // Absolute instruction.
@@ -3553,7 +3556,7 @@ multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
   // Absolute instruction.
   let BaseOpcode = BaseOp#_abs in {
     def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
-                                globaladdress, 0, isHalf>;
+                                0, isHalf>;
     // New-value store
     def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 0> ;
   }
@@ -3615,9 +3618,9 @@ let AddedComplexity = 100 in {
 //===----------------------------------------------------------------------===//
 let isPredicable = 1, hasSideEffects = 0 in
 class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3> MajOp, Operand AddrOp, bit isAbs>
-  : LDInst <(outs RC:$dst), (ins AddrOp:$addr),
-  "$dst = "#mnemonic# !if(isAbs, "(##", "(#")#"$addr)",
+                   bits<3> MajOp>
+  : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
+  "$dst = "#mnemonic# "(#$addr)",
   [], "", V2LDST_tc_ld_SLOT01> {
     bits<5> dst;
     bits<19> addr;
@@ -3642,7 +3645,7 @@ class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
 
 class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
                  bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1>, AddrModeRel {
+  : T_LoadAbsGP <mnemonic, RC, u32MustExt, MajOp>, AddrModeRel {
 
     string ImmOpStr = !cast<string>(ImmOp);
     let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
@@ -3660,10 +3663,11 @@ class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
 // Template class for predicated load instructions with
 // absolute addressing mode.
 //===----------------------------------------------------------------------===//
-let isPredicated = 1, opExtentBits = 6, opExtendable = 2 in
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
+    opExtendable = 2 in
 class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
                       bit isPredNot, bit isPredNew>
-  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u6Ext:$absaddr),
+  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32MustExt:$absaddr),
   !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
   ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
     bits<5> dst;
@@ -3734,10 +3738,10 @@ defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
 // if ([!]Pv[.new]) Rx=mem[bhwd](##global)
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1 in
+let isAsmParserOnly = 1, Uses = [GP] in
 class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
                 bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0>, PredNewRel {
+  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
     let BaseOpcode = BaseOp#_abs;
   }
 
@@ -3841,26 +3845,6 @@ let AddedComplexity = 100 in {
   def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>;
 }
 
-// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
-let AddedComplexity = 100 in
-def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
-           (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>;
-
-// Transfer global address into a register
-let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
-isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
-def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
-           "$dst = #$src1",
-           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>;
-
-// Transfer a block address into a register
-def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
-          (TFRI_V4 tblockaddress:$src1)>;
-
-let AddedComplexity = 50 in
-def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
-           (TFRI_V4 tglobaladdr:$src1)>;
-
 // i8/i16/i32 -> i64 loads
 // We need a complexity of 120 here to override preceding handling of
 // zextload.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
index 337f4ea..823961f 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -98,21 +98,21 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
 // HexagonInstrInfo.td patterns.
 let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1,
     isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
-    isCodeGenOnly = 1 in
+    isCodeGenOnly = 1, isPseudo = 1 in
 def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1),
                       "$dst = #$src1",
                       [(set F32:$dst, fpimm:$src1)]>,
                       Requires<[HasV5T]>;
 
-let isExtended = 1, opExtendable = 2, isPredicated = 1,
-    hasSideEffects = 0, validSubTargets = HasV5SubT, isCodeGenOnly = 1 in
+let isExtended = 1, opExtendable = 2, isPredicated = 1, hasSideEffects = 0,
+    validSubTargets = HasV5SubT, isCodeGenOnly = 1, isPseudo = 1 in
 def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
                           (ins PredRegs:$src1, f32Ext:$src2),
                           "if ($src1) $dst = #$src2", []>,
                           Requires<[HasV5T]>;
 
-let isPseudo = 1, isExtended = 1, opExtendable = 2, isPredicated = 1,
-    isPredicatedFalse = 1, hasSideEffects = 0, validSubTargets = HasV5SubT in
+let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1,
+    hasSideEffects = 0, validSubTargets = HasV5SubT, isPseudo = 1 in
 def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
                              (ins PredRegs:$src1, f32Ext:$src2),
                              "if (!$src1) $dst = #$src2", []>,
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
new file mode 100644
index 0000000..897ada0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -0,0 +1,2241 @@
+//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Vector store
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+{
+  class VSTInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = CVI_VM_ST,
+                IType type = TypeCVI_VM_ST>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>, OpcodeHexagon;
+
+}
+
+// Vector load
+let Predicates = [HasV60T, UseHVX] in
+let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
+                  IType type = TypeCVI_VM_LD>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let Predicates = [HasV60T, UseHVX] in
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = CVI_VM_ST,
+                IType type = TypeCVI_VM_ST>
+: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+//===----------------------------------------------------------------------===//
+// Vector loads with base + immediate offset
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access in
+class T_vload_ai<string asmStr>
+  : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
+                asmStr>;
+
+let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
+class T_vload_ai_128B<string asmStr>
+  : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
+                asmStr>;
+
+let isCVLoadable = 1, hasNewValue = 1 in {
+  def V6_vL32b_ai         : T_vload_ai <"$dst = vmem($src1+#$src2)">,
+                            V6_vL32b_ai_enc;
+  def V6_vL32b_nt_ai      : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
+                            V6_vL32b_nt_ai_enc;
+  // 128B
+  def V6_vL32b_ai_128B    : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
+                            V6_vL32b_ai_128B_enc;
+  def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
+                            V6_vL32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
+  def V6_vL32Ub_ai      : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
+                          V6_vL32Ub_ai_enc;
+  def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
+                          V6_vL32Ub_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
+    hasNewValue = 1 in {
+  def V6_vL32b_cur_ai    : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
+                           V6_vL32b_cur_ai_enc;
+  def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
+                           V6_vL32b_nt_cur_ai_enc;
+  // 128B
+  def V6_vL32b_cur_ai_128B    : T_vload_ai_128B
+                                <"$dst.cur = vmem($src1+#$src2)">,
+                                V6_vL32b_cur_ai_128B_enc;
+  def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
+                                <"$dst.cur = vmem($src1+#$src2):nt">,
+                                V6_vL32b_nt_cur_ai_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
+  def V6_vL32b_tmp_ai    : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
+                           V6_vL32b_tmp_ai_enc;
+  def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
+                           V6_vL32b_nt_tmp_ai_enc;
+  // 128B
+  def V6_vL32b_tmp_ai_128B    : T_vload_ai_128B
+                                <"$dst.tmp = vmem($src1+#$src2)">,
+                                V6_vL32b_tmp_ai_128B_enc;
+  def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
+                                <"$dst.tmp = vmem($src1+#$src2)">,
+                                V6_vL32b_nt_tmp_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access in
+class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
+                   RegisterClass RC, bit isNT>
+  : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_ai         : T_vstore_ai_64B <"vmem", "vS32b_ai">,
+                            V6_vS32b_ai_enc;
+  def V6_vS32b_ai_128B    : T_vstore_ai_128B <"vmem", "vS32b_ai">,
+                            V6_vS32b_ai_128B_enc;
+}
+
+let isNVStorable = 1, isNonTemporal = 1 in {
+  def V6_vS32b_nt_ai      : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
+                            V6_vS32b_nt_ai_enc;
+  def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
+                            V6_vS32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vs32Ub_ai">,
+                          V6_vS32Ub_ai_enc;
+  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vs32Ub_ai">,
+                          V6_vS32Ub_ai_128B_enc;
+}
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
+    Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
+class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+  : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
+  : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
+  : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+def V6_vS32b_new_ai      : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
+def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
+                           V6_vS32b_new_ai_128B_enc;
+
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_ai      : T_vstore_new_ai_64B<"vS32b_ai", 1>,
+                                V6_vS32b_nt_new_ai_enc;
+  def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
+                                V6_vS32b_nt_new_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1 in
+class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
+                        RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs),
+               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) "
+     #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
+                            bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
+                             bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
+                      isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_pred_ai     : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
+                             V6_vS32b_pred_ai_enc;
+  def V6_vS32b_npred_ai    : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
+                             V6_vS32b_npred_ai_enc;
+  // 128B
+  def V6_vS32b_pred_ai_128B    : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
+                                 V6_vS32b_pred_ai_128B_enc;
+  def V6_vS32b_npred_ai_128B   : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
+                                 V6_vS32b_npred_ai_128B_enc;
+}
+let isNVStorable = 1, isNonTemporal = 1 in {
+  def V6_vS32b_nt_pred_ai  : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
+                             V6_vS32b_nt_pred_ai_enc;
+  def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
+                             V6_vS32b_nt_npred_ai_enc;
+  // 128B
+  def V6_vS32b_nt_pred_ai_128B  : T_vstore_pred_ai_128B
+                                  <"vmem", "vS32b_ai", 0, 1>,
+                                  V6_vS32b_nt_pred_ai_128B_enc;
+  def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
+                                  <"vmem", "vS32b_ai", 1, 1>,
+                                  V6_vS32b_nt_npred_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pred_ai  : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
+                           V6_vS32Ub_pred_ai_enc;
+  def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
+                           V6_vS32Ub_npred_ai_enc;
+  // 128B
+  def V6_vS32Ub_pred_ai_128B  :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
+                               V6_vS32Ub_pred_ai_128B_enc;
+  def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
+                               V6_vS32Ub_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset in
+class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
+                         bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs),
+               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+          #!if(isNT, ":nt", "")#" = $src4"> {
+  let isPredicatedFalse = isPredNot;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_ai  : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
+def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
+                         V6_vS32b_nqpred_ai_enc;
+def V6_vS32b_nt_qpred_ai  : T_vstore_qpred_ai_64B <0, 1>,
+                            V6_vS32b_nt_qpred_ai_enc;
+def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
+                            V6_vS32b_nt_nqpred_ai_enc;
+// 128B
+def V6_vS32b_qpred_ai_128B  : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
+def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
+                              V6_vS32b_nqpred_ai_128B_enc;
+def V6_vS32b_nt_qpred_ai_128B  : T_vstore_qpred_ai_128B<0, 1>,
+                                 V6_vS32b_nt_qpred_ai_128B_enc;
+def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
+                                 V6_vS32b_nt_nqpred_ai_128B_enc;
+
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
+    isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
+class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
+                            bit isPredNot, bit isNT>
+  : V6_STInst <(outs),
+               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+         #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
+                          isPredNot, isNT>;
+
+
+def V6_vS32b_new_pred_ai     : T_vstore_new_pred_ai_64B <"vS32b_ai">,
+                               V6_vS32b_new_pred_ai_enc;
+def V6_vS32b_new_npred_ai    : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
+                               V6_vS32b_new_npred_ai_enc;
+// 128B
+def V6_vS32b_new_pred_ai_128B     : T_vstore_new_pred_ai_128B <"vS32b_ai">,
+                                    V6_vS32b_new_pred_ai_128B_enc;
+def V6_vS32b_new_npred_ai_128B    : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
+                                    V6_vS32b_new_npred_ai_128B_enc;
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_pred_ai  : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
+                                 V6_vS32b_nt_new_pred_ai_enc;
+  def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
+                                 V6_vS32b_nt_new_npred_ai_enc;
+  // 128B
+  def V6_vS32b_nt_new_pred_ai_128B  : T_vstore_new_pred_ai_128B
+                                      <"vS32b_ai", 0, 1>,
+                                      V6_vS32b_nt_new_pred_ai_128B_enc;
+  def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
+                                      <"vS32b_ai", 1, 1>,
+                                      V6_vS32b_nt_new_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, hasNewValue = 1 in
+class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
+  : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
+               (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
+    "$src1 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vload_pi_64B <string asmStr>
+  : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vload_pi_128B <string asmStr>
+  : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
+
+let isCVLoadable = 1 in {
+  def V6_vL32b_pi    : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
+                       V6_vL32b_pi_enc;
+  def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
+                       V6_vL32b_nt_pi_enc;
+  // 128B
+  def V6_vL32b_pi_128B    : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
+                            V6_vL32b_pi_128B_enc;
+  def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
+                            V6_vL32b_nt_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
+  def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
+                     V6_vL32Ub_pi_enc;
+  // 128B
+  def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
+                          V6_vL32Ub_pi_128B_enc;
+}
+
+let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
+  def V6_vL32b_cur_pi    : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
+                           V6_vL32b_cur_pi_enc;
+  def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
+                           V6_vL32b_nt_cur_pi_enc;
+  // 128B
+  def V6_vL32b_cur_pi_128B    : T_vload_pi_128B
+                                <"$dst.cur = vmem($src1++#$src2)">,
+                                V6_vL32b_cur_pi_128B_enc;
+  def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
+                                <"$dst.cur = vmem($src1++#$src2):nt">,
+                                V6_vL32b_nt_cur_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+  def V6_vL32b_tmp_pi    : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
+                           V6_vL32b_tmp_pi_enc;
+  def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
+                           V6_vL32b_nt_tmp_pi_enc;
+  //128B
+  def V6_vL32b_tmp_pi_128B    : T_vload_pi_128B
+                                <"$dst.tmp = vmem($src1++#$src2)">,
+                                V6_vL32b_tmp_pi_128B_enc;
+  def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
+                                <"$dst.tmp = vmem($src1++#$src2):nt">,
+                                V6_vL32b_nt_tmp_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc in
+class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
+                   RegisterClass RC, bit isNT>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let accessSize = Vector64Access in
+class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_pi <mnemonic, baseOp, s3_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
+  def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
+                         V6_vS32b_pi_128B_enc;
+}
+
+let isNVStorable = 1 , isNonTemporal = 1  in {
+  def V6_vS32b_nt_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
+                            V6_vS32b_nt_pi_enc;
+  def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
+                            V6_vS32b_nt_pi_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pi      : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
+                          V6_vS32Ub_pi_enc;
+  def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
+                          V6_vS32Ub_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment unconditional .new vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, isNVStore = 1 in
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+    opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+    "$src1 = $_dst_">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
+  : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
+  : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
+
+
+def V6_vS32b_new_pi      : T_vstore_new_pi_64B <"vS32b_pi">,
+                           V6_vS32b_new_pi_enc;
+def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
+                           V6_vS32b_new_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_pi      : T_vstore_new_pi_64B <"vS32b_pi", 1>,
+                                V6_vS32b_nt_new_pi_enc;
+  def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
+                                V6_vS32b_nt_new_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, addrMode = PostInc in
+class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
+                        RegisterClass RC, bit isPredNot, bit isNT>
+  : V6_STInst<(outs IntRegs:$_dst_),
+             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
+                            bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
+                             bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
+                      isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_pred_pi     : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
+                             V6_vS32b_pred_pi_enc;
+  def V6_vS32b_npred_pi    : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
+                             V6_vS32b_npred_pi_enc;
+  // 128B
+  def V6_vS32b_pred_pi_128B  : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
+                               V6_vS32b_pred_pi_128B_enc;
+  def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
+                               V6_vS32b_npred_pi_128B_enc;
+}
+let isNVStorable = 1, isNonTemporal = 1 in {
+  def V6_vS32b_nt_pred_pi  : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
+                             V6_vS32b_nt_pred_pi_enc;
+  def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
+                             V6_vS32b_nt_npred_pi_enc;
+  // 128B
+  def V6_vS32b_nt_pred_pi_128B  : T_vstore_pred_pi_128B
+                                  <"vmem", "vS32b_pi", 0, 1>,
+                                  V6_vS32b_nt_pred_pi_128B_enc;
+  def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
+                                  <"vmem", "vS32b_pi", 1, 1>,
+                                  V6_vS32b_nt_npred_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pred_pi  : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
+                           V6_vS32Ub_pred_pi_enc;
+  def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
+                           V6_vS32Ub_npred_pi_enc;
+  // 128B
+  def V6_vS32Ub_pred_pi_128B  : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
+                                V6_vS32Ub_pred_pi_128B_enc;
+  def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
+                                V6_vS32Ub_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc in
+class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
+                         bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_pi  : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
+def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
+// 128B
+def V6_vS32b_qpred_pi_128B  : T_vstore_qpred_pi_128B,
+                              V6_vS32b_qpred_pi_128B_enc;
+def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
+                              V6_vS32b_nqpred_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_qpred_pi  : T_vstore_qpred_pi_64B <0, 1>,
+                              V6_vS32b_nt_qpred_pi_enc;
+  def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
+                              V6_vS32b_nt_nqpred_pi_enc;
+  // 128B
+  def V6_vS32b_nt_qpred_pi_128B  : T_vstore_qpred_pi_128B<0, 1>,
+                                   V6_vS32b_nt_qpred_pi_128B_enc;
+  def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
+                                   V6_vS32b_nt_nqpred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+    isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
+class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
+                            bit isPredNot, bit isNT>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+         #!if(isNT, ":nt", "")#" = $src4.new", [],
+    "$src2 = $_dst_"> , NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
+                          isPredNot, isNT>;
+
+def V6_vS32b_new_pred_pi     : T_vstore_new_pred_pi_64B <"vS32b_pi">,
+                               V6_vS32b_new_pred_pi_enc;
+def V6_vS32b_new_npred_pi    : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
+                               V6_vS32b_new_npred_pi_enc;
+// 128B
+def V6_vS32b_new_pred_pi_128B    : T_vstore_new_pred_pi_128B <"vS32b_pi">,
+                                   V6_vS32b_new_pred_pi_128B_enc;
+def V6_vS32b_new_npred_pi_128B   : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
+                                   V6_vS32b_new_npred_pi_128B_enc;
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_pred_pi  : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
+                                 V6_vS32b_nt_new_pred_pi_enc;
+  def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
+                                 V6_vS32b_nt_new_npred_pi_enc;
+  // 128B
+  def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
+                                     <"vS32b_pi", 0, 1>,
+                                     V6_vS32b_nt_new_pred_pi_128B_enc;
+  def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
+                                      <"vS32b_pi", 1, 1>,
+                                      V6_vS32b_nt_new_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with register offset
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1 in
+class T_vload_ppu<string asmStr>
+  : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
+               (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let isCVLoadable = 1 in {
+  def V6_vL32b_ppu    : T_vload_ppu <"$dst = vmem($src1++$src2)">,
+                        V6_vL32b_ppu_enc;
+  def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
+                        V6_vL32b_nt_ppu_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
+def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
+                     V6_vL32Ub_ppu_enc;
+
+let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
+  def V6_vL32b_cur_ppu    : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
+                             V6_vL32b_cur_ppu_enc;
+  def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
+                             V6_vL32b_nt_cur_ppu_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+  def V6_vL32b_tmp_ppu    : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
+                             V6_vL32b_tmp_ppu_enc;
+  def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
+                             V6_vL32b_nt_tmp_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset
+//===----------------------------------------------------------------------===//
+class T_vstore_ppu <string mnemonic, bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+    mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+  def V6_vS32b_ppu    : T_vstore_ppu <"vmem">,
+                        V6_vS32b_ppu_enc;
+  let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
+  def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
+                        V6_vS32b_nt_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
+def V6_vS32Ub_ppu   : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+    opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_ppu <bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+    "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let BaseOpcode = "vS32b_ppu" in
+def V6_vS32b_new_ppu    : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
+def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst<(outs IntRegs:$_dst_),
+           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+  def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
+  def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+  def V6_vS32b_nt_pred_ppu  : T_vstore_pred_ppu <"vmem", 0, 1>,
+                              V6_vS32b_nt_pred_ppu_enc;
+  def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
+                              V6_vS32b_nt_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
+    Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pred_ppu  : T_vstore_pred_ppu <"vmemu">,
+                            V6_vS32Ub_pred_ppu_enc;
+  def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
+                            V6_vS32Ub_npred_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+        (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">, NewValueRel;
+
+def V6_vS32b_qpred_ppu  : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
+def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
+def V6_vS32b_nt_qpred_ppu  : T_vstore_qpred_ppu<0, 1>,
+                             V6_vS32b_nt_qpred_ppu_enc;
+def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
+                             V6_vS32b_nt_nqpred_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+    isNewValue = 1, opNewValue = 4, isNVStore = 1 in
+class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+         #!if(isNT, ":nt", "")#" = $src4.new", [],
+    "$src2 = $_dst_">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+}
+
+let BaseOpcode = "vS32b_ppu" in {
+  def V6_vS32b_new_pred_ppu  : T_vstore_new_pred_ppu,
+                               V6_vS32b_new_pred_ppu_enc;
+  def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
+                               V6_vS32b_new_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+def V6_vS32b_nt_new_pred_ppu :  T_vstore_new_pred_ppu<0, 1>,
+                                V6_vS32b_nt_new_pred_ppu_enc;
+def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
+                                V6_vS32b_nt_new_npred_ppu_enc;
+}
+
+let isPseudo = 1, validSubTargets = HasV60SubT in
+class STrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC>:
+        VSTInst<(outs), (ins IntRegs:$addr, ImmOp:$off, RC:$src),
+                #mnemonic#"($addr+#$off) = $src", []>;
+
+def STrivv_indexed: STrivv_template<"vvmem", s4_6Imm, VecDblRegs>,
+                    Requires<[HasV60T, UseHVXSgl]>;
+def STrivv_indexed_128B: STrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>,
+                         Requires<[HasV60T, UseHVXDbl]>;
+
+multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+  def : Pat<(store (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+            (STrivv_indexed IntRegs:$addr, #0, (VTSgl VecDblRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
+
+  def : Pat<(store (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+            (STrivv_indexed_128B IntRegs:$addr, #0,
+                                 (VTDbl VecDblRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+}
+
+defm : STrivv_pats <v128i8, v256i8>;
+defm : STrivv_pats <v64i16, v128i16>;
+defm : STrivv_pats <v32i32, v64i32>;
+defm : STrivv_pats <v16i64, v32i64>;
+
+
+multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+  // Aligned stores
+  def : Pat<(store (VTSgl VectorRegs:$src1), IntRegs:$addr),
+            (V6_vS32b_ai IntRegs:$addr, #0, (VTSgl VectorRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
+
+  // 128B Aligned stores
+  def : Pat<(store (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+            (V6_vS32b_ai_128B IntRegs:$addr, #0, (VTDbl VectorRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+
+  // Fold Add R+IFF into vector store.
+  let AddedComplexity = 10 in
+  def : Pat<(store (VTSgl VectorRegs:$src1),
+                   (add IntRegs:$src2, s4_6ImmPred:$offset)),
+            (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                         (VTSgl VectorRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
+
+  // Fold Add R+IFF into vector store 128B.
+  let AddedComplexity = 10 in
+  def : Pat<(store (VTDbl VectorRegs128B:$src1),
+                   (add IntRegs:$src2, s4_7ImmPred:$offset)),
+            (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                              (VTDbl VectorRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+}
+
+defm : vS32b_ai_pats <v64i8,  v128i8>;
+defm : vS32b_ai_pats <v32i16, v64i16>;
+defm : vS32b_ai_pats <v16i32, v32i32>;
+defm : vS32b_ai_pats <v8i64,  v16i64>;
+
+let isPseudo = 1, validSubTargets = HasV60SubT in
+class LDrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC>
+  : V6_LDInst <(outs RC:$dst), (ins IntRegs:$addr, ImmOp:$off),
+               "$dst="#mnemonic#"($addr+#$off)",
+               []>,
+               Requires<[HasV60T,UseHVXSgl]>;
+
+def LDrivv_indexed: LDrivv_template<"vvmem", s4_6Imm, VecDblRegs>;
+def LDrivv_indexed_128B: LDrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>;
+
+multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+  def : Pat < (VTSgl (load IntRegs:$addr)),
+              (LDrivv_indexed IntRegs:$addr, #0) >,
+              Requires<[UseHVXSgl]>;
+
+  def : Pat < (VTDbl (load IntRegs:$addr)),
+              (LDrivv_indexed_128B IntRegs:$addr, #0) >,
+              Requires<[UseHVXDbl]>;
+}
+
+defm : LDrivv_pats <v128i8, v256i8>;
+defm : LDrivv_pats <v64i16, v128i16>;
+defm : LDrivv_pats <v32i32, v64i32>;
+defm : LDrivv_pats <v16i64, v32i64>;
+
+multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+  // Aligned loads
+  def : Pat < (VTSgl (load IntRegs:$addr)),
+              (V6_vL32b_ai IntRegs:$addr, #0) >,
+              Requires<[UseHVXSgl]>;
+
+  // 128B Load
+  def : Pat < (VTDbl (load IntRegs:$addr)),
+              (V6_vL32b_ai_128B IntRegs:$addr, #0) >,
+              Requires<[UseHVXDbl]>;
+
+  // Fold Add R+IFF into vector load.
+  let AddedComplexity = 10 in
+  def : Pat<(VTDbl (load (add IntRegs:$src2, s4_7ImmPred:$offset))),
+            (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+             Requires<[UseHVXDbl]>;
+
+  let AddedComplexity = 10 in
+  def : Pat<(VTSgl (load (add IntRegs:$src2, s4_6ImmPred:$offset))),
+            (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+            Requires<[UseHVXSgl]>;
+}
+
+defm : vL32b_ai_pats <v64i8,  v128i8>;
+defm : vL32b_ai_pats <v32i16, v64i16>;
+defm : vL32b_ai_pats <v16i32, v32i32>;
+defm : vL32b_ai_pats <v8i64,  v16i64>;
+
+// Store vector predicate pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def STriq_pred_V6 : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VecPredRegs:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+
+def STriq_pred_vec_V6 : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+
+def STriq_pred_V6_128B : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VecPredRegs128B:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+
+def STriq_pred_vec_V6_128B : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Load vector predicate pseudo.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def LDriq_pred_V6 : LDInst<(outs VecPredRegs:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def LDriq_pred_vec_V6 : LDInst<(outs VectorRegs:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def LDriq_pred_V6_128B : LDInst<(outs VecPredRegs128B:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+def LDriq_pred_vec_V6_128B : LDInst<(outs VectorRegs128B:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Store vector pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def STriv_pseudo_V6 : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def STriv_pseudo_V6_128B : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def STrivv_pseudo_V6 : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VecDblRegs:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def STrivv_pseudo_V6_128B : STInst<(outs),
+            (ins IntRegs:$base, s32Imm:$offset, VecDblRegs128B:$src1),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Load vector pseudo.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def LDriv_pseudo_V6 : LDInst<(outs VectorRegs:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def LDriv_pseudo_V6_128B : LDInst<(outs VectorRegs128B:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def LDrivv_pseudo_V6 : LDInst<(outs VecDblRegs:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def LDrivv_pseudo_V6_128B : LDInst<(outs VecDblRegs128B:$dst),
+            (ins IntRegs:$base, s32Imm:$offset),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = CVI_VA_DV,
+              IType type = TypeCVI_VA_DV>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def VSelectPseudo_V6 : VSELInst<(outs VectorRegs:$dst),
+            (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+            ".error \"should not emit\" ",
+            []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+def VSelectDblPseudo_V6 : VSELInst<(outs VecDblRegs:$dst),
+               (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3),
+               ".error \"should not emit\" ",
+               []>,
+               Requires<[HasV60T,UseHVXSgl]>;
+}
+
+def : Pat <(v16i32 (selectcc (i32 IntRegs:$lhs), (i32 IntRegs:$rhs),
+                             (v16i32 VectorRegs:$tval),
+                             (v16i32 VectorRegs:$fval), SETEQ)),
+      (v16i32 (VSelectPseudo_V6 (i32 (C2_cmpeq (i32 IntRegs:$lhs),
+                                (i32 IntRegs:$rhs))),
+                                (v16i32 VectorRegs:$tval),
+                                (v16i32 VectorRegs:$fval)))>;
+
+
+let hasNewValue = 1 in
+class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+    asmString >;
+
+multiclass T_vmpy <string asmString, RegisterClass RCout,
+                        RegisterClass RCin> {
+  def NAME : T_vmpy <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
+                                      !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_vmpy_VV <string asmString>:
+  T_vmpy <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_vmpy_WW <string asmString>:
+  T_vmpy <asmString, VecDblRegs, VecDblRegs>;
+
+multiclass T_vmpy_VW <string asmString>:
+  T_vmpy <asmString, VectorRegs, VecDblRegs>;
+
+multiclass T_vmpy_WV <string asmString>:
+  T_vmpy <asmString, VecDblRegs, VectorRegs>;
+
+defm V6_vtmpyb   :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
+defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
+defm V6_vdsaduh  :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
+defm V6_vmpybus  :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
+defm V6_vmpabus  :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
+defm V6_vmpahb   :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
+defm V6_vmpyh    :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
+defm V6_vmpyuh   :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
+defm V6_vmpyiwh  :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
+defm V6_vtmpyhb  :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
+defm V6_vmpyub   :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
+
+let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
+defm V6_vmpyihb  :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
+
+defm V6_vdmpybus_dv :
+     T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
+defm V6_vdmpyhsusat :
+     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
+defm V6_vdmpyhsuisat :
+     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
+defm V6_vdmpyhsat :
+     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
+defm V6_vdmpyhisat :
+     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
+defm V6_vdmpyhb_dv :
+     T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
+defm V6_vmpyhss :
+     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
+defm V6_vmpyhsrs :
+     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in
+defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
+
+let Itinerary = CVI_VX, Type = TypeCVI_VX in {
+defm V6_vdmpyhb  : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
+defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
+defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
+defm V6_vmpyiwb  : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
+defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrw  : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
+defm V6_vasrh  : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
+defm V6_vaslw  : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
+defm V6_vaslh  : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
+defm V6_vlsrw  : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
+defm V6_vlsrh  : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
+}
+
+let hasNewValue = 1 in
+class T_HVX_alu <string asmString, InstrItinClass itin,
+                 RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+    asmString >{
+  let Itinerary = itin;
+  let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu <string asmString, RegisterClass RCout,
+           RegisterClass RCin, InstrItinClass itin> {
+  def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_alu <asmString, itin,
+                              !cast<RegisterClass>(RCout#"128B"),
+                              !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_alu_VV <string asmString>:
+  T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_WW <string asmString>:
+  T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
+
+multiclass T_HVX_alu_WV <string asmString>:
+  T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
+
+
+let Itinerary  =  CVI_VX, Type  =  TypeCVI_VX in {
+defm V6_vrmpyubv :
+     T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
+defm V6_vrmpybv :
+     T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
+defm V6_vrmpybusv :
+     T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
+defm V6_vabsdiffub :
+     T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
+defm V6_vabsdiffh :
+     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
+defm V6_vabsdiffuh :
+     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
+defm V6_vabsdiffw :
+     T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
+}
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhvsat :
+     T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
+defm V6_vmpyhvsrs :
+     T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
+defm V6_vmpyih :
+     T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
+}
+
+defm V6_vand :
+     T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
+defm V6_vor :
+     T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
+defm V6_vxor :
+     T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
+defm V6_vaddw :
+     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
+defm V6_vaddubsat :
+     T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
+defm V6_vadduhsat :
+     T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
+defm V6_vaddhsat :
+     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
+defm V6_vaddwsat :
+     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
+defm V6_vsubb :
+     T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
+defm V6_vsubh :
+     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
+defm V6_vsubw :
+     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
+defm V6_vsububsat :
+     T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
+defm V6_vsubuhsat :
+     T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
+defm V6_vsubhsat :
+     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
+defm V6_vsubwsat :
+     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
+defm V6_vavgub :
+     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
+defm V6_vavguh :
+     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
+defm V6_vavgh :
+     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
+defm V6_vavgw :
+     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
+defm V6_vnavgub :
+     T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
+defm V6_vnavgh :
+     T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
+defm V6_vnavgw :
+     T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
+defm V6_vavgubrnd :
+     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
+defm V6_vavguhrnd :
+     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
+defm V6_vavghrnd :
+     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
+defm V6_vavgwrnd :
+     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
+
+defm V6_vmpybv :
+     T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
+defm V6_vmpyubv :
+     T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
+defm V6_vmpybusv :
+     T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
+defm V6_vmpyhv :
+     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
+defm V6_vmpyuhv :
+     T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
+defm V6_vmpyhus :
+     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
+defm V6_vaddubh :
+     T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
+defm V6_vadduhw :
+     T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
+defm V6_vaddhw :
+     T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
+defm V6_vsububh :
+     T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
+defm V6_vsubuhw :
+     T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
+defm V6_vsubhw :
+     T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
+
+defm V6_vaddb_dv :
+     T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
+defm V6_vaddh_dv :
+     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
+defm V6_vaddw_dv :
+     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
+defm V6_vaddubsat_dv :
+     T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
+defm V6_vadduhsat_dv :
+     T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
+defm V6_vaddhsat_dv :
+     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
+defm V6_vaddwsat_dv :
+     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
+defm V6_vsubb_dv :
+     T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
+defm V6_vsubh_dv :
+     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
+defm V6_vsubw_dv :
+     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
+defm V6_vsububsat_dv :
+     T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
+defm V6_vsubuhsat_dv :
+     T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
+defm V6_vsubhsat_dv :
+     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
+defm V6_vsubwsat_dv :
+     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
+defm V6_vmpabusv :
+     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
+defm V6_vmpabuuv :
+     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1 in
+class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
+                     RegisterClass RCin1, RegisterClass RCin2>
+  : CVI_VA_Resource1 <(outs RCout:$dst),
+                      (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
+                      [], "$dst = $_src_" > {
+  let Itinerary = itin;
+  let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
+           RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
+  def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
+                   !cast<RegisterClass>(RCout#"128B"),
+                   !cast<RegisterClass>(RCin1#"128B"),
+                   !cast<RegisterClass>(RCin2#
+                   !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
+}
+
+multiclass T_HVX_vmpyacc_VVR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
+
+multiclass T_HVX_vmpyacc_VWR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WWR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_VVV <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVV <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+
+defm V6_vtmpyb_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
+     V6_vtmpyb_acc_enc;
+defm V6_vtmpybus_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
+     V6_vtmpybus_acc_enc;
+defm V6_vtmpyhb_acc :
+     T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
+     V6_vtmpyhb_acc_enc;
+defm V6_vdmpyhb_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+     V6_vdmpyhb_acc_enc;
+defm V6_vrmpyub_acc :
+     T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+     V6_vrmpyub_acc_enc;
+defm V6_vrmpybus_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+     V6_vrmpybus_acc_enc;
+defm V6_vdmpybus_acc :
+     T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+     V6_vdmpybus_acc_enc;
+defm V6_vdmpybus_dv_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+     V6_vdmpybus_dv_acc_enc;
+defm V6_vdmpyhsuisat_acc :
+     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
+     V6_vdmpyhsuisat_acc_enc;
+defm V6_vdmpyhisat_acc :
+     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+     V6_vdmpyhisat_acc_enc;
+defm V6_vdmpyhb_dv_acc :
+     T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+     V6_vdmpyhb_dv_acc_enc;
+defm V6_vmpybus_acc :
+     T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
+     V6_vmpybus_acc_enc;
+defm V6_vmpabus_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
+     V6_vmpabus_acc_enc;
+defm V6_vmpahb_acc :
+     T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
+     V6_vmpahb_acc_enc;
+defm V6_vmpyhsat_acc :
+     T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
+     V6_vmpyhsat_acc_enc;
+defm V6_vmpyuh_acc :
+     T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+     V6_vmpyuh_acc_enc;
+defm V6_vmpyiwb_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
+     V6_vmpyiwb_acc_enc;
+defm V6_vdsaduh_acc :
+     T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
+     V6_vdsaduh_acc_enc;
+defm V6_vmpyihb_acc :
+     T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
+     V6_vmpyihb_acc_enc;
+defm V6_vmpyub_acc :
+     T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+     V6_vmpyub_acc_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhsusat_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
+     V6_vdmpyhsusat_acc_enc;
+defm V6_vdmpyhsat_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+     V6_vdmpyhsat_acc_enc;
+defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
+     <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vaslw_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
+defm V6_vasrw_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
+}
+
+defm V6_vdmpyhvsat_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+     V6_vdmpyhvsat_acc_enc;
+defm V6_vmpybusv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
+     V6_vmpybusv_acc_enc;
+defm V6_vmpybv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
+defm V6_vmpyhus_acc :
+     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
+defm V6_vmpyhv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
+defm V6_vmpyiewh_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
+     V6_vmpyiewh_acc_enc;
+defm V6_vmpyiewuh_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
+     V6_vmpyiewuh_acc_enc;
+defm V6_vmpyih_acc :
+     T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
+defm V6_vmpyowh_rnd_sacc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
+     V6_vmpyowh_rnd_sacc_enc;
+defm V6_vmpyowh_sacc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
+     V6_vmpyowh_sacc_enc;
+defm V6_vmpyubv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+     V6_vmpyubv_acc_enc;
+defm V6_vmpyuhv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+     V6_vmpyuhv_acc_enc;
+defm V6_vrmpybusv_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+     V6_vrmpybusv_acc_enc;
+defm V6_vrmpybv_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
+defm V6_vrmpyubv_acc :
+     T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+     V6_vrmpyubv_acc_enc;
+
+
+class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst),
+                      (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
+                      [], "$dst = $_src_" > {
+  let Itinerary = CVI_VA;
+  let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_vcmp <string asmString> {
+  def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb_and :
+     T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
+defm V6_veqh_and :
+     T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
+defm V6_veqw_and :
+     T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
+defm V6_vgtb_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
+defm V6_vgth_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
+defm V6_vgtw_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
+defm V6_vgtub_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
+defm V6_vgtuh_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
+defm V6_vgtuw_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
+defm V6_veqb_or :
+     T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
+defm V6_veqh_or :
+     T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
+defm V6_veqw_or :
+     T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
+defm V6_vgtb_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
+defm V6_vgth_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
+defm V6_vgtw_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
+defm V6_vgtub_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
+defm V6_vgtuh_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
+defm V6_vgtuw_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
+defm V6_veqb_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
+defm V6_veqh_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
+defm V6_veqw_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
+defm V6_vgtb_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
+defm V6_vgth_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
+defm V6_vgtw_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
+defm V6_vgtub_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
+defm V6_vgtuh_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
+defm V6_vgtuw_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
+
+defm V6_vminub :
+     T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
+defm V6_vminuh :
+     T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
+defm V6_vminh :
+     T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
+defm V6_vminw :
+     T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
+defm V6_vmaxub :
+     T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
+defm V6_vmaxuh :
+     T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
+defm V6_vmaxh :
+     T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
+defm V6_vmaxw :
+     T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
+defm V6_vshuffeb :
+     T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
+defm V6_vshuffob :
+     T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
+defm V6_vshufeh :
+     T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
+defm V6_vshufoh :
+     T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vmpyowh_rnd :
+     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
+     V6_vmpyowh_rnd_enc;
+defm V6_vmpyiewuh :
+     T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
+defm V6_vmpyewuh :
+     T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
+defm V6_vmpyowh :
+     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
+defm V6_vmpyiowh :
+     T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
+}
+let Itinerary = CVI_VX, Type = TypeCVI_VX in
+defm V6_vmpyieoh :
+     T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
+defm V6_vshufoeh :
+     T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
+defm V6_vshufoeb :
+     T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
+}
+
+let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+defm V6_vcombine :
+     T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
+
+def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
+      SDTCisSubVecOfVec<1, 0>]>;
+
+def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
+
+def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs),
+                                  (v16i32 VectorRegs:$Vt))),
+         (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
+                                  (v32i32 VecDblRegs:$Vt))),
+         (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+
+let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
+defm V6_vsathub :
+     T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
+defm V6_vsatwh :
+     T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vroundwh :
+     T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
+defm V6_vroundwuh :
+     T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
+defm V6_vroundhb :
+     T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
+defm V6_vroundhub :
+     T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
+defm V6_vasrwv :
+     T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
+defm V6_vlsrwv :
+     T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
+defm V6_vlsrhv :
+     T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
+defm V6_vasrhv :
+     T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
+defm V6_vaslwv :
+     T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
+defm V6_vaslhv :
+     T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
+}
+
+defm V6_vaddb :
+     T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
+defm V6_vaddh :
+     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdelta :
+     T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
+defm V6_vrdelta :
+     T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
+defm V6_vdealb4w :
+     T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
+defm V6_vpackeb :
+     T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
+defm V6_vpackeh :
+     T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
+defm V6_vpackhub_sat :
+     T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
+defm V6_vpackhb_sat :
+     T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
+defm V6_vpackwuh_sat :
+     T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
+defm V6_vpackwh_sat :
+     T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
+defm V6_vpackob :
+     T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
+defm V6_vpackoh :
+     T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
+  : CVI_VA_Resource1 <(outs RC2:$dst),
+                      (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
+                      [], "$dst = $_src_" > {
+  let Itinerary = CVI_VA;
+  let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_condALU <string asmString> {
+  def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_vaddbq  : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
+                  V6_vaddbq_enc;
+defm V6_vaddhq  : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
+                  V6_vaddhq_enc;
+defm V6_vaddwq  : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
+                  V6_vaddwq_enc;
+defm V6_vsubbq  : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
+                  V6_vsubbq_enc;
+defm V6_vsubhq  : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
+                  V6_vsubhq_enc;
+defm V6_vsubwq  : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
+                  V6_vsubwq_enc;
+defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
+                  V6_vaddbnq_enc;
+defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
+                  V6_vaddhnq_enc;
+defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
+                  V6_vaddwnq_enc;
+defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
+                  V6_vsubbnq_enc;
+defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
+                  V6_vsubhnq_enc;
+defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
+                  V6_vsubwnq_enc;
+
+let hasNewValue = 1 in
+class T_HVX_alu_2op <string asmString, InstrItinClass itin,
+                 RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
+    asmString >{
+  let Itinerary = itin;
+  let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
+           RegisterClass RCin, InstrItinClass itin> {
+  def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_alu_2op <asmString, itin,
+                              !cast<RegisterClass>(RCout#"128B"),
+                              !cast<RegisterClass>(RCin#"128B")>;
+}
+
+let hasNewValue = 1 in
+multiclass T_HVX_alu_2op_VV <string asmString>:
+  T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_2op_WV <string asmString>:
+  T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
+
+
+defm V6_vabsh     : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
+                    V6_vabsh_enc;
+defm V6_vabsw     : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
+                    V6_vabsw_enc;
+defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
+                    V6_vabsh_sat_enc;
+defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
+                    V6_vabsw_sat_enc;
+defm V6_vnot      : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
+                    V6_vnot_enc;
+defm V6_vassign   : T_HVX_alu_2op_VV <"$dst = $src1">,
+                    V6_vassign_enc;
+
+defm V6_vzb       : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
+                    V6_vzb_enc;
+defm V6_vzh       : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
+                    V6_vzh_enc;
+defm V6_vsb       : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
+                    V6_vsb_enc;
+defm V6_vsh       : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
+                    V6_vsh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdealh    : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
+                    V6_vdealh_enc;
+defm V6_vdealb    : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
+                    V6_vdealb_enc;
+defm V6_vshuffh   : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
+                    V6_vshuffh_enc;
+defm V6_vshuffb   : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
+                    V6_vshuffb_enc;
+}
+
+let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
+defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
+                    V6_vunpackub_enc;
+defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
+                    V6_vunpackuh_enc;
+defm V6_vunpackb  : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
+                    V6_vunpackb_enc;
+defm V6_vunpackh  : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
+                    V6_vunpackh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vcl0w     : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
+                    V6_vcl0w_enc;
+defm V6_vcl0h     : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
+                    V6_vcl0h_enc;
+defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
+                    V6_vnormamtw_enc;
+defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
+                    V6_vnormamth_enc;
+defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
+                     V6_vpopcounth_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
+    Type = TypeCVI_VX_DV in
+class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1 <(outs RC:$dst),
+                      (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1Imm:$src3),
+    asmString, [], "$dst = $_src_" > ;
+
+
+multiclass T_HVX_vmpyacc2 <string asmString> {
+  def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi_acc :
+     T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
+     V6_vrmpybusi_acc_enc;
+defm V6_vrsadubi_acc :
+     T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
+     V6_vrsadubi_acc_enc;
+defm V6_vrmpyubi_acc :
+     T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
+     V6_vrmpyubi_acc_enc;
+
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
+class T_HVX_vmpy2 <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1Imm:$src3),
+    asmString>;
+
+
+multiclass T_HVX_vmpy2 <string asmString> {
+  def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi :
+     T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
+defm V6_vrsadubi :
+     T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
+defm V6_vrmpyubi :
+     T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
+
+
+let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
+    hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
+class T_HVX_perm <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
+                      (ins RC:$src1, RC:$src2, IntRegs:$src3),
+    asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
+
+multiclass T_HVX_perm <string asmString> {
+  def NAME : T_HVX_perm <asmString, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
+  defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
+  defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
+}
+
+// Conditional vector move.
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_cmov <bit isPredNot, RegisterClass RC>
+  : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
+    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
+  let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_cmov <bit isPredNot = 0> {
+  def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
+}
+
+defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
+defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
+
+// Conditional vector combine.
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
+    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 < (outs RCout:$dst),
+    (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
+    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
+  let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_ccombine <bit isPredNot = 0> {
+  def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
+defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
+
+let hasNewValue = 1 in
+class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_DV_Resource1<(outs RCout:$dst),
+    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+    asmString >;
+
+multiclass T_HVX_shift <string asmString, RegisterClass RCout,
+                        RegisterClass RCin> {
+  def NAME : T_HVX_shift <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
+                                           !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_shift_VV <string asmString>:
+  T_HVX_shift <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_shift_WV <string asmString>:
+  T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
+defm V6_valignb :
+     T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
+defm V6_vlalignb :
+     T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrwh :
+     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
+defm V6_vasrwhsat :
+     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
+     V6_vasrwhsat_enc;
+defm V6_vasrwhrndsat :
+     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
+     V6_vasrwhrndsat_enc;
+defm V6_vasrwuhsat :
+     T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
+     V6_vasrwuhsat_enc;
+defm V6_vasrhubsat :
+     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
+     V6_vasrhubsat_enc;
+defm V6_vasrhubrndsat :
+     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+     V6_vasrhubrndsat_enc;
+defm V6_vasrhbrndsat :
+     T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+     V6_vasrhbrndsat_enc;
+}
+
+// Assembler mapped -- alias?
+//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
+defm V6_vshuffvdd :
+     T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
+defm V6_vdealvdd :
+     T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
+}
+
+let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
+class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
+    asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_unpack <string asmString> {
+  def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
+defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
+    hasSideEffects = 0 in
+class T_HVX_valign <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3Imm:$src3),
+    asmString>;
+
+multiclass T_HVX_valign <string asmString> {
+  def NAME : T_HVX_valign <asmString, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
+}
+
+defm V6_valignbi :
+     T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
+defm V6_vlalignbi :
+     T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+class T_HVX_predAlu <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
+    asmString>;
+
+multiclass T_HVX_predAlu <string asmString> {
+  def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
+}
+
+defm V6_pred_and  : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
+defm V6_pred_or   : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
+defm V6_pred_xor  : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
+defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
+defm V6_pred_and_n :
+     T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_prednot <RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
+    "$dst = not($src1)">, V6_pred_not_enc;
+
+def V6_pred_not : T_HVX_prednot <VecPredRegs>;
+let isCodeGenOnly =  1 in
+def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+    asmString >;
+
+multiclass T_HVX_vcmp2 <string asmString> {
+  def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
+defm V6_veqh : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
+defm V6_veqw : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
+defm V6_vgtb : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
+defm V6_vgth : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
+defm V6_vgtw : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
+defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
+defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
+defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
+
+let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst),
+    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
+
+def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
+
+let isAccumulator = 1 in
+class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst),
+    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
+
+def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
+
+let hasNewValue =  1, hasSideEffects = 0 in
+class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst),
+    (ins RCin:$src1, IntRegs:$src2),
+    "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
+
+def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_lvsplatw <RegisterClass RC>
+  : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
+    "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
+
+def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
+
+
+let hasNewValue = 1 in
+class T_V6_vinsertwr <RegisterClass RC>
+  : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
+    "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
+    V6_vinsertwr_enc;
+
+def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
+class T_V6_pred_scalar2 <RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
+    "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
+
+def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
+
+class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+    "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
+
+def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
+  : SInst2 <(outs RC:$dst), (ins  RC:$src1, ImmOp:$src2), asmString>;
+
+class T_HVX_rol_R <string asmString>
+  : T_HVX_rol <asmString, IntRegs, u5Imm>;
+class T_HVX_rol_P <string asmString>
+  : T_HVX_rol <asmString, DoubleRegs, u6Imm>;
+
+def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
+let hasNewValue = 1, opNewValue = 0 in
+def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
+  : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
+    asmString, [], "$dst = $_src_" >;
+
+class T_HVX_rol_acc_P <string asmString>
+  : T_HVX_rol_acc <asmString, DoubleRegs, u6Imm>;
+
+class T_HVX_rol_acc_R <string asmString>
+  : T_HVX_rol_acc <asmString, IntRegs, u5Imm>;
+
+def S6_rol_i_p_nac :
+    T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
+def S6_rol_i_p_acc :
+    T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
+def S6_rol_i_p_and :
+    T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
+def S6_rol_i_p_or  :
+    T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
+def S6_rol_i_p_xacc :
+    T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
+
+let hasNewValue = 1, opNewValue = 0 in {
+def S6_rol_i_r_nac :
+    T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
+def S6_rol_i_r_acc :
+    T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
+def S6_rol_i_r_and :
+    T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
+def S6_rol_i_r_or :
+    T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
+def S6_rol_i_r_xacc :
+    T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
+}
+
+let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
+class T_V6_extractw <RegisterClass RC>
+  : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
+    "$dst = vextract($src1,$src2)">, V6_extractw_enc;
+
+def V6_extractw : T_V6_extractw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
+
+let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT  in
+class T_sys0op <string asmString>
+  : ST1Inst <(outs), (ins), asmString>;
+
+let isSolo = 1, validSubTargets = HasV55SubT in {
+def Y5_l2gunlock   : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
+def Y5_l2gclean    : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
+def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
+}
+
+class T_sys1op <string asmString, RegisterClass RC>
+  : ST1Inst <(outs), (ins RC:$src1), asmString>;
+
+class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
+class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
+
+let isSoloAX = 1, validSubTargets = HasV55SubT in
+def Y5_l2unlocka     : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
+
+let isSolo = 1, validSubTargets = HasV60SubT in {
+def Y6_l2gcleanpa    : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
+def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
+}
+
+let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
+    validSubTargets = HasV55SubT in
+def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
+  "$dst = l2locka($src1)">, Y5_l2locka_enc;
+
+// not defined on etc side. why?
+// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
+
+let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
+    hasSideEffects = 0,
+validSubTargets = HasV55SubT in
+def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
+  (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
+  "$dst1,$dst2 = vacsh($src1,$src2)", [],
+  "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
+    hasSideEffects = 0 in
+class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
+                  RegisterClass RCin2>
+  : CVI_VA_Resource1<(outs RCout:$dst),
+    (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
+
+multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
+  def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
+                               VecPredRegs128B, VectorRegs128B>;
+}
+
+multiclass T_HVX_alu2_V <string asmString> :
+  T_HVX_alu2 <asmString, VectorRegs>;
+
+multiclass T_HVX_alu2_W <string asmString> :
+  T_HVX_alu2 <asmString, VecDblRegs>;
+
+defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
+    hasSideEffects = 0 in
+defm V6_vmux  : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
+
+class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1<(outs RCout:$dst),
+    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
+
+multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
+                        RegisterClass RCin> {
+  def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
+                                           !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_V <string asmString> :
+  T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_W <string asmString> :
+  T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
+class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+                       RegisterClass RCin>
+  : CVI_VA_Resource1<(outs RCout:$dst),
+    (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+    asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+                            RegisterClass RCin> {
+  def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vlutb_acc<asmString,
+                                   !cast<RegisterClass>(RCout#"128B"),
+                                   !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_acc_V <string asmString> :
+  T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_acc_W <string asmString> :
+  T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
+defm V6_vlutvvb:
+     T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
+defm V6_vlutvwh:
+     T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
+
+let hasNewValue = 1 in {
+  defm V6_vlutvvb_oracc:
+       T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
+       V6_vlutvvb_oracc_enc;
+  defm V6_vlutvwh_oracc:
+       T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
+       V6_vlutvwh_oracc_enc;
+}
+
+// It's a fake instruction and should not be defined?
+def S2_cabacencbin
+  : SInst2<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+    "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
+
+// Vhist instructions
+def V6_vhistq
+  : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
+    "vhist($src1)">, V6_vhistq_enc;
+
+def V6_vhist
+  : CVI_HIST_Resource1 <(outs), (ins),
+    "vhist" >, V6_vhist_enc;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
index f4fb946..96dd531 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -35,6 +35,34 @@ multiclass bitconvert_64<ValueType a, ValueType b> {
              (a DoubleRegs:$src)>;
 }
 
+multiclass bitconvert_vec<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a VectorRegs:$src))),
+             (b  VectorRegs:$src)>;
+  def : Pat <(a (bitconvert (b VectorRegs:$src))),
+             (a  VectorRegs:$src)>;
+}
+
+multiclass bitconvert_dblvec<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a VecDblRegs:$src))),
+             (b  VecDblRegs:$src)>;
+  def : Pat <(a (bitconvert (b VecDblRegs:$src))),
+             (a  VecDblRegs:$src)>;
+}
+
+multiclass bitconvert_predvec<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a VecPredRegs:$src))),
+             (b  VectorRegs:$src)>;
+  def : Pat <(a (bitconvert (b VectorRegs:$src))),
+             (a  VecPredRegs:$src)>;
+}
+
+multiclass bitconvert_dblvec128B<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a VecDblRegs128B:$src))),
+             (b  VecDblRegs128B:$src)>;
+  def : Pat <(a (bitconvert (b VecDblRegs128B:$src))),
+             (a  VecDblRegs128B:$src)>;
+}
+
 // Bit convert vector types.
 defm : bitconvert_32<v4i8, i32>;
 defm : bitconvert_32<v2i16, i32>;
@@ -47,6 +75,21 @@ defm : bitconvert_64<v8i8, v4i16>;
 defm : bitconvert_64<v8i8, v2i32>;
 defm : bitconvert_64<v4i16, v2i32>;
 
+defm : bitconvert_vec<v64i8, v16i32>;
+defm : bitconvert_vec<v8i64 , v16i32>;
+defm : bitconvert_vec<v32i16, v16i32>;
+
+defm : bitconvert_dblvec<v16i64, v128i8>;
+defm : bitconvert_dblvec<v32i32, v128i8>;
+defm : bitconvert_dblvec<v64i16, v128i8>;
+
+defm : bitconvert_dblvec128B<v64i32, v128i16>;
+defm : bitconvert_dblvec128B<v256i8, v128i16>;
+defm : bitconvert_dblvec128B<v32i64, v128i16>;
+
+defm : bitconvert_dblvec128B<v64i32, v256i8>;
+defm : bitconvert_dblvec128B<v32i64, v256i8>;
+defm : bitconvert_dblvec128B<v128i16, v256i8>;
 
 // Vector shift support. Vector shifting in Hexagon is rather different
 // from internal representation of LLVM.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index 1d0d015..b207aaf 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -691,15 +691,15 @@ def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
 def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>;
 def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
 
-def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>;
+def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>;
 
 def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))),
          (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>;
 
 // Mux
-def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>;
-def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>;
-def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>;
+def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>;
+def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>;
+def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>;
 
 // Shift halfword
 def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
@@ -720,17 +720,17 @@ def : T_RR_pat<C2_cmpeq,  int_hexagon_C2_cmpeq>;
 def : T_RR_pat<C2_cmpgt,  int_hexagon_C2_cmpgt>;
 def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
 
-def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>;
-def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>;
-def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>;
+def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>;
+def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>;
+def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>;
 
-def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)),
+def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)),
       (i32 (C2_cmpgti (I32:$src1),
-                      (DEC_CONST_SIGNED s8ExtPred:$src2)))>;
+                      (DEC_CONST_SIGNED s32ImmPred:$src2)))>;
 
-def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)),
+def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)),
       (i32 (C2_cmpgtui (I32:$src1),
-                       (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>;
+                       (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>;
 
 // The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0.
 def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)),
@@ -1289,3 +1289,5 @@ def: T_stc_pat<S2_storerf_pci_pseudo, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
+include "HexagonIntrinsicsV60.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
new file mode 100644
index 0000000..24a3e4d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -0,0 +1,836 @@
+//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in {
+def HEXAGON_V6_vd0_pseudo : CVI_VA_Resource<(outs VectorRegs:$dst),
+    (ins ),
+    "$dst=#0",
+    [(set VectorRegs:$dst, (int_hexagon_V6_vd0 ))]>;
+
+def HEXAGON_V6_vd0_pseudo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
+    (ins ),
+    "$dst=#0",
+    [(set VectorRegs128B:$dst, (int_hexagon_V6_vd0_128B ))]>;
+}
+let isPseudo = 1 in
+def HEXAGON_V6_vassignp : CVI_VA_Resource<(outs VecDblRegs:$dst),
+    (ins VecDblRegs:$src1),
+    "$dst=vassignp_W($src1)",
+    [(set VecDblRegs:$dst, (int_hexagon_V6_vassignp VecDblRegs:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
+    (ins VecDblRegs128B:$src1),
+    "$dst=vassignp_W_128B($src1)",
+    [(set VecDblRegs128B:$dst, (int_hexagon_V6_vassignp_128B
+                                VecDblRegs128B:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_lo : CVI_VA_Resource<(outs VectorRegs:$dst),
+    (ins VecDblRegs:$src1),
+    "$dst=lo_W($src1)",
+    [(set VectorRegs:$dst, (int_hexagon_V6_lo VecDblRegs:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_hi : CVI_VA_Resource<(outs VectorRegs:$dst),
+    (ins VecDblRegs:$src1),
+    "$dst=hi_W($src1)",
+    [(set VectorRegs:$dst, (int_hexagon_V6_hi VecDblRegs:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_lo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
+    (ins VecDblRegs128B:$src1),
+    "$dst=lo_W($src1)",
+    [(set VectorRegs128B:$dst, (int_hexagon_V6_lo_128B VecDblRegs128B:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_hi_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
+    (ins VecDblRegs128B:$src1),
+    "$dst=hi_W($src1)",
+    [(set VectorRegs128B:$dst, (int_hexagon_V6_hi_128B VecDblRegs128B:$src1))]>;
+
+let AddedComplexity = 100 in {
+def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 VecDblRegs:$src1))),
+            (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_loreg)) >,
+            Requires<[UseHVXSgl]>;
+
+def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 VecDblRegs:$src1))),
+            (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_hireg)) >,
+            Requires<[UseHVXSgl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 VecDblRegs128B:$src1))),
+            (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1),
+                                     subreg_loreg)) >,
+            Requires<[UseHVXDbl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 VecDblRegs128B:$src1))),
+            (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1),
+                                     subreg_hireg)) >,
+            Requires<[UseHVXDbl]>;
+}
+
+def : Pat <(v512i1 (bitconvert (v16i32 VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v16i32 VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v32i16 VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v32i16 VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v64i8  VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v64i8  VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v8i64  VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v8i64  VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v16i32 (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v16i32 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v32i16 (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v32i16 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v64i8  (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v64i8  (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v8i64  (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v8i64  (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v1024i1 (bitconvert (v32i32 VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v32i32 VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v64i16 VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v64i16 VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v128i8  VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v128i8  VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v16i64  VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v16i64  VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v32i32 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v32i32 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v64i16 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v64i16 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v128i8  (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v128i8  (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v16i64  (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v16i64  (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+let AddedComplexity = 140 in {
+def : Pat <(store (v512i1 VecPredRegs:$src1), (i32 IntRegs:$addr)),
+           (V6_vS32b_ai IntRegs:$addr, 0,
+           (v16i32 (V6_vandqrt (v512i1 VecPredRegs:$src1),
+                                       (A2_tfrsi 0x01010101))))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
+           (v512i1 (V6_vandvrt
+           (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(store (v1024i1 VecPredRegs128B:$src1), (i32 IntRegs:$addr)),
+           (V6_vS32b_ai_128B IntRegs:$addr, 0,
+           (v32i32 (V6_vandqrt_128B (v1024i1 VecPredRegs128B:$src1),
+                                       (A2_tfrsi 0x01010101))))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
+           (v1024i1 (V6_vandvrt_128B
+           (v32i32 (V6_vL32b_ai_128B IntRegs:$addr, 0)),
+                                       (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+           (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1),
+           (MI    VectorRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1),
+           (MI    VecPredRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+           (MI    VecDblRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B")VecDblRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B")VecDblRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+           (MI    VectorRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B")VectorRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B")VectorRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+           (MI    VecDblRegs:$src1, VecDblRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+           (MI    VectorRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+           (MI    VecPredRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+           (MI    VecPredRegs:$src1, VecPredRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            VecPredRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            VecPredRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI    VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI    VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI    VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+           (MI    VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VecPredRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VecPredRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+
+multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI    VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2, imm:$src3),
+           (MI    VecDblRegs:$src1, IntRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            IntRegs:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            IntRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4),
+           (MI   VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+defm : T_WR_pat<V6_vtmpyb, int_hexagon_V6_vtmpyb>;
+defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
+defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
+defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
+defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
+defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
+defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
+defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
+defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
+defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
+defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
+defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
+defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
+defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
+defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
+defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
+defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
+defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
+defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
+defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
+defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
+defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
+defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
+defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
+defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
+defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
+defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
+defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
+defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
+defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
+defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
+defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
+
+defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
+defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
+defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
+defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
+defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
+defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
+defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
+defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
+defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
+defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
+defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
+defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
+defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
+defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
+defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
+defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
+defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
+defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
+defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
+defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
+defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
+defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
+defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
+defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
+defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
+defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
+defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
+defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
+defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
+defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
+defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
+defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
+defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
+defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
+defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
+defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
+defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
+defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
+defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
+defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
+defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
+defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
+defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
+defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
+defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
+defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
+defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
+defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
+defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
+defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
+defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
+defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
+defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
+defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
+defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
+defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
+defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
+defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
+defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
+defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
+defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
+defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
+defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
+defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
+
+defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
+defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
+defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
+defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
+defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
+defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
+defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
+defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
+defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
+
+defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
+defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
+
+defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
+defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
+defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
+defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
+
+defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
+defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
+defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
+defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
+defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
+defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
+defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
+defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
+
+defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
+defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
+defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
+defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
+defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
+defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
+defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
+defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
+defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
+defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
+defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
+defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
+defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
+defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
+defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
+
+// Compare instructions
+defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
+defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
+defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
+defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
+defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
+defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
+defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
+defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
+defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
+defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
+defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
+defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
+defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
+defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
+defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
+defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
+defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
+defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
+defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
+defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
+defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
+defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
+defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
+defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
+defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
+defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
+defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
+
+defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
+defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
+defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
+defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
+defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
+defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
+defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
+defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
+defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
+defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
+defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
+defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
+defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
+defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
+defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
+defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
+defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
+defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
+defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
+defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
+defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
+defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
+defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
+defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
+defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
+defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
+defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
+defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
+defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
+defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
+defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
+defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
+defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
+defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
+defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
+defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
+defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
+defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
+defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
+defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
+defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
+defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
+defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
+defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
+defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
+defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
+
+defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
+defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
+defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
+defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
+defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
+defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
+defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
+defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
+defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
+defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
+defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
+defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
+
+defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
+defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
+defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
+defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
+defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
+defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
+defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
+defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
+defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
+defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
+defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
+defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
+defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
+defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
+defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
+defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
+defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
+defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
+defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
+defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
+defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
+defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
+defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
+
+defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
+defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
+defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
+
+defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
+defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
+defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
+
+// assembler mapped.
+//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
+// not present earlier.. need to add intrinsic
+defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
+defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
+defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
+defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
+defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
+defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
+defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
+defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
+defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
+
+defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
+defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
+
+defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
+defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
+defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
+defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
+
+defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
+defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
+defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
+defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
+defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
+defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
+defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
+defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
+defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
+defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
+defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
+defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
+defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
+defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
+defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
+defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
+defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
+
+defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
+defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
+defm  : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
+defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
+
+defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
+defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
+defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
+defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
+
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
+def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
+def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
+def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
+def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
+def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
+def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
+def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
+def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
+def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
+def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
+def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
+
+defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
+defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
+
+def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+
+def: Pat<(v64i16 (trunc v64i32:$Vdd)),
+         (v64i16 (V6_vpackwh_sat_128B
+                 (v32i32 (HEXAGON_V6_hi_128B VecDblRegs128B:$Vdd)),
+                 (v32i32 (HEXAGON_V6_lo_128B VecDblRegs128B:$Vdd))))>,
+     Requires<[UseHVXDbl]>;
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 75189b6..624c0f6 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -26,39 +26,71 @@
 
 using namespace llvm;
 
-static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
-                              HexagonAsmPrinter& Printer) {
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              HexagonAsmPrinter &Printer) {
   MCContext &MC = Printer.OutContext;
   const MCExpr *ME;
 
-  ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC);
+  // Populate the relocation type based on Hexagon target flags
+  // set on an operand
+  MCSymbolRefExpr::VariantKind RelocationType;
+  switch (MO.getTargetFlags()) {
+  default:
+    RelocationType = MCSymbolRefExpr::VK_None;
+    break;
+  case HexagonII::MO_PCREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+    break;
+  case HexagonII::MO_GOT:
+    RelocationType = MCSymbolRefExpr::VK_GOT;
+    break;
+  case HexagonII::MO_LO16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16;
+    break;
+  case HexagonII::MO_HI16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16;
+    break;
+  case HexagonII::MO_GPREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
+    break;
+  }
+
+  ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
 
   if (!MO.isJTI() && MO.getOffset())
     ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
                                  MC);
 
-  return (MCOperand::createExpr(ME));
+  return MCOperand::createExpr(ME);
 }
 
 // Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
-                            HexagonAsmPrinter& AP) {
-  if(MI->getOpcode() == Hexagon::ENDLOOP0){
+void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                            MCInst &MCB, HexagonAsmPrinter &AP) {
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
     HexagonMCInstrInfo::setInnerLoop(MCB);
     return;
   }
-  if(MI->getOpcode() == Hexagon::ENDLOOP1){
+  if (MI->getOpcode() == Hexagon::ENDLOOP1) {
     HexagonMCInstrInfo::setOuterLoop(MCB);
     return;
   }
-  MCInst* MCI = new (AP.OutContext) MCInst;
+  MCInst *MCI = new (AP.OutContext) MCInst;
   MCI->setOpcode(MI->getOpcode());
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
          "MCI opcode should have been set on construction");
+  bool MustExtend = false;
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
     MCOperand MCO;
+    if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
+      MustExtend = true;
 
     switch (MO.getType()) {
     default:
@@ -73,11 +105,14 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
       APFloat Val = MO.getFPImm()->getValueAPF();
       // FP immediates are used only when setting GPRs, so they may be dealt
       // with like regular immediates from this point on.
-      MCO = MCOperand::createImm(*Val.bitcastToAPInt().getRawData());
+      MCO = MCOperand::createExpr(
+        MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(),
+                               AP.OutContext));
       break;
     }
     case MachineOperand::MO_Immediate:
-      MCO = MCOperand::createImm(MO.getImm());
+      MCO = MCOperand::createExpr(
+        MCConstantExpr::create(MO.getImm(), AP.OutContext));
       break;
     case MachineOperand::MO_MachineBasicBlock:
       MCO = MCOperand::createExpr
@@ -104,5 +139,8 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
 
     MCI->addOperand(MCO);
   }
+  AP.HexagonProcessInstruction(*MCI, *MI);
+  HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI,
+                                     MustExtend);
   MCB.addOperand(MCOperand::createInst(MCI));
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 35f732c..7a52d68 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -179,7 +179,11 @@ void VLIWMachineScheduler::schedule() {
   initQueues(TopRoots, BotRoots);
 
   bool IsTopNode = false;
-  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+  while (true) {
+    DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
     if (!checkSchedLimit())
       break;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 707bfdb..20c4ab1 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -92,6 +92,7 @@ namespace {
     /// \brief A handle to the branch probability pass.
     const MachineBranchProbabilityInfo *MBPI;
 
+    bool isNewValueJumpCandidate(const MachineInstr *MI) const;
   };
 
 } // end of anonymous namespace
@@ -280,9 +281,9 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
   return true;
 }
 
-// Given a compare operator, return a matching New Value Jump
-// compare operator. Make sure that MI here is included in
-// HexagonInstrInfo.cpp::isNewValueJumpCandidate
+
+// Given a compare operator, return a matching New Value Jump compare operator.
+// Make sure that MI here is included in isNewValueJumpCandidate.
 static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
                                       bool secondRegNewified,
                                       MachineBasicBlock *jmpTarget,
@@ -341,6 +342,24 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
       return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
                    : Hexagon::J4_cmpgtui_t_jumpnv_nt;
 
+    case Hexagon::C4_cmpneq:
+      return taken ? Hexagon::J4_cmpeq_f_jumpnv_t
+                   : Hexagon::J4_cmpeq_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplte:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmplt_f_jumpnv_t
+                     : Hexagon::J4_cmplt_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgt_f_jumpnv_t
+                   : Hexagon::J4_cmpgt_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplteu:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmpltu_f_jumpnv_t
+                     : Hexagon::J4_cmpltu_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t
+                   : Hexagon::J4_cmpgtu_f_jumpnv_nt;
+
     default:
        llvm_unreachable("Could not find matching New Value Jump instruction.");
   }
@@ -348,6 +367,26 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
   return 0;
 }
 
+bool HexagonNewValueJump::isNewValueJumpCandidate(const MachineInstr *MI)
+      const {
+  switch (MI->getOpcode()) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+      return true;
+
+    default:
+      return false;
+  }
+}
+
+
 bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
@@ -372,7 +411,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
   // Loop through all the bb's of the function
   for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
         MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock* MBB = MBBb;
+    MachineBasicBlock *MBB = &*MBBb;
 
     DEBUG(dbgs() << "** dumping bb ** "
                  << MBB->getNumber() << "\n");
@@ -468,7 +507,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           MI->getOperand(0).getReg() == predReg) {
 
         // Not all compares can be new value compare. Arch Spec: 7.6.1.1
-        if (QII->isNewValueJumpCandidate(MI)) {
+        if (isNewValueJumpCandidate(MI)) {
 
           assert((MI->getDesc().isCompare()) &&
               "Only compare instruction can be collapsed into New Value Jump");
@@ -591,8 +630,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           DebugLoc dl = MI->getDebugLoc();
           MachineInstr *NewMI;
 
-           assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
-                      "This compare is not a New Value Jump candidate.");
+          assert((isNewValueJumpCandidate(cmpInstr)) &&
+                 "This compare is not a New Value Jump candidate.");
           unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
                                                isSecondOpNewified,
                                                jmpTarget, MBPI);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
index 2bece8f..fbd29cd 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -1,4 +1,4 @@
-//===- HexagonOperands.td - Hexagon immediate processing -*- tablegen -*-===//
+//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,59 +7,114 @@
 //
 //===----------------------------------------------------------------------===//
 
+def s32ImmOperand : AsmOperandClass { let Name = "s32Imm"; }
+def s8ImmOperand : AsmOperandClass { let Name = "s8Imm"; }
+def s8Imm64Operand : AsmOperandClass { let Name = "s8Imm64"; }
+def s6ImmOperand : AsmOperandClass { let Name = "s6Imm"; }
+def s4ImmOperand : AsmOperandClass { let Name = "s4Imm"; }
 def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
 def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
 def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
 def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
-
+def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
+def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
+def u64ImmOperand : AsmOperandClass { let Name = "u64Imm"; }
+def u32ImmOperand : AsmOperandClass { let Name = "u32Imm"; }
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
+def u16ImmOperand : AsmOperandClass { let Name = "u16Imm"; }
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
+def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
+def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
+def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
+def u10ImmOperand : AsmOperandClass { let Name = "u10Imm"; }
+def u9ImmOperand : AsmOperandClass { let Name = "u9Imm"; }
+def u8ImmOperand : AsmOperandClass { let Name = "u8Imm"; }
+def u7ImmOperand : AsmOperandClass { let Name = "u7Imm"; }
+def u6ImmOperand : AsmOperandClass { let Name = "u6Imm"; }
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
+def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
+def u5ImmOperand : AsmOperandClass { let Name = "u5Imm"; }
+def u4ImmOperand : AsmOperandClass { let Name = "u4Imm"; }
+def u3ImmOperand : AsmOperandClass { let Name = "u3Imm"; }
+def u2ImmOperand : AsmOperandClass { let Name = "u2Imm"; }
+def u1ImmOperand : AsmOperandClass { let Name = "u1Imm"; }
+def n8ImmOperand : AsmOperandClass { let Name = "n8Imm"; }
 // Immediate operands.
 
-let PrintMethod = "printImmOperand" in {
-  def s32Imm : Operand<i32>;
-  def s8Imm : Operand<i32>;
-  def s8Imm64 : Operand<i64>;
-  def s6Imm : Operand<i32>;
+let OperandType = "OPERAND_IMMEDIATE",
+    DecoderMethod = "unsignedImmDecoder" in {
+  def s32Imm : Operand<i32> { let ParserMatchClass = s32ImmOperand;
+                              let DecoderMethod = "s32ImmDecoder"; }
+  def s8Imm : Operand<i32> { let ParserMatchClass = s8ImmOperand;
+                             let DecoderMethod = "s8ImmDecoder"; }
+  def s8Imm64 : Operand<i64>  { let ParserMatchClass = s8Imm64Operand;
+                                let DecoderMethod = "s8ImmDecoder"; }
+  def s6Imm : Operand<i32> { let ParserMatchClass = s6ImmOperand;
+                             let DecoderMethod = "s6_0ImmDecoder"; }
   def s6_3Imm : Operand<i32>;
-  def s4Imm : Operand<i32>;
-  def s4_0Imm : Operand<i32> { let DecoderMethod = "s4_0ImmDecoder"; }
-  def s4_1Imm : Operand<i32> { let DecoderMethod = "s4_1ImmDecoder"; }
-  def s4_2Imm : Operand<i32> { let DecoderMethod = "s4_2ImmDecoder"; }
-  def s4_3Imm : Operand<i32> { let DecoderMethod = "s4_3ImmDecoder"; }
-  def u64Imm : Operand<i64>;
-  def u32Imm : Operand<i32>;
-  def u26_6Imm : Operand<i32>;
-  def u16Imm : Operand<i32>;
-  def u16_0Imm : Operand<i32>;
-  def u16_1Imm : Operand<i32>;
-  def u16_2Imm : Operand<i32>;
-  def u16_3Imm : Operand<i32>;
-  def u11_3Imm : Operand<i32>;
-  def u10Imm : Operand<i32>;
-  def u9Imm : Operand<i32>;
-  def u8Imm : Operand<i32>;
-  def u7Imm : Operand<i32>;
-  def u6Imm : Operand<i32>;
-  def u6_0Imm : Operand<i32>;
-  def u6_1Imm : Operand<i32>;
-  def u6_2Imm : Operand<i32>;
-  def u6_3Imm : Operand<i32>;
-  def u5Imm : Operand<i32>;
+  def s4Imm : Operand<i32> { let ParserMatchClass = s4ImmOperand;
+                             let DecoderMethod = "s4_0ImmDecoder"; }
+  def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
+                               let DecoderMethod = "s4_0ImmDecoder"; }
+  def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
+                               let DecoderMethod = "s4_1ImmDecoder"; }
+  def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
+                               let DecoderMethod = "s4_2ImmDecoder"; }
+  def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
+                               let DecoderMethod = "s4_3ImmDecoder"; }
+  def u64Imm : Operand<i64> { let ParserMatchClass = u64ImmOperand; }
+  def u32Imm : Operand<i32> { let ParserMatchClass = u32ImmOperand; }
+  def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
+  def u16Imm : Operand<i32> { let ParserMatchClass = u16ImmOperand; }
+  def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
+  def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
+  def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
+  def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
+  def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
+  def u10Imm : Operand<i32> { let ParserMatchClass = u10ImmOperand; }
+  def u9Imm : Operand<i32> { let ParserMatchClass = u9ImmOperand; }
+  def u8Imm : Operand<i32> { let ParserMatchClass = u8ImmOperand; }
+  def u7Imm : Operand<i32> { let ParserMatchClass = u7ImmOperand; }
+  def u6Imm : Operand<i32> { let ParserMatchClass = u6ImmOperand; }
+  def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
+  def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
+  def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
+  def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
+  def u5Imm : Operand<i32> { let ParserMatchClass = u5ImmOperand; }
+  def u5_0Imm : Operand<i32>;
+  def u5_1Imm : Operand<i32>;
   def u5_2Imm : Operand<i32>;
   def u5_3Imm : Operand<i32>;
-  def u4Imm : Operand<i32>;
+  def u4Imm : Operand<i32> { let ParserMatchClass = u4ImmOperand; }
   def u4_0Imm : Operand<i32>;
+  def u4_1Imm : Operand<i32>;
   def u4_2Imm : Operand<i32>;
-  def u3Imm : Operand<i32>;
+  def u4_3Imm : Operand<i32>;
+  def u3Imm : Operand<i32> { let ParserMatchClass = u3ImmOperand; }
   def u3_0Imm : Operand<i32>;
   def u3_1Imm : Operand<i32>;
-  def u2Imm : Operand<i32>;
-  def u1Imm : Operand<i32>;
-  def n8Imm : Operand<i32>;
-  def m6Imm : Operand<i32>;
+  def u3_2Imm : Operand<i32>;
+  def u3_3Imm : Operand<i32>;
+  def u2Imm : Operand<i32> { let ParserMatchClass = u2ImmOperand; }
+  def u1Imm : Operand<i32> { let ParserMatchClass = u1ImmOperand; }
+  def n8Imm : Operand<i32> { let ParserMatchClass = n8ImmOperand; }
 }
 
-let PrintMethod = "printNOneImmOperand" in
-def nOneImm : Operand<i32>;
+let OperandType = "OPERAND_IMMEDIATE" in {
+  def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
+                               let PrintMethod = "prints4_6ImmOperand";
+                               let DecoderMethod = "s4_6ImmDecoder";}
+  def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
+                               let DecoderMethod = "s4_6ImmDecoder";}
+  def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
+                               let PrintMethod = "prints3_6ImmOperand";
+                               let DecoderMethod = "s3_6ImmDecoder";}
+  def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
+                               let DecoderMethod = "s3_6ImmDecoder";}
+}
 
 //
 // Immediate predicates
@@ -81,32 +136,12 @@ def s31_1ImmPred  : PatLeaf<(i32 imm), [{
 
 def s30_2ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<31,1>(v);
+  return isShiftedInt<30,2>(v);
 }]>;
 
 def s29_3ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<31,1>(v);
-}]>;
-
-def s22_10ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<22,10>(v);
-}]>;
-
-def s8_24ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<8,24>(v);
-}]>;
-
-def s16_16ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<16,16>(v);
-}]>;
-
-def s26_6ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<26,6>(v);
+  return isShiftedInt<29,3>(v);
 }]>;
 
 def s16ImmPred  : PatLeaf<(i32 imm), [{
@@ -114,16 +149,6 @@ def s16ImmPred  : PatLeaf<(i32 imm), [{
   return isInt<16>(v);
 }]>;
 
-def s13ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<13>(v);
-}]>;
-
-def s12ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<12>(v);
-}]>;
-
 def s11_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isInt<11>(v);
@@ -149,16 +174,6 @@ def s10ImmPred  : PatLeaf<(i32 imm), [{
   return isInt<10>(v);
 }]>;
 
-def s9ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<9>(v);
-}]>;
-
-def m9ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<9>(v) && (v != -256);
-}]>;
-
 def s8ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isInt<8>(v);
@@ -194,7 +209,6 @@ def s4_3ImmPred  : PatLeaf<(i32 imm), [{
   return isShiftedInt<4,3>(v);
 }]>;
 
-
 def u64ImmPred  : PatLeaf<(i64 imm), [{
   // Adding "N ||" to suppress gcc unused warning.
   return (N || true);
@@ -230,19 +244,19 @@ def u26_6ImmPred  : PatLeaf<(i32 imm), [{
   return isShiftedUInt<26,6>(v);
 }]>;
 
-def u16ImmPred  : PatLeaf<(i32 imm), [{
+def u16_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isUInt<16>(v);
 }]>;
 
-def u16_s8ImmPred  : PatLeaf<(i32 imm), [{
+def u16_1ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<16,8>(v);
+  return isShiftedUInt<16,1>(v);
 }]>;
 
-def u16_0ImmPred  : PatLeaf<(i32 imm), [{
+def u16_2ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<16>(v);
+  return isShiftedUInt<16,2>(v);
 }]>;
 
 def u11_3ImmPred : PatLeaf<(i32 imm), [{
@@ -250,6 +264,11 @@ def u11_3ImmPred : PatLeaf<(i32 imm), [{
   return isShiftedUInt<11,3>(v);
 }]>;
 
+def u10ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<10>(v);
+}]>;
+
 def u9ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isUInt<9>(v);
@@ -321,6 +340,11 @@ def u1ImmPred  : PatLeaf<(i1 imm), [{
   return isUInt<1>(v);
 }]>;
 
+def u1ImmPred32  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<1>(v);
+}]>;
+
 def m5BImmPred  : PatLeaf<(i32 imm), [{
   // m5BImmPred predicate - True if the (char) number is in range -1 .. -31
   // and will fit in a 5 bit field when made positive, for use in memops.
@@ -379,7 +403,7 @@ def Clr5ImmPred : PatLeaf<(i32 imm), [{
 }]>;
 
 def SetClr5ImmPred : PatLeaf<(i32 imm), [{
-  // SetClr5ImmPred predicate - True if the immediate is in range 0..31.
+  // True if the immediate is in range 0..31.
   int32_t v = (int32_t)N->getSExtValue();
   return (v >= 0 && v <= 31);
 }]>;
@@ -404,14 +428,13 @@ def Clr4ImmPred : PatLeaf<(i32 imm), [{
 }]>;
 
 def SetClr4ImmPred : PatLeaf<(i32 imm), [{
-  // SetClr4ImmPred predicate - True if the immediate is in the range 0..15.
+  // True if the immediate is in the range 0..15.
   int16_t v = (int16_t)N->getSExtValue();
   return (v >= 0 && v <= 15);
 }]>;
 
 def Set3ImmPred : PatLeaf<(i32 imm), [{
-  // Set3ImmPred predicate - True if the number is in the series of values:
-  // [ 2^0, 2^1, ... 2^7 ].
+  // True if the number is in the series of values: [ 2^0, 2^1, ... 2^7 ].
   // For use in setbit immediate.
   uint8_t v = (int8_t)N->getSExtValue();
   // Constrain to 8 bits, and then check for single bit.
@@ -419,9 +442,7 @@ def Set3ImmPred : PatLeaf<(i32 imm), [{
 }]>;
 
 def Clr3ImmPred : PatLeaf<(i32 imm), [{
-  // Clr3ImmPred predicate - True if the number is in the series of
-  // bit negated values:
-  // [ 2^0, 2^1, ... 2^7 ].
+  // True if the number is in the series of bit negated values: [ 2^0, 2^1, ... 2^7 ].
   // For use in setbit and clrbit immediate.
   uint8_t v = ~ (int8_t)N->getSExtValue();
   // Constrain to 8 bits, and then check for single bit.
@@ -429,76 +450,109 @@ def Clr3ImmPred : PatLeaf<(i32 imm), [{
 }]>;
 
 def SetClr3ImmPred : PatLeaf<(i32 imm), [{
-  // SetClr3ImmPred predicate - True if the immediate is in the range  0..7.
+  // True if the immediate is in the range  0..7.
   int8_t v = (int8_t)N->getSExtValue();
   return (v >= 0 && v <= 7);
 }]>;
 
 
 // Extendable immediate operands.
-
-let PrintMethod = "printExtOperand" in {
-  def f32Ext : Operand<f32>;
-  def s16Ext : Operand<i32> { let DecoderMethod = "s16ImmDecoder"; }
-  def s12Ext : Operand<i32> { let DecoderMethod = "s12ImmDecoder"; }
-  def s11_0Ext : Operand<i32> { let DecoderMethod = "s11_0ImmDecoder"; }
-  def s11_1Ext : Operand<i32> { let DecoderMethod = "s11_1ImmDecoder"; }
-  def s11_2Ext : Operand<i32> { let DecoderMethod = "s11_2ImmDecoder"; }
-  def s11_3Ext : Operand<i32> { let DecoderMethod = "s11_3ImmDecoder"; }
-  def s10Ext : Operand<i32> { let DecoderMethod = "s10ImmDecoder"; }
-  def s9Ext : Operand<i32> { let DecoderMethod = "s90ImmDecoder"; }
-  def s8Ext : Operand<i32> { let DecoderMethod = "s8ImmDecoder"; }
-  def s7Ext : Operand<i32>;
-  def s6Ext : Operand<i32> { let DecoderMethod = "s6_0ImmDecoder"; }
-  def u6Ext : Operand<i32>;
-  def u7Ext : Operand<i32>;
-  def u8Ext : Operand<i32>;
-  def u9Ext : Operand<i32>;
-  def u10Ext : Operand<i32>;
-  def u6_0Ext : Operand<i32>;
-  def u6_1Ext : Operand<i32>;
-  def u6_2Ext : Operand<i32>;
-  def u6_3Ext : Operand<i32>;
+def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
+def s16ExtOperand : AsmOperandClass { let Name = "s16Ext"; }
+def s12ExtOperand : AsmOperandClass { let Name = "s12Ext"; }
+def s10ExtOperand : AsmOperandClass { let Name = "s10Ext"; }
+def s9ExtOperand : AsmOperandClass { let Name = "s9Ext"; }
+def s8ExtOperand : AsmOperandClass { let Name = "s8Ext"; }
+def s7ExtOperand : AsmOperandClass { let Name = "s7Ext"; }
+def s6ExtOperand : AsmOperandClass { let Name = "s6Ext"; }
+def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
+def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
+def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
+def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
+def u6ExtOperand : AsmOperandClass { let Name = "u6Ext"; }
+def u7ExtOperand : AsmOperandClass { let Name = "u7Ext"; }
+def u8ExtOperand : AsmOperandClass { let Name = "u8Ext"; }
+def u9ExtOperand : AsmOperandClass { let Name = "u9Ext"; }
+def u10ExtOperand : AsmOperandClass { let Name = "u10Ext"; }
+def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
+def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
+def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
+def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
+def u32MustExtOperand : AsmOperandClass { let Name = "u32MustExt"; }
+
+
+
+let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
+    DecoderMethod = "unsignedImmDecoder" in {
+  def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
+  def s16Ext : Operand<i32> { let ParserMatchClass = s16ExtOperand;
+                              let DecoderMethod = "s16ImmDecoder"; }
+  def s12Ext : Operand<i32> { let ParserMatchClass = s12ExtOperand;
+                              let DecoderMethod = "s12ImmDecoder"; }
+  def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
+                              let DecoderMethod = "s11_0ImmDecoder"; }
+  def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
+                              let DecoderMethod = "s11_1ImmDecoder"; }
+  def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
+                              let DecoderMethod = "s11_2ImmDecoder"; }
+  def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
+                              let DecoderMethod = "s11_3ImmDecoder"; }
+  def s10Ext : Operand<i32> { let ParserMatchClass = s10ExtOperand;
+                              let DecoderMethod = "s10ImmDecoder"; }
+  def s9Ext : Operand<i32> { let ParserMatchClass = s9ExtOperand;
+                              let DecoderMethod = "s90ImmDecoder"; }
+  def s8Ext : Operand<i32> { let ParserMatchClass = s8ExtOperand;
+                              let DecoderMethod = "s8ImmDecoder"; }
+  def s7Ext : Operand<i32> { let ParserMatchClass = s7ExtOperand; }
+  def s6Ext : Operand<i32> { let ParserMatchClass = s6ExtOperand;
+                              let DecoderMethod = "s6_0ImmDecoder"; }
+  def u6Ext : Operand<i32> { let ParserMatchClass = u6ExtOperand; }
+  def u7Ext : Operand<i32> { let ParserMatchClass = u7ExtOperand; }
+  def u8Ext : Operand<i32> { let ParserMatchClass = u8ExtOperand; }
+  def u9Ext : Operand<i32> { let ParserMatchClass = u9ExtOperand; }
+  def u10Ext : Operand<i32> { let ParserMatchClass = u10ExtOperand; }
+  def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
+  def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
+  def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
+  def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
+  def u32MustExt : Operand<i32> { let ParserMatchClass = u32MustExtOperand; }
 }
 
-def s10ExtPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  if (isInt<10>(v))
-    return true;
 
-  // Return true if extending this immediate is profitable and the value
-  // can fit in a 32-bit signed field.
-  return isConstExtProfitable(Node) && isInt<32>(v);
+def s4_7ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (HST->hasV60TOps())
+    // Return true if the immediate can fit in a 10-bit sign extended field and
+    // is 128-byte aligned.
+    return isShiftedInt<4,7>(v);
+  return false;
 }]>;
 
-def s8ExtPred  : PatLeaf<(i32 imm), [{
+def s3_7ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (isInt<8>(v))
-    return true;
-
-  // Return true if extending this immediate is profitable and the value
-  // can fit in a 32-bit signed field.
-  return isConstExtProfitable(Node) && isInt<32>(v);
+  if (HST->hasV60TOps())
+    // Return true if the immediate can fit in a 9-bit sign extended field and
+    // is 128-byte aligned.
+    return isShiftedInt<3,7>(v);
+  return false;
 }]>;
 
-def u8ExtPred  : PatLeaf<(i32 imm), [{
+def s4_6ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (isUInt<8>(v))
-    return true;
-
-  // Return true if extending this immediate is profitable and the value
-  // can fit in a 32-bit unsigned field.
-  return isConstExtProfitable(Node) && isUInt<32>(v);
+  if (HST->hasV60TOps())
+    // Return true if the immediate can fit in a 10-bit sign extended field and
+    // is 64-byte aligned.
+    return isShiftedInt<4,6>(v);
+  return false;
 }]>;
 
-def u9ExtPred  : PatLeaf<(i32 imm), [{
+def s3_6ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
-  if (isUInt<9>(v))
-    return true;
-
-  // Return true if extending this immediate is profitable and the value
-  // can fit in a 32-bit unsigned field.
-  return isConstExtProfitable(Node) && isUInt<32>(v);
+  if (HST->hasV60TOps())
+    // Return true if the immediate can fit in a 9-bit sign extended field and
+    // is 64-byte aligned.
+    return isShiftedInt<3,6>(v);
+  return false;
 }]>;
 
 
@@ -523,21 +577,21 @@ let PrintMethod = "printGlobalOperand" in {
 let PrintMethod = "printJumpTable" in
 def jumptablebase : Operand<i32>;
 
-def brtarget : Operand<OtherVT>;
+def brtarget : Operand<OtherVT> {
+  let DecoderMethod = "brtargetDecoder";
+  let PrintMethod = "printBrtarget";
+}
 def brtargetExt : Operand<OtherVT> {
-  let PrintMethod = "printExtBrtarget";
+  let DecoderMethod = "brtargetDecoder";
+  let PrintMethod = "printBrtarget";
+}
+def calltarget : Operand<i32> {
+  let DecoderMethod = "brtargetDecoder";
+  let PrintMethod = "printBrtarget";
 }
-def calltarget : Operand<i32>;
 
 def bblabel : Operand<i32>;
-def bbl   : SDNode<"ISD::BasicBlock", SDTPtrLeaf   , [], "BasicBlockSDNode">;
-
-def symbolHi32 : Operand<i32> {
-  let PrintMethod = "printSymbolHi";
-}
-def symbolLo32 : Operand<i32> {
-  let PrintMethod = "printSymbolLo";
-}
+def bbl     : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
 
 // Return true if for a 32 to 64-bit sign-extended load.
 def is_sext_i32 : PatLeaf<(i64 DoubleRegs:$src1), [{
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
new file mode 100644
index 0000000..1723771
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -0,0 +1,150 @@
+//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "Hexagon.h"
+
+using namespace llvm;
+
+namespace llvm {
+  FunctionPass *createHexagonOptimizeSZextends();
+  void initializeHexagonOptimizeSZextendsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonOptimizeSZextends : public FunctionPass {
+  public:
+    static char ID;
+    HexagonOptimizeSZextends() : FunctionPass(ID) {
+      initializeHexagonOptimizeSZextendsPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnFunction(Function &F) override;
+
+    const char *getPassName() const override {
+      return "Remove sign extends";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineFunctionAnalysis>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      AU.addPreserved<StackProtector>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool intrinsicAlreadySextended(Intrinsic::ID IntID);
+  };
+}
+
+char HexagonOptimizeSZextends::ID = 0;
+
+INITIALIZE_PASS(HexagonOptimizeSZextends, "reargs",
+                "Remove Sign and Zero Extends for Args", false, false)
+
+bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) {
+  switch(IntID) {
+    case llvm::Intrinsic::hexagon_A2_addh_l16_sat_ll:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
+  unsigned Idx = 1;
+  // Try to optimize sign extends in formal parameters. It's relying on
+  // callee already sign extending the values. I'm not sure if our ABI
+  // requires callee to sign extend though.
+  for (auto &Arg : F.args()) {
+    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+      if (!isa<PointerType>(Arg.getType())) {
+        for (auto UI = Arg.use_begin(); UI != Arg.use_end();) {
+          if (isa<SExtInst>(*UI)) {
+            Instruction* Use = cast<Instruction>(*UI);
+            SExtInst* SI = new SExtInst(&Arg, Use->getType());
+            assert (EVT::getEVT(SI->getType()) ==
+                    (EVT::getEVT(Use->getType())));
+            ++UI;
+            Use->replaceAllUsesWith(SI);
+            Instruction* First = &F.getEntryBlock().front();
+            SI->insertBefore(First);
+            Use->eraseFromParent();
+          } else {
+            ++UI;
+          }
+        }
+      }
+    }
+    ++Idx;
+  }
+
+  // Try to remove redundant sext operations on Hexagon. The hardware
+  // already sign extends many 16 bit intrinsic operations to 32 bits.
+  // For example:
+  // %34 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %x, i32 %y)
+  // %sext233 = shl i32 %34, 16
+  // %conv52 = ashr exact i32 %sext233, 16
+  for (auto &B : F) {
+    for (auto &I : B) {
+      // Look for arithmetic shift right by 16.
+      BinaryOperator *Ashr = dyn_cast<BinaryOperator>(&I);
+      if (!(Ashr && Ashr->getOpcode() == Instruction::AShr))
+        continue;
+      Value *AshrOp1 = Ashr->getOperand(1);
+      ConstantInt *C = dyn_cast<ConstantInt>(AshrOp1);
+      // Right shifted by 16.
+      if (!(C && C->getSExtValue() == 16))
+        continue;
+
+      // The first operand of Ashr comes from logical shift left.
+      Instruction *Shl = dyn_cast<Instruction>(Ashr->getOperand(0));
+      if (!(Shl && Shl->getOpcode() == Instruction::Shl))
+        continue;
+      Value *Intr = Shl->getOperand(0);
+      Value *ShlOp1 = Shl->getOperand(1);
+      C = dyn_cast<ConstantInt>(ShlOp1);
+      // Left shifted by 16.
+      if (!(C && C->getSExtValue() == 16))
+        continue;
+
+      // The first operand of Shl comes from an intrinsic.
+      if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(Intr)) {
+        if (!intrinsicAlreadySextended(I->getIntrinsicID()))
+          continue;
+        // All is well. Replace all uses of AShr with I.
+        for (auto UI = Ashr->user_begin(), UE = Ashr->user_end();
+             UI != UE; ++UI) {
+          const Use &TheUse = UI.getUse();
+          if (Instruction *J = dyn_cast<Instruction>(TheUse.getUser())) {
+            J->replaceUsesOfWith(Ashr, I);
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+
+FunctionPass *llvm::createHexagonOptimizeSZextends() {
+  return new HexagonOptimizeSZextends();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index 93dcbe2..e68ff85 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -124,7 +124,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
        MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock* MBB = MBBb;
+    MachineBasicBlock *MBB = &*MBBb;
     PeepholeMap.clear();
     PeepholeDoubleRegsMap.clear();
 
@@ -180,7 +180,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         unsigned DstReg = Dst.getReg();
         unsigned SrcReg = Src1.getReg();
         PeepholeDoubleRegsMap[DstReg] =
-          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+          std::make_pair(*&SrcReg, Hexagon::subreg_hireg);
       }
 
       // Look for P=NOT(P).
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp
new file mode 100644
index 0000000..06719cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp
@@ -0,0 +1,60 @@
+//===--- HexagonRDF.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRDF.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+
+#include "llvm/CodeGen/MachineInstr.h"
+
+using namespace llvm;
+using namespace rdf;
+
+bool HexagonRegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const {
+  if (RA == RB)
+    return true;
+
+  if (TargetRegisterInfo::isVirtualRegister(RA.Reg) &&
+      TargetRegisterInfo::isVirtualRegister(RB.Reg)) {
+    // Hexagon-specific cases.
+    if (RA.Reg == RB.Reg) {
+      if (RA.Sub == 0)
+        return true;
+      if (RB.Sub == 0)
+        return false;
+    }
+  }
+
+  return RegisterAliasInfo::covers(RA, RB);
+}
+
+bool HexagonRegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR)
+      const {
+  if (RRs.count(RR))
+    return true;
+
+  if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg)) {
+    assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+    // Check if both covering subregisters are present.
+    bool HasLo = RRs.count({RR.Reg, Hexagon::subreg_loreg});
+    bool HasHi = RRs.count({RR.Reg, Hexagon::subreg_hireg});
+    if (HasLo && HasHi)
+      return true;
+  }
+
+  if (RR.Sub == 0) {
+    // Check if both covering subregisters are present.
+    unsigned Lo = TRI.getSubReg(RR.Reg, Hexagon::subreg_loreg);
+    unsigned Hi = TRI.getSubReg(RR.Reg, Hexagon::subreg_hireg);
+    if (RRs.count({Lo, 0}) && RRs.count({Hi, 0}))
+      return true;
+  }
+
+  return RegisterAliasInfo::covers(RRs, RR);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h
new file mode 100644
index 0000000..00c1889
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h
@@ -0,0 +1,28 @@
+//===--- HexagonRDF.h -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGON_RDF_H
+#define HEXAGON_RDF_H
+#include "RDFGraph.h"
+
+namespace llvm {
+  class TargetRegisterInfo;
+}
+
+namespace rdf {
+  struct HexagonRegisterAliasInfo : public RegisterAliasInfo {
+    HexagonRegisterAliasInfo(const TargetRegisterInfo &TRI)
+      : RegisterAliasInfo(TRI) {}
+    bool covers(RegisterRef RA, RegisterRef RR) const override;
+    bool covers(const RegisterSet &RRs, RegisterRef RR) const override;
+  };
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
new file mode 100644
index 0000000..3fcda984
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -0,0 +1,272 @@
+//===--- HexagonRDFOpt.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRDF.h"
+#include "HexagonSubtarget.h"
+#include "RDFCopy.h"
+#include "RDFDeadCode.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+  void initializeHexagonRDFOptPass(PassRegistry&);
+  FunctionPass *createHexagonRDFOpt();
+}
+
+namespace {
+  cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX));
+  unsigned RDFCount = 0;
+  cl::opt<bool> RDFDump("rdf-dump", cl::init(false));
+
+  class HexagonRDFOpt : public MachineFunctionPass {
+  public:
+    HexagonRDFOpt() : MachineFunctionPass(ID) {
+      initializeHexagonRDFOptPass(*PassRegistry::getPassRegistry());
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominanceFrontier>();
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    const char *getPassName() const override {
+      return "Hexagon RDF optimizations";
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    static char ID;
+
+  private:
+    MachineDominatorTree *MDT;
+    MachineRegisterInfo *MRI;
+  };
+
+  char HexagonRDFOpt::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+
+
+struct HexagonDCE : public DeadCodeElimination {
+  HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI)
+    : DeadCodeElimination(G, MRI) {}
+  bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove);
+  void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum);
+
+  bool run();
+};
+
+
+bool HexagonDCE::run() {
+  bool Collected = collect();
+  if (!Collected)
+    return false;
+
+  const SetVector<NodeId> &DeadNodes = getDeadNodes();
+  const SetVector<NodeId> &DeadInstrs = getDeadInstrs();
+
+  typedef DenseMap<NodeId,NodeId> RefToInstrMap;
+  RefToInstrMap R2I;
+  SetVector<NodeId> PartlyDead;
+  DataFlowGraph &DFG = getDFG();
+
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) {
+      NodeAddr<StmtNode*> SA = TA;
+      for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) {
+        R2I.insert(std::make_pair(RA.Id, SA.Id));
+        if (DFG.IsDef(RA) && DeadNodes.count(RA.Id))
+          if (!DeadInstrs.count(SA.Id))
+            PartlyDead.insert(SA.Id);
+      }
+    }
+  }
+
+  // Nodes to remove.
+  SetVector<NodeId> Remove = DeadInstrs;
+
+  bool Changed = false;
+  for (NodeId N : PartlyDead) {
+    auto SA = DFG.addr<StmtNode*>(N);
+    if (trace())
+      dbgs() << "Partly dead: " << *SA.Addr->getCode();
+    Changed |= rewrite(SA, Remove);
+  }
+
+  return erase(Remove) || Changed;
+}
+
+
+void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) {
+  MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+
+  auto getOpNum = [MI] (MachineOperand &Op) -> unsigned {
+    for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i)
+      if (&MI->getOperand(i) == &Op)
+        return i;
+    llvm_unreachable("Invalid operand");
+  };
+  DenseMap<NodeId,unsigned> OpMap;
+  NodeList Refs = IA.Addr->members(getDFG());
+  for (NodeAddr<RefNode*> RA : Refs)
+    OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp())));
+
+  MI->RemoveOperand(OpNum);
+
+  for (NodeAddr<RefNode*> RA : Refs) {
+    unsigned N = OpMap[RA.Id];
+    if (N < OpNum)
+      RA.Addr->setRegRef(&MI->getOperand(N));
+    else if (N > OpNum)
+      RA.Addr->setRegRef(&MI->getOperand(N-1));
+  }
+}
+
+
+bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) {
+  if (!getDFG().IsCode<NodeAttrs::Stmt>(IA))
+    return false;
+  DataFlowGraph &DFG = getDFG();
+  MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+  auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII());
+  if (HII.getAddrMode(MI) != HexagonII::PostInc)
+    return false;
+  unsigned Opc = MI->getOpcode();
+  unsigned OpNum, NewOpc;
+  switch (Opc) {
+    case Hexagon::L2_loadri_pi:
+      NewOpc = Hexagon::L2_loadri_io;
+      OpNum = 1;
+      break;
+    case Hexagon::L2_loadrd_pi:
+      NewOpc = Hexagon::L2_loadrd_io;
+      OpNum = 1;
+      break;
+    case Hexagon::V6_vL32b_pi:
+      NewOpc = Hexagon::V6_vL32b_ai;
+      OpNum = 1;
+      break;
+    case Hexagon::S2_storeri_pi:
+      NewOpc = Hexagon::S2_storeri_io;
+      OpNum = 0;
+      break;
+    case Hexagon::S2_storerd_pi:
+      NewOpc = Hexagon::S2_storerd_io;
+      OpNum = 0;
+      break;
+    case Hexagon::V6_vS32b_pi:
+      NewOpc = Hexagon::V6_vS32b_ai;
+      OpNum = 0;
+      break;
+    default:
+      return false;
+  }
+  auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool {
+    return getDeadNodes().count(DA.Id);
+  };
+  NodeList Defs;
+  MachineOperand &Op = MI->getOperand(OpNum);
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) {
+    if (&DA.Addr->getOp() != &Op)
+      continue;
+    Defs = DFG.getRelatedRefs(IA, DA);
+    if (!std::all_of(Defs.begin(), Defs.end(), IsDead))
+      return false;
+    break;
+  }
+
+  // Mark all nodes in Defs for removal.
+  for (auto D : Defs)
+    Remove.insert(D.Id);
+
+  if (trace())
+    dbgs() << "Rewriting: " << *MI;
+  MI->setDesc(HII.get(NewOpc));
+  MI->getOperand(OpNum+2).setImm(0);
+  removeOperand(IA, OpNum);
+  if (trace())
+    dbgs() << "       to: " << *MI;
+
+  return true;
+}
+
+
+bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (RDFLimit.getPosition()) {
+    if (RDFCount >= RDFLimit)
+      return false;
+    RDFCount++;
+  }
+
+  MDT = &getAnalysis<MachineDominatorTree>();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  HexagonRegisterAliasInfo HAI(HRI);
+  TargetOperandInfo TOI(HII);
+
+  if (RDFDump)
+    MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr);
+  DataFlowGraph G(MF, HII, HRI, *MDT, MDF, HAI, TOI);
+  G.build();
+  if (RDFDump) {
+    dbgs() << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+    dbgs() << MF.getName() << '\n';
+  }
+
+  bool Changed;
+  CopyPropagation CP(G);
+  CP.trace(RDFDump);
+  Changed = CP.run();
+  if (Changed)
+    G.build();
+
+  HexagonDCE DCE(G, *MRI);
+  DCE.trace(RDFDump);
+  Changed |= DCE.run();
+
+  if (Changed) {
+    Liveness LV(*MRI, G);
+    LV.trace(RDFDump);
+    LV.computeLiveIns();
+    LV.resetLiveIns();
+    LV.resetKills();
+  }
+
+  if (RDFDump)
+    MF.print(dbgs() << "After " << getPassName() << "\n", nullptr);
+  return false;
+}
+
+
+FunctionPass *llvm::createHexagonRDFOpt() {
+  return new HexagonRDFOpt();
+}
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index f6bb4a0..6e5f732 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -66,6 +66,8 @@ HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF) const {
   switch (HST.getHexagonArchVersion()) {
   case HexagonSubtarget::V4:
   case HexagonSubtarget::V5:
+  case HexagonSubtarget::V55:
+  case HexagonSubtarget::V60:
     return CallerSavedRegsV4;
   }
   llvm_unreachable(
@@ -84,6 +86,8 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
   case HexagonSubtarget::V4:
   case HexagonSubtarget::V5:
+  case HexagonSubtarget::V55:
+  case HexagonSubtarget::V60:
     return CalleeSavedRegsV3;
   }
   llvm_unreachable("Callee saved registers requested for unknown architecture "
@@ -98,6 +102,8 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   Reserved.set(Hexagon::R29);
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
+  Reserved.set(Hexagon::PC);
+  Reserved.set(Hexagon::GP);
   Reserved.set(Hexagon::D14);
   Reserved.set(Hexagon::D15);
   Reserved.set(Hexagon::LC0);
@@ -116,62 +122,21 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   assert(SPAdj == 0 && "Unexpected");
 
   MachineInstr &MI = *II;
-
   MachineBasicBlock &MB = *MI.getParent();
   MachineFunction &MF = *MB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HFI = *HST.getFrameLowering();
 
+  unsigned BP = 0;
   int FI = MI.getOperand(FIOp).getIndex();
-  int Offset = MFI.getObjectOffset(FI) + MI.getOperand(FIOp+1).getImm();
-  bool HasAlloca = MFI.hasVarSizedObjects();
-  bool HasAlign = needsStackRealignment(MF);
-
-  // XXX: Fixed objects cannot be accessed through SP if there are aligned
-  // objects in the local frame, or if there are dynamically allocated objects.
-  // In such cases, there has to be FP available.
-  if (!HFI.hasFP(MF)) {
-    assert(!HasAlloca && !HasAlign && "This function must have frame pointer");
-    // We will not reserve space on the stack for the lr and fp registers.
-    Offset -= 8;
-  }
-
-  unsigned SP = getStackRegister(), FP = getFrameRegister();
-  unsigned AP = 0;
-  if (MachineInstr *AI = HFI.getAlignaInstr(MF))
-    AP = AI->getOperand(0).getReg();
-  unsigned FrameSize = MFI.getStackSize();
-
-  // Special handling of dbg_value instructions and INLINEASM.
-  if (MI.isDebugValue() || MI.isInlineAsm()) {
-    MI.getOperand(FIOp).ChangeToRegister(SP, false /*isDef*/);
-    MI.getOperand(FIOp+1).ChangeToImmediate(Offset+FrameSize);
-    return;
-  }
-
-  bool UseFP = false, UseAP = false;  // Default: use SP.
-  if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) {
-    UseFP = HasAlloca || HasAlign;
-  } else {
-    if (HasAlloca) {
-      if (HasAlign)
-        UseAP = true;
-      else
-        UseFP = true;
-    }
-  }
+  // Select the base pointer (BP) and calculate the actual offset from BP
+  // to the beginning of the object at index FI.
+  int Offset = HFI.getFrameIndexReference(MF, FI, BP);
+  // Add the offset from the instruction.
+  int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
 
   unsigned Opc = MI.getOpcode();
-  bool ValidSP = HII.isValidOffset(Opc, FrameSize+Offset);
-  bool ValidFP = HII.isValidOffset(Opc, Offset);
-
-  // Calculate the actual offset in the instruction.
-  int64_t RealOffset = Offset;
-  if (!UseFP && !UseAP)
-    RealOffset = FrameSize+Offset;
-
   switch (Opc) {
     case Hexagon::TFR_FIA:
       MI.setDesc(HII.get(Hexagon::A2_addi));
@@ -184,20 +149,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       break;
   }
 
-  unsigned BP = 0;
-  bool Valid = false;
-  if (UseFP) {
-    BP = FP;
-    Valid = ValidFP;
-  } else if (UseAP) {
-    BP = AP;
-    Valid = ValidFP;
-  } else {
-    BP = SP;
-    Valid = ValidSP;
-  }
-
-  if (Valid) {
+  if (HII.isValidOffset(Opc, RealOffset)) {
     MI.getOperand(FIOp).ChangeToRegister(BP, false);
     MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
     return;
@@ -223,8 +175,8 @@ unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
                                                &MF) const {
   const HexagonFrameLowering *TFI = getFrameLowering(MF);
   if (TFI->hasFP(MF))
-    return Hexagon::R30;
-  return Hexagon::R29;
+    return getFrameRegister();
+  return getStackRegister();
 }
 
 
@@ -238,17 +190,9 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
 }
 
 
-bool
-HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  const HexagonFrameLowering *TFI = getFrameLowering(MF);
-  return TFI->hasFP(MF);
-}
-
-
-bool
-HexagonRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return MFI->getMaxAlignment() > 8;
+bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
+      const {
+  return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF);
 }
 
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
index 7edefee..db7e0f2 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -63,8 +63,6 @@ public:
     return true;
   }
 
-  bool needsStackRealignment(const MachineFunction &MF) const override;
-
   /// Returns true if the frame pointer is valid.
   bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index edf1c25..81629dc 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -53,6 +53,12 @@ let Namespace = "Hexagon" in {
     let Num = num;
   }
 
+
+  // Rq - vector predicate registers
+  class Rq<bits<3> num, string n> : Register<n, []> {
+    let HWEncoding{2-0} = num;
+  }
+
   // Rc - control registers
   class Rc<bits<5> num, string n,
            list<string> alt = [], list<Register> alias = []> : 
@@ -131,20 +137,21 @@ let Namespace = "Hexagon" in {
   def LC1  : Rc<3,  "lc1",       ["c3"]>,   DwarfRegNum<[70]>;
   def P3_0 : Rc<4,  "p3:0",      ["c4"], [P0, P1, P2, P3]>,
                                             DwarfRegNum<[71]>;
-  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[72]>;
-  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[73]>;
+  def C5   : Rc<5,  "c5",        ["c5"]>,   DwarfRegNum<[72]>; // future use
+  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[73]>;
+  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[74]>;
 
-  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[74]> {
+  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[75]> {
     let SubRegIndices = [subreg_overflow];
     let SubRegs = [USR_OVF];
   }
-  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[75]>;
-  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[76]>;
-  def GP   : Rc<11, "gp">,                  DwarfRegNum<[77]>;
-  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[78]>;
-  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[79]>;
-  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[80]>;
-  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[81]>;
+  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[76]>;
+  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[77]>;
+  def GP   : Rc<11, "gp">,                  DwarfRegNum<[78]>;
+  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[79]>;
+  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[80]>;
+  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[81]>;
+  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[82]>;
 }
 
   // Control registers pairs.
@@ -158,6 +165,36 @@ let Namespace = "Hexagon" in {
     def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
   }
 
+  foreach i = 0-31 in {
+    def V#i  : Ri<i, "v"#i>,  DwarfRegNum<[!add(i, 99)]>;
+  }
+
+  // Aliases of the V* registers used to hold double vec values.
+  let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
+  def W0  : Rd< 0,  "v1:0",  [V0,  V1]>,  DwarfRegNum<[99]>;
+  def W1  : Rd< 2,  "v3:2",  [V2,  V3]>,  DwarfRegNum<[101]>;
+  def W2  : Rd< 4,  "v5:4",  [V4,  V5]>,  DwarfRegNum<[103]>;
+  def W3  : Rd< 6,  "v7:6",  [V6,  V7]>,  DwarfRegNum<[105]>;
+  def W4  : Rd< 8,  "v9:8",  [V8,  V9]>,  DwarfRegNum<[107]>;
+  def W5  : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>;
+  def W6  : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>;
+  def W7  : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>;
+  def W8  : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>;
+  def W9  : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>;
+  def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>;
+  def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>;
+  def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>;
+  def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>;
+  def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>;
+  def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
+  }
+
+  // Vector Predicate registers.
+  def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>;
+  def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
+  def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
+  def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@@ -169,10 +206,34 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
                                  R10, R11, R29, R30, R31)> {
 }
 
+// Registers are listed in reverse order for allocation preference reasons.
+def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
+                                (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
+
 def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
                                (add (sequence "D%u", 0, 4),
                                     (sequence "D%u", 6, 13), D5, D14, D15)>;
 
+def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
+                               (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs : RegisterClass<"Hexagon",
+                         [v128i8, v64i16, v32i32, v16i64], 1024,
+                               (add (sequence "W%u", 0, 15))>;
+
+def VectorRegs128B : RegisterClass<"Hexagon",
+                         [v128i8, v64i16, v32i32, v16i64], 1024,
+                               (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs128B : RegisterClass<"Hexagon",
+                         [v256i8,v128i16,v64i32,v32i64], 2048,
+                               (add (sequence "W%u", 0, 15))>;
+
+def VecPredRegs : RegisterClass<"Hexagon", [v512i1], 512,
+                                (add (sequence "Q%u", 0, 3))>;
+
+def VecPredRegs128B : RegisterClass<"Hexagon", [v1024i1], 1024,
+                                   (add (sequence "Q%u", 0, 3))>;
 
 def PredRegs : RegisterClass<"Hexagon", 
                              [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32,
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
deleted file mode 100644
index 7069ad3..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends //
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Pass that removes sign extends for function parameters. These parameters
-// are already sign extended by the caller per Hexagon's ABI
-//
-//===----------------------------------------------------------------------===//
-
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/StackProtector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-
-using namespace llvm;
-
-namespace llvm {
-  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
-  void initializeHexagonRemoveExtendArgsPass(PassRegistry&);
-}
-
-namespace {
-  struct HexagonRemoveExtendArgs : public FunctionPass {
-  public:
-    static char ID;
-    HexagonRemoveExtendArgs() : FunctionPass(ID) {
-      initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry());
-    }
-    bool runOnFunction(Function &F) override;
-
-    const char *getPassName() const override {
-      return "Remove sign extends";
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineFunctionAnalysis>();
-      AU.addPreserved<MachineFunctionAnalysis>();
-      AU.addPreserved<StackProtector>();
-      FunctionPass::getAnalysisUsage(AU);
-    }
-  };
-}
-
-char HexagonRemoveExtendArgs::ID = 0;
-
-INITIALIZE_PASS(HexagonRemoveExtendArgs, "reargs",
-                "Remove Sign and Zero Extends for Args", false, false)
-
-bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
-  unsigned Idx = 1;
-  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
-       ++AI, ++Idx) {
-    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
-      Argument* Arg = AI;
-      if (!isa<PointerType>(Arg->getType())) {
-        for (auto UI = Arg->user_begin(); UI != Arg->user_end();) {
-          if (isa<SExtInst>(*UI)) {
-            Instruction* I = cast<Instruction>(*UI);
-            SExtInst* SI = new SExtInst(Arg, I->getType());
-            assert (EVT::getEVT(SI->getType()) ==
-                    (EVT::getEVT(I->getType())));
-            ++UI;
-            I->replaceAllUsesWith(SI);
-            Instruction* First = F.getEntryBlock().begin();
-            SI->insertBefore(First);
-            I->eraseFromParent();
-          } else {
-            ++UI;
-          }
-        }
-      }
-    }
-  }
-  return true;
-}
-
-
-
-FunctionPass*
-llvm::createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM) {
-  return new HexagonRemoveExtendArgs();
-}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
index 528cafc..6e4987b 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -13,6 +13,12 @@
 
 include "HexagonScheduleV4.td"
 
+// V55 Machine Info +
+include "HexagonScheduleV55.td"
+
 //===----------------------------------------------------------------------===//
-// V4 Machine Info -
+// V60 Machine Info -
 //===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV60.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
index a7d2d47..67af147 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -35,10 +35,11 @@ def SLOT_ENDLOOP: FuncUnit;
 
 // Itinerary classes.
 def PSEUDO      : InstrItinClass;
-def PSEUDOM   : InstrItinClass;
+def PSEUDOM     : InstrItinClass;
 // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
 def DUPLEX      : InstrItinClass;
 def PREFIX      : InstrItinClass;
+def COMPOUND_CJ_ARCHDEPSLOT    : InstrItinClass;
 def COMPOUND    : InstrItinClass;
 
 def ALU32_2op_tc_1_SLOT0123  : InstrItinClass;
@@ -58,6 +59,7 @@ def CR_tc_2early_SLOT3       : InstrItinClass;
 def CR_tc_3x_SLOT23          : InstrItinClass;
 def CR_tc_3x_SLOT3           : InstrItinClass;
 def J_tc_2early_SLOT23       : InstrItinClass;
+def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT       : InstrItinClass;
 def J_tc_2early_SLOT2        : InstrItinClass;
 def LD_tc_ld_SLOT01          : InstrItinClass;
 def LD_tc_ld_SLOT0           : InstrItinClass;
@@ -91,6 +93,7 @@ def V4LDST_tc_st_SLOT0       : InstrItinClass;
 def V4LDST_tc_st_SLOT01      : InstrItinClass;
 def J_tc_2early_SLOT0123     : InstrItinClass;
 def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
+def S_3op_tc_3stall_SLOT23   : InstrItinClass;
 
 
 def HexagonItinerariesV4 :
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
new file mode 100644
index 0000000..d9ad25d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -0,0 +1,170 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+
+def CJ_tc_1_SLOT23              : InstrItinClass;
+def CJ_tc_2early_SLOT23         : InstrItinClass;
+def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass;
+def COPROC_VX_vtc_long_SLOT23   : InstrItinClass;
+def COPROC_VX_vtc_SLOT23        : InstrItinClass;
+def J_tc_3stall_SLOT2           : InstrItinClass;
+def MAPPING_tc_1_SLOT0123       : InstrItinClass;
+def M_tc_3stall_SLOT23          : InstrItinClass;
+def SUBINSN_tc_1_SLOT01         : InstrItinClass;
+def SUBINSN_tc_2early_SLOT0     : InstrItinClass;
+def SUBINSN_tc_2early_SLOT01    : InstrItinClass;
+def SUBINSN_tc_3stall_SLOT0     : InstrItinClass;
+def SUBINSN_tc_ld_SLOT0         : InstrItinClass;
+def SUBINSN_tc_ld_SLOT01        : InstrItinClass;
+def SUBINSN_tc_st_SLOT01        : InstrItinClass;
+
+def HexagonItinerariesV55 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<2, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<3, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // JR
+        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<2, [SLOT2]>]>,
+        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<3, [SLOT2]>]>,
+
+        // Extender
+        InstrItinData<EXTENDER_tc_1_SLOT0123,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // Load
+        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // Store
+        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
+
+        // Subinsn
+        InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>,
+        InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<SUBINSN_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<SUBINSN_tc_1_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<SUBINSN_tc_2early_SLOT01,
+                              [InstrStage<2, [SLOT0, SLOT1]>]>,
+        InstrItinData<SUBINSN_tc_ld_SLOT01   , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<SUBINSN_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+
+        // Mem ops
+        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        // Endloop
+        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+
+        // Vector
+        InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+                      [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
+                      [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<COPROC_VX_vtc_SLOT23 ,
+                      [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<MAPPING_tc_1_SLOT0123      ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // Misc
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>]>
+
+      ]>;
+
+def HexagonModelV55 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV55;
+  let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
new file mode 100644
index 0000000..2ccff82
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -0,0 +1,310 @@
+//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec".
+def CVI_ST     : FuncUnit;
+def CVI_XLANE  : FuncUnit;
+def CVI_SHIFT  : FuncUnit;
+def CVI_MPY0   : FuncUnit;
+def CVI_MPY1   : FuncUnit;
+def CVI_LD     : FuncUnit;
+
+// Combined functional units.
+def CVI_XLSHF  : FuncUnit;
+def CVI_MPY01  : FuncUnit;
+def CVI_ALL    : FuncUnit;
+
+// Combined functional unit data.
+def HexagonComboFuncsV60 :
+    ComboFuncUnits<[
+      ComboFuncData<CVI_XLSHF    , [CVI_XLANE, CVI_SHIFT]>,
+      ComboFuncData<CVI_MPY01    , [CVI_MPY0, CVI_MPY1]>,
+      ComboFuncData<CVI_ALL      , [CVI_ST, CVI_XLANE, CVI_SHIFT,
+                                    CVI_MPY0, CVI_MPY1, CVI_LD]>
+    ]>;
+
+// Note: When adding additional vector scheduling classes, add the
+// corresponding methods to the class HexagonInstrInfo.
+def CVI_VA           : InstrItinClass;
+def CVI_VA_DV        : InstrItinClass;
+def CVI_VX_LONG      : InstrItinClass;
+def CVI_VX_LATE      : InstrItinClass;
+def CVI_VX           : InstrItinClass;
+def CVI_VX_DV_LONG   : InstrItinClass;
+def CVI_VX_DV        : InstrItinClass;
+def CVI_VX_DV_SLOT2  : InstrItinClass;
+def CVI_VP           : InstrItinClass;
+def CVI_VP_LONG      : InstrItinClass;
+def CVI_VP_VS_EARLY  : InstrItinClass;
+def CVI_VP_VS_LONG_EARLY   : InstrItinClass;
+def CVI_VP_VS_LONG   : InstrItinClass;
+def CVI_VP_VS   : InstrItinClass;
+def CVI_VP_DV        : InstrItinClass;
+def CVI_VS           : InstrItinClass;
+def CVI_VINLANESAT   : InstrItinClass;
+def CVI_VM_LD        : InstrItinClass;
+def CVI_VM_TMP_LD    : InstrItinClass;
+def CVI_VM_CUR_LD    : InstrItinClass;
+def CVI_VM_VP_LDU    : InstrItinClass;
+def CVI_VM_ST        : InstrItinClass;
+def CVI_VM_NEW_ST    : InstrItinClass;
+def CVI_VM_STU       : InstrItinClass;
+def CVI_HIST         : InstrItinClass;
+def CVI_VA_EXT       : InstrItinClass;
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V60 machine.
+// This file describes that machine information.
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+//
+//
+// In addition to using the above SLOTS, there are also six vector pipelines
+// in the CVI co-processor in the Hexagon V60 machine.
+//
+//      |=========| |=========| |=========| |=========| |=========| |=========|
+// SLOT | CVI_LD  | |CVI_MPY3 | |CVI_MPY2 | |CVI_SHIFT| |CVI_XLANE| | CVI_ST  |
+// ==== |=========| |=========| |=========| |=========| |=========| |=========|
+// S0-3 |         | | CVI_VA  | | CVI_VA  | | CVI_VA  | | CVI_VA  | |         |
+// S2-3 |         | | CVI_VX  | | CVI_VX  | |         | |         | |         |
+// S0-3 |         | |         | |         | |         | | CVI_VP  | |         |
+// S0-3 |         | |         | |         | | CVI_VS  | |         | |         |
+// S0-1 |(CVI_LD) | | CVI_LD  | | CVI_LD  | | CVI_LD  | | CVI_LD  | |         |
+// S0-1 |(C*TMP_LD) |         | |         | |         | |         | |         |
+// S01  |(C*_LDU) | |         | |         | |         | | C*_LDU  | |         |
+// S0   |         | | CVI_ST  | | CVI_ST  | | CVI_ST  | | CVI_ST  | |(CVI_ST) |
+// S0   |         | |         | |         | |         | |         | |(C*TMP_ST)
+// S01  |         | |         | |         | |         | | VSTU    | |(C*_STU) |
+//      |=========| |=========| |=========| |=========| |=========| |=========|
+//                  |=====================| |=====================|
+//                  | CVI_MPY2 & CVI_MPY3 | |CVI_XLANE & CVI_SHIFT|
+//                  |=====================| |=====================|
+// S0-3             | CVI_VA_DV           | | CVI_VA_DV           |
+// S0-3             |                     | | CVI_VP_DV           |
+// S2-3             | CVI_VX_DV           | |                     |
+//                  |=====================| |=====================|
+//      |=====================================================================|
+// S0-3 | CVI_HIST   Histogram                                                |
+// S0123| CVI_VA_EXT Extract                                                  |
+//      |=====================================================================|
+
+def HexagonItinerariesV60 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<2, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<3, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // JR
+        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<2, [SLOT2]>]>,
+        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<3, [SLOT2]>]>,
+
+        // Extender
+        InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
+                              [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // Load
+        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // Store
+        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
+
+        // Subinsn
+        InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>,
+        InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<SUBINSN_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<SUBINSN_tc_1_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<SUBINSN_tc_2early_SLOT01,
+                                               [InstrStage<2, [SLOT0, SLOT1]>]>,
+        InstrItinData<SUBINSN_tc_ld_SLOT01   , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<SUBINSN_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
+        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+
+        // Mem ops
+        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        // Endloop
+        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+
+        // Vector
+        InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+                             [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
+                             [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<COPROC_VX_vtc_SLOT23 ,
+                             [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<MAPPING_tc_1_SLOT0123      ,
+                             [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // Duplex and Compound
+        InstrItinData<DUPLEX     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // Misc
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM    , [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // Latest CVI spec definitions.
+        InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE,CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VA_DV,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>,
+        InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VX_DV_LONG,
+                                   [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>]>,
+        InstrItinData<CVI_VX_DV,
+                                   [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>]>,
+        InstrItinData<CVI_VX_DV_SLOT2,
+                                   [InstrStage<1, [SLOT2], 0>,
+                                    InstrStage<1, [CVI_MPY01]>]>,
+        InstrItinData<CVI_VP,      [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_VP_VS_EARLY,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_VS_LONG,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_VS,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_VS_LONG_EARLY,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_DV  , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VS,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_SHIFT]>]>,
+        InstrItinData<CVI_VINLANESAT,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_SHIFT]>]>,
+        InstrItinData<CVI_VM_LD  , [InstrStage<1, [SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD]>]>,
+        InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_VM_ST  , [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST]>]>,
+        InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_HIST   , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_ALL]>]>
+      ]>;
+
+def HexagonModelV60 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV60;
+  let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V60 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 276cc69..239dbda 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -12,12 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-selectiondag-info"
 
-bool llvm::flag_aligned_memcpy;
-
 SDValue
 HexagonSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
@@ -25,15 +24,40 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                         bool isVolatile, bool AlwaysInline,
                         MachinePointerInfo DstPtrInfo,
                         MachinePointerInfo SrcPtrInfo) const {
-  flag_aligned_memcpy = false;
-  if ((Align & 0x3) == 0) {
-    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-    if (ConstantSize) {
-      uint64_t SizeVal = ConstantSize->getZExtValue();
-      if ((SizeVal > 32) && ((SizeVal % 8) == 0))
-        flag_aligned_memcpy = true;
-    }
-  }
-
-  return SDValue();
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize)
+    return SDValue();
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (SizeVal < 32 || (SizeVal % 8) != 0)
+    return SDValue();
+
+  // Special case aligned memcpys with size >= 32 bytes and a multiple of 8.
+  //
+  const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+  Entry.Node = Src;
+  Args.push_back(Entry);
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  const char *SpecialMemcpyName =
+      "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                 Type::getVoidTy(*DAG.getContext()),
+                 DAG.getTargetExternalSymbol(
+                     SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout())),
+                 std::move(Args), 0)
+      .setDiscardResult();
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+  return CallResult.second;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index d3eb56f..10fe606 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -81,7 +81,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
   // Loop over all of the basic blocks
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
        MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock* MBB = MBBb;
+    MachineBasicBlock *MBB = &*MBBb;
     // Traverse the basic block
     MachineBasicBlock::iterator MII = MBB->begin();
     MachineBasicBlock::iterator MIE = MBB->end ();
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
new file mode 100644
index 0000000..d4e95b0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -0,0 +1,1209 @@
+//===--- HexagonSplitDouble.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hsdr"
+
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+  FunctionPass *createHexagonSplitDoubleRegs();
+  void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+}
+
+namespace {
+  static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1),
+      cl::desc("Maximum number of split partitions"));
+  static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true),
+      cl::desc("Do not split loads or stores"));
+
+  class HexagonSplitDoubleRegs : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonSplitDoubleRegs() : MachineFunctionPass(ID), TRI(nullptr),
+        TII(nullptr) {
+      initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry());
+    }
+    const char *getPassName() const override {
+      return "Hexagon Split Double Registers";
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    static const TargetRegisterClass *const DoubleRC;
+
+    const HexagonRegisterInfo *TRI;
+    const HexagonInstrInfo *TII;
+    const MachineLoopInfo *MLI;
+    MachineRegisterInfo *MRI;
+
+    typedef std::set<unsigned> USet;
+    typedef std::map<unsigned,USet> UUSetMap;
+    typedef std::pair<unsigned,unsigned> UUPair;
+    typedef std::map<unsigned,UUPair> UUPairMap;
+    typedef std::map<const MachineLoop*,USet> LoopRegMap;
+
+    bool isInduction(unsigned Reg, LoopRegMap &IRM) const;
+    bool isVolatileInstr(const MachineInstr *MI) const;
+    bool isFixedInstr(const MachineInstr *MI) const;
+    void partitionRegisters(UUSetMap &P2Rs);
+    int32_t profit(const MachineInstr *MI) const;
+    bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
+
+    void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
+    void collectIndRegs(LoopRegMap &IRM);
+
+    void createHalfInstr(unsigned Opc, MachineInstr *MI,
+        const UUPairMap &PairMap, unsigned SubR);
+    void splitMemRef(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitImmediate(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitCombine(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitExt(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitShift(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitAslOr(MachineInstr *MI, const UUPairMap &PairMap);
+    bool splitInstr(MachineInstr *MI, const UUPairMap &PairMap);
+    void replaceSubregUses(MachineInstr *MI, const UUPairMap &PairMap);
+    void collapseRegPairs(MachineInstr *MI, const UUPairMap &PairMap);
+    bool splitPartition(const USet &Part);
+
+    static int Counter;
+    static void dump_partition(raw_ostream&, const USet&,
+       const TargetRegisterInfo&);
+  };
+  char HexagonSplitDoubleRegs::ID;
+  int HexagonSplitDoubleRegs::Counter = 0;
+  const TargetRegisterClass *const HexagonSplitDoubleRegs::DoubleRC
+      = &Hexagon::DoubleRegsRegClass;
+}
+
+INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
+  "Hexagon Split Double Registers", false, false)
+
+
+static inline uint32_t getRegState(const MachineOperand &R) {
+  assert(R.isReg());
+  return getDefRegState(R.isDef()) |
+         getImplRegState(R.isImplicit()) |
+         getKillRegState(R.isKill()) |
+         getDeadRegState(R.isDead()) |
+         getUndefRegState(R.isUndef()) |
+         getInternalReadRegState(R.isInternalRead()) |
+         (R.isDebug() ? RegState::Debug : 0);
+}
+
+
+void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
+      const USet &Part, const TargetRegisterInfo &TRI) {
+  dbgs() << '{';
+  for (auto I : Part)
+    dbgs() << ' ' << PrintReg(I, &TRI);
+  dbgs() << " }";
+}
+
+
+bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
+  for (auto I : IRM) {
+    const USet &Rs = I.second;
+    if (Rs.find(Reg) != Rs.end())
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const {
+  for (auto &I : MI->memoperands())
+    if (I->isVolatile())
+      return true;
+  return false;
+}
+
+
+bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
+  if (MI->mayLoad() || MI->mayStore())
+    if (MemRefsFixed || isVolatileInstr(MI))
+      return true;
+  if (MI->isDebugValue())
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    default:
+      return true;
+
+    case TargetOpcode::PHI:
+    case TargetOpcode::COPY:
+      break;
+
+    case Hexagon::L2_loadrd_io:
+      // Not handling stack stores (only reg-based addresses).
+      if (MI->getOperand(1).isReg())
+        break;
+      return true;
+    case Hexagon::S2_storerd_io:
+      // Not handling stack stores (only reg-based addresses).
+      if (MI->getOperand(0).isReg())
+        break;
+      return true;
+    case Hexagon::L2_loadrd_pi:
+    case Hexagon::S2_storerd_pi:
+
+    case Hexagon::A2_tfrpi:
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineii:
+    case Hexagon::A4_combineri:
+    case Hexagon::A2_combinew:
+    case Hexagon::CONST64_Int_Real:
+
+    case Hexagon::A2_sxtw:
+
+    case Hexagon::A2_andp:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_xorp:
+    case Hexagon::S2_asl_i_p_or:
+    case Hexagon::S2_asl_i_p:
+    case Hexagon::S2_asr_i_p:
+    case Hexagon::S2_lsr_i_p:
+      break;
+  }
+
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      return true;
+  }
+  return false;
+}
+
+
+void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
+  typedef std::map<unsigned,unsigned> UUMap;
+  typedef std::vector<unsigned> UVect;
+
+  unsigned NumRegs = MRI->getNumVirtRegs();
+  BitVector DoubleRegs(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    unsigned R = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->getRegClass(R) == DoubleRC)
+      DoubleRegs.set(i);
+  }
+
+  BitVector FixedRegs(NumRegs);
+  for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    MachineInstr *DefI = MRI->getVRegDef(R);
+    // In some cases a register may exist, but never be defined or used.
+    // It should never appear anywhere, but mark it as "fixed", just to be
+    // safe.
+    if (!DefI || isFixedInstr(DefI))
+      FixedRegs.set(x);
+  }
+
+  UUSetMap AssocMap;
+  for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+    if (FixedRegs[x])
+      continue;
+    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    DEBUG(dbgs() << PrintReg(R, TRI) << " ~~");
+    USet &Asc = AssocMap[R];
+    for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
+         U != Z; ++U) {
+      MachineOperand &Op = *U;
+      MachineInstr *UseI = Op.getParent();
+      if (isFixedInstr(UseI))
+        continue;
+      for (unsigned i = 0, n = UseI->getNumOperands(); i < n; ++i) {
+        MachineOperand &MO = UseI->getOperand(i);
+        // Skip non-registers or registers with subregisters.
+        if (&MO == &Op || !MO.isReg() || MO.getSubReg())
+          continue;
+        unsigned T = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(T)) {
+          FixedRegs.set(x);
+          continue;
+        }
+        if (MRI->getRegClass(T) != DoubleRC)
+          continue;
+        unsigned u = TargetRegisterInfo::virtReg2Index(T);
+        if (FixedRegs[u])
+          continue;
+        DEBUG(dbgs() << ' ' << PrintReg(T, TRI));
+        Asc.insert(T);
+        // Make it symmetric.
+        AssocMap[T].insert(R);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+
+  UUMap R2P;
+  unsigned NextP = 1;
+  USet Visited;
+  for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    if (Visited.count(R))
+      continue;
+    // Create a new partition for R.
+    unsigned ThisP = FixedRegs[x] ? 0 : NextP++;
+    UVect WorkQ;
+    WorkQ.push_back(R);
+    for (unsigned i = 0; i < WorkQ.size(); ++i) {
+      unsigned T = WorkQ[i];
+      if (Visited.count(T))
+        continue;
+      R2P[T] = ThisP;
+      Visited.insert(T);
+      // Add all registers associated with T.
+      USet &Asc = AssocMap[T];
+      for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J)
+        WorkQ.push_back(*J);
+    }
+  }
+
+  for (auto I : R2P)
+    P2Rs[I.second].insert(I.first);
+}
+
+
+static inline int32_t profitImm(unsigned Lo, unsigned Hi) {
+  int32_t P = 0;
+  bool LoZ1 = false, HiZ1 = false;
+  if (Lo == 0 || Lo == 0xFFFFFFFF)
+    P += 10, LoZ1 = true;
+  if (Hi == 0 || Hi == 0xFFFFFFFF)
+    P += 10, HiZ1 = true;
+  if (!LoZ1 && !HiZ1 && Lo == Hi)
+    P += 3;
+  return P;
+}
+
+
+int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
+  unsigned ImmX = 0;
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::PHI:
+      for (const auto &Op : MI->operands())
+        if (!Op.getSubReg())
+          return 0;
+      return 10;
+    case TargetOpcode::COPY:
+      if (MI->getOperand(1).getSubReg() != 0)
+        return 10;
+      return 0;
+
+    case Hexagon::L2_loadrd_io:
+    case Hexagon::S2_storerd_io:
+      return -1;
+    case Hexagon::L2_loadrd_pi:
+    case Hexagon::S2_storerd_pi:
+      return 2;
+
+    case Hexagon::A2_tfrpi:
+    case Hexagon::CONST64_Int_Real: {
+      uint64_t D = MI->getOperand(1).getImm();
+      unsigned Lo = D & 0xFFFFFFFFULL;
+      unsigned Hi = D >> 32;
+      return profitImm(Lo, Hi);
+    }
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+      return profitImm(MI->getOperand(1).getImm(),
+                       MI->getOperand(2).getImm());
+    case Hexagon::A4_combineri:
+      ImmX++;
+    case Hexagon::A4_combineir: {
+      ImmX++;
+      int64_t V = MI->getOperand(ImmX).getImm();
+      if (V == 0 || V == -1)
+        return 10;
+      // Fall through into A2_combinew.
+    }
+    case Hexagon::A2_combinew:
+      return 2;
+
+    case Hexagon::A2_sxtw:
+      return 3;
+
+    case Hexagon::A2_andp:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_xorp:
+      return 1;
+
+    case Hexagon::S2_asl_i_p_or: {
+      unsigned S = MI->getOperand(3).getImm();
+      if (S == 0 || S == 32)
+        return 10;
+      return -1;
+    }
+    case Hexagon::S2_asl_i_p:
+    case Hexagon::S2_asr_i_p:
+    case Hexagon::S2_lsr_i_p:
+      unsigned S = MI->getOperand(2).getImm();
+      if (S == 0 || S == 32)
+        return 10;
+      if (S == 16)
+        return 5;
+      if (S == 48)
+        return 7;
+      return -10;
+  }
+
+  return 0;
+}
+
+
+bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
+      const {
+  unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+  int32_t TotalP = 0;
+
+  for (unsigned DR : Part) {
+    MachineInstr *DefI = MRI->getVRegDef(DR);
+    int32_t P = profit(DefI);
+    if (P == INT_MIN)
+      return false;
+    TotalP += P;
+    // Reduce the profitability of splitting induction registers.
+    if (isInduction(DR, IRM))
+      TotalP -= 30;
+
+    for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+         U != W; ++U) {
+      MachineInstr *UseI = U->getParent();
+      if (isFixedInstr(UseI)) {
+        FixedNum++;
+        // Calculate the cost of generating REG_SEQUENCE instructions.
+        for (auto &Op : UseI->operands()) {
+          if (Op.isReg() && Part.count(Op.getReg()))
+            if (Op.getSubReg())
+              TotalP -= 2;
+        }
+        continue;
+      }
+      // If a register from this partition is used in a fixed instruction,
+      // and there is also a register in this partition that is used in
+      // a loop phi node, then decrease the splitting profit as this can
+      // confuse the modulo scheduler.
+      if (UseI->isPHI()) {
+        const MachineBasicBlock *PB = UseI->getParent();
+        const MachineLoop *L = MLI->getLoopFor(PB);
+        if (L && L->getHeader() == PB)
+          LoopPhiNum++;
+      }
+      // Splittable instruction.
+      SplitNum++;
+      int32_t P = profit(UseI);
+      if (P == INT_MIN)
+        return false;
+      TotalP += P;
+    }
+  }
+
+  if (FixedNum > 0 && LoopPhiNum > 0)
+    TotalP -= 20*LoopPhiNum;
+
+  DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+  return TotalP > 0;
+}
+
+
+void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
+      USet &Rs) {
+  const MachineBasicBlock *HB = L->getHeader();
+  const MachineBasicBlock *LB = L->getLoopLatch();
+  if (!HB || !LB)
+    return;
+
+  // Examine the latch branch. Expect it to be a conditional branch to
+  // the header (either "br-cond header" or "br-cond exit; br header").
+  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB);
+  SmallVector<MachineOperand,2> Cond;
+  bool BadLB = TII->AnalyzeBranch(*TmpLB, TB, FB, Cond, false);
+  // Only analyzable conditional branches. HII::AnalyzeBranch will put
+  // the branch opcode as the first element of Cond, and the predicate
+  // operand as the second.
+  if (BadLB || Cond.size() != 2)
+    return;
+  // Only simple jump-conditional (with or without negation).
+  if (!TII->PredOpcodeHasJMP_c(Cond[0].getImm()))
+    return;
+  // Must go to the header.
+  if (TB != HB && FB != HB)
+    return;
+  assert(Cond[1].isReg() && "Unexpected Cond vector from AnalyzeBranch");
+  // Expect a predicate register.
+  unsigned PR = Cond[1].getReg();
+  assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass);
+
+  // Get the registers on which the loop controlling compare instruction
+  // depends.
+  unsigned CmpR1 = 0, CmpR2 = 0;
+  const MachineInstr *CmpI = MRI->getVRegDef(PR);
+  while (CmpI->getOpcode() == Hexagon::C2_not)
+    CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
+
+  int Mask = 0, Val = 0;
+  bool OkCI = TII->analyzeCompare(CmpI, CmpR1, CmpR2, Mask, Val);
+  if (!OkCI)
+    return;
+  // Eliminate non-double input registers.
+  if (CmpR1 && MRI->getRegClass(CmpR1) != DoubleRC)
+    CmpR1 = 0;
+  if (CmpR2 && MRI->getRegClass(CmpR2) != DoubleRC)
+    CmpR2 = 0;
+  if (!CmpR1 && !CmpR2)
+    return;
+
+  // Now examine the top of the loop: the phi nodes that could poten-
+  // tially define loop induction registers. The registers defined by
+  // such a phi node would be used in a 64-bit add, which then would
+  // be used in the loop compare instruction.
+
+  // Get the set of all double registers defined by phi nodes in the
+  // loop header.
+  typedef std::vector<unsigned> UVect;
+  UVect DP;
+  for (auto &MI : *HB) {
+    if (!MI.isPHI())
+      break;
+    const MachineOperand &MD = MI.getOperand(0);
+    unsigned R = MD.getReg();
+    if (MRI->getRegClass(R) == DoubleRC)
+      DP.push_back(R);
+  }
+  if (DP.empty())
+    return;
+
+  auto NoIndOp = [this, CmpR1, CmpR2] (unsigned R) -> bool {
+    for (auto I = MRI->use_nodbg_begin(R), E = MRI->use_nodbg_end();
+         I != E; ++I) {
+      const MachineInstr *UseI = I->getParent();
+      if (UseI->getOpcode() != Hexagon::A2_addp)
+        continue;
+      // Get the output from the add. If it is one of the inputs to the
+      // loop-controlling compare instruction, then R is likely an induc-
+      // tion register.
+      unsigned T = UseI->getOperand(0).getReg();
+      if (T == CmpR1 || T == CmpR2)
+        return false;
+    }
+    return true;
+  };
+  UVect::iterator End = std::remove_if(DP.begin(), DP.end(), NoIndOp);
+  Rs.insert(DP.begin(), End);
+  Rs.insert(CmpR1);
+  Rs.insert(CmpR2);
+
+  DEBUG({
+    dbgs() << "For loop at BB#" << HB->getNumber() << " ind regs: ";
+    dump_partition(dbgs(), Rs, *TRI);
+    dbgs() << '\n';
+  });
+}
+
+
+void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
+  typedef std::vector<MachineLoop*> LoopVector;
+  LoopVector WorkQ;
+
+  for (auto I : *MLI)
+    WorkQ.push_back(I);
+  for (unsigned i = 0; i < WorkQ.size(); ++i) {
+    for (auto I : *WorkQ[i])
+      WorkQ.push_back(I);
+  }
+
+  USet Rs;
+  for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
+    MachineLoop *L = WorkQ[i];
+    Rs.clear();
+    collectIndRegsForLoop(L, Rs);
+    if (!Rs.empty())
+      IRM.insert(std::make_pair(L, Rs));
+  }
+}
+
+
+void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
+      const UUPairMap &PairMap, unsigned SubR) {
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstr *NewI = BuildMI(B, MI, DL, TII->get(Opc));
+
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg()) {
+      NewI->addOperand(Op);
+      continue;
+    }
+    // For register operands, set the subregister.
+    unsigned R = Op.getReg();
+    unsigned SR = Op.getSubReg();
+    bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R);
+    bool isKill = Op.isKill();
+    if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
+      isKill = false;
+      UUPairMap::const_iterator F = PairMap.find(R);
+      if (F == PairMap.end()) {
+        SR = SubR;
+      } else {
+        const UUPair &P = F->second;
+        R = (SubR == Hexagon::subreg_loreg) ? P.first : P.second;
+        SR = 0;
+      }
+    }
+    auto CO = MachineOperand::CreateReg(R, Op.isDef(), Op.isImplicit(), isKill,
+          Op.isDead(), Op.isUndef(), Op.isEarlyClobber(), SR, Op.isDebug(),
+          Op.isInternalRead());
+    NewI->addOperand(CO);
+  }
+}
+
+
+void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  bool Load = MI->mayLoad();
+  unsigned OrigOpc = MI->getOpcode();
+  bool PostInc = (OrigOpc == Hexagon::L2_loadrd_pi ||
+                  OrigOpc == Hexagon::S2_storerd_pi);
+  MachineInstr *LowI, *HighI;
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Index of the base-address-register operand.
+  unsigned AdrX = PostInc ? (Load ? 2 : 1)
+                          : (Load ? 1 : 0);
+  MachineOperand &AdrOp = MI->getOperand(AdrX);
+  unsigned RSA = getRegState(AdrOp);
+  MachineOperand &ValOp = Load ? MI->getOperand(0)
+                               : (PostInc ? MI->getOperand(3)
+                                          : MI->getOperand(2));
+  UUPairMap::const_iterator F = PairMap.find(ValOp.getReg());
+  assert(F != PairMap.end());
+
+  if (Load) {
+    const UUPair &P = F->second;
+    int64_t Off = PostInc ? 0 : MI->getOperand(2).getImm();
+    LowI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.first)
+             .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+             .addImm(Off);
+    HighI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.second)
+              .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+              .addImm(Off+4);
+  } else {
+    const UUPair &P = F->second;
+    int64_t Off = PostInc ? 0 : MI->getOperand(1).getImm();
+    LowI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+             .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+             .addImm(Off)
+             .addReg(P.first);
+    HighI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+              .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+              .addImm(Off+4)
+              .addReg(P.second);
+  }
+
+  if (PostInc) {
+    // Create the increment of the address register.
+    int64_t Inc = Load ? MI->getOperand(3).getImm()
+                       : MI->getOperand(2).getImm();
+    MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0);
+    const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg());
+    unsigned NewR = MRI->createVirtualRegister(RC);
+    assert(!UpdOp.getSubReg() && "Def operand with subreg");
+    BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR)
+      .addReg(AdrOp.getReg(), RSA)
+      .addImm(Inc);
+    MRI->replaceRegWith(UpdOp.getReg(), NewR);
+    // The original instruction will be deleted later.
+  }
+
+  // Generate a new pair of memory-operands.
+  MachineFunction &MF = *B.getParent();
+  for (auto &MO : MI->memoperands()) {
+    const MachinePointerInfo &Ptr = MO->getPointerInfo();
+    unsigned F = MO->getFlags();
+    int A = MO->getAlignment();
+
+    auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A);
+    LowI->addMemOperand(MF, Tmp1);
+    auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4));
+    HighI->addMemOperand(MF, Tmp2);
+  }
+}
+
+
+void HexagonSplitDoubleRegs::splitImmediate(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  assert(Op0.isReg() && Op1.isImm());
+  uint64_t V = Op1.getImm();
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+
+  // The operand to A2_tfrsi can only have 32 significant bits. Immediate
+  // values in MachineOperand are stored as 64-bit integers, and so the
+  // value -1 may be represented either as 64-bit -1, or 4294967295. Both
+  // will have the 32 higher bits truncated in the end, but -1 will remain
+  // as -1, while the latter may appear to be a large unsigned value
+  // requiring a constant extender. The casting to int32_t will select the
+  // former representation. (The same reasoning applies to all 32-bit
+  // values.)
+  BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+    .addImm(int32_t(V & 0xFFFFFFFFULL));
+  BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+    .addImm(int32_t(V >> 32));
+}
+
+
+void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  MachineOperand &Op2 = MI->getOperand(2);
+  assert(Op0.isReg());
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+
+  if (Op1.isImm()) {
+    BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+      .addImm(Op1.getImm());
+  } else if (Op1.isReg()) {
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second)
+      .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg());
+  } else
+    llvm_unreachable("Unexpected operand");
+
+  if (Op2.isImm()) {
+    BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+      .addImm(Op2.getImm());
+  } else if (Op2.isReg()) {
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+      .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg());
+  } else
+    llvm_unreachable("Unexpected operand");
+}
+
+
+void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  assert(Op0.isReg() && Op1.isReg());
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+  unsigned RS = getRegState(Op1);
+
+  BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+    .addReg(Op1.getReg(), RS & ~RegState::Kill, Op1.getSubReg());
+  BuildMI(B, MI, DL, TII->get(Hexagon::S2_asr_i_r), P.second)
+    .addReg(Op1.getReg(), RS, Op1.getSubReg())
+    .addImm(31);
+}
+
+
+void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  MachineOperand &Op2 = MI->getOperand(2);
+  assert(Op0.isReg() && Op1.isReg() && Op2.isImm());
+  int64_t Sh64 = Op2.getImm();
+  assert(Sh64 >= 0 && Sh64 < 64);
+  unsigned S = Sh64;
+
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+  unsigned LoR = P.first;
+  unsigned HiR = P.second;
+  using namespace Hexagon;
+
+  unsigned Opc = MI->getOpcode();
+  bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p);
+  bool Left = !Right;
+  bool Signed = (Opc == S2_asr_i_p);
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RS = getRegState(Op1);
+  unsigned ShiftOpc = Left ? S2_asl_i_r
+                           : (Signed ? S2_asr_i_r : S2_lsr_i_r);
+  unsigned LoSR = subreg_loreg;
+  unsigned HiSR = subreg_hireg;
+
+  if (S == 0) {
+    // No shift, subregister copy.
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+      .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), HiR)
+      .addReg(Op1.getReg(), RS, HiSR);
+  } else if (S < 32) {
+    const TargetRegisterClass *IntRC = &IntRegsRegClass;
+    unsigned TmpR = MRI->createVirtualRegister(IntRC);
+    // Expansion:
+    // Shift left:    DR = shl R, #s
+    //   LoR  = shl R.lo, #s
+    //   TmpR = extractu R.lo, #s, #32-s
+    //   HiR  = or (TmpR, asl(R.hi, #s))
+    // Shift right:   DR = shr R, #s
+    //   HiR  = shr R.hi, #s
+    //   TmpR = shr R.lo, #s
+    //   LoR  = insert TmpR, R.hi, #s, #32-s
+
+    // Shift left:
+    //   LoR  = shl R.lo, #s
+    // Shift right:
+    //   TmpR = shr R.lo, #s
+
+    // Make a special case for A2_aslh and A2_asrh (they are predicable as
+    // opposed to S2_asl_i_r/S2_asr_i_r).
+    if (S == 16 && Left)
+      BuildMI(B, MI, DL, TII->get(A2_aslh), LoR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    else if (S == 16 && Signed)
+      BuildMI(B, MI, DL, TII->get(A2_asrh), TmpR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    else
+      BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? LoR : TmpR))
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+        .addImm(S);
+
+    if (Left) {
+      // TmpR = extractu R.lo, #s, #32-s
+      BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+        .addImm(S)
+        .addImm(32-S);
+      // HiR  = or (TmpR, asl(R.hi, #s))
+      BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+        .addReg(TmpR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(S);
+    } else {
+      // HiR  = shr R.hi, #s
+      BuildMI(B, MI, DL, TII->get(ShiftOpc), HiR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR)
+        .addImm(S);
+      // LoR  = insert TmpR, R.hi, #s, #32-s
+      BuildMI(B, MI, DL, TII->get(S2_insert), LoR)
+        .addReg(TmpR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(S)
+        .addImm(32-S);
+    }
+  } else if (S == 32) {
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), (Left ? HiR : LoR))
+      .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR));
+    if (!Signed)
+      BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+        .addImm(0);
+    else  // Must be right shift.
+      BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(31);
+  } else if (S < 64) {
+    S -= 32;
+    if (S == 16 && Left)
+      BuildMI(B, MI, DL, TII->get(A2_aslh), HiR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    else if (S == 16 && Signed)
+      BuildMI(B, MI, DL, TII->get(A2_asrh), LoR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR);
+    else
+      BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? HiR : LoR))
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR))
+        .addImm(S);
+
+    if (Signed)
+      BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(31);
+    else
+      BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+        .addImm(0);
+  }
+}
+
+
+void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  MachineOperand &Op2 = MI->getOperand(2);
+  MachineOperand &Op3 = MI->getOperand(3);
+  assert(Op0.isReg() && Op1.isReg() && Op2.isReg() && Op3.isImm());
+  int64_t Sh64 = Op3.getImm();
+  assert(Sh64 >= 0 && Sh64 < 64);
+  unsigned S = Sh64;
+
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+  unsigned LoR = P.first;
+  unsigned HiR = P.second;
+  using namespace Hexagon;
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RS1 = getRegState(Op1);
+  unsigned RS2 = getRegState(Op2);
+  const TargetRegisterClass *IntRC = &IntRegsRegClass;
+
+  unsigned LoSR = subreg_loreg;
+  unsigned HiSR = subreg_hireg;
+
+  // Op0 = S2_asl_i_p_or Op1, Op2, Op3
+  // means:  Op0 = or (Op1, asl(Op2, Op3))
+
+  // Expansion of
+  //   DR = or (R1, asl(R2, #s))
+  //
+  //   LoR  = or (R1.lo, asl(R2.lo, #s))
+  //   Tmp1 = extractu R2.lo, #s, #32-s
+  //   Tmp2 = or R1.hi, Tmp1
+  //   HiR  = or (Tmp2, asl(R2.hi, #s))
+
+  if (S == 0) {
+    // DR  = or (R1, asl(R2, #0))
+    //    -> or (R1, R2)
+    // i.e. LoR = or R1.lo, R2.lo
+    //      HiR = or R1.hi, R2.hi
+    BuildMI(B, MI, DL, TII->get(A2_or), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+      .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(Op2.getReg(), RS2, HiSR);
+  } else if (S < 32) {
+    BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+      .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+      .addImm(S);
+    unsigned TmpR1 = MRI->createVirtualRegister(IntRC);
+    BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1)
+      .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+      .addImm(S)
+      .addImm(32-S);
+    unsigned TmpR2 = MRI->createVirtualRegister(IntRC);
+    BuildMI(B, MI, DL, TII->get(A2_or), TmpR2)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(TmpR1);
+    BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+      .addReg(TmpR2)
+      .addReg(Op2.getReg(), RS2, HiSR)
+      .addImm(S);
+  } else if (S == 32) {
+    // DR  = or (R1, asl(R2, #32))
+    //    -> or R1, R2.lo
+    // LoR = R1.lo
+    // HiR = or R1.hi, R2.lo
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(Op2.getReg(), RS2, LoSR);
+  } else if (S < 64) {
+    // DR  = or (R1, asl(R2, #s))
+    //
+    // LoR = R1:lo
+    // HiR = or (R1:hi, asl(R2:lo, #s-32))
+    S -= 32;
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(Op2.getReg(), RS2, LoSR)
+      .addImm(S);
+  }
+}
+
+
+bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  DEBUG(dbgs() << "Splitting: " << *MI);
+  bool Split = false;
+  unsigned Opc = MI->getOpcode();
+  using namespace Hexagon;
+
+  switch (Opc) {
+    case TargetOpcode::PHI:
+    case TargetOpcode::COPY: {
+      unsigned DstR = MI->getOperand(0).getReg();
+      if (MRI->getRegClass(DstR) == DoubleRC) {
+        createHalfInstr(Opc, MI, PairMap, subreg_loreg);
+        createHalfInstr(Opc, MI, PairMap, subreg_hireg);
+        Split = true;
+      }
+      break;
+    }
+    case A2_andp:
+      createHalfInstr(A2_and, MI, PairMap, subreg_loreg);
+      createHalfInstr(A2_and, MI, PairMap, subreg_hireg);
+      Split = true;
+      break;
+    case A2_orp:
+      createHalfInstr(A2_or, MI, PairMap, subreg_loreg);
+      createHalfInstr(A2_or, MI, PairMap, subreg_hireg);
+      Split = true;
+      break;
+    case A2_xorp:
+      createHalfInstr(A2_xor, MI, PairMap, subreg_loreg);
+      createHalfInstr(A2_xor, MI, PairMap, subreg_hireg);
+      Split = true;
+      break;
+
+    case L2_loadrd_io:
+    case L2_loadrd_pi:
+    case S2_storerd_io:
+    case S2_storerd_pi:
+      splitMemRef(MI, PairMap);
+      Split = true;
+      break;
+
+    case A2_tfrpi:
+    case CONST64_Int_Real:
+      splitImmediate(MI, PairMap);
+      Split = true;
+      break;
+
+    case A2_combineii:
+    case A4_combineir:
+    case A4_combineii:
+    case A4_combineri:
+    case A2_combinew:
+      splitCombine(MI, PairMap);
+      Split = true;
+      break;
+
+    case A2_sxtw:
+      splitExt(MI, PairMap);
+      Split = true;
+      break;
+
+    case S2_asl_i_p:
+    case S2_asr_i_p:
+    case S2_lsr_i_p:
+      splitShift(MI, PairMap);
+      Split = true;
+      break;
+
+    case S2_asl_i_p_or:
+      splitAslOr(MI, PairMap);
+      Split = true;
+      break;
+
+    default:
+      llvm_unreachable("Instruction not splitable");
+      return false;
+  }
+
+  return Split;
+}
+
+
+void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg() || !Op.isUse() || !Op.getSubReg())
+      continue;
+    unsigned R = Op.getReg();
+    UUPairMap::const_iterator F = PairMap.find(R);
+    if (F == PairMap.end())
+      continue;
+    const UUPair &P = F->second;
+    switch (Op.getSubReg()) {
+      case Hexagon::subreg_loreg:
+        Op.setReg(P.first);
+        break;
+      case Hexagon::subreg_hireg:
+        Op.setReg(P.second);
+        break;
+    }
+    Op.setSubReg(0);
+  }
+}
+
+
+void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
+      continue;
+    UUPairMap::const_iterator F = PairMap.find(R);
+    if (F == PairMap.end())
+      continue;
+    const UUPair &Pr = F->second;
+    unsigned NewDR = MRI->createVirtualRegister(DoubleRC);
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR)
+      .addReg(Pr.first)
+      .addImm(Hexagon::subreg_loreg)
+      .addReg(Pr.second)
+      .addImm(Hexagon::subreg_hireg);
+    Op.setReg(NewDR);
+  }
+}
+
+
+bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
+  const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+  typedef std::set<MachineInstr*> MISet;
+  bool Changed = false;
+
+  DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI);
+        dbgs() << '\n');
+
+  UUPairMap PairMap;
+
+  MISet SplitIns;
+  for (unsigned DR : Part) {
+    MachineInstr *DefI = MRI->getVRegDef(DR);
+    SplitIns.insert(DefI);
+
+    // Collect all instructions, including fixed ones.  We won't split them,
+    // but we need to visit them again to insert the REG_SEQUENCE instructions.
+    for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+         U != W; ++U)
+      SplitIns.insert(U->getParent());
+
+    unsigned LoR = MRI->createVirtualRegister(IntRC);
+    unsigned HiR = MRI->createVirtualRegister(IntRC);
+    DEBUG(dbgs() << "Created mapping: " << PrintReg(DR, TRI) << " -> "
+                 << PrintReg(HiR, TRI) << ':' << PrintReg(LoR, TRI) << '\n');
+    PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR)));
+  }
+
+  MISet Erase;
+  for (auto MI : SplitIns) {
+    if (isFixedInstr(MI)) {
+      collapseRegPairs(MI, PairMap);
+    } else {
+      bool Done = splitInstr(MI, PairMap);
+      if (Done)
+        Erase.insert(MI);
+      Changed |= Done;
+    }
+  }
+
+  for (unsigned DR : Part) {
+    // Before erasing "double" instructions, revisit all uses of the double
+    // registers in this partition, and replace all uses of them with subre-
+    // gisters, with the corresponding single registers.
+    MISet Uses;
+    for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+         U != W; ++U)
+      Uses.insert(U->getParent());
+    for (auto M : Uses)
+      replaceSubregUses(M, PairMap);
+  }
+
+  for (auto MI : Erase) {
+    MachineBasicBlock *B = MI->getParent();
+    B->erase(MI);
+  }
+
+  return Changed;
+}
+
+
+bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Splitting double registers in function: "
+        << MF.getName() << '\n');
+
+  auto &ST = MF.getSubtarget<HexagonSubtarget>();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+
+  UUSetMap P2Rs;
+  LoopRegMap IRM;
+
+  collectIndRegs(IRM);
+  partitionRegisters(P2Rs);
+
+  DEBUG({
+    dbgs() << "Register partitioning: (partition #0 is fixed)\n";
+    for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+      dbgs() << '#' << I->first << " -> ";
+      dump_partition(dbgs(), I->second, *TRI);
+      dbgs() << '\n';
+    }
+  });
+
+  bool Changed = false;
+  int Limit = MaxHSDR;
+
+  for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+    if (I->first == 0)
+      continue;
+    if (Limit >= 0 && Counter >= Limit)
+      break;
+    USet &Part = I->second;
+    DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n');
+    if (!isProfitable(Part, IRM))
+      continue;
+    Counter++;
+    Changed |= splitPartition(Part);
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonSplitDoubleRegs() {
+  return new HexagonSplitDoubleRegs();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
new file mode 100644
index 0000000..b5339ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -0,0 +1,616 @@
+//===--- HexagonStoreWidening.cpp------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Replace sequences of "narrow" stores to adjacent memory locations with
+// a fewer "wide" stores that have the same effect.
+// For example, replace:
+//   S4_storeirb_io  %vreg100, 0, 0   ; store-immediate-byte
+//   S4_storeirb_io  %vreg100, 1, 0   ; store-immediate-byte
+// with
+//   S4_storeirh_io  %vreg100, 0, 0   ; store-immediate-halfword
+// The above is the general idea.  The actual cases handled by the code
+// may be a bit more complex.
+// The purpose of this pass is to reduce the number of outstanding stores,
+// or as one could say, "reduce store queue pressure".  Also, wide stores
+// mean fewer stores, and since there are only two memory instructions allowed
+// per packet, it also means fewer packets, and ultimately fewer cycles.
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-widen-stores"
+
+#include "HexagonTargetMachine.h"
+
+#include "llvm/PassSupport.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include <algorithm>
+
+
+using namespace llvm;
+
+namespace llvm {
+  FunctionPass *createHexagonStoreWidening();
+  void initializeHexagonStoreWideningPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonStoreWidening : public MachineFunctionPass {
+    const HexagonInstrInfo      *TII;
+    const HexagonRegisterInfo   *TRI;
+    const MachineRegisterInfo   *MRI;
+    AliasAnalysis               *AA;
+    MachineFunction             *MF;
+
+  public:
+    static char ID;
+    HexagonStoreWidening() : MachineFunctionPass(ID) {
+      initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "Hexagon Store Widening";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    static bool handledStoreType(const MachineInstr *MI);
+
+  private:
+    static const int MaxWideSize = 4;
+
+    typedef std::vector<MachineInstr*> InstrGroup;
+    typedef std::vector<InstrGroup> InstrGroupList;
+
+    bool instrAliased(InstrGroup &Stores, const MachineMemOperand &MMO);
+    bool instrAliased(InstrGroup &Stores, const MachineInstr *MI);
+    void createStoreGroup(MachineInstr *BaseStore, InstrGroup::iterator Begin,
+        InstrGroup::iterator End, InstrGroup &Group);
+    void createStoreGroups(MachineBasicBlock &MBB,
+        InstrGroupList &StoreGroups);
+    bool processBasicBlock(MachineBasicBlock &MBB);
+    bool processStoreGroup(InstrGroup &Group);
+    bool selectStores(InstrGroup::iterator Begin, InstrGroup::iterator End,
+        InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize);
+    bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
+    bool replaceStores(InstrGroup &OG, InstrGroup &NG);
+    bool storesAreAdjacent(const MachineInstr *S1, const MachineInstr *S2);
+  };
+
+} // namespace
+
+
+namespace {
+
+// Some local helper functions...
+unsigned getBaseAddressRegister(const MachineInstr *MI) {
+  const MachineOperand &MO = MI->getOperand(0);
+  assert(MO.isReg() && "Expecting register operand");
+  return MO.getReg();
+}
+
+int64_t getStoreOffset(const MachineInstr *MI) {
+  unsigned OpC = MI->getOpcode();
+  assert(HexagonStoreWidening::handledStoreType(MI) && "Unhandled opcode");
+
+  switch (OpC) {
+    case Hexagon::S4_storeirb_io:
+    case Hexagon::S4_storeirh_io:
+    case Hexagon::S4_storeiri_io: {
+      const MachineOperand &MO = MI->getOperand(1);
+      assert(MO.isImm() && "Expecting immediate offset");
+      return MO.getImm();
+    }
+  }
+  dbgs() << *MI;
+  llvm_unreachable("Store offset calculation missing for a handled opcode");
+  return 0;
+}
+
+const MachineMemOperand &getStoreTarget(const MachineInstr *MI) {
+  assert(!MI->memoperands_empty() && "Expecting memory operands");
+  return **MI->memoperands_begin();
+}
+
+} // namespace
+
+
+char HexagonStoreWidening::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores",
+                "Hexason Store Widening", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores",
+                "Hexagon Store Widening", false, false)
+
+
+// Filtering function: any stores whose opcodes are not "approved" of by
+// this function will not be subjected to widening.
+inline bool HexagonStoreWidening::handledStoreType(const MachineInstr *MI) {
+  // For now, only handle stores of immediate values.
+  // Also, reject stores to stack slots.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::S4_storeirb_io:
+    case Hexagon::S4_storeirh_io:
+    case Hexagon::S4_storeiri_io:
+      // Base address must be a register. (Implement FI later.)
+      return MI->getOperand(0).isReg();
+    default:
+      return false;
+  }
+}
+
+
+// Check if the machine memory operand MMO is aliased with any of the
+// stores in the store group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+      const MachineMemOperand &MMO) {
+  if (!MMO.getValue())
+    return true;
+
+  MemoryLocation L(MMO.getValue(), MMO.getSize(), MMO.getAAInfo());
+
+  for (auto SI : Stores) {
+    const MachineMemOperand &SMO = getStoreTarget(SI);
+    if (!SMO.getValue())
+      return true;
+
+    MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo());
+    if (AA->alias(L, SL))
+      return true;
+  }
+
+  return false;
+}
+
+
+// Check if the machine instruction MI accesses any storage aliased with
+// any store in the group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+      const MachineInstr *MI) {
+  for (auto &I : MI->memoperands())
+    if (instrAliased(Stores, *I))
+      return true;
+  return false;
+}
+
+
+// Inspect a machine basic block, and generate store groups out of stores
+// encountered in the block.
+//
+// A store group is a group of stores that use the same base register,
+// and which can be reordered within that group without altering the
+// semantics of the program.  A single store group could be widened as
+// a whole, if there existed a single store instruction with the same
+// semantics as the entire group.  In many cases, a single store group
+// may need more than one wide store.
+void HexagonStoreWidening::createStoreGroups(MachineBasicBlock &MBB,
+      InstrGroupList &StoreGroups) {
+  InstrGroup AllInsns;
+
+  // Copy all instruction pointers from the basic block to a temporary
+  // list.  This will allow operating on the list, and modifying its
+  // elements without affecting the basic block.
+  for (auto &I : MBB)
+    AllInsns.push_back(&I);
+
+  // Traverse all instructions in the AllInsns list, and if we encounter
+  // a store, then try to create a store group starting at that instruction
+  // i.e. a sequence of independent stores that can be widened.
+  for (auto I = AllInsns.begin(), E = AllInsns.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    // Skip null pointers (processed instructions).
+    if (!MI || !handledStoreType(MI))
+      continue;
+
+    // Found a store.  Try to create a store group.
+    InstrGroup G;
+    createStoreGroup(MI, I+1, E, G);
+    if (G.size() > 1)
+      StoreGroups.push_back(G);
+  }
+}
+
+
+// Create a single store group.  The stores need to be independent between
+// themselves, and also there cannot be other instructions between them
+// that could read or modify storage being stored into.
+void HexagonStoreWidening::createStoreGroup(MachineInstr *BaseStore,
+      InstrGroup::iterator Begin, InstrGroup::iterator End, InstrGroup &Group) {
+  assert(handledStoreType(BaseStore) && "Unexpected instruction");
+  unsigned BaseReg = getBaseAddressRegister(BaseStore);
+  InstrGroup Other;
+
+  Group.push_back(BaseStore);
+
+  for (auto I = Begin; I != End; ++I) {
+    MachineInstr *MI = *I;
+    if (!MI)
+      continue;
+
+    if (handledStoreType(MI)) {
+      // If this store instruction is aliased with anything already in the
+      // group, terminate the group now.
+      if (instrAliased(Group, getStoreTarget(MI)))
+        return;
+      // If this store is aliased to any of the memory instructions we have
+      // seen so far (that are not a part of this group), terminate the group.
+      if (instrAliased(Other, getStoreTarget(MI)))
+        return;
+
+      unsigned BR = getBaseAddressRegister(MI);
+      if (BR == BaseReg) {
+        Group.push_back(MI);
+        *I = 0;
+        continue;
+      }
+    }
+
+    // Assume calls are aliased to everything.
+    if (MI->isCall() || MI->hasUnmodeledSideEffects())
+      return;
+
+    if (MI->mayLoad() || MI->mayStore()) {
+      if (MI->hasOrderedMemoryRef() || instrAliased(Group, MI))
+        return;
+      Other.push_back(MI);
+    }
+  } // for
+}
+
+
+// Check if store instructions S1 and S2 are adjacent.  More precisely,
+// S2 has to access memory immediately following that accessed by S1.
+bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1,
+      const MachineInstr *S2) {
+  if (!handledStoreType(S1) || !handledStoreType(S2))
+    return false;
+
+  const MachineMemOperand &S1MO = getStoreTarget(S1);
+
+  // Currently only handling immediate stores.
+  int Off1 = S1->getOperand(1).getImm();
+  int Off2 = S2->getOperand(1).getImm();
+
+  return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2)
+                     : int(Off1+S1MO.getSize()) == Off2;
+}
+
+
+/// Given a sequence of adjacent stores, and a maximum size of a single wide
+/// store, pick a group of stores that  can be replaced by a single store
+/// of size not exceeding MaxSize.  The selected sequence will be recorded
+/// in OG ("old group" of instructions).
+/// OG should be empty on entry, and should be left empty if the function
+/// fails.
+bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
+      InstrGroup::iterator End, InstrGroup &OG, unsigned &TotalSize,
+      unsigned MaxSize) {
+  assert(Begin != End && "No instructions to analyze");
+  assert(OG.empty() && "Old group not empty on entry");
+
+  if (std::distance(Begin, End) <= 1)
+    return false;
+
+  MachineInstr *FirstMI = *Begin;
+  assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
+  const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
+  unsigned Alignment = FirstMMO.getAlignment();
+  unsigned SizeAccum = FirstMMO.getSize();
+  unsigned FirstOffset = getStoreOffset(FirstMI);
+
+  // The initial value of SizeAccum should always be a power of 2.
+  assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2");
+
+  // If the size of the first store equals to or exceeds the limit, do nothing.
+  if (SizeAccum >= MaxSize)
+    return false;
+
+  // If the size of the first store is greater than or equal to the address
+  // stored to, then the store cannot be made any wider.
+  if (SizeAccum >= Alignment)
+    return false;
+
+  // The offset of a store will put restrictions on how wide the store can be.
+  // Offsets in stores of size 2^n bytes need to have the n lowest bits be 0.
+  // If the first store already exhausts the offset limits, quit.  Test this
+  // by checking if the next wider size would exceed the limit.
+  if ((2*SizeAccum-1) & FirstOffset)
+    return false;
+
+  OG.push_back(FirstMI);
+  MachineInstr *S1 = FirstMI, *S2 = *(Begin+1);
+  InstrGroup::iterator I = Begin+1;
+
+  // Pow2Num will be the largest number of elements in OG such that the sum
+  // of sizes of stores 0...Pow2Num-1 will be a power of 2.
+  unsigned Pow2Num = 1;
+  unsigned Pow2Size = SizeAccum;
+
+  // Be greedy: keep accumulating stores as long as they are to adjacent
+  // memory locations, and as long as the total number of bytes stored
+  // does not exceed the limit (MaxSize).
+  // Keep track of when the total size covered is a power of 2, since
+  // this is a size a single store can cover.
+  while (I != End) {
+    S2 = *I;
+    // Stores are sorted, so if S1 and S2 are not adjacent, there won't be
+    // any other store to fill the "hole".
+    if (!storesAreAdjacent(S1, S2))
+      break;
+
+    unsigned S2Size = getStoreTarget(S2).getSize();
+    if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
+      break;
+
+    OG.push_back(S2);
+    SizeAccum += S2Size;
+    if (isPowerOf2_32(SizeAccum)) {
+      Pow2Num = OG.size();
+      Pow2Size = SizeAccum;
+    }
+    if ((2*Pow2Size-1) & FirstOffset)
+      break;
+
+    S1 = S2;
+    ++I;
+  }
+
+  // The stores don't add up to anything that can be widened.  Clean up.
+  if (Pow2Num <= 1) {
+    OG.clear();
+    return false;
+  }
+
+  // Only leave the stored being widened.
+  OG.resize(Pow2Num);
+  TotalSize = Pow2Size;
+  return true;
+}
+
+
+/// Given an "old group" OG of stores, create a "new group" NG of instructions
+/// to replace them.  Ideally, NG would only have a single instruction in it,
+/// but that may only be possible for store-immediate.
+bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
+      unsigned TotalSize) {
+  // XXX Current limitations:
+  // - only expect stores of immediate values in OG,
+  // - only handle a TotalSize of up to 4.
+
+  if (TotalSize > 4)
+    return false;
+
+  unsigned Acc = 0;  // Value accumulator.
+  unsigned Shift = 0;
+
+  for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    const MachineMemOperand &MMO = getStoreTarget(MI);
+    MachineOperand &SO = MI->getOperand(2);  // Source.
+    assert(SO.isImm() && "Expecting an immediate operand");
+
+    unsigned NBits = MMO.getSize()*8;
+    unsigned Mask = (0xFFFFFFFFU >> (32-NBits));
+    unsigned Val = (SO.getImm() & Mask) << Shift;
+    Acc |= Val;
+    Shift += NBits;
+  }
+
+
+  MachineInstr *FirstSt = OG.front();
+  DebugLoc DL = OG.back()->getDebugLoc();
+  const MachineMemOperand &OldM = getStoreTarget(FirstSt);
+  MachineMemOperand *NewM =
+    MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
+                             TotalSize, OldM.getAlignment(),
+                             OldM.getAAInfo());
+
+  if (Acc < 0x10000) {
+    // Create mem[hw] = #Acc
+    unsigned WOpc = (TotalSize == 2) ? Hexagon::S4_storeirh_io :
+                    (TotalSize == 4) ? Hexagon::S4_storeiri_io : 0;
+    assert(WOpc && "Unexpected size");
+
+    int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc);
+    const MCInstrDesc &StD = TII->get(WOpc);
+    MachineOperand &MR = FirstSt->getOperand(0);
+    int64_t Off = FirstSt->getOperand(1).getImm();
+    MachineInstr *StI = BuildMI(*MF, DL, StD)
+                          .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+                          .addImm(Off)
+                          .addImm(Val);
+    StI->addMemOperand(*MF, NewM);
+    NG.push_back(StI);
+  } else {
+    // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
+    const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
+    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
+    unsigned VReg = MF->getRegInfo().createVirtualRegister(RC);
+    MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg)
+                           .addImm(int(Acc));
+    NG.push_back(TfrI);
+
+    unsigned WOpc = (TotalSize == 2) ? Hexagon::S2_storerh_io :
+                    (TotalSize == 4) ? Hexagon::S2_storeri_io : 0;
+    assert(WOpc && "Unexpected size");
+
+    const MCInstrDesc &StD = TII->get(WOpc);
+    MachineOperand &MR = FirstSt->getOperand(0);
+    int64_t Off = FirstSt->getOperand(1).getImm();
+    MachineInstr *StI = BuildMI(*MF, DL, StD)
+                          .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+                          .addImm(Off)
+                          .addReg(VReg, RegState::Kill);
+    StI->addMemOperand(*MF, NewM);
+    NG.push_back(StI);
+  }
+
+  return true;
+}
+
+
+// Replace instructions from the old group OG with instructions from the
+// new group NG.  Conceptually, remove all instructions in OG, and then
+// insert all instructions in NG, starting at where the first instruction
+// from OG was (in the order in which they appeared in the basic block).
+// (The ordering in OG does not have to match the order in the basic block.)
+bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) {
+  DEBUG({
+    dbgs() << "Replacing:\n";
+    for (auto I : OG)
+      dbgs() << "  " << *I;
+    dbgs() << "with\n";
+    for (auto I : NG)
+      dbgs() << "  " << *I;
+  });
+
+  MachineBasicBlock *MBB = OG.back()->getParent();
+  MachineBasicBlock::iterator InsertAt = MBB->end();
+
+  // Need to establish the insertion point.  The best one is right before
+  // the first store in the OG, but in the order in which the stores occur
+  // in the program list.  Since the ordering in OG does not correspond
+  // to the order in the program list, we need to do some work to find
+  // the insertion point.
+
+  // Create a set of all instructions in OG (for quick lookup).
+  SmallPtrSet<MachineInstr*, 4> InstrSet;
+  for (auto I : OG)
+    InstrSet.insert(I);
+
+  // Traverse the block, until we hit an instruction from OG.
+  for (auto &I : *MBB) {
+    if (InstrSet.count(&I)) {
+      InsertAt = I;
+      break;
+    }
+  }
+
+  assert((InsertAt != MBB->end()) && "Cannot locate any store from the group");
+
+  bool AtBBStart = false;
+
+  // InsertAt points at the first instruction that will be removed.  We need
+  // to move it out of the way, so it remains valid after removing all the
+  // old stores, and so we are able to recover it back to the proper insertion
+  // position.
+  if (InsertAt != MBB->begin())
+    --InsertAt;
+  else
+    AtBBStart = true;
+
+  for (auto I : OG)
+    I->eraseFromParent();
+
+  if (!AtBBStart)
+    ++InsertAt;
+  else
+    InsertAt = MBB->begin();
+
+  for (auto I : NG)
+    MBB->insert(InsertAt, I);
+
+  return true;
+}
+
+
+// Break up the group into smaller groups, each of which can be replaced by
+// a single wide store.  Widen each such smaller group and replace the old
+// instructions with the widened ones.
+bool HexagonStoreWidening::processStoreGroup(InstrGroup &Group) {
+  bool Changed = false;
+  InstrGroup::iterator I = Group.begin(), E = Group.end();
+  InstrGroup OG, NG;   // Old and new groups.
+  unsigned CollectedSize;
+
+  while (I != E) {
+    OG.clear();
+    NG.clear();
+
+    bool Succ = selectStores(I++, E, OG, CollectedSize, MaxWideSize) &&
+                createWideStores(OG, NG, CollectedSize)              &&
+                replaceStores(OG, NG);
+    if (!Succ)
+      continue;
+
+    assert(OG.size() > 1 && "Created invalid group");
+    assert(distance(I, E)+1 >= int(OG.size()) && "Too many elements");
+    I += OG.size()-1;
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+// Process a single basic block: create the store groups, and replace them
+// with the widened stores, if possible.  Processing of each basic block
+// is independent from processing of any other basic block.  This transfor-
+// mation could be stopped after having processed any basic block without
+// any ill effects (other than not having performed widening in the unpro-
+// cessed blocks).  Also, the basic blocks can be processed in any order.
+bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
+  InstrGroupList SGs;
+  bool Changed = false;
+
+  createStoreGroups(MBB, SGs);
+
+  auto Less = [] (const MachineInstr *A, const MachineInstr *B) -> bool {
+    return getStoreOffset(A) < getStoreOffset(B);
+  };
+  for (auto &G : SGs) {
+    assert(G.size() > 1 && "Store group with fewer than 2 elements");
+    std::sort(G.begin(), G.end(), Less);
+
+    Changed |= processStoreGroup(G);
+  }
+
+  return Changed;
+}
+
+
+bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) {
+  MF = &MFn;
+  auto &ST = MFn.getSubtarget<HexagonSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &MFn.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  bool Changed = false;
+
+  for (auto &B : MFn)
+    Changed |= processBasicBlock(B);
+
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonStoreWidening() {
+  return new HexagonStoreWidening();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index cd482b3..aa0efd4 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -16,6 +16,8 @@
 #include "HexagonRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <map>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "hexagon-subtarget"
@@ -24,49 +26,65 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "HexagonGenSubtargetInfo.inc"
 
-static cl::opt<bool>
-EnableV3("enable-hexagon-v3", cl::Hidden,
-         cl::desc("Enable Hexagon V3 instructions."));
-
-static cl::opt<bool>
-EnableMemOps(
-    "enable-hexagon-memops",
-    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
-    cl::desc(
-      "Generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool>
-DisableMemOps(
-    "disable-hexagon-memops",
-    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
-    cl::desc(
-      "Do not generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool>
-EnableIEEERndNear(
-    "enable-hexagon-ieee-rnd-near",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false),
-    cl::desc("Generate non-chopped conversion from fp to int."));
+static cl::opt<bool> EnableMemOps("enable-hexagon-memops",
+  cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
+  cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> DisableMemOps("disable-hexagon-memops",
+  cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
+  cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Generate non-chopped conversion from fp to int."));
+
+static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<bool> EnableHexagonHVXDouble("enable-hexagon-hvx-double",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Hexagon Double Vector eXtensions"));
+
+static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Hexagon Vector eXtensions"));
 
 static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
-      cl::Hidden, cl::ZeroOrMore, cl::init(false),
-      cl::desc("Disable Hexagon MI Scheduling"));
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon MI Scheduling"));
+
+void HexagonSubtarget::initializeEnvironment() {
+  UseMemOps = false;
+  ModeIEEERndNear = false;
+  UseBSBScheduling = false;
+}
 
 HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  // If the programmer has not specified a Hexagon version, default to -mv4.
-  if (CPUString.empty())
-    CPUString = "hexagonv4";
-
-  if (CPUString == "hexagonv4") {
-    HexagonArchVersion = V4;
-  } else if (CPUString == "hexagonv5") {
-    HexagonArchVersion = V5;
-  } else {
+  CPUString = HEXAGON_MC::selectHexagonCPU(getTargetTriple(), CPU);
+
+  static std::map<StringRef, HexagonArchEnum> CpuTable {
+    { "hexagonv4", V4 },
+    { "hexagonv5", V5 },
+    { "hexagonv55", V55 },
+    { "hexagonv60", V60 },
+  };
+
+  auto foundIt = CpuTable.find(CPUString);
+  if (foundIt != CpuTable.end())
+    HexagonArchVersion = foundIt->second;
+  else
     llvm_unreachable("Unrecognized Hexagon processor version");
-  }
 
+  UseHVXOps = false;
+  UseHVXDblOps = false;
   ParseSubtargetFeatures(CPUString, FS);
+
+  if (EnableHexagonHVX.getPosition())
+    UseHVXOps = EnableHexagonHVX;
+  if (EnableHexagonHVXDouble.getPosition())
+    UseHVXDblOps = EnableHexagonHVXDouble;
+
   return *this;
 }
 
@@ -76,6 +94,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
       FrameLowering() {
 
+  initializeEnvironment();
+
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
@@ -91,6 +111,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
     ModeIEEERndNear = true;
   else
     ModeIEEERndNear = false;
+
+  UseBSBScheduling = hasV60TOps() && EnableBSBSched;
 }
 
 // Pin the vtable to this file.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 34cdad7..c7ae139 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -34,15 +34,19 @@ namespace llvm {
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
   virtual void anchor();
 
-  bool UseMemOps;
+  bool UseMemOps, UseHVXOps, UseHVXDblOps;
   bool ModeIEEERndNear;
 
 public:
   enum HexagonArchEnum {
-    V4, V5
+    V4, V5, V55, V60
   };
 
   HexagonArchEnum HexagonArchVersion;
+  /// True if the target should use Back-Skip-Back scheduling. This is the
+  /// default for V60.
+  bool UseBSBScheduling;
+
 private:
   std::string CPUString;
   HexagonInstrInfo InstrInfo;
@@ -50,6 +54,7 @@ private:
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
   InstrItineraryData InstrItins;
+  void initializeEnvironment();
 
 public:
   HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -84,7 +89,16 @@ public:
   bool useMemOps() const { return UseMemOps; }
   bool hasV5TOps() const { return getHexagonArchVersion() >= V5; }
   bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; }
+  bool hasV55TOps() const { return getHexagonArchVersion() >= V55; }
+  bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
+  bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
+  bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
   bool modeIEEERndNear() const { return ModeIEEERndNear; }
+  bool useHVXOps() const { return UseHVXOps; }
+  bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
+  bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; }
+
+  bool useBSBScheduling() const { return UseBSBScheduling; }
   bool enableMachineScheduler() const override;
   // Always use the TargetLowering default scheduler.
   // FIXME: This will use the vliw scheduler which is probably just hurting
@@ -98,7 +112,7 @@ public:
     return Hexagon_SMALL_DATA_THRESHOLD;
   }
   const HexagonArchEnum &getHexagonArchVersion() const {
-    return  HexagonArchVersion;
+    return HexagonArchVersion;
   }
 };
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
new file mode 100644
index 0000000..784686a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
@@ -0,0 +1,113 @@
+//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                     Cache manipulation instructions.
+//===----------------------------------------------------------------------===//
+let mayStore = 1 in
+class ST_MISC_CACHEOP<dag outs, dag ins,
+              string asmstr, list<dag> pattern = [],
+              bits<3> amode, bits<3> type, bits<1> un>
+  : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
+
+    bits<5> Rs;
+    bits<5> Rt;
+    bits<5> Rd;
+    let Inst{31-28} = 0b1010;
+    let Inst{27-25} = amode;
+    let Inst{24-22} = type;
+    let Inst{21}    = un;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Rt;
+    let Inst{4-0}   = Rd;
+}
+
+let mayStore = 1 in
+class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
+              string asmstr, list<dag> pattern = [],
+              bits<3> amode, bits<3> type, bits<1> un>
+  : SYSInst<outs, ins, asmstr, pattern, ""> {
+
+    bits<5> Rs;
+    bits<5> Rt;
+    bits<5> Rd;
+    let Inst{31-28} = 0b1010;
+    let Inst{27-25} = amode;
+    let Inst{24-22} = type;
+    let Inst{21}    = un;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Rt;
+    let Inst{4-0}   = Rd;
+}
+
+
+let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
+def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
+    "syncht" , [], 0b100, 0b001, 0b0>;
+}
+
+let Rt = 0, Rd = 0 in {
+let isSoloAin1 = 1 in {
+  def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+      "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
+  def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+      "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
+  def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+      "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
+  }
+}
+
+let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
+  def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
+      "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
+  def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
+      "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
+}
+
+let hasSideEffects = 0, isSolo = 1 in
+class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
+  : JRInst <
+  (outs), (ins IntRegs:$Rs),
+  #mnemonic#"($Rs)" > {
+    bits<5> Rs;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0110110;
+    let Inst{20-16} = Rs;
+    let Inst{13-12} = 0b00;
+    let Inst{11} = MajOp;
+  }
+// Instruction cache invalidate
+def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
+
+// Zero an aligned 32-byte cacheline.
+let isSoloAin1 = 1 in
+def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
+  "dczeroa($Rs)"> {
+    bits<5> Rs;
+    let IClass = 0b1010;
+    let Inst{27-21} = 0b0000110;
+    let Inst{13} = 0b0;
+    let Inst{20-16} = Rs;
+  }
+
+// Memory synchronization.
+let hasSideEffects = 0, isSolo = 1 in
+def Y2_isync: JRInst <(outs), (ins),
+  "isync"> {
+    let IClass = 0b0101;
+    let Inst{27-16} = 0b011111000000;
+    let Inst{13} = 0b0;
+    let Inst{9-0} = 0b0000000010;
+  }
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index b504429..34b03fb 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -16,27 +16,37 @@
 #include "HexagonISelLowering.h"
 #include "HexagonMachineScheduler.h"
 #include "HexagonTargetObjectFile.h"
+#include "HexagonTargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
-static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
+
+static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
+  cl::init(true), cl::desc("Enable RDF-based optimizations"));
+
+static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
   cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
 static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable Hexagon CFG Optimization"));
 
+static cl::opt<bool> DisableStoreWidening("disable-store-widen",
+  cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
+
 static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
   cl::init(true), cl::Hidden, cl::ZeroOrMore,
   cl::desc("Early expansion of MUX"));
 
+static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
+  cl::ZeroOrMore, cl::desc("Enable early if-conversion"));
+
 static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
   cl::Hidden, cl::desc("Generate \"insert\" instructions"));
 
@@ -46,10 +56,22 @@ static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
 static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
   cl::Hidden, cl::desc("Generate \"extract\" instructions"));
 
+static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden,
+  cl::desc("Enable converting conditional transfers into MUX instructions"));
+
 static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
   cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
   "predicate instructions"));
 
+static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
+  cl::desc("Disable splitting double registers"));
+
+static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
+  cl::Hidden, cl::desc("Bit simplification"));
+
+static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
+  cl::Hidden, cl::desc("Loop rescheduling"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -72,23 +94,31 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
                     createVLIWMachineSched);
 
 namespace llvm {
+  FunctionPass *createHexagonBitSimplify();
+  FunctionPass *createHexagonCallFrameInformation();
   FunctionPass *createHexagonCFGOptimizer();
   FunctionPass *createHexagonCommonGEP();
   FunctionPass *createHexagonCopyToCombine();
+  FunctionPass *createHexagonEarlyIfConversion();
   FunctionPass *createHexagonExpandCondsets();
   FunctionPass *createHexagonExpandPredSpillCode();
   FunctionPass *createHexagonFixupHwLoops();
   FunctionPass *createHexagonGenExtract();
   FunctionPass *createHexagonGenInsert();
+  FunctionPass *createHexagonGenMux();
   FunctionPass *createHexagonGenPredicate();
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
+  FunctionPass *createHexagonLoopRescheduling();
   FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonPacketizer();
   FunctionPass *createHexagonPeephole();
-  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonRDFOpt();
   FunctionPass *createHexagonSplitConst32AndConst64();
+  FunctionPass *createHexagonSplitDoubleRegs();
+  FunctionPass *createHexagonStoreWidening();
 } // end namespace llvm;
 
 /// HexagonTargetMachine ctor - Create an ILP32 architecture model.
@@ -101,13 +131,46 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, "e-m:e-p:32:32-i1:32-i64:64-a:0-n32", TT, CPU, FS,
-                        Options, RM, CM, OL),
-      TLOF(make_unique<HexagonTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this) {
-    initAsmInfo();
+    : LLVMTargetMachine(T, "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-"
+                        "i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-"
+                        "n16:32", TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<HexagonTargetObjectFile>()) {
+  initAsmInfo();
+}
+
+const HexagonSubtarget *
+HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
+  }
+  return I.get();
+}
+
+TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(HexagonTTIImpl(this, F));
+  });
 }
 
+
 HexagonTargetMachine::~HexagonTargetMachine() {}
 
 namespace {
@@ -166,7 +229,7 @@ bool HexagonPassConfig::addInstSelector() {
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
   if (!NoOpt)
-    addPass(createHexagonRemoveExtendArgs(TM));
+    addPass(createHexagonOptimizeSZextends());
 
   addPass(createHexagonISelDag(TM, getOptLevel()));
 
@@ -174,25 +237,42 @@ bool HexagonPassConfig::addInstSelector() {
     // Create logical operations on predicate registers.
     if (EnableGenPred)
       addPass(createHexagonGenPredicate(), false);
+    // Rotate loops to expose bit-simplification opportunities.
+    if (EnableLoopResched)
+      addPass(createHexagonLoopRescheduling(), false);
+    // Split double registers.
+    if (!DisableHSDR)
+      addPass(createHexagonSplitDoubleRegs());
+    // Bit simplification.
+    if (EnableBitSimplify)
+      addPass(createHexagonBitSimplify(), false);
     addPass(createHexagonPeephole());
     printAndVerify("After hexagon peephole pass");
     if (EnableGenInsert)
       addPass(createHexagonGenInsert(), false);
+    if (EnableEarlyIf)
+      addPass(createHexagonEarlyIfConversion(), false);
   }
 
   return false;
 }
 
 void HexagonPassConfig::addPreRegAlloc() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (!DisableStoreWidening)
+      addPass(createHexagonStoreWidening(), false);
     if (!DisableHardwareLoops)
       addPass(createHexagonHardwareLoops(), false);
+  }
 }
 
 void HexagonPassConfig::addPostRegAlloc() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (EnableRDFOpt)
+      addPass(createHexagonRDFOpt());
     if (!DisableHexagonCFGOpt)
       addPass(createHexagonCFGOptimizer(), false);
+  }
 }
 
 void HexagonPassConfig::addPreSched2() {
@@ -215,6 +295,13 @@ void HexagonPassConfig::addPreEmitPass() {
   if (!NoOpt) {
     if (!DisableHardwareLoops)
       addPass(createHexagonFixupHwLoops(), false);
+    // Generate MUX from pairs of conditional transfers.
+    if (EnableGenMux)
+      addPass(createHexagonGenMux(), false);
+
     addPass(createHexagonPacketizer(), false);
   }
+
+  // Add CFI instructions if necessary.
+  addPass(createHexagonCallFrameInformation(), false);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 115eadb..968814b 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -16,6 +16,7 @@
 
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -24,7 +25,7 @@ class Module;
 
 class HexagonTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  HexagonSubtarget Subtarget;
+  mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
 
 public:
   HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -32,20 +33,18 @@ public:
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
   ~HexagonTargetMachine() override;
-  const HexagonSubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
+
   static unsigned getModuleMatchQuality(const Module &M);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
 
-  TargetLoweringObjectFile *getObjFileLowering() const override {
-    return TLOF.get();
+  HexagonTargetObjectFile *getObjFileLowering() const override {
+    return static_cast<HexagonTargetObjectFile*>(TLOF.get());
   }
 };
 
-extern bool flag_aligned_memcpy;
-
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 4ea0e0d..ccca620 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -73,9 +73,10 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   if (!GVA)
     return false;
 
-  if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
+  if (Kind.isBSS() || Kind.isData() || Kind.isCommon()) {
     Type *Ty = GV->getType()->getElementType();
-    return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+    return IsInSmallSection(
+        GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
   }
 
   return false;
@@ -89,7 +90,7 @@ HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   // Handle Small Section classification here.
   if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
     return SmallBSSSection;
-  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+  if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
new file mode 100644
index 0000000..a05443e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -0,0 +1,38 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagontti"
+
+TargetTransformInfo::PopcntSupportKind
+HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
+  // Return Fast Hardware support as every input  < 64 bits will be promoted
+  // to 64 bits.
+  return TargetTransformInfo::PSK_FastHardware;
+}
+
+// The Hexagon target can unroll loops with run-time trip counts.
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+                                             TTI::UnrollingPreferences &UP) {
+  UP.Runtime = UP.Partial = true;
+}
+
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const {
+  return vector ? 0 : 32;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
new file mode 100644
index 0000000..71ae17a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -0,0 +1,70 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
+  typedef BasicTTIImplBase<HexagonTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const HexagonSubtarget *ST;
+  const HexagonTargetLowering *TLI;
+
+  const HexagonSubtarget *getST() const { return ST; }
+  const HexagonTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  HexagonTTIImpl(const HexagonTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  HexagonTTIImpl(HexagonTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
+
+  // The Hexagon target can unroll loops with run-time trip counts.
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool vector) const;
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index b91a3f6..8185054 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -16,35 +16,19 @@
 // prune the dependence.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/DFAPacketizer.h"
-#include "Hexagon.h"
-#include "HexagonMachineFunctionInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "HexagonVLIWPacketizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include <map>
 #include <vector>
 
@@ -52,9 +36,22 @@ using namespace llvm;
 
 #define DEBUG_TYPE "packets"
 
+static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
+  cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon packetizer pass"));
+
 static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
-      cl::ZeroOrMore, cl::Hidden, cl::init(true),
-      cl::desc("Allow non-solo packetization of volatile memory references"));
+  cl::ZeroOrMore, cl::Hidden, cl::init(true),
+  cl::desc("Allow non-solo packetization of volatile memory references"));
+
+static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC"));
+
+static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores",
+  cl::init(false), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Disable vector double new-value-stores"));
+
+extern cl::opt<bool> ScheduleInlineAsm;
 
 namespace llvm {
   FunctionPass *createHexagonPacketizer();
@@ -64,7 +61,6 @@ namespace llvm {
 
 namespace {
   class HexagonPacketizer : public MachineFunctionPass {
-
   public:
     static char ID;
     HexagonPacketizer() : MachineFunctionPass(ID) {
@@ -73,103 +69,25 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MachineBranchProbabilityInfo>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
       AU.addPreserved<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
-
     const char *getPassName() const override {
       return "Hexagon Packetizer";
     }
-
     bool runOnMachineFunction(MachineFunction &Fn) override;
-  };
-  char HexagonPacketizer::ID = 0;
-
-  class HexagonPacketizerList : public VLIWPacketizerList {
 
   private:
-
-    // Has the instruction been promoted to a dot-new instruction.
-    bool PromotedToDotNew;
-
-    // Has the instruction been glued to allocframe.
-    bool GlueAllocframeStore;
-
-    // Has the feeder instruction been glued to new value jump.
-    bool GlueToNewValueJump;
-
-    // Check if there is a dependence between some instruction already in this
-    // packet and this instruction.
-    bool Dependence;
-
-    // Only check for dependence if there are resources available to
-    // schedule this instruction.
-    bool FoundSequentialDependence;
-
-    /// \brief A handle to the branch probability pass.
-   const MachineBranchProbabilityInfo *MBPI;
-
-   // Track MIs with ignored dependece.
-   std::vector<MachineInstr*> IgnoreDepMIs;
-
-  public:
-    // Ctor.
-    HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
-                          const MachineBranchProbabilityInfo *MBPI);
-
-    // initPacketizerState - initialize some internal flags.
-    void initPacketizerState() override;
-
-    // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-    bool ignorePseudoInstruction(MachineInstr *MI,
-                                 MachineBasicBlock *MBB) override;
-
-    // isSoloInstruction - return true if instruction MI can not be packetized
-    // with any other instruction, which means that MI itself is a packet.
-    bool isSoloInstruction(MachineInstr *MI) override;
-
-    // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
-    // together.
-    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
-
-    // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
-    // and SUJ.
-    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
-
-    MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
-  private:
-    bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
-    bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
-                         MachineBasicBlock::iterator &MII,
-                         const TargetRegisterClass* RC);
-    bool CanPromoteToDotNew(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
-                            const std::map<MachineInstr *, SUnit *> &MIToSUnit,
-                            MachineBasicBlock::iterator &MII,
-                            const TargetRegisterClass *RC);
-    bool
-    CanPromoteToNewValue(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
-                         const std::map<MachineInstr *, SUnit *> &MIToSUnit,
-                         MachineBasicBlock::iterator &MII);
-    bool CanPromoteToNewValueStore(
-        MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg,
-        const std::map<MachineInstr *, SUnit *> &MIToSUnit);
-    bool DemoteToDotOld(MachineInstr *MI);
-    bool ArePredicatesComplements(
-        MachineInstr *MI1, MachineInstr *MI2,
-        const std::map<MachineInstr *, SUnit *> &MIToSUnit);
-    bool RestrictingDepExistInPacket(MachineInstr *, unsigned,
-                                     const std::map<MachineInstr *, SUnit *> &);
-    bool isNewifiable(MachineInstr* MI);
-    bool isCondInst(MachineInstr* MI);
-    bool tryAllocateResourcesForConstExt(MachineInstr* MI);
-    bool canReserveResourcesForConstExt(MachineInstr *MI);
-    void reserveResourcesForConstExt(MachineInstr* MI);
-    bool isNewValueInst(MachineInstr* MI);
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
   };
+
+  char HexagonPacketizer::ID = 0;
 }
 
 INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
@@ -177,26 +95,93 @@ INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
                     false, false)
 
 
-// HexagonPacketizerList Ctor.
-HexagonPacketizerList::HexagonPacketizerList(
-    MachineFunction &MF, MachineLoopInfo &MLI,
-    const MachineBranchProbabilityInfo *MBPI)
-    : VLIWPacketizerList(MF, MLI, true) {
-  this->MBPI = MBPI;
+HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
+      MachineLoopInfo &MLI, AliasAnalysis *AA,
+      const MachineBranchProbabilityInfo *MBPI)
+    : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) {
+  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
 }
 
-bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
-  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-  const MachineBranchProbabilityInfo *MBPI =
-    &getAnalysis<MachineBranchProbabilityInfo>();
+// Check if FirstI modifies a register that SecondI reads.
+static bool hasWriteToReadDep(const MachineInstr *FirstI,
+      const MachineInstr *SecondI, const TargetRegisterInfo *TRI) {
+  for (auto &MO : FirstI->operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (SecondI->readsRegister(R, TRI))
+      return true;
+  }
+  return false;
+}
+
+
+static MachineBasicBlock::iterator moveInstrOut(MachineInstr *MI,
+      MachineBasicBlock::iterator BundleIt, bool Before) {
+  MachineBasicBlock::instr_iterator InsertPt;
+  if (Before)
+    InsertPt = BundleIt.getInstrIterator();
+  else
+    InsertPt = std::next(BundleIt).getInstrIterator();
+
+  MachineBasicBlock &B = *MI->getParent();
+  // The instruction should at least be bundled with the preceding instruction
+  // (there will always be one, i.e. BUNDLE, if nothing else).
+  assert(MI->isBundledWithPred());
+  if (MI->isBundledWithSucc()) {
+    MI->clearFlag(MachineInstr::BundledSucc);
+    MI->clearFlag(MachineInstr::BundledPred);
+  } else {
+    // If it's not bundled with the successor (i.e. it is the last one
+    // in the bundle), then we can simply unbundle it from the predecessor,
+    // which will take care of updating the predecessor's flag.
+    MI->unbundleFromPred();
+  }
+  B.splice(InsertPt, &B, MI);
+
+  // Get the size of the bundle without asserting.
+  MachineBasicBlock::const_instr_iterator I(BundleIt);
+  MachineBasicBlock::const_instr_iterator E = B.instr_end();
+  unsigned Size = 0;
+  for (++I; I != E && I->isBundledWithPred(); ++I)
+    ++Size;
+
+  // If there are still two or more instructions, then there is nothing
+  // else to be done.
+  if (Size > 1)
+    return BundleIt;
+
+  // Otherwise, extract the single instruction out and delete the bundle.
+  MachineBasicBlock::iterator NextIt = std::next(BundleIt);
+  MachineInstr *SingleI = BundleIt->getNextNode();
+  SingleI->unbundleFromPred();
+  assert(!SingleI->isBundledWithSucc());
+  BundleIt->eraseFromParent();
+  return NextIt;
+}
+
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
+  if (DisablePacketizer)
+    return false;
+
+  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  if (EnableGenAllInsnClass)
+    HII->genAllInsnTimingClasses(MF);
+
   // Instantiate the packetizer.
-  HexagonPacketizerList Packetizer(Fn, MLI, MBPI);
+  HexagonPacketizerList Packetizer(MF, MLI, AA, MBPI);
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
@@ -211,162 +196,107 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
   // dependence between Insn 0 and Insn 2. This can lead to incorrect
   // packetization
   //
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    MachineBasicBlock::iterator End = MBB->end();
-    MachineBasicBlock::iterator MI = MBB->begin();
+  for (auto &MB : MF) {
+    auto End = MB.end();
+    auto MI = MB.begin();
     while (MI != End) {
+      auto NextI = std::next(MI);
       if (MI->isKill()) {
-        MachineBasicBlock::iterator DeleteMI = MI;
-        ++MI;
-        MBB->erase(DeleteMI);
-        End = MBB->end();
-        continue;
+        MB.erase(MI);
+        End = MB.end();
       }
-      ++MI;
+      MI = NextI;
     }
   }
 
   // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    // Find scheduling regions and schedule / packetize each region.
-    unsigned RemainingCount = MBB->size();
-    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
-        RegionEnd != MBB->begin();) {
-      // The next region starts above the previous region. Look backward in the
-      // instruction stream until we find the nearest boundary.
-      MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
-          break;
-      }
-      I = MBB->begin();
-
-      // Skip empty scheduling regions.
-      if (I == RegionEnd) {
-        RegionEnd = std::prev(RegionEnd);
-        --RemainingCount;
-        continue;
-      }
-      // Skip regions with one instruction.
-      if (I == std::prev(RegionEnd)) {
-        RegionEnd = std::prev(RegionEnd);
-        continue;
-      }
-
-      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
-      RegionEnd = I;
+  for (auto &MB : MF) {
+    auto Begin = MB.begin(), End = MB.end();
+    while (Begin != End) {
+      // First the first non-boundary starting from the end of the last
+      // scheduling region.
+      MachineBasicBlock::iterator RB = Begin;
+      while (RB != End && HII->isSchedulingBoundary(RB, &MB, MF))
+        ++RB;
+      // First the first boundary starting from the beginning of the new
+      // region.
+      MachineBasicBlock::iterator RE = RB;
+      while (RE != End && !HII->isSchedulingBoundary(RE, &MB, MF))
+        ++RE;
+      // Add the scheduling boundary if it's not block end.
+      if (RE != End)
+        ++RE;
+      // If RB == End, then RE == End.
+      if (RB != End)
+        Packetizer.PacketizeMIs(&MB, RB, RE);
+
+      Begin = RE;
     }
   }
 
+  Packetizer.unpacketizeSoloInstrs(MF);
   return true;
 }
 
 
-static bool IsIndirectCall(MachineInstr* MI) {
-  return MI->getOpcode() == Hexagon::J2_callr;
+// Reserve resources for a constant extender. Trigger an assertion if the
+// reservation fails.
+void HexagonPacketizerList::reserveResourcesForConstExt() {
+  if (!tryAllocateResourcesForConstExt(true))
+    llvm_unreachable("Resources not available");
 }
 
-// Reserve resources for constant extender. Trigure an assertion if
-// reservation fail.
-void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext),
-                                                 MI->getDebugLoc());
-
-  if (ResourceTracker->canReserveResources(PseudoMI)) {
-    ResourceTracker->reserveResources(PseudoMI);
-    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
-  } else {
-    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
-    llvm_unreachable("can not reserve resources for constant extender.");
-  }
-  return;
+bool HexagonPacketizerList::canReserveResourcesForConstExt() {
+  return tryAllocateResourcesForConstExt(false);
 }
 
-bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  assert((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
-         "Should only be called for constant extended instructions");
-  MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext),
-                                                 MI->getDebugLoc());
-  bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
-  MF.DeleteMachineInstr(PseudoMI);
-  return CanReserve;
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeeded,
+// return true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) {
+  auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc());
+  bool Avail = ResourceTracker->canReserveResources(ExtMI);
+  if (Reserve && Avail)
+    ResourceTracker->reserveResources(ExtMI);
+  MF.DeleteMachineInstr(ExtMI);
+  return Avail;
 }
 
-// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return
-// true, otherwise, return false.
-bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext),
-                                                 MI->getDebugLoc());
 
-  if (ResourceTracker->canReserveResources(PseudoMI)) {
-    ResourceTracker->reserveResources(PseudoMI);
-    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+bool HexagonPacketizerList::isCallDependent(const MachineInstr* MI,
+      SDep::Kind DepType, unsigned DepReg) {
+  // Check for LR dependence.
+  if (DepReg == HRI->getRARegister())
     return true;
-  } else {
-    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
-    return false;
-  }
-}
-
-
-bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI,
-                                          SDep::Kind DepType,
-                                          unsigned DepReg) {
-
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  const HexagonRegisterInfo *QRI =
-      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
-
-  // Check for lr dependence
-  if (DepReg == QRI->getRARegister()) {
-    return true;
-  }
 
-  if (QII->isDeallocRet(MI)) {
-    if (DepReg == QRI->getFrameRegister() ||
-        DepReg == QRI->getStackRegister())
+  if (HII->isDeallocRet(MI))
+    if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister())
       return true;
-  }
 
-  // Check if this is a predicate dependence
-  const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg);
-  if (RC == &Hexagon::PredRegsRegClass) {
+  // Check if this is a predicate dependence.
+  const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg);
+  if (RC == &Hexagon::PredRegsRegClass)
     return true;
-  }
 
-  //
-  // Lastly check for an operand used in an indirect call
-  // If we had an attribute for checking if an instruction is an indirect call,
-  // then we could have avoided this relatively brittle implementation of
-  // IsIndirectCall()
-  //
-  // Assumes that the first operand of the CALLr is the function address
-  //
-  if (IsIndirectCall(MI) && (DepType == SDep::Data)) {
+  // Assumes that the first operand of the CALLr is the function address.
+  if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) {
     MachineOperand MO = MI->getOperand(0);
-    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) {
+    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg))
       return true;
-    }
   }
 
   return false;
 }
 
-static bool IsRegDependence(const SDep::Kind DepType) {
-  return (DepType == SDep::Data || DepType == SDep::Anti ||
-          DepType == SDep::Output);
+static bool isRegDependence(const SDep::Kind DepType) {
+  return DepType == SDep::Data || DepType == SDep::Anti ||
+         DepType == SDep::Output;
 }
 
-static bool IsDirectJump(MachineInstr* MI) {
-  return (MI->getOpcode() == Hexagon::J2_jump);
+static bool isDirectJump(const MachineInstr* MI) {
+  return MI->getOpcode() == Hexagon::J2_jump;
 }
 
-static bool IsSchedBarrier(MachineInstr* MI) {
+static bool isSchedBarrier(const MachineInstr* MI) {
   switch (MI->getOpcode()) {
   case Hexagon::Y2_barrier:
     return true;
@@ -374,76 +304,127 @@ static bool IsSchedBarrier(MachineInstr* MI) {
   return false;
 }
 
-static bool IsControlFlow(MachineInstr* MI) {
+static bool isControlFlow(const MachineInstr* MI) {
   return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
 }
 
-static bool IsLoopN(MachineInstr *MI) {
-  return (MI->getOpcode() == Hexagon::J2_loop0i ||
-          MI->getOpcode() == Hexagon::J2_loop0r);
-}
 
-/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
-/// callee-saved register.
-static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
+/// Returns true if the instruction modifies a callee-saved register.
+static bool doesModifyCalleeSavedReg(const MachineInstr *MI,
                                      const TargetRegisterInfo *TRI) {
-  for (const MCPhysReg *CSR =
-           TRI->getCalleeSavedRegs(MI->getParent()->getParent());
-       *CSR; ++CSR) {
-    unsigned CalleeSavedReg = *CSR;
-    if (MI->modifiesRegister(CalleeSavedReg, TRI))
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  for (auto *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+    if (MI->modifiesRegister(*CSR, TRI))
       return true;
-  }
   return false;
 }
 
-// Returns true if an instruction can be promoted to .new predicate
-// or new-value store.
-bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  return isCondInst(MI) || QII->mayBeNewStore(MI);
+// TODO: MI->isIndirectBranch() and IsRegisterJump(MI)
+// Returns true if an instruction can be promoted to .new predicate or
+// new-value store.
+bool HexagonPacketizerList::isNewifiable(const MachineInstr* MI) {
+  return HII->isCondInst(MI) || MI->isReturn() || HII->mayBeNewStore(MI);
 }
 
-bool HexagonPacketizerList::isCondInst (MachineInstr* MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  const MCInstrDesc& TID = MI->getDesc();
-                                    // bug 5670: until that is fixed,
-                                    // this portion is disabled.
-  if (   TID.isConditionalBranch()  // && !IsRegisterJump(MI)) ||
-      || QII->isConditionalTransfer(MI)
-      || QII->isConditionalALU32(MI)
-      || QII->isConditionalLoad(MI)
-      || QII->isConditionalStore(MI)) {
-    return true;
+// Promote an instructiont to its .cur form.
+// At this time, we have already made a call to canPromoteToDotCur and made
+// sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::promoteToDotCur(MachineInstr* MI,
+      SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass* RC) {
+  assert(DepType == SDep::Data);
+  int CurOpcode = HII->getDotCurOp(MI);
+  MI->setDesc(HII->get(CurOpcode));
+  return true;
+}
+
+void HexagonPacketizerList::cleanUpDotCur() {
+  MachineInstr *MI = NULL;
+  for (auto BI : CurrentPacketMIs) {
+    DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
+    if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) {
+      MI = BI;
+      continue;
+    }
+    if (MI) {
+      for (auto &MO : BI->operands())
+        if (MO.isReg() && MO.getReg() == MI->getOperand(0).getReg())
+          return;
+    }
   }
-  return false;
+  if (!MI)
+    return;
+  // We did not find a use of the CUR, so de-cur it.
+  MI->setDesc(HII->get(Hexagon::V6_vL32b_ai));
+  DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
 }
 
+// Check to see if an instruction can be dot cur.
+bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr *MI,
+      const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass *RC) {
+  if (!HII->isV60VectorInstruction(MI))
+    return false;
+  if (!HII->isV60VectorInstruction(MII))
+    return false;
 
-// Promote an instructiont to its .new form.
-// At this time, we have already made a call to CanPromoteToDotNew
-// and made sure that it can *indeed* be promoted.
-bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
-                        SDep::Kind DepType, MachineBasicBlock::iterator &MII,
-                        const TargetRegisterClass* RC) {
+  // Already a dot new instruction.
+  if (HII->isDotCurInst(MI) && !HII->mayBeCurLoad(MI))
+    return false;
 
-  assert (DepType == SDep::Data);
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if (!HII->mayBeCurLoad(MI))
+    return false;
+
+  // The "cur value" cannot come from inline asm.
+  if (PacketSU->getInstr()->isInlineAsm())
+    return false;
+
+  // Make sure candidate instruction uses cur.
+  DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
+        MI->dump();
+        dbgs() << "in packet\n";);
+  MachineInstr *MJ = MII;
+  DEBUG(dbgs() << "Checking CUR against "; MJ->dump(););
+  unsigned DestReg = MI->getOperand(0).getReg();
+  bool FoundMatch = false;
+  for (auto &MO : MJ->operands())
+    if (MO.isReg() && MO.getReg() == DestReg)
+      FoundMatch = true;
+  if (!FoundMatch)
+    return false;
+
+  // Check for existing uses of a vector register within the packet which
+  // would be affected by converting a vector load into .cur formt.
+  for (auto BI : CurrentPacketMIs) {
+    DEBUG(dbgs() << "packet has "; BI->dump(););
+    if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo()))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Can Dot CUR MI\n"; MI->dump(););
+  // We can convert the opcode into a .cur.
+  return true;
+}
 
+// Promote an instruction to its .new form. At this time, we have already
+// made a call to canPromoteToDotNew and made sure that it can *indeed* be
+// promoted.
+bool HexagonPacketizerList::promoteToDotNew(MachineInstr* MI,
+      SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass* RC) {
+  assert (DepType == SDep::Data);
   int NewOpcode;
   if (RC == &Hexagon::PredRegsRegClass)
-    NewOpcode = QII->GetDotNewPredOp(MI, MBPI);
+    NewOpcode = HII->getDotNewPredOp(MI, MBPI);
   else
-    NewOpcode = QII->GetDotNewOp(MI);
-  MI->setDesc(QII->get(NewOpcode));
-
+    NewOpcode = HII->getDotNewOp(MI);
+  MI->setDesc(HII->get(NewOpcode));
   return true;
 }
 
-bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  int NewOpcode = QII->GetDotOldOp(MI->getOpcode());
-  MI->setDesc(QII->get(NewOpcode));
+bool HexagonPacketizerList::demoteToDotOld(MachineInstr* MI) {
+  int NewOpcode = HII->getDotOldOp(MI->getOpcode());
+  MI->setDesc(HII->get(NewOpcode));
   return true;
 }
 
@@ -455,175 +436,173 @@ enum PredicateKind {
 
 /// Returns true if an instruction is predicated on p0 and false if it's
 /// predicated on !p0.
-static PredicateKind getPredicateSense(MachineInstr* MI,
-                                       const HexagonInstrInfo *QII) {
-  if (!QII->isPredicated(MI))
+static PredicateKind getPredicateSense(const MachineInstr *MI,
+                                       const HexagonInstrInfo *HII) {
+  if (!HII->isPredicated(MI))
     return PK_Unknown;
-
-  if (QII->isPredicatedTrue(MI))
+  if (HII->isPredicatedTrue(MI))
     return PK_True;
-
   return PK_False;
 }
 
-static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
-                                               const HexagonInstrInfo *QII) {
-  assert(QII->isPostIncrement(MI) && "Not a post increment operation.");
+static const MachineOperand &getPostIncrementOperand(const MachineInstr *MI,
+      const HexagonInstrInfo *HII) {
+  assert(HII->isPostIncrement(MI) && "Not a post increment operation.");
 #ifndef NDEBUG
   // Post Increment means duplicates. Use dense map to find duplicates in the
   // list. Caution: Densemap initializes with the minimum of 64 buckets,
   // whereas there are at most 5 operands in the post increment.
-  DenseMap<unsigned,  unsigned> DefRegsSet;
-  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
-    if (MI->getOperand(opNum).isReg() &&
-        MI->getOperand(opNum).isDef()) {
-      DefRegsSet[MI->getOperand(opNum).getReg()] = 1;
-    }
-
-  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
-    if (MI->getOperand(opNum).isReg() &&
-        MI->getOperand(opNum).isUse()) {
-      if (DefRegsSet[MI->getOperand(opNum).getReg()]) {
-        return MI->getOperand(opNum);
-      }
-    }
+  DenseSet<unsigned> DefRegsSet;
+  for (auto &MO : MI->operands())
+    if (MO.isReg() && MO.isDef())
+      DefRegsSet.insert(MO.getReg());
+
+  for (auto &MO : MI->operands())
+    if (MO.isReg() && MO.isUse() && DefRegsSet.count(MO.getReg()))
+      return MO;
 #else
-  if (MI->getDesc().mayLoad()) {
+  if (MI->mayLoad()) {
+    const MachineOperand &Op1 = MI->getOperand(1);
     // The 2nd operand is always the post increment operand in load.
-    assert(MI->getOperand(1).isReg() &&
-                "Post increment operand has be to a register.");
-    return (MI->getOperand(1));
+    assert(Op1.isReg() && "Post increment operand has be to a register.");
+    return Op1;
   }
   if (MI->getDesc().mayStore()) {
+    const MachineOperand &Op0 = MI->getOperand(0);
     // The 1st operand is always the post increment operand in store.
-    assert(MI->getOperand(0).isReg() &&
-                "Post increment operand has be to a register.");
-    return (MI->getOperand(0));
+    assert(Op0.isReg() && "Post increment operand has be to a register.");
+    return Op0;
   }
 #endif
   // we should never come here.
   llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
 }
 
-// get the value being stored
-static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
+// Get the value being stored.
+static const MachineOperand& getStoreValueOperand(const MachineInstr *MI) {
   // value being stored is always the last operand.
-  return (MI->getOperand(MI->getNumOperands()-1));
+  return MI->getOperand(MI->getNumOperands()-1);
+}
+
+static bool isLoadAbsSet(const MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::L4_loadrd_ap:
+    case Hexagon::L4_loadrb_ap:
+    case Hexagon::L4_loadrh_ap:
+    case Hexagon::L4_loadrub_ap:
+    case Hexagon::L4_loadruh_ap:
+    case Hexagon::L4_loadri_ap:
+      return true;
+  }
+  return false;
 }
 
-// can be new value store?
+static const MachineOperand &getAbsSetOperand(const MachineInstr *MI) {
+  assert(isLoadAbsSet(MI));
+  return MI->getOperand(1);
+}
+
+
+// Can be new value store?
 // Following restrictions are to be respected in convert a store into
 // a new value store.
 // 1. If an instruction uses auto-increment, its address register cannot
 //    be a new-value register. Arch Spec 5.4.2.1
-// 2. If an instruction uses absolute-set addressing mode,
-//    its address register cannot be a new-value register.
-//    Arch Spec 5.4.2.1.TODO: This is not enabled as
-//    as absolute-set address mode patters are not implemented.
+// 2. If an instruction uses absolute-set addressing mode, its address
+//    register cannot be a new-value register. Arch Spec 5.4.2.1.
 // 3. If an instruction produces a 64-bit result, its registers cannot be used
 //    as new-value registers. Arch Spec 5.4.2.2.
-// 4. If the instruction that sets a new-value register is conditional, then
+// 4. If the instruction that sets the new-value register is conditional, then
 //    the instruction that uses the new-value register must also be conditional,
 //    and both must always have their predicates evaluate identically.
 //    Arch Spec 5.4.2.3.
-// 5. There is an implied restriction of a packet can not have another store,
-//    if there is a  new value store in the packet. Corollary, if there is
+// 5. There is an implied restriction that a packet cannot have another store,
+//    if there is a new value store in the packet. Corollary: if there is
 //    already a store in a packet, there can not be a new value store.
 //    Arch Spec: 3.4.4.2
-bool HexagonPacketizerList::CanPromoteToNewValueStore(
-    MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg,
-    const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr *MI,
+      const MachineInstr *PacketMI, unsigned DepReg) {
   // Make sure we are looking at the store, that can be promoted.
-  if (!QII->mayBeNewStore(MI))
+  if (!HII->mayBeNewStore(MI))
     return false;
 
-  // Make sure there is dependency and can be new value'ed
-  if (GetStoreValueOperand(MI).isReg() &&
-      GetStoreValueOperand(MI).getReg() != DepReg)
+  // Make sure there is dependency and can be new value'd.
+  const MachineOperand &Val = getStoreValueOperand(MI);
+  if (Val.isReg() && Val.getReg() != DepReg)
     return false;
 
-  const HexagonRegisterInfo *QRI =
-      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
   const MCInstrDesc& MCID = PacketMI->getDesc();
-  // first operand is always the result
-
-  const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
-
-  // if there is already an store in the packet, no can do new value store
-  // Arch Spec 3.4.4.2.
-  for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
-         VE = CurrentPacketMIs.end();
-       (VI != VE); ++VI) {
-    SUnit *PacketSU = MIToSUnit.find(*VI)->second;
-    if (PacketSU->getInstr()->getDesc().mayStore() ||
-        // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
-        // then we don't need this
-        PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe ||
-        PacketSU->getInstr()->getOpcode() == Hexagon::L2_deallocframe)
-      return false;
-  }
 
-  if (PacketRC == &Hexagon::DoubleRegsRegClass) {
-    // new value store constraint: double regs can not feed into new value store
-    // arch spec section: 5.4.2.2
+  // First operand is always the result.
+  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF);
+  // Double regs can not feed into new value store: PRM section: 5.4.2.2.
+  if (PacketRC == &Hexagon::DoubleRegsRegClass)
     return false;
+
+  // New-value stores are of class NV (slot 0), dual stores require class ST
+  // in slot 0 (PRM 5.5).
+  for (auto I : CurrentPacketMIs) {
+    SUnit *PacketSU = MIToSUnit.find(I)->second;
+    if (PacketSU->getInstr()->mayStore())
+      return false;
   }
 
   // Make sure it's NOT the post increment register that we are going to
   // new value.
-  if (QII->isPostIncrement(MI) &&
-      MI->getDesc().mayStore() &&
-      GetPostIncrementOperand(MI, QII).getReg() == DepReg) {
+  if (HII->isPostIncrement(MI) &&
+      getPostIncrementOperand(MI, HII).getReg() == DepReg) {
     return false;
   }
 
-  if (QII->isPostIncrement(PacketMI) &&
-      PacketMI->getDesc().mayLoad() &&
-      GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) {
-    // if source is post_inc, or absolute-set addressing,
-    // it can not feed into new value store
-    //  r3 = memw(r2++#4)
-    //  memw(r30 + #-1404) = r2.new -> can not be new value store
-    // arch spec section: 5.4.2.1
+  if (HII->isPostIncrement(PacketMI) && PacketMI->mayLoad() &&
+      getPostIncrementOperand(PacketMI, HII).getReg() == DepReg) {
+    // If source is post_inc, or absolute-set addressing, it can not feed
+    // into new value store
+    //   r3 = memw(r2++#4)
+    //   memw(r30 + #-1404) = r2.new -> can not be new value store
+    // arch spec section: 5.4.2.1.
     return false;
   }
 
+  if (isLoadAbsSet(PacketMI) && getAbsSetOperand(PacketMI).getReg() == DepReg)
+    return false;
+
   // If the source that feeds the store is predicated, new value store must
   // also be predicated.
-  if (QII->isPredicated(PacketMI)) {
-    if (!QII->isPredicated(MI))
+  if (HII->isPredicated(PacketMI)) {
+    if (!HII->isPredicated(MI))
       return false;
 
     // Check to make sure that they both will have their predicates
-    // evaluate identically
+    // evaluate identically.
     unsigned predRegNumSrc = 0;
     unsigned predRegNumDst = 0;
     const TargetRegisterClass* predRegClass = nullptr;
 
-    // Get predicate register used in the source instruction
-    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
-      if ( PacketMI->getOperand(opNum).isReg())
-      predRegNumSrc = PacketMI->getOperand(opNum).getReg();
-      predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc);
-      if (predRegClass == &Hexagon::PredRegsRegClass) {
+    // Get predicate register used in the source instruction.
+    for (auto &MO : PacketMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      predRegNumSrc = MO.getReg();
+      predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc);
+      if (predRegClass == &Hexagon::PredRegsRegClass)
         break;
-      }
     }
-    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
-        ("predicate register not found in a predicated PacketMI instruction"));
-
-    // Get predicate register used in new-value store instruction
-    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
-      if ( MI->getOperand(opNum).isReg())
-      predRegNumDst = MI->getOperand(opNum).getReg();
-      predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst);
-      if (predRegClass == &Hexagon::PredRegsRegClass) {
+    assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+        "predicate register not found in a predicated PacketMI instruction");
+
+    // Get predicate register used in new-value store instruction.
+    for (auto &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      predRegNumDst = MO.getReg();
+      predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst);
+      if (predRegClass == &Hexagon::PredRegsRegClass)
         break;
-      }
     }
-    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
-            ("predicate register not found in a predicated MI instruction"));
+    assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+           "predicate register not found in a predicated MI instruction");
 
     // New-value register producer and user (store) need to satisfy these
     // constraints:
@@ -632,13 +611,11 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore(
     // should also be .new predicated and if producer is not .new predicated
     // then store should not be .new predicated.
     // 3) Both new-value register producer and user should have same predicate
-    // sense, i.e, either both should be negated or both should be none negated.
-
-    if (( predRegNumDst != predRegNumSrc) ||
-          QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI)  ||
-          getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) {
+    // sense, i.e, either both should be negated or both should be non-negated.
+    if (predRegNumDst != predRegNumSrc ||
+        HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI)  ||
+        getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII))
       return false;
-    }
   }
 
   // Make sure that other than the new-value register no other store instruction
@@ -649,81 +626,77 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore(
   // including PacketMI. Howerver, we need to perform the check for the
   // remaining instructions in the packet.
 
-  std::vector<MachineInstr*>::iterator VI;
-  std::vector<MachineInstr*>::iterator VE;
   unsigned StartCheck = 0;
 
-  for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
-      (VI != VE); ++VI) {
-    SUnit *TempSU = MIToSUnit.find(*VI)->second;
+  for (auto I : CurrentPacketMIs) {
+    SUnit *TempSU = MIToSUnit.find(I)->second;
     MachineInstr* TempMI = TempSU->getInstr();
 
     // Following condition is true for all the instructions until PacketMI is
     // reached (StartCheck is set to 0 before the for loop).
     // StartCheck flag is 1 for all the instructions after PacketMI.
-    if (TempMI != PacketMI && !StartCheck) // start processing only after
-      continue;                            // encountering PacketMI
+    if (TempMI != PacketMI && !StartCheck) // Start processing only after
+      continue;                            // encountering PacketMI.
 
     StartCheck = 1;
-    if (TempMI == PacketMI) // We don't want to check PacketMI for dependence
+    if (TempMI == PacketMI) // We don't want to check PacketMI for dependence.
       continue;
 
-    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
-      if (MI->getOperand(opNum).isReg() &&
-          TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(),
-                                               QRI))
+    for (auto &MO : MI->operands())
+      if (MO.isReg() && TempSU->getInstr()->modifiesRegister(MO.getReg(), HRI))
         return false;
-    }
   }
 
   // Make sure that for non-POST_INC stores:
   // 1. The only use of reg is DepReg and no other registers.
   //    This handles V4 base+index registers.
   //    The following store can not be dot new.
-  //    Eg.   r0 = add(r0, #3)a
+  //    Eg.   r0 = add(r0, #3)
   //          memw(r1+r0<<#2) = r0
-  if (!QII->isPostIncrement(MI) &&
-      GetStoreValueOperand(MI).isReg() &&
-      GetStoreValueOperand(MI).getReg() == DepReg) {
-    for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
-      if (MI->getOperand(opNum).isReg() &&
-          MI->getOperand(opNum).getReg() == DepReg) {
-        return false;
-      }
-    }
-    // 2. If data definition is because of implicit definition of the register,
-    //    do not newify the store. Eg.
-    //    %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
-    //    STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
-    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
-      if (PacketMI->getOperand(opNum).isReg() &&
-          PacketMI->getOperand(opNum).getReg() == DepReg &&
-          PacketMI->getOperand(opNum).isDef() &&
-          PacketMI->getOperand(opNum).isImplicit()) {
+  if (!HII->isPostIncrement(MI)) {
+    for (unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
+      const MachineOperand &MO = MI->getOperand(opNum);
+      if (MO.isReg() && MO.getReg() == DepReg)
         return false;
-      }
     }
   }
 
+  // If data definition is because of implicit definition of the register,
+  // do not newify the store. Eg.
+  // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+  // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+  for (auto &MO : PacketMI->operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
+      continue;
+    unsigned R = MO.getReg();
+    if (R == DepReg || HRI->isSuperRegister(DepReg, R))
+      return false;
+  }
+
+  // Handle imp-use of super reg case. There is a target independent side
+  // change that should prevent this situation but I am handling it for
+  // just-in-case. For example, we cannot newify R2 in the following case:
+  // %R3<def> = A2_tfrsi 0;
+  // S2_storeri_io %R0<kill>, 0, %R2<kill>, %D1<imp-use,kill>;
+  for (auto &MO : MI->operands()) {
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == DepReg)
+      return false;
+  }
+
   // Can be dot new store.
   return true;
 }
 
-// can this MI to promoted to either
-// new value store or new value jump
-bool HexagonPacketizerList::CanPromoteToNewValue(
-    MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
-    const std::map<MachineInstr *, SUnit *> &MIToSUnit,
-    MachineBasicBlock::iterator &MII) {
-
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  if (!QII->mayBeNewStore(MI))
+// Can this MI to promoted to either new value store or new value jump.
+bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr *MI,
+      const SUnit *PacketSU, unsigned DepReg,
+      MachineBasicBlock::iterator &MII) {
+  if (!HII->mayBeNewStore(MI))
     return false;
 
-  MachineInstr *PacketMI = PacketSU->getInstr();
-
   // Check to see the store can be new value'ed.
-  if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit))
+  MachineInstr *PacketMI = PacketSU->getInstr();
+  if (canPromoteToNewValueStore(MI, PacketMI, DepReg))
     return true;
 
   // Check to see the compare/jump can be new value'ed.
@@ -731,93 +704,110 @@ bool HexagonPacketizerList::CanPromoteToNewValue(
   return false;
 }
 
+static bool isImplicitDependency(const MachineInstr *I, unsigned DepReg) {
+  for (auto &MO : I->operands())
+    if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
+      return true;
+  return false;
+}
+
 // Check to see if an instruction can be dot new
 // There are three kinds.
 // 1. dot new on predicate - V2/V3/V4
 // 2. dot new on stores NV/ST - V4
 // 3. dot new on jump NV/J - V4 -- This is generated in a pass.
-bool HexagonPacketizerList::CanPromoteToDotNew(
-    MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
-    const std::map<MachineInstr *, SUnit *> &MIToSUnit,
-    MachineBasicBlock::iterator &MII, const TargetRegisterClass *RC) {
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr *MI,
+      const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass* RC) {
   // Already a dot new instruction.
-  if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI))
+  if (HII->isDotNewInst(MI) && !HII->mayBeNewStore(MI))
     return false;
 
   if (!isNewifiable(MI))
     return false;
 
+  const MachineInstr *PI = PacketSU->getInstr();
+
+  // The "new value" cannot come from inline asm.
+  if (PI->isInlineAsm())
+    return false;
+
+  // IMPLICIT_DEFs won't materialize as real instructions, so .new makes no
+  // sense.
+  if (PI->isImplicitDef())
+    return false;
+
+  // If dependency is trough an implicitly defined register, we should not
+  // newify the use.
+  if (isImplicitDependency(PI, DepReg))
+    return false;
+
+  const MCInstrDesc& MCID = PI->getDesc();
+  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF);
+  if (DisableVecDblNVStores && VecRC == &Hexagon::VecDblRegsRegClass)
+    return false;
+
   // predicate .new
-  if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
-      return true;
-  else if (RC != &Hexagon::PredRegsRegClass &&
-      !QII->mayBeNewStore(MI)) // MI is not a new-value store
+  // bug 5670: until that is fixed
+  // TODO: MI->isIndirectBranch() and IsRegisterJump(MI)
+  if (RC == &Hexagon::PredRegsRegClass)
+    if (HII->isCondInst(MI) || MI->isReturn())
+      return HII->predCanBeUsedAsDotNew(PI, DepReg);
+
+  if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI))
+    return false;
+
+  // Create a dot new machine instruction to see if resources can be
+  // allocated. If not, bail out now.
+  int NewOpcode = HII->getDotNewOp(MI);
+  const MCInstrDesc &D = HII->get(NewOpcode);
+  MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
+  bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+  MF.DeleteMachineInstr(NewMI);
+  if (!ResourcesAvailable)
+    return false;
+
+  // New Value Store only. New Value Jump generated as a separate pass.
+  if (!canPromoteToNewValue(MI, PacketSU, DepReg, MII))
     return false;
-  else {
-    // Create a dot new machine instruction to see if resources can be
-    // allocated. If not, bail out now.
-    int NewOpcode = QII->GetDotNewOp(MI);
-    const MCInstrDesc &desc = QII->get(NewOpcode);
-    DebugLoc dl;
-    MachineInstr *NewMI =
-                    MI->getParent()->getParent()->CreateMachineInstr(desc, dl);
-    bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
-    MI->getParent()->getParent()->DeleteMachineInstr(NewMI);
-
-    if (!ResourcesAvailable)
-      return false;
 
-    // new value store only
-    // new new value jump generated as a passes
-    if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) {
-      return false;
-    }
-  }
   return true;
 }
 
-// Go through the packet instructions and search for anti dependency
-// between them and DepReg from MI
-// Consider this case:
+// Go through the packet instructions and search for an anti dependency between
+// them and DepReg from MI. Consider this case:
 // Trying to add
 // a) %R1<def> = TFRI_cdNotPt %P3, 2
 // to this packet:
 // {
-//   b) %P0<def> = OR_pp %P3<kill>, %P0<kill>
-//   c) %P3<def> = TFR_PdRs %R23
-//   d) %R1<def> = TFRI_cdnPt %P3, 4
+//   b) %P0<def> = C2_or %P3<kill>, %P0<kill>
+//   c) %P3<def> = C2_tfrrp %R23
+//   d) %R1<def> = C2_cmovenewit %P3, 4
 //  }
 // The P3 from a) and d) will be complements after
 // a)'s P3 is converted to .new form
-// Anti Dep between c) and b) is irrelevant for this case
-bool HexagonPacketizerList::RestrictingDepExistInPacket(
-    MachineInstr *MI, unsigned DepReg,
-    const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
-
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+// Anti-dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr* MI,
+                                                        unsigned DepReg) {
   SUnit *PacketSUDep = MIToSUnit.find(MI)->second;
 
-  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
-       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
-
+  for (auto I : CurrentPacketMIs) {
     // We only care for dependencies to predicated instructions
-    if(!QII->isPredicated(*VIN)) continue;
+    if (!HII->isPredicated(I))
+      continue;
 
     // Scheduling Unit for current insn in the packet
-    SUnit *PacketSU = MIToSUnit.find(*VIN)->second;
+    SUnit *PacketSU = MIToSUnit.find(I)->second;
 
-    // Look at dependencies between current members of the packet
-    // and predicate defining instruction MI.
-    // Make sure that dependency is on the exact register
-    // we care about.
+    // Look at dependencies between current members of the packet and
+    // predicate defining instruction MI. Make sure that dependency is
+    // on the exact register we care about.
     if (PacketSU->isSucc(PacketSUDep)) {
       for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
-        if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) &&
-            (PacketSU->Succs[i].getKind() == SDep::Anti) &&
-            (PacketSU->Succs[i].getReg() == DepReg)) {
+        auto &Dep = PacketSU->Succs[i];
+        if (Dep.getSUnit() == PacketSUDep && Dep.getKind() == SDep::Anti &&
+            Dep.getReg() == DepReg)
           return true;
-        }
       }
     }
   }
@@ -831,276 +821,362 @@ static unsigned getPredicatedRegister(MachineInstr *MI,
                                       const HexagonInstrInfo *QII) {
   /// We use the following rule: The first predicate register that is a use is
   /// the predicate register of a predicated instruction.
-
   assert(QII->isPredicated(MI) && "Must be predicated instruction");
 
-  for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-       OE = MI->operands_end(); OI != OE; ++OI) {
-    MachineOperand &Op = *OI;
+  for (auto &Op : MI->operands()) {
     if (Op.isReg() && Op.getReg() && Op.isUse() &&
         Hexagon::PredRegsRegClass.contains(Op.getReg()))
       return Op.getReg();
   }
 
   llvm_unreachable("Unknown instruction operand layout");
-
   return 0;
 }
 
 // Given two predicated instructions, this function detects whether
-// the predicates are complements
-bool HexagonPacketizerList::ArePredicatesComplements(
-    MachineInstr *MI1, MachineInstr *MI2,
-    const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
-
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-
+// the predicates are complements.
+bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1,
+                                                     MachineInstr *MI2) {
   // If we don't know the predicate sense of the instructions bail out early, we
   // need it later.
-  if (getPredicateSense(MI1, QII) == PK_Unknown ||
-      getPredicateSense(MI2, QII) == PK_Unknown)
+  if (getPredicateSense(MI1, HII) == PK_Unknown ||
+      getPredicateSense(MI2, HII) == PK_Unknown)
     return false;
 
-  // Scheduling unit for candidate
-  SUnit *SU = MIToSUnit.find(MI1)->second;
+  // Scheduling unit for candidate.
+  SUnit *SU = MIToSUnit[MI1];
 
   // One corner case deals with the following scenario:
   // Trying to add
-  // a) %R24<def> = TFR_cPt %P0, %R25
+  // a) %R24<def> = A2_tfrt %P0, %R25
   // to this packet:
-  //
   // {
-  //   b) %R25<def> = TFR_cNotPt %P0, %R24
-  //   c) %P0<def> = CMPEQri %R26, 1
+  //   b) %R25<def> = A2_tfrf %P0, %R24
+  //   c) %P0<def> = C2_cmpeqi %R26, 1
   // }
   //
-  // On general check a) and b) are complements, but
-  // presence of c) will convert a) to .new form, and
-  // then it is not a complement
-  // We attempt to detect it by analyzing  existing
-  // dependencies in the packet
+  // On general check a) and b) are complements, but presence of c) will
+  // convert a) to .new form, and then it is not a complement.
+  // We attempt to detect it by analyzing existing dependencies in the packet.
 
   // Analyze relationships between all existing members of the packet.
-  // Look for Anti dependecy on the same predicate reg
-  // as used in the candidate
-  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
-       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
-
-    // Scheduling Unit for current insn in the packet
-    SUnit *PacketSU = MIToSUnit.find(*VIN)->second;
+  // Look for Anti dependecy on the same predicate reg as used in the
+  // candidate.
+  for (auto I : CurrentPacketMIs) {
+    // Scheduling Unit for current insn in the packet.
+    SUnit *PacketSU = MIToSUnit.find(I)->second;
 
     // If this instruction in the packet is succeeded by the candidate...
     if (PacketSU->isSucc(SU)) {
       for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
-        // The corner case exist when there is true data
-        // dependency between candidate and one of current
-        // packet members, this dep is on predicate reg, and
-        // there already exist anti dep on the same pred in
+        auto Dep = PacketSU->Succs[i];
+        // The corner case exist when there is true data dependency between
+        // candidate and one of current packet members, this dep is on
+        // predicate reg, and there already exist anti dep on the same pred in
         // the packet.
-        if (PacketSU->Succs[i].getSUnit() == SU &&
-            PacketSU->Succs[i].getKind() == SDep::Data &&
-            Hexagon::PredRegsRegClass.contains(
-              PacketSU->Succs[i].getReg()) &&
-            // Here I know that *VIN is predicate setting instruction
-            // with true data dep to candidate on the register
-            // we care about - c) in the above example.
-            // Now I need to see if there is an anti dependency
-            // from c) to any other instruction in the
-            // same packet on the pred reg of interest
-            RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(),
-                                        MIToSUnit)) {
-           return false;
+        if (Dep.getSUnit() == SU && Dep.getKind() == SDep::Data &&
+            Hexagon::PredRegsRegClass.contains(Dep.getReg())) {
+          // Here I know that I is predicate setting instruction with true
+          // data dep to candidate on the register we care about - c) in the
+          // above example. Now I need to see if there is an anti dependency
+          // from c) to any other instruction in the same packet on the pred
+          // reg of interest.
+          if (restrictingDepExistInPacket(I, Dep.getReg()))
+            return false;
         }
       }
     }
   }
 
-  // If the above case does not apply, check regular
-  // complement condition.
-  // Check that the predicate register is the same and
-  // that the predicate sense is different
-  // We also need to differentiate .old vs. .new:
-  // !p0 is not complimentary to p0.new
-  unsigned PReg1 = getPredicatedRegister(MI1, QII);
-  unsigned PReg2 = getPredicatedRegister(MI2, QII);
-  return ((PReg1 == PReg2) &&
-          Hexagon::PredRegsRegClass.contains(PReg1) &&
-          Hexagon::PredRegsRegClass.contains(PReg2) &&
-          (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) &&
-          (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2)));
+  // If the above case does not apply, check regular complement condition.
+  // Check that the predicate register is the same and that the predicate
+  // sense is different We also need to differentiate .old vs. .new: !p0
+  // is not complementary to p0.new.
+  unsigned PReg1 = getPredicatedRegister(MI1, HII);
+  unsigned PReg2 = getPredicatedRegister(MI2, HII);
+  return PReg1 == PReg2 &&
+         Hexagon::PredRegsRegClass.contains(PReg1) &&
+         Hexagon::PredRegsRegClass.contains(PReg2) &&
+         getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) &&
+         HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2);
 }
 
-// initPacketizerState - Initialize packetizer flags
+// Initialize packetizer flags.
 void HexagonPacketizerList::initPacketizerState() {
-
   Dependence = false;
   PromotedToDotNew = false;
   GlueToNewValueJump = false;
   GlueAllocframeStore = false;
   FoundSequentialDependence = false;
-
-  return;
 }
 
-// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
-                                                    MachineBasicBlock *MBB) {
+// Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr *MI,
+      const MachineBasicBlock*) {
   if (MI->isDebugValue())
     return true;
 
   if (MI->isCFIInstruction())
     return false;
 
-  // We must print out inline assembly
+  // We must print out inline assembly.
   if (MI->isInlineAsm())
     return false;
 
-  // We check if MI has any functional units mapped to it.
-  // If it doesn't, we ignore the instruction.
+  if (MI->isImplicitDef())
+    return false;
+
+  // We check if MI has any functional units mapped to it. If it doesn't,
+  // we ignore the instruction.
   const MCInstrDesc& TID = MI->getDesc();
-  unsigned SchedClass = TID.getSchedClass();
-  const InstrStage* IS =
-                    ResourceTracker->getInstrItins()->beginStage(SchedClass);
+  auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
   unsigned FuncUnits = IS->getUnits();
   return !FuncUnits;
 }
 
-// isSoloInstruction: - Returns true for instructions that must be
-// scheduled in their own packet.
-bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) {
+bool HexagonPacketizerList::isSoloInstruction(const MachineInstr *MI) {
   if (MI->isEHLabel() || MI->isCFIInstruction())
     return true;
 
-  if (MI->isInlineAsm())
+  // Consider inline asm to not be a solo instruction by default.
+  // Inline asm will be put in a packet temporarily, but then it will be
+  // removed, and placed outside of the packet (before or after, depending
+  // on dependencies).  This is to reduce the impact of inline asm as a
+  // "packet splitting" instruction.
+  if (MI->isInlineAsm() && !ScheduleInlineAsm)
     return true;
 
   // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
   // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
   // They must not be grouped with other instructions in a packet.
-  if (IsSchedBarrier(MI))
+  if (isSchedBarrier(MI))
+    return true;
+
+  if (HII->isSolo(MI))
+    return true;
+
+  if (MI->getOpcode() == Hexagon::A2_nop)
     return true;
 
   return false;
 }
 
-// isLegalToPacketizeTogether:
-// SUI is the current instruction that is out side of the current packet.
-// SUJ is the current instruction inside the current packet against which that
-// SUI will be packetized.
-bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
-  MachineInstr *I = SUI->getInstr();
-  MachineInstr *J = SUJ->getInstr();
-  assert(I && J && "Unable to packetize null instruction!");
 
-  const MCInstrDesc &MCIDI = I->getDesc();
-  const MCInstrDesc &MCIDJ = J->getDesc();
+// Quick check if instructions MI and MJ cannot coexist in the same packet.
+// Limit the tests to be "one-way", e.g.  "if MI->isBranch and MJ->isInlineAsm",
+// but not the symmetric case: "if MJ->isBranch and MI->isInlineAsm".
+// For full test call this function twice:
+//   cannotCoexistAsymm(MI, MJ) || cannotCoexistAsymm(MJ, MI)
+// Doing the test only one way saves the amount of code in this function,
+// since every test would need to be repeated with the MI and MJ reversed.
+static bool cannotCoexistAsymm(const MachineInstr *MI, const MachineInstr *MJ,
+      const HexagonInstrInfo &HII) {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() &&
+      HII.isHVXMemWithAIndirect(MI, MJ))
+    return true;
 
-  MachineBasicBlock::iterator II = I;
+  // An inline asm cannot be together with a branch, because we may not be
+  // able to remove the asm out after packetizing (i.e. if the asm must be
+  // moved past the bundle).  Similarly, two asms cannot be together to avoid
+  // complications when determining their relative order outside of a bundle.
+  if (MI->isInlineAsm())
+    return MJ->isInlineAsm() || MJ->isBranch() || MJ->isBarrier() ||
+           MJ->isCall() || MJ->isTerminator();
 
-  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
-  const HexagonRegisterInfo *QRI =
-      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // "False" really means that the quick check failed to determine if
+  // I and J cannot coexist.
+  return false;
+}
 
-  // Inline asm cannot go in the packet.
-  if (I->getOpcode() == Hexagon::INLINEASM)
-    llvm_unreachable("Should not meet inline asm here!");
 
-  if (isSoloInstruction(I))
-    llvm_unreachable("Should not meet solo instr here!");
+// Full, symmetric check.
+bool HexagonPacketizerList::cannotCoexist(const MachineInstr *MI,
+      const MachineInstr *MJ) {
+  return cannotCoexistAsymm(MI, MJ, *HII) || cannotCoexistAsymm(MJ, MI, *HII);
+}
 
-  // A save callee-save register function call can only be in a packet
-  // with instructions that don't write to the callee-save registers.
-  if ((QII->isSaveCalleeSavedRegsCall(I) &&
-       DoesModifyCalleeSavedReg(J, QRI)) ||
-      (QII->isSaveCalleeSavedRegsCall(J) &&
-       DoesModifyCalleeSavedReg(I, QRI))) {
-    Dependence = true;
-    return false;
+void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
+  for (auto &B : MF) {
+    MachineBasicBlock::iterator BundleIt;
+    MachineBasicBlock::instr_iterator NextI;
+    for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) {
+      NextI = std::next(I);
+      MachineInstr *MI = &*I;
+      if (MI->isBundle())
+        BundleIt = I;
+      if (!MI->isInsideBundle())
+        continue;
+
+      // Decide on where to insert the instruction that we are pulling out.
+      // Debug instructions always go before the bundle, but the placement of
+      // INLINE_ASM depends on potential dependencies.  By default, try to
+      // put it before the bundle, but if the asm writes to a register that
+      // other instructions in the bundle read, then we need to place it
+      // after the bundle (to preserve the bundle semantics).
+      bool InsertBeforeBundle;
+      if (MI->isInlineAsm())
+        InsertBeforeBundle = !hasWriteToReadDep(MI, BundleIt, HRI);
+      else if (MI->isDebugValue())
+        InsertBeforeBundle = true;
+      else
+        continue;
+
+      BundleIt = moveInstrOut(MI, BundleIt, InsertBeforeBundle);
+    }
   }
+}
 
-  // Two control flow instructions cannot go in the same packet.
-  if (IsControlFlow(I) && IsControlFlow(J)) {
-    Dependence = true;
-    return false;
+// Check if a given instruction is of class "system".
+static bool isSystemInstr(const MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::Y2_barrier:
+    case Hexagon::Y2_dcfetchbo:
+      return true;
   }
+  return false;
+}
 
-  // A LoopN instruction cannot appear in the same packet as a jump or call.
-  if (IsLoopN(I) &&
-     (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) {
-    Dependence = true;
+bool HexagonPacketizerList::hasDeadDependence(const MachineInstr *I,
+                                              const MachineInstr *J) {
+  // The dependence graph may not include edges between dead definitions,
+  // so without extra checks, we could end up packetizing two instruction
+  // defining the same (dead) register.
+  if (I->isCall() || J->isCall())
     return false;
-  }
-  if (IsLoopN(J) &&
-     (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) {
-    Dependence = true;
+  if (HII->isPredicated(I) || HII->isPredicated(J))
     return false;
+
+  BitVector DeadDefs(Hexagon::NUM_TARGET_REGS);
+  for (auto &MO : I->operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+      continue;
+    DeadDefs[MO.getReg()] = true;
   }
 
+  for (auto &MO : J->operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+      continue;
+    unsigned R = MO.getReg();
+    if (R != Hexagon::USR_OVF && DeadDefs[R])
+      return true;
+  }
+  return false;
+}
+
+bool HexagonPacketizerList::hasControlDependence(const MachineInstr *I,
+                                                 const MachineInstr *J) {
+  // A save callee-save register function call can only be in a packet
+  // with instructions that don't write to the callee-save registers.
+  if ((HII->isSaveCalleeSavedRegsCall(I) &&
+       doesModifyCalleeSavedReg(J, HRI)) ||
+      (HII->isSaveCalleeSavedRegsCall(J) &&
+       doesModifyCalleeSavedReg(I, HRI)))
+    return true;
+
+  // Two control flow instructions cannot go in the same packet.
+  if (isControlFlow(I) && isControlFlow(J))
+    return true;
+
+  // \ref-manual (7.3.4) A loop setup packet in loopN or spNloop0 cannot
+  // contain a speculative indirect jump,
+  // a new-value compare jump or a dealloc_return.
+  auto isBadForLoopN = [this] (const MachineInstr *MI) -> bool {
+    if (MI->isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI))
+      return true;
+    if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI))
+      return true;
+    return false;
+  };
+
+  if (HII->isLoopN(I) && isBadForLoopN(J))
+    return true;
+  if (HII->isLoopN(J) && isBadForLoopN(I))
+    return true;
+
   // dealloc_return cannot appear in the same packet as a conditional or
   // unconditional jump.
-  if (QII->isDeallocRet(I) &&
-     (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) {
-    Dependence = true;
-    return false;
+  return HII->isDeallocRet(I) &&
+         (J->isBranch() || J->isCall() || J->isBarrier());
+}
+
+bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr *I,
+                                                    const MachineInstr *J) {
+  bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
+  bool StoreI = I->mayStore(), StoreJ = J->mayStore();
+  if ((SysI && StoreJ) || (SysJ && StoreI))
+    return true;
+
+  if (StoreI && StoreJ) {
+    if (HII->isNewValueInst(J) || HII->isMemOp(J) || HII->isMemOp(I))
+      return true;
+  } else {
+    // A memop cannot be in the same packet with another memop or a store.
+    // Two stores can be together, but here I and J cannot both be stores.
+    bool MopStI = HII->isMemOp(I) || StoreI;
+    bool MopStJ = HII->isMemOp(J) || StoreJ;
+    if (MopStI && MopStJ)
+      return true;
   }
 
+  return (StoreJ && HII->isDeallocRet(I)) || (StoreI && HII->isDeallocRet(J));
+}
 
-  // V4 allows dual store. But does not allow second store, if the
-  // first store is not in SLOT0. New value store, new value jump,
-  // dealloc_return and memop always take SLOT0.
-  // Arch spec 3.4.4.2
-  if (MCIDI.mayStore() && MCIDJ.mayStore() &&
-      (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
-    Dependence = true;
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  MachineInstr *J = SUJ->getInstr();
+  assert(I && J && "Unable to packetize null instruction!");
+
+  // Clear IgnoreDepMIs when Packet starts.
+  if (CurrentPacketMIs.size() == 1)
+    IgnoreDepMIs.clear();
+
+  MachineBasicBlock::iterator II = I;
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+
+  // Solo instructions cannot go in the packet.
+  assert(!isSoloInstruction(I) && "Unexpected solo instr!");
+
+  if (cannotCoexist(I, J))
     return false;
-  }
 
-  if ((QII->isMemOp(J) && MCIDI.mayStore())
-      || (MCIDJ.mayStore() && QII->isMemOp(I))
-      || (QII->isMemOp(J) && QII->isMemOp(I))) {
-    Dependence = true;
+  Dependence = hasDeadDependence(I, J) || hasControlDependence(I, J);
+  if (Dependence)
     return false;
-  }
 
-  //if dealloc_return
-  if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
-    Dependence = true;
+  // V4 allows dual stores. It does not allow second store, if the first
+  // store is not in SLOT0. New value store, new value jump, dealloc_return
+  // and memop always take SLOT0. Arch spec 3.4.4.2.
+  Dependence = hasV4SpecificDependence(I, J);
+  if (Dependence)
     return false;
-  }
 
   // If an instruction feeds new value jump, glue it.
   MachineBasicBlock::iterator NextMII = I;
   ++NextMII;
-  if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) {
+  if (NextMII != I->getParent()->end() && HII->isNewValueJump(NextMII)) {
     MachineInstr *NextMI = NextMII;
 
     bool secondRegMatch = false;
-    bool maintainNewValueJump = false;
+    const MachineOperand &NOp0 = NextMI->getOperand(0);
+    const MachineOperand &NOp1 = NextMI->getOperand(1);
 
-    if (NextMI->getOperand(1).isReg() &&
-        I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+    if (NOp1.isReg() && I->getOperand(0).getReg() == NOp1.getReg())
       secondRegMatch = true;
-      maintainNewValueJump = true;
-    }
-
-    if (!secondRegMatch &&
-          I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
-      maintainNewValueJump = true;
-    }
 
-    for (std::vector<MachineInstr*>::iterator
-          VI = CurrentPacketMIs.begin(),
-            VE = CurrentPacketMIs.end();
-          (VI != VE && maintainNewValueJump); ++VI) {
-      SUnit *PacketSU = MIToSUnit.find(*VI)->second;
-
-      // NVJ can not be part of the dual jump - Arch Spec: section 7.8
-      if (PacketSU->getInstr()->getDesc().isCall()) {
+    for (auto I : CurrentPacketMIs) {
+      SUnit *PacketSU = MIToSUnit.find(I)->second;
+      MachineInstr *PI = PacketSU->getInstr();
+      // NVJ can not be part of the dual jump - Arch Spec: section 7.8.
+      if (PI->isCall()) {
         Dependence = true;
         break;
       }
-      // Validate
+      // Validate:
       // 1. Packet does not have a store in it.
       // 2. If the first operand of the nvj is newified, and the second
       //    operand is also a reg, it (second reg) is not defined in
@@ -1108,302 +1184,413 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // 3. If the second operand of the nvj is newified, (which means
       //    first operand is also a reg), first reg is not defined in
       //    the same packet.
-      if (PacketSU->getInstr()->getDesc().mayStore()               ||
-          PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe ||
-          // Check #2.
-          (!secondRegMatch && NextMI->getOperand(1).isReg() &&
-            PacketSU->getInstr()->modifiesRegister(
-                              NextMI->getOperand(1).getReg(), QRI)) ||
-          // Check #3.
-          (secondRegMatch &&
-            PacketSU->getInstr()->modifiesRegister(
-                              NextMI->getOperand(0).getReg(), QRI))) {
+      if (PI->getOpcode() == Hexagon::S2_allocframe || PI->mayStore() ||
+          HII->isLoopN(PI)) {
+        Dependence = true;
+        break;
+      }
+      // Check #2/#3.
+      const MachineOperand &OpR = secondRegMatch ? NOp0 : NOp1;
+      if (OpR.isReg() && PI->modifiesRegister(OpR.getReg(), HRI)) {
         Dependence = true;
         break;
       }
     }
-    if (!Dependence)
-      GlueToNewValueJump = true;
-    else
+
+    if (Dependence)
       return false;
+    GlueToNewValueJump = true;
   }
 
-  if (SUJ->isSucc(SUI)) {
-    for (unsigned i = 0;
-         (i < SUJ->Succs.size()) && !FoundSequentialDependence;
-         ++i) {
+  // There no dependency between a prolog instruction and its successor.
+  if (!SUJ->isSucc(SUI))
+    return true;
 
-      if (SUJ->Succs[i].getSUnit() != SUI) {
-        continue;
-      }
+  for (unsigned i = 0; i < SUJ->Succs.size(); ++i) {
+    if (FoundSequentialDependence)
+      break;
 
-      SDep::Kind DepType = SUJ->Succs[i].getKind();
+    if (SUJ->Succs[i].getSUnit() != SUI)
+      continue;
 
-      // For direct calls:
-      // Ignore register dependences for call instructions for
-      // packetization purposes except for those due to r31 and
-      // predicate registers.
-      //
-      // For indirect calls:
-      // Same as direct calls + check for true dependences to the register
-      // used in the indirect call.
-      //
-      // We completely ignore Order dependences for call instructions
-      //
-      // For returns:
-      // Ignore register dependences for return instructions like jumpr,
-      // dealloc return unless we have dependencies on the explicit uses
-      // of the registers used by jumpr (like r31) or dealloc return
-      // (like r29 or r30).
-      //
-      // TODO: Currently, jumpr is handling only return of r31. So, the
-      // following logic (specificaly IsCallDependent) is working fine.
-      // We need to enable jumpr for register other than r31 and then,
-      // we need to rework the last part, where it handles indirect call
-      // of that (IsCallDependent) function. Bug 6216 is opened for this.
-      //
-      unsigned DepReg = 0;
-      const TargetRegisterClass* RC = nullptr;
-      if (DepType == SDep::Data) {
-        DepReg = SUJ->Succs[i].getReg();
-        RC = QRI->getMinimalPhysRegClass(DepReg);
-      }
-      if ((MCIDI.isCall() || MCIDI.isReturn()) &&
-          (!IsRegDependence(DepType) ||
-            !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) {
-        /* do nothing */
-      }
+    SDep::Kind DepType = SUJ->Succs[i].getKind();
+    // For direct calls:
+    // Ignore register dependences for call instructions for packetization
+    // purposes except for those due to r31 and predicate registers.
+    //
+    // For indirect calls:
+    // Same as direct calls + check for true dependences to the register
+    // used in the indirect call.
+    //
+    // We completely ignore Order dependences for call instructions.
+    //
+    // For returns:
+    // Ignore register dependences for return instructions like jumpr,
+    // dealloc return unless we have dependencies on the explicit uses
+    // of the registers used by jumpr (like r31) or dealloc return
+    // (like r29 or r30).
+    //
+    // TODO: Currently, jumpr is handling only return of r31. So, the
+    // following logic (specificaly isCallDependent) is working fine.
+    // We need to enable jumpr for register other than r31 and then,
+    // we need to rework the last part, where it handles indirect call
+    // of that (isCallDependent) function. Bug 6216 is opened for this.
+    unsigned DepReg = 0;
+    const TargetRegisterClass *RC = nullptr;
+    if (DepType == SDep::Data) {
+      DepReg = SUJ->Succs[i].getReg();
+      RC = HRI->getMinimalPhysRegClass(DepReg);
+    }
 
-      // For instructions that can be promoted to dot-new, try to promote.
-      else if ((DepType == SDep::Data) &&
-               CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) &&
-               PromoteToDotNew(I, DepType, II, RC)) {
-        PromotedToDotNew = true;
-        /* do nothing */
-      }
+    if (I->isCall() || I->isReturn()) {
+      if (!isRegDependence(DepType))
+        continue;
+      if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg()))
+        continue;
+    }
 
-      else if ((DepType == SDep::Data) &&
-               (QII->isNewValueJump(I))) {
-        /* do nothing */
-      }
+    if (DepType == SDep::Data) {
+      if (canPromoteToDotCur(J, SUJ, DepReg, II, RC))
+        if (promoteToDotCur(J, DepType, II, RC))
+          continue;
+    }
 
-      // For predicated instructions, if the predicates are complements
-      // then there can be no dependence.
-      else if (QII->isPredicated(I) &&
-               QII->isPredicated(J) &&
-          ArePredicatesComplements(I, J, MIToSUnit)) {
-        /* do nothing */
+    // Data dpendence ok if we have load.cur.
+    if (DepType == SDep::Data && HII->isDotCurInst(J)) {
+      if (HII->isV60VectorInstruction(I))
+        continue;
+    }
 
+    // For instructions that can be promoted to dot-new, try to promote.
+    if (DepType == SDep::Data) {
+      if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) {
+        if (promoteToDotNew(I, DepType, II, RC)) {
+          PromotedToDotNew = true;
+          continue;
+        }
       }
-      else if (IsDirectJump(I) &&
-               !MCIDJ.isBranch() &&
-               !MCIDJ.isCall() &&
-               (DepType == SDep::Order)) {
-        // Ignore Order dependences between unconditional direct branches
-        // and non-control-flow instructions
-        /* do nothing */
-      }
-      else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) &&
-               (DepType != SDep::Output)) {
-        // Ignore all dependences for jumps except for true and output
-        // dependences
-        /* do nothing */
-      }
-
-      // Ignore output dependences due to superregs. We can
-      // write to two different subregisters of R1:0 for instance
-      // in the same cycle
-      //
+      if (HII->isNewValueJump(I))
+        continue;
+    }
 
+    // For predicated instructions, if the predicates are complements then
+    // there can be no dependence.
+    if (HII->isPredicated(I) && HII->isPredicated(J) &&
+        arePredicatesComplements(I, J)) {
+      // Not always safe to do this translation.
+      // DAG Builder attempts to reduce dependence edges using transitive
+      // nature of dependencies. Here is an example:
       //
-      // Let the
-      // If neither I nor J defines DepReg, then this is a
-      // superfluous output dependence. The dependence must be of the
-      // form:
-      //  R0 = ...
-      //  R1 = ...
-      // and there is an output dependence between the two instructions
-      // with
-      // DepReg = D0
-      // We want to ignore these dependences.
-      // Ideally, the dependence constructor should annotate such
-      // dependences. We can then avoid this relatively expensive check.
+      // r0 = tfr_pt ... (1)
+      // r0 = tfr_pf ... (2)
+      // r0 = tfr_pt ... (3)
       //
-      else if (DepType == SDep::Output) {
-        // DepReg is the register that's responsible for the dependence.
-        unsigned DepReg = SUJ->Succs[i].getReg();
+      // There will be an output dependence between (1)->(2) and (2)->(3).
+      // However, there is no dependence edge between (1)->(3). This results
+      // in all 3 instructions going in the same packet. We ignore dependce
+      // only once to avoid this situation.
+      auto Itr = std::find(IgnoreDepMIs.begin(), IgnoreDepMIs.end(), J);
+      if (Itr != IgnoreDepMIs.end()) {
+        Dependence = true;
+        return false;
+      }
+      IgnoreDepMIs.push_back(I);
+      continue;
+    }
+
+    // Ignore Order dependences between unconditional direct branches
+    // and non-control-flow instructions.
+    if (isDirectJump(I) && !J->isBranch() && !J->isCall() &&
+        DepType == SDep::Order)
+      continue;
+
+    // Ignore all dependences for jumps except for true and output
+    // dependences.
+    if (I->isConditionalBranch() && DepType != SDep::Data &&
+        DepType != SDep::Output)
+      continue;
+
+    // Ignore output dependences due to superregs. We can write to two
+    // different subregisters of R1:0 for instance in the same cycle.
+
+    // If neither I nor J defines DepReg, then this is a superfluous output
+    // dependence. The dependence must be of the form:
+    //   R0 = ...
+    //   R1 = ...
+    // and there is an output dependence between the two instructions with
+    // DepReg = D0.
+    // We want to ignore these dependences. Ideally, the dependence
+    // constructor should annotate such dependences. We can then avoid this
+    // relatively expensive check.
+    //
+    if (DepType == SDep::Output) {
+      // DepReg is the register that's responsible for the dependence.
+      unsigned DepReg = SUJ->Succs[i].getReg();
+
+      // Check if I and J really defines DepReg.
+      if (!I->definesRegister(DepReg) && !J->definesRegister(DepReg))
+        continue;
+      FoundSequentialDependence = true;
+      break;
+    }
 
-        // Check if I and J really defines DepReg.
-        if (I->definesRegister(DepReg) ||
-            J->definesRegister(DepReg)) {
+    // For Order dependences:
+    // 1. On V4 or later, volatile loads/stores can be packetized together,
+    //    unless other rules prevent is.
+    // 2. Store followed by a load is not allowed.
+    // 3. Store followed by a store is only valid on V4 or later.
+    // 4. Load followed by any memory operation is allowed.
+    if (DepType == SDep::Order) {
+      if (!PacketizeVolatiles) {
+        bool OrdRefs = I->hasOrderedMemoryRef() || J->hasOrderedMemoryRef();
+        if (OrdRefs) {
           FoundSequentialDependence = true;
           break;
         }
       }
-
-      // We ignore Order dependences for
-      // 1. Two loads unless they are volatile.
-      // 2. Two stores in V4 unless they are volatile.
-      else if ((DepType == SDep::Order) &&
-               !I->hasOrderedMemoryRef() &&
-               !J->hasOrderedMemoryRef()) {
-        if (MCIDI.mayStore() && MCIDJ.mayStore()) {
-          /* do nothing */
-        }
-        // store followed by store-- not OK on V2
-        // store followed by load -- not OK on all (OK if addresses
-        // are not aliased)
-        // load followed by store -- OK on all
-        // load followed by load  -- OK on all
-        else if ( !MCIDJ.mayStore()) {
-          /* do nothing */
-        }
-        else {
+      // J is first, I is second.
+      bool LoadJ = J->mayLoad(), StoreJ = J->mayStore();
+      bool LoadI = I->mayLoad(), StoreI = I->mayStore();
+      if (StoreJ) {
+        // Two stores are only allowed on V4+. Load following store is never
+        // allowed.
+        if (LoadI) {
           FoundSequentialDependence = true;
           break;
         }
-      }
-
-      // For V4, special case ALLOCFRAME. Even though there is dependency
-      // between ALLOCFRAME and subsequent store, allow it to be
-      // packetized in a same packet. This implies that the store is using
-      // caller's SP. Hence, offset needs to be updated accordingly.
-      else if (DepType == SDep::Data
-               && J->getOpcode() == Hexagon::S2_allocframe
-               && (I->getOpcode() == Hexagon::S2_storerd_io
-                   || I->getOpcode() == Hexagon::S2_storeri_io
-                   || I->getOpcode() == Hexagon::S2_storerb_io)
-               && I->getOperand(0).getReg() == QRI->getStackRegister()
-               && QII->isValidOffset(I->getOpcode(),
-                                     I->getOperand(1).getImm() -
-                                     (FrameSize + HEXAGON_LRFP_SIZE)))
-      {
-        GlueAllocframeStore = true;
-        // Since this store is to be glued with allocframe in the same
-        // packet, it will use SP of the previous stack frame, i.e
-        // caller's SP. Therefore, we need to recalculate offset according
-        // to this change.
-        I->getOperand(1).setImm(I->getOperand(1).getImm() -
-                                        (FrameSize + HEXAGON_LRFP_SIZE));
-      }
-
-      //
-      // Skip over anti-dependences. Two instructions that are
-      // anti-dependent can share a packet
-      //
-      else if (DepType != SDep::Anti) {
+      } else if (!LoadJ || (!LoadI && !StoreI)) {
+        // If J is neither load nor store, assume a dependency.
+        // If J is a load, but I is neither, also assume a dependency.
         FoundSequentialDependence = true;
         break;
       }
+      // Store followed by store: not OK on V2.
+      // Store followed by load: not OK on all.
+      // Load followed by store: OK on all.
+      // Load followed by load: OK on all.
+      continue;
     }
 
-    if (FoundSequentialDependence) {
-      Dependence = true;
-      return false;
+    // For V4, special case ALLOCFRAME. Even though there is dependency
+    // between ALLOCFRAME and subsequent store, allow it to be packetized
+    // in a same packet. This implies that the store is using the caller's
+    // SP. Hence, offset needs to be updated accordingly.
+    if (DepType == SDep::Data && J->getOpcode() == Hexagon::S2_allocframe) {
+      unsigned Opc = I->getOpcode();
+      switch (Opc) {
+        case Hexagon::S2_storerd_io:
+        case Hexagon::S2_storeri_io:
+        case Hexagon::S2_storerh_io:
+        case Hexagon::S2_storerb_io:
+          if (I->getOperand(0).getReg() == HRI->getStackRegister()) {
+            int64_t Imm = I->getOperand(1).getImm();
+            int64_t NewOff = Imm - (FrameSize + HEXAGON_LRFP_SIZE);
+            if (HII->isValidOffset(Opc, NewOff)) {
+              GlueAllocframeStore = true;
+              // Since this store is to be glued with allocframe in the same
+              // packet, it will use SP of the previous stack frame, i.e.
+              // caller's SP. Therefore, we need to recalculate offset
+              // according to this change.
+              I->getOperand(1).setImm(NewOff);
+              continue;
+            }
+          }
+        default:
+          break;
+      }
+    }
+
+    // Skip over anti-dependences. Two instructions that are anti-dependent
+    // can share a packet.
+    if (DepType != SDep::Anti) {
+      FoundSequentialDependence = true;
+      break;
     }
   }
 
+  if (FoundSequentialDependence) {
+    Dependence = true;
+    return false;
+  }
+
   return true;
 }
 
-// isLegalToPruneDependencies
 bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
   MachineInstr *I = SUI->getInstr();
-  assert(I && SUJ->getInstr() && "Unable to packetize null instruction!");
-
-  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
-
-  if (Dependence) {
+  MachineInstr *J = SUJ->getInstr();
+  assert(I && J && "Unable to packetize null instruction!");
 
-    // Check if the instruction was promoted to a dot-new. If so, demote it
-    // back into a dot-old.
-    if (PromotedToDotNew) {
-      DemoteToDotOld(I);
-    }
+  if (cannotCoexist(I, J))
+    return false;
 
-    // Check if the instruction (must be a store) was glued with an Allocframe
-    // instruction. If so, restore its offset to its original value, i.e. use
-    // curent SP instead of caller's SP.
-    if (GlueAllocframeStore) {
-      I->getOperand(1).setImm(I->getOperand(1).getImm() +
-                                             FrameSize + HEXAGON_LRFP_SIZE);
-    }
+  if (!Dependence)
+    return true;
 
-    return false;
+  // Check if the instruction was promoted to a dot-new. If so, demote it
+  // back into a dot-old.
+  if (PromotedToDotNew)
+    demoteToDotOld(I);
+
+  cleanUpDotCur();
+  // Check if the instruction (must be a store) was glued with an allocframe
+  // instruction. If so, restore its offset to its original value, i.e. use
+  // current SP instead of caller's SP.
+  if (GlueAllocframeStore) {
+    unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+    MachineOperand &MOff = I->getOperand(1);
+    MOff.setImm(MOff.getImm() + FrameSize + HEXAGON_LRFP_SIZE);
   }
-  return true;
+  return false;
 }
 
+
 MachineBasicBlock::iterator
 HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+  MachineBasicBlock::iterator MII = MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  if (MI->isImplicitDef()) {
+    unsigned R = MI->getOperand(0).getReg();
+    if (Hexagon::IntRegsRegClass.contains(R)) {
+      MCSuperRegIterator S(R, HRI, false);
+      MI->addOperand(MachineOperand::CreateReg(*S, true, true));
+    }
+    return MII;
+  }
+  assert(ResourceTracker->canReserveResources(MI));
+
+  bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
+  bool Good = true;
+
+  if (GlueToNewValueJump) {
+    MachineInstr *NvjMI = ++MII;
+    // We need to put both instructions in the same packet: MI and NvjMI.
+    // Either of them can require a constant extender. Try to add both to
+    // the current packet, and if that fails, end the packet and start a
+    // new one.
+    ResourceTracker->reserveResources(MI);
+    if (ExtMI)
+      Good = tryAllocateResourcesForConstExt(true);
+
+    bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI);
+    if (Good) {
+      if (ResourceTracker->canReserveResources(NvjMI))
+        ResourceTracker->reserveResources(NvjMI);
+      else
+        Good = false;
+    }
+    if (Good && ExtNvjMI)
+      Good = tryAllocateResourcesForConstExt(true);
 
-    MachineBasicBlock::iterator MII = MI;
-    MachineBasicBlock *MBB = MI->getParent();
-
-    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-
-    if (GlueToNewValueJump) {
-
-      ++MII;
-      MachineInstr *nvjMI = MII;
+    if (!Good) {
+      endPacket(MBB, MI);
       assert(ResourceTracker->canReserveResources(MI));
       ResourceTracker->reserveResources(MI);
-      if ((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
-          !tryAllocateResourcesForConstExt(MI)) {
-        endPacket(MBB, MI);
-        ResourceTracker->reserveResources(MI);
-        assert(canReserveResourcesForConstExt(MI) &&
-               "Ensure that there is a slot");
-        reserveResourcesForConstExt(MI);
-        // Reserve resources for new value jump constant extender.
-        assert(canReserveResourcesForConstExt(MI) &&
-               "Ensure that there is a slot");
-        reserveResourcesForConstExt(nvjMI);
-        assert(ResourceTracker->canReserveResources(nvjMI) &&
-               "Ensure that there is a slot");
-
-      } else if (   // Extended instruction takes two slots in the packet.
-        // Try reserve and allocate 4-byte in the current packet first.
-        (QII->isExtended(nvjMI)
-            && (!tryAllocateResourcesForConstExt(nvjMI)
-                || !ResourceTracker->canReserveResources(nvjMI)))
-        || // For non-extended instruction, no need to allocate extra 4 bytes.
-        (!QII->isExtended(nvjMI) &&
-              !ResourceTracker->canReserveResources(nvjMI)))
-      {
-        endPacket(MBB, MI);
-        // A new and empty packet starts.
-        // We are sure that the resources requirements can be satisfied.
-        // Therefore, do not need to call "canReserveResources" anymore.
-        ResourceTracker->reserveResources(MI);
-        if (QII->isExtended(nvjMI))
-          reserveResourcesForConstExt(nvjMI);
+      if (ExtMI) {
+        assert(canReserveResourcesForConstExt());
+        tryAllocateResourcesForConstExt(true);
       }
-      // Here, we are sure that "reserveResources" would succeed.
-      ResourceTracker->reserveResources(nvjMI);
-      CurrentPacketMIs.push_back(MI);
-      CurrentPacketMIs.push_back(nvjMI);
-    } else {
-      if (   (QII->isExtended(MI) || QII->isConstExtended(MI))
-          && (   !tryAllocateResourcesForConstExt(MI)
-              || !ResourceTracker->canReserveResources(MI)))
-      {
-        endPacket(MBB, MI);
-        // Check if the instruction was promoted to a dot-new. If so, demote it
-        // back into a dot-old
-        if (PromotedToDotNew) {
-          DemoteToDotOld(MI);
-        }
-        reserveResourcesForConstExt(MI);
+      assert(ResourceTracker->canReserveResources(NvjMI));
+      ResourceTracker->reserveResources(NvjMI);
+      if (ExtNvjMI) {
+        assert(canReserveResourcesForConstExt());
+        reserveResourcesForConstExt();
       }
-      // In case that "MI" is not an extended insn,
-      // the resource availability has already been checked.
-      ResourceTracker->reserveResources(MI);
-      CurrentPacketMIs.push_back(MI);
     }
+    CurrentPacketMIs.push_back(MI);
+    CurrentPacketMIs.push_back(NvjMI);
     return MII;
+  }
+
+  ResourceTracker->reserveResources(MI);
+  if (ExtMI && !tryAllocateResourcesForConstExt(true)) {
+    endPacket(MBB, MI);
+    if (PromotedToDotNew)
+      demoteToDotOld(MI);
+    ResourceTracker->reserveResources(MI);
+    reserveResourcesForConstExt();
+  }
+
+  CurrentPacketMIs.push_back(MI);
+  return MII;
+}
+
+void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
+      MachineInstr *MI) {
+  OldPacketMIs = CurrentPacketMIs;
+  VLIWPacketizerList::endPacket(MBB, MI);
+}
+
+bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr *MI) {
+  return !producesStall(MI);
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+static bool isDependent(const MachineInstr *ProdMI,
+      const MachineInstr *ConsMI) {
+  if (!ProdMI->getOperand(0).isReg())
+    return false;
+  unsigned DstReg = ProdMI->getOperand(0).getReg();
+
+  for (auto &Op : ConsMI->operands())
+    if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg)
+      // The MIs depend on each other.
+      return true;
+
+  return false;
 }
 
+// V60 forward scheduling.
+bool HexagonPacketizerList::producesStall(const MachineInstr *I) {
+  // Check whether the previous packet is in a different loop. If this is the
+  // case, there is little point in trying to avoid a stall because that would
+  // favor the rare case (loop entry) over the common case (loop iteration).
+  //
+  // TODO: We should really be able to check all the incoming edges if this is
+  // the first packet in a basic block, so we can avoid stalls from the loop
+  // backedge.
+  if (!OldPacketMIs.empty()) {
+    auto *OldBB = OldPacketMIs.front()->getParent();
+    auto *ThisBB = I->getParent();
+    if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
+      return false;
+  }
+
+  // Check for stall between two vector instructions.
+  if (HII->isV60VectorInstruction(I)) {
+    for (auto J : OldPacketMIs) {
+      if (!HII->isV60VectorInstruction(J))
+        continue;
+      if (isDependent(J, I) && !HII->isVecUsableNextPacket(J, I))
+        return true;
+    }
+    return false;
+  }
+
+  // Check for stall between two scalar instructions. First, check that
+  // there is no definition of a use in the current packet, because it
+  // may be a candidate for .new.
+  for (auto J : CurrentPacketMIs)
+    if (!HII->isV60VectorInstruction(J) && isDependent(J, I))
+      return false;
+
+  // Check for stall between I and instructions in the previous packet.
+  if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) {
+    for (auto J : OldPacketMIs) {
+      if (HII->isV60VectorInstruction(J))
+        continue;
+      if (!HII->isLateInstrFeedsEarlyInstr(J, I))
+        continue;
+      if (isDependent(J, I) && !HII->canExecuteInBundle(J, I))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
new file mode 100644
index 0000000..960cf6c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -0,0 +1,114 @@
+#ifndef HEXAGONVLIWPACKETIZER_H
+#define HEXAGONVLIWPACKETIZER_H
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+namespace llvm {
+class HexagonPacketizerList : public VLIWPacketizerList {
+  // Vector of instructions assigned to the packet that has just been created.
+  std::vector<MachineInstr*> OldPacketMIs;
+
+  // Has the instruction been promoted to a dot-new instruction.
+  bool PromotedToDotNew;
+
+  // Has the instruction been glued to allocframe.
+  bool GlueAllocframeStore;
+
+  // Has the feeder instruction been glued to new value jump.
+  bool GlueToNewValueJump;
+
+  // Check if there is a dependence between some instruction already in this
+  // packet and this instruction.
+  bool Dependence;
+
+  // Only check for dependence if there are resources available to
+  // schedule this instruction.
+  bool FoundSequentialDependence;
+
+  // Track MIs with ignored dependence.
+  std::vector<MachineInstr*> IgnoreDepMIs;
+
+protected:
+  /// \brief A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+  const MachineLoopInfo *MLI;
+
+private:
+  const HexagonInstrInfo *HII;
+  const HexagonRegisterInfo *HRI;
+
+public:
+  // Ctor.
+  HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                        AliasAnalysis *AA,
+                        const MachineBranchProbabilityInfo *MBPI);
+
+  // initPacketizerState - initialize some internal flags.
+  void initPacketizerState() override;
+
+  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+  bool ignorePseudoInstruction(const MachineInstr *MI,
+                               const MachineBasicBlock *MBB) override;
+
+  // isSoloInstruction - return true if instruction MI can not be packetized
+  // with any other instruction, which means that MI itself is a packet.
+  bool isSoloInstruction(const MachineInstr *MI) override;
+
+  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+  // together.
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
+
+  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // and SUJ.
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
+  void endPacket(MachineBasicBlock *MBB, MachineInstr *MI) override;
+  bool shouldAddToPacket(const MachineInstr *MI) override;
+
+  void unpacketizeSoloInstrs(MachineFunction &MF);
+
+protected:
+  bool isCallDependent(const MachineInstr* MI, SDep::Kind DepType,
+                       unsigned DepReg);
+  bool promoteToDotCur(MachineInstr* MI, SDep::Kind DepType,
+                       MachineBasicBlock::iterator &MII,
+                       const TargetRegisterClass* RC);
+  bool canPromoteToDotCur(const MachineInstr* MI, const SUnit* PacketSU,
+                          unsigned DepReg, MachineBasicBlock::iterator &MII,
+                          const TargetRegisterClass* RC);
+  void cleanUpDotCur();
+
+  bool promoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
+                       MachineBasicBlock::iterator &MII,
+                       const TargetRegisterClass* RC);
+  bool canPromoteToDotNew(const MachineInstr* MI, const SUnit* PacketSU,
+                          unsigned DepReg, MachineBasicBlock::iterator &MII,
+                          const TargetRegisterClass* RC);
+  bool canPromoteToNewValue(const MachineInstr* MI, const SUnit* PacketSU,
+                            unsigned DepReg, MachineBasicBlock::iterator &MII);
+  bool canPromoteToNewValueStore(const MachineInstr* MI,
+                                 const MachineInstr* PacketMI, unsigned DepReg);
+  bool demoteToDotOld(MachineInstr* MI);
+  bool arePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2);
+  bool restrictingDepExistInPacket(MachineInstr*, unsigned);
+  bool isNewifiable(const MachineInstr *MI);
+  bool isCurifiable(MachineInstr* MI);
+  bool cannotCoexist(const MachineInstr *MI, const MachineInstr *MJ);
+  inline bool isPromotedToDotNew() const {
+    return PromotedToDotNew;
+  }
+  bool tryAllocateResourcesForConstExt(bool Reserve);
+  bool canReserveResourcesForConstExt();
+  void reserveResourcesForConstExt();
+  bool hasDeadDependence(const MachineInstr *I, const MachineInstr *J);
+  bool hasControlDependence(const MachineInstr *I, const MachineInstr *J);
+  bool hasV4SpecificDependence(const MachineInstr *I, const MachineInstr *J);
+  bool producesStall(const MachineInstr *MI);
+};
+} // namespace llvm
+#endif // HEXAGONVLIWPACKETIZER_H
+
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 99ea2fa..b73af82 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -13,7 +13,9 @@
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -33,14 +35,28 @@ class HexagonAsmBackend : public MCAsmBackend {
   mutable uint64_t relaxedCnt;
   std::unique_ptr <MCInstrInfo> MCII;
   std::unique_ptr <MCInst *> RelaxTarget;
+  MCInst * Extender;
 public:
   HexagonAsmBackend(Target const &T,  uint8_t OSABI, StringRef CPU) :
-    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *){}
+    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
+    Extender(nullptr) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createHexagonELFObjectWriter(OS, OSABI, CPU);
   }
 
+  void setExtender(MCContext &Context) const {
+    if (Extender == nullptr)
+      const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst;
+  }
+
+  MCInst *takeExtender() const {
+    assert(Extender != nullptr);
+    MCInst * Result = Extender;
+    const_cast<HexagonAsmBackend *>(this)->Extender = nullptr;
+    return Result;
+  }
+
   unsigned getNumFixupKinds() const override {
     return Hexagon::NumTargetFixupKinds;
   }
@@ -222,6 +238,7 @@ public:
         if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
           ++relaxedCnt;
           *RelaxTarget = &MCI;
+          setExtender(Layout.getAssembler().getContext());
           return true;
         } else {
           return false;
@@ -262,6 +279,7 @@ public:
       if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
         ++relaxedCnt;
         *RelaxTarget = &MCI;
+        setExtender(Layout.getAssembler().getContext());
         return true;
       }
     }
@@ -276,9 +294,35 @@ public:
     llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
   }
 
-  void relaxInstruction(MCInst const & /*Inst*/,
-                        MCInst & /*Res*/) const override {
-    llvm_unreachable("relaxInstruction() unimplemented");
+  void relaxInstruction(MCInst const & Inst,
+                        MCInst & Res) const override {
+    assert(HexagonMCInstrInfo::isBundle(Inst) &&
+           "Hexagon relaxInstruction only works on bundles");
+
+    Res = HexagonMCInstrInfo::createBundle();
+    // Copy the results into the bundle.
+    bool Update = false;
+    for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+      MCInst &CrntHMI = const_cast<MCInst &>(*I.getInst());
+
+      // if immediate extender needed, add it in
+      if (*RelaxTarget == &CrntHMI) {
+        Update = true;
+        assert((HexagonMCInstrInfo::bundleSize(Res) < HEXAGON_PACKET_SIZE) &&
+               "No room to insert extender for relaxation");
+
+        MCInst *HMIx = takeExtender();
+        *HMIx = HexagonMCInstrInfo::deriveExtender(
+                *MCII, CrntHMI,
+                HexagonMCInstrInfo::getExtendableOperand(*MCII, CrntHMI));
+        Res.addOperand(MCOperand::createInst(HMIx));
+        *RelaxTarget = nullptr;
+      }
+      // now copy over the original instruction(the one we may have extended)
+      Res.addOperand(MCOperand::createInst(I.getInst()));
+    }
+    (void)Update;
+    assert(Update && "Didn't find relaxation target");
   }
 
   bool writeNopData(uint64_t Count,
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index f4d162c..47a6f86 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -44,6 +44,25 @@ namespace HexagonII {
     TypeMEMOP   = 9,
     TypeNV      = 10,
     TypeDUPLEX  = 11,
+    TypeCOMPOUND = 12,
+    TypeCVI_FIRST     = 13,
+    TypeCVI_VA        = TypeCVI_FIRST,
+    TypeCVI_VA_DV     = 14,
+    TypeCVI_VX        = 15,
+    TypeCVI_VX_DV     = 16,
+    TypeCVI_VP        = 17,
+    TypeCVI_VP_VS     = 18,
+    TypeCVI_VS        = 19,
+    TypeCVI_VINLANESAT= 20,
+    TypeCVI_VM_LD     = 21,
+    TypeCVI_VM_TMP_LD = 22,
+    TypeCVI_VM_CUR_LD = 23,
+    TypeCVI_VM_VP_LDU = 24,
+    TypeCVI_VM_ST     = 25,
+    TypeCVI_VM_NEW_ST = 26,
+    TypeCVI_VM_STU    = 27,
+    TypeCVI_HIST      = 28,
+    TypeCVI_LAST      = TypeCVI_HIST,
     TypePREFIX  = 30, // Such as extenders.
     TypeENDLOOP = 31  // Such as end of a HW loop.
   };
@@ -71,12 +90,16 @@ namespace HexagonII {
     PostInc        = 6   // Post increment addressing mode
   };
 
+  // MemAccessSize is represented as 1+log2(N) where N is size in bits.
   enum class MemAccessSize {
     NoMemAccess = 0,            // Not a memory acces instruction.
     ByteAccess = 1,             // Byte access instruction (memb).
     HalfWordAccess = 2,         // Half word access instruction (memh).
     WordAccess = 3,             // Word access instruction (memw).
-    DoubleWordAccess = 4        // Double word access instruction (memd)
+    DoubleWordAccess = 4,       // Double word access instruction (memd)
+                    // 5,       // We do not have a 16 byte vector access.
+    Vector64Access = 7,         // 64 Byte vector access instruction (vmem).
+    Vector128Access = 8         // 128 Byte vector access instruction (vmem).
   };
 
   // MCInstrDesc TSFlags
@@ -156,7 +179,7 @@ namespace HexagonII {
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
     MemAccessSizePos = 43,
-    MemAccesSizeMask = 0x7,
+    MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
     TakenPos = 47,
@@ -164,7 +187,23 @@ namespace HexagonII {
 
     // Floating-point instructions.
     FPPos  = 48,
-    FPMask = 0x1
+    FPMask = 0x1,
+
+    // New-Value producer-2 instructions.
+    hasNewValuePos2  = 50,
+    hasNewValueMask2 = 0x1,
+
+    // Which operand consumes or produces a new value.
+    NewValueOpPos2  = 51,
+    NewValueOpMask2 = 0x7,
+
+    // Accumulator instructions.
+    AccumulatorPos = 54,
+    AccumulatorMask = 0x1,
+
+    // Complex XU, prevent xu competition by prefering slot3
+    PrefersSlot3Pos = 55,
+    PrefersSlot3Mask = 0x1,
   };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
@@ -219,6 +258,26 @@ namespace HexagonII {
     INST_PARSE_EXTENDER   = 0x00000000
   };
 
+  enum InstIClassBits : unsigned {
+    INST_ICLASS_MASK      = 0xf0000000,
+    INST_ICLASS_EXTENDER  = 0x00000000,
+    INST_ICLASS_J_1       = 0x10000000,
+    INST_ICLASS_J_2       = 0x20000000,
+    INST_ICLASS_LD_ST_1   = 0x30000000,
+    INST_ICLASS_LD_ST_2   = 0x40000000,
+    INST_ICLASS_J_3       = 0x50000000,
+    INST_ICLASS_CR        = 0x60000000,
+    INST_ICLASS_ALU32_1   = 0x70000000,
+    INST_ICLASS_XTYPE_1   = 0x80000000,
+    INST_ICLASS_LD        = 0x90000000,
+    INST_ICLASS_ST        = 0xa0000000,
+    INST_ICLASS_ALU32_2   = 0xb0000000,
+    INST_ICLASS_XTYPE_2   = 0xc0000000,
+    INST_ICLASS_XTYPE_3   = 0xd0000000,
+    INST_ICLASS_XTYPE_4   = 0xe0000000,
+    INST_ICLASS_ALU32_3   = 0xf0000000
+  };
+
 } // End namespace HexagonII.
 
 } // End namespace llvm.
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 36f8146..06ccec5 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonAsmPrinter.h"
-#include "Hexagon.h"
 #include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -28,104 +28,33 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "HexagonGenAsmWriter.inc"
 
-HexagonAsmInstPrinter::HexagonAsmInstPrinter(MCInstPrinter *RawPrinter)
-    : MCInstPrinter(*RawPrinter), RawPrinter(RawPrinter) {}
-
-void HexagonAsmInstPrinter::printInst(MCInst const *MI, raw_ostream &O,
-                                      StringRef Annot,
-                                      MCSubtargetInfo const &STI) {
-  assert(HexagonMCInstrInfo::isBundle(*MI));
-  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
-  std::string Buffer;
-  {
-    raw_string_ostream TempStream(Buffer);
-    RawPrinter->printInst(MI, TempStream, "", STI);
-  }
-  StringRef Contents(Buffer);
-  auto PacketBundle = Contents.rsplit('\n');
-  auto HeadTail = PacketBundle.first.split('\n');
-  auto Preamble = "\t{\n\t\t";
-  auto Separator = "";
-  while(!HeadTail.first.empty()) {
-    O << Separator;
-    StringRef Inst;
-    auto Duplex = HeadTail.first.split('\v');
-    if(!Duplex.second.empty()){
-      O << Duplex.first << "\n";
-      Inst = Duplex.second;
-    }
-    else
-      Inst = Duplex.first;
-    O << Preamble;
-    O << Inst;
-    HeadTail = HeadTail.second.split('\n');
-    Preamble = "";
-    Separator = "\n\t\t";
-  }
-  O << "\n\t}" << PacketBundle.second;
-}
-
-void HexagonAsmInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
-  RawPrinter->printRegName(O, RegNo);
-}
-
-// Return the minimum value that a constant extendable operand can have
-// without being extended.
-static int getMinValue(uint64_t TSFlags) {
-  unsigned isSigned =
-      (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits =
-      (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
-
-  if (isSigned)
-    return -1U << (bits - 1);
-
-  return 0;
-}
-
-// Return the maximum value that a constant extendable operand can have
-// without being extended.
-static int getMaxValue(uint64_t TSFlags) {
-  unsigned isSigned =
-      (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits =
-      (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
-
-  if (isSigned)
-    return ~(-1U << (bits - 1));
-
-  return ~(-1U << bits);
-}
-
-// Return true if the instruction must be extended.
-static bool isExtended(uint64_t TSFlags) {
-  return (TSFlags >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
-}
-
-// Currently just used in an assert statement
-static bool isExtendable(uint64_t TSFlags) LLVM_ATTRIBUTE_UNUSED;
-// Return true if the instruction may be extended based on the operand value.
-static bool isExtendable(uint64_t TSFlags) {
-  return (TSFlags >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+HexagonInstPrinter::HexagonInstPrinter(MCAsmInfo const &MAI,
+                                       MCInstrInfo const &MII,
+                                       MCRegisterInfo const &MRI)
+    : MCInstPrinter(MAI, MII, MRI), MII(MII), HasExtender(false) {
 }
 
 StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
   return MII.getName(Opcode);
 }
 
-void HexagonInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  OS << getRegisterName(RegNo);
+void HexagonInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  O << getRegName(RegNo);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
 }
 
 void HexagonInstPrinter::setExtender(MCInst const &MCI) {
   HasExtender = HexagonMCInstrInfo::isImmext(MCI);
 }
 
-void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS,
-                                   StringRef Annot,
-                                   MCSubtargetInfo const &STI) {
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                   StringRef Annot, const MCSubtargetInfo &STI) {
   assert(HexagonMCInstrInfo::isBundle(*MI));
   assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
+  assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
   HasExtender = false;
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
     MCInst const &MCI = *I.getInst();
@@ -157,145 +86,148 @@ void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS,
   }
 }
 
-void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
                                       raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-
+  if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo &&
+      (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI)))
+    O << "#";
+  MCOperand const &MO = MI->getOperand(OpNo);
   if (MO.isReg()) {
-    printRegName(O, MO.getReg());
-  } else if(MO.isExpr()) {
-    MO.getExpr()->print(O, &MAI);
-  } else if(MO.isImm()) {
-    printImmOperand(MI, OpNo, O);
-  } else {
-    llvm_unreachable("Unknown operand");
-  }
-}
-
-void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-
-  if(MO.isExpr()) {
-    MO.getExpr()->print(O, &MAI);
-  } else if(MO.isImm()) {
-    O << MI->getOperand(OpNo).getImm();
+    O << getRegisterName(MO.getReg());
+  } else if (MO.isExpr()) {
+    int64_t Value;
+    if (MO.getExpr()->evaluateAsAbsolute(Value))
+      O << formatImm(Value);
+    else
+      O << *MO.getExpr();
   } else {
     llvm_unreachable("Unknown operand");
   }
 }
 
-void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printExtOperand(MCInst const *MI, unsigned OpNo,
                                          raw_ostream &O) const {
-  const MCOperand &MO = MI->getOperand(OpNo);
-  const MCInstrDesc &MII = getMII().get(MI->getOpcode());
-
-  assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) &&
-         "Expecting an extendable operand");
-
-  if (MO.isExpr() || isExtended(MII.TSFlags)) {
-    O << "#";
-  } else if (MO.isImm()) {
-    int ImmValue = MO.getImm();
-    if (ImmValue < getMinValue(MII.TSFlags) ||
-        ImmValue > getMaxValue(MII.TSFlags))
-      O << "#";
-  }
   printOperand(MI, OpNo, O);
 }
 
-void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
-                                    unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printUnsignedImmOperand(MCInst const *MI,
+                                                 unsigned OpNo,
+                                                 raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
-void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printNegImmOperand(MCInst const *MI, unsigned OpNo,
                                             raw_ostream &O) const {
   O << -MI->getOperand(OpNo).getImm();
 }
 
-void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
                                              raw_ostream &O) const {
   O << -1;
 }
 
-void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) const {
-  const MCOperand& MO0 = MI->getOperand(OpNo);
-  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<9>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+  O << formatImm(Imm/64);
+}
 
-  printRegName(O, MO0.getReg());
-  O << " + #" << MO1.getImm();
+void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<10>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+  O << formatImm(Imm/128);
 }
 
-void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
-                                                raw_ostream &O) const {
-  const MCOperand& MO0 = MI->getOperand(OpNo);
-  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<10>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+  O << formatImm(Imm/64);
+}
 
-  printRegName(O, MO0.getReg());
-  O << ", #" << MO1.getImm();
+void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<11>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+  O << formatImm(Imm/128);
 }
 
-void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
                                             raw_ostream &O) const {
-  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
-
   printOperand(MI, OpNo, O);
 }
 
-void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printJumpTable(MCInst const *MI, unsigned OpNo,
                                         raw_ostream &O) const {
   assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
 
   printOperand(MI, OpNo, O);
 }
 
-void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printConstantPool(MCInst const *MI, unsigned OpNo,
                                            raw_ostream &O) const {
   assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
 
   printOperand(MI, OpNo, O);
 }
 
-void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printBranchOperand(MCInst const *MI, unsigned OpNo,
                                             raw_ostream &O) const {
   // Branches can take an immediate operand.  This is used by the branch
   // selection pass to print $+8, an eight byte displacement from the PC.
   llvm_unreachable("Unknown branch operand.");
 }
 
-void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) const {
-}
+void HexagonInstPrinter::printCallOperand(MCInst const *MI, unsigned OpNo,
+                                          raw_ostream &O) const {}
 
-void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-}
+void HexagonInstPrinter::printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {}
 
-void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                                               raw_ostream &O) const {
-}
+void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo,
+                                               raw_ostream &O) const {}
 
-void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo,
                                      raw_ostream &O, bool hi) const {
-  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
+  MCOperand const &MO = MI->getOperand(OpNo);
 
-  O << '#' << (hi ? "HI" : "LO") << "(#";
-  printOperand(MI, OpNo, O);
+  O << '#' << (hi ? "HI" : "LO") << '(';
+  if (MO.isImm()) {
+    O << '#';
+    printOperand(MI, OpNo, O);
+  } else {
+    printOperand(MI, OpNo, O);
+    assert("Unknown symbol operand");
+  }
   O << ')';
 }
 
-void HexagonInstPrinter::printExtBrtarget(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) const {
-  const MCOperand &MO = MI->getOperand(OpNo);
-  const MCInstrDesc &MII = getMII().get(MI->getOpcode());
-
-  assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) &&
-         "Expecting an extendable operand");
-
-  if (MO.isExpr() || isExtended(MII.TSFlags)) {
-    O << "##";
+void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo,
+                                       raw_ostream &O) const {
+  MCOperand const &MO = MI->getOperand(OpNo);
+  assert (MO.isExpr());
+  MCExpr const &Expr = *MO.getExpr();
+  int64_t Value;
+  if (Expr.evaluateAsAbsolute(Value))
+    O << format("0x%" PRIx64, Value);
+  else {
+    if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))
+      if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo)
+        O << "##";
+    O << Expr;
   }
-  printOperand(MI, OpNo, O);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 534ac23..5f42118 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class prints an Hexagon MCInst to a .s file.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,17 +14,8 @@
 #define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrInfo.h"
 
 namespace llvm {
-class HexagonAsmInstPrinter : public MCInstPrinter {
-public:
-  HexagonAsmInstPrinter(MCInstPrinter *RawPrinter);
-  void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
-                 MCSubtargetInfo const &STI) override;
-  void printRegName(raw_ostream &O, unsigned RegNo) const override;
-  std::unique_ptr<MCInstPrinter> RawPrinter;
-};
 /// Prints bundles as a newline separated list of individual instructions
 /// Duplexes are separated by a vertical tab \v character
 /// A trailing line includes bundle properties such as endloop0/1
@@ -33,68 +23,69 @@ public:
 /// r0 = add(r1, r2)
 /// r0 = #0 \v jump 0x0
 /// :endloop0 :endloop1
-  class HexagonInstPrinter : public MCInstPrinter {
-  public:
-    explicit HexagonInstPrinter(MCAsmInfo const &MAI,
-                                MCInstrInfo const &MII,
-                                MCRegisterInfo const &MRI)
-      : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
-
-    void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
-                   const MCSubtargetInfo &STI) override;
-    virtual StringRef getOpcodeName(unsigned Opcode) const;
-    void printInstruction(const MCInst *MI, raw_ostream &O);
-    void printRegName(raw_ostream &OS, unsigned RegNo) const override;
-    static const char *getRegisterName(unsigned RegNo);
+class HexagonInstPrinter : public MCInstPrinter {
+public:
+  explicit HexagonInstPrinter(MCAsmInfo const &MAI, MCInstrInfo const &MII,
+                              MCRegisterInfo const &MRI);
+  void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  virtual StringRef getOpcodeName(unsigned Opcode) const;
+  void printInstruction(MCInst const *MI, raw_ostream &O);
 
-    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) const;
-    void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
-                                raw_ostream &O) const;
-    void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printExtBrtarget(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+  StringRef getRegName(unsigned RegNo) const;
+  static char const *getRegisterName(unsigned RegNo);
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
 
-    void printConstantPool(const MCInst *MI, unsigned OpNo,
+  void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printUnsignedImmOperand(MCInst const *MI, unsigned OpNo,
+                               raw_ostream &O) const;
+  void printNegImmOperand(MCInst const *MI, unsigned OpNo,
+                          raw_ostream &O) const;
+  void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
                            raw_ostream &O) const;
+  void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void printBranchOperand(MCInst const *MI, unsigned OpNo,
+                          raw_ostream &O) const;
+  void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void printPredicateOperand(MCInst const *MI, unsigned OpNo,
+                             raw_ostream &O) const;
+  void printGlobalOperand(MCInst const *MI, unsigned OpNo,
+                          raw_ostream &O) const;
+  void printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+
+  void printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
 
-    void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
-      { printSymbol(MI, OpNo, O, true); }
-    void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
-      { printSymbol(MI, OpNo, O, false); }
+  void printSymbolHi(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+    printSymbol(MI, OpNo, O, true);
+  }
+  void printSymbolLo(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+    printSymbol(MI, OpNo, O, false);
+  }
 
-    const MCInstrInfo &getMII() const {
-      return MII;
-    }
+  MCAsmInfo const &getMAI() const { return MAI; }
+  MCInstrInfo const &getMII() const { return MII; }
 
-  protected:
-    void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
-           const;
+protected:
+  void printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O,
+                   bool hi) const;
 
-  private:
-    const MCInstrInfo &MII;
+private:
+  MCInstrInfo const &MII;
 
-    bool HasExtender;
-    void setExtender(MCInst const &MCI);
-  };
+  bool HasExtender;
+  void setExtender(MCInst const &MCI);
+};
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index dc07069..a8456b4 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -18,13 +18,14 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class Triple;
+class Triple;
 
-  class HexagonMCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit HexagonMCAsmInfo(const Triple &TT);
-  };
+class HexagonMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit HexagonMCAsmInfo(const Triple &TT);
+};
 
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
new file mode 100644
index 0000000..46b7b41
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -0,0 +1,581 @@
+//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCChecker.h"
+
+#include "HexagonBaseInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false),
+  cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity"));
+
+const HexagonMCChecker::PredSense
+  HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
+
+void HexagonMCChecker::init() {
+  // Initialize read-only registers set.
+  ReadOnly.insert(Hexagon::PC);
+
+  // Figure out the loop-registers definitions.
+  if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+    Defs[Hexagon::SA0].insert(Unconditional); // FIXME: define or change SA0?
+    Defs[Hexagon::LC0].insert(Unconditional);
+  }
+  if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+    Defs[Hexagon::SA1].insert(Unconditional); // FIXME: define or change SA0?
+    Defs[Hexagon::LC1].insert(Unconditional);
+  }
+
+  if (HexagonMCInstrInfo::isBundle(MCB))
+    // Unfurl a bundle.
+    for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      init(*I.getInst());
+    }
+  else
+    init(MCB);
+}
+
+void HexagonMCChecker::init(MCInst const& MCI) {
+  const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
+  unsigned PredReg = Hexagon::NoRegister;
+  bool isTrue = false;
+
+  // Get used registers.
+  for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+    if (MCI.getOperand(i).isReg()) {
+      unsigned R = MCI.getOperand(i).getReg();
+
+      if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+        // Note an used predicate register.
+        PredReg = R;
+        isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+        // Note use of new predicate register.
+        if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+          NewPreds.insert(PredReg);
+      }
+      else
+        // Note register use.  Super-registers are not tracked directly,
+        // but their components.
+        for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+           SRI.isValid();
+           ++SRI)
+         if (!MCSubRegIterator(*SRI, &RI).isValid())
+           // Skip super-registers used indirectly.
+           Uses.insert(*SRI);
+    }
+
+  // Get implicit register definitions.
+  if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
+    for (; *ImpDef; ++ImpDef) {
+      unsigned R = *ImpDef;
+
+      if (Hexagon::R31 != R && MCID.isCall())
+        // Any register other than the LR and the PC are actually volatile ones
+        // as defined by the ABI, not modified implicitly by the call insn.
+        continue;
+      if (Hexagon::PC == R)
+        // Branches are the only insns that can change the PC,
+        // otherwise a read-only register.
+        continue;
+
+      if (Hexagon::USR_OVF == R)
+        // Many insns change the USR implicitly, but only one or another flag.
+        // The instruction table models the USR.OVF flag, which can be implicitly
+        // modified more than once, but cannot be modified in the same packet
+        // with an instruction that modifies is explicitly. Deal with such situ-
+        // ations individually.
+        SoftDefs.insert(R);
+      else if (isPredicateRegister(R) &&
+               HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
+        // Include implicit late predicates.
+        LatePreds.insert(R);
+      else
+        Defs[R].insert(PredSense(PredReg, isTrue));
+    }
+
+  // Figure out explicit register definitions.
+  for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
+    unsigned R = MCI.getOperand(i).getReg(),
+             S = Hexagon::NoRegister;
+
+    // Note register definitions, direct ones as well as indirect side-effects.
+    // Super-registers are not tracked directly, but their components.
+    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+        SRI.isValid();
+        ++SRI) {
+      if (MCSubRegIterator(*SRI, &RI).isValid())
+        // Skip super-registers defined indirectly.
+        continue;
+
+      if (R == *SRI) {
+        if (S == R)
+          // Avoid scoring the defined register multiple times.
+          continue;
+        else
+          // Note that the defined register has already been scored.
+          S = R;
+      }
+
+      if (Hexagon::P3_0 != R && Hexagon::P3_0 == *SRI)
+        // P3:0 is a special case, since multiple predicate register definitions
+        // in a packet is allowed as the equivalent of their logical "and".
+        // Only an explicit definition of P3:0 is noted as such; if a
+        // side-effect, then note as a soft definition.
+        SoftDefs.insert(*SRI);
+      else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI))
+        // Some insns produce predicates too late to be used in the same packet.
+        LatePreds.insert(*SRI);
+      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD)
+        // Current loads should be used in the same packet.
+        // TODO: relies on the impossibility of a current and a temporary loads
+        // in the same packet.
+        CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue));
+      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD)
+        // Temporary loads should be used in the same packet, but don't commit
+        // results, so it should be disregarded if another insn changes the same
+        // register.
+        // TODO: relies on the impossibility of a current and a temporary loads
+        // in the same packet.
+        TmpDefs.insert(*SRI);
+      else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) )
+        // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and
+        // destination registers with this instruction. same for vdeal(Vx,Vy,Rx)
+        Uses.insert(*SRI);
+      else
+        Defs[*SRI].insert(PredSense(PredReg, isTrue));
+    }
+  }
+
+  // Figure out register definitions that produce new values.
+  if (HexagonMCInstrInfo::hasNewValue(MCII, MCI)) {
+    unsigned R = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+    if (HexagonMCInstrInfo::isCompound(MCII, MCI))
+      compoundRegisterMap(R); // Compound insns have a limited register range.
+
+    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+        SRI.isValid();
+        ++SRI)
+      if (!MCSubRegIterator(*SRI, &RI).isValid())
+        // No super-registers defined indirectly.
+        NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+                                              HexagonMCInstrInfo::isFloat(MCII, MCI)));
+
+    // For fairly unique 2-dot-new producers, example:
+    // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers.
+    if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) {
+      unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg();
+
+      for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid());
+          SRI.isValid();
+          ++SRI)
+        if (!MCSubRegIterator(*SRI, &RI).isValid())
+          NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+                                                HexagonMCInstrInfo::isFloat(MCII, MCI)));
+    }
+  }
+
+  // Figure out definitions of new predicate registers.
+  if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+    for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+      if (MCI.getOperand(i).isReg()) {
+        unsigned P = MCI.getOperand(i).getReg();
+
+        if (isPredicateRegister(P))
+          NewPreds.insert(P);
+      }
+
+  // Figure out uses of new values.
+  if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) {
+    unsigned N = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+    if (!MCSubRegIterator(N, &RI).isValid()) {
+      // Super-registers cannot use new values.
+      if (MCID.isBranch())
+        NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+      else
+        NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+    }
+  }
+}
+
+HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx,
+                                   MCRegisterInfo const &ri)
+    : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI),
+      bLoadErrInfo(false) {
+  init();
+}
+
+bool HexagonMCChecker::check() {
+  bool chkB = checkBranches();
+  bool chkP = checkPredicates();
+  bool chkNV = checkNewValues();
+  bool chkR = checkRegisters();
+  bool chkS = checkSolo();
+  bool chkSh = checkShuffle();
+  bool chkSl = checkSlots();
+  bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
+
+  return chk;
+}
+
+bool HexagonMCChecker::checkSlots()
+
+{
+  unsigned slotsUsed = 0;
+  for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) {
+    MCInst const& MCI = *HMI.getInst();
+    if (HexagonMCInstrInfo::isImmext(MCI))
+      continue;
+    if (HexagonMCInstrInfo::isDuplex(MCII, MCI))
+      slotsUsed += 2;
+    else
+      ++slotsUsed;
+  }
+
+  if (slotsUsed > HEXAGON_PACKET_SIZE) {
+    HexagonMCErrInfo errInfo;
+    errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS);
+    addErrInfo(errInfo);
+    return false;
+  }
+  return true;
+}
+
+// Check legal use of branches.
+bool HexagonMCChecker::checkBranches() {
+  HexagonMCErrInfo errInfo;
+  if (HexagonMCInstrInfo::isBundle(MCB)) {
+    bool hasConditional = false;
+    unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
+             NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+             Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
+
+    for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
+         i < MCB.size(); ++i) {
+      MCInst const &MCI = *MCB.begin()[i].getInst();
+
+      if (HexagonMCInstrInfo::isImmext(MCI))
+        continue;
+      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
+          HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
+        ++Branches;
+        if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
+            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+          ++NewIndirectBranches;
+        if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
+          ++NewValueBranches;
+
+        if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
+            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
+          hasConditional = true;
+          Conditional = i; // Record the position of the conditional branch.
+        } else {
+          Unconditional = i; // Record the position of the unconditional branch.
+        }
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
+          HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
+        ++Returns;
+    }
+
+    if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
+      if (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+          HexagonMCInstrInfo::isOuterLoop(MCB)) {
+        // Error out if there's any branch in a loop-end packet.
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC);
+        addErrInfo(errInfo);
+        return false;
+      }
+    if (Branches > 1)
+      if (!hasConditional || Conditional > Unconditional) {
+        // Error out if more than one unconditional branch or
+        // the conditional branch appears after the unconditional one.
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES);
+        addErrInfo(errInfo);
+        return false;
+      }
+  }
+
+  return true;
+}
+
+// Check legal use of predicate registers.
+bool HexagonMCChecker::checkPredicates() {
+  HexagonMCErrInfo errInfo;
+  // Check for proper use of new predicate registers.
+  for (const auto& I : NewPreds) {
+    unsigned P = I;
+
+    if (!Defs.count(P) || LatePreds.count(P)) {
+      // Error out if the new predicate register is not defined,
+      // or defined "late"
+      // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }").
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P);
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+
+  // Check for proper use of auto-anded of predicate registers.
+  for (const auto& I : LatePreds) {
+    unsigned P = I;
+
+    if (LatePreds.count(P) > 1 || Defs.count(P)) {
+      // Error out if predicate register defined "late" multiple times or
+      // defined late and regularly defined
+      // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }".
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P);
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Check legal use of new values.
+bool HexagonMCChecker::checkNewValues() {
+  HexagonMCErrInfo errInfo;
+  memset(&errInfo, 0, sizeof(errInfo));
+  for (auto& I : NewUses) {
+    unsigned R = I.first;
+    NewSense &US = I.second;
+
+    if (!hasValidNewValueDef(US, NewDefs[R])) {
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R);
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Check for legal register uses and definitions.
+bool HexagonMCChecker::checkRegisters() {
+  HexagonMCErrInfo errInfo;
+  // Check for proper register definitions.
+  for (const auto& I : Defs) {
+    unsigned R = I.first;
+
+    if (ReadOnly.count(R)) {
+      // Error out for definitions of read-only registers.
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R);
+      addErrInfo(errInfo);
+      return false;
+    }
+    if (isLoopRegister(R) && Defs.count(R) > 1 &&
+        (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+         HexagonMCInstrInfo::isOuterLoop(MCB))) {
+      // Error out for definitions of loop registers at the end of a loop.
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R);
+      addErrInfo(errInfo);
+      return false;
+    }
+    if (SoftDefs.count(R)) {
+      // Error out for explicit changes to registers also weakly defined
+      // (e.g., "{ usr = r0; r0 = sfadd(...) }").
+      unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
+      unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+      addErrInfo(errInfo);
+      return false;
+    }
+    if (!isPredicateRegister(R) && Defs[R].size() > 1) {
+      // Check for multiple register definitions.
+      PredSet &PM = Defs[R];
+
+      // Check for multiple unconditional register definitions.
+      if (PM.count(Unconditional)) {
+        // Error out on an unconditional change when there are any other
+        // changes, conditional or not.
+        unsigned UsrR = Hexagon::USR;
+        unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+        addErrInfo(errInfo);
+        return false;
+      }
+      // Check for multiple conditional register definitions.
+      for (const auto& J : PM) {
+        PredSense P = J;
+
+        // Check for multiple uses of the same condition.
+        if (PM.count(P) > 1) {
+          // Error out on conditional changes based on the same predicate
+          // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }").
+          errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+          addErrInfo(errInfo);
+          return false;
+        }
+        // Check for the use of the complementary condition.
+        P.second = !P.second;
+        if (PM.count(P) && PM.size() > 2) {
+          // Error out on conditional changes based on the same predicate
+          // multiple times
+          // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }").
+          errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+          addErrInfo(errInfo);
+          return false;
+        }
+      }
+    }
+  }
+
+  // Check for use of current definitions.
+  for (const auto& I : CurDefs) {
+    unsigned R = I;
+
+    if (!Uses.count(R)) {
+      // Warn on an unused current definition.
+      errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R);
+      addErrInfo(errInfo);
+      return true;
+    }
+  }
+
+  // Check for use of temporary definitions.
+  for (const auto& I : TmpDefs) {
+    unsigned R = I;
+
+    if (!Uses.count(R)) {
+      // special case for vhist
+      bool vHistFound = false;
+      for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+        if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) {
+          vHistFound = true;  // vhist() implicitly uses ALL REGxx.tmp
+          break;
+        }
+      }
+      // Warn on an unused temporary definition.
+      if (vHistFound == false) {
+        errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R);
+        addErrInfo(errInfo);
+        return true;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Check for legal use of solo insns.
+bool HexagonMCChecker::checkSolo() {
+  HexagonMCErrInfo errInfo;
+  if (HexagonMCInstrInfo::isBundle(MCB) &&
+      HexagonMCInstrInfo::bundleSize(MCB) > 1) {
+    for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) {
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO);
+        addErrInfo(errInfo);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool HexagonMCChecker::checkShuffle() {
+  HexagonMCErrInfo errInfo;
+  // Branch info is lost when duplexing. The unduplexed insns must be
+  // checked and only branch errors matter for this case.
+  HexagonMCShuffler MCS(MCII, STI, MCB);
+  if (!MCS.check()) {
+    if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+      errInfo.setShuffleError(MCS.getError());
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+  HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
+  if (!MCSDX.check()) {
+    errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+    errInfo.setShuffleError(MCSDX.getError());
+    addErrInfo(errInfo);
+    return false;
+  }
+  return true;
+}
+
+void HexagonMCChecker::compoundRegisterMap(unsigned& Register) {
+  switch (Register) {
+  default:
+    break;
+  case Hexagon::R15:
+    Register = Hexagon::R23;
+    break;
+  case Hexagon::R14:
+    Register = Hexagon::R22;
+    break;
+  case Hexagon::R13:
+    Register = Hexagon::R21;
+    break;
+  case Hexagon::R12:
+    Register = Hexagon::R20;
+    break;
+  case Hexagon::R11:
+    Register = Hexagon::R19;
+    break;
+  case Hexagon::R10:
+    Register = Hexagon::R18;
+    break;
+  case Hexagon::R9:
+    Register = Hexagon::R17;
+    break;
+  case Hexagon::R8:
+    Register = Hexagon::R16;
+    break;
+  }
+}
+
+bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use,
+      const NewSenseList &Defs) const {
+  bool Strict = !RelaxNVChecks;
+
+  for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
+    const NewSense &Def = Defs[i];
+    // NVJ cannot use a new FP value [7.6.1]
+    if (Use.IsNVJ && (Def.IsFloat || Def.PredReg != 0))
+      continue;
+    // If the definition was not predicated, then it does not matter if
+    // the use is.
+    if (Def.PredReg == 0)
+      return true;
+    // With the strict checks, both the definition and the use must be
+    // predicated on the same register and condition.
+    if (Strict) {
+      if (Def.PredReg == Use.PredReg && Def.Cond == Use.Cond)
+        return true;
+    } else {
+      // With the relaxed checks, if the definition was predicated, the only
+      // detectable violation is if the use is predicated on the opposing
+      // condition, otherwise, it's ok.
+      if (Def.PredReg != Use.PredReg || Def.Cond == Use.Cond)
+        return true;
+    }
+  }
+  return false;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
new file mode 100644
index 0000000..5fc0bde
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -0,0 +1,218 @@
+//===----- HexagonMCChecker.h - Instruction bundle checking ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCHECKER_H
+#define HEXAGONMCCHECKER_H
+
+#include <map>
+#include <set>
+#include <queue>
+#include "MCTargetDesc/HexagonMCShuffler.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MCOperandInfo;
+
+typedef struct {
+  unsigned Error, Warning, ShuffleError;
+  unsigned Register;
+} ErrInfo_T;
+
+class HexagonMCErrInfo {
+public:
+  enum {
+    CHECK_SUCCESS         = 0,
+    // Errors.
+    CHECK_ERROR_BRANCHES  = 0x00001,
+    CHECK_ERROR_NEWP      = 0x00002,
+    CHECK_ERROR_NEWV      = 0x00004,
+    CHECK_ERROR_REGISTERS = 0x00008,
+    CHECK_ERROR_READONLY  = 0x00010,
+    CHECK_ERROR_LOOP      = 0x00020,
+    CHECK_ERROR_ENDLOOP   = 0x00040,
+    CHECK_ERROR_SOLO      = 0x00080,
+    CHECK_ERROR_SHUFFLE   = 0x00100,
+    CHECK_ERROR_NOSLOTS   = 0x00200,
+    CHECK_ERROR_UNKNOWN   = 0x00400,
+    // Warnings.
+    CHECK_WARN_CURRENT    = 0x10000,
+    CHECK_WARN_TEMPORARY  = 0x20000
+  };
+  ErrInfo_T s;
+
+  void reset() {
+    s.Error = CHECK_SUCCESS;
+    s.Warning = CHECK_SUCCESS;
+    s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS;
+    s.Register = Hexagon::NoRegister;
+  };
+  HexagonMCErrInfo() {
+    reset();
+  };
+
+  void setError(unsigned e, unsigned r = Hexagon::NoRegister)
+    { s.Error = e; s.Register = r; };
+  void setWarning(unsigned w, unsigned r = Hexagon::NoRegister)
+    { s.Warning = w; s.Register = r; };
+  void setShuffleError(unsigned e) { s.ShuffleError = e; };
+};
+
+/// Check for a valid bundle.
+class HexagonMCChecker {
+  /// Insn bundle.
+  MCInst& MCB;
+  MCInst& MCBDX;
+  const MCRegisterInfo& RI;
+  MCInstrInfo const &MCII;
+  MCSubtargetInfo const &STI;
+  bool bLoadErrInfo;
+
+  /// Set of definitions: register #, if predicated, if predicated true.
+  typedef std::pair<unsigned, bool> PredSense;
+  static const PredSense Unconditional;
+  typedef std::multiset<PredSense> PredSet;
+  typedef std::multiset<PredSense>::iterator PredSetIterator;
+
+  typedef llvm::DenseMap<unsigned, PredSet>::iterator DefsIterator;
+  llvm::DenseMap<unsigned, PredSet> Defs;
+
+  /// Information about how a new-value register is defined or used:
+  ///   PredReg = predicate register, 0 if use/def not predicated,
+  ///   Cond    = true/false for if(PredReg)/if(!PredReg) respectively,
+  ///   IsFloat = true if definition produces a floating point value
+  ///             (not valid for uses),
+  ///   IsNVJ   = true if the use is a new-value branch (not valid for
+  ///             definitions).
+  struct NewSense {
+    unsigned PredReg;
+    bool IsFloat, IsNVJ, Cond;
+    // The special-case "constructors":
+    static NewSense Jmp(bool isNVJ) {
+      NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ,
+                      /*Cond=*/ false };
+      return NS;
+    }
+    static NewSense Use(unsigned PR, bool True) {
+      NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false,
+                      /*Cond=*/ True };
+      return NS;
+    }
+    static NewSense Def(unsigned PR, bool True, bool Float) {
+      NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false,
+                      /*Cond=*/ True };
+      return NS;
+    }
+  };
+  /// Set of definitions that produce new register:
+  typedef llvm::SmallVector<NewSense,2> NewSenseList;
+  typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator;
+  llvm::DenseMap<unsigned, NewSenseList> NewDefs;
+
+  /// Set of weak definitions whose clashes should be enforced selectively.
+  typedef std::set<unsigned>::iterator SoftDefsIterator;
+  std::set<unsigned> SoftDefs;
+
+  /// Set of current definitions committed to the register file.
+  typedef std::set<unsigned>::iterator CurDefsIterator;
+  std::set<unsigned> CurDefs;
+
+  /// Set of temporary definitions not committed to the register file.
+  typedef std::set<unsigned>::iterator TmpDefsIterator;
+  std::set<unsigned> TmpDefs;
+
+  /// Set of new predicates used.
+  typedef std::set<unsigned>::iterator NewPredsIterator;
+  std::set<unsigned> NewPreds;
+
+  /// Set of predicates defined late.
+  typedef std::multiset<unsigned>::iterator LatePredsIterator;
+  std::multiset<unsigned> LatePreds;
+
+  /// Set of uses.
+  typedef std::set<unsigned>::iterator UsesIterator;
+  std::set<unsigned> Uses;
+
+  /// Set of new values used: new register, if new-value jump.
+  typedef llvm::DenseMap<unsigned, NewSense>::iterator NewUsesIterator;
+  llvm::DenseMap<unsigned, NewSense> NewUses;
+
+  /// Pre-defined set of read-only registers.
+  typedef std::set<unsigned>::iterator ReadOnlyIterator;
+  std::set<unsigned> ReadOnly;
+
+  std::queue<ErrInfo_T> ErrInfoQ;
+  HexagonMCErrInfo CrntErrInfo;
+
+  void getErrInfo() {
+    if (bLoadErrInfo == true) {
+      if (ErrInfoQ.empty()) {
+        CrntErrInfo.reset();
+      } else {
+        CrntErrInfo.s = ErrInfoQ.front();
+        ErrInfoQ.pop();
+      }
+    }
+    bLoadErrInfo = false;
+  }
+
+  void init();
+  void init(MCInst const&);
+
+  // Checks performed.
+  bool checkBranches();
+  bool checkPredicates();
+  bool checkNewValues();
+  bool checkRegisters();
+  bool checkSolo();
+  bool checkShuffle();
+  bool checkSlots();
+
+  static void compoundRegisterMap(unsigned&);
+
+  bool isPredicateRegister(unsigned R) const {
+    return (Hexagon::P0 == R || Hexagon::P1 == R ||
+            Hexagon::P2 == R || Hexagon::P3 == R);
+  };
+  bool isLoopRegister(unsigned R) const {
+    return (Hexagon::SA0 == R || Hexagon::LC0 == R ||
+            Hexagon::SA1 == R || Hexagon::LC1 == R);
+  };
+
+  bool hasValidNewValueDef(const NewSense &Use,
+                           const NewSenseList &Defs) const;
+
+  public:
+  explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
+                            const MCRegisterInfo& ri);
+
+  bool check();
+
+  /// add a new error/warning
+  void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
+
+  /// Return the error code for the last operation in the insn bundle.
+  unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; };
+  unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; };
+  unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; };
+  unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; };
+  bool getNextErrInfo() {
+    bLoadErrInfo = true;
+    return (ErrInfoQ.empty()) ? false : (getErrInfo(), true);
+  }
+};
+
+}
+
+#endif // HEXAGONMCCHECKER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 9fc4e2a..4b07ca7 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -96,6 +96,12 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
   assert(!HexagonMCInstrInfo::isBundle(HMB));
   uint64_t Binary;
 
+  // Compound instructions are limited to using registers 0-7 and 16-23
+  // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
+  static unsigned RegMap[8] = {Hexagon::R8,  Hexagon::R9,  Hexagon::R10,
+                               Hexagon::R11, Hexagon::R12, Hexagon::R13,
+                               Hexagon::R14, Hexagon::R15};
+
   // Pseudo instructions don't get encoded and shouldn't be here
   // in the first place!
   assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
@@ -104,6 +110,16 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
                   " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
                                                                     "\n");
 
+  if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
+    for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
+      if (HMB.getOperand(i).isReg()) {
+        unsigned Reg =
+            MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
+        if ((Reg <= 23) && (Reg >= 16))
+          HMB.getOperand(i).setReg(RegMap[Reg - 16]);
+      }
+  }
+
   if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
     // Calculate the new value distance to the associated producer
     MCOperand &MCO =
@@ -318,21 +334,21 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   // The only relocs left should be GP relative:
   default:
     if (MCID.mayStore() || MCID.mayLoad()) {
-      for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses;
-           ++ImpUses) {
-        if (*ImpUses == Hexagon::GP) {
-          switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
-          case HexagonII::MemAccessSize::ByteAccess:
-            return fixup_Hexagon_GPREL16_0;
-          case HexagonII::MemAccessSize::HalfWordAccess:
-            return fixup_Hexagon_GPREL16_1;
-          case HexagonII::MemAccessSize::WordAccess:
-            return fixup_Hexagon_GPREL16_2;
-          case HexagonII::MemAccessSize::DoubleWordAccess:
-            return fixup_Hexagon_GPREL16_3;
-          default:
-            llvm_unreachable("unhandled fixup");
-          }
+      for (const MCPhysReg *ImpUses = MCID.getImplicitUses();
+           ImpUses && *ImpUses; ++ImpUses) {
+        if (*ImpUses != Hexagon::GP)
+          continue;
+        switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+        case HexagonII::MemAccessSize::ByteAccess:
+          return fixup_Hexagon_GPREL16_0;
+        case HexagonII::MemAccessSize::HalfWordAccess:
+          return fixup_Hexagon_GPREL16_1;
+        case HexagonII::MemAccessSize::WordAccess:
+          return fixup_Hexagon_GPREL16_2;
+        case HexagonII::MemAccessSize::DoubleWordAccess:
+          return fixup_Hexagon_GPREL16_3;
+        default:
+          llvm_unreachable("unhandled fixup");
         }
       }
     } else
@@ -389,10 +405,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
     return cast<MCConstantExpr>(ME)->getValue();
   }
   if (MK == MCExpr::Binary) {
-    unsigned Res;
-    Res = getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI);
-    Res +=
-        getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI);
+    getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI);
+    getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI);
     return 0;
   }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 886f8db..d194bea 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -115,8 +115,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
     SrcReg = MI.getOperand(1).getReg();
     if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        MI.getOperand(2).isImm() && ((isUInt<5>(MI.getOperand(2).getImm())) ||
-                                     (MI.getOperand(2).getImm() == -1)))
+        (HexagonMCInstrInfo::inRange<5>(MI, 2) ||
+         HexagonMCInstrInfo::minConstant(MI, 2) == -1))
       return HexagonII::HCG_A;
     break;
   case Hexagon::A2_tfr:
@@ -134,8 +134,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
       return false;
     // Rd = #u6
     DstReg = MI.getOperand(0).getReg();
-    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() <= 63 &&
-        MI.getOperand(1).getImm() >= 0 &&
+    if (HexagonMCInstrInfo::minConstant(MI, 1) <= 63 &&
+        HexagonMCInstrInfo::minConstant(MI, 1) >= 0 &&
         HexagonMCInstrInfo::isIntRegForSubInst(DstReg))
       return HexagonII::HCG_A;
     break;
@@ -145,9 +145,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
     DstReg = MI.getOperand(0).getReg();
     Src1Reg = MI.getOperand(1).getReg();
     if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
-        MI.getOperand(2).isImm() &&
         HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
-        (MI.getOperand(2).getImm() == 0))
+        HexagonMCInstrInfo::minConstant(MI, 2) == 0)
       return HexagonII::HCG_A;
     break;
   // The fact that .new form is used pretty much guarantees
@@ -206,6 +205,8 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
   MCInst *CompoundInsn = 0;
   unsigned compoundOpcode;
   MCOperand Rs, Rt;
+  int64_t Value;
+  bool Success;
 
   switch (L.getOpcode()) {
   default:
@@ -277,7 +278,10 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
 
   case Hexagon::C2_cmpeqi:
     DEBUG(dbgs() << "CX: C2_cmpeqi\n");
-    if (L.getOperand(2).getImm() == -1)
+    Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    (void)Success;
+    assert(Success);
+    if (Value == -1)
       compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)];
     else
       compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
@@ -286,14 +290,17 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
     CompoundInsn = new (Context) MCInst;
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
-    if (L.getOperand(2).getImm() != -1)
+    if (Value != -1)
       CompoundInsn->addOperand(L.getOperand(2));
     CompoundInsn->addOperand(R.getOperand(1));
     break;
 
   case Hexagon::C2_cmpgti:
     DEBUG(dbgs() << "CX: C2_cmpgti\n");
-    if (L.getOperand(2).getImm() == -1)
+    Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    (void)Success;
+    assert(Success);
+    if (Value == -1)
       compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)];
     else
       compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
@@ -302,7 +309,7 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
     CompoundInsn = new (Context) MCInst;
     CompoundInsn->setOpcode(compoundOpcode);
     CompoundInsn->addOperand(Rs);
-    if (L.getOperand(2).getImm() != -1)
+    if (Value != -1)
       CompoundInsn->addOperand(L.getOperand(2));
     CompoundInsn->addOperand(R.getOperand(1));
     break;
@@ -404,7 +411,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
 /// additional slot.
 void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
                                      MCContext &Context, MCInst &MCI) {
-  assert(MCI.getOpcode() == Hexagon::BUNDLE &&
+  assert(HexagonMCInstrInfo::isBundle(MCI) &&
          "Non-Bundle where Bundle expected");
 
   // By definition a compound must have 2 insn.
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 7e9247c..e6194f6 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -26,7 +26,7 @@ using namespace Hexagon;
 #define DEBUG_TYPE "hexagon-mcduplex-info"
 
 // pair table of subInstructions with opcodes
-static std::pair<unsigned, unsigned> opcodeData[] = {
+static const std::pair<unsigned, unsigned> opcodeData[] = {
     std::make_pair((unsigned)V4_SA1_addi, 0),
     std::make_pair((unsigned)V4_SA1_addrx, 6144),
     std::make_pair((unsigned)V4_SA1_addsp, 3072),
@@ -81,8 +81,7 @@ static std::pair<unsigned, unsigned> opcodeData[] = {
     std::make_pair((unsigned)V4_SS2_storewi1, 4352)};
 
 static std::map<unsigned, unsigned>
-    subinstOpcodeMap(opcodeData,
-                     opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0]));
+    subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData));
 
 bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
   switch (Ga) {
@@ -195,15 +194,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     // Special case this one from Group L2.
     // Rd = memw(r29+#u5:2)
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
-      if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
-          MCI.getOperand(2).isImm() &&
-          isShiftedUInt<5, 2>(MCI.getOperand(2).getImm())) {
+      if (HexagonMCInstrInfo::isIntReg(SrcReg) &&
+          Hexagon::R29 == SrcReg && inRange<5, 2>(MCI, 2)) {
         return HexagonII::HSIG_L2;
       }
       // Rd = memw(Rs+#u4:2)
       if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-          (MCI.getOperand(2).isImm() &&
-           isShiftedUInt<4, 2>(MCI.getOperand(2).getImm()))) {
+          inRange<4, 2>(MCI, 2)) {
         return HexagonII::HSIG_L1;
       }
     }
@@ -214,7 +211,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        MCI.getOperand(2).isImm() && isUInt<4>(MCI.getOperand(2).getImm())) {
+        inRange<4>(MCI, 2)) {
       return HexagonII::HSIG_L1;
     }
     break;
@@ -235,8 +232,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        MCI.getOperand(2).isImm() &&
-        isShiftedUInt<3, 1>(MCI.getOperand(2).getImm())) {
+        inRange<3, 1>(MCI, 2)) {
       return HexagonII::HSIG_L2;
     }
     break;
@@ -246,7 +242,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        MCI.getOperand(2).isImm() && isUInt<3>(MCI.getOperand(2).getImm())) {
+        inRange<3>(MCI, 2)) {
       return HexagonII::HSIG_L2;
     }
     break;
@@ -256,8 +252,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
-        MCI.getOperand(2).isImm() &&
-        isShiftedUInt<5, 3>(MCI.getOperand(2).getImm())) {
+        inRange<5, 3>(MCI, 2)) {
       return HexagonII::HSIG_L2;
     }
     break;
@@ -326,15 +321,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     Src2Reg = MCI.getOperand(2).getReg();
     if (HexagonMCInstrInfo::isIntReg(Src1Reg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
-        Hexagon::R29 == Src1Reg && MCI.getOperand(1).isImm() &&
-        isShiftedUInt<5, 2>(MCI.getOperand(1).getImm())) {
+        Hexagon::R29 == Src1Reg && inRange<5, 2>(MCI, 1)) {
       return HexagonII::HSIG_S2;
     }
     // memw(Rs+#u4:2) = Rt
     if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
-        MCI.getOperand(1).isImm() &&
-        isShiftedUInt<4, 2>(MCI.getOperand(1).getImm())) {
+        inRange<4, 2>(MCI, 1)) {
       return HexagonII::HSIG_S1;
     }
     break;
@@ -344,7 +337,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     Src2Reg = MCI.getOperand(2).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
-        MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm())) {
+        inRange<4>(MCI, 1)) {
       return HexagonII::HSIG_S1;
     }
     break;
@@ -363,8 +356,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     Src2Reg = MCI.getOperand(2).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
-        MCI.getOperand(1).isImm() &&
-        isShiftedUInt<3, 1>(MCI.getOperand(1).getImm())) {
+        inRange<3, 1>(MCI, 1)) {
       return HexagonII::HSIG_S2;
     }
     break;
@@ -374,8 +366,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     Src2Reg = MCI.getOperand(2).getReg();
     if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) &&
         HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg &&
-        MCI.getOperand(1).isImm() &&
-        isShiftedInt<6, 3>(MCI.getOperand(1).getImm())) {
+        inSRange<6, 3>(MCI, 1)) {
       return HexagonII::HSIG_S2;
     }
     break;
@@ -383,9 +374,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     // memw(Rs+#u4:2) = #U1
     Src1Reg = MCI.getOperand(0).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
-        MCI.getOperand(1).isImm() &&
-        isShiftedUInt<4, 2>(MCI.getOperand(1).getImm()) &&
-        MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) {
+        inRange<4, 2>(MCI, 1) && inRange<1>(MCI, 2)) {
       return HexagonII::HSIG_S2;
     }
     break;
@@ -393,16 +382,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     // memb(Rs+#u4) = #U1
     Src1Reg = MCI.getOperand(0).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
-        MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm()) &&
-        MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) {
+        inRange<4>(MCI, 1) && inRange<1>(MCI, 2)) {
       return HexagonII::HSIG_S2;
     }
     break;
   case Hexagon::S2_allocframe:
-    if (MCI.getOperand(0).isImm() &&
-        isShiftedUInt<5, 3>(MCI.getOperand(0).getImm())) {
+    if (inRange<5, 3>(MCI, 0))
       return HexagonII::HSIG_S2;
-    }
     break;
   //
   // Group A:
@@ -428,8 +414,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
       // Rd = add(r29,#u6:2)
       if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
-          MCI.getOperand(2).isImm() &&
-          isShiftedUInt<6, 2>(MCI.getOperand(2).getImm())) {
+          inRange<6, 2>(MCI, 2)) {
         return HexagonII::HSIG_A;
       }
       // Rx = add(Rx,#s7)
@@ -439,8 +424,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
       // Rd = add(Rs,#1)
       // Rd = add(Rs,#-1)
       if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-          MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) ||
-                                        (MCI.getOperand(2).getImm() == -1))) {
+          (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == -1)) {
         return HexagonII::HSIG_A;
       }
     }
@@ -460,8 +444,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) ||
-                                      (MCI.getOperand(2).getImm() == 255))) {
+        (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == 255)) {
       return HexagonII::HSIG_A;
     }
     break;
@@ -491,8 +474,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     DstReg = MCI.getOperand(0).getReg();  // Rd
     PredReg = MCI.getOperand(1).getReg(); // P0
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
-        Hexagon::P0 == PredReg && MCI.getOperand(2).isImm() &&
-        MCI.getOperand(2).getImm() == 0) {
+        Hexagon::P0 == PredReg && minConstant(MCI, 2) == 0) {
       return HexagonII::HSIG_A;
     }
     break;
@@ -502,7 +484,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (Hexagon::P0 == DstReg &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        MCI.getOperand(2).isImm() && isUInt<2>(MCI.getOperand(2).getImm())) {
+        inRange<2>(MCI, 2)) {
       return HexagonII::HSIG_A;
     }
     break;
@@ -511,10 +493,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     // Rdd = combine(#u2,#U2)
     DstReg = MCI.getOperand(0).getReg();
     if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
-        // TODO: Handle Globals/Symbols
-        (MCI.getOperand(1).isImm() && isUInt<2>(MCI.getOperand(1).getImm())) &&
-        ((MCI.getOperand(2).isImm() &&
-          isUInt<2>(MCI.getOperand(2).getImm())))) {
+        inRange<2>(MCI, 1) && inRange<2>(MCI, 2)) {
       return HexagonII::HSIG_A;
     }
     break;
@@ -524,7 +503,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(1).getReg();
     if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        (MCI.getOperand(2).isImm() && MCI.getOperand(2).getImm() == 0)) {
+        minConstant(MCI, 2) == 0) {
       return HexagonII::HSIG_A;
     }
     break;
@@ -534,7 +513,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     SrcReg = MCI.getOperand(2).getReg();
     if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
         HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
-        (MCI.getOperand(1).isImm() && MCI.getOperand(1).getImm() == 0)) {
+        minConstant(MCI, 1) == 0) {
       return HexagonII::HSIG_A;
     }
     break;
@@ -556,19 +535,17 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
 }
 
 bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
-
   unsigned DstReg, SrcReg;
-
   switch (potentialDuplex.getOpcode()) {
   case Hexagon::A2_addi:
     // testing for case of: Rx = add(Rx,#s7)
     DstReg = potentialDuplex.getOperand(0).getReg();
     SrcReg = potentialDuplex.getOperand(1).getReg();
     if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
-      if (potentialDuplex.getOperand(2).isExpr())
+      int64_t Value;
+      if (!potentialDuplex.getOperand(2).getExpr()->evaluateAsAbsolute(Value))
         return true;
-      if (potentialDuplex.getOperand(2).isImm() &&
-          !(isShiftedInt<7, 0>(potentialDuplex.getOperand(2).getImm())))
+      if (!isShiftedInt<7, 0>(Value))
         return true;
     }
     break;
@@ -576,15 +553,14 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
     DstReg = potentialDuplex.getOperand(0).getReg();
 
     if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
-      if (potentialDuplex.getOperand(1).isExpr())
+      int64_t Value;
+      if (!potentialDuplex.getOperand(1).getExpr()->evaluateAsAbsolute(Value))
         return true;
       // Check for case of Rd = #-1.
-      if (potentialDuplex.getOperand(1).isImm() &&
-          (potentialDuplex.getOperand(1).getImm() == -1))
+      if (Value == -1)
         return false;
       // Check for case of Rd = #u6.
-      if (potentialDuplex.getOperand(1).isImm() &&
-          !isShiftedUInt<6, 0>(potentialDuplex.getOperand(1).getImm()))
+      if (!isShiftedUInt<6, 0>(Value))
         return true;
     }
     break;
@@ -712,19 +688,23 @@ inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
 
 MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
   MCInst Result;
+  bool Absolute;
+  int64_t Value;
   switch (Inst.getOpcode()) {
   default:
     // dbgs() << "opcode: "<< Inst->getOpcode() << "\n";
     llvm_unreachable("Unimplemented subinstruction \n");
     break;
   case Hexagon::A2_addi:
-    if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == 1) {
+    Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 1) {
       Result.setOpcode(Hexagon::V4_SA1_inc);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
       break;
     } //  1,2 SUBInst $Rd = add($Rs, #1)
-    else if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == -1) {
+    else if (Value == -1) {
       Result.setOpcode(Hexagon::V4_SA1_dec);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
@@ -754,7 +734,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     addOps(Result, Inst, 0);
     break; //    1 SUBInst allocframe(#$u5_3)
   case Hexagon::A2_andir:
-    if (Inst.getOperand(2).getImm() == 255) {
+    if (minConstant(Inst, 2) == 255) {
       Result.setOpcode(Hexagon::V4_SA1_zxtb);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
@@ -772,26 +752,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break; //    2,3 SUBInst p0 = cmp.eq($Rs, #$u2)
   case Hexagon::A4_combineii:
   case Hexagon::A2_combineii:
-    if (Inst.getOperand(1).getImm() == 1) {
+    Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 1) {
       Result.setOpcode(Hexagon::V4_SA1_combine1i);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 2);
       break; //  1,3 SUBInst $Rdd = combine(#1, #$u2)
     }
-
-    if (Inst.getOperand(1).getImm() == 3) {
+    if (Value == 3) {
       Result.setOpcode(Hexagon::V4_SA1_combine3i);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 2);
       break; //  1,3 SUBInst $Rdd = combine(#3, #$u2)
     }
-    if (Inst.getOperand(1).getImm() == 0) {
+    if (Value == 0) {
       Result.setOpcode(Hexagon::V4_SA1_combine0i);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 2);
       break; //  1,3 SUBInst $Rdd = combine(#0, #$u2)
     }
-    if (Inst.getOperand(1).getImm() == 2) {
+    if (Value == 2) {
       Result.setOpcode(Hexagon::V4_SA1_combine2i);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 2);
@@ -894,12 +875,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
       break; //    1,2,3 SUBInst $Rd = memw($Rs + #$u4_2)
     }
   case Hexagon::S4_storeirb_io:
-    if (Inst.getOperand(2).getImm() == 0) {
+    Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 0) {
       Result.setOpcode(Hexagon::V4_SS2_storebi0);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
       break; //    1,2 SUBInst memb($Rs + #$u4_0)=#0
-    } else if (Inst.getOperand(2).getImm() == 1) {
+    } else if (Value == 1) {
       Result.setOpcode(Hexagon::V4_SS2_storebi1);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
@@ -923,12 +906,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     addOps(Result, Inst, 2);
     break; //    1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
   case Hexagon::S4_storeiri_io:
-    if (Inst.getOperand(2).getImm() == 0) {
+    Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 0) {
       Result.setOpcode(Hexagon::V4_SS2_storewi0);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
       break; //  3 1,2 SUBInst memw($Rs + #$u4_2)=#0
-    } else if (Inst.getOperand(2).getImm() == 1) {
+    } else if (Value == 1) {
       Result.setOpcode(Hexagon::V4_SS2_storewi1);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
@@ -983,7 +968,8 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     addOps(Result, Inst, 0);
     break; //  2 SUBInst if (p0) $Rd = #0
   case Hexagon::A2_tfrsi:
-    if (Inst.getOperand(1).isImm() && Inst.getOperand(1).getImm() == -1) {
+    Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+    if (Absolute && Value == -1) {
       Result.setOpcode(Hexagon::V4_SA1_setin1);
       addOps(Result, Inst, 0);
       break; //  2 1 SUBInst $Rd = #-1
@@ -1044,6 +1030,8 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
                      << "\n");
         bisReversable = false;
       }
+      if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf
+        bisReversable = false;
 
       // Try in order.
       if (isOrderedDuplexPair(
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index bf51c35..eaa3550 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -37,9 +37,7 @@ static cl::opt<unsigned>
 
 void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
                                            const MCSubtargetInfo &STI) {
-  MCInst HMI;
-  HMI.setOpcode(Hexagon::BUNDLE);
-  HMI.addOperand(MCOperand::createImm(0));
+  MCInst HMI = HexagonMCInstrInfo::createBundle();
   MCInst *MCB;
 
   if (MCK.getOpcode() != Hexagon::BUNDLE) {
@@ -50,7 +48,7 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
 
   // Examines packet and pad the packet, if needed, when an
   // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(*MCB);
+  HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
   HexagonMCShuffle(*MCII, STI, *MCB);
 
   assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
@@ -60,9 +58,9 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
     if (Extended) {
       if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
         MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst());
-        HexagonMCInstrInfo::clampExtended(*MCII, *SubInst);
+        HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst);
       } else {
-        HexagonMCInstrInfo::clampExtended(*MCII, *MCI);
+        HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI);
       }
       Extended = false;
     } else {
@@ -114,7 +112,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     MCSection *Section = getAssembler().getContext().getELFSection(
         SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
     SwitchSection(Section);
-    AssignSection(Symbol, Section);
+    AssignFragment(Symbol, getCurrentFragment());
 
     MCELFStreamer::EmitCommonSymbol(Symbol, Size, ByteAlignment);
     SwitchSection(CrntSection);
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
new file mode 100644
index 0000000..fc62626
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -0,0 +1,49 @@
+//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes
+//----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-mcexpr"
+
+HexagonNoExtendOperand *HexagonNoExtendOperand::Create(MCExpr const *Expr,
+                                                       MCContext &Ctx) {
+  return new (Ctx) HexagonNoExtendOperand(Expr);
+}
+
+bool HexagonNoExtendOperand::evaluateAsRelocatableImpl(
+    MCValue &Res, MCAsmLayout const *Layout, MCFixup const *Fixup) const {
+  return Expr->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+void HexagonNoExtendOperand::visitUsedExpr(MCStreamer &Streamer) const {}
+
+MCFragment *llvm::HexagonNoExtendOperand::findAssociatedFragment() const {
+  return Expr->findAssociatedFragment();
+}
+
+void HexagonNoExtendOperand::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+MCExpr const *HexagonNoExtendOperand::getExpr() const { return Expr; }
+
+bool HexagonNoExtendOperand::classof(MCExpr const *E) {
+  return E->getKind() == MCExpr::Target;
+}
+
+HexagonNoExtendOperand::HexagonNoExtendOperand(MCExpr const *Expr)
+    : Expr(Expr) {}
+
+void HexagonNoExtendOperand::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  Expr->print(OS, MAI);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
new file mode 100644
index 0000000..60f180f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -0,0 +1,35 @@
+//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+class MCInst;
+class HexagonNoExtendOperand : public MCTargetExpr {
+public:
+  static HexagonNoExtendOperand *Create(MCExpr const *Expr, MCContext &Ctx);
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override;
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+  static bool classof(MCExpr const *E);
+  MCExpr const *getExpr() const;
+
+private:
+  HexagonNoExtendOperand(MCExpr const *Expr);
+  MCExpr const *Expr;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 48b15f8..e684207 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -15,17 +15,37 @@
 
 #include "Hexagon.h"
 #include "HexagonBaseInfo.h"
+#include "HexagonMCChecker.h"
 
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
+void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value,
+                                     MCContext &Context) {
+  MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context)));
+}
+
+void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
+                                          MCInstrInfo const &MCII, MCInst &MCB,
+                                          MCInst const &MCI) {
+  assert(HexagonMCInstrInfo::isBundle(MCB));
+  MCOperand const &exOp =
+      MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
+
+  // Create the extender.
+  MCInst *XMCI =
+      new (Context) MCInst(HexagonMCInstrInfo::deriveExtender(MCII, MCI, exOp));
+
+  MCB.addOperand(MCOperand::createInst(XMCI));
+}
+
 iterator_range<MCInst::const_iterator>
 HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
   assert(isBundle(MCI));
-  return iterator_range<MCInst::const_iterator>(
-      MCI.begin() + bundleInstructionsOffset, MCI.end());
+  return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
 }
 
 size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
@@ -35,7 +55,40 @@ size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
     return (1);
 }
 
-void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) {
+bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
+                                            MCSubtargetInfo const &STI,
+                                            MCContext &Context, MCInst &MCB,
+                                            HexagonMCChecker *Check) {
+  // Examine the packet and convert pairs of instructions to compound
+  // instructions when possible.
+  if (!HexagonDisableCompound)
+    HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
+  // Check the bundle for errors.
+  bool CheckOk = Check ? Check->check() : true;
+  if (!CheckOk)
+    return false;
+  HexagonMCShuffle(MCII, STI, MCB);
+  // Examine the packet and convert pairs of instructions to duplex
+  // instructions when possible.
+  MCInst InstBundlePreDuplex = MCInst(MCB);
+  if (!HexagonDisableDuplex) {
+    SmallVector<DuplexCandidate, 8> possibleDuplexes;
+    possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+    HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
+  }
+  // Examines packet and pad the packet, if needed, when an
+  // end-loop is in the bundle.
+  HexagonMCInstrInfo::padEndloop(Context, MCB);
+  // If compounding and duplexing didn't reduce the size below
+  // 4 or less we have a packet that is too big.
+  if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
+    return false;
+  HexagonMCShuffle(MCII, STI, MCB);
+  return true;
+}
+
+void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
+                                       MCContext &Context, MCInst &MCI) {
   assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
          HexagonMCInstrInfo::isExtended(MCII, MCI));
   MCOperand &exOp =
@@ -43,13 +96,20 @@ void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) {
   // If the extended value is a constant, then use it for the extended and
   // for the extender instructions, masking off the lower 6 bits and
   // including the assumed bits.
-  if (exOp.isImm()) {
+  int64_t Value;
+  if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
     unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
-    int64_t Bits = exOp.getImm();
-    exOp.setImm((Bits & 0x3f) << Shift);
+    exOp.setExpr(MCConstantExpr::create((Value & 0x3f) << Shift, Context));
   }
 }
 
+MCInst HexagonMCInstrInfo::createBundle() {
+  MCInst Result;
+  Result.setOpcode(Hexagon::BUNDLE);
+  Result.addOperand(MCOperand::createImm(0));
+  return Result;
+}
+
 MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
                                          MCInst const &inst0,
                                          MCInst const &inst1) {
@@ -64,6 +124,27 @@ MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
   return duplexInst;
 }
 
+MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
+                                          MCInst const &Inst,
+                                          MCOperand const &MO) {
+  assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
+         HexagonMCInstrInfo::isExtended(MCII, Inst));
+
+  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
+  MCInst XMI;
+  XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
+                 HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
+                    ? Hexagon::A4_ext_b
+                    : Hexagon::A4_ext);
+  if (MO.isImm())
+    XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
+  else if (MO.isExpr())
+    XMI.addOperand(MCOperand::createExpr(MO.getExpr()));
+  else
+    llvm_unreachable("invalid extendable operand");
+  return XMI;
+}
+
 MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
                                                    size_t Index) {
   assert(Index <= bundleSize(MCB));
@@ -76,6 +157,13 @@ MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
   return nullptr;
 }
 
+void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context,
+                                        MCInstrInfo const &MCII, MCInst &MCB,
+                                        MCInst const &MCI, bool MustExtend) {
+  if (isConstExtended(MCII, MCI) || MustExtend)
+    addConstExtender(Context, MCII, MCB, MCI);
+}
+
 HexagonII::MemAccessSize
 HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -186,6 +274,25 @@ MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
   return (MCO);
 }
 
+/// Return the new value or the newly produced value.
+unsigned short HexagonMCInstrInfo::getNewValueOp2(MCInstrInfo const &MCII,
+                                                  MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::NewValueOpPos2) & HexagonII::NewValueOpMask2);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
+                                        MCInst const &MCI) {
+  unsigned O = HexagonMCInstrInfo::getNewValueOp2(MCII, MCI);
+  MCOperand const &MCO = MCI.getOperand(O);
+
+  assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+          HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) &&
+         MCO.isReg());
+  return (MCO);
+}
+
 int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -242,6 +349,13 @@ bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
   return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
 }
 
+/// Return whether the insn produces a second value.
+bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::hasNewValuePos2) & HexagonII::hasNewValueMask2);
+}
+
 MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
   assert(isBundle(MCB));
   assert(Index < HEXAGON_PACKET_SIZE);
@@ -261,6 +375,11 @@ bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
           HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
 }
 
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
+}
+
 bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
   return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
           (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
@@ -282,14 +401,21 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   if (HexagonMCInstrInfo::isExtended(MCII, MCI))
     return true;
-
-  if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
+  // Branch insns are handled as necessary by relaxation.
+  if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+       HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+       HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
+    return false;
+  // Otherwise loop instructions and other CR insts are handled by relaxation
+  else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) &&
+           (MCI.getOpcode() != Hexagon::C4_addipc))
+    return false;
+  else if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
     return false;
 
-  short ExtOpNum = HexagonMCInstrInfo::getCExtOpNum(MCII, MCI);
-  int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
-  int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
-  MCOperand const &MO = MCI.getOperand(ExtOpNum);
+  MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI);
 
   // We could be using an instruction with an extendable immediate and shoehorn
   // a global address into it. If it is a global address it will be constant
@@ -297,15 +423,13 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
   // We currently only handle isGlobal() because it is the only kind of
   // object we are going to end up with here for now.
   // In the future we probably should add isSymbol(), etc.
-  if (MO.isExpr())
+  assert(!MO.isImm());
+  int64_t Value;
+  if (!MO.getExpr()->evaluateAsAbsolute(Value))
     return true;
-
-  // If the extendable operand is not 'Immediate' type, the instruction should
-  // have 'isExtended' flag set.
-  assert(MO.isImm() && "Extendable operand must be Immediate type");
-
-  int ImmValue = MO.getImm();
-  return (ImmValue < MinValue || ImmValue > MaxValue);
+  int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
+  int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
+  return (MinValue > Value || Value > MaxValue);
 }
 
 bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
@@ -374,6 +498,19 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
+bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::PredicateLatePos & HexagonII::PredicateLateMask);
+}
+
+/// Return whether the insn is newly predicated.
+bool HexagonMCInstrInfo::isPredicatedNew(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
 bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
                                           MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -394,6 +531,18 @@ bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
   return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
 }
 
+bool HexagonMCInstrInfo::isMemReorderDisabled(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  auto Flags = MCI.getOperand(0).getImm();
+  return (Flags & memReorderDisabledMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  auto Flags = MCI.getOperand(0).getImm();
+  return (Flags & memStoreReorderEnabledMask) != 0;
+}
+
 bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
@@ -405,7 +554,28 @@ bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
   return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
 }
 
-void HexagonMCInstrInfo::padEndloop(MCInst &MCB) {
+bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
+  if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
+      (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
+    return true;
+  return false;
+}
+
+int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
+  auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max())
+                  << 8;
+  if (MCI.size() <= Index)
+    return Sentinal;
+  MCOperand const &MCO = MCI.getOperand(Index);
+  if (!MCO.isExpr())
+    return Sentinal;
+  int64_t Value;
+  if (!MCO.getExpr()->evaluateAsAbsolute(Value))
+    return Sentinal;
+  return Value;
+}
+
+void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
   assert(isBundle(MCB));
@@ -413,7 +583,7 @@ void HexagonMCInstrInfo::padEndloop(MCInst &MCB) {
           (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
          ((HexagonMCInstrInfo::isOuterLoop(MCB) &&
            (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))))
-    MCB.addOperand(MCOperand::createInst(new MCInst(Nop)));
+    MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop)));
 }
 
 bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
@@ -456,6 +626,20 @@ void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) {
   Operand.setImm(Operand.getImm() | innerLoopMask);
 }
 
+void HexagonMCInstrInfo::setMemReorderDisabled(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | memReorderDisabledMask);
+  assert(isMemReorderDisabled(MCI));
+}
+
+void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
+  assert(isMemStoreReorderEnabled(MCI));
+}
+
 void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
   assert(isBundle(MCI));
   MCOperand &Operand = MCI.getOperand(0);
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 32d61a4..0237b28 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -14,9 +14,11 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
 
+#include "HexagonMCExpr.h"
 #include "llvm/MC/MCInst.h"
 
 namespace llvm {
+class HexagonMCChecker;
 class MCContext;
 class MCInstrDesc;
 class MCInstrInfo;
@@ -39,20 +41,47 @@ int64_t const innerLoopMask = 1 << innerLoopOffset;
 size_t const outerLoopOffset = 1;
 int64_t const outerLoopMask = 1 << outerLoopOffset;
 
+// do not reorder memory load/stores by default load/stores are re-ordered
+// and by default loads can be re-ordered
+size_t const memReorderDisabledOffset = 2;
+int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset;
+
+// allow re-ordering of memory stores by default stores cannot be re-ordered
+size_t const memStoreReorderEnabledOffset = 3;
+int64_t const memStoreReorderEnabledMask = 1 << memStoreReorderEnabledOffset;
+
 size_t const bundleInstructionsOffset = 1;
 
+void addConstant(MCInst &MI, uint64_t Value, MCContext &Context);
+void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                      MCInst const &MCI);
+
 // Returns a iterator range of instructions in this bundle
 iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI);
 
 // Returns the number of instructions in the bundle
 size_t bundleSize(MCInst const &MCI);
 
+// Put the packet in to canonical form, compound, duplex, pad, and shuffle
+bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                        MCContext &Context, MCInst &MCB,
+                        HexagonMCChecker *Checker);
+
 // Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCInst &MCI);
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                    MCInst const &MCI, bool MustExtend);
 
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
                      MCInst const &inst1);
+MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
+                      MCOperand const &MO);
 
 // Convert this instruction in to a duplex subinst
 MCInst deriveSubInst(MCInst const &Inst);
@@ -108,6 +137,9 @@ unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the operand that consumes or produces a new value.
 MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
+unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
+MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
+                                     MCInst const &MCI);
 
 int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
 
@@ -125,6 +157,7 @@ bool hasImmExt(MCInst const &MCI);
 
 // Return whether the instruction is a legal new-value producer.
 bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the instruction at Index
 MCInst const &instruction(MCInst const &MCB, size_t Index);
@@ -134,10 +167,24 @@ bool isBundle(MCInst const &MCI);
 
 // Return whether the insn is an actual insn.
 bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the duplex iclass given the two duplex classes
 unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
 
+int64_t minConstant(MCInst const &MCI, size_t Index);
+template <unsigned N, unsigned S>
+bool inRange(MCInst const &MCI, size_t Index) {
+  return isShiftedUInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N, unsigned S>
+bool inSRange(MCInst const &MCI, size_t Index) {
+  return isShiftedInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
+  return isUInt<N>(minConstant(MCI, Index));
+}
+
 // Return whether the instruction needs to be constant extended.
 bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
 
@@ -173,6 +220,8 @@ bool isIntReg(unsigned Reg);
 
 // Is this register suitable for use in a duplex subinst
 bool isIntRegForSubInst(unsigned Reg);
+bool isMemReorderDisabled(MCInst const &MCI);
+bool isMemStoreReorderEnabled(MCInst const &MCI);
 
 // Return whether the insn is a new-value consumer.
 bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -191,6 +240,8 @@ bool isOuterLoop(MCInst const &MCI);
 
 // Return whether this instruction is predicated
 bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicateLate(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return whether the predicate sense is true
 bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -209,9 +260,10 @@ bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI);
 
 /// Return whether the insn can be packaged only with an A-type insn in slot #1.
 bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isVector(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Pad the bundle with nops to satisfy endloop requirements
-void padEndloop(MCInst &MCI);
+void padEndloop(MCContext &Context, MCInst &MCI);
 
 bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
 
@@ -220,6 +272,8 @@ void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
 
 // Marks a bundle as endloop0
 void setInnerLoop(MCInst &MCI);
+void setMemReorderDisabled(MCInst &MCI);
+void setMemStoreReorderEnabled(MCInst &MCI);
 
 // Marks a bundle as endloop1
 void setOuterLoop(MCInst &MCI);
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 53305d8..9a29257 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -40,6 +40,20 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "HexagonGenRegisterInfo.inc"
 
+cl::opt<bool> llvm::HexagonDisableCompound
+  ("mno-compound",
+   cl::desc("Disable looking for compound instructions for Hexagon"));
+
+cl::opt<bool> llvm::HexagonDisableDuplex
+  ("mno-pairing",
+   cl::desc("Disable looking for duplex instructions for Hexagon"));
+
+StringRef HEXAGON_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
+  if (CPU.empty())
+    CPU = "hexagonv60";
+  return CPU;
+}
+
 MCInstrInfo *llvm::createHexagonMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitHexagonMCInstrInfo(X);
@@ -54,6 +68,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
 
 static MCSubtargetInfo *
 createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  CPU = HEXAGON_MC::selectHexagonCPU(TT, CPU);
   return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
@@ -76,28 +91,23 @@ public:
     StringRef Contents(Buffer);
     auto PacketBundle = Contents.rsplit('\n');
     auto HeadTail = PacketBundle.first.split('\n');
-    auto Preamble = "\t{\n\t\t";
-    auto Separator = "";
-    while(!HeadTail.first.empty()) {
-      OS << Separator;
-      StringRef Inst;
+    StringRef Separator = "\n";
+    StringRef Indent = "\t\t";
+    OS << "\t{\n";
+    while (!HeadTail.first.empty()) {
+      StringRef InstTxt;
       auto Duplex = HeadTail.first.split('\v');
-      if(!Duplex.second.empty()){
-        OS << Duplex.first << "\n";
-        Inst = Duplex.second;
-      }
-      else {
-        if(!HeadTail.first.startswith("immext"))
-          Inst = Duplex.first;
+      if (!Duplex.second.empty()) {
+        OS << Indent << Duplex.first << Separator;
+        InstTxt = Duplex.second;
+      } else if (!HeadTail.first.trim().startswith("immext")) {
+        InstTxt = Duplex.first;
       }
-      OS << Preamble;
-      OS << Inst;
+      if (!InstTxt.empty())
+        OS << Indent << InstTxt << Separator;
       HeadTail = HeadTail.second.split('\n');
-      Preamble = "";
-      Separator = "\n\t\t";
     }
-    if(HexagonMCInstrInfo::bundleSize(Inst) != 0)
-      OS << "\n\t}" << PacketBundle.second;
+    OS << "\t}" << PacketBundle.second;
   }
 };
 }
@@ -154,9 +164,9 @@ static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
-  // For the time being, use static relocations, since there's really no
-  // support for PIC yet.
-  X->initMCCodeGenInfo(Reloc::Static, CM, OL);
+  if (RM == Reloc::Default)
+    RM = Reloc::Static;
+  X->initMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index cb62650..a005a01 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -16,6 +16,8 @@
 
 #include <cstdint>
 
+#include "llvm/Support/CommandLine.h"
+
 namespace llvm {
 struct InstrItinerary;
 struct InstrStage;
@@ -33,22 +35,27 @@ class raw_ostream;
 class raw_pwrite_stream;
 
 extern Target TheHexagonTarget;
-
+extern cl::opt<bool> HexagonDisableCompound;
+extern cl::opt<bool> HexagonDisableDuplex;
 extern const InstrStage HexagonStages[];
 
 MCInstrInfo *createHexagonMCInstrInfo();
 
-MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII,
-                                          MCRegisterInfo const &MRI,
+MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
                                           MCContext &MCT);
 
-MCAsmBackend *createHexagonAsmBackend(Target const &T,
-                                      MCRegisterInfo const &MRI,
+MCAsmBackend *createHexagonAsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
                                       const Triple &TT, StringRef CPU);
 
 MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
                                              uint8_t OSABI, StringRef CPU);
 
+namespace HEXAGON_MC {
+  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+}
+
 } // End llvm namespace
 
 // Define symbolic names for Hexagon registers.  This defines a mapping from
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 41112ac..4e1cce3 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -27,6 +27,7 @@
 
 using namespace llvm;
 
+namespace {
 // Insn shuffling priority.
 class HexagonBid {
   // The priority is directly proportional to how restricted the insn is based
@@ -75,6 +76,7 @@ public:
       return false;
   };
 };
+} // end anonymous namespace
 
 unsigned HexagonResource::setWeight(unsigned s) {
   const unsigned SlotWeight = 8;
@@ -93,10 +95,57 @@ unsigned HexagonResource::setWeight(unsigned s) {
   return (Weight);
 }
 
+void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
+  (*TUL)[HexagonII::TypeCVI_VA] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
+  (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_ST] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
+}
+
+HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
+                                       MCInstrInfo const &MCII, unsigned s,
+                                       MCInst const *id)
+    : HexagonResource(s), TUL(TUL) {
+  unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
+
+  if (TUL->count(T)) {
+    // For an HVX insn.
+    Valid = true;
+    setUnits((*TUL)[T].first);
+    setLanes((*TUL)[T].second);
+    setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+    setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
+  } else {
+    // For core insns.
+    Valid = false;
+    setUnits(0);
+    setLanes(0);
+    setLoad(false);
+    setStore(false);
+  }
+}
+
 HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
     : MCII(MCII), STI(STI) {
   reset();
+  HexagonCVIResource::SetupTUL(&TUL, STI.getCPU());
 }
 
 void HexagonShuffler::reset() {
@@ -107,7 +156,7 @@ void HexagonShuffler::reset() {
 
 void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
                              unsigned S, bool X) {
-  HexagonInstr PI(ID, Extender, S, X);
+  HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
 
   Packet.push_back(PI);
 }
@@ -126,6 +175,8 @@ bool HexagonShuffler::check() {
   // Number of memory operations, loads, solo loads, stores, solo stores, single
   // stores.
   unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+  // Number of HVX loads, HVX stores.
+  unsigned CVIloads = 0, CVIstores = 0;
   // Number of duplex insns, solo insns.
   unsigned duplex = 0, solo = 0;
   // Number of insns restricting other insns in the packet to A and X types,
@@ -168,6 +219,12 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeJ:
       ++jumps;
       break;
+    case HexagonII::TypeCVI_VM_VP_LDU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_LD:
+    case HexagonII::TypeCVI_VM_TMP_LD:
+    case HexagonII::TypeCVI_VM_CUR_LD:
+      ++CVIloads;
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
@@ -176,6 +233,11 @@ bool HexagonShuffler::check() {
       if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
         ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
       break;
+    case HexagonII::TypeCVI_VM_STU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_ST:
+    case HexagonII::TypeCVI_VM_NEW_ST:
+      ++CVIstores;
     case HexagonII::TypeST:
       ++stores;
       ++memory;
@@ -203,9 +265,9 @@ bool HexagonShuffler::check() {
   }
 
   // Check if the packet is legal.
-  if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) ||
-      (solo && size() > 1) || (onlyAX && neitherAnorX > 1) ||
-      (onlyAX && xtypeFloat)) {
+  if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
+      (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
+      (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
     Error = SHUFFLE_ERROR_INVALID;
     return false;
   }
@@ -336,6 +398,19 @@ bool HexagonShuffler::check() {
         return false;
       }
   }
+  // Verify the CVI slot subscriptions.
+  {
+    HexagonUnitAuction AuctionCVI;
+
+    std::sort(begin(), end(), HexagonInstr::lessCVI);
+
+    for (iterator I = begin(); I != end(); ++I)
+      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
+        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
+          Error = SHUFFLE_ERROR_SLOTS;
+          return false;
+        }
+  }
 
   Error = SHUFFLE_SUCCESS;
   return true;
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 8b6c72e..a093f85 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -51,6 +52,46 @@ public:
   };
 };
 
+// HVX insn resources.
+class HexagonCVIResource : public HexagonResource {
+public:
+  typedef std::pair<unsigned, unsigned> UnitsAndLanes;
+  typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
+
+private:
+  // Available HVX slots.
+  enum {
+    CVI_NONE = 0,
+    CVI_XLANE = 1 << 0,
+    CVI_SHIFT = 1 << 1,
+    CVI_MPY0 = 1 << 2,
+    CVI_MPY1 = 1 << 3
+  };
+
+  TypeUnitsAndLanes *TUL;
+
+  // Count of adjacent slots that the insn requires to be executed.
+  unsigned Lanes;
+  // Flag whether the insn is a load or a store.
+  bool Load, Store;
+  // Flag whether the HVX resources are valid.
+  bool Valid;
+
+  void setLanes(unsigned l) { Lanes = l; };
+  void setLoad(bool f = true) { Load = f; };
+  void setStore(bool f = true) { Store = f; };
+
+public:
+  HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII,
+                     unsigned s, MCInst const *id);
+  static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
+
+  bool isValid() const { return (Valid); };
+  unsigned getLanes() const { return (Lanes); };
+  bool mayLoad() const { return (Load); };
+  bool mayStore() const { return (Store); };
+};
+
 // Handle to an insn used by the shuffling algorithm.
 class HexagonInstr {
   friend class HexagonShuffler;
@@ -58,12 +99,15 @@ class HexagonInstr {
   MCInst const *ID;
   MCInst const *Extender;
   HexagonResource Core;
+  HexagonCVIResource CVI;
   bool SoloException;
 
 public:
-  HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s,
-               bool x = false)
-      : ID(id), Extender(Extender), Core(s), SoloException(x){};
+  HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
+               MCInstrInfo const &MCII, MCInst const *id,
+               MCInst const *Extender, unsigned s, bool x = false)
+      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
+        SoloException(x) {};
 
   MCInst const *getDesc() const { return (ID); };
 
@@ -79,6 +123,10 @@ public:
   static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
     return (HexagonResource::lessUnits(A.Core, B.Core));
   };
+  // Check if the handles are in ascending order by HVX slots.
+  static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) {
+    return (HexagonResource::lessUnits(A.CVI, B.CVI));
+  };
 };
 
 // Bundle shuffler.
@@ -92,6 +140,8 @@ class HexagonShuffler {
   // Shuffling error code.
   unsigned Error;
 
+  HexagonCVIResource::TypeUnitsAndLanes TUL;
+
 protected:
   int64_t BundleFlags;
   MCInstrInfo const &MCII;
@@ -108,6 +158,8 @@ public:
     SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns.
     SHUFFLE_ERROR_NOSLOTS,  ///< No free slots for other insns.
     SHUFFLE_ERROR_SLOTS,    ///< Over-subscribed slots.
+    SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60).
+    SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict
     SHUFFLE_ERROR_UNKNOWN   ///< Unknown error.
   };
 
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
new file mode 100644
index 0000000..c547c71
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -0,0 +1,180 @@
+//===--- RDFCopy.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simplistic RDF-based copy propagation.
+
+#include "RDFCopy.h"
+#include "RDFGraph.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/CommandLine.h"
+
+#include <atomic>
+
+#ifndef NDEBUG
+static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden);
+static unsigned CpCount = 0;
+#endif
+
+using namespace llvm;
+using namespace rdf;
+
+void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI) {
+  assert(MI->getOpcode() == TargetOpcode::COPY);
+  const MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1);
+  RegisterRef DstR = { Op0.getReg(), Op0.getSubReg() };
+  RegisterRef SrcR = { Op1.getReg(), Op1.getSubReg() };
+  auto FS = DefM.find(SrcR);
+  if (FS == DefM.end() || FS->second.empty())
+    return;
+  Copies.push_back(SA.Id);
+  RDefMap[SrcR][SA.Id] = FS->second.top()->Id;
+  // Insert DstR into the map.
+  RDefMap[DstR];
+}
+
+
+void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
+  RegisterSet RRs;
+  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+    RRs.insert(RA.Addr->getRegRef());
+  bool Common = false;
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    Common = true;
+    break;
+  }
+  if (!Common)
+    return;
+
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    auto F = DefM.find(R.first);
+    if (F == DefM.end() || F->second.empty())
+      continue;
+    R.second[IA.Id] = F->second.top()->Id;
+  }
+}
+
+
+bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
+  bool Changed = false;
+  auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
+  DFG.markBlock(BA.Id, DefM);
+
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+    if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
+      NodeAddr<StmtNode*> SA = IA;
+      MachineInstr *MI = SA.Addr->getCode();
+      if (MI->isCopy())
+        recordCopy(SA, MI);
+    }
+
+    updateMap(IA);
+    DFG.pushDefs(IA, DefM);
+  }
+
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto I : *N)
+    Changed |= scanBlock(I->getBlock());
+
+  DFG.releaseBlock(BA.Id, DefM);
+  return Changed;
+}
+
+
+bool CopyPropagation::run() {
+  scanBlock(&DFG.getMF().front());
+
+  if (trace()) {
+    dbgs() << "Copies:\n";
+    for (auto I : Copies)
+      dbgs() << *DFG.addr<StmtNode*>(I).Addr->getCode();
+    dbgs() << "\nRDef map:\n";
+    for (auto R : RDefMap) {
+      dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
+      for (auto &M : R.second)
+        dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
+               << Print<NodeId>(M.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  bool Changed = false;
+  NodeSet Deleted;
+#ifndef NDEBUG
+  bool HasLimit = CpLimit.getNumOccurrences() > 0;
+#endif
+
+  for (auto I : Copies) {
+#ifndef NDEBUG
+    if (HasLimit && CpCount >= CpLimit)
+      break;
+#endif
+    if (Deleted.count(I))
+      continue;
+    auto SA = DFG.addr<InstrNode*>(I);
+    NodeList Ds = SA.Addr->members_if(DFG.IsDef, DFG);
+    if (Ds.size() != 1)
+      continue;
+    NodeAddr<DefNode*> DA = Ds[0];
+    RegisterRef DR0 = DA.Addr->getRegRef();
+    NodeList Us = SA.Addr->members_if(DFG.IsUse, DFG);
+    if (Us.size() != 1)
+      continue;
+    NodeAddr<UseNode*> UA0 = Us[0];
+    RegisterRef UR0 = UA0.Addr->getRegRef();
+    NodeId RD0 = UA0.Addr->getReachingDef();
+
+    for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
+      auto UA = DFG.addr<UseNode*>(N);
+      NextN = UA.Addr->getSibling();
+      uint16_t F = UA.Addr->getFlags();
+      if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed))
+        continue;
+      if (UA.Addr->getRegRef() != DR0)
+        continue;
+      NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
+      assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
+      MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+      if (RDefMap[UR0][IA.Id] != RD0)
+        continue;
+      MachineOperand &Op = UA.Addr->getOp();
+      if (Op.isTied())
+        continue;
+      if (trace()) {
+        dbgs() << "can replace " << Print<RegisterRef>(DR0, DFG)
+               << " with " << Print<RegisterRef>(UR0, DFG) << " in "
+               << *NodeAddr<StmtNode*>(IA).Addr->getCode();
+      }
+
+      Op.setReg(UR0.Reg);
+      Op.setSubReg(UR0.Sub);
+      Changed = true;
+#ifndef NDEBUG
+      if (HasLimit && CpCount >= CpLimit)
+        break;
+      CpCount++;
+#endif
+
+      if (MI->isCopy()) {
+        MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1);
+        if (Op0.getReg() == Op1.getReg() && Op0.getSubReg() == Op1.getSubReg())
+          MI->eraseFromParent();
+        Deleted.insert(IA.Id);
+      }
+    }
+  }
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
new file mode 100644
index 0000000..02531b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -0,0 +1,48 @@
+//===--- RDFCopy.h --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef RDF_COPY_H
+#define RDF_COPY_H
+
+#include "RDFGraph.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineDominatorTree;
+  class MachineInstr;
+}
+
+namespace rdf {
+  struct CopyPropagation {
+    CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
+        Trace(false) {}
+
+    bool run();
+    void trace(bool On) { Trace = On; }
+    bool trace() const { return Trace; }
+
+  private:
+    const MachineDominatorTree &MDT;
+    DataFlowGraph &DFG;
+    DataFlowGraph::DefStackMap DefM;
+    bool Trace;
+
+    // map: register -> (map: stmt -> reaching def)
+    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+    std::vector<NodeId> Copies;
+
+    void recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI);
+    void updateMap(NodeAddr<InstrNode*> IA);
+    bool scanBlock(MachineBasicBlock *B);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
new file mode 100644
index 0000000..9566857
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -0,0 +1,204 @@
+//===--- RDFDeadCode.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "RDFDeadCode.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+// Check if the given instruction has observable side-effects, i.e. if
+// it should be considered "live". It is safe for this function to be
+// overly conservative (i.e. return "true" for all instructions), but it
+// is not safe to return "false" for an instruction that should not be
+// considered removable.
+bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
+  if (MI->mayStore() || MI->isBranch() || MI->isCall() || MI->isReturn())
+    return true;
+  if (MI->hasOrderedMemoryRef() || MI->hasUnmodeledSideEffects())
+    return true;
+  if (MI->isPHI())
+    return false;
+  for (auto &Op : MI->operands())
+    if (Op.isReg() && MRI.isReserved(Op.getReg()))
+      return true;
+  return false;
+}
+
+void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA,
+      SetVector<NodeId> &WorkQ) {
+  if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+    return;
+  if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+    return;
+  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) {
+    if (!LiveNodes.count(RA.Id))
+      WorkQ.insert(RA.Id);
+  }
+}
+
+void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA,
+      SetVector<NodeId> &WorkQ) {
+  NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+  for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+    if (!LiveNodes.count(UA.Id))
+      WorkQ.insert(UA.Id);
+  }
+  for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA))
+    LiveNodes.insert(TA.Id);
+}
+
+void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA,
+      SetVector<NodeId> &WorkQ) {
+  for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) {
+    if (!LiveNodes.count(DA.Id))
+      WorkQ.insert(DA.Id);
+  }
+}
+
+// Traverse the DFG and collect the set dead RefNodes and the set of
+// dead instructions. Return "true" if any of these sets is non-empty,
+// "false" otherwise.
+bool DeadCodeElimination::collect() {
+  // This function works by first finding all live nodes. The dead nodes
+  // are then the complement of the set of live nodes.
+  //
+  // Assume that all nodes are dead. Identify instructions which must be
+  // considered live, i.e. instructions with observable side-effects, such
+  // as calls and stores. All arguments of such instructions are considered
+  // live. For each live def, all operands used in the corresponding
+  // instruction are considered live. For each live use, all its reaching
+  // defs are considered live.
+  LiveNodes.clear();
+  SetVector<NodeId> WorkQ;
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG))
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG))
+      scanInstr(IA, WorkQ);
+
+  while (!WorkQ.empty()) {
+    NodeId N = *WorkQ.begin();
+    WorkQ.remove(N);
+    LiveNodes.insert(N);
+    auto RA = DFG.addr<RefNode*>(N);
+    if (DFG.IsDef(RA))
+      processDef(RA, WorkQ);
+    else
+      processUse(RA, WorkQ);
+  }
+
+  if (trace()) {
+    dbgs() << "Live nodes:\n";
+    for (NodeId N : LiveNodes) {
+      auto RA = DFG.addr<RefNode*>(N);
+      dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n";
+    }
+  }
+
+  auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool {
+    for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG))
+      if (LiveNodes.count(DA.Id))
+        return false;
+    return true;
+  };
+
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+        if (!LiveNodes.count(RA.Id))
+          DeadNodes.insert(RA.Id);
+      if (DFG.IsCode<NodeAttrs::Stmt>(IA))
+        if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+          continue;
+      if (IsDead(IA)) {
+        DeadInstrs.insert(IA.Id);
+        if (trace())
+          dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n";
+      }
+    }
+  }
+
+  return !DeadNodes.empty();
+}
+
+// Erase the nodes given in the Nodes set from DFG. In addition to removing
+// them from the DFG, if a node corresponds to a statement, the corresponding
+// machine instruction is erased from the function.
+bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
+  if (Nodes.empty())
+    return false;
+
+  // Prepare the actual set of ref nodes to remove: ref nodes from Nodes
+  // are included directly, for each InstrNode in Nodes, include the set
+  // of all RefNodes from it.
+  NodeList DRNs, DINs;
+  for (auto I : Nodes) {
+    auto BA = DFG.addr<NodeBase*>(I);
+    uint16_t Type = BA.Addr->getType();
+    if (Type == NodeAttrs::Ref) {
+      DRNs.push_back(DFG.addr<RefNode*>(I));
+      continue;
+    }
+
+    // If it's a code node, add all ref nodes from it.
+    uint16_t Kind = BA.Addr->getKind();
+    if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) {
+      for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
+        DRNs.push_back(N);
+      DINs.push_back(DFG.addr<InstrNode*>(I));
+    } else {
+      llvm_unreachable("Unexpected code node");
+      return false;
+    }
+  }
+
+  // Sort the list so that use nodes are removed first. This makes the
+  // "unlink" functions a bit faster.
+  auto UsesFirst = [] (NodeAddr<RefNode*> A, NodeAddr<RefNode*> B) -> bool {
+    uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind();
+    if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def)
+      return true;
+    if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use)
+      return false;
+    return A.Id < B.Id;
+  };
+  std::sort(DRNs.begin(), DRNs.end(), UsesFirst);
+
+  if (trace())
+    dbgs() << "Removing dead ref nodes:\n";
+  for (NodeAddr<RefNode*> RA : DRNs) {
+    if (trace())
+      dbgs() << "  " << PrintNode<RefNode*>(RA, DFG) << '\n';
+    if (DFG.IsUse(RA))
+      DFG.unlinkUse(RA);
+    else if (DFG.IsDef(RA))
+      DFG.unlinkDef(RA);
+  }
+
+  // Now, remove all dead instruction nodes.
+  for (NodeAddr<InstrNode*> IA : DINs) {
+    NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+    BA.Addr->removeMember(IA, DFG);
+    if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+      continue;
+
+    MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+    if (trace())
+      dbgs() << "erasing: " << *MI;
+    MI->eraseFromParent();
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
new file mode 100644
index 0000000..f4373fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
@@ -0,0 +1,65 @@
+//===--- RDFDeadCode.h ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+//
+// The main interface of this class are functions "collect" and "erase".
+// This allows custom processing of the function being optimized by a
+// particular consumer. The simplest way to use this class would be to
+// instantiate an object, and then simply call "collect" and "erase",
+// passing the result of "getDeadInstrs()" to it.
+// A more complex scenario would be to call "collect" first, then visit
+// all post-increment instructions to see if the address update is dead
+// or not, and if it is, convert the instruction to a non-updating form.
+// After that "erase" can be called with the set of nodes including both,
+// dead defs from the updating instructions and the nodes corresponding
+// to the dead instructions.
+
+#ifndef RDF_DEADCODE_H
+#define RDF_DEADCODE_H
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace llvm {
+  class MachineRegisterInfo;
+}
+
+namespace rdf {
+  struct DeadCodeElimination {
+    DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri)
+      : Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {}
+
+    bool collect();
+    bool erase(const SetVector<NodeId> &Nodes);
+    void trace(bool On) { Trace = On; }
+    bool trace() const { return Trace; }
+
+    SetVector<NodeId> getDeadNodes() { return DeadNodes; }
+    SetVector<NodeId> getDeadInstrs() { return DeadInstrs; }
+    DataFlowGraph &getDFG() { return DFG; }
+
+  private:
+    bool Trace;
+    SetVector<NodeId> LiveNodes;
+    SetVector<NodeId> DeadNodes;
+    SetVector<NodeId> DeadInstrs;
+    DataFlowGraph &DFG;
+    MachineRegisterInfo &MRI;
+    Liveness LV;
+
+    bool isLiveInstr(const MachineInstr *MI) const;
+    void scanInstr(NodeAddr<InstrNode*> IA, SetVector<NodeId> &WorkQ);
+    void processDef(NodeAddr<DefNode*> DA, SetVector<NodeId> &WorkQ);
+    void processUse(NodeAddr<UseNode*> UA, SetVector<NodeId> &WorkQ);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
new file mode 100644
index 0000000..9b47422
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
@@ -0,0 +1,1716 @@
+//===--- RDFGraph.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF).
+//
+#include "RDFGraph.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+// Printing functions. Have them here first, so that the rest of the code
+// can use them.
+namespace rdf {
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
+  auto &TRI = P.G.getTRI();
+  if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
+    OS << TRI.getName(P.Obj.Reg);
+  else
+    OS << '#' << P.Obj.Reg;
+  if (P.Obj.Sub > 0) {
+    OS << ':';
+    if (P.Obj.Sub < TRI.getNumSubRegIndices())
+      OS << TRI.getSubRegIndexName(P.Obj.Sub);
+    else
+      OS << '#' << P.Obj.Sub;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
+  auto NA = P.G.addr<NodeBase*>(P.Obj);
+  uint16_t Attrs = NA.Addr->getAttrs();
+  uint16_t Kind = NodeAttrs::kind(Attrs);
+  uint16_t Flags = NodeAttrs::flags(Attrs);
+  switch (NodeAttrs::type(Attrs)) {
+    case NodeAttrs::Code:
+      switch (Kind) {
+        case NodeAttrs::Func:   OS << 'f'; break;
+        case NodeAttrs::Block:  OS << 'b'; break;
+        case NodeAttrs::Stmt:   OS << 's'; break;
+        case NodeAttrs::Phi:    OS << 'p'; break;
+        default:                OS << "c?"; break;
+      }
+      break;
+    case NodeAttrs::Ref:
+      if (Flags & NodeAttrs::Preserving)
+        OS << '+';
+      if (Flags & NodeAttrs::Clobbering)
+        OS << '~';
+      switch (Kind) {
+        case NodeAttrs::Use:    OS << 'u'; break;
+        case NodeAttrs::Def:    OS << 'd'; break;
+        case NodeAttrs::Block:  OS << 'b'; break;
+        default:                OS << "r?"; break;
+      }
+      break;
+    default:
+      OS << '?';
+      break;
+  }
+  OS << P.Obj;
+  if (Flags & NodeAttrs::Shadow)
+    OS << '"';
+  return OS;
+}
+
+namespace {
+  void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
+        const DataFlowGraph &G) {
+    OS << Print<NodeId>(RA.Id, G) << '<'
+       << Print<RegisterRef>(RA.Addr->getRegRef(), G) << '>';
+    if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+      OS << '!';
+  }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedUse())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<PhiUseNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getPredecessor())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
+  switch (P.Obj.Addr->getKind()) {
+    case NodeAttrs::Def:
+      OS << PrintNode<DefNode*>(P.Obj, P.G);
+      break;
+    case NodeAttrs::Use:
+      if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
+        OS << PrintNode<PhiUseNode*>(P.Obj, P.G);
+      else
+        OS << PrintNode<UseNode*>(P.Obj, P.G);
+      break;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print<NodeId>(I.Id, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print<NodeId>(I, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+namespace {
+  template <typename T>
+  struct PrintListV {
+    PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+    typedef T Type;
+    const NodeList &List;
+    const DataFlowGraph &G;
+  };
+
+  template <typename T>
+  raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) {
+    unsigned N = P.List.size();
+    for (NodeAddr<T> A : P.List) {
+      OS << PrintNode<T>(A, P.G);
+      if (--N)
+        OS << ", ";
+    }
+    return OS;
+  }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
+     << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<StmtNode*>> &P) {
+  unsigned Opc = P.Obj.Addr->getCode()->getOpcode();
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc)
+     << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<InstrNode*>> &P) {
+  switch (P.Obj.Addr->getKind()) {
+    case NodeAttrs::Phi:
+      OS << PrintNode<PhiNode*>(P.Obj, P.G);
+      break;
+    case NodeAttrs::Stmt:
+      OS << PrintNode<StmtNode*>(P.Obj, P.G);
+      break;
+    default:
+      OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G);
+      break;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<BlockNode*>> &P) {
+  auto *BB = P.Obj.Addr->getCode();
+  unsigned NP = BB->pred_size();
+  std::vector<int> Ns;
+  auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void {
+    unsigned N = Ns.size();
+    for (auto I : Ns) {
+      OS << "BB#" << I;
+      if (--N)
+        OS << ", ";
+    }
+  };
+
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": === BB#" << BB->getNumber()
+     << " === preds(" << NP << "): ";
+  for (auto I : BB->predecessors())
+    Ns.push_back(I->getNumber());
+  PrintBBs(Ns);
+
+  unsigned NS = BB->succ_size();
+  OS << "  succs(" << NS << "): ";
+  Ns.clear();
+  for (auto I : BB->successors())
+    Ns.push_back(I->getNumber());
+  PrintBBs(Ns);
+  OS << '\n';
+
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<InstrNode*>(I, P.G) << '\n';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<FuncNode*>> &P) {
+  OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
+     << P.Obj.Addr->getCode()->getName() << '\n';
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<BlockNode*>(I, P.G) << '\n';
+  OS << "]\n";
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
+  OS << '{';
+  for (auto I : P.Obj)
+    OS << ' ' << Print<RegisterRef>(I, P.G);
+  OS << " }";
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<DataFlowGraph::DefStack> &P) {
+  for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
+    OS << Print<NodeId>(I->Id, P.G)
+       << '<' << Print<RegisterRef>(I->Addr->getRegRef(), P.G) << '>';
+    I.down();
+    if (I != E)
+      OS << ' ';
+  }
+  return OS;
+}
+
+} // namespace rdf
+
+// Node allocation functions.
+//
+// Node allocator is like a slab memory allocator: it allocates blocks of
+// memory in sizes that are multiples of the size of a node. Each block has
+// the same size. Nodes are allocated from the currently active block, and
+// when it becomes full, a new one is created.
+// There is a mapping scheme between node id and its location in a block,
+// and within that block is described in the header file.
+//
+void NodeAllocator::startNewBlock() {
+  void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize);
+  char *P = static_cast<char*>(T);
+  Blocks.push_back(P);
+  // Check if the block index is still within the allowed range, i.e. less
+  // than 2^N, where N is the number of bits in NodeId for the block index.
+  // BitsPerIndex is the number of bits per node index.
+  assert((Blocks.size() < (1U << (8*sizeof(NodeId)-BitsPerIndex))) &&
+         "Out of bits for block index");
+  ActiveEnd = P;
+}
+
+bool NodeAllocator::needNewBlock() {
+  if (Blocks.empty())
+    return true;
+
+  char *ActiveBegin = Blocks.back();
+  uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize;
+  return Index >= NodesPerBlock;
+}
+
+NodeAddr<NodeBase*> NodeAllocator::New() {
+  if (needNewBlock())
+    startNewBlock();
+
+  uint32_t ActiveB = Blocks.size()-1;
+  uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize;
+  NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd),
+                             makeId(ActiveB, Index) };
+  ActiveEnd += NodeMemSize;
+  return NA;
+}
+
+NodeId NodeAllocator::id(const NodeBase *P) const {
+  uintptr_t A = reinterpret_cast<uintptr_t>(P);
+  for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
+    uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
+    if (A < B || A >= B + NodesPerBlock*NodeMemSize)
+      continue;
+    uint32_t Idx = (A-B)/NodeMemSize;
+    return makeId(i, Idx);
+  }
+  llvm_unreachable("Invalid node address");
+}
+
+void NodeAllocator::clear() {
+  MemPool.Reset();
+  Blocks.clear();
+  ActiveEnd = nullptr;
+}
+
+
+// Insert node NA after "this" in the circular chain.
+void NodeBase::append(NodeAddr<NodeBase*> NA) {
+  NodeId Nx = Next;
+  // If NA is already "next", do nothing.
+  if (Next != NA.Id) {
+    Next = NA.Id;
+    NA.Addr->Next = Nx;
+  }
+}
+
+
+// Fundamental node manipulator functions.
+
+// Obtain the register reference from a reference node.
+RegisterRef RefNode::getRegRef() const {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
+    return Ref.RR;
+  assert(Ref.Op != nullptr);
+  return { Ref.Op->getReg(), Ref.Op->getSubReg() };
+}
+
+// Set the register reference in the reference node directly (for references
+// in phi nodes).
+void RefNode::setRegRef(RegisterRef RR) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
+  Ref.RR = RR;
+}
+
+// Set the register reference in the reference node based on a machine
+// operand (for references in statement nodes).
+void RefNode::setRegRef(MachineOperand *Op) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
+  Ref.Op = Op;
+}
+
+// Get the owner of a given reference node.
+NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+  while (NA.Addr != this) {
+    if (NA.Addr->getType() == NodeAttrs::Code)
+      return NA;
+    NA = G.addr<NodeBase*>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Connect the def node to the reaching def node.
+void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+  Ref.RD = DA.Id;
+  Ref.Sib = DA.Addr->getReachedDef();
+  DA.Addr->setReachedDef(Self);
+}
+
+// Connect the use node to the reaching def node.
+void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+  Ref.RD = DA.Id;
+  Ref.Sib = DA.Addr->getReachedUse();
+  DA.Addr->setReachedUse(Self);
+}
+
+// Get the first member of the code node.
+NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const {
+  if (Code.FirstM == 0)
+    return NodeAddr<NodeBase*>();
+  return G.addr<NodeBase*>(Code.FirstM);
+}
+
+// Get the last member of the code node.
+NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const {
+  if (Code.LastM == 0)
+    return NodeAddr<NodeBase*>();
+  return G.addr<NodeBase*>(Code.LastM);
+}
+
+// Add node NA at the end of the member list of the given code node.
+void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+  auto ML = getLastMember(G);
+  if (ML.Id != 0) {
+    ML.Addr->append(NA);
+  } else {
+    Code.FirstM = NA.Id;
+    NodeId Self = G.id(this);
+    NA.Addr->setNext(Self);
+  }
+  Code.LastM = NA.Id;
+}
+
+// Add node NA after member node MA in the given code node.
+void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+      const DataFlowGraph &G) {
+  MA.Addr->append(NA);
+  if (Code.LastM == MA.Id)
+    Code.LastM = NA.Id;
+}
+
+// Remove member node NA from the given code node.
+void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+  auto MA = getFirstMember(G);
+  assert(MA.Id != 0);
+
+  // Special handling if the member to remove is the first member.
+  if (MA.Id == NA.Id) {
+    if (Code.LastM == MA.Id) {
+      // If it is the only member, set both first and last to 0.
+      Code.FirstM = Code.LastM = 0;
+    } else {
+      // Otherwise, advance the first member.
+      Code.FirstM = MA.Addr->getNext();
+    }
+    return;
+  }
+
+  while (MA.Addr != this) {
+    NodeId MX = MA.Addr->getNext();
+    if (MX == NA.Id) {
+      MA.Addr->setNext(NA.Addr->getNext());
+      // If the member to remove happens to be the last one, update the
+      // LastM indicator.
+      if (Code.LastM == NA.Id)
+        Code.LastM = MA.Id;
+      return;
+    }
+    MA = G.addr<NodeBase*>(MX);
+  }
+  llvm_unreachable("No such member");
+}
+
+// Return the list of all members of the code node.
+NodeList CodeNode::members(const DataFlowGraph &G) const {
+  static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; };
+  return members_if(True, G);
+}
+
+// Return the owner of the given instr node.
+NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+  while (NA.Addr != this) {
+    assert(NA.Addr->getType() == NodeAttrs::Code);
+    if (NA.Addr->getKind() == NodeAttrs::Block)
+      return NA;
+    NA = G.addr<NodeBase*>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Add the phi node PA to the given block node.
+void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) {
+  auto M = getFirstMember(G);
+  if (M.Id == 0) {
+    addMember(PA, G);
+    return;
+  }
+
+  assert(M.Addr->getType() == NodeAttrs::Code);
+  if (M.Addr->getKind() == NodeAttrs::Stmt) {
+    // If the first member of the block is a statement, insert the phi as
+    // the first member.
+    Code.FirstM = PA.Id;
+    PA.Addr->setNext(M.Id);
+  } else {
+    // If the first member is a phi, find the last phi, and append PA to it.
+    assert(M.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<NodeBase*> MN = M;
+    do {
+      M = MN;
+      MN = G.addr<NodeBase*>(M.Addr->getNext());
+      assert(MN.Addr->getType() == NodeAttrs::Code);
+    } while (MN.Addr->getKind() == NodeAttrs::Phi);
+
+    // M is the last phi.
+    addMemberAfter(M, PA, G);
+  }
+}
+
+// Find the block node corresponding to the machine basic block BB in the
+// given func node.
+NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB,
+      const DataFlowGraph &G) const {
+  auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool {
+    return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB;
+  };
+  NodeList Ms = members_if(EqBB, G);
+  if (!Ms.empty())
+    return Ms[0];
+  return NodeAddr<BlockNode*>();
+}
+
+// Get the block node for the entry block in the given function.
+NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
+  MachineBasicBlock *EntryB = &getCode()->front();
+  return findBlock(EntryB, G);
+}
+
+
+// Register aliasing information.
+//
+// In theory, the lane information could be used to determine register
+// covering (and aliasing), but depending on the sub-register structure,
+// the lane mask information may be missing. The covering information
+// must be available for this framework to work, so relying solely on
+// the lane data is not sufficient.
+
+// Determine whether RA covers RB.
+bool RegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const {
+  if (RA == RB)
+    return true;
+  if (TargetRegisterInfo::isVirtualRegister(RA.Reg)) {
+    assert(TargetRegisterInfo::isVirtualRegister(RB.Reg));
+    if (RA.Reg != RB.Reg)
+      return false;
+    if (RA.Sub == 0)
+      return true;
+    return TRI.composeSubRegIndices(RA.Sub, RB.Sub) == RA.Sub;
+  }
+
+  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg) &&
+         TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+  unsigned A = RA.Sub != 0 ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg;
+  unsigned B = RB.Sub != 0 ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg;
+  return TRI.isSubRegister(A, B);
+}
+
+// Determine whether RR is covered by the set of references RRs.
+bool RegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR) const {
+  if (RRs.count(RR))
+    return true;
+
+  // For virtual registers, we cannot accurately determine covering based
+  // on subregisters. If RR itself is not present in RRs, but it has a sub-
+  // register reference, check for the super-register alone. Otherwise,
+  // assume non-covering.
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+    if (RR.Sub != 0)
+      return RRs.count({RR.Reg, 0});
+    return false;
+  }
+
+  // If any super-register of RR is present, then RR is covered.
+  unsigned Reg = RR.Sub == 0 ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub);
+  for (MCSuperRegIterator SR(Reg, &TRI); SR.isValid(); ++SR)
+    if (RRs.count({*SR, 0}))
+      return true;
+
+  return false;
+}
+
+// Get the list of references aliased to RR.
+std::vector<RegisterRef> RegisterAliasInfo::getAliasSet(RegisterRef RR) const {
+  // Do not include RR in the alias set. For virtual registers return an
+  // empty set.
+  std::vector<RegisterRef> AS;
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return AS;
+  assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
+  unsigned R = RR.Reg;
+  if (RR.Sub)
+    R = TRI.getSubReg(RR.Reg, RR.Sub);
+
+  for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+    AS.push_back(RegisterRef({*AI, 0}));
+  return AS;
+}
+
+// Check whether RA and RB are aliased.
+bool RegisterAliasInfo::alias(RegisterRef RA, RegisterRef RB) const {
+  bool VirtA = TargetRegisterInfo::isVirtualRegister(RA.Reg);
+  bool VirtB = TargetRegisterInfo::isVirtualRegister(RB.Reg);
+  bool PhysA = TargetRegisterInfo::isPhysicalRegister(RA.Reg);
+  bool PhysB = TargetRegisterInfo::isPhysicalRegister(RB.Reg);
+
+  if (VirtA != VirtB)
+    return false;
+
+  if (VirtA) {
+    if (RA.Reg != RB.Reg)
+      return false;
+    // RA and RB refer to the same register. If any of them refer to the
+    // whole register, they must be aliased.
+    if (RA.Sub == 0 || RB.Sub == 0)
+      return true;
+    unsigned SA = TRI.getSubRegIdxSize(RA.Sub);
+    unsigned OA = TRI.getSubRegIdxOffset(RA.Sub);
+    unsigned SB = TRI.getSubRegIdxSize(RB.Sub);
+    unsigned OB = TRI.getSubRegIdxOffset(RB.Sub);
+    if (OA <= OB && OA+SA > OB)
+      return true;
+    if (OB <= OA && OB+SB > OA)
+      return true;
+    return false;
+  }
+
+  assert(PhysA && PhysB);
+  (void)PhysA, (void)PhysB;
+  unsigned A = RA.Sub ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg;
+  unsigned B = RB.Sub ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg;
+  for (MCRegAliasIterator I(A, &TRI, true); I.isValid(); ++I)
+    if (B == *I)
+      return true;
+  return false;
+}
+
+
+// Target operand information.
+//
+
+// For a given instruction, check if there are any bits of RR that can remain
+// unchanged across this def.
+bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
+      const {
+  return TII.isPredicated(&In);
+}
+
+// Check if the definition of RR produces an unspecified value.
+bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
+      const {
+  if (In.isCall())
+    if (In.getOperand(OpNum).isImplicit())
+      return true;
+  return false;
+}
+
+// Check if the given instruction specifically requires 
+bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
+      const {
+  if (In.isCall() || In.isReturn())
+    return true;
+  const MCInstrDesc &D = In.getDesc();
+  if (!D.getImplicitDefs() && !D.getImplicitUses())
+    return false;
+  const MachineOperand &Op = In.getOperand(OpNum);
+  // If there is a sub-register, treat the operand as non-fixed. Currently,
+  // fixed registers are those that are listed in the descriptor as implicit
+  // uses or defs, and those lists do not allow sub-registers.
+  if (Op.getSubReg() != 0)
+    return false;
+  unsigned Reg = Op.getReg();
+  const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
+                                     : D.getImplicitUses();
+  if (!ImpR)
+    return false;
+  while (*ImpR)
+    if (*ImpR++ == Reg)
+      return true;
+  return false;
+}
+
+
+//
+// The data flow graph construction.
+//
+
+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+      const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+      const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai,
+      const TargetOperandInfo &toi)
+    : TimeG("rdf"), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), RAI(rai),
+      TOI(toi) {
+}
+
+
+// The implementation of the definition stack.
+// Each register reference has its own definition stack. In particular,
+// for a register references "Reg" and "Reg:subreg" will each have their
+// own definition stacks.
+
+// Construct a stack iterator.
+DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
+      bool Top) : DS(S) {
+  if (!Top) {
+    // Initialize to bottom.
+    Pos = 0;
+    return;
+  }
+  // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
+  Pos = DS.Stack.size();
+  while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1]))
+    Pos--;
+}
+
+// Return the size of the stack, including block delimiters.
+unsigned DataFlowGraph::DefStack::size() const {
+  unsigned S = 0;
+  for (auto I = top(), E = bottom(); I != E; I.down())
+    S++;
+  return S;
+}
+
+// Remove the top entry from the stack. Remove all intervening delimiters
+// so that after this, the stack is either empty, or the top of the stack
+// is a non-delimiter.
+void DataFlowGraph::DefStack::pop() {
+  assert(!empty());
+  unsigned P = nextDown(Stack.size());
+  Stack.resize(P);
+}
+
+// Push a delimiter for block node N on the stack.
+void DataFlowGraph::DefStack::start_block(NodeId N) {
+  assert(N != 0);
+  Stack.push_back(NodeAddr<DefNode*>(nullptr, N));
+}
+
+// Remove all nodes from the top of the stack, until the delimited for
+// block node N is encountered. Remove the delimiter as well. In effect,
+// this will remove from the stack all definitions from block N.
+void DataFlowGraph::DefStack::clear_block(NodeId N) {
+  assert(N != 0);
+  unsigned P = Stack.size();
+  while (P > 0) {
+    bool Found = isDelimiter(Stack[P-1], N);
+    P--;
+    if (Found)
+      break;
+  }
+  // This will also remove the delimiter, if found.
+  Stack.resize(P);
+}
+
+// Move the stack iterator up by one.
+unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
+  // Get the next valid position after P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  unsigned SS = Stack.size();
+  bool IsDelim;
+  assert(P < SS);
+  do {
+    P++;
+    IsDelim = isDelimiter(Stack[P-1]);
+  } while (P < SS && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Move the stack iterator down by one.
+unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
+  // Get the preceding valid position before P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  assert(P > 0 && P <= Stack.size());
+  bool IsDelim = isDelimiter(Stack[P-1]);
+  do {
+    if (--P == 0)
+      break;
+    IsDelim = isDelimiter(Stack[P-1]);
+  } while (P > 0 && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Node management functions.
+
+// Get the pointer to the node with the id N.
+NodeBase *DataFlowGraph::ptr(NodeId N) const {
+  if (N == 0)
+    return nullptr;
+  return Memory.ptr(N);
+}
+
+// Get the id of the node at the address P.
+NodeId DataFlowGraph::id(const NodeBase *P) const {
+  if (P == nullptr)
+    return 0;
+  return Memory.id(P);
+}
+
+// Allocate a new node and set the attributes to Attrs.
+NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) {
+  NodeAddr<NodeBase*> P = Memory.New();
+  P.Addr->init();
+  P.Addr->setAttrs(Attrs);
+  return P;
+}
+
+// Make a copy of the given node B, except for the data-flow links, which
+// are set to 0.
+NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) {
+  NodeAddr<NodeBase*> NA = newNode(0);
+  memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
+  // Ref nodes need to have the data-flow links reset.
+  if (NA.Addr->getType() == NodeAttrs::Ref) {
+    NodeAddr<RefNode*> RA = NA;
+    RA.Addr->setReachingDef(0);
+    RA.Addr->setSibling(0);
+    if (NA.Addr->getKind() == NodeAttrs::Def) {
+      NodeAddr<DefNode*> DA = NA;
+      DA.Addr->setReachedDef(0);
+      DA.Addr->setReachedUse(0);
+    }
+  }
+  return NA;
+}
+
+
+// Allocation routines for specific node types/kinds.
+
+NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner,
+      MachineOperand &Op, uint16_t Flags) {
+  NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  UA.Addr->setRegRef(&Op);
+  return UA;
+}
+
+NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner,
+      RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) {
+  NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  PUA.Addr->setRegRef(RR);
+  PUA.Addr->setPredecessor(PredB.Id);
+  return PUA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+      MachineOperand &Op, uint16_t Flags) {
+  NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  DA.Addr->setRegRef(&Op);
+  return DA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+      RegisterRef RR, uint16_t Flags) {
+  NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  DA.Addr->setRegRef(RR);
+  return DA;
+}
+
+NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) {
+  NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi);
+  Owner.Addr->addPhi(PA, *this);
+  return PA;
+}
+
+NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner,
+      MachineInstr *MI) {
+  NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt);
+  SA.Addr->setCode(MI);
+  Owner.Addr->addMember(SA, *this);
+  return SA;
+}
+
+NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner,
+      MachineBasicBlock *BB) {
+  NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block);
+  BA.Addr->setCode(BB);
+  Owner.Addr->addMember(BA, *this);
+  return BA;
+}
+
+NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) {
+  NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func);
+  FA.Addr->setCode(MF);
+  return FA;
+}
+
+// Build the data flow graph.
+void DataFlowGraph::build() {
+  reset();
+  Func = newFunc(&MF);
+
+  if (MF.empty())
+    return;
+
+  for (auto &B : MF) {
+    auto BA = newBlock(Func, &B);
+    for (auto &I : B) {
+      if (I.isDebugValue())
+        continue;
+      buildStmt(BA, I);
+    }
+  }
+
+  // Collect information about block references.
+  NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this);
+  BlockRefsMap RefM;
+  buildBlockRefs(EA, RefM);
+
+  // Add function-entry phi nodes.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+    NodeAddr<PhiNode*> PA = newPhi(EA);
+    RegisterRef RR = { I->first, 0 };
+    uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+    NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+    PA.Addr->addMember(DA, *this);
+  }
+
+  // Build a map "PhiM" which will contain, for each block, the set
+  // of references that will require phi definitions in that block.
+  BlockRefsMap PhiM;
+  auto Blocks = Func.Addr->members(*this);
+  for (NodeAddr<BlockNode*> BA : Blocks)
+    recordDefsForDF(PhiM, RefM, BA);
+  for (NodeAddr<BlockNode*> BA : Blocks)
+    buildPhis(PhiM, RefM, BA);
+
+  // Link all the refs. This will recursively traverse the dominator tree.
+  DefStackMap DM;
+  linkBlockRefs(DM, EA);
+
+  // Finally, remove all unused phi nodes.
+  removeUnusedPhis();
+}
+
+// For each stack in the map DefM, push the delimiter for block B on it.
+void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
+  // Push block delimiters.
+  for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+    I->second.start_block(B);
+}
+
+// Remove all definitions coming from block B from each stack in DefM.
+void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
+  // Pop all defs from this block from the definition stack. Defs that were
+  // added to the map during the traversal of instructions will not have a
+  // delimiter, but for those, the whole stack will be emptied.
+  for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+    I->second.clear_block(B);
+
+  // Finally, remove empty stacks from the map.
+  for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
+    NextI = std::next(I);
+    // This preserves the validity of iterators other than I.
+    if (I->second.empty())
+      DefM.erase(I);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  NodeList Defs = IA.Addr->members_if(IsDef, *this);
+  NodeSet Visited;
+#ifndef NDEBUG
+  RegisterSet Defined;
+#endif
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (NodeAddr<DefNode*> DA : Defs) {
+    if (Visited.count(DA.Id))
+      continue;
+    NodeList Rel = getRelatedRefs(IA, DA);
+    NodeAddr<DefNode*> PDA = Rel.front();
+    // Push the definition on the stack for the register and all aliases.
+    RegisterRef RR = PDA.Addr->getRegRef();
+#ifndef NDEBUG
+    // Assert if the register is defined in two or more unrelated defs.
+    // This could happen if there are two or more def operands defining it.
+    if (!Defined.insert(RR).second) {
+      auto *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+      dbgs() << "Multiple definitions of register: "
+             << Print<RegisterRef>(RR, *this) << " in\n  " << *MI
+             << "in BB#" << MI->getParent()->getNumber() << '\n';
+      llvm_unreachable(nullptr);
+    }
+#endif
+    DefM[RR].push(DA);
+    for (auto A : RAI.getAliasSet(RR)) {
+      assert(A != RR);
+      DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (auto T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Return the list of all reference nodes related to RA, including RA itself.
+// See "getNextRelated" for the meaning of a "related reference".
+NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeList Refs;
+  NodeId Start = RA.Id;
+  do {
+    Refs.push_back(RA);
+    RA = getNextRelated(IA, RA);
+  } while (RA.Id != 0 && RA.Id != Start);
+  return Refs;
+}
+
+
+// Clear all information in the graph.
+void DataFlowGraph::reset() {
+  Memory.clear();
+  Func = NodeAddr<FuncNode*>();
+}
+
+
+// Return the next reference node in the instruction node IA that is related
+// to RA. Conceptually, two reference nodes are related if they refer to the
+// same instance of a register access, but differ in flags or other minor
+// characteristics. Specific examples of related nodes are shadow reference
+// nodes.
+// Return the equivalent of nullptr if there are no more related references.
+NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  auto Related = [RA](NodeAddr<RefNode*> TA) -> bool {
+    if (TA.Addr->getKind() != RA.Addr->getKind())
+      return false;
+    if (TA.Addr->getRegRef() != RA.Addr->getRegRef())
+      return false;
+    return true;
+  };
+  auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+    return Related(TA) &&
+           &RA.Addr->getOp() == &TA.Addr->getOp();
+  };
+  auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+    if (!Related(TA))
+      return false;
+    if (TA.Addr->getKind() != NodeAttrs::Use)
+      return true;
+    // For phi uses, compare predecessor blocks.
+    const NodeAddr<const PhiUseNode*> TUA = TA;
+    const NodeAddr<const PhiUseNode*> RUA = RA;
+    return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor();
+  };
+
+  RegisterRef RR = RA.Addr->getRegRef();
+  if (IA.Addr->getKind() == NodeAttrs::Stmt)
+    return RA.Addr->getNextRef(RR, RelatedStmt, true, *this);
+  return RA.Addr->getNextRef(RR, RelatedPhi, true, *this);
+}
+
+// Find the next node related to RA in IA that satisfies condition P.
+// If such a node was found, return a pair where the second element is the
+// located node. If such a node does not exist, return a pair where the
+// first element is the element after which such a node should be inserted,
+// and the second element is a null-address.
+template <typename Predicate>
+std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+      Predicate P) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeAddr<RefNode*> NA;
+  NodeId Start = RA.Id;
+  while (true) {
+    NA = getNextRelated(IA, RA);
+    if (NA.Id == 0 || NA.Id == Start)
+      break;
+    if (P(NA))
+      break;
+    RA = NA;
+  }
+
+  if (NA.Id != 0 && NA.Id != Start)
+    return std::make_pair(RA, NA);
+  return std::make_pair(RA, NodeAddr<RefNode*>());
+}
+
+// Get the next shadow node in IA corresponding to RA, and optionally create
+// such a node if it does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA, bool Create) {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  auto Loc = locateNextRef(IA, RA, IsShadow);
+  if (Loc.second.Id != 0 || !Create)
+    return Loc.second;
+
+  // Create a copy of RA and mark is as shadow.
+  NodeAddr<RefNode*> NA = cloneNode(RA);
+  NA.Addr->setFlags(Flags | NodeAttrs::Shadow);
+  IA.Addr->addMemberAfter(Loc.first, NA, *this);
+  return NA;
+}
+
+// Get the next shadow node in IA corresponding to RA. Return null-address
+// if such a node does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  return locateNextRef(IA, RA, IsShadow).second;
+}
+
+// Create a new statement node in the block node BA that corresponds to
+// the machine instruction MI.
+void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
+  auto SA = newStmt(BA, &In);
+
+  // Collect a set of registers that this instruction implicitly uses
+  // or defines. Implicit operands from an instruction will be ignored
+  // unless they are listed here.
+  RegisterSet ImpUses, ImpDefs;
+  if (const uint16_t *ImpD = In.getDesc().getImplicitDefs())
+    while (uint16_t R = *ImpD++)
+      ImpDefs.insert({R, 0});
+  if (const uint16_t *ImpU = In.getDesc().getImplicitUses())
+    while (uint16_t R = *ImpU++)
+      ImpUses.insert({R, 0});
+
+  bool IsCall = In.isCall(), IsReturn = In.isReturn();
+  bool IsPredicated = TII.isPredicated(&In);
+  unsigned NumOps = In.getNumOperands();
+
+  // Avoid duplicate implicit defs. This will not detect cases of implicit
+  // defs that define registers that overlap, but it is not clear how to
+  // interpret that in the absence of explicit defs. Overlapping explicit
+  // defs are likely illegal already.
+  RegisterSet DoneDefs;
+  // Process explicit defs first.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+      continue;
+    RegisterRef RR = { Op.getReg(), Op.getSubReg() };
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN))
+      Flags |= NodeAttrs::Preserving;
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.insert(RR);
+  }
+
+  // Process implicit defs, skipping those that have already been added
+  // as explicit.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
+      continue;
+    RegisterRef RR = { Op.getReg(), Op.getSubReg() };
+    if (!IsCall && !ImpDefs.count(RR))
+      continue;
+    if (DoneDefs.count(RR))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN))
+      Flags |= NodeAttrs::Preserving;
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.insert(RR);
+  }
+
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    RegisterRef RR = { Op.getReg(), Op.getSubReg() };
+    // Add implicit uses on return and call instructions, and on predicated
+    // instructions regardless of whether or not they appear in the instruction
+    // descriptor's list.
+    bool Implicit = Op.isImplicit();
+    bool TakeImplicit = IsReturn || IsCall || IsPredicated;
+    if (Implicit && !TakeImplicit && !ImpUses.count(RR))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<UseNode*> UA = newUse(SA, Op, Flags);
+    SA.Addr->addMember(UA, *this);
+  }
+}
+
+// Build a map that for each block will have the set of all references from
+// that block, and from all blocks dominated by it.
+void DataFlowGraph::buildBlockRefs(NodeAddr<BlockNode*> BA,
+      BlockRefsMap &RefM) {
+  auto &Refs = RefM[BA.Id];
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  assert(N);
+  for (auto I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    auto SBA = Func.Addr->findBlock(SB, *this);
+    buildBlockRefs(SBA, RefM);
+    const auto &SRs = RefM[SBA.Id];
+    Refs.insert(SRs.begin(), SRs.end());
+  }
+
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this))
+    for (NodeAddr<RefNode*> RA : IA.Addr->members(*this))
+      Refs.insert(RA.Addr->getRegRef());
+}
+
+// Scan all defs in the block node BA and record in PhiM the locations of
+// phi nodes corresponding to these defs.
+void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+      NodeAddr<BlockNode*> BA) {
+  // Check all defs from block BA and record them in each block in BA's
+  // iterated dominance frontier. This information will later be used to
+  // create phi nodes.
+  MachineBasicBlock *BB = BA.Addr->getCode();
+  assert(BB);
+  auto DFLoc = MDF.find(BB);
+  if (DFLoc == MDF.end() || DFLoc->second.empty())
+    return;
+
+  // Traverse all instructions in the block and collect the set of all
+  // defined references. For each reference there will be a phi created
+  // in the block's iterated dominance frontier.
+  // This is done to make sure that each defined reference gets only one
+  // phi node, even if it is defined multiple times.
+  RegisterSet Defs;
+  for (auto I : BA.Addr->members(*this)) {
+    assert(I.Addr->getType() == NodeAttrs::Code);
+    assert(I.Addr->getKind() == NodeAttrs::Phi ||
+           I.Addr->getKind() == NodeAttrs::Stmt);
+    NodeAddr<InstrNode*> IA = I;
+    for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this))
+      Defs.insert(RA.Addr->getRegRef());
+  }
+
+  // Finally, add the set of defs to each block in the iterated dominance
+  // frontier.
+  const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
+  SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end());
+  for (unsigned i = 0; i < IDF.size(); ++i) {
+    auto F = MDF.find(IDF[i]);
+    if (F != MDF.end())
+      IDF.insert(F->second.begin(), F->second.end());
+  }
+
+  // Get the register references that are reachable from this block.
+  RegisterSet &Refs = RefM[BA.Id];
+  for (auto DB : IDF) {
+    auto DBA = Func.Addr->findBlock(DB, *this);
+    const auto &Rs = RefM[DBA.Id];
+    Refs.insert(Rs.begin(), Rs.end());
+  }
+
+  for (auto DB : IDF) {
+    auto DBA = Func.Addr->findBlock(DB, *this);
+    PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
+  }
+}
+
+// Given the locations of phi nodes in the map PhiM, create the phi nodes
+// that are located in the block node BA.
+void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+      NodeAddr<BlockNode*> BA) {
+  // Check if this blocks has any DF defs, i.e. if there are any defs
+  // that this block is in the iterated dominance frontier of.
+  auto HasDF = PhiM.find(BA.Id);
+  if (HasDF == PhiM.end() || HasDF->second.empty())
+    return;
+
+  // First, remove all R in Refs in such that there exists T in Refs
+  // such that T covers R. In other words, only leave those refs that
+  // are not covered by another ref (i.e. maximal with respect to covering).
+
+  auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
+    for (auto I : RRs)
+      if (I != RR && RAI.covers(I, RR))
+        RR = I;
+    return RR;
+  };
+
+  RegisterSet MaxDF;
+  for (auto I : HasDF->second)
+    MaxDF.insert(MaxCoverIn(I, HasDF->second));
+
+  std::vector<RegisterRef> MaxRefs;
+  auto &RefB = RefM[BA.Id];
+  for (auto I : MaxDF)
+    MaxRefs.push_back(MaxCoverIn(I, RefB));
+
+  // Now, for each R in MaxRefs, get the alias closure of R. If the closure
+  // only has R in it, create a phi a def for R. Otherwise, create a phi,
+  // and add a def for each S in the closure.
+
+  // Sort the refs so that the phis will be created in a deterministic order.
+  std::sort(MaxRefs.begin(), MaxRefs.end());
+  // Remove duplicates.
+  auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
+  MaxRefs.erase(NewEnd, MaxRefs.end());
+
+  auto Aliased = [this,&MaxRefs](RegisterRef RR,
+                                 std::vector<unsigned> &Closure) -> bool {
+    for (auto I : Closure)
+      if (RAI.alias(RR, MaxRefs[I]))
+        return true;
+    return false;
+  };
+
+  // Prepare a list of NodeIds of the block's predecessors.
+  std::vector<NodeId> PredList;
+  const MachineBasicBlock *MBB = BA.Addr->getCode();
+  for (auto PB : MBB->predecessors()) {
+    auto B = Func.Addr->findBlock(PB, *this);
+    PredList.push_back(B.Id);
+  }
+
+  while (!MaxRefs.empty()) {
+    // Put the first element in the closure, and then add all subsequent
+    // elements from MaxRefs to it, if they alias at least one element
+    // already in the closure.
+    // ClosureIdx: vector of indices in MaxRefs of members of the closure.
+    std::vector<unsigned> ClosureIdx = { 0 };
+    for (unsigned i = 1; i != MaxRefs.size(); ++i)
+      if (Aliased(MaxRefs[i], ClosureIdx))
+        ClosureIdx.push_back(i);
+
+    // Build a phi for the closure.
+    unsigned CS = ClosureIdx.size();
+    NodeAddr<PhiNode*> PA = newPhi(BA);
+
+    // Add defs.
+    for (unsigned X = 0; X != CS; ++X) {
+      RegisterRef RR = MaxRefs[ClosureIdx[X]];
+      uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+      NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+      PA.Addr->addMember(DA, *this);
+    }
+    // Add phi uses.
+    for (auto P : PredList) {
+      auto PBA = addr<BlockNode*>(P);
+      for (unsigned X = 0; X != CS; ++X) {
+        RegisterRef RR = MaxRefs[ClosureIdx[X]];
+        NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
+        PA.Addr->addMember(PUA, *this);
+      }
+    }
+
+    // Erase from MaxRefs all elements in the closure.
+    auto Begin = MaxRefs.begin();
+    for (unsigned i = ClosureIdx.size(); i != 0; --i)
+      MaxRefs.erase(Begin + ClosureIdx[i-1]);
+  }
+}
+
+// Remove any unneeded phi nodes that were created during the build process.
+void DataFlowGraph::removeUnusedPhis() {
+  // This will remove unused phis, i.e. phis where each def does not reach
+  // any uses or other defs. This will not detect or remove circular phi
+  // chains that are otherwise dead. Unused/dead phis are created during
+  // the build process and this function is intended to remove these cases
+  // that are easily determinable to be unnecessary.
+
+  SetVector<NodeId> PhiQ;
+  for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) {
+    for (auto P : BA.Addr->members_if(IsPhi, *this))
+      PhiQ.insert(P.Id);
+  }
+
+  static auto HasUsedDef = [](NodeList &Ms) -> bool {
+    for (auto M : Ms) {
+      if (M.Addr->getKind() != NodeAttrs::Def)
+        continue;
+      NodeAddr<DefNode*> DA = M;
+      if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0)
+        return true;
+    }
+    return false;
+  };
+
+  // Any phi, if it is removed, may affect other phis (make them dead).
+  // For each removed phi, collect the potentially affected phis and add
+  // them back to the queue.
+  while (!PhiQ.empty()) {
+    auto PA = addr<PhiNode*>(PhiQ[0]);
+    PhiQ.remove(PA.Id);
+    NodeList Refs = PA.Addr->members(*this);
+    if (HasUsedDef(Refs))
+      continue;
+    for (NodeAddr<RefNode*> RA : Refs) {
+      if (NodeId RD = RA.Addr->getReachingDef()) {
+        auto RDA = addr<DefNode*>(RD);
+        NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this);
+        if (IsPhi(OA))
+          PhiQ.insert(OA.Id);
+      }
+      if (RA.Addr->isDef())
+        unlinkDef(RA);
+      else
+        unlinkUse(RA);
+    }
+    NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this);
+    BA.Addr->removeMember(PA, *this);
+  }
+}
+
+// For a given reference node TA in an instruction node IA, connect the
+// reaching def of TA to the appropriate def node. Create any shadow nodes
+// as appropriate.
+template <typename T>
+void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
+      DefStack &DS) {
+  if (DS.empty())
+    return;
+  RegisterRef RR = TA.Addr->getRegRef();
+  NodeAddr<T> TAP;
+
+  // References from the def stack that have been examined so far.
+  RegisterSet Defs;
+
+  for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
+    RegisterRef QR = I->Addr->getRegRef();
+    auto AliasQR = [QR,this] (RegisterRef RR) -> bool {
+      return RAI.alias(QR, RR);
+    };
+    bool PrecUp = RAI.covers(QR, RR);
+    // Skip all defs that are aliased to any of the defs that we have already
+    // seen. If we encounter a covering def, stop the stack traversal early.
+    if (std::any_of(Defs.begin(), Defs.end(), AliasQR)) {
+      if (PrecUp)
+        break;
+      continue;
+    }
+    // The reaching def.
+    NodeAddr<DefNode*> RDA = *I;
+
+    // Pick the reached node.
+    if (TAP.Id == 0) {
+      TAP = TA;
+    } else {
+      // Mark the existing ref as "shadow" and create a new shadow.
+      TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow);
+      TAP = getNextShadow(IA, TAP, true);
+    }
+
+    // Create the link.
+    TAP.Addr->linkToDef(TAP.Id, RDA);
+
+    if (PrecUp)
+      break;
+    Defs.insert(QR);
+  }
+}
+
+// Create data-flow links for all reference nodes in the statement node SA.
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+  RegisterSet Defs;
+
+  // Link all nodes (upwards in the data-flow) with their reaching defs.
+  for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+    uint16_t Kind = RA.Addr->getKind();
+    assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
+    RegisterRef RR = RA.Addr->getRegRef();
+    // Do not process multiple defs of the same reference.
+    if (Kind == NodeAttrs::Def && Defs.count(RR))
+      continue;
+    Defs.insert(RR);
+
+    auto F = DefM.find(RR);
+    if (F == DefM.end())
+      continue;
+    DefStack &DS = F->second;
+    if (Kind == NodeAttrs::Use)
+      linkRefUp<UseNode*>(SA, RA, DS);
+    else if (Kind == NodeAttrs::Def)
+      linkRefUp<DefNode*>(SA, RA, DS);
+    else
+      llvm_unreachable("Unexpected node in instruction");
+  }
+}
+
+// Create data-flow links for all instructions in the block node BA. This
+// will include updating any phi nodes in BA.
+void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
+  // Push block delimiters.
+  markBlock(BA.Id, DefM);
+
+  // For each non-phi instruction in the block, link all the defs and uses
+  // to their reaching defs. For any member of the block (including phis),
+  // push the defs on the corresponding stacks.
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
+    // Ignore phi nodes here. They will be linked part by part from the
+    // predecessors.
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA);
+
+    // Push the definitions on the stack.
+    pushDefs(IA, DefM);
+  }
+
+  // Recursively process all children in the dominator tree.
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  for (auto I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    auto SBA = Func.Addr->findBlock(SB, *this);
+    linkBlockRefs(DefM, SBA);
+  }
+
+  // Link the phi uses from the successor blocks.
+  auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool {
+    if (NA.Addr->getKind() != NodeAttrs::Use)
+      return false;
+    assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
+    NodeAddr<PhiUseNode*> PUA = NA;
+    return PUA.Addr->getPredecessor() == BA.Id;
+  };
+  MachineBasicBlock *MBB = BA.Addr->getCode();
+  for (auto SB : MBB->successors()) {
+    auto SBA = Func.Addr->findBlock(SB, *this);
+    for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) {
+      // Go over each phi use associated with MBB, and link it.
+      for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
+        NodeAddr<PhiUseNode*> PUA = U;
+        RegisterRef RR = PUA.Addr->getRegRef();
+        linkRefUp<UseNode*>(IA, PUA, DefM[RR]);
+      }
+    }
+  }
+
+  // Pop all defs from this block from the definition stacks.
+  releaseBlock(BA.Id, DefM);
+}
+
+// Remove the use node UA from any data-flow and structural links.
+void DataFlowGraph::unlinkUse(NodeAddr<UseNode*> UA) {
+  NodeId RD = UA.Addr->getReachingDef();
+  NodeId Sib = UA.Addr->getSibling();
+
+  NodeAddr<InstrNode*> IA = UA.Addr->getOwner(*this);
+  IA.Addr->removeMember(UA, *this);
+
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  auto RDA = addr<DefNode*>(RD);
+  auto TA = addr<UseNode*>(RDA.Addr->getReachedUse());
+  if (TA.Id == UA.Id) {
+    RDA.Addr->setReachedUse(Sib);
+    return;
+  }
+
+  while (TA.Id != 0) {
+    NodeId S = TA.Addr->getSibling();
+    if (S == UA.Id) {
+      TA.Addr->setSibling(UA.Addr->getSibling());
+      return;
+    }
+    TA = addr<UseNode*>(S);
+  }
+}
+
+// Remove the def node DA from any data-flow and structural links.
+void DataFlowGraph::unlinkDef(NodeAddr<DefNode*> DA) {
+  //
+  //         RD
+  //         | reached
+  //         | def
+  //         :
+  //         .
+  //        +----+
+  // ... -- | DA | -- ... -- 0  : sibling chain of DA
+  //        +----+
+  //         |  | reached
+  //         |  : def
+  //         |  .
+  //         | ...  : Siblings (defs)
+  //         |
+  //         : reached
+  //         . use
+  //        ... : sibling chain of reached uses
+
+  NodeId RD = DA.Addr->getReachingDef();
+
+  // Visit all siblings of the reached def and reset their reaching defs.
+  // Also, defs reached by DA are now "promoted" to being reached by RD,
+  // so all of them will need to be spliced into the sibling chain where
+  // DA belongs.
+  auto getAllNodes = [this] (NodeId N) -> NodeList {
+    NodeList Res;
+    while (N) {
+      auto RA = addr<RefNode*>(N);
+      // Keep the nodes in the exact sibling order.
+      Res.push_back(RA);
+      N = RA.Addr->getSibling();
+    }
+    return Res;
+  };
+  NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
+  NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
+
+  if (RD == 0) {
+    for (NodeAddr<RefNode*> I : ReachedDefs)
+      I.Addr->setSibling(0);
+    for (NodeAddr<RefNode*> I : ReachedUses)
+      I.Addr->setSibling(0);
+  }
+  for (NodeAddr<DefNode*> I : ReachedDefs)
+    I.Addr->setReachingDef(RD);
+  for (NodeAddr<UseNode*> I : ReachedUses)
+    I.Addr->setReachingDef(RD);
+
+  NodeId Sib = DA.Addr->getSibling();
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  // Update the reaching def node and remove DA from the sibling list.
+  auto RDA = addr<DefNode*>(RD);
+  auto TA = addr<DefNode*>(RDA.Addr->getReachedDef());
+  if (TA.Id == DA.Id) {
+    // If DA is the first reached def, just update the RD's reached def
+    // to the DA's sibling.
+    RDA.Addr->setReachedDef(Sib);
+  } else {
+    // Otherwise, traverse the sibling list of the reached defs and remove
+    // DA from it.
+    while (TA.Id != 0) {
+      NodeId S = TA.Addr->getSibling();
+      if (S == DA.Id) {
+        TA.Addr->setSibling(Sib);
+        break;
+      }
+      TA = addr<DefNode*>(S);
+    }
+  }
+
+  // Splice the DA's reached defs into the RDA's reached def chain.
+  if (!ReachedDefs.empty()) {
+    auto Last = NodeAddr<DefNode*>(ReachedDefs.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedDef());
+    RDA.Addr->setReachedDef(ReachedDefs.front().Id);
+  }
+  // Splice the DA's reached uses into the RDA's reached use chain.
+  if (!ReachedUses.empty()) {
+    auto Last = NodeAddr<UseNode*>(ReachedUses.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedUse());
+    RDA.Addr->setReachedUse(ReachedUses.front().Id);
+  }
+
+  NodeAddr<InstrNode*> IA = DA.Addr->getOwner(*this);
+  IA.Addr->removeMember(DA, *this);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
new file mode 100644
index 0000000..7da7bb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
@@ -0,0 +1,841 @@
+//===--- RDFGraph.h -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF)
+// for a non-SSA program representation (e.g. post-RA machine code).
+//
+//
+// *** Introduction
+//
+// The RDF graph is a collection of nodes, each of which denotes some element
+// of the program. There are two main types of such elements: code and refe-
+// rences. Conceptually, "code" is something that represents the structure
+// of the program, e.g. basic block or a statement, while "reference" is an
+// instance of accessing a register, e.g. a definition or a use. Nodes are
+// connected with each other based on the structure of the program (such as
+// blocks, instructions, etc.), and based on the data flow (e.g. reaching
+// definitions, reached uses, etc.). The single-reaching-definition principle
+// of SSA is generally observed, although, due to the non-SSA representation
+// of the program, there are some differences between the graph and a "pure"
+// SSA representation.
+//
+//
+// *** Implementation remarks
+//
+// Since the graph can contain a large number of nodes, memory consumption
+// was one of the major design considerations. As a result, there is a single
+// base class NodeBase which defines all members used by all possible derived
+// classes. The members are arranged in a union, and a derived class cannot
+// add any data members of its own. Each derived class only defines the
+// functional interface, i.e. member functions. NodeBase must be a POD,
+// which implies that all of its members must also be PODs.
+// Since nodes need to be connected with other nodes, pointers have been
+// replaced with 32-bit identifiers: each node has an id of type NodeId.
+// There are mapping functions in the graph that translate between actual
+// memory addresses and the corresponding identifiers.
+// A node id of 0 is equivalent to nullptr.
+//
+//
+// *** Structure of the graph
+//
+// A code node is always a collection of other nodes. For example, a code
+// node corresponding to a basic block will contain code nodes corresponding
+// to instructions. In turn, a code node corresponding to an instruction will
+// contain a list of reference nodes that correspond to the definitions and
+// uses of registers in that instruction. The members are arranged into a
+// circular list, which is yet another consequence of the effort to save
+// memory: for each member node it should be possible to obtain its owner,
+// and it should be possible to access all other members. There are other
+// ways to accomplish that, but the circular list seemed the most natural.
+//
+// +- CodeNode -+
+// |            | <---------------------------------------------------+
+// +-+--------+-+                                                     |
+//   |FirstM  |LastM                                                  |
+//   |        +-------------------------------------+                 |
+//   |                                              |                 |
+//   V                                              V                 |
+//  +----------+ Next +----------+ Next       Next +----------+ Next  |
+//  |          |----->|          |-----> ... ----->|          |----->-+
+//  +- Member -+      +- Member -+                 +- Member -+
+//
+// The order of members is such that related reference nodes (see below)
+// should be contiguous on the member list.
+//
+// A reference node is a node that encapsulates an access to a register,
+// in other words, data flowing into or out of a register. There are two
+// major kinds of reference nodes: defs and uses. A def node will contain
+// the id of the first reached use, and the id of the first reached def.
+// Each def and use will contain the id of the reaching def, and also the
+// id of the next reached def (for def nodes) or use (for use nodes).
+// The "next node sharing the same reaching def" is denoted as "sibling".
+// In summary:
+// - Def node contains: reaching def, sibling, first reached def, and first
+// reached use.
+// - Use node contains: reaching def and sibling.
+//
+// +-- DefNode --+
+// | R2 = ...    | <---+--------------------+
+// ++---------+--+     |                    |
+//  |Reached  |Reached |                    |
+//  |Def      |Use     |                    |
+//  |         |        |Reaching            |Reaching
+//  |         V        |Def                 |Def
+//  |      +-- UseNode --+ Sib  +-- UseNode --+ Sib       Sib
+//  |      | ... = R2    |----->| ... = R2    |----> ... ----> 0
+//  |      +-------------+      +-------------+
+//  V
+// +-- DefNode --+ Sib
+// | R2 = ...    |----> ...
+// ++---------+--+
+//  |         |
+//  |         |
+// ...       ...
+//
+// To get a full picture, the circular lists connecting blocks within a
+// function, instructions within a block, etc. should be superimposed with
+// the def-def, def-use links shown above.
+// To illustrate this, consider a small example in a pseudo-assembly:
+// foo:
+//   add r2, r0, r1   ; r2 = r0+r1
+//   addi r0, r2, 1   ; r0 = r2+1
+//   ret r0           ; return value in r0
+//
+// The graph (in a format used by the debugging functions) would look like:
+//
+//   DFG dump:[
+//   f1: Function foo
+//   b2: === BB#0 === preds(0), succs(0):
+//   p3: phi [d4<r0>(,d12,u9):]
+//   p5: phi [d6<r1>(,,u10):]
+//   s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
+//   s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
+//   s14: ret [u15<r0>(d12):]
+//   ]
+//
+// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
+// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
+// ment, d - def, u - use).
+// The format of a def node is:
+//   dN<R>(rd,d,u):sib,
+// where
+//   N   - numeric node id,
+//   R   - register being defined
+//   rd  - reaching def,
+//   d   - reached def,
+//   u   - reached use,
+//   sib - sibling.
+// The format of a use node is:
+//   uN<R>[!](rd):sib,
+// where
+//   N   - numeric node id,
+//   R   - register being used,
+//   rd  - reaching def,
+//   sib - sibling.
+// Possible annotations (usually preceding the node id):
+//   +   - preserving def,
+//   ~   - clobbering def,
+//   "   - shadow ref (follows the node id),
+//   !   - fixed register (appears after register name).
+//
+// The circular lists are not explicit in the dump.
+//
+//
+// *** Node attributes
+//
+// NodeBase has a member "Attrs", which is the primary way of determining
+// the node's characteristics. The fields in this member decide whether
+// the node is a code node or a reference node (i.e. node's "type"), then
+// within each type, the "kind" determines what specifically this node
+// represents. The remaining bits, "flags", contain additional information
+// that is even more detailed than the "kind".
+// CodeNode's kinds are:
+// - Phi:   Phi node, members are reference nodes.
+// - Stmt:  Statement, members are reference nodes.
+// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
+// - Func:  The whole function. The members are basic block nodes.
+// RefNode's kinds are:
+// - Use.
+// - Def.
+//
+// Meaning of flags:
+// - Preserving: applies only to defs. A preserving def is one that can
+//   preserve some of the original bits among those that are included in
+//   the register associated with that def. For example, if R0 is a 32-bit
+//   register, but a def can only change the lower 16 bits, then it will
+//   be marked as preserving.
+// - Shadow: a reference that has duplicates holding additional reaching
+//   defs (see more below).
+// - Clobbering: applied only to defs, indicates that the value generated
+//   by this def is unspecified. A typical example would be volatile registers
+//   after function calls.
+//
+//
+// *** Shadow references
+//
+// It may happen that a super-register can have two (or more) non-overlapping
+// sub-registers. When both of these sub-registers are defined and followed
+// by a use of the super-register, the use of the super-register will not
+// have a unique reaching def: both defs of the sub-registers need to be
+// accounted for. In such cases, a duplicate use of the super-register is
+// added and it points to the extra reaching def. Both uses are marked with
+// a flag "shadow". Example:
+// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
+//   set r0, 1        ; r0 = 1
+//   set r1, 1        ; r1 = 1
+//   addi t1, t0, 1   ; t1 = t0+1
+//
+// The DFG:
+//   s1: set [d2<r0>(,,u9):]
+//   s3: set [d4<r1>(,,u10):]
+//   s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
+//
+// The statement s5 has two use nodes for t0: u7" and u9". The quotation
+// mark " indicates that the node is a shadow.
+//
+#ifndef RDF_GRAPH_H
+#define RDF_GRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+
+#include <functional>
+#include <map>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineOperand;
+  class MachineDominanceFrontier;
+  class MachineDominatorTree;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+}
+
+namespace rdf {
+  typedef uint32_t NodeId;
+
+  struct NodeAttrs {
+    enum : uint16_t {
+      None          = 0x0000,   // Nothing
+
+      // Types: 2 bits
+      TypeMask      = 0x0003,
+      Code          = 0x0001,   // 01, Container
+      Ref           = 0x0002,   // 10, Reference
+
+      // Kind: 3 bits
+      KindMask      = 0x0007 << 2,
+      Def           = 0x0001 << 2,  // 001
+      Use           = 0x0002 << 2,  // 010
+      Phi           = 0x0003 << 2,  // 011
+      Stmt          = 0x0004 << 2,  // 100
+      Block         = 0x0005 << 2,  // 101
+      Func          = 0x0006 << 2,  // 110
+
+      // Flags: 5 bits for now
+      FlagMask      = 0x001F << 5,
+      Shadow        = 0x0001 << 5,  // 00001, Has extra reaching defs.
+      Clobbering    = 0x0002 << 5,  // 00010, Produces unspecified values.
+      PhiRef        = 0x0004 << 5,  // 00100, Member of PhiNode.
+      Preserving    = 0x0008 << 5,  // 01000, Def can keep original bits.
+      Fixed         = 0x0010 << 5,  // 10000, Fixed register.
+    };
+
+    static uint16_t type(uint16_t T)  { return T & TypeMask; }
+    static uint16_t kind(uint16_t T)  { return T & KindMask; }
+    static uint16_t flags(uint16_t T) { return T & FlagMask; }
+
+    static uint16_t set_type(uint16_t A, uint16_t T) {
+      return (A & ~TypeMask) | T;
+    }
+    static uint16_t set_kind(uint16_t A, uint16_t K) {
+      return (A & ~KindMask) | K;
+    }
+    static uint16_t set_flags(uint16_t A, uint16_t F) {
+      return (A & ~FlagMask) | F;
+    }
+
+    // Test if A contains B.
+    static bool contains(uint16_t A, uint16_t B) {
+      if (type(A) != Code)
+        return false;
+      uint16_t KB = kind(B);
+      switch (kind(A)) {
+        case Func:
+          return KB == Block;
+        case Block:
+          return KB == Phi || KB == Stmt;
+        case Phi:
+        case Stmt:
+          return type(B) == Ref;
+      }
+      return false;
+    }
+  };
+
+  template <typename T> struct NodeAddr {
+    NodeAddr() : Addr(nullptr), Id(0) {}
+    NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
+    NodeAddr(const NodeAddr&) = default;
+    NodeAddr &operator= (const NodeAddr&) = default;
+
+    bool operator== (const NodeAddr<T> &NA) const {
+      assert((Addr == NA.Addr) == (Id == NA.Id));
+      return Addr == NA.Addr;
+    }
+    bool operator!= (const NodeAddr<T> &NA) const {
+      return !operator==(NA);
+    }
+    // Type cast (casting constructor). The reason for having this class
+    // instead of std::pair.
+    template <typename S> NodeAddr(const NodeAddr<S> &NA)
+      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
+
+    T Addr;
+    NodeId Id;
+  };
+
+  struct NodeBase;
+
+  // Fast memory allocation and translation between node id and node address.
+  // This is really the same idea as the one underlying the "bump pointer
+  // allocator", the difference being in the translation. A node id is
+  // composed of two components: the index of the block in which it was
+  // allocated, and the index within the block. With the default settings,
+  // where the number of nodes per block is 4096, the node id (minus 1) is:
+  //
+  // bit position:                11             0
+  // +----------------------------+--------------+
+  // | Index of the block         |Index in block|
+  // +----------------------------+--------------+
+  //
+  // The actual node id is the above plus 1, to avoid creating a node id of 0.
+  //
+  // This method significantly improved the build time, compared to using maps
+  // (std::unordered_map or DenseMap) to translate between pointers and ids.
+  struct NodeAllocator {
+    // Amount of storage for a single node.
+    enum { NodeMemSize = 32 };
+    NodeAllocator(uint32_t NPB = 4096)
+        : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
+          IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) {
+      assert(isPowerOf2_32(NPB));
+    }
+    NodeBase *ptr(NodeId N) const {
+      uint32_t N1 = N-1;
+      uint32_t BlockN = N1 >> BitsPerIndex;
+      uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
+      return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
+    }
+    NodeId id(const NodeBase *P) const;
+    NodeAddr<NodeBase*> New();
+    void clear();
+
+  private:
+    void startNewBlock();
+    bool needNewBlock();
+    uint32_t makeId(uint32_t Block, uint32_t Index) const {
+      // Add 1 to the id, to avoid the id of 0, which is treated as "null".
+      return ((Block << BitsPerIndex) | Index) + 1;
+    }
+
+    const uint32_t NodesPerBlock;
+    const uint32_t BitsPerIndex;
+    const uint32_t IndexMask;
+    char *ActiveEnd;
+    std::vector<char*> Blocks;
+    typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy;
+    AllocatorTy MemPool;
+  };
+
+  struct RegisterRef {
+    unsigned Reg, Sub;
+
+    // No non-trivial constructors, since this will be a member of a union.
+    RegisterRef() = default;
+    RegisterRef(const RegisterRef &RR) = default;
+    RegisterRef &operator= (const RegisterRef &RR) = default;
+    bool operator== (const RegisterRef &RR) const {
+      return Reg == RR.Reg && Sub == RR.Sub;
+    }
+    bool operator!= (const RegisterRef &RR) const {
+      return !operator==(RR);
+    }
+    bool operator< (const RegisterRef &RR) const {
+      return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
+    }
+  };
+  typedef std::set<RegisterRef> RegisterSet;
+
+  struct RegisterAliasInfo {
+    RegisterAliasInfo(const TargetRegisterInfo &tri) : TRI(tri) {}
+    virtual ~RegisterAliasInfo() {}
+
+    virtual std::vector<RegisterRef> getAliasSet(RegisterRef RR) const;
+    virtual bool alias(RegisterRef RA, RegisterRef RB) const;
+    virtual bool covers(RegisterRef RA, RegisterRef RB) const;
+    virtual bool covers(const RegisterSet &RRs, RegisterRef RR) const;
+
+    const TargetRegisterInfo &TRI;
+  };
+
+  struct TargetOperandInfo {
+    TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
+    virtual ~TargetOperandInfo() {}
+    virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
+    virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
+    virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
+
+    const TargetInstrInfo &TII;
+  };
+
+
+  struct DataFlowGraph;
+
+  struct NodeBase {
+  public:
+    // Make sure this is a POD.
+    NodeBase() = default;
+    uint16_t getType()  const { return NodeAttrs::type(Attrs); }
+    uint16_t getKind()  const { return NodeAttrs::kind(Attrs); }
+    uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
+    NodeId   getNext()  const { return Next; }
+
+    uint16_t getAttrs() const { return Attrs; }
+    void setAttrs(uint16_t A) { Attrs = A; }
+    void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
+
+    // Insert node NA after "this" in the circular chain.
+    void append(NodeAddr<NodeBase*> NA);
+    // Initialize all members to 0.
+    void init() { memset(this, 0, sizeof *this); }
+    void setNext(NodeId N) { Next = N; }
+
+  protected:
+    uint16_t Attrs;
+    uint16_t Reserved;
+    NodeId Next;                // Id of the next node in the circular chain.
+    // Definitions of nested types. Using anonymous nested structs would make
+    // this class definition clearer, but unnamed structs are not a part of
+    // the standard.
+    struct Def_struct  {
+      NodeId DD, DU;          // Ids of the first reached def and use.
+    };
+    struct PhiU_struct  {
+      NodeId PredB;           // Id of the predecessor block for a phi use.
+    };
+    struct Code_struct {
+      void *CP;               // Pointer to the actual code.
+      NodeId FirstM, LastM;   // Id of the first member and last.
+    };
+    struct Ref_struct {
+      NodeId RD, Sib;         // Ids of the reaching def and the sibling.
+      union {
+        Def_struct Def;
+        PhiU_struct PhiU;
+      };
+      union {
+        MachineOperand *Op;   // Non-phi refs point to a machine operand.
+        RegisterRef RR;       // Phi refs store register info directly.
+      };
+    };
+
+    // The actual payload.
+    union {
+      Ref_struct Ref;
+      Code_struct Code;
+    };
+  };
+  // The allocator allocates chunks of 32 bytes for each node. The fact that
+  // each node takes 32 bytes in memory is used for fast translation between
+  // the node id and the node address.
+  static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
+        "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
+
+  typedef std::vector<NodeAddr<NodeBase*>> NodeList;
+  typedef std::set<NodeId> NodeSet;
+
+  struct RefNode : public NodeBase {
+    RefNode() = default;
+    RegisterRef getRegRef() const;
+    MachineOperand &getOp() {
+      assert(!(getFlags() & NodeAttrs::PhiRef));
+      return *Ref.Op;
+    }
+    void setRegRef(RegisterRef RR);
+    void setRegRef(MachineOperand *Op);
+    NodeId getReachingDef() const {
+      return Ref.RD;
+    }
+    void setReachingDef(NodeId RD) {
+      Ref.RD = RD;
+    }
+    NodeId getSibling() const {
+      return Ref.Sib;
+    }
+    void setSibling(NodeId Sib) {
+      Ref.Sib = Sib;
+    }
+    bool isUse() const {
+      assert(getType() == NodeAttrs::Ref);
+      return getKind() == NodeAttrs::Use;
+    }
+    bool isDef() const {
+      assert(getType() == NodeAttrs::Ref);
+      return getKind() == NodeAttrs::Def;
+    }
+
+    template <typename Predicate>
+    NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
+        const DataFlowGraph &G);
+    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+  };
+
+  struct DefNode : public RefNode {
+    NodeId getReachedDef() const {
+      return Ref.Def.DD;
+    }
+    void setReachedDef(NodeId D) {
+      Ref.Def.DD = D;
+    }
+    NodeId getReachedUse() const {
+      return Ref.Def.DU;
+    }
+    void setReachedUse(NodeId U) {
+      Ref.Def.DU = U;
+    }
+
+    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+  };
+
+  struct UseNode : public RefNode {
+    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+  };
+
+  struct PhiUseNode : public UseNode {
+    NodeId getPredecessor() const {
+      assert(getFlags() & NodeAttrs::PhiRef);
+      return Ref.PhiU.PredB;
+    }
+    void setPredecessor(NodeId B) {
+      assert(getFlags() & NodeAttrs::PhiRef);
+      Ref.PhiU.PredB = B;
+    }
+  };
+
+  struct CodeNode : public NodeBase {
+    template <typename T> T getCode() const {
+      return static_cast<T>(Code.CP);
+    }
+    void setCode(void *C) {
+      Code.CP = C;
+    }
+
+    NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
+    NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
+    void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+    void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+        const DataFlowGraph &G);
+    void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+
+    NodeList members(const DataFlowGraph &G) const;
+    template <typename Predicate>
+    NodeList members_if(Predicate P, const DataFlowGraph &G) const;
+  };
+
+  struct InstrNode : public CodeNode {
+    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+  };
+
+  struct PhiNode : public InstrNode {
+    MachineInstr *getCode() const {
+      return nullptr;
+    }
+  };
+
+  struct StmtNode : public InstrNode {
+    MachineInstr *getCode() const {
+      return CodeNode::getCode<MachineInstr*>();
+    }
+  };
+
+  struct BlockNode : public CodeNode {
+    MachineBasicBlock *getCode() const {
+      return CodeNode::getCode<MachineBasicBlock*>();
+    }
+    void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
+  };
+
+  struct FuncNode : public CodeNode {
+    MachineFunction *getCode() const {
+      return CodeNode::getCode<MachineFunction*>();
+    }
+    NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
+        const DataFlowGraph &G) const;
+    NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
+  };
+
+  struct DataFlowGraph {
+    DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+        const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+        const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai,
+        const TargetOperandInfo &toi);
+
+    NodeBase *ptr(NodeId N) const;
+    template <typename T> T ptr(NodeId N) const {
+      return static_cast<T>(ptr(N));
+    }
+    NodeId id(const NodeBase *P) const;
+
+    template <typename T> NodeAddr<T> addr(NodeId N) const {
+      return { ptr<T>(N), N };
+    }
+
+    NodeAddr<FuncNode*> getFunc() const {
+      return Func;
+    }
+    MachineFunction &getMF() const {
+      return MF;
+    }
+    const TargetInstrInfo &getTII() const {
+      return TII;
+    }
+    const TargetRegisterInfo &getTRI() const {
+      return TRI;
+    }
+    const MachineDominatorTree &getDT() const {
+      return MDT;
+    }
+    const MachineDominanceFrontier &getDF() const {
+      return MDF;
+    }
+    const RegisterAliasInfo &getRAI() const {
+      return RAI;
+    }
+
+    struct DefStack {
+      DefStack() = default;
+      bool empty() const { return Stack.empty() || top() == bottom(); }
+    private:
+      typedef NodeAddr<DefNode*> value_type;
+      struct Iterator {
+        typedef DefStack::value_type value_type;
+        Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
+        Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
+        value_type operator*() const {
+          assert(Pos >= 1);
+          return DS.Stack[Pos-1];
+        }
+        const value_type *operator->() const {
+          assert(Pos >= 1);
+          return &DS.Stack[Pos-1];
+        }
+        bool operator==(const Iterator &It) const { return Pos == It.Pos; }
+        bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
+      private:
+        Iterator(const DefStack &S, bool Top);
+        // Pos-1 is the index in the StorageType object that corresponds to
+        // the top of the DefStack.
+        const DefStack &DS;
+        unsigned Pos;
+        friend struct DefStack;
+      };
+    public:
+      typedef Iterator iterator;
+      iterator top() const { return Iterator(*this, true); }
+      iterator bottom() const { return Iterator(*this, false); }
+      unsigned size() const;
+
+      void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
+      void pop();
+      void start_block(NodeId N);
+      void clear_block(NodeId N);
+    private:
+      friend struct Iterator;
+      typedef std::vector<value_type> StorageType;
+      bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
+        return (P.Addr == nullptr) && (N == 0 || P.Id == N);
+      }
+      unsigned nextUp(unsigned P) const;
+      unsigned nextDown(unsigned P) const;
+      StorageType Stack;
+    };
+
+    typedef std::map<RegisterRef,DefStack> DefStackMap;
+
+    void build();
+    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void markBlock(NodeId B, DefStackMap &DefM);
+    void releaseBlock(NodeId B, DefStackMap &DefM);
+
+    NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA, bool Create);
+    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA, bool Create);
+    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+
+    NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+
+    void unlinkUse(NodeAddr<UseNode*> UA);
+    void unlinkDef(NodeAddr<DefNode*> DA);
+
+    // Some useful filters.
+    template <uint16_t Kind>
+    static bool IsRef(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == Kind;
+    }
+    template <uint16_t Kind>
+    static bool IsCode(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Code &&
+             BA.Addr->getKind() == Kind;
+    }
+    static bool IsDef(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == NodeAttrs::Def;
+    }
+    static bool IsUse(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == NodeAttrs::Use;
+    }
+    static bool IsPhi(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Code &&
+             BA.Addr->getKind() == NodeAttrs::Phi;
+    }
+
+  private:
+    void reset();
+
+    NodeAddr<NodeBase*> newNode(uint16_t Attrs);
+    NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B);
+    NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner,
+        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+    NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner,
+        RegisterRef RR, NodeAddr<BlockNode*> PredB,
+        uint16_t Flags = NodeAttrs::PhiRef);
+    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+        RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
+    NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner);
+    NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner,
+        MachineInstr *MI);
+    NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner,
+        MachineBasicBlock *BB);
+    NodeAddr<FuncNode*> newFunc(MachineFunction *MF);
+
+    template <typename Predicate>
+    std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+    locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+        Predicate P) const;
+
+    typedef std::map<NodeId,RegisterSet> BlockRefsMap;
+
+    void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
+    void buildBlockRefs(NodeAddr<BlockNode*> BA, BlockRefsMap &RefM);
+    void recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+        NodeAddr<BlockNode*> BA);
+    void buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+        NodeAddr<BlockNode*> BA);
+    void removeUnusedPhis();
+
+    template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
+        NodeAddr<T> TA, DefStack &DS);
+    void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+    void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
+
+    TimerGroup TimeG;
+    NodeAddr<FuncNode*> Func;
+    NodeAllocator Memory;
+
+    MachineFunction &MF;
+    const TargetInstrInfo &TII;
+    const TargetRegisterInfo &TRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const RegisterAliasInfo &RAI;
+    const TargetOperandInfo &TOI;
+  };  // struct DataFlowGraph
+
+  template <typename Predicate>
+  NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
+        bool NextOnly, const DataFlowGraph &G) {
+    // Get the "Next" reference in the circular list that references RR and
+    // satisfies predicate "Pred".
+    auto NA = G.addr<NodeBase*>(getNext());
+
+    while (NA.Addr != this) {
+      if (NA.Addr->getType() == NodeAttrs::Ref) {
+        NodeAddr<RefNode*> RA = NA;
+        if (RA.Addr->getRegRef() == RR && P(NA))
+          return NA;
+        if (NextOnly)
+          break;
+        NA = G.addr<NodeBase*>(NA.Addr->getNext());
+      } else {
+        // We've hit the beginning of the chain.
+        assert(NA.Addr->getType() == NodeAttrs::Code);
+        NodeAddr<CodeNode*> CA = NA;
+        NA = CA.Addr->getFirstMember(G);
+      }
+    }
+    // Return the equivalent of "nullptr" if such a node was not found.
+    return NodeAddr<RefNode*>();
+  }
+
+  template <typename Predicate>
+  NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
+    NodeList MM;
+    auto M = getFirstMember(G);
+    if (M.Id == 0)
+      return MM;
+
+    while (M.Addr != this) {
+      if (P(M))
+        MM.push_back(M);
+      M = G.addr<NodeBase*>(M.Addr->getNext());
+    }
+    return MM;
+  }
+
+
+  template <typename T> struct Print;
+  template <typename T>
+  raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P);
+
+  template <typename T>
+  struct Print {
+    Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
+    const T &Obj;
+    const DataFlowGraph &G;
+  };
+
+  template <typename T>
+  struct PrintNode : Print<NodeAddr<T>> {
+    PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
+      : Print<NodeAddr<T>>(x, g) {}
+  };
+} // namespace rdf
+
+#endif // RDF_GRAPH_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
new file mode 100644
index 0000000..1d9bd37
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
@@ -0,0 +1,848 @@
+//===--- RDFLiveness.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Computation of the liveness information from the data-flow graph.
+//
+// The main functionality of this code is to compute block live-in
+// information. With the live-in information in place, the placement
+// of kill flags can also be recalculated.
+//
+// The block live-in calculation is based on the ideas from the following
+// publication:
+//
+// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
+// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
+// ACM Transactions on Architecture and Code Optimization, Association for
+// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
+// and Embedded Architectures and Compilers", 8 (4),
+// <10.1145/2086696.2086706>. <hal-00647369>
+//
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace rdf {
+  template<>
+  raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
+    OS << '{';
+    for (auto I : P.Obj) {
+      OS << ' ' << Print<RegisterRef>(I.first, P.G) << '{';
+      for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
+        OS << Print<NodeId>(*J, P.G);
+        if (++J != E)
+          OS << ',';
+      }
+      OS << '}';
+    }
+    OS << " }";
+    return OS;
+  }
+}
+
+// The order in the returned sequence is the order of reaching defs in the
+// upward traversal: the first def is the closest to the given reference RefA,
+// the next one is further up, and so on.
+// The list ends at a reaching phi def, or when the reference from RefA is
+// covered by the defs in the list (see FullChain).
+// This function provides two modes of operation:
+// (1) Returning the sequence of reaching defs for a particular reference
+// node. This sequence will terminate at the first phi node [1].
+// (2) Returning a partial sequence of reaching defs, where the final goal
+// is to traverse past phi nodes to the actual defs arising from the code
+// itself.
+// In mode (2), the register reference for which the search was started
+// may be different from the reference node RefA, for which this call was
+// made, hence the argument RefRR, which holds the original register.
+// Also, some definitions may have already been encountered in a previous
+// call that will influence register covering. The register references
+// already defined are passed in through DefRRs.
+// In mode (1), the "continuation" considerations do not apply, and the
+// RefRR is the same as the register in RefA, and the set DefRRs is empty.
+//
+// [1] It is possible for multiple phi nodes to be included in the returned
+// sequence:
+//   SubA = phi ...
+//   SubB = phi ...
+//   ...  = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
+// However, these phi nodes are independent from one another in terms of
+// the data-flow.
+
+NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
+      NodeAddr<RefNode*> RefA, bool FullChain, const RegisterSet &DefRRs) {
+  SetVector<NodeId> DefQ;
+  SetVector<NodeId> Owners;
+
+  // The initial queue should not have reaching defs for shadows. The
+  // whole point of a shadow is that it will have a reaching def that
+  // is not aliased to the reaching defs of the related shadows.
+  NodeId Start = RefA.Id;
+  auto SNA = DFG.addr<RefNode*>(Start);
+  if (NodeId RD = SNA.Addr->getReachingDef())
+    DefQ.insert(RD);
+
+  // Collect all the reaching defs, going up until a phi node is encountered,
+  // or there are no more reaching defs. From this set, the actual set of
+  // reaching defs will be selected.
+  // The traversal upwards must go on until a covering def is encountered.
+  // It is possible that a collection of non-covering (individually) defs
+  // will be sufficient, but keep going until a covering one is found.
+  for (unsigned i = 0; i < DefQ.size(); ++i) {
+    auto TA = DFG.addr<DefNode*>(DefQ[i]);
+    if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
+      continue;
+    // Stop at the covering/overwriting def of the initial register reference.
+    RegisterRef RR = TA.Addr->getRegRef();
+    if (RAI.covers(RR, RefRR)) {
+      uint16_t Flags = TA.Addr->getFlags();
+      if (!(Flags & NodeAttrs::Preserving))
+        continue;
+    }
+    // Get the next level of reaching defs. This will include multiple
+    // reaching defs for shadows.
+    for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
+      if (auto RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
+
+  // Remove all non-phi defs that are not aliased to RefRR, and collect
+  // the owners of the remaining defs.
+  SetVector<NodeId> Defs;
+  for (auto N : DefQ) {
+    auto TA = DFG.addr<DefNode*>(N);
+    bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
+    if (!IsPhi && !RAI.alias(RefRR, TA.Addr->getRegRef()))
+      continue;
+    Defs.insert(TA.Id);
+    Owners.insert(TA.Addr->getOwner(DFG).Id);
+  }
+
+  // Return the MachineBasicBlock containing a given instruction.
+  auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* {
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent();
+    assert(IA.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<PhiNode*> PA = IA;
+    NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
+    return BA.Addr->getCode();
+  };
+  // Less(A,B) iff instruction A is further down in the dominator tree than B.
+  auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
+    if (A == B)
+      return false;
+    auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B);
+    MachineBasicBlock *BA = Block(OA), *BB = Block(OB);
+    if (BA != BB)
+      return MDT.dominates(BB, BA);
+    // They are in the same block.
+    bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
+    bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
+    if (StmtA) {
+      if (!StmtB)   // OB is a phi and phis dominate statements.
+        return true;
+      auto CA = NodeAddr<StmtNode*>(OA).Addr->getCode();
+      auto CB = NodeAddr<StmtNode*>(OB).Addr->getCode();
+      // The order must be linear, so tie-break such equalities.
+      if (CA == CB)
+        return A < B;
+      return MDT.dominates(CB, CA);
+    } else {
+      // OA is a phi.
+      if (StmtB)
+        return false;
+      // Both are phis. There is no ordering between phis (in terms of
+      // the data-flow), so tie-break this via node id comparison.
+      return A < B;
+    }
+  };
+
+  std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
+  std::sort(Tmp.begin(), Tmp.end(), Less);
+
+  // The vector is a list of instructions, so that defs coming from
+  // the same instruction don't need to be artificially ordered.
+  // Then, when computing the initial segment, and iterating over an
+  // instruction, pick the defs that contribute to the covering (i.e. is
+  // not covered by previously added defs). Check the defs individually,
+  // i.e. first check each def if is covered or not (without adding them
+  // to the tracking set), and then add all the selected ones.
+
+  // The reason for this is this example:
+  // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
+  // *d3<C>              If A \incl BuC, and B \incl AuC, then *d2 would be
+  //                     covered if we added A first, and A would be covered
+  //                     if we added B first.
+
+  NodeList RDefs;
+  RegisterSet RRs = DefRRs;
+
+  auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getKind() == NodeAttrs::Def &&
+           Defs.count(TA.Id);
+  };
+  for (auto T : Tmp) {
+    if (!FullChain && RAI.covers(RRs, RefRR))
+      break;
+    auto TA = DFG.addr<InstrNode*>(T);
+    bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
+    NodeList Ds;
+    for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) {
+      auto QR = DA.Addr->getRegRef();
+      // Add phi defs even if they are covered by subsequent defs. This is
+      // for cases where the reached use is not covered by any of the defs
+      // encountered so far: the phi def is needed to expose the liveness
+      // of that use to the entry of the block.
+      // Example:
+      //   phi d1<R3>(,d2,), ...  Phi def d1 is covered by d2.
+      //   d2<R3>(d1,,u3), ...
+      //   ..., u3<D1>(d2)        This use needs to be live on entry.
+      if (FullChain || IsPhi || !RAI.covers(RRs, QR))
+        Ds.push_back(DA);
+    }
+    RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
+    for (NodeAddr<DefNode*> DA : Ds) {
+      // When collecting a full chain of definitions, do not consider phi
+      // defs to actually define a register.
+      uint16_t Flags = DA.Addr->getFlags();
+      if (!FullChain || !(Flags & NodeAttrs::PhiRef))
+        if (!(Flags & NodeAttrs::Preserving))
+          RRs.insert(DA.Addr->getRegRef());
+    }
+  }
+
+  return RDefs;
+}
+
+
+static const RegisterSet NoRegs;
+
+NodeList Liveness::getAllReachingDefs(NodeAddr<RefNode*> RefA) {
+  return getAllReachingDefs(RefA.Addr->getRegRef(), RefA, false, NoRegs);
+}
+
+
+void Liveness::computePhiInfo() {
+  NodeList Phis;
+  NodeAddr<FuncNode*> FA = DFG.getFunc();
+  auto Blocks = FA.Addr->members(DFG);
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    Phis.insert(Phis.end(), Ps.begin(), Ps.end());
+  }
+
+  // phi use -> (map: reaching phi -> set of registers defined in between)
+  std::map<NodeId,std::map<NodeId,RegisterSet>> PhiUp;
+  std::vector<NodeId> PhiUQ;  // Work list of phis for upward propagation.
+
+  // Go over all phis.
+  for (NodeAddr<PhiNode*> PhiA : Phis) {
+    // Go over all defs and collect the reached uses that are non-phi uses
+    // (i.e. the "real uses").
+    auto &RealUses = RealUseMap[PhiA.Id];
+    auto PhiRefs = PhiA.Addr->members(DFG);
+
+    // Have a work queue of defs whose reached uses need to be found.
+    // For each def, add to the queue all reached (non-phi) defs.
+    SetVector<NodeId> DefQ;
+    NodeSet PhiDefs;
+    for (auto R : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Def>(R))
+        continue;
+      DefQ.insert(R.Id);
+      PhiDefs.insert(R.Id);
+    }
+    for (unsigned i = 0; i < DefQ.size(); ++i) {
+      NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]);
+      NodeId UN = DA.Addr->getReachedUse();
+      while (UN != 0) {
+        NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
+        if (!(A.Addr->getFlags() & NodeAttrs::PhiRef))
+          RealUses[getRestrictedRegRef(A)].insert(A.Id);
+        UN = A.Addr->getSibling();
+      }
+      NodeId DN = DA.Addr->getReachedDef();
+      while (DN != 0) {
+        NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN);
+        for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
+          uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags();
+          // Must traverse the reached-def chain. Consider:
+          //   def(D0) -> def(R0) -> def(R0) -> use(D0)
+          // The reachable use of D0 passes through a def of R0.
+          if (!(Flags & NodeAttrs::PhiRef))
+            DefQ.insert(T.Id);
+        }
+        DN = A.Addr->getSibling();
+      }
+    }
+    // Filter out these uses that appear to be reachable, but really
+    // are not. For example:
+    //
+    // R1:0 =          d1
+    //      = R1:0     u2     Reached by d1.
+    //   R0 =          d3
+    //      = R1:0     u4     Still reached by d1: indirectly through
+    //                        the def d3.
+    //   R1 =          d5
+    //      = R1:0     u6     Not reached by d1 (covered collectively
+    //                        by d3 and d5), but following reached
+    //                        defs and uses from d1 will lead here.
+    auto HasDef = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool {
+      return PhiDefs.count(DA.Id);
+    };
+    for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
+      // For each reached register UI->first, there is a set UI->second, of
+      // uses of it. For each such use, check if it is reached by this phi,
+      // i.e. check if the set of its reaching uses intersects the set of
+      // this phi's defs.
+      auto &Uses = UI->second;
+      for (auto I = Uses.begin(), E = Uses.end(); I != E; ) {
+        auto UA = DFG.addr<UseNode*>(*I);
+        NodeList RDs = getAllReachingDefs(UI->first, UA);
+        if (std::any_of(RDs.begin(), RDs.end(), HasDef))
+          ++I;
+        else
+          I = Uses.erase(I);
+      }
+      if (Uses.empty())
+        UI = RealUses.erase(UI);
+      else
+        ++UI;
+    }
+
+    // If this phi reaches some "real" uses, add it to the queue for upward
+    // propagation.
+    if (!RealUses.empty())
+      PhiUQ.push_back(PhiA.Id);
+
+    // Go over all phi uses and check if the reaching def is another phi.
+    // Collect the phis that are among the reaching defs of these uses.
+    // While traversing the list of reaching defs for each phi use, collect
+    // the set of registers defined between this phi (Phi) and the owner phi
+    // of the reaching def.
+    for (auto I : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Use>(I))
+        continue;
+      NodeAddr<UseNode*> UA = I;
+      auto &UpMap = PhiUp[UA.Id];
+      RegisterSet DefRRs;
+      for (NodeAddr<DefNode*> DA : getAllReachingDefs(UA)) {
+        if (DA.Addr->getFlags() & NodeAttrs::PhiRef)
+          UpMap[DA.Addr->getOwner(DFG).Id] = DefRRs;
+        else
+          DefRRs.insert(DA.Addr->getRegRef());
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Phi-up-to-phi map:\n";
+    for (auto I : PhiUp) {
+      dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {";
+      for (auto R : I.second)
+        dbgs() << ' ' << Print<NodeId>(R.first, DFG)
+               << Print<RegisterSet>(R.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  // Propagate the reached registers up in the phi chain.
+  //
+  // The following type of situation needs careful handling:
+  //
+  //   phi d1<R1:0>  (1)
+  //        |
+  //   ... d2<R1>
+  //        |
+  //   phi u3<R1:0>  (2)
+  //        |
+  //   ... u4<R1>
+  //
+  // The phi node (2) defines a register pair R1:0, and reaches a "real"
+  // use u4 of just R1. The same phi node is also known to reach (upwards)
+  // the phi node (1). However, the use u4 is not reached by phi (1),
+  // because of the intervening definition d2 of R1. The data flow between
+  // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
+  //
+  // When propagating uses up the phi chains, get the all reaching defs
+  // for a given phi use, and traverse the list until the propagated ref
+  // is covered, or until or until reaching the final phi. Only assume
+  // that the reference reaches the phi in the latter case.
+
+  for (unsigned i = 0; i < PhiUQ.size(); ++i) {
+    auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
+    auto &RealUses = RealUseMap[PA.Id];
+    for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+      NodeAddr<UseNode*> UA = U;
+      auto &UpPhis = PhiUp[UA.Id];
+      for (auto UP : UpPhis) {
+        bool Changed = false;
+        auto &MidDefs = UP.second;
+        // Collect the set UpReached of uses that are reached by the current
+        // phi PA, and are not covered by any intervening def between PA and
+        // the upward phi UP.
+        RegisterSet UpReached;
+        for (auto T : RealUses) {
+          if (!isRestricted(PA, UA, T.first))
+            continue;
+          if (!RAI.covers(MidDefs, T.first))
+            UpReached.insert(T.first);
+        }
+        if (UpReached.empty())
+          continue;
+        // Update the set PRUs of real uses reached by the upward phi UP with
+        // the actual set of uses (UpReached) that the UP phi reaches.
+        auto &PRUs = RealUseMap[UP.first];
+        for (auto R : UpReached) {
+          unsigned Z = PRUs[R].size();
+          PRUs[R].insert(RealUses[R].begin(), RealUses[R].end());
+          Changed |= (PRUs[R].size() != Z);
+        }
+        if (Changed)
+          PhiUQ.push_back(UP.first);
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Real use map:\n";
+    for (auto I : RealUseMap) {
+      dbgs() << "phi " << Print<NodeId>(I.first, DFG);
+      NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first);
+      NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
+      if (!Ds.empty()) {
+        RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef();
+        dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>';
+      } else {
+        dbgs() << "<noreg>";
+      }
+      dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n';
+    }
+  }
+}
+
+
+void Liveness::computeLiveIns() {
+  // Populate the node-to-block map. This speeds up the calculations
+  // significantly.
+  NBMap.clear();
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+        NBMap.insert(std::make_pair(RA.Id, BB));
+      NBMap.insert(std::make_pair(IA.Id, BB));
+    }
+  }
+
+  MachineFunction &MF = DFG.getMF();
+
+  // Compute IDF first, then the inverse.
+  decltype(IIDF) IDF;
+  for (auto &B : MF) {
+    auto F1 = MDF.find(&B);
+    if (F1 == MDF.end())
+      continue;
+    SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end());
+    for (unsigned i = 0; i < IDFB.size(); ++i) {
+      auto F2 = MDF.find(IDFB[i]);
+      if (F2 != MDF.end())
+        IDFB.insert(F2->second.begin(), F2->second.end());
+    }
+    // Add B to the IDF(B). This will put B in the IIDF(B).
+    IDFB.insert(&B);
+    IDF[&B].insert(IDFB.begin(), IDFB.end());
+  }
+
+  for (auto I : IDF)
+    for (auto S : I.second)
+      IIDF[S].insert(I.first);
+
+  computePhiInfo();
+
+  NodeAddr<FuncNode*> FA = DFG.getFunc();
+  auto Blocks = FA.Addr->members(DFG);
+
+  // Build the phi live-on-entry map.
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    MachineBasicBlock *MB = BA.Addr->getCode();
+    auto &LON = PhiLON[MB];
+    for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG))
+      for (auto S : RealUseMap[P.Id])
+        LON[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << "Phi live-on-entry map:\n";
+    for (auto I : PhiLON)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print<RefMap>(I.second, DFG) << '\n';
+  }
+
+  // Build the phi live-on-exit map. Each phi node has some set of reached
+  // "real" uses. Propagate this set backwards into the block predecessors
+  // through the reaching defs of the corresponding phi uses.
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    auto Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    for (NodeAddr<PhiNode*> PA : Phis) {
+      auto &RUs = RealUseMap[PA.Id];
+      if (RUs.empty())
+        continue;
+
+      for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        NodeAddr<PhiUseNode*> UA = U;
+        if (UA.Addr->getReachingDef() == 0)
+          continue;
+
+        // Mark all reached "real" uses of P as live on exit in the
+        // predecessor.
+        // Remap all the RUs so that they have a correct reaching def.
+        auto PrA = DFG.addr<BlockNode*>(UA.Addr->getPredecessor());
+        auto &LOX = PhiLOX[PrA.Addr->getCode()];
+        for (auto R : RUs) {
+          RegisterRef RR = R.first;
+          if (!isRestricted(PA, UA, RR))
+            RR = getRestrictedRegRef(UA);
+          // The restricted ref may be different from the ref that was
+          // accessed in the "real use". This means that this phi use
+          // is not the one that carries this reference, so skip it.
+          if (!RAI.alias(R.first, RR))
+            continue;
+          for (auto D : getAllReachingDefs(RR, UA))
+            LOX[RR].insert(D.Id);
+        }
+      }  // for U : phi uses
+    }  // for P : Phis
+  }  // for B : Blocks
+
+  if (Trace) {
+    dbgs() << "Phi live-on-exit map:\n";
+    for (auto I : PhiLOX)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print<RefMap>(I.second, DFG) << '\n';
+  }
+
+  RefMap LiveIn;
+  traverse(&MF.front(), LiveIn);
+
+  // Add function live-ins to the live-in set of the function entry block.
+  auto &EntryIn = LiveMap[&MF.front()];
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
+    EntryIn.insert({I->first,0});
+
+  if (Trace) {
+    // Dump the liveness map
+    for (auto &B : MF) {
+      BitVector LV(TRI.getNumRegs());
+      for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+        LV.set(I->PhysReg);
+      dbgs() << "BB#" << B.getNumber() << "\t rec = {";
+      for (int x = LV.find_first(); x >= 0; x = LV.find_next(x))
+        dbgs() << ' ' << Print<RegisterRef>({unsigned(x),0}, DFG);
+      dbgs() << " }\n";
+      dbgs() << "\tcomp = " << Print<RegisterSet>(LiveMap[&B], DFG) << '\n';
+    }
+  }
+}
+
+
+void Liveness::resetLiveIns() {
+  for (auto &B : DFG.getMF()) {
+    // Remove all live-ins.
+    std::vector<unsigned> T;
+    for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+      T.push_back(I->PhysReg);
+    for (auto I : T)
+      B.removeLiveIn(I);
+    // Add the newly computed live-ins.
+    auto &LiveIns = LiveMap[&B];
+    for (auto I : LiveIns) {
+      assert(I.Sub == 0);
+      B.addLiveIn(I.Reg);
+    }
+  }
+}
+
+
+void Liveness::resetKills() {
+  for (auto &B : DFG.getMF())
+    resetKills(&B);
+}
+
+
+void Liveness::resetKills(MachineBasicBlock *B) {
+  auto CopyLiveIns = [] (MachineBasicBlock *B, BitVector &LV) -> void {
+    for (auto I = B->livein_begin(), E = B->livein_end(); I != E; ++I)
+      LV.set(I->PhysReg);
+  };
+
+  BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
+  CopyLiveIns(B, LiveIn);
+  for (auto SI : B->successors())
+    CopyLiveIns(SI, Live);
+
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    if (MI->isDebugValue())
+      continue;
+
+    MI->clearKillInfo();
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(R))
+        continue;
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+        Live.reset(*SR);
+    }
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isUse())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(R))
+        continue;
+      bool IsLive = false;
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) {
+        if (!Live[*SR])
+          continue;
+        IsLive = true;
+        break;
+      }
+      if (IsLive)
+        continue;
+      Op.setIsKill(true);
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+        Live.set(*SR);
+    }
+  }
+}
+
+
+// For shadows, determine if RR is aliased to a reaching def of any other
+// shadow associated with RA. If it is not, then RR is "restricted" to RA,
+// and so it can be considered a value specific to RA. This is important
+// for accurately determining values associated with phi uses.
+// For non-shadows, this function returns "true".
+bool Liveness::isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+      RegisterRef RR) const {
+  NodeId Start = RA.Id;
+  for (NodeAddr<RefNode*> TA = DFG.getNextShadow(IA, RA);
+       TA.Id != 0 && TA.Id != Start; TA = DFG.getNextShadow(IA, TA)) {
+    NodeId RD = TA.Addr->getReachingDef();
+    if (RD == 0)
+      continue;
+    if (RAI.alias(RR, DFG.addr<DefNode*>(RD).Addr->getRegRef()))
+      return false;
+  }
+  return true;
+}
+
+
+RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
+  assert(DFG.IsRef<NodeAttrs::Use>(RA));
+  if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
+    NodeId RD = RA.Addr->getReachingDef();
+    assert(RD);
+    RA = DFG.addr<DefNode*>(RD);
+  }
+  return RA.Addr->getRegRef();
+}
+
+
+unsigned Liveness::getPhysReg(RegisterRef RR) const {
+  if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+    return 0;
+  return RR.Sub ? TRI.getSubReg(RR.Reg, RR.Sub) : RR.Reg;
+}
+
+
+// Helper function to obtain the basic block containing the reaching def
+// of the given use.
+MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
+  auto F = NBMap.find(RN);
+  if (F != NBMap.end())
+    return F->second;
+  llvm_unreachable("Node id not in map");
+}
+
+
+void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
+  // The LiveIn map, for each (physical) register, contains the set of live
+  // reaching defs of that register that are live on entry to the associated
+  // block.
+
+  // The summary of the traversal algorithm:
+  //
+  // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
+  // and (U \in IDF(B) or B dom U).
+  //
+  // for (C : children) {
+  //   LU = {}
+  //   traverse(C, LU)
+  //   LiveUses += LU
+  // }
+  //
+  // LiveUses -= Defs(B);
+  // LiveUses += UpwardExposedUses(B);
+  // for (C : IIDF[B])
+  //   for (U : LiveUses)
+  //     if (Rdef(U) dom C)
+  //       C.addLiveIn(U)
+  //
+
+  // Go up the dominator tree (depth-first).
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto I : *N) {
+    RefMap L;
+    MachineBasicBlock *SB = I->getBlock();
+    traverse(SB, L);
+
+    for (auto S : L)
+      LiveIn[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << LLVM_FUNCTION_NAME << " in BB#" << B->getNumber()
+           << " after recursion into";
+    for (auto I : *N)
+      dbgs() << ' ' << I->getBlock()->getNumber();
+    dbgs() << "\n  LiveIn: " << Print<RefMap>(LiveIn, DFG);
+    dbgs() << "\n  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Add phi uses that are live on exit from this block.
+  RefMap &PUs = PhiLOX[B];
+  for (auto S : PUs)
+    LiveIn[S.first].insert(S.second.begin(), S.second.end());
+
+  if (Trace) {
+    dbgs() << "after LOX\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Stop tracking all uses defined in this block: erase those records
+  // where the reaching def is located in B and which cover all reached
+  // uses.
+  auto Copy = LiveIn;
+  LiveIn.clear();
+
+  for (auto I : Copy) {
+    auto &Defs = LiveIn[I.first];
+    NodeSet Rest;
+    for (auto R : I.second) {
+      auto DA = DFG.addr<DefNode*>(R);
+      RegisterRef DDR = DA.Addr->getRegRef();
+      NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+      NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+      // Defs from a different block need to be preserved. Defs from this
+      // block will need to be processed further, except for phi defs, the
+      // liveness of which is handled through the PhiLON/PhiLOX maps.
+      if (B != BA.Addr->getCode())
+        Defs.insert(R);
+      else {
+        bool IsPreserving = DA.Addr->getFlags() & NodeAttrs::Preserving;
+        if (IA.Addr->getKind() != NodeAttrs::Phi && !IsPreserving) {
+          bool Covering = RAI.covers(DDR, I.first);
+          NodeId U = DA.Addr->getReachedUse();
+          while (U && Covering) {
+            auto DUA = DFG.addr<UseNode*>(U);
+            RegisterRef Q = DUA.Addr->getRegRef();
+            Covering = RAI.covers(DA.Addr->getRegRef(), Q);
+            U = DUA.Addr->getSibling();
+          }
+          if (!Covering)
+            Rest.insert(R);
+        }
+      }
+    }
+
+    // Non-covering defs from B.
+    for (auto R : Rest) {
+      auto DA = DFG.addr<DefNode*>(R);
+      RegisterRef DRR = DA.Addr->getRegRef();
+      RegisterSet RRs;
+      for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) {
+        NodeAddr<InstrNode*> IA = TA.Addr->getOwner(DFG);
+        NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+        // Preserving defs do not count towards covering.
+        if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
+          RRs.insert(TA.Addr->getRegRef());
+        if (BA.Addr->getCode() == B)
+          continue;
+        if (RAI.covers(RRs, DRR))
+          break;
+        Defs.insert(TA.Id);
+      }
+    }
+  }
+
+  emptify(LiveIn);
+
+  if (Trace) {
+    dbgs() << "after defs in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Scan the block for upward-exposed uses and add them to the tracking set.
+  for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
+    NodeAddr<InstrNode*> IA = I;
+    if (IA.Addr->getKind() != NodeAttrs::Stmt)
+      continue;
+    for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+      RegisterRef RR = UA.Addr->getRegRef();
+      for (auto D : getAllReachingDefs(UA))
+        if (getBlockWithRef(D.Id) != B)
+          LiveIn[RR].insert(D.Id);
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "after uses in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Phi uses should not be propagated up the dominator tree, since they
+  // are not dominated by their corresponding reaching defs.
+  auto &Local = LiveMap[B];
+  auto &LON = PhiLON[B];
+  for (auto R : LON)
+    Local.insert(R.first);
+
+  if (Trace) {
+    dbgs() << "after phi uses in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(Local, DFG) << '\n';
+  }
+
+  for (auto C : IIDF[B]) {
+    auto &LiveC = LiveMap[C];
+    for (auto S : LiveIn)
+      for (auto R : S.second)
+        if (MDT.properlyDominates(getBlockWithRef(R), C))
+          LiveC.insert(S.first);
+  }
+}
+
+
+void Liveness::emptify(RefMap &M) {
+  for (auto I = M.begin(), E = M.end(); I != E; )
+    I = I->second.empty() ? M.erase(I) : std::next(I);
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
new file mode 100644
index 0000000..4c1e8f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
@@ -0,0 +1,106 @@
+//===--- RDFLiveness.h ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Recalculate the liveness information given a data flow graph.
+// This includes block live-ins and kill flags.
+
+#ifndef RDF_LIVENESS_H
+#define RDF_LIVENESS_H
+
+#include "RDFGraph.h"
+#include "llvm/ADT/DenseMap.h"
+#include <map>
+
+using namespace llvm;
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineRegisterInfo;
+  class TargetRegisterInfo;
+  class MachineDominatorTree;
+  class MachineDominanceFrontier;
+}
+
+namespace rdf {
+  struct Liveness {
+  public:
+    typedef std::map<MachineBasicBlock*,RegisterSet> LiveMapType;
+    typedef std::map<RegisterRef,NodeSet> RefMap;
+
+    Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
+      : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()),
+        RAI(g.getRAI()), MRI(mri), Empty(), Trace(false) {}
+
+    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+        bool FullChain = false, const RegisterSet &DefRRs = RegisterSet());
+    NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA);
+
+    LiveMapType &getLiveMap() { return LiveMap; }
+    const LiveMapType &getLiveMap() const { return LiveMap; }
+    const RefMap &getRealUses(NodeId P) const {
+      auto F = RealUseMap.find(P);
+      return F == RealUseMap.end() ? Empty : F->second;
+    }
+
+    void computePhiInfo();
+    void computeLiveIns();
+    void resetLiveIns();
+    void resetKills();
+    void resetKills(MachineBasicBlock *B);
+
+    void trace(bool T) { Trace = T; }
+
+  private:
+    const DataFlowGraph &DFG;
+    const TargetRegisterInfo &TRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const RegisterAliasInfo &RAI;
+    MachineRegisterInfo &MRI;
+    LiveMapType LiveMap;
+    const RefMap Empty;
+    bool Trace;
+
+    // Cache of mapping from node ids (for RefNodes) to the containing
+    // basic blocks. Not computing it each time for each node reduces
+    // the liveness calculation time by a large fraction.
+    typedef DenseMap<NodeId,MachineBasicBlock*> NodeBlockMap;
+    NodeBlockMap NBMap;
+
+    // Phi information:
+    //
+    // map: NodeId -> (map: RegisterRef -> NodeSet)
+    //      phi id -> (map: register -> set of reached non-phi uses)
+    std::map<NodeId, RefMap> RealUseMap;
+
+    // Inverse iterated dominance frontier.
+    std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
+
+    // Live on entry.
+    std::map<MachineBasicBlock*,RefMap> PhiLON;
+
+    // Phi uses are considered to be located at the end of the block that
+    // they are associated with. The reaching def of a phi use dominates the
+    // block that the use corresponds to, but not the block that contains
+    // the phi itself. To include these uses in the liveness propagation (up
+    // the dominator tree), create a map: block -> set of uses live on exit.
+    std::map<MachineBasicBlock*,RefMap> PhiLOX;
+
+    bool isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+        RegisterRef RR) const;
+    RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
+    unsigned getPhysReg(RegisterRef RR) const;
+    MachineBasicBlock *getBlockWithRef(NodeId RN) const;
+    void traverse(MachineBasicBlock *B, RefMap &LiveIn);
+    void emptify(RefMap &M);
+  };
+}
+
+#endif // RDF_LIVENESS_H
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 70141a9..72afec1 100644
--- a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -17,8 +17,6 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-  class MCOperand;
-
   class MSP430InstPrinter : public MCInstPrinter {
   public:
     MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index ff5b0b6..183dee3 100644
--- a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -17,13 +17,14 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class Triple;
+class Triple;
 
-  class MSP430MCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit MSP430MCAsmInfo(const Triple &TT);
-  };
+class MSP430MCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit MSP430MCAsmInfo(const Triple &TT);
+};
 
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
index ffcf222..606abc2 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -64,7 +64,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
   unsigned FuncSize = 0;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
-    MachineBasicBlock *MBB = MFI;
+    MachineBasicBlock *MBB = &*MFI;
 
     unsigned BlockSize = 0;
     for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 29bc8b3..18f38b7 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -69,10 +69,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   computeRegisterProperties(STI.getRegisterInfo());
 
   // Provide all sorts of operation actions
-
-  // Division is expensive
-  setIntDivIsCheap(false);
-
   setStackPointerRegisterToSaveRestore(MSP430::SP);
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
@@ -508,9 +504,10 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
         // Create the SelectionDAG nodes corresponding to a load
         //from this parameter
         SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
-        InVal = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                            MachinePointerInfo::getFixedStack(FI),
-                            false, false, false, 0);
+        InVal = DAG.getLoad(
+            VA.getLocVT(), dl, Chain, FIN,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+            false, false, false, 0);
       }
 
       InVals.push_back(InVal);
@@ -1231,8 +1228,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   }
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = BB;
-  ++I;
+  MachineFunction::iterator I = ++BB->getIterator();
 
   // Create loop block
   MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -1320,8 +1316,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // to set, the condition code register to branch on, the true/false values to
   // select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = BB;
-  ++I;
+  MachineFunction::iterator I = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 72b1780..d4f82bd 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -44,11 +44,10 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                            MachineMemOperand::MOStore,
-                            MFI.getObjectSize(FrameIdx),
-                            MFI.getObjectAlignment(FrameIdx));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
 
   if (RC == &MSP430::GR16RegClass)
     BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
@@ -72,11 +71,10 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                            MachineMemOperand::MOLoad,
-                            MFI.getObjectSize(FrameIdx),
-                            MFI.getObjectAlignment(FrameIdx));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
 
   if (RC == &MSP430::GR16RegClass)
     BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
index 54154a8..47b0e27 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -50,9 +50,9 @@ GetExternalSymbolSymbol(const MachineOperand &MO) const {
 
 MCSymbol *MSP430MCInstLower::
 GetJumpTableSymbol(const MachineOperand &MO) const {
-  const DataLayout *DL = Printer.TM.getDataLayout();
+  const DataLayout &DL = Printer.getDataLayout();
   SmallString<256> Name;
-  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
+  raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
                             << Printer.getFunctionNumber() << '_'
                             << MO.getIndex();
 
@@ -67,9 +67,9 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
 
 MCSymbol *MSP430MCInstLower::
 GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
-  const DataLayout *DL = Printer.TM.getDataLayout();
+  const DataLayout &DL = Printer.getDataLayout();
   SmallString<256> Name;
-  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI"
+  raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "CPI"
                             << Printer.getFunctionNumber() << '_'
                             << MO.getIndex();
 
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
index 0f75399..b442fc0 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MSP430MachineFuctionInfo.cpp - MSP430 machine function info -------===//
+//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index fcc5f5b..2d93731 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 5107d2a..d4e061f 100644
--- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsRegisterInfo.h"
+#include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
@@ -106,7 +107,6 @@ class MipsAsmParser : public MCTargetAsmParser {
     return static_cast<MipsTargetStreamer &>(TS);
   }
 
-  MCSubtargetInfo &STI;
   MipsABIInfo ABI;
   SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
   MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
@@ -114,6 +114,12 @@ class MipsAsmParser : public MCTargetAsmParser {
                        // selected. This usually happens after an '.end func'
                        // directive.
   bool IsLittleEndian;
+  bool IsPicEnabled;
+  bool IsCpRestoreSet;
+  int CpRestoreOffset;
+  unsigned CpSaveLocation;
+  /// If true, then CpSaveLocation is a register, otherwise it's an offset.
+  bool     CpSaveLocationIsRegister;
 
   // Print a warning along with its fix-it message at the given range.
   void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
@@ -141,50 +147,41 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
+  OperandMatchResultTy parseMemOperand(OperandVector &Operands);
+  OperandMatchResultTy
   matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
                                     StringRef Identifier, SMLoc S);
-
-  MipsAsmParser::OperandMatchResultTy
-  matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
-
-  MipsAsmParser::OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy parseImm(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy parseLSAImm(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseRegisterPair (OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseMovePRegPair(OperandVector &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseRegisterList (OperandVector  &Operands);
+  OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
+                                                     SMLoc S);
+  OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
+  OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
+  OperandMatchResultTy parseInvNum(OperandVector &Operands);
+  OperandMatchResultTy parseLSAImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
+  OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
+  OperandMatchResultTy parseRegisterList(OperandVector &Operands);
 
   bool searchSymbolAlias(OperandVector &Operands);
 
   bool parseOperand(OperandVector &, StringRef Mnemonic);
 
-  bool needsExpansion(MCInst &Inst);
+  enum MacroExpanderResultTy {
+    MER_NotAMacro,
+    MER_Success,
+    MER_Fail,
+  };
 
   // Expands assembly pseudo instructions.
-  // Returns false on success, true otherwise.
-  bool expandInstruction(MCInst &Inst, SMLoc IDLoc,
-                         SmallVectorImpl<MCInst> &Instructions);
+  MacroExpanderResultTy
+  tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+                       SmallVectorImpl<MCInst> &Instructions);
 
   bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
                          SmallVectorImpl<MCInst> &Instructions);
 
   bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
-                     bool Is32BitImm, SMLoc IDLoc,
+                     bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions);
 
   bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
@@ -194,11 +191,10 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions);
 
-  bool expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
-                            SmallVectorImpl<MCInst> &Instructions);
+  bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+                         const MCOperand &Offset, bool Is32BitAddress,
+                         SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions);
 
-  bool expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
-                            SmallVectorImpl<MCInst> &Instructions);
   bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions);
 
@@ -209,24 +205,43 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
                                SmallVectorImpl<MCInst> &Instructions);
 
+  bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
+                          SmallVectorImpl<MCInst> &Instructions);
+
   bool expandBranchImm(MCInst &Inst, SMLoc IDLoc,
                        SmallVectorImpl<MCInst> &Instructions);
 
   bool expandCondBranches(MCInst &Inst, SMLoc IDLoc,
                           SmallVectorImpl<MCInst> &Instructions);
 
-  bool expandUlhu(MCInst &Inst, SMLoc IDLoc,
-                  SmallVectorImpl<MCInst> &Instructions);
+  bool expandDiv(MCInst &Inst, SMLoc IDLoc,
+                 SmallVectorImpl<MCInst> &Instructions, const bool IsMips64,
+                 const bool Signed);
+
+  bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
+                 SmallVectorImpl<MCInst> &Instructions);
 
   bool expandUlw(MCInst &Inst, SMLoc IDLoc,
                  SmallVectorImpl<MCInst> &Instructions);
 
+  bool expandRotation(MCInst &Inst, SMLoc IDLoc,
+                      SmallVectorImpl<MCInst> &Instructions);
+  bool expandRotationImm(MCInst &Inst, SMLoc IDLoc,
+                         SmallVectorImpl<MCInst> &Instructions);
+  bool expandDRotation(MCInst &Inst, SMLoc IDLoc,
+                       SmallVectorImpl<MCInst> &Instructions);
+  bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
+                          SmallVectorImpl<MCInst> &Instructions);
+
   void createNop(bool hasShortDelaySlot, SMLoc IDLoc,
                  SmallVectorImpl<MCInst> &Instructions);
 
   void createAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg,
                   bool Is64Bit, SmallVectorImpl<MCInst> &Instructions);
 
+  void createCpRestoreMemOp(bool IsLoad, int StackOffset, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+
   bool reportParseError(Twine ErrorMsg);
   bool reportParseError(SMLoc Loc, Twine ErrorMsg);
 
@@ -239,8 +254,11 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetMips0Directive();
   bool parseSetArchDirective();
   bool parseSetFeature(uint64_t Feature);
+  bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
   bool parseDirectiveCpLoad(SMLoc Loc);
+  bool parseDirectiveCpRestore(SMLoc Loc);
   bool parseDirectiveCPSetup();
+  bool parseDirectiveCPReturn();
   bool parseDirectiveNaN();
   bool parseDirectiveSet();
   bool parseDirectiveOption();
@@ -337,6 +355,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   // FeatureMipsGP64 | FeatureMips1)
   // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4).
   void selectArch(StringRef ArchFeature) {
+    MCSubtargetInfo &STI = copySTI();
     FeatureBitset FeatureBits = STI.getFeatureBits();
     FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask;
     STI.setFeatureBits(FeatureBits);
@@ -346,7 +365,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   }
 
   void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
-    if (!(STI.getFeatureBits()[Feature])) {
+    if (!(getSTI().getFeatureBits()[Feature])) {
+      MCSubtargetInfo &STI = copySTI();
       setAvailableFeatures(
           ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
       AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
@@ -354,7 +374,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   }
 
   void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
-    if (STI.getFeatureBits()[Feature]) {
+    if (getSTI().getFeatureBits()[Feature]) {
+      MCSubtargetInfo &STI = copySTI();
       setAvailableFeatures(
           ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
       AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
@@ -363,26 +384,25 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
     setFeatureBits(Feature, FeatureString);
-    AssemblerOptions.front()->setFeatures(STI.getFeatureBits());
+    AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
   }
 
   void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
     clearFeatureBits(Feature, FeatureString);
-    AssemblerOptions.front()->setFeatures(STI.getFeatureBits());
+    AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
   }
 
 public:
   enum MipsMatchResultTy {
-    Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY
+    Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "MipsGenAsmMatcher.inc"
 #undef GET_OPERAND_DIAGNOSTIC_TYPES
-
   };
 
-  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+  MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti),
+    : MCTargetAsmParser(Options, sti),
         ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()),
                                           sti.getCPU(), Options)) {
     MCAsmParserExtension::Initialize(parser);
@@ -390,15 +410,15 @@ public:
     parser.addAliasForDirective(".asciiz", ".asciz");
 
     // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-    
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
     // Remember the initial assembler options. The user can not modify these.
     AssemblerOptions.push_back(
-        llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits()));
-    
+        llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+
     // Create an assembler options environment for the user to modify.
     AssemblerOptions.push_back(
-        llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits()));
+        llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
 
     getTargetStreamer().updateABIInfo(*this);
 
@@ -407,6 +427,12 @@ public:
 
     CurrentFn = nullptr;
 
+    IsPicEnabled =
+        (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_);
+
+    IsCpRestoreSet = false;
+    CpRestoreOffset = -1;
+
     Triple TheTriple(sti.getTargetTriple());
     if ((TheTriple.getArch() == Triple::mips) ||
         (TheTriple.getArch() == Triple::mips64))
@@ -418,70 +444,103 @@ public:
   /// True if all of $fcc0 - $fcc7 exist for the current ISA.
   bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
 
-  bool isGP64bit() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
-  bool isFP64bit() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
+  bool isGP64bit() const {
+    return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
+  }
+  bool isFP64bit() const {
+    return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
+  }
   const MipsABIInfo &getABI() const { return ABI; }
   bool isABI_N32() const { return ABI.IsN32(); }
   bool isABI_N64() const { return ABI.IsN64(); }
   bool isABI_O32() const { return ABI.IsO32(); }
-  bool isABI_FPXX() const { return STI.getFeatureBits()[Mips::FeatureFPXX]; }
+  bool isABI_FPXX() const {
+    return getSTI().getFeatureBits()[Mips::FeatureFPXX];
+  }
 
   bool useOddSPReg() const {
-    return !(STI.getFeatureBits()[Mips::FeatureNoOddSPReg]);
+    return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]);
   }
 
   bool inMicroMipsMode() const {
-    return STI.getFeatureBits()[Mips::FeatureMicroMips];
+    return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
+  }
+  bool hasMips1() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips1];
+  }
+  bool hasMips2() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips2];
+  }
+  bool hasMips3() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips3];
+  }
+  bool hasMips4() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips4];
+  }
+  bool hasMips5() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips5];
   }
-  bool hasMips1() const { return STI.getFeatureBits()[Mips::FeatureMips1]; }
-  bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
-  bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
-  bool hasMips4() const { return STI.getFeatureBits()[Mips::FeatureMips4]; }
-  bool hasMips5() const { return STI.getFeatureBits()[Mips::FeatureMips5]; }
   bool hasMips32() const {
-    return STI.getFeatureBits()[Mips::FeatureMips32];
+    return getSTI().getFeatureBits()[Mips::FeatureMips32];
   }
   bool hasMips64() const {
-    return STI.getFeatureBits()[Mips::FeatureMips64];
+    return getSTI().getFeatureBits()[Mips::FeatureMips64];
   }
   bool hasMips32r2() const {
-    return STI.getFeatureBits()[Mips::FeatureMips32r2];
+    return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
   }
   bool hasMips64r2() const {
-    return STI.getFeatureBits()[Mips::FeatureMips64r2];
+    return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
   }
   bool hasMips32r3() const {
-    return (STI.getFeatureBits()[Mips::FeatureMips32r3]);
+    return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
   }
   bool hasMips64r3() const {
-    return (STI.getFeatureBits()[Mips::FeatureMips64r3]);
+    return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
   }
   bool hasMips32r5() const {
-    return (STI.getFeatureBits()[Mips::FeatureMips32r5]);
+    return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
   }
   bool hasMips64r5() const {
-    return (STI.getFeatureBits()[Mips::FeatureMips64r5]);
+    return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
   }
   bool hasMips32r6() const {
-    return STI.getFeatureBits()[Mips::FeatureMips32r6];
+    return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
   }
   bool hasMips64r6() const {
-    return STI.getFeatureBits()[Mips::FeatureMips64r6];
+    return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
   }
 
-  bool hasDSP() const { return STI.getFeatureBits()[Mips::FeatureDSP]; }
-  bool hasDSPR2() const { return STI.getFeatureBits()[Mips::FeatureDSPR2]; }
-  bool hasMSA() const { return STI.getFeatureBits()[Mips::FeatureMSA]; }
+  bool hasDSP() const {
+    return getSTI().getFeatureBits()[Mips::FeatureDSP];
+  }
+  bool hasDSPR2() const {
+    return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
+  }
+  bool hasDSPR3() const {
+    return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
+  }
+  bool hasMSA() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMSA];
+  }
   bool hasCnMips() const {
-    return (STI.getFeatureBits()[Mips::FeatureCnMips]);
+    return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
+  }
+
+  bool inPicMode() {
+    return IsPicEnabled;
   }
 
   bool inMips16Mode() const {
-    return STI.getFeatureBits()[Mips::FeatureMips16];
+    return getSTI().getFeatureBits()[Mips::FeatureMips16];
+  }
+
+  bool useTraps() const {
+    return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV];
   }
 
   bool useSoftFloat() const {
-    return STI.getFeatureBits()[Mips::FeatureSoftFloat];
+    return getSTI().getFeatureBits()[Mips::FeatureSoftFloat];
   }
 
   /// Warn if RegIndex is the same as the current AT.
@@ -869,6 +928,16 @@ public:
     Inst.addOperand(MCOperand::createReg(getHWRegsReg()));
   }
 
+  template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+  void addConstantUImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    uint64_t Imm = getConstantImm() - Offset;
+    Imm &= (1 << Bits) - 1;
+    Imm += Offset;
+    Imm += AdjustOffset;
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCExpr *Expr = getImm();
@@ -878,7 +947,9 @@ public:
   void addMemOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
 
-    Inst.addOperand(MCOperand::createReg(getMemBase()->getGPR32Reg()));
+    Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit()
+                                             ? getMemBase()->getGPR64Reg()
+                                             : getMemBase()->getGPR32Reg()));
 
     const MCExpr *Expr = getMemOff();
     addExpr(Inst, Expr);
@@ -924,10 +995,16 @@ public:
   bool isRegIdx() const { return Kind == k_RegisterIndex; }
   bool isImm() const override { return Kind == k_Immediate; }
   bool isConstantImm() const {
-    return isImm() && dyn_cast<MCConstantExpr>(getImm());
+    return isImm() && isa<MCConstantExpr>(getImm());
+  }
+  bool isConstantImmz() const {
+    return isConstantImm() && getConstantImm() == 0;
   }
-  template <unsigned Bits> bool isUImm() const {
-    return isImm() && isConstantImm() && isUInt<Bits>(getConstantImm());
+  template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
+    return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
+  }
+  template <unsigned Bits> bool isConstantSImm() const {
+    return isConstantImm() && isInt<Bits>(getConstantImm());
   }
   bool isToken() const override {
     // Note: It's not possible to pretend that other operand kinds are tokens.
@@ -936,10 +1013,15 @@ public:
   }
   bool isMem() const override { return Kind == k_Memory; }
   bool isConstantMemOff() const {
-    return isMem() && dyn_cast<MCConstantExpr>(getMemOff());
+    return isMem() && isa<MCConstantExpr>(getMemOff());
   }
   template <unsigned Bits> bool isMemWithSimmOffset() const {
-    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff());
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
+      && getMemBase()->isGPRAsmReg();
+  }
+  template <unsigned Bits> bool isMemWithSimmOffsetGPR() const {
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) &&
+           getMemBase()->isGPRAsmReg();
   }
   bool isMemWithGRPMM16Base() const {
     return isMem() && getMemBase()->isMM16AsmReg();
@@ -953,13 +1035,23 @@ public:
       && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
       && (getMemBase()->getGPR32Reg() == Mips::SP);
   }
+  template <unsigned Bits, unsigned ShiftLeftAmount>
+  bool isScaledUImm() const {
+    return isConstantImm() &&
+           isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
+  }
   bool isRegList16() const {
     if (!isRegList())
       return false;
 
     int Size = RegList.List->size();
-    if (Size < 2 || Size > 5 || *RegList.List->begin() != Mips::S0 ||
-        RegList.List->back() != Mips::RA)
+    if (Size < 2 || Size > 5)
+      return false;
+
+    unsigned R0 = RegList.List->front();
+    unsigned R1 = RegList.List->back();
+    if (!((R0 == Mips::S0 && R1 == Mips::RA) ||
+          (R0 == Mips::S0_64 && R1 == Mips::RA_64)))
       return false;
 
     int PrevReg = *RegList.List->begin();
@@ -1304,9 +1396,123 @@ static bool hasShortDelaySlot(unsigned Opcode) {
   }
 }
 
+static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) {
+  if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+    return &SRExpr->getSymbol();
+  }
+
+  if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+    const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS());
+    const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS());
+
+    if (LHSSym)
+      return LHSSym;
+
+    if (RHSSym)
+      return RHSSym;
+
+    return nullptr;
+  }
+
+  if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+    return getSingleMCSymbol(UExpr->getSubExpr());
+
+  return nullptr;
+}
+
+static unsigned countMCSymbolRefExpr(const MCExpr *Expr) {
+  if (isa<MCSymbolRefExpr>(Expr))
+    return 1;
+
+  if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr))
+    return countMCSymbolRefExpr(BExpr->getLHS()) +
+           countMCSymbolRefExpr(BExpr->getRHS());
+
+  if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+    return countMCSymbolRefExpr(UExpr->getSubExpr());
+
+  return 0;
+}
+
+namespace {
+void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(Reg0));
+  tmpInst.addOperand(Op1);
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, Instructions);
+}
+
+void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, Instructions);
+}
+
+void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createImm(Imm1));
+  tmpInst.addOperand(MCOperand::createImm(Imm2));
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+           SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(Reg0));
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(Reg0));
+  tmpInst.addOperand(MCOperand::createReg(Reg1));
+  tmpInst.addOperand(Op2);
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc,
+          Instructions);
+}
+
+void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc,
+          Instructions);
+}
+
+void emitAppropriateDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
+                         SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  if (ShiftAmount >= 32) {
+    emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc,
+            Instructions);
+    return;
+  }
+
+  emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, Instructions);
+}
+} // end anonymous namespace.
+
 bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                        SmallVectorImpl<MCInst> &Instructions) {
   const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  bool ExpandedJalSym = false;
 
   Inst.setLoc(IDLoc);
 
@@ -1365,12 +1571,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BEQZ16_MM:
+    case Mips::BEQZC16_MMR6:
     case Mips::BNEZ16_MM:
+    case Mips::BNEZC16_MMR6:
       assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
       Offset = Inst.getOperand(1);
       if (!Offset.isImm())
         break; // We'll deal with this situation later on when applying fixups.
-      if (!isIntN(8, Offset.getImm()))
+      if (!isInt<8>(Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
       if (OffsetToAlignment(Offset.getImm(), 2LL))
         return Error(IDLoc, "branch to misaligned address");
@@ -1415,32 +1623,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         }
         break;
 
-      case Mips::CINS:
-      case Mips::CINS32:
-      case Mips::EXTS:
-      case Mips::EXTS32:
-        assert(MCID.getNumOperands() == 4 && "unexpected number of operands");
-        // Check length
-        Opnd = Inst.getOperand(3);
-        if (!Opnd.isImm())
-          return Error(IDLoc, "expected immediate operand kind");
-        Imm = Opnd.getImm();
-        if (Imm < 0 || Imm > 31)
-          return Error(IDLoc, "immediate operand value out of range");
-        // Check position
-        Opnd = Inst.getOperand(2);
-        if (!Opnd.isImm())
-          return Error(IDLoc, "expected immediate operand kind");
-        Imm = Opnd.getImm();
-        if (Imm < 0 || Imm > (Opcode == Mips::CINS ||
-                              Opcode == Mips::EXTS ? 63 : 31))
-          return Error(IDLoc, "immediate operand value out of range");
-        if (Imm > 31) {
-          Inst.setOpcode(Opcode == Mips::CINS ? Mips::CINS32 : Mips::EXTS32);
-          Inst.getOperand(2).setImm(Imm - 32);
-        }
-        break;
-
       case Mips::SEQi:
       case Mips::SNEi:
         assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
@@ -1454,6 +1636,81 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     }
   }
 
+  // This expansion is not in a function called by tryExpandInstruction()
+  // because the pseudo-instruction doesn't have a distinct opcode.
+  if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) &&
+      inPicMode()) {
+    warnIfNoMacro(IDLoc);
+
+    const MCExpr *JalExpr = Inst.getOperand(0).getExpr();
+
+    // We can do this expansion if there's only 1 symbol in the argument
+    // expression.
+    if (countMCSymbolRefExpr(JalExpr) > 1)
+      return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode");
+
+    // FIXME: This is checking the expression can be handled by the later stages
+    //        of the assembler. We ought to leave it to those later stages but
+    //        we can't do that until we stop evaluateRelocExpr() rewriting the
+    //        expressions into non-equivalent forms.
+    const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);
+
+    // FIXME: Add support for label+offset operands (currently causes an error).
+    // FIXME: Add support for forward-declared local symbols.
+    // FIXME: Add expansion for when the LargeGOT option is enabled.
+    if (JalSym->isInSection() || JalSym->isTemporary()) {
+      if (isABI_O32()) {
+        // If it's a local symbol and the O32 ABI is being used, we expand to:
+        //  lw $25, 0($gp)
+        //    R_(MICRO)MIPS_GOT16  label
+        //  addiu $25, $25, 0
+        //    R_(MICRO)MIPS_LO16   label
+        //  jalr  $25
+        const MCExpr *Got16RelocExpr = evaluateRelocExpr(JalExpr, "got");
+        const MCExpr *Lo16RelocExpr = evaluateRelocExpr(JalExpr, "lo");
+
+        emitRRX(Mips::LW, Mips::T9, Mips::GP,
+                MCOperand::createExpr(Got16RelocExpr), IDLoc, Instructions);
+        emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
+                MCOperand::createExpr(Lo16RelocExpr), IDLoc, Instructions);
+      } else if (isABI_N32() || isABI_N64()) {
+        // If it's a local symbol and the N32/N64 ABIs are being used,
+        // we expand to:
+        //  lw/ld $25, 0($gp)
+        //    R_(MICRO)MIPS_GOT_DISP  label
+        //  jalr  $25
+        const MCExpr *GotDispRelocExpr = evaluateRelocExpr(JalExpr, "got_disp");
+
+        emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+                MCOperand::createExpr(GotDispRelocExpr), IDLoc, Instructions);
+      }
+    } else {
+      // If it's an external/weak symbol, we expand to:
+      //  lw/ld    $25, 0($gp)
+      //    R_(MICRO)MIPS_CALL16  label
+      //  jalr  $25
+      const MCExpr *Call16RelocExpr = evaluateRelocExpr(JalExpr, "call16");
+
+      emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+              MCOperand::createExpr(Call16RelocExpr), IDLoc, Instructions);
+    }
+
+    MCInst JalrInst;
+    if (IsCpRestoreSet && inMicroMipsMode())
+      JalrInst.setOpcode(Mips::JALRS_MM);
+    else
+      JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+    JalrInst.addOperand(MCOperand::createReg(Mips::RA));
+    JalrInst.addOperand(MCOperand::createReg(Mips::T9));
+
+    // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR.
+    // This relocation is supposed to be an optimization hint for the linker
+    // and is not necessary for correctness.
+
+    Inst = JalrInst;
+    ExpandedJalSym = true;
+  }
+
   if (MCID.mayLoad() || MCID.mayStore()) {
     // Check the offset of memory operand, if it is a symbol
     // reference or immediate we may have to expand instructions.
@@ -1500,17 +1757,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
             int MemOffset = Op.getImm();
             MCOperand &DstReg = Inst.getOperand(0);
             MCOperand &BaseReg = Inst.getOperand(1);
-            if (isIntN(9, MemOffset) && (MemOffset % 4 == 0) &&
+            if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) &&
                 getContext().getRegisterInfo()->getRegClass(
                   Mips::GPRMM16RegClassID).contains(DstReg.getReg()) &&
-                BaseReg.getReg() == Mips::GP) {
-              MCInst TmpInst;
-              TmpInst.setLoc(IDLoc);
-              TmpInst.setOpcode(Mips::LWGP_MM);
-              TmpInst.addOperand(MCOperand::createReg(DstReg.getReg()));
-              TmpInst.addOperand(MCOperand::createReg(Mips::GP));
-              TmpInst.addOperand(MCOperand::createImm(MemOffset));
-              Instructions.push_back(TmpInst);
+                (BaseReg.getReg() == Mips::GP ||
+                BaseReg.getReg() == Mips::GP_64)) {
+
+              emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
+                      IDLoc, Instructions);
               return false;
             }
           }
@@ -1597,7 +1851,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         if (Imm < -1 || Imm > 14)
           return Error(IDLoc, "immediate operand value out of range");
         break;
+      case Mips::TEQ_MM:
+      case Mips::TGE_MM:
+      case Mips::TGEU_MM:
+      case Mips::TLT_MM:
+      case Mips::TLTU_MM:
+      case Mips::TNE_MM:
       case Mips::SB16_MM:
+      case Mips::SB16_MMR6:
         Opnd = Inst.getOperand(2);
         if (!Opnd.isImm())
           return Error(IDLoc, "expected immediate operand kind");
@@ -1607,6 +1868,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break;
       case Mips::LHU16_MM:
       case Mips::SH16_MM:
+      case Mips::SH16_MMR6:
         Opnd = Inst.getOperand(2);
         if (!Opnd.isImm())
           return Error(IDLoc, "expected immediate operand kind");
@@ -1616,6 +1878,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break;
       case Mips::LW16_MM:
       case Mips::SW16_MM:
+      case Mips::SW16_MMR6:
         Opnd = Inst.getOperand(2);
         if (!Opnd.isImm())
           return Error(IDLoc, "expected immediate operand kind");
@@ -1623,93 +1886,111 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         if (Imm < 0 || Imm > 60 || (Imm % 4 != 0))
           return Error(IDLoc, "immediate operand value out of range");
         break;
-      case Mips::CACHE:
-      case Mips::PREF:
-        Opnd = Inst.getOperand(2);
-        if (!Opnd.isImm())
-          return Error(IDLoc, "expected immediate operand kind");
-        Imm = Opnd.getImm();
-        if (!isUInt<5>(Imm))
-          return Error(IDLoc, "immediate operand value out of range");
-        break;
       case Mips::ADDIUPC_MM:
         MCOperand Opnd = Inst.getOperand(1);
         if (!Opnd.isImm())
           return Error(IDLoc, "expected immediate operand kind");
         int Imm = Opnd.getImm();
-        if ((Imm % 4 != 0) || !isIntN(25, Imm))
+        if ((Imm % 4 != 0) || !isInt<25>(Imm))
           return Error(IDLoc, "immediate operand value out of range");
         break;
     }
   }
 
-  if (needsExpansion(Inst)) {
-    if (expandInstruction(Inst, IDLoc, Instructions))
-      return true;
-  } else
+  MacroExpanderResultTy ExpandResult =
+      tryExpandInstruction(Inst, IDLoc, Instructions);
+  switch (ExpandResult) {
+  case MER_NotAMacro:
     Instructions.push_back(Inst);
+    break;
+  case MER_Success:
+    break;
+  case MER_Fail:
+    return true;
+  }
 
   // If this instruction has a delay slot and .set reorder is active,
   // emit a NOP after it.
   if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
     createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions);
 
-  return false;
-}
+  if ((Inst.getOpcode() == Mips::JalOneReg ||
+       Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) &&
+      isPicAndNotNxxAbi()) {
+    if (IsCpRestoreSet) {
+      // We need a NOP between the JALR and the LW:
+      // If .set reorder has been used, we've already emitted a NOP.
+      // If .set noreorder has been used, we need to emit a NOP at this point.
+      if (!AssemblerOptions.back()->isReorder())
+        createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions);
 
-bool MipsAsmParser::needsExpansion(MCInst &Inst) {
+      // Load the $gp from the stack.
+      SmallVector<MCInst, 3> LoadInsts;
+      createCpRestoreMemOp(true /*IsLoad*/, CpRestoreOffset /*StackOffset*/,
+                           IDLoc, LoadInsts);
 
-  switch (Inst.getOpcode()) {
-  case Mips::LoadImm32:
-  case Mips::LoadImm64:
-  case Mips::LoadAddrImm32:
-  case Mips::LoadAddrReg32:
-  case Mips::B_MM_Pseudo:
-  case Mips::LWM_MM:
-  case Mips::SWM_MM:
-  case Mips::JalOneReg:
-  case Mips::JalTwoReg:
-  case Mips::BneImm:
-  case Mips::BeqImm:
-  case Mips::BLT:
-  case Mips::BLE:
-  case Mips::BGE:
-  case Mips::BGT:
-  case Mips::BLTU:
-  case Mips::BLEU:
-  case Mips::BGEU:
-  case Mips::BGTU:
-  case Mips::Ulhu:
-  case Mips::Ulw:
-    return true;
-  default:
-    return false;
+      for (const MCInst &Inst : LoadInsts)
+        Instructions.push_back(Inst);
+
+    } else
+      Warning(IDLoc, "no .cprestore used in PIC mode");
   }
+
+  return false;
 }
 
-bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
-                                      SmallVectorImpl<MCInst> &Instructions) {
+MipsAsmParser::MacroExpanderResultTy
+MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   switch (Inst.getOpcode()) {
-  default: llvm_unreachable("unimplemented expansion");
+  default:
+    return MER_NotAMacro;
   case Mips::LoadImm32:
-    return expandLoadImm(Inst, true, IDLoc, Instructions);
+    return expandLoadImm(Inst, true, IDLoc, Instructions) ? MER_Fail
+                                                          : MER_Success;
   case Mips::LoadImm64:
-    return expandLoadImm(Inst, false, IDLoc, Instructions);
+    return expandLoadImm(Inst, false, IDLoc, Instructions) ? MER_Fail
+                                                           : MER_Success;
   case Mips::LoadAddrImm32:
-    return expandLoadAddressImm(Inst, true, IDLoc, Instructions);
+  case Mips::LoadAddrImm64:
+    assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+    assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) &&
+           "expected immediate operand kind");
+
+    return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
+                             Inst.getOperand(1),
+                             Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
+                             Instructions)
+               ? MER_Fail
+               : MER_Success;
   case Mips::LoadAddrReg32:
-    return expandLoadAddressReg(Inst, true, IDLoc, Instructions);
+  case Mips::LoadAddrReg64:
+    assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+    assert(Inst.getOperand(1).isReg() && "expected register operand kind");
+    assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) &&
+           "expected immediate operand kind");
+
+    return expandLoadAddress(Inst.getOperand(0).getReg(),
+                             Inst.getOperand(1).getReg(), Inst.getOperand(2),
+                             Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc,
+                             Instructions)
+               ? MER_Fail
+               : MER_Success;
   case Mips::B_MM_Pseudo:
-    return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions);
+  case Mips::B_MMR6_Pseudo:
+    return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions) ? MER_Fail
+                                                                 : MER_Success;
   case Mips::SWM_MM:
   case Mips::LWM_MM:
-    return expandLoadStoreMultiple(Inst, IDLoc, Instructions);
+    return expandLoadStoreMultiple(Inst, IDLoc, Instructions) ? MER_Fail
+                                                              : MER_Success;
   case Mips::JalOneReg:
   case Mips::JalTwoReg:
-    return expandJalWithRegs(Inst, IDLoc, Instructions);
+    return expandJalWithRegs(Inst, IDLoc, Instructions) ? MER_Fail
+                                                        : MER_Success;
   case Mips::BneImm:
   case Mips::BeqImm:
-    return expandBranchImm(Inst, IDLoc, Instructions);
+    return expandBranchImm(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success;
   case Mips::BLT:
   case Mips::BLE:
   case Mips::BGE:
@@ -1718,78 +1999,97 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
   case Mips::BLEU:
   case Mips::BGEU:
   case Mips::BGTU:
-    return expandCondBranches(Inst, IDLoc, Instructions);
+  case Mips::BLTL:
+  case Mips::BLEL:
+  case Mips::BGEL:
+  case Mips::BGTL:
+  case Mips::BLTUL:
+  case Mips::BLEUL:
+  case Mips::BGEUL:
+  case Mips::BGTUL:
+  case Mips::BLTImmMacro:
+  case Mips::BLEImmMacro:
+  case Mips::BGEImmMacro:
+  case Mips::BGTImmMacro:
+  case Mips::BLTUImmMacro:
+  case Mips::BLEUImmMacro:
+  case Mips::BGEUImmMacro:
+  case Mips::BGTUImmMacro:
+  case Mips::BLTLImmMacro:
+  case Mips::BLELImmMacro:
+  case Mips::BGELImmMacro:
+  case Mips::BGTLImmMacro:
+  case Mips::BLTULImmMacro:
+  case Mips::BLEULImmMacro:
+  case Mips::BGEULImmMacro:
+  case Mips::BGTULImmMacro:
+    return expandCondBranches(Inst, IDLoc, Instructions) ? MER_Fail
+                                                         : MER_Success;
+  case Mips::SDivMacro:
+    return expandDiv(Inst, IDLoc, Instructions, false, true) ? MER_Fail
+                                                             : MER_Success;
+  case Mips::DSDivMacro:
+    return expandDiv(Inst, IDLoc, Instructions, true, true) ? MER_Fail
+                                                            : MER_Success;
+  case Mips::UDivMacro:
+    return expandDiv(Inst, IDLoc, Instructions, false, false) ? MER_Fail
+                                                              : MER_Success;
+  case Mips::DUDivMacro:
+    return expandDiv(Inst, IDLoc, Instructions, true, false) ? MER_Fail
+                                                             : MER_Success;
+  case Mips::Ulh:
+    return expandUlh(Inst, true, IDLoc, Instructions) ? MER_Fail : MER_Success;
   case Mips::Ulhu:
-    return expandUlhu(Inst, IDLoc, Instructions);
+    return expandUlh(Inst, false, IDLoc, Instructions) ? MER_Fail : MER_Success;
   case Mips::Ulw:
-    return expandUlw(Inst, IDLoc, Instructions);
+    return expandUlw(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success;
+  case Mips::NORImm:
+    return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
+                                                           : MER_Success;
+  case Mips::ADDi:
+  case Mips::ADDiu:
+  case Mips::SLTi:
+  case Mips::SLTiu:
+    if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+        Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+      int64_t ImmValue = Inst.getOperand(2).getImm();
+      if (isInt<16>(ImmValue))
+        return MER_NotAMacro;
+      return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
+                                                             : MER_Success;
+    }
+    return MER_NotAMacro;
+  case Mips::ANDi:
+  case Mips::ORi:
+  case Mips::XORi:
+    if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+        Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+      int64_t ImmValue = Inst.getOperand(2).getImm();
+      if (isUInt<16>(ImmValue))
+        return MER_NotAMacro;
+      return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
+                                                             : MER_Success;
+    }
+    return MER_NotAMacro;
+  case Mips::ROL:
+  case Mips::ROR:
+    return expandRotation(Inst, IDLoc, Instructions) ? MER_Fail
+                                                     : MER_Success;
+  case Mips::ROLImm:
+  case Mips::RORImm:
+    return expandRotationImm(Inst, IDLoc, Instructions) ? MER_Fail
+                                                        : MER_Success;
+  case Mips::DROL:
+  case Mips::DROR:
+    return expandDRotation(Inst, IDLoc, Instructions) ? MER_Fail
+                                                      : MER_Success;
+  case Mips::DROLImm:
+  case Mips::DRORImm:
+    return expandDRotationImm(Inst, IDLoc, Instructions) ? MER_Fail
+                                                         : MER_Success;
   }
 }
 
-namespace {
-void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc,
-            SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  tmpInst.setOpcode(Opcode);
-  tmpInst.addOperand(MCOperand::createReg(DstReg));
-  tmpInst.addOperand(Imm);
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
-}
-
-void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc,
-            SmallVectorImpl<MCInst> &Instructions) {
-  emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions);
-}
-
-
-void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm,
-             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  tmpInst.setOpcode(Opcode);
-  tmpInst.addOperand(MCOperand::createReg(DstReg));
-  tmpInst.addOperand(MCOperand::createReg(SrcReg));
-  tmpInst.addOperand(Imm);
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
-}
-
-void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg,
-             unsigned SrcReg2, SMLoc IDLoc,
-             SmallVectorImpl<MCInst> &Instructions) {
-  emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc,
-          Instructions);
-}
-
-void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm,
-             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
-  emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc,
-          Instructions);
-}
-
-template <int16_t ShiftAmount>
-void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions) {
-  if (ShiftAmount >= 32)
-    emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions);
-  else if (ShiftAmount > 0)
-    emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions);
-
-  // There's no need for an ORi if the immediate is 0.
-  if (Operand.isImm() && Operand.getImm() == 0)
-    return;
-
-  emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions);
-}
-
-template <unsigned ShiftAmount>
-void createLShiftOri(int64_t Value, unsigned RegNo, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions) {
-  createLShiftOri<ShiftAmount>(MCOperand::createImm(Value), RegNo, IDLoc,
-                               Instructions);
-}
-}
-
 bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
                                       SmallVectorImpl<MCInst> &Instructions) {
   // Create a JALR instruction which is going to replace the pseudo-JAL.
@@ -1800,8 +2100,11 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
 
   if (Opcode == Mips::JalOneReg) {
     // jal $rs => jalr $rs
-    if (inMicroMipsMode()) {
-      JalrInst.setOpcode(Mips::JALR16_MM);
+    if (IsCpRestoreSet && inMicroMipsMode()) {
+      JalrInst.setOpcode(Mips::JALRS16_MM);
+      JalrInst.addOperand(FirstRegOp);
+    } else if (inMicroMipsMode()) {
+      JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM);
       JalrInst.addOperand(FirstRegOp);
     } else {
       JalrInst.setOpcode(Mips::JALR);
@@ -1810,30 +2113,47 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
     }
   } else if (Opcode == Mips::JalTwoReg) {
     // jal $rd, $rs => jalr $rd, $rs
-    JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+    if (IsCpRestoreSet && inMicroMipsMode())
+      JalrInst.setOpcode(Mips::JALRS_MM);
+    else
+      JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
     JalrInst.addOperand(FirstRegOp);
     const MCOperand SecondRegOp = Inst.getOperand(1);
     JalrInst.addOperand(SecondRegOp);
   }
   Instructions.push_back(JalrInst);
 
-  // If .set reorder is active, emit a NOP after it.
-  if (AssemblerOptions.back()->isReorder()) {
-    // This is a 32-bit NOP because these 2 pseudo-instructions
-    // do not have a short delay slot.
-    MCInst NopInst;
-    NopInst.setOpcode(Mips::SLL);
-    NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::createImm(0));
-    Instructions.push_back(NopInst);
+  // If .set reorder is active and branch instruction has a delay slot,
+  // emit a NOP after it.
+  const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode());
+  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) {
+    createNop(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc, Instructions);
   }
 
   return false;
 }
 
+/// Can the value be represented by a unsigned N-bit value and a shift left?
+template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
+  unsigned BitNum = findFirstSet(x);
+
+  return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum);
+}
+
+/// Load (or add) an immediate into a register.
+///
+/// @param ImmValue     The immediate to load.
+/// @param DstReg       The register that will hold the immediate.
+/// @param SrcReg       A register to add to the immediate or Mips::NoRegister
+///                     for a simple initialization.
+/// @param Is32BitImm   Is ImmValue 32-bit or 64-bit?
+/// @param IsAddress    True if the immediate represents an address. False if it
+///                     is an integer.
+/// @param IDLoc        Location of the immediate in the source file.
+/// @param Instructions The instructions emitted by this expansion.
 bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
-                                  unsigned SrcReg, bool Is32BitImm, SMLoc IDLoc,
+                                  unsigned SrcReg, bool Is32BitImm,
+                                  bool IsAddress, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions) {
   if (!Is32BitImm && !isGP64bit()) {
     Error(IDLoc, "instruction requires a 64-bit architecture");
@@ -1852,6 +2172,9 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     }
   }
 
+  unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
+  unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu;
+
   bool UseSrcReg = false;
   if (SrcReg != Mips::NoRegister)
     UseSrcReg = true;
@@ -1866,111 +2189,129 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     TmpReg = ATReg;
   }
 
-  // FIXME: gas has a special case for values that are 000...1111, which
-  // becomes a li -1 and then a dsrl
   if (isInt<16>(ImmValue)) {
-    // li d,j => addiu d,$zero,j
     if (!UseSrcReg)
-      SrcReg = Mips::ZERO;
+      SrcReg = ZeroReg;
+
+    // This doesn't quite follow the usual ABI expectations for N32 but matches
+    // traditional assembler behaviour. N32 would normally use addiu for both
+    // integers and addresses.
+    if (IsAddress && !Is32BitImm) {
+      emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
+      return false;
+    }
+
     emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
-  } else if (isUInt<16>(ImmValue)) {
-    // li d,j => ori d,$zero,j
+    return false;
+  }
+
+  if (isUInt<16>(ImmValue)) {
     unsigned TmpReg = DstReg;
     if (SrcReg == DstReg) {
-      unsigned ATReg = getATReg(IDLoc);
-      if (!ATReg)
+      TmpReg = getATReg(IDLoc);
+      if (!TmpReg)
         return true;
-      TmpReg = ATReg;
     }
 
-    emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions);
+    emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, Instructions);
     if (UseSrcReg)
-      emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
-  } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+      emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+    return false;
+  }
+
+  if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
     warnIfNoMacro(IDLoc);
 
-    // For all other values which are representable as a 32-bit integer:
-    // li d,j => lui d,hi16(j)
-    //           ori d,d,lo16(j)
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
 
     if (!Is32BitImm && !isInt<32>(ImmValue)) {
-      // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the
+      // Traditional behaviour seems to special case this particular value. It's
+      // not clear why other masks are handled differently.
+      if (ImmValue == 0xffffffff) {
+        emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, Instructions);
+        emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, Instructions);
+        if (UseSrcReg)
+          emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+        return false;
+      }
+
+      // Expand to an ORi instead of a LUi to avoid sign-extending into the
       // upper 32 bits.
-      emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions);
+      emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, Instructions);
       emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions);
-    } else
-      emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
-    createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions);
+      if (Bits15To0)
+        emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions);
+      if (UseSrcReg)
+        emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+      return false;
+    }
 
+    emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
+    if (Bits15To0)
+      emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions);
     if (UseSrcReg)
-      createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
-
-  } else if ((ImmValue & (0xffffLL << 48)) == 0) {
-    warnIfNoMacro(IDLoc);
+      emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+    return false;
+  }
 
-    //            <-------  lo32 ------>
-    // <-------  hi32 ------>
-    // <- hi16 ->             <- lo16 ->
-    //  _________________________________
-    // |          |          |          |
-    // | 16-bits  | 16-bits  | 16-bits  |
-    // |__________|__________|__________|
-    //
-    // For any 64-bit value that is representable as a 48-bit integer:
-    // li d,j => lui d,hi16(j)
-    //           ori d,d,hi16(lo32(j))
-    //           dsll d,d,16
-    //           ori d,d,lo16(lo32(j))
-    uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff;
-    uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
-    uint16_t Bits15To0 = ImmValue & 0xffff;
+  if (isShiftedUIntAtAnyPosition<16>(ImmValue)) {
+    if (Is32BitImm) {
+      Error(IDLoc, "instruction requires a 32-bit immediate");
+      return true;
+    }
 
-    emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions);
-    createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions);
-    createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions);
+    // Traditionally, these immediates are shifted as little as possible and as
+    // such we align the most significant bit to bit 15 of our temporary.
+    unsigned FirstSet = findFirstSet((uint64_t)ImmValue);
+    unsigned LastSet = findLastSet((uint64_t)ImmValue);
+    unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet));
+    uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff;
+    emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, Instructions);
+    emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, Instructions);
 
     if (UseSrcReg)
-      createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
+      emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
 
-  } else {
-    warnIfNoMacro(IDLoc);
+    return false;
+  }
 
-    // <-------  hi32 ------> <-------  lo32 ------>
-    // <- hi16 ->                        <- lo16 ->
-    //  ___________________________________________
-    // |          |          |          |          |
-    // | 16-bits  | 16-bits  | 16-bits  | 16-bits  |
-    // |__________|__________|__________|__________|
-    //
-    // For all other values which are representable as a 64-bit integer:
-    // li d,j => lui d,hi16(j)
-    //           ori d,d,lo16(hi32(j))
-    //           dsll d,d,16
-    //           ori d,d,hi16(lo32(j))
-    //           dsll d,d,16
-    //           ori d,d,lo16(lo32(j))
-    uint16_t Bits63To48 = (ImmValue >> 48) & 0xffff;
-    uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff;
-    uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
-    uint16_t Bits15To0 = ImmValue & 0xffff;
+  warnIfNoMacro(IDLoc);
 
-    emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions);
-    createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions);
+  // The remaining case is packed with a sequence of dsll and ori with zeros
+  // being omitted and any neighbouring dsll's being coalesced.
+  // The highest 32-bit's are equivalent to a 32-bit immediate load.
 
-    // When Bits31To16 is 0, do a left shift of 32 bits instead of doing
-    // two left shifts of 16 bits.
-    if (Bits31To16 == 0) {
-      createLShiftOri<32>(Bits15To0, TmpReg, IDLoc, Instructions);
-    } else {
-      createLShiftOri<16>(Bits31To16, TmpReg, IDLoc, Instructions);
-      createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions);
+  // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
+  if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
+                    IDLoc, Instructions))
+    return false;
+
+  // Shift and accumulate into the register. If a 16-bit chunk is zero, then
+  // skip it and defer the shift to the next chunk.
+  unsigned ShiftCarriedForwards = 16;
+  for (int BitNum = 16; BitNum >= 0; BitNum -= 16) {
+    uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff;
+
+    if (ImmChunk != 0) {
+      emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc,
+                          Instructions);
+      emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, Instructions);
+      ShiftCarriedForwards = 0;
     }
 
-    if (UseSrcReg)
-      createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
+    ShiftCarriedForwards += 16;
   }
+  ShiftCarriedForwards -= 16;
+
+  // Finish any remaining shifts left by trailing zeros.
+  if (ShiftCarriedForwards)
+    emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc,
+                        Instructions);
+
+  if (UseSrcReg)
+    emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+
   return false;
 }
 
@@ -1982,63 +2323,38 @@ bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
   assert(DstRegOp.isReg() && "expected register operand kind");
 
   if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
-                    Is32BitImm, IDLoc, Instructions))
+                    Is32BitImm, false, IDLoc, Instructions))
     return true;
 
   return false;
 }
 
-bool
-MipsAsmParser::expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
-                                    SmallVectorImpl<MCInst> &Instructions) {
-  const MCOperand &DstRegOp = Inst.getOperand(0);
-  assert(DstRegOp.isReg() && "expected register operand kind");
-
-  const MCOperand &SrcRegOp = Inst.getOperand(1);
-  assert(SrcRegOp.isReg() && "expected register operand kind");
-
-  const MCOperand &ImmOp = Inst.getOperand(2);
-  assert((ImmOp.isImm() || ImmOp.isExpr()) &&
-         "expected immediate operand kind");
-  if (!ImmOp.isImm()) {
-    if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(),
-                                SrcRegOp.getReg(), Is32BitImm, IDLoc,
-                                Instructions))
-      return true;
-
-    return false;
-  }
-
-  if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), SrcRegOp.getReg(),
-                    Is32BitImm, IDLoc, Instructions))
+bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+                                      const MCOperand &Offset,
+                                      bool Is32BitAddress, SMLoc IDLoc,
+                                      SmallVectorImpl<MCInst> &Instructions) {
+  // la can't produce a usable address when addresses are 64-bit.
+  if (Is32BitAddress && ABI.ArePtrs64bit()) {
+    // FIXME: Demote this to a warning and continue as if we had 'dla' instead.
+    //        We currently can't do this because we depend on the equality
+    //        operator and N64 can end up with a GPR32/GPR64 mismatch.
+    Error(IDLoc, "la used to load 64-bit address");
+    // Continue as if we had 'dla' instead.
+    Is32BitAddress = false;
+  }
+
+  // dla requires 64-bit addresses.
+  if (!Is32BitAddress && !ABI.ArePtrs64bit()) {
+    Error(IDLoc, "instruction requires a 64-bit architecture");
     return true;
-
-  return false;
-}
-
-bool
-MipsAsmParser::expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
-                                    SmallVectorImpl<MCInst> &Instructions) {
-  const MCOperand &DstRegOp = Inst.getOperand(0);
-  assert(DstRegOp.isReg() && "expected register operand kind");
-
-  const MCOperand &ImmOp = Inst.getOperand(1);
-  assert((ImmOp.isImm() || ImmOp.isExpr()) &&
-         "expected immediate operand kind");
-  if (!ImmOp.isImm()) {
-    if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(),
-                                Mips::NoRegister, Is32BitImm, IDLoc,
-                                Instructions))
-      return true;
-
-    return false;
   }
 
-  if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
-                    Is32BitImm, IDLoc, Instructions))
-    return true;
+  if (!Offset.isImm())
+    return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg,
+                                   Is32BitAddress, IDLoc, Instructions);
 
-  return false;
+  return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true,
+                       IDLoc, Instructions);
 }
 
 bool MipsAsmParser::loadAndAddSymbolAddress(
@@ -2046,67 +2362,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
     SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
   warnIfNoMacro(IDLoc);
 
-  if (Is32BitSym && isABI_N64())
-    Warning(IDLoc, "instruction loads the 32-bit address of a 64-bit symbol");
-
-  MCInst tmpInst;
-  const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymExpr);
-  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create(
-      &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_HI, getContext());
-  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create(
-      &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_LO, getContext());
+  const MCExpr *Symbol = cast<MCExpr>(SymExpr);
+  const MipsMCExpr *HiExpr = MipsMCExpr::create(
+      MCSymbolRefExpr::VK_Mips_ABS_HI, Symbol, getContext());
+  const MipsMCExpr *LoExpr = MipsMCExpr::create(
+      MCSymbolRefExpr::VK_Mips_ABS_LO, Symbol, getContext());
 
   bool UseSrcReg = SrcReg != Mips::NoRegister;
 
+  // This is the 64-bit symbol address expansion.
+  if (ABI.ArePtrs64bit() && isGP64bit()) {
+    // We always need AT for the 64-bit expansion.
+    // If it is not available we exit.
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+
+    const MipsMCExpr *HighestExpr = MipsMCExpr::create(
+        MCSymbolRefExpr::VK_Mips_HIGHEST, Symbol, getContext());
+    const MipsMCExpr *HigherExpr = MipsMCExpr::create(
+        MCSymbolRefExpr::VK_Mips_HIGHER, Symbol, getContext());
+
+    if (UseSrcReg && (DstReg == SrcReg)) {
+      // If $rs is the same as $rd:
+      // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
+      //                        daddiu $at, $at, %higher(sym)
+      //                        dsll   $at, $at, 16
+      //                        daddiu $at, $at, %hi(sym)
+      //                        dsll   $at, $at, 16
+      //                        daddiu $at, $at, %lo(sym)
+      //                        daddu  $rd, $at, $rd
+      emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+             Instructions);
+      emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr),
+              IDLoc, Instructions);
+      emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions);
+      emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc,
+              Instructions);
+      emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions);
+      emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc,
+              Instructions);
+      emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, Instructions);
+
+      return false;
+    }
+
+    // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+    // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+    //                            lui    $at, %hi(sym)
+    //                            daddiu $rd, $rd, %higher(sym)
+    //                            daddiu $at, $at, %lo(sym)
+    //                            dsll32 $rd, $rd, 0
+    //                            daddu  $rd, $rd, $at
+    //                            (daddu  $rd, $rd, $rs)
+    emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+           Instructions);
+    emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc,
+           Instructions);
+    emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr),
+            IDLoc, Instructions);
+    emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc,
+            Instructions);
+    emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, Instructions);
+    emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, Instructions);
+    if (UseSrcReg)
+      emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, Instructions);
+
+    return false;
+  }
+
+  // And now, the 32-bit symbol address expansion:
+  // If $rs is the same as $rd:
+  // (d)la $rd, sym($rd)     => lui   $at, %hi(sym)
+  //                            ori   $at, $at, %lo(sym)
+  //                            addu  $rd, $at, $rd
+  // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+  // (d)la $rd, sym/sym($rs) => lui   $rd, %hi(sym)
+  //                            ori   $rd, $rd, %lo(sym)
+  //                            (addu $rd, $rd, $rs)
   unsigned TmpReg = DstReg;
   if (UseSrcReg && (DstReg == SrcReg)) {
-    // At this point we need AT to perform the expansions and we exit if it is
-    // not available.
+    // If $rs is the same as $rd, we need to use AT.
+    // If it is not available we exit.
     unsigned ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
     TmpReg = ATReg;
   }
 
-  if (!Is32BitSym) {
-    // If it's a 64-bit architecture, expand to:
-    // la d,sym => lui  d,highest(sym)
-    //             ori  d,d,higher(sym)
-    //             dsll d,d,16
-    //             ori  d,d,hi16(sym)
-    //             dsll d,d,16
-    //             ori  d,d,lo16(sym)
-    const MCSymbolRefExpr *HighestExpr = MCSymbolRefExpr::create(
-        &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHEST, getContext());
-    const MCSymbolRefExpr *HigherExpr = MCSymbolRefExpr::create(
-        &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHER, getContext());
-
-    tmpInst.setOpcode(Mips::LUi);
-    tmpInst.addOperand(MCOperand::createReg(TmpReg));
-    tmpInst.addOperand(MCOperand::createExpr(HighestExpr));
-    Instructions.push_back(tmpInst);
-
-    createLShiftOri<0>(MCOperand::createExpr(HigherExpr), TmpReg, SMLoc(),
-                       Instructions);
-    createLShiftOri<16>(MCOperand::createExpr(HiExpr), TmpReg, SMLoc(),
-                        Instructions);
-    createLShiftOri<16>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(),
-                        Instructions);
-  } else {
-    // Otherwise, expand to:
-    // la d,sym => lui  d,hi16(sym)
-    //             ori  d,d,lo16(sym)
-    tmpInst.setOpcode(Mips::LUi);
-    tmpInst.addOperand(MCOperand::createReg(TmpReg));
-    tmpInst.addOperand(MCOperand::createExpr(HiExpr));
-    Instructions.push_back(tmpInst);
-
-    emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(),
-            Instructions);
-  }
+  emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, Instructions);
+  emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc,
+          Instructions);
 
   if (UseSrcReg)
-    createAddu(DstReg, TmpReg, SrcReg, !Is32BitSym, Instructions);
+    emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+  else
+    assert(DstReg == TmpReg);
 
   return false;
 }
@@ -2125,12 +2476,13 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(
     Inst.addOperand(MCOperand::createExpr(Offset.getExpr()));
   } else {
     assert(Offset.isImm() && "expected immediate operand kind");
-    if (isIntN(11, Offset.getImm())) {
+    if (isInt<11>(Offset.getImm())) {
       // If offset fits into 11 bits then this instruction becomes microMIPS
       // 16-bit unconditional branch instruction.
-      Inst.setOpcode(Mips::B16_MM);
+      if (inMicroMipsMode())
+        Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM);
     } else {
-      if (!isIntN(17, Offset.getImm()))
+      if (!isInt<17>(Offset.getImm()))
         Error(IDLoc, "branch target out of range");
       if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
         Error(IDLoc, "branch to misaligned address");
@@ -2143,8 +2495,10 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(
   }
   Instructions.push_back(Inst);
 
-  // If .set reorder is active, emit a NOP after the branch instruction.
-  if (AssemblerOptions.back()->isReorder())
+  // If .set reorder is active and branch instruction has a delay slot,
+  // emit a NOP after it.
+  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
     createNop(true, IDLoc, Instructions);
 
   return false;
@@ -2175,30 +2529,21 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
   }
 
   int64_t ImmValue = ImmOp.getImm();
-  if (ImmValue == 0) {
-    MCInst BranchInst;
-    BranchInst.setOpcode(OpCode);
-    BranchInst.addOperand(DstRegOp);
-    BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-    BranchInst.addOperand(MemOffsetOp);
-    Instructions.push_back(BranchInst);
-  } else {
+  if (ImmValue == 0)
+    emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+            Instructions);
+  else {
     warnIfNoMacro(IDLoc);
 
     unsigned ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
 
-    if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), IDLoc,
-                      Instructions))
+    if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
+                      IDLoc, Instructions))
       return true;
 
-    MCInst BranchInst;
-    BranchInst.setOpcode(OpCode);
-    BranchInst.addOperand(DstRegOp);
-    BranchInst.addOperand(MCOperand::createReg(ATReg));
-    BranchInst.addOperand(MemOffsetOp);
-    Instructions.push_back(BranchInst);
+    emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, Instructions);
   }
   return false;
 }
@@ -2206,7 +2551,6 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions,
                                   bool isLoad, bool isImmOpnd) {
-  MCInst TempInst;
   unsigned ImmOffset, HiOffset, LoOffset;
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
@@ -2227,8 +2571,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
       HiOffset++;
   } else
     ExprOffset = Inst.getOperand(2).getExpr();
-  // All instructions will have the same location.
-  TempInst.setLoc(IDLoc);
   // These are some of the types of expansions we perform here:
   // 1) lw $8, sym        => lui $8, %hi(sym)
   //                         lw $8, %lo(sym)($8)
@@ -2267,40 +2609,20 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
       return;
   }
 
-  TempInst.setOpcode(Mips::LUi);
-  TempInst.addOperand(MCOperand::createReg(TmpRegNum));
-  if (isImmOpnd)
-    TempInst.addOperand(MCOperand::createImm(HiOffset));
-  else {
-    const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi");
-    TempInst.addOperand(MCOperand::createExpr(HiExpr));
-  }
-  // Add the instruction to the list.
-  Instructions.push_back(TempInst);
-  // Prepare TempInst for next instruction.
-  TempInst.clear();
+  emitRX(Mips::LUi, TmpRegNum,
+         isImmOpnd ? MCOperand::createImm(HiOffset)
+                   : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "hi")),
+         IDLoc, Instructions);
   // Add temp register to base.
-  if (BaseRegNum != Mips::ZERO) {
-    TempInst.setOpcode(Mips::ADDu);
-    TempInst.addOperand(MCOperand::createReg(TmpRegNum));
-    TempInst.addOperand(MCOperand::createReg(TmpRegNum));
-    TempInst.addOperand(MCOperand::createReg(BaseRegNum));
-    Instructions.push_back(TempInst);
-    TempInst.clear();
-  }
+  if (BaseRegNum != Mips::ZERO)
+    emitRRR(Mips::ADDu, TmpRegNum, TmpRegNum, BaseRegNum, IDLoc, Instructions);
   // And finally, create original instruction with low part
   // of offset and new base.
-  TempInst.setOpcode(Inst.getOpcode());
-  TempInst.addOperand(MCOperand::createReg(RegOpNum));
-  TempInst.addOperand(MCOperand::createReg(TmpRegNum));
-  if (isImmOpnd)
-    TempInst.addOperand(MCOperand::createImm(LoOffset));
-  else {
-    const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo");
-    TempInst.addOperand(MCOperand::createExpr(LoExpr));
-  }
-  Instructions.push_back(TempInst);
-  TempInst.clear();
+  emitRRX(Inst.getOpcode(), RegOpNum, TmpRegNum,
+          isImmOpnd
+              ? MCOperand::createImm(LoOffset)
+              : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "lo")),
+          IDLoc, Instructions);
 }
 
 bool
@@ -2316,10 +2638,16 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
 
   if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
       Inst.getOperand(OpNum - 1).getImm() >= 0 &&
-      Inst.getOperand(OpNum - 2).getReg() == Mips::SP &&
-      Inst.getOperand(OpNum - 3).getReg() == Mips::RA)
+      (Inst.getOperand(OpNum - 2).getReg() == Mips::SP ||
+       Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) &&
+      (Inst.getOperand(OpNum - 3).getReg() == Mips::RA ||
+       Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) {
     // It can be implemented as SWM16 or LWM16 instruction.
-    NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+    if (inMicroMipsMode() && hasMips32r6())
+      NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6;
+    else
+      NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+  }
 
   Inst.setOpcode(NewOpcode);
   Instructions.push_back(Inst);
@@ -2328,44 +2656,126 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
 
 bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
                                        SmallVectorImpl<MCInst> &Instructions) {
+  bool EmittedNoMacroWarning = false;
   unsigned PseudoOpcode = Inst.getOpcode();
   unsigned SrcReg = Inst.getOperand(0).getReg();
-  unsigned TrgReg = Inst.getOperand(1).getReg();
+  const MCOperand &TrgOp = Inst.getOperand(1);
   const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr();
 
   unsigned ZeroSrcOpcode, ZeroTrgOpcode;
-  bool ReverseOrderSLT, IsUnsigned, AcceptsEquality;
+  bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality;
+
+  unsigned TrgReg;
+  if (TrgOp.isReg())
+    TrgReg = TrgOp.getReg();
+  else if (TrgOp.isImm()) {
+    warnIfNoMacro(IDLoc);
+    EmittedNoMacroWarning = true;
+
+    TrgReg = getATReg(IDLoc);
+    if (!TrgReg)
+      return true;
+
+    switch(PseudoOpcode) {
+    default:
+      llvm_unreachable("unknown opcode for branch pseudo-instruction");
+    case Mips::BLTImmMacro:
+      PseudoOpcode = Mips::BLT;
+      break;
+    case Mips::BLEImmMacro:
+      PseudoOpcode = Mips::BLE;
+      break;
+    case Mips::BGEImmMacro:
+      PseudoOpcode = Mips::BGE;
+      break;
+    case Mips::BGTImmMacro:
+      PseudoOpcode = Mips::BGT;
+      break;
+    case Mips::BLTUImmMacro:
+      PseudoOpcode = Mips::BLTU;
+      break;
+    case Mips::BLEUImmMacro:
+      PseudoOpcode = Mips::BLEU;
+      break;
+    case Mips::BGEUImmMacro:
+      PseudoOpcode = Mips::BGEU;
+      break;
+    case Mips::BGTUImmMacro:
+      PseudoOpcode = Mips::BGTU;
+      break;
+    case Mips::BLTLImmMacro:
+      PseudoOpcode = Mips::BLTL;
+      break;
+    case Mips::BLELImmMacro:
+      PseudoOpcode = Mips::BLEL;
+      break;
+    case Mips::BGELImmMacro:
+      PseudoOpcode = Mips::BGEL;
+      break;
+    case Mips::BGTLImmMacro:
+      PseudoOpcode = Mips::BGTL;
+      break;
+    case Mips::BLTULImmMacro:
+      PseudoOpcode = Mips::BLTUL;
+      break;
+    case Mips::BLEULImmMacro:
+      PseudoOpcode = Mips::BLEUL;
+      break;
+    case Mips::BGEULImmMacro:
+      PseudoOpcode = Mips::BGEUL;
+      break;
+    case Mips::BGTULImmMacro:
+      PseudoOpcode = Mips::BGTUL;
+      break;
+    }
+
+    if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
+                      false, IDLoc, Instructions))
+      return true;
+  }
 
   switch (PseudoOpcode) {
   case Mips::BLT:
   case Mips::BLTU:
+  case Mips::BLTL:
+  case Mips::BLTUL:
     AcceptsEquality = false;
     ReverseOrderSLT = false;
-    IsUnsigned = (PseudoOpcode == Mips::BLTU);
+    IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
+    IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL));
     ZeroSrcOpcode = Mips::BGTZ;
     ZeroTrgOpcode = Mips::BLTZ;
     break;
   case Mips::BLE:
   case Mips::BLEU:
+  case Mips::BLEL:
+  case Mips::BLEUL:
     AcceptsEquality = true;
     ReverseOrderSLT = true;
-    IsUnsigned = (PseudoOpcode == Mips::BLEU);
+    IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
+    IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL));
     ZeroSrcOpcode = Mips::BGEZ;
     ZeroTrgOpcode = Mips::BLEZ;
     break;
   case Mips::BGE:
   case Mips::BGEU:
+  case Mips::BGEL:
+  case Mips::BGEUL:
     AcceptsEquality = true;
     ReverseOrderSLT = false;
-    IsUnsigned = (PseudoOpcode == Mips::BGEU);
+    IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
+    IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL));
     ZeroSrcOpcode = Mips::BLEZ;
     ZeroTrgOpcode = Mips::BGEZ;
     break;
   case Mips::BGT:
   case Mips::BGTU:
+  case Mips::BGTL:
+  case Mips::BGTUL:
     AcceptsEquality = false;
     ReverseOrderSLT = true;
-    IsUnsigned = (PseudoOpcode == Mips::BGTU);
+    IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
+    IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL));
     ZeroSrcOpcode = Mips::BLTZ;
     ZeroTrgOpcode = Mips::BGTZ;
     break;
@@ -2373,7 +2783,6 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
     llvm_unreachable("unknown opcode for branch pseudo-instruction");
   }
 
-  MCInst BranchInst;
   bool IsTrgRegZero = (TrgReg == Mips::ZERO);
   bool IsSrcRegZero = (SrcReg == Mips::ZERO);
   if (IsSrcRegZero && IsTrgRegZero) {
@@ -2381,51 +2790,37 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
     // with GAS' behaviour. However, they may not generate the most efficient
     // code in some circumstances.
     if (PseudoOpcode == Mips::BLT) {
-      BranchInst.setOpcode(Mips::BLTZ);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+             Instructions);
       return false;
     }
     if (PseudoOpcode == Mips::BLE) {
-      BranchInst.setOpcode(Mips::BLEZ);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+             Instructions);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
     if (PseudoOpcode == Mips::BGE) {
-      BranchInst.setOpcode(Mips::BGEZ);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+             Instructions);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
     if (PseudoOpcode == Mips::BGT) {
-      BranchInst.setOpcode(Mips::BGTZ);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+             Instructions);
       return false;
     }
     if (PseudoOpcode == Mips::BGTU) {
-      BranchInst.setOpcode(Mips::BNE);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
+              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
       return false;
     }
     if (AcceptsEquality) {
       // If both registers are $0 and the pseudo-branch accepts equality, it
       // will always be taken, so we emit an unconditional branch.
-      BranchInst.setOpcode(Mips::BEQ);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
@@ -2449,11 +2844,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
       // the pseudo-branch will always be taken, so we emit an unconditional
       // branch.
       // This only applies to unsigned pseudo-branches.
-      BranchInst.setOpcode(Mips::BEQ);
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
       Warning(IDLoc, "branch is always taken");
       return false;
     }
@@ -2470,21 +2862,17 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
       //
       // Because only BLEU and BGEU branch on equality, we can use the
       // AcceptsEquality variable to decide when to emit the BEQZ.
-      BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE);
-      BranchInst.addOperand(
-          MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg));
-      BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-      Instructions.push_back(BranchInst);
+      emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
+              IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
+              MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
       return false;
     }
     // If we have a signed pseudo-branch and one of the registers is $0,
     // we can use an appropriate compare-to-zero branch. We select which one
     // to use in the switch statement above.
-    BranchInst.setOpcode(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode);
-    BranchInst.addOperand(MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg));
-    BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-    Instructions.push_back(BranchInst);
+    emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
+           IsSrcRegZero ? TrgReg : SrcReg, MCOperand::createExpr(OffsetExpr),
+           IDLoc, Instructions);
     return false;
   }
 
@@ -2494,7 +2882,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   if (!ATRegNum)
     return true;
 
-  warnIfNoMacro(IDLoc);
+  if (!EmittedNoMacroWarning)
+    warnIfNoMacro(IDLoc);
 
   // SLT fits well with 2 of our 4 pseudo-branches:
   //   BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and
@@ -2511,23 +2900,135 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   //
   // The same applies to the unsigned variants, except that SLTu is used
   // instead of SLT.
-  MCInst SetInst;
-  SetInst.setOpcode(IsUnsigned ? Mips::SLTu : Mips::SLT);
-  SetInst.addOperand(MCOperand::createReg(ATRegNum));
-  SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? TrgReg : SrcReg));
-  SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? SrcReg : TrgReg));
-  Instructions.push_back(SetInst);
-
-  BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE);
-  BranchInst.addOperand(MCOperand::createReg(ATRegNum));
-  BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
-  BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
-  Instructions.push_back(BranchInst);
+  emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
+          ReverseOrderSLT ? TrgReg : SrcReg, ReverseOrderSLT ? SrcReg : TrgReg,
+          IDLoc, Instructions);
+
+  emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
+                   : (AcceptsEquality ? Mips::BEQ : Mips::BNE),
+          ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+          Instructions);
   return false;
 }
 
-bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc,
-                               SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
+                              SmallVectorImpl<MCInst> &Instructions,
+                              const bool IsMips64, const bool Signed) {
+  if (hasMips32r6()) {
+    Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+    return false;
+  }
+
+  warnIfNoMacro(IDLoc);
+
+  const MCOperand &RsRegOp = Inst.getOperand(0);
+  assert(RsRegOp.isReg() && "expected register operand kind");
+  unsigned RsReg = RsRegOp.getReg();
+
+  const MCOperand &RtRegOp = Inst.getOperand(1);
+  assert(RtRegOp.isReg() && "expected register operand kind");
+  unsigned RtReg = RtRegOp.getReg();
+  unsigned DivOp;
+  unsigned ZeroReg;
+
+  if (IsMips64) {
+    DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
+    ZeroReg = Mips::ZERO_64;
+  } else {
+    DivOp = Signed ? Mips::SDIV : Mips::UDIV;
+    ZeroReg = Mips::ZERO;
+  }
+
+  bool UseTraps = useTraps();
+
+  if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
+    if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
+      Warning(IDLoc, "dividing zero by zero");
+    if (IsMips64) {
+      if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
+        if (UseTraps) {
+          emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+          return false;
+        }
+
+        emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+        return false;
+      }
+    } else {
+      emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions);
+      return false;
+    }
+  }
+
+  if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
+    Warning(IDLoc, "division by zero");
+    if (Signed) {
+      if (UseTraps) {
+        emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+        return false;
+      }
+
+      emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+      return false;
+    }
+  }
+
+  // FIXME: The values for these two BranchTarget variables may be different in
+  // micromips. These magic numbers need to be removed.
+  unsigned BranchTargetNoTraps;
+  unsigned BranchTarget;
+
+  if (UseTraps) {
+    BranchTarget = IsMips64 ? 12 : 8;
+    emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+  } else {
+    BranchTarget = IsMips64 ? 20 : 16;
+    BranchTargetNoTraps = 8;
+    // Branch to the li instruction.
+    emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc,
+            Instructions);
+  }
+
+  emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions);
+
+  if (!UseTraps)
+    emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+
+  if (!Signed) {
+    emitR(Mips::MFLO, RsReg, IDLoc, Instructions);
+    return false;
+  }
+
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, Instructions);
+  if (IsMips64) {
+    // Branch to the mflo instruction.
+    emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions);
+    emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, Instructions);
+    emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, Instructions);
+  } else {
+    // Branch to the mflo instruction.
+    emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions);
+    emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, Instructions);
+  }
+
+  if (UseTraps)
+    emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, Instructions);
+  else {
+    // Branch to the mflo instruction.
+    emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, Instructions);
+    emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, Instructions);
+    emitII(Mips::BREAK, 0x6, 0, IDLoc, Instructions);
+  }
+  emitR(Mips::MFLO, RsReg, IDLoc, Instructions);
+  return false;
+}
+
+bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
+                              SmallVectorImpl<MCInst> &Instructions) {
   if (hasMips32r6() || hasMips64r6()) {
     Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
     return false;
@@ -2562,7 +3063,7 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc,
     LoadedOffsetInAT = true;
 
     if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(),
-                      IDLoc, Instructions))
+                      true, IDLoc, Instructions))
       return true;
 
     // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate()
@@ -2590,33 +3091,15 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc,
 
   unsigned SllReg = LoadedOffsetInAT ? DstReg : ATReg;
 
-  MCInst TmpInst;
-  TmpInst.setOpcode(Mips::LBu);
-  TmpInst.addOperand(MCOperand::createReg(FirstLbuDstReg));
-  TmpInst.addOperand(MCOperand::createReg(LbuSrcReg));
-  TmpInst.addOperand(MCOperand::createImm(FirstLbuOffset));
-  Instructions.push_back(TmpInst);
-
-  TmpInst.clear();
-  TmpInst.setOpcode(Mips::LBu);
-  TmpInst.addOperand(MCOperand::createReg(SecondLbuDstReg));
-  TmpInst.addOperand(MCOperand::createReg(LbuSrcReg));
-  TmpInst.addOperand(MCOperand::createImm(SecondLbuOffset));
-  Instructions.push_back(TmpInst);
-
-  TmpInst.clear();
-  TmpInst.setOpcode(Mips::SLL);
-  TmpInst.addOperand(MCOperand::createReg(SllReg));
-  TmpInst.addOperand(MCOperand::createReg(SllReg));
-  TmpInst.addOperand(MCOperand::createImm(8));
-  Instructions.push_back(TmpInst);
-
-  TmpInst.clear();
-  TmpInst.setOpcode(Mips::OR);
-  TmpInst.addOperand(MCOperand::createReg(DstReg));
-  TmpInst.addOperand(MCOperand::createReg(DstReg));
-  TmpInst.addOperand(MCOperand::createReg(ATReg));
-  Instructions.push_back(TmpInst);
+  emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
+          FirstLbuOffset, IDLoc, Instructions);
+
+  emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondLbuOffset, IDLoc,
+          Instructions);
+
+  emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, Instructions);
+
+  emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, Instructions);
 
   return false;
 }
@@ -2654,7 +3137,7 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
     warnIfNoMacro(IDLoc);
 
     if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(),
-                      IDLoc, Instructions))
+                      true, IDLoc, Instructions))
       return true;
 
     // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate()
@@ -2677,37 +3160,373 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
     RightLoadOffset  = LoadedOffsetInAT ? 3 : (OffsetValue + 3);
   }
 
-  MCInst LeftLoadInst;
-  LeftLoadInst.setOpcode(Mips::LWL);
-  LeftLoadInst.addOperand(DstRegOp);
-  LeftLoadInst.addOperand(MCOperand::createReg(FinalSrcReg));
-  LeftLoadInst.addOperand(MCOperand::createImm(LeftLoadOffset));
-  Instructions.push_back(LeftLoadInst);
+  emitRRI(Mips::LWL, DstRegOp.getReg(), FinalSrcReg, LeftLoadOffset, IDLoc,
+          Instructions);
 
-  MCInst RightLoadInst;
-  RightLoadInst.setOpcode(Mips::LWR);
-  RightLoadInst.addOperand(DstRegOp);
-  RightLoadInst.addOperand(MCOperand::createReg(FinalSrcReg));
-  RightLoadInst.addOperand(MCOperand::createImm(RightLoadOffset ));
-  Instructions.push_back(RightLoadInst);
+  emitRRI(Mips::LWR, DstRegOp.getReg(), FinalSrcReg, RightLoadOffset, IDLoc,
+          Instructions);
 
   return false;
 }
 
+bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
+                                         SmallVectorImpl<MCInst> &Instructions) {
+
+  assert (Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert (Inst.getOperand(0).isReg() &&
+          Inst.getOperand(1).isReg() &&
+          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+  unsigned ATReg = Mips::NoRegister;
+  unsigned FinalDstReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+
+  bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+
+  unsigned FinalOpcode = Inst.getOpcode();
+
+  if (DstReg == SrcReg) {
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+    FinalDstReg = DstReg;
+    DstReg = ATReg;
+  }
+
+  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Instructions)) {
+    switch (FinalOpcode) {
+    default:
+      llvm_unreachable("unimplemented expansion");
+    case (Mips::ADDi):
+      FinalOpcode = Mips::ADD;
+      break;
+    case (Mips::ADDiu):
+      FinalOpcode = Mips::ADDu;
+      break;
+    case (Mips::ANDi):
+      FinalOpcode = Mips::AND;
+      break;
+    case (Mips::NORImm):
+      FinalOpcode = Mips::NOR;
+      break;
+    case (Mips::ORi):
+      FinalOpcode = Mips::OR;
+      break;
+    case (Mips::SLTi):
+      FinalOpcode = Mips::SLT;
+      break;
+    case (Mips::SLTiu):
+      FinalOpcode = Mips::SLTu;
+      break;
+    case (Mips::XORi):
+      FinalOpcode = Mips::XOR;
+      break;
+    }
+
+    if (FinalDstReg == Mips::NoRegister)
+      emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, Instructions);
+    else
+      emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc,
+              Instructions);
+    return false;
+  }
+  return true;
+}
+
+bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc,
+                                   SmallVectorImpl<MCInst> &Instructions) {
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  unsigned TReg = Inst.getOperand(2).getReg();
+  unsigned TmpReg = DReg;
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  if (hasMips32r2()) {
+
+    if (DReg == SReg) {
+      TmpReg = getATReg(Inst.getLoc());
+      if (!TmpReg)
+        return true;
+    }
+
+    if (Inst.getOpcode() == Mips::ROL) {
+      emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+      emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    if (Inst.getOpcode() == Mips::ROR) {
+      emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    return true;
+  }
+
+  if (hasMips32()) {
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::ROL:
+      FirstShift = Mips::SRLV;
+      SecondShift = Mips::SLLV;
+      break;
+    case Mips::ROR:
+      FirstShift = Mips::SLLV;
+      SecondShift = Mips::SRLV;
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+    emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions);
+    emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
+                                      SmallVectorImpl<MCInst> &Instructions) {
+
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  if (hasMips32r2()) {
+
+    if (Inst.getOpcode() == Mips::ROLImm) {
+      uint64_t MaxShift = 32;
+      uint64_t ShiftValue = ImmValue;
+      if (ImmValue != 0)
+        ShiftValue = MaxShift - ImmValue;
+      emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    if (Inst.getOpcode() == Mips::RORImm) {
+      emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    return true;
+  }
+
+  if (hasMips32()) {
+
+    if (ImmValue == 0) {
+      emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::ROLImm:
+      FirstShift = Mips::SLL;
+      SecondShift = Mips::SRL;
+      break;
+    case Mips::RORImm:
+      FirstShift = Mips::SRL;
+      SecondShift = Mips::SLL;
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), Instructions);
+    emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), Instructions);
+    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
+
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  unsigned TReg = Inst.getOperand(2).getReg();
+  unsigned TmpReg = DReg;
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  if (hasMips64r2()) {
+
+    if (TmpReg == SReg) {
+      TmpReg = getATReg(Inst.getLoc());
+      if (!TmpReg)
+        return true;
+    }
+
+    if (Inst.getOpcode() == Mips::DROL) {
+      emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+      emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    if (Inst.getOpcode() == Mips::DROR) {
+      emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    return true;
+  }
+
+  if (hasMips64()) {
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::DROL:
+      FirstShift = Mips::DSRLV;
+      SecondShift = Mips::DSLLV;
+      break;
+    case Mips::DROR:
+      FirstShift = Mips::DSLLV;
+      SecondShift = Mips::DSRLV;
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+    emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions);
+    emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
+                                       SmallVectorImpl<MCInst> &Instructions) {
+
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm() % 64;
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  MCInst TmpInst;
+
+  if (hasMips64r2()) {
+
+    unsigned FinalOpcode = Mips::NOP;
+    if (ImmValue == 0)
+      FinalOpcode = Mips::DROTR;
+    else if (ImmValue % 32 == 0)
+      FinalOpcode = Mips::DROTR32;
+    else if ((ImmValue >= 1) && (ImmValue <= 32)) {
+      if (Inst.getOpcode() == Mips::DROLImm)
+        FinalOpcode = Mips::DROTR32;
+      else
+        FinalOpcode = Mips::DROTR;
+    } else if (ImmValue >= 33) {
+      if (Inst.getOpcode() == Mips::DROLImm)
+        FinalOpcode = Mips::DROTR;
+      else
+        FinalOpcode = Mips::DROTR32;
+    }
+
+    uint64_t ShiftValue = ImmValue % 32;
+    if (Inst.getOpcode() == Mips::DROLImm)
+      ShiftValue = (32 - ImmValue % 32) % 32;
+
+    emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions);
+
+    return false;
+  }
+
+  if (hasMips64()) {
+
+    if (ImmValue == 0) {
+      emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), Instructions);
+      return false;
+    }
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::DROLImm:
+      if ((ImmValue >= 1) && (ImmValue <= 31)) {
+        FirstShift = Mips::DSLL;
+        SecondShift = Mips::DSRL32;
+      }
+      if (ImmValue == 32) {
+        FirstShift = Mips::DSLL32;
+        SecondShift = Mips::DSRL32;
+      }
+      if ((ImmValue >= 33) && (ImmValue <= 63)) {
+        FirstShift = Mips::DSLL32;
+        SecondShift = Mips::DSRL;
+      }
+      break;
+    case Mips::DRORImm:
+      if ((ImmValue >= 1) && (ImmValue <= 31)) {
+        FirstShift = Mips::DSRL;
+        SecondShift = Mips::DSLL32;
+      }
+      if (ImmValue == 32) {
+        FirstShift = Mips::DSRL32;
+        SecondShift = Mips::DSLL32;
+      }
+      if ((ImmValue >= 33) && (ImmValue <= 63)) {
+        FirstShift = Mips::DSRL32;
+        SecondShift = Mips::DSLL;
+      }
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), Instructions);
+    emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32, Inst.getLoc(), Instructions);
+    emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+    return false;
+  }
+
+  return true;
+}
+
 void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc,
                               SmallVectorImpl<MCInst> &Instructions) {
-  MCInst NopInst;
-  if (hasShortDelaySlot) {
-    NopInst.setOpcode(Mips::MOVE16_MM);
-    NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
-  } else {
-    NopInst.setOpcode(Mips::SLL);
-    NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::createImm(0));
-  }
-  Instructions.push_back(NopInst);
+  if (hasShortDelaySlot)
+    emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, Instructions);
+  else
+    emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, Instructions);
 }
 
 void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
@@ -2717,6 +3536,24 @@ void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
           Instructions);
 }
 
+void MipsAsmParser::createCpRestoreMemOp(
+    bool IsLoad, int StackOffset, SMLoc IDLoc,
+    SmallVectorImpl<MCInst> &Instructions) {
+  // If the offset can not fit into 16 bits, we need to expand.
+  if (!isInt<16>(StackOffset)) {
+    MCInst MemInst;
+    MemInst.setOpcode(IsLoad ? Mips::LW : Mips::SW);
+    MemInst.addOperand(MCOperand::createReg(Mips::GP));
+    MemInst.addOperand(MCOperand::createReg(Mips::SP));
+    MemInst.addOperand(MCOperand::createImm(StackOffset));
+    expandMemInst(MemInst, IDLoc, Instructions, IsLoad, true /*HasImmOpnd*/);
+    return;
+  }
+
+  emitRRI(IsLoad ? Mips::LW : Mips::SW, Mips::GP, Mips::SP, StackOffset, IDLoc,
+          Instructions);
+}
+
 unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // As described by the Mips32r2 spec, the registers Rd and Rs for
   // jalr.hb must be different.
@@ -2729,6 +3566,17 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
+static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands,
+                            uint64_t ErrorInfo) {
+  if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) {
+    SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+    if (ErrorLoc == SMLoc())
+      return Loc;
+    return ErrorLoc;
+  }
+  return Loc;
+}
+
 bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                             OperandVector &Operands,
                                             MCStreamer &Out,
@@ -2745,7 +3593,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (processInstruction(Inst, IDLoc, Instructions))
       return true;
     for (unsigned i = 0; i < Instructions.size(); i++)
-      Out.EmitInstruction(Instructions[i], STI);
+      Out.EmitInstruction(Instructions[i], getSTI());
     return false;
   }
   case Match_MissingFeature:
@@ -2757,7 +3605,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc();
+      ErrorLoc = Operands[ErrorInfo]->getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -2768,6 +3616,58 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "invalid instruction");
   case Match_RequiresDifferentSrcAndDst:
     return Error(IDLoc, "source and destination must be different");
+  case Match_Immz:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'");
+  case Match_UImm1_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 1-bit unsigned immediate");
+  case Match_UImm2_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 2-bit unsigned immediate");
+  case Match_UImm2_1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 1 .. 4");
+  case Match_UImm3_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 3-bit unsigned immediate");
+  case Match_UImm4_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 4-bit unsigned immediate");
+  case Match_UImm5_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 5-bit unsigned immediate");
+  case Match_UImm5_1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 1 .. 32");
+  case Match_UImm5_32:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 32 .. 63");
+  case Match_UImm5_33:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 33 .. 64");
+  case Match_UImm5_0_Report_UImm6:
+    // This is used on UImm5 operands that have a corresponding UImm5_32
+    // operand to avoid confusing the user.
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit unsigned immediate");
+  case Match_UImm5_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 7-bit unsigned immediate and multiple of 4");
+  case Match_UImm6_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit unsigned immediate");
+  case Match_SImm6:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit signed immediate");
+  case Match_UImm7_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 7-bit unsigned immediate");
+  case Match_UImm8_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 8-bit unsigned immediate");
+  case Match_UImm10_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 10-bit unsigned immediate");
   }
 
   llvm_unreachable("Implement any new match types added!");
@@ -3264,7 +4164,7 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) {
     const AsmToken &Tok = Parser.getTok(); // Get the next token.
     if (Tok.isNot(AsmToken::LParen)) {
       MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
-      if (Mnemonic.getToken() == "la") {
+      if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") {
         SMLoc E =
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
         Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
@@ -3598,12 +4498,15 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
     if (RegRange) {
       // Remove last register operand because registers from register range
       // should be inserted first.
-      if (RegNo == Mips::RA) {
+      if ((isGP64bit() && RegNo == Mips::RA_64) ||
+          (!isGP64bit() && RegNo == Mips::RA)) {
         Regs.push_back(RegNo);
       } else {
         unsigned TmpReg = PrevReg + 1;
         while (TmpReg <= RegNo) {
-          if ((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) {
+          if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) ||
+              (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) &&
+               isGP64bit())) {
             Error(E, "invalid register operand");
             return MatchOperand_ParseFail;
           }
@@ -3615,16 +4518,23 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
 
       RegRange = false;
     } else {
-      if ((PrevReg == Mips::NoRegister) && (RegNo != Mips::S0) &&
-          (RegNo != Mips::RA)) {
+      if ((PrevReg == Mips::NoRegister) &&
+          ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) ||
+          (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) {
         Error(E, "$16 or $31 expected");
         return MatchOperand_ParseFail;
-      } else if (((RegNo < Mips::S0) || (RegNo > Mips::S7)) &&
-                 (RegNo != Mips::FP) && (RegNo != Mips::RA)) {
+      } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA ||
+                    (RegNo >= Mips::S0 && RegNo <= Mips::S7)) &&
+                    !isGP64bit()) ||
+                   ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 ||
+                    (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) &&
+                    isGP64bit()))) {
         Error(E, "invalid register operand");
         return MatchOperand_ParseFail;
       } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
-                 (RegNo != Mips::FP) && (RegNo != Mips::RA)) {
+                 ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) ||
+                  (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 &&
+                   isGP64bit()))) {
         Error(E, "consecutive register numbers expected");
         return MatchOperand_ParseFail;
       }
@@ -4152,6 +5062,7 @@ bool MipsAsmParser::parseSetPopDirective() {
   if (AssemblerOptions.size() == 2)
     return reportParseError(Loc, ".set pop with no .set push");
 
+  MCSubtargetInfo &STI = copySTI();
   AssemblerOptions.pop_back();
   setAvailableFeatures(
       ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures()));
@@ -4225,6 +5136,7 @@ bool MipsAsmParser::parseSetMips0Directive() {
     return reportParseError("unexpected token, expected end of statement");
 
   // Reset assembler options to their initial values.
+  MCSubtargetInfo &STI = copySTI();
   setAvailableFeatures(
       ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures()));
   STI.setFeatureBits(AssemblerOptions.front()->getFeatures());
@@ -4366,6 +5278,14 @@ bool MipsAsmParser::eatComma(StringRef ErrorStr) {
   return true;
 }
 
+// Used to determine if .cpload, .cprestore, and .cpsetup have any effect.
+// In this class, it is only used for .cprestore.
+// FIXME: Only keep track of IsPicEnabled in one place, instead of in both
+// MipsTargetELFStreamer and MipsAsmParser.
+bool MipsAsmParser::isPicAndNotNxxAbi() {
+  return inPicMode() && !(isABI_N32() || isABI_N64());
+}
+
 bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
   if (AssemblerOptions.back()->isReorder())
     Warning(Loc, ".cpload should be inside a noreorder section");
@@ -4398,6 +5318,54 @@ bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
   return false;
 }
 
+bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
+  MCAsmParser &Parser = getParser();
+
+  // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+  // is used in non-PIC mode.
+
+  if (inMips16Mode()) {
+    reportParseError(".cprestore is not supported in Mips16 mode");
+    return false;
+  }
+
+  // Get the stack offset value.
+  const MCExpr *StackOffset;
+  int64_t StackOffsetVal;
+  if (Parser.parseExpression(StackOffset)) {
+    reportParseError("expected stack offset value");
+    return false;
+  }
+
+  if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) {
+    reportParseError("stack offset is not an absolute expression");
+    return false;
+  }
+
+  if (StackOffsetVal < 0) {
+    Warning(Loc, ".cprestore with negative stack offset has no effect");
+    IsCpRestoreSet = false;
+  } else {
+    IsCpRestoreSet = true;
+    CpRestoreOffset = StackOffsetVal;
+  }
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  // Store the $gp on the stack.
+  SmallVector<MCInst, 3> StoreInsts;
+  createCpRestoreMemOp(false /*IsLoad*/, CpRestoreOffset /*StackOffset*/, Loc,
+                       StoreInsts);
+
+  getTargetStreamer().emitDirectiveCpRestore(StoreInsts, CpRestoreOffset);
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveCPSetup() {
   MCAsmParser &Parser = getParser();
   unsigned FuncReg;
@@ -4427,16 +5395,19 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
 
   ResTy = parseAnyRegister(TmpReg);
   if (ResTy == MatchOperand_NoMatch) {
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Integer)) {
-      Save = Tok.getIntVal();
-      SaveIsReg = false;
-      Parser.Lex();
-    } else {
-      reportParseError("expected save register or stack offset");
+    const MCExpr *OffsetExpr;
+    int64_t OffsetVal;
+    SMLoc ExprLoc = getLexer().getLoc();
+
+    if (Parser.parseExpression(OffsetExpr) ||
+        !OffsetExpr->evaluateAsAbsolute(OffsetVal)) {
+      reportParseError(ExprLoc, "expected save register or stack offset");
       Parser.eatToEndOfStatement();
       return false;
     }
+
+    Save = OffsetVal;
+    SaveIsReg = false;
   } else {
     MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
     if (!SaveOpnd.isGPRAsmReg()) {
@@ -4462,11 +5433,20 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   }
   const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
 
+  CpSaveLocation = Save;
+  CpSaveLocationIsRegister = SaveIsReg;
+
   getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(),
                                            SaveIsReg);
   return false;
 }
 
+bool MipsAsmParser::parseDirectiveCPReturn() {
+  getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation,
+                                            CpSaveLocationIsRegister);
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveNaN() {
   MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
@@ -4655,6 +5635,9 @@ bool MipsAsmParser::parseDirectiveOption() {
   StringRef Option = Tok.getIdentifier();
 
   if (Option == "pic0") {
+    // MipsAsmParser needs to know if the current PIC mode changes.
+    IsPicEnabled = false;
+
     getTargetStreamer().emitDirectiveOptionPic0();
     Parser.Lex();
     if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
@@ -4666,6 +5649,9 @@ bool MipsAsmParser::parseDirectiveOption() {
   }
 
   if (Option == "pic2") {
+    // MipsAsmParser needs to know if the current PIC mode changes.
+    IsPicEnabled = true;
+
     getTargetStreamer().emitDirectiveOptionPic2();
     Parser.Lex();
     if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
@@ -4924,6 +5910,8 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".cpload")
     return parseDirectiveCpLoad(DirectiveID.getLoc());
+  if (IDVal == ".cprestore")
+    return parseDirectiveCpRestore(DirectiveID.getLoc());
   if (IDVal == ".dword") {
     parseDataDirective(8, DirectiveID.getLoc());
     return false;
@@ -4974,6 +5962,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     getTargetStreamer().emitDirectiveEnt(*Sym);
     CurrentFn = Sym;
+    IsCpRestoreSet = false;
     return false;
   }
 
@@ -5002,6 +5991,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     getTargetStreamer().emitDirectiveEnd(SymbolName);
     CurrentFn = nullptr;
+    IsCpRestoreSet = false;
     return false;
   }
 
@@ -5073,6 +6063,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
     getTargetStreamer().emitFrame(StackReg, FrameSizeVal,
                                   ReturnRegOpnd.getGPR32Reg());
+    IsCpRestoreSet = false;
     return false;
   }
 
@@ -5173,6 +6164,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".cpsetup")
     return parseDirectiveCPSetup();
 
+  if (IDVal == ".cpreturn")
+    return parseDirectiveCPReturn();
+
   if (IDVal == ".module")
     return parseDirectiveModule();
 
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index a34ba3b..3c1a771 100644
--- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          uint64_t Address,
                                          const void *Decoder);
 
+// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 // DecodeJumpTargetMM - Decode microMIPS jump target, which is
 // shifted left by 1 bit.
 static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
@@ -241,17 +248,42 @@ static DecodeStatus DecodeMem(MCInst &Inst,
                               uint64_t Address,
                               const void *Decoder);
 
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
 static DecodeStatus DecodeCacheOp(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
                               const void *Decoder);
 
-static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
                                     unsigned Insn,
                                     uint64_t Address,
                                     const void *Decoder);
 
-static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
                                     unsigned Insn,
                                     uint64_t Address,
                                     const void *Decoder);
@@ -261,6 +293,11 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
                                 uint64_t Address,
                                 const void *Decoder);
 
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
 
@@ -284,6 +321,11 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
 static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -330,6 +372,11 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst,
                                   uint64_t Address,
                                   const void *Decoder);
 
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+                                              unsigned Value,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
 static DecodeStatus DecodeSimm4(MCInst &Inst,
                                 unsigned Value,
                                 uint64_t Address,
@@ -340,23 +387,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  uint64_t Address,
                                  const void *Decoder);
 
-// Decode the immediate field of an LSA instruction which
-// is off by one.
-static DecodeStatus DecodeLSAImm(MCInst &Inst,
-                                 unsigned Insn,
-                                 uint64_t Address,
-                                 const void *Decoder);
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+                                         uint64_t Address, const void *Decoder);
 
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
                                   const void *Decoder);
 
-static DecodeStatus DecodeExtSize(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address,
-                                  const void *Decoder);
-
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder);
 
@@ -830,9 +869,24 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 
   if (IsMicroMips) {
     Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+
+    if (hasMips32r6()) {
+      DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+      // Calling the auto-generated decoder function for microMIPS32R6
+      // (and microMIPS64R6) 16-bit instructions.
+      Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 2;
+        return Result;
+      }
+    }
 
     DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
-    // Calling the auto-generated decoder function.
+    // Calling the auto-generated decoder function for microMIPS 16-bit
+    // instructions.
     Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
                                this, STI);
     if (Result != MCDisassembler::Fail) {
@@ -847,24 +901,33 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     if (hasMips32r6()) {
       DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
       // Calling the auto-generated decoder function.
-      Result = decodeInstruction(DecoderTableMicroMips32r632, Instr, Insn, Address,
-                                 this, STI);
-    } else {
-      DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
-      // Calling the auto-generated decoder function.
-      Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+      Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address,
                                  this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
     }
+
+    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+                               this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
       return Result;
     }
+    // This is an invalid instruction. Let the disassembler move forward by the
+    // minimum instruction size.
+    Size = 2;
     return MCDisassembler::Fail;
   }
 
   Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
-  if (Result == MCDisassembler::Fail)
+  if (Result == MCDisassembler::Fail) {
+    Size = 4;
     return MCDisassembler::Fail;
+  }
 
   if (hasCOP3()) {
     DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
@@ -925,6 +988,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     return Result;
   }
 
+  Size = 4;
   return MCDisassembler::Fail;
 }
 
@@ -1079,10 +1143,66 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
-  if(Inst.getOpcode() == Mips::SC ||
-     Inst.getOpcode() == Mips::SCD){
+  if (Inst.getOpcode() == Mips::SC ||
+      Inst.getOpcode() == Mips::SCD)
     Inst.addOperand(MCOperand::createReg(Reg));
-  }
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn >> 7);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+   if (Inst.getOpcode() == Mips::SCE)
+     Inst.addOperand(MCOperand::createReg(Reg));
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1125,11 +1245,28 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
                                     unsigned Insn,
                                     uint64_t Address,
                                     const void *Decoder) {
-  int Offset = fieldFromInstruction(Insn, 7, 9);
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+  Inst.addOperand(MCOperand::createImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn >> 7);
   unsigned Hint = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
 
@@ -1142,6 +1279,24 @@ static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSyncI(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
@@ -1157,6 +1312,21 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  int Immediate = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Immediate));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
@@ -1220,8 +1390,11 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
         return MCDisassembler::Fail;
       break;
     case Mips::SB16_MM:
+    case Mips::SB16_MMR6:
     case Mips::SH16_MM:
+    case Mips::SH16_MMR6:
     case Mips::SW16_MM:
+    case Mips::SW16_MMR6:
       if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder)
             == MCDisassembler::Fail)
         return MCDisassembler::Fail;
@@ -1240,14 +1413,17 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
         Inst.addOperand(MCOperand::createImm(Offset));
       break;
     case Mips::SB16_MM:
+    case Mips::SB16_MMR6:
       Inst.addOperand(MCOperand::createImm(Offset));
       break;
     case Mips::LHU16_MM:
     case Mips::SH16_MM:
+    case Mips::SH16_MMR6:
       Inst.addOperand(MCOperand::createImm(Offset << 1));
       break;
     case Mips::LW16_MM:
     case Mips::SW16_MM:
+    case Mips::SW16_MMR6:
       Inst.addOperand(MCOperand::createImm(Offset << 2));
       break;
   }
@@ -1291,7 +1467,16 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
                                                unsigned Insn,
                                                uint64_t Address,
                                                const void *Decoder) {
-  int Offset = SignExtend32<4>(Insn & 0xf);
+  int Offset;
+  switch (Inst.getOpcode()) {
+  case Mips::LWM16_MMR6:
+  case Mips::SWM16_MMR6:
+    Offset = fieldFromInstruction(Insn, 4, 4);
+    break;
+  default:
+    Offset = SignExtend32<4>(Insn & 0xf);
+    break;
+  }
 
   if (DecodeRegListOperand16(Inst, Insn, Address, Decoder)
       == MCDisassembler::Fail)
@@ -1303,6 +1488,27 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if (Inst.getOpcode() == Mips::SCE_MM)
+    Inst.addOperand(MCOperand::createReg(Reg));
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -1659,6 +1865,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+  unsigned Offset,
+  uint64_t Address,
+  const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 1;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
@@ -1700,6 +1916,14 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+                                              unsigned Value,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSimm4(MCInst &Inst,
                                 unsigned Value,
                                 uint64_t Address,
@@ -1716,12 +1940,12 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLSAImm(MCInst &Inst,
-                                 unsigned Insn,
-                                 uint64_t Address,
-                                 const void *Decoder) {
-  // We add one to the immediate field as it was encoded as 'imm - 1'.
-  Inst.addOperand(MCOperand::createImm(Insn + 1));
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  Value &= ((1 << Bits) - 1);
+  Inst.addOperand(MCOperand::createImm(Value + Offset));
   return MCDisassembler::Success;
 }
 
@@ -1736,15 +1960,6 @@ static DecodeStatus DecodeInsSize(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeExtSize(MCInst &Inst,
-                                  unsigned Insn,
-                                  uint64_t Address,
-                                  const void *Decoder) {
-  int Size = (int) Insn  + 1;
-  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4));
@@ -1792,15 +2007,21 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst,
                                          uint64_t Address,
                                          const void *Decoder) {
   unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
-                     Mips::S6, Mips::FP};
+                     Mips::S6, Mips::S7, Mips::FP};
   unsigned RegNum;
 
   unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
+
   // Empty register lists are not allowed.
   if (RegLst == 0)
     return MCDisassembler::Fail;
 
   RegNum = RegLst & 0xf;
+
+  // RegLst values 10-15, and 26-31 are reserved.
+  if (RegNum > 9)
+    return MCDisassembler::Fail;
+
   for (unsigned i = 0; i < RegNum; i++)
     Inst.addOperand(MCOperand::createReg(Regs[i]));
 
@@ -1814,7 +2035,16 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
                                            const void *Decoder) {
   unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
-  unsigned RegLst = fieldFromInstruction(Insn, 4, 2);
+  unsigned RegLst;
+  switch(Inst.getOpcode()) {
+  default:
+    RegLst = fieldFromInstruction(Insn, 4, 2);
+    break;
+  case Mips::LWM16_MMR6:
+  case Mips::SWM16_MMR6:
+    RegLst = fieldFromInstruction(Insn, 8, 2);
+    break;
+  }
   unsigned RegNum = RegLst & 0x3;
 
   for (unsigned i = 0; i <= RegNum; i++)
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index a5637b1..a7b7d2e 100644
--- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -235,7 +235,9 @@ printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   case Mips::SWM32_MM:
   case Mips::LWM32_MM:
   case Mips::SWM16_MM:
+  case Mips::SWM16_MMR6:
   case Mips::LWM16_MM:
+  case Mips::LWM16_MMR6:
     opNum = MI->getNumOperands() - 2;
     break;
   }
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 713f35c..0e61ea6 100644
--- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -73,8 +73,6 @@ enum CondCode {
 const char *MipsFCCToString(Mips::CondCode CC);
 } // end namespace Mips
 
-class TargetMachine;
-
 class MipsInstPrinter : public MCInstPrinter {
 public:
   MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 8e6c9e6..cdcc392 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -23,7 +23,7 @@ static const MCPhysReg Mips64IntRegs[8] = {
     Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
 }
 
-const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
+ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
   if (IsO32())
     return makeArrayRef(O32IntRegs);
   if (IsN32() || IsN64())
@@ -31,7 +31,7 @@ const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
   llvm_unreachable("Unhandled ABI");
 }
 
-const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
+ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
   if (IsO32())
     return makeArrayRef(O32IntRegs);
   if (IsN32() || IsN64())
@@ -78,7 +78,6 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
       .Case("mips32r3", MipsABIInfo::O32())
       .Case("mips32r5", MipsABIInfo::O32())
       .Case("mips32r6", MipsABIInfo::O32())
-      .Case("mips16", MipsABIInfo::O32())
       .Case("mips3", MipsABIInfo::N64())
       .Case("mips4", MipsABIInfo::N64())
       .Case("mips5", MipsABIInfo::N64())
@@ -107,6 +106,10 @@ unsigned MipsABIInfo::GetNullPtr() const {
   return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO;
 }
 
+unsigned MipsABIInfo::GetZeroReg() const {
+  return AreGprs64bit() ? Mips::ZERO_64 : Mips::ZERO;
+}
+
 unsigned MipsABIInfo::GetPtrAdduOp() const {
   return ArePtrs64bit() ? Mips::DADDu : Mips::ADDu;
 }
@@ -115,6 +118,10 @@ unsigned MipsABIInfo::GetPtrAddiuOp() const {
   return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu;
 }
 
+unsigned MipsABIInfo::GetGPRMoveOp() const {
+  return ArePtrs64bit() ? Mips::OR64 : Mips::OR;
+}
+
 unsigned MipsABIInfo::GetEhDataReg(unsigned I) const {
   static const unsigned EhDataReg[] = {
     Mips::A0, Mips::A1, Mips::A2, Mips::A3
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index 40c5681..ffa2c76 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -47,10 +47,10 @@ public:
   ABI GetEnumValue() const { return ThisABI; }
 
   /// The registers to use for byval arguments.
-  const ArrayRef<MCPhysReg> GetByValArgRegs() const;
+  ArrayRef<MCPhysReg> GetByValArgRegs() const;
 
   /// The registers to use for the variable argument list.
-  const ArrayRef<MCPhysReg> GetVarArgRegs() const;
+  ArrayRef<MCPhysReg> GetVarArgRegs() const;
 
   /// Obtain the size of the area allocated by the callee for arguments.
   /// CallingConv::FastCall affects the value for O32.
@@ -67,9 +67,12 @@ public:
   unsigned GetFramePtr() const;
   unsigned GetBasePtr() const;
   unsigned GetNullPtr() const;
+  unsigned GetZeroReg() const;
   unsigned GetPtrAdduOp() const;
   unsigned GetPtrAddiuOp() const;
+  unsigned GetGPRMoveOp() const;
   inline bool ArePtrs64bit() const { return IsN64(); }
+  inline bool AreGprs64bit() const { return IsN32() || IsN64(); }
 
   unsigned GetEhDataReg(unsigned I) const;
 };
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 328e717..e4865e2 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -63,15 +63,19 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // address range. Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isIntN(16, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_MIPS_PC19_S2:
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 19-bit signed immediate.
-    if (!isIntN(19, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC19 fixup");
+    if (!isInt<19>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_Mips_26:
     // So far we are only using this type for jumps.
@@ -104,45 +108,57 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 7-bit signed immediate.
-    if (!isIntN(7, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC7 fixup");
+    if (!isInt<7>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_MICROMIPS_PC10_S1:
     Value -= 2;
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 10-bit signed immediate.
-    if (!isIntN(10, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC10 fixup");
+    if (!isInt<10>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_MICROMIPS_PC16_S1:
     Value -= 4;
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isIntN(16, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_MIPS_PC18_S3:
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isIntN(18, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_MIPS_PC21_S2:
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isIntN(21, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+      return 0;
+    }
     break;
   case Mips::fixup_MIPS_PC26_S2:
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isIntN(26, Value) && Ctx)
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+      return 0;
+    }
     break;
   }
 
@@ -232,6 +248,18 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
 }
 
+bool MipsAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const {
+  if (Name == "R_MIPS_NONE") {
+    MappedKind = (MCFixupKind)Mips::fixup_Mips_NONE;
+    return true;
+  }
+  if (Name == "R_MIPS_32") {
+    MappedKind = FK_Data_4;
+    return true;
+  }
+  return MCAsmBackend::getFixupKind(Name, MappedKind);
+}
+
 const MCFixupKindInfo &MipsAsmBackend::
 getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
@@ -239,6 +267,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
     // MipsFixupKinds.h.
     //
     // name                    offset  bits  flags
+    { "fixup_Mips_NONE",         0,      0,   0 },
     { "fixup_Mips_16",           0,     16,   0 },
     { "fixup_Mips_32",           0,     32,   0 },
     { "fixup_Mips_REL32",        0,     32,   0 },
@@ -304,6 +333,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
     // MipsFixupKinds.h.
     //
     // name                    offset  bits  flags
+    { "fixup_Mips_NONE",         0,      0,   0 },
     { "fixup_Mips_16",          16,     16,   0 },
     { "fixup_Mips_32",           0,     32,   0 },
     { "fixup_Mips_REL32",        0,     32,   0 },
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index b3d5a49..1c9af92 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -41,6 +41,7 @@ public:
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value, bool IsPCRel) const override;
 
+  bool getFixupKind(StringRef Name, MCFixupKind &MappedKind) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
   unsigned getNumFixupKinds() const override {
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 9b29527..5b9f02b 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -68,6 +68,8 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   unsigned Kind = (unsigned)Fixup.getKind();
 
   switch (Kind) {
+  case Mips::fixup_Mips_NONE:
+    return ELF::R_MIPS_NONE;
   case Mips::fixup_Mips_16:
   case FK_Data_2:
     return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16;
@@ -325,13 +327,24 @@ static void setMatch(MipsRelocationEntry &Hi, MipsRelocationEntry &Lo) {
 //   matching LO;
 // - prefer LOs without a pair;
 // - prefer LOs with higher offset;
+
+static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
+  const ELFRelocationEntry &A = *AP;
+  const ELFRelocationEntry &B = *BP;
+  if (A.Offset != B.Offset)
+    return B.Offset - A.Offset;
+  if (B.Type != A.Type)
+    return A.Type - B.Type;
+  return 0;
+}
+
 void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                                      std::vector<ELFRelocationEntry> &Relocs) {
   if (Relocs.size() < 2)
     return;
 
-  // The default function sorts entries by Offset in descending order.
-  MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
+  // Sorts entries by Offset in descending order.
+  array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel);
 
   // Init MipsRelocs from Relocs.
   std::vector<MipsRelocationEntry> MipsRelocs;
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index b45d9cf..e7d687e 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -63,7 +63,7 @@ void MipsELFStreamer::SwitchSection(MCSection *Section,
 }
 
 void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                    const SMLoc &Loc) {
+                                    SMLoc Loc) {
   MCELFStreamer::EmitValueImpl(Value, Size, Loc);
   Labels.clear();
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index af9311f..a241cde 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -60,8 +60,7 @@ public:
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .word directive is emitted.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                     const SMLoc &Loc) override;
+  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
 
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index e601963..3652f4b 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -23,8 +23,11 @@ namespace Mips {
   // in MipsAsmBackend.cpp.
   //
   enum Fixups {
+    // Branch fixups resulting in R_MIPS_NONE.
+    fixup_Mips_NONE = FirstTargetFixupKind,
+
     // Branch fixups resulting in R_MIPS_16.
-    fixup_Mips_16 = FirstTargetFixupKind,
+    fixup_Mips_16,
 
     // Pure 32 bit data fixup resulting in - R_MIPS_32.
     fixup_Mips_32,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 5d23fcb..d4ccf03 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -17,13 +17,14 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class Triple;
+class Triple;
 
-  class MipsMCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit MipsMCAsmInfo(const Triple &TheTriple);
-  };
+class MipsMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit MipsMCAsmInfo(const Triple &TheTriple);
+};
 
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index e36263d..4b030eb 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -190,6 +190,10 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
     else
       NewOpcode = Mips::Std2MicroMips(Opcode, Mips::Arch_micromips);
 
+    // Check whether it is Dsp instruction.
+    if (NewOpcode == -1)
+      NewOpcode = Mips::Dsp2MicroMips(Opcode, Mips::Arch_mmdsp);
+
     if (NewOpcode != -1) {
       if (Fixups.size() > N)
         Fixups.pop_back();
@@ -346,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget26OpValueMM - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
+    const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm())
+    return MO.getImm() >> 1;
+
+  // TODO: Push 26 PC fixup.
+  return 0;
+}
+
 /// getJumpOffset16OpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -745,7 +766,8 @@ getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
                            const MCSubtargetInfo &STI) const {
   // Register is encoded in bits 9-5, offset is encoded in bits 4-0.
   assert(MI.getOperand(OpNo).isReg() &&
-         MI.getOperand(OpNo).getReg() == Mips::SP &&
+         (MI.getOperand(OpNo).getReg() == Mips::SP ||
+         MI.getOperand(OpNo).getReg() == Mips::SP_64) &&
          "Unexpected base register!");
   unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
                                        Fixups, STI) >> 2;
@@ -769,6 +791,19 @@ getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 8-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+                                       STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI);
+
+  return (OffBits & 0x1FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
 getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                       SmallVectorImpl<MCFixup> &Fixups,
                       const MCSubtargetInfo &STI) const {
@@ -792,6 +827,19 @@ getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+                                       STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
 getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
@@ -801,7 +849,9 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
   default:
     break;
   case Mips::SWM16_MM:
+  case Mips::SWM16_MMR6:
   case Mips::LWM16_MM:
+  case Mips::LWM16_MMR6:
     OpNo = MI.getNumOperands() - 2;
     break;
   }
@@ -815,15 +865,6 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
   return ((OffBits >> 2) & 0x0F);
 }
 
-unsigned
-MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  assert(MI.getOperand(OpNo).isImm());
-  unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
-  return SizeEncoding - 1;
-}
-
 // FIXME: should be called getMSBEncoding
 //
 unsigned
@@ -838,13 +879,15 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
   return Position + Size - 1;
 }
 
+template <unsigned Bits, int Offset>
 unsigned
-MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
+MipsMCCodeEmitter::getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
   assert(MI.getOperand(OpNo).isImm());
-  // The immediate is encoded as 'immediate - 1'.
-  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1;
+  unsigned Value = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+  Value -= Offset;
+  return Value;
 }
 
 unsigned
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 911cc2f..fdacd17 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -137,6 +137,13 @@ public:
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget26OpValueMM - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
   // getJumpOffset16OpValue - Return binary encoding of the jump
   // offset operand. If the machine operand requires relocation,
   // record the relocation and return zero.
@@ -172,23 +179,27 @@ public:
   unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
   unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
   unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
-  unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
   unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const;
 
-  // getLSAImmEncoding - Return binary encoding of LSA immediate.
-  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  /// Subtract Offset then encode as a N-bit unsigned integer.
+  template <unsigned Bits, int Offset>
+  unsigned getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
   unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index fd2ed17..e889972 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -51,8 +51,8 @@ public:
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *findAssociatedSection() const override {
-    return getSubExpr()->findAssociatedSection();
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
   }
 
   // There are no TLS MipsMCExprs at the moment.
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index e4da2df..e5fa755 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -89,9 +89,15 @@ void MipsTargetStreamer::emitDirectiveSetHardFloat() {
 void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
+void MipsTargetStreamer::emitDirectiveCpRestore(
+    SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+  forbidModuleDirective();
+}
 void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                                               const MCSymbol &Sym, bool IsReg) {
 }
+void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+                                               bool SaveLocationIsRegister) {}
 
 void MipsTargetStreamer::emitDirectiveModuleFP() {}
 
@@ -358,6 +364,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   forbidModuleDirective();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveCpRestore(
+    SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+  MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset);
+  OS << "\t.cprestore\t" << Offset << "\n";
+}
+
 void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
                                                  int RegOrOffset,
                                                  const MCSymbol &Sym,
@@ -373,7 +385,13 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
 
   OS << ", ";
 
-  OS << Sym.getName() << "\n";
+  OS << Sym.getName();
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+                                                  bool SaveLocationIsRegister) {
+  OS << "\t.cpreturn";
   forbidModuleDirective();
 }
 
@@ -595,8 +613,9 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS,
                                             ELF::SHF_ALLOC | ELF::SHT_REL);
 
+  MCSymbol *Sym = Context.getOrCreateSymbol(Name);
   const MCSymbolRefExpr *ExprRef =
-      MCSymbolRefExpr::create(Name, MCSymbolRefExpr::VK_None, Context);
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
 
   MCA.registerSection(*Sec);
   Sec->setAlignment(4);
@@ -622,10 +641,25 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
 
   OS.PopSection();
+
+  // .end also implicitly sets the size.
+  MCSymbol *CurPCSym = Context.createTempSymbol();
+  OS.EmitLabel(CurPCSym);
+  const MCExpr *Size = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
+      ExprRef, Context);
+  int64_t AbsSize;
+  if (!Size->evaluateAsAbsolute(AbsSize, MCA))
+    llvm_unreachable("Function size must be evaluatable as absolute");
+  Size = MCConstantExpr::create(AbsSize, Context);
+  static_cast<MCSymbolELF *>(Sym)->setSize(Size);
 }
 
 void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
   GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+  // .ent also acts like an implicit '.type symbol, STT_FUNC'
+  static_cast<const MCSymbolELF &>(Symbol).setType(ELF::STT_FUNC);
 }
 
 void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
@@ -752,6 +786,24 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   forbidModuleDirective();
 }
 
+void MipsTargetELFStreamer::emitDirectiveCpRestore(
+    SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+  MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset);
+  // .cprestore offset
+  // When PIC mode is enabled and the O32 ABI is used, this directive expands
+  // to:
+  //    sw $gp, offset($sp)
+  // and adds a corresponding LW after every JAL.
+
+  // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+  // is used in non-PIC mode.
+  if (!Pic || (getABI().IsN32() || getABI().IsN64()))
+    return;
+
+  for (const MCInst &Inst : StoreInsts)
+    getStreamer().EmitInstruction(Inst, STI);
+}
+
 void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
                                                  int RegOrOffset,
                                                  const MCSymbol &Sym,
@@ -766,7 +818,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   // Either store the old $gp in a register or on the stack
   if (IsReg) {
     // move $save, $gpreg
-    Inst.setOpcode(Mips::DADDu);
+    Inst.setOpcode(Mips::OR64);
     Inst.addOperand(MCOperand::createReg(RegOrOffset));
     Inst.addOperand(MCOperand::createReg(Mips::GP));
     Inst.addOperand(MCOperand::createReg(Mips::ZERO));
@@ -810,6 +862,30 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   forbidModuleDirective();
 }
 
+void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+                                                  bool SaveLocationIsRegister) {
+  // Only N32 and N64 emit anything for .cpreturn iff PIC is set.
+  if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
+    return;
+
+  MCInst Inst;
+  // Either restore the old $gp from a register or on the stack
+  if (SaveLocationIsRegister) {
+    Inst.setOpcode(Mips::OR);
+    Inst.addOperand(MCOperand::createReg(Mips::GP));
+    Inst.addOperand(MCOperand::createReg(SaveLocation));
+    Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+  } else {
+    Inst.setOpcode(Mips::LD);
+    Inst.addOperand(MCOperand::createReg(Mips::GP));
+    Inst.addOperand(MCOperand::createReg(Mips::SP));
+    Inst.addOperand(MCOperand::createImm(SaveLocation));
+  }
+  getStreamer().EmitInstruction(Inst, STI);
+
+  forbidModuleDirective();
+}
+
 void MipsTargetELFStreamer::emitMipsAbiFlags() {
   MCAssembler &MCA = getStreamer().getAssembler();
   MCContext &Context = MCA.getContext();
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 187a022..400f6eef 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -16,6 +16,64 @@ class MMR6Arch<string opstr> {
   string BaseOpcode = opstr;
 }
 
+// Class used for microMIPS32r6 and microMIPS64r6 instructions.
+class MicroMipsR6Inst16 : PredicateControl {
+  string DecoderNamespace = "MicroMipsR6";
+  let InsnPredicates = [HasMicroMips32r6];
+}
+
+class BC16_FM_MM16R6 {
+  bits<10> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x33;
+  let Inst{9-0}   = offset;
+}
+
+class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 {
+  bits<3> rs;
+  bits<7> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7}   = rs;
+  let Inst{6-0}   = offset;
+}
+
+class POOL16C_JALRC_FM_MM16R6<bits<5> op> {
+  bits<5> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = rs;
+  let Inst{4-0}   = op;
+}
+
+class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> {
+  bits<5> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = imm;
+  let Inst{4-0}   = op;
+}
+
+class POOL16C_LWM_SWM_FM_MM16R6<bits<4> funct> {
+  bits<2> rt;
+  bits<4> addr;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-8}   = rt;
+  let Inst{7-4}   = addr;
+  let Inst{3-0}   = funct;
+}
+
 class POOL32A_BITSWAP_FM_MMR6<bits<6> funct> : MipsR6Inst {
   bits<5> rd;
   bits<5> rt;
@@ -71,6 +129,64 @@ class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
   let Inst{15-0}  = imm16;
 }
 
+class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0b1010;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class LB32_FM_MMR6 : MipsR6Inst {
+  bits<21> addr;
+  bits<5> rt;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class LBU32_FM_MMR6 : MipsR6Inst {
+  bits<21> addr;
+  bits<5> rt;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000101;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = 0b0110;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = addr{8-0};
+}
+
 class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct>
     : MMR6Arch<instr_asm> {
   bits<5> rd;
@@ -124,6 +240,69 @@ class POOL32A_FM_MMR6<bits<10> funct> : MipsR6Inst {
   let Inst{9-0}   = funct;
 }
 
+class POOL32A_PAUSE_FM_MMR6<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = op;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = 0;
+}
+
+class POOL32A_RDPGPR_FM_MMR6<bits<10> funct> {
+  bits<5> rt;
+  bits<5> rd;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rd;
+  let Inst{15-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_RDHWR_FM_MMR6 {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sel;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10} = 0;
+  let Inst{9-0} = 0b0111000000;
+}
+
+class POOL32A_SYNC_FM_MMR6 {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = stype;
+  let Inst{15-6}  = 0b0110101101;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32I_SYNCI_FM_MMR6 {
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> immediate = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = 0b01100;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = immediate;
+}
+
 class POOL32A_2R_FM_MMR6<bits<10> funct> : MipsR6Inst {
   bits<5> rs;
   bits<5> rt;
@@ -198,6 +377,78 @@ class POOL32A_LSA_FM<bits<6> funct> : MipsR6Inst {
   let Inst{5-0}   = funct;
 }
 
+class SB32_SH32_STORE_FM_MMR6<bits<6> op> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0b1010;
+  let Inst{11-9}  = funct;
+  let Inst{8-0}   = offset;
+}
+
+class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0b0110;
+  let Inst{11-9}  = funct;
+  let Inst{8-0}   = offset;
+}
+
+class LOAD_WORD_FM_MMR6 {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b111111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class LOAD_UPPER_IMM_FM_MMR6 {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000100;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = 0;
+  let Inst{15-0}  = imm16;
+}
+
 class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
   bits<5> rt;
   bits<16> offset;
@@ -222,12 +473,13 @@ class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
   let Inst{15-0}  = offset;
 }
 
-class ERET_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> {
+class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm> {
   bits<32> Inst;
 
   let Inst{31-26} = 0x00;
   let Inst{25-16} = 0x00;
-  let Inst{15-6}  = 0x3cd;
+  let Inst{15-6}  = funct;
   let Inst{5-0}   = 0x3c;
 }
 
@@ -262,7 +514,8 @@ class BARRIER_MMR6_ENC<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
   let Inst{5-0}   = 0x0;
 }
 
-class EIDI_MMR6_ENC<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> {
+class POOL32A_EIDI_MMR6_ENC<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm> {
   bits<32> Inst;
   bits<5> rt; // Actually rs but we're sharing code with the standard encodings which call it rt
 
@@ -287,3 +540,323 @@ class SHIFT_MMR6_ENC<string instr_asm, bits<10> funct, bit rotate> : MMR6Arch<in
   let Inst{10}    = rotate;
   let Inst{9-0}   = funct;
 }
+
+class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-0}  = addr{15-0};
+}
+
+class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt,
+    bits<3> funct> : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = fmt;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10}    = 0;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+}
+
+class POOL32F_ARITHF_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32F_MOV_NEG_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_MINMAX_FM<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9} = fmt;
+  let Inst{8-0} = funct;
+}
+
+class POOL32F_CMP_FM<string instr_asm, bits<6> format, FIELD_CMP_COND Cond>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-6} = Cond.Value;
+  let Inst{5-0} = format;
+}
+
+class POOL32F_CVT_LW_FM<string instr_asm, bit fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15} = 0;
+  let Inst{14} = fmt;
+  let Inst{13-6} = funct;
+  let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_CVT_DS_FM<string instr_asm, bits<2> fmt, bits<7> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15} = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6} = funct;
+  let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_ABS_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> rs;
+  bits<3> rt;
+  bits<3> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b000001;
+  let Inst{9-7}   = rs;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rd;
+  let Inst{0}     = 0;
+}
+
+class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = 0b0001;
+}
+
+class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = 0b0000;
+}
+
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = op;
+}
+
+class POOL16C_BREAKPOINT_FM_MMR6<bits<6> op> {
+  bits<4> code_;
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-6}   = code_;
+  let Inst{5-0}   = op;
+}
+
+class POOL16A_SUBU16_FM_MMR6 {
+  bits<3> rs;
+  bits<3> rt;
+  bits<3> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b000001;
+  let Inst{9-7}   = rs;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rd;
+  let Inst{0}     = 0b1;
+}
+
+class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = 0b000100000;
+}
+
+class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0b00000;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 53bde13..31b5db0 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -11,6 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+def brtarget26_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValueMM";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Encodings
@@ -28,6 +35,9 @@ class ALIGN_MMR6_ENC : POOL32A_ALIGN_FM_MMR6<0b011111>;
 class AUI_MMR6_ENC : AUI_FM_MMR6;
 class BALC_MMR6_ENC  : BRANCH_OFF26_FM<0b101101>;
 class BC_MMR6_ENC : BRANCH_OFF26_FM<0b100101>;
+class BC16_MMR6_ENC : BC16_FM_MM16R6;
+class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>;
+class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>;
 class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>;
 class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">;
 class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b011101>;
@@ -42,13 +52,19 @@ class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>;
 class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>;
 class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>;
 class EHB_MMR6_ENC : BARRIER_MMR6_ENC<"ehb", 0x3>;
-class EI_MMR6_ENC : EIDI_MMR6_ENC<"ei", 0x15d>;
-class ERET_MMR6_ENC : ERET_FM_MMR6<"eret">;
+class EI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"ei", 0x15d>;
+class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>;
+class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>;
+class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>;
 class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">;
+class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>;
 class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>;
 class JIC_MMR6_ENC   : JMP_IDX_COMPACT_FM<0b101000>;
+class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
+class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
 class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
 class LWPC_MMR6_ENC  : PCREL19_FM_MMR6<0b01>;
+class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
 class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
 class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
 class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>;
@@ -59,15 +75,99 @@ class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
 class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
 class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
 class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>;
+class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>;
 class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>;
 class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>;
 class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>;
 class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>;
+class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>;
 class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>;
 class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>;
 class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>;
+class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>;
+class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
+class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
+class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
+class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
+class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
+class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
+class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
+class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>;
+class LB_MMR6_ENC : LB32_FM_MMR6;
+class LBU_MMR6_ENC : LBU32_FM_MMR6;
+class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>;
+class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>;
+class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>;
+class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6;
+class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">;
+class SSNOP_MMR6_ENC : BARRIER_FM_MM<0x1>, MMR6Arch<"ssnop">;
+class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6;
+class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">;
+class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>;
+class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">;
 class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>;
 class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>;
+class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>;
+class ABS_D_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.d", 1, 0b0001101>;
+class FLOOR_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.s", 0, 0b00001100>;
+class FLOOR_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.d", 1, 0b00001100>;
+class FLOOR_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.s", 0, 0b00101100>;
+class FLOOR_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.d", 1, 0b00101100>;
+class CEIL_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.s", 0, 0b01001100>;
+class CEIL_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.d", 1, 0b01001100>;
+class CEIL_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.s", 0, 0b01101100>;
+class CEIL_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.d", 1, 0b01101100>;
+class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>;
+class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>;
+class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>;
+class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>;
+class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>;
+class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>;
+class RSQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.s", 0, 0b00001000>;
+class RSQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.d", 1, 0b00001000>;
+class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>;
+class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>;
+class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>;
+class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>;
+class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>;
+class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
+class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
+class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
+class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>;
+class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>;
+class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
+class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>;
+class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0,
+                                                       0b11001100>;
+class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1,
+                                                       0b11001100>;
+class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0,
+                                                       0b11101100>;
+class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1,
+                                                       0b11101100>;
+class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
+class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
+class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
+class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
+class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>;
+class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>;
+class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
+class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
+
+class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
+class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
+class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
+class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6;
+class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>;
+class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16;
+class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
+class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
+class LI16_MMR6_ENC : LI_FM_MM16;
+class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
+class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
+class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
+class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
 
 class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
                                   RegisterOperand GPROpnd>
@@ -108,6 +208,43 @@ class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm,
   list<Register> Defs = [RA];
 }
 
+/// Floating Point Instructions
+class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
+class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
+class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>;
+class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>;
+class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>;
+class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>;
+class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>;
+class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>;
+class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>;
+class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
+class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
+class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
+class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
+class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
+class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
+class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>;
+class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
+class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
+class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>;
+class MAXA_D_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.d", 1, 0b000101011>;
+class MIN_S_MMR6_ENC : POOL32F_MINMAX_FM<"min.s", 0, 0b000000011>;
+class MIN_D_MMR6_ENC : POOL32F_MINMAX_FM<"min.d", 1, 0b000000011>;
+class MINA_S_MMR6_ENC : POOL32F_MINMAX_FM<"mina.s", 0, 0b000100011>;
+class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>;
+
+class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>;
+class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>;
+class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>;
+class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>;
+class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>;
+class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>;
+class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>;
+class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>;
+class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>;
+class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Descriptions
@@ -130,11 +267,34 @@ class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd>
   bit isBarrier = 1;
 }
 
-class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> {
+class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> {
   bit isCall = 1;
   list<Register> Defs = [RA];
 }
-class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>;
+class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>;
+
+class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+                                       !strconcat("bc16", "\t$offset"), [],
+                                       II_BC, FrmI>,
+                       MMR6Arch<"bc16">, MicroMipsR6Inst16 {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 0;
+  let AdditionalPredicates = [RelocPIC];
+  let Defs = [AT];
+}
+
+class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm>
+    : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 0;
+  let Defs = [AT];
+}
+class BEQZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"beqzc16">;
+class BNEZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"bnezc16">;
+
 class SUB_MMR6_DESC : ArithLogicR<"sub", GPR32Opnd>;
 class SUBU_MMR6_DESC : ArithLogicR<"subu", GPR32Opnd>;
 
@@ -162,6 +322,35 @@ class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd,
 class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd>;
 class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd>;
 
+class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+                                  RegisterOperand GPROpnd> :
+                                  CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd,
+                                  GPROpnd> {
+  string DecoderMethod = "DecodePrefeOpMM";
+}
+
+class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9, GPR32Opnd>;
+class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9, GPR32Opnd>;
+
+class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+                            RegisterOperand GPROpnd> : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  string DecoderMethod = "DecodeLoadByte15";
+  bit mayLoad = 1;
+}
+class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd>;
+class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd>;
+
+class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+                              RegisterOperand GPROpnd>
+    : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd> {
+  let DecoderMethod = "DecodeLoadByte9";
+}
+class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd>;
+class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd>;
+
 class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
     : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rt);
@@ -174,10 +363,22 @@ class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd>;
 
 class EHB_MMR6_DESC : Barrier<"ehb">;
 class EI_MMR6_DESC : DEI_FT<"ei", GPR32Opnd>;
+class DI_MMR6_DESC : DEI_FT<"di", GPR32Opnd>;
 
 class ERET_MMR6_DESC : ER_FT<"eret">;
+class DERET_MMR6_DESC : ER_FT<"deret">;
 class ERETNC_MMR6_DESC : ER_FT<"eretnc">;
 
+class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+    : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+                      [(MipsJmpLink RO:$rs)], II_JALR, FrmR>,
+      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+  let isCall = 1;
+  let hasDelaySlot = 0;
+  let Defs = [RA];
+}
+class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>;
+
 class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
                                      RegisterOperand GPROpnd>
     : MMR6Arch<opstr> {
@@ -200,6 +401,27 @@ class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
   list<Register> Defs = [AT];
 }
 
+class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+    : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+                      [], II_JR, FrmR>,
+      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+  let hasDelaySlot = 0;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>;
+
+class JRCADDIUSP_MMR6_DESC
+    : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm",
+                      [], II_JRADDIUSP, FrmR>,
+      MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 {
+  let hasDelaySlot = 0;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
 class ALIGN_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                       Operand ImmOpnd>  : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs GPROpnd:$rd);
@@ -241,7 +463,7 @@ class LSA_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   list<dag> Pattern = [];
 }
 
-class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2>;
+class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>;
 
 class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                            Operand ImmOpnd> : MMR6Arch<instr_asm> {
@@ -264,6 +486,18 @@ class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
 
 class SELEQZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"seleqz", GPR32Opnd>;
 class SELNEZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"selnez", GPR32Opnd>;
+class PAUSE_MMR6_DESC : Barrier<"pause">;
+class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins HWRegsOpnd:$rs, uimm3:$sel);
+  string AsmString = !strconcat("rdhwr", "\t$rt, $rs, $sel");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_RDHWR;
+  Format Form = FrmR;
+}
+
+class WAIT_MMR6_DESC : WaitMM<"wait">;
+class SSNOP_MMR6_DESC : Barrier<"ssnop">;
 class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>;
 class DIV_MMR6_DESC : ArithLogicR<"div", GPR32Opnd>;
 class DIVU_MMR6_DESC : ArithLogicR<"divu", GPR32Opnd>;
@@ -277,13 +511,426 @@ class ORI_MMR6_DESC : ArithLogicI<"ori", simm16, GPR32Opnd>;
 class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd>;
 class XORI_MMR6_DESC : ArithLogicI<"xori", simm16, GPR32Opnd>;
 
+class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
+                  SDPatternOperator OpNode = null_frag,
+                  InstrItinClass Itin = NoItinerary,
+                  ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+}
+class SW_MMR6_DESC : Store<"sw", GPR32Opnd>;
+class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9>;
+
+class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO>
+    : MMR6Arch<instr_asm> {
+  dag InOperandList = (ins RO:$rs);
+  dag OutOperandList = (outs RO:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  list<dag> Pattern = [];
+  Format f = FrmR;
+  string BaseOpcode = instr_asm;
+  bit hasSideEffects = 0;
+}
+class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd>;
+class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd>;
+
+/// Floating Point Instructions
+class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
+                            InstrItinClass Itin, bit isComm,
+                            SDPatternOperator OpNode = null_frag> : HARDFLOAT {
+  dag OutOperandList = (outs RC:$fd);
+  dag InOperandList = (ins RC:$ft, RC:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set RC:$fd, (OpNode RC:$fs, RC:$ft))];
+  InstrItinClass Itinerary = Itin;
+  bit isCommutable = isComm;
+}
+class FADD_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>;
+class FADD_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>;
+class FSUB_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>;
+class FSUB_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>;
+class FMUL_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>;
+class FMUL_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>;
+class FDIV_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>;
+class FDIV_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>;
+class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>, HARDFLOAT;
+class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>, HARDFLOAT;
+class MSUBF_S_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>, HARDFLOAT;
+class MSUBF_D_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>, HARDFLOAT;
+
+class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+                               RegisterOperand SrcRC, InstrItinClass Itin,
+                               SDPatternOperator OpNode = null_frag>
+                               : HARDFLOAT, NeverHasSideEffects {
+  dag OutOperandList = (outs DstRC:$ft);
+  dag InOperandList = (ins SrcRC:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+  list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmFR;
+}
+class FMOV_S_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
+class FMOV_D_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>;
+class FNEG_S_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
+class FNEG_D_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>;
+
+class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>, HARDFLOAT;
+class MAX_D_MMR6_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>, HARDFLOAT;
+class MIN_S_MMR6_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>, HARDFLOAT;
+class MIN_D_MMR6_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>, HARDFLOAT;
+
+class MAXA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>, HARDFLOAT;
+class MAXA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>, HARDFLOAT;
+class MINA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>, HARDFLOAT;
+class MINA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>, HARDFLOAT;
+
+class CVT_MMR6_DESC_BASE<
+    string instr_asm, RegisterOperand DstRC, RegisterOperand SrcRC,
+    InstrItinClass Itin, SDPatternOperator OpNode = null_frag>
+    : HARDFLOAT, NeverHasSideEffects {
+  dag OutOperandList = (outs DstRC:$ft);
+  dag InOperandList = (ins SrcRC:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+  list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmFR;
+}
+
+class CVT_L_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.s", FGR64Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd,
+                                             II_CVT>;
+class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd,
+                                             II_CVT>;
+class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd,
+                                             II_CVT>;
+class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd,
+                                             II_CVT>;
+class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd,
+                                             II_CVT>, FGR_64;
+class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
+                                             II_CVT>, FGR_64;
+
+multiclass CMP_CC_MMR6<bits<6> format, string Typestr,
+                       RegisterOperand FGROpnd> {
+  def CMP_AF_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>,
+      CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_UN_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>,
+      CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_EQ_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>,
+      CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_UEQ_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>,
+      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_LT_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>,
+      CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_ULT_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>,
+      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_LE_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>,
+      CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_ULE_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>,
+      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SAF_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>,
+      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SUN_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>,
+      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SEQ_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>,
+      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SUEQ_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>,
+      CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SLT_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>,
+      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SULT_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>,
+      CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SLE_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>,
+      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+  def CMP_SULE_#NAME : POOL32F_CMP_FM<
+      !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>,
+      CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+      ISA_MICROMIPS32R6;
+}
+
+class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+                             RegisterOperand SrcRC, InstrItinClass Itin,
+                             SDPatternOperator OpNode = null_frag>
+    : HARDFLOAT, NeverHasSideEffects {
+  dag OutOperandList = (outs DstRC:$ft);
+  dag InOperandList  = (ins SrcRC:$fs);
+  string AsmString   = !strconcat(instr_asm, "\t$ft, $fs");
+  list<dag>  Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmFR;
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+
+class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd,
+                                                II_ABS, fabs>;
+class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd,
+                                                II_ABS, fabs>;
+class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd,
+                                                    FGR32Opnd, II_FLOOR>;
+class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd,
+                                                    FGR64Opnd, II_FLOOR>;
+class FLOOR_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.s", FGR32Opnd,
+                                                    FGR32Opnd, II_FLOOR>;
+class FLOOR_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.d", FGR32Opnd,
+                                                    AFGR64Opnd, II_FLOOR>;
+class CEIL_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.s", FGR64Opnd,
+                                                   FGR32Opnd, II_CEIL>;
+class CEIL_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.d", FGR64Opnd,
+                                                   FGR64Opnd, II_CEIL>;
+class CEIL_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.s", FGR32Opnd,
+                                                   FGR32Opnd, II_CEIL>;
+class CEIL_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.d", FGR32Opnd,
+                                                   AFGR64Opnd, II_CEIL>;
+class TRUNC_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.s", FGR64Opnd,
+                                                    FGR32Opnd, II_TRUNC>;
+class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd,
+                                                    FGR64Opnd, II_TRUNC>;
+class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd,
+                                                    FGR32Opnd, II_TRUNC>;
+class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd,
+                                                    AFGR64Opnd, II_TRUNC>;
+class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd,
+                                                 II_SQRT_S, fsqrt>;
+class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd,
+                                                 II_SQRT_D, fsqrt>;
+class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd,
+                                                  FGR32Opnd, II_TRUNC>;
+class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd,
+                                                  AFGR64Opnd, II_TRUNC>;
+class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd,
+                                                 FGR32Opnd, II_ROUND>;
+class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd,
+                                                 II_ROUND>;
+class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+
+class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+
+class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
+
+class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO>
+    : Store<opstr, RO>, MMR6Arch<opstr> {
+  let DecoderMethod = "DecodeMemMMImm16";
+}
+class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd>;
+
+class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins RO:$rt, mem_mm_9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  string DecoderMethod = "DecodeStoreEvaOpMM";
+  bit mayStore = 1;
+}
+class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd>;
+class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd>;
+class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd>;
+class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd>;
+class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> :
+            MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs RO:$rt);
+  dag InOperandList = (ins mem_mm_12:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  string DecoderMethod = "DecodeMemMMImm9";
+  bit mayLoad = 1;
+}
+class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd>;
+class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd>;
+class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+      MMR6Arch<"addu16">;
+class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+      MMR6Arch<"and16">;
+class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
+      MMR6Arch<"andi16">;
+class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16">;
+class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
+      MMR6Arch<"or16">;
+class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+      MMR6Arch<"sll16">;
+class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+      MMR6Arch<"srl16">;
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16">, MMR6Arch<"srl16">,
+      MicroMipsR6Inst16;
+class LI16_MMR6_DESC : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>,
+      MMR6Arch<"srl16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"srl16">,
+      MicroMipsR6Inst16;
+class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16">, MMR6Arch<"sdbbp16">,
+      MicroMipsR6Inst16;
+class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+      MMR6Arch<"sdbbp16">, MicroMipsR6Inst16;
+class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+      MMR6Arch<"sdbbp16">, MicroMipsR6Inst16;
+
+class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins mem:$addr);
+  string AsmString = "lw\t$rt, $addr";
+  let DecoderMethod = "DecodeMemMMImm16";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (load addrDefault:$addr))];
+  InstrItinClass Itinerary = II_LW;
+}
+
+class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins uimm16:$imm16);
+  string AsmString = "lui\t$rt, $imm16";
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  bit isReMaterializable = 1;
+  InstrItinClass Itinerary = II_LUI;
+  Format Form = FrmI;
+}
+
+class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins i32imm:$stype);
+  string AsmString = !strconcat("sync", "\t$stype");
+  list<dag> Pattern = [(MipsSync imm:$stype)];
+  InstrItinClass Itinerary = NoItinerary;
+  bit HasSideEffects = 1;
+}
+
+class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> {
+  let DecoderMethod = "DecodeSynciR6";
+}
+
+class RDPGPR_MMR6_DESC : MMR6Arch<"rdpgpr">, MipsR6Inst {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rd);
+  string AsmString = !strconcat("rdpgpr", "\t$rt, $rd");
+}
+
+class SDBBP_MMR6_DESC : MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm20:$code_);
+  string AsmString = !strconcat("sdbbp", "\t$code_");
+  list<dag> Pattern = [];
+}
+
+class LWM16_MMR6_DESC
+    : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+                      !strconcat("lwm16", "\t$rt, $addr"), [],
+                      NoItinerary, FrmI>,
+      MMR6Arch<"lwm16">, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayLoad = 1;
+  InstrItinClass Itin = NoItinerary;
+  ComplexPattern Addr = addr;
+}
+
+class SWM16_MMR6_DESC
+    : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+                      !strconcat("swm16", "\t$rt, $addr"), [],
+                      NoItinerary, FrmI>,
+      MMR6Arch<"swm16">, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayStore = 1;
+  InstrItinClass Itin = NoItinerary;
+  ComplexPattern Addr = addr;
+}
+
+class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+                          SDPatternOperator OpNode, InstrItinClass Itin,
+                          Operand MemOpnd>
+    : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+                      !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
+      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMImm4";
+  let mayStore = 1;
+}
+class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd,
+                                           truncstorei8, II_SB, mem_mm_4>;
+class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd,
+                                           truncstorei16, II_SH, mem_mm_4_lsl1>;
+class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
+                                           store, II_SW, mem_mm_4_lsl2>;
+
+class SWSP_MMR6_DESC
+    : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
+                      !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
+      MMR6Arch<"sw">, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+  let mayStore = 1;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
 //
 //===----------------------------------------------------------------------===//
 
-let DecoderNamespace = "MicroMips32r6" in {
+let DecoderNamespace = "MicroMipsR6" in {
 def ADD_MMR6 : StdMMR6Rel, ADD_MMR6_DESC, ADD_MMR6_ENC, ISA_MICROMIPS32R6;
 def ADDIU_MMR6 : StdMMR6Rel, ADDIU_MMR6_DESC, ADDIU_MMR6_ENC, ISA_MICROMIPS32R6;
 def ADDU_MMR6 : StdMMR6Rel, ADDU_MMR6_DESC, ADDU_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -298,6 +945,11 @@ def ALIGN_MMR6 : R6MMR6Rel, ALIGN_MMR6_ENC, ALIGN_MMR6_DESC, ISA_MICROMIPS32R6;
 def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6;
 def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6;
 def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
 def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC,
                    ISA_MICROMIPS32R6;
 def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC,
@@ -320,13 +972,21 @@ def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6;
 def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6;
 def EHB_MMR6 : StdMMR6Rel, EHB_MMR6_DESC, EHB_MMR6_ENC, ISA_MICROMIPS32R6;
 def EI_MMR6 : StdMMR6Rel, EI_MMR6_DESC, EI_MMR6_ENC, ISA_MICROMIPS32R6;
-def ERET_MMR6 : R6MMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def DI_MMR6 : StdMMR6Rel, DI_MMR6_DESC, DI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6;
 def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC,
                   ISA_MICROMIPS32R6;
+def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
 def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6;
 def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6;
+def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
+                      ISA_MICROMIPS32R6;
 def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
 def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
 def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
 def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
 def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -337,17 +997,211 @@ def NOR_MMR6 : StdMMR6Rel, NOR_MMR6_DESC, NOR_MMR6_ENC, ISA_MICROMIPS32R6;
 def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6;
 def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6;
 def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6;
+def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6;
 def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6;
 def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6;
 def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC,
                   ISA_MICROMIPS32R6;
 def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC,
                   ISA_MICROMIPS32R6;
+def SH16_MMR6 : StdMMR6Rel, SH16_MMR6_DESC, SH16_MMR6_ENC, ISA_MICROMIPS32R6;
 def SLL_MMR6 : StdMMR6Rel, SLL_MMR6_DESC, SLL_MMR6_ENC, ISA_MICROMIPS32R6;
 def SUB_MMR6 : StdMMR6Rel, SUB_MMR6_DESC, SUB_MMR6_ENC, ISA_MICROMIPS32R6;
 def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
+def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
+def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
+def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6;
+def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6;
+def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6;
+def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6;
+def SSNOP_MMR6 : StdMMR6Rel, SSNOP_MMR6_DESC, SSNOP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNC_MMR6 : StdMMR6Rel, SYNC_MMR6_DESC, SYNC_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6;
 def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6;
 def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderMethod = "DecodeMemMMImm16" in {
+  def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+let DecoderMethod = "DecodeMemMMImm9" in {
+  def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+/// Floating Point Instructions
+def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def MSUBF_S_MMR6 : R6MMR6Rel, MSUBF_S_MMR6_ENC, MSUBF_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_D_MMR6 : R6MMR6Rel, MIN_D_MMR6_ENC, MIN_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAXA_S_MMR6 : R6MMR6Rel, MAXA_S_MMR6_ENC, MAXA_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MAXA_D_MMR6 : R6MMR6Rel, MAXA_D_MMR6_ENC, MAXA_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MINA_S_MMR6 : R6MMR6Rel, MINA_S_MMR6_ENC, MINA_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MINA_D_MMR6 : R6MMR6Rel, MINA_D_MMR6_ENC, MINA_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def CVT_L_S_MMR6 : StdMMR6Rel, CVT_L_S_MMR6_ENC, CVT_L_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd>;
+defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd>;
+def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def FLOOR_W_S_MMR6 : StdMMR6Rel, FLOOR_W_S_MMR6_ENC, FLOOR_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def FLOOR_W_D_MMR6 : StdMMR6Rel, FLOOR_W_D_MMR6_ENC, FLOOR_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def CEIL_L_S_MMR6 : StdMMR6Rel, CEIL_L_S_MMR6_ENC, CEIL_L_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CEIL_L_D_MMR6 : StdMMR6Rel, CEIL_L_D_MMR6_ENC, CEIL_L_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CEIL_W_S_MMR6 : StdMMR6Rel, CEIL_W_S_MMR6_ENC, CEIL_W_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CEIL_W_D_MMR6 : StdMMR6Rel, CEIL_W_D_MMR6_ENC, CEIL_W_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def TRUNC_L_S_MMR6 : StdMMR6Rel, TRUNC_L_S_MMR6_ENC, TRUNC_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def TRUNC_L_D_MMR6 : StdMMR6Rel, TRUNC_L_D_MMR6_ENC, TRUNC_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def RSQRT_S_MMR6 : StdMMR6Rel, RSQRT_S_MMR6_ENC, RSQRT_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def RSQRT_D_MMR6 : StdMMR6Rel, RSQRT_D_MMR6_ENC, RSQRT_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6;
+def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6;
+def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def AND16_MMR6 : StdMMR6Rel, AND16_MMR6_DESC, AND16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def ANDI16_MMR6 : StdMMR6Rel, ANDI16_MMR6_DESC, ANDI16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def NOT16_MMR6 : StdMMR6Rel, NOT16_MMR6_DESC, NOT16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def OR16_MMR6 : StdMMR6Rel, OR16_MMR6_DESC, OR16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SLL16_MMR6 : StdMMR6Rel, SLL16_MMR6_DESC, SLL16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SRL16_MMR6 : StdMMR6Rel, SRL16_MMR6_DESC, SRL16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def BREAK16_MMR6 : StdMMR6Rel, BREAK16_MMR6_DESC, BREAK16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC,
+                ISA_MICROMIPS32R6;
+def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
+                 ISA_MICROMIPS32R6;
+def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
 }
 
 //===----------------------------------------------------------------------===//
@@ -357,4 +1211,23 @@ def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
 //===----------------------------------------------------------------------===//
 
 def : MipsInstAlias<"ei", (EI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"di", (DI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"nop", (SLL_MMR6 ZERO, ZERO, 0), 1>, ISA_MICROMIPS32R6;
+def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+                                      !strconcat("b", "\t$offset")> {
+  string DecoderNamespace = "MicroMipsR6";
+}
+def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"rdhwr $rt, $rs",
+                    (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
+                    ISA_MICROMIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// MicroMips arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+              (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td
new file mode 100644
index 0000000..da305a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td
@@ -0,0 +1,86 @@
+//=-   MicroMips64r6InstrFormats.td - Instruction Formats  -*- tablegen -*  -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPS64r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_FM_MMR6 {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b111100;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm;
+}
+
+class POOL32I_ADD_IMM_FM_MMR6<bits<5> funct> {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = imm;
+}
+
+class POOL32S_EXTBITS_FM_MMR6<bits<6> funct> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> size;
+  bits<5> pos;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class POOL32S_DALIGN_FM_MMR6 {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<3> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = bp;
+  let Inst{7-6}   = 0b00;
+  let Inst{5-0}   = 0b011100;
+}
+
+class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rt;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-0}  = funct;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
new file mode 100644
index 0000000..ec1aef8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -0,0 +1,119 @@
+//=-  MicroMips64r6InstrInfo.td - Instruction Information  -*- tablegen -*- -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_ENC : DAUI_FM_MMR6;
+class DAHI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10001>;
+class DATI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10000>;
+class DEXT_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b101100>;
+class DEXTM_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b100100>;
+class DEXTU_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b010100>;
+class DALIGN_MMR6_ENC : POOL32S_DALIGN_FM_MMR6;
+class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>;
+class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>;
+class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>;
+class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+  list<dag> Pattern = [];
+}
+class DAUI_MMR6_DESC : DAUI_MMR6_DESC_BASE<"daui", GPR64Opnd>;
+
+class DAHI_DATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  string Constraints = "$rs = $rt";
+}
+class DAHI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dahi", GPR64Opnd>;
+class DATI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dati", GPR64Opnd>;
+
+class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
+                        Operand SizeOpnd, SDPatternOperator Op = null_frag>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs RO:$rt);
+  dag InOperandList = (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $pos, $size");
+  list<dag> Pattern = [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))];
+  InstrItinClass Itinerary = II_EXT;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+}
+// TODO: Add 'pos + size' constraint check to dext* instructions
+//       DEXT: 0 < pos + size <= 63
+//       DEXTM, DEXTU: 32 < pos + size <= 64
+class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5,
+                                         uimm5_plus1, MipsExt>;
+class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
+                                          uimm5_plus33, MipsExt>;
+class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32,
+                                          uimm5_plus1, MipsExt>;
+
+class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+}
+
+class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
+
+class DDIV_MM64R6_DESC : ArithLogicR<"ddiv", GPR32Opnd>;
+class DMOD_MM64R6_DESC : ArithLogicR<"dmod", GPR32Opnd>;
+class DDIVU_MM64R6_DESC : ArithLogicR<"ddivu", GPR32Opnd>;
+class DMODU_MM64R6_DESC : ArithLogicR<"dmodu", GPR32Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "MicroMipsR6" in {
+  def DAUI_MM64R6 : StdMMR6Rel, DAUI_MMR6_DESC, DAUI_MMR6_ENC, ISA_MICROMIPS64R6;
+  def DAHI_MM64R6 : StdMMR6Rel, DAHI_MMR6_DESC, DAHI_MMR6_ENC, ISA_MICROMIPS64R6;
+  def DATI_MM64R6 : StdMMR6Rel, DATI_MMR6_DESC, DATI_MMR6_ENC, ISA_MICROMIPS64R6;
+  def DEXT_MM64R6 : StdMMR6Rel, DEXT_MMR6_DESC, DEXT_MMR6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DEXTM_MM64R6 : StdMMR6Rel, DEXTM_MMR6_DESC, DEXTM_MMR6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DEXTU_MM64R6 : StdMMR6Rel, DEXTU_MMR6_DESC, DEXTU_MMR6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DALIGN_MM64R6 : StdMMR6Rel, DALIGN_MMR6_DESC, DALIGN_MMR6_ENC,
+                      ISA_MICROMIPS64R6;
+  def DDIV_MM64R6 : R6MMR6Rel, DDIV_MM64R6_DESC, DDIV_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DMOD_MM64R6 : R6MMR6Rel, DMOD_MM64R6_DESC, DMOD_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DDIVU_MM64R6 : R6MMR6Rel, DDIVU_MM64R6_DESC, DDIVU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
new file mode 100644
index 0000000..f11c09a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -0,0 +1,244 @@
+//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MMDSPInst<string opstr = "">
+    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+  let AdditionalPredicates = [InMicroMips];
+  string BaseOpcode = opstr;
+  string Arch = "mmdsp";
+  let DecoderNamespace = "MicroMips";
+}
+
+class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+    : InstAlias<Asm, Result, Emit>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+  let AdditionalPredicates = [InMicroMips];
+}
+
+class POOL32A_3R_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_2R_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_2RSA4_FMT<string opstr, bits<12> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<4> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = sa;
+  let Inst{11-0}  = op;
+}
+
+class POOL32A_2RSA3_FMT<string opstr, bits<7> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = sa;
+  let Inst{12-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RSA5B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_2RSA4B0_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<4> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = sa;
+  let Inst{11}    = 0b0;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_2RSA4OP6_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<4> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = sa;
+  let Inst{11-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> imm;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = imm;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> {
+  bits<5> index;
+  bits<5> base;
+  bits<5> rd;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = funct;
+}
+
+class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<7> mask;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-14} = mask;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<8> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-13} = imm;
+  let Inst{12}    = 0;
+  let Inst{11-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_4B0SHIFT6AC4B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<6> shift;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-22} = 0b0000;
+  let Inst{21-16} = shift;
+  let Inst{15-14} = ac;
+  let Inst{13-10} = 0b0000;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
new file mode 100644
index 0000000..b342e23
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -0,0 +1,528 @@
+//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips DSP instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Instruction encoding.
+class ADDQ_PH_MM_ENC : POOL32A_3R_FMT<"addq.ph", 0b00000001101>;
+class ADDQ_S_PH_MM_ENC : POOL32A_3R_FMT<"addq_s.ph", 0b10000001101>;
+class ADDQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"addq_s.w", 0b1100000101>;
+class ADDQH_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh.ph", 0b00001001101>;
+class ADDQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.ph", 0b10001001101>;
+class ADDQH_W_MMR2_ENC: POOL32A_3R_FMT<"addqh.w", 0b00010001101>;
+class ADDQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.w", 0b10010001101>;
+class ADDU_PH_MMR2_ENC : POOL32A_3R_FMT<"addu.ph", 0b00100001101>;
+class ADDU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"addu_s.ph", 0b10100001101>;
+class ADDU_QB_MM_ENC : POOL32A_3R_FMT<"addu.qb", 0b00011001101>;
+class ADDU_S_QB_MM_ENC : POOL32A_3R_FMT<"addu_s.qb", 0b10011001101>;
+class ADDUH_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh.qb", 0b00101001101>;
+class ADDUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh_r.qb", 0b10101001101>;
+class ADDSC_MM_ENC : POOL32A_3RB0_FMT<"addsc", 0b1110000101>;
+class ADDWC_MM_ENC : POOL32A_3RB0_FMT<"addwc", 0b1111000101>;
+class DPA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpa.w.ph", 0b00000010>;
+class DPAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpaq_s.w.ph", 0b00001010>;
+class DPAQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpaq_sa.l.w", 0b01001010>;
+class DPAQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_s.w.ph", 0b10001010>;
+class DPAQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_sa.w.ph", 0b11001010>;
+class DPAU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbl", 0b10000010>;
+class DPAU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbr", 0b11000010>;
+class DPAX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpax.w.ph", 0b01000010>;
+class ABSQ_S_PH_MM_ENC : POOL32A_2R_FMT<"absq_s.ph", 0b0001000100>;
+class ABSQ_S_W_MM_ENC : POOL32A_2R_FMT<"absq_s.w", 0b0010000100>;
+class ABSQ_S_QB_MMR2_ENC : POOL32A_2R_FMT<"absq_s.qb", 0b0000000100>;
+class INSV_MM_ENC : POOL32A_2R_FMT<"insv", 0b0100000100>;
+class MADD_DSP_MM_ENC : POOL32A_2RAC_FMT<"madd", 0b00101010>;
+class MADDU_DSP_MM_ENC : POOL32A_2RAC_FMT<"maddu", 0b01101010>;
+class MSUB_DSP_MM_ENC : POOL32A_2RAC_FMT<"msub", 0b10101010>;
+class MSUBU_DSP_MM_ENC : POOL32A_2RAC_FMT<"msubu", 0b11101010>;
+class MULT_DSP_MM_ENC : POOL32A_2RAC_FMT<"mult", 0b00110010>;
+class MULTU_DSP_MM_ENC : POOL32A_2RAC_FMT<"multu", 0b01110010>;
+class SHLL_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll.ph", 0b001110110101>;
+class SHLL_S_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll_s.ph", 0b101110110101>;
+class SHLL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shll.qb", 0b0100001>;
+class SHLLV_PH_MM_ENC : POOL32A_3R_FMT<"shllv.ph", 0b00000001110>;
+class SHLLV_S_PH_MM_ENC : POOL32A_3R_FMT<"shllv_s.ph", 0b10000001110>;
+class SHLLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shllv.qb", 0b1110010101>;
+class SHLLV_S_W_MM_ENC : POOL32A_3RB0_FMT<"shllv_s.w", 0b1111010101>;
+class SHLL_S_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shll_s.w", 0b1111110101>;
+class SHRA_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra.qb", 0b0000111>;
+class SHRA_R_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra_r.qb", 0b1000111>;
+class SHRA_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra.ph", 0b01100110101>;
+class SHRA_R_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra_r.ph", 0b11100110101>;
+class SHRAV_PH_MM_ENC : POOL32A_3R_FMT<"shrav.ph", 0b00110001101>;
+class SHRAV_R_PH_MM_ENC : POOL32A_3R_FMT<"shrav_r.ph", 0b10110001101>;
+class SHRAV_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav.qb", 0b00111001101>;
+class SHRAV_R_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav_r.qb", 0b10111001101>;
+class SHRAV_R_W_MM_ENC : POOL32A_3RB0_FMT<"shrav_r.w", 0b1011010101>;
+class SHRA_R_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shra_r.w", 0b1011110101>;
+class SHRL_PH_MMR2_ENC : POOL32A_2RSA4OP6_FMT<"shrl.ph", 0b001111>;
+class SHRL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shrl.qb", 0b1100001>;
+class SHRLV_PH_MMR2_ENC : POOL32A_3RB0_FMT<"shrlv.ph", 0b1100010101>;
+class SHRLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shrlv.qb", 0b1101010101>;
+class PRECEQ_W_PHL_MM_ENC : POOL32A_2R_FMT<"preceq.w.phl", 0b0101000100>;
+class PRECEQ_W_PHR_MM_ENC : POOL32A_2R_FMT<"preceq.w.phr", 0b0110000100>;
+class PRECEQU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbl", 0b0111000100>;
+class PRECEQU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbla", 0b0111001100>;
+class PRECEQU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbr", 0b1001000100>;
+class PRECEQU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbra", 0b1001001100>;
+class PRECEU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbl", 0b1011000100>;
+class PRECEU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbla", 0b1011001100>;
+class PRECEU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbr", 0b1101000100>;
+class PRECEU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbra", 0b1101001100>;
+class SUBQ_PH_MM_ENC : POOL32A_3R_FMT<"subq.ph", 0b01000001101>;
+class SUBQ_S_PH_MM_ENC : POOL32A_3R_FMT<"subq_s.ph", 0b11000001101>;
+class SUBQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"subq_s.w", 0b1101000101>;
+class SUBQH_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh.ph", 0b01001001101>;
+class SUBQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.ph", 0b11001001101>;
+class SUBQH_W_MMR2_ENC : POOL32A_3R_FMT<"subqh.w", 0b01010001101>;
+class SUBQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.w", 0b11010001101>;
+class SUBU_PH_MMR2_ENC : POOL32A_3R_FMT<"subu.ph", 0b01100001101>;
+class SUBU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"subu_s.ph", 0b11100001101>;
+class SUBU_QB_MM_ENC : POOL32A_3R_FMT<"subu.qb", 0b01011001101>;
+class SUBU_S_QB_MM_ENC : POOL32A_3R_FMT<"subu_s.qb", 0b11011001101>;
+class SUBUH_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh.qb", 0b01101001101>;
+class SUBUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh_r.qb", 0b11101001101>;
+class EXTP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extp", 0b10011001>;
+class EXTPDP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extpdp", 0b11011001>;
+class EXTPDPV_MM_ENC : POOL32A_2RAC_FMT<"extpdpv", 0b11100010>;
+class EXTPV_MM_ENC : POOL32A_2RAC_FMT<"extpv", 0b10100010>;
+class EXTR_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr.w", 0b00111001>;
+class EXTR_R_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_r.w", 0b01111001>;
+class EXTR_RS_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_rs.w", 0b10111001>;
+class EXTR_S_H_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_s.h", 0b11111001>;
+class EXTRV_W_MM_ENC : POOL32A_2RAC_FMT<"extrv.w", 0b00111010>;
+class EXTRV_R_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_r.w", 0b01111010>;
+class EXTRV_RS_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_rs.w", 0b10111010>;
+class EXTRV_S_H_MM_ENC : POOL32A_2RAC_FMT<"extrv_s.h", 0b11111010>;
+class DPS_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dps.w.ph", 0b00010010>;
+class DPSQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpsq_s.w.ph", 0b00011010>;
+class DPSQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpsq_sa.l.w", 0b01011010>;
+class DPSQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_s.w.ph", 0b10011010>;
+class DPSQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_sa.w.ph", 0b11011010>;
+class DPSU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbl", 0b10010010>;
+class DPSU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbr", 0b11010010>;
+class DPSX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsx.w.ph", 0b01010010>;
+class MUL_PH_MMR2_ENC : POOL32A_3R_FMT<"mul.ph", 0b00000101101>;
+class MUL_S_PH_MMR2_ENC : POOL32A_3R_FMT<"mul_s.ph", 0b10000101101>;
+class MULEQ_S_W_PHL_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phl", 0b0000100101>;
+class MULEQ_S_W_PHR_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phr", 0b0001100101>;
+class MULEU_S_PH_QBL_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbl", 0b0010010101>;
+class MULEU_S_PH_QBR_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbr", 0b0011010101>;
+class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>;
+class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>;
+class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>;
+class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>;
+class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>;
+class PRECR_SRA_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>;
+class PRECR_SRA_R_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>;
+class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>;
+class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
+class PRECRQU_S_QB_PH_MM_ENC
+    : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
+class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
+class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>;
+class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>;
+class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>;
+class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>;
+class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>;
+class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>;
+class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>;
+class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
+class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
+class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
+class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
+class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>;
+class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>;
+class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>;
+class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>;
+class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>;
+class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>;
+class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>;
+class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>;
+class PACKRL_PH_MM_ENC : POOL32A_3RB0_FMT<"packrl.ph", 0b0110101101>;
+class PICK_PH_MM_ENC : POOL32A_3RB0_FMT<"pick.ph", 0b1000101101>;
+class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>;
+class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>;
+class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>;
+class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>;
+
+// Instruction desc.
+class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
+                                InstrItinClass itin, RegisterOperand ROD,
+                                RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rt);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(opstr, "\t$rt, $rs");
+  list<dag> Pattern = [(set ROD:$rt, (OpNode ROS:$rs))];
+  InstrItinClass Itinerary = itin;
+}
+class ABSQ_S_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "absq_s.ph", int_mips_absq_s_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_W_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "absq_s.w", int_mips_absq_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_QB_MMR2_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "absq_s.qb", int_mips_absq_s_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class PRECEQ_W_PHL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceq.w.phl", int_mips_preceq_w_phl, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQ_W_PHR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceq.w.phr", int_mips_preceq_w_phr, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbl", int_mips_precequ_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbla", int_mips_precequ_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbr", int_mips_precequ_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbra", int_mips_precequ_ph_qbra, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbl", int_mips_preceu_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbla", int_mips_preceu_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbr", int_mips_preceu_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbra", int_mips_preceu_ph_qbra, NoItinerary, DSPROpnd>;
+
+class SHLL_R2_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           SDPatternOperator ImmPat, InstrItinClass itin,
+                           RegisterOperand RO, Operand ImmOpnd> {
+  dag OutOperandList = (outs RO:$rt);
+  dag InOperandList = (ins RO:$rs, ImmOpnd:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set RO:$rt, (OpNode RO:$rs, ImmPat:$sa))];
+  InstrItinClass Itinerary = itin;
+  bit hasSideEffects = 1;
+}
+class SHLL_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+  Defs<[DSPOutFlag22]>;
+class SHLL_S_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll_s.ph", int_mips_shll_s_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+  Defs<[DSPOutFlag22]>;
+class SHLL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>,
+  Defs<[DSPOutFlag22]>;
+class SHLL_S_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll_s.w", int_mips_shll_s_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>,
+  Defs<[DSPOutFlag22]>;
+class SHRA_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_R_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra_r.qb", int_mips_shra_r_qb, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra_r.ph", int_mips_shra_r_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra_r.w", int_mips_shra_r_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>;
+class SHRL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shrl.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRL_PH_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+  "shrl.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+
+class SHLLV_R3_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))];
+  InstrItinClass Itinerary = itin;
+}
+class SHLLV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv.ph", int_mips_shll_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv_s.ph", int_mips_shll_s_ph, NoItinerary, DSPROpnd>,
+  Defs<[DSPOutFlag22]>;
+class SHLLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv.qb", int_mips_shll_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv_s.w", int_mips_shll_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag22]>;
+class SHRAV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>;
+class SHRAV_R_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>;
+class SHRAV_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>;
+class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>;
+class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>;
+
+class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs");
+  InstrItinClass Itinerary = itin;
+}
+class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
+  InstrItinClass Itinerary = itin;
+}
+
+class EXTP_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTPDP_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPDPV_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPV_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTR_W_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTR_R_W_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTR_RS_W_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTR_S_H_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_W_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_R_W_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_RS_W_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_S_H_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+
+class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                        InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rs);
+  dag InOperandList = (ins RO:$ac);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
+                                       NoItinerary>;
+class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
+                                       NoItinerary>;
+
+class RADDU_W_QB_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins DSPROpnd:$rs);
+  string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))];
+  InstrItinClass Itinerary = NoItinerary;
+  string BaseOpcode = "raddu.w.qb";
+}
+
+class RDDSP_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins uimm16:$mask);
+  string AsmString = !strconcat("rddsp", "\t$rt, $mask");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPL_QB_MM_DESC {
+  dag OutOperandList = (outs DSPROpnd:$rt);
+  dag InOperandList = (ins uimm16:$imm);
+  string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
+  list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+
+class WRDSP_MM_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask);
+  string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
+  list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+// Instruction defs.
+// microMIPS DSP Rev 1
+def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC;
+def ADDQ_S_W_MM : DspMMRel, ADDQ_S_W_MM_ENC, ADDQ_S_W_DESC;
+def ADDU_QB_MM : DspMMRel, ADDU_QB_MM_ENC, ADDU_QB_DESC;
+def ADDU_S_QB_MM : DspMMRel, ADDU_S_QB_MM_ENC, ADDU_S_QB_DESC;
+def ADDSC_MM : DspMMRel, ADDSC_MM_ENC, ADDSC_DESC;
+def ADDWC_MM : DspMMRel, ADDWC_MM_ENC, ADDWC_DESC;
+def DPAQ_S_W_PH_MM : DspMMRel, DPAQ_S_W_PH_MM_ENC, DPAQ_S_W_PH_DESC;
+def DPAQ_SA_L_W_MM : DspMMRel, DPAQ_SA_L_W_MM_ENC, DPAQ_SA_L_W_DESC;
+def DPAU_H_QBL_MM : DspMMRel, DPAU_H_QBL_MM_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR_MM : DspMMRel, DPAU_H_QBR_MM_ENC, DPAU_H_QBR_DESC;
+def ABSQ_S_PH_MM : DspMMRel, ABSQ_S_PH_MM_ENC, ABSQ_S_PH_MM_DESC;
+def ABSQ_S_W_MM : DspMMRel, ABSQ_S_W_MM_ENC, ABSQ_S_W_MM_DESC;
+def INSV_MM : DspMMRel, INSV_MM_ENC, INSV_DESC;
+def MADD_DSP_MM : DspMMRel, MADD_DSP_MM_ENC, MADD_DSP_DESC;
+def MADDU_DSP_MM : DspMMRel, MADDU_DSP_MM_ENC, MADDU_DSP_DESC;
+def MSUB_DSP_MM : DspMMRel, MSUB_DSP_MM_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP_MM : DspMMRel, MSUBU_DSP_MM_ENC, MSUBU_DSP_DESC;
+def MULT_DSP_MM : DspMMRel, MULT_DSP_MM_ENC, MULT_DSP_DESC;
+def MULTU_DSP_MM : DspMMRel, MULTU_DSP_MM_ENC, MULTU_DSP_DESC;
+def SHLL_PH_MM : DspMMRel, SHLL_PH_MM_ENC, SHLL_PH_MM_DESC;
+def SHLL_S_PH_MM : DspMMRel, SHLL_S_PH_MM_ENC, SHLL_S_PH_MM_DESC;
+def SHLL_QB_MM : DspMMRel, SHLL_QB_MM_ENC, SHLL_QB_MM_DESC;
+def SHLLV_PH_MM : DspMMRel, SHLLV_PH_MM_ENC, SHLLV_PH_MM_DESC;
+def SHLLV_S_PH_MM : DspMMRel, SHLLV_S_PH_MM_ENC, SHLLV_S_PH_MM_DESC;
+def SHLLV_QB_MM : DspMMRel, SHLLV_QB_MM_ENC, SHLLV_QB_MM_DESC;
+def SHLLV_S_W_MM : DspMMRel, SHLLV_S_W_MM_ENC, SHLLV_S_W_MM_DESC;
+def SHLL_S_W_MM : DspMMRel, SHLL_S_W_MM_ENC, SHLL_S_W_MM_DESC;
+def SHRA_PH_MM : DspMMRel, SHRA_PH_MM_ENC, SHRA_PH_MM_DESC;
+def SHRA_R_PH_MM : DspMMRel, SHRA_R_PH_MM_ENC, SHRA_R_PH_MM_DESC;
+def SHRAV_PH_MM : DspMMRel, SHRAV_PH_MM_ENC, SHRAV_PH_MM_DESC;
+def SHRAV_R_PH_MM : DspMMRel, SHRAV_R_PH_MM_ENC, SHRAV_R_PH_MM_DESC;
+def SHRAV_R_W_MM : DspMMRel, SHRAV_R_W_MM_ENC, SHRAV_R_W_MM_DESC;
+def SHRA_R_W_MM : DspMMRel, SHRA_R_W_MM_ENC, SHRA_R_W_MM_DESC;
+def SHRL_QB_MM : DspMMRel, SHRL_QB_MM_ENC, SHRL_QB_MM_DESC;
+def SHRLV_QB_MM : DspMMRel, SHRLV_QB_MM_ENC, SHRLV_QB_MM_DESC;
+def PRECEQ_W_PHL_MM : DspMMRel, PRECEQ_W_PHL_MM_ENC, PRECEQ_W_PHL_MM_DESC;
+def PRECEQ_W_PHR_MM : DspMMRel, PRECEQ_W_PHR_MM_ENC, PRECEQ_W_PHR_MM_DESC;
+def PRECEQU_PH_QBL_MM : DspMMRel, PRECEQU_PH_QBL_MM_ENC, PRECEQU_PH_QBL_MM_DESC;
+def PRECEQU_PH_QBLA_MM : DspMMRel, PRECEQU_PH_QBLA_MM_ENC,
+                         PRECEQU_PH_QBLA_MM_DESC;
+def PRECEQU_PH_QBR_MM : DspMMRel, PRECEQU_PH_QBR_MM_ENC, PRECEQU_PH_QBR_MM_DESC;
+def PRECEQU_PH_QBRA_MM : DspMMRel, PRECEQU_PH_QBRA_MM_ENC,
+                         PRECEQU_PH_QBRA_MM_DESC;
+def PRECEU_PH_QBL_MM : DspMMRel, PRECEU_PH_QBL_MM_ENC, PRECEU_PH_QBL_MM_DESC;
+def PRECEU_PH_QBLA_MM : DspMMRel, PRECEU_PH_QBLA_MM_ENC, PRECEU_PH_QBLA_MM_DESC;
+def PRECEU_PH_QBR_MM : DspMMRel, PRECEU_PH_QBR_MM_ENC, PRECEU_PH_QBR_MM_DESC;
+def PRECEU_PH_QBRA_MM : DspMMRel, PRECEU_PH_QBRA_MM_ENC, PRECEU_PH_QBRA_MM_DESC;
+def SUBQ_PH_MM : DspMMRel, SUBQ_PH_MM_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH_MM : DspMMRel, SUBQ_S_PH_MM_ENC, SUBQ_S_PH_DESC;
+def SUBQ_S_W_MM : DspMMRel, SUBQ_S_W_MM_ENC, SUBQ_S_W_DESC;
+def SUBU_QB_MM : DspMMRel, SUBU_QB_MM_ENC, SUBU_QB_DESC;
+def SUBU_S_QB_MM : DspMMRel, SUBU_S_QB_MM_ENC, SUBU_S_QB_DESC;
+def EXTP_MM : DspMMRel, EXTP_MM_ENC, EXTP_MM_DESC;
+def EXTPDP_MM : DspMMRel, EXTPDP_MM_ENC, EXTPDP_MM_DESC;
+def EXTPDPV_MM : DspMMRel, EXTPDPV_MM_ENC, EXTPDPV_MM_DESC;
+def EXTPV_MM : DspMMRel, EXTPV_MM_ENC, EXTPV_MM_DESC;
+def EXTR_W_MM : DspMMRel, EXTR_W_MM_ENC, EXTR_W_MM_DESC;
+def EXTR_R_W_MM : DspMMRel, EXTR_R_W_MM_ENC, EXTR_R_W_MM_DESC;
+def EXTR_RS_W_MM : DspMMRel, EXTR_RS_W_MM_ENC, EXTR_RS_W_MM_DESC;
+def EXTR_S_H_MM : DspMMRel, EXTR_S_H_MM_ENC, EXTR_S_H_MM_DESC;
+def EXTRV_W_MM : DspMMRel, EXTRV_W_MM_ENC, EXTRV_W_MM_DESC;
+def EXTRV_R_W_MM : DspMMRel, EXTRV_R_W_MM_ENC, EXTRV_R_W_MM_DESC;
+def EXTRV_RS_W_MM : DspMMRel, EXTRV_RS_W_MM_ENC, EXTRV_RS_W_MM_DESC;
+def EXTRV_S_H_MM : DspMMRel, EXTRV_S_H_MM_ENC, EXTRV_S_H_MM_DESC;
+def DPSQ_S_W_PH_MM : DspMMRel, DPSQ_S_W_PH_MM_ENC, DPSQ_S_W_PH_DESC;
+def DPSQ_SA_L_W_MM : DspMMRel, DPSQ_SA_L_W_MM_ENC, DPSQ_SA_L_W_DESC;
+def DPSU_H_QBL_MM : DspMMRel, DPSU_H_QBL_MM_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR_MM : DspMMRel, DPSU_H_QBR_MM_ENC, DPSU_H_QBR_DESC;
+def MULEQ_S_W_PHL_MM : DspMMRel, MULEQ_S_W_PHL_MM_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC;
+def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC;
+def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC;
+def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
+def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC;
+def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC;
+def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC;
+def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
+def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
+def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
+def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
+def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC;
+def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC;
+def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC;
+def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC;
+def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC;
+def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC;
+def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC;
+def PACKRL_PH_MM : DspMMRel, PACKRL_PH_MM_ENC, PACKRL_PH_DESC;
+def PICK_PH_MM : DspMMRel, PICK_PH_MM_ENC, PICK_PH_DESC;
+def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC;
+def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC;
+def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC;
+def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC;
+// microMIPS DSP Rev 2
+def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
+                     ISA_DSPR2;
+def ADDQH_PH_MMR2 : DspMMRel, ADDQH_PH_MMR2_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH_MMR2 : DspMMRel, ADDQH_R_PH_MMR2_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W_MMR2 : DspMMRel, ADDQH_W_MMR2_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W_MMR2 : DspMMRel, ADDQH_R_W_MMR2_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def ADDU_PH_MMR2 : DspMMRel, ADDU_PH_MMR2_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH_MMR2 : DspMMRel, ADDU_S_PH_MMR2_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def ADDUH_QB_MMR2 : DspMMRel, ADDUH_QB_MMR2_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB_MMR2 : DspMMRel, ADDUH_R_QB_MMR2_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def DPA_W_PH_MMR2 : DspMMRel, DPA_W_PH_MMR2_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH_MMR2 : DspMMRel, DPAQX_S_W_PH_MMR2_ENC, DPAQX_S_W_PH_DESC,
+                        ISA_DSPR2;
+def DPAQX_SA_W_PH_MMR2 : DspMMRel, DPAQX_SA_W_PH_MMR2_ENC, DPAQX_SA_W_PH_DESC,
+                         ISA_DSPR2;
+def DPAX_W_PH_MMR2 : DspMMRel, DPAX_W_PH_MMR2_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def SHRA_QB_MMR2 : DspMMRel, SHRA_QB_MMR2_ENC, SHRA_QB_MMR2_DESC, ISA_DSPR2;
+def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC,
+                     ISA_DSPR2;
+def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2;
+def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC,
+                      ISA_DSPR2;
+def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2;
+def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2;
+def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH_MMR2 : DspMMRel, SUBQH_R_PH_MMR2_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_W_MMR2 : DspMMRel, SUBQH_W_MMR2_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W_MMR2 : DspMMRel, SUBQH_R_W_MMR2_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def SUBU_PH_MMR2 : DspMMRel, SUBU_PH_MMR2_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH_MMR2 : DspMMRel, SUBU_S_PH_MMR2_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def SUBUH_QB_MMR2 : DspMMRel, SUBUH_QB_MMR2_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB_MMR2 : DspMMRel, SUBUH_R_QB_MMR2_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def DPS_W_PH_MMR2 : DspMMRel, DPS_W_PH_MMR2_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH_MMR2 : DspMMRel, DPSQX_S_W_PH_MMR2_ENC, DPSQX_S_W_PH_DESC,
+                        ISA_DSPR2;
+def DPSQX_SA_W_PH_MMR2 : DspMMRel, DPSQX_SA_W_PH_MMR2_ENC, DPSQX_SA_W_PH_DESC,
+                         ISA_DSPR2;
+def DPSX_W_PH_MMR2 : DspMMRel, DPSX_W_PH_MMR2_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def MUL_PH_MMR2 : DspMMRel, MUL_PH_MMR2_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC,
+                       ISA_DSPR2;
+def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
+                          PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
+                            PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
+
+// Instruction alias.
+def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
index 004b0d5..756e6c9 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -37,23 +37,14 @@ def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
 def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
                   CEQS_FM_MM<1>;
 
-def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>,
+def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, II_BC1F, MIPS_BRANCH_F>,
               BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
-def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>,
+def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
               BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
-
-def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
-                   ROUND_W_FM_MM<0, 0x6c>;
 def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                    ROUND_W_FM_MM<0, 0x24>;
-def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
-                   ROUND_W_FM_MM<0, 0x2c>;
-def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                    ROUND_W_FM_MM<0, 0xec>;
-def TRUNC_W_S_MM : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
-                   ROUND_W_FM_MM<0, 0xac>;
-def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
-                                fsqrt>, ROUND_W_FM_MM<0, 0x28>;
 
 def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
                  ROUND_W_FM_MM<1, 0x6c>;
@@ -61,7 +52,7 @@ def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
                  ROUND_W_FM_MM<1, 0x24>;
 def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
                  ROUND_W_FM_MM<1, 0x2c>;
-def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
                  ROUND_W_FM_MM<1, 0xec>;
 def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
                  ROUND_W_FM_MM<1, 0xac>;
@@ -146,3 +137,14 @@ def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
 def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
                    MADDS_FM_MM<0x2a>;
 }
+
+let AdditionalPredicates = [InMicroMips] in {
+  def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
+    II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>;
+  def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
+    FGR32Opnd, II_TRUNC>, ROUND_W_FM_MM<0, 0xac>;
+  def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+    ROUND_W_FM_MM<0, 0x6c>;
+  def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+    fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
index 560afa4..b736367 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -389,6 +389,22 @@ class LW_FM_MM<bits<6> op> : MMArch {
   let Inst{15-0}  = addr{15-0};
 }
 
+class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = fmt;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
 class LWL_FM_MM<bits<4> funct> {
   bits<5> rt;
   bits<21> addr;
@@ -402,6 +418,22 @@ class LWL_FM_MM<bits<4> funct> {
   let Inst{11-0}  = addr{11-0};
 }
 
+class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = type;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
 class CMov_F_I_FM_MM<bits<7> func> : MMArch {
   bits<5> rd;
   bits<5> rs;
@@ -655,6 +687,22 @@ class LL_FM_MM<bits<4> funct> {
   let Inst{11-0}  = addr{11-0};
 }
 
+class LLE_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-9} = 0x6;
+  let Inst{8-0} = offset;
+}
+
 class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
   bits<5> ft;
   bits<5> fs;
@@ -895,7 +943,7 @@ class LWM_FM_MM<bits<4> funct> : MMArch {
   let Inst{11-0}  = addr{11-0};
 }
 
-class LWM_FM_MM16<bits<4> funct> : MMArch {
+class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl {
   bits<2> rt;
   bits<4> addr;
 
@@ -922,6 +970,37 @@ class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch {
   let Inst{11-0}  = offset;
 }
 
+class CACHE_PREFE_FM_MM<bits<6> op, bits<3> funct> : MMArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0xA;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class POOL32F_PREFX_FM_MM<bits<6> op, bits<9> funct> : MMArch {
+  bits<5> index;
+  bits<5> base;
+  bits<5> hint;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = hint;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
 class BARRIER_FM_MM<bits<5> op> : MMArch {
   bits<32> Inst;
 
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
index 3939384..99f0f44 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -13,11 +13,6 @@ def simm12 : Operand<i32> {
   let DecoderMethod = "DecodeSimm12";
 }
 
-def uimm5_lsl2 : Operand<OtherVT> {
-  let EncoderMethod = "getUImm5Lsl2Encoding";
-  let DecoderMethod = "DecodeUImm5lsl2";
-}
-
 def uimm6_lsl2 : Operand<i32> {
   let EncoderMethod = "getUImm6Lsl2Encoding";
   let DecoderMethod = "DecodeUImm6Lsl2";
@@ -30,6 +25,7 @@ def simm9_addiusp : Operand<i32> {
 
 def uimm3_shift : Operand<i32> {
   let EncoderMethod = "getUImm3Mod8Encoding";
+  let DecoderMethod = "DecodePOOL16BEncodedField";
 }
 
 def simm3_lsa2 : Operand<i32> {
@@ -105,6 +101,14 @@ def mem_mm_gp_imm7_lsl2 : Operand<i32> {
   let EncoderMethod = "getMemEncodingMMGPImm7Lsl2";
 }
 
+def mem_mm_9 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm9);
+  let EncoderMethod = "getMemEncodingMMImm9";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
 def mem_mm_12 : Operand<i32> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops GPR32, simm12);
@@ -113,6 +117,14 @@ def mem_mm_12 : Operand<i32> {
   let OperandType = "OPERAND_MEMORY";
 }
 
+def mem_mm_16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm16);
+  let EncoderMethod = "getMemEncodingMMImm16";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
 def MipsMemUimm4AsmOperand : AsmOperandClass {
   let Name = "MemOffsetUimm4";
   let SuperClasses = [MipsMemAsmOperand];
@@ -166,7 +178,7 @@ def simm23_lsl2 : Operand<i32> {
 class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
                       RegisterOperand RO> :
   InstSE<(outs), (ins RO:$rs, opnd:$offset),
-         !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> {
+         !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 0;
@@ -251,6 +263,13 @@ class LLBaseMM<string opstr, RegisterOperand RO> :
   let mayLoad = 1;
 }
 
+class LLEBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm9";
+  let mayLoad = 1;
+}
+
 class SCBaseMM<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
@@ -259,6 +278,14 @@ class SCBaseMM<string opstr, RegisterOperand RO> :
   let Constraints = "$rt = $dst";
 }
 
+class SCEBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm9";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
 class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
              InstrItinClass Itin = NoItinerary> :
   InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
@@ -392,7 +419,7 @@ class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
 // 16-bit Jump and Link (Call)
 class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
-           [(MipsJmpLink RO:$rs)], IIBranch, FrmR> {
+           [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl {
   let isCall = 1;
   let hasDelaySlot = 1;
   let Defs = [RA];
@@ -401,7 +428,7 @@ class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
 // 16-bit Jump Reg
 class JumpRegMM16<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
-           [], IIBranch, FrmR> {
+           [], II_JR, FrmR> {
   let hasDelaySlot = 1;
   let isBranch = 1;
   let isIndirectBranch = 1;
@@ -410,7 +437,7 @@ class JumpRegMM16<string opstr, RegisterOperand RO> :
 // Base class for JRADDIUSP instruction.
 class JumpRAddiuStackMM16 :
   MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm",
-                  [], IIBranch, FrmR> {
+                  [], II_JRADDIUSP, FrmR> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isBranch = 1;
@@ -420,7 +447,7 @@ class JumpRAddiuStackMM16 :
 // 16-bit Jump and Link (Call) - Short Delay Slot
 class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
-           [], IIBranch, FrmR> {
+           [], II_JALRS, FrmR> {
   let isCall = 1;
   let hasDelaySlot = 1;
   let Defs = [RA];
@@ -429,7 +456,7 @@ class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
 // 16-bit Jump Register Compact - No delay slot
 class JumpRegCMM16<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
-                  [], IIBranch, FrmR> {
+                  [], II_JRC, FrmR> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isBranch = 1;
@@ -444,7 +471,7 @@ class BrkSdbbp16MM<string opstr> :
 
 class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset),
-                  !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> {
+                  !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZ, FrmI> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -455,18 +482,18 @@ class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
 let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
   class JumpLinkMM<string opstr, DAGOperand opnd> :
     InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-           [], IIBranch, FrmJ, opstr> {
+           [], II_JALS, FrmJ, opstr> {
     let DecoderMethod = "DecodeJumpTargetMM";
   }
 
   class JumpLinkRegMM<string opstr, RegisterOperand RO>:
     InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-            [], IIBranch, FrmR>;
+            [], II_JALRS, FrmR>;
 
   class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd,
                                   RegisterOperand RO> :
     InstSE<(outs), (ins RO:$rs, opnd:$offset),
-           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
+           !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>;
 }
 
 class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
@@ -475,6 +502,10 @@ class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
   InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$rd, ${index}(${base})"), [], Itin, FrmFI>;
 
+class PrefetchIndexed<string opstr> :
+  InstSE<(outs), (ins PtrRC:$base, PtrRC:$index, uimm5:$hint),
+         !strconcat(opstr, "\t$hint, ${index}(${base})"), [], NoItinerary, FrmOther>;
+
 class AddImmUPC<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm),
          !strconcat(opstr, "\t$rs, $imm"), [], NoItinerary, FrmR>;
@@ -543,7 +574,7 @@ class LoadMultMM16<string opstr,
 class UncondBranchMM16<string opstr> :
   MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
                   !strconcat(opstr, "\t$offset"),
-                  [], IIBranch, FrmI> {
+                  [], II_B, FrmI> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -553,21 +584,24 @@ class UncondBranchMM16<string opstr> :
 }
 
 def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
-                ARITH_FM_MM16<0>;
-def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
-                ARITH_FM_MM16<1>;
-def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>;
+    ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
 def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
-               LOGIC_FM_MM16<0x2>;
-def OR16_MM  : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
-               LOGIC_FM_MM16<0x3>;
-def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
-               LOGIC_FM_MM16<0x1>;
-def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>;
+    LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6_64R6;
+def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
 def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
-               SHIFT_FM_MM16<0>;
+    SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
 def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
-               SHIFT_FM_MM16<1>;
+    SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+
+def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+                ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+               LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6_64R6;
 def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
                         mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
 def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
@@ -597,7 +631,8 @@ def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
 def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
 def LI16_MM : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, LI_FM_MM16,
               IsAsCheapAsAMove;
-def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>;
+def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
 def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
 def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
 def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
@@ -607,8 +642,18 @@ def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
 def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
                 BEQNEZ_FM_MM16<0x2b>;
 def B16_MM : UncondBranchMM16<"b16">, B16_FM;
-def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>;
-def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>;
+def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+
+let DecoderNamespace = "MicroMips" in {
+  /// Load and Store Instructions - multiple
+  def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+}
 
 class WaitMM<string opstr> :
   InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
@@ -701,6 +746,18 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
     def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
   }
 
+  let DecoderMethod = "DecodeMemMMImm9" in {
+    def LBE_MM  : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
+    def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
+    def LHE_MM  : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+    def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+    def LWE_MM  : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
+    def SBE_MM  : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
+    def SHE_MM  : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
+    def SWE_MM  : StoreMemory<"swe", GPR32Opnd, mem_simm9gpr>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
+  }
+
   def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
 
   def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>;
@@ -714,12 +771,20 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                LWL_FM_MM<0x8>;
   def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
                LWL_FM_MM<0x9>;
+  let DecoderMethod = "DecodeMemMMImm9" in {
+    def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_12>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
+    def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_12>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
+    def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_12>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
+    def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_12>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
+  }
 
   /// Load and Store Instructions - multiple
   def SWM32_MM  : StoreMultMM<"swm32">, LWM_FM_MM<0xd>;
   def LWM32_MM  : LoadMultMM<"lwm32">, LWM_FM_MM<0x5>;
-  def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>;
-  def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>;
 
   /// Load and Store Pair Instructions
   def SWP_MM  : StorePairMM<"swp">, LWM_FM_MM<0x9>;
@@ -777,11 +842,11 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
 
   /// Word Swap Bytes Within Halfwords
-  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>,
-                ISA_MIPS32R2;
-
-  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
-               EXT_FM_MM<0x2c>;
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
+                SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
+  // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
+                              MipsExt>, EXT_FM_MM<0x2c>;
   def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
                EXT_FM_MM<0x0c>;
 
@@ -854,12 +919,22 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
   def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
 
+  def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>;
+  def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>;
+
   let DecoderMethod = "DecodeCacheOpMM" in {
   def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12>,
                  CACHE_PREF_FM_MM<0x08, 0x6>;
   def PREF_MM  : MMRel, CacheOp<"pref", mem_mm_12>,
                  CACHE_PREF_FM_MM<0x18, 0x2>;
   }
+
+  let DecoderMethod = "DecodePrefeOpMM" in {
+    def PREFE_MM  : MMRel, CacheOp<"prefe", mem_mm_9>,
+                 CACHE_PREFE_FM_MM<0x18, 0x2>;
+    def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9>,
+                 CACHE_PREFE_FM_MM<0x18, 0x3>;
+  }
   def SSNOP_MM : MMRel, Barrier<"ssnop">, BARRIER_FM_MM<0x1>;
   def EHB_MM   : MMRel, Barrier<"ehb">, BARRIER_FM_MM<0x3>;
   def PAUSE_MM : MMRel, Barrier<"pause">, BARRIER_FM_MM<0x5>;
@@ -870,7 +945,13 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def TLBWR_MM : MMRel, TLB<"tlbwr">, COP0_TLB_FM_MM<0xcd>;
 
   def SDBBP_MM : MMRel, SYS_FT<"sdbbp">, SDBBP_FM_MM;
-  def RDHWR_MM : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM_MM;
+
+  def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
+}
+
+let DecoderNamespace = "MicroMips" in {
+  def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
+                 RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
 }
 
 let Predicates = [InMicroMips] in {
@@ -928,7 +1009,7 @@ class UncondBranchMMPseudo<string opstr> :
   MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
                     !strconcat(opstr, "\t$offset")>;
 
-  def B_MM_Pseudo : UncondBranchMMPseudo<"b">;
+def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
 
   def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
   def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
@@ -937,4 +1018,17 @@ class UncondBranchMMPseudo<string opstr> :
 
 let Predicates = [InMicroMips] in {
 def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
+def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+def : MipsInstAlias<"teq $rs, $rt",
+                    (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tge $rs, $rt",
+                    (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tgeu $rs, $rt",
+                    (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tlt $rs, $rt",
+                    (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tltu $rs, $rt",
+                    (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tne $rs, $rt",
+                    (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
 }
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
index dbb5f7b..35352b6 100644
--- a/contrib/llvm/lib/Target/Mips/Mips.td
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -154,9 +154,14 @@ def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
 def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
 def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
                                     "Mips DSP-R2 ASE", [FeatureDSP]>;
+def FeatureDSPR3
+    : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE",
+                       [ FeatureDSP, FeatureDSPR2 ]>;
 
 def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
 
+def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
+
 def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
                                          "microMips mode">;
 
@@ -164,10 +169,19 @@ def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
                                 "true", "Octeon cnMIPS Support",
                                 [FeatureMips64r2]>;
 
+def FeatureUseTCCInDIV : SubtargetFeature<
+                               "use-tcc-in-div",
+                               "UseTCCInDIV", "false",
+                               "Force the assembler to use trapping">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
 
+def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl",
+                                 "MipsSubtarget::CPU::P5600",
+                                 "The P5600 Processor", [FeatureMips32r5]>;
+
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
@@ -187,12 +201,11 @@ def : Proc<"mips64r2", [FeatureMips64r2]>;
 def : Proc<"mips64r3", [FeatureMips64r3]>;
 def : Proc<"mips64r5", [FeatureMips64r5]>;
 def : Proc<"mips64r6", [FeatureMips64r6]>;
-def : Proc<"mips16", [FeatureMips16]>;
 def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>;
+def : ProcessorModel<"p5600", MipsP5600Model, [ImplP5600]>;
 
 def MipsAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
-  let MnemonicContainsDot = 1;
 }
 
 def MipsAsmParserVariant : AsmParserVariant {
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index 46cc99c..26426c0 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -39,7 +39,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
   const Mips16InstrInfo &TII =
       *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+
   uint64_t StackSize = MFI->getStackSize();
 
   // No need to allocate space on the stack.
@@ -107,7 +111,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           const std::vector<CalleeSavedInfo> &CSI,
                           const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  MachineBasicBlock *EntryBlock = MF->begin();
+  MachineBasicBlock *EntryBlock = &MF->front();
 
   //
   // Registers RA, S0,S1 are the callee saved registers and they
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 893fc7c..b2bc7e7 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -40,26 +40,17 @@ namespace {
     const MipsTargetMachine &TM;
   };
 
-  class InlineAsmHelper {
-    LLVMContext &C;
-    BasicBlock *BB;
-  public:
-    InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) :
-      C(C_), BB(BB_) {
-      }
-
-    void Out(StringRef AsmString) {
-      std::vector<llvm::Type *> AsmArgTypes;
-      std::vector<llvm::Value*> AsmArgs;
-
-      llvm::FunctionType *AsmFTy = llvm::FunctionType::get(Type::getVoidTy(C),
-                                                           AsmArgTypes, false);
-      llvm::InlineAsm *IA = llvm::InlineAsm::get(AsmFTy, AsmString, "", true,
-                                                 /* IsAlignStack */ false,
-                                                 llvm::InlineAsm::AD_ATT);
-      CallInst::Create(IA, AsmArgs, "", BB);
-    }
-  };
+  static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
+    std::vector<llvm::Type *> AsmArgTypes;
+    std::vector<llvm::Value *> AsmArgs;
+
+    llvm::FunctionType *AsmFTy =
+        llvm::FunctionType::get(Type::getVoidTy(C), AsmArgTypes, false);
+    llvm::InlineAsm *IA =
+        llvm::InlineAsm::get(AsmFTy, AsmText, "", true,
+                             /* IsAlignStack */ false, llvm::InlineAsm::AD_ATT);
+    CallInst::Create(IA, AsmArgs, "", BB);
+  }
 
   char Mips16HardFloat::ID = 0;
 }
@@ -182,7 +173,7 @@ static bool needsFPReturnHelper(Function &F) {
   return whichFPReturnVariant(RetType) != NoFPRet;
 }
 
-static bool needsFPReturnHelper(const FunctionType &FT) {
+static bool needsFPReturnHelper(FunctionType &FT) {
   Type* RetType = FT.getReturnType();
   return whichFPReturnVariant(RetType) != NoFPRet;
 }
@@ -195,63 +186,72 @@ static bool needsFPHelperFromSig(Function &F) {
 // We swap between FP and Integer registers to allow Mips16 and Mips32 to
 // interoperate
 //
-static void swapFPIntParams(FPParamVariant PV, Module *M, InlineAsmHelper &IAH,
-                            bool LE, bool ToFP) {
-  //LLVMContext &Context = M->getContext();
-  std::string MI = ToFP? "mtc1 ": "mfc1 ";
+static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE,
+                                   bool ToFP) {
+  std::string MI = ToFP ? "mtc1 ": "mfc1 ";
+  std::string AsmText;
+
   switch (PV) {
   case FSig:
-    IAH.Out(MI + "$$4,$$f12");
+    AsmText += MI + "$$4, $$f12\n";
     break;
+
   case FFSig:
-    IAH.Out(MI +"$$4,$$f12");
-    IAH.Out(MI + "$$5,$$f14");
+    AsmText += MI + "$$4, $$f12\n";
+    AsmText += MI + "$$5, $$f14\n";
     break;
+
   case FDSig:
-    IAH.Out(MI + "$$4,$$f12");
+    AsmText += MI + "$$4, $$f12\n";
     if (LE) {
-      IAH.Out(MI + "$$6,$$f14");
-      IAH.Out(MI + "$$7,$$f15");
+      AsmText += MI + "$$6, $$f14\n";
+      AsmText += MI + "$$7, $$f15\n";
     } else {
-      IAH.Out(MI + "$$7,$$f14");
-      IAH.Out(MI + "$$6,$$f15");
+      AsmText += MI + "$$7, $$f14\n";
+      AsmText += MI + "$$6, $$f15\n";
     }
     break;
+
   case DSig:
     if (LE) {
-      IAH.Out(MI + "$$4,$$f12");
-      IAH.Out(MI + "$$5,$$f13");
+      AsmText += MI + "$$4, $$f12\n";
+      AsmText += MI + "$$5, $$f13\n";
     } else {
-      IAH.Out(MI + "$$5,$$f12");
-      IAH.Out(MI + "$$4,$$f13");
+      AsmText += MI + "$$5, $$f12\n";
+      AsmText += MI + "$$4, $$f13\n";
     }
     break;
+
   case DDSig:
     if (LE) {
-      IAH.Out(MI + "$$4,$$f12");
-      IAH.Out(MI + "$$5,$$f13");
-      IAH.Out(MI + "$$6,$$f14");
-      IAH.Out(MI + "$$7,$$f15");
+      AsmText += MI + "$$4, $$f12\n";
+      AsmText += MI + "$$5, $$f13\n";
+      AsmText += MI + "$$6, $$f14\n";
+      AsmText += MI + "$$7, $$f15\n";
     } else {
-      IAH.Out(MI + "$$5,$$f12");
-      IAH.Out(MI + "$$4,$$f13");
-      IAH.Out(MI + "$$7,$$f14");
-      IAH.Out(MI + "$$6,$$f15");
+      AsmText += MI + "$$5, $$f12\n";
+      AsmText += MI + "$$4, $$f13\n";
+      AsmText += MI + "$$7, $$f14\n";
+      AsmText += MI + "$$6, $$f15\n";
     }
     break;
+
   case DFSig:
     if (LE) {
-      IAH.Out(MI + "$$4,$$f12");
-      IAH.Out(MI + "$$5,$$f13");
+      AsmText += MI + "$$4, $$f12\n";
+      AsmText += MI + "$$5, $$f13\n";
     } else {
-      IAH.Out(MI + "$$5,$$f12");
-      IAH.Out(MI + "$$4,$$f13");
+      AsmText += MI + "$$5, $$f12\n";
+      AsmText += MI + "$$4, $$f13\n";
     }
-    IAH.Out(MI + "$$6,$$f14");
+    AsmText += MI + "$$6, $$f14\n";
     break;
+
   case NoSig:
-    return;
+    break;
   }
+
+  return AsmText;
 }
 
 //
@@ -282,68 +282,77 @@ static void assureFPCallStub(Function &F, Module *M,
   FStub->addFnAttr("nomips16");
   FStub->setSection(SectionName);
   BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
-  InlineAsmHelper IAH(Context, BB);
-  IAH.Out(".set reorder");
   FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
   FPParamVariant PV = whichFPParamVariantNeeded(F);
-  swapFPIntParams(PV, M, IAH, LE, true);
+
+  std::string AsmText;
+  AsmText += ".set reorder\n";
+  AsmText += swapFPIntParams(PV, M, LE, true);
   if (RV != NoFPRet) {
-    IAH.Out("move $$18, $$31");
-    IAH.Out("jal " + Name);
+    AsmText += "move $$18, $$31\n";
+    AsmText += "jal " + Name + "\n";
   } else {
-    IAH.Out("lui  $$25,%hi(" + Name + ")");
-    IAH.Out("addiu  $$25,$$25,%lo(" + Name + ")" );
+    AsmText += "lui  $$25, %hi(" + Name + ")\n";
+    AsmText += "addiu  $$25, $$25, %lo(" + Name + ")\n";
   }
+
   switch (RV) {
   case FRet:
-    IAH.Out("mfc1 $$2,$$f0");
+    AsmText += "mfc1 $$2, $$f0\n";
     break;
+
   case DRet:
     if (LE) {
-      IAH.Out("mfc1 $$2,$$f0");
-      IAH.Out("mfc1 $$3,$$f1");
+      AsmText += "mfc1 $$2, $$f0\n";
+      AsmText += "mfc1 $$3, $$f1\n";
     } else {
-      IAH.Out("mfc1 $$3,$$f0");
-      IAH.Out("mfc1 $$2,$$f1");
+      AsmText += "mfc1 $$3, $$f0\n";
+      AsmText += "mfc1 $$2, $$f1\n";
     }
     break;
+
   case CFRet:
     if (LE) {
-      IAH.Out("mfc1 $$2,$$f0");
-      IAH.Out("mfc1 $$3,$$f2");
+      AsmText += "mfc1 $$2, $$f0\n";
+      AsmText += "mfc1 $$3, $$f2\n";
     } else {
-      IAH.Out("mfc1 $$3,$$f0");
-      IAH.Out("mfc1 $$3,$$f2");
+      AsmText += "mfc1 $$3, $$f0\n";
+      AsmText += "mfc1 $$3, $$f2\n";
     }
     break;
+
   case CDRet:
     if (LE) {
-      IAH.Out("mfc1 $$4,$$f2");
-      IAH.Out("mfc1 $$5,$$f3");
-      IAH.Out("mfc1 $$2,$$f0");
-      IAH.Out("mfc1 $$3,$$f1");
+      AsmText += "mfc1 $$4, $$f2\n";
+      AsmText += "mfc1 $$5, $$f3\n";
+      AsmText += "mfc1 $$2, $$f0\n";
+      AsmText += "mfc1 $$3, $$f1\n";
 
     } else {
-      IAH.Out("mfc1 $$5,$$f2");
-      IAH.Out("mfc1 $$4,$$f3");
-      IAH.Out("mfc1 $$3,$$f0");
-      IAH.Out("mfc1 $$2,$$f1");
+      AsmText += "mfc1 $$5, $$f2\n";
+      AsmText += "mfc1 $$4, $$f3\n";
+      AsmText += "mfc1 $$3, $$f0\n";
+      AsmText += "mfc1 $$2, $$f1\n";
     }
     break;
+
   case NoFPRet:
     break;
   }
+
   if (RV != NoFPRet)
-    IAH.Out("jr $$18");
+    AsmText += "jr $$18\n";
   else
-    IAH.Out("jr $$25");
+    AsmText += "jr $$25\n";
+  EmitInlineAsm(Context, BB, AsmText);
+
   new UnreachableInst(Context, BB);
 }
 
 //
 // Functions that are llvm intrinsics and don't need helpers.
 //
-static const char *IntrinsicInline[] = {
+static const char *const IntrinsicInline[] = {
   "fabs", "fabsf",
   "llvm.ceil.f32", "llvm.ceil.f64",
   "llvm.copysign.f32", "llvm.copysign.f64",
@@ -395,7 +404,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         Type *T = RVal->getType();
         FPReturnVariant RV = whichFPReturnVariant(T);
         if (RV == NoFPRet) continue;
-        static const char* Helper[NoFPRet] = {
+        static const char *const Helper[NoFPRet] = {
           "__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
           "__mips16_ret_dc"
         };
@@ -419,11 +428,11 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         CallInst::Create(F, Params, "", &Inst );
       } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
         const Value* V = CI->getCalledValue();
-        const Type* T = nullptr;
+        Type* T = nullptr;
         if (V) T = V->getType();
-        const PointerType *PFT=nullptr;
+        PointerType *PFT = nullptr;
         if (T) PFT = dyn_cast<PointerType>(T);
-        const FunctionType *FT=nullptr;
+        FunctionType *FT = nullptr;
         if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
         Function *F_ =  CI->getCalledFunction();
         if (FT && needsFPReturnHelper(*FT) &&
@@ -469,20 +478,21 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   FStub->addFnAttr("nomips16");
   FStub->setSection(SectionName);
   BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
-  InlineAsmHelper IAH(Context, BB);
+
+  std::string AsmText;
   if (PicMode) {
-    IAH.Out(".set noreorder");
-    IAH.Out(".cpload  $$25");
-    IAH.Out(".set reorder");
-    IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
-    IAH.Out("la $$25," + LocalName);
-  }
-  else {
-    IAH.Out("la $$25," + Name);
-  }
-  swapFPIntParams(PV, M, IAH, LE, false);
-  IAH.Out("jr $$25");
-  IAH.Out(LocalName + " = " + Name);
+    AsmText += ".set noreorder\n";
+    AsmText += ".cpload $$25\n";
+    AsmText += ".set reorder\n";
+    AsmText += ".reloc 0, R_MIPS_NONE, " + Name + "\n";
+    AsmText += "la $$25, " + LocalName + "\n";
+  } else
+    AsmText += "la $$25, " + Name + "\n";
+  AsmText += swapFPIntParams(PV, M, LE, false);
+  AsmText += "jr $$25\n";
+  AsmText += LocalName + " = " + Name + "\n";
+  EmitInlineAsm(Context, BB, AsmText);
+
   new UnreachableInst(FStub->getContext(), BB);
 }
 
@@ -535,7 +545,7 @@ bool Mips16HardFloat::runOnModule(Module &M) {
     FPParamVariant V = whichFPParamVariantNeeded(*F);
     if (V != NoSig) {
       Modified = true;
-      createFPFnStub(F, &M, V, TM);
+      createFPFnStub(&*F, &M, V, TM);
     }
   }
   return Modified;
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index bce2c1e..5a1c2c67 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -73,7 +73,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
   unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
 
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
index 3522cbb..e748325 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -530,8 +530,7 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
@@ -592,8 +591,7 @@ Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI,
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
@@ -657,8 +655,7 @@ Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index a49572e..da8ada4 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -196,7 +196,7 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
 void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const {
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
   const BitVector Reserved = RI.getReservedRegs(MF);
@@ -263,7 +263,7 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         unsigned Reg1, unsigned Reg2) const {
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
   //
   // li reg1, constant
   // move reg2, sp
@@ -446,7 +446,7 @@ const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
 
 void Mips16InstrInfo::BuildAddiuSpImm
   (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
   BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
index 10fff03..dad6ea4 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -530,19 +530,19 @@ class MayStore {
 // Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
 // To add a constant to a 32-bit integer.
 //
-def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIM16Alu>;
 
-def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>,
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIM16Alu>,
   ArithLogic16Defs<0> {
   let AddedComplexity = 5;
 }
-def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIM16Alu>,
   ArithLogic16Defs<0> {
   let isCodeGenOnly = 1;
 }
 
 def AddiuRxRyOffMemX16:
-  FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>;
+  FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIM16Alu>;
 
 //
 
@@ -550,7 +550,7 @@ def AddiuRxRyOffMemX16:
 // Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
 // To add a constant to the program counter.
 //
-def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIM16Alu>;
 
 //
 // Format: ADDIU sp, immediate MIPS16e
@@ -558,14 +558,14 @@ def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
 // To add a constant to the stack pointer.
 //
 def AddiuSpImm16
-  : FI816_SP_ins<0b011, "addiu", IIAlu> {
+  : FI816_SP_ins<0b011, "addiu", IIM16Alu> {
   let Defs = [SP];
   let Uses = [SP];
   let AddedComplexity = 5;
 }
 
 def AddiuSpImmX16
-  : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> {
+  : FEXT_I816_SP_ins<0b011, "addiu", IIM16Alu> {
   let Defs = [SP];
   let Uses = [SP];
 }
@@ -576,14 +576,14 @@ def AddiuSpImmX16
 // To add 32-bit integers.
 //
 
-def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIM16Alu>, ArithLogic16Defs<1>;
 
 //
 // Format: AND rx, ry MIPS16e
 // Purpose: AND
 // To do a bitwise logical AND.
 
-def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIM16Alu>, ArithLogic16Defs<1>;
 
 
 //
@@ -591,7 +591,7 @@ def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
 // Purpose: Branch on Equal to Zero
 // To test a GPR then do a PC-relative conditional branch.
 //
-def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
 
 
 //
@@ -599,7 +599,7 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 // Purpose: Branch on Equal to Zero (Extended)
 // To test a GPR then do a PC-relative conditional branch.
 //
-def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
 
 //
 // Format: B offset MIPS16e
@@ -607,27 +607,27 @@ def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 // To do an unconditional PC-relative branch.
 //
 
-def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16;
+def Bimm16: FI16_ins<0b00010, "b", IIM16Alu>, branch16;
 
 // Format: B offset MIPS16e
 // Purpose: Unconditional Branch
 // To do an unconditional PC-relative branch.
 //
-def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIM16Alu>, branch16;
 
 //
 // Format: BNEZ rx, offset MIPS16e
 // Purpose: Branch on Not Equal to Zero
 // To test a GPR then do a PC-relative conditional branch.
 //
-def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
 
 //
 // Format: BNEZ rx, offset MIPS16e
 // Purpose: Branch on Not Equal to Zero (Extended)
 // To test a GPR then do a PC-relative conditional branch.
 //
-def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
 
 
 //
@@ -641,11 +641,11 @@ def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>;
 // Purpose: Branch on T Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
-def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+def Bteqz16: FI816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
   let Uses = [T8];
 }
 
-def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
   let Uses = [T8];
 }
 
@@ -669,11 +669,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
 // To test special register T then do a PC-relative conditional branch.
 //
 
-def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 {
+def Btnez16: FI816_ins<0b001, "btnez", IIM16Alu>, cbranch16 {
   let Uses = [T8];
 }
 
-def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIM16Alu> ,cbranch16 {
   let Uses = [T8];
 }
 
@@ -695,7 +695,7 @@ def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
 // Purpose: Compare
 // To compare the contents of two GPRs.
 //
-def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIM16Alu> {
   let Defs = [T8];
 }
 
@@ -704,7 +704,7 @@ def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
 // Purpose: Compare Immediate
 // To compare a constant with the contents of a GPR.
 //
-def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIM16Alu> {
   let Defs = [T8];
 }
 
@@ -713,7 +713,7 @@ def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
 // Purpose: Compare Immediate (Extended)
 // To compare a constant with the contents of a GPR.
 //
-def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIM16Alu> {
   let Defs = [T8];
 }
 
@@ -723,7 +723,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
 // Purpose: Divide Word
 // To divide 32-bit signed integers.
 //
-def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIM16Alu> {
   let Defs = [HI0, LO0];
 }
 
@@ -732,7 +732,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
 // Purpose: Divide Unsigned Word
 // To divide 32-bit unsigned integers.
 //
-def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIM16Alu> {
   let Defs = [HI0, LO0];
 }
 //
@@ -742,13 +742,13 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
 // region and preserve the current ISA.
 //
 
-def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
+def Jal16 : FJAL16_ins<0b0, "jal", IIM16Alu> {
   let hasDelaySlot = 0;  // not true, but we add the nop for now
   let isCall=1;
   let Defs = [RA];
 }
 
-def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 {
+def JalB16 : FJALB16_ins<0b0, "jal", IIM16Alu>, branch16 {
   let hasDelaySlot = 0;  // not true, but we add the nop for now
   let isBranch=1;
   let Defs = [RA];
@@ -761,7 +761,7 @@ def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 {
 // address register.
 //
 
-def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
   let isBranch = 1;
   let isIndirectBranch = 1;
   let hasDelaySlot = 1;
@@ -769,14 +769,14 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
   let isBarrier=1;
 }
 
-def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> {
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
   let isBranch = 1;
   let isIndirectBranch = 1;
   let isTerminator=1;
   let isBarrier=1;
 }
 
-def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
   let isBranch = 1;
   let isIndirectBranch = 1;
   let isTerminator=1;
@@ -825,16 +825,16 @@ def LhuRxRyOffMemX16:
 // Purpose: Load Immediate
 // To load a constant into a GPR.
 //
-def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
+def LiRxImm16: FRI16_ins<0b01101, "li", IIM16Alu>;
 
 //
 // Format: LI rx, immediate MIPS16e
 // Purpose: Load Immediate (Extended)
 // To load a constant into a GPR.
 //
-def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIM16Alu>;
 
-def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIM16Alu> {
   let isCodeGenOnly = 1;
 }
 
@@ -863,21 +863,21 @@ def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
 // Purpose: Move
 // To move the contents of a GPR to a GPR.
 //
-def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>;
 
 //
 // Format: MOVE ry, r32 MIPS16e
 //Purpose: Move
 // To move the contents of a GPR to a GPR.
 //
-def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
+def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
 
 //
 // Format: MFHI rx MIPS16e
 // Purpose: Move From HI Register
 // To copy the special purpose HI register to a GPR.
 //
-def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
   let Uses = [HI0];
   let hasSideEffects = 0;
 }
@@ -887,7 +887,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
 // Purpose: Move From LO Register
 // To copy the special purpose LO register to a GPR.
 //
-def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
   let Uses = [LO0];
   let hasSideEffects = 0;
 }
@@ -895,13 +895,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
 //
 // Pseudo Instruction for mult
 //
-def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
+def MultRxRy16:  FMULT16_ins<"mult",  IIM16Alu> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
 }
 
-def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
+def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
@@ -912,7 +912,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
 // Purpose: Multiply Word
 // To multiply 32-bit signed integers.
 //
-def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
@@ -923,7 +923,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
 // Purpose: Multiply Unsigned Word
 // To multiply 32-bit unsigned integers.
 //
-def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
@@ -934,21 +934,21 @@ def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
 // Purpose: Negate
 // To negate an integer value.
 //
-def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>;
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIM16Alu>;
 
 //
 // Format: NOT rx, ry MIPS16e
 // Purpose: Not
 // To complement an integer value
 //
-def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>;
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIM16Alu>;
 
 //
 // Format: OR rx, ry MIPS16e
 // Purpose: Or
 // To do a bitwise logical OR.
 //
-def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIM16Alu>, ArithLogic16Defs<1>;
 
 //
 // Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
@@ -1012,7 +1012,7 @@ def SbRxRyOffMemX16:
 // Sign-extend least significant byte in register rx.
 //
 def SebRx16
-  : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>;
+  : FRR_SF16_ins<0b10001, 0b100, "seb", IIM16Alu>;
 
 //
 // Format: SEH rx MIPS16e
@@ -1020,7 +1020,7 @@ def SebRx16
 // Sign-extend least significant word in register rx.
 //
 def SehRx16
-  : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>;
+  : FRR_SF16_ins<0b10001, 0b101, "seh", IIM16Alu>;
 
 //
 // The Sel(T) instructions are pseudos
@@ -1149,21 +1149,21 @@ def ShRxRyOffMemX16:
 // Purpose: Shift Word Left Logical (Extended)
 // To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
 //
-def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIM16Alu>;
 
 //
 // Format: SLLV ry, rx MIPS16e
 // Purpose: Shift Word Left Logical Variable
 // To execute a left-shift of a word by a variable number of bits.
 //
-def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIM16Alu>;
 
 // Format: SLTI rx, immediate MIPS16e
 // Purpose: Set on Less Than Immediate
 // To record the result of a less-than comparison with a constant.
 //
 //
-def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIM16Alu> {
   let Defs = [T8];
 }
 
@@ -1173,7 +1173,7 @@ def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
 // To record the result of a less-than comparison with a constant.
 //
 //
-def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> {
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIM16Alu> {
   let Defs = [T8];
 }
 
@@ -1184,7 +1184,7 @@ def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
 // To record the result of a less-than comparison with a constant.
 //
 //
-def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIM16Alu> {
   let Defs = [T8];
 }
 
@@ -1194,7 +1194,7 @@ def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
 // To record the result of a less-than comparison with a constant.
 //
 //
-def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> {
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIM16Alu> {
   let Defs = [T8];
 }
 //
@@ -1209,7 +1209,7 @@ def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
 // Purpose: Set on Less Than
 // To record the result of a less-than comparison.
 //
-def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIM16Alu>{
   let Defs = [T8];
 }
 
@@ -1219,7 +1219,7 @@ def SltCCRxRy16: FCCRR16_ins<"slt">;
 // Purpose: Set on Less Than Unsigned
 // To record the result of an unsigned less-than comparison.
 //
-def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIM16Alu>{
   let Defs = [T8];
 }
 
@@ -1236,7 +1236,7 @@ def SltuCCRxRy16: FCCRR16_ins<"sltu">;
 // To execute an arithmetic right-shift of a word by a variable
 // number of bits.
 //
-def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIM16Alu>;
 
 
 //
@@ -1245,7 +1245,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
 // To execute an arithmetic right-shift of a word by a fixed
 // number of bits-1 to 8 bits.
 //
-def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIM16Alu>;
 
 
 //
@@ -1254,7 +1254,7 @@ def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
 // To execute a logical right-shift of a word by a variable
 // number of bits.
 //
-def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIM16Alu>;
 
 
 //
@@ -1263,14 +1263,14 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
 // To execute a logical right-shift of a word by a fixed
 // number of bits-1 to 31 bits.
 //
-def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIM16Alu>;
 
 //
 // Format: SUBU rz, rx, ry MIPS16e
 // Purpose: Subtract Unsigned Word
 // To subtract 32-bit integers
 //
-def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>;
 
 //
 // Format: SW ry, offset(rx) MIPS16e
@@ -1294,7 +1294,7 @@ def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins
 // Purpose: Xor
 // To do a bitwise logical XOR.
 //
-def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>;
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIM16Alu>, ArithLogic16Defs<1>;
 
 class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [InMips16Mode];
@@ -1380,7 +1380,7 @@ def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> {
 let isCall=1, hasDelaySlot=0 in
 def JumpLinkReg16:
   FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
-              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch> {
+              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], II_JALRC> {
   let Defs = [RA];
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index d6ab8a6..82d2c8e 100644
--- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -186,54 +186,56 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
 
 multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
                      RegisterOperand FGROpnd>{
-  def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
-                    CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
-                    ISA_MIPS32R6, HARDFLOAT;
-  def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
-                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
-                     ISA_MIPS32R6, HARDFLOAT;
-  def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
-                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
-                     ISA_MIPS32R6, HARDFLOAT;
-  def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
-                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
+  let AdditionalPredicates = [NotInMicroMips] in {
+    def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+                      CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
                       ISA_MIPS32R6, HARDFLOAT;
-  def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
-                     CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
-                     ISA_MIPS32R6, HARDFLOAT;
-  def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
-                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
-                     CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
-                     ISA_MIPS32R6, HARDFLOAT;
-  def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
-                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
-                      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
-                      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
-                      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
-                       CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+    def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+                       CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
                        ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
-                      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
-                       CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+    def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+                       CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
                        ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
-                      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
-                      ISA_MIPS32R6, HARDFLOAT;
-  def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
-                       CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+    def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
+                        CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+                       CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
+                        CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+                       CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
+                       ISA_MIPS32R6, HARDFLOAT;
+    def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
+                        CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
+                        CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
+                        CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
+                        CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
+                         CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+                         ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
+                        CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
+                         CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+                         ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
+                        CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
+                         CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+                         ISA_MIPS32R6, HARDFLOAT;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -557,7 +559,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
   dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
   string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
   list<dag> Pattern = [];
-  string DecoderMethod = "DecodeCacheOpR6";
+  string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
 }
 
 class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
@@ -595,7 +597,7 @@ class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   list<dag> Pattern = [];
 }
 
-class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>;
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>;
 
 class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   dag OutOperandList = (outs GPROpnd:$rt);
@@ -685,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
 def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
 def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
 def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
-def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
 def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
 defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
@@ -702,39 +706,51 @@ def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
 def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
 def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
 def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
-def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAXA_D  : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAXA_S  : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAX_D   : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAX_S   : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MINA_D  : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MINA_S  : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MIN_D   : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MIN_S   : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
 def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
-def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def MUH    : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6;
 def MUHU   : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
 def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
 def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 def NAL; // BAL with rd=0
 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
-def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
 def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+}
 def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
 def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_D    : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_S    : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 
 //===----------------------------------------------------------------------===//
@@ -743,7 +759,9 @@ def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 //
 //===----------------------------------------------------------------------===//
 
+let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+}
 def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
 
 //===----------------------------------------------------------------------===//
@@ -752,84 +770,78 @@ def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
 //
 //===----------------------------------------------------------------------===//
 
-// f32 comparisons supported via another comparison
-def : MipsPat<(setone f32:$lhs, f32:$rhs),
-              (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f32:$lhs, f32:$rhs),
-              (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f32:$lhs, f32:$rhs),
-              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setne f32:$lhs, f32:$rhs),
-              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-
-// f64 comparisons supported via another comparison
-def : MipsPat<(setone f64:$lhs, f64:$rhs),
-              (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f64:$lhs, f64:$rhs),
-              (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f64:$lhs, f64:$rhs),
-              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setne f64:$lhs, f64:$rhs),
-              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+// comparisons supported via another comparison
+multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> {
+def : MipsPat<(setone VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seto VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(setune VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seteq VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setgt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setge VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setlt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setle VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setne VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+}
+
+defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
 
 // i32 selects
+multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
+                          Instruction SLTiOp, Instruction SLTiuOp,
+                          Instruction SELEQZOp, Instruction SELNEZOp,
+                          SDPatternOperator imm_type, ValueType Opg> {
+// reg, immz
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>;
+
+// reg, immZExt16[_64]
+def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+
+// reg, immSExt16Plus1
+def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>;
+def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>;
+
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz),
+              (SELEQZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz),
+              (SELNEZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f),
+              (SELNEZOp RC:$f, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
+              (SELEQZOp RC:$f, RC:$cond)>;
+}
+
+defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
+                      immZExt16, i32>, ISA_MIPS32R6;
+
 def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              (OR (SELNEZ i32:$t, i32:$cond),
+                  (SELEQZ i32:$f, i32:$cond))>,
               ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
-                      i32:$f),
-              (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
-                  (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
-                      i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
-                  (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
-              ISA_MIPS32R6;
-
 def : MipsPat<(select i32:$cond, i32:$t, immz),
-              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
-              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
-              (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+              (SELNEZ i32:$t, i32:$cond)>,
+              ISA_MIPS32R6;
 def : MipsPat<(select i32:$cond, immz, i32:$f),
-              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
-              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
-              (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+              (SELEQZ i32:$f, i32:$cond)>,
+              ISA_MIPS32R6;
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
index f917eca..cbdcdd7 100644
--- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -16,10 +16,6 @@
 //===----------------------------------------------------------------------===//
 
 // Unsigned Operand
-def uimm5_64      : Operand<i64> {
-  let PrintMethod = "printUnsignedImm";
-}
-
 def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
 }
@@ -276,12 +272,20 @@ def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 let isCodeGenOnly = 1 in
 def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
-def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
-def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  // TODO: Add 'pos + size' constraint check to dext* instructions
+  //       DEXT: 0 < pos + size <= 63
+  //       DEXTM, DEXTU: 32 < pos + size <= 64
+  def DEXT : ExtBase<"dext", GPR64Opnd, uimm5, uimm5_plus1, MipsExt>,
+             EXT_FM<3>;
+  def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, MipsExt>,
+              EXT_FM<1>;
+  def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1,
+                      MipsExt>, EXT_FM<2>;
+}
 
 def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
-def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>;
 def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
@@ -341,11 +345,11 @@ class SetCC64_I<string opstr, PatFrag cond_op>:
 }
 
 class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op,
-                    RegisterOperand RO, bits<64> shift = 1> :
-  InstSE<(outs), (ins RO:$rs, uimm5_64:$p, opnd:$offset),
+                    RegisterOperand RO, Operand ImmOp, bits<64> shift = 1> :
+  InstSE<(outs), (ins RO:$rs, ImmOp:$p, opnd:$offset),
          !strconcat(opstr, "\t$rs, $p, $offset"),
          [(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)),
-                  bb:$offset)], IIBranch, FrmI, opstr> {
+                  bb:$offset)], II_BBIT, FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -363,14 +367,17 @@ def BADDu  : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
                               ADD_FM<0x1c, 0x28>;
 
 // Branch on Bit Clear /+32
-def BBIT0  : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd>, BBIT_FM<0x32>;
-def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, 0x100000000>,
+def BBIT0  : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd,
+                           uimm5_64_report_uimm6>, BBIT_FM<0x32>;
+def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64,
+                           0x100000000>,
                            BBIT_FM<0x36>;
 
 // Branch on Bit Set /+32
-def BBIT1  : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd>, BBIT_FM<0x3a>;
-def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, 0x100000000>,
-                           BBIT_FM<0x3e>;
+def BBIT1  : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd,
+                           uimm5_64_report_uimm6>, BBIT_FM<0x3a>;
+def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64,
+                           0x100000000>, BBIT_FM<0x3e>;
 
 // Multiply Doubleword to GPR
 let Defs = [HI0, LO0, P0, P1, P2] in
@@ -544,10 +551,25 @@ def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst
               (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>;
 }
 
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
+def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>;
+def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>;
+def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
 def : MipsInstAlias<"move $dst, $src",
+                    (OR64 GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+      GPR_64;
+def : MipsInstAlias<"move $dst, $src",
                     (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
       GPR_64;
 def : MipsInstAlias<"daddu $rs, $rt, $imm",
@@ -617,6 +639,38 @@ def : MipsInstAlias<"syncw",      (SYNC 0x4), 0>;
 def : MipsInstAlias<"syncws",     (SYNC 0x5), 0>;
 }
 
+// cnMIPS Aliases.
+
+// bbit* with $p 32-63 converted to bbit*32 with $p 0-31
+def : MipsInstAlias<"bbit0 $rs, $p, $offset",
+                    (BBIT032 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+                             brtarget:$offset), 0>,
+      ASE_CNMIPS;
+def : MipsInstAlias<"bbit1 $rs, $p, $offset",
+                    (BBIT132 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+                             brtarget:$offset), 0>,
+      ASE_CNMIPS;
+
+// exts with $pos 32-63 in converted to exts32 with $pos 0-31
+def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
+                    (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+def : MipsInstAlias<"exts $rt, $pos, $lenm1",
+                    (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+
+// cins with $pos 32-63 in converted to cins32 with $pos 0-31
+def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
+                    (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+def : MipsInstAlias<"cins $rt, $pos, $lenm1",
+                    (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -625,3 +679,8 @@ class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
                      !strconcat(instr_asm, "\t$rt, $imm64")> ;
 def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>;
+
+def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
+                                       "dla\t$rt, $addr">;
+def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
+                                       "dla\t$rt, $imm64">;
diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
index 6b546e8..6f34dbe 100644
--- a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -62,7 +62,7 @@ class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>;
 class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>;
 class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
 class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
-class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1>;
 class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>;
 class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>;
 class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>;
@@ -81,10 +81,12 @@ class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
 //
 //===----------------------------------------------------------------------===//
 
-def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
-def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
-def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
-def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+  def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+  def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+  def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+}
 def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
 def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
 def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index fdba064..9575293 100644
--- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -169,12 +169,12 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (MCPE.isMachineConstantPoolEntry())
       EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
-      EmitGlobalConstant(MCPE.Val.ConstVal);
+      EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
     return;
   }
 
 
-  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
   do {
@@ -202,7 +202,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       llvm_unreachable("Pseudo opcode found in EmitInstruction()");
 
     MCInst TmpInst0;
-    MCInstLowering.Lower(I, TmpInst0);
+    MCInstLowering.Lower(&*I, TmpInst0);
     EmitToStreamer(*OutStreamer, TmpInst0);
   } while ((++I != E) && I->isInsideBundle()); // Delay slot check
 }
@@ -405,7 +405,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
 
   // If this is a landing pad, it isn't a fall through.  If it has no preds,
   // then nothing falls through to it.
-  if (MBB->isLandingPad() || MBB->pred_empty())
+  if (MBB->isEHPad() || MBB->pred_empty())
     return false;
 
   // If there isn't exactly one predecessor, it can't be a fall through.
@@ -559,7 +559,6 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                   raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   bool closeP = false;
 
@@ -608,7 +607,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     }
 
     case MachineOperand::MO_ConstantPoolIndex:
-      O << DL->getPrivateGlobalPrefix() << "CPI"
+      O << getDataLayout().getPrivateGlobalPrefix() << "CPI"
         << getFunctionNumber() << "_" << MO.getIndex();
       if (MO.getOffset())
         O << "+" << MO.getOffset();
@@ -1009,7 +1008,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   //
   // Mov $18, $31
 
-  EmitInstrRegRegReg(*STI, Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO);
+  EmitInstrRegRegReg(*STI, Mips::OR, Mips::S2, Mips::RA, Mips::ZERO);
 
   EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true);
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
index b808129..d82063e 100644
--- a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -29,22 +29,16 @@ static bool isF128SoftLibCall(const char *CallSym) {
       "powl",          "rintl",        "sinl",          "sqrtl",
       "truncl"};
 
-  const char *const *End = LibCalls + array_lengthof(LibCalls);
-
   // Check that LibCalls is sorted alphabetically.
-  MipsTargetLowering::LTStr Comp;
-
-#ifndef NDEBUG
-  for (const char *const *I = LibCalls; I < End - 1; ++I)
-    assert(Comp(*I, *(I + 1)));
-#endif
-
-  return std::binary_search(LibCalls, End, CallSym, Comp);
+  auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
+  assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp));
+  return std::binary_search(std::begin(LibCalls), std::end(LibCalls),
+                            CallSym, Comp);
 }
 
 /// This function returns true if Ty is fp128, {f128} or i128 which was
 /// originally a fp128.
-static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
+static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
   if (Ty->isFP128Ty())
     return true;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
index 93e1908..0b4b778 100644
--- a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -427,3 +427,28 @@ def CSR_Mips16RetHelper :
   CalleeSavedRegs<(add V0, V1, FP,
                    (sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
                    (sequence "D%u", 15, 10))>;
+
+def CSR_Interrupt_32R6 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+                                              (sequence "S%u", 7, 0),
+                                              (sequence "V%u", 1, 0),
+                                              (sequence "T%u", 9, 0),
+                                              RA, FP, GP, AT)>;
+
+def CSR_Interrupt_32 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+                                            (sequence "S%u", 7, 0),
+                                            (sequence "V%u", 1, 0),
+                                            (sequence "T%u", 9, 0),
+                                            RA, FP, GP, AT, LO0, HI0)>;
+
+def CSR_Interrupt_64R6 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+                                              (sequence "V%u_64", 1, 0),
+                                              (sequence "S%u_64", 7, 0),
+                                              (sequence "T%u_64", 9, 0),
+                                              RA_64, FP_64, GP_64, AT_64)>;
+
+def CSR_Interrupt_64 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+                                            (sequence "S%u_64", 7, 0),
+                                            (sequence "T%u_64", 9, 0),
+                                            (sequence "V%u_64", 1, 0),
+                                            RA_64, FP_64, GP_64, AT_64,
+                                            LO0_64, HI0_64)>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 96553d2..ea8c587 100644
--- a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -560,7 +560,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  const DataLayout &TD = MF->getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
@@ -598,12 +598,12 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
 /// into the block immediately after it.
 static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
+  MachineFunction::iterator MBBI = MBB->getIterator();
   // Can't fall off end of function.
   if (std::next(MBBI) == MBB->getParent()->end())
     return false;
 
-  MachineBasicBlock *NextBB = std::next(MBBI);
+  MachineBasicBlock *NextBB = &*std::next(MBBI);
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I)
     if (*I == NextBB)
@@ -656,11 +656,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   // alignment assumptions, as we don't know for sure the size of any
   // instructions in the inline assembly.
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
+    computeBlockSize(&*I);
 
 
   // Compute block offsets.
-  adjustBBOffsetsAfter(MF->begin());
+  adjustBBOffsetsAfter(&MF->front());
 
   // Now go back through the instructions and build up our data structures.
   for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
@@ -879,7 +879,7 @@ MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
   // Create a new MBB for the code after the OrigBB.
   MachineBasicBlock *NewBB =
     MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MachineFunction::iterator MBBI = ++OrigBB->getIterator();
   MF->insert(MBBI, NewBB);
 
   // Splice the instructions starting with MI over to NewBB.
@@ -967,8 +967,8 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
   unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
   unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
   unsigned NextBlockOffset, NextBlockAlignment;
-  MachineFunction::const_iterator NextBlock = Water;
-  if (++NextBlock == MF->end()) {
+  MachineFunction::const_iterator NextBlock = ++Water->getIterator();
+  if (NextBlock == MF->end()) {
     NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
     NextBlockAlignment = 0;
   } else {
@@ -1261,7 +1261,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
             << format(", expected CPE offset %#x\n", CPEOffset));
-      NewMBB = std::next(MachineFunction::iterator(UserMBB));
+      NewMBB = &*++UserMBB->getIterator();
       // Add an unconditional branch from UserMBB to fallthrough block.  Record
       // it for branch lengthening; this new branch will not get out of range,
       // but if the preceding conditional branch is out of range, the targets
@@ -1371,8 +1371,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
       NewWaterList.insert(NewIsland);
 
     // The new CPE goes before the following block (NewMBB).
-    NewMBB = std::next(MachineFunction::iterator(WaterBB));
-
+    NewMBB = &*++WaterBB->getIterator();
   } else {
     // No water found.
     // we first see if a longer form of the instrucion could have reached
@@ -1389,7 +1388,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // next iteration for constant pools, but in this context, we don't want
     // it.  Check for this so it will be removed from the WaterList.
     // Also remove any entry from NewWaterList.
-    MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
+    MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
     IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
     if (IP != WaterList.end())
       NewWaterList.erase(WaterBB);
@@ -1406,7 +1405,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     WaterList.erase(IP);
 
   // Okay, we know we can put an island before NewMBB now, do it!
-  MF->insert(NewMBB, NewIsland);
+  MF->insert(NewMBB->getIterator(), NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
   updateForInsertedWaterBlock(NewIsland);
@@ -1431,9 +1430,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
-  adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
-
-
+  adjustBBOffsetsAfter(&*--NewIsland->getIterator());
 
   // Finally, change the CPI in the instruction operand to be ID.
   for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1645,7 +1642,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
   }
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = &*++MBB->getIterator();
 
   DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
                << " also invert condition and change dest. to BB#"
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
index b5d52ce..f959bd4 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -7,10 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 
+class DspMMRel;
+
+def Dsp2MicroMips : InstrMapping {
+  let FilterClass = "DspMMRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["Arch"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = ["dsp"];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["dsp"], ["mmdsp"]];
+}
+
 def HasDSP : Predicate<"Subtarget->hasDSP()">,
              AssemblerPredicate<"FeatureDSP">;
 def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
                AssemblerPredicate<"FeatureDSPR2">;
+def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
+               AssemblerPredicate<"FeatureDSPR3">;
+
+class ISA_DSPR2 {
+  list<Predicate> InsnPredicates = [HasDSPR2];
+}
 
 // Fields.
 class Field6<bits<6> val> {
@@ -20,14 +40,22 @@ class Field6<bits<6> val> {
 def SPECIAL3_OPCODE : Field6<0b011111>;
 def REGIMM_OPCODE : Field6<0b000001>;
 
-class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
-  let Predicates = [HasDSP];
+class DSPInst<string opstr = "">
+    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+  string BaseOpcode = opstr;
+  string Arch = "dsp";
 }
 
 class PseudoDSP<dag outs, dag ins, list<dag> pattern,
-                InstrItinClass itin = IIPseudo>:
-  MipsPseudo<outs, ins, pattern, itin> {
-  let Predicates = [HasDSP];
+                InstrItinClass itin = IIPseudo>
+    : MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+}
+
+class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+    : InstAlias<Asm, Result, Emit>, PredicateControl {
+  let InsnPredicates = [HasDSP];
 }
 
 // ADDU.QB sub-class format.
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
index d268384..da6f174 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -12,9 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 // ImmLeaf
+def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
 def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
 def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt7 : ImmLeaf<i32, [{return isUInt<7>(Imm);}]>;
 def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
 def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
 def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
@@ -263,6 +265,7 @@ class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -273,6 +276,7 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -293,6 +297,7 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -304,6 +309,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -314,6 +320,7 @@ class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -323,6 +330,7 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
   list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -332,17 +340,19 @@ class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
   list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                            SDPatternOperator ImmPat, InstrItinClass itin,
-                           RegisterOperand RO> {
+                           RegisterOperand RO, Operand ImmOpnd> {
   dag OutOperandList = (outs RO:$rd);
-  dag InOperandList = (ins RO:$rt, uimm16:$rs_sa);
+  dag InOperandList = (ins RO:$rt, ImmOpnd:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
   list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
   InstrItinClass Itinerary = itin;
   bit hasSideEffects = 1;
+  string BaseOpcode = instr_asm;
 }
 
 class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -353,6 +363,7 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
   InstrItinClass Itinerary = itin;
   bit mayLoad = 1;
+  string BaseOpcode = instr_asm;
 }
 
 class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -363,17 +374,19 @@ class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                       SDPatternOperator ImmOp, InstrItinClass itin> {
+                       Operand ImmOp, SDPatternOperator Imm, InstrItinClass itin> {
   dag OutOperandList = (outs GPR32Opnd:$rt);
-  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src);
+  dag InOperandList = (ins GPR32Opnd:$rs, ImmOp:$sa, GPR32Opnd:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
   list<dag> Pattern =  [(set GPR32Opnd:$rt,
-                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))];
+                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -382,6 +395,7 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -390,15 +404,17 @@ class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   dag OutOperandList = (outs ACC64DSPOpnd:$ac);
-  dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin);
+  dag InOperandList = (ins simm6:$shift, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
                         (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -408,6 +424,7 @@ class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
                         (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -417,6 +434,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
                         (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -426,15 +444,17 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
   list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -444,6 +464,7 @@ class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
                         (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -454,6 +475,7 @@ class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
   InstrItinClass Itinerary = itin;
   bit isCommutable = 1;
+  string BaseOpcode = instr_asm;
 }
 
 class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -465,6 +487,7 @@ class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                         (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
@@ -474,6 +497,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
@@ -481,6 +505,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
   dag InOperandList = (ins GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
@@ -506,6 +531,7 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 //===----------------------------------------------------------------------===//
@@ -639,7 +665,7 @@ class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
 
 // Shift
 class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPROpnd>,
+                                          NoItinerary, DSPROpnd, uimm3>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
@@ -647,13 +673,13 @@ class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
                       Defs<[DSPOutFlag22]>;
 
 class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPROpnd>;
+                                          NoItinerary, DSPROpnd, uimm3>;
 
 class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
                                            NoItinerary, DSPROpnd>;
 
 class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPROpnd>,
+                                          NoItinerary, DSPROpnd, uimm4>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
@@ -661,7 +687,8 @@ class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
                       Defs<[DSPOutFlag22]>;
 
 class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
-                                            immZExt4, NoItinerary, DSPROpnd>,
+                                            immZExt4, NoItinerary, DSPROpnd,
+                                            uimm4>,
                        Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
@@ -669,19 +696,21 @@ class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
                         Defs<[DSPOutFlag22]>;
 
 class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPROpnd>;
+                                          NoItinerary, DSPROpnd, uimm4>;
 
 class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
                                            NoItinerary, DSPROpnd>;
 
 class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
-                                            immZExt4, NoItinerary, DSPROpnd>;
+                                            immZExt4, NoItinerary, DSPROpnd,
+                                            uimm4>;
 
 class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
                                              NoItinerary, DSPROpnd>;
 
 class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
-                                           immZExt5, NoItinerary, GPR32Opnd>,
+                                           immZExt5, NoItinerary, GPR32Opnd,
+                                           uimm5>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
@@ -689,7 +718,8 @@ class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
                        Defs<[DSPOutFlag22]>;
 
 class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
-                                           immZExt5, NoItinerary, GPR32Opnd>;
+                                           immZExt5, NoItinerary, GPR32Opnd,
+                                           uimm5>;
 
 class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
                                             NoItinerary, GPR32Opnd>;
@@ -1039,32 +1069,33 @@ class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
 
 // Shift
 class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPROpnd>;
+                                          NoItinerary, DSPROpnd, uimm3>;
 
 class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
                                            NoItinerary, DSPROpnd>;
 
 class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
-                                            immZExt3, NoItinerary, DSPROpnd>;
+                                            immZExt3, NoItinerary, DSPROpnd,
+                                            uimm3>;
 
 class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
                                              NoItinerary, DSPROpnd>;
 
 class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPROpnd>;
+                                          NoItinerary, DSPROpnd, uimm4>;
 
 class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
                                            NoItinerary, DSPROpnd>;
 
 // Misc
-class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5,
                                      NoItinerary>;
 
-class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2,
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2,
                                      NoItinerary>;
 
-class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5,
-                                      NoItinerary>;
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
+                                      immZExt5, NoItinerary>;
 
 // Pseudos.
 def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
@@ -1072,80 +1103,80 @@ def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
 
 // Instruction defs.
 // MIPS DSP Rev 1
-def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC;
-def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC;
-def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC;
-def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC;
-def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC;
-def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
-def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC;
-def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
-def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC;
-def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC;
-def ADDSC : ADDSC_ENC, ADDSC_DESC;
-def ADDWC : ADDWC_ENC, ADDWC_DESC;
+def ADDU_QB : DspMMRel, ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : DspMMRel, ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : DspMMRel, SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : DspMMRel, SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : DspMMRel, ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : DspMMRel, ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : DspMMRel, SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : DspMMRel, SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
+def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
 def MODSUB : MODSUB_ENC, MODSUB_DESC;
-def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
-def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
-def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC;
-def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
-def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
-def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
-def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
-def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
-def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
-def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
-def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
-def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
-def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
-def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
-def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
-def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
-def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
-def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC;
-def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC;
-def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC;
-def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC;
-def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC;
-def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC;
-def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC;
-def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
-def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC;
-def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC;
-def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC;
-def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
-def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC;
-def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC;
-def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC;
-def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC;
-def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
-def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
-def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
-def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
-def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : DspMMRel, PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : DspMMRel, PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : DspMMRel, PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : DspMMRel, PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : DspMMRel, PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : DspMMRel, PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : DspMMRel, PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : DspMMRel, SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : DspMMRel, SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : DspMMRel, SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : DspMMRel, SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : DspMMRel, SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : DspMMRel, SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : DspMMRel, SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : DspMMRel, SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : DspMMRel, SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : DspMMRel, SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : DspMMRel, SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : DspMMRel, SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : DspMMRel, SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : DspMMRel, SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : DspMMRel, SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : DspMMRel, SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : DspMMRel, MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
 def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
-def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
-def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
-def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
-def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
-def MFHI_DSP : MFHI_ENC, MFHI_DESC;
-def MFLO_DSP : MFLO_ENC, MFLO_DESC;
-def MTHI_DSP : MTHI_ENC, MTHI_DESC;
-def MTLO_DSP : MTLO_ENC, MTLO_DESC;
-def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
-def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
-def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
-def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
-def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
-def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
-def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
-def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
-def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC;
-def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC;
-def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC;
-def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC;
-def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC;
-def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC;
+def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DspMMRel, DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DspMMRel, DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DspMMRel, DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DspMMRel, DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DspMMRel, DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : DspMMRel, MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : DspMMRel, MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC;
 def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
 def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
 def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
@@ -1156,87 +1187,85 @@ def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
 def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
 def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
 def BITREV : BITREV_ENC, BITREV_DESC;
-def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
-def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
-def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
-def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
-def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
-def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
-def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
-def LWX : LWX_ENC, LWX_DESC;
-def LHX : LHX_ENC, LHX_DESC;
-def LBUX : LBUX_ENC, LBUX_DESC;
+def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : DspMMRel, PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC;
+def LWX : DspMMRel, LWX_ENC, LWX_DESC;
+def LHX : DspMMRel, LHX_ENC, LHX_DESC;
+def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
 def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
-def INSV : INSV_ENC, INSV_DESC;
-def EXTP : EXTP_ENC, EXTP_DESC;
-def EXTPV : EXTPV_ENC, EXTPV_DESC;
-def EXTPDP : EXTPDP_ENC, EXTPDP_DESC;
-def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC;
-def EXTR_W : EXTR_W_ENC, EXTR_W_DESC;
-def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC;
-def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC;
-def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC;
-def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC;
-def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
-def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC;
-def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC;
-def SHILO : SHILO_ENC, SHILO_DESC;
-def SHILOV : SHILOV_ENC, SHILOV_DESC;
-def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
-def RDDSP : RDDSP_ENC, RDDSP_DESC;
-def WRDSP : WRDSP_ENC, WRDSP_DESC;
+def INSV : DspMMRel, INSV_ENC, INSV_DESC;
+def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
+def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : DspMMRel, EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : DspMMRel, EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : DspMMRel, EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : DspMMRel, EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : DspMMRel, EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : DspMMRel, EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : DspMMRel, EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : DspMMRel, EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : DspMMRel, SHILO_ENC, SHILO_DESC;
+def SHILOV : DspMMRel, SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def WRDSP : WRDSP_ENC, WRDSP_DESC;
+}
 
 // MIPS DSP Rev 2
-let Predicates = [HasDSPR2] in {
-
-def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC;
-def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC;
-def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC;
-def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC;
-def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC;
-def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC;
-def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC;
-def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC;
-def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC;
-def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC;
-def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC;
-def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC;
-def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC;
-def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC;
-def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC;
-def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC;
-def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC;
-def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC;
-def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC;
-def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC;
-def MUL_PH : MUL_PH_ENC, MUL_PH_DESC;
-def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC;
-def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC;
-def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC;
-def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC;
-def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC;
-def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC;
-def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC;
-def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC;
-def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC;
-def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC;
-def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
-def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
-def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
-def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
-def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
-def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
-def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC;
-def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC;
-def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC;
-def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC;
-def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC;
-def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC;
-def APPEND : APPEND_ENC, APPEND_DESC;
-def BALIGN : BALIGN_ENC, BALIGN_DESC;
-def PREPEND : PREPEND_ENC, PREPEND_DESC;
-
-}
+def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2;
+def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2;
+def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2;
+def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2;
+def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def SUBUH_QB : DspMMRel, SUBUH_QB_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB : DspMMRel, SUBUH_R_QB_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def ADDQH_PH : DspMMRel, ADDQH_PH_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH : DspMMRel, ADDQH_R_PH_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_PH : DspMMRel, SUBQH_PH_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH : DspMMRel, SUBQH_R_PH_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W : DspMMRel, ADDQH_W_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W : DspMMRel, ADDQH_R_W_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def SUBQH_W : DspMMRel, SUBQH_W_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W : DspMMRel, SUBQH_R_W_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def MUL_PH : DspMMRel, MUL_PH_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH : DspMMRel, MUL_S_PH_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W : DspMMRel, MULQ_S_W_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def MULQ_RS_W : DspMMRel, MULQ_RS_W_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH : DspMMRel, MULQ_S_PH_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def DPA_W_PH : DspMMRel, DPA_W_PH_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPS_W_PH : DspMMRel, DPS_W_PH_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH : DspMMRel, DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC, ISA_DSPR2;
+def DPAQX_SA_W_PH : DspMMRel, DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC, ISA_DSPR2;
+def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2;
+def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2;
+def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2;
+def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC, ISA_DSPR2;
+def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC, ISA_DSPR2;
+def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2;
+def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2;
+def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2;
+def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2;
+def APPEND : APPEND_ENC, APPEND_DESC, ISA_DSPR2;
+def BALIGN : BALIGN_ENC, BALIGN_DESC, ISA_DSPR2;
+def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2;
 
 // Pseudos.
 let isPseudo = 1, isCodeGenOnly = 1 in {
@@ -1415,3 +1444,8 @@ let AddedComplexity = 20 in {
   def : IndexedLoadPat<sextloadi16, LHX>;
   def : IndexedLoadPat<load, LWX>;
 }
+
+// Instruction alias.
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : DSPInstAlias<"wrdsp $rt", (WRDSP GPR32Opnd:$rt, 0x1F), 1>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 4faeb33..8313d90 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -355,9 +355,8 @@ void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
   for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
        SE = MBB.succ_end(); SI != SE; ++SI)
     if (*SI != &SuccBB)
-      for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
-           LE = (*SI)->livein_end(); LI != LE; ++LI)
-        Uses.set(*LI);
+      for (const auto &LI : (*SI)->liveins())
+        Uses.set(LI.PhysReg);
 }
 
 bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
@@ -431,7 +430,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
       (*MI.memoperands_begin())->getPseudoValue()) {
     if (isa<FixedStackPseudoSourceValue>(PSV))
       return false;
-    return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack();
+    return !PSV->isConstant(nullptr) && !PSV->isStack();
   }
 
   return true;
@@ -598,7 +597,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
         // Get instruction with delay slot.
         MachineBasicBlock::instr_iterator DSI(I);
 
-        if (InMicroMipsMode && TII->GetInstSizeInBytes(std::next(DSI)) == 2 &&
+        if (InMicroMipsMode && TII->GetInstSizeInBytes(&*std::next(DSI)) == 2 &&
             DSI->isCall()) {
           // If instruction in delay slot is 16b change opcode to
           // corresponding instruction with short delay slot.
@@ -713,8 +712,9 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
   if (DisableBackwardSearch)
     return false;
 
-  RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
-  MemDefsUses MemDU(*TM.getDataLayout(), MBB.getParent()->getFrameInfo());
+  auto *Fn = MBB.getParent();
+  RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo());
+  MemDefsUses MemDU(Fn->getDataLayout(), Fn->getFrameInfo());
   ReverseIter Filler;
 
   RegDU.init(*Slot);
@@ -763,6 +763,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
   BB2BrMap BrMap;
   std::unique_ptr<InspectMemInstr> IM;
   Iter Filler;
+  auto *Fn = MBB.getParent();
 
   // Iterate over SuccBB's predecessor list.
   for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
@@ -772,15 +773,15 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
 
   // Do not allow moving instructions which have unallocatable register operands
   // across basic block boundaries.
-  RegDU.setUnallocatableRegs(*MBB.getParent());
+  RegDU.setUnallocatableRegs(*Fn);
 
   // Only allow moving loads from stack or constants if any of the SuccBB's
   // predecessors have multiple successors.
   if (HasMultipleSuccs) {
     IM.reset(new LoadFromStackOrConst());
   } else {
-    const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo();
-    IM.reset(new MemDefsUses(*TM.getDataLayout(), MFI));
+    const MachineFrameInfo *MFI = Fn->getFrameInfo();
+    IM.reset(new MemDefsUses(Fn->getDataLayout(), MFI));
   }
 
   if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot,
@@ -800,12 +801,13 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
 
   // Select the successor with the larget edge weight.
   auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
-  MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(),
-                                           [&](const MachineBasicBlock *Dst0,
-                                               const MachineBasicBlock *Dst1) {
-    return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1);
-  });
-  return S->isLandingPad() ? nullptr : S;
+  MachineBasicBlock *S = *std::max_element(
+      B.succ_begin(), B.succ_end(),
+      [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) {
+        return Prob.getEdgeProbability(&B, Dst0) <
+               Prob.getEdgeProbability(&B, Dst1);
+      });
+  return S->isEHPad() ? nullptr : S;
 }
 
 std::pair<MipsInstrInfo::BranchType, MachineInstr *>
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
new file mode 100644
index 0000000..11e191a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -0,0 +1,84 @@
+//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                    PredicateControl, StdArch {
+  let DecoderNamespace = "Mips";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA
+def OPCODE6_LBE        : OPCODE6<0b101100>;
+def OPCODE6_LBuE       : OPCODE6<0b101000>;
+def OPCODE6_LHE        : OPCODE6<0b101101>;
+def OPCODE6_LHuE       : OPCODE6<0b101001>;
+def OPCODE6_LWE        : OPCODE6<0b101111>;
+
+def OPCODE6_SBE        : OPCODE6<0b011100>;
+def OPCODE6_SHE        : OPCODE6<0b011101>;
+def OPCODE6_SWE        : OPCODE6<0b011111>;
+
+// load/store left/right EVA
+def OPCODE6_LWLE       : OPCODE6<0b011001>;
+def OPCODE6_LWRE       : OPCODE6<0b011010>;
+def OPCODE6_SWLE       : OPCODE6<0b100001>;
+def OPCODE6_SWRE       : OPCODE6<0b100010>;
+
+// Load-linked EVA, Store-conditional EVA
+def OPCODE6_LLE        : OPCODE6<0b101110>;
+def OPCODE6_SCE        : OPCODE6<0b011110>;
+
+def OPCODE6_TLBINV     : OPCODE6<0b000011>;
+def OPCODE6_TLBINVF    : OPCODE6<0b000100>;
+
+def OPCODE6_CACHEE     : OPCODE6<0b011011>;
+def OPCODE6_PREFE      : OPCODE6<0b100011>;
+
+def OPGROUP_COP0       : OPGROUP<0b010000>;
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6 Operation> : MipsEVAInst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-7}  = offset;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class TLB_FM<OPCODE6 Operation> : MipsEVAInst {
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP0.Value;
+  let Inst{25} = 1;       // CO
+  let Inst{24-6} = 0;
+  let Inst{5-0} = Operation.Value;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
new file mode 100644
index 0000000..36c9694
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -0,0 +1,192 @@
+//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips EVA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction encodings
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA encodings
+class LBE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBE>;
+class LBuE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBuE>;
+class LHE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHE>;
+class LHuE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHuE>;
+class LWE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWE>;
+
+class SBE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SBE>;
+class SHE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SHE>;
+class SWE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWE>;
+
+// load/store left/right EVA encodings
+class LWLE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWLE>;
+class LWRE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWRE>;
+class SWLE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWLE>;
+class SWRE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWRE>;
+
+// Load-linked EVA, Store-conditional EVA encodings
+class LLE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LLE>;
+class SCE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SCE>;
+
+class TLBINV_ENC  : TLB_FM<OPCODE6_TLBINV>;
+class TLBINVF_ENC : TLB_FM<OPCODE6_TLBINVF>;
+
+class CACHEE_ENC  : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_CACHEE>;
+class PREFE_ENC   : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction descriptions
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA descriptions
+class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  bit canFoldAsLoad = 1;
+  bit mayLoad = 1;
+}
+
+class LBE_DESC  : LOAD_EVA_DESC_BASE<"lbe",  GPR32Opnd>;
+class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd>;
+class LHE_DESC  : LOAD_EVA_DESC_BASE<"lhe",  GPR32Opnd>;
+class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd>;
+class LWE_DESC  : LOAD_EVA_DESC_BASE<"lwe",  GPR32Opnd>;
+
+class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                          SDPatternOperator OpNode = null_frag> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  bit mayStore = 1;
+}
+
+class SBE_DESC  : STORE_EVA_DESC_BASE<"sbe",  GPR32Opnd>;
+class SHE_DESC  : STORE_EVA_DESC_BASE<"she",  GPR32Opnd>;
+class SWE_DESC  : STORE_EVA_DESC_BASE<"swe",  GPR32Opnd>;
+
+// Load/Store Left/Right EVA descriptions
+class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  string Constraints = "$src = $rt";
+  bit canFoldAsLoad = 1;
+}
+
+class LWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle",  GPR32Opnd>;
+class LWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre",  GPR32Opnd>;
+
+class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+}
+
+class SWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle",  GPR32Opnd>;
+class SWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre",  GPR32Opnd>;
+
+// Load-linked EVA, Store-conditional EVA descriptions
+class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+  string DecoderMethod = "DecodeMemEVA";
+}
+
+class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd>;
+
+class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$dst);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+  string DecoderMethod = "DecodeMemEVA";
+}
+
+class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd>;
+
+class TLB_DESC_BASE<string instr_asm> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins);
+  string AsmString = instr_asm;
+  list<dag> Pattern = [];
+}
+
+class TLBINV_DESC  : TLB_DESC_BASE<"tlbinv">;
+class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf">;
+
+class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins  MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+}
+
+class CACHEE_DESC  : CACHEE_DESC_BASE<"cachee", mem>;
+class PREFE_DESC   : CACHEE_DESC_BASE<"prefe", mem>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction definitions
+//
+//===----------------------------------------------------------------------===//
+
+/// Load and Store EVA Instructions
+def LBE     : LBE_ENC, LBE_DESC, INSN_EVA;
+def LBuE    : LBuE_ENC, LBuE_DESC, INSN_EVA;
+def LHE     : LHE_ENC, LHE_DESC, INSN_EVA;
+def LHuE    : LHuE_ENC, LHuE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWE     : LWE_ENC, LWE_DESC, INSN_EVA;
+}
+def SBE     : SBE_ENC, SBE_DESC, INSN_EVA;
+def SHE     : SHE_ENC, SHE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SWE     : SWE_ENC, SWE_DESC, INSN_EVA;
+}
+
+/// load/store left/right EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWLE    : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def LWRE    : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWLE    : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWRE    : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+}
+
+/// Load-linked EVA, Store-conditional EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LLE     : LLE_ENC, LLE_DESC, INSN_EVA;
+def SCE     : SCE_ENC, SCE_DESC, INSN_EVA;
+}
+
+def TLBINV  : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
+def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
+
+def CACHEE  : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
+def PREFE   : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
index 5152a07..e9eaf81 100644
--- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -192,10 +192,10 @@ public:
         TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) {
     MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
     Context = &funcInfo.Fn->getContext();
+    bool ISASupported = !Subtarget->hasMips32r6() && Subtarget->hasMips32();
     TargetSupported =
-        ((TM.getRelocationModel() == Reloc::PIC_) &&
-         ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) &&
-          (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32())));
+        ISASupported && (TM.getRelocationModel() == Reloc::PIC_) &&
+        (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32());
     UnsupportedFPMode = Subtarget->isFP64bit();
   }
 
@@ -236,32 +236,36 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
     std::swap(LHS, RHS);
 
   unsigned Opc;
-  if (ISDOpc == ISD::AND) {
+  switch (ISDOpc) {
+  case ISD::AND:
     Opc = Mips::AND;
-  } else if (ISDOpc == ISD::OR) {
+    break;
+  case ISD::OR:
     Opc = Mips::OR;
-  } else if (ISDOpc == ISD::XOR) {
+    break;
+  case ISD::XOR:
     Opc = Mips::XOR;
-  } else
+    break;
+  default:
     llvm_unreachable("unexpected opcode");
+  }
 
   unsigned LHSReg = getRegForValue(LHS);
-  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
-  if (!ResultReg)
-    return 0;
-
-  unsigned RHSReg;
   if (!LHSReg)
     return 0;
 
+  unsigned RHSReg;
   if (const auto *C = dyn_cast<ConstantInt>(RHS))
     RHSReg = materializeInt(C, MVT::i32);
   else
     RHSReg = getRegForValue(RHS);
-
   if (!RHSReg)
     return 0;
 
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ResultReg)
+    return 0;
+
   emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg);
   return ResultReg;
 }
@@ -747,7 +751,7 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
     unsigned Offset = Addr.getOffset();
     MachineFrameInfo &MFI = *MF->getFrameInfo();
     MachineMemOperand *MMO = MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+        MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
         MFI.getObjectSize(FI), Align);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
         .addFrameIndex(FI)
@@ -798,7 +802,7 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
     unsigned Offset = Addr.getOffset();
     MachineFrameInfo &MFI = *MF->getFrameInfo();
     MachineMemOperand *MMO = MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+        MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
         MFI.getObjectSize(FI), Align);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
         .addReg(SrcReg)
@@ -912,8 +916,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
     BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
         .addReg(CondReg)
         .addMBB(TBB);
-    fastEmitBranch(FBB, DbgLoc);
-    FuncInfo.MBB->addSuccessor(TBB);
+    finishCondBranch(BI->getParent(), TBB, FBB);
     return true;
   }
   return false;
@@ -1057,22 +1060,16 @@ bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) {
   // entirely within FPRs.
   unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
   unsigned TempReg = createResultReg(&Mips::FGR32RegClass);
-  unsigned Opc;
-
-  if (SrcVT == MVT::f32)
-    Opc = Mips::TRUNC_W_S;
-  else
-    Opc = Mips::TRUNC_W_D32;
+  unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32;
 
   // Generate the convert.
   emitInst(Opc, TempReg).addReg(SrcReg);
-
   emitInst(Mips::MFC1, DestReg).addReg(TempReg);
 
   updateValueMap(I, DestReg);
   return true;
 }
-//
+
 bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
                                    SmallVectorImpl<MVT> &OutVTs,
                                    unsigned &NumBytes) {
@@ -1196,7 +1193,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
 
       unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-          MachinePointerInfo::getStack(Addr.getOffset()),
+          MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
       (void)(MMO);
       // if (!emitStore(ArgVT, ArgReg, Addr, MMO))
@@ -1607,19 +1604,23 @@ bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
 
 bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                unsigned DestReg) {
+  int64_t Imm;
+
   switch (SrcVT.SimpleTy) {
   default:
     return false;
   case MVT::i1:
-    emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(1);
+    Imm = 1;
     break;
   case MVT::i8:
-    emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xff);
+    Imm = 0xff;
     break;
   case MVT::i16:
-    emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xffff);
+    Imm = 0xffff;
     break;
   }
+
+  emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(Imm);
   return true;
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
index fab2fdf..5680130 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -117,6 +117,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::GPRel:             return "MipsISD::GPRel";
   case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
   case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::ERet:              return "MipsISD::ERet";
   case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
   case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
   case MipsISD::FPCmp:             return "MipsISD::FPCmp";
@@ -276,8 +277,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
-  setOperationAction(ISD::SELECT_CC,          MVT::f32,   Custom);
-  setOperationAction(ISD::SELECT_CC,          MVT::f64,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
@@ -326,6 +325,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
   setOperationAction(ISD::SELECT_CC,         MVT::i32,   Expand);
   setOperationAction(ISD::SELECT_CC,         MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::f64,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
   setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
@@ -390,10 +391,10 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
 
-  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);
-  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i64,    Expand);
-  setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
-  setOperationAction(ISD::ATOMIC_STORE,      MVT::i64,    Expand);
+  if (!Subtarget.isGP64bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD,     MVT::i64,   Expand);
+    setOperationAction(ISD::ATOMIC_STORE,    MVT::i64,   Expand);
+  }
 
   setInsertFencesForAtomic(true);
 
@@ -437,9 +438,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
 
-  setExceptionPointerRegister(ABI.IsN64() ? Mips::A0_64 : Mips::A0);
-  setExceptionSelectorRegister(ABI.IsN64() ? Mips::A1_64 : Mips::A1);
-
   MaxStoresPerMemcpy = 16;
 
   isMicroMips = Subtarget.inMicroMipsMode();
@@ -836,6 +834,14 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   return SDValue();
 }
 
+bool MipsTargetLowering::isCheapToSpeculateCttz() const {
+  return Subtarget.hasMips32();
+}
+
+bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
+  return Subtarget.hasMips32();
+}
+
 void
 MipsTargetLowering::LowerOperationWrapper(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
@@ -866,7 +872,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
   case ISD::JumpTable:          return lowerJumpTable(Op, DAG);
   case ISD::SELECT:             return lowerSELECT(Op, DAG);
-  case ISD::SELECT_CC:          return lowerSELECT_CC(Op, DAG);
   case ISD::SETCC:              return lowerSETCC(Op, DAG);
   case ISD::VASTART:            return lowerVASTART(Op, DAG);
   case ISD::VAARG:              return lowerVAARG(Op, DAG);
@@ -1092,8 +1097,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
   MF->insert(It, loopMBB);
   MF->insert(It, exitMBB);
 
@@ -1204,8 +1208,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
   MF->insert(It, loopMBB);
   MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
@@ -1330,15 +1333,20 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, ZERO, BNE, BEQ;
 
-  if (Size == 4) {
-    LL = isMicroMips ? Mips::LL_MM : Mips::LL;
-    SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+   if (Size == 4) {
+     if (isMicroMips) {
+       LL = Mips::LL_MM;
+       SC = Mips::SC_MM;
+     } else {
+       LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL;
+       SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC;
+     }
     ZERO = Mips::ZERO;
     BNE = Mips::BNE;
     BEQ = Mips::BEQ;
   } else {
-    LL = Mips::LLD;
-    SC = Mips::SCD;
+    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
     ZERO = Mips::ZERO_64;
     BNE = Mips::BNE64;
     BEQ = Mips::BEQ64;
@@ -1356,8 +1364,7 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
   MF->insert(It, loop1MBB);
   MF->insert(It, loop2MBB);
   MF->insert(It, exitMBB);
@@ -1440,8 +1447,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
   MF->insert(It, loop1MBB);
   MF->insert(It, loop2MBB);
   MF->insert(It, sinkMBB);
@@ -1586,9 +1592,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table);
 
   EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
-  Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
-                        MachinePointerInfo::getJumpTable(), MemVT, false, false,
-                        false, 0);
+  Addr =
+      DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
+                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
+                     MemVT, false, false, false, 0);
   Chain = Addr.getValue(1);
 
   if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || ABI.IsN64()) {
@@ -1640,20 +1647,6 @@ lowerSELECT(SDValue Op, SelectionDAG &DAG) const
                       SDLoc(Op));
 }
 
-SDValue MipsTargetLowering::
-lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
-{
-  SDLoc DL(Op);
-  EVT Ty = Op.getOperand(0).getValueType();
-  SDValue Cond =
-      DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(),
-                                                     *DAG.getContext(), Ty),
-                  Op.getOperand(0), Op.getOperand(1), Op.getOperand(4));
-
-  return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
-                     Op.getOperand(3));
-}
-
 SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op);
@@ -1690,14 +1683,15 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
     return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 
   if (LargeGOT)
-    return getAddrGlobalLargeGOT(N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16,
-                                 MipsII::MO_GOT_LO16, DAG.getEntryNode(),
-                                 MachinePointerInfo::getGOT());
+    return getAddrGlobalLargeGOT(
+        N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
+        DAG.getEntryNode(),
+        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
-  return getAddrGlobal(N, SDLoc(N), Ty, DAG,
-                       (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP
-                                                    : MipsII::MO_GOT16,
-                       DAG.getEntryNode(), MachinePointerInfo::getGOT());
+  return getAddrGlobal(
+      N, SDLoc(N), Ty, DAG,
+      (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+      DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 }
 
 SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
@@ -1719,6 +1713,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   // Local Exec TLS Model.
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
   SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -1813,7 +1810,8 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
 
-    if (TLOF->IsConstantInSmallSection(N->getConstVal(), getTargetMachine()))
+    if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(),
+                                       getTargetMachine()))
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
@@ -2946,8 +2944,12 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                      *DAG.getContext());
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
-  Function::const_arg_iterator FuncArg =
-    DAG.getMachineFunction().getFunction()->arg_begin();
+  const Function *Func = DAG.getMachineFunction().getFunction();
+  Function::const_arg_iterator FuncArg = Func->arg_begin();
+
+  if (Func->hasFnAttribute("interrupt") && !Func->arg_empty())
+    report_fatal_error(
+        "Functions with the interrupt attribute cannot have arguments!");
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
@@ -3019,7 +3021,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
         // We ought to be able to use LocVT directly but O32 sets it to i32
         // when allocating floating point values to integer registers.
         // This shouldn't influence how we load the value into registers unless
-        // we are targetting softfloat.
+        // we are targeting softfloat.
         if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat())
           LocVT = VA.getValVT();
       }
@@ -3033,9 +3035,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-      SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
-                                     MachinePointerInfo::getFixedStack(FI),
-                                     false, false, false, 0);
+      SDValue ArgValue = DAG.getLoad(
+          LocVT, DL, Chain, FIN,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+          false, false, false, 0);
       OutChains.push_back(ArgValue.getValue(1));
 
       ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
@@ -3098,8 +3101,20 @@ MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const
 }
 
 SDValue
-MipsTargetLowering::LowerReturn(SDValue Chain,
-                                CallingConv::ID CallConv, bool IsVarArg,
+MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                         SDLoc DL, SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setISR();
+
+  return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps);
+}
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                bool IsVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
                                 SDLoc DL, SelectionDAG &DAG) const {
@@ -3192,7 +3207,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  // Return on Mips is always a "jr $ra"
+  // ISRs must use "eret".
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt"))
+    return LowerInterruptReturn(RetOps, DL, DAG);
+
+  // Standard return on Mips is a "jr $ra"
   return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
 }
 
@@ -3300,7 +3319,7 @@ static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
 
   // Search for the first numeric character.
   StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
-  I = std::find_if(B, E, std::ptr_fun(isdigit));
+  I = std::find_if(B, E, isdigit);
 
   Prefix = StringRef(B, I - B);
 
@@ -3669,7 +3688,7 @@ void MipsTargetLowering::passByValArg(
   unsigned NumRegs = LastReg - FirstReg;
 
   if (NumRegs) {
-    const ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
+    ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
     bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
     unsigned I = 0;
 
@@ -3755,7 +3774,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
                                          SDValue Chain, SDLoc DL,
                                          SelectionDAG &DAG,
                                          CCState &State) const {
-  const ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+  ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
   unsigned Idx = State.getFirstUnallocated(ArgRegs);
   unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
   MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
@@ -3812,7 +3831,7 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
 
   if (State->getCallingConv() != CallingConv::Fast) {
     unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
-    const ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
+    ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
     // FIXME: The O32 case actually describes no shadow registers.
     const MCPhysReg *ShadowRegs =
         ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs;
@@ -3860,8 +3879,7 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
index b3d861d..0dc683e 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -67,6 +67,10 @@ namespace llvm {
       // Return
       Ret,
 
+      // Interrupt, exception, error trap Return
+      ERet,
+
+      // Software Exception Return.
       EH_RETURN,
 
       // Node used to extract integer from accumulator.
@@ -231,6 +235,9 @@ namespace llvm {
       return MVT::i32;
     }
 
+    bool isCheapToSpeculateCttz() const override;
+    bool isCheapToSpeculateCtlz() const override;
+
     void LowerOperationWrapper(SDNode *N,
                                SmallVectorImpl<SDValue> &Results,
                                SelectionDAG &DAG) const override;
@@ -258,17 +265,25 @@ namespace llvm {
     EmitInstrWithCustomInserter(MachineInstr *MI,
                                 MachineBasicBlock *MBB) const override;
 
-    struct LTStr {
-      bool operator()(const char *S1, const char *S2) const {
-        return strcmp(S1, S2) < 0;
-      }
-    };
-
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
     unsigned getRegisterByName(const char* RegName, EVT VT,
                                SelectionDAG &DAG) const override;
 
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return ABI.IsN64() ? Mips::A0_64 : Mips::A0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
+    }
+
     /// Returns true if a cast between SrcAS and DestAS is a noop.
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
       // Mips doesn't have any special address spaces so we just reserve
@@ -290,9 +305,10 @@ namespace llvm {
       unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
       SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, GOTFlag));
-      SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
-                                 MachinePointerInfo::getGOT(), false, false,
-                                 false, 0);
+      SDValue Load =
+          DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                      false, false, false, 0);
       unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
       SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
                                getTargetNode(N, Ty, DAG, LoFlag));
@@ -414,7 +430,6 @@ namespace llvm {
     SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
@@ -487,6 +502,9 @@ namespace llvm {
                         const SmallVectorImpl<SDValue> &OutVals,
                         SDLoc dl, SelectionDAG &DAG) const override;
 
+    SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, SDLoc DL,
+                                 SelectionDAG &DAG) const;
+
     bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
 
     // Inline asm support
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
index cb91225..377260f 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -136,7 +136,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin,
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
   def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32;
-  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
+  def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -267,24 +267,25 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                  ABSS_FM<0xc, 16>, ISA_MIPS2;
-def TRUNC_W_S  : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+def TRUNC_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
                  ABSS_FM<0xd, 16>, ISA_MIPS2;
-def CEIL_W_S   : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+def CEIL_W_S   : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
                  ABSS_FM<0xe, 16>, ISA_MIPS2;
-def FLOOR_W_S  : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+def FLOOR_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
                  ABSS_FM<0xf, 16>, ISA_MIPS2;
 def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                  ABSS_FM<0x24, 16>;
 
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
 defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
 defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
 defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
 defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
 
 let DecoderNamespace = "Mips64" in {
+  let AdditionalPredicates = [NotInMicroMips] in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
                   ABSS_FM<0x8, 16>, FGR_64;
   def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
@@ -301,14 +302,17 @@ let DecoderNamespace = "Mips64" in {
                   ABSS_FM<0xb, 16>, FGR_64;
   def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
                     ABSS_FM<0xb, 17>, FGR_64;
+  }
 }
 
 def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
               ABSS_FM<0x20, 20>;
-def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-              ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
-def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
-               ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+let AdditionalPredicates = [NotInMicroMips] in{
+  def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
+  def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+                 ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+}
 
 def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
                 ABSS_FM<0x20, 17>, FGR_32;
@@ -320,8 +324,10 @@ def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
 let DecoderNamespace = "Mips64" in {
   def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
                   ABSS_FM<0x20, 17>, FGR_64;
-  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 21>, FGR_64;
+  let AdditionalPredicates = [NotInMicroMips] in{
+    def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
+                    ABSS_FM<0x20, 21>, FGR_64;
+  }
   def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
                   ABSS_FM<0x21, 20>, FGR_64;
   def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
@@ -345,8 +351,8 @@ def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
 defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
 defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
 
-def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>,
-              ABSS_FM<0x4, 16>, ISA_MIPS2;
+def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
+              II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
 defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
 
 // The odd-numbered registers are only referenced when doing loads,
@@ -503,13 +509,13 @@ let AdditionalPredicates = [NoNaNsFPMath],
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
 
-def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
+def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, II_BC1F, MIPS_BRANCH_F>,
            BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
-def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, IIBranch, MIPS_BRANCH_F, 0>,
+def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, II_BC1FL, MIPS_BRANCH_F, 0>,
             BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6;
-def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
+def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, II_BC1T, MIPS_BRANCH_T>,
            BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
-def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, IIBranch, MIPS_BRANCH_T, 0>,
+def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>,
             BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6;
 
 /// Floating Point Compare
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
index 5f4fcc3..45baf27 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -132,7 +132,7 @@ class PseudoSE<dag outs, dag ins, list<dag> pattern,
 // These are aliases that require C++ handling to convert to the target
 // instruction, while InstAliases can be handled directly by tblgen.
 class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
-  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
+  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl {
   let isPseudo = 1;
   let Pattern = [];
 }
@@ -644,16 +644,16 @@ class BRK_FM<bits<6> funct> : StdArch
 //  Exception return format <Cop0|1|0|funct>
 //===----------------------------------------------------------------------===//
 
-class ER_FM<bits<6> funct> : StdArch
+class ER_FM<bits<6> funct, bit LLBit> : StdArch
 {
   bits<32> Inst;
   let Inst{31-26} = 0x10;
   let Inst{25}    = 1;
-  let Inst{24-6}  = 0;
+  let Inst{24-7}  = 0;
+  let Inst{6} = LLBit;
   let Inst{5-0}   = funct;
 }
 
-
 //===----------------------------------------------------------------------===//
 //  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index bb23cc0..b1d6950 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -60,8 +60,8 @@ MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
 
-  return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), Flag,
-                                 MFI.getObjectSize(FI), Align);
+  return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+                                 Flag, MFI.getObjectSize(FI), Align);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
index ab98c90..ffda491 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -77,6 +77,9 @@ def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
 def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
+def MipsERet : SDNode<"MipsISD::ERet", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPSideEffect]>;
+
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
                            [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
@@ -157,7 +160,7 @@ def HasMips3     :    Predicate<"Subtarget->hasMips3()">,
 def HasMips4_32  :    Predicate<"Subtarget->hasMips4_32()">,
                       AssemblerPredicate<"FeatureMips4_32">;
 def NotMips4_32  :    Predicate<"!Subtarget->hasMips4_32()">,
-                      AssemblerPredicate<"FeatureMips4_32">;
+                      AssemblerPredicate<"!FeatureMips4_32">;
 def HasMips4_32r2 :   Predicate<"Subtarget->hasMips4_32r2()">,
                       AssemblerPredicate<"FeatureMips4_32r2">;
 def HasMips5_32r2 :   Predicate<"Subtarget->hasMips5_32r2()">,
@@ -166,6 +169,8 @@ def HasMips32    :    Predicate<"Subtarget->hasMips32()">,
                       AssemblerPredicate<"FeatureMips32">;
 def HasMips32r2  :    Predicate<"Subtarget->hasMips32r2()">,
                       AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r5  :    Predicate<"Subtarget->hasMips32r5()">,
+                      AssemblerPredicate<"FeatureMips32r5">;
 def HasMips32r6  :    Predicate<"Subtarget->hasMips32r6()">,
                       AssemblerPredicate<"FeatureMips32r6">;
 def NotMips32r6  :    Predicate<"!Subtarget->hasMips32r6()">,
@@ -176,6 +181,8 @@ def IsGP32bit    :    Predicate<"!Subtarget->isGP64bit()">,
                       AssemblerPredicate<"!FeatureGP64Bit">;
 def HasMips64    :    Predicate<"Subtarget->hasMips64()">,
                       AssemblerPredicate<"FeatureMips64">;
+def NotMips64    :    Predicate<"!Subtarget->hasMips64()">,
+                      AssemblerPredicate<"!FeatureMips64">;
 def HasMips64r2  :    Predicate<"Subtarget->hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
 def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
@@ -184,6 +191,8 @@ def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"!FeatureMips64r6">;
 def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">,
                        AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">;
+def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">,
+                       AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">;
 def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
 def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
@@ -201,6 +210,12 @@ def NotInMicroMips :  Predicate<"!Subtarget->inMicroMipsMode()">,
 def IsLE           :  Predicate<"Subtarget->isLittle()">;
 def IsBE           :  Predicate<"!Subtarget->isLittle()">;
 def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
+def UseTCCInDIV    :  AssemblerPredicate<"FeatureUseTCCInDIV">;
+def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
+                      AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
+def HasMSA : Predicate<"Subtarget->hasMSA()">,
+             AssemblerPredicate<"FeatureMSA">;
+
 
 //===----------------------------------------------------------------------===//
 // Mips GPR size adjectives.
@@ -242,6 +257,7 @@ class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
 class ISA_MIPS32R2_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
 }
+class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; }
 class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
 class ISA_MIPS64_NOT_64R6 {
   list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
@@ -249,9 +265,21 @@ class ISA_MIPS64_NOT_64R6 {
 class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
 class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
 class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; }
 class ISA_MICROMIPS32R6 {
   list<Predicate> InsnPredicates = [HasMicroMips32r6];
 }
+class ISA_MICROMIPS64R6 {
+  list<Predicate> InsnPredicates = [HasMicroMips64r6];
+}
+class ISA_MICROMIPS32_NOT_MIPS32R6 {
+  list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6];
+}
+
+class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; }
+class INSN_EVA_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA];
+}
 
 // The portions of MIPS-III that were also added to MIPS32
 class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
@@ -283,6 +311,28 @@ class INSN_MIPS5_32R2_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
 }
 
+class ASE_CNMIPS {
+  list<Predicate> InsnPredicates = [HasCnMips];
+}
+
+class ASE_MSA {
+  list<Predicate> InsnPredicates = [HasMSA];
+}
+
+class ASE_MSA_NOT_MSA64 {
+  list<Predicate> InsnPredicates = [HasMSA, NotMips64];
+}
+
+class ASE_MSA64 {
+  list<Predicate> InsnPredicates = [HasMSA, HasMips64];
+}
+
+// Class used for separating microMIPSr6 and microMIPS (r3) instruction.
+// It can be used only on instructions that doesn't inherit PredicateControl.
+class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl {
+  let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6];
+}
+
 //===----------------------------------------------------------------------===//
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
@@ -335,6 +385,81 @@ include "MipsInstrFormats.td"
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
+class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "ConstantSImm" # Bits;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isConstantSImm<" # Bits # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "SImm" # Bits;
+}
+
+class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+                                  int Offset = 0> : AsmOperandClass {
+  let Name = "ConstantUImm" # Bits # "_" # Offset;
+  let RenderMethod = "addConstantUImmOperands<" # Bits # ", " # Offset # ">";
+  let PredicateMethod = "isConstantUImm<" # Bits # ", " # Offset # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "UImm" # Bits # "_" # Offset;
+}
+
+def ConstantUImm10AsmOperandClass
+    : ConstantUImmAsmOperandClass<10, []>;
+def ConstantUImm8AsmOperandClass
+    : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>;
+def ConstantUImm7AsmOperandClass
+    : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass]>;
+def ConstantUImm6AsmOperandClass
+    : ConstantUImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>;
+def ConstantSImm6AsmOperandClass
+    : ConstantSImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>;
+def ConstantUImm5Plus1AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 1>;
+def ConstantUImm5Plus32AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>;
+def ConstantUImm5Plus33AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 33>;
+def ConstantUImm5Plus32NormalizeAsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> {
+  let Name = "ConstantUImm5_32_Norm";
+  // We must also subtract 32 when we render the operand.
+  let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
+}
+def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "UImm5Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledUImm<5, 2>";
+  let SuperClasses = [ConstantUImm6AsmOperandClass];
+  let DiagnosticType = "UImm5_Lsl2";
+}
+def ConstantUImm5ReportUImm6AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> {
+  let Name = "ConstantUImm5_0_Report_UImm6";
+  let DiagnosticType = "UImm5_0_Report_UImm6";
+}
+def ConstantUImm5AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>;
+def ConstantUImm4AsmOperandClass
+    : ConstantUImmAsmOperandClass<
+          4, [ConstantUImm5AsmOperandClass,
+              ConstantUImm5Plus32AsmOperandClass,
+              ConstantUImm5Plus32NormalizeAsmOperandClass]>;
+def ConstantUImm3AsmOperandClass
+    : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>;
+def ConstantUImm2Plus1AsmOperandClass
+    : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>;
+def ConstantUImm2AsmOperandClass
+    : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass]>;
+def ConstantUImm1AsmOperandClass
+    : ConstantUImmAsmOperandClass<1, [ConstantUImm2AsmOperandClass]>;
+def ConstantImmzAsmOperandClass : AsmOperandClass {
+  let Name = "ConstantImmz";
+  let RenderMethod = "addConstantUImmOperands<1>";
+  let PredicateMethod = "isConstantImmz";
+  let SuperClasses = [ConstantUImm1AsmOperandClass];
+  let DiagnosticType = "Immz";
+}
+
 def MipsJumpTargetAsmOperand : AsmOperandClass {
   let Name = "JumpTarget";
   let ParserMethod = "parseJumpTarget";
@@ -360,6 +485,10 @@ def calltarget  : Operand<iPTR> {
 
 def imm64: Operand<i64>;
 
+def simm6 : Operand<i32> {
+  let ParserMatchClass = ConstantSImm6AsmOperandClass;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
 def simm9 : Operand<i32>;
 def simm10 : Operand<i32>;
 def simm11 : Operand<i32>;
@@ -380,23 +509,12 @@ def simm18_lsl3 : Operand<i32> {
   let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
-def simm20      : Operand<i32> {
-}
+def simm20      : Operand<i32>;
+def simm32      : Operand<i32>;
 
 def uimm20      : Operand<i32> {
 }
 
-def MipsUImm10AsmOperand : AsmOperandClass {
-  let Name = "UImm10";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseImm";
-  let PredicateMethod = "isUImm<10>";
-}
-
-def uimm10      : Operand<i32> {
-  let ParserMatchClass = MipsUImm10AsmOperand;
-}
-
 def simm16_64   : Operand<i64> {
   let DecoderMethod = "DecodeSimm16";
 }
@@ -404,23 +522,71 @@ def simm16_64   : Operand<i64> {
 // Zero
 def uimmz       : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantImmzAsmOperandClass;
+}
+
+// Unsigned Operands
+foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10} in
+  def uimm # I : Operand<i32> {
+    let PrintMethod = "printUnsignedImm";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+def uimm2_plus1 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>";
+  let DecoderMethod = "DecodeUImmWithOffset<2, 1>";
+  let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
 }
 
-// Unsigned Operand
-def uimm2 : Operand<i32> {
+def uimm5_plus1 : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+  let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+  let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
 }
 
-def uimm3 : Operand<i32> {
+def uimm5_plus32 : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
 }
 
-def uimm5       : Operand<i32> {
+def uimm5_plus33 : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+  let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+  let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass;
 }
 
-def uimm6 : Operand<i32> {
+def uimm5_plus32_normalize : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+def uimm5_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getUImm5Lsl2Encoding";
+  let DecoderMethod = "DecodeUImm5lsl2";
+  let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass;
+}
+
+def uimm5_plus32_normalize_64 : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+foreach I = {5} in
+  def uimm # I # _64 : Operand<i64> {
+    let PrintMethod = "printUnsignedImm";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+// Like uimm5_64 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_64_report_uimm6 : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
 }
 
 def uimm16      : Operand<i32> {
@@ -435,6 +601,22 @@ def MipsMemAsmOperand : AsmOperandClass {
   let ParserMethod = "parseMemOperand";
 }
 
+def MipsMemSimm9AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm9";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<9>";
+}
+
+def MipsMemSimm9GPRAsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm9GPR";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffsetGPR<9>";
+}
+
 def MipsMemSimm11AsmOperand : AsmOperandClass {
   let Name = "MemOffsetSimm11";
   let SuperClasses = [MipsMemAsmOperand];
@@ -485,6 +667,13 @@ def mem_msa : mem_generic {
 def mem_simm9 : mem_generic {
   let MIOperandInfo = (ops ptr_rc, simm9);
   let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm9AsmOperand;
+}
+
+def mem_simm9gpr : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm9GPRAsmOperand;
 }
 
 def mem_simm11 : mem_generic {
@@ -512,12 +701,6 @@ def PtrRC : Operand<iPTR> {
   let ParserMatchClass = GPR32AsmOperand;
 }
 
-// size operand of ext instruction
-def size_ext : Operand<i32> {
-  let EncoderMethod = "getSizeExtEncoding";
-  let DecoderMethod = "DecodeExtSize";
-}
-
 // size operand of ins instruction
 def size_ins : Operand<i32> {
   let EncoderMethod = "getSizeInsEncoding";
@@ -657,7 +840,7 @@ class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
          [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR,
          opstr>;
 
-// Load Upper Imediate
+// Load Upper Immediate
 class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
   InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
          [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
@@ -675,14 +858,19 @@ class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
   let mayLoad = 1;
 }
 
-class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
+            SDPatternOperator OpNode = null_frag,
             InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
-  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+  InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
 }
 
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  StoreMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+
 // Load/Store Left/Right
 let canFoldAsLoad = 1 in
 class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
@@ -740,7 +928,7 @@ class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
               RegisterOperand RO, bit DelaySlot = 1> :
   InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
          !strconcat(opstr, "\t$rs, $rt, $offset"),
-         [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
+         [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], II_BCC,
          FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
@@ -752,7 +940,7 @@ class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
                   RegisterOperand RO, bit DelaySlot = 1> :
   InstSE<(outs), (ins RO:$rs, opnd:$offset),
          !strconcat(opstr, "\t$rs, $offset"),
-         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
+         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], II_BCCZ,
          FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
@@ -778,7 +966,7 @@ class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
 class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
              SDPatternOperator targetoperator, string bopstr> :
   InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-         [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> {
+         [(operator targetoperator:$target)], II_J, FrmJ, bopstr> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
@@ -788,7 +976,7 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
 
 // Unconditional branch
 class UncondBranch<Instruction BEQInst> :
-  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>,
+  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], II_B>,
   PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
@@ -802,7 +990,7 @@ class UncondBranch<Instruction BEQInst> :
 let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
 class JumpFR<string opstr, RegisterOperand RO,
              SDPatternOperator operator = null_frag>:
-  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch,
+  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR,
          FrmR, opstr>;
 
 // Indirect branch
@@ -815,23 +1003,23 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
 let isCall=1, hasDelaySlot=1, Defs = [RA] in {
   class JumpLink<string opstr, DAGOperand opnd> :
     InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-           [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> {
+           [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> {
     let DecoderMethod = "DecodeJumpTarget";
   }
 
   class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
                           Register RetReg, RegisterOperand ResRO = RO>:
-    PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], IIBranch>,
+    PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>,
     PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
 
   class JumpLinkReg<string opstr, RegisterOperand RO>:
     InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-           [], IIBranch, FrmR>;
+           [], II_JALR, FrmR, opstr>;
 
   class BGEZAL_FT<string opstr, DAGOperand opnd,
                   RegisterOperand RO, bit DelaySlot = 1> :
     InstSE<(outs), (ins RO:$rs, opnd:$offset),
-           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr> {
+           !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZAL, FrmI, opstr> {
     let hasDelaySlot = DelaySlot;
   }
 
@@ -840,17 +1028,17 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     hasExtraSrcRegAllocReq = 1, Defs = [AT] in {
   class TailCall<Instruction JumpInst> :
-    PseudoSE<(outs), (ins calltarget:$target), [], IIBranch>,
+    PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
     PseudoInstExpansion<(JumpInst jmptarget:$target)>;
 
   class TailCallReg<RegisterOperand RO, Instruction JRInst,
                     RegisterOperand ResRO = RO> :
-    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], IIBranch>,
+    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
     PseudoInstExpansion<(JRInst ResRO:$rs)>;
 }
 
 class BAL_BR_Pseudo<Instruction RealInst> :
-  PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>,
+  PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>,
   PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
@@ -997,9 +1185,10 @@ class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
          [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
 
 // Subword Swap
-class SubwordSwap<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
-         NoItinerary, FrmR, opstr> {
+class SubwordSwap<string opstr, RegisterOperand RO,
+                  InstrItinClass itin = NoItinerary>:
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], itin,
+         FrmR, opstr> {
   let hasSideEffects = 0;
 }
 
@@ -1010,8 +1199,8 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
 
 // Ext and Ins
 class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
-              SDPatternOperator Op = null_frag>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
+              Operand SizeOpnd, SDPatternOperator Op = null_frag> :
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], II_EXT,
          FrmR, opstr>, ISA_MIPS32R2;
@@ -1074,6 +1263,9 @@ class TrapBase<Instruction RealInst>
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
 def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
 
+let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, hasSideEffects=1 in
+def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
+
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
 def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
                                   [(callseq_start timm:$amt)]>;
@@ -1215,10 +1407,11 @@ def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
           LW_FM<0x21>;
 def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def LW  : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
+def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
           LW_FM<0x23>;
 }
-def SB  : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>;
+def SB  : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+          LW_FM<0x28>;
 def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
 let AdditionalPredicates = [NotInMicroMips] in {
 def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
@@ -1259,15 +1452,17 @@ let DecoderNamespace = "COP3_" in {
 }
 }
 
-def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
-def SYNCI : MMRel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
+def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
+def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
 
-def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
-def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
-def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2;
-def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2;
-def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2;
-def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
+  def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
+  def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2;
+  def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2;
+  def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2;
+  def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2;
+}
 
 def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
            ISA_MIPS2_NOT_32R6_64R6;
@@ -1290,14 +1485,15 @@ def TRAP : TrapBase<BREAK>;
 def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NotInMicroMips] in {
-def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
+  def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
+  def ERETNC : MMRel, ER_FT<"eretnc">, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+  def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f, 0x0>, ISA_MIPS32;
 }
-def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
 
 let AdditionalPredicates = [NotInMicroMips] in {
-def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+  def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+  def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
 }
-def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
 
 let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
     AdditionalPredicates = [NotInMicroMips] in {
@@ -1359,7 +1555,8 @@ def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
 // Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
 // then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
 class PseudoIndirectBranchBase<RegisterOperand RO> :
-    MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> {
+    MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
+               II_IndirectBranchPseudo> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
@@ -1369,12 +1566,12 @@ class PseudoIndirectBranchBase<RegisterOperand RO> :
 
 def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
 
-// Return instructions are matched as a RetRA instruction, then ar expanded
+// Return instructions are matched as a RetRA instruction, then are expanded
 // into PseudoReturn/PseudoReturn64 after register allocation. Finally,
 // MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the
 // ISA.
 class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
-                                                        [], IIBranch> {
+                                                        [], II_ReturnPseudo> {
   let isTerminator = 1;
   let isBarrier = 1;
   let hasDelaySlot = 1;
@@ -1441,8 +1638,11 @@ def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>,
 def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>,
           ISA_MIPS32_NOT_32R6_64R6;
 
-/// Word Swap Bytes Within Halfwords
-def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
+let AdditionalPredicates = [NotInMicroMips] in {
+  /// Word Swap Bytes Within Halfwords
+  def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>,
+             ISA_MIPS32R2;
+}
 
 /// No operation.
 def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1485,10 +1685,12 @@ def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
                                0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
                                0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
-
+let AdditionalPredicates = [NotInMicroMips] in {
 def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
-
-def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
+}
+// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, MipsExt>,
+          EXT_FM<0>;
 def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
 
 /// Move Control Registers From/To CPU Registers
@@ -1499,9 +1701,9 @@ def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
 class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
                                       FrmOther, asmstr>;
-def SSNOP : MMRel, Barrier<"ssnop">, BARRIER_FM<1>;
+def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop">, BARRIER_FM<1>;
 def EHB : MMRel, Barrier<"ehb">, BARRIER_FM<3>;
-def PAUSE : MMRel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
 
 // JR_HB and JALR_HB are defined here using the new style naming
 // scheme because some of this code is shared with Mips32r6InstrInfo.td
@@ -1562,11 +1764,60 @@ def CACHE : MMRel, CacheOp<"cache", mem>, CACHEOP_FM<0b101111>,
 def PREF :  MMRel, CacheOp<"pref", mem>, CACHEOP_FM<0b110011>,
             INSN_MIPS3_32_NOT_32R6_64R6;
 
+def ROL : MipsAsmPseudoInst<(outs),
+                            (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                            "rol\t$rs, $rt, $rd">;
+def ROLImm : MipsAsmPseudoInst<(outs),
+                               (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                               "rol\t$rs, $rt, $imm">;
+def : MipsInstAlias<"rol $rd, $rs",
+                    (ROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"rol $rd, $imm",
+                    (ROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def ROR : MipsAsmPseudoInst<(outs),
+                            (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                            "ror\t$rs, $rt, $rd">;
+def RORImm : MipsAsmPseudoInst<(outs),
+                               (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                               "ror\t$rs, $rt, $imm">;
+def : MipsInstAlias<"ror $rd, $rs",
+                    (ROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"ror $rd, $imm",
+                    (RORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def DROL : MipsAsmPseudoInst<(outs),
+                             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                             "drol\t$rs, $rt, $rd">, ISA_MIPS64;
+def DROLImm : MipsAsmPseudoInst<(outs),
+                                (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                                "drol\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $rs",
+                    (DROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $imm",
+                    (DROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
+def DROR : MipsAsmPseudoInst<(outs),
+                             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                             "dror\t$rs, $rt, $rd">, ISA_MIPS64;
+def DRORImm : MipsAsmPseudoInst<(outs),
+                                (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                                "dror\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $rs",
+                    (DROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $imm",
+                    (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
 def : MipsInstAlias<"move $dst, $src",
-                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
+                    (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+      GPR_32 {
+  let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"move $dst, $src",
+                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
       GPR_32 {
   let AdditionalPredicates = [NotInMicroMips];
 }
@@ -1630,27 +1881,27 @@ def : MipsInstAlias<"beqz $rs,$offset",
 def : MipsInstAlias<"beqzl $rs,$offset",
                     (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
 def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
-    
+
 def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
 def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
-}
-def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
-
-def : MipsInstAlias<"teq $rs, $rt",
-                    (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tge $rs, $rt",
-                    (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tgeu $rs, $rt",
-                    (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tlt $rs, $rt",
-                    (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tltu $rs, $rt",
-                    (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tne $rs, $rt",
-                    (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-
+  def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"teq $rs, $rt",
+                      (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tge $rs, $rt",
+                      (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tgeu $rs, $rt",
+                      (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tlt $rs, $rt",
+                      (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tltu $rs, $rt",
+                      (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tne $rs, $rt",
+                      (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+}
 def  : MipsInstAlias<"sll $rd, $rt, $rs",
                      (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"sub, $rd, $rs, $imm",
@@ -1678,7 +1929,7 @@ def : MipsInstAlias<"sync",
 class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32 : LoadImmediate32<"li", uimm5, GPR32Opnd>;
+def LoadImm32 : LoadImmediate32<"li", simm32, GPR32Opnd>;
 
 class LoadAddressFromReg32<string instr_asm, Operand MemOpnd,
                            RegisterOperand RO> :
@@ -1689,13 +1940,16 @@ def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>;
 class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddrImm32 : LoadAddressFromImm32<"la", uimm5, GPR32Opnd>;
+def LoadAddrImm32 : LoadAddressFromImm32<"la", simm32, GPR32Opnd>;
 
 def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
                       "jal\t$rd, $rs"> ;
 def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
                       "jal\t$rs"> ;
 
+def NORImm : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                               "nor\t$rs, $rt, $imm"> ;
+
 let hasDelaySlot = 1 in {
 def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
                                (ins imm64:$imm64, brtarget:$offset),
@@ -1718,12 +1972,62 @@ def BLTU : CondBranchPseudo<"bltu">;
 def BLEU : CondBranchPseudo<"bleu">;
 def BGEU : CondBranchPseudo<"bgeu">;
 def BGTU : CondBranchPseudo<"bgtu">;
+def BLTL : CondBranchPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEL : CondBranchPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEL : CondBranchPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTL : CondBranchPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTUL: CondBranchPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+class CondBranchImmPseudo<string instr_asm> :
+  MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+                    !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
+
+def BLTImmMacro  : CondBranchImmPseudo<"blt">;
+def BLEImmMacro  : CondBranchImmPseudo<"ble">;
+def BGEImmMacro  : CondBranchImmPseudo<"bge">;
+def BGTImmMacro  : CondBranchImmPseudo<"bgt">;
+def BLTUImmMacro : CondBranchImmPseudo<"bltu">;
+def BLEUImmMacro : CondBranchImmPseudo<"bleu">;
+def BGEUImmMacro : CondBranchImmPseudo<"bgeu">;
+def BGTUImmMacro : CondBranchImmPseudo<"bgtu">;
+def BLTLImmMacro : CondBranchImmPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLELImmMacro : CondBranchImmPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGELImmMacro : CondBranchImmPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTLImmMacro : CondBranchImmPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTULImmMacro : CondBranchImmPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEULImmMacro : CondBranchImmPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEULImmMacro : CondBranchImmPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+// FIXME: Predicates are removed because instructions are matched regardless of
+// predicates, because PredicateControl was not in the hierarchy. This was
+// done to emit more precise error message from expansion function.
+// Once the tablegen-erated errors are made better, this needs to be fixed and
+// predicates needs to be restored.
+
+def SDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "div\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def UDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "divu\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def DSDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                   "ddiv\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6;
+
+def DUDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                   "ddivu\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6;
+
+def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+                            "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
 
 def Ulhu : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
-                             "ulhu\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6;
+                             "ulhu\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
 
 def Ulw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
-                            "ulw\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6;
+                            "ulw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -1771,8 +2075,6 @@ def : MipsPat<(MipsSync (i32 immz)),
               (SYNC 0)>, ISA_MIPS2;
 
 // Call
-def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
-              (JAL tglobaladdr:$dst)>;
 def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
               (JAL texternalsym:$dst)>;
 //def : MipsPat<(MipsJmpLink GPR32:$dst),
@@ -1939,6 +2241,16 @@ let AddedComplexity = 40 in {
   }
 }
 
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
+def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
+def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>;
+
 //===----------------------------------------------------------------------===//
 // Floating Point Support
 //===----------------------------------------------------------------------===//
@@ -1964,6 +2276,10 @@ include "MipsDSPInstrInfo.td"
 include "MipsMSAInstrFormats.td"
 include "MipsMSAInstrInfo.td"
 
+// EVA
+include "MipsEVAInstrFormats.td"
+include "MipsEVAInstrInfo.td"
+
 // Micromips
 include "MicroMipsInstrFormats.td"
 include "MicroMipsInstrInfo.td"
@@ -1972,3 +2288,11 @@ include "MicroMipsInstrFPU.td"
 // Micromips r6
 include "MicroMips32r6InstrFormats.td"
 include "MicroMips32r6InstrInfo.td"
+
+// Micromips64 r6
+include "MicroMips64r6InstrFormats.td"
+include "MicroMips64r6InstrInfo.td"
+
+// Micromips DSP
+include "MicroMipsDSPInstrFormats.td"
+include "MicroMipsDSPInstrInfo.td"
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
index 90f8cc0..49fb99a 100644
--- a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
@@ -148,7 +148,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
   // Insert NewMBB and fix control flow.
   MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
   NewMBB->transferSuccessors(MBB);
-  NewMBB->removeSuccessor(Tgt);
+  NewMBB->removeSuccessor(Tgt, true);
   MBB->addSuccessor(NewMBB);
   MBB->addSuccessor(Tgt);
   MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
@@ -161,7 +161,7 @@ void MipsLongBranch::initMBBInfo() {
   // Split the MBBs if they have two branches. Each basic block should have at
   // most one branch after this loop is executed.
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
-    splitMBB(I++);
+    splitMBB(&*I++);
 
   MF->RenumberBlocks();
   MBBInfos.clear();
@@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
-  MBB->removeSuccessor(TgtMBB);
-  MBB->addSuccessor(LongBrMBB);
+  MBB->replaceSuccessor(TgtMBB, LongBrMBB);
 
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
@@ -434,7 +433,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
   } else
     // Change branch destination and reverse condition.
-    replaceBranch(*MBB, I.Br, DL, FallThroughMBB);
+    replaceBranch(*MBB, I.Br, DL, &*FallThroughMBB);
 }
 
 static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
index bff2d0f..7d25ea5 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -7,18 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def HasMSA : Predicate<"Subtarget->hasMSA()">,
-             AssemblerPredicate<"FeatureMSA">;
-
-class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
-  let Predicates = [HasMSA];
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                PredicateControl, ASE_MSA {
+  let EncodingPredicates = [HasStdEnc];
   let Inst{31-26} = 0b011110;
 }
 
-class MSA64Inst : MSAInst {
-  let Predicates = [HasMSA, HasMips64];
-}
-
 class MSACBranch : MSAInst {
   let Inst{31-26} = 0b010001;
 }
@@ -27,10 +21,6 @@ class MSASpecial : MSAInst {
   let Inst{31-26} = 0b000000;
 }
 
-class MSA64Special : MSA64Inst {
-  let Inst{31-26} = 0b000000;
-}
-
 class MSAPseudo<dag outs, dag ins, list<dag> pattern,
                 InstrItinClass itin = IIPseudo>:
   MipsPseudo<outs, ins, pattern, itin> {
@@ -100,7 +90,7 @@ class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
   let Inst{5-0} = minor;
 }
 
-class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSA64Inst {
+class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
   bits<5> rs;
   bits<5> wd;
 
@@ -293,7 +283,7 @@ class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
   let Inst{5-0} = minor;
 }
 
-class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
   bits<4> n;
   bits<5> ws;
   bits<5> rd;
@@ -345,7 +335,7 @@ class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
   let Inst{5-0} = minor;
 }
 
-class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
   bits<6> n;
   bits<5> rs;
   bits<5> wd;
@@ -450,7 +440,7 @@ class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
   let Inst{5-0} = minor;
 }
 
-class SPECIAL_DLSA_FMT<bits<6> minor>: MSA64Special {
+class SPECIAL_DLSA_FMT<bits<6> minor>: MSASpecial {
   bits<5> rs;
   bits<5> rt;
   bits<5> rd;
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
index 970e98e..eacfcec 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -63,30 +63,13 @@ def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
 def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
     SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
 
+def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
+def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
 def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
 def immZExt6Ptr : ImmLeaf<iPTR, [{return isUInt<6>(Imm);}]>;
 
 // Operands
 
-// The immediate of an LSA instruction needs special handling
-// as the encoded value should be subtracted by one.
-def uimm2LSAAsmOperand : AsmOperandClass {
-  let Name = "LSAImm";
-  let ParserMethod = "parseLSAImm";
-  let RenderMethod = "addImmOperands";
-}
-
-def LSAImm : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
-  let EncoderMethod = "getLSAImmEncoding";
-  let DecoderMethod = "DecodeLSAImm";
-  let ParserMatchClass = uimm2LSAAsmOperand;
-}
-
-def uimm4 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
 def uimm4_ptr : Operand<iPTR> {
   let PrintMethod = "printUnsignedImm8";
 }
@@ -95,10 +78,6 @@ def uimm6_ptr : Operand<iPTR> {
   let PrintMethod = "printUnsignedImm8";
 }
 
-def uimm8 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
 def simm5 : Operand<i32>;
 
 def vsplat_uimm1 : Operand<vAny> {
@@ -639,7 +618,6 @@ class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>;
 class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
 class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
 class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
-class COPY_U_D_ENC : MSA_ELM_COPY_D_FMT<0b0011, 0b011001>;
 
 class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
 
@@ -1195,47 +1173,14 @@ class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   InstrItinClass Itinerary = itin;
 }
 
-// This class is deprecated and will be removed soon.
-class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
+class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  dag InOperandList = (ins ROWS:$ws, ImmOp:$m);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
   InstrItinClass Itinerary = itin;
 }
 
@@ -1291,13 +1236,14 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 }
 
 class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWD, RegisterOperand ROWS,
+                            Operand ImmOp, ImmLeaf Imm,
                             InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, uimm4:$n);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ImmOp:$n);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
   list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
-                                              immZExt4:$n))];
+                                              Imm:$n))];
   string Constraints = "$wd = $wd_in";
   InstrItinClass Itinerary = itin;
 }
@@ -1479,7 +1425,7 @@ class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
   dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
   string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
   list<dag> Pattern = [];
-  InstrItinClass Itinerary = IIBranch;
+  InstrItinClass Itinerary = NoItinerary;
   bit isBranch = 1;
   bit isTerminator = 1;
   bit hasDelaySlot = 1;
@@ -1519,13 +1465,14 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
 }
 
 class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
                           InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2);
+  dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2);
   string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
   list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
-                                              immZExt6:$n,
+                                              Imm:$n,
                                               ROWS:$ws,
                                               immz:$n2))];
   InstrItinClass Itinerary = itin;
@@ -1934,8 +1881,6 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
                                          GPR32Opnd, MSA128HOpnd>;
 class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
                                          GPR32Opnd, MSA128WOpnd>;
-class COPY_U_D_DESC : MSA_COPY_DESC_BASE<"copy_u.d", vextract_zext_i64, v2i64,
-                                         GPR64Opnd, MSA128DOpnd>;
 
 class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
                                                  MSA128W>;
@@ -2346,13 +2291,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC :
 class INSERT_FD_VIDX64_PSEUDO_DESC :
     MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
 
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
                                          MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
                                          MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
                                          MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
                                          MSA128DOpnd>;
 
 class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -2381,7 +2326,7 @@ class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
                     RegisterOperand RORS = RORD, RegisterOperand RORT = RORD,
                     InstrItinClass itin = NoItinerary > {
   dag OutOperandList = (outs RORD:$rd);
-  dag InOperandList = (ins RORS:$rs, RORT:$rt, LSAImm:$sa);
+  dag InOperandList = (ins RORS:$rs, RORT:$rt, uimm2_plus1:$sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
   list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt,
                                                 (shl RORS:$rs,
@@ -2561,23 +2506,23 @@ class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
 class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
 class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
 
-class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
-                                           MSA128BOpnd>;
-class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
-                                           MSA128HOpnd>;
-class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
-                                           MSA128WOpnd>;
-class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
-                                           MSA128DOpnd>;
-
-class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
-                                           MSA128BOpnd>;
-class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
-                                           MSA128HOpnd>;
-class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
-                                           MSA128WOpnd>;
-class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
-                                           MSA128DOpnd>;
+class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
 class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
@@ -2589,13 +2534,17 @@ class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
 class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
 
 class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
-                                          MSA128BOpnd>;
+                                          MSA128BOpnd, MSA128BOpnd, uimm4,
+                                          immZExt4>;
 class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
-                                          MSA128HOpnd>;
+                                          MSA128HOpnd, MSA128HOpnd, uimm3,
+                                          immZExt3>;
 class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
-                                          MSA128WOpnd>;
+                                          MSA128WOpnd, MSA128WOpnd, uimm2,
+                                          immZExt2>;
 class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
-                                          MSA128DOpnd>;
+                                          MSA128DOpnd, MSA128DOpnd, uimm1,
+                                          immZExt1>;
 
 class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
 class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
@@ -2648,14 +2597,14 @@ class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
 class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
 class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
 
-class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
-                                           MSA128BOpnd>;
-class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
-                                           MSA128HOpnd>;
-class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
-                                           MSA128WOpnd>;
-class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
-                                           MSA128DOpnd>;
+class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
 class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
@@ -2676,14 +2625,14 @@ class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
 class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
 class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
 
-class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
-                                           MSA128BOpnd>;
-class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
-                                           MSA128HOpnd>;
-class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
-                                           MSA128WOpnd>;
-class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
-                                           MSA128DOpnd>;
+class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
@@ -2991,12 +2940,11 @@ def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
 def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
 def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
 def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
-def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC;
+def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC, ASE_MSA64;
 
 def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
 def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
-def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
-def COPY_U_D : COPY_U_D_ENC, COPY_U_D_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC, ASE_MSA64;
 
 def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
 def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
@@ -3108,7 +3056,7 @@ def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
 def FILL_B : FILL_B_ENC, FILL_B_DESC;
 def FILL_H : FILL_H_ENC, FILL_H_DESC;
 def FILL_W : FILL_W_ENC, FILL_W_DESC;
-def FILL_D : FILL_D_ENC, FILL_D_DESC;
+def FILL_D : FILL_D_ENC, FILL_D_DESC, ASE_MSA64;
 def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
 def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
 
@@ -3238,7 +3186,7 @@ def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
 def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
 def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
 def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
-def INSERT_D : INSERT_D_ENC, INSERT_D_DESC;
+def INSERT_D : INSERT_D_ENC, INSERT_D_DESC, ASE_MSA64;
 
 // INSERT_FW_PSEUDO defined after INSVE_W
 // INSERT_FD_PSEUDO defined after INSVE_D
@@ -3280,7 +3228,7 @@ def LDI_W : LDI_W_ENC, LDI_W_DESC;
 def LDI_D : LDI_D_ENC, LDI_D_DESC;
 
 def LSA : LSA_ENC, LSA_DESC;
-def DLSA : DLSA_ENC, DLSA_DESC;
+def DLSA : DLSA_ENC, DLSA_DESC, ASE_MSA64;
 
 def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
 def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
@@ -3787,6 +3735,28 @@ def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
 def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
                                                MSA128B, NoItinerary>;
 
+// Vector extraction with fixed index.
+//
+// Extracting 32-bit values on MSA32 should always use COPY_S_W rather than
+// COPY_U_W, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a MIPS64 can execute MIPS32 code without getting
+// different register values.
+def : MSAPat<(vextract_zext_i32 (v4i32 MSA128W:$ws), immZExt2Ptr:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+def : MSAPat<(vextract_zext_i32 (v4f32 MSA128W:$ws), immZExt2Ptr:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+
+// Extracting 64-bit values on MSA64 should always use COPY_S_D rather than
+// COPY_U_D, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a hypothetical MIPS128 would be able to execute MIPS64
+// code without getting different register values.
+def : MSAPat<(vextract_zext_i64 (v2i64 MSA128D:$ws), immZExt1Ptr:$idx),
+             (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+def : MSAPat<(vextract_zext_i64 (v2f64 MSA128D:$ws), immZExt1Ptr:$idx),
+             (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+
 // Vector extraction with variable index
 def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
              (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
index 0d1ee04..c7d2738 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -24,42 +24,6 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
-// class MipsCallEntry.
-MipsCallEntry::MipsCallEntry(StringRef N) {
-#ifndef NDEBUG
-  Name = N;
-  Val = nullptr;
-#endif
-}
-
-MipsCallEntry::MipsCallEntry(const GlobalValue *V) {
-#ifndef NDEBUG
-  Val = V;
-#endif
-}
-
-bool MipsCallEntry::isConstant(const MachineFrameInfo *) const {
-  return false;
-}
-
-bool MipsCallEntry::isAliased(const MachineFrameInfo *) const {
-  return false;
-}
-
-bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const {
-  return false;
-}
-
-void MipsCallEntry::printCustom(raw_ostream &O) const {
-  O << "MipsCallEntry: ";
-#ifndef NDEBUG
-  if (Val)
-    O << Val->getName();
-  else
-    O << Name;
-#endif
-}
-
 MipsFunctionInfo::~MipsFunctionInfo() {}
 
 bool MipsFunctionInfo::globalBaseRegSet() const {
@@ -111,27 +75,32 @@ void MipsFunctionInfo::createEhDataRegsFI() {
   }
 }
 
+void MipsFunctionInfo::createISRRegFI() {
+  // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers.
+  // The current implementation only supports Mips32r2+ not Mips64rX. Status
+  // is always 32 bits, ErrorPC is 32 or 64 bits dependant on architecture,
+  // however Mips32r2+ is the supported architecture.
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+
+  for (int I = 0; I < 2; ++I)
+    ISRDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(
+        RC->getSize(), RC->getAlignment(), false);
+}
+
 bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
   return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
                         || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
 }
 
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(StringRef Name) {
-  std::unique_ptr<const MipsCallEntry> &E = ExternalCallEntries[Name];
-
-  if (!E)
-    E = llvm::make_unique<MipsCallEntry>(Name);
-
-  return MachinePointerInfo(E.get());
+bool MipsFunctionInfo::isISRRegFI(int FI) const {
+  return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]);
+}
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) {
+  return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES));
 }
 
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) {
-  std::unique_ptr<const MipsCallEntry> &E = GlobalCallEntries[Val];
-
-  if (!E)
-    E = llvm::make_unique<MipsCallEntry>(Val);
-
-  return MachinePointerInfo(E.get());
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
+  return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV));
 }
 
 int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
index 32436ef..a2f6ee0 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -15,12 +15,10 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
 
 #include "Mips16HardFloatInfo.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -30,31 +28,13 @@
 
 namespace llvm {
 
-/// \brief A class derived from PseudoSourceValue that represents a GOT entry
-/// resolved by lazy-binding.
-class MipsCallEntry : public PseudoSourceValue {
-public:
-  explicit MipsCallEntry(StringRef N);
-  explicit MipsCallEntry(const GlobalValue *V);
-  bool isConstant(const MachineFrameInfo *) const override;
-  bool isAliased(const MachineFrameInfo *) const override;
-  bool mayAlias(const MachineFrameInfo *) const override;
-
-private:
-  void printCustom(raw_ostream &O) const override;
-#ifndef NDEBUG
-  std::string Name;
-  const GlobalValue *Val;
-#endif
-};
-
 /// MipsFunctionInfo - This class is derived from MachineFunction private
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
 public:
   MipsFunctionInfo(MachineFunction &MF)
       : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
-        VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false),
+        VarArgsFrameIndex(0), CallsEhReturn(false), IsISR(false), SaveS2(false),
         MoveF64ViaSpillFI(-1) {}
 
   ~MipsFunctionInfo();
@@ -86,13 +66,21 @@ public:
   int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
   bool isEhDataRegFI(int FI) const;
 
-  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
-  /// representing a GOT entry for an external function.
-  MachinePointerInfo callPtrInfo(StringRef Name);
+  /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue
+  /// object representing a GOT entry for an external function.
+  MachinePointerInfo callPtrInfo(const char *ES);
+
+  // Functions with the "interrupt" attribute require special prologues,
+  // epilogues and additional spill slots.
+  bool isISR() const { return IsISR; }
+  void setISR() { IsISR = true; }
+  void createISRRegFI();
+  int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; }
+  bool isISRRegFI(int FI) const;
 
-  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object
   /// representing a GOT entry for a global function.
-  MachinePointerInfo callPtrInfo(const GlobalValue *Val);
+  MachinePointerInfo callPtrInfo(const GlobalValue *GV);
 
   void setSaveS2() { SaveS2 = true; }
   bool hasSaveS2() const { return SaveS2; }
@@ -136,17 +124,18 @@ private:
   /// Frame objects for spilling eh data registers.
   int EhDataRegFI[4];
 
+  /// ISR - Whether the function is an Interrupt Service Routine.
+  bool IsISR;
+
+  /// Frame objects for spilling C0_STATUS, C0_EPC
+  int ISRDataRegFI[2];
+
   // saveS2
   bool SaveS2;
 
   /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
   /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
   int MoveF64ViaSpillFI;
-
-  /// MipsCallEntry maps.
-  StringMap<std::unique_ptr<const MipsCallEntry>> ExternalCallEntries;
-  ValueMap<const GlobalValue *, std::unique_ptr<const MipsCallEntry>>
-      GlobalCallEntries;
 };
 
 } // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index f6647e6..28e5a42 100644
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -84,6 +84,16 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 const MCPhysReg *
 MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>();
+  const Function *F = MF->getFunction();
+  if (F->hasFnAttribute("interrupt")) {
+    if (Subtarget.hasMips64())
+      return Subtarget.hasMips64r6() ? CSR_Interrupt_64R6_SaveList
+                                     : CSR_Interrupt_64_SaveList;
+    else
+      return Subtarget.hasMips32r6() ? CSR_Interrupt_32R6_SaveList
+                                     : CSR_Interrupt_32_SaveList;
+  }
+
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
 
@@ -284,6 +294,16 @@ getFrameRegister(const MachineFunction &MF) const {
 }
 
 bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  // Avoid realigning functions that explicitly do not want to be realigned.
+  // Normally, we should report an error when a function should be dynamically
+  // realigned but also has the attribute no-realign-stack. Unfortunately,
+  // with this attribute, MachineFrameInfo clamps each new object's alignment
+  // to that of the stack's alignment as specified by the ABI. As a result,
+  // the information of whether we have objects with larger alignment
+  // requirement than the stack's alignment is already lost at this point.
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+
   const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
   unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64;
   unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64;
@@ -306,42 +326,3 @@ bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // sized objects.
   return MF.getRegInfo().canReserveReg(BP);
 }
-
-bool MipsRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  bool CanRealign = canRealignStack(MF);
-
-  // Avoid realigning functions that explicitly do not want to be realigned.
-  // Normally, we should report an error when a function should be dynamically
-  // realigned but also has the attribute no-realign-stack. Unfortunately,
-  // with this attribute, MachineFrameInfo clamps each new object's alignment
-  // to that of the stack's alignment as specified by the ABI. As a result,
-  // the information of whether we have objects with larger alignment
-  // requirement than the stack's alignment is already lost at this point.
-  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
-    return false;
-
-  const Function *F = MF.getFunction();
-  if (F->hasFnAttribute(Attribute::StackAlignment)) {
-#ifdef DEBUG
-    if (!CanRealign)
-      DEBUG(dbgs() << "It's not possible to realign the stack of the function: "
-            << F->getName() << "\n");
-#endif
-    return CanRealign;
-  }
-
-  unsigned StackAlignment = Subtarget.getFrameLowering()->getStackAlignment();
-  if (MFI->getMaxAlignment() > StackAlignment) {
-#ifdef DEBUG
-    if (!CanRealign)
-      DEBUG(dbgs() << "It's not possible to realign the stack of the function: "
-            << F->getName() << "\n");
-#endif
-    return CanRealign;
-  }
-
-  return false;
-}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
index ee1f6bc..5de68a2 100644
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -61,9 +61,7 @@ public:
                                        RegScavenger *RS = nullptr) const;
 
   // Stack realignment queries.
-  bool canRealignStack(const MachineFunction &MF) const;
-
-  bool needsStackRealignment(const MachineFunction &MF) const override;
+  bool canRealignStack(const MachineFunction &MF) const override;
 
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 096b3be..a4abd62 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -17,6 +17,7 @@
 #include "MipsMachineFunction.h"
 #include "MipsSEInstrInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -319,6 +320,15 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
 bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            bool FP64) const {
+  const MachineOperand &Op1 = I->getOperand(1);
+  const MachineOperand &Op2 = I->getOperand(2);
+
+  if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg);
+    return true;
+  }
+
   // For fpxx and when mfhc1 is not available, use:
   //   spill + reload via ldc1
   //
@@ -335,8 +345,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
   if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
       (FP64 && !Subtarget.useOddSPReg())) {
     unsigned DstReg = I->getOperand(0).getReg();
-    unsigned SrcReg = I->getOperand(1).getReg();
-    unsigned N = I->getOperand(2).getImm();
+    unsigned SrcReg = Op1.getReg();
+    unsigned N = Op2.getImm();
     int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
 
     // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
@@ -352,8 +362,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
     // We re-use the same spill slot each time so that the stack frame doesn't
     // grow too much in functions with a large number of moves.
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
-    TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC,
-                        &RegInfo, 0);
+    TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
     TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
     return true;
   }
@@ -376,12 +385,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  DebugLoc dl;
   MipsABIInfo ABI = STI.getABI();
   unsigned SP = ABI.GetStackPtr();
   unsigned FP = ABI.GetFramePtr();
   unsigned ZERO = ABI.GetNullPtr();
-  unsigned ADDu = ABI.GetPtrAdduOp();
+  unsigned MOVE = ABI.GetGPRMoveOp();
   unsigned ADDiu = ABI.GetPtrAddiuOp();
   unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND;
 
@@ -407,6 +416,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
+  if (MF.getFunction()->hasFnAttribute("interrupt"))
+    emitInterruptPrologueStub(MF, MBB);
+
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
   if (CSI.size()) {
@@ -491,7 +503,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
     // Insert instruction "move $fp, $sp" at this location.
-    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO)
+    BuildMI(MBB, MBBI, dl, TII.get(MOVE), FP).addReg(SP).addReg(ZERO)
       .setMIFlag(MachineInstr::FrameSetup);
 
     // emit ".cfi_def_cfa_register $fp"
@@ -514,7 +526,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       if (hasBP(MF)) {
         // move $s7, $sp
         unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7;
-        BuildMI(MBB, MBBI, dl, TII.get(ADDu), BP)
+        BuildMI(MBB, MBBI, dl, TII.get(MOVE), BP)
           .addReg(SP)
           .addReg(ZERO);
       }
@@ -522,6 +534,135 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void MipsSEFrameLowering::emitInterruptPrologueStub(
+    MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Report an error the target doesn't support Mips32r2 or later.
+  // The epilogue relies on the use of the "ehb" to clear execution
+  // hazards. Pre R2 Mips relies on an implementation defined number
+  // of "ssnop"s to clear the execution hazard. Support for ssnop hazard
+  // clearing is not provided so reject that configuration.
+  if (!STI.hasMips32r2())
+    report_fatal_error(
+        "\"interrupt\" attribute is not supported on pre-MIPS32R2 or "
+        "MIPS16 targets.");
+
+  // The GP register contains the "user" value, so we cannot perform
+  // any gp relative loads until we restore the "kernel" or "system" gp
+  // value. Until support is written we shall only accept the static
+  // relocation model.
+  if ((STI.getRelocationModel() != Reloc::Static))
+    report_fatal_error("\"interrupt\" attribute is only supported for the "
+                       "static relocation model on MIPS at the present time.");
+
+  if (!STI.isABI_O32() || STI.hasMips64())
+    report_fatal_error("\"interrupt\" attribute is only supported for the "
+                       "O32 ABI on MIPS32R2+ at the present time.");
+
+  // Perform ISR handling like GCC
+  StringRef IntKind =
+      MF.getFunction()->getFnAttribute("interrupt").getValueAsString();
+  const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+  // EIC interrupt handling needs to read the Cause register to disable
+  // interrupts.
+  if (IntKind == "eic") {
+    // Coprocessor registers are always live per se.
+    MBB.addLiveIn(Mips::COP013);
+    BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K0)
+        .addReg(Mips::COP013)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EXT), Mips::K0)
+        .addReg(Mips::K0)
+        .addImm(10)
+        .addImm(6)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Fetch and spill EPC
+  MBB.addLiveIn(Mips::COP014);
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+      .addReg(Mips::COP014)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+                                      MipsFI->getISRRegFI(0), PtrRC,
+                                      STI.getRegisterInfo(), 0);
+
+  // Fetch and Spill Status
+  MBB.addLiveIn(Mips::COP012);
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+      .addReg(Mips::COP012)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+                                      MipsFI->getISRRegFI(1), PtrRC,
+                                      STI.getRegisterInfo(), 0);
+
+  // Build the configuration for disabling lower priority interrupts. Non EIC
+  // interrupts need to be masked off with zero, EIC from the Cause register.
+  unsigned InsPosition = 8;
+  unsigned InsSize = 0;
+  unsigned SrcReg = Mips::ZERO;
+
+  // If the interrupt we're tied to is the EIC, switch the source for the
+  // masking off interrupts to the cause register.
+  if (IntKind == "eic") {
+    SrcReg = Mips::K0;
+    InsPosition = 10;
+    InsSize = 6;
+  } else
+    InsSize = StringSwitch<unsigned>(IntKind)
+                  .Case("sw0", 1)
+                  .Case("sw1", 2)
+                  .Case("hw0", 3)
+                  .Case("hw1", 4)
+                  .Case("hw2", 5)
+                  .Case("hw3", 6)
+                  .Case("hw4", 7)
+                  .Case("hw5", 8)
+                  .Default(0);
+  assert(InsSize != 0 && "Unknown interrupt type!");
+
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+      .addReg(SrcReg)
+      .addImm(InsPosition)
+      .addImm(InsSize)
+      .addReg(Mips::K1)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Mask off KSU, ERL, EXL
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+      .addReg(Mips::ZERO)
+      .addImm(1)
+      .addImm(4)
+      .addReg(Mips::K1)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Disable the FPU as we are not spilling those register sets.
+  if (!STI.useSoftFloat())
+    BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+        .addReg(Mips::ZERO)
+        .addImm(29)
+        .addImm(1)
+        .addReg(Mips::K1)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  // Set the new status
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+      .addReg(Mips::K1)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+}
+
 void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
                                        MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -533,12 +674,12 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
   const MipsRegisterInfo &RegInfo =
       *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
 
-  DebugLoc dl = MBBI->getDebugLoc();
+  DebugLoc DL = MBBI->getDebugLoc();
   MipsABIInfo ABI = STI.getABI();
   unsigned SP = ABI.GetStackPtr();
   unsigned FP = ABI.GetFramePtr();
   unsigned ZERO = ABI.GetNullPtr();
-  unsigned ADDu = ABI.GetPtrAdduOp();
+  unsigned MOVE = ABI.GetGPRMoveOp();
 
   // if framepointer enabled, restore the stack pointer.
   if (hasFP(MF)) {
@@ -549,7 +690,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
       --I;
 
     // Insert instruction "move $sp, $fp" at this location.
-    BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
+    BuildMI(MBB, I, DL, TII.get(MOVE), SP).addReg(FP).addReg(ZERO);
   }
 
   if (MipsFI->callsEhReturn()) {
@@ -568,6 +709,9 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
     }
   }
 
+  if (MF.getFunction()->hasFnAttribute("interrupt"))
+    emitInterruptEpilogueStub(MF, MBB);
+
   // Get the number of bytes from FrameInfo
   uint64_t StackSize = MFI->getStackSize();
 
@@ -578,13 +722,59 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
   TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
 }
 
+void MipsSEFrameLowering::emitInterruptEpilogueStub(
+    MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Perform ISR handling like GCC
+  const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+  // Disable Interrupts.
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::DI), Mips::ZERO);
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB));
+
+  // Restore EPC
+  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+                                           MipsFI->getISRRegFI(0), PtrRC,
+                                           STI.getRegisterInfo());
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014)
+      .addReg(Mips::K1)
+      .addImm(0);
+
+  // Restore Status
+  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+                                           MipsFI->getISRRegFI(1), PtrRC,
+                                           STI.getRegisterInfo());
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+      .addReg(Mips::K1)
+      .addImm(0);
+}
+
+int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                int FI,
+                                                unsigned &FrameReg) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsABIInfo ABI = STI.getABI();
+
+  if (MFI->isFixedObjectIndex(FI))
+    FrameReg = hasFP(MF) ? ABI.GetFramePtr() : ABI.GetStackPtr();
+  else
+    FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr();
+
+  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+         getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
+}
+
 bool MipsSEFrameLowering::
 spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI,
                           const std::vector<CalleeSavedInfo> &CSI,
                           const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  MachineBasicBlock *EntryBlock = MF->begin();
+  MachineBasicBlock *EntryBlock = &MF->front();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -599,6 +789,26 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     if (!IsRAAndRetAddrIsTaken)
       EntryBlock->addLiveIn(Reg);
 
+    // ISRs require HI/LO to be spilled into kernel registers to be then
+    // spilled to the stack frame.
+    bool IsLOHI = (Reg == Mips::LO0 || Reg == Mips::LO0_64 ||
+                   Reg == Mips::HI0 || Reg == Mips::HI0_64);
+    const Function *Func = MBB.getParent()->getFunction();
+    if (IsLOHI && Func->hasFnAttribute("interrupt")) {
+      DebugLoc DL = MI->getDebugLoc();
+
+      unsigned Op = 0;
+      if (!STI.getABI().ArePtrs64bit()) {
+        Op = (Reg == Mips::HI0) ? Mips::MFHI : Mips::MFLO;
+        Reg = Mips::K0;
+      } else {
+        Op = (Reg == Mips::HI0) ? Mips::MFHI64 : Mips::MFLO64;
+        Reg = Mips::K0_64;
+      }
+      BuildMI(MBB, MI, DL, TII.get(Op), Mips::K0)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     // Insert the spill to the stack frame.
     bool IsKill = !IsRAAndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
@@ -622,7 +832,8 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 }
 
 /// Mark \p Reg and all registers aliasing it in the bitset.
-void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) {
+static void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs,
+                         unsigned Reg) {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     SavedRegs.set(*AI);
@@ -648,6 +859,10 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MipsFI->callsEhReturn())
     MipsFI->createEhDataRegsFI();
 
+  // Create spill slots for Coprocessor 0 registers if function is an ISR.
+  if (MipsFI->isISR())
+    MipsFI->createISRRegFI();
+
   // Expand pseudo instructions which load, store or copy accumulators.
   // Add an emergency spill slot if a pseudo was expanded.
   if (ExpandPseudo(MF).expand()) {
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index 9cb32e6..63cd3ce 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -27,6 +27,9 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
@@ -37,8 +40,13 @@ public:
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
   unsigned ehDataReg(unsigned I) const;
-};
 
+private:
+  void emitInterruptEpilogueStub(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
+  void emitInterruptPrologueStub(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
+};
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 2ebfbd1..6f001ea 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -136,7 +136,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
   unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC;
   const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index b319fd0..efe22fb 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1181,6 +1181,10 @@ bool MipsSETargetLowering::isEligibleForTailCallOptimization(
   if (!EnableMipsTailCalls)
     return false;
 
+  // Exception has to be cleared with eret.
+  if (FI.isISR())
+    return false;
+
   // Return false if either the callee or caller has a byval argument.
   if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg())
     return false;
@@ -1786,9 +1790,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
                        Op->getOperand(2));
   case Intrinsic::mips_fadd_w:
-  case Intrinsic::mips_fadd_d:
+  case Intrinsic::mips_fadd_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
     return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
                        Op->getOperand(2));
+  }
   // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
   case Intrinsic::mips_fceq_w:
   case Intrinsic::mips_fceq_d:
@@ -1831,9 +1837,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
                         Op->getOperand(2), ISD::SETUNE);
   case Intrinsic::mips_fdiv_w:
-  case Intrinsic::mips_fdiv_d:
+  case Intrinsic::mips_fdiv_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
     return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
                        Op->getOperand(2));
+  }
   case Intrinsic::mips_ffint_u_w:
   case Intrinsic::mips_ffint_u_d:
     return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
@@ -1856,6 +1864,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::mips_fexp2_w:
   case Intrinsic::mips_fexp2_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
     EVT ResTy = Op->getValueType(0);
     return DAG.getNode(
         ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
@@ -1869,11 +1878,14 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
   case Intrinsic::mips_fmul_w:
-  case Intrinsic::mips_fmul_d:
+  case Intrinsic::mips_fmul_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
     return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
                        Op->getOperand(2));
+  }
   case Intrinsic::mips_fmsub_w:
   case Intrinsic::mips_fmsub_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
     EVT ResTy = Op->getValueType(0);
     return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
                        DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
@@ -1886,9 +1898,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_fsqrt_d:
     return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
   case Intrinsic::mips_fsub_w:
-  case Intrinsic::mips_fsub_d:
+  case Intrinsic::mips_fsub_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
     return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
                        Op->getOperand(2));
+  }
   case Intrinsic::mips_ftrunc_u_w:
   case Intrinsic::mips_ftrunc_u_d:
     return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index 786307b..d4aeaf9 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -88,7 +88,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       if (isMicroMips)
         Opc = Mips::MOVE16_MM;
       else
-        Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+        Opc = Mips::OR, ZeroReg = Mips::ZERO;
     } else if (Mips::CCRRegClass.contains(SrcReg))
       Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
@@ -141,7 +141,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = Mips::FMOV_D64;
   else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
     if (Mips::GPR64RegClass.contains(SrcReg))
-      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+      Opc = Mips::OR64, ZeroReg = Mips::ZERO_64;
     else if (Mips::HI64RegClass.contains(SrcReg))
       Opc = Mips::MFHI64, SrcReg = 0;
     else if (Mips::LO64RegClass.contains(SrcReg))
@@ -182,7 +182,6 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                 const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
                 int64_t Offset) const {
   DebugLoc DL;
-  if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
 
   unsigned Opc = 0;
@@ -213,6 +212,33 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = Mips::ST_W;
   else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
     Opc = Mips::ST_D;
+  else if (Mips::LO32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::LO64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+  else if (Mips::HI32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::HI64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+
+  // Hi, Lo are normally caller save but they are callee save
+  // for interrupt handling.
+  const Function *Func = MBB.getParent()->getFunction();
+  if (Func->hasFnAttribute("interrupt")) {
+    if (Mips::HI32RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFHI), Mips::K0);
+      SrcReg = Mips::K0;
+    } else if (Mips::HI64RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFHI64), Mips::K0_64);
+      SrcReg = Mips::K0_64;
+    } else if (Mips::LO32RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFLO), Mips::K0);
+      SrcReg = Mips::K0;
+    } else if (Mips::LO64RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFLO64), Mips::K0_64);
+      SrcReg = Mips::K0_64;
+    }
+  }
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
@@ -228,6 +254,11 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
   unsigned Opc = 0;
 
+  const Function *Func = MBB.getParent()->getFunction();
+  bool ReqIndirectLoad = Func->hasFnAttribute("interrupt") &&
+                         (DestReg == Mips::LO0 || DestReg == Mips::LO0_64 ||
+                          DestReg == Mips::HI0 || DestReg == Mips::HI0_64);
+
   if (Mips::GPR32RegClass.hasSubClassEq(RC))
     Opc = Mips::LW;
   else if (Mips::GPR64RegClass.hasSubClassEq(RC))
@@ -254,10 +285,44 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = Mips::LD_W;
   else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
     Opc = Mips::LD_D;
+  else if (Mips::HI32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::HI64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
+  else if (Mips::LO32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::LO64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
 
   assert(Opc && "Register class not handled!");
-  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
-    .addMemOperand(MMO);
+
+  if (!ReqIndirectLoad)
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+  else {
+    // Load HI/LO through K0. Notably the DestReg is encoded into the
+    // instruction itself.
+    unsigned Reg = Mips::K0;
+    unsigned LdOp = Mips::MTLO;
+    if (DestReg == Mips::HI0)
+      LdOp = Mips::MTHI;
+
+    if (Subtarget.getABI().ArePtrs64bit()) {
+      Reg = Mips::K0_64;
+      if (DestReg == Mips::HI0_64)
+        LdOp = Mips::MTHI64;
+      else
+        LdOp = Mips::MTLO64;
+    }
+
+    BuildMI(MBB, I, DL, get(Opc), Reg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+    BuildMI(MBB, I, DL, get(LdOp)).addReg(Reg);
+  }
 }
 
 bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
@@ -271,6 +336,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case Mips::RetRA:
     expandRetRA(MBB, MI);
     break;
+  case Mips::ERet:
+    expandERet(MBB, MI);
+    break;
   case Mips::PseudoMFHI:
     Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
     expandPseudoMFHiLo(MBB, MI, Opc);
@@ -360,7 +428,7 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const {
   MipsABIInfo ABI = Subtarget.getABI();
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
   unsigned ADDu = ABI.GetPtrAdduOp();
   unsigned ADDiu = ABI.GetPtrAddiuOp();
 
@@ -438,6 +506,11 @@ void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
     BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
 }
 
+void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Mips::ERET));
+}
+
 std::pair<bool, bool>
 MipsSEInstrInfo::compareOpndSize(unsigned Opc,
                                  const MachineFunction &MF) const {
@@ -471,8 +544,6 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
   const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
   MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
   MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
-  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
-  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
 
   // Add lo/hi registers if the mtlo/hi instructions created have explicit
   // def registers.
@@ -483,6 +554,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
     LoInst.addReg(DstLo, RegState::Define);
     HiInst.addReg(DstHi, RegState::Define);
   }
+
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
 }
 
 void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index bebbabf..5d73545 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -82,6 +82,8 @@ private:
 
   void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
 
+  void expandERet(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
   std::pair<bool, bool> compareOpndSize(unsigned Opc,
                                         const MachineFunction &MF) const;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
index 132c3a1..b1e2885 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -126,17 +126,19 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   }
 
   bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
-
+  bool IsISRRegFI = MipsFI->isISRRegFI(FrameIndex);
   // The following stack frame objects are always referenced relative to $sp:
   //  1. Outgoing arguments.
   //  2. Pointer to dynamically allocated stack space.
   //  3. Locations for callee-saved registers.
   //  4. Locations for eh data registers.
+  //  5. Locations for ISR saved Coprocessor 0 registers 12 & 14.
   // Everything else is referenced relative to whatever register
   // getFrameRegister() returns.
   unsigned FrameReg;
 
-  if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI)
+  if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI ||
+      IsISRRegFI)
     FrameReg = ABI.GetStackPtr();
   else if (RegInfo->needsStackRealignment(MF)) {
     if (MFI->hasVarSizedObjects() && !MFI->isFixedObjectIndex(FrameIndex))
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
index 54b5d28..37f9e49 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSchedule.td
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -16,8 +16,8 @@ def IMULDIV : FuncUnit;
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for Mips
 //===----------------------------------------------------------------------===//
-def IIAlu              : InstrItinClass;
-def IIBranch           : InstrItinClass;
+// IIM16Alu is a placeholder class for most MIPS16 instructions.
+def IIM16Alu           : InstrItinClass;
 def IIPseudo           : InstrItinClass;
 
 def II_ABS              : InstrItinClass;
@@ -28,7 +28,19 @@ def II_ADD_D            : InstrItinClass;
 def II_ADD_S            : InstrItinClass;
 def II_AND              : InstrItinClass;
 def II_ANDI             : InstrItinClass;
+def II_B                : InstrItinClass;
 def II_BADDU            : InstrItinClass;
+def II_BBIT             : InstrItinClass; // bbit[01], bbit[01]32
+def II_BC               : InstrItinClass;
+def II_BC1F             : InstrItinClass;
+def II_BC1FL            : InstrItinClass;
+def II_BC1T             : InstrItinClass;
+def II_BC1TL            : InstrItinClass;
+def II_BCC              : InstrItinClass; // beq and bne
+def II_BCCZ             : InstrItinClass; // b[gl][et]z
+def II_BCCZAL           : InstrItinClass; // bgezal and bltzal
+def II_BCCZALS          : InstrItinClass; // bgezals and bltzals
+def II_BCCZC            : InstrItinClass; // beqzc, bnezc
 def II_CEIL             : InstrItinClass;
 def II_CFC1             : InstrItinClass;
 def II_CLO              : InstrItinClass;
@@ -68,21 +80,39 @@ def II_DSUB             : InstrItinClass;
 def II_EXT              : InstrItinClass; // Any EXT instruction
 def II_FLOOR            : InstrItinClass;
 def II_INS              : InstrItinClass; // Any INS instruction
+def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
+def II_J                : InstrItinClass;
+def II_JAL              : InstrItinClass;
+def II_JALR             : InstrItinClass;
+def II_JALRC            : InstrItinClass;
+def II_JALRS            : InstrItinClass;
+def II_JALS             : InstrItinClass;
+def II_JR               : InstrItinClass;
+def II_JRADDIUSP        : InstrItinClass;
+def II_JRC              : InstrItinClass;
+def II_ReturnPseudo     : InstrItinClass; // Return pseudo.
 def II_LB               : InstrItinClass;
+def II_LBE              : InstrItinClass;
 def II_LBU              : InstrItinClass;
+def II_LBUE             : InstrItinClass;
 def II_LD               : InstrItinClass;
 def II_LDC1             : InstrItinClass;
 def II_LDL              : InstrItinClass;
 def II_LDR              : InstrItinClass;
 def II_LDXC1            : InstrItinClass;
 def II_LH               : InstrItinClass;
+def II_LHE              : InstrItinClass;
 def II_LHU              : InstrItinClass;
+def II_LHUE             : InstrItinClass;
 def II_LUI              : InstrItinClass;
 def II_LUXC1            : InstrItinClass;
 def II_LW               : InstrItinClass;
+def II_LWE              : InstrItinClass;
 def II_LWC1             : InstrItinClass;
 def II_LWL              : InstrItinClass;
+def II_LWLE             : InstrItinClass;
 def II_LWR              : InstrItinClass;
+def II_LWRE             : InstrItinClass;
 def II_LWU              : InstrItinClass;
 def II_LWXC1            : InstrItinClass;
 def II_MADD             : InstrItinClass;
@@ -134,6 +164,7 @@ def II_ROTRV            : InstrItinClass;
 def II_ROUND            : InstrItinClass;
 def II_SAVE             : InstrItinClass;
 def II_SB               : InstrItinClass;
+def II_SBE              : InstrItinClass;
 def II_SD               : InstrItinClass;
 def II_SDC1             : InstrItinClass;
 def II_SDL              : InstrItinClass;
@@ -144,6 +175,7 @@ def II_SEH              : InstrItinClass;
 def II_SEQ_SNE          : InstrItinClass; // seq and sne
 def II_SEQI_SNEI        : InstrItinClass; // seqi and snei
 def II_SH               : InstrItinClass;
+def II_SHE              : InstrItinClass;
 def II_SLL              : InstrItinClass;
 def II_SLLV             : InstrItinClass;
 def II_SLTI_SLTIU       : InstrItinClass; // slti and sltiu
@@ -159,11 +191,15 @@ def II_SUB_D            : InstrItinClass;
 def II_SUB_S            : InstrItinClass;
 def II_SUXC1            : InstrItinClass;
 def II_SW               : InstrItinClass;
+def II_SWE              : InstrItinClass;
 def II_SWC1             : InstrItinClass;
 def II_SWL              : InstrItinClass;
+def II_SWLE             : InstrItinClass;
 def II_SWR              : InstrItinClass;
+def II_SWRE             : InstrItinClass;
 def II_SWXC1            : InstrItinClass;
 def II_TRUNC            : InstrItinClass;
+def II_WSBH             : InstrItinClass;
 def II_XOR              : InstrItinClass;
 def II_XORI             : InstrItinClass;
 
@@ -171,7 +207,7 @@ def II_XORI             : InstrItinClass;
 // Mips Generic instruction itineraries.
 //===----------------------------------------------------------------------===//
 def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
-  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIM16Alu           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ADDI            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ADDIU           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_ADDU            , [InstrStage<1,  [ALU]>]>,
@@ -240,7 +276,29 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_SAVE            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SEQ_SNE         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SEQI_SNEI       , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_B               , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BBIT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1F            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1FL           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1T            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1TL           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZAL          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZALS         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_IndirectBranchPseudo, [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_J               , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JAL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALRC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALRS           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALS            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JR              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JRADDIUSP       , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JRC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ReturnPseudo    , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DMUL            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DMULT           , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DMULTU          , [InstrStage<17, [IMULDIV]>]>,
@@ -313,3 +371,5 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>
 ]>;
+
+include "MipsScheduleP5600.td"
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
new file mode 100644
index 0000000..d32ae4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -0,0 +1,392 @@
+//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MipsP5600Model : SchedMachineModel {
+  int IssueWidth = 2; // 2x dispatched per cycle
+  int MicroOpBufferSize = 48; // min(48, 48, 64)
+  int LoadLatency = 4;
+  int MispredictPenalty = 8; // TODO: Estimated
+
+  let CompleteModel = 1;
+}
+
+let SchedModel = MipsP5600Model in {
+
+// ALQ Pipelines
+// =============
+
+def P5600ALQ : ProcResource<1> { let BufferSize = 16; }
+def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; }
+
+// ALU Pipeline
+// ------------
+
+def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>;
+
+// and, lui, nor, or, slti, sltiu, sub, subu, xor
+def : ItinRW<[P5600WriteALU],
+             [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUBU, II_XOR]>;
+
+// AGQ Pipelines
+// =============
+
+def P5600AGQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueAL2 : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueCTISTD : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueLDST : ProcResource<1> { let Super = P5600AGQ; }
+
+def P5600AL2Div : ProcResource<1>;
+// Pseudo-resource used to block CTISTD when handling multi-pipeline splits.
+def P5600CTISTD : ProcResource<1>;
+
+// CTISTD Pipeline
+// ---------------
+
+def P5600WriteJump : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
+  let Latency = 2;
+}
+
+// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx,
+// jalr, jr.hb, jr
+def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR]>;
+def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR]>;
+
+// LDST Pipeline
+// -------------
+
+def P5600WriteLoad : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 4;
+}
+
+def P5600WriteLoadShifted : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+  let Latency = 4;
+}
+
+def P5600WritePref : SchedWriteRes<[P5600IssueLDST]>;
+
+def P5600WriteStore : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+  // FIXME: This is a bit pessimistic. P5600CTISTD is only used during cycle 2
+  //        not during 0, 1, and 2.
+  let ResourceCycles = [ 1, 3 ];
+}
+
+def P5600WriteGPRFromBypass : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 2;
+}
+
+def P5600WriteStoreFromOtherUnits : SchedWriteRes<[P5600IssueLDST]>;
+def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 0;
+}
+
+// l[bhw], l[bh]u, ll
+def : ItinRW<[P5600WriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LWU]>;
+
+// lw[lr]
+def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWR]>;
+
+// s[bhw], sw[lr]
+def : ItinRW<[P5600WriteStore], [II_SB, II_SH, II_SW, II_SWL, II_SWR]>;
+
+// pref
+// (this instruction does not exist in the backend yet)
+def : ItinRW<[P5600WritePref], []>;
+
+// sc
+// (this instruction does not exist in the backend yet)
+def : ItinRW<[P5600WriteStore], []>;
+
+// LDST is also used in moves from general purpose registers to floating point
+// and MSA.
+def P5600WriteMoveGPRToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 0;
+}
+
+// AL2 Pipeline
+// ------------
+
+def P5600WriteAL2 : SchedWriteRes<[P5600IssueAL2]>;
+def P5600WriteAL2BitExt : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2ShadowMov : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2CondMov : SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+  let Latency = 2;
+}
+def P5600WriteAL2Div : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+  // Estimated worst case
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2DivU : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+  // Estimated worst case
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2Mul : SchedWriteRes<[P5600IssueAL2]> { let Latency = 3; }
+def P5600WriteAL2Mult: SchedWriteRes<[P5600IssueAL2]> { let Latency = 5; }
+def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+  let Latency = 5;
+}
+
+// clo, clz, di, mfhi, mflo
+def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_MFHI_MFLO]>;
+
+// ehb, rdhwr, rdpgpr, wrpgpr, wsbh
+def : ItinRW<[P5600WriteAL2ShadowMov], [II_RDHWR]>;
+
+// mov[nz]
+def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>;
+
+// divu?
+def : ItinRW<[P5600WriteAL2Div], [II_DIV]>;
+def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>;
+
+// mul
+def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>;
+// multu?, multu?
+def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>;
+// maddu?, msubu?, mthi, mtlo
+def : ItinRW<[P5600WriteAL2MAdd],
+             [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>;
+
+// ext, ins
+def : ItinRW<[P5600WriteAL2BitExt],
+             [II_EXT, II_INS]>;
+
+// Either ALU or AL2 Pipelines
+// ---------------------------
+//
+// Some instructions can choose between ALU and AL2, but once dispatched to
+// ALQ or AGQ respectively they are committed to that path.
+// The decision is based on the outcome of the most recent selection when the
+// choice was last available. For now, we assume ALU is always chosen.
+
+def P5600WriteEitherALU : SchedWriteVariant<
+  // FIXME: Implement selection predicate
+  [SchedVar<SchedPredicate<[{1}]>, [P5600WriteALU]>,
+   SchedVar<SchedPredicate<[{0}]>, [P5600WriteAL2]>
+  ]>;
+
+// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
+// xori
+def : ItinRW<[P5600WriteEitherALU],
+             [II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH,
+              II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV,
+              II_SRAV, II_SRLV]>;
+
+// FPU Pipelines
+// =============
+
+def P5600FPQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueFPUS : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPUL : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPULoad : ProcResource<1> { let Super = P5600FPQ; }
+
+def P5600FPUDivSqrt : ProcResource<2>;
+
+def P5600WriteFPUS : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteFPUL : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 4; }
+def P5600WriteFPUL_MADDSUB : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 6; }
+def P5600WriteFPUDivS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 23 / 27
+  let Latency = 23; // Using common case
+  let ResourceCycles = [ 1, 23 ];
+}
+def P5600WriteFPUDivD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 31 / 35
+  let Latency = 31; // Using common case
+  let ResourceCycles = [ 1, 31 ];
+}
+def P5600WriteFPURcpS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 19 / 23
+  let Latency = 19; // Using common case
+  let ResourceCycles = [ 1, 19 ];
+}
+def P5600WriteFPURcpD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 31
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 27
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 31
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 31
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 35 / 39
+  let Latency = 35; // Using common case
+  let ResourceCycles = [ 1, 35 ];
+}
+def P5600WriteMSAShortLogic : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteMSAShortInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 2; }
+def P5600WriteMoveOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPUS]>;
+
+// FPUS is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPUSToOtherUnits : SchedWriteRes<[P5600IssueFPUS]> {
+  let Latency = 0;
+}
+
+// FPUL is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>;
+
+// Short Pipe
+// ----------
+//
+// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1,
+// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b,
+// sdxc1, sdc1, st.[bhwd], swc1, swxc1
+def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D,
+                                II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>;
+
+// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
+// aver?_[us].[bhwd]
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>;
+// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it.
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
+
+// and.v, andi.b, move.v, ldi.[bhwd]
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+
+// Long Pipe
+// ----------
+//
+// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
+// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
+// trunc.w.[ds], trunc.w.ps
+def : ItinRW<[P5600WriteFPUL],
+             [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D,
+              II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>;
+
+// div.[ds], div.ps
+def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>;
+def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>;
+
+// sqrt.[ds], sqrt.ps
+def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>;
+def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>;
+
+// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds],
+// Operand 0 is read on cycle 5. All other operands are read on operand 0.
+def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
+             [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D,
+              II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>;
+
+// madd.ps, msub.ps, nmadd.ps, nmsub.ps
+// Operand 0 and 1 are read on cycle 5. All others are read on operand 0.
+// (none of these instructions exist in the backend yet)
+
+// Load Pipe
+// ---------
+//
+// This is typically used in conjunction with the load pipeline under the AGQ
+// All the instructions are in the 'Tricky Instructions' section.
+
+def P5600WriteLoadOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPULoad]> {
+  let Latency = 4;
+}
+
+// Tricky Instructions
+// ===================
+//
+// These instructions are split across multiple uops (in different pipelines)
+// that must cooperate to complete the operation
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteMoveGPRToFPU : WriteSequence<[P5600WriteMoveGPRToOtherUnits,
+                                            P5600WriteMoveOtherUnitsToFPU]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteMoveFPUToGPR : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+                                            P5600WriteGPRFromBypass]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteStoreFPUS : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+                                         P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteStoreFPUL : WriteSequence<[P5600WriteMoveFPULToOtherUnits,
+                                         P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits,
+                                       P5600WriteLoadOtherUnitsToFPU]>;
+
+// ctc1, mtc1, mthc1
+def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>;
+
+// bc1[ft], cfc1, mfc1, mfhc1, movf, movt
+def : ItinRW<[P5600WriteMoveFPUToGPR],
+             [II_BC1F, II_BC1T, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>;
+
+// swc1, swxc1, st.[bhwd]
+def : ItinRW<[P5600WriteStoreFPUS], [II_SWC1, II_SWXC1]>;
+def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
+
+// movn.[ds], movz.[ds]
+def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>;
+
+// l[dw]x?c1, ld.[bhwd]
+def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1]>;
+def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
+
+// Unsupported Instructions
+// ========================
+//
+// The following instruction classes are never valid on P5600.
+//   II_DADDIU, II_DADDU, II_DMFC1, II_DMTC1, II_DMULT, II_DMULTU, II_DROTR,
+//   II_DROTR32, II_DROTRV, II_DDIV, II_DSLL, II_DSLL32, II_DSLLV, II_DSRA,
+//   II_DSRA32, II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, II_DSUBU, II_DDIVU,
+//   II_JALRC, II_LD, II_LD[LR], II_LUXC1, II_RESTORE, II_SAVE, II_SD, II_SDC1,
+//   II_SDL, II_SDR, II_SDXC1
+//
+// The following instructions are never valid on P5600.
+//   addq.ph, rdhwr, repl.ph, repl.qb, subq.ph, subu_s.qb
+//
+// Guesswork
+// =========
+//
+// This section is largely temporary guesswork.
+
+// ceil.[lw].[ds], floor.[lw].[ds]
+// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL
+def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>;
+
+// rotrv
+// Reason behind guess: rotr is in the same category and the two register forms
+//                      generally follow the immediate forms in this category
+def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
index 471b6e1..8a18b51 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -69,8 +69,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
       HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
-      HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
-      HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(),
+      HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM),
+      TargetTriple(TT), TSInfo(),
       InstrInfo(
           MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
index 1db8881..fbb01fe 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -42,9 +42,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
     Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6
   };
 
+  enum class CPU { P5600 };
+
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
 
+  // Processor implementation (unused but required to exist by
+  // tablegen-erated code).
+  CPU ProcImpl;
+
   // IsLittle - The target is Little Endian
   bool IsLittle;
 
@@ -116,8 +122,8 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // InMicroMips -- can process MicroMips instructions
   bool InMicroMipsMode;
 
-  // HasDSP, HasDSPR2 -- supports DSP ASE.
-  bool HasDSP, HasDSPR2;
+  // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE.
+  bool HasDSP, HasDSPR2, HasDSPR3;
 
   // Allow mixed Mips16 and Mips32 in one source file
   bool AllowMixed16_32;
@@ -130,6 +136,12 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasMSA -- supports MSA ASE.
   bool HasMSA;
 
+  // UseTCCInDIV -- Enables the use of trapping in the assembler.
+  bool UseTCCInDIV;
+
+  // HasEVA -- supports EVA ASE.
+  bool HasEVA;
+
   InstrItineraryData InstrItins;
 
   // We can override the determination of whether we are in mips16 mode
@@ -189,7 +201,7 @@ public:
   }
   bool hasMips32r5() const {
     return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) ||
-           hasMips64r2();
+           hasMips64r5();
   }
   bool hasMips32r6() const {
     return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) ||
@@ -228,9 +240,12 @@ public:
   }
   bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool inMicroMips32r6Mode() const { return InMicroMipsMode && hasMips32r6(); }
+  bool inMicroMips64r6Mode() const { return InMicroMipsMode && hasMips64r6(); }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
+  bool hasDSPR3() const { return HasDSPR3; }
   bool hasMSA() const { return HasMSA; }
+  bool hasEVA() const { return HasEVA; }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 1c77745..3e63872 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -233,7 +233,7 @@ void MipsPassConfig::addPreRegAlloc() {
 }
 
 TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     if (Subtarget->allowMixed16_32()) {
       DEBUG(errs() << "No Target Transform Info Pass Added\n");
       // FIXME: This is no longer necessary as the TTI returned is per-function.
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index 0f2db60..146f33b 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -76,7 +76,7 @@ bool MipsTargetObjectFile::
 IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
                        SectionKind Kind) const {
   return (IsGlobalInSmallSectionImpl(GV, TM) &&
-          (Kind.isDataRel() || Kind.isBSS() || Kind.isCommon()));
+          (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
 }
 
 /// Return true if this global address should be placed into small data/bss
@@ -107,7 +107,8 @@ IsGlobalInSmallSectionImpl(const GlobalValue *GV,
     return false;
 
   Type *Ty = GV->getType()->getElementType();
-  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+  return IsInSmallSection(
+      GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
 }
 
 MCSection *
@@ -120,7 +121,7 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   // Handle Small Section classification here.
   if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
     return SmallBSSSection;
-  if (Kind.isDataRel() && IsGlobalInSmallSection(GV, TM, Kind))
+  if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
@@ -128,21 +129,20 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
 }
 
 /// Return true if this constant should be placed into small data section.
-bool MipsTargetObjectFile::
-IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const {
+bool MipsTargetObjectFile::IsConstantInSmallSection(
+    const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const {
   return (static_cast<const MipsTargetMachine &>(TM)
               .getSubtargetImpl()
               ->useSmallSection() &&
-          LocalSData && IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(
-                            CN->getType())));
+          LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType())));
 }
 
-MCSection *
-MipsTargetObjectFile::getSectionForConstant(SectionKind Kind,
-                                            const Constant *C) const {
-  if (IsConstantInSmallSection(C, *TM))
+/// Return true if this constant should be placed into small data section.
+MCSection *MipsTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+  if (IsConstantInSmallSection(DL, C, *TM))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::getSectionForConstant(Kind, C);
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
index 725f2ff..ba04343 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
@@ -36,10 +36,10 @@ class MipsTargetMachine;
                                       const TargetMachine &TM) const override;
 
     /// Return true if this constant should be placed into small data section.
-    bool IsConstantInSmallSection(const Constant *CN,
+    bool IsConstantInSmallSection(const DataLayout &DL, const Constant *CN,
                                   const TargetMachine &TM) const;
 
-    MCSection *getSectionForConstant(SectionKind Kind,
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                      const Constant *C) const override;
   };
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
index 6ce1be7..b3222f5 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -12,6 +12,7 @@
 
 #include "MCTargetDesc/MipsABIFlagsSection.h"
 #include "MCTargetDesc/MipsABIInfo.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -77,8 +78,12 @@ public:
 
   // PIC support
   virtual void emitDirectiveCpLoad(unsigned RegNo);
+  virtual void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
+                                      int Offset);
   virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                                     const MCSymbol &Sym, bool IsReg);
+  virtual void emitDirectiveCpreturn(unsigned SaveLocation,
+                                     bool SaveLocationIsRegister);
 
   // FP abiflags directives
   virtual void emitDirectiveModuleFP();
@@ -97,18 +102,18 @@ public:
   // structure values.
   template <class PredicateLibrary>
   void updateABIInfo(const PredicateLibrary &P) {
-    ABI = &P.getABI();
+    ABI = P.getABI();
     ABIFlagsSection.setAllFromPredicates(P);
   }
 
   MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
   const MipsABIInfo &getABI() const {
-    assert(ABI && "ABI hasn't been set!");
+    assert(ABI.hasValue() && "ABI hasn't been set!");
     return *ABI;
   }
 
 protected:
-  const MipsABIInfo *ABI;
+  llvm::Optional<MipsABIInfo> ABI;
   MipsABIFlagsSection ABIFlagsSection;
 
   bool GPRInfoSet;
@@ -188,8 +193,12 @@ public:
 
   // PIC support
   void emitDirectiveCpLoad(unsigned RegNo) override;
+  void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
+                              int Offset) override;
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
+  void emitDirectiveCpreturn(unsigned SaveLocation,
+                             bool SaveLocationIsRegister) override;
 
   // FP abiflags directives
   void emitDirectiveModuleFP() override;
@@ -237,8 +246,12 @@ public:
 
   // PIC support
   void emitDirectiveCpLoad(unsigned RegNo) override;
+  void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
+                              int Offset) override;
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
+  void emitDirectiveCpreturn(unsigned SaveLocation,
+                             bool SaveLocationIsRegister) override;
 
   void emitMipsAbiFlags();
 };
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index 02c5a21..f0f223a 100644
--- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -15,11 +15,9 @@
 #define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
-class MCOperand;
 class MCSubtargetInfo;
 
 class NVPTXInstPrinter : public MCInstPrinter {
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index b432e06..9ac3c88 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -22,6 +22,7 @@ class Triple;
 
 class NVPTXMCAsmInfo : public MCAsmInfo {
   virtual void anchor();
+
 public:
   explicit NVPTXMCAsmInfo(const Triple &TheTriple);
 };
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
index fe28214..e5fae85 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -41,24 +41,6 @@ enum CondCodes {
 };
 }
 
-inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
-  switch (CC) {
-  case NVPTXCC::NE:
-    return "ne";
-  case NVPTXCC::EQ:
-    return "eq";
-  case NVPTXCC::LT:
-    return "lt";
-  case NVPTXCC::LE:
-    return "le";
-  case NVPTXCC::GT:
-    return "gt";
-  case NVPTXCC::GE:
-    return "ge";
-  }
-  llvm_unreachable("Unknown condition code");
-}
-
 FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
                                  llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ecb0f0a..e8c3608 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -355,7 +355,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
   if (isABI) {
     if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
       unsigned size = 0;
-      if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+      if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
         size = ITy->getBitWidth();
         if (size < 32)
           size = 32;
@@ -635,9 +635,7 @@ static bool usedInGlobalVarDef(const Constant *C) {
     return false;
 
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
-    if (GV->getName() == "llvm.used")
-      return false;
-    return true;
+    return GV->getName() != "llvm.used";
   }
 
   for (const User *U : C->users())
@@ -682,7 +680,7 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
 static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
   if (!gv->hasInternalLinkage())
     return false;
-  const PointerType *Pty = gv->getType();
+  PointerType *Pty = gv->getType();
   if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
     return false;
 
@@ -720,7 +718,7 @@ static bool useFuncSeen(const Constant *C,
 void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   llvm::DenseMap<const Function *, bool> seenMap;
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
-    const Function *F = FI;
+    const Function *F = &*FI;
 
     if (F->isDeclaration()) {
       if (F->use_empty())
@@ -870,9 +868,8 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
   DenseSet<const GlobalVariable *> GVVisiting;
 
   // Visit each global variable, in order
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+  for (const GlobalVariable &I : M.globals())
+    VisitGlobalVariableForEmission(&I, Globals, GVVisited, GVVisiting);
 
   assert(GVVisited.size() == M.getGlobalList().size() &&
          "Missed a global variable");
@@ -1029,10 +1026,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       GVar->getName().startswith("nvvm."))
     return;
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
-  const PointerType *PTy = GVar->getType();
+  PointerType *PTy = GVar->getType();
   Type *ETy = PTy->getElementType();
 
   if (GVar->hasExternalLinkage()) {
@@ -1159,7 +1156,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   }
 
   if (GVar->getAlignment() == 0)
-    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
   else
     O << " .align " << GVar->getAlignment();
 
@@ -1185,9 +1182,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           printScalarConstant(Initializer, O);
         }
       } else {
-        // The frontend adds zero-initializer to variables that don't have an
-        // initial value, so skip warning for this case.
-        if (!GVar->getInitializer()->isNullValue()) {
+        // The frontend adds zero-initializer to device and constant variables
+        // that don't have an initial value, and UndefValue to shared
+        // variables, so skip warning for this case.
+        if (!GVar->getInitializer()->isNullValue() &&
+            !isa<UndefValue>(GVar->getInitializer())) {
           report_fatal_error("initial value of '" + GVar->getName() +
                              "' is not allowed in addrspace(" +
                              Twine(PTy->getAddressSpace()) + ")");
@@ -1205,7 +1204,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     case Type::StructTyID:
     case Type::ArrayTyID:
     case Type::VectorTyID:
-      ElementSize = TD->getTypeStoreSize(ETy);
+      ElementSize = DL.getTypeStoreSize(ETy);
       // Ptx allows variable initilization only for constant and
       // global state spaces.
       if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
@@ -1299,7 +1298,7 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
 }
 
 std::string
-NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
   switch (Ty->getTypeID()) {
   default:
     llvm_unreachable("unexpected type");
@@ -1339,16 +1338,16 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
                                             raw_ostream &O) {
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
-  const PointerType *PTy = GVar->getType();
+  PointerType *PTy = GVar->getType();
   Type *ETy = PTy->getElementType();
 
   O << ".";
   emitPTXAddressSpace(PTy->getAddressSpace(), O);
   if (GVar->getAlignment() == 0)
-    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
   else
     O << " .align " << GVar->getAlignment();
 
@@ -1370,7 +1369,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   case Type::StructTyID:
   case Type::ArrayTyID:
   case Type::VectorTyID:
-    ElementSize = TD->getTypeStoreSize(ETy);
+    ElementSize = DL.getTypeStoreSize(ETy);
     O << " .b8 ";
     getSymbol(GVar)->print(O, MAI);
     O << "[";
@@ -1385,32 +1384,32 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   return;
 }
 
-static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
+static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
   if (Ty->isSingleValueType())
-    return TD->getPrefTypeAlignment(Ty);
+    return DL.getPrefTypeAlignment(Ty);
 
-  const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+  auto *ATy = dyn_cast<ArrayType>(Ty);
   if (ATy)
-    return getOpenCLAlignment(TD, ATy->getElementType());
+    return getOpenCLAlignment(DL, ATy->getElementType());
 
-  const StructType *STy = dyn_cast<StructType>(Ty);
+  auto *STy = dyn_cast<StructType>(Ty);
   if (STy) {
     unsigned int alignStruct = 1;
     // Go through each element of the struct and find the
     // largest alignment.
     for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
       Type *ETy = STy->getElementType(i);
-      unsigned int align = getOpenCLAlignment(TD, ETy);
+      unsigned int align = getOpenCLAlignment(DL, ETy);
       if (align > alignStruct)
         alignStruct = align;
     }
     return alignStruct;
   }
 
-  const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+  auto *FTy = dyn_cast<FunctionType>(Ty);
   if (FTy)
-    return TD->getPointerPrefAlignment();
-  return TD->getPrefTypeAlignment(Ty);
+    return DL.getPointerPrefAlignment();
+  return DL.getPrefTypeAlignment(Ty);
 }
 
 void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
@@ -1419,13 +1418,8 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
   O << "_param_" << paramIndex;
 }
 
-void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
-  CurrentFnSym->print(O, MAI);
-  O << "_param_" << paramIndex;
-}
-
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   const AttributeSet &PAL = F->getAttributes();
   const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
   Function::const_arg_iterator I, E;
@@ -1433,7 +1427,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   bool first = true;
   bool isKernelFunc = llvm::isKernelFunction(*F);
   bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
-  MVT thePointerTy = TLI->getPointerTy(*TD);
+  MVT thePointerTy = TLI->getPointerTy(DL);
 
   O << "(\n";
 
@@ -1485,9 +1479,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         // size = typeallocsize of element type
         unsigned align = PAL.getParamAlignment(paramIndex + 1);
         if (align == 0)
-          align = TD->getABITypeAlignment(Ty);
+          align = DL.getABITypeAlignment(Ty);
 
-        unsigned sz = TD->getTypeAllocSize(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         O << "\t.param .align " << align << " .b8 ";
         printParamName(I, paramIndex, O);
         O << "[" << sz << "]";
@@ -1495,7 +1489,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         continue;
       }
       // Just a scalar
-      const PointerType *PTy = dyn_cast<PointerType>(Ty);
+      auto *PTy = dyn_cast<PointerType>(Ty);
       if (isKernelFunc) {
         if (PTy) {
           // Special handling for pointer arguments to kernel
@@ -1519,7 +1513,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
               O << ".ptr .global ";
               break;
             }
-            O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " ";
+            O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " ";
           }
           printParamName(I, paramIndex, O);
           continue;
@@ -1556,7 +1550,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     }
 
     // param has byVal attribute. So should be a pointer
-    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    auto *PTy = dyn_cast<PointerType>(Ty);
     assert(PTy && "Param with byval attribute should be a pointer type");
     Type *ETy = PTy->getElementType();
 
@@ -1566,9 +1560,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // size = typeallocsize of element type
       unsigned align = PAL.getParamAlignment(paramIndex + 1);
       if (align == 0)
-        align = TD->getABITypeAlignment(ETy);
+        align = DL.getABITypeAlignment(ETy);
 
-      unsigned sz = TD->getTypeAllocSize(ETy);
+      unsigned sz = DL.getTypeAllocSize(ETy);
       O << "\t.param .align " << align << " .b8 ";
       printParamName(I, paramIndex, O);
       O << "[" << sz << "]";
@@ -1579,7 +1573,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // Further, if a part is vector, print the above for
       // each vector element.
       SmallVector<EVT, 16> vtparts;
-      ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts);
+      ComputeValueVTs(*TLI, DL, ETy, vtparts);
       for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
         unsigned elems = 1;
         EVT elemtype = vtparts[i];
@@ -1786,10 +1780,10 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) {
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
 
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
-    int s = TD->getTypeAllocSize(CPV->getType());
+    int s = DL.getTypeAllocSize(CPV->getType());
     if (s < Bytes)
       s = Bytes;
     aggBuffer->addZeros(s);
@@ -1800,7 +1794,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   switch (CPV->getType()->getTypeID()) {
 
   case Type::IntegerTyID: {
-    const Type *ETy = CPV->getType();
+    Type *ETy = CPV->getType();
     if (ETy == Type::getInt8Ty(CPV->getContext())) {
       unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
       ConvertIntToBytes<>(ptr, c);
@@ -1817,7 +1811,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
         if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
-                ConstantFoldConstantExpression(Cexpr, *TD))) {
+                ConstantFoldConstantExpression(Cexpr, DL))) {
           int int32 = (int)(constInt->getZExtValue());
           ConvertIntToBytes<>(ptr, int32);
           aggBuffer->addBytes(ptr, 4, Bytes);
@@ -1839,7 +1833,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
         if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
-                ConstantFoldConstantExpression(Cexpr, *TD))) {
+                ConstantFoldConstantExpression(Cexpr, DL))) {
           long long int64 = (long long)(constInt->getZExtValue());
           ConvertIntToBytes<>(ptr, int64);
           aggBuffer->addBytes(ptr, 8, Bytes);
@@ -1860,7 +1854,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::FloatTyID:
   case Type::DoubleTyID: {
     const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
-    const Type *Ty = CFP->getType();
+    Type *Ty = CFP->getType();
     if (Ty == Type::getFloatTy(CPV->getContext())) {
       float float32 = (float) CFP->getValueAPF().convertToFloat();
       ConvertFloatToBytes(ptr, float32);
@@ -1881,7 +1875,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
       const Value *v = Cexpr->stripPointerCasts();
       aggBuffer->addSymbol(v, Cexpr);
     }
-    unsigned int s = TD->getTypeAllocSize(CPV->getType());
+    unsigned int s = DL.getTypeAllocSize(CPV->getType());
     aggBuffer->addZeros(s);
     break;
   }
@@ -1891,7 +1885,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::StructTyID: {
     if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
         isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
-      int ElementSize = TD->getTypeAllocSize(CPV->getType());
+      int ElementSize = DL.getTypeAllocSize(CPV->getType());
       bufferAggregateConstant(CPV, aggBuffer);
       if (Bytes > ElementSize)
         aggBuffer->addZeros(Bytes - ElementSize);
@@ -1909,7 +1903,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
 
 void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
                                               AggBuffer *aggBuffer) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   int Bytes;
 
   // Old constants
@@ -1934,12 +1928,12 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
       StructType *ST = cast<StructType>(CPV->getType());
       for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
         if (i == (e - 1))
-          Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
-                  TD->getTypeAllocSize(ST) -
-                  TD->getStructLayout(ST)->getElementOffset(i);
+          Bytes = DL.getStructLayout(ST)->getElementOffset(0) +
+                  DL.getTypeAllocSize(ST) -
+                  DL.getStructLayout(ST)->getElementOffset(i);
         else
-          Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) -
-                  TD->getStructLayout(ST)->getElementOffset(i);
+          Bytes = DL.getStructLayout(ST)->getElementOffset(i + 1) -
+                  DL.getStructLayout(ST)->getElementOffset(i);
         bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
       }
     }
@@ -1951,18 +1945,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
 // buildTypeNameMap - Run through symbol table looking for type names.
 //
 
-bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
-
-  std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
-
-  if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") ||
-                                  !PI->second.compare("struct._image2d_t") ||
-                                  !PI->second.compare("struct._image3d_t")))
-    return true;
-
-  return false;
-}
-
 
 bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
@@ -2054,7 +2036,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C = ConstantFoldConstantExpression(CE, *TM.getDataLayout()))
+    if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout()))
       if (C != CE)
         return lowerConstantForGV(C, ProcessingGeneric);
 
@@ -2083,7 +2065,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
   }
 
   case Instruction::GetElementPtr: {
-    const DataLayout &DL = *TM.getDataLayout();
+    const DataLayout &DL = getDataLayout();
 
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
@@ -2109,7 +2091,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     return lowerConstantForGV(CE->getOperand(0), ProcessingGeneric);
 
   case Instruction::IntToPtr: {
-    const DataLayout &DL = *TM.getDataLayout();
+    const DataLayout &DL = getDataLayout();
 
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
@@ -2120,7 +2102,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &DL = *TM.getDataLayout();
+    const DataLayout &DL = getDataLayout();
 
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index f6f7685..76bf179 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -212,28 +212,21 @@ private:
   MCOperand GetSymbolRef(const MCSymbol *Symbol);
   unsigned encodeVirtualRegister(unsigned Reg);
 
-  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {}
-
   void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
                                  raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = nullptr);
-  void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
                           bool = false);
-  void printParamName(int paramIndex, raw_ostream &O);
   void printParamName(Function::const_arg_iterator I, int paramIndex,
                       raw_ostream &O);
   void emitGlobals(const Module &M);
   void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
   void emitVirtualRegister(unsigned int vr, raw_ostream &);
-  void emitFunctionExternParamList(const MachineFunction &MF);
   void emitFunctionParamList(const Function *, raw_ostream &O);
   void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
   void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
-  void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize);
-  bool isImageType(const Type *Ty);
   void printReturnValStr(const Function *, raw_ostream &O);
   void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -271,7 +264,7 @@ private:
 
   // Build the map between type name and ID based on module's type
   // symbol table.
-  std::map<const Type *, std::string> TypeNameMap;
+  std::map<Type *, std::string> TypeNameMap;
 
   // List of variables demoted to a function scope.
   std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
@@ -282,19 +275,15 @@ private:
 
   void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
   void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
-  std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
+  std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
   void printScalarConstant(const Constant *CPV, raw_ostream &O);
   void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
   void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
   void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
 
-  void printOperandProper(const MachineOperand &MO);
-
   void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
   void emitDeclarations(const Module &, raw_ostream &O);
   void emitDeclaration(const Function *, raw_ostream &O);
-
-  static const char *getRegisterName(unsigned RegNo);
   void emitDemotedVars(const Function *, raw_ostream &);
 
   bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index 69a229e..95813c8 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -98,7 +98,7 @@ private:
   /// This reordering exposes to optimizeMemoryInstruction more
   /// optimization opportunities on loads and stores.
   ///
-  /// If this function succesfully hoists an eliminable addrspacecast or V is
+  /// If this function successfully hoists an eliminable addrspacecast or V is
   /// already such an addrspacecast, it returns the transformed value (which is
   /// guaranteed to be an addrspacecast); otherwise, it returns nullptr.
   Value *hoistAddrSpaceCastFrom(Value *V, int Depth = 0);
@@ -267,14 +267,14 @@ bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
     return false;
 
   bool Changed = false;
-  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+  for (BasicBlock &B : F) {
+    for (Instruction &I : B) {
       if (isa<LoadInst>(I)) {
         // V = load P
-        Changed |= optimizeMemoryInstruction(I, 0);
+        Changed |= optimizeMemoryInstruction(&I, 0);
       } else if (isa<StoreInst>(I)) {
         // store V, P
-        Changed |= optimizeMemoryInstruction(I, 1);
+        Changed |= optimizeMemoryInstruction(&I, 1);
       }
     }
   }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 6fd09c4..62ca5e9 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -81,7 +81,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
 
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
        I != E;) {
-    GlobalVariable *GV = I++;
+    GlobalVariable *GV = &*I++;
     if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
         !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
         !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
@@ -117,7 +117,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
           Value *Operand = II->getOperand(i);
           if (isa<Constant>(Operand)) {
             II->setOperand(
-                i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+                i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder));
           }
         }
       }
@@ -132,10 +132,8 @@ bool GenericToNVVM::runOnModule(Module &M) {
 
   // Walk through the metadata section and update the debug information
   // associated with the global variables in the default address space.
-  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
-                                       E = M.named_metadata_end();
-       I != E; I++) {
-    remapNamedMDNode(VM, I);
+  for (NamedMDNode &I : M.named_metadata()) {
+    remapNamedMDNode(VM, &I);
   }
 
   // Walk through the global variable  initializers, and replace any use of
@@ -318,9 +316,8 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
                               NewOperands[0], NewOperands[1]);
   case Instruction::FCmp:
     // CompareConstantExpr (fcmp)
-    assert(false && "Address space conversion should have no effect "
-                    "on float point CompareConstantExpr (fcmp)!");
-    return C;
+    llvm_unreachable("Address space conversion should have no effect "
+                     "on float point CompareConstantExpr (fcmp)!");
   case Instruction::ExtractElement:
     // ExtractElementConstantExpr
     return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
@@ -364,8 +361,7 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
       return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
                                 NewOperands[0], C->getType());
     }
-    assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
-    return C;
+    llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr");
   }
 }
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 232a611..2d0098b 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
@@ -530,7 +532,7 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
   if (!Src)
     return NVPTX::PTXLdStInstCode::GENERIC;
 
-  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+  if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
     switch (PT->getAddressSpace()) {
     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
@@ -544,6 +546,39 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
+                          unsigned CodeAddrSpace, MachineFunction *F) {
+  // To use non-coherent caching, the load has to be from global
+  // memory and we have to prove that the memory area is not written
+  // to anywhere for the duration of the kernel call, not even after
+  // the load.
+  //
+  // To ensure that there are no writes to the memory, we require the
+  // underlying pointer to be a noalias (__restrict) kernel parameter
+  // that is never used for a write. We can only do this for kernel
+  // functions since from within a device function, we cannot know if
+  // there were or will be writes to the memory from the caller - or we
+  // could, but then we would have to do inter-procedural analysis.
+  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL ||
+      !isKernelFunction(*F->getFunction())) {
+    return false;
+  }
+
+  // We use GetUnderlyingObjects() here instead of
+  // GetUnderlyingObject() mainly because the former looks through phi
+  // nodes while the latter does not. We need to look through phi
+  // nodes to handle pointer induction variables.
+  SmallVector<Value *, 8> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+                       Objs, F->getDataLayout());
+  for (Value *Obj : Objs) {
+    auto *A = dyn_cast<const Argument>(Obj);
+    if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+  }
+
+  return true;
+}
+
 SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   switch (IID) {
@@ -638,6 +673,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(LD);
 
+  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
+    return SelectLDGLDU(N);
+  }
+
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool isVolatile = LD->isVolatile();
@@ -872,6 +911,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
 
+  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
+    return SelectLDGLDU(N);
+  }
+
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool IsVolatile = MemSD->isVolatile();
@@ -1425,6 +1468,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1474,6 +1518,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           }
         }
         break;
+      case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1522,6 +1567,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           break;
         }
         break;
+      case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1563,6 +1609,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1612,6 +1659,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           }
         }
         break;
+      case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1660,6 +1708,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           break;
         }
         break;
+      case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1707,6 +1756,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1756,6 +1806,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           }
         }
         break;
+      case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1804,6 +1855,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           break;
         }
         break;
+      case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1845,6 +1897,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG) {
           switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1894,6 +1947,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           }
         }
         break;
+      case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1942,6 +1996,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
           break;
         }
         break;
+      case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -5039,7 +5094,7 @@ bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
   }
   if (!Src)
     return false;
-  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+  if (auto *PT = dyn_cast<PointerType>(Src->getType()))
     return (PT->getAddressSpace() == spN);
   return false;
 }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b75cf40..be735f6 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -124,6 +124,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // condition branches.
   setJumpIsExpensive(true);
 
+  // Wide divides are _very_ slow. Try to reduce the width of the divide if
+  // possible.
+  addBypassSlowDiv(64, 32);
+
   // By default, use the Source scheduling
   if (sched4reg)
     setSchedulingPreference(Sched::RegPressure);
@@ -275,6 +279,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SELECT);
 
   // Now deduce the information based on the above mentioned
   // actions
@@ -910,7 +915,7 @@ std::string NVPTXTargetLowering::getPrototype(
     O << "(";
     if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
       unsigned size = 0;
-      if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
         size = ITy->getBitWidth();
         if (size < 32)
           size = 32;
@@ -981,7 +986,7 @@ std::string NVPTXTargetLowering::getPrototype(
       O << "_";
       continue;
     }
-    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    auto *PTy = dyn_cast<PointerType>(Ty);
     assert(PTy && "Param with byval attribute should be a pointer type");
     Type *ETy = PTy->getElementType();
 
@@ -1318,7 +1323,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // struct or vector
     SmallVector<EVT, 16> vtparts;
     SmallVector<uint64_t, 16> Offsets;
-    const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
     ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
                        vtparts, &Offsets, 0);
@@ -2007,15 +2012,6 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   return Result;
 }
 
-SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
-                                        int idx, EVT v) const {
-  std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
-  std::stringstream suffix;
-  suffix << idx;
-  *name += suffix.str();
-  return DAG.getTargetExternalSymbol(name->c_str(), v);
-}
-
 SDValue
 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
   std::string ParamSym;
@@ -2029,10 +2025,6 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
 }
 
-SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
-  return getExtSymb(DAG, ".HLPPARAM", idx);
-}
-
 // Check to see if the kernel argument is image*_t or sampler_t
 
 bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
@@ -2040,8 +2032,8 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
                                               "struct._image3d_t",
                                               "struct._sampler_t" };
 
-  const Type *Ty = arg->getType();
-  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+  Type *Ty = arg->getType();
+  auto *PTy = dyn_cast<PointerType>(Ty);
 
   if (!PTy)
     return false;
@@ -2049,14 +2041,11 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
   if (!context)
     return false;
 
-  const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+  auto *STy = dyn_cast<StructType>(PTy->getElementType());
   const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
 
-  for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
-    if (TypeName == specialTypes[i])
-      return true;
-
-  return false;
+  return std::find(std::begin(specialTypes), std::end(specialTypes),
+                   TypeName) != std::end(specialTypes);
 }
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -2082,10 +2071,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
 
   std::vector<Type *> argTypes;
   std::vector<const Argument *> theArgs;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I) {
-    theArgs.push_back(I);
-    argTypes.push_back(I->getType());
+  for (const Argument &I : F->args()) {
+    theArgs.push_back(&I);
+    argTypes.push_back(I.getType());
   }
   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
   // Ins.size() will be larger
@@ -2545,20 +2533,6 @@ void NVPTXTargetLowering::LowerAsmOperandForConstraint(
     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-// NVPTX suuport vector of legal types of any length in Intrinsics because the
-// NVPTX specific type legalizer
-// will legalize them to the PTX supported length.
-bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
-  if (isTypeLegal(VT))
-    return true;
-  if (VT.isVector()) {
-    MVT eVT = VT.getVectorElementType();
-    if (isTypeLegal(eVT))
-      return true;
-  }
-  return false;
-}
-
 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
   switch (Intrinsic) {
   default:
@@ -3747,9 +3721,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   // - [immAddr]
 
   if (AM.BaseGV) {
-    if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
-      return false;
-    return true;
+    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
   }
 
   switch (AM.Scale) {
@@ -3820,11 +3792,6 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
-  return 4;
-}
-
 //===----------------------------------------------------------------------===//
 //                         NVPTX DAG Combining
 //===----------------------------------------------------------------------===//
@@ -4057,6 +4024,67 @@ static SDValue PerformANDCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformSELECTCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  // Currently this detects patterns for integer min and max and
+  // lowers them to PTX-specific intrinsics that enable hardware
+  // support.
+
+  const SDValue Cond = N->getOperand(0);
+  if (Cond.getOpcode() != ISD::SETCC) return SDValue();
+
+  const SDValue LHS = Cond.getOperand(0);
+  const SDValue RHS = Cond.getOperand(1);
+  const SDValue True = N->getOperand(1);
+  const SDValue False = N->getOperand(2);
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  const EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
+
+  const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  SDValue Larger;  // The larger of LHS and RHS when condition is true.
+  switch (CC) {
+    case ISD::SETULT:
+    case ISD::SETULE:
+    case ISD::SETLT:
+    case ISD::SETLE:
+      Larger = RHS;
+      break;
+
+    case ISD::SETGT:
+    case ISD::SETGE:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      Larger = LHS;
+      break;
+
+    default:
+      return SDValue();
+  }
+  const bool IsMax = (Larger == True);
+  const bool IsSigned = ISD::isSignedIntSetCC(CC);
+
+  unsigned IntrinsicId;
+  if (VT == MVT::i32) {
+    if (IsSigned)
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
+    else
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
+  } else {
+    assert(VT == MVT::i64);
+    if (IsSigned)
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
+    else
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
+  }
+
+  SDLoc DL(N);
+  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
+}
+
 enum OperandSignedness {
   Signed = 0,
   Unsigned,
@@ -4113,25 +4141,16 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
     APInt Val = CI->getAPIntValue();
     if (LHSSign == Unsigned) {
-      if (Val.isIntN(OptSize)) {
-        return true;
-      }
-      return false;
+      return Val.isIntN(OptSize);
     } else {
-      if (Val.isSignedIntN(OptSize)) {
-        return true;
-      }
-      return false;
+      return Val.isSignedIntN(OptSize);
     }
   } else {
     OperandSignedness RHSSign;
     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
       return false;
 
-    if (LHSSign != RHSSign)
-      return false;
-
-    return true;
+    return LHSSign == RHSSign;
   }
 }
 
@@ -4247,6 +4266,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformSHLCombine(N, DCI, OptLevel);
     case ISD::AND:
       return PerformANDCombine(N, DCI);
+    case ISD::SELECT:
+      return PerformSELECTCombine(N, DCI);
   }
   return SDValue();
 }
@@ -4509,25 +4530,26 @@ void NVPTXTargetLowering::ReplaceNodeResults(
 void NVPTXSection::anchor() {}
 
 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
-  delete TextSection;
-  delete DataSection;
-  delete BSSSection;
-  delete ReadOnlySection;
-
-  delete StaticCtorSection;
-  delete StaticDtorSection;
-  delete LSDASection;
-  delete EHFrameSection;
-  delete DwarfAbbrevSection;
-  delete DwarfInfoSection;
-  delete DwarfLineSection;
-  delete DwarfFrameSection;
-  delete DwarfPubTypesSection;
-  delete DwarfDebugInlineSection;
-  delete DwarfStrSection;
-  delete DwarfLocSection;
-  delete DwarfARangesSection;
-  delete DwarfRangesSection;
+  delete static_cast<NVPTXSection *>(TextSection);
+  delete static_cast<NVPTXSection *>(DataSection);
+  delete static_cast<NVPTXSection *>(BSSSection);
+  delete static_cast<NVPTXSection *>(ReadOnlySection);
+
+  delete static_cast<NVPTXSection *>(StaticCtorSection);
+  delete static_cast<NVPTXSection *>(StaticDtorSection);
+  delete static_cast<NVPTXSection *>(LSDASection);
+  delete static_cast<NVPTXSection *>(EHFrameSection);
+  delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
+  delete static_cast<NVPTXSection *>(DwarfInfoSection);
+  delete static_cast<NVPTXSection *>(DwarfLineSection);
+  delete static_cast<NVPTXSection *>(DwarfFrameSection);
+  delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
+  delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
+  delete static_cast<NVPTXSection *>(DwarfStrSection);
+  delete static_cast<NVPTXSection *>(DwarfLocSection);
+  delete static_cast<NVPTXSection *>(DwarfARangesSection);
+  delete static_cast<NVPTXSection *>(DwarfRangesSection);
+  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
 }
 
 MCSection *
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index e5c3732..60914c1 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -441,13 +441,9 @@ public:
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
-                             SelectionDAG &DAG) const;
 
   const char *getTargetNodeName(unsigned Opcode) const override;
 
-  bool isTypeSupportedInIntrinsic(MVT VT) const;
-
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
                           unsigned Intrinsic) const override;
 
@@ -459,8 +455,13 @@ public:
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  unsigned getFunctionAlignment(const Function *F) const;
+  bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+      return false;
+    return SrcTy->getPrimitiveSizeInBits() == 64 &&
+           DstTy->getPrimitiveSizeInBits() == 32;
+  }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {
@@ -515,11 +516,7 @@ public:
 
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
-
-  SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
-                     EVT = MVT::i32) const;
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
-  SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
 
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 76d6597..9f3cf45 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -37,30 +37,31 @@ void NVPTXInstrInfo::copyPhysReg(
   const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
 
-  if (DestRC != SrcRC)
-    report_fatal_error("Attempted to created cross-class register copy");
-
-  if (DestRC == &NVPTX::Int32RegsRegClass)
-    BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (DestRC == &NVPTX::Int1RegsRegClass)
-    BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (DestRC == &NVPTX::Float32RegsRegClass)
-    BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (DestRC == &NVPTX::Int16RegsRegClass)
-    BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (DestRC == &NVPTX::Int64RegsRegClass)
-    BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (DestRC == &NVPTX::Float64RegsRegClass)
-    BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else {
+  if (DestRC->getSize() != SrcRC->getSize())
+    report_fatal_error("Copy one register into another with a different width");
+
+  unsigned Op;
+  if (DestRC == &NVPTX::Int1RegsRegClass) {
+    Op = NVPTX::IMOV1rr;
+  } else if (DestRC == &NVPTX::Int16RegsRegClass) {
+    Op = NVPTX::IMOV16rr;
+  } else if (DestRC == &NVPTX::Int32RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr
+                                             : NVPTX::BITCONVERT_32_F2I);
+  } else if (DestRC == &NVPTX::Int64RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
+                                             : NVPTX::BITCONVERT_64_F2I);
+  } else if (DestRC == &NVPTX::Float32RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
+                                               : NVPTX::BITCONVERT_32_I2F);
+  } else if (DestRC == &NVPTX::Float64RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr
+                                               : NVPTX::BITCONVERT_64_I2F);
+  } else {
     llvm_unreachable("Bad register copy");
   }
+  BuildMI(MBB, I, DL, get(Op), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
@@ -86,27 +87,6 @@ bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
   return false;
 }
 
-bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case NVPTX::INT_PTX_SREG_NTID_X:
-  case NVPTX::INT_PTX_SREG_NTID_Y:
-  case NVPTX::INT_PTX_SREG_NTID_Z:
-  case NVPTX::INT_PTX_SREG_TID_X:
-  case NVPTX::INT_PTX_SREG_TID_Y:
-  case NVPTX::INT_PTX_SREG_TID_Z:
-  case NVPTX::INT_PTX_SREG_CTAID_X:
-  case NVPTX::INT_PTX_SREG_CTAID_Y:
-  case NVPTX::INT_PTX_SREG_CTAID_Z:
-  case NVPTX::INT_PTX_SREG_NCTAID_X:
-  case NVPTX::INT_PTX_SREG_NCTAID_Y:
-  case NVPTX::INT_PTX_SREG_NCTAID_Z:
-  case NVPTX::INT_PTX_SREG_WARPSIZE:
-    return true;
-  }
-}
-
 bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
                                  unsigned &AddrSpace) const {
   bool isLoad = false;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index 179c068..3e40722 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -56,7 +56,6 @@ public:
                            unsigned &DestReg) const;
   bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
   bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
-  bool isReadSpecialReg(MachineInstr &MI) const;
 
   virtual bool CanTailMerge(const MachineInstr *MI) const;
   // Branch analysis.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 0bf72fe..f770c2a 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -6,6 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
+// \file
 // Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
 // the size is large or is not a compile-time constant.
 //
@@ -18,19 +20,20 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "nvptx"
 
 using namespace llvm;
 
 namespace {
+
 // actual analysis class, which is a functionpass
 struct NVPTXLowerAggrCopies : public FunctionPass {
   static char ID;
@@ -50,179 +53,299 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
     return "Lower aggregate copies/intrinsics into loops";
   }
 };
-} // namespace
 
 char NVPTXLowerAggrCopies::ID = 0;
 
-// Lower MemTransferInst or load-store pair to loop
-static void convertTransferToLoop(
-    Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
-    bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
-  Type *indType = len->getType();
+// Lower memcpy to loop.
+void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+                         Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+                         bool DstIsVolatile, LLVMContext &Context,
+                         Function &F) {
+  Type *TypeOfCopyLen = CopyLen->getType();
 
-  BasicBlock *origBB = splitAt->getParent();
-  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
-  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+  BasicBlock *OrigBB = ConvertedInst->getParent();
+  BasicBlock *NewBB =
+      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
 
-  origBB->getTerminator()->setSuccessor(0, loopBB);
-  IRBuilder<> builder(origBB, origBB->getTerminator());
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
 
-  // srcAddr and dstAddr are expected to be pointer types,
+  // SrcAddr and DstAddr are expected to be pointer types,
   // so no check is made here.
-  unsigned srcAS = cast<PointerType>(srcAddr->getType())->getAddressSpace();
-  unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   // Cast pointers to (char *)
-  srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
-  dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
 
-  IRBuilder<> loop(loopBB);
-  // The loop index (ind) is a phi node.
-  PHINode *ind = loop.CreatePHI(indType, 0);
-  // Incoming value for ind is 0
-  ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
 
-  // load from srcAddr+ind
+  // load from SrcAddr+LoopIndex
   // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
   // word-sized loads and stores.
-  Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind),
-                               srcVolatile);
-  // store at dstAddr+ind
-  loop.CreateStore(val, loop.CreateGEP(loop.getInt8Ty(), dstAddr, ind),
-                   dstVolatile);
-
-  // The value for ind coming from backedge is (ind + 1)
-  Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
-  ind->addIncoming(newind, loopBB);
-
-  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+  Value *Element =
+      LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+                                 LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+                             SrcIsVolatile);
+  // store at DstAddr+LoopIndex
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+                                                        DstAddr, LoopIndex),
+                          DstIsVolatile);
+
+  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
 }
 
-// Lower MemSetInst to loop
-static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
-                                Value *len, Value *val, LLVMContext &Context,
-                                Function &F) {
-  BasicBlock *origBB = splitAt->getParent();
-  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
-  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+                          Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+                          bool DstIsVolatile, LLVMContext &Context,
+                          Function &F) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = ConvertedInst->getParent();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = ConvertedInst->getParent();
+  ExitBB->setName("memmove_done");
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateLoad(
+      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+      BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *FwdElement = FwdLoopBuilder.CreateLoad(
+      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+  FwdLoopBuilder.CreateStore(
+      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
 
-  origBB->getTerminator()->setSuccessor(0, loopBB);
-  IRBuilder<> builder(origBB, origBB->getTerminator());
+// Lower memset to loop.
+void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
+                         Value *CopyLen, Value *SetValue, LLVMContext &Context,
+                         Function &F) {
+  BasicBlock *OrigBB = ConvertedInst->getParent();
+  BasicBlock *NewBB =
+      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
 
-  unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace();
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
 
   // Cast pointer to the type of value getting stored
-  dstAddr =
-      builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS));
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
 
-  IRBuilder<> loop(loopBB);
-  PHINode *ind = loop.CreatePHI(len->getType(), 0);
-  ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
+  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
 
-  loop.CreateStore(val, loop.CreateGEP(val->getType(), dstAddr, ind), false);
+  LoopBuilder.CreateStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      false);
 
-  Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
-  ind->addIncoming(newind, loopBB);
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
 
-  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
 }
 
 bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
-  SmallVector<LoadInst *, 4> aggrLoads;
-  SmallVector<MemTransferInst *, 4> aggrMemcpys;
-  SmallVector<MemSetInst *, 4> aggrMemsets;
+  SmallVector<LoadInst *, 4> AggrLoads;
+  SmallVector<MemIntrinsic *, 4> MemCalls;
 
   const DataLayout &DL = F.getParent()->getDataLayout();
   LLVMContext &Context = F.getParent()->getContext();
 
-  //
-  // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
-  //
+  // Collect all aggregate loads and mem* calls.
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
          ++II) {
-      if (LoadInst *load = dyn_cast<LoadInst>(II)) {
-        if (!load->hasOneUse())
+      if (LoadInst *LI = dyn_cast<LoadInst>(II)) {
+        if (!LI->hasOneUse())
           continue;
 
-        if (DL.getTypeStoreSize(load->getType()) < MaxAggrCopySize)
+        if (DL.getTypeStoreSize(LI->getType()) < MaxAggrCopySize)
           continue;
 
-        User *use = load->user_back();
-        if (StoreInst *store = dyn_cast<StoreInst>(use)) {
-          if (store->getOperand(0) != load)
+        if (StoreInst *SI = dyn_cast<StoreInst>(LI->user_back())) {
+          if (SI->getOperand(0) != LI)
             continue;
-          aggrLoads.push_back(load);
-        }
-      } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) {
-        Value *len = intr->getLength();
-        // If the number of elements being copied is greater
-        // than MaxAggrCopySize, lower it to a loop
-        if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
-          if (len_int->getZExtValue() >= MaxAggrCopySize) {
-            aggrMemcpys.push_back(intr);
-          }
-        } else {
-          // turn variable length memcpy/memmov into loop
-          aggrMemcpys.push_back(intr);
+          AggrLoads.push_back(LI);
         }
-      } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) {
-        Value *len = memsetintr->getLength();
-        if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
-          if (len_int->getZExtValue() >= MaxAggrCopySize) {
-            aggrMemsets.push_back(memsetintr);
+      } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
+        // Convert intrinsic calls with variable size or with constant size
+        // larger than the MaxAggrCopySize threshold.
+        if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
+          if (LenCI->getZExtValue() >= MaxAggrCopySize) {
+            MemCalls.push_back(IntrCall);
           }
         } else {
-          // turn variable length memset into loop
-          aggrMemsets.push_back(memsetintr);
+          MemCalls.push_back(IntrCall);
         }
       }
     }
   }
-  if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) &&
-      (aggrMemsets.size() == 0))
+
+  if (AggrLoads.size() == 0 && MemCalls.size() == 0) {
     return false;
+  }
 
   //
   // Do the transformation of an aggr load/copy/set to a loop
   //
-  for (LoadInst *load : aggrLoads) {
-    StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
-    Value *srcAddr = load->getOperand(0);
-    Value *dstAddr = store->getOperand(1);
-    unsigned numLoads = DL.getTypeStoreSize(load->getType());
-    Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
-
-    convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
-                          store->isVolatile(), Context, F);
-
-    store->eraseFromParent();
-    load->eraseFromParent();
+  for (LoadInst *LI : AggrLoads) {
+    StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+    Value *SrcAddr = LI->getOperand(0);
+    Value *DstAddr = SI->getOperand(1);
+    unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
+    Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
+
+    convertMemCpyToLoop(/* ConvertedInst */ SI,
+                        /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                        /* CopyLen */ CopyLen,
+                        /* SrcIsVolatile */ LI->isVolatile(),
+                        /* DstIsVolatile */ SI->isVolatile(),
+                        /* Context */ Context,
+                        /* Function F */ F);
+
+    SI->eraseFromParent();
+    LI->eraseFromParent();
   }
 
-  for (MemTransferInst *cpy : aggrMemcpys) {
-    convertTransferToLoop(/* splitAt */ cpy,
-                          /* srcAddr */ cpy->getSource(),
-                          /* dstAddr */ cpy->getDest(),
-                          /* len */ cpy->getLength(),
-                          /* srcVolatile */ cpy->isVolatile(),
-                          /* dstVolatile */ cpy->isVolatile(),
+  // Transform mem* intrinsic calls.
+  for (MemIntrinsic *MemCall : MemCalls) {
+    if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
+      convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
+                          /* SrcAddr */ Memcpy->getRawSource(),
+                          /* DstAddr */ Memcpy->getRawDest(),
+                          /* CopyLen */ Memcpy->getLength(),
+                          /* SrcIsVolatile */ Memcpy->isVolatile(),
+                          /* DstIsVolatile */ Memcpy->isVolatile(),
                           /* Context */ Context,
                           /* Function F */ F);
-    cpy->eraseFromParent();
-  }
-
-  for (MemSetInst *memsetinst : aggrMemsets) {
-    Value *len = memsetinst->getLength();
-    Value *val = memsetinst->getValue();
-    convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
-                        F);
-    memsetinst->eraseFromParent();
+    } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
+      convertMemMoveToLoop(/* ConvertedInst */ Memmove,
+                           /* SrcAddr */ Memmove->getRawSource(),
+                           /* DstAddr */ Memmove->getRawDest(),
+                           /* CopyLen */ Memmove->getLength(),
+                           /* SrcIsVolatile */ Memmove->isVolatile(),
+                           /* DstIsVolatile */ Memmove->isVolatile(),
+                           /* Context */ Context,
+                           /* Function F */ F);
+
+    } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
+      convertMemSetToLoop(/* ConvertedInst */ Memset,
+                          /* DstAddr */ Memset->getRawDest(),
+                          /* CopyLen */ Memset->getLength(),
+                          /* SetValue */ Memset->getValue(),
+                          /* Context */ Context,
+                          /* Function F */ F);
+    }
+    MemCall->eraseFromParent();
   }
 
   return true;
 }
 
+} // namespace
+
+namespace llvm {
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies",
+                "Lower aggregate copies, and llvm.mem* intrinsics into loops",
+                false, false)
+
 FunctionPass *llvm::createLowerAggrCopies() {
   return new NVPTXLowerAggrCopies();
 }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 93d0025..624052e 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -81,7 +81,7 @@ bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
         // Check Load, Store, GEP, and BitCast Uses on alloca and make them
         // use the converted generic address, in order to expose non-generic
         // addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types
-        // of instructions this is unecessary and may introduce redudant
+        // of instructions this is unnecessary and may introduce redundant
         // address cast.
         const auto &AllocaUse = *UI++;
         auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
index b533f31..6656077 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -47,6 +47,36 @@
 //      ...
 //    }
 //
+// 3. Convert pointers in a byval kernel parameter to pointers in the global
+//    address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
+//
+//    struct S {
+//      int *x;
+//      int *y;
+//    };
+//    __global__ void foo(S s) {
+//      int *b = s.y;
+//      // use b
+//    }
+//
+//    "b" points to the global address space. In the IR level,
+//
+//    define void @foo({i32*, i32*}* byval %input) {
+//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+//      %b = load i32*, i32** %b_ptr
+//      ; use %b
+//    }
+//
+//    becomes
+//
+//    define void @foo({i32*, i32*}* byval %input) {
+//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+//      %b = load i32*, i32** %b_ptr
+//      %b_global = addrspacecast i32* %b to i32 addrspace(1)*
+//      %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
+//      ; use %b_generic
+//    }
+//
 // TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes
 // don't cancel the addrspacecast pair this pass emits.
 //===----------------------------------------------------------------------===//
@@ -54,6 +84,7 @@
 #include "NVPTX.h"
 #include "NVPTXUtilities.h"
 #include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -71,9 +102,12 @@ class NVPTXLowerKernelArgs : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
   // handle byval parameters
-  void handleByValParam(Argument *);
-  // handle non-byval pointer parameters
-  void handlePointerParam(Argument *);
+  void handleByValParam(Argument *Arg);
+  // Knowing Ptr must point to the global address space, this function
+  // addrspacecasts Ptr to global and then back to generic. This allows
+  // NVPTXFavorNonGenericAddrSpace to fold the global-to-generic cast into
+  // loads/stores that appear later.
+  void markPointerAsGlobal(Value *Ptr);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -104,7 +138,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
 //
 // The above code allocates some space in the stack and copies the incoming
 // struct from param space to local space.
-// Then replace all occurences of %d by %temp.
+// Then replace all occurrences of %d by %temp.
 // =============================================================================
 void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
   Function *Func = Arg->getParent();
@@ -128,27 +162,33 @@ void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
   new StoreInst(LI, AllocA, FirstInst);
 }
 
-void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) {
-  assert(!Arg->hasByValAttr() &&
-         "byval params should be handled by handleByValParam");
-
-  // Do nothing if the argument already points to the global address space.
-  if (Arg->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
+void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) {
+  if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
     return;
 
-  Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin();
-  Instruction *ArgInGlobal = new AddrSpaceCastInst(
-      Arg, PointerType::get(Arg->getType()->getPointerElementType(),
+  // Deciding where to emit the addrspacecast pair.
+  BasicBlock::iterator InsertPt;
+  if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+    // Insert at the functon entry if Ptr is an argument.
+    InsertPt = Arg->getParent()->getEntryBlock().begin();
+  } else {
+    // Insert right after Ptr if Ptr is an instruction.
+    InsertPt = ++cast<Instruction>(Ptr)->getIterator();
+    assert(InsertPt != InsertPt->getParent()->end() &&
+           "We don't call this function with Ptr being a terminator.");
+  }
+
+  Instruction *PtrInGlobal = new AddrSpaceCastInst(
+      Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
                             ADDRESS_SPACE_GLOBAL),
-      Arg->getName(), FirstInst);
-  Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(),
-                                              Arg->getName(), FirstInst);
-  // Replace with ArgInGeneric all uses of Args except ArgInGlobal.
-  Arg->replaceAllUsesWith(ArgInGeneric);
-  ArgInGlobal->setOperand(0, Arg);
+      Ptr->getName(), &*InsertPt);
+  Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
+                                              Ptr->getName(), &*InsertPt);
+  // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
+  Ptr->replaceAllUsesWith(PtrInGeneric);
+  PtrInGlobal->setOperand(0, Ptr);
 }
 
-
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
@@ -157,12 +197,32 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
   if (!isKernelFunction(F))
     return false;
 
+  if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
+    // Mark pointers in byval structs as global.
+    for (auto &B : F) {
+      for (auto &I : B) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+          if (LI->getType()->isPointerTy()) {
+            Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
+                                            F.getParent()->getDataLayout());
+            if (Argument *Arg = dyn_cast<Argument>(UO)) {
+              if (Arg->hasByValAttr()) {
+                // LI is a load from a pointer within a byval kernel parameter.
+                markPointerAsGlobal(LI);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   for (Argument &Arg : F.args()) {
     if (Arg.getType()->isPointerTy()) {
       if (Arg.hasByValAttr())
         handleByValParam(&Arg);
       else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
-        handlePointerParam(&Arg);
+        markPointerAsGlobal(&Arg);
     }
   }
   return true;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
index 46b4b33..81a606d 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -68,7 +68,7 @@ public:
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};
-  MCSection *findAssociatedSection() const override { return nullptr; }
+  MCFragment *findAssociatedFragment() const override { return nullptr; }
 
   // There are no TLS NVPTXMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
@@ -110,7 +110,7 @@ public:
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};
-  MCSection *findAssociatedSection() const override { return nullptr; }
+  MCFragment *findAssociatedFragment() const override { return nullptr; }
 
   // There are no TLS NVPTXMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 5fd69a6..17019d7 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -72,7 +72,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
     // If last instruction is a return instruction, add an epilogue
-    if (!I->empty() && I->back().isReturn())
+    if (I->isReturnBlock())
       TFI.emitEpilogue(MF, *I);
   }
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
index 0d2627d..45a7309 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -19,15 +19,14 @@
 #include <vector>
 
 namespace llvm {
-/// NVPTXSection - Represents a section in PTX
-/// PTX does not have sections. We create this class in order to use
-/// the ASMPrint interface.
+/// Represents a section in PTX PTX does not have sections. We create this class
+/// in order to use the ASMPrint interface.
 ///
-class NVPTXSection : public MCSection {
+class NVPTXSection final : public MCSection {
   virtual void anchor();
 public:
   NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
-  virtual ~NVPTXSection() {}
+  ~NVPTXSection() {}
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 248f9e1..aa931b1 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -53,6 +53,7 @@ void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
 }
@@ -64,14 +65,15 @@ extern "C" void LLVMInitializeNVPTXTarget() {
 
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
-  initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-  initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
-  initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry());
-  initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
-  initializeNVPTXFavorNonGenericAddrSpacesPass(
-    *PassRegistry::getPassRegistry());
-  initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
-  initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry());
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeNVVMReflectPass(PR);
+  initializeGenericToNVVMPass(PR);
+  initializeNVPTXAllocaHoistingPass(PR);
+  initializeNVPTXAssignValidGlobalNamesPass(PR);
+  initializeNVPTXFavorNonGenericAddrSpacesPass(PR);
+  initializeNVPTXLowerKernelArgsPass(PR);
+  initializeNVPTXLowerAllocaPass(PR);
+  initializeNVPTXLowerAggrCopiesPass(PR);
 }
 
 static std::string computeDataLayout(bool is64Bit) {
@@ -139,6 +141,10 @@ public:
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+  // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE.
+  void addEarlyCSEOrGVNPass();
 };
 } // end anonymous namespace
 
@@ -148,11 +154,18 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(NVPTXTTIImpl(this, F));
   });
 }
 
+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
 void NVPTXPassConfig::addIRPasses() {
   // The following passes are known to not play well with virtual regs hanging
   // around after register allocation (which in our case, is *all* registers).
@@ -161,13 +174,14 @@ void NVPTXPassConfig::addIRPasses() {
   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
   disablePass(&PrologEpilogCodeInserterID);
   disablePass(&MachineCopyPropagationID);
-  disablePass(&BranchFolderPassID);
   disablePass(&TailDuplicateID);
 
+  addPass(createNVVMReflectPass());
   addPass(createNVPTXImageOptimizerPass());
-  TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+
+  // === Propagate special address spaces ===
   addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
   // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
   // be eliminated by SROA.
@@ -178,22 +192,38 @@ void NVPTXPassConfig::addIRPasses() {
   // them unused. We could remove dead code in an ad-hoc manner, but that
   // requires manual work and might be error-prone.
   addPass(createDeadCodeEliminationPass());
+
+  // === Straight-line scalar optimizations ===
   addPass(createSeparateConstOffsetFromGEPPass());
+  addPass(createSpeculativeExecutionPass());
   // ReassociateGEPs exposes more opportunites for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
   addPass(createStraightLineStrengthReducePass());
   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
   // for some of our benchmarks.
-  if (getOptLevel() == CodeGenOpt::Aggressive)
-    addPass(createGVNPass());
-  else
-    addPass(createEarlyCSEPass());
+  addEarlyCSEOrGVNPass();
   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
   addPass(createNaryReassociatePass());
   // NaryReassociate on GEPs creates redundant common expressions, so run
   // EarlyCSE after it.
   addPass(createEarlyCSEPass());
+
+  // === LSR and other generic IR passes ===
+  TargetPassConfig::addIRPasses();
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  addEarlyCSEOrGVNPass();
 }
 
 bool NVPTXPassConfig::addInstSelector() {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 5ecdc87..683b9a3 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -41,6 +41,7 @@ public:
     DwarfLocSection = nullptr;
     DwarfARangesSection = nullptr;
     DwarfRangesSection = nullptr;
+    DwarfMacinfoSection = nullptr;
   }
 
   virtual ~NVPTXTargetObjectFile();
@@ -48,8 +49,7 @@ public:
   void Initialize(MCContext &ctx, const TargetMachine &TM) override {
     TargetLoweringObjectFile::Initialize(ctx, TM);
     TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
-    DataSection =
-        new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
+    DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData());
     BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
     ReadOnlySection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
@@ -82,9 +82,11 @@ public:
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
     DwarfRangesSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfMacinfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
   }
 
-  MCSection *getSectionForConstant(SectionKind Kind,
+  MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                    const Constant *C) const override {
     return ReadOnlySection;
   }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index e7250cd..6e679dd 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -89,12 +89,12 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
   return false;
 }
 
-unsigned NVPTXTTIImpl::getArithmeticInstrCost(
+int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 5bcd1e2..0946a32 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -52,7 +52,7 @@ public:
 
   bool isSourceOfDivergence(const Value *V);
 
-  unsigned getArithmeticInstrCost(
+  int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 1f178af..578b466 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -335,106 +335,7 @@ bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
   return false;
 }
 
-bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
-  if ((id == Intrinsic::nvvm_barrier0) ||
-      (id == Intrinsic::nvvm_barrier0_popc) ||
-      (id == Intrinsic::nvvm_barrier0_and) ||
-      (id == Intrinsic::nvvm_barrier0_or) ||
-      (id == Intrinsic::cuda_syncthreads))
-    return true;
-  return false;
-}
-
-// Interface for checking all memory space transfer related intrinsics
-bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
-  if (id == Intrinsic::nvvm_ptr_local_to_gen ||
-      id == Intrinsic::nvvm_ptr_shared_to_gen ||
-      id == Intrinsic::nvvm_ptr_global_to_gen ||
-      id == Intrinsic::nvvm_ptr_constant_to_gen ||
-      id == Intrinsic::nvvm_ptr_gen_to_global ||
-      id == Intrinsic::nvvm_ptr_gen_to_shared ||
-      id == Intrinsic::nvvm_ptr_gen_to_local ||
-      id == Intrinsic::nvvm_ptr_gen_to_constant ||
-      id == Intrinsic::nvvm_ptr_gen_to_param) {
-    return true;
-  }
-
-  return false;
-}
-
-// consider several special intrinsics in striping pointer casts, and
-// provide an option to ignore GEP indicies for find out the base address only
-// which could be used in simple alias disambigurate.
-const Value *
-llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
-  V = V->stripPointerCasts();
-  while (true) {
-    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
-      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
-        V = IS->getArgOperand(0)->stripPointerCasts();
-        continue;
-      }
-    } else if (ignore_GEP_indices)
-      if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-        V = GEP->getPointerOperand()->stripPointerCasts();
-        continue;
-      }
-    break;
-  }
-  return V;
-}
-
-// consider several special intrinsics in striping pointer casts, and
-// - ignore GEP indicies for find out the base address only, and
-// - tracking PHINode
-// which could be used in simple alias disambigurate.
-const Value *
-llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
-  if (processed.find(V) != processed.end())
-    return nullptr;
-  processed.insert(V);
-
-  const Value *V2 = V->stripPointerCasts();
-  if (V2 != V && processed.find(V2) != processed.end())
-    return nullptr;
-  processed.insert(V2);
-
-  V = V2;
-
-  while (true) {
-    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
-      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
-        V = IS->getArgOperand(0)->stripPointerCasts();
-        continue;
-      }
-    } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      V = GEP->getPointerOperand()->stripPointerCasts();
-      continue;
-    } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
-      if (V != V2 && processed.find(V) != processed.end())
-        return nullptr;
-      processed.insert(PN);
-      const Value *common = nullptr;
-      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
-        const Value *pv = PN->getIncomingValue(i);
-        const Value *base = skipPointerTransfer(pv, processed);
-        if (base) {
-          if (!common)
-            common = base;
-          else if (common != base)
-            return PN;
-        }
-      }
-      if (!common)
-        return PN;
-      V = common;
-    }
-    break;
-  }
-  return V;
-}
-
-// The following are some useful utilities for debuggung
+// The following are some useful utilities for debugging
 
 BasicBlock *llvm::getParentBlock(Value *v) {
   if (BasicBlock *B = dyn_cast<BasicBlock>(v))
@@ -466,7 +367,7 @@ void llvm::dumpBlock(Value *v, char *blockName) {
     return;
 
   for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
-    BasicBlock *B = it;
+    BasicBlock *B = &*it;
     if (strcmp(B->getName().data(), blockName) == 0) {
       B->dump();
       return;
@@ -490,7 +391,7 @@ Instruction *llvm::getInst(Value *base, char *instName) {
   return nullptr;
 }
 
-// Dump an instruction by nane
+// Dump an instruction by name
 void llvm::dumpInst(Value *base, char *instName) {
   Instruction *I = getInst(base, instName);
   if (I)
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 7e2ce73..a5262cb 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -61,27 +61,6 @@ bool isKernelFunction(const llvm::Function &);
 bool getAlign(const llvm::Function &, unsigned index, unsigned &);
 bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
 
-bool isBarrierIntrinsic(llvm::Intrinsic::ID);
-
-/// make_vector - Helper function which is useful for building temporary vectors
-/// to pass into type construction of CallInst ctors.  This turns a null
-/// terminated list of pointers (or other value types) into a real live vector.
-///
-template <typename T> inline std::vector<T> make_vector(T A, ...) {
-  va_list Args;
-  va_start(Args, A);
-  std::vector<T> Result;
-  Result.push_back(A);
-  while (T Val = va_arg(Args, T))
-    Result.push_back(Val);
-  va_end(Args);
-  return Result;
-}
-
-bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
-const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
-const Value *
-skipPointerTransfer(const Value *V, std::set<const Value *> &processed);
 BasicBlock *getParentBlock(Value *v);
 Function *getParentFunction(Value *v);
 void dumpBlock(Value *v, char *blockName);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
index a237247..e69bbba 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
@@ -26,7 +26,7 @@ let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
 def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
   (ins V2I16Regs:$src, i8imm:$c),
                          "mov.u16 \t$dst, $src${c:vecelem};",
-                         [(set Int16Regs:$dst, (vector_extract
+                         [(set Int16Regs:$dst, (extractelt
                            (v2i16 V2I16Regs:$src), imm:$c))],
                          IMOV16rr>;
 
@@ -34,7 +34,7 @@ def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
 def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
   (ins V4I16Regs:$src, i8imm:$c),
                          "mov.u16 \t$dst, $src${c:vecelem};",
-                         [(set Int16Regs:$dst, (vector_extract
+                         [(set Int16Regs:$dst, (extractelt
                            (v4i16 V4I16Regs:$src), imm:$c))],
                          IMOV16rr>;
 
@@ -42,7 +42,7 @@ def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
 def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
   (ins V2I8Regs:$src, i8imm:$c),
                          "mov.u16 \t$dst, $src${c:vecelem};",
-                         [(set Int8Regs:$dst, (vector_extract
+                         [(set Int8Regs:$dst, (extractelt
                            (v2i8 V2I8Regs:$src), imm:$c))],
                          IMOV8rr>;
 
@@ -50,7 +50,7 @@ def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
 def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
   (ins V4I8Regs:$src, i8imm:$c),
                          "mov.u16 \t$dst, $src${c:vecelem};",
-                         [(set Int8Regs:$dst, (vector_extract
+                         [(set Int8Regs:$dst, (extractelt
                            (v4i8 V4I8Regs:$src), imm:$c))],
                          IMOV8rr>;
 
@@ -58,7 +58,7 @@ def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
 def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
   (ins V2I32Regs:$src, i8imm:$c),
                          "mov.u32 \t$dst, $src${c:vecelem};",
-                         [(set Int32Regs:$dst, (vector_extract
+                         [(set Int32Regs:$dst, (extractelt
                            (v2i32 V2I32Regs:$src), imm:$c))],
                          IMOV32rr>;
 
@@ -66,7 +66,7 @@ def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
 def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
   (ins V2F32Regs:$src, i8imm:$c),
                          "mov.f32 \t$dst, $src${c:vecelem};",
-                         [(set Float32Regs:$dst, (vector_extract
+                         [(set Float32Regs:$dst, (extractelt
                            (v2f32 V2F32Regs:$src), imm:$c))],
                          FMOV32rr>;
 
@@ -74,7 +74,7 @@ def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
 def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
   (ins V2I64Regs:$src, i8imm:$c),
                          "mov.u64 \t$dst, $src${c:vecelem};",
-                         [(set Int64Regs:$dst, (vector_extract
+                         [(set Int64Regs:$dst, (extractelt
                            (v2i64 V2I64Regs:$src), imm:$c))],
                          IMOV64rr>;
 
@@ -82,7 +82,7 @@ def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
 def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
   (ins V2F64Regs:$src, i8imm:$c),
                          "mov.f64 \t$dst, $src${c:vecelem};",
-                         [(set Float64Regs:$dst, (vector_extract
+                         [(set Float64Regs:$dst, (extractelt
                            (v2f64 V2F64Regs:$src), imm:$c))],
                          FMOV64rr>;
 
@@ -90,7 +90,7 @@ def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
 def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
   (ins V4I32Regs:$src, i8imm:$c),
                          "mov.u32 \t$dst, $src${c:vecelem};",
-                         [(set Int32Regs:$dst, (vector_extract
+                         [(set Int32Regs:$dst, (extractelt
                            (v4i32 V4I32Regs:$src), imm:$c))],
                          IMOV32rr>;
 
@@ -98,7 +98,7 @@ def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
 def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
   (ins V4F32Regs:$src, i8imm:$c),
                          "mov.f32 \t$dst, $src${c:vecelem};",
-                         [(set Float32Regs:$dst, (vector_extract
+                         [(set Float32Regs:$dst, (extractelt
                            (v4f32 V4F32Regs:$src), imm:$c))],
                          FMOV32rr>;
 }
@@ -110,8 +110,7 @@ def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
         "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
         "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
        [(set V2I8Regs:$dst,
-         (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
-                         IMOV8rr>;
+         (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
 
 // Insert v4i8
 def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
@@ -119,8 +118,7 @@ def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
                        "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
        [(set V4I8Regs:$dst,
-         (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
-                         IMOV8rr>;
+         (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
 
 // Insert v2i16
 def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
@@ -128,8 +126,8 @@ def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
                        "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
        [(set V2I16Regs:$dst,
-         (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
-                         IMOV16rr>;
+         (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+                    IMOV16rr>;
 
 // Insert v4i16
 def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
@@ -137,8 +135,8 @@ def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
                        "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
        [(set V4I16Regs:$dst,
-         (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
-                         IMOV16rr>;
+         (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+                    IMOV16rr>;
 
 // Insert v2i32
 def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
@@ -146,8 +144,8 @@ def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
                        "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
        [(set V2I32Regs:$dst,
-         (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
-                         IMOV32rr>;
+         (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+                    IMOV32rr>;
 
 // Insert v2f32
 def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
@@ -155,8 +153,8 @@ def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
                        "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
        [(set V2F32Regs:$dst,
-         (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
-                         FMOV32rr>;
+         (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+                    FMOV32rr>;
 
 // Insert v2i64
 def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
@@ -164,8 +162,8 @@ def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
                        "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
        [(set V2I64Regs:$dst,
-         (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
-                         IMOV64rr>;
+         (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+                    IMOV64rr>;
 
 // Insert v2f64
 def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
@@ -173,8 +171,8 @@ def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
                        "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
        [(set V2F64Regs:$dst,
-         (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
-                         FMOV64rr>;
+         (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+                    FMOV64rr>;
 
 // Insert v4i32
 def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
@@ -182,8 +180,8 @@ def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
                        "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
        [(set V4I32Regs:$dst,
-         (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
-                         IMOV32rr>;
+         (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+                    IMOV32rr>;
 
 // Insert v4f32
 def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
@@ -191,8 +189,8 @@ def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
                        "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
                        "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
        [(set V4F32Regs:$dst,
-         (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
-                         FMOV32rr>;
+         (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+                    FMOV32rr>;
 }
 
 class BinOpAsmString<string c> {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 5e375b7..20ab5db 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -109,10 +109,10 @@ void NVVMReflect::setVarMap() {
   for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
     DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
     SmallVector<StringRef, 4> NameValList;
-    StringRef(ReflectList[i]).split(NameValList, ",");
+    StringRef(ReflectList[i]).split(NameValList, ',');
     for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
       SmallVector<StringRef, 2> NameValPair;
-      NameValList[j].split(NameValPair, "=");
+      NameValList[j].split(NameValPair, '=');
       assert(NameValPair.size() == 2 && "name=val expected");
       std::stringstream ValStream(NameValPair[1]);
       int Val;
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index a699a55..220c70a 100644
--- a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -243,7 +243,6 @@ namespace {
 struct PPCOperand;
 
 class PPCAsmParser : public MCTargetAsmParser {
-  MCSubtargetInfo &STI;
   const MCInstrInfo &MII;
   bool IsPPC64;
   bool IsDarwin;
@@ -291,9 +290,9 @@ class PPCAsmParser : public MCTargetAsmParser {
 
 
 public:
-  PPCAsmParser(MCSubtargetInfo &STI, MCAsmParser &, const MCInstrInfo &MII,
-               const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(STI), MII(MII) {
+  PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &,
+               const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, STI), MII(MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -1185,7 +1184,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
     break;
   }
   case PPC::MFTB: {
-    if (STI.getFeatureBits()[PPC::FeatureMFTB]) {
+    if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) {
       assert(Inst.getNumOperands() == 2 && "Expecting two operands");
       Inst.setOpcode(PPC::MFSPR);
     }
@@ -1205,7 +1204,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     // Post-process instructions (typically extended mnemonics)
     ProcessInstruction(Inst, Operands);
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
+    Out.EmitInstruction(Inst, getSTI());
     return false;
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
@@ -1690,7 +1689,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   //  where th can be omitted when it is 0. dcbtst is the same. We take the
   //  server form to be the default, so swap the operands if we're parsing for
   //  an embedded core (they'll be swapped again upon printing).
-  if (STI.getFeatureBits()[PPC::FeatureBookE] &&
+  if (getSTI().getFeatureBits()[PPC::FeatureBookE] &&
       Operands.size() == 4 &&
       (Name == "dcbt" || Name == "dcbtst")) {
     std::swap(Operands[1], Operands[3]);
@@ -1730,10 +1729,19 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
+      SMLoc ExprLoc = getLexer().getLoc();
       if (getParser().parseExpression(Value))
         return false;
 
-      getParser().getStreamer().EmitValue(Value, Size);
+      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size);
+      } else {
+        getStreamer().EmitValue(Value, Size, ExprLoc);
+      }
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 93a503c..1fc84fb 100644
--- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -401,8 +401,6 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
     if (result != MCDisassembler::Fail)
       return result;
-
-    MI.clear();
   }
 
   return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 8e18783..53eb727 100644
--- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -18,8 +18,6 @@
 
 namespace llvm {
 
-class MCOperand;
-
 class PPCInstPrinter : public MCInstPrinter {
   bool IsDarwin;
 public:
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 992be5b..dd99495 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -113,6 +113,10 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
         break;
       }
       break;
+    case PPC::fixup_ppc_half16ds:
+      Target.print(errs());
+      errs() << '\n';
+      report_fatal_error("Invalid PC-relative half16ds relocation");
     case FK_Data_4:
     case FK_PCRel_4:
       Type = ELF::R_PPC_REL32;
@@ -305,13 +309,13 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
         break;
       case MCSymbolRefExpr::VK_GOT:
         Type = ELF::R_PPC64_GOT16_DS;
-	break;
+        break;
       case MCSymbolRefExpr::VK_PPC_GOT_LO:
         Type = ELF::R_PPC64_GOT16_LO_DS;
         break;
       case MCSymbolRefExpr::VK_PPC_TOC:
         Type = ELF::R_PPC64_TOC16_DS;
-	break;
+        break;
       case MCSymbolRefExpr::VK_PPC_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO_DS;
         break;
@@ -372,16 +376,16 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
         break;
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_PPC64_ADDR64;
-	break;
+        break;
       case MCSymbolRefExpr::VK_PPC_DTPMOD:
         Type = ELF::R_PPC64_DTPMOD64;
-	break;
+        break;
       case MCSymbolRefExpr::VK_PPC_TPREL:
         Type = ELF::R_PPC64_TPREL64;
-	break;
+        break;
       case MCSymbolRefExpr::VK_PPC_DTPREL:
         Type = ELF::R_PPC64_DTPREL64;
-	break;
+        break;
       }
       break;
     case FK_Data_4:
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 86ad385..e252ac9 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -20,18 +20,19 @@
 namespace llvm {
 class Triple;
 
-  class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
-
-  public:
-    explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
-  };
-
-  class PPCELFMCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit PPCELFMCAsmInfo(bool is64Bit, const Triple&);
-  };
+class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &);
+};
+
+class PPCELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
+};
 
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index a641780..d42a111 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -82,8 +82,8 @@ public:
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *findAssociatedSection() const override {
-    return getSubExpr()->findAssociatedSection();
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
   }
 
   // There are no TLS PPCMCExprs at the moment.
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 9d72896..b54a0e1 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -241,12 +241,12 @@ bool PPCMachObjectWriter::recordScatteredRelocation(
     if (FixupOffset > 0xffffff) {
       char Buffer[32];
       format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
+      Asm.getContext().reportError(Fixup.getLoc(),
                                   Twine("Section too large, can't encode "
                                         "r_address (") +
                                       Buffer + ") into 24 bits of scattered "
                                                "relocation entry.");
-      llvm_unreachable("fatal error returned?!");
+      return false;
     }
 
     // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 6075631..acea600 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -56,6 +56,14 @@ namespace PPC {
     PRED_BIT_UNSET = 1025
   };
   
+  // Bit for branch taken (plus) or not-taken (minus) hint
+  enum BranchHintBit {
+    BR_NO_HINT       = 0x0,
+    BR_NONTAKEN_HINT = 0x2,
+    BR_TAKEN_HINT    = 0x3,
+    BR_HINT_MASK     = 0X3
+  };
+
   /// Invert the specified predicate.  != -> ==, < -> >=.
   Predicate InvertPredicate(Predicate Opcode);
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h
index ae8d8b4..a259ed3 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPC.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.h
@@ -41,13 +41,16 @@ namespace llvm {
   FunctionPass *createPPCVSXCopyPass();
   FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCVSXSwapRemovalPass();
+  FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
+  FunctionPass *createPPCBoolRetToIntPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
+  void initializePPCBoolRetToIntPass(PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td
index 641b237..b03be12 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPC.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.td
@@ -50,6 +50,8 @@ def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
+def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                              "Use software emulation for floating point">;                                        
 def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
                               "Enable 64-bit registers usage for ppc32 [beta]">;
 def FeatureCRBits    : SubtargetFeature<"crbits", "UseCRBits", "true",
@@ -137,6 +139,12 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
                                   "Enable Hardware Transactional Memory instructions">;
 def FeatureMFTB   : SubtargetFeature<"", "FeatureMFTB", "true",
                                         "Implement mftb using the mfspr instruction">;
+def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
+                                     "Target supports add/load integer fusion.">;
+def FeatureFloat128 :
+  SubtargetFeature<"float128", "HasFloat128", "true",
+                   "Enable the __float128 data type for IEEE-754R Binary128.",
+                   [FeatureVSX]>;
 
 def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
   "Treat vector data stream cache control instructions as deprecated">;
@@ -168,7 +176,8 @@ def ProcessorFeatures {
        FeatureMFTB, DeprecatedDST];
   list<SubtargetFeature> Power8SpecificFeatures =
       [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
-       FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic];
+       FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic,
+       FeatureFusion];
   list<SubtargetFeature> Power8FeatureList =
       !listconcat(Power7FeatureList, Power8SpecificFeatures);
 }
@@ -309,7 +318,7 @@ def : ProcessorModel<"g5", G5Model,
                    Feature64Bit /*, Feature64BitRegs */,
                    FeatureMFTB, DeprecatedDST]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
-                  [DirectiveE500mc, FeatureMFOCRF,
+                  [DirectiveE500mc,
                    FeatureSTFIWX, FeatureICBT, FeatureBookE, 
                    FeatureISEL, FeatureMFTB]>;
 def : ProcessorModel<"e5500", PPCE5500Model,
@@ -403,6 +412,7 @@ def PPCAsmParserVariant : AsmParserVariant {
   // InstAlias definitions use immediate literals.  Set RegisterPrefix
   // so that those are not misinterpreted as registers.
   string RegisterPrefix = "%";
+  string BreakCharacters = ".";
 }
 
 def PPC : Target {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 8e118ec..ec354c2 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -65,19 +65,20 @@ using namespace llvm;
 #define DEBUG_TYPE "asmprinter"
 
 namespace {
-  class PPCAsmPrinter : public AsmPrinter {
-  protected:
-    MapVector<MCSymbol*, MCSymbol*> TOC;
-    const PPCSubtarget *Subtarget;
-    StackMaps SM;
-  public:
-    explicit PPCAsmPrinter(TargetMachine &TM,
-                           std::unique_ptr<MCStreamer> Streamer)
-        : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
-
-    const char *getPassName() const override {
-      return "PowerPC Assembly Printer";
-    }
+class PPCAsmPrinter : public AsmPrinter {
+protected:
+  MapVector<MCSymbol *, MCSymbol *> TOC;
+  const PPCSubtarget *Subtarget;
+  StackMaps SM;
+
+public:
+  explicit PPCAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
+
+  const char *getPassName() const override {
+    return "PowerPC Assembly Printer";
+  }
 
     MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
 
@@ -94,10 +95,8 @@ namespace {
 
     void EmitEndOfAsmFile(Module &M) override;
 
-    void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                       const MachineInstr &MI);
-    void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                         const MachineInstr &MI);
+    void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
+    void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
     void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
     bool runOnMachineFunction(MachineFunction &MF) override {
       Subtarget = &MF.getSubtarget<PPCSubtarget>();
@@ -157,15 +156,15 @@ static const char *stripRegisterPrefix(const char *RegName) {
       return RegName + 1;
     case 'c': if (RegName[1] == 'r') return RegName + 2;
   }
-  
+
   return RegName;
 }
 
 void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   const MachineOperand &MO = MI->getOperand(OpNo);
-  
+
   switch (MO.getType()) {
   case MachineOperand::MO_Register: {
     const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
@@ -184,8 +183,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     MO.getMBB()->getSymbol()->print(O, MAI);
     return;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
-      << '_' << MO.getIndex();
+    O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
     return;
   case MachineOperand::MO_BlockAddress:
     GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
@@ -200,19 +199,19 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
         !GV->isStrongDefinitionForLinker()) {
       if (!GV->hasHiddenVisibility()) {
         SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-        MachineModuleInfoImpl::StubValueTy &StubSym = 
-          MMI->getObjFileInfo<MachineModuleInfoMachO>()
-            .getGVStubEntry(SymToPrint);
+        MachineModuleInfoImpl::StubValueTy &StubSym =
+            MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
+                SymToPrint);
         if (!StubSym.getPointer())
           StubSym = MachineModuleInfoImpl::
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
                  GV->hasAvailableExternallyLinkage()) {
         SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-        
-        MachineModuleInfoImpl::StubValueTy &StubSym = 
-          MMI->getObjFileInfo<MachineModuleInfoMachO>().
-                    getHiddenGVStubEntry(SymToPrint);
+
+        MachineModuleInfoImpl::StubValueTy &StubSym =
+            MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
+                SymToPrint);
         if (!StubSym.getPointer())
           StubSym = MachineModuleInfoImpl::
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
@@ -295,16 +294,16 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
       }
     case 'U': // Print 'u' for update form.
     case 'X': // Print 'x' for indexed form.
-      {
-	// FIXME: Currently for PowerPC memory operands are always loaded
-	// into a register, so we never get an update or indexed form.
-	// This is bad even for offset forms, since even if we know we
-	// have a value in -16(r1), we will generate a load into r<n>
-	// and then load from 0(r<n>).  Until that issue is fixed,
-	// tolerate 'U' and 'X' but don't output anything.
-	assert(MI->getOperand(OpNo).isReg());
-	return false;
-      }
+    {
+      // FIXME: Currently for PowerPC memory operands are always loaded
+      // into a register, so we never get an update or indexed form.
+      // This is bad even for offset forms, since even if we know we
+      // have a value in -16(r1), we will generate a load into r<n>
+      // and then load from 0(r<n>).  Until that issue is fixed,
+      // tolerate 'U' and 'X' but don't output anything.
+      assert(MI->getOperand(OpNo).isReg());
+      return false;
+    }
     }
   }
 
@@ -315,7 +314,6 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
-
 /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
@@ -330,8 +328,7 @@ void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
   SM.serializeToStackMapSection();
 }
 
-void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                                  const MachineInstr &MI) {
+void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
   unsigned NumNOPBytes = MI.getOperand(1).getImm();
 
   SM.recordStackMap(MI);
@@ -353,13 +350,12 @@ void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
 
   // Emit nops.
   for (unsigned i = 0; i < NumNOPBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
 }
 
 // Lower a patchpoint of the form:
 // [<def>], <id>, <numBytes>, <target>, <numArgs>
-void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                                    const MachineInstr &MI) {
+void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
   SM.recordPatchPoint(MI);
   PatchPointOpers Opers(&MI);
 
@@ -375,60 +371,59 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
       unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
       EncodedBytes = 0;
       // Materialize the jump address:
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
                                       .addReg(ScratchReg)
                                       .addImm((CallTarget >> 32) & 0xFFFF));
       ++EncodedBytes;
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC)
                                       .addReg(ScratchReg)
                                       .addReg(ScratchReg)
                                       .addImm(32).addImm(16));
       ++EncodedBytes;
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8)
                                       .addReg(ScratchReg)
                                       .addReg(ScratchReg)
                                       .addImm((CallTarget >> 16) & 0xFFFF));
       ++EncodedBytes;
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8)
                                       .addReg(ScratchReg)
                                       .addReg(ScratchReg)
                                       .addImm(CallTarget & 0xFFFF));
 
       // Save the current TOC pointer before the remote call.
       int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
                                       .addReg(PPC::X2)
                                       .addImm(TOCSaveOffset)
                                       .addReg(PPC::X1));
       ++EncodedBytes;
 
-
       // If we're on ELFv1, then we need to load the actual function pointer
       // from the function descriptor.
       if (!Subtarget->isELFv2ABI()) {
-	// Load the new TOC pointer and the function address, but not r11
-	// (needing this is rare, and loading it here would prevent passing it
-	// via a 'nest' parameter.
-        EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+        // Load the new TOC pointer and the function address, but not r11
+        // (needing this is rare, and loading it here would prevent passing it
+        // via a 'nest' parameter.
+        EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
                                         .addReg(PPC::X2)
                                         .addImm(8)
                                         .addReg(ScratchReg));
         ++EncodedBytes;
-        EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+        EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
                                         .addReg(ScratchReg)
                                         .addImm(0)
                                         .addReg(ScratchReg));
         ++EncodedBytes;
       }
 
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8)
                                       .addReg(ScratchReg));
       ++EncodedBytes;
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8));
       ++EncodedBytes;
 
       // Restore the TOC pointer after the call.
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
                                       .addReg(PPC::X2)
                                       .addImm(TOCSaveOffset)
                                       .addReg(PPC::X1));
@@ -439,7 +434,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
 
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP)
                                     .addExpr(SymVar));
     EncodedBytes += 2;
   }
@@ -454,7 +449,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
   assert((NumBytes - EncodedBytes) % 4 == 0 &&
          "Invalid number of NOP bytes requested!");
   for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
 }
 
 /// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
@@ -499,16 +494,16 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   bool isDarwin = TM.getTargetTriple().isOSDarwin();
   const Module *M = MF->getFunction()->getParent();
   PICLevel::Level PL = M->getPICLevel();
-  
+
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
   case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(*OutStreamer, SM, *MI);
+    return LowerSTACKMAP(SM, *MI);
   case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(*OutStreamer, SM, *MI);
+    return LowerPATCHPOINT(SM, *MI);
 
   case PPC::MoveGOTtoLR: {
     // Transform %LR = MoveGOTtoLR
@@ -533,17 +528,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::MovePCtoLR:
   case PPC::MovePCtoLR8: {
     // Transform %LR = MovePCtoLR
-    // Into this, where the label is the PIC base: 
+    // Into this, where the label is the PIC base:
     //     bl L1$pb
     // L1$pb:
     MCSymbol *PICBase = MF->getPICBaseSymbol();
-    
+
     // Emit the 'bl'.
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
-      // FIXME: We would like an efficient form for this, so we don't have to do
-      // a lot of extra uniquing.
-      .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
-    
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL)
+                       // FIXME: We would like an efficient form for this, so we
+                       // don't have to do a lot of extra uniquing.
+                       .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
     // Emit the label.
     OutStreamer->EmitLabel(PICBase);
     return;
@@ -654,7 +650,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
-      
+
   case PPC::ADDIStocHA: {
     // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
@@ -669,28 +665,22 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
             MO.isBlockAddress()) &&
            "Invalid operand for ADDIStocHA!");
     MCSymbol *MOSymbol = nullptr;
-    bool IsExternal = false;
-    bool IsNonLocalFunction = false;
-    bool IsCommon = false;
-    bool IsAvailExt = false;
+    bool GlobalToc = false;
 
     if (MO.isGlobal()) {
       const GlobalValue *GV = MO.getGlobal();
       MOSymbol = getSymbol(GV);
-      IsExternal = GV->isDeclaration();
-      IsCommon = GV->hasCommonLinkage();
-      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
-        !GV->isStrongDefinitionForLinker();
-      IsAvailExt = GV->hasAvailableExternallyLinkage();
-    } else if (MO.isCPI())
+      unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+      GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
+    } else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
-    else if (MO.isJTI())
+    } else if (MO.isJTI()) {
       MOSymbol = GetJTISymbol(MO.getIndex());
-    else if (MO.isBlockAddress())
+    } else if (MO.isBlockAddress()) {
       MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+    }
 
-    if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
-        MO.isJTI() || MO.isBlockAddress() ||
+    if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
         TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
@@ -727,13 +717,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
     else if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      MOSymbol = getSymbol(GValue);
-      if (GValue->getType()->getElementType()->isFunctionTy() ||
-          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
-          GValue->hasAvailableExternallyLinkage() ||
-          TM.getCodeModel() == CodeModel::Large)
-        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      DEBUG(
+        unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+        assert((GVFlags & PPCII::MO_NLP_FLAG) &&
+               "LDtocL used on symbol that could be accessed directly is "
+               "invalid. Must match ADDIStocHA."));
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
 
     const MCExpr *Exp =
@@ -754,21 +745,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
     MCSymbol *MOSymbol = nullptr;
-    bool IsExternal = false;
-    bool IsNonLocalFunction = false;
 
     if (MO.isGlobal()) {
       const GlobalValue *GV = MO.getGlobal();
+      DEBUG(
+        unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+        assert (
+            !(GVFlags & PPCII::MO_NLP_FLAG) &&
+            "Interposable definitions must use indirect access."));
       MOSymbol = getSymbol(GV);
-      IsExternal = GV->isDeclaration();
-      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
-        !GV->isStrongDefinitionForLinker();
-    } else if (MO.isCPI())
+    } else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
-
-    if (IsNonLocalFunction || IsExternal ||
-        TM.getCodeModel() == CodeModel::Large)
-      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
 
     const MCExpr *Exp =
       MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
@@ -840,13 +828,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::PPC32GOT: {
-    MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
-    const MCExpr *SymGotTlsL =
-      MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
-                              OutContext);
-    const MCExpr *SymGotTlsHA =                               
-      MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
-                              OutContext);
+    MCSymbol *GOTSymbol =
+        OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    const MCExpr *SymGotTlsL = MCSymbolRefExpr::create(
+        GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext);
+    const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create(
+        GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addExpr(SymGotTlsL));
@@ -1079,14 +1066,14 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   // linux/ppc32 - Normal entry label.
-  if (!Subtarget->isPPC64() && 
-      (TM.getRelocationModel() != Reloc::PIC_ || 
+  if (!Subtarget->isPPC64() &&
+      (TM.getRelocationModel() != Reloc::PIC_ ||
        MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small))
     return AsmPrinter::EmitFunctionEntryLabel();
 
   if (!Subtarget->isPPC64()) {
     const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
-  	if (PPCFI->usesPICBase()) {
+    if (PPCFI->usesPICBase()) {
       MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
       MCSymbol *PICBase = MF->getPICBaseSymbol();
       OutStreamer->EmitLabel(RelocSymbol);
@@ -1105,8 +1092,28 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   }
 
   // ELFv2 ABI - Normal entry label.
-  if (Subtarget->isELFv2ABI())
+  if (Subtarget->isELFv2ABI()) {
+    // In the Large code model, we allow arbitrary displacements between
+    // the text section and its associated TOC section.  We place the
+    // full 8-byte offset to the TOC in memory immediatedly preceding
+    // the function global entry point.
+    if (TM.getCodeModel() == CodeModel::Large
+        && !MF->getRegInfo().use_empty(PPC::X2)) {
+      const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+
+      MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+      MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol();
+      const MCExpr *TOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+                                MCSymbolRefExpr::create(GlobalEPSymbol,
+                                                        OutContext),
+                                OutContext);
+
+      OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol());
+      OutStreamer->EmitValue(TOCDeltaExpr, 8);
+    }
     return AsmPrinter::EmitFunctionEntryLabel();
+  }
 
   // Emit an official procedure descriptor.
   MCSectionSubPair Current = OutStreamer->getCurrentSection();
@@ -1130,11 +1137,10 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   OutStreamer->SwitchSection(Current.first, Current.second);
 }
 
-
 bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
 
-  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+  bool isPPC64 = DL.getPointerSizeInBits() == 64;
 
   PPCTargetStreamer &TS =
       static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
@@ -1174,10 +1180,25 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
   // thus emit a prefix sequence along the following lines:
   //
   // func:
+  // .Lfunc_gepNN:
+  //         # global entry point
+  //         addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha
+  //         addi  r2,r2,(.TOC.-.Lfunc_gepNN)@l
+  // .Lfunc_lepNN:
+  //         .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
+  //         # local entry point, followed by function body
+  //
+  // For the Large code model, we create
+  //
+  // .Lfunc_tocNN:
+  //         .quad .TOC.-.Lfunc_gepNN      # done by EmitFunctionEntryLabel
+  // func:
+  // .Lfunc_gepNN:
   //         # global entry point
-  //         addis r2,r12,(.TOC.-func)@ha
-  //         addi  r2,r2,(.TOC.-func)@l
-  //         .localentry func, .-func
+  //         ld    r2,.Lfunc_tocNN-.Lfunc_gepNN(r12)
+  //         add   r2,r2,r12
+  // .Lfunc_lepNN:
+  //         .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
   //         # local entry point, followed by function body
   //
   // This ensures we have r2 set up correctly while executing the function
@@ -1185,32 +1206,49 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
   if (Subtarget->isELFv2ABI()
       // Only do all that if the function uses r2 in the first place.
       && !MF->getRegInfo().use_empty(PPC::X2)) {
+    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
 
-    MCSymbol *GlobalEntryLabel = OutContext.createTempSymbol();
+    MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol();
     OutStreamer->EmitLabel(GlobalEntryLabel);
     const MCSymbolRefExpr *GlobalEntryLabelExp =
       MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
 
-    MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
-    const MCExpr *TOCDeltaExpr =
-      MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
-                              GlobalEntryLabelExp, OutContext);
+    if (TM.getCodeModel() != CodeModel::Large) {
+      MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+      const MCExpr *TOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+                                GlobalEntryLabelExp, OutContext);
 
-    const MCExpr *TOCDeltaHi =
-      PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
-                                 .addReg(PPC::X2)
-                                 .addReg(PPC::X12)
-                                 .addExpr(TOCDeltaHi));
-
-    const MCExpr *TOCDeltaLo =
-      PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
-                                 .addReg(PPC::X2)
-                                 .addReg(PPC::X2)
-                                 .addExpr(TOCDeltaLo));
-
-    MCSymbol *LocalEntryLabel = OutContext.createTempSymbol();
+      const MCExpr *TOCDeltaHi =
+        PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X12)
+                                   .addExpr(TOCDeltaHi));
+
+      const MCExpr *TOCDeltaLo =
+        PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X2)
+                                   .addExpr(TOCDeltaLo));
+    } else {
+      MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol();
+      const MCExpr *TOCOffsetDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext),
+                                GlobalEntryLabelExp, OutContext);
+
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+                                   .addReg(PPC::X2)
+                                   .addExpr(TOCOffsetDeltaExpr)
+                                   .addReg(PPC::X12));
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X12));
+    }
+
+    MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol();
     OutStreamer->EmitLabel(LocalEntryLabel);
     const MCSymbolRefExpr *LocalEntryLabelExp =
        MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
@@ -1293,8 +1331,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
   // Prime text sections so they are adjacent.  This reduces the likelihood a
   // large data or debug section causes a branch to exceed 16M limit.
-  const TargetLoweringObjectFileMachO &TLOFMacho = 
-    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
   OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection());
   if (TM.getRelocationModel() == Reloc::PIC_) {
     OutStreamer->SwitchSection(
@@ -1325,7 +1363,7 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
 
 void PPCDarwinAsmPrinter::
 EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
-  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
 
   // Construct a local MCSubtargetInfo and shadow EmitToStreamer here.
   // This is because the MachineFunction won't exist (but have not yet been
@@ -1338,8 +1376,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     S.EmitInstruction(Inst, *STI);
   };
 
-  const TargetLoweringObjectFileMachO &TLOFMacho = 
-    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
 
   // .lazy_symbol_pointer
   MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
@@ -1353,12 +1391,12 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
       OutStreamer->SwitchSection(StubSection);
       EmitAlignment(4);
-      
+
       MCSymbol *Stub = Stubs[i].first;
       MCSymbol *RawSym = Stubs[i].second.getPointer();
       MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
       MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
-                                           
+
       OutStreamer->EmitLabel(Stub);
       OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
 
@@ -1463,20 +1501,19 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4);
     }
   }
-  
+
   OutStreamer->AddBlankLine();
 }
 
-
 bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
-  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
 
   // Darwin/PPC always uses mach-o.
-  const TargetLoweringObjectFileMachO &TLOFMacho = 
-    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
   MachineModuleInfoMachO &MMIMacho =
-    MMI->getObjFileInfo<MachineModuleInfoMachO>();
-  
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
   MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
   if (!Stubs.empty())
     EmitFunctionStubs(Stubs);
@@ -1484,27 +1521,27 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   if (MAI->doesSupportExceptionHandling() && MMI) {
     // Add the (possibly multiple) personalities to the set of global values.
     // Only referenced functions get into the Personalities list.
-    const std::vector<const Function*> &Personalities = MMI->getPersonalities();
-    for (std::vector<const Function*>::const_iterator I = Personalities.begin(),
-         E = Personalities.end(); I != E; ++I) {
-      if (*I) {
-        MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
+    for (const Function *Personality : MMI->getPersonalities()) {
+      if (Personality) {
+        MCSymbol *NLPSym =
+            getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym =
-          MMIMacho.getGVStubEntry(NLPSym);
-        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
+            MMIMacho.getGVStubEntry(NLPSym);
+        StubSym =
+            MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true);
       }
     }
   }
 
   // Output stubs for dynamically-linked functions.
   Stubs = MMIMacho.GetGVStubList();
-  
+
   // Output macho stubs for external and common global variables.
   if (!Stubs.empty()) {
     // Switch with ".non_lazy_symbol_pointer" directive.
     OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
     EmitAlignment(isPPC64 ? 3 : 2);
-    
+
     for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
       // L_foo$stub:
       OutStreamer->EmitLabel(Stubs[i].first);
@@ -1535,7 +1572,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   if (!Stubs.empty()) {
     OutStreamer->SwitchSection(getObjFileLowering().getDataSection());
     EmitAlignment(isPPC64 ? 3 : 2);
-    
+
     for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
       // L_foo$stub:
       OutStreamer->EmitLabel(Stubs[i].first);
@@ -1573,7 +1610,7 @@ createPPCAsmPrinterPass(TargetMachine &tm,
 }
 
 // Force static initialization.
-extern "C" void LLVMInitializePowerPCAsmPrinter() { 
+extern "C" void LLVMInitializePowerPCAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
new file mode 100644
index 0000000..7920240
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -0,0 +1,253 @@
+//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements converting i1 values to i32 if they could be more
+// profitably allocated as GPRs rather than CRs. This pass will become totally
+// unnecessary if Register Bank Allocation and Global Instruction Selection ever
+// go upstream.
+//
+// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// transitive closure of their uses includes only PHINodes, CallInsts, and
+// ReturnInsts. The rational is that arguments are generally passed and returned
+// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// actually save casts at the Machine Instruction level.
+//
+// It might be useful to expand this pass to add bit-wise operations to the list
+// of safe transitive closure types. Also, we miss some opportunities when LLVM
+// represents logical AND and OR operations with control flow rather than data
+// flow. For example by lowering the expression: return (A && B && C)
+//
+// as: return A ? true : B && C.
+//
+// There's code in SimplifyCFG that code be used to turn control flow in data
+// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so
+// this probably isn't good in general, but for the special case of i1, the
+// Selects could be further lowered to bit operations that are fast everywhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "bool-ret-to-int"
+
+STATISTIC(NumBoolRetPromotion,
+          "Number of times a bool feeding a RetInst was promoted to an int");
+STATISTIC(NumBoolCallPromotion,
+          "Number of times a bool feeding a CallInst was promoted to an int");
+STATISTIC(NumBoolToIntPromotion,
+          "Total number of times a bool was promoted to an int");
+
+class PPCBoolRetToInt : public FunctionPass {
+
+  static SmallPtrSet<Value *, 8> findAllDefs(Value *V) {
+    SmallPtrSet<Value *, 8> Defs;
+    SmallVector<Value *, 8> WorkList;
+    WorkList.push_back(V);
+    Defs.insert(V);
+    while (!WorkList.empty()) {
+      Value *Curr = WorkList.back();
+      WorkList.pop_back();
+      if (User *CurrUser = dyn_cast<User>(Curr))
+        for (auto &Op : CurrUser->operands())
+          if (Defs.insert(Op).second)
+            WorkList.push_back(Op);
+    }
+    return Defs;
+  }
+
+  // Translate a i1 value to an equivalent i32 value:
+  static Value *translate(Value *V) {
+    Type *Int32Ty = Type::getInt32Ty(V->getContext());
+    if (Constant *C = dyn_cast<Constant>(V))
+      return ConstantExpr::getZExt(C, Int32Ty);
+    if (PHINode *P = dyn_cast<PHINode>(V)) {
+      // Temporarily set the operands to 0. We'll fix this later in
+      // runOnUse.
+      Value *Zero = Constant::getNullValue(Int32Ty);
+      PHINode *Q =
+        PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+      for (unsigned i = 0; i < P->getNumOperands(); ++i)
+        Q->addIncoming(Zero, P->getIncomingBlock(i));
+      return Q;
+    }
+
+    Argument *A = dyn_cast<Argument>(V);
+    Instruction *I = dyn_cast<Instruction>(V);
+    assert((A || I) && "Unknown value type");
+
+    auto InstPt =
+      A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
+    return new ZExtInst(V, Int32Ty, "", InstPt);
+  }
+
+  typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
+
+  // A PHINode is Promotable if:
+  // 1. Its type is i1 AND
+  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // AND
+  // 3. All of its operands are Constant or Argument or
+  //    CallInst or PHINode AND
+  // 4. All of its PHINode uses are Promotable AND
+  // 5. All of its PHINode operands are Promotable
+  static PHINodeSet getPromotablePHINodes(const Function &F) {
+    PHINodeSet Promotable;
+    // Condition 1
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (const PHINode *P = dyn_cast<PHINode>(&I))
+          if (P->getType()->isIntegerTy(1))
+            Promotable.insert(P);
+
+    SmallVector<const PHINode *, 8> ToRemove;
+    for (const auto &P : Promotable) {
+      // Condition 2 and 3
+      auto IsValidUser = [] (const Value *V) -> bool {
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
+        isa<DbgInfoIntrinsic>(V);
+      };
+      auto IsValidOperand = [] (const Value *V) -> bool {
+        return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
+        isa<PHINode>(V);
+      };
+      const auto &Users = P->users();
+      const auto &Operands = P->operands();
+      if (!std::all_of(Users.begin(), Users.end(), IsValidUser) ||
+          !std::all_of(Operands.begin(), Operands.end(), IsValidOperand))
+        ToRemove.push_back(P);
+    }
+
+    // Iterate to convergence
+    auto IsPromotable = [&Promotable] (const Value *V) -> bool {
+      const PHINode *Phi = dyn_cast<PHINode>(V);
+      return !Phi || Promotable.count(Phi);
+    };
+    while (!ToRemove.empty()) {
+      for (auto &User : ToRemove)
+        Promotable.erase(User);
+      ToRemove.clear();
+
+      for (const auto &P : Promotable) {
+        // Condition 4 and 5
+        const auto &Users = P->users();
+        const auto &Operands = P->operands();
+        if (!std::all_of(Users.begin(), Users.end(), IsPromotable) ||
+            !std::all_of(Operands.begin(), Operands.end(), IsPromotable))
+          ToRemove.push_back(P);
+      }
+    }
+
+    return Promotable;
+  }
+
+  typedef DenseMap<Value *, Value *> B2IMap;
+
+ public:
+  static char ID;
+  PPCBoolRetToInt() : FunctionPass(ID) {
+    initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) {
+    PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
+    B2IMap Bool2IntMap;
+    bool Changed = false;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (ReturnInst *R = dyn_cast<ReturnInst>(&I))
+          if (F.getReturnType()->isIntegerTy(1))
+            Changed |=
+              runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap);
+
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          for (auto &U : CI->operands())
+            if (U->getType()->isIntegerTy(1))
+              Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap);
+      }
+    }
+
+    return Changed;
+  }
+
+  static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+                       B2IMap &BoolToIntMap) {
+    auto Defs = findAllDefs(U);
+
+    // If the values are all Constants or Arguments, don't bother
+    if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>))
+      return false;
+
+    // Presently, we only know how to handle PHINode, Constant, and Arguments.
+    // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension
+    // could also be handled in the future.
+    for (const auto &V : Defs)
+      if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V))
+        return false;
+
+    for (const auto &V : Defs)
+      if (const PHINode *P = dyn_cast<PHINode>(V))
+        if (!PromotablePHINodes.count(P))
+          return false;
+
+    if (isa<ReturnInst>(U.getUser()))
+      ++NumBoolRetPromotion;
+    if (isa<CallInst>(U.getUser()))
+      ++NumBoolCallPromotion;
+    ++NumBoolToIntPromotion;
+
+    for (const auto &V : Defs)
+      if (!BoolToIntMap.count(V))
+        BoolToIntMap[V] = translate(V);
+
+    // Replace the operands of the translated instructions. There were set to
+    // zero in the translate function.
+    for (auto &Pair : BoolToIntMap) {
+      User *First = dyn_cast<User>(Pair.first);
+      User *Second = dyn_cast<User>(Pair.second);
+      assert((!First || Second) && "translated from user to non-user!?");
+      if (First)
+        for (unsigned i = 0; i < First->getNumOperands(); ++i)
+          Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
+    }
+
+    Value *IntRetVal = BoolToIntMap[U];
+    Type *Int1Ty = Type::getInt1Ty(U->getContext());
+    Instruction *I = cast<Instruction>(U.getUser());
+    Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+    U.set(BackToBool);
+
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char PPCBoolRetToInt::ID = 0;
+INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
+                "Convert i1 constants to i32 if they are returned",
+                false, false)
+
+FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
index 940d55a..73a5305 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -91,7 +91,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
   unsigned FuncSize = 0;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
-    MachineBasicBlock *MBB = MFI;
+    MachineBasicBlock *MBB = &*MFI;
 
     // The end of the previous block may have extra nops if this block has an
     // alignment requirement.
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index fd150be..b6ac4d5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -98,7 +98,7 @@ namespace {
       AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
     }
 
   private:
@@ -112,6 +112,7 @@ namespace {
     const DataLayout *DL;
     DominatorTree *DT;
     const TargetLibraryInfo *LibInfo;
+    bool PreserveLCSSA;
   };
 
   char PPCCTRLoops::ID = 0;
@@ -147,7 +148,7 @@ INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
 
@@ -169,11 +170,12 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
 
 bool PPCCTRLoops::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DL = &F.getParent()->getDataLayout();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
   bool MadeChange = false;
 
@@ -250,8 +252,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
           // we're definitely using CTR.
           case Intrinsic::ppc_is_decremented_ctr_nonzero:
-	  case Intrinsic::ppc_mtctr:
-	    return true;
+          case Intrinsic::ppc_mtctr:
+            return true;
 
 // VisualStudio defines setjmp as _setjmp
 #if defined(_MSC_VER) && defined(setjmp) && \
@@ -369,7 +371,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
                                             true);
           if (VTy == MVT::Other)
             return true;
-          
+
           if (TLI->isOperationLegalOrCustom(Opcode, VTy))
             continue;
           else if (VTy.isVector() &&
@@ -537,7 +539,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // the CTR register because some such uses might be reordered by the
   // selection DAG after the mtctr instruction).
   if (!Preheader || mightUseCTR(TT, Preheader))
-    Preheader = InsertPreheaderForLoop(L, this);
+    Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
   if (!Preheader)
     return MadeChange;
 
@@ -554,10 +556,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   if (!ExitCount->getType()->isPointerTy() &&
       ExitCount->getType() != CountType)
     ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(CountType, 1)); 
-  Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType,
-                                       Preheader->getTerminator());
+  ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
+  Value *ECValue =
+      SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
 
   IRBuilder<> CountBuilder(Preheader->getTerminator());
   Module *M = Preheader->getParent()->getParent();
@@ -677,7 +678,7 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
   // any other instructions that might clobber the ctr register.
   for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
        I != IE; ++I) {
-    MachineBasicBlock *MBB = I;
+    MachineBasicBlock *MBB = &*I;
     if (!MDT->isReachableFromEntry(MBB))
       continue;
 
@@ -694,4 +695,3 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 #endif // NDEBUG
-
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index fc89753..7cb1bb5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -71,15 +71,20 @@ protected:
       for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
            PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
         bool OtherReference = false, BlockChanged = false;
+
+        if ((*PI)->empty())
+          continue;
+        
         for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
-          MachineInstrBuilder MIB;
+          if (J == (*PI)->end())
+            break;
+
           if (J->getOpcode() == PPC::B) {
             if (J->getOperand(0).getMBB() == &ReturnMBB) {
               // This is an unconditional branch to the return. Replace the
               // branch with a blr.
-              MIB =
-                BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()));
-              MIB.copyImplicitOps(I);
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
+                  .copyImplicitOps(I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -90,10 +95,10 @@ protected:
             if (J->getOperand(2).getMBB() == &ReturnMBB) {
               // This is a conditional branch to the return. Replace the branch
               // with a bclr.
-              MIB = BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
-                      .addImm(J->getOperand(0).getImm())
-                      .addReg(J->getOperand(1).getReg());
-              MIB.copyImplicitOps(I);
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+                  .addImm(J->getOperand(0).getImm())
+                  .addReg(J->getOperand(1).getReg())
+                  .copyImplicitOps(I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -104,11 +109,11 @@ protected:
             if (J->getOperand(1).getMBB() == &ReturnMBB) {
               // This is a conditional branch to the return. Replace the branch
               // with a bclr.
-              MIB = BuildMI(**PI, J, J->getDebugLoc(),
-                            TII->get(J->getOpcode() == PPC::BC ?
-                                     PPC::BCLR : PPC::BCLRn))
-                      .addReg(J->getOperand(0).getReg());
-              MIB.copyImplicitOps(I);
+              BuildMI(
+                  **PI, J, J->getDebugLoc(),
+                  TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
+                  .addReg(J->getOperand(0).getReg())
+                  .copyImplicitOps(I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
               BlockChanged = true;
@@ -146,7 +151,7 @@ protected:
       }
 
       for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
-        PredToRemove[i]->removeSuccessor(&ReturnMBB);
+        PredToRemove[i]->removeSuccessor(&ReturnMBB, true);
 
       if (Changed && !ReturnMBB.hasAddressTaken()) {
         // We now might be able to merge this blr-only block into its
@@ -156,7 +161,7 @@ protected:
           if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) {
             // Move the blr into the preceding block.
             PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
-            PrevMBB.removeSuccessor(&ReturnMBB);
+            PrevMBB.removeSuccessor(&ReturnMBB, true);
           }
         }
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 5f236f7..b451ebf 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -164,7 +164,8 @@ class PPCFastISel final : public FastISel {
                            unsigned DestReg, bool IsZExt);
     unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
     unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
-    unsigned PPCMaterializeInt(const Constant *C, MVT VT, bool UseSExt = true);
+    unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+                               bool UseSExt = true);
     unsigned PPCMaterialize32BitInt(int64_t Imm,
                                     const TargetRegisterClass *RC);
     unsigned PPCMaterialize64BitInt(int64_t Imm,
@@ -292,10 +293,7 @@ bool PPCFastISel::isValueAvailable(const Value *V) const {
     return true;
 
   const auto *I = cast<Instruction>(V);
-  if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
-    return true;
-
-  return false;
+  return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
 }
 
 // Given a value Obj, create an Address object Addr that represents its
@@ -527,9 +525,9 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
     // VSX only provides an indexed load.
     if (Is32VSXLoad || Is64VSXLoad) return false;
 
-    MachineMemOperand *MMO =
-      FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+                                          Addr.Offset),
         MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
         MFI.getObjectAlignment(Addr.Base.FI));
 
@@ -660,9 +658,9 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
     // VSX only provides an indexed store.
     if (Is32VSXStore || Is64VSXStore) return false;
 
-    MachineMemOperand *MMO =
-      FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+                                          Addr.Offset),
         MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
         MFI.getObjectAlignment(Addr.Base.FI));
 
@@ -774,8 +772,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
 
       BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
         .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
-      fastEmitBranch(FBB, DbgLoc);
-      FuncInfo.MBB->addSuccessor(TBB);
+      finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
   } else if (const ConstantInt *CI =
@@ -1607,21 +1604,18 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
     if (ValLocs.size() > 1)
       return false;
 
-    // Special case for returning a constant integer of any size.
-    // Materialize the constant as an i64 and copy it to the return
-    // register. We still need to worry about properly extending the sign. E.g:
-    // If the constant has only one bit, it means it is a boolean. Therefore
-    // we can't use PPCMaterializeInt because it extends the sign which will
-    // cause negations of the returned value to be incorrect as they are
-    // implemented as the flip of the least significant bit.
-    if (isa<ConstantInt>(*RV)) {
-      const Constant *C = cast<Constant>(RV);
-
+    // Special case for returning a constant integer of any size - materialize
+    // the constant as an i64 and copy it to the return register.
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) {
       CCValAssign &VA = ValLocs[0];
 
       unsigned RetReg = VA.getLocReg();
-      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64,
-                                          VA.getLocInfo() == CCValAssign::SExt);
+      // We still need to worry about properly extending the sign. For example,
+      // we could have only a single bit or a constant that needs zero
+      // extension rather than sign extension. Make sure we pass the return
+      // value extension property to integer materialization.
+      unsigned SrcReg =
+          PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt);
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
@@ -1761,8 +1755,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
 
   const IndirectBrInst *IB = cast<IndirectBrInst>(I);
-  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
-    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+  for (const BasicBlock *SuccBB : IB->successors())
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
 
   return true;
 }
@@ -1898,10 +1892,9 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
   CodeModel::Model CModel = TM.getCodeModel();
 
-  MachineMemOperand *MMO =
-    FuncInfo.MF->getMachineMemOperand(
-      MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
-      (VT == MVT::f32) ? 4 : 8, Align);
+  MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+      MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
 
   unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
   unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
@@ -1976,19 +1969,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
-    // If/when switches are implemented, jump tables should be handled
-    // on the "if" path here.
-    if (CModel == CodeModel::Large ||
-        (GV->getType()->getElementType()->isFunctionTy() &&
-         !GV->isStrongDefinitionForLinker()) ||
-        GV->isDeclaration() || GV->hasCommonLinkage() ||
-        GV->hasAvailableExternallyLinkage())
+    unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+    if (GVFlags & PPCII::MO_NLP_FLAG) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
-    else
+    } else {
       // Otherwise generate the ADDItocL.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
               DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+    }
   }
 
   return DestReg;
@@ -2085,12 +2074,11 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
 
 // Materialize an integer constant into a register, and return
 // the register number (or zero if we failed to handle it).
-unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
-                                                           bool UseSExt) {
+unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+                                        bool UseSExt) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
   if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
-    const ConstantInt *CI = cast<ConstantInt>(C);
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
@@ -2105,12 +2093,17 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
                                    &PPC::GPRCRegClass);
 
   // If the constant is in range, use a load-immediate.
-  const ConstantInt *CI = cast<ConstantInt>(C);
-  if (isInt<16>(CI->getSExtValue())) {
+  if (UseSExt && isInt<16>(CI->getSExtValue())) {
+    unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
+    unsigned ImmReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
+        .addImm(CI->getSExtValue());
+    return ImmReg;
+  } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) {
     unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
     unsigned ImmReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
-      .addImm( (UseSExt) ? CI->getSExtValue() : CI->getZExtValue() );
+        .addImm(CI->getZExtValue());
     return ImmReg;
   }
 
@@ -2138,8 +2131,8 @@ unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
     return PPCMaterializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return PPCMaterializeGV(GV, VT);
-  else if (isa<ConstantInt>(C))
-    return PPCMaterializeInt(C, VT, VT != MVT::i1);
+  else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+    return PPCMaterializeInt(CI, VT, VT != MVT::i1);
 
   return 0;
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 08ae717..beab844 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 
 /// VRRegNo - Map from a numbered VR register to its enum value.
 ///
-static const uint16_t VRRegNo[] = {
+static const MCPhysReg VRRegNo[] = {
  PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
  PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
  PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
@@ -270,7 +270,7 @@ static void RemoveVRSaveCode(MachineInstr *MI) {
   // epilog blocks.
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
     // If last instruction is a return instruction, add an epilogue
-    if (!I->empty() && I->back().isReturn()) {
+    if (I->isReturnBlock()) {
       bool FoundIt = false;
       for (MBBI = I->end(); MBBI != I->begin(); ) {
         --MBBI;
@@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DebugLoc dl = MI->getDebugLoc();
 
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UsedRegMask = 0;
   for (unsigned i = 0; i != 32; ++i)
-    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+    if (MRI.isPhysRegModified(VRRegNo[i]))
       UsedRegMask |= 1 << (31-i);
 
   // Live in and live out values already must be in the mask, so don't bother
@@ -325,7 +326,7 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
        UsedRegMask != 0 && BI != BE; ++BI) {
     const MachineBasicBlock &MBB = *BI;
-    if (MBB.empty() || !MBB.back().isReturn())
+    if (!MBB.isReturnBlock())
       continue;
     const MachineInstr &Ret = MBB.back();
     for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
@@ -555,9 +556,67 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
     }
 }
 
+bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
+                                           bool UseAtEnd,
+                                           unsigned *ScratchRegister) const {
+  RegScavenger RS;
+  unsigned     R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+
+  if (ScratchRegister)
+    *ScratchRegister = R0;
+
+  // If MBB is an entry or exit block, use R0 as the scratch register
+  if ((UseAtEnd && MBB->isReturnBlock()) ||
+      (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
+    return true;
+
+  RS.enterBasicBlock(MBB);
+
+  if (UseAtEnd && !MBB->empty()) {
+    // The scratch register will be used at the end of the block, so must consider
+    // all registers used within the block
+
+    MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
+    // If no terminator, back iterator up to previous instruction.
+    if (MBBI == MBB->end())
+      MBBI = std::prev(MBBI);
+
+    if (MBBI != MBB->begin())
+      RS.forward(MBBI);
+  }
+  
+  if (!RS.isRegUsed(R0)) 
+    return true;
+
+  unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass
+                                  : &PPC::GPRCRegClass);
+  
+  // Make sure the register scavenger was able to find an available register
+  // If not, use R0 but return false to indicate no register was available and
+  // R0 must be used (as recommended by the ABI)
+  if (Reg == 0)
+    return false;
+
+  if (ScratchRegister)
+    *ScratchRegister = Reg;
+
+  return true;
+}
+
+bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+  return findScratchRegister(TmpMBB, false, nullptr);
+}
+
+bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+  return findScratchRegister(TmpMBB, true, nullptr);
+}
+
 void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const PPCInstrInfo &TII =
@@ -589,7 +648,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       }
     }
 
-  // Move MBBI back to the beginning of the function.
+  // Move MBBI back to the beginning of the prologue block.
   MBBI = MBB.begin();
 
   // Work out frame sizes.
@@ -613,7 +672,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned BPReg       = RegInfo->getBaseRegister(MF);
   unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
   unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
-  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned ScratchReg  = 0;
   unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
   //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
   const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
@@ -642,6 +701,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
          "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
 
+  findScratchRegister(&MBB, false, &ScratchReg);
+  assert(ScratchReg && "No scratch register!");
+         
   int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
@@ -916,27 +978,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 }
 
 void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
-                                MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI != MBB.end() && "Returning block has no terminator");
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl;
+
+  if (MBBI != MBB.end())
+    dl = MBBI->getDebugLoc();
+  
   const PPCInstrInfo &TII =
       *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
   const PPCRegisterInfo *RegInfo =
       static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
-  unsigned RetOpcode = MBBI->getOpcode();
-  DebugLoc dl;
-
-  assert((RetOpcode == PPC::BLR ||
-          RetOpcode == PPC::BLR8 ||
-          RetOpcode == PPC::TCRETURNri ||
-          RetOpcode == PPC::TCRETURNdi ||
-          RetOpcode == PPC::TCRETURNai ||
-          RetOpcode == PPC::TCRETURNri8 ||
-          RetOpcode == PPC::TCRETURNdi8 ||
-          RetOpcode == PPC::TCRETURNai8) &&
-         "Can only insert epilog into returning blocks");
-
   // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -959,7 +1012,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
   unsigned BPReg      = RegInfo->getBaseRegister(MF);
   unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
-  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned ScratchReg = 0;
   unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
   const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
                                                  : PPC::MTLR );
@@ -973,10 +1026,14 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
                                                    : PPC::ADDI );
   const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
                                                 : PPC::ADD4 );
-
+  
   int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
+
+  findScratchRegister(&MBB, true, &ScratchReg);
+  assert(ScratchReg && "No scratch register!");
+  
   if (HasFP) {
     if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
@@ -1008,25 +1065,30 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     PBPOffset = FFI->getObjectOffset(PBPIndex);
   }
 
-  bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
-    RetOpcode == PPC::TCRETURNdi ||
-    RetOpcode == PPC::TCRETURNai ||
-    RetOpcode == PPC::TCRETURNri8 ||
-    RetOpcode == PPC::TCRETURNdi8 ||
-    RetOpcode == PPC::TCRETURNai8;
-
-  if (UsesTCRet) {
-    int MaxTCRetDelta = FI->getTailCallSPDelta();
-    MachineOperand &StackAdjust = MBBI->getOperand(1);
-    assert(StackAdjust.isImm() && "Expecting immediate value.");
-    // Adjust stack pointer.
-    int StackAdj = StackAdjust.getImm();
-    int Delta = StackAdj - MaxTCRetDelta;
-    assert((Delta >= 0) && "Delta must be positive");
-    if (MaxTCRetDelta>0)
-      FrameSize += (StackAdj +Delta);
-    else
-      FrameSize += StackAdj;
+  bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn());
+  
+  if (IsReturnBlock) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
+                      RetOpcode == PPC::TCRETURNdi ||
+                      RetOpcode == PPC::TCRETURNai ||
+                      RetOpcode == PPC::TCRETURNri8 ||
+                      RetOpcode == PPC::TCRETURNdi8 ||
+                      RetOpcode == PPC::TCRETURNai8;
+
+    if (UsesTCRet) {
+      int MaxTCRetDelta = FI->getTailCallSPDelta();
+      MachineOperand &StackAdjust = MBBI->getOperand(1);
+      assert(StackAdjust.isImm() && "Expecting immediate value.");
+      // Adjust stack pointer.
+      int StackAdj = StackAdjust.getImm();
+      int Delta = StackAdj - MaxTCRetDelta;
+      assert((Delta >= 0) && "Delta must be positive");
+      if (MaxTCRetDelta>0)
+        FrameSize += (StackAdj +Delta);
+      else
+        FrameSize += StackAdj;
+    }
   }
 
   // Frames of 32KB & larger require special handling because they cannot be
@@ -1066,7 +1128,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
         .addImm(0)
         .addReg(SPReg);
     }
-
   }
 
   if (MustSaveLR)
@@ -1109,52 +1170,55 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Callee pop calling convention. Pop parameter/linkage area. Used for tail
   // call optimization
-  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
-      (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
-      MF.getFunction()->getCallingConv() == CallingConv::Fast) {
-     PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-     unsigned CallerAllocatedAmt = FI->getMinReservedArea();
-
-     if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
-       BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
-         .addReg(SPReg).addImm(CallerAllocatedAmt);
-     } else {
-       BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+  if (IsReturnBlock) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+        (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
+        MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+
+      if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(SPReg).addImm(CallerAllocatedAmt);
+      } else {
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(CallerAllocatedAmt >> 16);
-       BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
           .addReg(ScratchReg, RegState::Kill)
           .addImm(CallerAllocatedAmt & 0xFFFF);
-       BuildMI(MBB, MBBI, dl, AddInst)
+        BuildMI(MBB, MBBI, dl, AddInst)
           .addReg(SPReg)
           .addReg(FPReg)
           .addReg(ScratchReg);
-     }
-  } else if (RetOpcode == PPC::TCRETURNdi) {
-    MBBI = MBB.getLastNonDebugInstr();
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
-      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
-  } else if (RetOpcode == PPC::TCRETURNri) {
-    MBBI = MBB.getLastNonDebugInstr();
-    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
-  } else if (RetOpcode == PPC::TCRETURNai) {
-    MBBI = MBB.getLastNonDebugInstr();
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
-  } else if (RetOpcode == PPC::TCRETURNdi8) {
-    MBBI = MBB.getLastNonDebugInstr();
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
-      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
-  } else if (RetOpcode == PPC::TCRETURNri8) {
-    MBBI = MBB.getLastNonDebugInstr();
-    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
-  } else if (RetOpcode == PPC::TCRETURNai8) {
-    MBBI = MBB.getLastNonDebugInstr();
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+      }
+    } else if (RetOpcode == PPC::TCRETURNdi) {
+      MBBI = MBB.getLastNonDebugInstr();
+      MachineOperand &JumpTarget = MBBI->getOperand(0);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    } else if (RetOpcode == PPC::TCRETURNri) {
+      MBBI = MBB.getLastNonDebugInstr();
+      assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+    } else if (RetOpcode == PPC::TCRETURNai) {
+      MBBI = MBB.getLastNonDebugInstr();
+      MachineOperand &JumpTarget = MBBI->getOperand(0);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+    } else if (RetOpcode == PPC::TCRETURNdi8) {
+      MBBI = MBB.getLastNonDebugInstr();
+      MachineOperand &JumpTarget = MBBI->getOperand(0);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    } else if (RetOpcode == PPC::TCRETURNri8) {
+      MBBI = MBB.getLastNonDebugInstr();
+      assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+    } else if (RetOpcode == PPC::TCRETURNai8) {
+      MBBI = MBB.getLastNonDebugInstr();
+      MachineOperand &JumpTarget = MBBI->getOperand(0);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+    }
   }
 }
 
@@ -1200,8 +1264,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // Reserve stack space for the PIC Base register (R30).
   // Only used in SVR4 32-bit.
   if (FI->usesPICBase()) {
-    int PBPSI = FI->getPICBasePointerSaveIndex();
-    PBPSI = MFI->CreateFixedObject(4, -8, true);
+    int PBPSI = MFI->CreateFixedObject(4, -8, true);
     FI->setPICBasePointerSaveIndex(PBPSI);
   }
 
@@ -1710,3 +1773,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   return true;
 }
+
+bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
+          MF.getSubtarget<PPCSubtarget>().isPPC64());
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index d6a389b..bbe1329 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -29,6 +29,30 @@ class PPCFrameLowering: public TargetFrameLowering {
   const unsigned LinkageSize;
   const unsigned BasePointerSaveOffset;
 
+  /**
+   * \brief Find a register that can be used in function prologue and epilogue
+   *
+   * Find a register that can be use as the scratch register in function
+   * prologue and epilogue to save various registers (Link Register, Base
+   * Pointer, etc.). Prefer R0, if it is available. If it is not available,
+   * then choose a different register.
+   *
+   * This method will return true if an available register was found (including
+   * R0). If no available registers are found, the method returns false and sets
+   * ScratchRegister to R0, as per the recommendation in the ABI.
+   *
+   * \param[in] MBB The machine basic block to find an available register for
+   * \param[in] UseAtEnd Specify whether the scratch register will be used at
+   *                     the end of the basic block (i.e., will the scratch
+   *                     register kill a register defined in the basic block)
+   * \param[out] ScratchRegister The scratch register to use
+   * \return true if a scratch register was found. false of a scratch register
+   *         was not found and R0 is being used as the default.
+   */
+  bool findScratchRegister(MachineBasicBlock *MBB,
+                           bool UseAtEnd,
+                           unsigned *ScratchRegister) const;
+
 public:
   PPCFrameLowering(const PPCSubtarget &STI);
 
@@ -92,6 +116,13 @@ public:
 
   const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+  /// Methods used by shrink wrapping to determine if MBB can be used for the
+  /// function prologue/epilogue.
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 9322268..1eaa811 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -16,6 +16,8 @@
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -52,6 +54,11 @@ static cl::opt<bool> BPermRewriterNoMasking(
              "bit permutations"),
     cl::Hidden);
 
+static cl::opt<bool> EnableBranchHint(
+  "ppc-use-branch-hint", cl::init(true),
+    cl::desc("Enable static hinting of branches on ppc"),
+    cl::Hidden);
+
 namespace llvm {
   void initializePPCDAGToDAGISelPass(PassRegistry&);
 }
@@ -286,7 +293,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
 
   // Find all return blocks, outputting a restore in each epilog.
   for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
-    if (!BB->empty() && BB->back().isReturn()) {
+    if (BB->isReturnBlock()) {
       IP = BB->end(); --IP;
 
       // Skip over all terminator instructions, which are part of the return
@@ -393,6 +400,55 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) {
   return isInt32Immediate(N.getNode(), Imm);
 }
 
+static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
+                              const SDValue &DestMBB) {
+  assert(isa<BasicBlockSDNode>(DestMBB));
+
+  if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
+
+  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+  const TerminatorInst *BBTerm = BB->getTerminator();
+
+  if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
+
+  const BasicBlock *TBB = BBTerm->getSuccessor(0);
+  const BasicBlock *FBB = BBTerm->getSuccessor(1);
+
+  auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB);
+  auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB);
+
+  // We only want to handle cases which are easy to predict at static time, e.g.
+  // C++ throw statement, that is very likely not taken, or calling never
+  // returned function, e.g. stdlib exit(). So we set Threshold to filter
+  // unwanted cases.
+  //
+  // Below is LLVM branch weight table, we only want to handle case 1, 2
+  //
+  // Case                  Taken:Nontaken  Example
+  // 1. Unreachable        1048575:1       C++ throw, stdlib exit(),
+  // 2. Invoke-terminating 1:1048575
+  // 3. Coldblock          4:64            __builtin_expect
+  // 4. Loop Branch        124:4           For loop
+  // 5. PH/ZH/FPH          20:12
+  const uint32_t Threshold = 10000;
+
+  if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
+    return PPC::BR_NO_HINT;
+
+  DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
+               << BB->getName() << "'\n"
+               << " -> " << TBB->getName() << ": " << TProb << "\n"
+               << " -> " << FBB->getName() << ": " << FProb << "\n");
+
+  const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
+
+  // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities,
+  // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock
+  if (BBDN->getBasicBlock()->getBasicBlock() != TBB)
+    std::swap(TProb, FProb);
+
+  return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT;
+}
 
 // isOpcWithIntImmediate - This method tests to see if the node is a specific
 // opcode and that it has a immediate integer right operand.
@@ -564,7 +620,6 @@ static unsigned SelectInt64CountDirect(int64_t Imm) {
 
   // Handle first 32 bits.
   unsigned Lo = Imm & 0xFFFF;
-  unsigned Hi = (Imm >> 16) & 0xFFFF;
 
   // Simple value.
   if (isInt<16>(Imm)) {
@@ -586,9 +641,9 @@ static unsigned SelectInt64CountDirect(int64_t Imm) {
     ++Result;
 
   // Add in the last bits as required.
-  if ((Hi = (Remainder >> 16) & 0xFFFF))
+  if ((Remainder >> 16) & 0xFFFF)
     ++Result;
-  if ((Lo = Remainder & 0xFFFF))
+  if (Remainder & 0xFFFF)
     ++Result;
 
   return Result;
@@ -1028,7 +1083,7 @@ class BitPermutationSelector {
           BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
           BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
           BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
-        DEBUG(dbgs() << "\tcombining final bit group with inital one\n");
+        DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
         BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
         BitGroups.erase(BitGroups.begin());
       }
@@ -1557,10 +1612,7 @@ class BitPermutationSelector {
           return false;
         }
 
-        if (VRI.RLAmt != EffRLAmt)
-          return false;
-
-        return true;
+        return VRI.RLAmt == EffRLAmt;
       };
 
       for (auto &BG : BitGroups) {
@@ -2781,7 +2833,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-      
+
       SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
               Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
       unsigned DM[2];
@@ -2798,7 +2850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
         SDValue Base, Offset;
 
-        if (LD->isUnindexed() &&
+        if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() &&
             (LD->getMemoryVT() == MVT::f64 ||
              LD->getMemoryVT() == MVT::i64) &&
             SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
@@ -2841,8 +2893,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     // Op #3 is the Dest MBB
     // Op #4 is the Flag.
     // Prevent PPC::PRED_* from being selected into LI.
-    SDValue Pred =
-      getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(), dl);
+    unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    if (EnableBranchHint)
+      PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3));
+
+    SDValue Pred = getI32Imm(PCC, dl);
     SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
       N->getOperand(0), N->getOperand(4) };
     return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
@@ -2871,6 +2926,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                                   BitComp, N->getOperand(4), N->getOperand(0));
     }
 
+    if (EnableBranchHint)
+      PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4));
+
     SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
     SDValue Ops[] = { getI32Imm(PCC, dl), CondCode,
                         N->getOperand(4), N->getOperand(0) };
@@ -2903,9 +2961,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       break;
 
     // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
-    // If it is an externally defined symbol, a symbol with common linkage,
-    // a non-local function address, or a jump table address, or if we are
-    // generating code for large code model, we generate:
+    // If it must be toc-referenced according to PPCSubTarget, we generate:
     //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
     // Otherwise we generate:
     //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
@@ -2920,13 +2976,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                                       MVT::i64, GA, SDValue(Tmp, 0)));
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
-      const GlobalValue *GValue = G->getGlobal();
-      if ((GValue->getType()->getElementType()->isFunctionTy() &&
-           !GValue->isStrongDefinitionForLinker()) ||
-          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
-          GValue->hasAvailableExternallyLinkage())
+      const GlobalValue *GV = G->getGlobal();
+      unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+      if (GVFlags & PPCII::MO_NLP_FLAG) {
         return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
                                         MVT::i64, GA, SDValue(Tmp, 0)));
+      }
     }
 
     return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
@@ -3110,7 +3165,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
         if (!CurDAG->MaskedValueIsZero(Op0,
               APInt::getHighBitsSet(Bits, Bits - (b+1)*8)))
           return false;
-        
+
         LHS = Op0.getOperand(0);
         RHS = Op0.getOperand(1);
         return true;
@@ -3305,7 +3360,7 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
 
   bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = --Position;
+    SDNode *N = &*--Position;
     if (N->use_empty())
       continue;
 
@@ -3989,7 +4044,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
 
   bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = --Position;
+    SDNode *N = &*--Position;
     // Skip dead nodes and any non-machine opcodes.
     if (N->use_empty() || !N->isMachineOpcode())
       continue;
@@ -4145,7 +4200,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
   ++Position;
 
   while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = --Position;
+    SDNode *N = &*--Position;
     // Skip dead nodes and any non-machine opcodes.
     if (N->use_empty() || !N->isMachineOpcode())
       continue;
@@ -4184,16 +4239,24 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       break;
     }
 
-    // If this is a load or store with a zero offset, we may be able to
-    // fold an add-immediate into the memory operation.
-    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
-        N->getConstantOperandVal(FirstOp) != 0)
+    // If this is a load or store with a zero offset, or within the alignment,
+    // we may be able to fold an add-immediate into the memory operation.
+    // The check against alignment is below, as it can't occur until we check
+    // the arguments to N
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
       continue;
 
     SDValue Base = N->getOperand(FirstOp + 1);
     if (!Base.isMachineOpcode())
       continue;
 
+    // On targets with fusion, we don't want this to fire and remove a fusion
+    // opportunity, unless a) it results in another fusion opportunity or
+    // b) optimizing for size.
+    if (PPCSubTarget->hasFusion() &&
+        (!MF->getFunction()->optForSize() && !Base.hasOneUse()))
+      continue;
+
     unsigned Flags = 0;
     bool ReplaceFlags = true;
 
@@ -4237,6 +4300,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       break;
     }
 
+    SDValue ImmOpnd = Base.getOperand(1);
+    int MaxDisplacement = 0;
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+      const GlobalValue *GV = GA->getGlobal();
+      MaxDisplacement = GV->getAlignment() - 1;
+    }
+
+    int Offset = N->getConstantOperandVal(FirstOp);
+    if (Offset < 0 || Offset > MaxDisplacement)
+      continue;
+
     // We found an opportunity.  Reverse the operands from the add
     // immediate and substitute them into the load or store.  If
     // needed, update the target flags for the immediate operand to
@@ -4247,8 +4321,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     DEBUG(N->dump(CurDAG));
     DEBUG(dbgs() << "\n");
 
-    SDValue ImmOpnd = Base.getOperand(1);
-
     // If the relocation information isn't already present on the
     // immediate operand, add it now.
     if (ReplaceFlags) {
@@ -4259,17 +4331,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         // is insufficient for the instruction encoding.
         if (GV->getAlignment() < 4 &&
             (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
-             StorageOpcode == PPC::LWA)) {
+             StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
           DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
           continue;
         }
-        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
       } else if (ConstantPoolSDNode *CP =
                  dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
         const Constant *C = CP->getConstVal();
         ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
                                                 CP->getAlignment(),
-                                                0, Flags);
+                                                Offset, Flags);
       }
     }
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1b8f8fb..af9ad07 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -42,10 +42,6 @@
 
 using namespace llvm;
 
-// FIXME: Remove this once soft-float is supported.
-static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic",
-cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden);
-
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -72,8 +68,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
-  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
-  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+  if (!Subtarget.useSoftFloat()) {
+    addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+    addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+  }
 
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
   for (MVT VT : MVT::integer_valuetypes()) {
@@ -107,8 +105,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
                          isPPC64 ? MVT::i64 : MVT::i32);
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
-      AddPromotedToType (ISD::UINT_TO_FP, MVT::i1, 
-                         isPPC64 ? MVT::i64 : MVT::i32);
+      AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
     } else {
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
@@ -257,10 +255,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
 
-  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
-  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
-  setOperationAction(ISD::BITCAST, MVT::i64, Expand);
-  setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+  if (Subtarget.hasDirectMove()) {
+    setOperationAction(ISD::BITCAST, MVT::f32, Legal);
+    setOperationAction(ISD::BITCAST, MVT::i32, Legal);
+    setOperationAction(ISD::BITCAST, MVT::i64, Legal);
+    setOperationAction(ISD::BITCAST, MVT::f64, Legal);
+  } else {
+    setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+  }
 
   // We cannot sextinreg(i1).  Expand to shifts.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -329,6 +334,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -403,9 +410,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     // will selectively turn on ones that can be effectively codegen'd.
     for (MVT VT : MVT::vector_valuetypes()) {
       // add/sub are legal for all supported vector VT's.
-      setOperationAction(ISD::ADD , VT, Legal);
-      setOperationAction(ISD::SUB , VT, Legal);
-      
+      setOperationAction(ISD::ADD, VT, Legal);
+      setOperationAction(ISD::SUB, VT, Legal);
+
       // Vector instructions introduced in P8
       if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
         setOperationAction(ISD::CTPOP, VT, Legal);
@@ -477,6 +484,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+      setOperationAction(ISD::ROTL, VT, Expand);
+      setOperationAction(ISD::ROTR, VT, Expand);
 
       for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
@@ -519,12 +528,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     }
 
-    
-    if (Subtarget.hasP8Altivec()) 
+    if (Subtarget.hasP8Altivec())
       setOperationAction(ISD::MUL, MVT::v4i32, Legal);
     else
       setOperationAction(ISD::MUL, MVT::v4i32, Custom);
-      
+
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
 
@@ -545,6 +553,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     if (Subtarget.hasVSX()) {
       setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+      if (Subtarget.hasP8Vector()) {
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
+      }
+      if (Subtarget.hasDirectMove()) {
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+      }
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
       setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
       setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
@@ -813,15 +836,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
-  if (isPPC64) {
-    setStackPointerRegisterToSaveRestore(PPC::X1);
-    setExceptionPointerRegister(PPC::X3);
-    setExceptionSelectorRegister(PPC::X4);
-  } else {
-    setStackPointerRegisterToSaveRestore(PPC::R1);
-    setExceptionPointerRegister(PPC::R3);
-    setExceptionSelectorRegister(PPC::R4);
-  }
+  setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -942,9 +957,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    for (auto *EltTy : STy->elements()) {
       unsigned EltAlign = 0;
-      getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
+      getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
       if (MaxAlign == MaxMaxAlign)
@@ -969,6 +984,10 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
   return Align;
 }
 
+bool PPCTargetLowering::useSoftFloat() const {
+  return Subtarget.useSoftFloat();
+}
+
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
@@ -992,6 +1011,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
   case PPCISD::SRA:             return "PPCISD::SRA";
@@ -1236,7 +1256,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
 
 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
-/// The ShuffleKind distinguishes between big-endian merges with two 
+/// The ShuffleKind distinguishes between big-endian merges with two
 /// different inputs (0), either-endian merges with two identical inputs (1),
 /// and little-endian merges with two different inputs (2).  For the latter,
 /// the input operands are swapped (see PPCInstrAltivec.td).
@@ -1261,7 +1281,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 
 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
-/// The ShuffleKind distinguishes between big-endian merges with two 
+/// The ShuffleKind distinguishes between big-endian merges with two
 /// different inputs (0), either-endian merges with two identical inputs (1),
 /// and little-endian merges with two different inputs (2).  For the latter,
 /// the input operands are swapped (see PPCInstrAltivec.td).
@@ -1353,7 +1373,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
  *   - 2 = little-endian merge with two different inputs (inputs are swapped for
  *     little-endian merges).
  * \param[in] DAG The current SelectionDAG
- * \return true iff this shuffle mask 
+ * \return true iff this shuffle mask
  */
 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
                               unsigned ShuffleKind, SelectionDAG &DAG) {
@@ -1380,7 +1400,7 @@ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
 
 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
 /// amount, otherwise return -1.
-/// The ShuffleKind distinguishes between big-endian operations with two 
+/// The ShuffleKind distinguishes between big-endian operations with two
 /// different inputs (0), either-endian operations with two identical inputs
 /// (1), and little-endian operations with two different inputs (2).  For the
 /// latter, the input operands are swapped (see PPCInstrAltivec.td).
@@ -1513,8 +1533,8 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
     for (unsigned i = 0; i != Multiple-1; ++i) {
       if (!UniquedVals[i].getNode()) continue;  // Must have been undefs.
 
-      LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
-      LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+      LeadingZero &= isNullConstant(UniquedVals[i]);
+      LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
     }
     // Finally, check the least significant entry.
     if (LeadingZero) {
@@ -1629,7 +1649,6 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) {
   return isIntS16Immediate(Op.getNode(), Imm);
 }
 
-
 /// SelectAddressRegReg - Given the specified addressed, check to see if it
 /// can be represented as an indexed [r+r] operation.  Returns false if it
 /// can be more efficiently represented with [r+imm].
@@ -1998,10 +2017,10 @@ static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit,
                 DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
 
   SDValue Ops[] = { GA, Reg };
-  return DAG.getMemIntrinsicNode(PPCISD::TOC_ENTRY, dl,
-                                 DAG.getVTList(VT, MVT::Other), Ops, VT,
-                                 MachinePointerInfo::getGOT(), 0, false, true,
-                                 false, 0);
+  return DAG.getMemIntrinsicNode(
+      PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
+      false, 0);
 }
 
 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
@@ -2092,6 +2111,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   // large models could be added if users need it, at the cost of
   // additional complexity.
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2480,7 +2502,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   //                */
   // } va_list[1];
 
-
   SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
   SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
 
@@ -2536,7 +2557,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
-// Function whose sole purpose is to kill compiler warnings 
+// Function whose sole purpose is to kill compiler warnings
 // stemming from unused functions included from PPCGenCallingConv.inc.
 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
   return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
@@ -2933,8 +2954,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
       PPC::F8
     };
     unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
-    if (DisablePPCFloatInVariadic)
-      NumFPArgRegs = 0;
+
+    if (Subtarget.useSoftFloat())
+       NumFPArgRegs = 0;
 
     FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
     FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
@@ -3177,15 +3199,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
             EVT ObjType = (ObjSize == 1 ? MVT::i8 :
                            (ObjSize == 2 ? MVT::i16 : MVT::i32));
             Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
-                                      MachinePointerInfo(FuncArg),
-                                      ObjType, false, false, 0);
+                                      MachinePointerInfo(&*FuncArg), ObjType,
+                                      false, false, 0);
           } else {
             // For sizes that don't fit a truncating store (3, 5, 6, 7),
             // store the whole register as-is to the parameter save area
             // slot.
-            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                 MachinePointerInfo(FuncArg),
-                                 false, false, 0);
+            Store =
+                DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                             MachinePointerInfo(&*FuncArg), false, false, 0);
           }
 
           MemOps.push_back(Store);
@@ -3212,9 +3234,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           SDValue Off = DAG.getConstant(j, dl, PtrVT);
           Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
         }
-        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
-                                     MachinePointerInfo(FuncArg, j),
-                                     false, false, 0);
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), dl, Val, Addr,
+                         MachinePointerInfo(&*FuncArg, j), false, false, 0);
         MemOps.push_back(Store);
         ++GPR_idx;
       }
@@ -3592,7 +3614,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(FuncArg),
+                                            MachinePointerInfo(&*FuncArg),
                                             ObjType, false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -3615,9 +3637,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(FuncArg, j),
-                                       false, false, 0);
+          SDValue Store =
+              DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                           MachinePointerInfo(&*FuncArg, j), false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
           ArgOffset += PtrByteSize;
@@ -3880,7 +3902,6 @@ struct TailCallArgumentInfo {
 
   TailCallArgumentInfo() : FrameIdx(0) {}
 };
-
 }
 
 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
@@ -3895,9 +3916,10 @@ StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
     SDValue FIN = TailCallArgs[i].FrameIdxOp;
     int FI = TailCallArgs[i].FrameIdx;
     // Store relative to framepointer.
-    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
-                                       MachinePointerInfo::getFixedStack(FI),
-                                       false, false, 0));
+    MemOpChains.push_back(DAG.getStore(
+        Chain, dl, Arg, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+        false, 0));
   }
 }
 
@@ -3922,9 +3944,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
                                                           NewRetAddrLoc, true);
     EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
     SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
-    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
-                         MachinePointerInfo::getFixedStack(NewRetAddr),
-                         false, false, 0);
+    Chain = DAG.getStore(
+        Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewRetAddr),
+        false, false, 0);
 
     // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
     // slot as the FP is never overwritten.
@@ -3933,9 +3956,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
       int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
                                                           true);
       SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
-      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
-                           MachinePointerInfo::getFixedStack(NewFPIdx),
-                           false, false, 0);
+      Chain = DAG.getStore(
+          Chain, dl, OldFP, NewFramePtrIdx,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFPIdx),
+          false, false, 0);
     }
   }
   return Chain;
@@ -4812,8 +4836,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
             continue;
           break;
         case MVT::v4f32:
-	  // When using QPX, this is handled like a FP register, otherwise, it
-	  // is an Altivec register.
+          // When using QPX, this is handled like a FP register, otherwise, it
+          // is an Altivec register.
           if (Subtarget.hasQPX()) {
             if (++NumFPRsUsed <= NumFPRs)
               continue;
@@ -5318,9 +5342,10 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
     SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
-                         MachinePointerInfo::getStack(TOCSaveOffset),
-                         false, false, 0);
+    Chain = DAG.getStore(
+        Val.getValue(1), dl, Val, AddPtr,
+        MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset),
+        false, false, 0);
     // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
     // This does not mean the MTCTR instruction must use R12; it's easier
     // to model this as an extra parameter, so do that.
@@ -5341,9 +5366,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
-		    hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
-                    Callee, SPDiff, NumBytes, Ins, InVals, CS);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, hasNest,
+                    DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
+                    SPDiff, NumBytes, Ins, InVals, CS);
 }
 
 SDValue
@@ -5798,6 +5823,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
+SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(
+    SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const {
+  SDLoc dl(Op);
+
+  // Get the corect type for integers.
+  EVT IntVT = Op.getValueType();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNAREAOFFSET node.
+  SDValue Ops[2] = {Chain, FPSIdx};
+  SDVTList VTs = DAG.getVTList(IntVT);
+  return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
+}
+
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                    const PPCSubtarget &Subtarget) const {
   // When we pop the dynamic allocation we need to restore the SP link.
@@ -5828,10 +5869,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                       false, false, 0);
 }
 
-
-
-SDValue
-PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
+SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
@@ -5983,6 +6021,10 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   if (!DAG.getTarget().Options.NoInfsFPMath ||
       !DAG.getTarget().Options.NoNaNsFPMath)
     return Op;
+  // TODO: Propagate flags from the select rather than global settings.
+  SDNodeFlags Flags;
+  Flags.setNoInfs(true);
+  Flags.setNoNaNs(true);
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
 
@@ -6033,7 +6075,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SETNE:
     std::swap(TV, FV);
   case ISD::SETEQ:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
@@ -6043,25 +6085,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
   case ISD::SETULT:
   case ISD::SETLT:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
   case ISD::SETOGE:
   case ISD::SETGE:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
   case ISD::SETUGT:
   case ISD::SETGT:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
   case ISD::SETOLE:
   case ISD::SETLE:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
@@ -6101,7 +6143,8 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
     (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
   SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
   int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
-  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // Emit a store to the stack slot.
   SDValue Chain;
@@ -6291,11 +6334,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
     // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
     Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-  
+
     SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64);
-    FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
-                          FPHalfs, FPHalfs, FPHalfs, FPHalfs);
-  
+    FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs,
+                          FPHalfs, FPHalfs);
+
     Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
     if (Op.getValueType() != MVT::v4f64)
@@ -6421,17 +6464,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-      SDValue Store =
-        DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
-                     MachinePointerInfo::getFixedStack(FrameIdx),
-                     false, false, 0);
+      SDValue Store = DAG.getStore(
+          DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+          false, false, 0);
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
 
       RLI.Ptr = FIdx;
       RLI.Chain = Store;
-      RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+      RLI.MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
       RLI.Alignment = 4;
 
       MachineMemOperand *MMO =
@@ -6472,16 +6516,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
-      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
-                                   MachinePointerInfo::getFixedStack(FrameIdx),
-                                   false, false, 0);
+      SDValue Store = DAG.getStore(
+          DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+          false, false, 0);
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
 
       RLI.Ptr = FIdx;
       RLI.Chain = Store;
-      RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+      RLI.MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
       RLI.Alignment = 4;
     }
 
@@ -6506,14 +6552,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                 Op.getOperand(0));
 
     // STD the extended value into the stack slot.
-    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx,
-                                 MachinePointerInfo::getFixedStack(FrameIdx),
-                                 false, false, 0);
+    SDValue Store = DAG.getStore(
+        DAG.getEntryNode(), dl, Ext64, FIdx,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+        false, false, 0);
 
     // Load the value as a double.
-    Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx,
-                     MachinePointerInfo::getFixedStack(FrameIdx),
-                     false, false, false, 0);
+    Ld = DAG.getLoad(
+        MVT::f64, dl, Store, FIdx,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+        false, false, false, 0);
   }
 
   // FCFID it and return it.
@@ -6735,7 +6783,6 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
                      DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
 }
 
-
 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
 /// amount.  The result has the specified value type.
 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
@@ -6768,7 +6815,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // to a zero vector to get the boolean result.
     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
     int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
-    MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
@@ -6794,8 +6842,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       for (unsigned i = 0; i < 4; ++i) {
         if (BVN->getOperand(i).getOpcode() == ISD::UNDEF)
           CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
-        else if (cast<ConstantSDNode>(BVN->getOperand(i))->
-                   getConstantIntValue()->isZero())
+        else if (isNullConstant(BVN->getOperand(i)))
           continue;
         else
           CV[i] = One;
@@ -6814,9 +6861,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       ValueVTs.push_back(MVT::Other); // chain
       SDVTList VTs = DAG.getVTList(ValueVTs);
 
-      return DAG.getMemIntrinsicNode(PPCISD::QVLFSb,
-        dl, VTs, Ops, MVT::v4f32,
-        MachinePointerInfo::getConstantPool());
+      return DAG.getMemIntrinsicNode(
+          PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     }
 
     SmallVector<SDValue, 4> Stores;
@@ -6915,7 +6962,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   if (SextVal >= -16 && SextVal <= 15)
     return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
 
-
   // Two instruction sequences.
 
   // If this value is in the range [-32,30] and is even, use:
@@ -7304,11 +7350,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                        V1, V2, VPermMask);
 }
 
-/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
-/// altivec comparison.  If it is, return true and fill in Opc/isDot with
+/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
+/// vector comparison.  If it is, return true and fill in Opc/isDot with
 /// information about the intrinsic.
-static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
-                                  bool &isDot, const PPCSubtarget &Subtarget) {
+static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
+                                 bool &isDot, const PPCSubtarget &Subtarget) {
   unsigned IntrinsicID =
     cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
   CompareOpc = -1;
@@ -7321,12 +7367,11 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
   case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
   case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequd_p: 
+  case Intrinsic::ppc_altivec_vcmpequd_p:
     if (Subtarget.hasP8Altivec()) {
-      CompareOpc = 199; 
-      isDot = 1; 
-    }
-    else 
+      CompareOpc = 199;
+      isDot = 1;
+    } else
       return false;
 
     break;
@@ -7335,28 +7380,48 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
   case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
   case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsd_p: 
+  case Intrinsic::ppc_altivec_vcmpgtsd_p:
     if (Subtarget.hasP8Altivec()) {
-      CompareOpc = 967; 
-      isDot = 1; 
-    }
-    else 
+      CompareOpc = 967;
+      isDot = 1;
+    } else
       return false;
 
     break;
   case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
   case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
   case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtud_p: 
+  case Intrinsic::ppc_altivec_vcmpgtud_p:
     if (Subtarget.hasP8Altivec()) {
-      CompareOpc = 711; 
-      isDot = 1; 
+      CompareOpc = 711;
+      isDot = 1;
+    } else
+      return false;
+
+    break;
+    // VSX predicate comparisons use the same infrastructure
+  case Intrinsic::ppc_vsx_xvcmpeqdp_p:
+  case Intrinsic::ppc_vsx_xvcmpgedp_p:
+  case Intrinsic::ppc_vsx_xvcmpgtdp_p:
+  case Intrinsic::ppc_vsx_xvcmpeqsp_p:
+  case Intrinsic::ppc_vsx_xvcmpgesp_p:
+  case Intrinsic::ppc_vsx_xvcmpgtsp_p:
+    if (Subtarget.hasVSX()) {
+      switch (IntrinsicID) {
+      case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break;
+      case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break;
+      case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break;
+      case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break;
+      case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break;
+      case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break;
+      }
+      isDot = 1;
     }
-    else 
+    else
       return false;
 
     break;
-      
+
     // Normal Comparisons.
   case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
   case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
@@ -7365,10 +7430,9 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
   case Intrinsic::ppc_altivec_vcmpequd:
     if (Subtarget.hasP8Altivec()) {
-      CompareOpc = 199; 
-      isDot = 0; 
-    }
-    else
+      CompareOpc = 199;
+      isDot = 0;
+    } else
       return false;
 
     break;
@@ -7377,24 +7441,22 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
   case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
   case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsd:   
+  case Intrinsic::ppc_altivec_vcmpgtsd:
     if (Subtarget.hasP8Altivec()) {
-      CompareOpc = 967; 
-      isDot = 0; 
-    }
-    else
+      CompareOpc = 967;
+      isDot = 0;
+    } else
       return false;
 
     break;
   case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
   case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
   case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtud:   
+  case Intrinsic::ppc_altivec_vcmpgtud:
     if (Subtarget.hasP8Altivec()) {
-      CompareOpc = 711; 
-      isDot = 0; 
-    }
-    else
+      CompareOpc = 711;
+      isDot = 0;
+    } else
       return false;
 
     break;
@@ -7411,7 +7473,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc dl(Op);
   int CompareOpc;
   bool isDot;
-  if (!getAltivecCompareInfo(Op, CompareOpc, isDot, Subtarget))
+  if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
     return SDValue();    // Don't custom lower most intrinsics.
 
   // If this is a non-dot comparison, make the VCMP node and we are done.
@@ -7536,7 +7598,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
                         FPHalfs, FPHalfs, FPHalfs, FPHalfs);
 
-  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 
+  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
   // Now convert to an integer and store.
   Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
@@ -7545,7 +7607,8 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
-  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
@@ -7752,7 +7815,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
   FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
                         FPHalfs, FPHalfs, FPHalfs, FPHalfs);
 
-  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 
+  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
   // Now convert to an integer and store.
   Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
@@ -7761,7 +7824,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
 
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
-  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
@@ -7798,11 +7862,10 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
     SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
 
-    Stores.push_back(DAG.getTruncStore(StoreChain, dl, Loads[i], Idx,
-                                       SN->getPointerInfo().getWithOffset(i),
-                                       MVT::i8 /* memory type */,
-                                       SN->isNonTemporal(), SN->isVolatile(), 
-                                       1 /* alignment */, SN->getAAInfo()));
+    Stores.push_back(DAG.getTruncStore(
+        StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
+        MVT::i8 /* memory type */, SN->isNonTemporal(), SN->isVolatile(),
+        1 /* alignment */, SN->getAAInfo()));
   }
 
   StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
@@ -7906,6 +7969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
+  case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget);
 
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -7971,7 +8035,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
                                  N->getValueType(0));
     SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
     SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
-                                 N->getOperand(1)); 
+                                 N->getOperand(1));
 
     Results.push_back(NewInt);
     Results.push_back(NewInt.getValue(1));
@@ -8020,7 +8084,6 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-
 //===----------------------------------------------------------------------===//
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
@@ -8089,8 +8152,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   unsigned dest = MI->getOperand(0).getReg();
   unsigned ptrA = MI->getOperand(1).getReg();
@@ -8160,8 +8222,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   unsigned dest = MI->getOperand(0).getReg();
   unsigned ptrA = MI->getOperand(1).getReg();
@@ -8283,8 +8344,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
+  MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
@@ -8384,8 +8444,8 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
           .addMBB(mainMBB);
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
 
-  thisMBB->addSuccessor(mainMBB, /* weight */ 0);
-  thisMBB->addSuccessor(sinkMBB, /* weight */ 1);
+  thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
+  thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
 
   // mainMBB:
   //  mainDstReg = 0
@@ -8562,8 +8622,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // To "insert" these instructions we actually have to insert their
   // control-flow patterns.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   MachineFunction *F = BB->getParent();
 
@@ -8675,7 +8734,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // mfspr Rx,TBU # load from TBU
     // mfspr Ry,TB  # load from TB
     // mfspr Rz,TBU # load from TBU
-    // cmpw crX,Rx,Rz # check if ‘old’=’new’
+    // cmpw crX,Rx,Rz # check if 'old'='new'
     // bne readLoop   # branch if they're not equal
     // ...
 
@@ -9137,7 +9196,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
   return SDValue();
 }
 
-bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
   // Note: This functionality is used only when unsafe-fp-math is enabled, and
   // on cores with reciprocal estimates (which are used when unsafe-fp-math is
   // enabled for division), this functionality is redundant with the default
@@ -9150,12 +9209,26 @@ bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
   // one FP pipeline) for three or more FDIVs (for generic OOO cores).
   switch (Subtarget.getDarwinDirective()) {
   default:
-    return NumUsers > 2;
+    return 3;
   case PPC::DIR_440:
   case PPC::DIR_A2:
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
-    return NumUsers > 1;
+    return 2;
+  }
+}
+
+// isConsecutiveLSLoc needs to work even if all adds have not yet been
+// collapsed, and so we need to look through chains of them.
+static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
+                                     int64_t& Offset, SelectionDAG &DAG) {
+  if (DAG.isBaseWithConstantOffset(Loc)) {
+    Base = Loc.getOperand(0);
+    Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+
+    // The base might itself be a base plus an offset, and if so, accumulate
+    // that as well.
+    getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
   }
 }
 
@@ -9178,16 +9251,18 @@ static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
     return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
   }
 
-  // Handle X+C
-  if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
-      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
+  SDValue Base1 = Loc, Base2 = BaseLoc;
+  int64_t Offset1 = 0, Offset2 = 0;
+  getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
+  getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
+  if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
     return true;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const GlobalValue *GV1 = nullptr;
   const GlobalValue *GV2 = nullptr;
-  int64_t Offset1 = 0;
-  int64_t Offset2 = 0;
+  Offset1 = 0;
+  Offset2 = 0;
   bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
   bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
   if (isGA1 && isGA2 && GV1 == GV2)
@@ -9343,7 +9418,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
        IE = LoadRoots.end(); I != IE; ++I) {
     Queue.push_back(*I);
-       
+
     while (!Queue.empty()) {
       SDNode *LoadRoot = Queue.pop_back_val();
       if (!Visited.insert(LoadRoot).second)
@@ -9470,7 +9545,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   }
 
   // Visit all inputs, collect all binary operations (and, or, xor and
-  // select) that are all fed by extensions. 
+  // select) that are all fed by extensions.
   while (!BinOps.empty()) {
     SDValue BinOp = BinOps.back();
     BinOps.pop_back();
@@ -9492,7 +9567,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
             BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
            BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
           isa<ConstantSDNode>(BinOp.getOperand(i))) {
-        Inputs.push_back(BinOp.getOperand(i)); 
+        Inputs.push_back(BinOp.getOperand(i));
       } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
                  BinOp.getOperand(i).getOpcode() == ISD::OR  ||
                  BinOp.getOperand(i).getOpcode() == ISD::XOR ||
@@ -9572,7 +9647,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
     else
-      DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
   }
 
   // Replace all operations (these are all the same, but have a different
@@ -9682,7 +9757,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
   SmallPtrSet<SDNode *, 16> Visited;
 
   // Visit all inputs, collect all binary operations (and, or, xor and
-  // select) that are all fed by truncations. 
+  // select) that are all fed by truncations.
   while (!BinOps.empty()) {
     SDValue BinOp = BinOps.back();
     BinOps.pop_back();
@@ -9701,7 +9776,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
 
       if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
           isa<ConstantSDNode>(BinOp.getOperand(i))) {
-        Inputs.push_back(BinOp.getOperand(i)); 
+        Inputs.push_back(BinOp.getOperand(i));
       } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
                  BinOp.getOperand(i).getOpcode() == ISD::OR  ||
                  BinOp.getOperand(i).getOpcode() == ISD::XOR ||
@@ -9915,10 +9990,11 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
          "Invalid extension type");
   EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
   SDValue ShiftCst =
-    DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
-  return DAG.getNode(ISD::SRA, dl, N->getValueType(0), 
-                     DAG.getNode(ISD::SHL, dl, N->getValueType(0),
-                                 N->getOperand(0), ShiftCst), ShiftCst);
+      DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
+  return DAG.getNode(
+      ISD::SRA, dl, N->getValueType(0),
+      DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
+      ShiftCst);
 }
 
 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
@@ -10102,16 +10178,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case PPCISD::SHL:
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      if (C->isNullValue())   // 0 << V -> 0.
+    if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
         return N->getOperand(0);
-    }
     break;
   case PPCISD::SRL:
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      if (C->isNullValue())   // 0 >>u V -> 0.
+    if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
         return N->getOperand(0);
-    }
     break;
   case PPCISD::SRA:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
@@ -10122,7 +10194,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND: 
+  case ISD::ANY_EXTEND:
     return DAGCombineExtBoolTrunc(N, DCI);
   case ISD::TRUNCATE:
   case ISD::SETCC:
@@ -10277,7 +10349,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // original unaligned load.
       MachineFunction &MF = DAG.getMachineFunction();
       MachineMemOperand *BaseMMO =
-        MF.getMachineMemOperand(LD->getMemOperand(), -MemVT.getStoreSize()+1,
+        MF.getMachineMemOperand(LD->getMemOperand(),
+                                -(long)MemVT.getStoreSize()+1,
                                 2*MemVT.getStoreSize()-1);
 
       // Create the new base load.
@@ -10527,7 +10600,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BRCOND: {
     SDValue Cond = N->getOperand(1);
     SDValue Target = N->getOperand(2);
- 
+
     if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
         cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
           Intrinsic::ppc_is_decremented_ctr_nonzero) {
@@ -10558,8 +10631,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
           Intrinsic::ppc_is_decremented_ctr_nonzero &&
         isa<ConstantSDNode>(LHS.getOperand(1)) &&
-        !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()->
-          isZero())
+        !isNullConstant(LHS.getOperand(1)))
       LHS = LHS.getOperand(0);
 
     if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
@@ -10588,7 +10660,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
         isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
-        getAltivecCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
+        getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
       assert(isDot && "Can't compare against a vector result!");
 
       // If this is a comparison against something other than 0/1, then we know
@@ -10739,8 +10811,11 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
     // boundary so that the entire loop fits in one instruction-cache line.
     uint64_t LoopSize = 0;
     for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
-      for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J)
+      for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
         LoopSize += TII->GetInstSizeInBytes(J);
+        if (LoopSize > 32)
+          break;
+      }
 
     if (LoopSize > 16 && LoopSize <= 32)
       return 5;
@@ -10868,17 +10943,19 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &PPC::QFRCRegClass);
       if (VT == MVT::v4f32 && Subtarget.hasQPX())
         return std::make_pair(0U, &PPC::QSRCRegClass);
-      return std::make_pair(0U, &PPC::VRRCRegClass);
+      if (Subtarget.hasAltivec())
+        return std::make_pair(0U, &PPC::VRRCRegClass);
     case 'y':   // crrc
       return std::make_pair(0U, &PPC::CRRCRegClass);
     }
-  } else if (Constraint == "wc") { // an individual CR bit.
+  } else if (Constraint == "wc" && Subtarget.useCRBits()) {
+    // An individual CR bit.
     return std::make_pair(0U, &PPC::CRBITRCRegClass);
-  } else if (Constraint == "wa" || Constraint == "wd" ||
-             Constraint == "wf") {
+  } else if ((Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf") && Subtarget.hasVSX()) {
     return std::make_pair(0U, &PPC::VSRCRegClass);
-  } else if (Constraint == "ws") {
-    if (VT == MVT::f32)
+  } else if (Constraint == "ws" && Subtarget.hasVSX()) {
+    if (VT == MVT::f32 && Subtarget.hasP8Vector())
       return std::make_pair(0U, &PPC::VSSRCRegClass);
     else
       return std::make_pair(0U, &PPC::VSFRCRegClass);
@@ -10908,7 +10985,6 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   return R;
 }
 
-
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
@@ -11358,9 +11434,7 @@ bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0 || BitSize > 64)
-    return false;
-  return true;
+  return !(BitSize == 0 || BitSize > 64);
 }
 
 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
@@ -11477,11 +11551,21 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
   return ScratchRegs;
 }
 
+unsigned PPCTargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
+}
+
+unsigned PPCTargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
+}
+
 bool
 PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
                      EVT VT , unsigned DefinedValues) const {
   if (VT == MVT::v2i64)
-    return false;
+    return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
 
   if (Subtarget.hasQPX()) {
     if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1)
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 6e13533..44bcb89 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -79,6 +79,11 @@ namespace llvm {
       /// compute an allocation on the stack.
       DYNALLOC,
 
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an offset from native SP to the address  of the most recent
+      /// dynamic alloca.
+      DYNAREAOFFSET,
+
       /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
       /// at function entry, used for PIC code.
       GlobalBaseReg,
@@ -423,6 +428,8 @@ namespace llvm {
     /// DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
+    bool useSoftFloat() const override;
+
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
       return MVT::i32;
     }
@@ -655,8 +662,17 @@ namespace llvm {
       return Ty->isArrayTy();
     }
 
-  private:
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+  private:
     struct ReuseLoadInfo {
       SDValue Ptr;
       SDValue Chain;
@@ -719,6 +735,8 @@ namespace llvm {
                         const PPCSubtarget &Subtarget) const;
     SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                 const PPCSubtarget &Subtarget) const;
+    SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -853,7 +871,7 @@ namespace llvm {
                              bool &UseOneConstNR) const override;
     SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                              unsigned &RefinementSteps) const override;
-    bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+    unsigned combineRepeatedFPDivisors() const override;
 
     CCAssignFn *useFastISelCCs(unsigned Flag) const;
   };
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index d628330..79e4fe3 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -299,22 +299,35 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
 // 64-bit CR instructions
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
 def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
                         "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
 def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
                       "mtcrf $FXM, $rS", IIC_BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
 
-let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
 def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
                         "mfocrf $rT, $FXM", IIC_SprMFCRF>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
 def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
 } // hasSideEffects = 0
 
 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
@@ -369,6 +382,8 @@ let Defs = [X1], Uses = [X1] in
 def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
                        [(set i64:$result,
                              (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+                       [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
 
 let Defs = [LR8] in {
 def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index d4e666c..dcff6ad 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -144,6 +144,9 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
                                                    UseMI, UseIdx);
 
+  if (!DefMI->getParent())
+    return Latency;
+
   const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
   unsigned Reg = DefMO.getReg();
 
@@ -186,6 +189,60 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
+// This function does not list all associative and commutative operations, but
+// only those worth feeding through the machine combiner in an attempt to
+// reduce the critical path. Mostly, this means floating-point operations,
+// because they have high latencies (compared to other operations, such and
+// and/or, which are also associative and commutative, but have low latencies).
+bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  // FP Add:
+  case PPC::FADD:
+  case PPC::FADDS:
+  // FP Multiply:
+  case PPC::FMUL:
+  case PPC::FMULS:
+  // Altivec Add:
+  case PPC::VADDFP:
+  // VSX Add:
+  case PPC::XSADDDP:
+  case PPC::XVADDDP:
+  case PPC::XVADDSP:
+  case PPC::XSADDSP:
+  // VSX Multiply:
+  case PPC::XSMULDP:
+  case PPC::XVMULDP:
+  case PPC::XVMULSP:
+  case PPC::XSMULSP:
+  // QPX Add:
+  case PPC::QVFADD:
+  case PPC::QVFADDS:
+  case PPC::QVFADDSs:
+  // QPX Multiply:
+  case PPC::QVFMUL:
+  case PPC::QVFMULS:
+  case PPC::QVFMULSs:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool PPCInstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Using the machine combiner in this way is potentially expensive, so
+  // restrict to when aggressive optimizations are desired.
+  if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
+    return false;
+
+  // FP reassociation is only legal when we don't need strict IEEE semantics.
+  if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
+    return false;
+
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
 // Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
 bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                          unsigned &SrcReg, unsigned &DstReg,
@@ -259,16 +316,16 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-// commuteInstruction - We can commute rlwimi instructions, but only if the
-// rotate amt is zero.  We also have to munge the immediates a bit.
-MachineInstr *
-PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                   bool NewMI,
+                                                   unsigned OpIdx1,
+                                                   unsigned OpIdx2) const {
   MachineFunction &MF = *MI->getParent()->getParent();
 
   // Normal instructions can be commuted the obvious way.
   if (MI->getOpcode() != PPC::RLWIMI &&
       MI->getOpcode() != PPC::RLWIMIo)
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
   // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
   // changing the relative order of the mask operands might change what happens
@@ -286,6 +343,8 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   //   Op0 = (Op2 & ~M) | (Op1 & M)
 
   // Swap op1/op2
+  assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
+         "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
   unsigned Reg0 = MI->getOperand(0).getReg();
   unsigned Reg1 = MI->getOperand(1).getReg();
   unsigned Reg2 = MI->getOperand(2).getReg();
@@ -353,9 +412,9 @@ bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
   if (AltOpc == -1)
     return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
 
-  SrcOpIdx1 = 2;
-  SrcOpIdx2 = 3;
-  return true;
+  // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
+  // and SrcOpIdx2.
+  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
 }
 
 void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -685,20 +744,43 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
          "isel is for regular integer GPRs only");
 
   unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
-  unsigned SelectPred = Cond[0].getImm();
+  auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm());
 
   unsigned SubIdx;
   bool SwapOps;
   switch (SelectPred) {
-  default: llvm_unreachable("invalid predicate for isel");
-  case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break;
-  case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break;
-  case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break;
-  case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break;
-  case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break;
-  case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break;
-  case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break;
-  case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break;
+  case PPC::PRED_EQ:
+  case PPC::PRED_EQ_MINUS:
+  case PPC::PRED_EQ_PLUS:
+      SubIdx = PPC::sub_eq; SwapOps = false; break;
+  case PPC::PRED_NE:
+  case PPC::PRED_NE_MINUS:
+  case PPC::PRED_NE_PLUS:
+      SubIdx = PPC::sub_eq; SwapOps = true; break;
+  case PPC::PRED_LT:
+  case PPC::PRED_LT_MINUS:
+  case PPC::PRED_LT_PLUS:
+      SubIdx = PPC::sub_lt; SwapOps = false; break;
+  case PPC::PRED_GE:
+  case PPC::PRED_GE_MINUS:
+  case PPC::PRED_GE_PLUS:
+      SubIdx = PPC::sub_lt; SwapOps = true; break;
+  case PPC::PRED_GT:
+  case PPC::PRED_GT_MINUS:
+  case PPC::PRED_GT_PLUS:
+      SubIdx = PPC::sub_gt; SwapOps = false; break;
+  case PPC::PRED_LE:
+  case PPC::PRED_LE_MINUS:
+  case PPC::PRED_LE_PLUS:
+      SubIdx = PPC::sub_gt; SwapOps = true; break;
+  case PPC::PRED_UN:
+  case PPC::PRED_UN_MINUS:
+  case PPC::PRED_UN_PLUS:
+      SubIdx = PPC::sub_un; SwapOps = false; break;
+  case PPC::PRED_NU:
+  case PPC::PRED_NU_MINUS:
+  case PPC::PRED_NU_PLUS:
+      SubIdx = PPC::sub_un; SwapOps = true; break;
   case PPC::PRED_BIT_SET:   SubIdx = 0; SwapOps = false; break;
   case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
   }
@@ -996,11 +1078,10 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     MBB.insert(MI, NewMIs[i]);
 
   const MachineFrameInfo &MFI = *MF.getFrameInfo();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                            MachineMemOperand::MOStore,
-                            MFI.getObjectSize(FrameIdx),
-                            MFI.getObjectAlignment(FrameIdx));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
@@ -1109,11 +1190,10 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     MBB.insert(MI, NewMIs[i]);
 
   const MachineFrameInfo &MFI = *MF.getFrameInfo();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                            MachineMemOperand::MOLoad,
-                            MFI.getObjectSize(FrameIdx),
-                            MFI.getObjectAlignment(FrameIdx));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
@@ -1214,7 +1294,7 @@ bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
                      unsigned NumT, unsigned ExtraT,
                      MachineBasicBlock &FMBB,
                      unsigned NumF, unsigned ExtraF,
-                     const BranchProbability &Probability) const {
+                     BranchProbability Probability) const {
   return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB));
 }
 
@@ -1691,13 +1771,13 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
     MI->setDesc(NewDesc);
 
     if (NewDesc.ImplicitDefs)
-      for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs();
+      for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
            *ImpDefs; ++ImpDefs)
         if (!MI->definesRegister(*ImpDefs))
           MI->addOperand(*MI->getParent()->getParent(),
                          MachineOperand::CreateReg(*ImpDefs, true, true));
     if (NewDesc.ImplicitUses)
-      for (const uint16_t *ImpUses = NewDesc.getImplicitUses();
+      for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
            *ImpUses; ++ImpUses)
         if (!MI->readsRegister(*ImpUses))
           MI->addOperand(*MI->getParent()->getParent(),
@@ -1737,3 +1817,35 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   }
 }
 
+std::pair<unsigned, unsigned>
+PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  const unsigned Mask = PPCII::MO_ACCESS_MASK;
+  return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace PPCII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_LO, "ppc-lo"},
+      {MO_HA, "ppc-ha"},
+      {MO_TPREL_LO, "ppc-tprel-lo"},
+      {MO_TPREL_HA, "ppc-tprel-ha"},
+      {MO_DTPREL_LO, "ppc-dtprel-lo"},
+      {MO_TLSLD_LO, "ppc-tlsld-lo"},
+      {MO_TOC_LO, "ppc-toc-lo"},
+      {MO_TLS, "ppc-tls"}};
+  return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace PPCII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_PLT_OR_STUB, "ppc-plt-or-stub"},
+      {MO_PIC_FLAG, "ppc-pic"},
+      {MO_NLP_FLAG, "ppc-nlp"},
+      {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
+  return makeArrayRef(TargetFlags);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 40badae..c3c3a48 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -79,6 +79,23 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             SmallVectorImpl<MachineInstr*> &NewMIs,
                             bool &NonRI, bool &SpillsVRS) const;
   virtual void anchor();
+
+protected:
+  /// Commutes the operands in the given instruction.
+  /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+  ///
+  /// Do not call this method for a non-commutable instruction or for
+  /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  ///
+  /// For example, we can commute rlwimi instructions, but only if the
+  /// rotate amt is zero.  We also have to munge the immediates a bit.
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+                                       bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
+
 public:
   explicit PPCInstrInfo(PPCSubtarget &STI);
 
@@ -119,6 +136,19 @@ public:
     return false;
   }
 
+  bool useMachineCombiner() const override {
+    return true;
+  }
+
+  /// Return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// output in the <Pattern> array.
+  bool getMachineCombinerPatterns(
+      MachineInstr &Root,
+      SmallVectorImpl<MachineCombinerPattern> &P) const override;
+
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
                              unsigned &SubIdx) const override;
@@ -127,10 +157,6 @@ public:
   unsigned isStoreToStackSlot(const MachineInstr *MI,
                               int &FrameIndex) const override;
 
-  // commuteInstruction - We can commute rlwimi instructions, but only if the
-  // rotate amt is zero.  We also have to munge the immediates a bit.
-  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
-
   bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
@@ -183,7 +209,7 @@ public:
   // profitable to use the predicated branches.
   bool isProfitableToIfCvt(MachineBasicBlock &MBB,
                           unsigned NumCycles, unsigned ExtraPredCycles,
-                          const BranchProbability &Probability) const override {
+                          BranchProbability Probability) const override {
     return true;
   }
 
@@ -191,12 +217,10 @@ public:
                            unsigned NumT, unsigned ExtraT,
                            MachineBasicBlock &FMBB,
                            unsigned NumF, unsigned ExtraF,
-                           const BranchProbability &Probability) const override;
+                           BranchProbability Probability) const override;
 
-  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                 unsigned NumCycles,
-                                 const BranchProbability
-                                 &Probability) const override {
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                 BranchProbability Probability) const override {
     return true;
   }
 
@@ -239,6 +263,15 @@ public:
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
   void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
 };
 
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 24fd9bd..ce0f9e6 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
 
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC specific transformation functions and pattern fragments.
@@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in
 def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
                        [(set i32:$result,
                              (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+                       [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
                          
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
@@ -2295,22 +2299,35 @@ def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
                      "#RESTORE_VRSAVE", []>;
 
 let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
 def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
                        "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
 def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
                       "mtcrf $FXM, $rS", IIC_BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
 
-let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
 def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
                        "mfocrf $rT, $FXM", IIC_SprMFCRF>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
 def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
 } // hasSideEffects = 0
 
 // Pseudo instruction to perform FADD in round-to-zero mode.
@@ -3883,8 +3900,11 @@ def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0,
 def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
 def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
 
-def : InstAlias<"cntlz $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
-def : InstAlias<"cntlz. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+// The POWER variant
+def : MnemonicAlias<"cntlz",  "cntlzw">;
+def : MnemonicAlias<"cntlz.", "cntlzw.">;
 
 def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
index 0a044c5..4312007 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
@@ -839,31 +839,31 @@ def : Pat<(v4f64 (scalar_to_vector f64:$A)),
 def : Pat<(v4f32 (scalar_to_vector f32:$A)),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
 
-def : Pat<(f64 (vector_extract v4f64:$S, 0)),
+def : Pat<(f64 (extractelt v4f64:$S, 0)),
           (EXTRACT_SUBREG $S, sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+def : Pat<(f32 (extractelt v4f32:$S, 0)),
           (EXTRACT_SUBREG $S, sub_64)>;
 
-def : Pat<(f64 (vector_extract v4f64:$S, 1)),
+def : Pat<(f64 (extractelt v4f64:$S, 1)),
           (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, 2)),
+def : Pat<(f64 (extractelt v4f64:$S, 2)),
           (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, 3)),
+def : Pat<(f64 (extractelt v4f64:$S, 3)),
           (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
 
-def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+def : Pat<(f32 (extractelt v4f32:$S, 1)),
           (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+def : Pat<(f32 (extractelt v4f32:$S, 2)),
           (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+def : Pat<(f32 (extractelt v4f32:$S, 3)),
           (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
 
-def : Pat<(f64 (vector_extract v4f64:$S, i64:$F)),
+def : Pat<(f64 (extractelt v4f64:$S, i64:$F)),
           (EXTRACT_SUBREG (QVFPERM $S, $S,
                                    (QVLPCLSXint (RLDICR $F, 2,
                                                         /* 63-2 = */ 61))),
                           sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, i64:$F)),
+def : Pat<(f32 (extractelt v4f32:$S, i64:$F)),
           (EXTRACT_SUBREG (QVFPERMs $S, $S,
                                     (QVLPCLSXint (RLDICR $F, 2,
                                                          /* 63-2 = */ 61))),
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index ce63c22..df1142c 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -67,17 +67,19 @@ def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
 def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
 
-multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
-                    string asmbase, string asmstr, InstrItinClass itin,
-                    list<dag> pattern> {
+multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
+                    string asmstr, InstrItinClass itin, Intrinsic Int,
+                    ValueType OutTy, ValueType InTy> {
   let BaseName = asmbase in {
-    def NAME : XX3Form_Rc<opcode, xo, OOL, IOL,
+    def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                        !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
-                       pattern>;
+                       [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>;
     let Defs = [CR6] in
-    def o    : XX3Form_Rc<opcode, xo, OOL, IOL,
+    def o    : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                        !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
-                       []>, isDOT;
+                       [(set InTy:$XT,
+                                (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
+                       isDOT;
   }
 }
 
@@ -456,35 +458,23 @@ let Uses = [RM] in {
                            "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
 
   defm XVCMPEQDP : XX3Form_Rcr<60, 99,
-                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
-                             [(set v2i64:$XT,
-                                (int_ppc_vsx_xvcmpeqdp v2f64:$XA, v2f64:$XB))]>;
+                             int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
   defm XVCMPEQSP : XX3Form_Rcr<60, 67,
-                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare,
-                             [(set v4i32:$XT,
-                                (int_ppc_vsx_xvcmpeqsp v4f32:$XA, v4f32:$XB))]>;
+                             int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>;
   defm XVCMPGEDP : XX3Form_Rcr<60, 115,
-                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare,
-                             [(set v2i64:$XT,
-                                (int_ppc_vsx_xvcmpgedp v2f64:$XA, v2f64:$XB))]>;
+                             int_ppc_vsx_xvcmpgedp, v2i64, v2f64>;
   defm XVCMPGESP : XX3Form_Rcr<60, 83,
-                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare,
-                             [(set v4i32:$XT,
-                                (int_ppc_vsx_xvcmpgesp v4f32:$XA, v4f32:$XB))]>;
+                             int_ppc_vsx_xvcmpgesp, v4i32, v4f32>;
   defm XVCMPGTDP : XX3Form_Rcr<60, 107,
-                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare,
-                             [(set v2i64:$XT,
-                                (int_ppc_vsx_xvcmpgtdp v2f64:$XA, v2f64:$XB))]>;
+                             int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>;
   defm XVCMPGTSP : XX3Form_Rcr<60, 75,
-                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare,
-                             [(set v4i32:$XT,
-                                (int_ppc_vsx_xvcmpgtsp v4f32:$XA, v4f32:$XB))]>;
+                             int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;
 
   // Move Instructions
   def XSABSDP : XX2Form<60, 345,
@@ -845,9 +835,9 @@ let Predicates = [IsBigEndian] in {
 def : Pat<(v2f64 (scalar_to_vector f64:$A)),
           (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
 
-def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
           (f64 (EXTRACT_SUBREG $S, sub_64))>;
-def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
           (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
 }
 
@@ -856,9 +846,9 @@ def : Pat<(v2f64 (scalar_to_vector f64:$A)),
           (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
                            (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
 
-def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
           (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
-def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
           (f64 (EXTRACT_SUBREG $S, sub_64))>;
 }
 
@@ -1206,6 +1196,23 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   }
+
+  // Single Precision Conversions (FP <-> INT)
+  def XSCVSXDSP : XX2Form<60, 312,
+                      (outs vssrc:$XT), (ins vsfrc:$XB),
+                      "xscvsxdsp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+  def XSCVUXDSP : XX2Form<60, 296,
+                      (outs vssrc:$XT), (ins vsfrc:$XB),
+                      "xscvuxdsp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+
+  // Conversions between vector and scalar single precision
+  def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
+                          "xscvdpspn $XT, $XB", IIC_VecFP, []>;
+  def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
+                          "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+
 } // AddedComplexity = 400
 } // HasP8Vector
 
@@ -1229,3 +1236,550 @@ let Predicates = [HasDirectMove, HasVSX] in {
                                "mtvsrwz $XT, $rA", IIC_VecGeneral,
                                [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
 } // HasDirectMove, HasVSX
+
+/*  Direct moves of various widths from GPR's into VSR's. Each move lines
+    the value up into element 0 (both BE and LE). Namely, entities smaller than
+    a doubleword are shifted left and moved for BE. For LE, they're moved, then
+    swapped to go into the least significant element of the VSR.
+*/
+def MovesToVSR {
+  dag BE_BYTE_0 =
+    (MTVSRD
+      (RLDICR
+        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7));
+  dag BE_HALF_0 =
+    (MTVSRD
+      (RLDICR
+        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15));
+  dag BE_WORD_0 =
+    (MTVSRD
+      (RLDICR
+        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31));
+  dag BE_DWORD_0 = (MTVSRD $A);
+
+  dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32));
+  dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                        LE_MTVSRW, sub_64));
+  dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2);
+  dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                         BE_DWORD_0, sub_64));
+  dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
+}
+
+/*  Patterns for extracting elements out of vectors. Integer elements are
+    extracted using direct move operations. Patterns for extracting elements
+    whose indices are not available at compile time are also provided with
+    various _VARIABLE_ patterns.
+    The numbering for the DAG's is for LE, but when used on BE, the correct
+    LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
+*/
+def VectorExtractions {
+  // Doubleword extraction
+  dag LE_DWORD_0 =
+    (MFVSRD
+      (EXTRACT_SUBREG
+        (XXPERMDI (COPY_TO_REGCLASS $S, VSRC),
+                  (COPY_TO_REGCLASS $S, VSRC), 2), sub_64));
+  dag LE_DWORD_1 = (MFVSRD
+                     (EXTRACT_SUBREG
+                       (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+
+  // Word extraction
+  dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 2), sub_64));
+  dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64));
+  dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG
+                             (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+  dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64));
+
+  // Halfword extraction
+  dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32));
+  dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32));
+  dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32));
+  dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32));
+  dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32));
+  dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32));
+  dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32));
+  dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32));
+
+  // Byte extraction
+  dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32));
+  dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32));
+  dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32));
+  dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32));
+  dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32));
+  dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32));
+  dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32));
+  dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32));
+  dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32));
+  dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32));
+  dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32));
+  dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32));
+  dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32));
+  dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32));
+  dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32));
+  dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32));
+
+  /* Variable element number (BE and LE patterns must be specified separately)
+     This is a rather involved process.
+
+     Conceptually, this is how the move is accomplished:
+     1. Identify which doubleword contains the element
+     2. Shift in the VMX register so that the correct doubleword is correctly
+        lined up for the MFVSRD
+     3. Perform the move so that the element (along with some extra stuff)
+        is in the GPR
+     4. Right shift within the GPR so that the element is right-justified
+
+     Of course, the index is an element number which has a different meaning
+     on LE/BE so the patterns have to be specified separately.
+
+     Note: The final result will be the element right-justified with high
+           order bits being arbitrarily defined (namely, whatever was in the
+           vector register to the left of the value originally).
+  */
+
+  /*  LE variable byte
+      Number 1. above:
+      - For elements 0-7, we shift left by 8 bytes since they're on the right
+      - For elements 8-15, we need not shift (shift left by zero bytes)
+      This is accomplished by inverting the bits of the index and AND-ing
+      with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
+  */
+  dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VBYTE = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-7 (8-15 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 8 as we need to shift right by the number of bits, not bytes
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60),
+                                       sub_32);
+  dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
+                                         sub_32);
+
+  /*  LE variable halfword
+      Number 1. above:
+      - For elements 0-3, we shift left by 8 since they're on the right
+      - For elements 4-7, we need not shift (shift left by zero bytes)
+      Similarly to the byte pattern, we invert the bits of the index, but we
+      AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
+      Of course, the shift is still by 8 bytes, so we must multiply by 2.
+  */
+  dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VHALF = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-3 (4-7 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 16 as we need to shift right by the number of bits
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59),
+                                       sub_32);
+  dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
+                                         sub_32);
+
+  /*  LE variable word
+      Number 1. above:
+      - For elements 0-1, we shift left by 8 since they're on the right
+      - For elements 2-3, we need not shift
+  */
+  dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-1 (2-3 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 32 as we need to shift right by the number of bits
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  LE variable doubleword
+      Number 1. above:
+      - For element 0, we shift left by 8 since it's on the right
+      - For element 1, we need not shift
+  */
+  dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+
+  // Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  //  - Number 4. is not needed for the doubleword as the value is 64-bits
+  dag LE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  LE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+  dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
+  dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
+
+  /*  LE variable double
+      Same as the LE doubleword except there is no move.
+  */
+  dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  LE_VDWORD_PERM_VEC);
+  dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
+
+  /*  BE variable byte
+      The algorithm here is the same as the LE variable byte except:
+      - The shift in the VMX register is by 0/8 for opposite element numbers so
+        we simply AND the element number with 0x8
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-7
+  */
+  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
+  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+  dag BE_MV_VBYTE = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
+                                       sub_32);
+  dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
+                                         sub_32);
+
+  /*  BE variable halfword
+      The algorithm here is the same as the LE variable halfword except:
+      - The shift in the VMX register is by 0/8 for opposite element numbers so
+        we simply AND the element number with 0x4 and multiply by 2
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-3
+  */
+  dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62));
+  dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC);
+  dag BE_MV_VHALF = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
+                                       sub_32);
+  dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
+                                         sub_32);
+
+  /*  BE variable word
+      The algorithm is the same as the LE variable word except:
+      - The shift in the VMX register happens for opposite element numbers
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-1
+  */
+  dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
+  dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+  dag BE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  BE variable doubleword
+      Same as the LE doubleword except we shift in the VMX register for opposite
+      element indices.
+  */
+  dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
+  dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  BE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+  dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
+  dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+
+  /* BE variable double
+      Same as the BE doubleword except there is no move.
+  */
+  dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
+}
+
+// v4f32 scalar <-> vector conversions (BE)
+let Predicates = [IsBigEndian, HasP8Vector] in {
+  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+            (v4f32 (XSCVDPSPN $A))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+            (f32 (XSCVSPDPN $S))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
+} // IsBigEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsBigEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
+let Predicates = [IsBigEndian, HasDirectMove] in {
+  // v16i8 scalar <-> vector conversions (BE)
+  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+            (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
+  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+            (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
+  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+            (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+            (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 VectorExtractions.LE_BYTE_15)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 VectorExtractions.LE_BYTE_14)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 VectorExtractions.LE_BYTE_13)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 VectorExtractions.LE_BYTE_12)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 VectorExtractions.LE_BYTE_11)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 VectorExtractions.LE_BYTE_10)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 VectorExtractions.LE_BYTE_9)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 VectorExtractions.LE_BYTE_8)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 VectorExtractions.LE_BYTE_7)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 VectorExtractions.LE_BYTE_6)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 VectorExtractions.LE_BYTE_5)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 VectorExtractions.LE_BYTE_4)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 VectorExtractions.LE_BYTE_3)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 VectorExtractions.LE_BYTE_2)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 VectorExtractions.LE_BYTE_1)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 VectorExtractions.LE_BYTE_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
+
+  // v8i16 scalar <-> vector conversions (BE)
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 VectorExtractions.LE_HALF_7)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 VectorExtractions.LE_HALF_6)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 VectorExtractions.LE_HALF_5)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 VectorExtractions.LE_HALF_4)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 VectorExtractions.LE_HALF_3)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 VectorExtractions.LE_HALF_2)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 VectorExtractions.LE_HALF_1)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+            (i32 VectorExtractions.LE_HALF_0)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_HALF)>;
+
+  // v4i32 scalar <-> vector conversions (BE)
+  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+            (i32 VectorExtractions.LE_WORD_3)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+            (i32 VectorExtractions.LE_WORD_2)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+            (i32 VectorExtractions.LE_WORD_1)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+            (i32 VectorExtractions.LE_WORD_0)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_WORD)>;
+
+  // v2i64 scalar <-> vector conversions (BE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // IsBigEndian, HasDirectMove
+
+// v4f32 scalar <-> vector conversions (LE)
+let Predicates = [IsLittleEndian, HasP8Vector] in {
+  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+            (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+            (f32 (XSCVSPDPN $S))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
+} // IsLittleEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsLittleEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
+let Predicates = [IsLittleEndian, HasDirectMove] in {
+  // v16i8 scalar <-> vector conversions (LE)
+  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+            (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+            (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+            (v4i32 MovesToVSR.LE_WORD_0)>;
+  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+            (v2i64 MovesToVSR.LE_DWORD_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 VectorExtractions.LE_BYTE_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 VectorExtractions.LE_BYTE_1)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 VectorExtractions.LE_BYTE_2)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 VectorExtractions.LE_BYTE_3)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 VectorExtractions.LE_BYTE_4)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 VectorExtractions.LE_BYTE_5)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 VectorExtractions.LE_BYTE_6)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 VectorExtractions.LE_BYTE_7)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 VectorExtractions.LE_BYTE_8)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 VectorExtractions.LE_BYTE_9)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 VectorExtractions.LE_BYTE_10)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 VectorExtractions.LE_BYTE_11)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 VectorExtractions.LE_BYTE_12)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 VectorExtractions.LE_BYTE_13)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 VectorExtractions.LE_BYTE_14)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 VectorExtractions.LE_BYTE_15)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
+
+  // v8i16 scalar <-> vector conversions (LE)
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 VectorExtractions.LE_HALF_0)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 VectorExtractions.LE_HALF_1)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 VectorExtractions.LE_HALF_2)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 VectorExtractions.LE_HALF_3)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 VectorExtractions.LE_HALF_4)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 VectorExtractions.LE_HALF_5)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 VectorExtractions.LE_HALF_6)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+            (i32 VectorExtractions.LE_HALF_7)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_HALF)>;
+
+  // v4i32 scalar <-> vector conversions (LE)
+  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+            (i32 VectorExtractions.LE_WORD_0)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+            (i32 VectorExtractions.LE_WORD_1)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+            (i32 VectorExtractions.LE_WORD_2)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+            (i32 VectorExtractions.LE_WORD_3)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_WORD)>;
+
+  // v2i64 scalar <-> vector conversions (LE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // IsLittleEndian, HasDirectMove
+
+let Predicates = [HasDirectMove, HasVSX] in {
+// bitconvert f32 -> i32
+// (convert to 32-bit fp single, shift right 1 word, move to GPR)
+def : Pat<(i32 (bitconvert f32:$S)),
+          (i32 (MFVSRWZ (EXTRACT_SUBREG
+                          (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3),
+                          sub_64)))>;
+// bitconvert i32 -> f32
+// (move to FPR, shift left 1 word, convert to 64-bit fp single)
+def : Pat<(f32 (bitconvert i32:$A)),
+          (f32 (XSCVSPDPN
+                 (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;
+
+// bitconvert f64 -> i64
+// (move to GPR, nothing else needed)
+def : Pat<(i64 (bitconvert f64:$S)),
+          (i64 (MFVSRD $S))>;
+
+// bitconvert i64 -> f64
+// (move to FPR, nothing else needed)
+def : Pat<(f64 (bitconvert i64:$S)),
+          (f64 (MTVSRD $S))>;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
index b4e1c09..e3a35d5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -71,10 +72,10 @@ namespace {
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
       // FIXME: For some reason, preserving SE here breaks LSR (even if
       // this pass changes nothing).
-      // AU.addPreserved<ScalarEvolution>();
+      // AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
@@ -96,7 +97,7 @@ INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
                     "PPC Loop Data Prefetch", false, false)
 
@@ -104,7 +105,7 @@ FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPref
 
 bool PPCLoopDataPrefetch::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DL = &F.getParent()->getDataLayout();
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index b6e7799..5e18826 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -73,7 +73,7 @@ namespace {
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
     }
 
     bool runOnFunction(Function &F) override;
@@ -84,8 +84,10 @@ namespace {
 
   private:
     PPCTargetMachine *TM;
+    DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
+    bool PreserveLCSSA;
   };
 }
 
@@ -93,7 +95,7 @@ char PPCLoopPreIncPrep::ID = 0;
 static const char *name = "Prepare loop for pre-inc. addressing modes";
 INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
 
 FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
@@ -101,17 +103,20 @@ FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
 }
 
 namespace {
-  struct SCEVLess : std::binary_function<const SCEV *, const SCEV *, bool>
-  {
-    SCEVLess(ScalarEvolution *SE) : SE(SE) {}
+  struct BucketElement {
+    BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+    BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
 
-    bool operator() (const SCEV *X, const SCEV *Y) const {
-      const SCEV *Diff = SE->getMinusSCEV(X, Y);
-      return cast<SCEVConstant>(Diff)->getValue()->getSExtValue() < 0;
-    }
+    const SCEVConstant *Offset;
+    Instruction *Instr;
+  };
 
-  protected:
-    ScalarEvolution *SE;
+  struct Bucket {
+    Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+                                            Elements(1, BucketElement(I)) {}
+
+    const SCEV *BaseSCEV;
+    SmallVector<BucketElement, 16> Elements;
   };
 }
 
@@ -140,7 +145,10 @@ static Value *GetPointerOperand(Value *MemI) {
 
 bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
   bool MadeChange = false;
 
@@ -169,7 +177,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     std::distance(pred_begin(Header), pred_end(Header));
 
   // Collect buckets of comparable addresses used by loads and stores.
-  typedef std::multimap<const SCEV *, Instruction *, SCEVLess> Bucket;
   SmallVector<Bucket, 16> Buckets;
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I) {
@@ -212,25 +219,24 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
       }
 
       bool FoundBucket = false;
-      for (unsigned i = 0, e = Buckets.size(); i != e; ++i)
-        for (Bucket::iterator K = Buckets[i].begin(), KE = Buckets[i].end();
-             K != KE; ++K) {
-          const SCEV *Diff = SE->getMinusSCEV(K->first, LSCEV);
-          if (isa<SCEVConstant>(Diff)) {
-            Buckets[i].insert(std::make_pair(LSCEV, MemI));
-            FoundBucket = true;
-            break;
-          }
+      for (auto &B : Buckets) {
+        const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+        if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+          B.Elements.push_back(BucketElement(CDiff, MemI));
+          FoundBucket = true;
+          break;
         }
+      }
 
       if (!FoundBucket) {
-        Buckets.push_back(Bucket(SCEVLess(SE)));
-        Buckets[Buckets.size()-1].insert(std::make_pair(LSCEV, MemI));
+        if (Buckets.size() == MaxVars)
+          return MadeChange;
+        Buckets.push_back(Bucket(LSCEV, MemI));
       }
     }
   }
 
-  if (Buckets.empty() || Buckets.size() > MaxVars)
+  if (Buckets.empty())
     return MadeChange;
 
   BasicBlock *LoopPredecessor = L->getLoopPredecessor();
@@ -239,7 +245,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
   // iteration space), insert a new preheader for the loop.
   if (!LoopPredecessor ||
       !LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
-    LoopPredecessor = InsertPreheaderForLoop(L, this);
+    LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
     if (LoopPredecessor)
       MadeChange = true;
   }
@@ -253,8 +259,45 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     // The base address of each bucket is transformed into a phi and the others
     // are rewritten as offsets of that variable.
 
+    // We have a choice now of which instruction's memory operand we use as the
+    // base for the generated PHI. Always picking the first instruction in each
+    // bucket does not work well, specifically because that instruction might
+    // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+    // the choice is somewhat arbitrary, because the backend will happily
+    // generate direct offsets from both the pre-incremented and
+    // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+    // instruction in each bucket, and adjust the recurrence and other offsets
+    // accordingly. 
+    for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
+      if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
+        if (II->getIntrinsicID() == Intrinsic::prefetch)
+          continue;
+
+      // If we'd otherwise pick the first element anyway, there's nothing to do.
+      if (j == 0)
+        break;
+
+      // If our chosen element has no offset from the base pointer, there's
+      // nothing to do.
+      if (!Buckets[i].Elements[j].Offset ||
+          Buckets[i].Elements[j].Offset->isZero())
+        break;
+
+      const SCEV *Offset = Buckets[i].Elements[j].Offset;
+      Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
+      for (auto &E : Buckets[i].Elements) {
+        if (E.Offset)
+          E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+        else
+          E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+      }
+
+      std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
+      break;
+    }
+
     const SCEVAddRecExpr *BasePtrSCEV =
-      cast<SCEVAddRecExpr>(Buckets[i].begin()->first);
+      cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
     if (!BasePtrSCEV->isAffine())
       continue;
 
@@ -262,7 +305,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     assert(BasePtrSCEV->getLoop() == L &&
            "AddRec for the wrong loop?");
 
-    Instruction *MemI = Buckets[i].begin()->second;
+    // The instruction corresponding to the Bucket's BaseSCEV must be the first
+    // in the vector of elements.
+    Instruction *MemI = Buckets[i].Elements.begin()->Instr;
     Value *BasePtr = GetPointerOperand(MemI);
     assert(BasePtr && "No pointer operand");
 
@@ -302,7 +347,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
       NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
     }
 
-    Instruction *InsPoint = Header->getFirstInsertionPt();
+    Instruction *InsPoint = &*Header->getFirstInsertionPt();
     GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
         I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
         MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
@@ -327,18 +372,20 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     BasePtr->replaceAllUsesWith(NewBasePtr);
     RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
 
-    Value *LastNewPtr = NewBasePtr;
-    for (Bucket::iterator I = std::next(Buckets[i].begin()),
-         IE = Buckets[i].end(); I != IE; ++I) {
-      Value *Ptr = GetPointerOperand(I->second);
+    // Keep track of the replacement pointer values we've inserted so that we
+    // don't generate more pointer values than necessary.
+    SmallPtrSet<Value *, 16> NewPtrs;
+    NewPtrs.insert( NewBasePtr);
+
+    for (auto I = std::next(Buckets[i].Elements.begin()),
+         IE = Buckets[i].Elements.end(); I != IE; ++I) {
+      Value *Ptr = GetPointerOperand(I->Instr);
       assert(Ptr && "No pointer operand");
-      if (Ptr == LastNewPtr)
+      if (NewPtrs.count(Ptr))
         continue;
 
       Instruction *RealNewPtr;
-      const SCEVConstant *Diff =
-        cast<SCEVConstant>(SE->getMinusSCEV(I->first, BasePtrSCEV));
-      if (Diff->isZero()) {
+      if (!I->Offset || I->Offset->getValue()->isZero()) {
         RealNewPtr = NewBasePtr;
       } else {
         Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
@@ -346,13 +393,13 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
             cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
           PtrIP = 0;
         else if (isa<PHINode>(PtrIP))
-          PtrIP = PtrIP->getParent()->getFirstInsertionPt();
+          PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
         else if (!PtrIP)
-          PtrIP = I->second;
+          PtrIP = I->Instr;
 
         GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
-            I8Ty, PtrInc, Diff->getValue(),
-            I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP);
+            I8Ty, PtrInc, I->Offset->getValue(),
+            I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
         if (!PtrIP)
           NewPtr->insertAfter(cast<Instruction>(PtrInc));
         NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
@@ -373,7 +420,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
       Ptr->replaceAllUsesWith(ReplNewPtr);
       RecursivelyDeleteTriviallyDeadInstructions(Ptr);
 
-      LastNewPtr = RealNewPtr;
+      NewPtrs.insert(RealNewPtr);
     }
 
     MadeChange = true;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 76837ec..44a692d 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -38,7 +38,7 @@ static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
 static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
   const TargetMachine &TM = AP.TM;
   Mangler *Mang = AP.Mang;
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = AP.getDataLayout();
   MCContext &Ctx = AP.OutContext;
   bool isDarwin = TM.getTargetTriple().isOSDarwin();
 
@@ -51,13 +51,13 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     Suffix = "$non_lazy_ptr";
 
   if (!Suffix.empty())
-    Name += DL->getPrivateGlobalPrefix();
+    Name += DL.getPrivateGlobalPrefix();
 
   unsigned PrefixLen = Name.size();
 
   if (!MO.isGlobal()) {
     assert(MO.isSymbol() && "Isn't a symbol reference");
-    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL);
+    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
   } else {
     const GlobalValue *GV = MO.getGlobal();
     TM.getNameWithPrefix(Name, GV, *Mang);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
new file mode 100644
index 0000000..fe339d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -0,0 +1,230 @@
+//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to clean up ugly code
+// sequences at the MachineInstruction layer.  It runs at the end of
+// the SSA phases, following VSX swap removal.  A pass of dead code
+// elimination follows this one for quick clean-up of any dead
+// instructions introduced here.  Although we could do this as callbacks
+// from the generic peephole pass, this would have a couple of bad
+// effects:  it might remove optimization opportunities for VSX swap
+// removal, and it would miss cleanups made possible following VSX
+// swap removal.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-mi-peepholes"
+
+namespace llvm {
+  void initializePPCMIPeepholePass(PassRegistry&);
+}
+
+namespace {
+
+struct PPCMIPeephole : public MachineFunctionPass {
+
+  static char ID;
+  const PPCInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  PPCMIPeephole() : MachineFunctionPass(ID) {
+    initializePPCMIPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  // Perform peepholes.
+  bool simplifyCode(void);
+
+  // Find the "true" register represented by SrcReg (following chains
+  // of copies and subreg_to_reg operations).
+  unsigned lookThruCopyLike(unsigned SrcReg);
+
+public:
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    initialize(MF);
+    return simplifyCode();
+  }
+};
+
+// Initialize class variables.
+void PPCMIPeephole::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+  DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
+  DEBUG(MF->dump());
+}
+
+// Perform peephole optimizations.
+bool PPCMIPeephole::simplifyCode(void) {
+  bool Simplified = false;
+  MachineInstr* ToErase = nullptr;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+
+      // If the previous instruction was marked for elimination,
+      // remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // Ignore debug instructions.
+      if (MI.isDebugValue())
+        continue;
+
+      // Per-opcode peepholes.
+      switch (MI.getOpcode()) {
+
+      default:
+        break;
+
+      case PPC::XXPERMDI: {
+        // Perform simplifications of 2x64 vector swaps and splats.
+        // A swap is identified by an immediate value of 2, and a splat
+        // is identified by an immediate value of 0 or 3.
+        int Immed = MI.getOperand(3).getImm();
+
+        if (Immed != 1) {
+
+          // For each of these simplifications, we need the two source
+          // regs to match.  Unfortunately, MachineCSE ignores COPY and
+          // SUBREG_TO_REG, so for example we can see
+          //   XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
+          // We have to look through chains of COPY and SUBREG_TO_REG
+          // to find the real source values for comparison.
+          unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
+          unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
+
+          if (TrueReg1 == TrueReg2
+              && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
+            MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
+
+            // If this is a splat or a swap fed by another splat, we
+            // can replace it with a copy.
+            if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
+              unsigned FeedImmed = DefMI->getOperand(3).getImm();
+              unsigned FeedReg1
+                = lookThruCopyLike(DefMI->getOperand(1).getReg());
+              unsigned FeedReg2
+                = lookThruCopyLike(DefMI->getOperand(2).getReg());
+
+              if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
+                DEBUG(dbgs()
+                      << "Optimizing splat/swap or splat/splat "
+                      "to splat/copy: ");
+                DEBUG(MI.dump());
+                BuildMI(MBB, &MI, MI.getDebugLoc(),
+                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
+                  .addOperand(MI.getOperand(1));
+                ToErase = &MI;
+                Simplified = true;
+              }
+
+              // If this is a splat fed by a swap, we can simplify modify
+              // the splat to splat the other value from the swap's input
+              // parameter.
+              else if ((Immed == 0 || Immed == 3)
+                       && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+                DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
+                DEBUG(MI.dump());
+                MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
+                MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
+                MI.getOperand(3).setImm(3 - Immed);
+                Simplified = true;
+              }
+
+              // If this is a swap fed by a swap, we can replace it
+              // with a copy from the first swap's input.
+              else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+                DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
+                DEBUG(MI.dump());
+                BuildMI(MBB, &MI, MI.getDebugLoc(),
+                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
+                  .addOperand(DefMI->getOperand(1));
+                ToErase = &MI;
+                Simplified = true;
+              }
+            }
+          }
+        }
+        break;
+      }
+      }
+    }
+
+    // If the last instruction was marked for elimination,
+    // remove it now.
+    if (ToErase) {
+      ToErase->eraseFromParent();
+      ToErase = nullptr;
+    }
+  }
+
+  return Simplified;
+}
+
+// This is used to find the "true" source register for an
+// XXPERMDI instruction, since MachineCSE does not handle the
+// "copy-like" operations (Copy and SubregToReg).  Returns
+// the original SrcReg unless it is the target of a copy-like
+// operation, in which case we chain backwards through all
+// such operations to the ultimate source register.  If a
+// physical register is encountered, we stop the search.
+unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
+
+  while (true) {
+
+    MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    if (!MI->isCopyLike())
+      return SrcReg;
+
+    unsigned CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+      return CopySrcReg;
+
+    SrcReg = CopySrcReg;
+  }
+}
+
+} // end default namespace
+
+INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
+                      "PowerPC MI Peephole Optimization", false, false)
+INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
+                    "PowerPC MI Peephole Optimization", false, false)
+
+char PPCMIPeephole::ID = 0;
+FunctionPass*
+llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index ec4e0a5..9d91e31 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -18,8 +18,29 @@ using namespace llvm;
 void PPCFunctionInfo::anchor() { }
 
 MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
-  const DataLayout *DL = MF.getTarget().getDataLayout();
-  return MF.getContext().getOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
                                            Twine(MF.getFunctionNumber()) +
                                            "$poff");
 }
+
+MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_gep" +
+                                           Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_lep" +
+                                           Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_toc" +
+                                           Twine(MF.getFunctionNumber()));
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 607cdf6..10a8ce0 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -197,6 +197,10 @@ public:
   bool usesPICBase() const { return UsesPICBase; }
 
   MCSymbol *getPICOffsetSymbol() const;
+
+  MCSymbol *getGlobalEPSymbol() const;
+  MCSymbol *getLocalEPSymbol() const;
+  MCSymbol *getTOCOffsetSymbol() const;
 };
 
 } // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 2b09b2f..934bdf6 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -200,7 +200,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::R2);  // System-reserved register
     Reserved.set(PPC::R13); // Small Data Area pointer register
   }
-  
+
   // On PPC64, r13 is the thread pointer. Never allocate this register.
   if (TM.isPPC64()) {
     Reserved.set(PPC::R13);
@@ -262,7 +262,7 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   default:
     return 0;
   case PPC::G8RC_NOX0RegClassID:
-  case PPC::GPRC_NOR0RegClassID: 
+  case PPC::GPRC_NOR0RegClassID:
   case PPC::G8RCRegClassID:
   case PPC::GPRCRegClassID: {
     unsigned FP = TFI->hasFP(MF) ? 1 : 0;
@@ -311,7 +311,7 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 //===----------------------------------------------------------------------===//
 
 /// lowerDynamicAlloc - Generate the code for allocating an object in the
-/// current frame.  The sequence of code with be in the general form
+/// current frame.  The sequence of code will be in the general form
 ///
 ///   addi   R0, SP, \#frameSize ; get the address of the previous frame
 ///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
@@ -337,7 +337,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
   // Get the total frame size.
   unsigned FrameSize = MFI->getStackSize();
-  
+
   // Get stack alignments.
   const PPCFrameLowering *TFI = getFrameLowering(MF);
   unsigned TargetAlign = TFI->getStackAlignment();
@@ -347,14 +347,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
 
   // Determine the previous frame's address.  If FrameSize can't be
   // represented as 16 bits or we need special alignment, then we load the
-  // previous frame's address from 0(SP).  Why not do an addis of the hi? 
-  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. 
-  // Constructing the constant and adding would take 3 instructions. 
+  // previous frame's address from 0(SP).  Why not do an addis of the hi?
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero.
+  // Constructing the constant and adding would take 3 instructions.
   // Fortunately, a frame greater than 32K is rare.
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
   unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-  
+
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
     BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
       .addReg(PPC::R31)
@@ -425,11 +425,32 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
       .addReg(PPC::R1)
       .addImm(maxCallFrameSize);
   }
-  
+
   // Discard the DYNALLOC instruction.
   MBB.erase(II);
 }
 
+void PPCRegisterInfo::lowerDynamicAreaOffset(
+    MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  DebugLoc dl = MI.getDebugLoc();
+  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+      .addImm(maxCallFrameSize);
+  MBB.erase(II);
+}
+
 /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
 /// reserving a whole register (R0), we scrounge for one here. This generates
 /// code like this:
@@ -459,8 +480,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   // We need to store the CR in the low 4-bits of the saved value. First, issue
   // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
   BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
-          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
-    
+      .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
   // If the saved register wasn't CR0, shift the bits left so that they are in
   // CR0's slot.
   if (SrcReg != PPC::CR0) {
@@ -549,8 +570,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
           .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
 
   BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
-          .addReg(getCRFromCRBit(SrcReg));
-    
+      .addReg(getCRFromCRBit(SrcReg));
+
   // If the saved register wasn't CR0LT, shift the bits left so that the bit to
   // store is the first one. Mask all but that bit.
   unsigned Reg1 = Reg;
@@ -602,17 +623,19 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
   unsigned ShiftBits = getEncodingValue(DestReg);
   // rlwimi r11, r10, 32-ShiftBits, ..., ...
   BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
-           .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill)
-           .addImm(ShiftBits ? 32-ShiftBits : 0)
-           .addImm(ShiftBits).addImm(ShiftBits);
-           
+      .addReg(RegO, RegState::Kill)
+      .addReg(Reg, RegState::Kill)
+      .addImm(ShiftBits ? 32 - ShiftBits : 0)
+      .addImm(ShiftBits)
+      .addImm(ShiftBits);
+
   BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
           getCRFromCRBit(DestReg))
-            .addReg(RegO, RegState::Kill)
-	    // Make sure we have a use dependency all the way through this
-	    // sequence of instructions. We can't have the other bits in the CR
-	    // modified in between the mfocrf and the mtocrf.
-            .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
+      .addReg(RegO, RegState::Kill)
+      // Make sure we have a use dependency all the way through this
+      // sequence of instructions. We can't have the other bits in the CR
+      // modified in between the mfocrf and the mtocrf.
+      .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
 
   // Discard the pseudo instruction.
   MBB.erase(II);
@@ -634,11 +657,11 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
   unsigned SrcReg = MI.getOperand(0).getReg();
 
   BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
-          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
-    
-  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
-                    .addReg(Reg, RegState::Kill),
-                    FrameIndex);
+      .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+  addFrameReference(
+      BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill),
+      FrameIndex);
 
   // Discard the pseudo instruction.
   MBB.erase(II);
@@ -671,9 +694,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
-bool
-PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
-				      unsigned Reg, int &FrameIdx) const {
+bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
   // ABI, return true to prevent allocating an additional frame slot.
@@ -752,7 +774,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FPSI = FI->getFramePointerSaveIndex();
   // Get the instruction opcode.
   unsigned OpC = MI.getOpcode();
-  
+
+  if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) {
+    lowerDynamicAreaOffset(II);
+    return;
+  }
+
   // Special case for dynamic alloca.
   if (FPSI && FrameIndex == FPSI &&
       (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
@@ -800,8 +827,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we're not using a Frame Pointer that has been set to the value of the
   // SP before having the stack size subtracted from it, then add the stack size
   // to Offset to get the correct offset.
-  // Naked functions have stack size 0, although getStackSize may not reflect that
-  // because we didn't call all the pieces that compute it for naked functions.
+  // Naked functions have stack size 0, although getStackSize may not reflect
+  // that because we didn't call all the pieces that compute it for naked
+  // functions.
   if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
     if (!(hasBasePointer(MF) && FrameIndex < 0))
       Offset += MFI->getStackSize();
@@ -840,7 +868,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     .addImm(Offset);
 
   // Convert into indexed form of the instruction:
-  // 
+  //
   //   sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
   //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
   unsigned OperandBase;
@@ -898,24 +926,6 @@ bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   return needsStackRealignment(MF);
 }
 
-bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
-    return false;
-
-  return true;
-}
-
-bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const PPCFrameLowering *TFI = getFrameLowering(MF);
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function *F = MF.getFunction();
-  unsigned StackAlign = TFI->getStackAlignment();
-  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
-                              F->hasFnAttribute(Attribute::StackAlignment));
-
-  return requiresRealignment && canRealignStack(MF);
-}
-
 /// Returns true if the instruction's frame index
 /// reference would be better served by a base register other than FP
 /// or SP. Used by LocalStackFrameAllocation to determine which frame index
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index d304e1d..b15fde8 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -54,13 +54,13 @@ inline static unsigned getCRFromCRBit(unsigned SrcReg) {
   return Reg;
 }
 
-
 class PPCRegisterInfo : public PPCGenRegisterInfo {
   DenseMap<unsigned, unsigned> ImmToIdxMap;
   const PPCTargetMachine &TM;
+
 public:
   PPCRegisterInfo(const PPCTargetMachine &TM);
-  
+
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
   const TargetRegisterClass *
@@ -77,7 +77,7 @@ public:
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override;
-  const uint32_t *getNoPreservedMask() const;
+  const uint32_t *getNoPreservedMask() const override;
 
   void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
 
@@ -101,6 +101,7 @@ public:
   }
 
   void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II,
                        unsigned FrameIndex) const;
   void lowerCRRestore(MachineBasicBlock::iterator II,
@@ -115,9 +116,9 @@ public:
                           unsigned FrameIndex) const;
 
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
-			    int &FrameIdx) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, unsigned FIOperandNum,
+                            int &FrameIdx) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
   // Support for virtual base registers.
@@ -136,8 +137,6 @@ public:
   // Base pointer (stack realignment) support.
   unsigned getBaseRegister(const MachineFunction &MF) const;
   bool hasBasePointer(const MachineFunction &MF) const;
-  bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 58dacca..c0fcb6c 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -62,6 +62,7 @@ void PPCSubtarget::initializeEnvironment() {
   Has64BitSupport = false;
   Use64BitRegs = false;
   UseCRBits = false;
+  UseSoftFloat = false;
   HasAltivec = false;
   HasSPE = false;
   HasQPX = false;
@@ -100,6 +101,8 @@ void PPCSubtarget::initializeEnvironment() {
   HasDirectMove = false;
   IsQPXStackUnaligned = false;
   HasHTM = false;
+  HasFusion = false;
+  HasFloat128 = false;
 }
 
 void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -210,5 +213,33 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
+unsigned char PPCSubtarget::classifyGlobalReference(
+    const GlobalValue *GV) const {
+  // Note that currently we don't generate non-pic references.
+  // If a caller wants that, this will have to be updated.
+
+  // Large code model always uses the TOC even for local symbols.
+  if (TM.getCodeModel() == CodeModel::Large)
+    return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
+
+  unsigned char flags = PPCII::MO_PIC_FLAG;
+
+  // Only if the relocation mode is PIC do we have to worry about
+  // interposition. In all other cases we can use a slightly looser standard to
+  // decide how to access the symbol.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // If it's local, or it's non-default, it can't be interposed.
+    if (!GV->hasLocalLinkage() &&
+        GV->hasDefaultVisibility()) {
+      flags |= PPCII::MO_NLP_FLAG;
+    }
+    return flags;
+  }
+
+  if (GV->isStrongDefinitionForLinker())
+    return flags;
+  return flags | PPCII::MO_NLP_FLAG;
+}
+
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
 bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 0616c1f..4f5c95c 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -83,6 +83,7 @@ protected:
   bool Has64BitSupport;
   bool Use64BitRegs;
   bool UseCRBits;
+  bool UseSoftFloat;
   bool IsPPC64;
   bool HasAltivec;
   bool HasSPE;
@@ -119,6 +120,8 @@ protected:
   bool HasPartwordAtomics;
   bool HasDirectMove;
   bool HasHTM;
+  bool HasFusion;
+  bool HasFloat128;
 
   /// When targeting QPX running a stock PPC64 Linux kernel where the stack
   /// alignment has not been changed, we need to keep the 16-byte alignment
@@ -188,6 +191,8 @@ public:
   /// has64BitSupport - Return true if the selected CPU supports 64-bit
   /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
   bool has64BitSupport() const { return Has64BitSupport; }
+  // useSoftFloat - Return true if soft-float option is turned on.
+  bool useSoftFloat() const { return UseSoftFloat; }
 
   /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
   /// registers in 32-bit mode when possible.  This can only true if
@@ -254,6 +259,8 @@ public:
     return 16;
   }
   bool hasHTM() const { return HasHTM; }
+  bool hasFusion() const { return HasFusion; }
+  bool hasFloat128() const { return HasFloat128; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -285,6 +292,10 @@ public:
   bool useAA() const override;
 
   bool enableSubRegLiveness() const override;
+
+  /// classifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget accourding to how we should reference it.
+  unsigned char classifyGlobalReference(const GlobalValue *GV) const;
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 2dc0d82..a9d2e88 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -99,6 +99,11 @@ protected:
           break;
         }
 
+        // Don't really need to save data to the stack - the clobbered
+        // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
+        // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
           .addReg(InReg);
@@ -113,6 +118,8 @@ protected:
                               .addReg(GPR3));
         Call->addOperand(MI->getOperand(3));
 
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+
         BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
           .addReg(GPR3);
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 1daf244..d24b590 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -42,6 +42,10 @@ static cl::
 opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
                                 cl::desc("Disable VSX Swap Removal for PPC"));
 
+static cl::
+opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
+                            cl::desc("Disable machine peepholes for PPC"));
+
 static cl::opt<bool>
 EnableGEPOpt("ppc-gep-opt", cl::Hidden,
              cl::desc("Enable optimizations on complex GEPs"),
@@ -57,11 +61,19 @@ EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps",
                       cl::desc("Add extra TOC register dependencies"),
                       cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+EnableMachineCombinerPass("ppc-machine-combiner",
+                          cl::desc("Enable the machine combiner pass"),
+                          cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
   RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializePPCBoolRetToIntPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -118,7 +130,7 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
   }
 
   if (OL != CodeGenOpt::None) {
-     if (!FullFS.empty())
+    if (!FullFS.empty())
       FullFS = "+invariant-function-descriptors," + FullFS;
     else
       FullFS = "+invariant-function-descriptors";
@@ -144,7 +156,7 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
     return PPCTargetMachine::PPC_ABI_ELFv2;
 
   assert(Options.MCOptions.getABIName().empty() &&
-	 "Unknown target-abi option!");
+         "Unknown target-abi option!");
 
   if (!TT.isMacOSX()) {
     switch (TT.getArch()) {
@@ -160,9 +172,9 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
   return PPCTargetMachine::PPC_ABI_UNKNOWN;
 }
 
-// The FeatureString here is a little subtle. We are modifying the feature string
-// with what are (currently) non-function specific overrides as it goes into the
-// LLVMTargetMachine constructor and then using the stored value in the
+// The FeatureString here is a little subtle. We are modifying the feature
+// string with what are (currently) non-function specific overrides as it goes
+// into the LLVMTargetMachine constructor and then using the stored value in the
 // Subtarget constructor below it.
 PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
@@ -227,6 +239,19 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
                        ? FSAttr.getValueAsString().str()
                        : TargetFS;
 
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  bool SoftFloat =
+    F.hasFnAttribute("use-soft-float") &&
+    F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+  // If the soft float attribute is set on the function turn on the soft float
+  // subtarget feature.
+  if (SoftFloat)
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
@@ -277,6 +302,8 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void PPCPassConfig::addIRPasses() {
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCBoolRetToIntPass());
   addPass(createAtomicExpandPass(&getPPCTargetMachine()));
 
   // For the BG/Q (or if explicitly requested), add explicit data prefetch
@@ -316,6 +343,10 @@ bool PPCPassConfig::addPreISel() {
 
 bool PPCPassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
+
+  if (EnableMachineCombinerPass)
+    addPass(&MachineCombinerID);
+
   return true;
 }
 
@@ -339,6 +370,12 @@ void PPCPassConfig::addMachineSSAOptimization() {
   if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
       !DisableVSXSwapRemoval)
     addPass(createPPCVSXSwapRemovalPass());
+  // Target-specific peephole cleanups performed after instruction
+  // selection.
+  if (!DisableMIPeephole) {
+    addPass(createPPCMIPeepholePass());
+    addPass(&DeadMachineInstructionElimID);
+  }
 }
 
 void PPCPassConfig::addPreRegAlloc() {
@@ -364,6 +401,7 @@ void PPCPassConfig::addPreEmitPass() {
 }
 
 TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); });
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(PPCTTIImpl(this, F));
+  });
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index 9ee5db9..798bb9d 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -42,9 +42,7 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
   if (Kind.isReadOnly()) {
     const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
 
-    if (GVar && GVar->isConstant() &&
-        (GVar->getInitializer()->getRelocationInfo() ==
-         Constant::GlobalRelocations))
+    if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation())
       Kind = SectionKind::getReadOnlyWithRel();
   }
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index e21c2b7..cd86dab 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -35,7 +35,7 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   if (DisablePPCConstHoist)
     return BaseT::getIntImmCost(Imm, Ty);
 
@@ -64,8 +64,8 @@ unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   return 4 * TTI::TCC_Basic;
 }
 
-unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                   const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
   if (DisablePPCConstHoist)
     return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
 
@@ -98,8 +98,8 @@ unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                   const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
   if (DisablePPCConstHoist)
     return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
 
@@ -197,9 +197,20 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L,
 }
 
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
+  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
+  // on combining the loads generated for consecutive accesses, and failure to
+  // do so is particularly expensive. This makes it much more likely (compared
+  // to only using concatenation unrolling).
+  if (ST->getDarwinDirective() == PPC::DIR_A2)
+    return true;
+
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::enableInterleavedAccessVectorization() {
+  return true;
+}
+
 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
   if (Vector && !ST->hasAltivec() && !ST->hasQPX())
     return 0;
@@ -246,7 +257,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return 2;
 }
 
-unsigned PPCTTIImpl::getArithmeticInstrCost(
+int PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
@@ -257,24 +268,30 @@ unsigned PPCTTIImpl::getArithmeticInstrCost(
                                        Opd1PropInfo, Opd2PropInfo);
 }
 
-unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                                    Type *SubTp) {
-  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // (at least in the sense that there need only be one non-loop-invariant
+  // instruction). We need one such shuffle instruction for each actual
+  // register (this is not true for arbitrary shuffles, but is true for the
+  // structured types of shuffles covered by TTI::ShuffleKind).
+  return LT.first;
 }
 
-unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                        Type *CondTy) {
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                        unsigned Index) {
+int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -313,41 +330,83 @@ unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
-unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                     unsigned Alignment,
-                                     unsigned AddressSpace) {
+int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
-  unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
-
-  // VSX loads/stores support unaligned access.
-  if (ST->hasVSX()) {
-    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
-      return Cost;
-  }
+  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 
-  bool UnalignedAltivec =
-    Src->isVectorTy() &&
-    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
-    LT.second.getSizeInBits() == 128 &&
-    Opcode == Instruction::Load;
+  // Aligned loads and stores are easy.
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+    return Cost;
+
+  bool IsAltivecType = ST->hasAltivec() &&
+                       (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+                        LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+  bool IsVSXType = ST->hasVSX() &&
+                   (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+  bool IsQPXType = ST->hasQPX() &&
+                   (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
+
+  // If we can use the permutation-based load sequence, then this is also
+  // relatively cheap (not counting loop-invariant instructions): one load plus
+  // one permute (the last load in a series has extra cost, but we're
+  // neglecting that here). Note that on the P7, we should do unaligned loads
+  // for Altivec types using the VSX instructions, but that's more expensive
+  // than using the permutation-based load sequence. On the P8, that's no
+  // longer true.
+  if (Opcode == Instruction::Load &&
+      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+      Alignment >= LT.second.getScalarType().getStoreSize())
+    return Cost + LT.first; // Add the cost of the permutations.
+
+  // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
+  // P7, unaligned vector loads are more expensive than the permutation-based
+  // load sequence, so that might be used instead, but regardless, the net cost
+  // is about the same (not counting loop-invariant instructions).
+  if (IsVSXType || (ST->hasVSX() && IsAltivecType))
+    return Cost;
 
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
-  unsigned SrcBytes = LT.second.getStoreSize();
-  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
-    Cost += LT.first*(SrcBytes/Alignment-1);
-
-    // For a vector type, there is also scalarization overhead (only for
-    // stores, loads are expanded using the vector-load + permutation sequence,
-    // which is much less expensive).
-    if (Src->isVectorTy() && Opcode == Instruction::Store)
-      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
-        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
-  }
+
+  // Add the cost of each scalar load or store.
+  Cost += LT.first*(SrcBytes/Alignment-1);
+
+  // For a vector type, there is also scalarization overhead (only for
+  // stores, loads are expanded using the vector-load + permutation sequence,
+  // which is much less expensive).
+  if (Src->isVectorTy() && Opcode == Instruction::Store)
+    for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+
+  return Cost;
+}
+
+int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+
+  // Firstly, the cost of load/store operation.
+  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // (at least in the sense that there need only be one non-loop-invariant
+  // instruction). For each result vector, we need one shuffle per incoming
+  // vector (except that the first shuffle can take two incoming vectors
+  // because it does not need to take itself).
+  Cost += Factor*(LT.first-1);
 
   return Cost;
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 368bef9..04c1b02 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -37,7 +37,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
   const PPCTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F)
+  explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
@@ -52,12 +52,11 @@ public:
   /// @{
 
   using BaseT::getIntImmCost;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty);
 
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
@@ -68,22 +67,27 @@ public:
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
-  unsigned getArithmeticInstrCost(
+  int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
-  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                          Type *SubTp);
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace);
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
 
   /// @}
 };
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
index 5e3ae2a..782583c 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -77,6 +77,14 @@ namespace {
       return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
     }
 
+    bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+    }
+
+    bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
+    }
+
 protected:
     bool processBlock(MachineBasicBlock &MBB) {
       bool Changed = false;
@@ -100,7 +108,9 @@ protected:
             IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
                                            &PPC::VSLRCRegClass;
           assert((IsF8Reg(SrcMO.getReg(), MRI) ||
-                  IsVRReg(SrcMO.getReg(), MRI)) &&
+                  IsVRReg(SrcMO.getReg(), MRI) ||
+                  IsVSSReg(SrcMO.getReg(), MRI) ||
+                  IsVSFReg(SrcMO.getReg(), MRI)) &&
                  "Unknown source for a VSX copy");
 
           unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
@@ -123,6 +133,8 @@ protected:
             IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
                                            &PPC::VSLRCRegClass;
           assert((IsF8Reg(DstMO.getReg(), MRI) ||
+                  IsVSFReg(DstMO.getReg(), MRI) ||
+                  IsVSSReg(DstMO.getReg(), MRI) ||
                   IsVRReg(DstMO.getReg(), MRI)) &&
                  "Unknown destination for a VSX copy");
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 46b8d13..6b19a2f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -103,10 +103,10 @@ protected:
 
         VNInfo *AddendValNo =
           LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
-        if (!AddendValNo) {
-          // This can be null if the register is undef.
+
+        // This can be null if the register is undef.
+        if (!AddendValNo)
           continue;
-        }
 
         MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
 
@@ -186,18 +186,17 @@ protected:
         if (!KilledProdOp)
           continue;
 
-	// If the addend copy is used only by this MI, then the addend source
-	// register is likely not live here. This could be fixed (based on the
-	// legality checks above, the live range for the addend source register
-	// could be extended), but it seems likely that such a trivial copy can
-	// be coalesced away later, and thus is not worth the effort.
-	if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
+        // If the addend copy is used only by this MI, then the addend source
+        // register is likely not live here. This could be fixed (based on the
+        // legality checks above, the live range for the addend source register
+        // could be extended), but it seems likely that such a trivial copy can
+        // be coalesced away later, and thus is not worth the effort.
+        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
             !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx))
           continue;
 
         // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
 
-        unsigned AddReg = AddendMI->getOperand(1).getReg();
         unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
         unsigned OtherProdReg  = MI->getOperand(OtherProdOp).getReg();
 
@@ -221,6 +220,14 @@ protected:
         if (OldFMAReg == KilledProdReg)
           continue;
 
+        // If there isn't a class that fits, we can't perform the transform.
+        // This is needed for correctness with a mixture of VSX and Altivec
+        // instructions to make sure that a low VSX register is not assigned to
+        // the Altivec instruction.
+        if (!MRI.constrainRegClass(KilledProdReg,
+                                   MRI.getRegClass(OldFMAReg)))
+          continue;
+
         assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
                "Addend copy not tied to old FMA output!");
 
@@ -228,7 +235,7 @@ protected:
 
         MI->getOperand(0).setReg(KilledProdReg);
         MI->getOperand(1).setReg(KilledProdReg);
-        MI->getOperand(3).setReg(AddReg);
+        MI->getOperand(3).setReg(AddendSrcReg);
         MI->getOperand(2).setReg(OtherProdReg);
 
         MI->getOperand(0).setSubReg(KilledProdSubReg);
@@ -263,8 +270,7 @@ protected:
           if (UseMI == AddendMI)
             continue;
 
-          UseMO.setReg(KilledProdReg);
-          UseMO.setSubReg(KilledProdSubReg);
+          UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI);
         }
 
         // Extend the live intervals of the killed product operand to hold the
@@ -286,6 +292,20 @@ protected:
         }
         DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
 
+        // Extend the live interval of the addend source (it might end at the
+        // copy to be removed, or somewhere in between there and here). This
+        // is necessary only if it is a physical register.
+        if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg))
+          for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
+               ++Units) {
+            unsigned Unit = *Units;
+
+            LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
+            AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB),
+                                         FMAIdx.getRegSlot());
+            DEBUG(dbgs() << "  extended: " << AddendSrcRange << '\n');
+          }
+
         FMAInt.removeValNo(FMAValNo);
         DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
 
@@ -347,7 +367,6 @@ INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
 char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
 
 char PPCVSXFMAMutate::ID = 0;
-FunctionPass*
-llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
-
-
+FunctionPass *llvm::createPPCVSXFMAMutatePass() {
+  return new PPCVSXFMAMutate();
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index d7132d5..27c540f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -94,7 +94,7 @@ enum SHValues {
   SH_NOSWAP_ST,
   SH_SPLAT,
   SH_XXPERMDI,
-  SH_COPYSCALAR
+  SH_COPYWIDEN
 };
 
 struct PPCVSXSwapRemoval : public MachineFunctionPass {
@@ -149,6 +149,11 @@ private:
   // handling.  Return true iff any changes are made.
   bool removeSwaps();
 
+  // Insert a swap instruction from SrcReg to DstReg at the given
+  // InsertPoint.
+  void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
+                  unsigned DstReg, unsigned SrcReg);
+
   // Update instructions requiring special handling.
   void handleSpecialSwappables(int EntryIdx);
 
@@ -159,9 +164,7 @@ private:
   bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       return RC->hasSubClassEq(MRI->getRegClass(Reg));
-    if (RC->contains(Reg))
-      return true;
-    return false;
+    return RC->contains(Reg);
   }
 
   // Return true iff the given register is a full vector register.
@@ -215,7 +218,7 @@ public:
 void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
   MF = &MFParm;
   MRI = &MF->getRegInfo();
-  TII = static_cast<const PPCInstrInfo*>(MF->getSubtarget().getInstrInfo());
+  TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
 
   // An initial vector size of 256 appears to work well in practice.
   // Small/medium functions with vector content tend not to incur a
@@ -343,6 +346,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         SwapVector[VecIdx].IsLoad = 1;
         SwapVector[VecIdx].IsSwap = 1;
         break;
+      case PPC::LXSDX:
+      case PPC::LXSSPX:
+        // A load of a floating-point value into the high-order half of
+        // a vector register is safe, provided that we introduce a swap
+        // following the load, which will be done by the SUBREG_TO_REG
+        // support.  So just mark these as safe.
+        SwapVector[VecIdx].IsLoad = 1;
+        SwapVector[VecIdx].IsSwappable = 1;
+        break;
       case PPC::STVX:
         // Non-permuting stores are currently unsafe.  We can use special
         // handling for this in the future.  By not marking these as
@@ -385,7 +397,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         else if (isVecReg(MI.getOperand(0).getReg()) &&
                  isScalarVecReg(MI.getOperand(2).getReg())) {
           SwapVector[VecIdx].IsSwappable = 1;
-          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
         }
         break;
       }
@@ -420,7 +432,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       case PPC::STVEHX:
       case PPC::STVEWX:
       case PPC::STVXL:
+        // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
+        // by adding special handling for narrowing copies as well as
+        // widening ones.  However, I've experimented with this, and in
+        // practice we currently do not appear to use STXSDX fed by 
+        // a narrowing copy from a full vector register.  Since I can't
+        // generate any useful test cases, I've left this alone for now.
       case PPC::STXSDX:
+      case PPC::STXSSPX:
       case PPC::VCIPHER:
       case PPC::VCIPHERLAST:
       case PPC::VMRGHB:
@@ -543,7 +562,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
   }
 
   if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
-    SwapVector[VecIdx].MentionsPhysVR = 1;
+    if (!isScalarVecReg(CopySrcReg))
+      SwapVector[VecIdx].MentionsPhysVR = 1;
     return CopySrcReg;
   }
 
@@ -629,8 +649,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
       SwapVector[Repr].WebRejected = 1;
 
       DEBUG(dbgs() <<
-            format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
-                   Repr));
+            format("Web %d rejected for physreg, partial reg, or not "
+                   "swap[pable]\n", Repr));
       DEBUG(dbgs() << "  in " << EntryIdx << ": ");
       DEBUG(SwapVector[EntryIdx].VSEMI->dump());
       DEBUG(dbgs() << "\n");
@@ -743,6 +763,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
   }
 }
 
+// Create an xxswapd instruction and insert it prior to the given point.
+// MI is used to determine basic block and debug loc information.
+// FIXME: When inserting a swap, we should check whether SrcReg is
+// defined by another swap:  SrcReg = XXPERMDI Reg, Reg, 2;  If so,
+// then instead we should generate a copy from Reg to DstReg.
+void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
+                                   MachineBasicBlock::iterator InsertPoint,
+                                   unsigned DstReg, unsigned SrcReg) {
+  BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+          TII->get(PPC::XXPERMDI), DstReg)
+    .addReg(SrcReg)
+    .addReg(SrcReg)
+    .addImm(2);
+}
+
 // The identified swap entry requires special handling to allow its
 // containing computation to be optimized.  Perform that handling
 // here.
@@ -752,8 +787,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
   switch (SwapVector[EntryIdx].SpecialHandling) {
 
   default:
-    assert(false && "Unexpected special handling type");
-    break;
+    llvm_unreachable("Unexpected special handling type");
 
   // For splats based on an index into a vector, add N/2 modulo N
   // to the index, where N is the number of vector elements.
@@ -766,7 +800,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
 
     switch (MI->getOpcode()) {
     default:
-      assert(false && "Unexpected splat opcode");
+      llvm_unreachable("Unexpected splat opcode");
     case PPC::VSPLTB: NElts = 16; break;
     case PPC::VSPLTH: NElts = 8;  break;
     case PPC::VSPLTW: NElts = 4;  break;
@@ -811,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
   // For a copy from a scalar floating-point register to a vector
   // register, removing swaps will leave the copied value in the
   // wrong lane.  Insert a swap following the copy to fix this.
-  case SHValues::SH_COPYSCALAR: {
+  case SHValues::SH_COPYWIDEN: {
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
 
     DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
@@ -825,14 +859,13 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     DEBUG(dbgs() << "  Into: ");
     DEBUG(MI->dump());
 
-    MachineBasicBlock::iterator InsertPoint = MI->getNextNode();
+    auto InsertPoint = ++MachineBasicBlock::iterator(MI);
 
     // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
     // is copying to a VRRC, we need to be careful to avoid a register
     // assignment problem.  In this case we must copy from VRRC to VSRC
     // prior to the swap, and from VSRC to VRRC following the swap.
     // Coalescing will usually remove all this mess.
-
     if (DstRC == &PPC::VRRCRegClass) {
       unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
       unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
@@ -840,29 +873,19 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
       BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
               TII->get(PPC::COPY), VSRCTmp1)
         .addReg(NewVReg);
-      DEBUG(MI->getNextNode()->dump());
+      DEBUG(std::prev(InsertPoint)->dump());
 
-      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
-              TII->get(PPC::XXPERMDI), VSRCTmp2)
-        .addReg(VSRCTmp1)
-        .addReg(VSRCTmp1)
-        .addImm(2);
-      DEBUG(MI->getNextNode()->getNextNode()->dump());
+      insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
+      DEBUG(std::prev(InsertPoint)->dump());
 
       BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
               TII->get(PPC::COPY), DstReg)
         .addReg(VSRCTmp2);
-      DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
+      DEBUG(std::prev(InsertPoint)->dump());
 
     } else {
-
-      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
-              TII->get(PPC::XXPERMDI), DstReg)
-        .addReg(NewVReg)
-        .addReg(NewVReg)
-        .addImm(2);
-
-      DEBUG(MI->getNextNode()->dump());
+      insertSwap(MI, InsertPoint, DstReg, NewVReg);
+      DEBUG(std::prev(InsertPoint)->dump());
     }
     break;
   }
@@ -947,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
       case SH_XXPERMDI:
         DEBUG(dbgs() << "special:xxpermdi ");
         break;
-      case SH_COPYSCALAR:
-        DEBUG(dbgs() << "special:copyscalar ");
+      case SH_COPYWIDEN:
+        DEBUG(dbgs() << "special:copywiden ");
         break;
       }
     }
diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 1c4e486..a552747 100644
--- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -34,7 +35,6 @@ namespace {
 class SparcOperand;
 class SparcAsmParser : public MCTargetAsmParser {
 
-  MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
   /// @name Auto-generated Match Functions
@@ -69,6 +69,10 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
 
+  // Helper function for dealing with %lo / %hi in PIC mode.
+  const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK,
+                                         const MCExpr *subExpr);
+
   // returns true if Tok is matched to a register and returns register in RegNo.
   bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
                          unsigned &RegKind);
@@ -77,24 +81,24 @@ class SparcAsmParser : public MCTargetAsmParser {
   bool parseDirectiveWord(unsigned Size, SMLoc L);
 
   bool is64Bit() const {
-    return STI.getTargetTriple().getArch() == Triple::sparcv9;
+    return getSTI().getTargetTriple().getArch() == Triple::sparcv9;
   }
 
   void expandSET(MCInst &Inst, SMLoc IDLoc,
                  SmallVectorImpl<MCInst> &Instructions);
 
 public:
-  SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+  SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII,
                 const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+      : MCTargetAsmParser(Options, sti), Parser(parser) {
     // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
 
 };
 
-  static unsigned IntRegs[32] = {
+  static const MCPhysReg IntRegs[32] = {
     Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
     Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
     Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3,
@@ -104,7 +108,7 @@ public:
     Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3,
     Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 };
 
-  static unsigned FloatRegs[32] = {
+  static const MCPhysReg FloatRegs[32] = {
     Sparc::F0,  Sparc::F1,  Sparc::F2,  Sparc::F3,
     Sparc::F4,  Sparc::F5,  Sparc::F6,  Sparc::F7,
     Sparc::F8,  Sparc::F9,  Sparc::F10, Sparc::F11,
@@ -114,7 +118,7 @@ public:
     Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27,
     Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 };
 
-  static unsigned DoubleRegs[32] = {
+  static const MCPhysReg DoubleRegs[32] = {
     Sparc::D0,  Sparc::D1,  Sparc::D2,  Sparc::D3,
     Sparc::D4,  Sparc::D5,  Sparc::D6,  Sparc::D7,
     Sparc::D8,  Sparc::D7,  Sparc::D8,  Sparc::D9,
@@ -124,13 +128,13 @@ public:
     Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27,
     Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 };
 
-  static unsigned QuadFPRegs[32] = {
+  static const MCPhysReg QuadFPRegs[32] = {
     Sparc::Q0,  Sparc::Q1,  Sparc::Q2,  Sparc::Q3,
     Sparc::Q4,  Sparc::Q5,  Sparc::Q6,  Sparc::Q7,
     Sparc::Q8,  Sparc::Q9,  Sparc::Q10, Sparc::Q11,
     Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 };
 
-  static unsigned ASRRegs[32] = {
+  static const MCPhysReg ASRRegs[32] = {
     SP::Y,     SP::ASR1,  SP::ASR2,  SP::ASR3,
     SP::ASR4,  SP::ASR5,  SP::ASR6, SP::ASR7,
     SP::ASR8,  SP::ASR9,  SP::ASR10, SP::ASR11,
@@ -140,6 +144,12 @@ public:
     SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
     SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
 
+  static const MCPhysReg IntPairRegs[] = {
+    Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7,
+    Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7,
+    Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7,
+    Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7};
+
 /// SparcOperand - Instances of this class represent a parsed Sparc machine
 /// instruction.
 class SparcOperand : public MCParsedAsmOperand {
@@ -147,6 +157,7 @@ public:
   enum RegisterKind {
     rk_None,
     rk_IntReg,
+    rk_IntPairReg,
     rk_FloatReg,
     rk_DoubleReg,
     rk_QuadReg,
@@ -200,6 +211,10 @@ public:
   bool isMEMrr() const { return Kind == k_MemoryReg; }
   bool isMEMri() const { return Kind == k_MemoryImm; }
 
+  bool isIntReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_IntReg);
+  }
+
   bool isFloatReg() const {
     return (Kind == k_Register && Reg.Kind == rk_FloatReg);
   }
@@ -330,6 +345,25 @@ public:
     return Op;
   }
 
+  static bool MorphToIntPairReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_IntReg);
+    unsigned regIdx = 32;
+    if (Reg >= Sparc::G0 && Reg <= Sparc::G7)
+      regIdx = Reg - Sparc::G0;
+    else if (Reg >= Sparc::O0 && Reg <= Sparc::O7)
+      regIdx = Reg - Sparc::O0 + 8;
+    else if (Reg >= Sparc::L0 && Reg <= Sparc::L7)
+      regIdx = Reg - Sparc::L0 + 16;
+    else if (Reg >= Sparc::I0 && Reg <= Sparc::I7)
+      regIdx = Reg - Sparc::I0 + 24;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+    Op.Reg.Kind = rk_IntPairReg;
+    return true;
+  }
+
   static bool MorphToDoubleReg(SparcOperand &Op) {
     unsigned Reg = Op.getReg();
     assert(Op.Reg.Kind == rk_FloatReg);
@@ -407,7 +441,22 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
 
   // the imm operand can be either an expression or an immediate.
   bool IsImm = Inst.getOperand(1).isImm();
-  uint64_t ImmValue = IsImm ? MCValOp.getImm() : 0;
+  int64_t RawImmValue = IsImm ? MCValOp.getImm() : 0;
+
+  // Allow either a signed or unsigned 32-bit immediate.
+  if (RawImmValue < -2147483648LL || RawImmValue > 4294967295LL) {
+    Error(IDLoc, "set: argument must be between -2147483648 and 4294967295");
+    return;
+  }
+
+  // If the value was expressed as a large unsigned number, that's ok.
+  // We want to see if it "looks like" a small signed number.
+  int32_t ImmValue = RawImmValue;
+  // For 'set' you can't use 'or' with a negative operand on V9 because
+  // that would splat the sign bit across the upper half of the destination
+  // register, whereas 'set' is defined to zero the high 32 bits.
+  bool IsEffectivelyImm13 =
+      IsImm && ((is64Bit() ? 0 : -4096) <= ImmValue && ImmValue < 4096);
   const MCExpr *ValExpr;
   if (IsImm)
     ValExpr = MCConstantExpr::create(ImmValue, getContext());
@@ -416,10 +465,12 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
 
   MCOperand PrevReg = MCOperand::createReg(Sparc::G0);
 
-  if (!IsImm || (ImmValue & ~0x1fff)) {
+  // If not just a signed imm13 value, then either we use a 'sethi' with a
+  // following 'or', or a 'sethi' by itself if there are no more 1 bits.
+  // In either case, start with the 'sethi'.
+  if (!IsEffectivelyImm13) {
     MCInst TmpInst;
-    const MCExpr *Expr =
-        SparcMCExpr::create(SparcMCExpr::VK_Sparc_HI, ValExpr, getContext());
+    const MCExpr *Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_HI, ValExpr);
     TmpInst.setLoc(IDLoc);
     TmpInst.setOpcode(SP::SETHIi);
     TmpInst.addOperand(MCRegOp);
@@ -428,10 +479,23 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
     PrevReg = MCRegOp;
   }
 
-  if (!IsImm || ((ImmValue & 0x1fff) != 0 || ImmValue == 0)) {
+  // The low bits require touching in 3 cases:
+  // * A non-immediate value will always require both instructions.
+  // * An effectively imm13 value needs only an 'or' instruction.
+  // * Otherwise, an immediate that is not effectively imm13 requires the
+  //   'or' only if bits remain after clearing the 22 bits that 'sethi' set.
+  // If the low bits are known zeros, there's nothing to do.
+  // In the second case, and only in that case, must we NOT clear
+  // bits of the immediate value via the %lo() assembler function.
+  // Note also, the 'or' instruction doesn't mind a large value in the case
+  // where the operand to 'set' was 0xFFFFFzzz - it does exactly what you mean.
+  if (!IsImm || IsEffectivelyImm13 || (ImmValue & 0x3ff)) {
     MCInst TmpInst;
-    const MCExpr *Expr =
-        SparcMCExpr::create(SparcMCExpr::VK_Sparc_LO, ValExpr, getContext());
+    const MCExpr *Expr;
+    if (IsEffectivelyImm13)
+      Expr = ValExpr;
+    else
+      Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_LO, ValExpr);
     TmpInst.setLoc(IDLoc);
     TmpInst.setOpcode(SP::ORri);
     TmpInst.addOperand(MCRegOp);
@@ -463,7 +527,7 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     }
 
     for (const MCInst &I : Instructions) {
-      Out.EmitInstruction(I, STI);
+      Out.EmitInstruction(I, getSTI());
     }
     return false;
   }
@@ -742,6 +806,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
       case Sparc::PSR:
         Op = SparcOperand::CreateToken("%psr", S);
         break;
+      case Sparc::FSR:
+        Op = SparcOperand::CreateToken("%fsr", S);
+        break;
       case Sparc::WIM:
         Op = SparcOperand::CreateToken("%wim", S);
         break;
@@ -766,6 +833,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
   case AsmToken::Minus:
   case AsmToken::Integer:
   case AsmToken::LParen:
+  case AsmToken::Dot:
     if (!getParser().parseExpression(EVal, E))
       Op = SparcOperand::CreateImm(EVal, S, E);
     break;
@@ -848,6 +916,13 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
       return true;
     }
 
+    // %fprs is an alias of %asr6.
+    if (name.equals("fprs")) {
+      RegNo = ASRRegs[6];
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
     if (name.equals("icc")) {
       RegNo = Sparc::ICC;
       RegKind = SparcOperand::rk_Special;
@@ -860,6 +935,12 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
       return true;
     }
 
+    if (name.equals("fsr")) {
+      RegNo = Sparc::FSR;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
     if (name.equals("wim")) {
       RegNo = Sparc::WIM;
       RegKind = SparcOperand::rk_Special;
@@ -943,6 +1024,82 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
       RegKind = SparcOperand::rk_IntReg;
       return true;
     }
+
+    if (name.equals("tpc")) {
+      RegNo = Sparc::TPC;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tnpc")) {
+      RegNo = Sparc::TNPC;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tstate")) {
+      RegNo = Sparc::TSTATE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tt")) {
+      RegNo = Sparc::TT;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tick")) {
+      RegNo = Sparc::TICK;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tba")) {
+      RegNo = Sparc::TBA;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("pstate")) {
+      RegNo = Sparc::PSTATE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tl")) {
+      RegNo = Sparc::TL;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("pil")) {
+      RegNo = Sparc::PIL;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("cwp")) {
+      RegNo = Sparc::CWP;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("cansave")) {
+      RegNo = Sparc::CANSAVE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("canrestore")) {
+      RegNo = Sparc::CANRESTORE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("cleanwin")) {
+      RegNo = Sparc::CLEANWIN;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("otherwin")) {
+      RegNo = Sparc::OTHERWIN;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("wstate")) {
+      RegNo = Sparc::WSTATE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
   }
   return false;
 }
@@ -975,6 +1132,32 @@ static bool hasGOTReference(const MCExpr *Expr) {
   return false;
 }
 
+const SparcMCExpr *
+SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
+                                    const MCExpr *subExpr)
+{
+  // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
+  // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
+  // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
+  // as %got10 or %got22 relocation.
+
+  if (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) {
+    switch(VK) {
+    default: break;
+    case SparcMCExpr::VK_Sparc_LO:
+      VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC10
+                                     : SparcMCExpr::VK_Sparc_GOT10);
+      break;
+    case SparcMCExpr::VK_Sparc_HI:
+      VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC22
+                                     : SparcMCExpr::VK_Sparc_GOT22);
+      break;
+    }
+  }
+
+  return SparcMCExpr::create(VK, subExpr, getContext());
+}
+
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
                                             SMLoc &EndLoc)
 {
@@ -998,30 +1181,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
   if (Parser.parseParenExpression(subExpr, EndLoc))
     return false;
 
-  bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_;
-
-  // Ugly: if a sparc assembly expression says "%hi(...)" but the
-  // expression within contains _GLOBAL_OFFSET_TABLE_, it REALLY means
-  // %pc22. Same with %lo -> %pc10. Worse, if it doesn't contain that,
-  // the meaning depends on whether the assembler was invoked with
-  // -KPIC or not: if so, it really means %got22/%got10; if not, it
-  // actually means what it said! Sigh, historical mistakes...
-
-  switch(VK) {
-  default: break;
-  case SparcMCExpr::VK_Sparc_LO:
-    VK =  (hasGOTReference(subExpr)
-           ? SparcMCExpr::VK_Sparc_PC10
-           : (isPIC ? SparcMCExpr::VK_Sparc_GOT10 : VK));
-    break;
-  case SparcMCExpr::VK_Sparc_HI:
-    VK =  (hasGOTReference(subExpr)
-           ? SparcMCExpr::VK_Sparc_PC22
-           : (isPIC ? SparcMCExpr::VK_Sparc_GOT22 : VK));
-    break;
-  }
-
-  EVal = SparcMCExpr::create(VK, subExpr, getContext());
+  EVal = adjustPICRelocation(VK, subExpr);
   return true;
 }
 
@@ -1051,5 +1211,9 @@ unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
       break;
     }
   }
+  if (Op.isIntReg() && Kind == MCK_IntPair) {
+    if (SparcOperand::MorphToIntPairReg(Op))
+      return MCTargetAsmParser::Match_Success;
+  }
   return Match_InvalidOperand;
 }
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
index 38bff44..c689b7f 100644
--- a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    // TODO: If we ever want to support v7, this needs to be extended
+    // to cover all floating point operations.
     if (!Subtarget->isV9() &&
         (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
          || MI->getOpcode() == SP::FCMPQ)) {
diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 3e56b9e..51751ec 100644
--- a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -117,6 +117,19 @@ static const unsigned ASRRegDecoderTable[] = {
   SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
   SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
 
+static const unsigned PRRegDecoderTable[] = {
+  SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE,
+  SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN,
+  SP::OTHERWIN, SP::WSTATE
+};
+
+static const uint16_t IntPairDecoderTable[] = {
+  SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7,
+  SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7,
+  SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7,
+  SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7,
+};
+
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
                                                unsigned RegNo,
                                                uint64_t Address,
@@ -196,9 +209,34 @@ static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= array_lengthof(PRRegDecoderTable))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1))
+    S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = IntPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+  return S;
+}
 
 static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
                                   const void *Decoder);
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
 static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                  const void *Decoder);
 static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
@@ -207,6 +245,8 @@ static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                   const void *Decoder);
 static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
@@ -326,6 +366,12 @@ static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
                    DecodeIntRegsRegisterClass);
 }
 
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeIntPairRegisterClass);
+}
+
 static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                  const void *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, true,
@@ -350,6 +396,12 @@ static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
                    DecodeIntRegsRegisterClass);
 }
 
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeIntPairRegisterClass);
+}
+
 static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                   const void *Decoder) {
   return DecodeMem(Inst, insn, Address, Decoder, false,
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
index 0b01b88..6f06d1d 100644
--- a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -15,12 +15,9 @@
 #define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
-class MCOperand;
-
 class SparcInstPrinter : public MCInstPrinter {
 public:
   SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 12386f1..ad44122 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -21,6 +21,7 @@ class Triple;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
   void anchor() override;
+
 public:
   explicit SparcELFMCAsmInfo(const Triple &TheTriple);
   const MCExpr*
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index d08ad86..13f0819 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -90,8 +90,8 @@ public:
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *findAssociatedSection() const override {
-    return getSubExpr()->findAssociatedSection();
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
   }
 
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index c5f046b..e3b0f52 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -267,11 +267,11 @@ void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
     LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo());
     return;
   }
-  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
   do {
     MCInst TmpInst;
-    LowerSparcMachineInstrToMCInst(I, TmpInst, *this);
+    LowerSparcMachineInstrToMCInst(&*I, TmpInst, *this);
     EmitToStreamer(*OutStreamer, TmpInst);
   } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
 }
@@ -296,7 +296,7 @@ void SparcAsmPrinter::EmitFunctionBodyStart() {
 
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   const MachineOperand &MO = MI->getOperand (opNum);
   SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
 
@@ -373,7 +373,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << MO.getSymbolName();
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+    O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
       << MO.getIndex();
     break;
   default:
diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
index dfaaabf..0aa29d1 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -21,7 +21,11 @@ def CC_Sparc32 : CallingConv<[
   // i32 f32 arguments get passed in integer registers if there is space.
   CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
   // f64 arguments are split and passed through registers or through stack.
-  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>,
+  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>,
+  // As are v2i32 arguments (this would be the default behavior for
+  // v2i32 if it wasn't allocated to the IntPair register-class)
+  CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>,
+
 
   // Alternatively, they are assigned to the stack in 4-byte aligned units.
   CCAssignToStack<4, 4>
@@ -30,7 +34,8 @@ def CC_Sparc32 : CallingConv<[
 def RetCC_Sparc32 : CallingConv<[
   CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
   CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
-  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>,
+  CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">>
 ]>;
 
 
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index c0279da..39b5e80 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -44,7 +44,7 @@ void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
                                           unsigned ADDrr,
                                           unsigned ADDri) const {
 
-  DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+  DebugLoc dl;
   const SparcInstrInfo &TII =
       *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
@@ -90,8 +90,23 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const SparcInstrInfo &TII =
       *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SparcRegisterInfo &RegInfo =
+      *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+  bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+
+  // FIXME: unfortunately, returning false from canRealignStack
+  // actually just causes needsStackRealignment to return false,
+  // rather than reporting an error, as would be sensible. This is
+  // poor, but fixing that bogosity is going to be a large project.
+  // For now, just see if it's lied, and report an error here.
+  if (!NeedsStackRealignment && MFI->getMaxAlignment() > getStackAlignment())
+    report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
+                       "stack re-alignment, but LLVM couldn't handle it "
+                       "(probably because it has a dynamic alloca).");
 
   // Get the number of bytes to allocate from the FrameInfo
   int NumBytes = (int) MFI->getStackSize();
@@ -104,12 +119,43 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
     SAVEri = SP::ADDri;
     SAVErr = SP::ADDrr;
   }
-  NumBytes = -MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
-  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
+
+  // The SPARC ABI is a bit odd in that it requires a reserved 92-byte
+  // (128 in v9) area in the user's stack, starting at %sp. Thus, the
+  // first part of the stack that can actually be used is located at
+  // %sp + 92.
+  //
+  // We therefore need to add that offset to the total stack size
+  // after all the stack objects are placed by
+  // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be
+  // aligned *after* the extra size is added, we need to disable
+  // calculateFrameObjectOffsets's built-in stack alignment, by having
+  // targetHandlesStackFrameRounding return true.
+
+
+  // Add the extra call frame stack size, if needed. (This is the same
+  // code as in PrologEpilogInserter, but also gets disabled by
+  // targetHandlesStackFrameRounding)
+  if (MFI->adjustsStack() && hasReservedCallFrame(MF))
+    NumBytes += MFI->getMaxCallFrameSize();
+
+  // Adds the SPARC subtarget-specific spill area to the stack
+  // size. Also ensures target-required alignment.
+  NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
+
+  // Finally, ensure that the size is sufficiently aligned for the
+  // data on the stack.
+  if (MFI->getMaxAlignment() > 0) {
+    NumBytes = RoundUpToAlignment(NumBytes, MFI->getMaxAlignment());
+  }
+
+  // Update stack size with corrected value.
+  MFI->setStackSize(NumBytes);
+
+  emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri);
 
   MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
+  unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true);
 
   // Emit ".cfi_def_cfa_register 30".
   unsigned CFIIndex =
@@ -122,13 +168,19 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
-  unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
-  unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
+  unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true);
+  unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true);
   // Emit ".cfi_register 15, 31".
   CFIIndex = MMI.addFrameInst(
       MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
+
+  if (NeedsStackRealignment) {
+    // andn %o6, MaxAlign-1, %o6
+    int MaxAlign = MFI->getMaxAlignment();
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1);
+  }
 }
 
 void SparcFrameLowering::
@@ -167,7 +219,6 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
   if (NumBytes == 0)
     return;
 
-  NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
   emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
 }
 
@@ -180,21 +231,69 @@ bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 // pointer register.  This is true if the function has variable sized allocas or
 // if frame pointer elimination is disabled.
 bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
-    MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+      RegInfo->needsStackRealignment(MF) ||
+      MFI->hasVarSizedObjects() ||
+      MFI->isFrameAddressTaken();
 }
 
 
+int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                               unsigned &FrameReg) const {
+  const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Addressable stack objects are accessed using neg. offsets from
+  // %fp, or positive offsets from %sp.
+  bool UseFP;
+
+  // Sparc uses FP-based references in general, even when "hasFP" is
+  // false. That function is rather a misnomer, because %fp is
+  // actually always available, unless isLeafProc.
+  if (FuncInfo->isLeafProc()) {
+    // If there's a leaf proc, all offsets need to be %sp-based,
+    // because we haven't caused %fp to actually point to our frame.
+    UseFP = false;
+  } else if (isFixed) {
+    // Otherwise, argument access should always use %fp.
+    UseFP = true;
+  } else if (RegInfo->needsStackRealignment(MF)) {
+    // If there is dynamic stack realignment, all local object
+    // references need to be via %sp, to take account of the
+    // re-alignment.
+    UseFP = false;
+  } else {
+    // Finally, default to using %fp.
+    UseFP = true;
+  }
+
+  int64_t FrameOffset = MF.getFrameInfo()->getObjectOffset(FI) +
+      Subtarget.getStackPointerBias();
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FrameOffset;
+  } else {
+    FrameReg = SP::O6; // %sp
+    return FrameOffset + MF.getFrameInfo()->getStackSize();
+  }
+}
+
 static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (MRI->isPhysRegUsed(reg))
+    if (!MRI->reg_nodbg_empty(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (MRI->isPhysRegUsed(reg))
+    if (!MRI->reg_nodbg_empty(reg))
       return false;
 
   return true;
@@ -206,33 +305,42 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineFrameInfo    *MFI = MF.getFrameInfo();
 
-  return !(MFI->hasCalls()              // has calls
-           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
-           || MRI.isPhysRegUsed(SP::O6) // %SP is used
-           || hasFP(MF));               // need %FP
+  return !(MFI->hasCalls()                 // has calls
+           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
+           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || hasFP(MF));                  // need %FP
 }
 
 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
-
   MachineRegisterInfo &MRI = MF.getRegInfo();
-
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (!MRI.isPhysRegUsed(reg))
+    if (MRI.reg_nodbg_empty(reg))
       continue;
-    unsigned mapped_reg = (reg - SP::I0 + SP::O0);
-    assert(!MRI.isPhysRegUsed(mapped_reg));
+
+    unsigned mapped_reg = reg - SP::I0 + SP::O0;
+    assert(MRI.reg_nodbg_empty(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
 
-    // Mark the reg unused.
-    MRI.setPhysRegUnused(reg);
+    // Also replace register pair super-registers.
+    if ((reg - SP::I0) % 2 == 0) {
+      unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1;
+      unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1;
+      MRI.replaceRegWith(preg, mapped_preg);
+    }
   }
 
   // Rewrite MBB's Live-ins.
   for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
        MBB != E; ++MBB) {
+    for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
+      if (!MBB->isLiveIn(reg))
+        continue;
+      MBB->removeLiveIn(reg);
+      MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
+    }
     for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
       if (!MBB->isLiveIn(reg))
         continue;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
index 29fc7b7..cbb4dc0 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -39,6 +39,14 @@ public:
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
 
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const override { return true; }
+
 private:
   // Remap input registers to output registers for leaf procedure.
   void remapRegsForLeafProc(MachineFunction &MF) const;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 340b72e..c4c6416 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Compiler.h"
@@ -62,6 +63,7 @@ public:
 
 private:
   SDNode* getGlobalBaseReg();
+  SDNode *SelectInlineAsm(SDNode *N);
 };
 }  // end anonymous namespace
 
@@ -141,6 +143,181 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   return true;
 }
 
+
+// Re-assemble i64 arguments split up in SelectionDAGBuilder's
+// visitInlineAsm / GetRegistersForValue functions.
+//
+// Note: This function was copied from, and is essentially identical
+// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that
+// such hacking-up is necessary; a rethink of how inline asm operands
+// are handled may be in order to make doing this more sane.
+//
+// TODO: fix inline asm support so I can simply tell it that 'i64'
+// inputs to asm need to be allocated to the IntPair register type,
+// and have that work. Then, delete this function.
+SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  // Normally, i64 data is bounded to two arbitrary GPRs for "%r"
+  // constraint.  However, some instructions (e.g. ldd/std) require
+  // (even/even+1) GPRs.
+
+  // So, here, we check for this case, and mutate the inlineasm to use
+  // a single IntPair register instead, which guarantees such even/odd
+  // placement.
+
+  SDLoc dl(N);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
+
+  SmallVector<bool, 8> OpChanged;
+  // Glue node will be appended late.
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID))
+        || NumRegs != 2)
+      continue;
+
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, Ops);
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(
+          CurDAG->getMachineNode(
+              TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32,
+              {
+                  CurDAG->getTargetConstant(SP::IntPairRegClassID, dl,
+                                            MVT::i32),
+                  T0,
+                  CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32),
+                  T1,
+                  CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32),
+              }),
+          0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, dl, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return nullptr;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
 SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
@@ -150,6 +327,12 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
+    case ISD::INLINEASM: {
+    SDNode *ResNode = SelectInlineAsm(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
   case SPISD::GLOBAL_BASE_REG:
     return getGlobalBaseReg();
 
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 4879d4e..5e70ffe 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -49,9 +49,9 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
   return true;
 }
 
-static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
-                                MVT &LocVT, CCValAssign::LocInfo &LocInfo,
-                                ISD::ArgFlagsTy &ArgFlags, CCState &State)
+static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
+                                     MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags, CCState &State)
 {
   static const MCPhysReg RegList[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
@@ -77,6 +77,29 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
   return true;
 }
 
+static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  static const MCPhysReg RegList[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+
+  // Try to get first reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    return false;
+
+  // Try to get second reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    return false;
+
+  return true;
+}
+
 // Allocate a full-sized argument for the 64-bit ABI.
 static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
                             MVT &LocVT, CCValAssign::LocInfo &LocInfo,
@@ -202,12 +225,34 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
   RetOps.push_back(SDValue());
 
   // Copy the result values into the output registers.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(),
-                             OutVals[i], Flag);
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    if (VA.needsCustom()) {
+      assert(VA.getLocVT() == MVT::v2i32);
+      // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
+      // happen by default if this wasn't a legal type)
+
+      SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));
+
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
 
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
@@ -355,6 +400,7 @@ LowerFormalArguments_32(SDValue Chain,
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
 
   const unsigned StackOffset = 92;
+  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
 
   unsigned InIdx = 0;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
@@ -375,7 +421,8 @@ LowerFormalArguments_32(SDValue Chain,
 
     if (VA.isRegLoc()) {
       if (VA.needsCustom()) {
-        assert(VA.getLocVT() == MVT::f64);
+        assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
         unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
         MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
         SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
@@ -396,9 +443,13 @@ LowerFormalArguments_32(SDValue Chain,
                                         &SP::IntRegsRegClass);
           LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
         }
+
+        if (IsLittleEndian)
+          std::swap(LoVal, HiVal);
+
         SDValue WholeValue =
           DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
-        WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+        WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
         InVals.push_back(WholeValue);
         continue;
       }
@@ -422,7 +473,7 @@ LowerFormalArguments_32(SDValue Chain,
     auto PtrVT = getPointerTy(DAG.getDataLayout());
 
     if (VA.needsCustom()) {
-      assert(VA.getValVT() == MVT::f64);
+      assert(VA.getValVT() == MVT::f64 || MVT::v2i32);
       // If it is double-word aligned, just load.
       if (Offset % 8 == 0) {
         int FI = MF.getFrameInfo()->CreateFixedObject(8,
@@ -452,9 +503,12 @@ LowerFormalArguments_32(SDValue Chain,
                                   MachinePointerInfo(),
                                   false, false, false, 0);
 
+      if (IsLittleEndian)
+        std::swap(LoVal, HiVal);
+
       SDValue WholeValue =
         DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
-      WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+      WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
       InVals.push_back(WholeValue);
       continue;
     }
@@ -468,16 +522,12 @@ LowerFormalArguments_32(SDValue Chain,
       Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
                          MachinePointerInfo(),
                          false, false, false, 0);
+    } else if (VA.getValVT() == MVT::f128) {
+      report_fatal_error("SPARCv8 does not handle f128 in calls; "
+                         "pass indirectly");
     } else {
-      ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
-      // Sparc is big endian, so add an offset based on the ObjectVT.
-      unsigned Offset = 4-std::max(1U, VA.getValVT().getSizeInBits()/8);
-      FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
-                          DAG.getConstant(Offset, dl, MVT::i32));
-      Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
-                            MachinePointerInfo(),
-                            VA.getValVT(), false, false, false,0);
-      Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load);
+      // We shouldn't see any other value types here.
+      llvm_unreachable("Unexpected ValVT encountered in frame lowering.");
     }
     InVals.push_back(Load);
   }
@@ -612,7 +662,7 @@ LowerFormalArguments_64(SDValue Chain,
     InVals.push_back(DAG.getLoad(
         VA.getValVT(), DL, Chain,
         DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
-        MachinePointerInfo::getFixedStack(FI), false, false, false, 0));
+        MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0));
   }
 
   if (!IsVarArg)
@@ -640,9 +690,9 @@ LowerFormalArguments_64(SDValue Chain,
     SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
     int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true);
     auto PtrVT = getPointerTy(MF.getDataLayout());
-    OutChains.push_back(
-        DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
-                     MachinePointerInfo::getFixedStack(FI), false, false, 0));
+    OutChains.push_back(DAG.getStore(
+        Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+        MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
   }
 
   if (!OutChains.empty())
@@ -788,7 +838,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     }
 
     if (VA.needsCustom()) {
-      assert(VA.getLocVT() == MVT::f64);
+      assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
 
       if (VA.isMemLoc()) {
         unsigned Offset = VA.getLocMemOffset() + StackOffset;
@@ -804,49 +854,53 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
         }
       }
 
-      SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
-      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
-                                   Arg, StackPtr, MachinePointerInfo(),
-                                   false, false, 0);
-      // Sparc is big-endian, so the high part comes first.
-      SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
-                               MachinePointerInfo(), false, false, false, 0);
-      // Increment the pointer to the other half.
-      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                             DAG.getIntPtrConstant(4, dl));
-      // Load the low part.
-      SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
-                               MachinePointerInfo(), false, false, false, 0);
+      if (VA.getLocVT() == MVT::f64) {
+        // Move from the float value from float registers into the
+        // integer registers.
+
+        // TODO: The f64 -> v2i32 conversion is super-inefficient for
+        // constants: it sticks them in the constant pool, then loads
+        // to a fp register, then stores to temp memory, then loads to
+        // integer registers.
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
+      }
+
+      SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));
 
       if (VA.isRegLoc()) {
-        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi));
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
         assert(i+1 != e);
         CCValAssign &NextVA = ArgLocs[++i];
         if (NextVA.isRegLoc()) {
-          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
+          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
         } else {
-          // Store the low part in stack.
+          // Store the second part in stack.
           unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
           PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-          MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+          MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
                                              MachinePointerInfo(),
                                              false, false, 0));
         }
       } else {
         unsigned Offset = VA.getLocMemOffset() + StackOffset;
-        // Store the high part.
+        // Store the first part.
         SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
         PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff,
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Part0, PtrOff,
                                            MachinePointerInfo(),
                                            false, false, 0));
-        // Store the low part.
+        // Store the second part.
         PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
         PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
                                            MachinePointerInfo(),
                                            false, false, 0));
       }
@@ -990,8 +1044,8 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
   if (!CalleeFn)
     return 0;
 
-  assert(CalleeFn->hasStructRetAttr() &&
-         "Callee does not have the StructRet attribute.");
+  // It would be nice to check for the sret attribute on CalleeFn here,
+  // but since it is not part of the function type, any check will misfire.
 
   PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
   Type *ElementTy = Ty->getElementType();
@@ -1370,15 +1424,60 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
 SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
                                          const SparcSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-  auto &DL = *TM.getDataLayout();
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+  // Instructions which use registers as conditionals examine all the
+  // bits (as does the pseudo SELECT_CC expansion). I don't think it
+  // matters much whether it's ZeroOrOneBooleanContent, or
+  // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
+  // former.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent);
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
   addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
   addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
-  if (Subtarget->is64Bit())
+  if (Subtarget->is64Bit()) {
     addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+  } else {
+    // On 32bit sparc, we define a double-register 32bit register
+    // class, as well. This is modeled in LLVM as a 2-vector of i32.
+    addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);
+
+    // ...but almost all operations must be expanded, so set that as
+    // the default.
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      setOperationAction(Op, MVT::v2i32, Expand);
+    }
+    // Truncating/extending stores/loads are also not supported.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);
+
+      setTruncStoreAction(VT, MVT::v2i32, Expand);
+      setTruncStoreAction(MVT::v2i32, VT, Expand);
+    }
+    // However, load and store *are* legal.
+    setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
+    setOperationAction(ISD::STORE, MVT::v2i32, Legal);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);
+
+    // And we need to promote i64 loads/stores into vector load/store
+    setOperationAction(ISD::LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::STORE, MVT::i64, Custom);
+
+    // Sadly, this doesn't work:
+    //    AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+    //    AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+  }
 
   // Turn FP extload into load/fextend
   for (MVT VT : MVT::fp_valuetypes()) {
@@ -1396,10 +1495,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 
   // Custom legalize GlobalAddress nodes into LO/HI parts.
-  setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom);
-  setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom);
-  setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom);
-  setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom);
+  setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+  setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+  setOperationAction(ISD::BlockAddress, PtrVT, Custom);
 
   // Sparc doesn't have sext_inreg, replace them with shl/sra
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -1579,9 +1678,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
 
-  setExceptionPointerRegister(SP::I0);
-  setExceptionSelectorRegister(SP::I1);
-
   setStackPointerRegisterToSaveRestore(SP::O6);
 
   setOperationAction(ISD::CTPOP, MVT::i32,
@@ -1744,18 +1840,15 @@ void SparcTargetLowering::computeKnownBitsForTargetNode
 // set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
 static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
                              ISD::CondCode CC, unsigned &SPCC) {
-  if (isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isNullValue() &&
+  if (isNullConstant(RHS) &&
       CC == ISD::SETNE &&
       (((LHS.getOpcode() == SPISD::SELECT_ICC ||
          LHS.getOpcode() == SPISD::SELECT_XCC) &&
         LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
        (LHS.getOpcode() == SPISD::SELECT_FCC &&
         LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
-      isa<ConstantSDNode>(LHS.getOperand(0)) &&
-      isa<ConstantSDNode>(LHS.getOperand(1)) &&
-      cast<ConstantSDNode>(LHS.getOperand(0))->isOne() &&
-      cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) {
+      isOneConstant(LHS.getOperand(0)) &&
+      isNullConstant(LHS.getOperand(1))) {
     SDValue CMPCC = LHS.getOperand(3);
     SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
     LHS = CMPCC.getOperand(0);
@@ -1821,7 +1914,8 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
     MFI->setHasCalls(true);
     return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
-                       MachinePointerInfo::getGOT(), false, false, false, 0);
+                       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                       false, false, false, 0);
   }
 
   // This is one of the absolute code models.
@@ -1872,6 +1966,9 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
   SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2601,6 +2698,17 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
   return DAG.getMergeValues(Ops, dl);
 }
 
+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+  LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+  EVT MemVT = LdNode->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return LowerF128Load(Op, DAG);
+
+  return Op;
+}
+
 // Lower a f128 store into two f64 stores.
 static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -2645,6 +2753,29 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+
+  EVT MemVT = St->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return LowerF128Store(Op, DAG);
+
+  if (MemVT == MVT::i64) {
+    // Custom handling for i64 stores: turn it into a bitcast and a
+    // v2i32 store.
+    SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
+    SDValue Chain = DAG.getStore(
+        St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
+        St->isVolatile(), St->isNonTemporal(), St->getAlignment(),
+        St->getAAInfo());
+    return Chain;
+  }
+
+  return SDValue();
+}
+
 static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
          && "invalid opcode");
@@ -2752,7 +2883,7 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
 
   SDValue MulResult = TLI.makeLibCall(DAG,
                                       RTLIB::MUL_I128, WideVT,
-                                      Args, 4, isSigned, dl).first;
+                                      Args, isSigned, dl).first;
   SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
                                    MulResult, DAG.getIntPtrConstant(0, dl));
   SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
@@ -2783,7 +2914,6 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
@@ -2818,8 +2948,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
                                                                Subtarget);
 
-  case ISD::LOAD:               return LowerF128Load(Op, DAG);
-  case ISD::STORE:              return LowerF128Store(Op, DAG);
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
   case ISD::FADD:               return LowerF128Op(Op, DAG,
                                        getLibcallName(RTLIB::ADD_F128), 2);
   case ISD::FSUB:               return LowerF128Op(Op, DAG,
@@ -2921,8 +3051,7 @@ SparcTargetLowering::expandSelectCC(MachineInstr *MI,
   // to set, the condition code register to branch on, the true/false values to
   // select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
@@ -3007,7 +3136,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
     .addReg(AddrReg).addImm(0);
 
   // Split the basic block MBB before MI and insert the loop block in the hole.
-  MachineFunction::iterator MFI = MBB;
+  MachineFunction::iterator MFI = MBB->getIterator();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction *MF = MBB->getParent();
   MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -3149,9 +3278,12 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      return std::make_pair(0U, &SP::IntRegsRegClass);
+      if (VT == MVT::v2i32)
+        return std::make_pair(0U, &SP::IntPairRegClass);
+      else
+        return std::make_pair(0U, &SP::IntRegsRegClass);
     }
-  } else  if (!Constraint.empty() && Constraint.size() <= 5
+  } else if (!Constraint.empty() && Constraint.size() <= 5
               && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
     // constraint = '{r<d>}'
     // Remove the braces from around the name.
@@ -3227,5 +3359,24 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
                                   getLibcallName(libCall),
                                   1));
     return;
+  case ISD::LOAD: {
+    LoadSDNode *Ld = cast<LoadSDNode>(N);
+    // Custom handling only for i64: turn i64 load into a v2i32 load,
+    // and a bitcast.
+    if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64)
+      return;
+
+    SDLoc dl(N);
+    SDValue LoadRes = DAG.getExtLoad(
+        Ld->getExtensionType(), dl, MVT::v2i32,
+        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+        MVT::v2i32, Ld->isVolatile(), Ld->isNonTemporal(),
+        Ld->isInvariant(), Ld->getAlignment(), Ld->getAAInfo());
+
+    SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
+    Results.push_back(Res);
+    Results.push_back(LoadRes.getValue(1));
+    return;
+  }
   }
 }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
index bbc91a4..4e46709 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -89,6 +89,20 @@ namespace llvm {
       return MVT::i32;
     }
 
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return SP::I0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return SP::I1;
+    }
+
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
@@ -167,8 +181,8 @@ namespace llvm {
     }
 
     void ReplaceNodeResults(SDNode *N,
-                                    SmallVectorImpl<SDValue>& Results,
-                                    SelectionDAG &DAG) const override;
+                            SmallVectorImpl<SDValue>& Results,
+                            SelectionDAG &DAG) const override;
 
     MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
                                       unsigned BROpcode) const;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
index 25cc652..d51e2cc 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -250,6 +250,7 @@ defm : int_cond_alias<"n",    0b0000>;
 defm : int_cond_alias<"ne",   0b1001>;
 defm : int_cond_alias<"nz",   0b1001>; // same as ne
 defm : int_cond_alias<"e",    0b0001>;
+defm : int_cond_alias<"eq",    0b0001>; // same as e
 defm : int_cond_alias<"z",    0b0001>; // same as e
 defm : int_cond_alias<"g",    0b1010>;
 defm : int_cond_alias<"le",   0b0010>;
@@ -429,6 +430,9 @@ def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
 def : InstAlias<"flush", (FLUSH), 0>;
 
 
+def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
+def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
+
 def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
 
 def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>;
@@ -450,3 +454,8 @@ def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1,
                                                      QFPRegs:$rs2)>,
                 Requires<[HasHardQuad]>;
 
+// signx rd -> sra rd, %g0, rd
+def : InstAlias<"signx $rd", (SRArr IntRegs:$rd, IntRegs:$rd, G0), 0>, Requires<[HasV9]>;
+
+// signx reg, rd -> sra reg, %g0, rd
+def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index 6167c53..05006ac 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -83,7 +83,6 @@ static bool IsIntegerCC(unsigned CC)
   return  (CC <= SPCC::ICC_VC);
 }
 
-
 static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
 {
   switch(CC) {
@@ -124,106 +123,103 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
   llvm_unreachable("Invalid cond code");
 }
 
-bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const
-{
+static bool isUncondBranchOpcode(int Opc) { return Opc == SP::BA; }
 
-  MachineBasicBlock::iterator I = MBB.end();
-  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
-  while (I != MBB.begin()) {
-    --I;
-
-    if (I->isDebugValue())
-      continue;
+static bool isCondBranchOpcode(int Opc) {
+  return Opc == SP::FBCOND || Opc == SP::BCOND;
+}
 
-    // When we see a non-terminator, we are done.
-    if (!isUnpredicatedTerminator(I))
-      break;
+static bool isIndirectBranchOpcode(int Opc) {
+  return Opc == SP::BINDrr || Opc == SP::BINDri;
+}
 
-    // Terminator is not a branch.
-    if (!I->isBranch())
-      return true;
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(1).getImm()));
+  Target = LastInst->getOperand(0).getMBB();
+}
 
-    // Handle Unconditional branches.
-    if (I->getOpcode() == SP::BA) {
-      UnCondBrIter = I;
+bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  unsigned LastOpc = LastInst->getOpcode();
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
 
-      if (!AllowModify) {
-        TBB = I->getOperand(0).getMBB();
-        continue;
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
       }
+    }
+  }
 
-      while (std::next(I) != MBB.end())
-        std::next(I)->eraseFromParent();
-
-      Cond.clear();
-      FBB = nullptr;
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
 
-      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = nullptr;
-        I->eraseFromParent();
-        I = MBB.end();
-        UnCondBrIter = MBB.end();
-        continue;
-      }
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
 
-      TBB = I->getOperand(0).getMBB();
-      continue;
-    }
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    return false;
+  }
 
-    unsigned Opcode = I->getOpcode();
-    if (Opcode != SP::BCOND && Opcode != SP::FBCOND)
-      return true; // Unknown Opcode.
-
-    SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm();
-
-    if (Cond.empty()) {
-      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
-      if (AllowModify && UnCondBrIter != MBB.end() &&
-          MBB.isLayoutSuccessor(TargetBB)) {
-
-        // Transform the code
-        //
-        //    brCC L1
-        //    ba L2
-        // L1:
-        //    ..
-        // L2:
-        //
-        // into
-        //
-        //   brnCC L2
-        // L1:
-        //   ...
-        // L2:
-        //
-        BranchCode = GetOppositeBranchCondition(BranchCode);
-        MachineBasicBlock::iterator OldInst = I;
-        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(Opcode))
-          .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode);
-        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA))
-          .addMBB(TargetBB);
-
-        OldInst->eraseFromParent();
-        UnCondBrIter->eraseFromParent();
-
-        UnCondBrIter = MBB.end();
-        I = MBB.end();
-        continue;
-      }
-      FBB = TBB;
-      TBB = I->getOperand(0).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(BranchCode));
-      continue;
-    }
-    // FIXME: Handle subsequent conditional branches.
-    // For now, we can't handle multiple conditional branches.
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
     return true;
   }
-  return false;
+
+  // Otherwise, can't handle this.
+  return true;
 }
 
 unsigned
@@ -277,6 +273,14 @@ unsigned SparcInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const
   return Count;
 }
 
+bool SparcInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1);
+  SPCC::CondCodes CC = static_cast<SPCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
 void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
                                  unsigned DestReg, unsigned SrcReg,
@@ -284,7 +288,9 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned numSubRegs = 0;
   unsigned movOpc     = 0;
   const unsigned *subRegIdx = nullptr;
+  bool ExtraG0 = false;
 
+  const unsigned DW_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
   const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
   const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
   const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
@@ -294,7 +300,12 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
       .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+  else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) {
+    subRegIdx  = DW_SubRegsIdx;
+    numSubRegs = 2;
+    movOpc     = SP::ORrr;
+    ExtraG0 = true;
+  } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
   else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
@@ -347,7 +358,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
     assert(Dst && Src && "Bad sub-register");
 
-    MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
+    if (ExtraG0)
+      MIB.addReg(SP::G0);
+    MIB.addReg(Src);
+    MovMI = MIB.getInstr();
   }
   // Add implicit super-register defs and kills to the last MovMI.
   MovMI->addRegisterDefined(DestReg, TRI);
@@ -365,19 +380,20 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
-  MachineMemOperand *MMO =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                             MachineMemOperand::MOStore,
-                             MFI.getObjectSize(FI),
-                             MFI.getObjectAlignment(FI));
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
- if (RC == &SP::I64RegsRegClass)
+  if (RC == &SP::I64RegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   else if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::IntPairRegClass)
+    BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
@@ -403,11 +419,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
-  MachineMemOperand *MMO =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                             MachineMemOperand::MOLoad,
-                             MFI.getObjectSize(FI),
-                             MFI.getObjectAlignment(FI));
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
 
   if (RC == &SP::I64RegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
@@ -415,6 +429,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   else if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
       .addMemOperand(MMO);
+  else if (RC == &SP::IntPairRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
       .addMemOperand(MMO);
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
index 15673f1..9de624c 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -76,6 +76,9 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         DebugLoc DL) const override;
 
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 3b9e048..ec37c22 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -283,17 +283,32 @@ multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
                  [(set Ty:$dst, (OpNode ADDRri:$addr))]>;
 }
 
+// TODO: Instructions of the LoadASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+              RegisterClass RC, ValueType Ty> :
+  F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
+                !strconcat(OpcStr, "a [$addr] $asi, $dst"),
+                []>;
+
 // LoadA multiclass - As above, but also define alternate address space variant
 multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
              Load<OpcStr, Op3Val, OpNode, RC, Ty> {
-  // TODO: The LD*Arr instructions are currently asm only; hooking up
-  // CodeGen's address spaces to use these is a future task.
-  def Arr  : F3_1_asi<3, LoadAOp3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
-                !strconcat(OpcStr, "a [$addr] $asi, $dst"),
-                []>;
+  def Arr  : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>;
 }
 
+// The LDSTUB instruction is supported for asm only.
+// It is unlikely that general-purpose code could make use of it.
+// CAS is preferred for sparc v9.
+def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldstub [$addr], $dst", []>;
+def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldstub [$addr], $dst", []>;
+def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst),
+                         (ins MEMrr:$addr, i8imm:$asi),
+                         "ldstuba [$addr] $asi, $dst", []>;
+
 // Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
 multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
            RegisterClass RC, ValueType Ty> {
@@ -307,14 +322,18 @@ multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
                  [(OpNode Ty:$rd, ADDRri:$addr)]>;
 }
 
-multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
+// TODO: Instructions of the StoreASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class StoreASI<string OpcStr, bits<6> Op3Val,
                   SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
-             Store<OpcStr, Op3Val, OpNode, RC, Ty> {
-  // TODO: The ST*Arr instructions are currently asm only; hooking up
-  // CodeGen's address spaces to use these is a future task.
-  def Arr  : F3_1_asi<3, StoreAOp3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
+  F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
                   !strconcat(OpcStr, "a $rd, [$addr] $asi"),
                   []>;
+
+multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
+                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
+             Store<OpcStr, Op3Val, OpNode, RC, Ty> {
+  def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -408,15 +427,40 @@ let DecoderMethod = "DecodeLoadInt" in {
   defm LD   : LoadA<"ld",   0b000000, 0b010000, load,        IntRegs, i32>;
 }
 
+let DecoderMethod = "DecodeLoadIntPair" in
+  defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32>;
+
 // Section B.2 - Load Floating-point Instructions, p. 92
-let DecoderMethod = "DecodeLoadFP" in
-  defm LDF   : Load<"ld",  0b100000, load, FPRegs,  f32>;
-let DecoderMethod = "DecodeLoadDFP" in
-  defm LDDF  : Load<"ldd", 0b100011, load, DFPRegs, f64>;
+let DecoderMethod = "DecodeLoadFP" in {
+  defm LDF   : Load<"ld",  0b100000, load,    FPRegs,  f32>;
+  def LDFArr : LoadASI<"ld",  0b110000, load, FPRegs,  f32>,
+                Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeLoadDFP" in {
+  defm LDDF   : Load<"ldd", 0b100011, load,    DFPRegs, f64>;
+  def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>,
+                 Requires<[HasV9]>;
+}
 let DecoderMethod = "DecodeLoadQFP" in
-  defm LDQF  : Load<"ldq", 0b100010, load, QFPRegs, f128>,
+  defm LDQF  : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
                Requires<[HasV9, HasHardQuad]>;
 
+let DecoderMethod = "DecodeLoadFP" in
+  let Defs = [FSR] in {
+    let rd = 0 in {
+      def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+                     "ld [$addr], %fsr", []>;
+      def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+                     "ld [$addr], %fsr", []>;
+    }
+    let rd = 1 in {
+      def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+                     "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+      def LDXFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+                     "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+    }
+  }
+
 // Section B.4 - Store Integer Instructions, p. 95
 let DecoderMethod = "DecodeStoreInt" in {
   defm STB   : StoreA<"stb", 0b000101, 0b010101, truncstorei8,  IntRegs, i32>;
@@ -424,15 +468,40 @@ let DecoderMethod = "DecodeStoreInt" in {
   defm ST    : StoreA<"st",  0b000100, 0b010100, store,         IntRegs, i32>;
 }
 
+let DecoderMethod = "DecodeStoreIntPair" in
+  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>;
+
 // Section B.5 - Store Floating-point Instructions, p. 97
-let DecoderMethod = "DecodeStoreFP" in
+let DecoderMethod = "DecodeStoreFP" in {
   defm STF   : Store<"st",  0b100100, store,         FPRegs,  f32>;
-let DecoderMethod = "DecodeStoreDFP" in
-  defm STDF  : Store<"std", 0b100111, store,         DFPRegs, f64>;
+  def STFArr : StoreASI<"st",  0b110100, store,      FPRegs,  f32>,
+               Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeStoreDFP" in {
+  defm STDF   : Store<"std", 0b100111, store,         DFPRegs, f64>;
+  def STDFArr : StoreASI<"std", 0b110111, store,      DFPRegs, f64>,
+                Requires<[HasV9]>;
+}
 let DecoderMethod = "DecodeStoreQFP" in
-  defm STQF  : Store<"stq", 0b100110, store,         QFPRegs, f128>,
+  defm STQF  : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>,
                Requires<[HasV9, HasHardQuad]>;
 
+let DecoderMethod = "DecodeStoreFP" in
+  let Defs = [FSR] in {
+    let rd = 0 in {
+      def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+                     "st %fsr, [$addr]", []>;
+      def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+                     "st %fsr, [$addr]", []>;
+    }
+    let rd = 1 in {
+      def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+                     "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+      def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+                     "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+    }
+  }
+
 // Section B.8 - SWAP Register with Memory Instruction
 // (Atomic swap)
 let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
@@ -559,6 +628,10 @@ let Defs = [Y, ICC] in {
   defm SMULCC : F3_12np<"smulcc", 0b011011>;
 }
 
+let Defs = [Y, ICC], Uses = [Y, ICC] in {
+  defm MULSCC : F3_12np<"mulscc", 0b100100>;
+}
+
 // Section B.19 - Divide Instructions, p. 115
 let Uses = [Y], Defs = [Y] in {
   defm UDIV : F3_12np<"udiv", 0b001110>;
@@ -1221,8 +1294,8 @@ let Predicates = [HasV9] in {
 // the top 32-bits before using it.  To do this clearing, we use a SRLri X,0.
 let rs1 = 0 in
   def POPCrr : F3_1<2, 0b101110,
-                    (outs IntRegs:$dst), (ins IntRegs:$src),
-                    "popc $src, $dst", []>, Requires<[HasV9]>;
+                    (outs IntRegs:$rd), (ins IntRegs:$rs2),
+                    "popc $rs2, $rd", []>, Requires<[HasV9]>;
 def : Pat<(ctpop i32:$src),
           (POPCrr (SRLri $src, 0))>;
 
@@ -1254,6 +1327,25 @@ let hasSideEffects = 1 in {
 }
 }
 
+
+// Section A.43 - Read Privileged Register Instructions
+let Predicates = [HasV9] in {
+let rs2 = 0 in
+  def RDPR : F3_1<2, 0b101010,
+                 (outs IntRegs:$rd), (ins PRRegs:$rs1),
+                 "rdpr $rs1, $rd", []>;
+}
+
+// Section A.62 - Write Privileged Register Instructions
+let Predicates = [HasV9] in {
+  def WRPRrr : F3_1<2, 0b110010,
+                   (outs PRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "wrpr $rs1, $rs2, $rd", []>;
+  def WRPRri : F3_2<2, 0b110010,
+                   (outs PRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "wrpr $rs1, $simm13, $rd", []>;
+}
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
@@ -1327,6 +1419,18 @@ def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>;
 def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
 def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
 
+// extract_vector
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 0),
+          (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>;
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 1),
+          (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>;
+
+// build_vector
+def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)),
+          (INSERT_SUBREG
+	    (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even),
+            (i32 IntRegs:$a2), sub_odd)>;
+
 
 include "SparcInstr64Bit.td"
 include "SparcInstrVIS.td"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index 9667bc0..da31783 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -75,6 +75,18 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(SP::G6);
   Reserved.set(SP::G7);
 
+  // Also reserve the register pair aliases covering the above
+  // registers, with the same conditions.
+  Reserved.set(SP::G0_G1);
+  if (ReserveAppRegisters)
+    Reserved.set(SP::G2_G3);
+  if (ReserveAppRegisters || !Subtarget.is64Bit())
+    Reserved.set(SP::G4_G5);
+
+  Reserved.set(SP::O6_O7);
+  Reserved.set(SP::I6_I7);
+  Reserved.set(SP::G6_G7);
+
   // Unaliased double registers are not available in non-V9 targets.
   if (!Subtarget.isV9()) {
     for (unsigned n = 0; n != 16; ++n) {
@@ -158,21 +170,15 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   DebugLoc dl = MI.getDebugLoc();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-
-  // Addressable stack objects are accessed using neg. offsets from %fp
   MachineFunction &MF = *MI.getParent()->getParent();
   const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
-  int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-                   MI.getOperand(FIOperandNum + 1).getImm() +
-                   Subtarget.getStackPointerBias();
-  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
-  unsigned FramePtr = SP::I6;
-  if (FuncInfo->isLeafProc()) {
-    // Use %sp and adjust offset if needed.
-    FramePtr = SP::O6;
-    int stackSize = MF.getFrameInfo()->getStackSize();
-    Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
-  }
+  const SparcFrameLowering *TFI = getFrameLowering(MF);
+
+  unsigned FrameReg;
+  int Offset;
+  Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
 
   if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
     if (MI.getOpcode() == SP::STQFri) {
@@ -182,8 +188,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
       MachineInstr *StMI =
         BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
-        .addReg(FramePtr).addImm(0).addReg(SrcEvenReg);
-      replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr);
+        .addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
+      replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
       MI.setDesc(TII.get(SP::STDFri));
       MI.getOperand(2).setReg(SrcOddReg);
       Offset += 8;
@@ -194,8 +200,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
       MachineInstr *StMI =
         BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
-        .addReg(FramePtr).addImm(0);
-      replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr);
+        .addReg(FrameReg).addImm(0);
+      replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
 
       MI.setDesc(TII.get(SP::LDDFri));
       MI.getOperand(0).setReg(DestOddReg);
@@ -203,7 +209,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     }
   }
 
-  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr);
+  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
 
 }
 
@@ -211,3 +217,25 @@ unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
 
+// Sparc has no architectural need for stack realignment support,
+// except that LLVM unfortunately currently implements overaligned
+// stack objects by depending upon stack realignment support.
+// If that ever changes, this can probably be deleted.
+bool SparcRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+
+  // Sparc always has a fixed frame pointer register, so don't need to
+  // worry about needing to reserve it. [even if we don't have a frame
+  // pointer for our frame, it still cannot be used for other things,
+  // or register window traps will be SADNESS.]
+
+  // If there's a reserved call frame, we can use SP to access locals.
+  if (getFrameLowering(MF)->hasReservedCallFrame(MF))
+    return true;
+
+  // Otherwise, we'd need a base pointer, but those aren't implemented
+  // for SPARC at the moment.
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
index 764a894..32075b1 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -42,8 +42,10 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                        RegScavenger *RS = nullptr) const;
 
-  // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index db8a7e8..cca9463 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -32,6 +32,12 @@ def sub_odd64  : SubRegIndex<64, 64>;
 // Ri - 32-bit integer registers
 class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
 
+// Rdi - pairs of 32-bit integer registers
+class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even, sub_odd];
+  let CoveredBySubRegs = 1;
+}
 // Rf - 32-bit floating-point registers
 class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
 
@@ -54,6 +60,8 @@ def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code.
 foreach I = 0-3 in
   def FCC#I : SparcCtrlReg<I, "FCC"#I>;
 
+def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register.
+
 // Y register
 def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
 // Ancillary state registers (implementation defined)
@@ -94,6 +102,22 @@ def PSR : SparcCtrlReg<0, "PSR">;
 def WIM : SparcCtrlReg<0, "WIM">;
 def TBR : SparcCtrlReg<0, "TBR">;
 
+def TPC : SparcCtrlReg<0, "TPC">;
+def TNPC : SparcCtrlReg<1, "TNPC">;
+def TSTATE : SparcCtrlReg<2, "TSTATE">;
+def TT : SparcCtrlReg<3, "TT">;
+def TICK : SparcCtrlReg<4, "TICK">;
+def TBA : SparcCtrlReg<5, "TBA">;
+def PSTATE : SparcCtrlReg<6, "PSTATE">;
+def TL : SparcCtrlReg<7, "TL">;
+def PIL : SparcCtrlReg<8, "PIL">;
+def CWP : SparcCtrlReg<9, "CWP">;
+def CANSAVE : SparcCtrlReg<10, "CANSAVE">;
+def CANRESTORE : SparcCtrlReg<11, "CANRESTORE">;
+def CLEANWIN : SparcCtrlReg<12, "CLEANWIN">;
+def OTHERWIN : SparcCtrlReg<13, "OTHERWIN">;
+def WSTATE : SparcCtrlReg<14, "WSTATE">;
+
 // Integer registers
 def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
 def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
@@ -217,6 +241,24 @@ def Q13 : Rq<21, "F52", [D26, D27]>;
 def Q14 : Rq<25, "F56", [D28, D29]>;
 def Q15 : Rq<29, "F60", [D30, D31]>;
 
+// Aliases of the integer registers used for LDD/STD double-word operations
+def G0_G1 : Rdi<0, "G0", [G0, G1]>;
+def G2_G3 : Rdi<2, "G2", [G2, G3]>;
+def G4_G5 : Rdi<4, "G4", [G4, G5]>;
+def G6_G7 : Rdi<6, "G6", [G6, G7]>;
+def O0_O1 : Rdi<8, "O0", [O0, O1]>;
+def O2_O3 : Rdi<10, "O2", [O2, O3]>;
+def O4_O5 : Rdi<12, "O4", [O4, O5]>;
+def O6_O7 : Rdi<14, "O6", [O6, O7]>;
+def L0_L1 : Rdi<16, "L0", [L0, L1]>;
+def L2_L3 : Rdi<18, "L2", [L2, L3]>;
+def L4_L5 : Rdi<20, "L4", [L4, L5]>;
+def L6_L7 : Rdi<22, "L6", [L6, L7]>;
+def I0_I1 : Rdi<24, "I0", [I0, I1]>;
+def I2_I3 : Rdi<26, "I2", [I2, I3]>;
+def I4_I5 : Rdi<28, "I4", [I4, I5]>;
+def I6_I7 : Rdi<30, "I6", [I6, I7]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@@ -231,6 +273,13 @@ def IntRegs : RegisterClass<"SP", [i32, i64], 32,
                                  (sequence "L%u", 0, 7),
                                  (sequence "O%u", 0, 7))>;
 
+// Should be in the same order as IntRegs.
+def IntPair : RegisterClass<"SP", [v2i32], 64,
+    (add I0_I1, I2_I3, I4_I5, I6_I7,
+         G0_G1, G2_G3, G4_G5, G6_G7,
+         L0_L1, L2_L3, L4_L5, L6_L7,
+         O0_O1, O2_O3, O4_O5, O6_O7)>;
+
 // Register class for 64-bit mode, with a 64-bit spill slot size.
 // These are the same as the 32-bit registers, so TableGen will consider this
 // to be a sub-class of IntRegs. That works out because requiring a 64-bit
@@ -252,3 +301,8 @@ def ASRRegs : RegisterClass<"SP", [i32], 32,
                             (add Y, (sequence "ASR%u", 1, 31))> {
   let isAllocatable = 0;
 }
+
+// Privileged Registers
+def PRRegs : RegisterClass<"SP", [i64], 64,
+    (add TPC, TNPC, TSTATE, TT, TICK, TBA, PSTATE, TL, PIL, CWP,
+         CANSAVE, CANRESTORE, CLEANWIN, OTHERWIN, WSTATE)>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index d69da40..d701594 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -64,7 +64,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
     frameSize += 128;
     // Frames with calls must also reserve space for 6 outgoing arguments
     // whether they are used or not. LowerCall_64 takes care of that.
-    assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned");
+    frameSize = RoundUpToAlignment(frameSize, 16);
   } else {
     // Emit the correct save instruction based on the number of bytes in
     // the frame. Minimum stack frame size according to V8 ABI is:
@@ -81,3 +81,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
   }
   return frameSize;
 }
+
+bool SparcSubtarget::enableMachineScheduler() const {
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
index 9d21911..e2fd2f0 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -60,6 +60,8 @@ public:
     return &TSInfo;
   }
 
+  bool enableMachineScheduler() const override;
+
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
   bool isVIS2() const { return IsVIS2; }
@@ -85,7 +87,6 @@ public:
   /// returns adjusted framesize which includes space for register window
   /// spills and arguments.
   int getAdjustedFrameSize(int stackSize) const;
-
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 3aa4c6b..9c995bf 100644
--- a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -349,7 +349,6 @@ class SystemZAsmParser : public MCTargetAsmParser {
 #include "SystemZGenAsmMatcher.inc"
 
 private:
-  MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   enum RegisterGroup {
     RegGR,
@@ -386,14 +385,14 @@ private:
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
 public:
-  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+  SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
                    const MCInstrInfo &MII,
                    const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    : MCTargetAsmParser(Options, sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
 
   // Override MCTargetAsmParser.
@@ -533,14 +532,16 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
 }
 
 // Parse a register of group Group.  If Regs is nonnull, use it to map
-// the raw register number to LLVM numbering, with zero entries indicating
-// an invalid register.  IsAddress says whether the register appears in an
-// address context.
+// the raw register number to LLVM numbering, with zero entries
+// indicating an invalid register.  IsAddress says whether the
+// register appears in an address context. Allow FP Group if expecting
+// RegV Group, since the f-prefix yields the FP group even while used
+// with vector instructions.
 bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
                                      const unsigned *Regs, bool IsAddress) {
   if (parseRegister(Reg))
     return true;
-  if (Reg.Group != Group)
+  if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV))
     return Error(Reg.StartLoc, "invalid operand for instruction");
   if (Regs && Regs[Reg.Num] == 0)
     return Error(Reg.StartLoc, "invalid register pair");
@@ -791,7 +792,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (MatchResult) {
   case Match_Success:
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
+    Out.EmitInstruction(Inst, getSTI());
     return false;
 
   case Match_MissingFeature: {
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 059ae3f..6444cf8 100644
--- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -60,15 +60,15 @@ void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
   O << '%' << getRegisterName(RegNo);
 }
 
-template<unsigned N>
-void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+template <unsigned N>
+static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
   int64_t Value = MI->getOperand(OpNum).getImm();
   assert(isUInt<N>(Value) && "Invalid uimm argument");
   O << Value;
 }
 
-template<unsigned N>
-void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+template <unsigned N>
+static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
   int64_t Value = MI->getOperand(OpNum).getImm();
   assert(isInt<N>(Value) && "Invalid simm argument");
   O << Value;
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index ba55e68..7ca386f 100644
--- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -15,7 +15,6 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 class MCOperand;
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 5fefa31..2115d44 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -226,7 +226,7 @@ extern "C" void LLVMInitializeSystemZTargetMC() {
 
   // Register the MCCodeEmitter.
   TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget,
-					createSystemZMCCodeEmitter);
+                                        createSystemZMCCodeEmitter);
 
   // Register the MCInstrInfo.
   TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt
index e089047..cd367d6 100644
--- a/contrib/llvm/lib/Target/SystemZ/README.txt
+++ b/contrib/llvm/lib/Target/SystemZ/README.txt
@@ -52,12 +52,6 @@ We don't use the TEST DATA CLASS instructions.
 
 --
 
-We could use the generic floating-point forms of LOAD COMPLEMENT,
-LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the
-condition codes.  For example, we could use LCDFR instead of LCDBR.
-
---
-
 We only use MVC, XC and CLC for constant-length block operations.
 We could extend them to variable-length operations too,
 using EXECUTE RELATIVE LONG.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 3dca7bd..7527311 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -288,7 +288,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
-  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
+  uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType());
 
   OutStreamer->EmitValue(Expr, Size);
 }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index 44ea1d2..4a6beb6 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -26,21 +26,6 @@ SystemZConstantPoolValue::Create(const GlobalValue *GV,
   return new SystemZConstantPoolValue(GV, Modifier);
 }
 
-unsigned SystemZConstantPoolValue::getRelocationInfo() const {
-  switch (Modifier) {
-  case SystemZCP::TLSGD:
-  case SystemZCP::TLSLDM:
-  case SystemZCP::DTPOFF:
-    // May require a dynamic relocation.
-    return 2;
-  case SystemZCP::NTPOFF:
-    // May require a relocation, but the relocations are always resolved
-    // by the static linker.
-    return 1;
-  }
-  llvm_unreachable("Unknown modifier");
-}
-
 int SystemZConstantPoolValue::
 getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
   unsigned AlignMask = Alignment - 1;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
index e5f1bb1..a71b595 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -43,7 +43,6 @@ public:
     Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
 
   // Override MachineConstantPoolValue.
-  unsigned getRelocationInfo() const override;
   int getExistingMachineCPValue(MachineConstantPool *CP,
                                 unsigned Alignment) override;
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 16f9adc..4818ed0 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -37,13 +37,11 @@ namespace {
 // instructions.
 struct Reference {
   Reference()
-    : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
+    : Def(false), Use(false) {}
 
   Reference &operator|=(const Reference &Other) {
     Def |= Other.Def;
-    IndirectDef |= Other.IndirectDef;
     Use |= Other.Use;
-    IndirectUse |= Other.IndirectUse;
     return *this;
   }
 
@@ -53,11 +51,6 @@ struct Reference {
   // via a sub- or super-register.
   bool Def;
   bool Use;
-
-  // True if the register is defined or used indirectly, by a sub- or
-  // super-register.
-  bool IndirectDef;
-  bool IndirectUse;
 };
 
 class SystemZElimCompare : public MachineFunctionPass {
@@ -104,14 +97,12 @@ static bool isCCLiveOut(MachineBasicBlock &MBB) {
   return false;
 }
 
-// Return true if any CC result of MI would reflect the value of subreg
-// SubReg of Reg.
-static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
+// Return true if any CC result of MI would reflect the value of Reg.
+static bool resultTests(MachineInstr *MI, unsigned Reg) {
   if (MI->getNumOperands() > 0 &&
       MI->getOperand(0).isReg() &&
       MI->getOperand(0).isDef() &&
-      MI->getOperand(0).getReg() == Reg &&
-      MI->getOperand(0).getSubReg() == SubReg)
+      MI->getOperand(0).getReg() == Reg)
     return true;
 
   switch (MI->getOpcode()) {
@@ -127,30 +118,25 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
   case SystemZ::LTEBR:
   case SystemZ::LTDBR:
   case SystemZ::LTXBR:
-    if (MI->getOperand(1).getReg() == Reg &&
-        MI->getOperand(1).getSubReg() == SubReg)
+    if (MI->getOperand(1).getReg() == Reg)
       return true;
   }
 
   return false;
 }
 
-// Describe the references to Reg in MI, including sub- and super-registers.
+// Describe the references to Reg or any of its aliases in MI.
 Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
   Reference Ref;
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
     if (MO.isReg()) {
       if (unsigned MOReg = MO.getReg()) {
-        if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) {
-          if (MO.isUse()) {
+        if (TRI->regsOverlap(MOReg, Reg)) {
+          if (MO.isUse())
             Ref.Use = true;
-            Ref.IndirectUse |= (MOReg != Reg);
-          }
-          if (MO.isDef()) {
+          else if (MO.isDef())
             Ref.Def = true;
-            Ref.IndirectDef |= (MOReg != Reg);
-          }
         }
       }
     }
@@ -158,6 +144,30 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
   return Ref;
 }
 
+// Return true if this is a load and test which can be optimized the
+// same way as compare instruction.
+static bool isLoadAndTestAsCmp(MachineInstr *MI) {
+  // If we during isel used a load-and-test as a compare with 0, the
+  // def operand is dead.
+  return ((MI->getOpcode() == SystemZ::LTEBR ||
+           MI->getOpcode() == SystemZ::LTDBR ||
+           MI->getOpcode() == SystemZ::LTXBR) &&
+          MI->getOperand(0).isDead());
+}
+
+// Return the source register of Compare, which is the unknown value
+// being tested.
+static unsigned getCompareSourceReg(MachineInstr *Compare) {
+  unsigned reg = 0;
+  if (Compare->isCompare())
+    reg = Compare->getOperand(0).getReg();
+  else if (isLoadAndTestAsCmp(Compare))
+    reg = Compare->getOperand(1).getReg();
+  assert (reg);
+
+  return reg;
+}
+
 // Compare compares the result of MI against zero.  If MI is an addition
 // of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
 // and convert the branch to a BRCT(G).  Return true on success.
@@ -188,7 +198,7 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
   // We already know that there are no references to the register between
   // MI and Compare.  Make sure that there are also no references between
   // Compare and Branch.
-  unsigned SrcReg = Compare->getOperand(0).getReg();
+  unsigned SrcReg = getCompareSourceReg(Compare);
   MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
   for (++MBBI; MBBI != MBBE; ++MBBI)
     if (getRegReferences(MBBI, SrcReg))
@@ -196,16 +206,15 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
 
   // The transformation is OK.  Rebuild Branch as a BRCT(G).
   MachineOperand Target(Branch->getOperand(2));
-  Branch->RemoveOperand(2);
-  Branch->RemoveOperand(1);
-  Branch->RemoveOperand(0);
+  while (Branch->getNumOperands())
+    Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(BRCT));
   MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
     .addOperand(MI->getOperand(0))
     .addOperand(MI->getOperand(1))
     .addOperand(Target)
     .addReg(SystemZ::CC, RegState::ImplicitDefine);
-  MI->removeFromParent();
+  MI->eraseFromParent();
   return true;
 }
 
@@ -308,6 +317,10 @@ static bool isCompareZero(MachineInstr *Compare) {
     return true;
 
   default:
+
+    if (isLoadAndTestAsCmp(Compare))
+      return true;
+
     return (Compare->getNumExplicitOperands() == 2 &&
             Compare->getOperand(1).isImm() &&
             Compare->getOperand(1).getImm() == 0);
@@ -325,8 +338,7 @@ optimizeCompareZero(MachineInstr *Compare,
     return false;
 
   // Search back for CC results that are based on the first operand.
-  unsigned SrcReg = Compare->getOperand(0).getReg();
-  unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
+  unsigned SrcReg = getCompareSourceReg(Compare);
   MachineBasicBlock &MBB = *Compare->getParent();
   MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
   Reference CCRefs;
@@ -334,7 +346,7 @@ optimizeCompareZero(MachineInstr *Compare,
   while (MBBI != MBBE) {
     --MBBI;
     MachineInstr *MI = MBBI;
-    if (resultTests(MI, SrcReg, SrcSubReg)) {
+    if (resultTests(MI, SrcReg)) {
       // Try to remove both MI and Compare by converting a branch to BRCT(G).
       // We don't care in this case whether CC is modified between MI and
       // Compare.
@@ -435,23 +447,21 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
   while (MBBI != MBB.begin()) {
     MachineInstr *MI = --MBBI;
     if (CompleteCCUsers &&
-        MI->isCompare() &&
+        (MI->isCompare() || isLoadAndTestAsCmp(MI)) &&
         (optimizeCompareZero(MI, CCUsers) ||
          fuseCompareAndBranch(MI, CCUsers))) {
       ++MBBI;
-      MI->removeFromParent();
+      MI->eraseFromParent();
       Changed = true;
       CCUsers.clear();
-      CompleteCCUsers = true;
       continue;
     }
 
-    Reference CCRefs(getRegReferences(MI, SystemZ::CC));
-    if (CCRefs.Def) {
+    if (MI->definesRegister(SystemZ::CC)) {
       CCUsers.clear();
-      CompleteCCUsers = !CCRefs.IndirectDef;
+      CompleteCCUsers = true;
     }
-    if (CompleteCCUsers && CCRefs.Use)
+    if (MI->readsRegister(SystemZ::CC) && CompleteCCUsers)
       CCUsers.push_back(MI);
   }
   return Changed;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 397de47..e1b20d0 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -48,7 +48,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
 
 SystemZFrameLowering::SystemZFrameLowering()
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
-                          -SystemZMC::CallFrameSize, 8) {
+                          -SystemZMC::CallFrameSize, 8,
+                          false /* StackRealignable */) {
   // Create a mapping from register number to save slot offset.
   RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
   for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
@@ -133,7 +134,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   bool IsVarArg = MF.getFunction()->isVarArg();
-  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  DebugLoc DL;
 
   // Scan the call-saved GPRs and find the bounds of the register spill area.
   unsigned LowGPR = 0;
@@ -322,7 +323,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo();
   bool HasFP = hasFP(MF);
-  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
 
   // The current offset of the stack pointer from the CFA.
   int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
@@ -394,7 +398,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
 
       // Add CFI for the this save.
       unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-      int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx());
+      unsigned IgnoredFrameReg;
+      int64_t Offset =
+          getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+
       unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, DwarfReg, SPOffsetFromCFA + Offset));
       CFIIndexes.push_back(CFIIndex);
@@ -455,9 +462,14 @@ bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
           MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
 }
 
-int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                              int FI) const {
+int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
   const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
 
   // Start with the offset of FI from the top of the caller-allocated frame
   // (i.e. the top of the 160 bytes allocated by the caller).  This initial
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 5ade757..46bb6b7 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -43,7 +43,8 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   bool hasFP(const MachineFunction &MF) const override;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 75fd37f..a909309 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -585,7 +585,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
 static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
   if (N.getNode()->getNodeId() == -1 ||
       N.getNode()->getNodeId() > Pos->getNodeId()) {
-    DAG->RepositionNode(Pos, N.getNode());
+    DAG->RepositionNode(Pos->getIterator(), N.getNode());
     N.getNode()->setNodeId(Pos->getNodeId());
   }
 }
@@ -801,7 +801,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
     RxSBG.Input = N.getOperand(0);
     return true;
   }
-      
+
   case ISD::ANY_EXTEND:
     // Bits above the extended operand are don't-care.
     RxSBG.Input = N.getOperand(0);
@@ -818,7 +818,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       return true;
     }
     // Fall through.
-    
+
   case ISD::SIGN_EXTEND: {
     // Check that the extension bits are don't-care (i.e. are masked out
     // by the final mask).
@@ -938,7 +938,23 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
       }
       return nullptr;
     }
-  }  
+  }
+
+  // If the RISBG operands require no rotation and just masks the bottom
+  // 8/16 bits, attempt to convert this to a LLC zero extension.
+  if (RISBG.Rotate == 0 && (RISBG.Mask == 0xff || RISBG.Mask == 0xffff)) {
+    unsigned OpCode = (RISBG.Mask == 0xff ? SystemZ::LLGCR : SystemZ::LLGHR);
+    if (VT == MVT::i32) {
+      if (Subtarget->hasHighWord())
+        OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCRMux : SystemZ::LLHRMux);
+      else
+        OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCR : SystemZ::LLHR);
+    }
+
+    SDValue In = convertTo(DL, VT, RISBG.Input);
+    N = CurDAG->getMachineNode(OpCode, DL, VT, In);
+    return convertTo(DL, VT, SDValue(N, 0)).getNode();
+  }
 
   unsigned Opcode = SystemZ::RISBG;
   // Prefer RISBGN if available, since it does not clobber CC.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 9a753c8..ee73267 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -84,8 +84,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
 SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
                                              const SystemZSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
-  auto &DL = *TM.getDataLayout();
-  MVT PtrVT = getPointerTy(DL);
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 
   // Set up the register classes.
   if (Subtarget.hasHighWord())
@@ -115,8 +114,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
   // Set up special registers.
-  setExceptionPointerRegister(SystemZ::R6D);
-  setExceptionSelectorRegister(SystemZ::R7D);
   setStackPointerRegisterToSaveRestore(SystemZ::R15D);
 
   // TODO: It may be better to default to latency-oriented scheduling, however
@@ -370,7 +367,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       // No special instructions for these.
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
       setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
     }
   }
 
@@ -776,9 +775,7 @@ bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
 }
 
 bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
-    return false;
-  return true;
+  return CI->isTailCall();
 }
 
 // We do not yet support 128-bit single-element vector types.  If the user
@@ -939,8 +936,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getIntPtrConstant(4, DL));
       ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
-                             MachinePointerInfo::getFixedStack(FI),
-                             false, false, false, 0);
+                             MachinePointerInfo::getFixedStack(MF, FI), false,
+                             false, false, 0);
     }
 
     // Convert the value of the argument register into the value that's
@@ -976,9 +973,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                                      &SystemZ::FP64BitRegClass);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
         MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
-                                 MachinePointerInfo::getFixedStack(FI),
+                                 MachinePointerInfo::getFixedStack(MF, FI),
                                  false, false, 0);
-
       }
       // Join the stores, which are independent of one another.
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
@@ -1060,9 +1056,9 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
       // Store the argument in a stack slot and pass its address.
       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot,
-                                         MachinePointerInfo::getFixedStack(FI),
-                                         false, false, 0));
+      MemOpChains.push_back(DAG.getStore(
+          Chain, DL, ArgValue, SpillSlot,
+          MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
       ArgValue = SpillSlot;
     } else
       ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
@@ -1607,8 +1603,8 @@ static void adjustSubwordCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
   } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
     if (Value > Mask)
       return;
-    assert(C.ICmpType == SystemZICMP::Any &&
-           "Signedness shouldn't matter here.");
+    // If the constant is in range, we can use any comparison.
+    C.ICmpType = SystemZICMP::Any;
   } else
     return;
 
@@ -2439,7 +2435,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
     Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
   }
 
   // If there was a non-zero offset that we didn't fold, create an explicit
@@ -2499,7 +2496,9 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
 }
 
 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
-						     SelectionDAG &DAG) const {
+                                                     SelectionDAG &DAG) const {
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(Node, DAG);
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2529,9 +2528,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
         SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
 
       Offset = DAG.getConstantPool(CPV, PtrVT, 8);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
 
       // Call __tls_get_offset to retrieve the offset.
       Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
@@ -2544,9 +2544,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
         SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
 
       Offset = DAG.getConstantPool(CPV, PtrVT, 8);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
 
       // Call __tls_get_offset to retrieve the module base offset.
       Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
@@ -2562,9 +2563,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
 
       SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
-      DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                              DTPOffset, MachinePointerInfo::getConstantPool(),
-                              false, false, false, 0);
+      DTPOffset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), DTPOffset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
 
       Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
       break;
@@ -2575,8 +2577,8 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                           SystemZII::MO_INDNTPOFF);
       Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getGOT(),
+      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
+                           MachinePointerInfo::getGOT(DAG.getMachineFunction()),
                            false, false, false, 0);
       break;
     }
@@ -2587,9 +2589,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
         SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
 
       Offset = DAG.getConstantPool(CPV, PtrVT, 8);
-      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
-                           Offset, MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, 0);
       break;
     }
   }
@@ -2628,10 +2631,10 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
   SDValue Result;
   if (CP->isMachineConstantPoolEntry())
     Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
-				       CP->getAlignment());
+                                       CP->getAlignment());
   else
     Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
-				       CP->getAlignment(), CP->getOffset());
+                                       CP->getAlignment(), CP->getOffset());
 
   // Use LARL to load the address of the constant pool entry.
   return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -2736,17 +2739,37 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
 
 SDValue SystemZTargetLowering::
 lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  bool RealignOpt = !DAG.getMachineFunction().getFunction()->
+    hasFnAttribute("no-realign-stack");
+
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
   SDLoc DL(Op);
 
+  // If user has set the no alignment function attribute, ignore
+  // alloca alignments.
+  uint64_t AlignVal = (RealignOpt ?
+                       dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+
+  uint64_t StackAlign = TFI->getStackAlignment();
+  uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
+  uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
+
   unsigned SPReg = getStackPointerRegisterToSaveRestore();
+  SDValue NeededSpace = Size;
 
   // Get a reference to the stack pointer.
   SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
 
+  // Add extra space for alignment if needed.
+  if (ExtraAlignSpace)
+    NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
+                              DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); 
+
   // Get the new stack pointer value.
-  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, Size);
+  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
 
   // Copy the new stack pointer back.
   Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
@@ -2757,6 +2780,16 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
   SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
 
+  // Dynamically realign if needed.
+  if (RequiredAlign > StackAlign) {
+    Result =
+      DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
+                  DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+    Result =
+      DAG.getNode(ISD::AND, DL, MVT::i64, Result,
+                  DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
+  }
+
   SDValue Ops[2] = { Result, Chain };
   return DAG.getMergeValues(Ops, DL);
 }
@@ -2837,7 +2870,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
   } else if (DAG.ComputeNumSignBits(Op1) > 32) {
     Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
     Opcode = SystemZISD::SDIVREM32;
-  } else    
+  } else
     Opcode = SystemZISD::SDIVREM64;
 
   // DSG(F) takes a 64-bit dividend, so the even register in the GR128
@@ -3247,8 +3280,8 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getNumValues() == 1)
       return CC;
     assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
-    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
-		    Glued, CC);
+    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued,
+                       CC);
   }
 
   unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3890,7 +3923,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
       GS.addUndef();
     } else {
       GS.add(SDValue(), ResidueOps.size());
-      ResidueOps.push_back(Op);
+      ResidueOps.push_back(BVN->getOperand(I));
     }
   }
 
@@ -3901,7 +3934,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
   // Create the BUILD_VECTOR for the remaining elements, if any.
   if (!ResidueOps.empty()) {
     while (ResidueOps.size() < NumElements)
-      ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType()));
+      ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
     for (auto &Op : GS.Ops) {
       if (!Op.getNode()) {
         Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps);
@@ -4204,7 +4237,7 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
 
 SDValue
 SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-					      unsigned UnpackHigh) const {
+                                              unsigned UnpackHigh) const {
   SDValue PackedOp = Op.getOperand(0);
   EVT OutVT = Op.getValueType();
   EVT InVT = PackedOp.getValueType();
@@ -4566,9 +4599,9 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
       }
       return Op;
     } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
-		Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
-		Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
-	       canTreatAsByteVector(Op.getValueType()) &&
+                Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+                Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
+               canTreatAsByteVector(Op.getValueType()) &&
                canTreatAsByteVector(Op.getOperand(0).getValueType())) {
       // Make sure that only the unextended bits are significant.
       EVT ExtVT = Op.getValueType();
@@ -4579,14 +4612,14 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
       unsigned SubByte = Byte % ExtBytesPerElement;
       unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
       if (SubByte < MinSubByte ||
-	  SubByte + BytesPerElement > ExtBytesPerElement)
-	break;
+          SubByte + BytesPerElement > ExtBytesPerElement)
+        break;
       // Get the byte offset of the unextended element
       Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
       // ...then add the byte offset relative to that element.
       Byte += SubByte - MinSubByte;
       if (Byte % BytesPerElement != 0)
-	break;
+        break;
       Op = Op.getOperand(0);
       Index = Byte / BytesPerElement;
       Force = true;
@@ -5611,6 +5644,31 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
   return MBB;
 }
 
+MachineBasicBlock *
+SystemZTargetLowering::emitLoadAndTestCmp0(MachineInstr *MI,
+                                          MachineBasicBlock *MBB,
+                                          unsigned Opcode) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned SrcReg = MI->getOperand(0).getReg();
+
+  // Create new virtual register of the same class as source.
+  const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+  unsigned DstReg = MRI->createVirtualRegister(RC);
+
+  // Replace pseudo with a normal load-and-test that models the def as
+  // well.
+  BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
+    .addReg(SrcReg);
+  MI->eraseFromParent();
+
+  return MBB;
+}
+
 MachineBasicBlock *SystemZTargetLowering::
 EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   switch (MI->getOpcode()) {
@@ -5858,6 +5916,13 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
     return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
   case SystemZ::TBEGINC:
     return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
+  case SystemZ::LTEBRCompare_VecPseudo:
+    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
+  case SystemZ::LTDBRCompare_VecPseudo:
+    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
+  case SystemZ::LTXBRCompare_VecPseudo:
+    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
+
   default:
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 07ff251..391636e 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -409,6 +409,20 @@ public:
     return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
   }
 
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  unsigned
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+    return SystemZ::R6D;
+  }
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  unsigned
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+    return SystemZ::R7D;
+  }
+
   MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const
     override;
@@ -481,7 +495,7 @@ private:
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-				 unsigned UnpackHigh) const;
+                                 unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
   SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
@@ -530,6 +544,10 @@ private:
                                           MachineBasicBlock *MBB,
                                           unsigned Opcode,
                                           bool NoFloat) const;
+  MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const;
+
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
index 464f79a..5a1c874 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -35,11 +35,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
   if (MCID.mayStore())
     Flags |= MachineMemOperand::MOStore;
   int64_t Offset = 0;
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(
-                              PseudoSourceValue::getFixedStack(FI), Offset),
-                            Flags, MFFrame->getObjectSize(FI),
-                            MFFrame->getObjectAlignment(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+      MFFrame->getObjectSize(FI), MFFrame->getObjectAlignment(FI));
   return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 27fbd7d..0cb2672 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -46,15 +46,28 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
   defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
   defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
 }
-// Note that the comparison against zero operation is not available if we
-// have vector support, since load-and-test instructions will partially
-// clobber the target (vector) register.
+// Note that LTxBRCompare is not available if we have vector support,
+// since load-and-test instructions will partially clobber the target
+// (vector) register.
 let Predicates = [FeatureNoVector] in {
   defm : CompareZeroFP<LTEBRCompare, FP32>;
   defm : CompareZeroFP<LTDBRCompare, FP64>;
   defm : CompareZeroFP<LTXBRCompare, FP128>;
 }
 
+// Use a normal load-and-test for compare against zero in case of
+// vector support (via a pseudo to simplify instruction selection).
+let Defs = [CC], usesCustomInserter = 1 in {
+  def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
+  def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
+  def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
+}
+let Predicates = [FeatureVector] in {
+  defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
+  defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
+  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
+}
+
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
 def LDGR : UnaryRRE<"ldg", 0xB3C1, bitconvert, FP64, GR64>;
@@ -238,26 +251,46 @@ let Predicates = [FeatureFPExtension] in {
 // Unary arithmetic
 //===----------------------------------------------------------------------===//
 
+// We prefer generic instructions during isel, because they do not
+// clobber CC and therefore give the scheduler more freedom. In cases
+// the CC is actually useful, the SystemZElimCompare pass will try to
+// convert generic instructions into opcodes that also set CC. Note
+// that lcdf / lpdf / lndf only affect the sign bit, and can therefore
+// be used with fp32 as well. This could be done for fp128, in which
+// case the operands would have to be tied.
+
 // Negation (Load Complement).
 let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
-  def LCEBR : UnaryRRE<"lceb", 0xB303, fneg, FP32,  FP32>;
-  def LCDBR : UnaryRRE<"lcdb", 0xB313, fneg, FP64,  FP64>;
+  def LCEBR : UnaryRRE<"lceb", 0xB303, null_frag, FP32,  FP32>;
+  def LCDBR : UnaryRRE<"lcdb", 0xB313, null_frag, FP64,  FP64>;
   def LCXBR : UnaryRRE<"lcxb", 0xB343, fneg, FP128, FP128>;
 }
+// Generic form, which does not set CC.
+def LCDFR : UnaryRRE<"lcdf", 0xB373, fneg, FP64,  FP64>;
+let isCodeGenOnly = 1 in
+  def LCDFR_32 : UnaryRRE<"lcdf", 0xB373, fneg, FP32,  FP32>;
 
 // Absolute value (Load Positive).
 let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
-  def LPEBR : UnaryRRE<"lpeb", 0xB300, fabs, FP32,  FP32>;
-  def LPDBR : UnaryRRE<"lpdb", 0xB310, fabs, FP64,  FP64>;
+  def LPEBR : UnaryRRE<"lpeb", 0xB300, null_frag, FP32,  FP32>;
+  def LPDBR : UnaryRRE<"lpdb", 0xB310, null_frag, FP64,  FP64>;
   def LPXBR : UnaryRRE<"lpxb", 0xB340, fabs, FP128, FP128>;
 }
+// Generic form, which does not set CC.
+def LPDFR : UnaryRRE<"lpdf", 0xB370, fabs, FP64,  FP64>;
+let isCodeGenOnly = 1 in
+  def LPDFR_32 : UnaryRRE<"lpdf", 0xB370, fabs, FP32,  FP32>;
 
 // Negative absolute value (Load Negative).
 let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
-  def LNEBR : UnaryRRE<"lneb", 0xB301, fnabs, FP32,  FP32>;
-  def LNDBR : UnaryRRE<"lndb", 0xB311, fnabs, FP64,  FP64>;
+  def LNEBR : UnaryRRE<"lneb", 0xB301, null_frag, FP32,  FP32>;
+  def LNDBR : UnaryRRE<"lndb", 0xB311, null_frag, FP64,  FP64>;
   def LNXBR : UnaryRRE<"lnxb", 0xB341, fnabs, FP128, FP128>;
 }
+// Generic form, which does not set CC.
+def LNDFR : UnaryRRE<"lndf", 0xB371, fnabs, FP64,  FP64>;
+let isCodeGenOnly = 1 in
+  def LNDFR_32 : UnaryRRE<"lndf", 0xB371, fnabs, FP32,  FP32>;
 
 // Square root.
 def SQEBR : UnaryRRE<"sqeb", 0xB314, fsqrt, FP32,  FP32>;
@@ -414,6 +447,6 @@ let Defs = [CC], CCValues = 0xF in {
 // Peepholes
 //===----------------------------------------------------------------------===//
 
-def : Pat<(f32  fpimmneg0), (LCEBR (LZER))>;
-def : Pat<(f64  fpimmneg0), (LCDBR (LZDR))>;
+def : Pat<(f32  fpimmneg0), (LCDFR_32 (LZER))>;
+def : Pat<(f64  fpimmneg0), (LCDFR (LZDR))>;
 def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 71eb998..01f4cde 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2381,6 +2381,7 @@ multiclass StringRRE<string mnemonic, bits<16> opcode,
   def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
                    (ins GR64:$R1src, GR64:$R2src),
                    mnemonic#"\t$R1, $R2", []> {
+    let Uses = [R0L];
     let Constraints = "$R1 = $R1src, $R2 = $R2src";
     let DisableEncoding = "$R1src, $R2src";
   }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 5d4a34f..e6b5fc8 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -69,6 +69,11 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineOperand &LowOffsetOp = MI->getOperand(2);
   LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
 
+ // Clear the kill flags for the base and index registers in the first
+ // instruction.
+  EarlierMI->getOperand(1).setIsKill(false);
+  EarlierMI->getOperand(3).setIsKill(false);
+
   // Set the opcodes.
   unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm());
   unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm());
@@ -111,7 +116,7 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
 }
 
 // MI is a three-operand RIE-style pseudo instruction.  Replace it with
-// LowOpcode3 if the registers are both low GR32s, otherwise use a move
+// LowOpcodeK if the registers are both low GR32s, otherwise use a move
 // followed by HighOpcode or LowOpcode, depending on whether the target
 // is a high or low GR32.
 void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
@@ -129,6 +134,7 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
                   MI->getOperand(1).isKill());
     MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
     MI->getOperand(1).setReg(DestReg);
+    MI->tieOperands(0, 1);
   }
 }
 
@@ -486,11 +492,8 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
                                        const MachineRegisterInfo *MRI) const {
   assert(!SrcReg2 && "Only optimizing constant comparisons so far");
   bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
-  if (Value == 0 &&
-      !IsLogical &&
-      removeIPMBasedCompare(Compare, SrcReg, MRI, &RI))
-    return true;
-  return false;
+  return Value == 0 && !IsLogical &&
+         removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
 }
 
 // If Opcode is a move that has a conditional variant, return that variant,
@@ -505,16 +508,13 @@ static unsigned getConditionalMove(unsigned Opcode) {
 
 bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const {
   unsigned Opcode = MI->getOpcode();
-  if (STI.hasLoadStoreOnCond() &&
-      getConditionalMove(Opcode))
-    return true;
-  return false;
+  return STI.hasLoadStoreOnCond() && getConditionalMove(Opcode);
 }
 
 bool SystemZInstrInfo::
 isProfitableToIfCvt(MachineBasicBlock &MBB,
                     unsigned NumCycles, unsigned ExtraPredCycles,
-                    const BranchProbability &Probability) const {
+                    BranchProbability Probability) const {
   // For now only convert single instructions.
   return NumCycles == 1;
 }
@@ -524,7 +524,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
                     unsigned NumCyclesT, unsigned ExtraPredCyclesT,
                     MachineBasicBlock &FMBB,
                     unsigned NumCyclesF, unsigned ExtraPredCyclesF,
-                    const BranchProbability &Probability) const {
+                    BranchProbability Probability) const {
   // For now avoid converting mutually-exclusive cases.
   return false;
 }
@@ -548,11 +548,10 @@ PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const {
   return false;
 }
 
-void
-SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-			      MachineBasicBlock::iterator MBBI, DebugLoc DL,
-			      unsigned DestReg, unsigned SrcReg,
-			      bool KillSrc) const {
+void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
   if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
     copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
@@ -590,13 +589,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-void
-SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-				      MachineBasicBlock::iterator MBBI,
-				      unsigned SrcReg, bool isKill,
-				      int FrameIdx,
-				      const TargetRegisterClass *RC,
-				      const TargetRegisterInfo *TRI) const {
+void SystemZInstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Callers may expect a single instruction, so keep 128-bit moves
@@ -604,15 +600,14 @@ SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   unsigned LoadOpcode, StoreOpcode;
   getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
   addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode))
-		    .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+                        .addReg(SrcReg, getKillRegState(isKill)),
+                    FrameIdx);
 }
 
-void
-SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-				       MachineBasicBlock::iterator MBBI,
-				       unsigned DestReg, int FrameIdx,
-				       const TargetRegisterClass *RC,
-				       const TargetRegisterInfo *TRI) const {
+void SystemZInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Callers may expect a single instruction, so keep 128-bit moves
@@ -681,7 +676,8 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                         LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
 
   unsigned Opcode = MI->getOpcode();
   unsigned NumOps = MI->getNumOperands();
@@ -708,14 +704,19 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
     int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
     if (ThreeOperandOpcode >= 0) {
-      MachineInstrBuilder MIB =
-        BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
-        .addOperand(Dest);
+      // Create three address instruction without adding the implicit
+      // operands. Those will instead be copied over from the original
+      // instruction by the loop below.
+      MachineInstrBuilder MIB(*MF,
+                              MF->CreateMachineInstr(get(ThreeOperandOpcode),
+                                    MI->getDebugLoc(), /*NoImplicit=*/true));
+      MIB.addOperand(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
         MIB.addOperand(MI->getOperand(I));
+      MBB->insert(MI, MIB);
       return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }
@@ -1191,6 +1192,12 @@ unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
   case SystemZ::LER:    return SystemZ::LTEBR;
   case SystemZ::LDR:    return SystemZ::LTDBR;
   case SystemZ::LXR:    return SystemZ::LTXBR;
+  case SystemZ::LCDFR:  return SystemZ::LCDBR;
+  case SystemZ::LPDFR:  return SystemZ::LPDBR;
+  case SystemZ::LNDFR:  return SystemZ::LNDBR;
+  case SystemZ::LCDFR_32:  return SystemZ::LCEBR;
+  case SystemZ::LPDFR_32:  return SystemZ::LPEBR;
+  case SystemZ::LNDFR_32:  return SystemZ::LNEBR;
   // On zEC12 we prefer to use RISBGN.  But if there is a chance to
   // actually use the condition code, we may turn it back into RISGB.
   // Note that RISBG is not really a "load-and-test" instruction,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 31c9db2..d9094ba 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -159,12 +159,12 @@ public:
   bool isPredicable(MachineInstr *MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const override;
+                           BranchProbability Probability) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
                            unsigned NumCyclesT, unsigned ExtraPredCyclesT,
                            MachineBasicBlock &FMBB,
                            unsigned NumCyclesF, unsigned ExtraPredCyclesF,
-                           const BranchProbability &Probability) const override;
+                           BranchProbability Probability) const override;
   bool PredicateInstruction(MachineInstr *MI,
                             ArrayRef<MachineOperand> Pred) const override;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 820f30b..d5dabc2 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -397,7 +397,7 @@ let mayLoad = 1, mayStore = 1 in
   defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
 
 // String moves.
-let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
   defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
 
 //===----------------------------------------------------------------------===//
@@ -424,7 +424,7 @@ let hasSideEffects = 0 in {
   def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>;
 }
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
-  def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR64>;
+  def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR32>;
 
 // Match 32-to-64-bit sign extensions in which the source is already
 // in a 64-bit register.
@@ -490,7 +490,7 @@ def : Pat<(and GR64:$src, 0xffffffff),
 def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
              Requires<[FeatureHighWord]>;
 def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
-def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
            Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
@@ -498,7 +498,7 @@ def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
 def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
              Requires<[FeatureHighWord]>;
 def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
-def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>,
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
 def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
 
@@ -1147,7 +1147,7 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
                 Requires<[FeatureHighWord]>;
   def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
-  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GR32, uimm32>,
+  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GRH32, uimm32>,
               Requires<[FeatureHighWord]>;
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
@@ -1185,7 +1185,7 @@ let mayLoad = 1, Defs = [CC] in
   defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
 
 // String comparison.
-let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+let mayLoad = 1, Defs = [CC] in
   defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
 
 // Test under mask.
@@ -1219,6 +1219,9 @@ def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
+// A serialization instruction that acts as a barrier for all memory
+// accesses, which expands to "bcr 14, 0".
+let hasSideEffects = 1 in
 def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
 
 let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
@@ -1459,9 +1462,29 @@ let usesCustomInserter = 1 in {
 }
 
 // Search a block of memory for a character.
-let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+let mayLoad = 1, Defs = [CC] in
   defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
 
+// Other instructions for inline assembly
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+  def STCK : InstS<0xB205, (outs), (ins bdaddr12only:$BD2),
+                       "stck\t$BD2",
+                       []>;
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+  def STCKF : InstS<0xB27C, (outs), (ins bdaddr12only:$BD2),
+                       "stckf\t$BD2",
+                       []>;
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+  def STCKE : InstS<0xB278, (outs), (ins bdaddr12only:$BD2),
+                       "stcke\t$BD2",
+                       []>;
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+  def STFLE : InstS<0xB2B0, (outs), (ins bdaddr12only:$BD2),
+                       "stfle\t$BD2",
+                       []>;
+
+
+
 //===----------------------------------------------------------------------===//
 // Peepholes.
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
index 00572d0..1a7c0d7 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=//
+//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 34fc36d..f4a517b 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=//
+//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index dc7bd25..6fd24e3 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -69,8 +69,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   // Decompose the frame index into a base and offset.
   int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
-  unsigned BasePtr = getFrameRegister(MF);
-  int64_t Offset = (TFI->getFrameIndexOffset(MF, FrameIndex) +
+  unsigned BasePtr;
+  int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
                     MI->getOperand(FIOperandNum + 1).getImm());
 
   // Special handling of dbg_value instructions.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 85aa0a6..0d8b08b 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -282,4 +282,5 @@ def v128any : TypedReg<untyped, VR128>;
 // The 2-bit condition code field of the PSW.  Every register named in an
 // inline asm needs a class associated with it.
 def CC : SystemZReg<"cc">;
-def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+let isAllocatable = 0 in
+  def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index d1a17c5..846edd5 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -16,6 +16,8 @@
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -35,19 +37,16 @@ public:
   bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
-  bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
-                  unsigned LLIxL, unsigned LLIxH);
+  bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH);
   bool shortenOn0(MachineInstr &MI, unsigned Opcode);
   bool shortenOn01(MachineInstr &MI, unsigned Opcode);
   bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode);
   bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
 
   const SystemZInstrInfo *TII;
-
-  // LowGPRs[I] has bit N set if LLVM register I includes the low
-  // word of GPR N.  HighGPRs is the same for the high word.
-  unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
-  unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+  const TargetRegisterInfo *TRI;
+  LivePhysRegs LiveRegs;
 };
 
 char SystemZShortenInst::ID = 0;
@@ -58,33 +57,31 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
 }
 
 SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
-  : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() {
-  // Set up LowGPRs and HighGPRs.
-  for (unsigned I = 0; I < 16; ++I) {
-    LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
-    LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
-    HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I;
-    HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
-    if (unsigned GR128 = SystemZMC::GR128Regs[I]) {
-      LowGPRs[GR128] |= 3 << I;
-      HighGPRs[GR128] |= 3 << I;
-    }
-  }
+  : MachineFunctionPass(ID), TII(nullptr) {}
+
+// Tie operands if MI has become a two-address instruction.
+static void tieOpsIfNeeded(MachineInstr &MI) {
+  if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+      !MI.getOperand(0).isTied())
+    MI.tieOperands(0, 1);
 }
 
 // MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
 // are the halfword immediate loads for the same word.  Try to use one of them
-// instead of IIxF.  If MI loads the high word, GPRMap[X] is the set of high
-// words referenced by LLVM register X while LiveOther is the mask of low
-// words that are currently live, and vice versa.
-bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
-                                    unsigned LiveOther, unsigned LLIxL,
-                                    unsigned LLIxH) {
+// instead of IIxF. 
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI,
+                                    unsigned LLIxL, unsigned LLIxH) {
   unsigned Reg = MI.getOperand(0).getReg();
-  assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
-  unsigned GPRs = GPRMap[Reg];
-  assert(GPRs != 0 && "Register must be a GPR");
-  if (GPRs & LiveOther)
+  // The new opcode will clear the other half of the GR64 reg, so
+  // cancel if that is live.
+  unsigned thisSubRegIdx = (SystemZ::GRH32BitRegClass.contains(Reg) ?
+			    SystemZ::subreg_h32 : SystemZ::subreg_l32);
+  unsigned otherSubRegIdx = (thisSubRegIdx == SystemZ::subreg_l32 ?
+			     SystemZ::subreg_h32 : SystemZ::subreg_l32);
+  unsigned GR64BitReg = TRI->getMatchingSuperReg(Reg, thisSubRegIdx,
+						 &SystemZ::GR64BitRegClass);
+  unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
+  if (LiveRegs.contains(OtherReg))
     return false;
 
   uint64_t Imm = MI.getOperand(1).getImm();
@@ -123,12 +120,26 @@ bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
 }
 
 // Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
-// 4-bit encoding and if operands 0 and 1 are tied.
+// 4-bit encoding and if operands 0 and 1 are tied. Also ties op 0
+// with op 1, if MI becomes 2-address.
 bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
   if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
       MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
       SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
     MI.setDesc(TII->get(Opcode));
+    tieOpsIfNeeded(MI);
+    return true;
+  }
+  return false;
+}
+
+// Calls shortenOn001 if CCLive is false. CC def operand is added in
+// case of success.
+bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI,
+					   unsigned Opcode) {
+  if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) {
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+      .addReg(SystemZ::CC, RegState::ImplicitDefine);
     return true;
   }
   return false;
@@ -164,35 +175,24 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
 bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
-  // Work out which words are live on exit from the block.
-  unsigned LiveLow = 0;
-  unsigned LiveHigh = 0;
-  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) {
-    for (auto LI = (*SI)->livein_begin(), LE = (*SI)->livein_end();
-         LI != LE; ++LI) {
-      unsigned Reg = *LI;
-      assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
-      LiveLow |= LowGPRs[Reg];
-      LiveHigh |= HighGPRs[Reg];
-    }
-  }
+  // Set up the set of live registers at the end of MBB (live out)
+  LiveRegs.clear();
+  LiveRegs.addLiveOuts(&MBB);
 
   // Iterate backwards through the block looking for instructions to change.
   for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
     MachineInstr &MI = *MBBI;
     switch (MI.getOpcode()) {
     case SystemZ::IILF:
-      Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
-                            SystemZ::LLILH);
+      Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH);
       break;
 
     case SystemZ::IIHF:
-      Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
-                            SystemZ::LLIHH);
+      Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH);
       break;
 
     case SystemZ::WFADB:
-      Changed |= shortenOn001(MI, SystemZ::ADBR);
+      Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
       break;
 
     case SystemZ::WFDDB:
@@ -216,15 +216,15 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       break;
 
     case SystemZ::WFLCDB:
-      Changed |= shortenOn01(MI, SystemZ::LCDBR);
+      Changed |= shortenOn01(MI, SystemZ::LCDFR);
       break;
 
     case SystemZ::WFLNDB:
-      Changed |= shortenOn01(MI, SystemZ::LNDBR);
+      Changed |= shortenOn01(MI, SystemZ::LNDFR);
       break;
 
     case SystemZ::WFLPDB:
-      Changed |= shortenOn01(MI, SystemZ::LPDBR);
+      Changed |= shortenOn01(MI, SystemZ::LPDFR);
       break;
 
     case SystemZ::WFSQDB:
@@ -232,7 +232,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       break;
 
     case SystemZ::WFSDB:
-      Changed |= shortenOn001(MI, SystemZ::SDBR);
+      Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
       break;
 
     case SystemZ::WFCDB:
@@ -257,33 +257,17 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       break;
     }
 
-    unsigned UsedLow = 0;
-    unsigned UsedHigh = 0;
-    for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
-         MOI != MOE; ++MOI) {
-      MachineOperand &MO = *MOI;
-      if (MO.isReg()) {
-        if (unsigned Reg = MO.getReg()) {
-          assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
-          if (MO.isDef()) {
-            LiveLow &= ~LowGPRs[Reg];
-            LiveHigh &= ~HighGPRs[Reg];
-          } else if (!MO.isUndef()) {
-            UsedLow |= LowGPRs[Reg];
-            UsedHigh |= HighGPRs[Reg];
-          }
-        }
-      }
-    }
-    LiveLow |= UsedLow;
-    LiveHigh |= UsedHigh;
+    LiveRegs.stepBackward(MI);
   }
 
   return Changed;
 }
 
 bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
-  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  LiveRegs.init(TRI);
 
   bool Changed = false;
   for (auto &MBB : F)
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 00cbbd1..f305e85 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -16,6 +16,7 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> MISchedPostRA;
 extern "C" void LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
@@ -32,7 +33,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
     VectorABI = false;
 
   SmallVector<StringRef, 3> Features;
-  FS.split(Features, ",", -1, false /* KeepEmpty */);
+  FS.split(Features, ',', -1, false /* KeepEmpty */);
   for (auto &Feature : Features) {
     if (Feature == "vector" || Feature == "+vector")
       VectorABI = true;
@@ -130,6 +131,13 @@ void SystemZPassConfig::addPreSched2() {
 }
 
 void SystemZPassConfig::addPreEmitPass() {
+
+  // Do instruction shortening before compare elimination because some
+  // vector instructions will be shortened into opcodes that compare
+  // elimination recognizes.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
+
   // We eliminate comparisons here rather than earlier because some
   // transformations can change the set of available CC values and we
   // generally want those transformations to have priority.  This is
@@ -155,9 +163,17 @@ void SystemZPassConfig::addPreEmitPass() {
   // preventing that would be a win or not.
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+
+  // Do final scheduling after all other optimizations, to get an
+  // optimal input for the decoder (branch relaxation must happen
+  // after block placement).
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (MISchedPostRA)
+      addPass(&PostMachineSchedulerID);
+    else
+      addPass(&PostRASchedulerID);
+  }
 }
 
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
@@ -165,7 +181,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(SystemZTTIImpl(this, F));
   });
 }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
index 0a81e1f..1a8f1f7 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -43,6 +43,9 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool targetSchedulesPostRAScheduling() const override { return true; };
+
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 5a87df1..5ff5b21 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 //
 //===----------------------------------------------------------------------===//
 
-unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -63,8 +63,8 @@ unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   return 4 * TTI::TCC_Basic;
 }
 
-unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                       const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -181,8 +181,8 @@ unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
   return SystemZTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                       const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 4b80973..9ae736d 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -28,7 +28,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   const SystemZTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F)
+  explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
@@ -42,12 +42,11 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
 
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty);
 
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 19b5e2a..a0b0d8f 100644
--- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -43,7 +43,6 @@ using namespace llvm;
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
   Ctx = &ctx;
-  DL = TM.getDataLayout();
   InitMCObjectFileInfo(TM.getTargetTriple(), TM.getRelocationModel(),
                        TM.getCodeModel(), *Ctx);
 }
@@ -107,7 +106,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
   assert(!Suffix.empty());
 
   SmallString<60> NameStr;
-  NameStr += DL->getPrivateGlobalPrefix();
+  NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
   TM.getNameWithPrefix(NameStr, GV, Mang);
   NameStr.append(Suffix.begin(), Suffix.end());
   return Ctx->getOrCreateSymbol(NameStr);
@@ -120,7 +119,7 @@ MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
 }
 
 void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
-                                                    const TargetMachine &TM,
+                                                    const DataLayout &,
                                                     const MCSymbol *Sym) const {
 }
 
@@ -170,14 +169,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
     // If the initializer for the global contains something that requires a
     // relocation, then we may have to drop this into a writable data section
     // even though it is marked const.
-    switch (C->getRelocationInfo()) {
-    case Constant::NoRelocation:
+    if (!C->needsRelocation()) {
       // If the global is required to have a unique address, it can't be put
       // into a mergable section: just drop it into the general read-only
       // section instead.
       if (!GVar->hasUnnamedAddr())
         return SectionKind::getReadOnly();
-        
+
       // If initializer is a null-terminated string, put it in a "cstring"
       // section of the right width.
       if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
@@ -200,7 +198,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
       // Otherwise, just drop it into a mergable constant section.  If we have
       // a section for this size, use it, otherwise use the arbitrary sized
       // mergable section.
-      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
+      switch (GV->getParent()->getDataLayout().getTypeAllocSize(C->getType())) {
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();
@@ -208,20 +206,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
         return SectionKind::getReadOnly();
       }
 
-    case Constant::LocalRelocation:
-      // In static relocation model, the linker will resolve all addresses, so
-      // the relocation entries will actually be constants by the time the app
-      // starts up.  However, we can't put this into a mergable section, because
-      // the linker doesn't take relocations into consideration when it tries to
-      // merge entries in the section.
-      if (ReloModel == Reloc::Static)
-        return SectionKind::getReadOnly();
-
-      // Otherwise, the dynamic linker needs to fix it up, put it in the
-      // writable data.rel.local section.
-      return SectionKind::getReadOnlyWithRelLocal();
-
-    case Constant::GlobalRelocations:
+    } else {
       // In static relocation model, the linker will resolve all addresses, so
       // the relocation entries will actually be constants by the time the app
       // starts up.  However, we can't put this into a mergable section, because
@@ -242,17 +227,11 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
   // globals together onto fewer pages, improving the locality of the dynamic
   // linker.
   if (ReloModel == Reloc::Static)
-    return SectionKind::getDataNoRel();
-
-  switch (C->getRelocationInfo()) {
-  case Constant::NoRelocation:
-    return SectionKind::getDataNoRel();
-  case Constant::LocalRelocation:
-    return SectionKind::getDataRelLocal();
-  case Constant::GlobalRelocations:
-    return SectionKind::getDataRel();
-  }
-  llvm_unreachable("Invalid relocation");
+    return SectionKind::getData();
+
+  if (C->needsRelocation())
+    return SectionKind::getData();
+  return SectionKind::getData();
 }
 
 /// This method computes the appropriate section to emit the specified global
@@ -273,7 +252,8 @@ TargetLoweringObjectFile::SectionForGlobal(const GlobalValue *GV,
 
 MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
     const Function &F, Mangler &Mang, const TargetMachine &TM) const {
-  return getSectionForConstant(SectionKind::getReadOnly(), /*C=*/nullptr);
+  return getSectionForConstant(F.getParent()->getDataLayout(),
+                               SectionKind::getReadOnly(), /*C=*/nullptr);
 }
 
 bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
@@ -296,9 +276,8 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
 
 /// Given a mergable constant with the specified size and relocation
 /// information, return a section that it should be placed in.
-MCSection *
-TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind,
-                                                const Constant *C) const {
+MCSection *TargetLoweringObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
   if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
@@ -345,7 +324,7 @@ const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol
 }
 
 void TargetLoweringObjectFile::getNameWithPrefix(
-    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
-    bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const {
-  Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang,
+    const TargetMachine &TM) const {
+  Mang.getNameWithPrefix(OutName, GV, /*CannotUsePrivateLabel=*/false);
 }
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
index 83174c2..850c93c 100644
--- a/contrib/llvm/lib/Target/TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -150,24 +150,11 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
 }
 
 TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(F.getParent()->getDataLayout());
   });
 }
 
-static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
-                               const MCSection &Section) {
-  if (!AsmInfo.isSectionAtomizableBySymbols(Section))
-    return true;
-
-  // If it is not dead stripped, it is safe to use private labels.
-  const MCSectionMachO &SMO = cast<MCSectionMachO>(Section);
-  if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
-    return true;
-
-  return false;
-}
-
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
                                       const GlobalValue *GV, Mangler &Mang,
                                       bool MayAlwaysUsePrivate) const {
@@ -177,11 +164,8 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
     Mang.getNameWithPrefix(Name, GV, false);
     return;
   }
-  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this);
   const TargetLoweringObjectFile *TLOF = getObjFileLowering();
-  const MCSection *TheSection = TLOF->SectionForGlobal(GV, GVKind, Mang, *this);
-  bool CannotUsePrivateLabel = !canUsePrivateLabel(*AsmInfo, *TheSection);
-  TLOF->getNameWithPrefix(Name, GV, CannotUsePrivateLabel, Mang, *this);
+  TLOF->getNameWithPrefix(Name, GV, Mang, *this);
 }
 
 MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const {
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
index 7199235..f82566c 100644
--- a/contrib/llvm/lib/Target/TargetMachineC.cpp
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -32,17 +32,25 @@
 
 using namespace llvm;
 
-inline TargetMachine *unwrap(LLVMTargetMachineRef P) {
-  return reinterpret_cast<TargetMachine*>(P);
+namespace llvm {
+// Friend to the TargetMachine, access legacy API that are made private in C++
+struct C_API_PRIVATE_ACCESS {
+  static const DataLayout &getDataLayout(const TargetMachine &T) {
+    return T.getDataLayout();
+  }
+};
+}
+
+static TargetMachine *unwrap(LLVMTargetMachineRef P) {
+  return reinterpret_cast<TargetMachine *>(P);
 }
-inline Target *unwrap(LLVMTargetRef P) {
+static Target *unwrap(LLVMTargetRef P) {
   return reinterpret_cast<Target*>(P);
 }
-inline LLVMTargetMachineRef wrap(const TargetMachine *P) {
-  return
-    reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P));
+static LLVMTargetMachineRef wrap(const TargetMachine *P) {
+  return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine *>(P));
 }
-inline LLVMTargetRef wrap(const Target * P) {
+static LLVMTargetRef wrap(const Target * P) {
   return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P));
 }
 
@@ -69,16 +77,16 @@ LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
 LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
                                  char **ErrorMessage) {
   std::string Error;
-  
+
   *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
-  
+
   if (!*T) {
     if (ErrorMessage)
       *ErrorMessage = strdup(Error.c_str());
 
     return 1;
   }
-  
+
   return 0;
 }
 
@@ -145,10 +153,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
     CM, OL));
 }
 
-
-void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) {
-  delete unwrap(T);
-}
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
 
 LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
   const Target* target = &(unwrap(T)->getTarget());
@@ -170,8 +175,9 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
   return strdup(StringRep.c_str());
 }
 
+/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
-  return wrap(unwrap(T)->getDataLayout());
+  return wrap(&C_API_PRIVATE_ACCESS::getDataLayout(*unwrap(T)));
 }
 
 void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
@@ -190,14 +196,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
 
   std::string error;
 
-  const DataLayout *td = TM->getDataLayout();
-
-  if (!td) {
-    error = "No DataLayout in TargetMachine";
-    *ErrorMessage = strdup(error.c_str());
-    return true;
-  }
-  Mod->setDataLayout(*td);
+  Mod->setDataLayout(TM->createDataLayout());
 
   TargetMachine::CodeGenFileType ft;
   switch (codegen) {
@@ -239,7 +238,6 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
   SmallString<0> CodeString;
   raw_svector_ostream OStream(CodeString);
   bool Result = LLVMTargetMachineEmit(T, M, OStream, codegen, ErrorMessage);
-  OStream.flush();
 
   StringRef Data = OStream.str();
   *OutMemBuf =
diff --git a/contrib/llvm/lib/Target/TargetRecip.cpp b/contrib/llvm/lib/Target/TargetRecip.cpp
index 42bc487..d41b643 100644
--- a/contrib/llvm/lib/Target/TargetRecip.cpp
+++ b/contrib/llvm/lib/Target/TargetRecip.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 // the key strings for queries and command-line inputs.
 // In addition, the command-line interface recognizes the global parameters
 // "all", "none", and "default".
-static const char *RecipOps[] = {
+static const char *const RecipOps[] = {
   "divd",
   "divf",
   "vec-divd",
@@ -46,7 +46,7 @@ TargetRecip::TargetRecip() {
     RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
 }
 
-static bool parseRefinementStep(const StringRef &In, size_t &Position,
+static bool parseRefinementStep(StringRef In, size_t &Position,
                                 uint8_t &Value) {
   const char RefStepToken = ':';
   Position = In.find(RefStepToken);
@@ -175,7 +175,7 @@ TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
   parseIndividualParams(Args);
 }
 
-bool TargetRecip::isEnabled(const StringRef &Key) const {
+bool TargetRecip::isEnabled(StringRef Key) const {
   ConstRecipIter Iter = RecipMap.find(Key);
   assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
   assert(Iter->second.Enabled != Uninitialized &&
@@ -183,7 +183,7 @@ bool TargetRecip::isEnabled(const StringRef &Key) const {
   return Iter->second.Enabled;
 }
 
-unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
+unsigned TargetRecip::getRefinementSteps(StringRef Key) const {
   ConstRecipIter Iter = RecipMap.find(Key);
   assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
   assert(Iter->second.RefinementSteps != Uninitialized &&
@@ -192,7 +192,7 @@ unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
 }
 
 /// Custom settings (previously initialized values) override target defaults.
-void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
+void TargetRecip::setDefaults(StringRef Key, bool Enable,
                               unsigned RefSteps) {
   if (Key == "all") {
     for (auto &KV : RecipMap) {
@@ -213,7 +213,7 @@ void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
 
 bool TargetRecip::operator==(const TargetRecip &Other) const {
   for (const auto &KV : RecipMap) {
-    const StringRef &Op = KV.first;
+    StringRef Op = KV.first;
     const RecipParams &RP = KV.second;
     const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
     if (RP.RefinementSteps != OtherRP.RefinementSteps)
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..5e55e29
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMWebAssemblyDisassembler
+  WebAssemblyDisassembler.cpp
+  )
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..a452ca1
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/WebAssembly/Disassembler/LLVMBuild.txt -----*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = WebAssemblyDisassembler
+parent = WebAssembly
+required_libraries = MCDisassembler WebAssemblyInfo Support
+add_to_library_groups = WebAssembly
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile
new file mode 100644
index 0000000..bcd36ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===-- lib/Target/WebAssembly/Disassembler/Makefile -------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMWebAssemblyDisassembler
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
new file mode 100644
index 0000000..0143b10
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -0,0 +1,148 @@
+//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the WebAssembly Disassembler.
+///
+/// It contains code to translate the data produced by the decoder into
+/// MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-disassembler"
+
+namespace {
+class WebAssemblyDisassembler final : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MCII;
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+
+public:
+  WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                          std::unique_ptr<const MCInstrInfo> MCII)
+      : MCDisassembler(STI, Ctx), MCII(std::move(MCII)) {}
+};
+} // end anonymous namespace
+
+static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
+                                                     const MCSubtargetInfo &STI,
+                                                     MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MCII(T.createMCInstrInfo());
+  return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
+}
+
+extern "C" void LLVMInitializeWebAssemblyDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget32,
+                                         createWebAssemblyDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget64,
+                                         createWebAssemblyDisassembler);
+}
+
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+    raw_ostream &OS, raw_ostream &CS) const {
+  Size = 0;
+  uint64_t Pos = 0;
+
+  // Read the opcode.
+  if (Pos + sizeof(uint64_t) > Bytes.size())
+    return MCDisassembler::Fail;
+  uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
+  Pos += sizeof(uint64_t);
+
+  if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
+    return MCDisassembler::Fail;
+
+  MI.setOpcode(Opcode);
+  const MCInstrDesc &Desc = MCII->get(Opcode);
+  unsigned NumFixedOperands = Desc.NumOperands;
+
+  // If it's variadic, read the number of extra operands.
+  unsigned NumExtraOperands = 0;
+  if (Desc.isVariadic()) {
+    if (Pos + sizeof(uint64_t) > Bytes.size())
+      return MCDisassembler::Fail;
+    NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
+    Pos += sizeof(uint64_t);
+  }
+
+  // Read the fixed operands. These are described by the MCInstrDesc.
+  for (unsigned i = 0; i < NumFixedOperands; ++i) {
+    const MCOperandInfo &Info = Desc.OpInfo[i];
+    switch (Info.OperandType) {
+    case MCOI::OPERAND_IMMEDIATE:
+    case WebAssembly::OPERAND_BASIC_BLOCK: {
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      MI.addOperand(MCOperand::createImm(Imm));
+      break;
+    }
+    case MCOI::OPERAND_REGISTER: {
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      MI.addOperand(MCOperand::createReg(Reg));
+      break;
+    }
+    case WebAssembly::OPERAND_FPIMM: {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      double Imm;
+      memcpy(&Imm, &Bits, sizeof(Imm));
+      MI.addOperand(MCOperand::createFPImm(Imm));
+      break;
+    }
+    default:
+      llvm_unreachable("unimplemented operand kind");
+    }
+  }
+
+  // Read the extra operands.
+  assert(NumExtraOperands == 0 || Desc.isVariadic());
+  for (unsigned i = 0; i < NumExtraOperands; ++i) {
+    if (Pos + sizeof(uint64_t) > Bytes.size())
+      return MCDisassembler::Fail;
+    if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
+      // Decode extra immediate operands.
+      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+      MI.addOperand(MCOperand::createImm(Imm));
+    } else {
+      // Decode extra register operands.
+      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+      MI.addOperand(MCOperand::createReg(Reg));
+    }
+    Pos += sizeof(uint64_t);
+  }
+
+  Size = Pos;
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index fbb985a..9a95150 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -13,7 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -21,23 +25,164 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include <cctype>
+#include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+#include "WebAssemblyGenAsmWriter.inc"
+
 WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
                                                const MCInstrInfo &MII,
                                                const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI) {}
+    : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {}
 
 void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
                                           unsigned RegNo) const {
-  llvm_unreachable("TODO: implement printRegName");
+  assert(RegNo != WebAssemblyFunctionInfo::UnusedReg);
+  // Note that there's an implicit get_local/set_local here!
+  OS << "$" << RegNo;
 }
 
 void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                        StringRef Annot,
-                                       const MCSubtargetInfo &STI) {
-  llvm_unreachable("TODO: implement printInst");
+                                       const MCSubtargetInfo & /*STI*/) {
+  // Print the instruction (this uses the AsmStrings from the .td files).
+  printInstruction(MI, OS);
+
+  // Print any additional variadic operands.
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  if (Desc.isVariadic())
+    for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) {
+      if (i != 0)
+        OS << ", ";
+      printOperand(MI, i, OS);
+    }
+
+  // Print any added annotation.
+  printAnnotation(OS, Annot);
+
+  if (CommentStream) {
+    // Observe any effects on the control flow stack, for use in annotating
+    // control flow label references.
+    switch (MI->getOpcode()) {
+    default:
+      break;
+    case WebAssembly::LOOP: {
+      // Grab the TopLabel value first so that labels print in numeric order.
+      uint64_t TopLabel = ControlFlowCounter++;
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      printAnnotation(OS, "label" + utostr(TopLabel) + ':');
+      ControlFlowStack.push_back(std::make_pair(TopLabel, true));
+      break;
+    }
+    case WebAssembly::BLOCK:
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      break;
+    case WebAssembly::END_LOOP:
+      ControlFlowStack.pop_back();
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      break;
+    case WebAssembly::END_BLOCK:
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      break;
+    }
+
+    // Annotate any control flow label references.
+    unsigned NumFixedOperands = Desc.NumOperands;
+    SmallSet<uint64_t, 8> Printed;
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      const MCOperandInfo &Info = Desc.OpInfo[i];
+      if (!(i < NumFixedOperands
+                ? (Info.OperandType == WebAssembly::OPERAND_BASIC_BLOCK)
+                : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel)))
+        continue;
+      uint64_t Depth = MI->getOperand(i).getImm();
+      if (!Printed.insert(Depth).second)
+        continue;
+      const auto &Pair = ControlFlowStack.rbegin()[Depth];
+      printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") +
+                              " to label" + utostr(Pair.first));
+    }
+  }
+}
+
+static std::string toString(const APFloat &FP) {
+  static const size_t BufBytes = 128;
+  char buf[BufBytes];
+  if (FP.isNaN())
+    assert((FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) ||
+            FP.bitwiseIsEqual(
+                APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) &&
+           "convertToHexString handles neither SNaN nor NaN payloads");
+  // Use C99's hexadecimal floating-point representation.
+  auto Written = FP.convertToHexString(
+      buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
+  (void)Written;
+  assert(Written != 0);
+  assert(Written < BufBytes);
+  return buf;
+}
+
+void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            MII.get(MI->getOpcode()).TSFlags == 0) &&
+           "WebAssembly variable_ops register ops don't use TSFlags");
+    unsigned WAReg = Op.getReg();
+    if (int(WAReg) >= 0)
+      printRegName(O, WAReg);
+    else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
+      O << "$pop" << (WAReg & INT32_MAX);
+    else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
+      O << "$push" << (WAReg & INT32_MAX);
+    else
+      O << "$discard";
+    // Add a '=' suffix if this is a def.
+    if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
+      O << '=';
+  } else if (Op.isImm()) {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            (MII.get(MI->getOpcode()).TSFlags &
+             WebAssemblyII::VariableOpIsImmediate)) &&
+           "WebAssemblyII::VariableOpIsImmediate should be set for "
+           "variable_ops immediate ops");
+    // TODO: (MII.get(MI->getOpcode()).TSFlags &
+    //        WebAssemblyII::VariableOpImmediateIsLabel)
+    // can tell us whether this is an immediate referencing a label in the
+    // control flow stack, and it may be nice to pretty-print.
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            MII.get(MI->getOpcode()).TSFlags == 0) &&
+           "WebAssembly variable_ops floating point ops don't use TSFlags");
+    O << toString(APFloat(Op.getFPImm()));
+  } else {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            (MII.get(MI->getOpcode()).TSFlags &
+             WebAssemblyII::VariableOpIsImmediate)) &&
+           "WebAssemblyII::VariableOpIsImmediate should be set for "
+           "variable_ops expr ops");
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+const char *llvm::WebAssembly::TypeToString(MVT Ty) {
+  switch (Ty.SimpleTy) {
+  case MVT::i32:
+    return "i32";
+  case MVT::i64:
+    return "i64";
+  case MVT::f32:
+    return "f32";
+  case MVT::f64:
+    return "f64";
+  default:
+    llvm_unreachable("unsupported type");
+  }
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 70fcef2..cd6c59a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -16,14 +16,16 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineValueType.h"
 
 namespace llvm {
 
-class MCOperand;
 class MCSubtargetInfo;
 
-class WebAssemblyInstPrinter : public MCInstPrinter {
+class WebAssemblyInstPrinter final : public MCInstPrinter {
+  uint64_t ControlFlowCounter;
+  SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack;
+
 public:
   WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                          const MCRegisterInfo &MRI);
@@ -31,8 +33,21 @@ public:
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
+
+  // Used by tblegen code.
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
 };
 
+namespace WebAssembly {
+
+const char *TypeToString(MVT Ty);
+
+} // end namespace WebAssembly
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
new file mode 100644
index 0000000..bba06f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -0,0 +1,100 @@
+//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyAsmBackend class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+  bool Is64Bit;
+
+public:
+  explicit WebAssemblyAsmBackend(bool Is64Bit)
+      : MCAsmBackend(), Is64Bit(Is64Bit) {}
+  ~WebAssemblyAsmBackend() override {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    // We currently just use the generic fixups in MCFixup.h and don't have any
+    // target-specific fixups.
+    return 0;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
+                                         MCObjectWriter *OW) const {
+  if (Count == 0)
+    return true;
+
+  // FIXME: Do something.
+  return false;
+}
+
+void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                       unsigned DataSize, uint64_t Value,
+                                       bool IsPCRel) const {
+  const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+  unsigned NumBytes = RoundUpToAlignment(Info.TargetSize, 8);
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
+  return new WebAssemblyAsmBackend(TT.isArch64Bit());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
new file mode 100644
index 0000000..2bb58b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -0,0 +1,66 @@
+//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles ELF-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter {
+public:
+  WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
+                                                       uint8_t OSABI)
+    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
+                              /*HasRelocationAddend=*/false) {}
+
+unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target,
+                                                  const MCFixup &Fixup,
+                                                  bool IsPCRel) const {
+  // WebAssembly functions are not allocated in the address space. To resolve a
+  // pointer to a function, we must use a special relocation type.
+  if (const MCSymbolRefExpr *SyExp =
+          dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+    if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
+      return ELF::R_WEBASSEMBLY_FUNCTION;
+
+  switch (Fixup.getKind()) {
+  case FK_Data_4:
+    assert(!is64Bit() && "4-byte relocations only supported on wasm32");
+    return ELF::R_WEBASSEMBLY_DATA;
+  case FK_Data_8:
+    assert(is64Bit() && "8-byte relocations only supported on wasm64");
+    return ELF::R_WEBASSEMBLY_DATA;
+  default:
+    llvm_unreachable("unimplemented fixup kind");
+  }
+}
+
+MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+                                                       bool Is64Bit,
+                                                       uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW =
+      new WebAssemblyELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 55346f7..02c717a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -23,15 +23,16 @@ using namespace llvm;
 WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
 
 WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
-  PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit();
+  PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
 
   // TODO: What should MaxInstLength be?
 
-  PrivateGlobalPrefix = "";
-  PrivateLabelPrefix = "";
-
   UseDataRegionDirectives = true;
 
+  // Use .skip instead of .zero because .zero is confusing when used with two
+  // arguments (it doesn't actually zero things out).
+  ZeroDirective = "\t.skip\t";
+
   Data8bitsDirective = "\t.int8\t";
   Data16bitsDirective = "\t.int16\t";
   Data32bitsDirective = "\t.int32\t";
@@ -41,9 +42,6 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
   COMMDirectiveAlignmentIsInBytes = false;
   LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
 
-  HasDotTypeDotSizeDirective = false;
-  HasSingleParameterDotFile = false;
-
   SupportsDebugInformation = true;
 
   // For now, WebAssembly does not support exceptions.
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index d2b8fb7..2dcf2cd 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -15,13 +15,13 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
 class Triple;
 
-class WebAssemblyMCAsmInfo final : public MCAsmInfo {
+class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
 public:
   explicit WebAssemblyMCAsmInfo(const Triple &T);
   ~WebAssemblyMCAsmInfo() override;
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
new file mode 100644
index 0000000..f409bd7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -0,0 +1,92 @@
+//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyMCCodeEmitter class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  const MCContext &Ctx;
+
+  // Implementation generated by tablegen.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+public:
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+                                                    MCContext &Ctx) {
+  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
+}
+
+void WebAssemblyMCCodeEmitter::encodeInstruction(
+    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  // FIXME: This is not the real binary encoding. This is an extremely
+  // over-simplified encoding where we just use uint64_t for everything. This
+  // is a temporary measure.
+  support::endian::Writer<support::little>(OS).write<uint64_t>(MI.getOpcode());
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (Desc.isVariadic())
+    support::endian::Writer<support::little>(OS).write<uint64_t>(
+        MI.getNumOperands() - Desc.NumOperands);
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (MO.isReg()) {
+      support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getReg());
+    } else if (MO.isImm()) {
+      support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getImm());
+    } else if (MO.isFPImm()) {
+      support::endian::Writer<support::little>(OS).write<double>(MO.getFPImm());
+    } else if (MO.isExpr()) {
+      support::endian::Writer<support::little>(OS).write<uint64_t>(0);
+      Fixups.push_back(MCFixup::create(
+          (1 + MCII.get(MI.getOpcode()).isVariadic() + i) * sizeof(uint64_t),
+          MO.getExpr(), STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
+          MI.getLoc()));
+      ++MCNumFixups;
+    } else {
+      llvm_unreachable("unexpected operand kind");
+    }
+  }
+
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "WebAssemblyGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 224aa77..37000f1 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -15,10 +15,10 @@
 #include "WebAssemblyMCTargetDesc.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCAsmInfo.h"
+#include "WebAssemblyTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -26,34 +26,98 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-target-desc"
 
+#define GET_INSTRINFO_MC_DESC
+#include "WebAssemblyGenInstrInfo.inc"
+
 #define GET_SUBTARGETINFO_MC_DESC
 #include "WebAssemblyGenSubtargetInfo.inc"
 
 #define GET_REGINFO_MC_DESC
 #include "WebAssemblyGenRegisterInfo.inc"
 
-static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI,
-                                             const Triple &TT) {
-  MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT);
-  return MAI;
+static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
+                                  const Triple &TT) {
+  return new WebAssemblyMCAsmInfo(TT);
+}
+
+static MCInstrInfo *createMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitWebAssemblyMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitWebAssemblyMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
+                                          unsigned SyntaxVariant,
+                                          const MCAsmInfo &MAI,
+                                          const MCInstrInfo &MII,
+                                          const MCRegisterInfo &MRI) {
+  assert(SyntaxVariant == 0);
+  return new WebAssemblyInstPrinter(MAI, MII, MRI);
+}
+
+static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo & /*MRI*/,
+                                        MCContext &Ctx) {
+  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+}
+
+static MCAsmBackend *createAsmBackend(const Target & /*T*/,
+                                      const MCRegisterInfo & /*MRI*/,
+                                      const Triple &TT, StringRef /*CPU*/) {
+  return createWebAssemblyAsmBackend(TT);
 }
 
-static MCInstPrinter *
-createWebAssemblyMCInstPrinter(const Triple &T, unsigned SyntaxVariant,
-                               const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                               const MCRegisterInfo &MRI) {
-  if (SyntaxVariant == 0 || SyntaxVariant == 1)
-    return new WebAssemblyInstPrinter(MAI, MII, MRI);
-  return nullptr;
+static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                              StringRef FS) {
+  return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
+  return new WebAssemblyTargetELFStreamer(S);
+}
+
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter * /*InstPrint*/,
+                                                 bool /*isVerboseAsm*/) {
+  return new WebAssemblyTargetAsmStreamer(S, OS);
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeWebAssemblyTargetMC() {
   for (Target *T : {&TheWebAssemblyTarget32, &TheWebAssemblyTarget64}) {
     // Register the MC asm info.
-    RegisterMCAsmInfoFn X(*T, createWebAssemblyMCAsmInfo);
+    RegisterMCAsmInfoFn X(*T, createMCAsmInfo);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo);
 
     // Register the MCInstPrinter.
-    TargetRegistry::RegisterMCInstPrinter(*T, createWebAssemblyMCInstPrinter);
+    TargetRegistry::RegisterMCInstPrinter(*T, createMCInstPrinter);
+
+    // Register the MC code emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createCodeEmitter);
+
+    // Register the ASM Backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createAsmBackend);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createMCSubtargetInfo);
+
+    // Register the object target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(*T,
+                                                 createObjectTargetStreamer);
+    // Register the asm target streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
   }
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index eebf5b7..9bac4f8 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -15,32 +15,61 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
-#include <string>
 
 namespace llvm {
 
-class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCRegisterInfo;
 class MCObjectWriter;
-class MCStreamer;
 class MCSubtargetInfo;
-class MCTargetStreamer;
-class StringRef;
 class Target;
 class Triple;
-class raw_ostream;
+class raw_pwrite_stream;
 
 extern Target TheWebAssemblyTarget32;
 extern Target TheWebAssemblyTarget64;
 
-MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
-                                          const MCRegisterInfo &MRI,
-                                          StringRef TT, StringRef CPU);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+                                              MCContext &Ctx);
+
+MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
+
+MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+                                                 bool Is64Bit, uint8_t OSABI);
+
+namespace WebAssembly {
+enum OperandType {
+  /// Basic block label in a branch construct.
+  OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
+  /// Floating-point immediate.
+  OPERAND_FPIMM
+};
+
+/// WebAssembly-specific directive identifiers.
+enum Directive {
+  // FIXME: This is not the real binary encoding.
+  DotParam = UINT64_MAX - 0,   ///< .param
+  DotResult = UINT64_MAX - 1,  ///< .result
+  DotLocal = UINT64_MAX - 2,   ///< .local
+  DotEndFunc = UINT64_MAX - 3, ///< .endfunc
+};
+
+} // end namespace WebAssembly
+
+namespace WebAssemblyII {
+enum {
+  // For variadic instructions, this flag indicates whether an operand
+  // in the variable_ops range is an immediate value.
+  VariableOpIsImmediate = (1 << 0),
+  // For immediate values in the variable_ops range, this flag indicates
+  // whether the value represents a control-flow label.
+  VariableOpImmediateIsLabel = (1 << 1),
+};
+} // end namespace WebAssemblyII
 
 } // end namespace llvm
 
@@ -50,6 +79,11 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
 #define GET_REGINFO_ENUM
 #include "WebAssemblyGenRegisterInfo.inc"
 
+// Defines symbolic names for the WebAssembly instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "WebAssemblyGenInstrInfo.inc"
+
 #define GET_SUBTARGETINFO_ENUM
 #include "WebAssemblyGenSubtargetInfo.inc"
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
new file mode 100644
index 0000000..1d28228
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -0,0 +1,94 @@
+//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetStreamer.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyTargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S) {}
+
+WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
+    MCStreamer &S, formatted_raw_ostream &OS)
+    : WebAssemblyTargetStreamer(S), OS(OS) {}
+
+WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
+    : WebAssemblyTargetStreamer(S) {}
+
+static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
+  bool First = true;
+  for (MVT Type : Types) {
+    if (First)
+      First = false;
+    else
+      OS << ", ";
+    OS << WebAssembly::TypeToString(Type);
+  }
+  OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
+  OS << "\t.param  \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
+  OS << "\t.result \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
+  OS << "\t.local  \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
+
+// FIXME: What follows is not the real binary encoding.
+
+static void EncodeTypes(MCStreamer &Streamer, ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(Types.size(), sizeof(uint64_t));
+  for (MVT Type : Types)
+    Streamer.EmitIntValue(Type.SimpleTy, sizeof(uint64_t));
+}
+
+void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(WebAssembly::DotParam, sizeof(uint64_t));
+  EncodeTypes(Streamer, Types);
+}
+
+void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(WebAssembly::DotResult, sizeof(uint64_t));
+  EncodeTypes(Streamer, Types);
+}
+
+void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(WebAssembly::DotLocal, sizeof(uint64_t));
+  EncodeTypes(Streamer, Types);
+}
+
+void WebAssemblyTargetELFStreamer::emitEndFunc() {
+  Streamer.EmitIntValue(WebAssembly::DotEndFunc, sizeof(uint64_t));
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
new file mode 100644
index 0000000..c66a515
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -0,0 +1,68 @@
+//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCELFStreamer;
+
+/// WebAssembly-specific streamer interface, to implement support
+/// WebAssembly-specific assembly directives.
+class WebAssemblyTargetStreamer : public MCTargetStreamer {
+public:
+  explicit WebAssemblyTargetStreamer(MCStreamer &S);
+
+  /// .param
+  virtual void emitParam(ArrayRef<MVT> Types) = 0;
+  /// .result
+  virtual void emitResult(ArrayRef<MVT> Types) = 0;
+  /// .local
+  virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  /// .endfunc
+  virtual void emitEndFunc() = 0;
+};
+
+/// This part is for ascii assembly output
+class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void emitParam(ArrayRef<MVT> Types) override;
+  void emitResult(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitEndFunc() override;
+};
+
+/// This part is for ELF object output
+class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
+
+  void emitParam(ArrayRef<MVT> Types) override;
+  void emitResult(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitEndFunc() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt
index 63e02c4..b97ea45 100644
--- a/contrib/llvm/lib/Target/WebAssembly/README.txt
+++ b/contrib/llvm/lib/Target/WebAssembly/README.txt
@@ -12,6 +12,16 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/AstSemantics.md
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
+The backend is built, tested and archived on the following waterfall:
+  https://build.chromium.org/p/client.wasm.llvm/console
+
+The backend's bringup is done using the GCC torture test suite first since it
+doesn't require C library support. Current known failures are in
+known_gcc_test_failures.txt, all other tests should pass. The waterfall will
+turn red if not. Once most of these pass, further testing will use LLVM's own
+test suite. The tests can be run locally using:
+  github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py
+
 Interesting work that remains to be done:
 * Write a pass to restructurize irreducible control flow. This needs to be done
   before register allocation to be efficient, because it may duplicate basic
@@ -19,8 +29,60 @@ Interesting work that remains to be done:
   level. Note that LLVM's GPU code has such a pass, but it linearizes control
   flow (e.g. both sides of branches execute and are masked) which is undesirable
   for WebAssembly.
-* Basic relooper to expose control flow as an AST.
-* Figure out how to properly use MC for virtual ISAs. This may require some
-  refactoring of MC.
+
+//===---------------------------------------------------------------------===//
+
+set_local instructions have a return value. We should (a) model this,
+and (b) write optimizations which take advantage of it. Keep in mind that
+many set_local instructions are implicit!
+
+//===---------------------------------------------------------------------===//
+
+Br, br_if, and tableswitch instructions can support having a value on the
+expression stack across the jump (sometimes). We should (a) model this, and
+(b) extend the stackifier to utilize it.
+
+//===---------------------------------------------------------------------===//
+
+The min/max operators aren't exactly a<b?a:b because of NaN and negative zero
+behavior. The ARM target has the same kind of min/max instructions and has
+implemented optimizations for them; we should do similar optimizations for
+WebAssembly.
+
+//===---------------------------------------------------------------------===//
+
+AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM.
+Would these be useful to run for WebAssembly too? Also, it has an option to
+run SimplifyCFG after running the AtomicExpand pass. Would this be useful for
+us too?
+
+//===---------------------------------------------------------------------===//
+
+When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly?
+
+//===---------------------------------------------------------------------===//
+
+Register stackification uses the EXPR_STACK physical register to impose
+ordering dependencies on instructions with stack operands. This is pessimistic;
+we should consider alternate ways to model stack dependencies.
+
+//===---------------------------------------------------------------------===//
+
+Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly,
+there are numerous optimization-related hooks that can be overridden in
+WebAssemblyTargetLowering.
+
+//===---------------------------------------------------------------------===//
+
+Instead of the OptimizeReturned pass, which should consider preserving the
+"returned" attribute through to MachineInstrs and extending the StoreResults
+pass to do this optimization on calls too. That would also let the
+WebAssemblyPeephole pass clean up dead defs for such calls, as it does for
+stores.
+
+//===---------------------------------------------------------------------===//
+
+Memset/memcpy/memmove should be marked with the "returned" attribute somehow,
+even when they are translated through intrinsics.
 
 //===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp b/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp
new file mode 100644
index 0000000..9b718ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp
@@ -0,0 +1,984 @@
+//===-- Relooper.cpp - Top-level interface for WebAssembly  ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This implements the Relooper algorithm. This implementation includes
+/// optimizations added since the original academic paper [1] was published.
+///
+/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
+/// Proceedings of the ACM international conference companion on Object
+/// oriented programming systems languages and applications companion
+/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
+/// http://doi.acm.org/10.1145/2048147.2048224
+///
+//===-------------------------------------------------------------------===//
+
+#include "Relooper.h"
+#include "WebAssembly.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <functional>
+#include <list>
+#include <stack>
+#include <string>
+
+#define DEBUG_TYPE "relooper"
+
+using namespace llvm;
+using namespace Relooper;
+
+static cl::opt<int> RelooperSplittingFactor(
+    "relooper-splitting-factor",
+    cl::desc(
+        "How much to discount code size when deciding whether to split a node"),
+    cl::init(5));
+
+static cl::opt<unsigned> RelooperMultipleSwitchThreshold(
+    "relooper-multiple-switch-threshold",
+    cl::desc(
+        "How many entries to allow in a multiple before we use a switch"),
+    cl::init(10));
+
+static cl::opt<unsigned> RelooperNestingLimit(
+    "relooper-nesting-limit",
+    cl::desc(
+        "How much nesting is acceptable"),
+    cl::init(20));
+
+
+namespace {
+///
+/// Implements the relooper algorithm for a function's blocks.
+///
+/// Implementation details: The Relooper instance has
+/// ownership of the blocks and shapes, and frees them when done.
+///
+struct RelooperAlgorithm {
+  std::deque<Block *> Blocks;
+  std::deque<Shape *> Shapes;
+  Shape *Root;
+  bool MinSize;
+  int BlockIdCounter;
+  int ShapeIdCounter;
+
+  RelooperAlgorithm();
+  ~RelooperAlgorithm();
+
+  void AddBlock(Block *New, int Id = -1);
+
+  // Calculates the shapes
+  void Calculate(Block *Entry);
+
+  // Sets us to try to minimize size
+  void SetMinSize(bool MinSize_) { MinSize = MinSize_; }
+};
+
+struct RelooperAnalysis final : public FunctionPass {
+  static char ID;
+  RelooperAnalysis() : FunctionPass(ID) {}
+  const char *getPassName() const override { return "relooper"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+  bool runOnFunction(Function &F) override;
+};
+}
+
+// RelooperAnalysis
+
+char RelooperAnalysis::ID = 0;
+FunctionPass *llvm::createWebAssemblyRelooper() {
+  return new RelooperAnalysis();
+}
+
+bool RelooperAnalysis::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n");
+  RelooperAlgorithm R;
+  // FIXME: remove duplication between relooper's and LLVM's BBs.
+  std::map<const BasicBlock *, Block *> BB2B;
+  std::map<const Block *, const BasicBlock *> B2BB;
+  for (const BasicBlock &BB : F) {
+    // FIXME: getName is wrong here, Code is meant to represent amount of code.
+    // FIXME: use BranchVarInit for switch.
+    Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr);
+    R.AddBlock(B);
+    assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice");
+    assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice");
+    BB2B[&BB] = B;
+    B2BB[B] = &BB;
+  }
+  for (Block *B : R.Blocks) {
+    const BasicBlock *BB = B2BB[B];
+    for (const BasicBlock *Successor : successors(BB))
+      // FIXME: add branch's Condition and Code below.
+      B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr);
+  }
+  R.Calculate(BB2B[&F.getEntryBlock()]);
+  return false; // Analysis passes don't modify anything.
+}
+
+// Helpers
+
+typedef MapVector<Block *, BlockSet> BlockBlockSetMap;
+typedef std::list<Block *> BlockList;
+
+template <class T, class U>
+static bool contains(const T &container, const U &contained) {
+  return container.count(contained);
+}
+
+
+// Branch
+
+Branch::Branch(const char *ConditionInit, const char *CodeInit)
+    : Ancestor(nullptr), Labeled(true) {
+  // FIXME: move from char* to LLVM data structures
+  Condition = ConditionInit ? strdup(ConditionInit) : nullptr;
+  Code = CodeInit ? strdup(CodeInit) : nullptr;
+}
+
+Branch::~Branch() {
+  // FIXME: move from char* to LLVM data structures
+  free(static_cast<void *>(const_cast<char *>(Condition)));
+  free(static_cast<void *>(const_cast<char *>(Code)));
+}
+
+// Block
+
+Block::Block(const char *CodeInit, const char *BranchVarInit)
+    : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) {
+  // FIXME: move from char* to LLVM data structures
+  Code = strdup(CodeInit);
+  BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr;
+}
+
+Block::~Block() {
+  // FIXME: move from char* to LLVM data structures
+  free(static_cast<void *>(const_cast<char *>(Code)));
+  free(static_cast<void *>(const_cast<char *>(BranchVar)));
+}
+
+void Block::AddBranchTo(Block *Target, const char *Condition,
+                        const char *Code) {
+  assert(!contains(BranchesOut, Target) &&
+         "cannot add more than one branch to the same target");
+  BranchesOut[Target] = make_unique<Branch>(Condition, Code);
+}
+
+// Relooper
+
+RelooperAlgorithm::RelooperAlgorithm()
+    : Root(nullptr), MinSize(false), BlockIdCounter(1),
+      ShapeIdCounter(0) { // block ID 0 is reserved for clearings
+}
+
+RelooperAlgorithm::~RelooperAlgorithm() {
+  for (auto Curr : Blocks)
+    delete Curr;
+  for (auto Curr : Shapes)
+    delete Curr;
+}
+
+void RelooperAlgorithm::AddBlock(Block *New, int Id) {
+  New->Id = Id == -1 ? BlockIdCounter++ : Id;
+  Blocks.push_back(New);
+}
+
+struct RelooperRecursor {
+  RelooperAlgorithm *Parent;
+  RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
+};
+
+void RelooperAlgorithm::Calculate(Block *Entry) {
+  // Scan and optimize the input
+  struct PreOptimizer : public RelooperRecursor {
+    PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
+    BlockSet Live;
+
+    void FindLive(Block *Root) {
+      BlockList ToInvestigate;
+      ToInvestigate.push_back(Root);
+      while (!ToInvestigate.empty()) {
+        Block *Curr = ToInvestigate.front();
+        ToInvestigate.pop_front();
+        if (contains(Live, Curr))
+          continue;
+        Live.insert(Curr);
+        for (const auto &iter : Curr->BranchesOut)
+          ToInvestigate.push_back(iter.first);
+      }
+    }
+
+    // If a block has multiple entries but no exits, and it is small enough, it
+    // is useful to split it. A common example is a C++ function where
+    // everything ends up at a final exit block and does some RAII cleanup.
+    // Without splitting, we will be forced to introduce labelled loops to
+    // allow reaching the final block
+    void SplitDeadEnds() {
+      unsigned TotalCodeSize = 0;
+      for (const auto &Curr : Live) {
+        TotalCodeSize += strlen(Curr->Code);
+      }
+      BlockSet Splits;
+      BlockSet Removed;
+      for (const auto &Original : Live) {
+        if (Original->BranchesIn.size() <= 1 ||
+            !Original->BranchesOut.empty())
+          continue; // only dead ends, for now
+        if (contains(Original->BranchesOut, Original))
+          continue; // cannot split a looping node
+        if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) >
+            TotalCodeSize / RelooperSplittingFactor)
+          continue; // if splitting increases raw code size by a significant
+                    // amount, abort
+        // Split the node (for simplicity, we replace all the blocks, even
+        // though we could have reused the original)
+        DEBUG(dbgs() << "  Splitting '" << Original->Code << "'\n");
+        for (const auto &Prior : Original->BranchesIn) {
+          Block *Split = new Block(Original->Code, Original->BranchVar);
+          Parent->AddBlock(Split, Original->Id);
+          Split->BranchesIn.insert(Prior);
+          std::unique_ptr<Branch> Details;
+          Details.swap(Prior->BranchesOut[Original]);
+          Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition,
+                                                          Details->Code);
+          for (const auto &iter : Original->BranchesOut) {
+            Block *Post = iter.first;
+            Branch *Details = iter.second.get();
+            Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition,
+                                                           Details->Code);
+            Post->BranchesIn.insert(Split);
+          }
+          Splits.insert(Split);
+          Removed.insert(Original);
+        }
+        for (const auto &iter : Original->BranchesOut) {
+          Block *Post = iter.first;
+          Post->BranchesIn.remove(Original);
+        }
+      }
+      for (const auto &iter : Splits)
+        Live.insert(iter);
+      for (const auto &iter : Removed)
+        Live.remove(iter);
+    }
+  };
+  PreOptimizer Pre(this);
+  Pre.FindLive(Entry);
+
+  // Add incoming branches from live blocks, ignoring dead code
+  for (unsigned i = 0; i < Blocks.size(); i++) {
+    Block *Curr = Blocks[i];
+    if (!contains(Pre.Live, Curr))
+      continue;
+    for (const auto &iter : Curr->BranchesOut)
+      iter.first->BranchesIn.insert(Curr);
+  }
+
+  if (!MinSize)
+    Pre.SplitDeadEnds();
+
+  // Recursively process the graph
+
+  struct Analyzer : public RelooperRecursor {
+    Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
+
+    // Add a shape to the list of shapes in this Relooper calculation
+    void Notice(Shape *New) {
+      New->Id = Parent->ShapeIdCounter++;
+      Parent->Shapes.push_back(New);
+    }
+
+    // Create a list of entries from a block. If LimitTo is provided, only
+    // results in that set will appear
+    void GetBlocksOut(Block *Source, BlockSet &Entries,
+                      BlockSet *LimitTo = nullptr) {
+      for (const auto &iter : Source->BranchesOut)
+        if (!LimitTo || contains(*LimitTo, iter.first))
+          Entries.insert(iter.first);
+    }
+
+    // Converts/processes all branchings to a specific target
+    void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor,
+                   BlockSet &From) {
+      DEBUG(dbgs() << "  Solipsize '" << Target->Code << "' type " << Type
+                   << "\n");
+      for (auto iter = Target->BranchesIn.begin();
+           iter != Target->BranchesIn.end();) {
+        Block *Prior = *iter;
+        if (!contains(From, Prior)) {
+          iter++;
+          continue;
+        }
+        std::unique_ptr<Branch> PriorOut;
+        PriorOut.swap(Prior->BranchesOut[Target]);
+        PriorOut->Ancestor = Ancestor;
+        PriorOut->Type = Type;
+        if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor))
+          Multiple->Breaks++; // We are breaking out of this Multiple, so need a
+                              // loop
+        iter++; // carefully increment iter before erasing
+        Target->BranchesIn.remove(Prior);
+        Target->ProcessedBranchesIn.insert(Prior);
+        Prior->ProcessedBranchesOut[Target].swap(PriorOut);
+      }
+    }
+
+    Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) {
+      DEBUG(dbgs() << "  MakeSimple inner block '" << Inner->Code << "'\n");
+      SimpleShape *Simple = new SimpleShape;
+      Notice(Simple);
+      Simple->Inner = Inner;
+      Inner->Parent = Simple;
+      if (Blocks.size() > 1) {
+        Blocks.remove(Inner);
+        GetBlocksOut(Inner, NextEntries, &Blocks);
+        BlockSet JustInner;
+        JustInner.insert(Inner);
+        for (const auto &iter : NextEntries)
+          Solipsize(iter, Branch::Direct, Simple, JustInner);
+      }
+      return Simple;
+    }
+
+    Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries,
+                    BlockSet &NextEntries) {
+      // Find the inner blocks in this loop. Proceed backwards from the entries
+      // until
+      // you reach a seen block, collecting as you go.
+      BlockSet InnerBlocks;
+      BlockSet Queue = Entries;
+      while (!Queue.empty()) {
+        Block *Curr = *(Queue.begin());
+        Queue.remove(*Queue.begin());
+        if (!contains(InnerBlocks, Curr)) {
+          // This element is new, mark it as inner and remove from outer
+          InnerBlocks.insert(Curr);
+          Blocks.remove(Curr);
+          // Add the elements prior to it
+          for (const auto &iter : Curr->BranchesIn)
+            Queue.insert(iter);
+        }
+      }
+      assert(!InnerBlocks.empty());
+
+      for (const auto &Curr : InnerBlocks) {
+        for (const auto &iter : Curr->BranchesOut) {
+          Block *Possible = iter.first;
+          if (!contains(InnerBlocks, Possible))
+            NextEntries.insert(Possible);
+        }
+      }
+
+      LoopShape *Loop = new LoopShape();
+      Notice(Loop);
+
+      // Solipsize the loop, replacing with break/continue and marking branches
+      // as Processed (will not affect later calculations)
+      // A. Branches to the loop entries become a continue to this shape
+      for (const auto &iter : Entries)
+        Solipsize(iter, Branch::Continue, Loop, InnerBlocks);
+      // B. Branches to outside the loop (a next entry) become breaks on this
+      // shape
+      for (const auto &iter : NextEntries)
+        Solipsize(iter, Branch::Break, Loop, InnerBlocks);
+      // Finish up
+      Shape *Inner = Process(InnerBlocks, Entries, nullptr);
+      Loop->Inner = Inner;
+      return Loop;
+    }
+
+    // For each entry, find the independent group reachable by it. The
+    // independent group is the entry itself, plus all the blocks it can
+    // reach that cannot be directly reached by another entry. Note that we
+    // ignore directly reaching the entry itself by another entry.
+    //   @param Ignore - previous blocks that are irrelevant
+    void FindIndependentGroups(BlockSet &Entries,
+                               BlockBlockSetMap &IndependentGroups,
+                               BlockSet *Ignore = nullptr) {
+      typedef std::map<Block *, Block *> BlockBlockMap;
+
+      struct HelperClass {
+        BlockBlockSetMap &IndependentGroups;
+        BlockBlockMap Ownership; // For each block, which entry it belongs to.
+                                 // We have reached it from there.
+
+        HelperClass(BlockBlockSetMap &IndependentGroupsInit)
+            : IndependentGroups(IndependentGroupsInit) {}
+        void InvalidateWithChildren(Block *New) {
+          // Being in the list means you need to be invalidated
+          BlockList ToInvalidate;
+          ToInvalidate.push_back(New);
+          while (!ToInvalidate.empty()) {
+            Block *Invalidatee = ToInvalidate.front();
+            ToInvalidate.pop_front();
+            Block *Owner = Ownership[Invalidatee];
+            // Owner may have been invalidated, do not add to
+            // IndependentGroups!
+            if (contains(IndependentGroups, Owner))
+              IndependentGroups[Owner].remove(Invalidatee);
+            if (Ownership[Invalidatee]) { // may have been seen before and
+                                          // invalidated already
+              Ownership[Invalidatee] = nullptr;
+              for (const auto &iter : Invalidatee->BranchesOut) {
+                Block *Target = iter.first;
+                BlockBlockMap::iterator Known = Ownership.find(Target);
+                if (Known != Ownership.end()) {
+                  Block *TargetOwner = Known->second;
+                  if (TargetOwner)
+                    ToInvalidate.push_back(Target);
+                }
+              }
+            }
+          }
+        }
+      };
+      HelperClass Helper(IndependentGroups);
+
+      // We flow out from each of the entries, simultaneously.
+      // When we reach a new block, we add it as belonging to the one we got to
+      // it from.
+      // If we reach a new block that is already marked as belonging to someone,
+      // it is reachable by two entries and is not valid for any of them.
+      // Remove it and all it can reach that have been visited.
+
+      // Being in the queue means we just added this item, and
+      // we need to add its children
+      BlockList Queue;
+      for (const auto &Entry : Entries) {
+        Helper.Ownership[Entry] = Entry;
+        IndependentGroups[Entry].insert(Entry);
+        Queue.push_back(Entry);
+      }
+      while (!Queue.empty()) {
+        Block *Curr = Queue.front();
+        Queue.pop_front();
+        Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership
+                                               // map if we are in the queue
+        if (!Owner)
+          continue; // we have been invalidated meanwhile after being reached
+                    // from two entries
+        // Add all children
+        for (const auto &iter : Curr->BranchesOut) {
+          Block *New = iter.first;
+          BlockBlockMap::iterator Known = Helper.Ownership.find(New);
+          if (Known == Helper.Ownership.end()) {
+            // New node. Add it, and put it in the queue
+            Helper.Ownership[New] = Owner;
+            IndependentGroups[Owner].insert(New);
+            Queue.push_back(New);
+            continue;
+          }
+          Block *NewOwner = Known->second;
+          if (!NewOwner)
+            continue; // We reached an invalidated node
+          if (NewOwner != Owner)
+            // Invalidate this and all reachable that we have seen - we reached
+            // this from two locations
+            Helper.InvalidateWithChildren(New);
+          // otherwise, we have the same owner, so do nothing
+        }
+      }
+
+      // Having processed all the interesting blocks, we remain with just one
+      // potential issue:
+      // If a->b, and a was invalidated, but then b was later reached by
+      // someone else, we must invalidate b. To check for this, we go over all
+      // elements in the independent groups, if an element has a parent which
+      // does *not* have the same owner, we/ must remove it and all its
+      // children.
+
+      for (const auto &iter : Entries) {
+        BlockSet &CurrGroup = IndependentGroups[iter];
+        BlockList ToInvalidate;
+        for (const auto &iter : CurrGroup) {
+          Block *Child = iter;
+          for (const auto &iter : Child->BranchesIn) {
+            Block *Parent = iter;
+            if (Ignore && contains(*Ignore, Parent))
+              continue;
+            if (Helper.Ownership[Parent] != Helper.Ownership[Child])
+              ToInvalidate.push_back(Child);
+          }
+        }
+        while (!ToInvalidate.empty()) {
+          Block *Invalidatee = ToInvalidate.front();
+          ToInvalidate.pop_front();
+          Helper.InvalidateWithChildren(Invalidatee);
+        }
+      }
+
+      // Remove empty groups
+      for (const auto &iter : Entries)
+        if (IndependentGroups[iter].empty())
+          IndependentGroups.erase(iter);
+    }
+
+    Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries,
+                        BlockBlockSetMap &IndependentGroups, Shape *Prev,
+                        BlockSet &NextEntries) {
+      bool Fused = isa<SimpleShape>(Prev);
+      MultipleShape *Multiple = new MultipleShape();
+      Notice(Multiple);
+      BlockSet CurrEntries;
+      for (auto &iter : IndependentGroups) {
+        Block *CurrEntry = iter.first;
+        BlockSet &CurrBlocks = iter.second;
+        // Create inner block
+        CurrEntries.clear();
+        CurrEntries.insert(CurrEntry);
+        for (const auto &CurrInner : CurrBlocks) {
+          // Remove the block from the remaining blocks
+          Blocks.remove(CurrInner);
+          // Find new next entries and fix branches to them
+          for (auto iter = CurrInner->BranchesOut.begin();
+               iter != CurrInner->BranchesOut.end();) {
+            Block *CurrTarget = iter->first;
+            auto Next = iter;
+            Next++;
+            if (!contains(CurrBlocks, CurrTarget)) {
+              NextEntries.insert(CurrTarget);
+              Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks);
+            }
+            iter = Next; // increment carefully because Solipsize can remove us
+          }
+        }
+        Multiple->InnerMap[CurrEntry->Id] =
+            Process(CurrBlocks, CurrEntries, nullptr);
+        // If we are not fused, then our entries will actually be checked
+        if (!Fused)
+          CurrEntry->IsCheckedMultipleEntry = true;
+      }
+      // Add entries not handled as next entries, they are deferred
+      for (const auto &Entry : Entries)
+        if (!contains(IndependentGroups, Entry))
+          NextEntries.insert(Entry);
+      // The multiple has been created, we can decide how to implement it
+      if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) {
+        Multiple->UseSwitch = true;
+        Multiple->Breaks++; // switch captures breaks
+      }
+      return Multiple;
+    }
+
+    // Main function.
+    // Process a set of blocks with specified entries, returns a shape
+    // The Make* functions receive a NextEntries. If they fill it with data,
+    // those are the entries for the ->Next block on them, and the blocks
+    // are what remains in Blocks (which Make* modify). In this way
+    // we avoid recursing on Next (imagine a long chain of Simples, if we
+    // recursed we could blow the stack).
+    Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) {
+      BlockSet *Entries = &InitialEntries;
+      BlockSet TempEntries[2];
+      int CurrTempIndex = 0;
+      BlockSet *NextEntries;
+      Shape *Ret = nullptr;
+
+      auto Make = [&](Shape *Temp) {
+        if (Prev)
+          Prev->Next = Temp;
+        if (!Ret)
+          Ret = Temp;
+        Prev = Temp;
+        Entries = NextEntries;
+      };
+
+      while (1) {
+        CurrTempIndex = 1 - CurrTempIndex;
+        NextEntries = &TempEntries[CurrTempIndex];
+        NextEntries->clear();
+
+        if (Entries->empty())
+          return Ret;
+        if (Entries->size() == 1) {
+          Block *Curr = *(Entries->begin());
+          if (Curr->BranchesIn.empty()) {
+            // One entry, no looping ==> Simple
+            Make(MakeSimple(Blocks, Curr, *NextEntries));
+            if (NextEntries->empty())
+              return Ret;
+            continue;
+          }
+          // One entry, looping ==> Loop
+          Make(MakeLoop(Blocks, *Entries, *NextEntries));
+          if (NextEntries->empty())
+            return Ret;
+          continue;
+        }
+
+        // More than one entry, try to eliminate through a Multiple groups of
+        // independent blocks from an entry/ies. It is important to remove
+        // through multiples as opposed to looping since the former is more
+        // performant.
+        BlockBlockSetMap IndependentGroups;
+        FindIndependentGroups(*Entries, IndependentGroups);
+
+        if (!IndependentGroups.empty()) {
+          // We can handle a group in a multiple if its entry cannot be reached
+          // by another group.
+          // Note that it might be reachable by itself - a loop. But that is
+          // fine, we will create a loop inside the multiple block (which
+          // is the performant order to do it).
+          for (auto iter = IndependentGroups.begin();
+               iter != IndependentGroups.end();) {
+            Block *Entry = iter->first;
+            BlockSet &Group = iter->second;
+            auto curr = iter++; // iterate carefully, we may delete
+            for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin();
+                 iterBranch != Entry->BranchesIn.end(); iterBranch++) {
+              Block *Origin = *iterBranch;
+              if (!contains(Group, Origin)) {
+                // Reached from outside the group, so we cannot handle this
+                IndependentGroups.erase(curr);
+                break;
+              }
+            }
+          }
+
+          // As an optimization, if we have 2 independent groups, and one is a
+          // small dead end, we can handle only that dead end.
+          // The other then becomes a Next - without nesting in the code and
+          // recursion in the analysis.
+          // TODO: if the larger is the only dead end, handle that too
+          // TODO: handle >2 groups
+          // TODO: handle not just dead ends, but also that do not branch to the
+          // NextEntries. However, must be careful there since we create a
+          // Next, and that Next can prevent eliminating a break (since we no
+          // longer naturally reach the same place), which may necessitate a
+          // one-time loop, which makes the unnesting pointless.
+          if (IndependentGroups.size() == 2) {
+            // Find the smaller one
+            auto iter = IndependentGroups.begin();
+            Block *SmallEntry = iter->first;
+            auto SmallSize = iter->second.size();
+            iter++;
+            Block *LargeEntry = iter->first;
+            auto LargeSize = iter->second.size();
+            if (SmallSize != LargeSize) { // ignore the case where they are
+                                          // identical - keep things symmetrical
+                                          // there
+              if (SmallSize > LargeSize) {
+                Block *Temp = SmallEntry;
+                SmallEntry = LargeEntry;
+                LargeEntry = Temp; // Note: we did not flip the Sizes too, they
+                                   // are now invalid. TODO: use the smaller
+                                   // size as a limit?
+              }
+              // Check if dead end
+              bool DeadEnd = true;
+              BlockSet &SmallGroup = IndependentGroups[SmallEntry];
+              for (const auto &Curr : SmallGroup) {
+                for (const auto &iter : Curr->BranchesOut) {
+                  Block *Target = iter.first;
+                  if (!contains(SmallGroup, Target)) {
+                    DeadEnd = false;
+                    break;
+                  }
+                }
+                if (!DeadEnd)
+                  break;
+              }
+              if (DeadEnd)
+                IndependentGroups.erase(LargeEntry);
+            }
+          }
+
+          if (!IndependentGroups.empty())
+            // Some groups removable ==> Multiple
+            Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev,
+                              *NextEntries));
+            if (NextEntries->empty())
+              return Ret;
+            continue;
+        }
+        // No independent groups, must be loopable ==> Loop
+        Make(MakeLoop(Blocks, *Entries, *NextEntries));
+        if (NextEntries->empty())
+          return Ret;
+        continue;
+      }
+    }
+  };
+
+  // Main
+
+  BlockSet AllBlocks;
+  for (const auto &Curr : Pre.Live) {
+    AllBlocks.insert(Curr);
+  }
+
+  BlockSet Entries;
+  Entries.insert(Entry);
+  Root = Analyzer(this).Process(AllBlocks, Entries, nullptr);
+  assert(Root);
+
+  ///
+  /// Relooper post-optimizer
+  ///
+  struct PostOptimizer {
+    RelooperAlgorithm *Parent;
+    std::stack<Shape *> LoopStack;
+
+    PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
+
+    void ShapeSwitch(Shape* var,
+                     std::function<void (SimpleShape*)> simple,
+                     std::function<void (MultipleShape*)> multiple,
+                     std::function<void (LoopShape*)> loop) {
+      switch (var->getKind()) {
+        case Shape::SK_Simple: {
+          simple(cast<SimpleShape>(var));
+          break;
+        }
+        case Shape::SK_Multiple: {
+          multiple(cast<MultipleShape>(var));
+          break;
+        }
+        case Shape::SK_Loop: {
+          loop(cast<LoopShape>(var));
+          break;
+        }
+      }
+    }
+
+    // Find the blocks that natural control flow can get us directly to, or
+    // through a multiple that we ignore
+    void FollowNaturalFlow(Shape *S, BlockSet &Out) {
+      ShapeSwitch(S, [&](SimpleShape* Simple) {
+        Out.insert(Simple->Inner);
+      }, [&](MultipleShape* Multiple) {
+        for (const auto &iter : Multiple->InnerMap) {
+          FollowNaturalFlow(iter.second, Out);
+        }
+        FollowNaturalFlow(Multiple->Next, Out);
+      }, [&](LoopShape* Loop) {
+        FollowNaturalFlow(Loop->Inner, Out);
+      });
+    }
+
+    void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) {
+      if (Root->Next) {
+        Root->Natural = Root->Next;
+        FindNaturals(Root->Next, Otherwise);
+      } else {
+        Root->Natural = Otherwise;
+      }
+
+      ShapeSwitch(Root, [](SimpleShape* Simple) {
+      }, [&](MultipleShape* Multiple) {
+        for (const auto &iter : Multiple->InnerMap) {
+          FindNaturals(iter.second, Root->Natural);
+        }
+      }, [&](LoopShape* Loop){
+        FindNaturals(Loop->Inner, Loop->Inner);
+      });
+    }
+
+    // Remove unneeded breaks and continues.
+    // A flow operation is trivially unneeded if the shape we naturally get to
+    // by normal code execution is the same as the flow forces us to.
+    void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr,
+                             LoopShape *LastLoop = nullptr,
+                             unsigned Depth = 0) {
+      BlockSet NaturalBlocks;
+      FollowNaturalFlow(Natural, NaturalBlocks);
+      Shape *Next = Root;
+      while (Next) {
+        Root = Next;
+        Next = nullptr;
+        ShapeSwitch(
+            Root,
+            [&](SimpleShape* Simple) {
+              if (Simple->Inner->BranchVar)
+                LastLoop =
+                    nullptr; // a switch clears out the loop (TODO: only for
+                             // breaks, not continue)
+
+              if (Simple->Next) {
+                if (!Simple->Inner->BranchVar &&
+                    Simple->Inner->ProcessedBranchesOut.size() == 2 &&
+                    Depth < RelooperNestingLimit) {
+                  // If there is a next block, we already know at Simple
+                  // creation time to make direct branches, and we can do
+                  // nothing more in general. But, we try to optimize the
+                  // case of a break and a direct: This would normally be
+                  //   if (break?) { break; } ..
+                  // but if we make sure to nest the else, we can save the
+                  // break,
+                  //   if (!break?) { .. }
+                  // This is also better because the more canonical nested
+                  // form is easier to further optimize later. The
+                  // downside is more nesting, which adds to size in builds with
+                  // whitespace.
+                  // Note that we avoid switches, as it complicates control flow
+                  // and is not relevant for the common case we optimize here.
+                  bool Found = false;
+                  bool Abort = false;
+                  for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+                    Block *Target = iter.first;
+                    Branch *Details = iter.second.get();
+                    if (Details->Type == Branch::Break) {
+                      Found = true;
+                      if (!contains(NaturalBlocks, Target))
+                        Abort = true;
+                    } else if (Details->Type != Branch::Direct)
+                      Abort = true;
+                  }
+                  if (Found && !Abort) {
+                    for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+                      Branch *Details = iter.second.get();
+                      if (Details->Type == Branch::Break) {
+                        Details->Type = Branch::Direct;
+                        if (MultipleShape *Multiple =
+                                dyn_cast<MultipleShape>(Details->Ancestor))
+                          Multiple->Breaks--;
+                      } else {
+                        assert(Details->Type == Branch::Direct);
+                        Details->Type = Branch::Nested;
+                      }
+                    }
+                  }
+                  Depth++; // this optimization increases depth, for us and all
+                           // our next chain (i.e., until this call returns)
+                }
+                Next = Simple->Next;
+              } else {
+                // If there is no next then Natural is where we will
+                // go to by doing nothing, so we can potentially optimize some
+                // branches to direct.
+                for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+                  Block *Target = iter.first;
+                  Branch *Details = iter.second.get();
+                  if (Details->Type != Branch::Direct &&
+                      contains(NaturalBlocks,
+                               Target)) { // note: cannot handle split blocks
+                    Details->Type = Branch::Direct;
+                    if (MultipleShape *Multiple =
+                            dyn_cast<MultipleShape>(Details->Ancestor))
+                      Multiple->Breaks--;
+                  } else if (Details->Type == Branch::Break && LastLoop &&
+                             LastLoop->Natural == Details->Ancestor->Natural) {
+                    // it is important to simplify breaks, as simpler breaks
+                    // enable other optimizations
+                    Details->Labeled = false;
+                    if (MultipleShape *Multiple =
+                            dyn_cast<MultipleShape>(Details->Ancestor))
+                      Multiple->Breaks--;
+                  }
+                }
+              }
+            }, [&](MultipleShape* Multiple)
+            {
+              for (const auto &iter : Multiple->InnerMap) {
+                RemoveUnneededFlows(iter.second, Multiple->Next,
+                                    Multiple->Breaks ? nullptr : LastLoop,
+                                    Depth + 1);
+              }
+              Next = Multiple->Next;
+            }, [&](LoopShape* Loop)
+            {
+              RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1);
+              Next = Loop->Next;
+            });
+      }
+    }
+
+    // After we know which loops exist, we can calculate which need to be
+    // labeled
+    void FindLabeledLoops(Shape *Root) {
+      Shape *Next = Root;
+      while (Next) {
+        Root = Next;
+        Next = nullptr;
+
+        ShapeSwitch(
+            Root,
+            [&](SimpleShape *Simple) {
+          MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next);
+          // If we are fusing a Multiple with a loop into this Simple, then
+          // visit it now
+          if (Fused && Fused->Breaks)
+            LoopStack.push(Fused);
+          if (Simple->Inner->BranchVar)
+            LoopStack.push(nullptr); // a switch means breaks are now useless,
+                                     // push a dummy
+          if (Fused) {
+            if (Fused->UseSwitch)
+              LoopStack.push(nullptr); // a switch means breaks are now
+                                       // useless, push a dummy
+            for (const auto &iter : Fused->InnerMap) {
+              FindLabeledLoops(iter.second);
+            }
+          }
+          for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+            Branch *Details = iter.second.get();
+            if (Details->Type == Branch::Break ||
+                Details->Type == Branch::Continue) {
+              assert(!LoopStack.empty());
+              if (Details->Ancestor != LoopStack.top() && Details->Labeled) {
+                if (MultipleShape *Multiple =
+                        dyn_cast<MultipleShape>(Details->Ancestor)) {
+                  Multiple->Labeled = true;
+                } else {
+                  LoopShape *Loop = cast<LoopShape>(Details->Ancestor);
+                  Loop->Labeled = true;
+                }
+              } else {
+                Details->Labeled = false;
+              }
+            }
+            if (Fused && Fused->UseSwitch)
+              LoopStack.pop();
+            if (Simple->Inner->BranchVar)
+              LoopStack.pop();
+            if (Fused && Fused->Breaks)
+              LoopStack.pop();
+            if (Fused)
+              Next = Fused->Next;
+            else
+              Next = Root->Next;
+          }
+          }
+          , [&](MultipleShape* Multiple) {
+            if (Multiple->Breaks)
+              LoopStack.push(Multiple);
+            for (const auto &iter : Multiple->InnerMap)
+              FindLabeledLoops(iter.second);
+            if (Multiple->Breaks)
+              LoopStack.pop();
+            Next = Root->Next;
+          }
+          , [&](LoopShape* Loop) {
+            LoopStack.push(Loop);
+            FindLabeledLoops(Loop->Inner);
+            LoopStack.pop();
+            Next = Root->Next;
+          });
+      }
+    }
+
+    void Process(Shape * Root) {
+      FindNaturals(Root);
+      RemoveUnneededFlows(Root);
+      FindLabeledLoops(Root);
+    }
+  };
+
+  PostOptimizer(this).Process(Root);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/Relooper.h b/contrib/llvm/lib/Target/WebAssembly/Relooper.h
new file mode 100644
index 0000000..7c564de
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Relooper.h
@@ -0,0 +1,186 @@
+//===-- Relooper.h - Top-level interface for WebAssembly  ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This defines an optimized C++ implemention of the Relooper
+/// algorithm, originally developed as part of Emscripten, which
+/// generates a structured AST from arbitrary control flow.
+///
+//===-------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Casting.h"
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdio>
+#include <deque>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+
+namespace llvm {
+
+namespace Relooper {
+
+struct Block;
+struct Shape;
+
+///
+/// Info about a branching from one block to another
+///
+struct Branch {
+  enum FlowType {
+    Direct = 0, // We will directly reach the right location through other
+                // means, no need for continue or break
+    Break = 1,
+    Continue = 2,
+    Nested = 3 // This code is directly reached, but we must be careful to
+               // ensure it is nested in an if - it is not reached
+    // unconditionally, other code paths exist alongside it that we need to make
+    // sure do not intertwine
+  };
+  Shape
+      *Ancestor; // If not nullptr, this shape is the relevant one for purposes
+                 // of getting to the target block. We break or continue on it
+  Branch::FlowType
+      Type;     // If Ancestor is not nullptr, this says whether to break or
+                // continue
+  bool Labeled; // If a break or continue, whether we need to use a label
+  const char *Condition; // The condition for which we branch. For example,
+                         // "my_var == 1". Conditions are checked one by one.
+                         // One of the conditions should have nullptr as the
+                         // condition, in which case it is the default
+                         // FIXME: move from char* to LLVM data structures
+  const char *Code; // If provided, code that is run right before the branch is
+                    // taken. This is useful for phis
+                    // FIXME: move from char* to LLVM data structures
+
+  Branch(const char *ConditionInit, const char *CodeInit = nullptr);
+  ~Branch();
+};
+
+typedef SetVector<Block *> BlockSet;
+typedef MapVector<Block *, Branch *> BlockBranchMap;
+typedef MapVector<Block *, std::unique_ptr<Branch>> OwningBlockBranchMap;
+
+///
+/// Represents a basic block of code - some instructions that end with a
+/// control flow modifier (a branch, return or throw).
+///
+struct Block {
+  // Branches become processed after we finish the shape relevant to them. For
+  // example, when we recreate a loop, branches to the loop start become
+  // continues and are now processed. When we calculate what shape to generate
+  // from a set of blocks, we ignore processed branches. Blocks own the Branch
+  // objects they use, and destroy them when done.
+  OwningBlockBranchMap BranchesOut;
+  BlockSet BranchesIn;
+  OwningBlockBranchMap ProcessedBranchesOut;
+  BlockSet ProcessedBranchesIn;
+  Shape *Parent; // The shape we are directly inside
+  int Id; // A unique identifier, defined when added to relooper. Note that this
+          // uniquely identifies a *logical* block - if we split it, the two
+          // instances have the same content *and* the same Id
+  const char *Code;      // The string representation of the code in this block.
+                         // Owning pointer (we copy the input)
+                         // FIXME: move from char* to LLVM data structures
+  const char *BranchVar; // A variable whose value determines where we go; if
+                         // this is not nullptr, emit a switch on that variable
+                         // FIXME: move from char* to LLVM data structures
+  bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching
+                               // us requires setting the label variable
+
+  Block(const char *CodeInit, const char *BranchVarInit);
+  ~Block();
+
+  void AddBranchTo(Block *Target, const char *Condition,
+                   const char *Code = nullptr);
+};
+
+///
+/// Represents a structured control flow shape
+///
+struct Shape {
+  int Id; // A unique identifier. Used to identify loops, labels are Lx where x
+          // is the Id. Defined when added to relooper
+  Shape *Next;    // The shape that will appear in the code right after this one
+  Shape *Natural; // The shape that control flow gets to naturally (if there is
+                  // Next, then this is Next)
+
+  /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
+  enum ShapeKind { SK_Simple, SK_Multiple, SK_Loop };
+
+private:
+  ShapeKind Kind;
+
+public:
+  ShapeKind getKind() const { return Kind; }
+
+  Shape(ShapeKind KindInit) : Id(-1), Next(nullptr), Kind(KindInit) {}
+};
+
+///
+/// Simple: No control flow at all, just instructions.
+///
+struct SimpleShape : public Shape {
+  Block *Inner;
+
+  SimpleShape() : Shape(SK_Simple), Inner(nullptr) {}
+
+  static bool classof(const Shape *S) { return S->getKind() == SK_Simple; }
+};
+
+///
+/// A shape that may be implemented with a labeled loop.
+///
+struct LabeledShape : public Shape {
+  bool Labeled; // If we have a loop, whether it needs to be labeled
+
+  LabeledShape(ShapeKind KindInit) : Shape(KindInit), Labeled(false) {}
+};
+
+// Blocks with the same id were split and are identical, so we just care about
+// ids in Multiple entries
+typedef std::map<int, Shape *> IdShapeMap;
+
+///
+/// Multiple: A shape with more than one entry. If the next block to
+///           be entered is among them, we run it and continue to
+///           the next shape, otherwise we continue immediately to the
+///           next shape.
+///
+struct MultipleShape : public LabeledShape {
+  IdShapeMap InnerMap; // entry block ID -> shape
+  int Breaks; // If we have branches on us, we need a loop (or a switch). This
+              // is a counter of requirements,
+              // if we optimize it to 0, the loop is unneeded
+  bool UseSwitch; // Whether to switch on label as opposed to an if-else chain
+
+  MultipleShape() : LabeledShape(SK_Multiple), Breaks(0), UseSwitch(false) {}
+
+  static bool classof(const Shape *S) { return S->getKind() == SK_Multiple; }
+};
+
+///
+/// Loop: An infinite loop.
+///
+struct LoopShape : public LabeledShape {
+  Shape *Inner;
+
+  LoopShape() : LabeledShape(SK_Loop), Inner(nullptr) {}
+
+  static bool classof(const Shape *S) { return S->getKind() == SK_Loop; }
+};
+
+} // namespace Relooper
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
index 3ff19d4..e972da5 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -23,8 +23,22 @@ namespace llvm {
 class WebAssemblyTargetMachine;
 class FunctionPass;
 
+FunctionPass *createWebAssemblyOptimizeReturned();
+
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
+FunctionPass *createWebAssemblyArgumentMove();
+
+FunctionPass *createWebAssemblyStoreResults();
+FunctionPass *createWebAssemblyRegStackify();
+FunctionPass *createWebAssemblyRegColoring();
+FunctionPass *createWebAssemblyPEI();
+FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyLowerBrUnless();
+FunctionPass *createWebAssemblyRegNumbering();
+FunctionPass *createWebAssemblyPeephole();
+
+FunctionPass *createWebAssemblyRelooper();
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
index a123bf6..551ad93 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -6,10 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This is a target description file for the WebAssembly architecture, which is
-// also known as "wasm".
-//
+///
+/// \file
+/// \brief This is a target description file for the WebAssembly architecture,
+/// which is also known as "wasm".
+///
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -50,6 +51,9 @@ def WebAssemblyInstrInfo : InstrInfo;
 // Minimal Viable Product.
 def : ProcessorModel<"mvp", NoSchedModel, []>;
 
+// Generic processor: latest stable version.
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
 // Latest and greatest experimental version of WebAssembly. Bugs included!
 def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128]>;
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
new file mode 100644
index 0000000..3893c40
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -0,0 +1,110 @@
+//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+///
+/// Arguments are really live-in registers, however, since we use virtual
+/// registers and LLVM doesn't support live-in virtual registers, we're
+/// currently making do with ARGUMENT instructions which are placed at the top
+/// of the entry block. The trick is to get them to *stay* at the top of the
+/// entry block.
+///
+/// The ARGUMENTS physical register keeps these instructions pinned in place
+/// during liveness-aware CodeGen passes, however one thing which does not
+/// respect this is the ScheduleDAG scheduler. This pass is therefore run
+/// immediately after that.
+///
+/// This is all hopefully a temporary solution until we find a better solution
+/// for describing the live-in nature of arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-argument-move"
+
+namespace {
+class WebAssemblyArgumentMove final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyArgumentMove() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Argument Move";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyArgumentMove::ID = 0;
+FunctionPass *llvm::createWebAssemblyArgumentMove() {
+  return new WebAssemblyArgumentMove();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Argument Move **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineBasicBlock::iterator InsertPt = EntryMBB.end();
+
+  // Look for the first NonArg instruction.
+  for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) {
+    MachineInstr *MI = MII;
+    if (!IsArgument(MI)) {
+      InsertPt = MII;
+      break;
+    }
+  }
+
+  // Now move any argument instructions later in the block
+  // to before our first NonArg instruction.
+  for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) {
+    MachineInstr *MI = I;
+    if (IsArgument(MI)) {
+      EntryMBB.insert(InsertPt, MI->removeFromParent());
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
new file mode 100644
index 0000000..45ac99d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -0,0 +1,294 @@
+//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a printer that converts from our internal
+/// representation of machine-dependent LLVM code to the WebAssembly assembly
+/// language.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRegisterInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class WebAssemblyAsmPrinter final : public AsmPrinter {
+  const MachineRegisterInfo *MRI;
+  const WebAssemblyFunctionInfo *MFI;
+
+public:
+  WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Assembly Printer";
+  }
+
+  //===------------------------------------------------------------------===//
+  // MachineFunctionPass Implementation.
+  //===------------------------------------------------------------------===//
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    MRI = &MF.getRegInfo();
+    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(MF);
+  }
+
+  //===------------------------------------------------------------------===//
+  // AsmPrinter Implementation.
+  //===------------------------------------------------------------------===//
+
+  void EmitJumpTableInfo() override;
+  void EmitConstantPool() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  const MCExpr *lowerConstant(const Constant *CV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  MVT getRegType(unsigned RegNo) const;
+  const char *toString(MVT VT) const;
+  std::string regToString(const MachineOperand &MO);
+  WebAssemblyTargetStreamer *getTargetStreamer();
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helpers.
+//===----------------------------------------------------------------------===//
+
+MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
+  const TargetRegisterClass *TRC =
+      TargetRegisterInfo::isVirtualRegister(RegNo)
+          ? MRI->getRegClass(RegNo)
+          : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
+  for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+    if (TRC->hasType(T))
+      return T;
+  DEBUG(errs() << "Unknown type for register number: " << RegNo);
+  llvm_unreachable("Unknown register type");
+  return MVT::Other;
+}
+
+const char *WebAssemblyAsmPrinter::toString(MVT VT) const {
+  return WebAssembly::TypeToString(VT);
+}
+
+std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
+  unsigned RegNo = MO.getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
+         "Unlowered physical register encountered during assembly printing");
+  assert(!MFI->isVRegStackified(RegNo));
+  unsigned WAReg = MFI->getWAReg(RegNo);
+  assert(WAReg != WebAssemblyFunctionInfo::UnusedReg);
+  return '$' + utostr(WAReg);
+}
+
+WebAssemblyTargetStreamer *
+WebAssemblyAsmPrinter::getTargetStreamer() {
+  MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
+  return static_cast<WebAssemblyTargetStreamer *>(TS);
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssemblyAsmPrinter Implementation.
+//===----------------------------------------------------------------------===//
+
+void WebAssemblyAsmPrinter::EmitConstantPool() {
+  assert(MF->getConstantPool()->getConstants().empty() &&
+         "WebAssembly disables constant pools");
+}
+
+void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+  // Nothing to do; jump tables are incorporated into the instruction stream.
+}
+
+static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+                                 Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
+  const DataLayout &DL(F.getParent()->getDataLayout());
+  const WebAssemblyTargetLowering &TLI =
+      *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
+  SmallVector<EVT, 4> VTs;
+  ComputeValueVTs(TLI, DL, Ty, VTs);
+
+  for (EVT VT : VTs) {
+    unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT);
+    MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i)
+      ValueVTs.push_back(RegisterVT);
+  }
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
+  if (!MFI->getParams().empty())
+    getTargetStreamer()->emitParam(MFI->getParams());
+
+  SmallVector<MVT, 4> ResultVTs;
+  const Function &F(*MF->getFunction());
+  ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs);
+
+  // If the return type needs to be legalized it will get converted into
+  // passing a pointer.
+  if (ResultVTs.size() == 1)
+    getTargetStreamer()->emitResult(ResultVTs);
+
+  bool AnyWARegs = false;
+  SmallVector<MVT, 16> LocalTypes;
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+    unsigned WAReg = MFI->getWAReg(VReg);
+    // Don't declare unused registers.
+    if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+      continue;
+    // Don't redeclare parameters.
+    if (WAReg < MFI->getParams().size())
+      continue;
+    // Don't declare stackified registers.
+    if (int(WAReg) < 0)
+      continue;
+    LocalTypes.push_back(getRegType(VReg));
+    AnyWARegs = true;
+  }
+  auto &PhysRegs = MFI->getPhysRegs();
+  for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) {
+    if (PhysRegs[PReg] == -1U)
+      continue;
+    LocalTypes.push_back(getRegType(PReg));
+    AnyWARegs = true;
+  }
+  if (AnyWARegs)
+    getTargetStreamer()->emitLocal(LocalTypes);
+
+  AsmPrinter::EmitFunctionBodyStart();
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
+  getTargetStreamer()->emitEndFunc();
+}
+
+void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    // These represent values which are live into the function entry, so there's
+    // no instruction to emit.
+    break;
+  default: {
+    WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    break;
+  }
+  }
+}
+
+const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    if (GV->getValueType()->isFunctionTy())
+      return MCSymbolRefExpr::create(
+          getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+  return AsmPrinter::lowerConstant(CV);
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &OS) {
+  if (AsmVariant != 0)
+    report_fatal_error("There are no defined alternate asm variants");
+
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+    return false;
+
+  if (!ExtraCode) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (MO.getType()) {
+    case MachineOperand::MO_Immediate:
+      OS << MO.getImm();
+      return false;
+    case MachineOperand::MO_Register:
+      OS << regToString(MO);
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      getSymbol(MO.getGlobal())->print(OS, MAI);
+      printOffset(MO.getOffset(), OS);
+      return false;
+    case MachineOperand::MO_ExternalSymbol:
+      GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI);
+      printOffset(MO.getOffset(), OS);
+      return false;
+    case MachineOperand::MO_MachineBasicBlock:
+      MO.getMBB()->getSymbol()->print(OS, MAI);
+      return false;
+    default:
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                                  unsigned OpNo,
+                                                  unsigned AsmVariant,
+                                                  const char *ExtraCode,
+                                                  raw_ostream &OS) {
+  if (AsmVariant != 0)
+    report_fatal_error("There are no defined alternate asm variants");
+
+  if (!ExtraCode) {
+    // TODO: For now, we just hard-code 0 as the constant offset; teach
+    // SelectInlineAsmMemoryOperand how to do address mode matching.
+    OS << "0(" + regToString(MI->getOperand(OpNo)) + ')';
+    return false;
+  }
+
+  return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyAsmPrinter() {
+  RegisterAsmPrinter<WebAssemblyAsmPrinter> X(TheWebAssemblyTarget32);
+  RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(TheWebAssemblyTarget64);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
new file mode 100644
index 0000000..a39349c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -0,0 +1,493 @@
+//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG stacking pass.
+///
+/// This pass reorders the blocks in a function to put them into a reverse
+/// post-order [0], with special care to keep the order as similar as possible
+/// to the original order, and to keep loops contiguous even in the case of
+/// split backedges.
+///
+/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// scope boundaries serve as the labels for WebAssembly's control transfers.
+///
+/// This is sufficient to convert arbitrary CFGs into a form that works on
+/// WebAssembly, provided that all loops are single-entry.
+///
+/// [0] https://en.wikipedia.org/wiki/Depth-first_search#Vertex_orderings
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-stackify"
+
+namespace {
+class WebAssemblyCFGStackify final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly CFG Stackify";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCFGStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGStackify() {
+  return new WebAssemblyCFGStackify();
+}
+
+static void EliminateMultipleEntryLoops(MachineFunction &MF,
+                                        const MachineLoopInfo &MLI) {
+  SmallPtrSet<MachineBasicBlock *, 8> InSet;
+  for (scc_iterator<MachineFunction *> I = scc_begin(&MF), E = scc_end(&MF);
+       I != E; ++I) {
+    const std::vector<MachineBasicBlock *> &CurrentSCC = *I;
+
+    // Skip trivial SCCs.
+    if (CurrentSCC.size() == 1)
+      continue;
+
+    InSet.insert(CurrentSCC.begin(), CurrentSCC.end());
+    MachineBasicBlock *Header = nullptr;
+    for (MachineBasicBlock *MBB : CurrentSCC) {
+      for (MachineBasicBlock *Pred : MBB->predecessors()) {
+        if (InSet.count(Pred))
+          continue;
+        if (!Header) {
+          Header = MBB;
+          break;
+        }
+        // TODO: Implement multiple-entry loops.
+        report_fatal_error("multiple-entry loops are not supported yet");
+      }
+    }
+    assert(MLI.isLoopHeader(Header));
+
+    InSet.clear();
+  }
+}
+
+namespace {
+/// Post-order traversal stack entry.
+struct POStackEntry {
+  MachineBasicBlock *MBB;
+  SmallVector<MachineBasicBlock *, 0> Succs;
+
+  POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
+               const MachineLoopInfo &MLI);
+};
+} // end anonymous namespace
+
+static bool LoopContains(const MachineLoop *Loop,
+                         const MachineBasicBlock *MBB) {
+  return Loop ? Loop->contains(MBB) : true;
+}
+
+POStackEntry::POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
+                           const MachineLoopInfo &MLI)
+    : MBB(MBB), Succs(MBB->successors()) {
+  // RPO is not a unique form, since at every basic block with multiple
+  // successors, the DFS has to pick which order to visit the successors in.
+  // Sort them strategically (see below).
+  MachineLoop *Loop = MLI.getLoopFor(MBB);
+  MachineFunction::iterator Next = next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *LayoutSucc = Next == MF.end() ? nullptr : &*Next;
+  std::stable_sort(
+      Succs.begin(), Succs.end(),
+      [=, &MLI](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+        if (A == B)
+          return false;
+
+        // Keep loops contiguous by preferring the block that's in the same
+        // loop.
+        bool LoopContainsA = LoopContains(Loop, A);
+        bool LoopContainsB = LoopContains(Loop, B);
+        if (LoopContainsA && !LoopContainsB)
+          return true;
+        if (!LoopContainsA && LoopContainsB)
+          return false;
+
+        // Minimize perturbation by preferring the block which is the immediate
+        // layout successor.
+        if (A == LayoutSucc)
+          return true;
+        if (B == LayoutSucc)
+          return false;
+
+        // TODO: More sophisticated orderings may be profitable here.
+
+        return false;
+      });
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+/// Sort the blocks in RPO, taking special care to make sure that loops are
+/// contiguous even in the case of split backedges.
+///
+/// TODO: Determine whether RPO is actually worthwhile, or whether we should
+/// move to just a stable-topological-sort-based approach that would preserve
+/// more of the original order.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
+  // Note that we do our own RPO rather than using
+  // "llvm/ADT/PostOrderIterator.h" because we want control over the order that
+  // successors are visited in (see above). Also, we can sort the blocks in the
+  // MachineFunction as we go.
+  SmallPtrSet<MachineBasicBlock *, 16> Visited;
+  SmallVector<POStackEntry, 16> Stack;
+
+  MachineBasicBlock *EntryBlock = &*MF.begin();
+  Visited.insert(EntryBlock);
+  Stack.push_back(POStackEntry(EntryBlock, MF, MLI));
+
+  for (;;) {
+    POStackEntry &Entry = Stack.back();
+    SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs;
+    if (!Succs.empty()) {
+      MachineBasicBlock *Succ = Succs.pop_back_val();
+      if (Visited.insert(Succ).second)
+        Stack.push_back(POStackEntry(Succ, MF, MLI));
+      continue;
+    }
+
+    // Put the block in its position in the MachineFunction.
+    MachineBasicBlock &MBB = *Entry.MBB;
+    MBB.moveBefore(&*MF.begin());
+
+    // Branch instructions may utilize a fallthrough, so update them if a
+    // fallthrough has been added or removed.
+    if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() &&
+        !MBB.back().isBarrier())
+      report_fatal_error(
+          "Non-branch terminator with fallthrough cannot yet be rewritten");
+    if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch())
+      MBB.updateTerminator();
+
+    Stack.pop_back();
+    if (Stack.empty())
+      break;
+  }
+
+  // Now that we've sorted the blocks in RPO, renumber them.
+  MF.RenumberBlocks();
+
+#ifndef NDEBUG
+  SmallSetVector<MachineLoop *, 8> OnStack;
+
+  // Insert a sentinel representing the degenerate loop that starts at the
+  // function entry block and includes the entire function as a "loop" that
+  // executes once.
+  OnStack.insert(nullptr);
+
+  for (auto &MBB : MF) {
+    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+    MachineLoop *Loop = MLI.getLoopFor(&MBB);
+    if (Loop && &MBB == Loop->getHeader()) {
+      // Loop header. The loop predecessor should be sorted above, and the other
+      // predecessors should be backedges below.
+      for (auto Pred : MBB.predecessors())
+        assert(
+            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+            "Loop header predecessors must be loop predecessors or backedges");
+      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+    } else {
+      // Not a loop header. All predecessors should be sorted above.
+      for (auto Pred : MBB.predecessors())
+        assert(Pred->getNumber() < MBB.getNumber() &&
+               "Non-loop-header predecessors should be topologically sorted");
+      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+             "Blocks must be nested in their loops");
+    }
+    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+      OnStack.pop_back();
+  }
+  assert(OnStack.pop_back_val() == nullptr &&
+         "The function entry block shouldn't actually be a loop header");
+  assert(OnStack.empty() &&
+         "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+/// Test whether Pred has any terminators explicitly branching to MBB, as
+/// opposed to falling through. Note that it's possible (eg. in unoptimized
+/// code) for a branch instruction to both branch to a block and fallthrough
+/// to it, so we check the actual branch operands to see if there are any
+/// explicit mentions.
+static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
+                                 MachineBasicBlock *MBB) {
+  for (MachineInstr &MI : Pred->terminators())
+    for (MachineOperand &MO : MI.explicit_operands())
+      if (MO.isMBB() && MO.getMBB() == MBB)
+        return true;
+  return false;
+}
+
+/// Insert a BLOCK marker for branches to MBB (if needed).
+static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
+                             SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+                             const WebAssemblyInstrInfo &TII,
+                             const MachineLoopInfo &MLI,
+                             MachineDominatorTree &MDT) {
+  // First compute the nearest common dominator of all forward non-fallthrough
+  // predecessors so that we minimize the time that the BLOCK is on the stack,
+  // which reduces overall stack height.
+  MachineBasicBlock *Header = nullptr;
+  bool IsBranchedTo = false;
+  int MBBNumber = MBB.getNumber();
+  for (MachineBasicBlock *Pred : MBB.predecessors())
+    if (Pred->getNumber() < MBBNumber) {
+      Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+      if (ExplicitlyBranchesTo(Pred, &MBB))
+        IsBranchedTo = true;
+    }
+  if (!Header)
+    return;
+  if (!IsBranchedTo)
+    return;
+
+  assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
+  MachineBasicBlock *LayoutPred = &*prev(MachineFunction::iterator(&MBB));
+
+  // If the nearest common dominator is inside a more deeply nested context,
+  // walk out to the nearest scope which isn't more deeply nested.
+  for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+    if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
+      if (ScopeTop->getNumber() > Header->getNumber()) {
+        // Skip over an intervening scope.
+        I = next(MachineFunction::iterator(ScopeTop));
+      } else {
+        // We found a scope level at an appropriate depth.
+        Header = ScopeTop;
+        break;
+      }
+    }
+  }
+
+  // If there's a loop which ends just before MBB which contains Header, we can
+  // reuse its label instead of inserting a new BLOCK.
+  for (MachineLoop *Loop = MLI.getLoopFor(LayoutPred);
+       Loop && Loop->contains(LayoutPred); Loop = Loop->getParentLoop())
+    if (Loop && LoopBottom(Loop) == LayoutPred && Loop->contains(Header))
+      return;
+
+  // Decide where in Header to put the BLOCK.
+  MachineBasicBlock::iterator InsertPos;
+  MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
+  if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
+    // Header is the header of a loop that does not lexically contain MBB, so
+    // the BLOCK needs to be above the LOOP.
+    InsertPos = Header->begin();
+  } else {
+    // Otherwise, insert the BLOCK as late in Header as we can, but before the
+    // beginning of the local expression tree and any nested BLOCKs.
+    InsertPos = Header->getFirstTerminator();
+    while (InsertPos != Header->begin() &&
+           prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) &&
+           prev(InsertPos)->getOpcode() != WebAssembly::LOOP &&
+           prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK &&
+           prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP)
+      --InsertPos;
+  }
+
+  // Add the BLOCK.
+  BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK));
+
+  // Mark the end of the block.
+  InsertPos = MBB.begin();
+  while (InsertPos != MBB.end() &&
+         InsertPos->getOpcode() == WebAssembly::END_LOOP)
+    ++InsertPos;
+  BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::END_BLOCK));
+
+  // Track the farthest-spanning scope that ends at this point.
+  int Number = MBB.getNumber();
+  if (!ScopeTops[Number] ||
+      ScopeTops[Number]->getNumber() > Header->getNumber())
+    ScopeTops[Number] = Header;
+}
+
+/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
+static void PlaceLoopMarker(
+    MachineBasicBlock &MBB, MachineFunction &MF,
+    SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+    DenseMap<const MachineInstr *, const MachineBasicBlock *> &LoopTops,
+    const WebAssemblyInstrInfo &TII, const MachineLoopInfo &MLI) {
+  MachineLoop *Loop = MLI.getLoopFor(&MBB);
+  if (!Loop || Loop->getHeader() != &MBB)
+    return;
+
+  // The operand of a LOOP is the first block after the loop. If the loop is the
+  // bottom of the function, insert a dummy block at the end.
+  MachineBasicBlock *Bottom = LoopBottom(Loop);
+  auto Iter = next(MachineFunction::iterator(Bottom));
+  if (Iter == MF.end()) {
+    MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
+    // Give it a fake predecessor so that AsmPrinter prints its label.
+    Label->addSuccessor(Label);
+    MF.push_back(Label);
+    Iter = next(MachineFunction::iterator(Bottom));
+  }
+  MachineBasicBlock *AfterLoop = &*Iter;
+
+  // Mark the beginning of the loop (after the end of any existing loop that
+  // ends here).
+  auto InsertPos = MBB.begin();
+  while (InsertPos != MBB.end() &&
+         InsertPos->getOpcode() == WebAssembly::END_LOOP)
+    ++InsertPos;
+  BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::LOOP));
+
+  // Mark the end of the loop.
+  MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(),
+                              TII.get(WebAssembly::END_LOOP));
+  LoopTops[End] = &MBB;
+
+  assert((!ScopeTops[AfterLoop->getNumber()] ||
+          ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
+         "With RPO we should visit the outer-most loop for a block first.");
+  if (!ScopeTops[AfterLoop->getNumber()])
+    ScopeTops[AfterLoop->getNumber()] = &MBB;
+}
+
+static unsigned
+GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+         const MachineBasicBlock *MBB) {
+  unsigned Depth = 0;
+  for (auto X : reverse(Stack)) {
+    if (X == MBB)
+      break;
+    ++Depth;
+  }
+  assert(Depth < Stack.size() && "Branch destination should be in scope");
+  return Depth;
+}
+
+/// Insert LOOP and BLOCK markers at appropriate places.
+static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
+                         const WebAssemblyInstrInfo &TII,
+                         MachineDominatorTree &MDT) {
+  // For each block whose label represents the end of a scope, record the block
+  // which holds the beginning of the scope. This will allow us to quickly skip
+  // over scoped regions when walking blocks. We allocate one more than the
+  // number of blocks in the function to accommodate for the possible fake block
+  // we may insert at the end.
+  SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1);
+
+  // For eacn LOOP_END, the corresponding LOOP.
+  DenseMap<const MachineInstr *, const MachineBasicBlock *> LoopTops;
+
+  for (auto &MBB : MF) {
+    // Place the LOOP for MBB if MBB is the header of a loop.
+    PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI);
+
+    // Place the BLOCK for MBB if MBB is branched to from above.
+    PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT);
+  }
+
+  // Now rewrite references to basic blocks to be depth immediates.
+  SmallVector<const MachineBasicBlock *, 8> Stack;
+  for (auto &MBB : reverse(MF)) {
+    for (auto &MI : reverse(MBB)) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::BLOCK:
+        assert(ScopeTops[Stack.back()->getNumber()] == &MBB &&
+               "Block should be balanced");
+        Stack.pop_back();
+        break;
+      case WebAssembly::LOOP:
+        assert(Stack.back() == &MBB && "Loop top should be balanced");
+        Stack.pop_back();
+        Stack.pop_back();
+        break;
+      case WebAssembly::END_BLOCK:
+        Stack.push_back(&MBB);
+        break;
+      case WebAssembly::END_LOOP:
+        Stack.push_back(&MBB);
+        Stack.push_back(LoopTops[&MI]);
+        break;
+      default:
+        if (MI.isTerminator()) {
+          // Rewrite MBB operands to be depth immediates.
+          SmallVector<MachineOperand, 4> Ops(MI.operands());
+          while (MI.getNumOperands() > 0)
+            MI.RemoveOperand(MI.getNumOperands() - 1);
+          for (auto MO : Ops) {
+            if (MO.isMBB())
+              MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB()));
+            MI.addOperand(MF, MO);
+          }
+        }
+        break;
+      }
+    }
+  }
+  assert(Stack.empty() && "Control flow should be balanced");
+}
+
+bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** CFG Stackifying **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto &MDT = getAnalysis<MachineDominatorTree>();
+  // Liveness is not tracked for EXPR_STACK physreg.
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MF.getRegInfo().invalidateLiveness();
+
+  // RPO sorting needs all loops to be single-entry.
+  EliminateMultipleEntryLoops(MF, MLI);
+
+  // Sort the blocks in RPO, with contiguous loops.
+  SortBlocks(MF, MLI);
+
+  // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
+  PlaceMarkers(MF, MLI, TII, MDT);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
new file mode 100644
index 0000000..1b761b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -0,0 +1,81 @@
+//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific support for the FastISel
+/// class. Some of the target-specific code is generated by tablegen in the file
+/// WebAssemblyGenFastISel.inc, which is #included here.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fastisel"
+
+namespace {
+
+class WebAssemblyFastISel final : public FastISel {
+  /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+  /// right decision when generating code for different targets.
+  const WebAssemblySubtarget *Subtarget;
+  LLVMContext *Context;
+
+  // Call handling routines.
+private:
+public:
+  // Backend specific FastISel code.
+  WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo,
+                      const TargetLibraryInfo *LibInfo)
+      : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
+    Subtarget = &FuncInfo.MF->getSubtarget<WebAssemblySubtarget>();
+    Context = &FuncInfo.Fn->getContext();
+  }
+
+  bool fastSelectInstruction(const Instruction *I) override;
+
+#include "WebAssemblyGenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+    // TODO: add fast-isel selection cases here...
+  }
+
+  // Fall back to target-independent instruction selection.
+  return selectOperator(I, I->getOpcode());
+}
+
+FastISel *WebAssembly::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                      const TargetLibraryInfo *LibInfo) {
+  return new WebAssemblyFastISel(FuncInfo, LibInfo);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index e4ca82e..0eefd57 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -35,11 +35,20 @@ using namespace llvm;
 #define DEBUG_TYPE "wasm-frame-info"
 
 // TODO: Implement a red zone?
+// TODO: wasm64
+// TODO: Prolog/epilog should be stackified too. This pass runs after register
+//       stackification, so we'll have to do it manually.
+// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
 
 /// Return true if the specified function should have a dedicated frame pointer
 /// register.
 bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
-  llvm_unreachable("TODO: implement hasFP");
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const auto *RegInfo =
+      MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+         MFI->hasStackMap() || MFI->hasPatchPoint() ||
+         RegInfo->needsStackRealignment(MF);
 }
 
 /// Under normal circumstances, when a frame pointer is not required, we reserve
@@ -52,23 +61,115 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame(
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
+
+/// Adjust the stack pointer by a constant amount.
+static void adjustStackPointer(unsigned StackSize,
+                               bool AdjustUp,
+                               MachineFunction& MF,
+                               MachineBasicBlock& MBB,
+                               const TargetInstrInfo* TII,
+                               MachineBasicBlock::iterator InsertPt,
+                               const DebugLoc& DL) {
+  auto &MRI = MF.getRegInfo();
+  unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPReg)
+      .addExternalSymbol(SPSymbol);
+  // This MachinePointerInfo should reference __stack_pointer as well but
+  // doesn't because MachinePointerInfo() takes a GV which we don't have for
+  // __stack_pointer. TODO: check if PseudoSourceValue::ExternalSymbolCallEntry
+  // is appropriate instead. (likewise for EmitEpologue below)
+  auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(),
+                                        MachineMemOperand::MOLoad, 4, 4);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+      .addImm(0)
+      .addReg(SPReg)
+      .addMemOperand(LoadMMO);
+  // Add/Subtract the frame size
+  unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+      .addImm(StackSize);
+  BuildMI(MBB, InsertPt, DL,
+          TII->get(AdjustUp ? WebAssembly::ADD_I32 : WebAssembly::SUB_I32),
+          WebAssembly::SP32)
+      .addReg(SPReg)
+      .addReg(OffsetReg);
+  // The SP32 register now has the new stacktop. Also write it back to memory.
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+      .addExternalSymbol(SPSymbol);
+  auto *MMO = new MachineMemOperand(MachinePointerInfo(),
+                                    MachineMemOperand::MOStore, 4, 4);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
+      .addImm(0)
+      .addReg(OffsetReg)
+      .addReg(WebAssembly::SP32)
+      .addMemOperand(MMO);
+}
+
 void WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  llvm_unreachable("TODO: implement eliminateCallFramePseudoInstr");
+  const auto *TII =
+      static_cast<const WebAssemblyInstrInfo*>(MF.getSubtarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  unsigned Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  unsigned Amount = I->getOperand(0).getImm();
+  if (Amount)
+    adjustStackPointer(Amount, IsDestroy, MF, MBB,
+                       TII, I, DL);
+  MBB.erase(I);
 }
 
 void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
                                             MachineBasicBlock &MBB) const {
-  llvm_unreachable("TODO: implement emitPrologue");
+  // TODO: Do ".setMIFlag(MachineInstr::FrameSetup)" on emitted instructions
+  auto *MFI = MF.getFrameInfo();
+  assert(MFI->getCalleeSavedInfo().empty() &&
+         "WebAssembly should not have callee-saved registers");
+  assert(!hasFP(MF) && "Functions needing frame pointers not yet supported");
+  uint64_t StackSize = MFI->getStackSize();
+  if (!StackSize && (!MFI->adjustsStack() || MFI->getMaxCallFrameSize() == 0))
+    return;
+
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+
+  auto InsertPt = MBB.begin();
+  DebugLoc DL;
+
+  adjustStackPointer(StackSize, false, MF, MBB, TII, InsertPt, DL);
 }
 
 void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
                                             MachineBasicBlock &MBB) const {
-  llvm_unreachable("TODO: implement emitEpilogue");
-}
+  uint64_t StackSize = MF.getFrameInfo()->getStackSize();
+  if (!StackSize)
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+  unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  auto InsertPt = MBB.getFirstTerminator();
+  DebugLoc DL;
+
+  if (InsertPt != MBB.end()) {
+    DL = InsertPt->getDebugLoc();
+  }
 
-void WebAssemblyFrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  llvm_unreachable("TODO: implement processFunctionBeforeCalleeSavedScan");
+  // Restore the stack pointer. Without FP its value is just SP32 - stacksize
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+      .addImm(StackSize);
+  auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), WebAssembly::SP32)
+      .addReg(WebAssembly::SP32)
+      .addReg(OffsetReg);
+  // Re-use OffsetReg to hold the address of the stacktop
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+      .addExternalSymbol(SPSymbol);
+  auto *MMO = new MachineMemOperand(MachinePointerInfo(),
+                                    MachineMemOperand::MOStore, 4, 4);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
+      .addImm(0)
+      .addReg(OffsetReg)
+      .addReg(WebAssembly::SP32)
+      .addMemOperand(MMO);
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 0b112d0..5f4708f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -38,9 +38,6 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
new file mode 100644
index 0000000..3a03fa5
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -0,0 +1,25 @@
+//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file describes the various WebAssembly ISD node types.
+///
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+HANDLE_NODETYPE(CALL1)
+HANDLE_NODETYPE(CALL0)
+HANDLE_NODETYPE(RETURN)
+HANDLE_NODETYPE(ARGUMENT)
+HANDLE_NODETYPE(Wrapper)
+HANDLE_NODETYPE(BR_IF)
+HANDLE_NODETYPE(TABLESWITCH)
+
+// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 518ef33..8390f79 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -56,13 +56,68 @@ public:
 
   SDNode *Select(SDNode *Node) override;
 
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+// Include the pieces autogenerated from the target description.
+#include "WebAssemblyGenDAGISel.inc"
+
 private:
   // add select functions here...
 };
 } // end anonymous namespace
 
 SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
-  llvm_unreachable("TODO: implement Select");
+  // Dump information about the Node being selected.
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return nullptr;
+  }
+
+  // Few custom selection stuff.
+  SDNode *ResNode = nullptr;
+  EVT VT = Node->getValueType(0);
+
+  switch (Node->getOpcode()) {
+  default:
+    break;
+    // If we need WebAssembly-specific selection, it would go here.
+    (void)VT;
+  }
+
+  // Select the default instruction.
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  return ResNode;
+}
+
+bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  switch (ConstraintID) {
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_m:
+    // We just support simple memory operands that just have a single address
+    // operand and need no special handling.
+    OutOps.push_back(Op);
+    return false;
+  default:
+    break;
+  }
+
+  return true;
 }
 
 /// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4184eb6..e9933b0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -17,10 +17,13 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
-#include "WebAssemblyTargetObjectFile.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
@@ -32,14 +35,254 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
+namespace {
+// Diagnostic information for unimplemented or unsupported feature reporting.
+// TODO: This code is copied from BPF and AMDGPU; consider factoring it out
+// and sharing code.
+class DiagnosticInfoUnsupported final : public DiagnosticInfo {
+private:
+  // Debug location where this diagnostic is triggered.
+  DebugLoc DLoc;
+  const Twine &Description;
+  const Function &Fn;
+  SDValue Value;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc,
+                            SDValue Value)
+      : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()),
+        Description(Desc), Fn(Fn), Value(Value) {}
+
+  void print(DiagnosticPrinter &DP) const override {
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    if (DLoc) {
+      auto DIL = DLoc.get();
+      StringRef Filename = DIL->getFilename();
+      unsigned Line = DIL->getLine();
+      unsigned Column = DIL->getColumn();
+      OS << Filename << ':' << Line << ':' << Column << ' ';
+    }
+
+    OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n'
+       << Description;
+    if (Value)
+      Value->print(OS);
+    OS << '\n';
+    OS.flush();
+    DP << Str;
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+} // end anonymous namespace
+
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
+
+  // Booleans always contain 0 or 1.
+  setBooleanContents(ZeroOrOneBooleanContent);
   // WebAssembly does not produce floating-point exceptions on normal floating
   // point operations.
   setHasFloatingPointExceptions(false);
   // We don't know the microarchitecture here, so just reduce register pressure.
   setSchedulingPreference(Sched::RegPressure);
+  // Tell ISel that we have a stack pointer.
+  setStackPointerRegisterToSaveRestore(
+      Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32);
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &WebAssembly::I32RegClass);
+  addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
+  addRegisterClass(MVT::f32, &WebAssembly::F32RegClass);
+  addRegisterClass(MVT::f64, &WebAssembly::F64RegClass);
+  // Compute derived properties from the register classes.
+  computeRegisterProperties(Subtarget->getRegisterInfo());
+
+  setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
+  setOperationAction(ISD::JumpTable, MVTPtr, Custom);
+
+  // Take the default expansion for va_arg, va_copy, and va_end. There is no
+  // default action for va_start, so we do that custom.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  for (auto T : {MVT::f32, MVT::f64}) {
+    // Don't expand the floating-point types to constant pools.
+    setOperationAction(ISD::ConstantFP, T, Legal);
+    // Expand floating-point comparisons.
+    for (auto CC : {ISD::SETO, ISD::SETUO, ISD::SETUEQ, ISD::SETONE,
+                    ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
+      setCondCodeAction(CC, T, Expand);
+    // Expand floating-point library function operators.
+    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
+                    ISD::FREM, ISD::FMA})
+      setOperationAction(Op, T, Expand);
+    // Note supported floating-point library function operators that otherwise
+    // default to expand.
+    for (auto Op :
+         {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT})
+      setOperationAction(Op, T, Legal);
+    // Support minnan and maxnan, which otherwise default to expand.
+    setOperationAction(ISD::FMINNAN, T, Legal);
+    setOperationAction(ISD::FMAXNAN, T, Legal);
+  }
+
+  for (auto T : {MVT::i32, MVT::i64}) {
+    // Expand unavailable integer operations.
+    for (auto Op :
+         {ISD::BSWAP, ISD::ROTL, ISD::ROTR, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+          ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS,
+          ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC,
+          ISD::SUBE}) {
+      setOperationAction(Op, T, Expand);
+    }
+  }
+
+  // As a special case, these operators use the type to mean the type to
+  // sign-extend from.
+  for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32})
+    setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
+
+  // Dynamic stack allocation: use the default expansion.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
+
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  // Expand these forms; we pattern-match the forms that we can handle in isel.
+  for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+    for (auto Op : {ISD::BR_CC, ISD::SELECT_CC})
+      setOperationAction(Op, T, Expand);
+
+  // We have custom switch handling.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+  // WebAssembly doesn't have:
+  //  - Floating-point extending loads.
+  //  - Floating-point truncating stores.
+  //  - i1 extending loads.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  for (auto T : MVT::integer_valuetypes())
+    for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
+      setLoadExtAction(Ext, T, MVT::i1, Promote);
+
+  // Trap lowers to wasm unreachable
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+}
+
+FastISel *WebAssemblyTargetLowering::createFastISel(
+    FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const {
+  return WebAssembly::createFastISel(FuncInfo, LibInfo);
+}
+
+bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode * /*GA*/) const {
+  // All offsets can be folded.
+  return true;
+}
+
+MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
+                                                      EVT VT) const {
+  unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
+  if (BitWidth > 1 && BitWidth < 8)
+    BitWidth = 8;
+
+  if (BitWidth > 64) {
+    BitWidth = 64;
+    assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) &&
+           "64-bit shift counts ought to be enough for anyone");
+  }
+
+  MVT Result = MVT::getIntegerVT(BitWidth);
+  assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+         "Unable to represent scalar shift amount type");
+  return Result;
+}
+
+const char *
+WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
+  case WebAssemblyISD::FIRST_NUMBER:
+    break;
+#define HANDLE_NODETYPE(NODE)                                                  \
+  case WebAssemblyISD::NODE:                                                   \
+    return "WebAssemblyISD::" #NODE;
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
+  }
+  return nullptr;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to a
+  // WebAssembly register class.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+      if (VT.isInteger() && !VT.isVector()) {
+        if (VT.getSizeInBits() <= 32)
+          return std::make_pair(0U, &WebAssembly::I32RegClass);
+        if (VT.getSizeInBits() <= 64)
+          return std::make_pair(0U, &WebAssembly::I64RegClass);
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const {
+  // Assume ctz is a relatively cheap operation.
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const {
+  // Assume clz is a relatively cheap operation.
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                      const AddrMode &AM,
+                                                      Type *Ty,
+                                                      unsigned AS) const {
+  // WebAssembly offsets are added as unsigned without wrapping. The
+  // isLegalAddressingMode gives us no way to determine if wrapping could be
+  // happening, so we approximate this by accepting only non-negative offsets.
+  if (AM.BaseOffs < 0)
+    return false;
+
+  // WebAssembly has no scale register operands.
+  if (AM.Scale != 0)
+    return false;
+
+  // Everything else is legal.
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -50,16 +293,367 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
+static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue()));
+}
+
+// Test whether the given calling convention is supported.
+static bool CallingConvSupported(CallingConv::ID CallConv) {
+  // We currently support the language-independent target-independent
+  // conventions. We don't yet have a way to annotate calls with properties like
+  // "cold", and we don't have any call-clobbered registers, so these are mostly
+  // all handled the same.
+  return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+         CallConv == CallingConv::Cold ||
+         CallConv == CallingConv::PreserveMost ||
+         CallConv == CallingConv::PreserveAll ||
+         CallConv == CallingConv::CXX_FAST_TLS;
+}
+
+SDValue
+WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  CallingConv::ID CallConv = CLI.CallConv;
+  if (!CallingConvSupported(CallConv))
+    fail(DL, DAG,
+         "WebAssembly doesn't support language-specific or target-specific "
+         "calling conventions yet");
+  if (CLI.IsPatchPoint)
+    fail(DL, DAG, "WebAssembly doesn't support patch point yet");
+
+  // WebAssembly doesn't currently support explicit tail calls. If they are
+  // required, fail. Otherwise, just disable them.
+  if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
+       MF.getTarget().Options.GuaranteedTailCallOpt) ||
+      (CLI.CS && CLI.CS->isMustTailCall()))
+    fail(DL, DAG, "WebAssembly doesn't support tail call yet");
+  CLI.IsTailCall = false;
+
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  if (Ins.size() > 1)
+    fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
+
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  for (const ISD::OutputArg &Out : Outs) {
+    if (Out.Flags.isByVal())
+      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+    if (Out.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+    if (Out.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+    if (Out.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+    if (Out.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+  }
+
+  bool IsVarArg = CLI.IsVarArg;
+  unsigned NumFixedArgs = CLI.NumFixedArgs;
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  if (IsVarArg) {
+    // Outgoing non-fixed arguments are placed at the top of the stack. First
+    // compute their offsets and the total amount of argument stack space
+    // needed.
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      EVT VT = Arg.getValueType();
+      assert(VT != MVT::iPTR && "Legalized args should be concrete");
+      Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+      unsigned Offset =
+          CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty),
+                               MF.getDataLayout().getABITypeAlignment(Ty));
+      CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
+                                        Offset, VT.getSimpleVT(),
+                                        CCValAssign::Full));
+    }
+  }
+
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+  SDValue NB;
+  if (NumBytes) {
+    NB = DAG.getConstant(NumBytes, DL, PtrVT, true);
+    Chain = DAG.getCALLSEQ_START(Chain, NB, DL);
+  }
+
+  if (IsVarArg) {
+    // For non-fixed arguments, next emit stores to store the argument values
+    // to the stack at the offsets computed above.
+    SDValue SP = DAG.getCopyFromReg(
+        Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT);
+    unsigned ValNo = 0;
+    SmallVector<SDValue, 8> Chains;
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      assert(ArgLocs[ValNo].getValNo() == ValNo &&
+             "ArgLocs should remain in order and only hold varargs args");
+      unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
+      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP,
+                                DAG.getConstant(Offset, DL, PtrVT));
+      Chains.push_back(DAG.getStore(Chain, DL, Arg, Add,
+                                    MachinePointerInfo::getStack(MF, Offset),
+                                    false, false, 0));
+    }
+    if (!Chains.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  // Compute the operands for the CALLn node.
+  SmallVector<SDValue, 16> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs
+  // isn't reliable.
+  Ops.append(OutVals.begin(),
+             IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
+
+  SmallVector<EVT, 8> Tys;
+  for (const auto &In : Ins) {
+    assert(!In.Flags.isByVal() && "byval is not valid for return values");
+    assert(!In.Flags.isNest() && "nest is not valid for return values");
+    if (In.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values");
+    if (In.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values");
+    if (In.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG,
+           "WebAssembly hasn't implemented cons regs last return values");
+    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // registers.
+    Tys.push_back(In.VT);
+  }
+  Tys.push_back(MVT::Other);
+  SDVTList TyList = DAG.getVTList(Tys);
+  SDValue Res =
+      DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
+                  DL, TyList, Ops);
+  if (Ins.empty()) {
+    Chain = Res;
+  } else {
+    InVals.push_back(Res);
+    Chain = Res.getValue(1);
+  }
+
+  if (NumBytes) {
+    SDValue Unused = DAG.getTargetConstant(0, DL, PtrVT);
+    Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL);
+  }
+
+  return Chain;
+}
+
+bool WebAssemblyTargetLowering::CanLowerReturn(
+    CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    LLVMContext & /*Context*/) const {
+  // WebAssembly can't currently handle returning tuples.
+  return Outs.size() <= 1;
+}
+
+SDValue WebAssemblyTargetLowering::LowerReturn(
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+    SelectionDAG &DAG) const {
+  assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
+  if (!CallingConvSupported(CallConv))
+    fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  RetOps.append(OutVals.begin(), OutVals.end());
+  Chain = DAG.getNode(WebAssemblyISD::RETURN, DL, MVT::Other, RetOps);
+
+  // Record the number and types of the return values.
+  for (const ISD::OutputArg &Out : Outs) {
+    assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+    assert(!Out.Flags.isNest() && "nest is not valid for return values");
+    assert(Out.IsFixed && "non-fixed return value is not valid");
+    if (Out.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
+    if (Out.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
+    if (Out.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
+  }
+
+  return Chain;
+}
+
+SDValue WebAssemblyTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (!CallingConvSupported(CallConv))
+    fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+  // Set up the incoming ARGUMENTS value, which serves to represent the liveness
+  // of the incoming values before they're represented by virtual registers.
+  MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
+
+  for (const ISD::InputArg &In : Ins) {
+    if (In.Flags.isByVal())
+      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+    if (In.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+    if (In.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+    if (In.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+    if (In.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // registers.
+    InVals.push_back(
+        In.Used
+            ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
+                          DAG.getTargetConstant(InVals.size(), DL, MVT::i32))
+            : DAG.getUNDEF(In.VT));
+
+    // Record the number and types of arguments.
+    MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);
+  }
+
+  // Incoming varargs arguments are on the stack and will be accessed through
+  // va_arg, so we don't need to do anything for them here.
+
+  return Chain;
+}
+
 //===----------------------------------------------------------------------===//
-//  Other Lowering Code
+//  Custom lowering hooks.
 //===----------------------------------------------------------------------===//
 
+SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operation lowering");
+    return SDValue();
+  case ISD::FrameIndex:
+    return LowerFrameIndex(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::ExternalSymbol:
+    return LowerExternalSymbol(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::BR_JT:
+    return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  }
+}
+
+SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  int FI = cast<FrameIndexSDNode>(Op)->getIndex();
+  return DAG.getTargetFrameIndex(FI, Op.getValueType());
+}
+
+SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const auto *GA = cast<GlobalAddressSDNode>(Op);
+  EVT VT = Op.getValueType();
+  assert(GA->getTargetFlags() == 0 &&
+         "Unexpected target flags on generic GlobalAddressSDNode");
+  if (GA->getAddressSpace() != 0)
+    fail(DL, DAG, "WebAssembly only expects the 0 address space");
+  return DAG.getNode(
+      WebAssemblyISD::Wrapper, DL, VT,
+      DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
+}
+
+SDValue
+WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const auto *ES = cast<ExternalSymbolSDNode>(Op);
+  EVT VT = Op.getValueType();
+  assert(ES->getTargetFlags() == 0 &&
+         "Unexpected target flags on generic ExternalSymbolSDNode");
+  // Set the TargetFlags to 0x1 which indicates that this is a "function"
+  // symbol rather than a data symbol. We do this unconditionally even though
+  // we don't know anything about the symbol other than its name, because all
+  // external symbols used in target-independent SelectionDAG code are for
+  // functions.
+  return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+                     DAG.getTargetExternalSymbol(ES->getSymbol(), VT,
+                                                 /*TargetFlags=*/0x1));
+}
+
+SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // There's no need for a Wrapper node because we always incorporate a jump
+  // table operand into a TABLESWITCH instruction, rather than ever
+  // materializing it in a register.
+  const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(),
+                                JT->getTargetFlags());
+}
+
+SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
+  SDValue Index = Op.getOperand(2);
+  assert(JT->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Index);
+
+  MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
+  const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs;
+
+  // TODO: For now, we just pick something arbitrary for a default case for now.
+  // We really want to sniff out the guard and put in the real default case (and
+  // delete the guard).
+  Ops.push_back(DAG.getBasicBlock(MBBs[0]));
+
+  // Add an operand for each case.
+  for (auto MBB : MBBs)
+    Ops.push_back(DAG.getBasicBlock(MBB));
+
+  return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops);
+}
+
+SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
+
+  // The incoming non-fixed arguments are placed on the top of the stack, with
+  // natural alignment, at the point of the call, so the base pointer is just
+  // the current frame pointer.
+  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
+  unsigned FP =
+      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
 //===----------------------------------------------------------------------===//
 //                          WebAssembly Optimization Hooks
 //===----------------------------------------------------------------------===//
-
-MCSection *WebAssemblyTargetObjectFile::SelectSectionForGlobal(
-    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
-    const TargetMachine &TM) const {
-  return getDataSection();
-}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index efd60a7..e7232a0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -22,10 +22,11 @@ namespace llvm {
 
 namespace WebAssemblyISD {
 
-enum {
+enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-  // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
+#define HANDLE_NODETYPE(NODE) NODE,
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
 };
 
 } // end namespace WebAssemblyISD
@@ -42,8 +43,51 @@ private:
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
+
+  FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                           const TargetLibraryInfo *LibInfo) const override;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  bool isCheapToSpeculateCttz() const override;
+  bool isCheapToSpeculateCtlz() const override;
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
+                      SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SDLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  // Custom lowering hooks.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 };
 
+namespace WebAssembly {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace WebAssembly
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 6b5b6cd..cfa1519 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -12,10 +12,63 @@
 ///
 //===----------------------------------------------------------------------===//
 
-/*
- * TODO(jfb): Add the following.
- *
- * call_direct: call function directly
- * call_indirect: call function indirectly
- * addressof: obtain a function pointer value for a given function
- */
+// TODO: addr64: These currently assume the callee address is 32-bit.
+
+let Defs = [ARGUMENTS] in {
+
+// Call sequence markers. These have an immediate which represents the amount of
+// stack space to allocate or free, which is used for varargs lowering.
+let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
+                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                       [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
+} // isCodeGenOnly = 1
+
+multiclass CALL<WebAssemblyRegClass vt, string prefix> {
+  def CALL_#vt : I<(outs vt:$dst), (ins i32imm:$callee, variable_ops),
+                   [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
+                   !strconcat(prefix, "call\t$dst, $callee")>;
+  def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
+                            [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
+                            !strconcat(prefix, "call_indirect\t$dst, $callee")>;
+}
+let Uses = [SP32, SP64], isCall = 1 in {
+  defm : CALL<I32, "i32.">;
+  defm : CALL<I64, "i64.">;
+  defm : CALL<F32, "f32.">;
+  defm : CALL<F64, "f64.">;
+
+  def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops),
+                    [(WebAssemblycall0 (i32 imm:$callee))],
+                    "call    \t$callee">;
+  def CALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
+                             [(WebAssemblycall0 I32:$callee)],
+                             "call_indirect\t$callee">;
+} // Uses = [SP32,SP64], isCall = 1
+
+} // Defs = [ARGUMENTS]
+
+// Patterns for matching a direct call to a global address.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I32 tglobaladdr:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I64 tglobaladdr:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F32 tglobaladdr:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
+          (CALL_VOID tglobaladdr:$callee)>;
+
+// Patterns for matching a direct call to an external symbol.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I32 texternalsym:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I64 texternalsym:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F32 texternalsym:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F64 texternalsym:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
+          (CALL_VOID texternalsym:$callee)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
new file mode 100644
index 0000000..fda9595
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -0,0 +1,87 @@
+//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly control-flow code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+// The condition operand is a boolean value which WebAssembly represents as i32.
+def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst),
+              [(brcond I32:$cond, bb:$dst)],
+               "br_if   \t$cond, $dst">;
+let isCodeGenOnly = 1 in
+def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [],
+                   "br_unless\t$cond, $dst">;
+let isBarrier = 1 in {
+def BR   : I<(outs), (ins bb_op:$dst),
+             [(br bb:$dst)],
+             "br      \t$dst">;
+} // isBarrier = 1
+} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
+          (BR_IF I32:$cond, bb_op:$dst)>;
+def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
+          (BR_UNLESS I32:$cond, bb_op:$dst)>;
+
+let Defs = [ARGUMENTS] in {
+
+// TODO: SelectionDAG's lowering insists on using a pointer as the index for
+// jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
+// currently.
+// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
+// Set TSFlags{1} to 1 to indicate that the immediates represent labels.
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
+                        [(WebAssemblytableswitch I32:$index, bb:$default)],
+                        "tableswitch\t$index, $default"> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
+def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
+                        [(WebAssemblytableswitch I64:$index, bb:$default)],
+                        "tableswitch\t$index, $default"> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+// Placemarkers to indicate the start or end of a block or loop scope. These
+// use/clobber EXPR_STACK to prevent them from being moved into the middle of
+// an expression tree.
+let Uses = [EXPR_STACK], Defs = [EXPR_STACK] in {
+def BLOCK     : I<(outs), (ins), [], "block">;
+def LOOP      : I<(outs), (ins), [], "loop">;
+def END_BLOCK : I<(outs), (ins), [], "end_block">;
+def END_LOOP  : I<(outs), (ins), [], "end_loop">;
+} // Uses = [EXPR_STACK], Defs = [EXPR_STACK]
+
+multiclass RETURN<WebAssemblyRegClass vt> {
+  def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
+                     "return  \t$val">;
+}
+
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+let isReturn = 1 in {
+  defm : RETURN<I32>;
+  defm : RETURN<I64>;
+  defm : RETURN<F32>;
+  defm : RETURN<F64>;
+  def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">;
+} // isReturn = 1
+  def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable">;
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 3fa2906..931f4a9 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -13,32 +13,99 @@
 ///
 //===----------------------------------------------------------------------===//
 
-/*
- * TODO(jfb): Add the following.
- *
- * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer
- * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer
- * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer
- * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer
- * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer
- * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer
- * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer
- * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer
- * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer
- * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer
- * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer
- * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer
- * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer
- * float32.demote[float64]: demote a 64-bit float to a 32-bit float
- * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float
- * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float
- * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float
- * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float
- * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float
- * float64.promote[float32]: promote a 32-bit float to a 64-bit float
- * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float
- * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float
- * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float
- * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float
- * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float
- */
+let Defs = [ARGUMENTS] in {
+
+def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src),
+                      [(set I32:$dst, (trunc I64:$src))],
+                      "i32.wrap/i64\t$dst, $src">;
+
+def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src),
+                          [(set I64:$dst, (sext I32:$src))],
+                          "i64.extend_s/i32\t$dst, $src">;
+def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
+                         [(set I64:$dst, (zext I32:$src))],
+                         "i64.extend_u/i32\t$dst, $src">;
+
+} // defs = [ARGUMENTS]
+
+// Expand a "don't care" extend into zero-extend (chosen over sign-extend
+// somewhat arbitrarily, although it favors popular hardware architectures
+// and is conceptually a simpler operation).
+def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Conversion from floating point to integer traps on overflow and invalid.
+let hasSideEffects = 1 in {
+def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
+                        [(set I32:$dst, (fp_to_sint F32:$src))],
+                        "i32.trunc_s/f32\t$dst, $src">;
+def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src),
+                        [(set I32:$dst, (fp_to_uint F32:$src))],
+                        "i32.trunc_u/f32\t$dst, $src">;
+def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src),
+                        [(set I64:$dst, (fp_to_sint F32:$src))],
+                        "i64.trunc_s/f32\t$dst, $src">;
+def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src),
+                        [(set I64:$dst, (fp_to_uint F32:$src))],
+                        "i64.trunc_u/f32\t$dst, $src">;
+def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src),
+                        [(set I32:$dst, (fp_to_sint F64:$src))],
+                        "i32.trunc_s/f64\t$dst, $src">;
+def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src),
+                        [(set I32:$dst, (fp_to_uint F64:$src))],
+                        "i32.trunc_u/f64\t$dst, $src">;
+def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src),
+                        [(set I64:$dst, (fp_to_sint F64:$src))],
+                        "i64.trunc_s/f64\t$dst, $src">;
+def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src),
+                        [(set I64:$dst, (fp_to_uint F64:$src))],
+                        "i64.trunc_u/f64\t$dst, $src">;
+} // hasSideEffects = 1
+
+def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src),
+                          [(set F32:$dst, (sint_to_fp I32:$src))],
+                          "f32.convert_s/i32\t$dst, $src">;
+def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src),
+                          [(set F32:$dst, (uint_to_fp I32:$src))],
+                          "f32.convert_u/i32\t$dst, $src">;
+def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src),
+                          [(set F64:$dst, (sint_to_fp I32:$src))],
+                          "f64.convert_s/i32\t$dst, $src">;
+def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src),
+                          [(set F64:$dst, (uint_to_fp I32:$src))],
+                          "f64.convert_u/i32\t$dst, $src">;
+def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src),
+                          [(set F32:$dst, (sint_to_fp I64:$src))],
+                          "f32.convert_s/i64\t$dst, $src">;
+def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src),
+                          [(set F32:$dst, (uint_to_fp I64:$src))],
+                          "f32.convert_u/i64\t$dst, $src">;
+def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src),
+                          [(set F64:$dst, (sint_to_fp I64:$src))],
+                          "f64.convert_s/i64\t$dst, $src">;
+def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src),
+                          [(set F64:$dst, (uint_to_fp I64:$src))],
+                          "f64.convert_u/i64\t$dst, $src">;
+
+def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
+                        [(set F64:$dst, (fextend F32:$src))],
+                        "f64.promote/f32\t$dst, $src">;
+def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
+                       [(set F32:$dst, (fround F64:$src))],
+                       "f32.demote/f64\t$dst, $src">;
+
+def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
+                            [(set I32:$dst, (bitconvert F32:$src))],
+                            "i32.reinterpret/f32\t$dst, $src">;
+def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src),
+                            [(set F32:$dst, (bitconvert I32:$src))],
+                            "f32.reinterpret/i32\t$dst, $src">;
+def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src),
+                            [(set I64:$dst, (bitconvert F64:$src))],
+                            "i64.reinterpret/f64\t$dst, $src">;
+def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src),
+                            [(set F64:$dst, (bitconvert I64:$src))],
+                            "f64.reinterpret/i64\t$dst, $src">;
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 30ef633..5520c6d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -12,33 +12,90 @@
 ///
 //===----------------------------------------------------------------------===//
 
-defm FADD : BinaryFP<fadd>;
-defm FSUB : BinaryFP<fsub>;
-defm FMUL : BinaryFP<fmul>;
-defm FDIV : BinaryFP<fdiv>;
-defm FABS : UnaryFP<fabs>;
-defm FNEG : UnaryFP<fneg>;
-defm COPYSIGN : BinaryFP<fcopysign>;
-defm CEIL : UnaryFP<fceil>;
-defm FLOOR : UnaryFP<ffloor>;
-defm TRUNC : UnaryFP<ftrunc>;
-defm NEARESTINT : UnaryFP<fnearbyint>;
-
-/*
- * TODO(jfb): Add the following for 32-bit and 64-bit.
- *
- * float32.eq: compare equal
- * float32.lt: less than
- * float32.le: less than or equal
- * float32.gt: greater than
- * float32.ge: greater than or equal
- */
-
-defm SQRT : UnaryFP<fsqrt>;
-
-/*
- * TODO(jfb): Add the following for 32-bit and 64-bit.
- *
- * float32.min: minimum (binary operator); if either operand is NaN, returns NaN
- * float32.max: maximum (binary operator); if either operand is NaN, returns NaN
- */
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in
+defm ADD : BinaryFP<fadd, "add ">;
+defm SUB : BinaryFP<fsub, "sub ">;
+let isCommutable = 1 in
+defm MUL : BinaryFP<fmul, "mul ">;
+defm DIV : BinaryFP<fdiv, "div ">;
+defm SQRT : UnaryFP<fsqrt, "sqrt">;
+
+defm ABS : UnaryFP<fabs, "abs ">;
+defm NEG : UnaryFP<fneg, "neg ">;
+defm COPYSIGN : BinaryFP<fcopysign, "copysign">;
+
+let isCommutable = 1 in {
+defm MIN : BinaryFP<fminnan, "min ">;
+defm MAX : BinaryFP<fmaxnan, "max ">;
+} // isCommutable = 1
+
+defm CEIL : UnaryFP<fceil, "ceil">;
+defm FLOOR : UnaryFP<ffloor, "floor">;
+defm TRUNC : UnaryFP<ftrunc, "trunc">;
+defm NEAREST : UnaryFP<fnearbyint, "nearest">;
+
+} // Defs = [ARGUMENTS]
+
+// DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
+def : Pat<(fcopysign F64:$lhs, F32:$rhs),
+          (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
+def : Pat<(fcopysign F32:$lhs, F64:$rhs),
+          (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>;
+
+// WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
+def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
+def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in {
+defm EQ : ComparisonFP<SETOEQ, "eq  ">;
+defm NE : ComparisonFP<SETUNE, "ne  ">;
+} // isCommutable = 1
+defm LT : ComparisonFP<SETOLT, "lt  ">;
+defm LE : ComparisonFP<SETOLE, "le  ">;
+defm GT : ComparisonFP<SETOGT, "gt  ">;
+defm GE : ComparisonFP<SETOGE, "ge  ">;
+
+} // Defs = [ARGUMENTS]
+
+// Don't care floating-point comparisons, supported via other comparisons.
+def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setlt f32:$lhs, f32:$rhs), (LT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setle f32:$lhs, f32:$rhs), (LE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setgt f32:$lhs, f32:$rhs), (GT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setge f32:$lhs, f32:$rhs), (GE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(seteq f64:$lhs, f64:$rhs), (EQ_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setne f64:$lhs, f64:$rhs), (NE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setlt f64:$lhs, f64:$rhs), (LT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_F32 : I<(outs F32:$dst), (ins I32:$cond, F32:$lhs, F32:$rhs),
+                   [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
+                   "f32.select\t$dst, $cond, $lhs, $rhs">;
+def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs),
+                   [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
+                   "f64.select\t$dst, $cond, $lhs, $rhs">;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs),
+          (SELECT_F32 I32:$cond, F32:$lhs, F32:$rhs)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs),
+          (SELECT_F64 I32:$cond, F64:$lhs, F64:$rhs)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
+          (SELECT_F32 I32:$cond, F32:$rhs, F32:$lhs)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
+          (SELECT_F64 I32:$cond, F64:$rhs, F64:$lhs)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 513c36f..8008dd3 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -1,4 +1,4 @@
-// WebAssemblyInstrFormats.td - WebAssembly Instruction Formats -*- tblgen -*-//
+//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,44 +12,68 @@
 ///
 //===----------------------------------------------------------------------===//
 
-// WebAssembly Instruction Format
-class WebAssemblyInst<string cstr> : Instruction {
+// WebAssembly Instruction Format.
+class WebAssemblyInst<string asmstr> : Instruction {
   field bits<0> Inst; // Instruction encoding.
   let Namespace   = "WebAssembly";
   let Pattern     = [];
-  let Constraints = cstr;
+  let AsmString   = asmstr;
 }
 
-// Normal instructions
-class I<dag oops, dag iops, list<dag> pattern, string cstr = "">
-    : WebAssemblyInst<cstr> {
+// Normal instructions.
+class I<dag oops, dag iops, list<dag> pattern, string asmstr = "">
+    : WebAssemblyInst<asmstr> {
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
 }
 
 // Unary and binary instructions, for the local types that WebAssembly supports.
-multiclass UnaryInt<SDNode node> {
-  def _I32 : I<(outs Int32:$dst), (ins Int32:$src),
-               [(set Int32:$dst, (node Int32:$src))]>;
-  def _I64 : I<(outs Int64:$dst), (ins Int64:$src),
-               [(set Int64:$dst, (node Int64:$src))]>;
-}
-multiclass BinaryInt<SDNode node> {
-  def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs),
-               [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>;
-  def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs),
-               [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>;
-}
-multiclass UnaryFP<SDNode node> {
-  def _F32 : I<(outs Float32:$dst), (ins Float32:$src),
-               [(set Float32:$dst, (node Float32:$src))]>;
-  def _F64 : I<(outs Float64:$dst), (ins Float64:$src),
-               [(set Float64:$dst, (node Float64:$src))]>;
-}
-multiclass BinaryFP<SDNode node> {
-  def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs),
-               [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>;
-  def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs),
-               [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>;
+multiclass UnaryInt<SDNode node, string name> {
+  def _I32 : I<(outs I32:$dst), (ins I32:$src),
+               [(set I32:$dst, (node I32:$src))],
+               !strconcat("i32.", !strconcat(name, "\t$dst, $src"))>;
+  def _I64 : I<(outs I64:$dst), (ins I64:$src),
+               [(set I64:$dst, (node I64:$src))],
+               !strconcat("i64.", !strconcat(name, "\t$dst, $src"))>;
+}
+multiclass BinaryInt<SDNode node, string name> {
+  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+               [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
+               !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs),
+               [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
+               !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+}
+multiclass UnaryFP<SDNode node, string name> {
+  def _F32 : I<(outs F32:$dst), (ins F32:$src),
+               [(set F32:$dst, (node F32:$src))],
+               !strconcat("f32.", !strconcat(name, "\t$dst, $src"))>;
+  def _F64 : I<(outs F64:$dst), (ins F64:$src),
+               [(set F64:$dst, (node F64:$src))],
+               !strconcat("f64.", !strconcat(name, "\t$dst, $src"))>;
+}
+multiclass BinaryFP<SDNode node, string name> {
+  def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs),
+               [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
+               !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs),
+               [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
+               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+}
+multiclass ComparisonInt<CondCode cond, string name> {
+  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+               [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
+               !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs),
+               [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
+               !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+}
+multiclass ComparisonFP<CondCode cond, string name> {
+  def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs),
+               [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
+               !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs),
+               [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
+               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index ea8937c..028e9af 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -24,5 +24,145 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-instr-info"
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "WebAssemblyGenInstrInfo.inc"
+
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
-    : RI(STI.getTargetTriple()) {}
+    : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+                              WebAssembly::ADJCALLSTACKUP),
+      RI(STI.getTargetTriple()) {}
+
+void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       DebugLoc DL, unsigned DestReg,
+                                       unsigned SrcReg, bool KillSrc) const {
+  // This method is called by post-RA expansion, which expects only pregs to
+  // exist. However we need to handle both here.
+  auto &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+      MRI.getRegClass(DestReg) :
+      MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(SrcReg);
+
+  unsigned CopyLocalOpcode;
+  if (RC == &WebAssembly::I32RegClass)
+    CopyLocalOpcode = WebAssembly::COPY_LOCAL_I32;
+  else if (RC == &WebAssembly::I64RegClass)
+    CopyLocalOpcode = WebAssembly::COPY_LOCAL_I64;
+  else if (RC == &WebAssembly::F32RegClass)
+    CopyLocalOpcode = WebAssembly::COPY_LOCAL_F32;
+  else if (RC == &WebAssembly::F64RegClass)
+    CopyLocalOpcode = WebAssembly::COPY_LOCAL_F64;
+  else
+    llvm_unreachable("Unexpected register class");
+
+  BuildMI(MBB, I, DL, get(CopyLocalOpcode), DestReg)
+      .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
+}
+
+// Branch analysis.
+bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                         MachineBasicBlock *&TBB,
+                                         MachineBasicBlock *&FBB,
+                                         SmallVectorImpl<MachineOperand> &Cond,
+                                         bool /*AllowModify*/) const {
+  bool HaveCond = false;
+  for (MachineInstr &MI : MBB.terminators()) {
+    switch (MI.getOpcode()) {
+    default:
+      // Unhandled instruction; bail out.
+      return true;
+    case WebAssembly::BR_IF:
+      if (HaveCond)
+        return true;
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(1).isMBB())
+        return true;
+      Cond.push_back(MachineOperand::CreateImm(true));
+      Cond.push_back(MI.getOperand(0));
+      TBB = MI.getOperand(1).getMBB();
+      HaveCond = true;
+      break;
+    case WebAssembly::BR_UNLESS:
+      if (HaveCond)
+        return true;
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(1).isMBB())
+        return true;
+      Cond.push_back(MachineOperand::CreateImm(false));
+      Cond.push_back(MI.getOperand(0));
+      TBB = MI.getOperand(1).getMBB();
+      HaveCond = true;
+      break;
+    case WebAssembly::BR:
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(0).isMBB())
+        return true;
+      if (!HaveCond)
+        TBB = MI.getOperand(0).getMBB();
+      else
+        FBB = MI.getOperand(0).getMBB();
+      break;
+    }
+    if (MI.isBarrier())
+      break;
+  }
+
+  return false;
+}
+
+unsigned WebAssemblyInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  unsigned Count = 0;
+
+  while (I != MBB.instr_begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isTerminator())
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.instr_end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                            MachineBasicBlock *TBB,
+                                            MachineBasicBlock *FBB,
+                                            ArrayRef<MachineOperand> Cond,
+                                            DebugLoc DL) const {
+  if (Cond.empty()) {
+    if (!TBB)
+      return 0;
+
+    BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(TBB);
+    return 1;
+  }
+
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+
+  if (Cond[0].getImm()) {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF))
+        .addOperand(Cond[1])
+        .addMBB(TBB);
+  } else {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
+        .addOperand(Cond[1])
+        .addMBB(TBB);
+  }
+  if (!FBB)
+    return 1;
+
+  BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(FBB);
+  return 2;
+}
+
+bool WebAssemblyInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+  Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index 1c4ae22..5ddd9b3 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -19,17 +19,35 @@
 #include "WebAssemblyRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
+#define GET_INSTRINFO_HEADER
+#include "WebAssemblyGenInstrInfo.inc"
+
 namespace llvm {
 
 class WebAssemblySubtarget;
 
-class WebAssemblyInstrInfo final {
+class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo {
   const WebAssemblyRegisterInfo RI;
 
 public:
   explicit WebAssemblyInstrInfo(const WebAssemblySubtarget &STI);
 
   const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        DebugLoc DL) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index fe3ca76..2e682a4 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,20 +25,58 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
 
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqEnd :
+    SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyCall1    : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
+def SDT_WebAssemblyTableswitch : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyReturn   : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyWrapper  : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                   SDTCisPtrTy<0>]>;
+
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
+def WebAssemblycallseq_start :
+    SDNode<"ISD::CALLSEQ_START", SDT_WebAssemblyCallSeqStart,
+           [SDNPHasChain, SDNPOutGlue]>;
+def WebAssemblycallseq_end :
+    SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
+                              SDT_WebAssemblyCall0,
+                              [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
+                              SDT_WebAssemblyCall1,
+                              [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblytableswitch : SDNode<"WebAssemblyISD::TABLESWITCH",
+                                    SDT_WebAssemblyTableswitch,
+                                    [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
+                                 SDT_WebAssemblyArgument>;
+def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
+                                 SDT_WebAssemblyReturn, [SDNPHasChain]>;
+def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
+                                 SDT_WebAssemblyWrapper>;
+
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific Operands.
 //===----------------------------------------------------------------------===//
 
-/*
- * TODO(jfb): Add the following.
- *
- * get_local: read the current value of a local variable
- * set_local: set the current value of a local variable
-*/
+let OperandNamespace = "WebAssembly" in {
+
+let OperandType = "OPERAND_BASIC_BLOCK" in
+def bb_op : Operand<OtherVT>;
+
+let OperandType = "OPERAND_FPIMM" in {
+def f32imm_op : Operand<f32>;
+def f64imm_op : Operand<f64>;
+} // OperandType = "OPERAND_FPIMM"
+
+} // OperandNamespace = "WebAssembly"
 
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
@@ -47,13 +85,75 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 include "WebAssemblyInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
+// Additional instructions.
+//===----------------------------------------------------------------------===//
+
+multiclass ARGUMENT<WebAssemblyRegClass vt> {
+  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
+                       [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+}
+defm : ARGUMENT<I32>;
+defm : ARGUMENT<I64>;
+defm : ARGUMENT<F32>;
+defm : ARGUMENT<F64>;
+
+let Defs = [ARGUMENTS] in {
+
+// get_local and set_local are not generated by instruction selection; they
+// are implied by virtual register uses and defs in most contexts. However,
+// they are explicitly emitted for special purposes.
+multiclass LOCAL<WebAssemblyRegClass vt> {
+  def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [],
+                        "get_local\t$res, $regno">;
+  // TODO: set_local returns its operand value
+  def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [],
+                        "set_local\t$regno, $src">;
+
+  // COPY_LOCAL is not an actual instruction in wasm, but since we allow
+  // get_local and set_local to be implicit, we can have a COPY_LOCAL which
+  // is actually a no-op because all the work is done in the implied
+  // get_local and set_local.
+  let isAsCheapAsAMove = 1 in
+  def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [],
+                         "copy_local\t$res, $src">;
+}
+defm : LOCAL<I32>;
+defm : LOCAL<I64>;
+defm : LOCAL<F32>;
+defm : LOCAL<F64>;
+
+let isMoveImm = 1 in {
+def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
+                  [(set I32:$res, imm:$imm)],
+                  "i32.const\t$res, $imm">;
+def CONST_I64 : I<(outs I64:$res), (ins i64imm:$imm),
+                  [(set I64:$res, imm:$imm)],
+                  "i64.const\t$res, $imm">;
+def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
+                  [(set F32:$res, fpimm:$imm)],
+                  "f32.const\t$res, $imm">;
+def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
+                  [(set F64:$res, fpimm:$imm)],
+                  "f64.const\t$res, $imm">;
+} // isMoveImm = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+          (CONST_I32 tglobaladdr:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+          (CONST_I32 texternalsym:$addr)>;
+
+//===----------------------------------------------------------------------===//
 // Additional sets of instructions.
 //===----------------------------------------------------------------------===//
 
 include "WebAssemblyInstrMemory.td"
 include "WebAssemblyInstrCall.td"
+include "WebAssemblyInstrControl.td"
 include "WebAssemblyInstrInteger.td"
-include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 5f60fe8..09e5eaf 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -12,34 +12,77 @@
 ///
 //===----------------------------------------------------------------------===//
 
-defm ADD : BinaryInt<add>;
-defm SUB : BinaryInt<sub>;
-defm MUL : BinaryInt<mul>;
-defm SDIV : BinaryInt<sdiv>;
-defm UDIV : BinaryInt<udiv>;
-defm SREM : BinaryInt<srem>;
-defm UREM : BinaryInt<urem>;
-defm AND : BinaryInt<and>;
-defm IOR : BinaryInt<or>;
-defm XOR : BinaryInt<xor>;
-defm SHL : BinaryInt<shl>;
-defm SHR : BinaryInt<srl>;
-defm SAR : BinaryInt<sra>;
-
-/*
- * TODO(jfb): Add the following for 32-bit and 64-bit.
- *
- * int32.eq: signed-less compare equal
- * int32.slt: signed less than
- * int32.sle: signed less than or equal
- * int32.ult: unsigned less than
- * int32.ule: unsigned less than or equal
- * int32.sgt: signed greater than
- * int32.sge: signed greater than or equal
- * int32.ugt: unsigned greater than
- * int32.uge: unsigned greater than or equal
- */
-
-defm CLZ : UnaryInt<ctlz>;
-defm CTZ : UnaryInt<cttz>;
-defm POPCNT : UnaryInt<ctpop>;
+let Defs = [ARGUMENTS] in {
+
+// The spaces after the names are for aesthetic purposes only, to make
+// operands line up vertically after tab expansion.
+let isCommutable = 1 in
+defm ADD : BinaryInt<add, "add ">;
+defm SUB : BinaryInt<sub, "sub ">;
+let isCommutable = 1 in
+defm MUL : BinaryInt<mul, "mul ">;
+// Divide and remainder trap on a zero denominator.
+let hasSideEffects = 1 in {
+defm DIV_S : BinaryInt<sdiv, "div_s">;
+defm DIV_U : BinaryInt<udiv, "div_u">;
+defm REM_S : BinaryInt<srem, "rem_s">;
+defm REM_U : BinaryInt<urem, "rem_u">;
+} // hasSideEffects = 1
+let isCommutable = 1 in {
+defm AND : BinaryInt<and, "and ">;
+defm OR : BinaryInt<or, "or  ">;
+defm XOR : BinaryInt<xor, "xor ">;
+} // isCommutable = 1
+defm SHL : BinaryInt<shl, "shl ">;
+defm SHR_U : BinaryInt<srl, "shr_u">;
+defm SHR_S : BinaryInt<sra, "shr_s">;
+
+let isCommutable = 1 in {
+defm EQ : ComparisonInt<SETEQ, "eq  ">;
+defm NE : ComparisonInt<SETNE, "ne  ">;
+} // isCommutable = 1
+defm LT_S : ComparisonInt<SETLT, "lt_s">;
+defm LE_S : ComparisonInt<SETLE, "le_s">;
+defm LT_U : ComparisonInt<SETULT, "lt_u">;
+defm LE_U : ComparisonInt<SETULE, "le_u">;
+defm GT_S : ComparisonInt<SETGT, "gt_s">;
+defm GE_S : ComparisonInt<SETGE, "ge_s">;
+defm GT_U : ComparisonInt<SETUGT, "gt_u">;
+defm GE_U : ComparisonInt<SETUGE, "ge_u">;
+
+defm CLZ : UnaryInt<ctlz, "clz ">;
+defm CTZ : UnaryInt<cttz, "ctz ">;
+defm POPCNT : UnaryInt<ctpop, "popcnt">;
+
+} // Defs = [ARGUMENTS]
+
+// Expand the "don't care" operations to supported operations.
+def : Pat<(ctlz_zero_undef I32:$src), (CLZ_I32 I32:$src)>;
+def : Pat<(ctlz_zero_undef I64:$src), (CLZ_I64 I64:$src)>;
+def : Pat<(cttz_zero_undef I32:$src), (CTZ_I32 I32:$src)>;
+def : Pat<(cttz_zero_undef I64:$src), (CTZ_I64 I64:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_I32 : I<(outs I32:$dst), (ins I32:$cond, I32:$lhs, I32:$rhs),
+                   [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
+                   "i32.select\t$dst, $cond, $lhs, $rhs">;
+def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs),
+                   [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
+                   "i64.select\t$dst, $cond, $lhs, $rhs">;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
+          (SELECT_I32 I32:$cond, I32:$lhs, I32:$rhs)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
+          (SELECT_I64 I32:$cond, I64:$lhs, I64:$rhs)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
+          (SELECT_I32 I32:$cond, I32:$rhs, I32:$lhs)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
+          (SELECT_I64 I32:$cond, I64:$rhs, I64:$lhs)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 5ab40e8..b39ac52 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -12,35 +12,576 @@
 ///
 //===----------------------------------------------------------------------===//
 
-/*
- * TODO(jfb): Add the following.
- * Each has optional alignment and immediate byte offset.
- *
- * int32.load_sx[int8]: sign-extend to int32
- * int32.load_sx[int16]: sign-extend to int32
- * int32.load_zx[int8]: zero-extend to int32
- * int32.load_zx[int16]: zero-extend to int32
- * int32.load[int32]: (no conversion)
- * int64.load_sx[int8]: sign-extend to int64
- * int64.load_sx[int16]: sign-extend to int64
- * int64.load_sx[int32]: sign-extend to int64
- * int64.load_zx[int8]: zero-extend to int64
- * int64.load_zx[int16]: zero-extend to int64
- * int64.load_zx[int32]: zero-extend to int64
- * int64.load[int64]: (no conversion)
- * float32.load[float32]: (no conversion)
- * float64.load[float64]: (no conversion)
- * 
- * int32.store[int8]: wrap int32 to int8
- * int32.store[int16]: wrap int32 to int16
- * int32.store[int32]: (no conversion)
- * int64.store[int8]: wrap int64 to int8
- * int64.store[int16]: wrap int64 to int16
- * int64.store[int32]: wrap int64 to int32
- * int64.store[int64]: (no conversion)
- * float32.store[float32]: (no conversion)
- * float64.store[float64]: (no conversion)
- * 
- * load_global: load the value of a given global variable
- * store_global: store a given value to a given global variable
- */
+// TODO:
+//  - HasAddr64
+//  - WebAssemblyTargetLowering having to do with atomics
+//  - Each has optional alignment.
+
+// WebAssembly has i8/i16/i32/i64/f32/f64 memory types, but doesn't have i8/i16
+// local types. These memory-only types instead zero- or sign-extend into local
+// types when loading, and truncate when storing.
+
+// WebAssembly constant offsets are performed as unsigned with infinite
+// precision, so we need to check for NoUnsignedWrap so that we don't fold an
+// offset for an add that needs wrapping.
+def regPlusImm : PatFrag<(ops node:$addr, node:$off),
+                         (add node:$addr, node:$off),
+                         [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
+
+// GlobalAddresses are conceptually unsigned values, so we can also fold them
+// into immediate values as long as their offsets are non-negative.
+def regPlusGA : PatFrag<(ops node:$addr, node:$off),
+                        (add node:$addr, node:$off),
+                        [{
+  return N->getFlags()->hasNoUnsignedWrap() ||
+         (N->getOperand(1)->getOpcode() == WebAssemblyISD::Wrapper &&
+          isa<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0)) &&
+          cast<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0))
+             ->getOffset() >= 0);
+}]>;
+
+// We don't need a regPlusES because external symbols never have constant
+// offsets folded into them, so we can just use add.
+
+let Defs = [ARGUMENTS] in {
+
+// Basic load.
+def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+                 "i32.load\t$dst, ${off}(${addr})">;
+def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                 "i64.load\t$dst, ${off}(${addr})">;
+def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr), [],
+                 "f32.load\t$dst, ${off}(${addr})">;
+def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr), [],
+                 "f64.load\t$dst, ${off}(${addr})">;
+
+} // Defs = [ARGUMENTS]
+
+// Select loads with no constant offset.
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
+
+// Select loads with a constant offset.
+def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_I32 imm:$off, $addr)>;
+def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_I64 imm:$off, $addr)>;
+def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_F32 imm:$off, $addr)>;
+def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_F64 imm:$off, $addr)>;
+def : Pat<(i32 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(f32 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_F32 tglobaladdr:$off, $addr)>;
+def : Pat<(f64 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_F64 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_I32 texternalsym:$off, $addr)>;
+def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_I64 texternalsym:$off, $addr)>;
+def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_F32 texternalsym:$off, $addr)>;
+def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_F64 texternalsym:$off, $addr)>;
+
+// Select loads with just a constant offset.
+def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_F32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_F64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_F32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_F64 texternalsym:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Extending load.
+def LOAD8_S_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i32.load8_s\t$dst, ${off}(${addr})">;
+def LOAD8_U_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i32.load8_u\t$dst, ${off}(${addr})">;
+def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i32.load16_s\t$dst, ${off}(${addr})">;
+def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i32.load16_u\t$dst, ${off}(${addr})">;
+def LOAD8_S_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i64.load8_s\t$dst, ${off}(${addr})">;
+def LOAD8_U_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i64.load8_u\t$dst, ${off}(${addr})">;
+def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i64.load16_s\t$dst, ${off}(${addr})">;
+def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i64.load16_u\t$dst, ${off}(${addr})">;
+def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i64.load32_s\t$dst, ${off}(${addr})">;
+def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+                     "i64.load32_u\t$dst, ${off}(${addr})">;
+
+} // Defs = [ARGUMENTS]
+
+// Select extending loads with no constant offset.
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+
+// Select extending loads with a constant offset.
+def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_S_I32 imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I32 imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_S_I32 imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I32 imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_S_I64 imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_S_I64 imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD32_S_I64 imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD32_U_I64 imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_S_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_S_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_S_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_S_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD32_S_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_S_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_S_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_S_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_S_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD32_S_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD32_U_I64 texternalsym:$off, $addr)>;
+
+// Select extending loads with just a constant offset.
+def : Pat<(i32 (sextloadi8 imm:$off)), (LOAD8_S_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 imm:$off)), (LOAD16_S_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 imm:$off)), (LOAD8_S_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 imm:$off)), (LOAD16_S_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 imm:$off)), (LOAD32_S_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0))>;
+
+// Resolve "don't care" extending loads to zero-extending loads. This is
+// somewhat arbitrary, but zero-extending is conceptually simpler.
+
+// Select "don't care" extending loads with no constant offset.
+def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+
+// Select "don't care" extending loads with a constant offset.
+def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I32 imm:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I32 imm:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD32_U_I64 imm:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr,
+                                     (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr,
+                                     (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (add I32:$addr,
+                               (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (add I32:$addr,
+                               (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD32_U_I64 texternalsym:$off, $addr)>;
+
+// Select "don't care" extending loads with just a constant offset.
+def : Pat<(i32 (extloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Basic store.
+// Note that we split the patterns out of the instruction definitions because
+// WebAssembly's stores return their operand value, and tablegen doesn't like
+// instruction definition patterns that don't reference all of the output
+// operands.
+// Note: WebAssembly inverts SelectionDAG's usual operand order.
+def STORE_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
+                   "i32.store\t$dst, ${off}(${addr}), $val">;
+def STORE_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+                   "i64.store\t$dst, ${off}(${addr}), $val">;
+def STORE_F32  : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr, F32:$val), [],
+                   "f32.store\t$dst, ${off}(${addr}), $val">;
+def STORE_F64  : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr, F64:$val), [],
+                   "f64.store\t$dst, ${off}(${addr}), $val">;
+
+} // Defs = [ARGUMENTS]
+
+// Select stores with no constant offset.
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
+
+// Select stores with a constant offset.
+def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_F32 imm:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_F64 imm:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>;
+
+// Select stores with just a constant offset.
+def : Pat<(store I32:$val, imm:$off),
+          (STORE_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, imm:$off),
+          (STORE_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, imm:$off),
+          (STORE_F32 imm:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, imm:$off),
+          (STORE_F64 imm:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_F32 tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_F64 tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_F32 texternalsym:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_F64 texternalsym:$off, (CONST_I32 0), F64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Truncating store.
+def STORE8_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
+                    "i32.store8\t$dst, ${off}(${addr}), $val">;
+def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
+                    "i32.store16\t$dst, ${off}(${addr}), $val">;
+def STORE8_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+                    "i64.store8\t$dst, ${off}(${addr}), $val">;
+def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+                    "i64.store16\t$dst, ${off}(${addr}), $val">;
+def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+                    "i64.store32\t$dst, ${off}(${addr}), $val">;
+
+} // Defs = [ARGUMENTS]
+
+// Select truncating stores with no constant offset.
+def : Pat<(truncstorei8 I32:$val, I32:$addr),
+          (STORE8_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, I32:$addr),
+          (STORE16_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, I32:$addr),
+          (STORE8_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, I32:$addr),
+          (STORE16_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, I32:$addr),
+          (STORE32_I64 0, I32:$addr, I64:$val)>;
+
+// Select truncating stores with a constant offset.
+def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE8_I32 imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE16_I32 imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE8_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE16_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE32_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val,
+                        (regPlusGA I32:$addr,
+                                   (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val,
+                        (regPlusGA I32:$addr,
+                                   (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
+                                       (WebAssemblywrapper texternalsym:$off))),
+          (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
+          (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val,
+                        (add I32:$addr,
+                             (WebAssemblywrapper texternalsym:$off))),
+          (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
+          (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
+          (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+
+// Select truncating stores with just a constant offset.
+def : Pat<(truncstorei8 I32:$val, imm:$off),
+          (STORE8_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, imm:$off),
+          (STORE16_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, imm:$off),
+          (STORE8_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, imm:$off),
+          (STORE16_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, imm:$off),
+          (STORE32_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE8_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE16_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE8_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE16_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE32_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Memory size.
+def MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins),
+                        [(set I32:$dst, (int_wasm_memory_size))],
+                        "memory_size\t$dst">,
+                      Requires<[HasAddr32]>;
+def MEMORY_SIZE_I64 : I<(outs I64:$dst), (ins),
+                        [(set I64:$dst, (int_wasm_memory_size))],
+                        "memory_size\t$dst">,
+                      Requires<[HasAddr64]>;
+
+// Grow memory.
+def GROW_MEMORY_I32 : I<(outs), (ins I32:$delta),
+                        [(int_wasm_grow_memory I32:$delta)],
+                        "grow_memory\t$delta">,
+                      Requires<[HasAddr32]>;
+def GROW_MEMORY_I64 : I<(outs), (ins I64:$delta),
+                        [(int_wasm_grow_memory I64:$delta)],
+                        "grow_memory\t$delta">,
+                      Requires<[HasAddr64]>;
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
new file mode 100644
index 0000000..b009a4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -0,0 +1,133 @@
+//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers br_unless into br_if with an inverted condition.
+///
+/// br_unless is not currently in the spec, but it's very convenient for LLVM
+/// to use. This pass allows LLVM to use it, for now.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-br_unless"
+
+namespace {
+class WebAssemblyLowerBrUnless final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Lower br_unless";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerBrUnless::ID = 0;
+FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
+  return new WebAssemblyLowerBrUnless();
+}
+
+bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+
+  for (auto &MBB : MF) {
+    for (auto MII = MBB.begin(); MII != MBB.end(); ) {
+      MachineInstr *MI = &*MII++;
+      if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+        continue;
+
+      unsigned Cond = MI->getOperand(0).getReg();
+      bool Inverted = false;
+
+      // Attempt to invert the condition in place.
+      if (MFI.isVRegStackified(Cond)) {
+        assert(MRI.hasOneDef(Cond));
+        MachineInstr *Def = MRI.getVRegDef(Cond);
+        switch (Def->getOpcode()) {
+        using namespace WebAssembly;
+        case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
+        case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
+        case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
+        case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break;
+        case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break;
+        case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break;
+        case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break;
+        case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break;
+        case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break;
+        case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break;
+        case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break;
+        case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break;
+        case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break;
+        case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break;
+        case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break;
+        case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break;
+        case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break;
+        case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break;
+        case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break;
+        case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break;
+        case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break;
+        case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break;
+        case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break;
+        case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break;
+        default: break;
+        }
+      }
+
+      // If we weren't able to invert the condition in place. Insert an
+      // expression to invert it.
+      if (!Inverted) {
+        unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        MFI.stackifyVReg(ZeroReg);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg)
+            .addImm(0);
+        unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        MFI.stackifyVReg(Tmp);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp)
+            .addReg(Cond)
+            .addReg(ZeroReg);
+        Cond = Tmp;
+        Inverted = true;
+      }
+
+      // The br_unless condition has now been inverted. Insert a br_if and
+      // delete the br_unless.
+      assert(Inverted);
+      BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
+          .addReg(Cond)
+          .addOperand(MI->getOperand(1));
+      MBB.erase(MI);
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
new file mode 100644
index 0000000..022a448
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -0,0 +1,115 @@
+// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower WebAssembly MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSymbol *
+WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
+    const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
+                                                     int64_t Offset,
+                                                     bool IsFunc) const {
+  MCSymbolRefExpr::VariantKind VK =
+      IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
+             : MCSymbolRefExpr::VK_None;
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
+
+  if (Offset != 0) {
+    if (IsFunc)
+      report_fatal_error("Function addresses with offsets not supported");
+    Expr =
+        MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+  }
+
+  return MCOperand::createExpr(Expr);
+}
+
+void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
+                                   MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_MachineBasicBlock:
+      MI->dump();
+      llvm_unreachable("MachineBasicBlock operand should have been rewritten");
+    case MachineOperand::MO_Register: {
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      const WebAssemblyFunctionInfo &MFI =
+          *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+      unsigned WAReg = MFI.getWAReg(MO.getReg());
+      MCOp = MCOperand::createReg(WAReg);
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      const ConstantFP *Imm = MO.getFPImm();
+      if (Imm->getType()->isFloatTy())
+        MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat());
+      else if (Imm->getType()->isDoubleTy())
+        MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble());
+      else
+        llvm_unreachable("unknown floating point immediate type");
+      break;
+    }
+    case MachineOperand::MO_GlobalAddress:
+      assert(MO.getTargetFlags() == 0 &&
+             "WebAssembly does not use target flags on GlobalAddresses");
+      MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(),
+                                MO.getGlobal()->getValueType()->isFunctionTy());
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      // The target flag indicates whether this is a symbol for a
+      // variable or a function.
+      assert((MO.getTargetFlags() & -2) == 0 &&
+             "WebAssembly uses only one target flag bit on ExternalSymbols");
+      MCOp = LowerSymbolOperand(GetExternalSymbolSymbol(MO), /*Offset=*/0,
+                                MO.getTargetFlags() & 1);
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
new file mode 100644
index 0000000..ab4ba1c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -0,0 +1,46 @@
+//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the class to lower WebAssembly MachineInstrs to
+/// their corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCSymbol;
+class MachineInstr;
+class MachineOperand;
+
+/// This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset,
+                               bool IsFunc) const;
+
+public:
+  WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+      : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 542d984..225c5d3 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -17,3 +17,9 @@
 using namespace llvm;
 
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {}
+
+void WebAssemblyFunctionInfo::initWARegs() {
+  assert(WARegs.empty());
+  unsigned Reg = UnusedReg;
+  WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index fc5e910..6a60280 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-// WebAssemblyMachineFuctionInfo.h-WebAssembly machine function info -*- C++ -*-
+// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*-
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,8 +16,7 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
 
-#include "WebAssemblyRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
@@ -27,9 +26,70 @@ namespace llvm {
 class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   MachineFunction &MF;
 
+  std::vector<MVT> Params;
+
+  /// A mapping from CodeGen vreg index to WebAssembly register number.
+  std::vector<unsigned> WARegs;
+
+  /// A mapping from CodeGen vreg index to a boolean value indicating whether
+  /// the given register is considered to be "stackified", meaning it has been
+  /// determined or made to meet the stack requirements:
+  ///   - single use (per path)
+  ///   - single def (per path)
+  ///   - defined and used in LIFO order with other stack registers
+  BitVector VRegStackified;
+
+  // One entry for each possible target reg. we expect it to be small.
+  std::vector<unsigned> PhysRegs;
+
 public:
-  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {
+    PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U);
+  }
   ~WebAssemblyFunctionInfo() override;
+
+  void addParam(MVT VT) { Params.push_back(VT); }
+  const std::vector<MVT> &getParams() const { return Params; }
+
+  static const unsigned UnusedReg = -1u;
+
+  void stackifyVReg(unsigned VReg) {
+    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+      VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
+    VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
+  }
+  bool isVRegStackified(unsigned VReg) const {
+    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+      return false;
+    return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg));
+  }
+
+  void initWARegs();
+  void setWAReg(unsigned VReg, unsigned WAReg) {
+    assert(WAReg != UnusedReg);
+    assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size());
+    WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
+  }
+  unsigned getWAReg(unsigned Reg) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+      return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
+    }
+    return PhysRegs[Reg];
+  }
+  // If new virtual registers are created after initWARegs has been called,
+  // this function can be used to add WebAssembly register mappings for them.
+  void addWAReg(unsigned VReg, unsigned WAReg) {
+    assert(VReg = WARegs.size());
+    WARegs.push_back(WAReg);
+  }
+
+  void addPReg(unsigned PReg, unsigned WAReg) {
+    assert(PReg < WebAssembly::NUM_TARGET_REGS);
+    assert(WAReg < -1U);
+    PhysRegs[PReg] = WAReg;
+  }
+  const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
new file mode 100644
index 0000000..4dc401a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -0,0 +1,76 @@
+//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize calls with "returned" attributes for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-returned"
+
+namespace {
+class OptimizeReturned final : public FunctionPass,
+                               public InstVisitor<OptimizeReturned> {
+  const char *getPassName() const override {
+    return "WebAssembly Optimize Returned";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  DominatorTree *DT;
+
+public:
+  static char ID;
+  OptimizeReturned() : FunctionPass(ID), DT(nullptr) {}
+
+  void visitCallSite(CallSite CS);
+};
+} // End anonymous namespace
+
+char OptimizeReturned::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
+  return new OptimizeReturned();
+}
+
+void OptimizeReturned::visitCallSite(CallSite CS) {
+  for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
+    if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
+      Instruction *Inst = CS.getInstruction();
+      Value *Arg = CS.getArgOperand(i);
+      // Ignore constants, globals, undef, etc.
+      if (isa<Constant>(Arg))
+        continue;
+      // Like replaceDominatedUsesWith but using Instruction/Use dominance.
+      for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
+        Use &U = *UI++;
+        if (DT->dominates(Inst, U))
+          U.set(Inst);
+      }
+    }
+}
+
+bool OptimizeReturned::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  visit(F);
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp
new file mode 100644
index 0000000..d570d42
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp
@@ -0,0 +1,1066 @@
+//===-- WebAssemblyPEI.cpp - Insert Prolog/Epilog code in function --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This is a copy of lib/CodeGen/PrologEpilogInserter.cpp except that it does
+// not assert that all virtual registers are gone (because WebAssembly currently
+// uses virtual rather than physical registers), and only runs
+// MRI.clearVirtRegs() if scavenging happened (which it never does). It also
+// uses a different class name so it can be registered via INITIALIZE_PASS.
+// It is otherwise unmodified, so any changes to the target-independent PEI
+// can be easily applied.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <climits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pei"
+namespace llvm {
+void initializeWasmPEIPass(PassRegistry&);
+}
+namespace {
+class WasmPEI : public MachineFunctionPass {
+public:
+  static char ID;
+  WasmPEI() : MachineFunctionPass(ID) {
+    initializeWasmPEIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+  /// frame indexes with appropriate references.
+  ///
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+  RegScavenger *RS;
+
+  // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+  // stack frame indexes.
+  unsigned MinCSFrameIndex, MaxCSFrameIndex;
+
+  // Save and Restore blocks of the current function. Typically there is a
+  // single save block, unless Windows EH funclets are involved.
+  SmallVector<MachineBasicBlock *, 1> SaveBlocks;
+  SmallVector<MachineBasicBlock *, 4> RestoreBlocks;
+
+  // Flag to control whether to use the register scavenger to resolve
+  // frame index materialization registers. Set according to
+  // TRI->requiresFrameIndexScavenging() for the current function.
+  bool FrameIndexVirtualScavenging;
+
+  void calculateSets(MachineFunction &Fn);
+  void calculateCallsInformation(MachineFunction &Fn);
+  void assignCalleeSavedSpillSlots(MachineFunction &Fn,
+                                   const BitVector &SavedRegs);
+  void insertCSRSpillsAndRestores(MachineFunction &Fn);
+  void calculateFrameObjectOffsets(MachineFunction &Fn);
+  void replaceFrameIndices(MachineFunction &Fn);
+  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+                           int &SPAdj);
+  void scavengeFrameVirtualRegs(MachineFunction &Fn);
+  void insertPrologEpilogCode(MachineFunction &Fn);
+};
+} // namespace
+
+char WasmPEI::ID = 0;
+
+namespace llvm {
+FunctionPass *createWebAssemblyPEI() {
+  return new WasmPEI();
+}
+}
+
+static cl::opt<unsigned>
+WarnStackSize("wasm-warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
+              cl::desc("Warn for stack size bigger than the given"
+                       " number"));
+
+INITIALIZE_PASS_BEGIN(WasmPEI, "wasmprologepilog",
+                "Wasm Prologue/Epilogue Insertion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(WasmPEI, "wasmprologepilog",
+                    "Wasm Prologue/Epilogue Insertion & Frame Finalization",
+                    false, false)
+
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+STATISTIC(NumBytesStackSpace,
+          "Number of bytes used for stack in all functions");
+
+void WasmPEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<StackProtector>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Compute the set of return blocks
+void WasmPEI::calculateSets(MachineFunction &Fn) {
+  const MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Even when we do not change any CSR, we still want to insert the
+  // prologue and epilogue of the function.
+  // So set the save points for those.
+
+  // Use the points found by shrink-wrapping, if any.
+  if (MFI->getSavePoint()) {
+    SaveBlocks.push_back(MFI->getSavePoint());
+    assert(MFI->getRestorePoint() && "Both restore and save must be set");
+    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
+    // If RestoreBlock does not have any successor and is not a return block
+    // then the end point is unreachable and we do not need to insert any
+    // epilogue.
+    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+      RestoreBlocks.push_back(RestoreBlock);
+    return;
+  }
+
+  // Save refs to entry and return blocks.
+  SaveBlocks.push_back(&Fn.front());
+  for (MachineBasicBlock &MBB : Fn) {
+    if (MBB.isEHFuncletEntry())
+      SaveBlocks.push_back(&MBB);
+    if (MBB.isReturnBlock())
+      RestoreBlocks.push_back(&MBB);
+  }
+}
+
+/// StackObjSet - A set of stack object indexes
+typedef SmallSetVector<int, 8> StackObjSet;
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+///
+bool WasmPEI::runOnMachineFunction(MachineFunction &Fn) {
+  const Function* F = Fn.getFunction();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+
+  // LOCALMOD: assert removed from target-independent PEI
+  //assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
+
+  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
+  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
+
+  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
+  // function's frame information. Also eliminates call frame pseudo
+  // instructions.
+  calculateCallsInformation(Fn);
+
+  // Determine which of the registers in the callee save list should be saved.
+  BitVector SavedRegs;
+  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
+
+  // Insert spill code for any callee saved registers that are modified.
+  assignCalleeSavedSpillSlots(Fn, SavedRegs);
+
+  // Determine placement of CSR spill/restore code:
+  // place all spills in the entry block, all restores in return blocks.
+  calculateSets(Fn);
+
+  // Add the code to save and restore the callee saved registers.
+  if (!F->hasFnAttribute(Attribute::Naked))
+    insertCSRSpillsAndRestores(Fn);
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameFinalized(Fn, RS);
+
+  // Calculate actual frame offsets for all abstract stack objects...
+  calculateFrameObjectOffsets(Fn);
+
+  // Add prolog and epilog code to the function.  This function is required
+  // to align the stack frame as necessary for any stack variables or
+  // called functions.  Because of this, calculateCalleeSavedRegisters()
+  // must be called before this function in order to set the AdjustsStack
+  // and MaxCallFrameSize variables.
+  if (!F->hasFnAttribute(Attribute::Naked))
+    insertPrologEpilogCode(Fn);
+
+  // Replace all MO_FrameIndex operands with physical register references
+  // and actual offsets.
+  //
+  replaceFrameIndices(Fn);
+
+  // If register scavenging is needed, as we've enabled doing it as a
+  // post-pass, scavenge the virtual registers that frame index elimination
+  // inserted.
+  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
+    scavengeFrameVirtualRegs(Fn);
+    // Clear any vregs created by virtual scavenging.
+    // LOCALMOD: made this call conditional with scavengeFrameVirtualregs()
+    Fn.getRegInfo().clearVirtRegs();
+  }
+
+  // Warn on stack size when we exceeds the given limit.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  uint64_t StackSize = MFI->getStackSize();
+  if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
+    DiagnosticInfoStackSize DiagStackSize(*F, StackSize);
+    F->getContext().diagnose(DiagStackSize);
+  }
+
+  delete RS;
+  SaveBlocks.clear();
+  RestoreBlocks.clear();
+  return true;
+}
+
+/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
+/// variables for the function's frame information and eliminate call frame
+/// pseudo instructions.
+void WasmPEI::calculateCallsInformation(MachineFunction &Fn) {
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  unsigned MaxCallFrameSize = 0;
+  bool AdjustsStack = MFI->adjustsStack();
+
+  // Get the function call frame set-up and tear-down instruction opcode
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  // Early exit for targets which have no call frame setup/destroy pseudo
+  // instructions.
+  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
+    return;
+
+  std::vector<MachineBasicBlock::iterator> FrameSDOps;
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
+               " instructions should have a single immediate argument!");
+        unsigned Size = I->getOperand(0).getImm();
+        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+        AdjustsStack = true;
+        FrameSDOps.push_back(I);
+      } else if (I->isInlineAsm()) {
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+          AdjustsStack = true;
+      }
+
+  MFI->setAdjustsStack(AdjustsStack);
+  MFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+  for (std::vector<MachineBasicBlock::iterator>::iterator
+         i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
+    MachineBasicBlock::iterator I = *i;
+
+    // If call frames are not being included as part of the stack frame, and
+    // the target doesn't indicate otherwise, remove the call frame pseudos
+    // here. The sub/add sp instruction pairs are still inserted, but we don't
+    // need to track the SP adjustment for frame index elimination.
+    if (TFI->canSimplifyCallFramePseudos(Fn))
+      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+  }
+}
+
+void WasmPEI::assignCalleeSavedSpillSlots(MachineFunction &F,
+                                      const BitVector &SavedRegs) {
+  // These are used to keep track the callee-save area. Initialize them.
+  MinCSFrameIndex = INT_MAX;
+  MaxCSFrameIndex = 0;
+
+  if (SavedRegs.empty())
+    return;
+
+  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+
+  std::vector<CalleeSavedInfo> CSI;
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (SavedRegs.test(Reg))
+      CSI.push_back(CalleeSavedInfo(Reg));
+  }
+
+  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
+  MachineFrameInfo *MFI = F.getFrameInfo();
+  if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
+    // If target doesn't implement this, use generic code.
+
+    if (CSI.empty())
+      return; // Early exit if no callee saved registers are modified!
+
+    unsigned NumFixedSpillSlots;
+    const TargetFrameLowering::SpillSlot *FixedSpillSlots =
+        TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+    // Now that we know which registers need to be saved and restored, allocate
+    // stack slots for them.
+    for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
+         I != E; ++I) {
+      unsigned Reg = I->getReg();
+      const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+      int FrameIdx;
+      if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
+        I->setFrameIdx(FrameIdx);
+        continue;
+      }
+
+      // Check to see if this physreg must be spilled to a particular stack slot
+      // on this target.
+      const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
+      while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
+             FixedSlot->Reg != Reg)
+        ++FixedSlot;
+
+      if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+        // Nope, just spill it anywhere convenient.
+        unsigned Align = RC->getAlignment();
+        unsigned StackAlign = TFI->getStackAlignment();
+
+        // We may not be able to satisfy the desired alignment specification of
+        // the TargetRegisterClass if the stack alignment is smaller. Use the
+        // min.
+        Align = std::min(Align, StackAlign);
+        FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
+        if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+        if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+      } else {
+        // Spill it to the stack where we must.
+        FrameIdx =
+            MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
+      }
+
+      I->setFrameIdx(FrameIdx);
+    }
+  }
+
+  MFI->setCalleeSavedInfo(CSI);
+}
+
+/// Helper function to update the liveness information for the callee-saved
+/// registers.
+static void updateLiveness(MachineFunction &MF) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Visited will contain all the basic blocks that are in the region
+  // where the callee saved registers are alive:
+  // - Anything that is not Save or Restore -> LiveThrough.
+  // - Save -> LiveIn.
+  // - Restore -> LiveOut.
+  // The live-out is not attached to the block, so no need to keep
+  // Restore in this set.
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<MachineBasicBlock *, 8> WorkList;
+  MachineBasicBlock *Entry = &MF.front();
+  MachineBasicBlock *Save = MFI->getSavePoint();
+
+  if (!Save)
+    Save = Entry;
+
+  if (Entry != Save) {
+    WorkList.push_back(Entry);
+    Visited.insert(Entry);
+  }
+  Visited.insert(Save);
+
+  MachineBasicBlock *Restore = MFI->getRestorePoint();
+  if (Restore)
+    // By construction Restore cannot be visited, otherwise it
+    // means there exists a path to Restore that does not go
+    // through Save.
+    WorkList.push_back(Restore);
+
+  while (!WorkList.empty()) {
+    const MachineBasicBlock *CurBB = WorkList.pop_back_val();
+    // By construction, the region that is after the save point is
+    // dominated by the Save and post-dominated by the Restore.
+    if (CurBB == Save && Save != Restore)
+      continue;
+    // Enqueue all the successors not already visited.
+    // Those are by construction either before Save or after Restore.
+    for (MachineBasicBlock *SuccBB : CurBB->successors())
+      if (Visited.insert(SuccBB).second)
+        WorkList.push_back(SuccBB);
+  }
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    for (MachineBasicBlock *MBB : Visited) {
+      MCPhysReg Reg = CSI[i].getReg();
+      // Add the callee-saved register as live-in.
+      // It's killed at the spill.
+      if (!MBB->isLiveIn(Reg))
+        MBB->addLiveIn(Reg);
+    }
+  }
+}
+
+/// insertCSRSpillsAndRestores - Insert spill and restore code for
+/// callee saved registers used in the function.
+///
+void WasmPEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
+  // Get callee saved register information.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  MFI->setCalleeSavedInfoValid(true);
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty())
+    return;
+
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  MachineBasicBlock::iterator I;
+
+  // Spill using target interface.
+  for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+    I = SaveBlock->begin();
+    if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        // Insert the spill to the stack frame.
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
+                                RC, TRI);
+      }
+    }
+    // Update the live-in information of all the blocks up to the save point.
+    updateLiveness(Fn);
+  }
+
+  // Restore using target interface.
+  for (MachineBasicBlock *MBB : RestoreBlocks) {
+    I = MBB->end();
+
+    // Skip over all terminator instructions, which are part of the return
+    // sequence.
+    MachineBasicBlock::iterator I2 = I;
+    while (I2 != MBB->begin() && (--I2)->isTerminator())
+      I = I2;
+
+    bool AtStart = I == MBB->begin();
+    MachineBasicBlock::iterator BeforeI = I;
+    if (!AtStart)
+      --BeforeI;
+
+    // Restore all registers immediately before the return and any
+    // terminators that precede it.
+    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+        assert(I != MBB->begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
+        // Insert in reverse order.  loadRegFromStackSlot can insert
+        // multiple instructions.
+        if (AtStart)
+          I = MBB->begin();
+        else {
+          I = BeforeI;
+          ++I;
+        }
+      }
+    }
+  }
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign, unsigned Skew) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
+/// those required to be close to the Stack Protector) to stack offsets.
+static void
+AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                      SmallSet<int, 16> &ProtectedObjs,
+                      MachineFrameInfo *MFI, bool StackGrowsDown,
+                      int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
+
+  for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
+        E = UnassignedObjs.end(); I != E; ++I) {
+    int i = *I;
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+    ProtectedObjs.insert(i);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void WasmPEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  StackProtector *SP = &getAnalysis<StackProtector>();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // Skew to be applied to alignment.
+  unsigned Skew = TFI.getStackAlignmentSkew(Fn);
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // First assign frame offsets to stack objects that are used to spill
+  // callee saved registers.
+  if (StackGrowsDown) {
+    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
+      // If the stack grows down, we need to add the size to find the lowest
+      // address of the object.
+      Offset += MFI->getObjectSize(i);
+
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+      MFI->setObjectOffset(i, -Offset);        // Set the computed offset
+    }
+  } else {
+    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
+    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+      MFI->setObjectOffset(i, Offset);
+      Offset += MFI->getObjectSize(i);
+    }
+  }
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // incoming stack pointer if a frame pointer is required and is closer
+  // to the incoming rather than the final stack pointer.
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
+  bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
+                               TFI.isFPCloseToIncomingSP() &&
+                               RegInfo->useFPForScavengingIndex(Fn) &&
+                               !RegInfo->needsStackRealignment(Fn));
+  if (RS && EarlyScavengingSlots) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+           IE = SFIs.end(); I != IE; ++I)
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
+  }
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> ProtectedObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    StackObjSet LargeArrayObjs;
+    StackObjSet SmallArrayObjs;
+    StackObjSet AddrOfObjs;
+
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
+                      Offset, MaxAlign, Skew);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isObjectPreAllocated(i) &&
+          MFI->getUseLocalStackAllocationBlock())
+        continue;
+      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+        continue;
+      if (RS && RS->isScavengingFrameIndex((int)i))
+        continue;
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+
+      switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
+      case StackProtector::SSPLK_None:
+        continue;
+      case StackProtector::SSPLK_SmallArray:
+        SmallArrayObjs.insert(i);
+        continue;
+      case StackProtector::SSPLK_AddrOf:
+        AddrOfObjs.insert(i);
+        continue;
+      case StackProtector::SSPLK_LargeArray:
+        LargeArrayObjs.insert(i);
+        continue;
+      }
+      llvm_unreachable("Unexpected SSPLayoutKind.");
+    }
+
+    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign, Skew);
+    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign, Skew);
+    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign, Skew);
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
+    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+      continue;
+    if (RS && RS->isScavengingFrameIndex((int)i))
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i)
+      continue;
+    if (ProtectedObjs.count(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+  }
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // stack pointer.
+  if (RS && !EarlyScavengingSlots) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+           IE = SFIs.end(); I != IE; ++I)
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
+  }
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    Offset = RoundUpToAlignment(Offset, StackAlign, Skew);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+  NumBytesStackSpace += StackSize;
+}
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+///
+void WasmPEI::insertPrologEpilogCode(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+
+  // Add prologue to the function...
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.emitPrologue(Fn, *SaveBlock);
+
+  // Add epilogue to restore the callee-save registers in each exiting block.
+  for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+    TFI.emitEpilogue(Fn, *RestoreBlock);
+
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.inlineStackProbe(Fn, *SaveBlock);
+
+  // Emit additional code that is required to support segmented stacks, if
+  // we've been asked for it.  This, when linked with a runtime with support
+  // for segmented stacks (libgcc is one), will result in allocating stack
+  // space in small chunks instead of one large contiguous block.
+  if (Fn.shouldSplitStack()) {
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
+  }
+
+  // Emit additional code that is required to explicitly handle the stack in
+  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
+  // approach is rather similar to that of Segmented Stacks, but it uses a
+  // different conditional check and another BIF for allocating more stack
+  // space.
+  if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
+}
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+///
+void WasmPEI::replaceFrameIndices(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(Fn)) return;
+
+  // Store SPAdj at exit of a basic block.
+  SmallVector<int, 8> SPState;
+  SPState.resize(Fn.getNumBlockIDs());
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+
+  // Iterate over the reachable blocks in DFS order.
+  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+       DFI != DFE; ++DFI) {
+    int SPAdj = 0;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      SPAdj = SPState[StackPred->getNumber()];
+    }
+    MachineBasicBlock *BB = *DFI;
+    replaceFrameIndices(BB, Fn, SPAdj);
+    SPState[BB->getNumber()] = SPAdj;
+  }
+
+  // Handle the unreachable blocks.
+  for (auto &BB : Fn) {
+    if (Reachable.count(&BB))
+      // Already handled in DFS traversal.
+      continue;
+    int SPAdj = 0;
+    replaceFrameIndices(&BB, Fn, SPAdj);
+  }
+}
+
+void WasmPEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+                              int &SPAdj) {
+  assert(Fn.getSubtarget().getRegisterInfo() &&
+         "getRegisterInfo() must be implemented!");
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
+
+  bool InsideCallSequence = false;
+
+  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+
+    if (I->getOpcode() == FrameSetupOpcode ||
+        I->getOpcode() == FrameDestroyOpcode) {
+      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+      SPAdj += TII.getSPAdjust(I);
+
+      MachineBasicBlock::iterator PrevI = BB->end();
+      if (I != BB->begin()) PrevI = std::prev(I);
+      TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
+
+      // Visit the instructions created by eliminateCallFramePseudoInstr().
+      if (PrevI == BB->end())
+        I = BB->begin();     // The replaced instr was the first in the block.
+      else
+        I = std::next(PrevI);
+      continue;
+    }
+
+    MachineInstr *MI = I;
+    bool DoIncr = true;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      if (!MI->getOperand(i).isFI())
+        continue;
+
+      // Frame indices in debug values are encoded in a target independent
+      // way with simply the frame index and offset rather than any
+      // target-specific addressing mode.
+      if (MI->isDebugValue()) {
+        assert(i == 0 && "Frame indices can only appear as the first "
+                         "operand of a DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI->getOperand(1);
+        Offset.setImm(Offset.getImm() +
+                      TFI->getFrameIndexReference(
+                          Fn, MI->getOperand(0).getIndex(), Reg));
+        MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // TODO: This code should be commoned with the code for
+      // PATCHPOINT. There's no good reason for the difference in
+      // implementation other than historical accident.  The only
+      // remaining difference is the unconditional use of the stack
+      // pointer as the base register.
+      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+        assert((!MI->isDebugValue() || i == 0) &&
+               "Frame indicies can only appear as the first operand of a "
+               "DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI->getOperand(i + 1);
+        const unsigned refOffset =
+          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
+                                            Reg);
+
+        Offset.setImm(Offset.getImm() + refOffset);
+        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // Some instructions (e.g. inline asm instructions) can have
+      // multiple frame indices and/or cause eliminateFrameIndex
+      // to insert more than one instruction. We need the register
+      // scavenger to go through all of these instructions so that
+      // it can update its register information. We keep the
+      // iterator at the point before insertion so that we can
+      // revisit them in full.
+      bool AtBeginning = (I == BB->begin());
+      if (!AtBeginning) --I;
+
+      // If this instruction has a FrameIndex operand, we need to
+      // use that target machine register info object to eliminate
+      // it.
+      TRI.eliminateFrameIndex(MI, SPAdj, i,
+                              FrameIndexVirtualScavenging ?  nullptr : RS);
+
+      // Reset the iterator if we were at the beginning of the BB.
+      if (AtBeginning) {
+        I = BB->begin();
+        DoIncr = false;
+      }
+
+      MI = nullptr;
+      break;
+    }
+
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because 
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (MI && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
+    if (DoIncr && I != BB->end()) ++I;
+
+    // Update register states.
+    if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
+  }
+}
+
+/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
+/// with physical registers. Use the register scavenger to find an
+/// appropriate register to use.
+///
+/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
+/// iterate over the vreg use list, which at this point only contains machine
+/// operands for which eliminateFrameIndex need a new scratch reg.
+void
+WasmPEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+  // Run through the instructions and find any virtual registers.
+  for (MachineFunction::iterator BB = Fn.begin(),
+       E = Fn.end(); BB != E; ++BB) {
+    RS->enterBasicBlock(&*BB);
+
+    int SPAdj = 0;
+
+    // The instruction stream may change in the loop, so check BB->end()
+    // directly.
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+      // We might end up here again with a NULL iterator if we scavenged a
+      // register for which we inserted spill code for definition by what was
+      // originally the first instruction in BB.
+      if (I == MachineBasicBlock::iterator(nullptr))
+        I = BB->begin();
+
+      MachineInstr *MI = I;
+      MachineBasicBlock::iterator J = std::next(I);
+      MachineBasicBlock::iterator P =
+                         I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
+                                          : std::prev(I);
+
+      // RS should process this instruction before we might scavenge at this
+      // location. This is because we might be replacing a virtual register
+      // defined by this instruction, and if so, registers killed by this
+      // instruction are available, and defined registers are not.
+      RS->forward(I);
+
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (MI->getOperand(i).isReg()) {
+          MachineOperand &MO = MI->getOperand(i);
+          unsigned Reg = MO.getReg();
+          if (Reg == 0)
+            continue;
+          if (!TargetRegisterInfo::isVirtualRegister(Reg))
+            continue;
+
+          // When we first encounter a new virtual register, it
+          // must be a definition.
+          assert(MI->getOperand(i).isDef() &&
+                 "frame index virtual missing def!");
+          // Scavenge a new scratch register
+          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
+          unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
+
+          ++NumScavengedRegs;
+
+          // Replace this reference to the virtual register with the
+          // scratch register.
+          assert (ScratchReg && "Missing scratch register!");
+          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
+          
+          // Because this instruction was processed by the RS before this
+          // register was allocated, make sure that the RS now records the
+          // register as being used.
+          RS->setRegUsed(ScratchReg);
+        }
+      }
+
+      // If the scavenger needed to use one of its spill slots, the
+      // spill code will have been inserted in between I and J. This is a
+      // problem because we need the spill code before I: Move I to just
+      // prior to J.
+      if (I != std::prev(J)) {
+        BB->splice(J, &*BB, I);
+
+        // Before we move I, we need to prepare the RS to visit I again.
+        // Specifically, RS will assert if it sees uses of registers that
+        // it believes are undefined. Because we have already processed
+        // register kills in I, when it visits I again, it will believe that
+        // those registers are undefined. To avoid this situation, unprocess
+        // the instruction I.
+        assert(RS->getCurrentPosition() == I &&
+          "The register scavenger has an unexpected position");
+        I = P;
+        RS->unprocess(P);
+      } else
+        ++I;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
new file mode 100644
index 0000000..4ad6eed
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -0,0 +1,86 @@
+//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Late peephole optimizations for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-peephole"
+
+namespace {
+class WebAssemblyPeephole final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly late peephole optimizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID;
+  WebAssemblyPeephole() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyPeephole::ID = 0;
+FunctionPass *llvm::createWebAssemblyPeephole() {
+  return new WebAssemblyPeephole();
+}
+
+bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case WebAssembly::STORE8_I32:
+      case WebAssembly::STORE16_I32:
+      case WebAssembly::STORE8_I64:
+      case WebAssembly::STORE16_I64:
+      case WebAssembly::STORE32_I64:
+      case WebAssembly::STORE_F32:
+      case WebAssembly::STORE_F64:
+      case WebAssembly::STORE_I32:
+      case WebAssembly::STORE_I64: {
+        // Store instructions return their value operand. If we ended up using
+        // the same register for both, replace it with a dead def so that it
+        // can use $discard instead.
+        MachineOperand &MO = MI.getOperand(0);
+        unsigned OldReg = MO.getReg();
+        // TODO: Handle SP/physregs
+        if (OldReg == MI.getOperand(3).getReg()
+            && TargetRegisterInfo::isVirtualRegister(MI.getOperand(3).getReg())) {
+          Changed = true;
+          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+          MO.setReg(NewReg);
+          MO.setIsDead();
+          MFI.stackifyVReg(NewReg);
+          MFI.addWAReg(NewReg, WebAssemblyFunctionInfo::UnusedReg);
+        }
+      }
+      }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
new file mode 100644
index 0000000..9ec6659
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -0,0 +1,175 @@
+//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a virtual register coloring pass.
+///
+/// WebAssembly doesn't have a fixed number of registers, but it is still
+/// desirable to minimize the total number of registers used in each function.
+///
+/// This code is modeled after lib/CodeGen/StackSlotColoring.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-coloring"
+
+namespace {
+class WebAssemblyRegColoring final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyRegColoring() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Register Coloring";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyRegColoring::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegColoring() {
+  return new WebAssemblyRegColoring();
+}
+
+// Compute the total spill weight for VReg.
+static float computeWeight(const MachineRegisterInfo *MRI,
+                           const MachineBlockFrequencyInfo *MBFI,
+                           unsigned VReg) {
+  float weight = 0.0f;
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
+    weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
+                                            MO.getParent());
+  return weight;
+}
+
+bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Register Coloring **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  // If there are calls to setjmp or sigsetjmp, don't perform coloring. Virtual
+  // registers could be modified before the longjmp is executed, resulting in
+  // the wrong value being used afterwards. (See <rdar://problem/8007500>.)
+  // TODO: Does WebAssembly need to care about setjmp for register coloring?
+  if (MF.exposesReturnsTwice())
+    return false;
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  LiveIntervals *Liveness = &getAnalysis<LiveIntervals>();
+  const MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfo>();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+  // Gather all register intervals into a list and sort them.
+  unsigned NumVRegs = MRI->getNumVirtRegs();
+  SmallVector<LiveInterval *, 0> SortedIntervals;
+  SortedIntervals.reserve(NumVRegs);
+
+  DEBUG(dbgs() << "Interesting register intervals:\n");
+  for (unsigned i = 0; i < NumVRegs; ++i) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
+    if (MFI.isVRegStackified(VReg))
+      continue;
+    // Skip unused registers, which can use $discard.
+    if (MRI->use_empty(VReg))
+      continue;
+
+    LiveInterval *LI = &Liveness->getInterval(VReg);
+    assert(LI->weight == 0.0f);
+    LI->weight = computeWeight(MRI, MBFI, VReg);
+    DEBUG(LI->dump());
+    SortedIntervals.push_back(LI);
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Sort them to put arguments first (since we don't want to rename live-in
+  // registers), by weight next, and then by position.
+  // TODO: Investigate more intelligent sorting heuristics. For starters, we
+  // should try to coalesce adjacent live intervals before non-adjacent ones.
+  std::sort(SortedIntervals.begin(), SortedIntervals.end(),
+            [MRI](LiveInterval *LHS, LiveInterval *RHS) {
+              if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
+                return MRI->isLiveIn(LHS->reg);
+              if (LHS->weight != RHS->weight)
+                return LHS->weight > RHS->weight;
+              if (LHS->empty() || RHS->empty())
+                return !LHS->empty() && RHS->empty();
+              return *LHS < *RHS;
+            });
+
+  DEBUG(dbgs() << "Coloring register intervals:\n");
+  SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u);
+  SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments(
+      SortedIntervals.size());
+  BitVector UsedColors(SortedIntervals.size());
+  bool Changed = false;
+  for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+    LiveInterval *LI = SortedIntervals[i];
+    unsigned Old = LI->reg;
+    size_t Color = i;
+    const TargetRegisterClass *RC = MRI->getRegClass(Old);
+
+    // Check if it's possible to reuse any of the used colors.
+    if (!MRI->isLiveIn(Old))
+      for (int C(UsedColors.find_first()); C != -1;
+           C = UsedColors.find_next(C)) {
+        if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
+          continue;
+        for (LiveInterval *OtherLI : Assignments[C])
+          if (!OtherLI->empty() && OtherLI->overlaps(*LI))
+            goto continue_outer;
+        Color = C;
+        break;
+      continue_outer:;
+      }
+
+    unsigned New = SortedIntervals[Color]->reg;
+    SlotMapping[i] = New;
+    Changed |= Old != New;
+    UsedColors.set(Color);
+    Assignments[Color].push_back(LI);
+    DEBUG(dbgs() << "Assigning vreg"
+                 << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg"
+                 << TargetRegisterInfo::virtReg2Index(New) << "\n");
+  }
+  if (!Changed)
+    return false;
+
+  // Rewrite register operands.
+  for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+    unsigned Old = SortedIntervals[i]->reg;
+    unsigned New = SlotMapping[i];
+    if (Old != New)
+      MRI->replaceRegWith(Old, New);
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
new file mode 100644
index 0000000..f621db0
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -0,0 +1,109 @@
+//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass which assigns WebAssembly register
+/// numbers for CodeGen virtual registers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-numbering"
+
+namespace {
+class WebAssemblyRegNumbering final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Register Numbering";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyRegNumbering() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegNumbering::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegNumbering() {
+  return new WebAssemblyRegNumbering();
+}
+
+bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Register Numbering **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo &FrameInfo = *MF.getFrameInfo();
+
+  MFI.initWARegs();
+
+  // WebAssembly argument registers are in the same index space as local
+  // variables. Assign the numbers for them first.
+  MachineBasicBlock &EntryMBB = MF.front();
+  for (MachineInstr &MI : EntryMBB) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::ARGUMENT_I32:
+    case WebAssembly::ARGUMENT_I64:
+    case WebAssembly::ARGUMENT_F32:
+    case WebAssembly::ARGUMENT_F64:
+      MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm());
+      break;
+    default:
+      break;
+    }
+  }
+
+  // Then assign regular WebAssembly registers for all remaining used
+  // virtual registers. TODO: Consider sorting the registers by frequency of
+  // use, to maximize usage of small immediate fields.
+  unsigned NumArgRegs = MFI.getParams().size();
+  unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs();
+  unsigned NumStackRegs = 0;
+  unsigned CurReg = 0;
+  for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+    // Handle stackified registers.
+    if (MFI.isVRegStackified(VReg)) {
+      MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
+      continue;
+    }
+    // Skip unused registers.
+    if (MRI.use_empty(VReg))
+      continue;
+    if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg)
+      MFI.setWAReg(VReg, NumArgRegs + CurReg++);
+  }
+  // Allocate locals for used physical registers
+  if (FrameInfo.getStackSize() > 0)
+    MFI.addPReg(WebAssembly::SP32, CurReg++);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
new file mode 100644
index 0000000..537c147
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -0,0 +1,267 @@
+//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a register stacking pass.
+///
+/// This pass reorders instructions to put register uses and defs in an order
+/// such that they form single-use expression trees. Registers fitting this form
+/// are then marked as "stackified", meaning references to them are replaced by
+/// "push" and "pop" from the stack.
+///
+/// This is primarily a code size optimization, since temporary values on the
+/// expression don't need to be named.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-stackify"
+
+namespace {
+class WebAssemblyRegStackify final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Register Stackify";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(MachineDominatorsID);
+    AU.addPreservedID(LiveVariablesID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyRegStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegStackify() {
+  return new WebAssemblyRegStackify();
+}
+
+// Decorate the given instruction with implicit operands that enforce the
+// expression stack ordering constraints for an instruction which is on
+// the expression stack.
+static void ImposeStackOrdering(MachineInstr *MI) {
+  // Write the opaque EXPR_STACK register.
+  if (!MI->definesRegister(WebAssembly::EXPR_STACK))
+    MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
+                                             /*isDef=*/true,
+                                             /*isImp=*/true));
+
+  // Also read the opaque EXPR_STACK register.
+  if (!MI->readsRegister(WebAssembly::EXPR_STACK))
+    MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
+                                             /*isDef=*/false,
+                                             /*isImp=*/true));
+}
+
+// Test whether it's safe to move Def to just before Insert.
+// TODO: Compute memory dependencies in a way that doesn't require always
+// walking the block.
+// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
+// more precise.
+static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
+                         AliasAnalysis &AA, LiveIntervals &LIS,
+                         MachineRegisterInfo &MRI) {
+  assert(Def->getParent() == Insert->getParent());
+  bool SawStore = false, SawSideEffects = false;
+  MachineBasicBlock::const_iterator D(Def), I(Insert);
+
+  // Check for register dependencies.
+  for (const MachineOperand &MO : Def->operands()) {
+    if (!MO.isReg() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+
+    // If the register is dead here and at Insert, ignore it.
+    if (MO.isDead() && Insert->definesRegister(Reg) &&
+        !Insert->readsRegister(Reg))
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // If the physical register is never modified, ignore it.
+      if (!MRI.isPhysRegModified(Reg))
+        continue;
+      // Otherwise, it's a physical register with unknown liveness.
+      return false;
+    }
+
+    // Ask LiveIntervals whether moving this virtual register use or def to
+    // Insert will change value numbers are seen.
+    const LiveInterval &LI = LIS.getInterval(Reg);
+    VNInfo *DefVNI = MO.isDef() ?
+        LI.getVNInfoAt(LIS.getInstructionIndex(Def).getRegSlot()) :
+        LI.getVNInfoBefore(LIS.getInstructionIndex(Def));
+    assert(DefVNI && "Instruction input missing value number");
+    VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(Insert));
+    if (InsVNI && DefVNI != InsVNI)
+      return false;
+  }
+
+  // Check for memory dependencies and side effects.
+  for (--I; I != D; --I)
+    SawSideEffects |= I->isSafeToMove(&AA, SawStore);
+  return !(SawStore && Def->mayLoad() && !Def->isInvariantLoad(&AA)) &&
+         !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore));
+}
+
+bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Register Stackifying **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  // Walk the instructions from the bottom up. Currently we don't look past
+  // block boundaries, and the blocks aren't ordered so the block visitation
+  // order isn't significant, but we may want to change this in the future.
+  for (MachineBasicBlock &MBB : MF) {
+    // Don't use a range-based for loop, because we modify the list as we're
+    // iterating over it and the end iterator may change.
+    for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) {
+      MachineInstr *Insert = &*MII;
+      // Don't nest anything inside a phi.
+      if (Insert->getOpcode() == TargetOpcode::PHI)
+        break;
+
+      // Don't nest anything inside an inline asm, because we don't have
+      // constraints for $push inputs.
+      if (Insert->getOpcode() == TargetOpcode::INLINEASM)
+        break;
+
+      // Iterate through the inputs in reverse order, since we'll be pulling
+      // operands off the stack in LIFO order.
+      bool AnyStackified = false;
+      for (MachineOperand &Op : reverse(Insert->uses())) {
+        // We're only interested in explicit virtual register operands.
+        if (!Op.isReg() || Op.isImplicit() || !Op.isUse())
+          continue;
+
+        unsigned Reg = Op.getReg();
+
+        // Only consider registers with a single definition.
+        // TODO: Eventually we may relax this, to stackify phi transfers.
+        MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+        if (!Def)
+          continue;
+
+        // There's no use in nesting implicit defs inside anything.
+        if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+          continue;
+
+        // Don't nest an INLINE_ASM def into anything, because we don't have
+        // constraints for $pop outputs.
+        if (Def->getOpcode() == TargetOpcode::INLINEASM)
+          continue;
+
+        // Don't nest PHIs inside of anything.
+        if (Def->getOpcode() == TargetOpcode::PHI)
+          continue;
+
+        // Argument instructions represent live-in registers and not real
+        // instructions.
+        if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_I64 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_F32 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_F64)
+          continue;
+
+        // Single-use expression trees require defs that have one use.
+        // TODO: Eventually we'll relax this, to take advantage of set_local
+        // returning its result.
+        if (!MRI.hasOneUse(Reg))
+          continue;
+
+        // For now, be conservative and don't look across block boundaries.
+        // TODO: Be more aggressive?
+        if (Def->getParent() != &MBB)
+          continue;
+
+        // Don't move instructions that have side effects or memory dependencies
+        // or other complications.
+        if (!IsSafeToMove(Def, Insert, AA, LIS, MRI))
+          continue;
+
+        Changed = true;
+        AnyStackified = true;
+        // Move the def down and nest it in the current instruction.
+        MBB.splice(Insert, &MBB, Def);
+        LIS.handleMove(Def);
+        MFI.stackifyVReg(Reg);
+        ImposeStackOrdering(Def);
+        Insert = Def;
+      }
+      if (AnyStackified)
+        ImposeStackOrdering(&*MII);
+    }
+  }
+
+  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere
+  // so that it never looks like a use-before-def.
+  if (Changed) {
+    MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK);
+    for (MachineBasicBlock &MBB : MF)
+      MBB.addLiveIn(WebAssembly::EXPR_STACK);
+  }
+
+#ifndef NDEBUG
+  // Verify that pushes and pops are performed in FIFO order.
+  SmallVector<unsigned, 0> Stack;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : reverse(MI.explicit_operands())) {
+        if (!MO.isReg())
+          continue;
+        unsigned VReg = MO.getReg();
+
+        // Don't stackify physregs like SP or FP.
+        if (!TargetRegisterInfo::isVirtualRegister(VReg))
+          continue;
+
+        if (MFI.isVRegStackified(VReg)) {
+          if (MO.isDef())
+            Stack.push_back(VReg);
+          else
+            assert(Stack.pop_back_val() == VReg);
+        }
+      }
+    }
+    // TODO: Generalize this code to support keeping values on the stack across
+    // basic block boundaries.
+    assert(Stack.empty());
+  }
+#endif
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 385c40b..90d8dda 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -43,7 +43,7 @@ WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
 }
 
 BitVector
-WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
   BitVector Reserved(getNumRegs());
   for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32,
                    WebAssembly::FP64})
@@ -52,9 +52,43 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 void WebAssemblyRegisterInfo::eliminateFrameIndex(
-    MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
-    RegScavenger *RS) const {
-  llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME
+    MachineBasicBlock::iterator II, int SPAdj,
+    unsigned FIOperandNum, RegScavenger * /*RS*/) const {
+  assert(SPAdj == 0);
+  MachineInstr &MI = *II;
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  const MachineFrameInfo& MFI = *MF.getFrameInfo();
+  int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
+
+  if (MI.mayLoadOrStore()) {
+    // If this is a load or store, make it relative to SP and fold the frame
+    // offset directly in.
+    assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0);
+    int64_t Offset = MI.getOperand(1).getImm() + FrameOffset;
+
+    if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) {
+      // If this happens the program is invalid, but better to error here than
+      // generate broken code.
+      report_fatal_error("Memory offset field overflow");
+    }
+    MI.getOperand(1).setImm(Offset);
+    MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+  } else {
+    // Otherwise create an i32.add SP, offset and make it the operand.
+    auto &MRI = MF.getRegInfo();
+    const auto *TII = MF.getSubtarget().getInstrInfo();
+
+    unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetReg)
+        .addImm(FrameOffset);
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::ADD_I32), OffsetReg)
+        .addReg(WebAssembly::SP32)
+        .addReg(OffsetReg);
+    MI.getOperand(FIOperandNum).ChangeToRegister(OffsetReg, /*IsDef=*/false);
+  }
 }
 
 unsigned
@@ -67,21 +101,11 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
 }
 
-bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  return !MF.getFunction()->hasFnAttribute("no-realign-stack");
-}
-
-// FIXME: share this with other backends with identical implementation?
-bool WebAssemblyRegisterInfo::needsStackRealignment(
-    const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
-  const Function *F = MF.getFunction();
-  unsigned StackAlign = TFI->getStackAlignment();
-  bool requiresRealignment =
-      ((MFI->getMaxAlignment() > StackAlign) ||
-       F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::StackAlignment));
-
-  return requiresRealignment && canRealignStack(MF);
+const TargetRegisterClass *
+WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                            unsigned Kind) const {
+  assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
+  if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
+    return &WebAssembly::I64RegClass;
+  return &WebAssembly::I32RegClass;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index dbdb9d0..ad1d71e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -42,9 +42,9 @@ public:
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  // Base pointer (stack realignment) support.
-  bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 2ba42eb..80a83fa 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -33,22 +33,26 @@ def FP64 : WebAssemblyReg<"%FP64">;
 def SP32 : WebAssemblyReg<"%SP32">;
 def SP64 : WebAssemblyReg<"%SP64">;
 
-// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do
-//           away with it? Try deleting once the backend works.
-// WebAssembly uses virtual registers, but the backend defines a few physical
-// registers here to keep SDAG and the MachineInstr layers happy.
-foreach i = 0-4 in {
-  def I#i : WebAssemblyReg<"%i."#i>; // i32
-  def L#i : WebAssemblyReg<"%l."#i>; // i64
-  def F#i : WebAssemblyReg<"%f."#i>; // f32
-  def D#i : WebAssemblyReg<"%d."#i>; // f64
-}
+// The register allocation framework requires register classes have at least
+// one register, so we define a few for the floating point register classes
+// since we otherwise don't need a physical register in those classes.
+def F32_0 : WebAssemblyReg<"%f32.0">;
+def F64_0 : WebAssemblyReg<"%f64.0">;
+
+// The expression stack "register". This is an opaque entity which serves to
+// order uses and defs that must remain in LIFO order.
+def EXPR_STACK : WebAssemblyReg<"STACK">;
+
+// The incoming arguments "register". This is an opaque entity which serves to
+// order the ARGUMENT instructions that are emulating live-in registers and
+// must not be scheduled below other instructions.
+def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">;
 
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
 
-def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>;
-def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>;
-def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
-def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>;
+def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
+def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
+def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
+def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
new file mode 100644
index 0000000..4e08b2b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -0,0 +1,124 @@
+//===-- WebAssemblyStoreResults.cpp - Optimize using store result values --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements an optimization pass using store result values.
+///
+/// WebAssembly's store instructions return the stored value. This is to enable
+/// an optimization wherein uses of the stored value can be replaced by uses of
+/// the store's result value, making the stored value register more likely to
+/// be single-use, thus more likely to be useful to register stackifying, and
+/// potentially also exposing the store to register stackifying. These both can
+/// reduce get_local/set_local traffic.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-store-results"
+
+namespace {
+class WebAssemblyStoreResults final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyStoreResults() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Store Results";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyStoreResults::ID = 0;
+FunctionPass *llvm::createWebAssemblyStoreResults() {
+  return new WebAssemblyStoreResults();
+}
+
+bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Store Results **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  bool Changed = false;
+
+  assert(MRI.isSSA() && "StoreResults depends on SSA form");
+
+  for (auto &MBB : MF) {
+    DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
+    for (auto &MI : MBB)
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case WebAssembly::STORE8_I32:
+      case WebAssembly::STORE16_I32:
+      case WebAssembly::STORE8_I64:
+      case WebAssembly::STORE16_I64:
+      case WebAssembly::STORE32_I64:
+      case WebAssembly::STORE_F32:
+      case WebAssembly::STORE_F64:
+      case WebAssembly::STORE_I32:
+      case WebAssembly::STORE_I64:
+        unsigned ToReg = MI.getOperand(0).getReg();
+        unsigned FromReg = MI.getOperand(3).getReg();
+        for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
+          MachineOperand &O = *I++;
+          MachineInstr *Where = O.getParent();
+          if (Where->getOpcode() == TargetOpcode::PHI) {
+            // PHIs use their operands on their incoming CFG edges rather than
+            // in their parent blocks. Get the basic block paired with this use
+            // of FromReg and check that MI's block dominates it.
+            MachineBasicBlock *Pred =
+                Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
+            if (!MDT.dominates(&MBB, Pred))
+              continue;
+          } else {
+            // For a non-PHI, check that MI dominates the instruction in the
+            // normal way.
+            if (&MI == Where || !MDT.dominates(&MI, Where))
+              continue;
+          }
+          Changed = true;
+          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
+                       << " from " << MI << "\n");
+          O.setReg(ToReg);
+          // If the store's def was previously dead, it is no longer. But the
+          // dead flag shouldn't be set yet.
+          assert(!MI.getOperand(0).isDead() && "Dead flag set on store result");
+        }
+      }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 3d9e7aa..cb2d5a6 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -46,3 +46,4 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
       TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
+bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 6f17619..f530a29 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -61,9 +61,15 @@ public:
   const WebAssemblyTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
+  const WebAssemblyInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
+  const WebAssemblyRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override;
-  bool useAA() const override { return true; }
+  bool useAA() const override;
 
   // Predicates used by WebAssemblyInstrInfo.td.
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 6f93248..b290b4b 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -45,11 +45,17 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
     CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, TT.isArch64Bit()
-                               ? "e-p:64:64-i64:64-v128:8:128-n32:64-S128"
-                               : "e-p:32:32-i64:64-v128:8:128-n32:64-S128",
+    : LLVMTargetMachine(T,
+                        TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
+                                         : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, RM, CM, OL),
       TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+  // WebAssembly type-checks expressions, but a noreturn function with a return
+  // type that doesn't match the context will cause a check failure. So we lower
+  // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
+  // 'unreachable' expression which is meant for that case.
+  this->Options.TrapUnreachable = true;
+
   initAsmInfo();
 
   // We need a reducible CFG, so disable some optimizations which tend to
@@ -77,7 +83,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+    I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
   }
   return I.get();
 }
@@ -94,23 +100,18 @@ public:
   }
 
   FunctionPass *createTargetRegisterAllocator(bool) override;
-  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
-  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
 
   void addIRPasses() override;
-  bool addPreISel() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
   void addPreRegAlloc() override;
-  void addRegAllocPasses(bool Optimized);
   void addPostRegAlloc() override;
-  void addPreSched2() override;
   void addPreEmitPass() override;
 };
 } // end anonymous namespace
 
 TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
   });
 }
@@ -124,50 +125,86 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
   return nullptr; // No reg alloc
 }
 
-void WebAssemblyPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  assert(!RegAllocPass && "WebAssembly uses no regalloc!");
-  addRegAllocPasses(false);
-}
-
-void WebAssemblyPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  assert(!RegAllocPass && "WebAssembly uses no regalloc!");
-  addRegAllocPasses(true);
-}
-
 //===----------------------------------------------------------------------===//
 // The following functions are called from lib/CodeGen/Passes.cpp to modify
 // the CodeGen pass sequence.
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyPassConfig::addIRPasses() {
-  // FIXME: the default for this option is currently POSIX, whereas
-  // WebAssembly's MVP should default to Single.
   if (TM->Options.ThreadModel == ThreadModel::Single)
+    // In "single" mode, atomics get lowered to non-atomics.
     addPass(createLowerAtomicPass());
   else
     // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
     // control specifically what gets lowered.
     addPass(createAtomicExpandPass(TM));
 
+  // Optimize "returned" function attributes.
+  addPass(createWebAssemblyOptimizeReturned());
+
   TargetPassConfig::addIRPasses();
 }
 
-bool WebAssemblyPassConfig::addPreISel() { return false; }
-
 bool WebAssemblyPassConfig::addInstSelector() {
+  (void)TargetPassConfig::addInstSelector();
   addPass(
       createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
+  // Run the argument-move pass immediately after the ScheduleDAG scheduler
+  // so that we can fix up the ARGUMENT instructions before anything else
+  // sees them in the wrong place.
+  addPass(createWebAssemblyArgumentMove());
   return false;
 }
 
-bool WebAssemblyPassConfig::addILPOpts() { return true; }
+bool WebAssemblyPassConfig::addILPOpts() {
+  (void)TargetPassConfig::addILPOpts();
+  return true;
+}
+
+void WebAssemblyPassConfig::addPreRegAlloc() {
+  TargetPassConfig::addPreRegAlloc();
 
-void WebAssemblyPassConfig::addPreRegAlloc() {}
+  // Prepare store instructions for register stackifying.
+  addPass(createWebAssemblyStoreResults());
+}
 
-void WebAssemblyPassConfig::addRegAllocPasses(bool Optimized) {}
+void WebAssemblyPassConfig::addPostRegAlloc() {
+  // TODO: The following CodeGen passes don't currently support code containing
+  // virtual registers. Consider removing their restrictions and re-enabling
+  // them.
+  //
+  // We use our own PrologEpilogInserter which is very slightly modified to
+  // tolerate virtual registers.
+  disablePass(&PrologEpilogCodeInserterID);
+  // Fails with: should be run after register allocation.
+  disablePass(&MachineCopyPropagationID);
+
+  // Mark registers as representing wasm's expression stack.
+  addPass(createWebAssemblyRegStackify());
+
+  // Run the register coloring pass to reduce the total number of registers.
+  addPass(createWebAssemblyRegColoring());
+
+  TargetPassConfig::addPostRegAlloc();
+
+  // Run WebAssembly's version of the PrologEpilogInserter. Target-independent
+  // PEI runs after PostRegAlloc and after ShrinkWrap. Putting it here will run
+  // PEI before ShrinkWrap but otherwise in the same position in the order.
+  addPass(createWebAssemblyPEI());
+}
 
-void WebAssemblyPassConfig::addPostRegAlloc() {}
+void WebAssemblyPassConfig::addPreEmitPass() {
+  TargetPassConfig::addPreEmitPass();
 
-void WebAssemblyPassConfig::addPreSched2() {}
+  // Put the CFG in structured form; insert BLOCK and LOOP markers.
+  addPass(createWebAssemblyCFGStackify());
 
-void WebAssemblyPassConfig::addPreEmitPass() {}
+  // Lower br_unless into br_if.
+  addPass(createWebAssemblyLowerBrUnless());
+
+  // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
+  addPass(createWebAssemblyRegNumbering());
+
+  // Perform the very last peephole optimizations on the code.
+  addPass(createWebAssemblyPeephole());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
new file mode 100644
index 0000000..74e33b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -0,0 +1,24 @@
+//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the functions of the WebAssembly-specific subclass
+/// of TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetObjectFile.h"
+#include "WebAssemblyTargetMachine.h"
+using namespace llvm;
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index ee78b94..39e50c9 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -16,50 +16,13 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
 
-#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
 namespace llvm {
 
-class GlobalVariable;
-
-class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFile {
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
 public:
-  WebAssemblyTargetObjectFile() {
-    TextSection = nullptr;
-    DataSection = nullptr;
-    BSSSection = nullptr;
-    ReadOnlySection = nullptr;
-
-    StaticCtorSection = nullptr;
-    StaticDtorSection = nullptr;
-    LSDASection = nullptr;
-    EHFrameSection = nullptr;
-    DwarfAbbrevSection = nullptr;
-    DwarfInfoSection = nullptr;
-    DwarfLineSection = nullptr;
-    DwarfFrameSection = nullptr;
-    DwarfPubTypesSection = nullptr;
-    DwarfDebugInlineSection = nullptr;
-    DwarfStrSection = nullptr;
-    DwarfLocSection = nullptr;
-    DwarfARangesSection = nullptr;
-    DwarfRangesSection = nullptr;
-  }
-
-  MCSection *getSectionForConstant(SectionKind Kind,
-                                   const Constant *C) const override {
-    return ReadOnlySection;
-  }
-
-  MCSection *getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
-                                      Mangler &Mang,
-                                      const TargetMachine &TM) const override {
-    return DataSection;
-  }
-
-  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                                    Mangler &Mang,
-                                    const TargetMachine &TM) const override;
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index fa88ed5..3566317 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -21,8 +21,7 @@ using namespace llvm;
 #define DEBUG_TYPE "wasmtti"
 
 TargetTransformInfo::PopcntSupportKind
-WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) {
+WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  // TODO: Make Math.popcount32 happen in WebAssembly.
-  return TTI::PSK_Software;
+  return TargetTransformInfo::PSK_FastHardware;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 7ffb604..26dc388 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -38,7 +38,7 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   const WebAssemblyTargetLowering *getTLI() const { return TLI; }
 
 public:
-  WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F)
+  WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
@@ -54,7 +54,7 @@ public:
 
   // TODO: Implement more Scalar TTI for WebAssembly
 
-  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
 
   /// @}
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
new file mode 100644
index 0000000..91b3fff
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -0,0 +1,309 @@
+# Tests which are known to fail from the GCC torture test suite.
+
+# Core dump.
+920908-1.c
+pr38151.c
+va-arg-22.c
+
+# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
+struct-ret-1.c
+va-arg-11.c
+va-arg-21.c
+va-arg-24.c
+va-arg-trap-1.c
+
+# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
+20000815-1.c
+20010129-1.c
+930628-1.c
+980707-1.c
+
+# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed.
+20030914-2.c
+20040703-1.c
+20081117-1.c
+920625-1.c
+931004-11.c
+931004-13.c
+980223.c
+bitfld-5.c
+complex-7.c
+pr38969.c
+pr51323.c
+pr52129.c
+pr57130.c
+
+# These were previously "Cannot select FrameIndex." Now most of them fail
+# because they contain call frame pseudos (e.g. call a vararg func),
+# frame pointers, or similar. This list will be updated again soon.
+20000519-1.c
+20000706-4.c
+20000706-5.c
+20000801-2.c
+20000801-4.c
+20011126-2.c
+
+20020529-1.c
+20021024-1.c
+
+20030828-1.c
+20030914-1.c
+
+20040302-1.c
+20040625-1.c
+20040823-1.c
+
+20041113-1.c
+
+20041214-1.c
+
+20050826-2.c
+
+20071213-1.c
+
+20080506-2.c
+20080519-1.c
+
+20081103-1.c
+20090113-1.c
+20090113-2.c
+20090113-3.c
+
+20090623-1.c
+
+920501-6.c
+920501-8.c
+920726-1.c
+930518-1.c
+
+931004-10.c
+931004-12.c
+931004-14.c
+931004-2.c
+931004-4.c
+931004-6.c
+931004-8.c
+
+980205.c
+980608-1.c
+980709-1.c
+980716-1.c
+990127-1.c
+
+991216-2.c
+
+#cbrt.c
+complex-5.c
+complex-6.c
+
+enum-3.c
+fprintf-chk-1.c
+frame-address.c
+loop-15.c
+loop-ivopts-2.c
+mayalias-3.c
+
+multi-ix.c
+
+pr20466-1.c
+
+
+pr28778.c
+pr28982b.c
+
+pr30778.c
+pr31448-2.c
+pr31448.c
+
+pr33870-1.c
+pr33870.c
+
+pr38051.c
+
+pr39100.c
+
+pr39339.c
+
+pr43987.c
+
+pr44575.c
+
+pr44942.c
+pr46309.c
+pr47538.c
+pr47925.c
+
+pr49390.c
+pr49419.c
+
+#pr51877.c
+
+#pr52979-1.c
+#pr52979-2.c
+pr53645-2.c
+pr53645.c
+
+pr56205.c
+
+pr56866.c
+
+pr57876.c
+pr58277-1.c
+
+pr59643.c
+
+printf-chk-1.c
+pta-field-1.c
+pta-field-2.c
+
+stdarg-1.c
+stdarg-2.c
+stdarg-3.c
+stdarg-4.c
+strct-stdarg-1.c
+strct-varg-1.c
+
+va-arg-1.c
+va-arg-10.c
+va-arg-12.c
+va-arg-13.c
+va-arg-14.c
+va-arg-15.c
+va-arg-16.c
+va-arg-17.c
+va-arg-18.c
+va-arg-19.c
+va-arg-2.c
+va-arg-20.c
+va-arg-23.c
+va-arg-26.c
+va-arg-4.c
+va-arg-5.c
+va-arg-6.c
+va-arg-7.c
+va-arg-8.c
+va-arg-9.c
+va-arg-pack-1.c
+vfprintf-1.c
+vfprintf-chk-1.c
+vprintf-1.c
+vprintf-chk-1.c
+
+# Cannot select callseq_end.
+20040811-1.c
+pr43220.c
+vla-dealloc-1.c
+
+# Cannot select brind.
+20071210-1.c
+920501-4.c
+920501-5.c
+
+# Cannot select BlockAddress.
+comp-goto-1.c
+980526-1.c
+990208-1.c
+
+# WebAssembly hasn't implemented byval arguments.
+20000412-3.c
+20000419-1.c
+20000706-1.c
+20000706-2.c
+20000707-1.c
+20000717-1.c
+20000717-5.c
+20000808-1.c
+20010605-2.c
+20011113-1.c
+20020215-1.c
+20020810-1.c
+20021118-1.c
+20040707-1.c
+20040709-1.c
+20040709-2.c
+20041201-1.c
+20050713-1.c
+20070614-1.c
+920908-2.c
+921112-1.c
+921117-1.c
+921123-2.c
+921204-1.c
+930126-1.c
+930208-1.c
+931004-5.c
+931004-9.c
+931031-1.c
+950607-2.c
+960416-1.c
+990525-1.c
+991118-1.c
+bf64-1.c
+complex-1.c
+complex-2.c
+pr15262-2.c
+pr20621-1.c
+pr23135.c
+pr30185.c
+pr42248.c
+
+# unimplemented operation lowering.
+20010122-1.c
+20030323-1.c
+20030811-1.c
+pr17377.c
+
+# Error: invalid output constraint '=t' in asm.
+990413-2.c
+990826-0.c
+
+# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
+built-in-setjmp.c
+pr60003.c
+
+# Error in the program / unsupported by Clang.
+scal-to-vec1.c
+scal-to-vec2.c
+scal-to-vec3.c
+20000822-1.c
+20010209-1.c
+20010605-1.c
+20030501-1.c
+20040520-1.c
+20061220-1.c
+20090219-1.c
+920415-1.c
+920428-2.c
+920501-7.c
+920612-2.c
+920721-4.c
+921017-1.c
+921215-1.c
+931002-1.c
+comp-goto-2.c
+nest-align-1.c
+nest-stdar-1.c
+nestfunc-1.c
+nestfunc-2.c
+nestfunc-3.c
+nestfunc-5.c
+nestfunc-6.c
+nestfunc-7.c
+pr22061-3.c
+pr22061-4.c
+pr24135.c
+pr51447.c
+20020412-1.c
+20040308-1.c
+20040423-1.c
+20041218-2.c
+20070919-1.c
+align-nest.c
+pr41935.c
+20050107-1.c
+20050119-1.c
+20050119-2.c
+920302-1.c
+920501-3.c
+920728-1.c
+pr28865.c
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index 9eee4a0..09cc53a 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -10,10 +10,8 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86AsmInstrumentation.h"
 #include "X86Operand.h"
-#include "X86RegisterInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
@@ -118,11 +116,6 @@ bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
 
 bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
 
-std::string FuncName(unsigned AccessSize, bool IsWrite) {
-  return std::string("__asan_report_") + (IsWrite ? "store" : "load") +
-         utostr(AccessSize);
-}
-
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
   struct RegisterContext {
@@ -136,26 +129,26 @@ public:
   public:
     RegisterContext(unsigned AddressReg, unsigned ShadowReg,
                     unsigned ScratchReg) {
-      BusyRegs.push_back(convReg(AddressReg, MVT::i64));
-      BusyRegs.push_back(convReg(ShadowReg, MVT::i64));
-      BusyRegs.push_back(convReg(ScratchReg, MVT::i64));
+      BusyRegs.push_back(convReg(AddressReg, 64));
+      BusyRegs.push_back(convReg(ShadowReg, 64));
+      BusyRegs.push_back(convReg(ScratchReg, 64));
     }
 
-    unsigned AddressReg(MVT::SimpleValueType VT) const {
-      return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT);
+    unsigned AddressReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
     }
 
-    unsigned ShadowReg(MVT::SimpleValueType VT) const {
-      return convReg(BusyRegs[REG_OFFSET_SHADOW], VT);
+    unsigned ShadowReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
     }
 
-    unsigned ScratchReg(MVT::SimpleValueType VT) const {
-      return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT);
+    unsigned ScratchReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
     }
 
     void AddBusyReg(unsigned Reg) {
       if (Reg != X86::NoRegister)
-        BusyRegs.push_back(convReg(Reg, MVT::i64));
+        BusyRegs.push_back(convReg(Reg, 64));
     }
 
     void AddBusyRegs(const X86Operand &Op) {
@@ -163,36 +156,36 @@ public:
       AddBusyReg(Op.getMemIndexReg());
     }
 
-    unsigned ChooseFrameReg(MVT::SimpleValueType VT) const {
+    unsigned ChooseFrameReg(unsigned Size) const {
       static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
                                               X86::RCX, X86::RDX, X86::RDI,
                                               X86::RSI };
       for (unsigned Reg : Candidates) {
         if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
-          return convReg(Reg, VT);
+          return convReg(Reg, Size);
       }
       return X86::NoRegister;
     }
 
   private:
-    unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const {
-      return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT);
+    unsigned convReg(unsigned Reg, unsigned Size) const {
+      return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
     }
 
     std::vector<unsigned> BusyRegs;
   };
 
-  X86AddressSanitizer(const MCSubtargetInfo &STI)
+  X86AddressSanitizer(const MCSubtargetInfo *&STI)
       : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
 
-  virtual ~X86AddressSanitizer() {}
+  ~X86AddressSanitizer() override {}
 
   // X86AsmInstrumentation implementation:
-  virtual void InstrumentAndEmitInstruction(const MCInst &Inst,
-                                            OperandVector &Operands,
-                                            MCContext &Ctx,
-                                            const MCInstrInfo &MII,
-                                            MCStreamer &Out) override {
+  void InstrumentAndEmitInstruction(const MCInst &Inst,
+                                    OperandVector &Operands,
+                                    MCContext &Ctx,
+                                    const MCInstrInfo &MII,
+                                    MCStreamer &Out) override {
     InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
     if (RepPrefix)
       EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
@@ -240,17 +233,16 @@ public:
 protected:
   void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
 
-  void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg,
-               MCStreamer &Out) {
-    assert(VT == MVT::i32 || VT == MVT::i64);
+  void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
+    assert(Size == 32 || Size == 64);
     MCInst Inst;
-    Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r);
-    Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, VT)));
+    Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
+    Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
     Op.addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
 
-  void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT,
+  void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
                                 unsigned Reg, MCContext &Ctx, MCStreamer &Out);
 
   // Creates new memory operand with Displacement added to an original
@@ -261,13 +253,13 @@ protected:
                                               MCContext &Ctx, int64_t *Residue);
 
   bool is64BitMode() const {
-    return STI.getFeatureBits()[X86::Mode64Bit];
+    return STI->getFeatureBits()[X86::Mode64Bit];
   }
   bool is32BitMode() const {
-    return STI.getFeatureBits()[X86::Mode32Bit];
+    return STI->getFeatureBits()[X86::Mode32Bit];
   }
   bool is16BitMode() const {
-    return STI.getFeatureBits()[X86::Mode16Bit];
+    return STI->getFeatureBits()[X86::Mode16Bit];
   }
 
   unsigned getPointerWidth() {
@@ -437,7 +429,7 @@ void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
 }
 
 void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
-                                                   MVT::SimpleValueType VT,
+                                                   unsigned Size,
                                                    unsigned Reg, MCContext &Ctx,
                                                    MCStreamer &Out) {
   int64_t Displacement = 0;
@@ -450,14 +442,14 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
 
   // Emit Op as is.
   if (Displacement == 0) {
-    EmitLEA(Op, VT, Reg, Out);
+    EmitLEA(Op, Size, Reg, Out);
     return;
   }
 
   int64_t Residue;
   std::unique_ptr<X86Operand> NewOp =
       AddDisplacement(Op, Displacement, Ctx, &Residue);
-  EmitLEA(*NewOp, VT, Reg, Out);
+  EmitLEA(*NewOp, Size, Reg, Out);
 
   while (Residue != 0) {
     const MCConstantExpr *Disp =
@@ -465,7 +457,7 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
     std::unique_ptr<X86Operand> DispOp =
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
                               SMLoc());
-    EmitLEA(*DispOp, VT, Reg, Out);
+    EmitLEA(*DispOp, Size, Reg, Out);
     Residue -= Disp->getValue();
   }
 }
@@ -503,16 +495,16 @@ class X86AddressSanitizer32 : public X86AddressSanitizer {
 public:
   static const long kShadowOffset = 0x20000000;
 
-  X86AddressSanitizer32(const MCSubtargetInfo &STI)
+  X86AddressSanitizer32(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  virtual ~X86AddressSanitizer32() {}
+  ~X86AddressSanitizer32() override {}
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
     if (FrameReg == X86::NoRegister)
       return FrameReg;
-    return getX86SubSuperRegister(FrameReg, MVT::i32);
+    return getX86SubSuperRegister(FrameReg, 32);
   }
 
   void SpillReg(MCStreamer &Out, unsigned Reg) {
@@ -535,10 +527,10 @@ public:
     OrigSPOffset += 4;
   }
 
-  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
-                                            MCContext &Ctx,
-                                            MCStreamer &Out) override {
-    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32);
+  void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
     assert(LocalFrameReg != X86::NoRegister);
 
     const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
@@ -558,24 +550,24 @@ public:
           MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
     }
 
-    SpillReg(Out, RegCtx.AddressReg(MVT::i32));
-    SpillReg(Out, RegCtx.ShadowReg(MVT::i32));
-    if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister)
-      SpillReg(Out, RegCtx.ScratchReg(MVT::i32));
+    SpillReg(Out, RegCtx.AddressReg(32));
+    SpillReg(Out, RegCtx.ShadowReg(32));
+    if (RegCtx.ScratchReg(32) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(32));
     StoreFlags(Out);
   }
 
-  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
-                                            MCContext &Ctx,
-                                            MCStreamer &Out) override {
-    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32);
+  void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
     assert(LocalFrameReg != X86::NoRegister);
 
     RestoreFlags(Out);
-    if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister)
-      RestoreReg(Out, RegCtx.ScratchReg(MVT::i32));
-    RestoreReg(Out, RegCtx.ShadowReg(MVT::i32));
-    RestoreReg(Out, RegCtx.AddressReg(MVT::i32));
+    if (RegCtx.ScratchReg(32) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(32));
+    RestoreReg(Out, RegCtx.ShadowReg(32));
+    RestoreReg(Out, RegCtx.AddressReg(32));
 
     unsigned FrameReg = GetFrameReg(Ctx, Out);
     if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
@@ -586,18 +578,18 @@ public:
     }
   }
 
-  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
-                                         bool IsWrite,
-                                         const RegisterContext &RegCtx,
-                                         MCContext &Ctx,
-                                         MCStreamer &Out) override;
-  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
-                                         bool IsWrite,
-                                         const RegisterContext &RegCtx,
-                                         MCContext &Ctx,
-                                         MCStreamer &Out) override;
-  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
-                                  MCStreamer &Out) override;
+  void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                          MCStreamer &Out) override;
 
 private:
   void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
@@ -610,10 +602,11 @@ private:
                              .addReg(X86::ESP)
                              .addImm(-16));
     EmitInstruction(
-        Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32)));
+        Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
 
-    const std::string &Fn = FuncName(AccessSize, IsWrite);
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn));
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+                                            (IsWrite ? "store" : "load") +
+                                            llvm::Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
@@ -623,14 +616,14 @@ private:
 void X86AddressSanitizer32::InstrumentMemOperandSmall(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
-  unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
-  unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
-  unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8);
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
 
-  assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister);
-  unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32);
+  assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
 
-  ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out);
+  ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
                            AddressRegI32));
@@ -673,7 +666,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
                               SMLoc(), SMLoc()));
-    EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
+    EmitLEA(*Op, 32, ScratchRegI32, Out);
     break;
   }
   case 4:
@@ -698,10 +691,10 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
 void X86AddressSanitizer32::InstrumentMemOperandLarge(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
-  unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
-  unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
 
-  ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out);
+  ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
                            AddressRegI32));
@@ -760,16 +753,16 @@ class X86AddressSanitizer64 : public X86AddressSanitizer {
 public:
   static const long kShadowOffset = 0x7fff8000;
 
-  X86AddressSanitizer64(const MCSubtargetInfo &STI)
+  X86AddressSanitizer64(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  virtual ~X86AddressSanitizer64() {}
+  ~X86AddressSanitizer64() override {}
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
     if (FrameReg == X86::NoRegister)
       return FrameReg;
-    return getX86SubSuperRegister(FrameReg, MVT::i64);
+    return getX86SubSuperRegister(FrameReg, 64);
   }
 
   void SpillReg(MCStreamer &Out, unsigned Reg) {
@@ -792,10 +785,10 @@ public:
     OrigSPOffset += 8;
   }
 
-  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
-                                            MCContext &Ctx,
-                                            MCStreamer &Out) override {
-    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64);
+  void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
     assert(LocalFrameReg != X86::NoRegister);
 
     const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
@@ -816,24 +809,24 @@ public:
     }
 
     EmitAdjustRSP(Ctx, Out, -128);
-    SpillReg(Out, RegCtx.ShadowReg(MVT::i64));
-    SpillReg(Out, RegCtx.AddressReg(MVT::i64));
-    if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister)
-      SpillReg(Out, RegCtx.ScratchReg(MVT::i64));
+    SpillReg(Out, RegCtx.ShadowReg(64));
+    SpillReg(Out, RegCtx.AddressReg(64));
+    if (RegCtx.ScratchReg(64) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(64));
     StoreFlags(Out);
   }
 
-  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
-                                            MCContext &Ctx,
-                                            MCStreamer &Out) override {
-    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64);
+  void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
     assert(LocalFrameReg != X86::NoRegister);
 
     RestoreFlags(Out);
-    if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister)
-      RestoreReg(Out, RegCtx.ScratchReg(MVT::i64));
-    RestoreReg(Out, RegCtx.AddressReg(MVT::i64));
-    RestoreReg(Out, RegCtx.ShadowReg(MVT::i64));
+    if (RegCtx.ScratchReg(64) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(64));
+    RestoreReg(Out, RegCtx.AddressReg(64));
+    RestoreReg(Out, RegCtx.ShadowReg(64));
     EmitAdjustRSP(Ctx, Out, 128);
 
     unsigned FrameReg = GetFrameReg(Ctx, Out);
@@ -845,18 +838,18 @@ public:
     }
   }
 
-  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
-                                         bool IsWrite,
-                                         const RegisterContext &RegCtx,
-                                         MCContext &Ctx,
-                                         MCStreamer &Out) override;
-  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
-                                         bool IsWrite,
-                                         const RegisterContext &RegCtx,
-                                         MCContext &Ctx,
-                                         MCStreamer &Out) override;
-  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
-                                  MCStreamer &Out) override;
+  void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                          MCStreamer &Out) override;
 
 private:
   void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
@@ -864,7 +857,7 @@ private:
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
                               SMLoc(), SMLoc()));
-    EmitLEA(*Op, MVT::i64, X86::RSP, Out);
+    EmitLEA(*Op, 64, X86::RSP, Out);
     OrigSPOffset += Offset;
   }
 
@@ -878,12 +871,13 @@ private:
                              .addReg(X86::RSP)
                              .addImm(-16));
 
-    if (RegCtx.AddressReg(MVT::i64) != X86::RDI) {
+    if (RegCtx.AddressReg(64) != X86::RDI) {
       EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
-                               RegCtx.AddressReg(MVT::i64)));
+                               RegCtx.AddressReg(64)));
     }
-    const std::string &Fn = FuncName(AccessSize, IsWrite);
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn));
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+                                            (IsWrite ? "store" : "load") +
+                                            llvm::Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
@@ -893,16 +887,16 @@ private:
 void X86AddressSanitizer64::InstrumentMemOperandSmall(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
-  unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64);
-  unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
-  unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64);
-  unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
-  unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8);
+  unsigned AddressRegI64 = RegCtx.AddressReg(64);
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
 
-  assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister);
-  unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32);
+  assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
 
-  ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out);
+  ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
                            AddressRegI64));
@@ -944,7 +938,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
                               SMLoc(), SMLoc()));
-    EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
+    EmitLEA(*Op, 32, ScratchRegI32, Out);
     break;
   }
   case 4:
@@ -969,10 +963,10 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
 void X86AddressSanitizer64::InstrumentMemOperandLarge(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
-  unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64);
-  unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64);
+  unsigned AddressRegI64 = RegCtx.AddressReg(64);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
 
-  ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out);
+  ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
                            AddressRegI64));
@@ -1030,7 +1024,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
 
 } // End anonymous namespace
 
-X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI)
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
     : STI(STI), InitialFrameReg(0) {}
 
 X86AsmInstrumentation::~X86AsmInstrumentation() {}
@@ -1043,7 +1037,7 @@ void X86AsmInstrumentation::InstrumentAndEmitInstruction(
 
 void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
                                             const MCInst &Inst) {
-  Out.EmitInstruction(Inst, STI);
+  Out.EmitInstruction(Inst, *STI);
 }
 
 unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
@@ -1067,17 +1061,17 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
 
 X86AsmInstrumentation *
 CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
-                            const MCContext &Ctx, const MCSubtargetInfo &STI) {
-  Triple T(STI.getTargetTriple());
+                            const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+  Triple T(STI->getTargetTriple());
   const bool hasCompilerRTSupport = T.isOSLinux();
   if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
       MCOptions.SanitizeAddress) {
-    if (STI.getFeatureBits()[X86::Mode32Bit] != 0)
+    if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
       return new X86AddressSanitizer32(STI);
-    if (STI.getFeatureBits()[X86::Mode64Bit] != 0)
+    if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
       return new X86AddressSanitizer64(STI);
   }
   return new X86AsmInstrumentation(STI);
 }
 
-} // End llvm namespace
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 19ebcc4..470cead 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -28,7 +28,8 @@ class X86AsmInstrumentation;
 
 X86AsmInstrumentation *
 CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
-                            const MCContext &Ctx, const MCSubtargetInfo &STI);
+                            const MCContext &Ctx,
+                            const MCSubtargetInfo *&STI);
 
 class X86AsmInstrumentation {
 public:
@@ -48,15 +49,16 @@ public:
 protected:
   friend X86AsmInstrumentation *
   CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
-                              const MCContext &Ctx, const MCSubtargetInfo &STI);
+                              const MCContext &Ctx,
+                              const MCSubtargetInfo *&STI);
 
-  X86AsmInstrumentation(const MCSubtargetInfo &STI);
+  X86AsmInstrumentation(const MCSubtargetInfo *&STI);
 
   unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
 
   void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
 
-  const MCSubtargetInfo &STI;
+  const MCSubtargetInfo *&STI;
 
   unsigned InitialFrameReg;
 };
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index bca059d..4d8ffac 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -11,7 +11,6 @@
 #include "X86AsmInstrumentation.h"
 #include "X86AsmParserCommon.h"
 #include "X86Operand.h"
-#include "X86ISelLowering.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
@@ -26,6 +25,7 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
@@ -57,10 +57,10 @@ static const char OpPrecedence[] = {
 };
 
 class X86AsmParser : public MCTargetAsmParser {
-  MCSubtargetInfo &STI;
   const MCInstrInfo &MII;
   ParseInstructionInfo *InstInfo;
   std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+
 private:
   SMLoc consumeToken() {
     MCAsmParser &Parser = getParser();
@@ -154,6 +154,7 @@ private:
       // Push the new operator.
       InfixOperatorStack.push_back(Op);
     }
+
     int64_t execute() {
       // Push any remaining operators onto the postfix stack.
       while (!InfixOperatorStack.empty()) {
@@ -268,6 +269,7 @@ private:
     bool StopOnLBrac, AddImmPrefix;
     InfixCalculator IC;
     InlineAsmIdentifierInfo Info;
+
   public:
     IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
       State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
@@ -712,10 +714,10 @@ private:
                         SMLoc End, unsigned Size, StringRef Identifier,
                         InlineAsmIdentifierInfo &Info);
 
+  bool parseDirectiveEven(SMLoc L);
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
-  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
   bool processInstruction(MCInst &Inst, const OperandVector &Ops);
 
   /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
@@ -758,23 +760,24 @@ private:
 
   bool is64BitMode() const {
     // FIXME: Can tablegen auto-generate this?
-    return STI.getFeatureBits()[X86::Mode64Bit];
+    return getSTI().getFeatureBits()[X86::Mode64Bit];
   }
   bool is32BitMode() const {
     // FIXME: Can tablegen auto-generate this?
-    return STI.getFeatureBits()[X86::Mode32Bit];
+    return getSTI().getFeatureBits()[X86::Mode32Bit];
   }
   bool is16BitMode() const {
     // FIXME: Can tablegen auto-generate this?
-    return STI.getFeatureBits()[X86::Mode16Bit];
+    return getSTI().getFeatureBits()[X86::Mode16Bit];
   }
   void SwitchMode(unsigned mode) {
+    MCSubtargetInfo &STI = copySTI();
     FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
     FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
     unsigned FB = ComputeAvailableFeatures(
       STI.ToggleFeature(OldMode.flip(mode)));
     setAvailableFeatures(FB);
-    
+
     assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
   }
 
@@ -798,12 +801,12 @@ private:
   /// }
 
 public:
-  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser,
+  X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
                const MCInstrInfo &mii, const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) {
+    : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) {
 
     // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
     Instrumentation.reset(
         CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
   }
@@ -912,6 +915,11 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
   if (RegNo == 0)
     RegNo = MatchRegisterName(Tok.getString().lower());
 
+  // The "flags" register cannot be referenced directly.
+  // Treat it as an identifier instead.
+  if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+    RegNo = 0;
+
   if (!is64BitMode()) {
     // FIXME: This should be done using Requires<Not64BitMode> and
     // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
@@ -1042,8 +1050,11 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
     .Cases("BYTE", "byte", 8)
     .Cases("WORD", "word", 16)
     .Cases("DWORD", "dword", 32)
+    .Cases("FWORD", "fword", 48)
     .Cases("QWORD", "qword", 64)
+    .Cases("MMWORD","mmword", 64)
     .Cases("XWORD", "xword", 80)
+    .Cases("TBYTE", "tbyte", 80)
     .Cases("XMMWORD", "xmmword", 128)
     .Cases("YMMWORD", "ymmword", 256)
     .Cases("ZMMWORD", "zmmword", 512)
@@ -1062,8 +1073,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
     // Insert an explicit size if the user didn't have one.
     if (!Size) {
       Size = getPointerWidth();
-      InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
-                                                  /*Len=*/0, Size));
+      InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+                                          /*Len=*/0, Size);
     }
 
     // Create an absolute memory reference in order to match against
@@ -1082,8 +1093,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
     if (!Size) {
       Size = Info.Type * 8; // Size is in terms of bits in this context.
       if (Size)
-        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
-                                                    /*Len=*/0, Size));
+        InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+                                            /*Len=*/0, Size);
     }
   }
 
@@ -1097,13 +1108,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
 }
 
 static void
-RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
                            StringRef SymName, int64_t ImmDisp,
                            int64_t FinalImmDisp, SMLoc &BracLoc,
                            SMLoc &StartInBrac, SMLoc &End) {
   // Remove the '[' and ']' from the IR string.
-  AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1));
-  AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1));
+  AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
+  AsmRewrites.emplace_back(AOK_Skip, End, 1);
 
   // If ImmDisp is non-zero, then we parsed a displacement before the
   // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
@@ -1114,15 +1125,14 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
       // We have an immediate displacement before the bracketed expression.
       // Adjust this to match the final immediate displacement.
       bool Found = false;
-      for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
-             E = AsmRewrites->end(); I != E; ++I) {
-        if ((*I).Loc.getPointer() > BracLoc.getPointer())
+      for (AsmRewrite &AR : AsmRewrites) {
+        if (AR.Loc.getPointer() > BracLoc.getPointer())
           continue;
-        if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) {
+        if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
           assert (!Found && "ImmDisp already rewritten.");
-          (*I).Kind = AOK_Imm;
-          (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer();
-          (*I).Val = FinalImmDisp;
+          AR.Kind = AOK_Imm;
+          AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
+          AR.Val = FinalImmDisp;
           Found = true;
           break;
         }
@@ -1133,28 +1143,27 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
       // We have a symbolic and an immediate displacement, but no displacement
       // before the bracketed expression.  Put the immediate displacement
       // before the bracketed expression.
-      AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp));
+      AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
     }
   }
   // Remove all the ImmPrefix rewrites within the brackets.
-  for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
-         E = AsmRewrites->end(); I != E; ++I) {
-    if ((*I).Loc.getPointer() < StartInBrac.getPointer())
+  for (AsmRewrite &AR : AsmRewrites) {
+    if (AR.Loc.getPointer() < StartInBrac.getPointer())
       continue;
-    if ((*I).Kind == AOK_ImmPrefix)
-      (*I).Kind = AOK_Delete;
+    if (AR.Kind == AOK_ImmPrefix)
+      AR.Kind = AOK_Delete;
   }
   const char *SymLocPtr = SymName.data();
   // Skip everything before the symbol.
   if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
     assert(Len > 0 && "Expected a non-negative length.");
-    AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len));
+    AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
   }
   // Skip everything after the symbol.
   if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
     SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
     assert(Len > 0 && "Expected a non-negative length.");
-    AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len));
+    AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
   }
 }
 
@@ -1162,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
+  AsmToken::TokenKind PrevTK = AsmToken::Error;
   bool Done = false;
   while (!Done) {
     bool UpdateLocLex = true;
@@ -1205,7 +1215,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
             return Error(Tok.getLoc(), "Unexpected identifier!");
         } else {
           // This is a dot operator, not an adjacent identifier.
-          if (Identifier.find('.') != StringRef::npos) {
+          if (Identifier.find('.') != StringRef::npos &&
+              PrevTK == AsmToken::RBrac) {
             return false;
           } else {
             InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
@@ -1223,8 +1234,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     case AsmToken::Integer: {
       StringRef ErrMsg;
       if (isParsingInlineAsm() && SM.getAddImmPrefix())
-        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
-                                                    Tok.getLoc()));
+        InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
       // Look for 'b' or 'f' following an Integer as a directional label
       SMLoc Loc = getTok().getLoc();
       int64_t IntVal = getTok().getIntVal();
@@ -1237,7 +1247,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
               getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
           MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
           const MCExpr *Val =
-	    MCSymbolRefExpr::create(Sym, Variant, getContext());
+              MCSymbolRefExpr::create(Sym, Variant, getContext());
           if (IDVal == "b" && Sym->isUndefined())
             return Error(Loc, "invalid reference to undefined symbol");
           StringRef Identifier = Sym->getName();
@@ -1275,6 +1285,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
 
     if (!Done && UpdateLocLex)
       End = consumeToken();
+
+    PrevTK = TK;
   }
   return false;
 }
@@ -1302,7 +1314,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
     // A symbolic displacement.
     Disp = Sym;
     if (isParsingInlineAsm())
-      RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(),
+      RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
                                  ImmDisp, SM.getImm(), BracLoc, StartInBrac,
                                  End);
   }
@@ -1359,7 +1371,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
                                         InlineAsmIdentifierInfo &Info,
                                         bool IsUnevaluatedOperand, SMLoc &End) {
   MCAsmParser &Parser = getParser();
-  assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+  assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
   Val = nullptr;
 
   StringRef LineBuf(Identifier.data());
@@ -1372,15 +1384,17 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
   // Advance the token stream until the end of the current token is
   // after the end of what the frontend claimed.
   const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
-  while (true) {
+  do {
     End = Tok.getEndLoc();
     getLexer().Lex();
-
-    assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
-    if (End.getPointer() == EndPtr) break;
-  }
+  } while (End.getPointer() < EndPtr);
   Identifier = LineBuf;
 
+  // The frontend should end parsing on an assembler token boundary, unless it
+  // failed parsing.
+  assert((End.getPointer() == EndPtr || !Result) &&
+         "frontend claimed part of a token?");
+
   // If the identifier lookup was unsuccessful, assume that we are dealing with
   // a label.
   if (!Result) {
@@ -1389,9 +1403,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
                                          Loc, false);
     assert(InternalName.size() && "We should have an internal name here.");
     // Push a rewrite for replacing the identifier name with the internal name.
-    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc,
-                                                Identifier.size(),
-                                                InternalName));
+    InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+                                        InternalName);
   }
 
   // Create the symbol reference.
@@ -1418,8 +1431,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
     AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
 
     if (isParsingInlineAsm())
-      InstInfo->AsmRewrites->push_back(
-          AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc()));
+      InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
 
     if (getLexer().isNot(AsmToken::LBrac)) {
       // An immediate following a 'segment register', 'colon' token sequence can
@@ -1588,8 +1600,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
     unsigned Len = DotDispStr.size();
     unsigned Val = OrigDispVal + DotDispVal;
-    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
-                                                Val));
+    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
   }
 
   NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
@@ -1613,7 +1624,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
     return nullptr;
 
   // Don't emit the offset operator.
-  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+  InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
 
   // The offset operator will have an 'r' constraint, thus we need to create
   // register operand to ensure proper matching.  Just pick a GPR based on
@@ -1664,7 +1675,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   // Rewrite the type operator and the C or C++ type or variable in terms of an
   // immediate.  E.g. TYPE foo -> $$4
   unsigned Len = End.getPointer() - TypeLoc.getPointer();
-  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
+  InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
 
   const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
   return X86Operand::CreateImm(Imm, Start, End);
@@ -1688,12 +1699,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
       return ParseIntelOperator(IOK_TYPE);
   }
 
+  bool PtrInOperand = false;
   unsigned Size = getIntelMemOperandSize(Tok.getString());
   if (Size) {
     Parser.Lex(); // Eat operand size (e.g., byte, word).
     if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
       return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
     Parser.Lex(); // Eat ptr.
+    PtrInOperand = true;
   }
   Start = Tok.getLoc();
 
@@ -1711,10 +1724,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
       unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
       if (StartTok.getString().size() == Len)
         // Just add a prefix if this wasn't a complex immediate expression.
-        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start));
+        InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
       else
         // Otherwise, rewrite the complex expression as a single immediate.
-        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm));
+        InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
     }
 
     if (getLexer().isNot(AsmToken::LBrac)) {
@@ -1740,7 +1753,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   }
 
   // rounding mode token
-  if (STI.getFeatureBits()[X86::FeatureAVX512] &&
+  if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
       getLexer().is(AsmToken::LCurly))
     return ParseRoundingModeOp(Start, End);
 
@@ -1749,9 +1762,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
     // of a segment override, otherwise this is a normal register reference.
-    if (getLexer().isNot(AsmToken::Colon))
+    // In case it is a normal register and there is ptr in the operand this 
+    // is an error
+    if (getLexer().isNot(AsmToken::Colon)){
+      if (PtrInOperand){
+        return ErrorOperand(Start, "expected memory operand after "
+                                   "'ptr', found register operand instead");
+      }
       return X86Operand::CreateReg(RegNo, Start, End);
-
+    }
+    
     return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
   }
 
@@ -1798,7 +1818,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
   }
   case AsmToken::LCurly:{
     SMLoc Start = Parser.getTok().getLoc(), End;
-    if (STI.getFeatureBits()[X86::FeatureAVX512])
+    if (getSTI().getFeatureBits()[X86::FeatureAVX512])
       return ParseRoundingModeOp(Start, End);
     return ErrorOperand(Start, "unknown token in expression");
   }
@@ -1808,7 +1828,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
 bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
                                        const MCParsedAsmOperand &Op) {
   MCAsmParser &Parser = getParser();
-  if(STI.getFeatureBits()[X86::FeatureAVX512]) {
+  if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
     if (getLexer().is(AsmToken::LCurly)) {
       // Eat "{" and mark the current place.
       const SMLoc consumedToken = consumeToken();
@@ -1983,12 +2003,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
           }
 
           // Validate the scale amount.
-	  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+          if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
               ScaleVal != 1) {
             Error(Loc, "scale factor in 16-bit address must be 1");
             return nullptr;
-	  }
-          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+          }
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
+              ScaleVal != 8) {
             Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
             return nullptr;
           }
@@ -2175,7 +2196,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     Name == "repne" || Name == "repnz" ||
     Name == "rex64" || Name == "data16";
 
-
   // This does the actual operand parsing.  Don't parse any more if we have a
   // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
   // just want to parse the "lock" as the first instruction and the "incl" as
@@ -2213,6 +2233,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       (isPrefix && getLexer().is(AsmToken::Slash)))
     Parser.Lex();
 
+  // This is for gas compatibility and cannot be done in td.
+  // Adding "p" for some floating point with no argument.
+  // For example: fsub --> fsubp
+  bool IsFp =
+    Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+  if (IsFp && Operands.size() == 1) {
+    const char *Repl = StringSwitch<const char *>(Name)
+      .Case("fsub", "fsubp")
+      .Case("fdiv", "fdivp")
+      .Case("fsubr", "fsubrp")
+      .Case("fdivr", "fdivrp");
+    static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+  }
+
   // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
   // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
   // documented form in various unofficial manuals, so a lot of code uses it.
@@ -2242,9 +2276,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   // Append default arguments to "ins[bwld]"
   if (Name.startswith("ins") && Operands.size() == 1 &&
-      (Name == "insb" || Name == "insw" || Name == "insl" ||
-       Name == "insd" )) {
-    AddDefaultSrcDestOperands(Operands, 
+      (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) {
+    AddDefaultSrcDestOperands(Operands,
                               X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
                               DefaultMemDIOperand(NameLoc));
   }
@@ -2346,98 +2379,21 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // instalias with an immediate operand yet.
   if (Name == "int" && Operands.size() == 2) {
     X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
-    if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
-        cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) {
-      Operands.erase(Operands.begin() + 1);
-      static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
-    }
+    if (Op1.isImm())
+      if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
+        if (CE->getValue() == 3) {
+          Operands.erase(Operands.begin() + 1);
+          static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+        }
   }
 
   return false;
 }
 
-static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg,
-                            bool isCmp) {
-  MCInst TmpInst;
-  TmpInst.setOpcode(Opcode);
-  if (!isCmp)
-    TmpInst.addOperand(MCOperand::createReg(Reg));
-  TmpInst.addOperand(MCOperand::createReg(Reg));
-  TmpInst.addOperand(Inst.getOperand(0));
-  Inst = TmpInst;
-  return true;
-}
-
-static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode,
-                                bool isCmp = false) {
-  if (!Inst.getOperand(0).isImm() ||
-      !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
-    return false;
-
-  return convertToSExti8(Inst, Opcode, X86::AX, isCmp);
-}
-
-static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode,
-                                bool isCmp = false) {
-  if (!Inst.getOperand(0).isImm() ||
-      !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
-    return false;
-
-  return convertToSExti8(Inst, Opcode, X86::EAX, isCmp);
-}
-
-static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
-                                bool isCmp = false) {
-  if (!Inst.getOperand(0).isImm() ||
-      !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
-    return false;
-
-  return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
-}
-
-bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
-  switch (Inst.getOpcode()) {
-  default: return true;
-  case X86::INT:
-    X86Operand &Op = static_cast<X86Operand &>(*Ops[1]);
-    assert(Op.isImm() && "expected immediate");
-    int64_t Res;
-    if (!Op.getImm()->evaluateAsAbsolute(Res) || Res > 255) {
-      Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]");
-      return false;
-    }
-    return true;
-  }
-  llvm_unreachable("handle the instruction appropriately");
-}
-
 bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
   switch (Inst.getOpcode()) {
   default: return false;
-  case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
-  case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8);
-  case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8);
-  case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8);
-  case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8);
-  case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8);
-  case X86::OR16i16:  return convert16i16to16ri8(Inst, X86::OR16ri8);
-  case X86::OR32i32:  return convert32i32to32ri8(Inst, X86::OR32ri8);
-  case X86::OR64i32:  return convert64i32to64ri8(Inst, X86::OR64ri8);
-  case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true);
-  case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true);
-  case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true);
-  case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8);
-  case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8);
-  case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8);
-  case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8);
-  case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8);
-  case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8);
-  case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8);
-  case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8);
-  case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8);
-  case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
-  case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
-  case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+  case X86::VMOVZPQILo2PQIrr:
   case X86::VMOVAPDrr:
   case X86::VMOVAPDYrr:
   case X86::VMOVAPSrr:
@@ -2457,18 +2413,19 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
     unsigned NewOpc;
     switch (Inst.getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
-    case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
-    case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
-    case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
-    case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
-    case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
-    case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
-    case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
-    case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
-    case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
-    case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
-    case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
-    case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+    case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr;   break;
+    case X86::VMOVAPDrr:        NewOpc = X86::VMOVAPDrr_REV;  break;
+    case X86::VMOVAPDYrr:       NewOpc = X86::VMOVAPDYrr_REV; break;
+    case X86::VMOVAPSrr:        NewOpc = X86::VMOVAPSrr_REV;  break;
+    case X86::VMOVAPSYrr:       NewOpc = X86::VMOVAPSYrr_REV; break;
+    case X86::VMOVDQArr:        NewOpc = X86::VMOVDQArr_REV;  break;
+    case X86::VMOVDQAYrr:       NewOpc = X86::VMOVDQAYrr_REV; break;
+    case X86::VMOVDQUrr:        NewOpc = X86::VMOVDQUrr_REV;  break;
+    case X86::VMOVDQUYrr:       NewOpc = X86::VMOVDQUYrr_REV; break;
+    case X86::VMOVUPDrr:        NewOpc = X86::VMOVUPDrr_REV;  break;
+    case X86::VMOVUPDYrr:       NewOpc = X86::VMOVUPDYrr_REV; break;
+    case X86::VMOVUPSrr:        NewOpc = X86::VMOVUPSrr_REV;  break;
+    case X86::VMOVUPSYrr:       NewOpc = X86::VMOVUPSYrr_REV; break;
     }
     Inst.setOpcode(NewOpc);
     return true;
@@ -2573,9 +2530,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
                                isParsingIntelSyntax())) {
   default: llvm_unreachable("Unexpected match result!");
   case Match_Success:
-    if (!validateInstruction(Inst, Operands))
-      return true;
-
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
     // individual transformations can chain off each other.
@@ -2819,9 +2773,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned NumSuccessfulMatches =
       std::count(std::begin(Match), std::end(Match), Match_Success);
   if (NumSuccessfulMatches == 1) {
-    if (!validateInstruction(Inst, Operands))
-      return true;
-
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the individual
     // transformations can chain off each other.
@@ -2898,10 +2849,29 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
                                            "a '%' prefix in .intel_syntax");
     }
     return false;
-  }
+  } else if (IDVal == ".even")
+    return parseDirectiveEven(DirectiveID.getLoc());
   return true;
 }
 
+/// parseDirectiveEven
+///  ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+  const MCSection *Section = getStreamer().getCurrentSection().first;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token in directive");
+    return false;  
+  }
+  if (!Section) {
+    getStreamer().InitSections(false);
+    Section = getStreamer().getCurrentSection().first;
+  }
+  if (Section->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(2, 0);
+  else
+    getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+  return false;
+}
 /// ParseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
@@ -2909,10 +2879,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
+      SMLoc ExprLoc = getLexer().getLoc();
       if (getParser().parseExpression(Value))
         return false;
 
-      getParser().getStreamer().EmitValue(Value, Size);
+      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size);
+      } else {
+        getStreamer().EmitValue(Value, Size, ExprLoc);
+      }
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 7610806..54538c8 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -13,30 +13,25 @@
 namespace llvm {
 
 inline bool isImmSExti16i8Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000000000007FULL)||
-          (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+  return isInt<8>(Value) ||
+         (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
 }
 
 inline bool isImmSExti32i8Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000000000007FULL)||
-          (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+  return isInt<8>(Value) ||
+         (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
 }
 
 inline bool isImmSExti64i8Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000000000007FULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+  return isInt<8>(Value);
 }
 
 inline bool isImmSExti64i32Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000007FFFFFFFULL)||
-          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+  return isInt<32>(Value);
 }
 
 inline bool isImmUnsignedi8Value(uint64_t Value) {
-  return ((                                  Value <= 0x00000000000000FFULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+  return isUInt<8>(Value) || isInt<8>(Value);
 }
 
 } // End of namespace llvm
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index cfc3ee2..ce8fcf1 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -95,11 +95,13 @@ X86GenericDisassembler::X86GenericDisassembler(
   llvm_unreachable("Invalid CPU mode");
 }
 
+namespace {
 struct Region {
   ArrayRef<uint8_t> Bytes;
   uint64_t Base;
   Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
 };
+} // end anonymous namespace
 
 /// A callback function that wraps the readByte method from Region.
 ///
@@ -831,8 +833,12 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_XMM256:
   case TYPE_XMM512:
   case TYPE_VK1:
+  case TYPE_VK2:
+  case TYPE_VK4:
   case TYPE_VK8:
   case TYPE_VK16:
+  case TYPE_VK32:
+  case TYPE_VK64:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
   case TYPE_BNDR:
@@ -962,6 +968,7 @@ static bool translateInstruction(MCInst &mcInst,
     return true;
   }
 
+  mcInst.clear();
   mcInst.setOpcode(insn.instructionID);
   // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
   // prefix bytes should be disassembled as xrelease and xacquire then set the
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index f73fa75..040143b 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
        * then it should be disassembled as a xacquire/xrelease not repne/rep.
        */
       if ((byte == 0xf2 || byte == 0xf3) &&
-          ((nextByte == 0xf0) |
+          ((nextByte == 0xf0) ||
           ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
         insn->xAcquireRelease = true;
       /*
@@ -980,6 +980,47 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
       insn->opcode == 0xE3)
     attrMask ^= ATTR_ADSIZE;
 
+  /*
+   * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix
+   * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
+   */
+
+  if (insn->mode == MODE_64BIT &&
+      isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+    switch (insn->opcode) {
+    case 0xE8:
+    case 0xE9:
+      // Take care of psubsb and other mmx instructions.
+      if (insn->opcodeType == ONEBYTE) {
+        attrMask ^= ATTR_OPSIZE;
+        insn->immediateSize = 4;
+        insn->displacementSize = 4;
+      }
+      break;
+    case 0x82:
+    case 0x83:
+    case 0x84:
+    case 0x85:
+    case 0x86:
+    case 0x87:
+    case 0x88:
+    case 0x89:
+    case 0x8A:
+    case 0x8B:
+    case 0x8C:
+    case 0x8D:
+    case 0x8E:
+    case 0x8F:
+      // Take care of lea and three byte ops.
+      if (insn->opcodeType == TWOBYTE) {
+        attrMask ^= ATTR_OPSIZE;
+        insn->immediateSize = 4;
+        insn->displacementSize = 4;
+      }
+      break;
+    }
+  }
+
   if (getIDWithAttrMask(&instructionID, insn, attrMask))
     return -1;
 
@@ -1447,8 +1488,12 @@ static int readModRM(struct InternalInstruction* insn) {
     case TYPE_XMM:                                        \
       return prefix##_XMM0 + index;                       \
     case TYPE_VK1:                                        \
+    case TYPE_VK2:                                        \
+    case TYPE_VK4:                                        \
     case TYPE_VK8:                                        \
     case TYPE_VK16:                                       \
+    case TYPE_VK32:                                       \
+    case TYPE_VK64:                                       \
       if (index > 7)                                      \
         *valid = 0;                                       \
       return prefix##_K0 + index;                         \
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index a79a923..28a628e 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -572,8 +572,6 @@ struct InternalInstruction {
 
   // The last byte of the opcode, not counting any ModR/M extension
   uint8_t opcode;
-  // The ModR/M byte of the instruction, if it is an opcode extension
-  uint8_t modRMExtension;
 
   // decode state
 
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index ea727e6..b4c0bc4 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 62b6b73..bbb3090 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -15,12 +15,9 @@
 #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
-class MCOperand;
-
 class X86ATTInstPrinter final : public MCInstPrinter {
 public:
   X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 91b144a..73f654c 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -21,6 +21,26 @@
 
 using namespace llvm;
 
+static unsigned getVectorRegSize(unsigned RegNo) {
+  if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+    return 512;
+  if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+    return 256;
+  if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+    return 128;
+  if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+    return 64;
+
+  llvm_unreachable("Unknown vector reg!");
+}
+
+static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
+                                 unsigned OperandIndex) {
+  unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+  return MVT::getVectorVT(ScalarVT,
+                          getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+}
+
 /// \brief Extracts the src/dst types for a given zero extension instruction.
 /// \note While the number of elements in DstVT type correct, the
 /// number in the SrcVT type is expanded to fill the src xmm register and the
@@ -107,6 +127,75 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
   }
 }
 
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src)  \
+  case X86::V##Inst##Suffix##src:                \
+  case X86::V##Inst##Suffix##src##k:             \
+  case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_SSE_INS_COMMON(Inst, src)           \
+  case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src)  \
+  case X86::V##Inst##Suffix##src:
+
+#define CASE_MOVDUP(Inst, src)                  \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)         \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)           \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)          \
+  CASE_SSE_INS_COMMON(Inst, r##src)             \
+
+#define CASE_UNPCK(Inst, src)                   \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)         \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)           \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)          \
+  CASE_SSE_INS_COMMON(Inst, r##src)             \
+
+#define CASE_SHUF(Inst, src)                    \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src##i)      \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src##i)   \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)   \
+  CASE_AVX_INS_COMMON(Inst, , r##src##i)        \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src##i)       \
+  CASE_SSE_INS_COMMON(Inst, r##src##i)          \
+
+#define CASE_VPERM(Inst, src)                   \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)         \
+  CASE_MASK_INS_COMMON(Inst, Z256, src##i)      \
+  CASE_MASK_INS_COMMON(Inst, Z128, src##i)      \
+  CASE_AVX_INS_COMMON(Inst, , src##i)           \
+  CASE_AVX_INS_COMMON(Inst, Y, src##i)          \
+
+#define CASE_VSHUF(Inst, src)                          \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i)      \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i)      \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i)   \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)   \
+
+/// \brief Extracts the types and if it has memory operand for a given
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
+static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
+  HasMemOp = false;
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown VSHUF64x2 family instructions.");
+    break;
+  CASE_VSHUF(64X2, m)
+    HasMemOp = true;        // FALL THROUGH.
+  CASE_VSHUF(64X2, r)
+    VT = getRegOperandVectorVT(MI, MVT::i64, 0);
+    break;
+  CASE_VSHUF(32X4, m)
+    HasMemOp = true;        // FALL THROUGH.
+  CASE_VSHUF(32X4, r)
+    VT = getRegOperandVectorVT(MI, MVT::i32, 0);
+    break;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Top Level Entrypoint
 //===----------------------------------------------------------------------===//
@@ -127,23 +216,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::BLENDPDrri:
   case X86::VBLENDPDrri:
+  case X86::VBLENDPDYrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::BLENDPDrmi:
   case X86::VBLENDPDrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v2f64,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VBLENDPDYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
   case X86::VBLENDPDYrmi:
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v4f64,
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -152,23 +232,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::BLENDPSrri:
   case X86::VBLENDPSrri:
+  case X86::VBLENDPSYrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::BLENDPSrmi:
   case X86::VBLENDPSrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v4f32,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VBLENDPSYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
   case X86::VBLENDPSYrmi:
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v8f32,
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -177,23 +248,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::PBLENDWrri:
   case X86::VPBLENDWrri:
+  case X86::VPBLENDWYrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PBLENDWrmi:
   case X86::VPBLENDWrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v8i16,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPBLENDWYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
   case X86::VPBLENDWYrmi:
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v16i16,
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -201,23 +263,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::VPBLENDDrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPBLENDDrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v4i32,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
   case X86::VPBLENDDYrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::VPBLENDDrmi:
   case X86::VPBLENDDYrmi:
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeBLENDMask(MVT::v8i32,
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -239,6 +291,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::MOVLHPSrr:
   case X86::VMOVLHPSrr:
+  case X86::VMOVLHPSZrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -247,569 +300,327 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   case X86::MOVHLPSrr:
   case X86::VMOVHLPSrr:
+  case X86::VMOVHLPSZrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeMOVHLPSMask(2, ShuffleMask);
     break;
 
-  case X86::MOVSLDUPrr:
-  case X86::VMOVSLDUPrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::MOVSLDUPrm:
-  case X86::VMOVSLDUPrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
-    break;
-
-  case X86::VMOVSHDUPYrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VMOVSHDUPYrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
-    break;
-
-  case X86::VMOVSLDUPYrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_MOVDUP(MOVSLDUP, r)
+    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
     // FALL THROUGH.
-  case X86::VMOVSLDUPYrm:
+  CASE_MOVDUP(MOVSLDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
+    DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
     break;
 
-  case X86::MOVSHDUPrr:
-  case X86::VMOVSHDUPrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_MOVDUP(MOVSHDUP, r)
+    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
     // FALL THROUGH.
-  case X86::MOVSHDUPrm:
-  case X86::VMOVSHDUPrm:
+  CASE_MOVDUP(MOVSHDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
+    DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
     break;
 
-  case X86::VMOVDDUPYrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_MOVDUP(MOVDDUP, r)
+    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
     // FALL THROUGH.
-  case X86::VMOVDDUPYrm:
+  CASE_MOVDUP(MOVDDUP, m)
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask);
-    break;
-
-  case X86::MOVDDUPrr:
-  case X86::VMOVDDUPrr:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::MOVDDUPrm:
-  case X86::VMOVDDUPrm:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask);
+    DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
     break;
 
   case X86::PSLLDQri:
   case X86::VPSLLDQri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSLLDQMask(MVT::v16i8,
-                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                       ShuffleMask);
-    break;
-
   case X86::VPSLLDQYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSLLDQMask(MVT::v32i8,
+      DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
                        ShuffleMask);
     break;
 
   case X86::PSRLDQri:
   case X86::VPSRLDQri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSRLDQMask(MVT::v16i8,
-                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                       ShuffleMask);
-    break;
-
   case X86::VPSRLDQYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSRLDQMask(MVT::v32i8,
+      DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
                        ShuffleMask);
     break;
 
   case X86::PALIGNR128rr:
   case X86::VPALIGNR128rr:
+  case X86::VPALIGNR256rr:
     Src1Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PALIGNR128rm:
   case X86::VPALIGNR128rm:
-    Src2Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePALIGNRMask(MVT::v16i8,
-                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::VPALIGNR256rr:
-    Src1Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
   case X86::VPALIGNR256rm:
     Src2Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePALIGNRMask(MVT::v32i8,
+      DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
                         MI->getOperand(MI->getNumOperands() - 1).getImm(),
                         ShuffleMask);
     break;
 
   case X86::PSHUFDri:
   case X86::VPSHUFDri:
+  case X86::VPSHUFDYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::PSHUFDmi:
   case X86::VPSHUFDmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFMask(MVT::v4i32,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    break;
-  case X86::VPSHUFDYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
   case X86::VPSHUFDYmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFMask(MVT::v8i32,
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     break;
 
   case X86::PSHUFHWri:
   case X86::VPSHUFHWri:
+  case X86::VPSHUFHWYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::PSHUFHWmi:
   case X86::VPSHUFHWmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFHWMask(MVT::v8i16,
-                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::VPSHUFHWYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
   case X86::VPSHUFHWYmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFHWMask(MVT::v16i16,
+      DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
                         MI->getOperand(MI->getNumOperands() - 1).getImm(),
                         ShuffleMask);
     break;
+
   case X86::PSHUFLWri:
   case X86::VPSHUFLWri:
+  case X86::VPSHUFLWYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::PSHUFLWmi:
   case X86::VPSHUFLWmi:
-    DestName = getRegName(MI->getOperand(0).getReg());
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFLWMask(MVT::v8i16,
-                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                        ShuffleMask);
-    break;
-  case X86::VPSHUFLWYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
   case X86::VPSHUFLWYmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFLWMask(MVT::v16i16,
+      DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
                         MI->getOperand(MI->getNumOperands() - 1).getImm(),
                         ShuffleMask);
     break;
 
-  case X86::PUNPCKHBWrr:
-  case X86::VPUNPCKHBWrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHBWrm:
-  case X86::VPUNPCKHBWrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
-    break;
-  case X86::VPUNPCKHBWYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHBWYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
-    break;
-  case X86::PUNPCKHWDrr:
-  case X86::VPUNPCKHWDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHWDrm:
-  case X86::VPUNPCKHWDrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
-    break;
-  case X86::VPUNPCKHWDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHWDYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
-    break;
-  case X86::PUNPCKHDQrr:
-  case X86::VPUNPCKHDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHDQrm:
-  case X86::VPUNPCKHDQrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
-    break;
-  case X86::VPUNPCKHDQYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHDQYrm:
+  case X86::MMX_PSHUFWri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
-    break;
-  case X86::VPUNPCKHDQZrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VPUNPCKHDQZrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v16i32, ShuffleMask);
-    break;
-  case X86::PUNPCKHQDQrr:
-  case X86::VPUNPCKHQDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHQDQrm:
-  case X86::VPUNPCKHQDQrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::MMX_PSHUFWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
-    break;
-  case X86::VPUNPCKHQDQYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHQDQYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
-    break;
-  case X86::VPUNPCKHQDQZrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKHQDQZrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v8i64, ShuffleMask);
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFMask(MVT::v4i16,
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
     break;
 
-  case X86::PUNPCKLBWrr:
-  case X86::VPUNPCKLBWrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLBWrm:
-  case X86::VPUNPCKLBWrm:
+  case X86::PSWAPDrr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
-    break;
-  case X86::VPUNPCKLBWYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VPUNPCKLBWYrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
-    break;
-  case X86::PUNPCKLWDrr:
-  case X86::VPUNPCKLWDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::PUNPCKLWDrm:
-  case X86::VPUNPCKLWDrm:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::PSWAPDrm:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+    DecodePSWAPMask(MVT::v2i32, ShuffleMask);
     break;
-  case X86::VPUNPCKLWDYrr:
+
+  CASE_UNPCK(PUNPCKHBW, r)
+  case X86::MMX_PUNPCKHBWirr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VPUNPCKLWDYrm:
+  CASE_UNPCK(PUNPCKHBW, m)
+  case X86::MMX_PUNPCKHBWirm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
     break;
-  case X86::PUNPCKLDQrr:
-  case X86::VPUNPCKLDQrr:
+
+  CASE_UNPCK(PUNPCKHWD, r)
+  case X86::MMX_PUNPCKHWDirr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::PUNPCKLDQrm:
-  case X86::VPUNPCKLDQrm:
+  CASE_UNPCK(PUNPCKHWD, m)
+  case X86::MMX_PUNPCKHWDirm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
     break;
-  case X86::VPUNPCKLDQYrr:
+
+  CASE_UNPCK(PUNPCKHDQ, r)
+  case X86::MMX_PUNPCKHDQirr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VPUNPCKLDQYrm:
+  CASE_UNPCK(PUNPCKHDQ, m)
+  case X86::MMX_PUNPCKHDQirm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
     break;
-  case X86::VPUNPCKLDQZrr:
+
+  CASE_UNPCK(PUNPCKHQDQ, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VPUNPCKLDQZrm:
+  CASE_UNPCK(PUNPCKHQDQ, m)
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v16i32, ShuffleMask);
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
     break;
-  case X86::PUNPCKLQDQrr:
-  case X86::VPUNPCKLQDQrr:
+
+  CASE_UNPCK(PUNPCKLBW, r)
+  case X86::MMX_PUNPCKLBWirr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::PUNPCKLQDQrm:
-  case X86::VPUNPCKLQDQrm:
+  CASE_UNPCK(PUNPCKLBW, m)
+  case X86::MMX_PUNPCKLBWirm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
     break;
-  case X86::VPUNPCKLQDQYrr:
+
+  CASE_UNPCK(PUNPCKLWD, r)
+  case X86::MMX_PUNPCKLWDirr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VPUNPCKLQDQYrm:
+  CASE_UNPCK(PUNPCKLWD, m)
+  case X86::MMX_PUNPCKLWDirm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
     break;
-  case X86::VPUNPCKLQDQZrr:
+
+  CASE_UNPCK(PUNPCKLDQ, r)
+  case X86::MMX_PUNPCKLDQirr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VPUNPCKLQDQZrm:
+  CASE_UNPCK(PUNPCKLDQ, m)
+  case X86::MMX_PUNPCKLDQirm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v8i64, ShuffleMask);
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
     break;
 
-  case X86::SHUFPDrri:
-  case X86::VSHUFPDrri:
+  CASE_UNPCK(PUNPCKLQDQ, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::SHUFPDrmi:
-  case X86::VSHUFPDrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeSHUFPMask(MVT::v2f64,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VSHUFPDYrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VSHUFPDYrmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeSHUFPMask(MVT::v4f64,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
+  CASE_UNPCK(PUNPCKLQDQ, m)
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
     break;
 
-  case X86::SHUFPSrri:
-  case X86::VSHUFPSrri:
+  CASE_SHUF(SHUFPD, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::SHUFPSrmi:
-  case X86::VSHUFPSrmi:
+  CASE_SHUF(SHUFPD, m)
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeSHUFPMask(MVT::v4f32,
+      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  case X86::VSHUFPSYrri:
+
+  CASE_SHUF(SHUFPS, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VSHUFPSYrmi:
+  CASE_SHUF(SHUFPS, m)
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodeSHUFPMask(MVT::v8f32,
+      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
-  case X86::UNPCKLPDrr:
-  case X86::VUNPCKLPDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKLPDrm:
-  case X86::VUNPCKLPDrm:
-    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKLPDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKLPDYrm:
-    DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKLPDZrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKLPDZrm:
-    DecodeUNPCKLMask(MVT::v8f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::UNPCKLPSrr:
-  case X86::VUNPCKLPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKLPSrm:
-  case X86::VUNPCKLPSrm:
-    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKLPSYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKLPSYrm:
-    DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKLPSZrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKLPSZrm:
-    DecodeUNPCKLMask(MVT::v16f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::UNPCKHPDrr:
-  case X86::VUNPCKHPDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKHPDrm:
-  case X86::VUNPCKHPDrm:
-    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VUNPCKHPDYrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::VUNPCKHPDYrm:
-    DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(1).getReg());
+  CASE_VSHUF(64X2, r)
+  CASE_VSHUF(64X2, m)
+  CASE_VSHUF(32X4, r)
+  CASE_VSHUF(32X4, m) {
+    MVT VT;
+    bool HasMemOp;
+    unsigned NumOp = MI->getNumOperands();
+    getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
+    decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
+                              ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
+    if (HasMemOp) {
+      assert((NumOp >= 8) && "Expected at least 8 operands!");
+      Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
+    } else {
+      assert((NumOp >= 4) && "Expected at least 4 operands!");
+      Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
+      Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
+    }
     break;
-  case X86::VUNPCKHPDZrr:
+  }
+
+  CASE_UNPCK(UNPCKLPD, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VUNPCKHPDZrm:
-    DecodeUNPCKHMask(MVT::v8f64, ShuffleMask);
+  CASE_UNPCK(UNPCKLPD, m)
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  case X86::UNPCKHPSrr:
-  case X86::VUNPCKHPSrr:
+
+  CASE_UNPCK(UNPCKLPS, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::UNPCKHPSrm:
-  case X86::VUNPCKHPSrm:
-    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+  CASE_UNPCK(UNPCKLPS, m)
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  case X86::VUNPCKHPSYrr:
+
+  CASE_UNPCK(UNPCKHPD, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VUNPCKHPSYrm:
-    DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+  CASE_UNPCK(UNPCKHPD, m)
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  case X86::VUNPCKHPSZrr:
+
+  CASE_UNPCK(UNPCKHPS, r)
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
-  case X86::VUNPCKHPSZrm:
-    DecodeUNPCKHMask(MVT::v16f32, ShuffleMask);
+  CASE_UNPCK(UNPCKHPS, m)
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  case X86::VPERMILPSri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMILPSmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFMask(MVT::v4f32,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMILPSYri:
-    Src1Name = getRegName(MI->getOperand(1).getReg());
-    // FALL THROUGH.
-  case X86::VPERMILPSYmi:
-    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFMask(MVT::v8f32,
-                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
-                      ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-  case X86::VPERMILPDri:
+
+  CASE_VPERM(PERMILPS, r)
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
-  case X86::VPERMILPDmi:
+  CASE_VPERM(PERMILPS, m)
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFMask(MVT::v2f64,
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  case X86::VPERMILPDYri:
+
+  CASE_VPERM(PERMILPD, r)
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
-  case X86::VPERMILPDYmi:
+  CASE_VPERM(PERMILPD, m)
     if (MI->getOperand(MI->getNumOperands() - 1).isImm())
-      DecodePSHUFMask(MVT::v4f64,
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
                       ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+
   case X86::VPERM2F128rr:
   case X86::VPERM2I128rr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
@@ -824,6 +635,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+
   case X86::VPERMQYri:
   case X86::VPERMPDYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -846,6 +658,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+
   case X86::MOVSSrr:
   case X86::VMOVSSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
@@ -861,6 +674,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::MOVZPQILo2PQIrr:
   case X86::VMOVPQI2QIrr:
   case X86::VMOVZPQILo2PQIrr:
+  case X86::VMOVZPQILo2PQIZrr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
   // FALL THROUGH.
   case X86::MOVQI2PQIrm:
@@ -869,9 +683,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVQI2PQIrm:
   case X86::VMOVZQI2PQIrm:
   case X86::VMOVZPQILo2PQIrm:
+  case X86::VMOVZPQILo2PQIZrm:
     DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+
   case X86::MOVDI2PDIrm:
   case X86::VMOVDI2PDIrm:
     DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 6e371da..20cd7ff 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -19,8 +19,6 @@
 
 namespace llvm {
 
-class MCOperand;
-
 class X86IntelInstPrinter final : public MCInstPrinter {
 public:
   X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 629802f..133bd0e 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -69,15 +69,19 @@ public:
 class X86AsmBackend : public MCAsmBackend {
   const StringRef CPU;
   bool HasNopl;
-  const uint64_t MaxNopLength;
+  uint64_t MaxNopLength;
 public:
-  X86AsmBackend(const Target &T, StringRef CPU)
-      : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) {
+  X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) {
     HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
               CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
               CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
               CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
               CPU != "c3" && CPU != "c3-2";
+    // Max length of true long nop instruction is 15 bytes.
+    // Max length of long nop replacement instruction is 7 bytes.
+    // Taking into account SilverMont architecture features max length of nops
+    // is reduced for it to achieve better performance.
+    MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15;
   }
 
   unsigned getNumFixupKinds() const override {
@@ -200,6 +204,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
   case X86::ADD64ri8: return X86::ADD64ri32;
   case X86::ADD64mi8: return X86::ADD64mi32;
 
+   // ADC
+  case X86::ADC16ri8: return X86::ADC16ri;
+  case X86::ADC16mi8: return X86::ADC16mi;
+  case X86::ADC32ri8: return X86::ADC32ri;
+  case X86::ADC32mi8: return X86::ADC32mi;
+  case X86::ADC64ri8: return X86::ADC64ri32;
+  case X86::ADC64mi8: return X86::ADC64mi32;
+
     // SUB
   case X86::SUB16ri8: return X86::SUB16ri;
   case X86::SUB16mi8: return X86::SUB16mi;
@@ -208,6 +220,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
   case X86::SUB64ri8: return X86::SUB64ri32;
   case X86::SUB64mi8: return X86::SUB64mi32;
 
+   // SBB
+  case X86::SBB16ri8: return X86::SBB16ri;
+  case X86::SBB16mi8: return X86::SBB16mi;
+  case X86::SBB32ri8: return X86::SBB32ri;
+  case X86::SBB32mi8: return X86::SBB32mi;
+  case X86::SBB64ri8: return X86::SBB64ri32;
+  case X86::SBB64mi8: return X86::SBB64mi32;
+
     // CMP
   case X86::CMP16ri8: return X86::CMP16ri;
   case X86::CMP16mi8: return X86::CMP16mi;
@@ -279,7 +299,7 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
 /// bytes.
 /// \return - true on success, false on failure
 bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  static const uint8_t Nops[10][10] = {
+  static const uint8_t TrueNops[10][10] = {
     // nop
     {0x90},
     // xchg %ax,%ax
@@ -302,17 +322,31 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   };
 
-  // This CPU doesn't support long nops. If needed add more.
-  // FIXME: Can we get this from the subtarget somehow?
-  // FIXME: We could generated something better than plain 0x90.
-  if (!HasNopl) {
-    for (uint64_t i = 0; i < Count; ++i)
-      OW->write8(0x90);
-    return true;
-  }
+  // Alternative nop instructions for CPUs which don't support long nops.
+  static const uint8_t AltNops[7][10] = {
+      // nop
+      {0x90},
+      // xchg %ax,%ax
+      {0x66, 0x90},
+      // lea 0x0(%esi),%esi
+      {0x8d, 0x76, 0x00},
+      // lea 0x0(%esi),%esi
+      {0x8d, 0x74, 0x26, 0x00},
+      // nop + lea 0x0(%esi),%esi
+      {0x90, 0x8d, 0x74, 0x26, 0x00},
+      // lea 0x0(%esi),%esi
+      {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 },
+      // lea 0x0(%esi),%esi
+      {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // Select the right NOP table.
+  // FIXME: Can we get if CPU supports long nops from the subtarget somehow?
+  const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops;
+  assert(HasNopl || MaxNopLength <= 7);
 
-  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
-  // needed, then emit a nop of the remaining length.
+  // Emit as many largest nops as needed, then emit a nop of the remaining
+  // length.
   do {
     const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
     const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
@@ -359,6 +393,17 @@ public:
   }
 };
 
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_IAMCU);
+  }
+};
+
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
   ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
@@ -610,13 +655,13 @@ private:
   /// \brief Get the compact unwind number for a given register. The number
   /// corresponds to the enum lists in compact_unwind_encoding.h.
   int getCompactUnwindRegNum(unsigned Reg) const {
-    static const uint16_t CU32BitRegs[7] = {
+    static const MCPhysReg CU32BitRegs[7] = {
       X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
     };
-    static const uint16_t CU64BitRegs[] = {
+    static const MCPhysReg CU64BitRegs[] = {
       X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
     };
-    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
     for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
       if (*CURegs == Reg)
         return Idx;
@@ -780,6 +825,10 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
     return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.isOSIAMCU())
+    return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+
   return new ELFX86_32AsmBackend(T, OSABI, CPU);
 }
 
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index f0d00b0..9ff85b9 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -41,6 +41,16 @@ namespace X86 {
     /// AddrNumOperands - Total number of operands in a memory reference.
     AddrNumOperands = 5
   };
+
+  /// AVX512 static rounding constants.  These need to match the values in
+  /// avx512fintrin.h.
+  enum STATIC_ROUNDING {
+    TO_NEAREST_INT = 0,
+    TO_NEG_INF = 1,
+    TO_POS_INF = 2,
+    TO_ZERO = 3,
+    CUR_DIRECTION = 4
+  };
 } // end namespace X86;
 
 /// X86II - This namespace holds all of the target specific flags that
@@ -675,7 +685,7 @@ namespace X86II {
     case X86II::RawFrmSrc:
     case X86II::RawFrmDst:
     case X86II::RawFrmDstSrc:
-       return -1;
+      return -1;
     case X86II::MRMDestMem:
       return 0;
     case X86II::MRMSrcMem:
@@ -696,23 +706,27 @@ namespace X86II {
       // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
       return 0 + HasVEX_4V + HasEVEX_K;
     case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
-    case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+    case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+    case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
     case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+    case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
     case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
-    case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6:
-    case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9:
-    case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
-    case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
-    case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2:
-    case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5:
-    case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA:
-    case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED:
-    case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1:
-    case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4:
-    case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7:
-    case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA:
-    case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD:
-    case X86II::MRM_FE: case X86II::MRM_FF:
+    case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+    case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+    case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+    case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+    case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+    case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+    case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+    case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+    case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+    case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+    case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+    case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+    case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+    case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+    case X86II::MRM_FF:
       return -1;
     }
   }
@@ -740,7 +754,7 @@ namespace X86II {
     case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
     case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
     case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
-        return true;
+      return true;
     }
     return false;
   }
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index a33468d..736c39d 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -32,9 +32,11 @@ namespace {
 
 X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                                        uint16_t EMachine)
-  : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
-                            // Only i386 uses Rel instead of RelA.
-                            /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
+    : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                              // Only i386 and IAMCU use Rel instead of RelA.
+                              /*HasRelocationAddend*/
+                              (EMachine != ELF::EM_386) &&
+                                  (EMachine != ELF::EM_IAMCU)) {}
 
 X86ELFObjectWriter::~X86ELFObjectWriter()
 {}
@@ -246,7 +248,8 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
   if (getEMachine() == ELF::EM_X86_64)
     return getRelocType64(Modifier, Type, IsPCRel);
 
-  assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type.");
+  assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+         "Unsupported ELF machine type.");
   return getRelocType32(Modifier, getType32(Type), IsPCRel);
 }
 
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index deaad2a..30d5c80 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -20,39 +20,42 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class Triple;
-
-  class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
-
-  public:
-    explicit X86MCAsmInfoDarwin(const Triple &Triple);
-  };
-
-  struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
-    explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
-    const MCExpr *
-    getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
-                                MCStreamer &Streamer) const override;
-  };
-
-  class X86ELFMCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit X86ELFMCAsmInfo(const Triple &Triple);
-  };
-
-  class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
-    void anchor() override;
-  public:
-    explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
-  };
-
-  class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
-    void anchor() override;
-  public:
-    explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
-  };
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+  explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 10c434c..dfab6ec 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -510,8 +510,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
 
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
     EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
-    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
-                  Fixups);
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+                  CurByte, OS, Fixups);
     return;
   }
 
@@ -988,6 +988,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
                                    const MCInstrDesc &Desc) {
   unsigned REX = 0;
+  bool UsesHighByteReg = false;
+
   if (TSFlags & X86II::REX_W)
     REX |= 1 << 3; // set REX.W
 
@@ -1004,6 +1006,8 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
     const MCOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
+    if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+      UsesHighByteReg = true;
     if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
     // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
     // that returns non-zero.
@@ -1073,6 +1077,9 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
     }
     break;
   }
+  if (REX && UsesHighByteReg)
+    report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+
   return REX;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 83b4091..53a6550 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -122,7 +122,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
   } else if (TheTriple.isOSBinFormatELF()) {
     // Force the use of an ELF container.
     MAI = new X86ELFMCAsmInfo(TheTriple);
-  } else if (TheTriple.isWindowsMSVCEnvironment()) {
+  } else if (TheTriple.isWindowsMSVCEnvironment() ||
+             TheTriple.isWindowsCoreCLREnvironment()) {
     MAI = new X86MCAsmInfoMicrosoft(TheTriple);
   } else if (TheTriple.isOSCygMing() ||
              TheTriple.isWindowsItaniumEnvironment()) {
@@ -267,3 +268,184 @@ extern "C" void LLVMInitializeX86TargetMC() {
   TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
                                        createX86_64AsmBackend);
 }
+
+unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
+                                            bool High) {
+  switch (Size) {
+  default: return 0;
+  case 8:
+    if (High) {
+      switch (Reg) {
+      default: return getX86SubSuperRegisterOrZero(Reg, 64);
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case 16:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case 32:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case 64:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+}
+
+unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
+  unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+  assert(Res != 0 && "Unexpected register or VT");
+  return Res;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 6221bab..2d2836f 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -79,7 +79,7 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
 /// Takes ownership of \p AB and \p CE.
 MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
                                      raw_pwrite_stream &OS, MCCodeEmitter *CE,
-                                     bool RelaxAll);
+                                     bool RelaxAll, bool IncrementalLinkerCompatible);
 
 /// Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
@@ -98,6 +98,17 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
 
 /// Construct X86-64 ELF relocation info.
 MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
+                                      bool High = false);
+
 } // End llvm namespace
 
 
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 9e801fc..191ebea 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation(
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol", false);
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
 
     // We don't support PCrel relocations of differences. Darwin 'as' doesn't
     // implement most of these correctly.
-    if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference",
-                         false);
+    if (IsPCRel) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+      return;
+    }
 
     // The support for the situation where one or both of the symbols would
     // require a local relocation is handled just like if the symbols were
@@ -168,16 +173,20 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
     // single SIGNED relocation); reject it for now.  Except the case where both
     // symbols don't have a base, equal but both NULL.
-    if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base", false);
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
 
     // A subtraction expression where either symbol is undefined is a
     // non-relocatable expression.
     if (A->isUndefined() || B->isUndefined()) {
       StringRef Name = A->isUndefined() ? A->getName() : B->getName();
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
+      Asm.getContext().reportError(Fixup.getLoc(),
         "unsupported relocation with subtraction expression, symbol '" +
         Name + "' can not be undefined in a subtraction expression");
+      return;
     }
 
     Value += Writer->getSymbolAddress(*A, Layout) -
@@ -244,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         FixedValue = Res;
         return;
       } else {
-        report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'", false);
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unsupported relocation of variable '" +
+                                         Symbol->getName() + "'");
+        return;
       }
     } else {
-      report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'", false);
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+                              Symbol->getName() + "'");
+      return;
     }
 
     MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -266,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
           Type = MachO::X86_64_RELOC_TLV;
         }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation",
-                             false);
+          Asm.getContext().reportError(
+              Fixup.getLoc(), "unsupported symbol modifier in relocation");
+          return;
         } else {
           Type = MachO::X86_64_RELOC_SIGNED;
 
@@ -292,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation(
           }
         }
       } else {
-        if (Modifier != MCSymbolRefExpr::VK_None)
-          report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation", false);
+        if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "unsupported symbol modifier in branch relocation");
+          return;
+        }
 
         Type = MachO::X86_64_RELOC_BRANCH;
       }
@@ -309,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         Type = MachO::X86_64_RELOC_GOT;
         IsPCRel = 1;
       } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel",
-                           false);
-      } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation", false);
-      else {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+        return;
+      } else if (Modifier != MCSymbolRefExpr::VK_None) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "unsupported symbol modifier in relocation");
+        return;
+      } else {
         Type = MachO::X86_64_RELOC_UNSIGNED;
         unsigned Kind = Fixup.getKind();
-        if (Kind == X86::reloc_signed_4byte)
-          report_fatal_error("32-bit absolute addressing is not supported in "
-                             "64-bit mode", false);
+        if (Kind == X86::reloc_signed_4byte) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "32-bit absolute addressing is not supported in 64-bit mode");
+          return;
+        }
       }
     }
   }
@@ -350,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
 
-  if (!A->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression",
-                       false);
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(
+        Fixup.getLoc(),
+        "symbol '" + A->getName() +
+            "' can not be undefined in a subtraction expression");
+    return false;
+  }
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
@@ -363,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     const MCSymbol *SB = &B->getSymbol();
 
-    if (!SB->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression",
-                         false);
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "symbol '" + B->getSymbol().getName() +
+              "' can not be undefined in a subtraction expression");
+      return false;
+    }
 
     // Select the appropriate difference relocation type.
     //
@@ -387,12 +416,12 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     if (FixupOffset > 0xffffff) {
       char Buffer[32];
       format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
-      Asm.getContext().reportFatalError(Fixup.getLoc(),
+      Asm.getContext().reportError(Fixup.getLoc(),
                          Twine("Section too large, can't encode "
                                 "r_address (") + Buffer +
                          ") into 24 bits of scattered "
                          "relocation entry.");
-      llvm_unreachable("fatal error returned?!");
+      return false;
     }
 
     MachO::any_relocation_info MRE;
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 92f42b6..d045118 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -50,9 +50,11 @@ void X86WinCOFFStreamer::FinishImpl() {
 
 MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
                                            raw_pwrite_stream &OS,
-                                           MCCodeEmitter *CE, bool RelaxAll) {
+                                           MCCodeEmitter *CE, bool RelaxAll,
+                                           bool IncrementalLinkerCompatible) {
   X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
   S->getAssembler().setRelaxAll(RelaxAll);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
   return S;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index cae865a..619f7c8 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ShuffleDecode.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/CodeGen/MachineValueType.h"
 
 //===----------------------------------------------------------------------===//
@@ -140,13 +139,14 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm,
   }
 }
 
-/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
   unsigned NumLaneElts = NumElts / NumLanes;
 
   unsigned NewImm = Imm;
@@ -191,6 +191,16 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm,
   }
 }
 
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumHalfElts = NumElts / 2;
+
+  for (unsigned l = 0; l != NumHalfElts; ++l)
+    ShuffleMask.push_back(l + NumHalfElts);
+  for (unsigned h = 0; h != NumHalfElts; ++h)
+    ShuffleMask.push_back(h);
+}
+
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
@@ -222,7 +232,7 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   // independently on 128-bit lanes.
   unsigned NumLanes = VT.getSizeInBits() / 128;
-  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
   unsigned NumLaneElts = NumElts / NumLanes;
 
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
@@ -253,6 +263,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+  unsigned ControlBitsMask = NumLanes - 1;
+  unsigned NumControlBits  = NumLanes / 2;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+    // We actually need the other source.
+    if (l >= NumLanes / 2)
+      LaneMask += NumLanes;
+    for (unsigned i = 0; i != NumElementsInLane; ++i)
+      ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+  }
+}
+
 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
   unsigned HalfSize = VT.getVectorNumElements() / 2;
@@ -265,54 +295,6 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
   }
 }
 
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  // It is not an error for the PSHUFB mask to not be a vector of i8 because the
-  // constant pool uniques constants by their bit representation.
-  // e.g. the following take up the same space in the constant pool:
-  //   i128 -170141183420855150465331762880109871104
-  //
-  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
-  //
-  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
-  //              i32 -2147483648, i32 -2147483648>
-
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-
-  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
-    return;
-
-  // This is a straightforward byte vector.
-  if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
-    int NumElements = MaskTy->getVectorNumElements();
-    ShuffleMask.reserve(NumElements);
-
-    for (int i = 0; i < NumElements; ++i) {
-      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
-      // lane of the vector we're inside.
-      int Base = i < 16 ? 0 : 16;
-      Constant *COp = C->getAggregateElement(i);
-      if (!COp) {
-        ShuffleMask.clear();
-        return;
-      } else if (isa<UndefValue>(COp)) {
-        ShuffleMask.push_back(SM_SentinelUndef);
-        continue;
-      }
-      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
-      // If the high bit (7) of the byte is set, the element is zeroed.
-      if (Element & (1 << 7))
-        ShuffleMask.push_back(SM_SentinelZero);
-      else {
-        // Only the least significant 4 bits of the byte are used.
-        int Index = Base + (Element & 0xf);
-        ShuffleMask.push_back(Index);
-      }
-    }
-  }
-  // TODO: Handle funny-looking vectors too.
-}
-
 void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
                       SmallVectorImpl<int> &ShuffleMask) {
   for (int i = 0, e = RawMask.size(); i < e; ++i) {
@@ -357,46 +339,6 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
-  assert(MaskTy->getVectorElementType()->isIntegerTy() &&
-         "Expected integer constant mask elements!");
-  int ElementBits = MaskTy->getScalarSizeInBits();
-  int NumElements = MaskTy->getVectorNumElements();
-  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
-         "Unexpected number of vector elements.");
-  ShuffleMask.reserve(NumElements);
-  if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
-    assert((unsigned)NumElements == CDS->getNumElements() &&
-           "Constant mask has a different number of elements!");
-
-    for (int i = 0; i < NumElements; ++i) {
-      int Base = (i * ElementBits / 128) * (128 / ElementBits);
-      uint64_t Element = CDS->getElementAsInteger(i);
-      // Only the least significant 2 bits of the integer are used.
-      int Index = Base + (Element & 0x3);
-      ShuffleMask.push_back(Index);
-    }
-  } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
-    assert((unsigned)NumElements == C->getNumOperands() &&
-           "Constant mask has a different number of elements!");
-
-    for (int i = 0; i < NumElements; ++i) {
-      int Base = (i * ElementBits / 128) * (128 / ElementBits);
-      Constant *COp = CV->getOperand(i);
-      if (isa<UndefValue>(COp)) {
-        ShuffleMask.push_back(SM_SentinelUndef);
-        continue;
-      }
-      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
-      // Only the least significant 2 bits of the integer are used.
-      int Index = Base + (Element & 0x3);
-      ShuffleMask.push_back(Index);
-    }
-  }
-}
-
 void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
   unsigned NumDstElts = DstVT.getVectorNumElements();
   unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
@@ -503,4 +445,20 @@ void DecodeINSERTQIMask(int Len, int Idx,
     ShuffleMask.push_back(SM_SentinelUndef);
 }
 
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    ShuffleMask.push_back((int)M);
+  }
+}
+
 } // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
index 3d10d18..72db6a8 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -23,7 +23,6 @@
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-class Constant;
 class MVT;
 
 enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
@@ -54,6 +53,9 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
+/// \brief Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
@@ -69,9 +71,6 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 /// different datatypes and vector widths.
 void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
-
 /// \brief Decode a PSHUFB mask from a raw array of constants such as from
 /// BUILD_VECTOR.
 void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
@@ -83,13 +82,15 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
 
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+                               SmallVectorImpl<int> &ShuffleMask);
+
 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
 /// No VT provided since it only works on 256-bit, 4 element vectors.
 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
-
 /// \brief Decode a zero extension instruction as a shuffle mask.
 void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
                           SmallVectorImpl<int> &ShuffleMask);
@@ -108,6 +109,14 @@ void DecodeEXTRQIMask(int Len, int Idx,
 /// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
 void DecodeINSERTQIMask(int Len, int Idx,
                         SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index 8403ae6..01e65b8 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -23,56 +23,48 @@ class FunctionPass;
 class ImmutablePass;
 class X86TargetMachine;
 
-/// createX86ISelDag - This pass converts a legalized DAG into a
-/// X86-specific DAG, ready for instruction scheduling.
-///
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
-/// createX86GlobalBaseRegPass - This pass initializes a global base
-/// register for PIC on x86-32.
-FunctionPass* createX86GlobalBaseRegPass();
+/// This pass initializes a global base register for PIC on x86-32.
+FunctionPass *createX86GlobalBaseRegPass();
 
-/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
-/// to local-dynamic TLS variables so that the TLS base address for the module
-/// is only fetched once per execution path through the function.
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
 FunctionPass *createCleanupLocalDynamicTLSPass();
 
-/// createX86FloatingPointStackifierPass - This function returns a pass which
-/// converts floating point register references and pseudo instructions into
-/// floating point stack references and physical instructions.
-///
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
 FunctionPass *createX86FloatingPointStackifierPass();
 
-/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
-/// before each call to avoid transition penalty between functions encoded with
-/// AVX and SSE.
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
 FunctionPass *createX86IssueVZeroUpperPass();
 
-/// createX86EmitCodeToMemory - Returns a pass that converts a register
-/// allocated function into raw machine code in a dynamically
-/// allocated chunk of memory.
-///
-FunctionPass *createEmitX86CodeToMemory();
-
-/// createX86PadShortFunctions - Return a pass that pads short functions
-/// with NOOPs. This will prevent a stall when returning on the Atom.
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
-/// createX86FixupLEAs - Return a a pass that selectively replaces
-/// certain instructions (like add, sub, inc, dec, some shifts,
-/// and some multiplies) by equivalent LEA instructions, in order
-/// to eliminate execution delays in some Atom processors.
+
+/// Return a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
 FunctionPass *createX86FixupLEAs();
 
-/// createX86CallFrameOptimization - Return a pass that optimizes
-/// the code-size of x86 call sequences. This is done by replacing
-/// esp-relative movs with pushes.
+/// Return a pass that removes redundant LEA instructions and redundant address
+/// recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
 FunctionPass *createX86CallFrameOptimization();
 
-/// createX86WinEHStatePass - Return an IR pass that inserts EH registration
-/// stack objects and explicit EH state updates. This pass must run after EH
-/// preparation, which does Windows-specific but architecture-neutral
-/// preparation.
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
 FunctionPass *createX86WinEHStatePass();
 
 /// Return a Machine IR pass that expands X86-specific pseudo
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
index 8522674..8902a85 100644
--- a/contrib/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -37,14 +37,26 @@ def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
 def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
                                        "Support POPCNT instruction">;
 
+def FeatureFXSR    : SubtargetFeature<"fxsr", "HasFXSR", "true",
+                                      "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE   : SubtargetFeature<"xsave", "HasXSAVE", "true",
+                                       "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+                                       "Support xsaveopt instructions">;
+
+def FeatureXSAVEC  : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+                                       "Support xsavec instructions">;
+
+def FeatureXSAVES  : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+                                       "Support xsaves instructions">;
 
-def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
-                                      "Enable MMX instructions">;
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions",
                                       // SSE codegen depends on cmovs, and all
                                       // SSE1+ processors support them.
-                                      [FeatureMMX, FeatureCMOV]>;
+                                      [FeatureCMOV]>;
 def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
                                       "Enable SSE2 instructions",
                                       [FeatureSSE1]>;
@@ -60,6 +72,11 @@ def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
 def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
                                       "Enable SSE 4.2 instructions",
                                       [FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX     : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+                                      "Enable MMX instructions">;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
                                       "Enable 3DNow! instructions",
                                       [FeatureMMX]>;
@@ -79,16 +96,13 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
-// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that
-// explicit. Also, it seems this would be the default state for most chips
-// going forward, so it would probably be better to negate the logic and
-// match the 32-byte "slow mem" feature below.
-def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
-                                        "IsUAMemFast", "true",
-                                        "Fast unaligned memory access">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+                                "IsUAMem16Slow", "true",
+                                "Slow unaligned 16-byte memory access">;
 def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
-                            "IsUAMem32Slow", "true",
-                            "Slow unaligned 32-byte memory access">;
+                                "IsUAMem32Slow", "true",
+                                "Slow unaligned 32-byte memory access">;
 def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                       "Support SSE 4a instructions",
                                       [FeatureSSE3]>;
@@ -120,6 +134,8 @@ def FeatureBWI     : SubtargetFeature<"avx512bw", "HasBWI", "true",
 def FeatureVLX     : SubtargetFeature<"avx512vl", "HasVLX", "true",
                       "Enable AVX-512 Vector Length eXtensions",
                                       [FeatureAVX512]>;
+def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
+                      "Enable protection keys">;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
@@ -168,9 +184,11 @@ def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+                                       "Support LAHF and SAHF instructions">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
-def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
                                      "HasSlowDivide32", "true",
@@ -181,6 +199,11 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
+// TODO: This feature ought to be renamed.
+// What it really refers to are CPUs for which certain instructions
+// (which ones besides the example below?) are microcoded.
+// The best examples of this are the memory forms of CALL and PUSH
+// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
 def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
                                      "CallRegIndirect", "true",
                                      "Call register indirect">;
@@ -208,278 +231,473 @@ def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
 
-def : Proc<"generic",         []>;
-def : Proc<"i386",            []>;
-def : Proc<"i486",            []>;
-def : Proc<"i586",            []>;
-def : Proc<"pentium",         []>;
-def : Proc<"pentium-mmx",     [FeatureMMX]>;
-def : Proc<"i686",            []>;
-def : Proc<"pentiumpro",      [FeatureCMOV]>;
-def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
-def : Proc<"pentium3",        [FeatureSSE1]>;
-def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
-def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"pentium4",        [FeatureSSE2]>;
-def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"generic",         [FeatureSlowUAMem16]>;
+def : Proc<"i386",            [FeatureSlowUAMem16]>;
+def : Proc<"i486",            [FeatureSlowUAMem16]>;
+def : Proc<"i586",            [FeatureSlowUAMem16]>;
+def : Proc<"pentium",         [FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx",     [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686",            [FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro",      [FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV,
+                               FeatureFXSR]>;
+def : Proc<"pentium3",        [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+                               FeatureFXSR]>;
+def : Proc<"pentium3m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+                               FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium-m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+                               FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+                               FeatureFXSR]>;
+def : Proc<"pentium4m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+                               FeatureFXSR, FeatureSlowBTMem]>;
 
 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
-                     [FeatureSSE3, FeatureSlowBTMem]>;
+                     [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+                      FeatureSlowBTMem]>;
 
 // NetBurst.
-def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"nocona",   [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : Proc<"prescott",
+           [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+            FeatureSlowBTMem]>;
+def : Proc<"nocona", [
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem
+]>;
 
 // Intel Core 2 Solo/Duo.
-def : ProcessorModel<"core2", SandyBridgeModel,
-                     [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
-def : ProcessorModel<"penryn", SandyBridgeModel,
-                     [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : ProcessorModel<"core2", SandyBridgeModel, [
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
+def : ProcessorModel<"penryn", SandyBridgeModel, [
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSE41,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
 
 // Atom CPUs.
 class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
-                                   ProcIntelAtom,
-                                   FeatureSSSE3,
-                                   FeatureCMPXCHG16B,
-                                   FeatureMOVBE,
-                                   FeatureSlowBTMem,
-                                   FeatureLeaForSP,
-                                   FeatureSlowDivide32,
-                                   FeatureSlowDivide64,
-                                   FeatureCallRegIndirect,
-                                   FeatureLEAUsesAG,
-                                   FeaturePadShortFunctions
-                                 ]>;
+  ProcIntelAtom,
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeatureSlowBTMem,
+  FeatureLEAForSP,
+  FeatureSlowDivide32,
+  FeatureSlowDivide64,
+  FeatureCallRegIndirect,
+  FeatureLEAUsesAG,
+  FeaturePadShortFunctions,
+  FeatureLAHFSAHF
+]>;
 def : BonnellProc<"bonnell">;
 def : BonnellProc<"atom">; // Pin the generic name to the baseline.
 
 class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
-                                      ProcIntelSLM,
-                                      FeatureSSE42,
-                                      FeatureCMPXCHG16B,
-                                      FeatureMOVBE,
-                                      FeaturePOPCNT,
-                                      FeaturePCLMUL,
-                                      FeatureAES,
-                                      FeatureSlowDivide64,
-                                      FeatureCallRegIndirect,
-                                      FeaturePRFCHW,
-                                      FeatureSlowLEA,
-                                      FeatureSlowIncDec,
-                                      FeatureSlowBTMem,
-                                      FeatureFastUAMem
-                                    ]>;
+  ProcIntelSLM,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeaturePOPCNT,
+  FeaturePCLMUL,
+  FeatureAES,
+  FeatureSlowDivide64,
+  FeatureCallRegIndirect,
+  FeaturePRFCHW,
+  FeatureSlowLEA,
+  FeatureSlowIncDec,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
 
 // "Arrandale" along with corei3 and corei5
 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
-                                   FeatureSSE42,
-                                   FeatureCMPXCHG16B,
-                                   FeatureSlowBTMem,
-                                   FeatureFastUAMem,
-                                   FeaturePOPCNT
-                                 ]>;
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureLAHFSAHF
+]>;
 def : NehalemProc<"nehalem">;
 def : NehalemProc<"corei7">;
 
 // Westmere is a similar machine to nehalem with some additional features.
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
 class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
-                                    FeatureSSE42,
-                                    FeatureCMPXCHG16B,
-                                    FeatureSlowBTMem,
-                                    FeatureFastUAMem,
-                                    FeaturePOPCNT,
-                                    FeatureAES,
-                                    FeaturePCLMUL
-                                  ]>;
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureLAHFSAHF
+]>;
 def : WestmereProc<"westmere">;
 
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
 class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
-                                       FeatureAVX,
-                                       FeatureCMPXCHG16B,
-                                       FeatureFastUAMem,
-                                       FeatureSlowUAMem32,
-                                       FeaturePOPCNT,
-                                       FeatureAES,
-                                       FeaturePCLMUL
-                                     ]>;
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF
+]>;
 def : SandyBridgeProc<"sandybridge">;
 def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
 
 class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
-                                     FeatureAVX,
-                                     FeatureCMPXCHG16B,
-                                     FeatureFastUAMem,
-                                     FeatureSlowUAMem32,
-                                     FeaturePOPCNT,
-                                     FeatureAES,
-                                     FeaturePCLMUL,
-                                     FeatureRDRAND,
-                                     FeatureF16C,
-                                     FeatureFSGSBase
-                                   ]>;
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
 def : IvyBridgeProc<"ivybridge">;
 def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
 
 class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
-                                   FeatureAVX2,
-                                   FeatureCMPXCHG16B,
-                                   FeatureFastUAMem,
-                                   FeaturePOPCNT,
-                                   FeatureAES,
-                                   FeaturePCLMUL,
-                                   FeatureRDRAND,
-                                   FeatureF16C,
-                                   FeatureFSGSBase,
-                                   FeatureMOVBE,
-                                   FeatureLZCNT,
-                                   FeatureBMI,
-                                   FeatureBMI2,
-                                   FeatureFMA,
-                                   FeatureRTM,
-                                   FeatureHLE,
-                                   FeatureSlowIncDec
-                                 ]>;
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureRDRAND,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
+]>;
 def : HaswellProc<"haswell">;
 def : HaswellProc<"core-avx2">; // Legacy alias.
 
 class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
-                                     FeatureAVX2,
-                                     FeatureCMPXCHG16B,
-                                     FeatureFastUAMem,
-                                     FeaturePOPCNT,
-                                     FeatureAES,
-                                     FeaturePCLMUL,
-                                     FeatureRDRAND,
-                                     FeatureF16C,
-                                     FeatureFSGSBase,
-                                     FeatureMOVBE,
-                                     FeatureLZCNT,
-                                     FeatureBMI,
-                                     FeatureBMI2,
-                                     FeatureFMA,
-                                     FeatureRTM,
-                                     FeatureHLE,
-                                     FeatureADX,
-                                     FeatureRDSEED,
-                                     FeatureSlowIncDec
-                                   ]>;
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureADX,
+  FeatureRDSEED,
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
+]>;
 def : BroadwellProc<"broadwell">;
 
 // FIXME: define KNL model
-class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel,
-                     [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
-                      FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
-                      FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
-                      FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec, FeatureMPX]>;
+class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
+  FeatureMMX,
+  FeatureAVX512,
+  FeatureFXSR,
+  FeatureERI,
+  FeatureCDI,
+  FeaturePFI,
+  FeatureCMPXCHG16B,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureSlowIncDec,
+  FeatureMPX,
+  FeatureLAHFSAHF
+]>;
 def : KnightsLandingProc<"knl">;
 
 // FIXME: define SKX model
-class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel,
-                     [FeatureAVX512, FeatureCDI,
-                      FeatureDQI, FeatureBWI, FeatureVLX,
-                      FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
-                      FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
-                      FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec, FeatureMPX]>;
+class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
+  FeatureMMX,
+  FeatureAVX512,
+  FeatureFXSR,
+  FeatureCDI,
+  FeatureDQI,
+  FeatureBWI,
+  FeatureVLX,
+  FeaturePKU,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureADX,
+  FeatureRDSEED,
+  FeatureSlowIncDec,
+  FeatureMPX,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureLAHFSAHF
+]>;
 def : SkylakeProc<"skylake">;
 def : SkylakeProc<"skx">; // Legacy alias.
 
 
 // AMD CPUs.
 
-def : Proc<"k6",              [FeatureMMX]>;
-def : Proc<"k6-2",            [Feature3DNow]>;
-def : Proc<"k6-3",            [Feature3DNow]>;
-def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"k6",              [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon",          [FeatureSlowUAMem16, Feature3DNowA,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [FeatureSlowUAMem16, Feature3DNowA,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8",              [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"opteron",         [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"athlon64",        [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"athlon-fx",       [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"k8-sse3",         [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"amdfam10",        [FeatureSSE4A,
-                               Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureSlowBTMem,
+def : Proc<"opteron-sse3",    [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"barcelona",       [FeatureSSE4A,
-                               Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureSlowBTMem,
+def : Proc<"athlon64-sse3",   [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
+def : Proc<"amdfam10",        [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+                               FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+def : Proc<"barcelona",       [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+                               FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+
 // Bobcat
-def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowSHLD]>;
+def : Proc<"btver1", [
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureSSE4A,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeaturePRFCHW,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
 
 // Jaguar
-def : ProcessorModel<"btver2", BtVer2Model,
-                     [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
-                      FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
-                      FeatureBMI, FeatureF16C, FeatureMOVBE,
-                      FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
-                      FeatureSlowSHLD]>;
-
-// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
+def : ProcessorModel<"btver2", BtVer2Model, [
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureCMPXCHG16B,
+  FeaturePRFCHW,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureBMI,
+  FeatureF16C,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
 
 // Bulldozer
-def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureAVX, FeatureSSE4A, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureSlowSHLD]>;
+def : Proc<"bdver1", [
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
 // Piledriver
-def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureAVX, FeatureSSE4A, FeatureF16C,
-                               FeatureLZCNT, FeaturePOPCNT, FeatureBMI,
-                               FeatureTBM, FeatureFMA, FeatureSlowSHLD]>;
+def : Proc<"bdver2", [
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
 
 // Steamroller
-def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureAVX, FeatureSSE4A, FeatureF16C,
-                               FeatureLZCNT, FeaturePOPCNT, FeatureBMI,
-                               FeatureTBM, FeatureFMA, FeatureSlowSHLD,
-                               FeatureFSGSBase]>;
+def : Proc<"bdver3", [
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
 
 // Excavator
-def : Proc<"bdver4",          [FeatureAVX2, FeatureXOP, FeatureFMA4,
-                               FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
-                               FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI, FeatureBMI2,
-                               FeatureTBM, FeatureFMA, FeatureSSE4A,
-                               FeatureFSGSBase]>;
-
-def : Proc<"geode",           [Feature3DNowA]>;
-
-def : Proc<"winchip-c6",      [FeatureMMX]>;
-def : Proc<"winchip2",        [Feature3DNow]>;
-def : Proc<"c3",              [Feature3DNow]>;
-def : Proc<"c3-2",            [FeatureSSE1]>;
+def : Proc<"bdver4", [
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureXSAVEOPT,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
+
+def : Proc<"geode",           [FeatureSlowUAMem16, Feature3DNowA]>;
+
+def : Proc<"winchip-c6",      [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3",              [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>;
 
 // We also provide a generic 64-bit specific x86 processor model which tries to
 // be good for modern chips without enabling instruction set encodings past the
@@ -492,8 +710,8 @@ def : Proc<"c3-2",            [FeatureSSE1]>;
 // knobs which need to be tuned differently for AMD chips, we might consider
 // forming a common base for them.
 def : ProcessorModel<"x86-64", SandyBridgeModel,
-                     [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
-                      FeatureFastUAMem]>;
+                     [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit,
+                      FeatureSlowBTMem ]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -520,10 +738,6 @@ include "X86CallingConv.td"
 // Assembly Parser
 //===----------------------------------------------------------------------===//
 
-def ATTAsmParser : AsmParser {
-  string AsmParserClassName = "AsmParser";
-}
-
 def ATTAsmParserVariant : AsmParserVariant {
   int Variant = 0;
 
@@ -568,7 +782,6 @@ def IntelAsmWriter : AsmWriter {
 def X86 : Target {
   // Information about the instructions...
   let InstructionSet = X86InstrInfo;
-  let AssemblyParsers = [ATTAsmParser];
   let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
   let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
 }
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
index ba33248..2170e62 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -217,10 +217,10 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
     if (AsmVariant == 0) O << '%';
     unsigned Reg = MO.getReg();
     if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-      MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
-        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
-                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
-      Reg = getX86SubSuperRegister(Reg, VT);
+      unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+                      (strcmp(Modifier+6,"32") == 0) ? 32 :
+                      (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+      Reg = getX86SubSuperRegister(Reg, Size);
     }
     O << X86ATTInstPrinter::getRegisterName(Reg);
     return;
@@ -361,22 +361,21 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   switch (Mode) {
   default: return true;  // Unknown mode.
   case 'b': // Print QImode register
-    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    Reg = getX86SubSuperRegister(Reg, 8);
     break;
   case 'h': // Print QImode high register
-    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    Reg = getX86SubSuperRegister(Reg, 8, true);
     break;
   case 'w': // Print HImode register
-    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    Reg = getX86SubSuperRegister(Reg, 16);
     break;
   case 'k': // Print SImode register
-    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    Reg = getX86SubSuperRegister(Reg, 32);
     break;
   case 'q':
     // Print 64-bit register names if 64-bit integer registers are available.
     // Otherwise, print 32-bit register names.
-    MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32;
-    Reg = getX86SubSuperRegister(Reg, Ty);
+    Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
     break;
   }
 
@@ -535,6 +534,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
           S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
     }
   }
+  OutStreamer->EmitSyntaxDirective();
 }
 
 static void
@@ -565,10 +565,11 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
     const MachineConstantPoolEntry &CPE =
         MF->getConstantPool()->getConstants()[CPID];
     if (!CPE.isMachineConstantPoolEntry()) {
-      SectionKind Kind = CPE.getSectionKind(TM.getDataLayout());
+      const DataLayout &DL = MF->getDataLayout();
+      SectionKind Kind = CPE.getSectionKind(&DL);
       const Constant *C = CPE.Val.ConstVal;
       if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
-            getObjFileLowering().getSectionForConstant(Kind, C))) {
+              getObjFileLowering().getSectionForConstant(DL, Kind, C))) {
         if (MCSymbol *Sym = S->getCOMDATSymbol()) {
           if (Sym->isUndefined())
             OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
index 7f5d127..9c8bd98 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -78,8 +78,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   // outputting it to the OutStream. This allows the shadow tracker to minimise
   // the number of NOPs used for stackmap padding.
   void EmitAndCountInstruction(MCInst &Inst);
-
-  void InsertStackMapShadows(MachineFunction &MF);
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 031ba4b..fc6ee17 100644
--- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
@@ -53,10 +54,13 @@ private:
   // Information we know about a particular call site
   struct CallContext {
     CallContext()
-        : Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
-          MovVector(4, nullptr), NoStackParams(false), UsePush(false){};
+        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+          MovVector(4, nullptr), NoStackParams(false), UsePush(false){}
 
-    // Actuall call instruction
+    // Iterator referring to the frame setup instruction
+    MachineBasicBlock::iterator FrameSetup;
+
+    // Actual call instruction
     MachineInstr *Call;
 
     // A copy of the stack pointer
@@ -75,17 +79,16 @@ private:
     bool UsePush;
   };
 
-  typedef DenseMap<MachineInstr *, CallContext> ContextMap;
+  typedef SmallVector<CallContext, 8> ContextVector;
 
   bool isLegal(MachineFunction &MF);
 
-  bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap);
+  bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
 
   void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator I, CallContext &Context);
 
-  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I,
-                          const CallContext &Context);
+  bool adjustCallSequence(MachineFunction &MF, const CallContext &Context);
 
   MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
                                    unsigned Reg);
@@ -100,7 +103,8 @@ private:
   const char *getPassName() const override { return "X86 Optimize Call Frame"; }
 
   const TargetInstrInfo *TII;
-  const TargetFrameLowering *TFL;
+  const X86FrameLowering *TFL;
+  const X86Subtarget *STI;
   const MachineRegisterInfo *MRI;
   static char ID;
 };
@@ -124,8 +128,15 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
   // No point in running this in 64-bit mode, since some arguments are
   // passed in-register in all common calling conventions, so the pattern
   // we're looking for will never match.
-  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
-  if (STI.is64Bit())
+  if (STI->is64Bit())
+    return false;
+
+  // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+  // in the compact unwind encoding that Darwin uses. So, bail if there
+  // is a danger of that being generated.
+  if (STI->isTargetDarwin() && 
+     (!MF.getMMI().getLandingPads().empty() || 
+       (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
     return false;
 
   // You would expect straight-line code between call-frame setup and
@@ -161,7 +172,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
 // Check whether this trasnformation is profitable for a particular
 // function - in terms of code size.
 bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, 
-  ContextMap &CallSeqMap) {
+  ContextVector &CallSeqVector) {
   // This transformation is always a win when we do not expect to have
   // a reserved call frame. Under other circumstances, it may be either
   // a win or a loss, and requires a heuristic.
@@ -170,24 +181,20 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
     return true;
 
   // Don't do this when not optimizing for size.
-  bool OptForSize =
-      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
-      MF.getFunction()->hasFnAttribute(Attribute::MinSize);
-
-  if (!OptForSize)
+  if (!MF.getFunction()->optForSize())
     return false;
 
   unsigned StackAlign = TFL->getStackAlignment();
 
   int64_t Advantage = 0;
-  for (auto CC : CallSeqMap) {
+  for (auto CC : CallSeqVector) {
     // Call sites where no parameters are passed on the stack
     // do not affect the cost, since there needs to be no
     // stack adjustment.
-    if (CC.second.NoStackParams)
+    if (CC.NoStackParams)
       continue;
 
-    if (!CC.second.UsePush) {
+    if (!CC.UsePush) {
       // If we don't use pushes for a particular call site,
       // we pay for not having a reserved call frame with an
       // additional sub/add esp pair. The cost is ~3 bytes per instruction,
@@ -200,11 +207,11 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
       // We'll need a add after the call.
       Advantage -= 3;
       // If we have to realign the stack, we'll also need and sub before
-      if (CC.second.ExpectedDist % StackAlign)
+      if (CC.ExpectedDist % StackAlign)
         Advantage -= 3;
       // Now, for each push, we save ~3 bytes. For small constants, we actually,
       // save more (up to 5 bytes), but 3 should be a good approximation.
-      Advantage += (CC.second.ExpectedDist / 4) * 3;
+      Advantage += (CC.ExpectedDist / 4) * 3;
     }
   }
 
@@ -212,8 +219,9 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
 }
 
 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getSubtarget().getInstrInfo();
-  TFL = MF.getSubtarget().getFrameLowering();
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TFL = STI->getFrameLowering();
   MRI = &MF.getRegInfo();
 
   if (!isLegal(MF))
@@ -223,21 +231,22 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  ContextMap CallSeqMap;
+  ContextVector CallSeqVector;
 
   for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
       if (I->getOpcode() == FrameSetupOpcode) {
-        CallContext &Context = CallSeqMap[I];
+        CallContext Context;
         collectCallInfo(MF, *BB, I, Context);
+        CallSeqVector.push_back(Context);
       }
 
-  if (!isProfitable(MF, CallSeqMap))
+  if (!isProfitable(MF, CallSeqVector))
     return false;
 
-  for (auto CC : CallSeqMap)
-    if (CC.second.UsePush)
-      Changed |= adjustCallSequence(MF, CC.first, CC.second);
+  for (auto CC : CallSeqVector)
+    if (CC.UsePush)
+      Changed |= adjustCallSequence(MF, CC);
 
   return Changed;
 }
@@ -307,13 +316,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   // Check that this particular call sequence is amenable to the
   // transformation.
   const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-                                       MF.getSubtarget().getRegisterInfo());
-  unsigned StackPtr = RegInfo.getStackRegister();
+                                       STI->getRegisterInfo());
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 
   // We expect to enter this at the beginning of a call sequence
   assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
   MachineBasicBlock::iterator FrameSetup = I++;
+  Context.FrameSetup = FrameSetup;
 
   // How much do we adjust the stack? This puts an upper bound on
   // the number of parameters actually passed on it.
@@ -338,7 +347,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   if (!I->isCopy() || !I->getOperand(0).isReg())
     return;
   Context.SPCopy = I++;
-  StackPtr = Context.SPCopy->getOperand(0).getReg();
+
+  unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
 
   // Scan the call setup sequence for the pattern we're looking for.
   // We only handle a simple case - a sequence of MOV32mi or MOV32mr
@@ -434,22 +444,22 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
 }
 
 bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
-                                                  MachineBasicBlock::iterator I,
                                                   const CallContext &Context) {
   // Ok, we can in fact do the transformation for this call.
   // Do not remove the FrameSetup instruction, but adjust the parameters.
   // PEI will end up finalizing the handling of this.
-  MachineBasicBlock::iterator FrameSetup = I;
-  MachineBasicBlock &MBB = *(I->getParent());
+  MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+  MachineBasicBlock &MBB = *(FrameSetup->getParent());
   FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
 
-  DebugLoc DL = I->getDebugLoc();
+  DebugLoc DL = FrameSetup->getDebugLoc();
   // Now, iterate through the vector in reverse order, and replace the movs
   // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
   // replace uses.
   for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
     MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
     MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    MachineBasicBlock::iterator Push = nullptr;
     if (MOV->getOpcode() == X86::MOV32mi) {
       unsigned PushOpcode = X86::PUSHi32;
       // If the operand is a small (8-bit) immediate, we can use a
@@ -461,21 +471,20 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         if (isInt<8>(Val))
           PushOpcode = X86::PUSH32i8;
       }
-      BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+          .addOperand(PushOp);
     } else {
       unsigned int Reg = PushOp.getReg();
 
       // If PUSHrmm is not slow on this target, try to fold the source of the
       // push into the instruction.
-      const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+      bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
 
       // Check that this is legal to fold. Right now, we're extremely
       // conservative about that.
       MachineInstr *DefMov = nullptr;
       if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
-        MachineInstr *Push =
-            BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
 
         unsigned NumOps = DefMov->getDesc().getNumOperands();
         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -483,12 +492,19 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 
         DefMov->eraseFromParent();
       } else {
-        BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
             .addReg(Reg)
             .getInstr();
       }
     }
 
+    // For debugging, when using SP-based CFA, we need to adjust the CFA
+    // offset after each push.
+    // TODO: This is needed only if we require precise CFA.
+    if (!TFL->hasFP(MF))
+      TFL->BuildCFI(MBB, std::next(Push), DL, 
+                    MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
+
     MBB.erase(MOV);
   }
 
@@ -532,13 +548,10 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
       DefMI->getParent() != FrameSetup->getParent())
     return nullptr;
 
-  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
-  // of MOVs. To be less conservative would require duplicating a lot of the
-  // logic from PeepholeOptimizer.
-  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
-  // to be smarter about folding into pushes.
+  // Make sure we don't have any instructions between DefMI and the
+  // push that make folding the load illegal.
   for (auto I = DefMI; I != FrameSetup; ++I)
-    if (I->getOpcode() != X86::MOV32rm)
+    if (I->isLoadFoldBarrier())
       return nullptr;
 
   return DefMI;
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
index 0eb2494..a08160f 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.h
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
 #define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
 
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/IR/CallingConv.h"
 
@@ -42,6 +43,64 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
   return false;
 }
 
+inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State) {
+  // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+  // not to split i64 and double between a register and stack
+  static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+  static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
+  
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // If this is the first part of an double/i64/i128, or if we're already
+  // in the middle of a split, add to the pending list. If this is not
+  // the end of the split, return, otherwise go on to process the pending
+  // list
+  if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+    PendingMembers.push_back(
+        CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+    if (!ArgFlags.isSplitEnd())
+      return true;
+  }
+
+  // If there are no pending members, we are not in the middle of a split,
+  // so do the usual inreg stuff.
+  if (PendingMembers.empty()) {
+    if (unsigned Reg = State.AllocateReg(RegList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+    return false;
+  }
+
+  assert(ArgFlags.isSplitEnd());
+
+  // We now have the entire original argument in PendingMembers, so decide
+  // whether to use registers or the stack.
+  // Per the MCU ABI:
+  // a) To use registers, we need to have enough of them free to contain
+  // the entire argument.
+  // b) We never want to use more than 2 registers for a single argument.
+
+  unsigned FirstFree = State.getFirstUnallocated(RegList);
+  bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+  for (auto &It : PendingMembers) {
+    if (UseRegs)
+      It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+    else
+      It.convertToMem(State.AllocateStack(4, 4));
+    State.addLoc(It);
+  }
+
+  PendingMembers.clear();
+
+  return true;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
index 8f88888..e8b96e7 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -158,6 +158,7 @@ def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
   CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
 
   // MMX vector types are always returned in XMM0.
   CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
@@ -202,6 +203,16 @@ def RetCC_X86_64_AnyReg : CallingConv<[
   CCCustom<"CC_X86_AnyReg_Error">
 ]>;
 
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: could return in any GP register save RSP and R12.
+  CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+                                 RAX, R10, R11, R13, R14, R15]>>
+]>;
+
 // This is the root return-value convention for the X86-32 backend.
 def RetCC_X86_32 : CallingConv<[
   // If FastCC, use RetCC_X86_32_Fast.
@@ -227,6 +238,9 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
+  // Handle HHVM calls.
+  CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
   // Mingw64 and native Win64 use Win64 CC
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
 
@@ -280,7 +294,7 @@ def CC_X86_64_C : CallingConv<[
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
   // The first 8 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
             CCIfSubtarget<"hasSSE1()",
             CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
 
@@ -305,7 +319,7 @@ def CC_X86_64_C : CallingConv<[
 
   // Long doubles get stack slots whose size and alignment depends on the
   // subtarget.
-  CCIfType<[f80], CCAssignToStack<0, 0>>,
+  CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
 
   // Vectors get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
@@ -319,6 +333,23 @@ def CC_X86_64_C : CallingConv<[
            CCAssignToStack<64, 64>>
 ]>;
 
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+  // Use all/any GP registers for args, except RSP.
+  CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+                                 RDI, RSI, RDX, RCX, R8, R9,
+                                 RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+  // Pass the first argument in RBP.
+  CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+  // Otherwise it's the same as the regular C calling convention.
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
 // Calling convention used on Win64
 def CC_X86_Win64_C : CallingConv<[
   // FIXME: Handle byval stuff.
@@ -561,6 +592,23 @@ def CC_X86_32_C : CallingConv<[
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
+def CC_X86_32_MCU : CallingConv<[
+  // Handles byval parameters.  Note that, like FastCC, we can't rely on
+  // the delegation to CC_X86_32_Common because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // If the call is not a vararg call, some arguments may be passed
+  // in integer registers.
+  CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
 def CC_X86_32_FastCall : CallingConv<[
   // Promote i1/i8/i16 arguments to i32.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -708,18 +756,28 @@ def CC_Intel_OCL_BI : CallingConv<[
   CCDelegateTo<CC_X86_32_C>
 ]>;
 
+def CC_X86_32_Intr : CallingConv<[
+  CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+  CCAssignToStack<8, 8>
+]>;
+
 //===----------------------------------------------------------------------===//
 // X86 Root Argument Calling Conventions
 //===----------------------------------------------------------------------===//
 
 // This is the root argument convention for the X86-32 backend.
 def CC_X86_32 : CallingConv<[
+  CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
 
   // Otherwise, drop to normal X86-32 CC
   CCDelegateTo<CC_X86_32_C>
@@ -734,6 +792,9 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+  CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+  CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
 
   // Mingw64 and native Win64 use Win64 CC
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -764,6 +825,18 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
 
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+                                             R8, R9, R10, R11)>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
+
 // All GPRs - except r11
 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
                                               R8, R9, R10, RSP)>;
@@ -778,6 +851,11 @@ def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
                                            R11, R12, R13, R14, R15, RBP,
                                            (sequence "XMM%u", 0, 15))>;
 
+def CSR_32_AllRegs     : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+                                              EDI, ESP)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                              (sequence "XMM%u", 0, 7))>;
+
 def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
                                               (sequence "XMM%u", 16, 31))>;
 def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
@@ -804,3 +882,6 @@ def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
 def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
                                                   (sequence "ZMM%u", 16, 31),
                                                   K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 6a5a28e..a09d065 100644
--- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -19,9 +19,10 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
 #include "llvm/IR/GlobalValue.h"
 using namespace llvm;
 
@@ -141,6 +142,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     // The EH_RETURN pseudo is really removed during the MC Lowering.
     return true;
   }
+  case X86::IRET: {
+    // Adjust stack to erase error code
+    int64_t StackAdj = MBBI->getOperand(0).getImm();
+    X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+    // Replace pseudo with machine iret
+    BuildMI(MBB, MBBI, DL,
+            TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+    MBB.erase(MBBI);
+    return true;
+  }
+  case X86::EH_RESTORE: {
+    // Restore ESP and EBP, and optionally ESI if required.
+    bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
+        MBB.getParent()->getFunction()->getPersonalityFn()));
+    X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
+    MBBI->eraseFromParent();
+    return true;
+  }
   }
   llvm_unreachable("Previous switch has a fallthrough?");
 }
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index b4319c8..f48b479 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -298,8 +298,8 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
     return false;
 
   // Make sure nothing is in the way
-  BasicBlock::const_iterator Start = I;
-  BasicBlock::const_iterator End = II;
+  BasicBlock::const_iterator Start(I);
+  BasicBlock::const_iterator End(II);
   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
     // We only expect extractvalue instructions between the intrinsic and the
     // instruction to be selected.
@@ -433,6 +433,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
                                    X86AddressMode &AM,
                                    MachineMemOperand *MMO, bool Aligned) {
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasSSE4A = Subtarget->hasSSE4A();
+  bool HasAVX = Subtarget->hasAVX();
+  bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -449,35 +454,59 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
   // FALLTHROUGH, handling i1 as i8.
   case MVT::i8:  Opc = X86::MOV8mr;  break;
   case MVT::i16: Opc = X86::MOV16mr; break;
-  case MVT::i32: Opc = X86::MOV32mr; break;
-  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+  case MVT::i32:
+    Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+    break;
   case MVT::f32:
-    Opc = X86ScalarSSEf32 ?
-          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
+    if (X86ScalarSSEf32) {
+      if (IsNonTemporal && HasSSE4A)
+        Opc = X86::MOVNTSS;
+      else
+        Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+    } else
+      Opc = X86::ST_Fp32m;
     break;
   case MVT::f64:
-    Opc = X86ScalarSSEf64 ?
-          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
+    if (X86ScalarSSEf32) {
+      if (IsNonTemporal && HasSSE4A)
+        Opc = X86::MOVNTSD;
+      else
+        Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+    } else
+      Opc = X86::ST_Fp64m;
     break;
   case MVT::v4f32:
-    if (Aligned)
-      Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
-    else
-      Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+      else
+        Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+    } else
+      Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
     break;
   case MVT::v2f64:
-    if (Aligned)
-      Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
-    else
-      Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+      else
+        Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+    } else
+      Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
     break;
   case MVT::v4i32:
   case MVT::v2i64:
   case MVT::v8i16:
   case MVT::v16i8:
-    if (Aligned)
-      Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
-    else
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+      else
+        Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+    } else
       Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
     break;
   }
@@ -973,6 +1002,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
@@ -1069,12 +1101,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     RetRegs.push_back(VA.getLocReg());
   }
 
-  // The x86-64 ABI for returning structs by value requires that we copy
-  // the sret argument into %rax for the return. We saved the argument into
-  // a virtual register in the entry block, so now we copy the value out
-  // and into %rax. We also do the same with %eax for Win32.
-  if (F.hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
+  // All x86 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  if (F.hasStructRetAttr()) {
     unsigned Reg = X86MFInfo->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
@@ -1431,17 +1462,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
           .addMBB(TrueMBB);
       }
 
-      // Obtain the branch weight and add the TrueBB to the successor list.
-      uint32_t BranchWeight = 0;
-      if (FuncInfo.BPI)
-        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                   TrueMBB->getBasicBlock());
-      FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
-
-      // Emits an unconditional branch to the FalseBB, obtains the branch
-      // weight, and adds it to the successor list.
-      fastEmitBranch(FalseMBB, DbgLoc);
-
+      finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1472,12 +1493,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
           .addMBB(TrueMBB);
-        fastEmitBranch(FalseMBB, DbgLoc);
-        uint32_t BranchWeight = 0;
-        if (FuncInfo.BPI)
-          BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                     TrueMBB->getBasicBlock());
-        FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+
+        finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
         return true;
       }
     }
@@ -1492,12 +1509,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
       .addMBB(TrueMBB);
-    fastEmitBranch(FalseMBB, DbgLoc);
-    uint32_t BranchWeight = 0;
-    if (FuncInfo.BPI)
-      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                                 TrueMBB->getBasicBlock());
-    FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+    finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
     return true;
   }
 
@@ -1511,12 +1523,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
     .addReg(OpReg).addImm(1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
     .addMBB(TrueMBB);
-  fastEmitBranch(FalseMBB, DbgLoc);
-  uint32_t BranchWeight = 0;
-  if (FuncInfo.BPI)
-    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
-                                               TrueMBB->getBasicBlock());
-  FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+  finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
   return true;
 }
 
@@ -1945,6 +1952,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
   unsigned ResultReg;
   
   if (Subtarget->hasAVX()) {
+    const TargetRegisterClass *FR32 = &X86::FR32RegClass;
+    const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
     // If we have AVX, create 1 blendv instead of 3 logic instructions.
     // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
     // uses XMM0 as the selection register. That may need just as many
@@ -1955,10 +1965,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     unsigned BlendOpcode =
       (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
     
-    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
-    ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill,
-                                 LHSReg, LHSIsKill, CmpReg, true);
+    unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+                                          LHSReg, LHSIsKill, CmpReg, true);
+    ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
   } else {
     unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
@@ -2806,10 +2819,12 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
       CC == CallingConv::HiPE)
     return 0;
-  if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
-    return 0;
-  if (CS && CS->paramHasAttr(1, Attribute::InReg))
-    return 0;
+
+  if (CS)
+    if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
+        CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+      return 0;
+
   return 4;
 }
 
@@ -2924,7 +2939,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
@@ -3020,8 +3035,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
       unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
-        ArgVT.getStoreSize(), Alignment);
+          MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
       if (Flags.isByVal()) {
         X86AddressMode SrcAM;
         SrcAM.Base.Reg = ArgReg;
@@ -3252,6 +3267,30 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
     updateValueMap(I, Reg);
     return true;
   }
+  case Instruction::BitCast: {
+    // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+    if (!Subtarget->hasSSE2())
+      return false;
+
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+
+    if (!SrcVT.isSimple() || !DstVT.isSimple())
+      return false;
+
+    if (!SrcVT.is128BitVector() &&
+        !(Subtarget->hasAVX() && SrcVT.is256BitVector()))
+      return false;
+
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0)
+      return false;
+      
+    // No instruction is needed for conversion. Reuse the register used by
+    // the fist operand.
+    updateValueMap(I, Reg);
+    return true;
+  }
   }
 
   return false;
@@ -3384,8 +3423,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
                                       TII.get(Opc), ResultReg);
     addDirectMem(MIB, AddrReg);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
-        DL.getPointerSize(), Align);
+        MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+        MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
     MIB->addMemOperand(*FuncInfo.MF, MMO);
     return ResultReg;
   }
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 5eb4fae..1dd69e8 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -9,6 +9,7 @@
 //
 // This file defines the pass that finds instructions that can be
 // re-written as LEA instructions in order to reduce pipeline delays.
+// When optimizing for size it replaces suitable LEAs with INC or DEC.
 //
 //===----------------------------------------------------------------------===//
 
@@ -61,6 +62,11 @@ class FixupLEAPass : public MachineFunctionPass {
   void processInstructionForSLM(MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI);
 
+  /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+  /// and convert them to INC or DEC respectively.
+  bool fixupIncDec(MachineBasicBlock::iterator &I,
+                   MachineFunction::iterator MFI) const;
+
   /// \brief Determine if an instruction references a machine register
   /// and, if so, whether it reads or writes the register.
   RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
@@ -89,6 +95,8 @@ public:
 private:
   MachineFunction *MF;
   const X86InstrInfo *TII; // Machine instruction info.
+  bool OptIncDec;
+  bool OptLEA;
 };
 char FixupLEAPass::ID = 0;
 }
@@ -150,7 +158,10 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
-  if (!ST.LEAusesAG() && !ST.slowLEA())
+  OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+
+  if (!OptLEA && !OptIncDec)
     return false;
 
   TII = ST.getInstrInfo();
@@ -187,7 +198,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
 static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
                                     MachineFunction::iterator MFI) {
   if (I == MFI->begin()) {
-    if (MFI->isPredecessor(MFI)) {
+    if (MFI->isPredecessor(&*MFI)) {
       I = --MFI->end();
       return true;
     } else
@@ -222,6 +233,60 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
   return nullptr;
 }
 
+static inline bool isLEA(const int opcode) {
+  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+}
+
+/// isLEASimpleIncOrDec - Does this LEA have one these forms:
+/// lea  %reg, 1(%reg)
+/// lea  %reg, -1(%reg)
+static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) {
+  unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg();
+  unsigned DstReg = LEA->getOperand(0).getReg();
+  unsigned AddrDispOp = 1 + X86::AddrDisp;
+  return SrcReg == DstReg &&
+         LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+         LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+         LEA->getOperand(AddrDispOp).isImm() &&
+         (LEA->getOperand(AddrDispOp).getImm() == 1 ||
+          LEA->getOperand(AddrDispOp).getImm() == -1);
+}
+
+bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
+                               MachineFunction::iterator MFI) const {
+  MachineInstr *MI = I;
+  int Opcode = MI->getOpcode();
+  if (!isLEA(Opcode))
+    return false;
+
+  if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
+    int NewOpcode;
+    bool isINC = MI->getOperand(4).getImm() == 1;
+    switch (Opcode) {
+    case X86::LEA16r:
+      NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
+      break;
+    case X86::LEA32r:
+    case X86::LEA64_32r:
+      NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
+      break;
+    case X86::LEA64r:
+      NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
+      break;
+    }
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1));
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    return true;
+  }
+  return false;
+}
+
 void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
                                       MachineFunction::iterator MFI) {
   // Process a load, store, or LEA instruction.
@@ -265,8 +330,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
                                             MachineFunction::iterator MFI) {
   MachineInstr *MI = I;
   const int opcode = MI->getOpcode();
-  if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r &&
-      opcode != X86::LEA64_32r)
+  if (!isLEA(opcode))
     return;
   if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
       !TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -280,7 +344,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   int addrr_opcode, addri_opcode;
   switch (opcode) {
-  default: llvm_unreachable("Unexpected LEA instruction");
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
   case X86::LEA16r:
     addrr_opcode = X86::ADD16rr;
     addri_opcode = X86::ADD16ri;
@@ -330,10 +395,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
                                      MachineFunction::iterator MFI) {
 
   for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
-    if (MF.getSubtarget<X86Subtarget>().isSLM())
-      processInstructionForSLM(I, MFI);
-    else
-      processInstruction(I, MFI);
+    if (OptIncDec)
+      if (fixupIncDec(I, MFI))
+        continue;
+
+    if (OptLEA) {
+      if (MF.getSubtarget<X86Subtarget>().isSLM())
+        processInstructionForSLM(I, MFI);
+      else
+        processInstruction(I, MFI);
+    }
   }
   return false;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 40b9c8a..97bb8ab 100644
--- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -120,12 +120,10 @@ namespace {
     // Return a bitmask of FP registers in block's live-in list.
     static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
       unsigned Mask = 0;
-      for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
-           E = MBB->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        if (Reg < X86::FP0 || Reg > X86::FP6)
+      for (const auto &LI : MBB->liveins()) {
+        if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
           continue;
-        Mask |= 1 << (Reg - X86::FP0);
+        Mask |= 1 << (LI.PhysReg - X86::FP0);
       }
       return Mask;
     }
@@ -301,8 +299,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   bool FPIsUsed = false;
 
   static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0; i <= 6; ++i)
-    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
       FPIsUsed = true;
       break;
     }
@@ -321,7 +320,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   // Process the function in depth first order so that we process at least one
   // of the predecessors for every reachable block in the function.
   SmallPtrSet<MachineBasicBlock*, 8> Processed;
-  MachineBasicBlock *Entry = MF.begin();
+  MachineBasicBlock *Entry = &MF.front();
 
   bool Changed = false;
   for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
@@ -329,9 +328,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
 
   // Process any unreachable blocks in arbitrary order now.
   if (MF.size() != Processed.size())
-    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-      if (Processed.insert(BB).second)
-        Changed |= processBasicBlock(MF, *BB);
+    for (MachineBasicBlock &BB : MF)
+      if (Processed.insert(&BB).second)
+        Changed |= processBasicBlock(MF, BB);
 
   LiveBundles.clear();
 
@@ -348,13 +347,12 @@ void FPS::bundleCFG(MachineFunction &MF) {
   LiveBundles.resize(Bundles->getNumBundles());
 
   // Gather the actual live-in masks for all MBBs.
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I;
-    const unsigned Mask = calcLiveInMask(MBB);
+  for (MachineBasicBlock &MBB : MF) {
+    const unsigned Mask = calcLiveInMask(&MBB);
     if (!Mask)
       continue;
     // Update MBB ingoing bundle mask.
-    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask;
+    LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
   }
 }
 
@@ -546,17 +544,9 @@ namespace {
   };
 }
 
-#ifndef NDEBUG
-static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
-  for (unsigned i = 0; i != NumEntries-1; ++i)
-    if (!(Table[i] < Table[i+1])) return false;
-  return true;
-}
-#endif
-
-static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
-  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
-  if (I != Table+N && I->from == Opcode)
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+  if (I != Table.end() && I->from == Opcode)
     return I->to;
   return -1;
 }
@@ -567,7 +557,7 @@ static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
 #define ASSERT_SORTED(TABLE)                                              \
   { static bool TABLE##Checked = false;                                   \
     if (!TABLE##Checked) {                                                \
-       assert(TableIsSorted(TABLE, array_lengthof(TABLE)) &&              \
+       assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) &&       \
               "All lookup tables must be sorted for efficient access!");  \
        TABLE##Checked = true;                                             \
     }                                                                     \
@@ -746,7 +736,7 @@ static const TableEntry OpcodeTable[] = {
 
 static unsigned getConcreteOpcode(unsigned Opcode) {
   ASSERT_SORTED(OpcodeTable);
-  int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+  int Opc = Lookup(OpcodeTable, Opcode);
   assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
   return Opc;
 }
@@ -797,7 +787,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
   RegMap[Stack[--StackTop]] = ~0;     // Update state
 
   // Check to see if there is a popping version of this instruction...
-  int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+  int Opcode = Lookup(PopTable, I->getOpcode());
   if (Opcode != -1) {
     I->setDesc(TII->get(Opcode));
     if (Opcode == X86::UCOM_FPPr)
@@ -1193,7 +1183,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
 
   // We decide which form to use based on what is on the top of the stack, and
   // which operand is killed by this instruction.
-  const TableEntry *InstTable;
+  ArrayRef<TableEntry> InstTable;
   bool isForward = TOS == Op0;
   bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
   if (updateST0) {
@@ -1208,8 +1198,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
       InstTable = ReverseSTiTable;
   }
 
-  int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
-                      MI->getOpcode());
+  int Opcode = Lookup(InstTable, MI->getOpcode());
   assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
 
   // NotTOS - The register which is not on the top of stack...
@@ -1520,31 +1509,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
     return;
   }
 
-  case X86::WIN_FTOL_32:
-  case X86::WIN_FTOL_64: {
-    // Push the operand into ST0.
-    MachineOperand &Op = MI->getOperand(0);
-    assert(Op.isUse() && Op.isReg() &&
-      Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
-    unsigned FPReg = getFPReg(Op);
-    if (Op.isKill())
-      moveToTop(FPReg, Inst);
-    else
-      duplicateToTop(FPReg, ScratchFPReg, Inst);
-
-    // Emit the call. This will pop the operand.
-    BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
-      .addExternalSymbol("_ftol2")
-      .addReg(X86::ST0, RegState::ImplicitKill)
-      .addReg(X86::ECX, RegState::ImplicitDefine)
-      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
-      .addReg(X86::EDX, RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-    --StackTop;
-
-    break;
-  }
-
   case X86::RETQ:
   case X86::RETL:
   case X86::RETIL:
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
index 3a21b57..8b5fd27 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -18,25 +18,23 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Debug.h"
 #include <cstdlib>
 
 using namespace llvm;
 
-// FIXME: completely move here.
-extern cl::opt<bool> ForceStackAlign;
-
 X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
                                    unsigned StackAlignOverride)
     : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
@@ -92,7 +90,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
-          MMI.callsUnwindInit() || MMI.callsEHReturn() ||
+          MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
           MFI->hasStackMap() || MFI->hasPatchPoint());
 }
 
@@ -148,21 +146,14 @@ static unsigned getLEArOpcode(unsigned IsLP64) {
 /// to this register without worry about clobbering it.
 static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator &MBBI,
-                                       const TargetRegisterInfo *TRI,
+                                       const X86RegisterInfo *TRI,
                                        bool Is64Bit) {
   const MachineFunction *MF = MBB.getParent();
   const Function *F = MF->getFunction();
   if (!F || MF->getMMI().callsEHReturn())
     return 0;
 
-  static const uint16_t CallerSavedRegs32Bit[] = {
-    X86::EAX, X86::EDX, X86::ECX, 0
-  };
-
-  static const uint16_t CallerSavedRegs64Bit[] = {
-    X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
-    X86::R8,  X86::R9,  X86::R10, X86::R11, 0
-  };
+  const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
 
   unsigned Opc = MBBI->getOpcode();
   switch (Opc) {
@@ -191,10 +182,9 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
         Uses.insert(*AI);
     }
 
-    const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
-    for (; *CS; ++CS)
-      if (!Uses.count(*CS))
-        return *CS;
+    for (auto CS : AvailableRegs)
+      if (!Uses.count(CS) && CS != X86::RIP)
+        return CS;
   }
   }
 
@@ -214,8 +204,12 @@ static bool isEAXLiveIn(MachineFunction &MF) {
   return false;
 }
 
-/// Check whether or not the terminators of \p MBB needs to read EFLAGS.
-static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
   for (const MachineInstr &MI : MBB.terminators()) {
     bool BreakNext = false;
     for (const MachineOperand &MO : MI.operands()) {
@@ -225,15 +219,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
       if (Reg != X86::EFLAGS)
         continue;
 
-      // This terminator needs an eflag that is not defined
-      // by a previous terminator.
+      // This terminator needs an eflags that is not defined
+      // by a previous another terminator:
+      // EFLAGS is live-in of the region composed by the terminators.
       if (!MO.isDef())
         return true;
+      // This terminator defines the eflags, i.e., we don't need to preserve it.
+      // However, we still need to check this specific terminator does not
+      // read a live-in value.
       BreakNext = true;
     }
+    // We found a definition of the eflags, no need to preserve them.
     if (BreakNext)
-      break;
+      return false;
   }
+
+  // None of the terminators use or define the eflags.
+  // Check if they are live-out, that would imply we need to preserve them.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
+      return true;
+
   return false;
 }
 
@@ -289,6 +295,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
           .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
         if (isSub)
           MI->setFlag(MachineInstr::FrameSetup);
+        else
+          MI->setFlag(MachineInstr::FrameDestroy);
         Offset -= ThisVal;
         continue;
       }
@@ -298,6 +306,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
         MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
     if (isSub)
       MI.setMIFlag(MachineInstr::FrameSetup);
+    else
+      MI.setMIFlag(MachineInstr::FrameDestroy);
 
     Offset -= ThisVal;
   }
@@ -312,7 +322,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   // is tricky.
   bool UseLEA;
   if (!InEpilogue) {
-    UseLEA = STI.useLeaForSP();
+    // Check if inserting the prologue at the beginning
+    // of MBB would require to use LEA operations.
+    // We need to use LEA operations if EFLAGS is live in, because
+    // it means an instruction will read it before it gets defined.
+    UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
   } else {
     // If we can use LEA for SP but we shouldn't, check that none
     // of the terminators uses the eflags. Otherwise we will insert
@@ -321,10 +335,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
     // and is an optimization anyway.
     UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
     if (UseLEA && !STI.useLeaForSP())
-      UseLEA = terminatorsNeedFlagsAsInput(MBB);
+      UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
     // If that assert breaks, that means we do not do the right thing
     // in canUseAsEpilogue.
-    assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) &&
+    assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
            "We shouldn't have allowed this insertion point");
   }
 
@@ -347,30 +361,6 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   return MI;
 }
 
-/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
-static
-void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                      unsigned StackPtr, uint64_t *NumBytes = nullptr) {
-  if (MBBI == MBB.begin()) return;
-
-  MachineBasicBlock::iterator PI = std::prev(MBBI);
-  unsigned Opc = PI->getOpcode();
-  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
-       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
-       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
-      PI->getOperand(0).getReg() == StackPtr) {
-    if (NumBytes)
-      *NumBytes += PI->getOperand(2).getImm();
-    MBB.erase(PI);
-  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
-              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
-             PI->getOperand(0).getReg() == StackPtr) {
-    if (NumBytes)
-      *NumBytes -= PI->getOperand(2).getImm();
-    MBB.erase(PI);
-  }
-}
-
 int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator &MBBI,
                                      bool doMergeWithPrevious) const {
@@ -436,27 +426,265 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
   }
 }
 
-/// usesTheStack - This function checks if any of the users of EFLAGS
-/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
-/// to use the stack, and if we don't adjust the stack we clobber the first
-/// frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(const MachineFunction &MF) {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  for (MachineRegisterInfo::reg_instr_iterator
-       ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
-       ri != re; ++ri)
-    if (ri->isCopy())
-      return true;
+MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               DebugLoc DL,
+                                               bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.isTargetWindowsCoreCLR()) {
+    if (InProlog) {
+      return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+    } else {
+      return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+    }
+  } else {
+    return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+  }
+}
 
-  return false;
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                        MachineBasicBlock &PrologMBB) const {
+  const StringRef ChkStkStubSymbol = "__chkstk_stub";
+  MachineInstr *ChkStkStub = nullptr;
+
+  for (MachineInstr &MI : PrologMBB) {
+    if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+        ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+      ChkStkStub = &MI;
+      break;
+    }
+  }
+
+  if (ChkStkStub != nullptr) {
+    MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+    assert(std::prev(MBBI).operator==(ChkStkStub) &&
+      "MBBI expected after __chkstk_stub.");
+    DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+    emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+    ChkStkStub->eraseFromParent();
+  }
 }
 
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
-                                          MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL) const {
+MachineInstr *X86FrameLowering::emitStackProbeInline(
+  MachineFunction &MF, MachineBasicBlock &MBB,
+  MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  assert(STI.is64Bit() && "different expansion needed for 32 bit");
+  assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  // RAX contains the number of bytes of desired stack adjustment.
+  // The handling here assumes this value has already been updated so as to
+  // maintain stack alignment.
+  //
+  // We need to exit with RSP modified by this amount and execute suitable
+  // page touches to notify the OS that we're growing the stack responsibly.
+  // All stack probing must be done without modifying RSP.
+  //
+  // MBB:
+  //    SizeReg = RAX;
+  //    ZeroReg = 0
+  //    CopyReg = RSP
+  //    Flags, TestReg = CopyReg - SizeReg
+  //    FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+  //    LimitReg = gs magic thread env access
+  //    if FinalReg >= LimitReg goto ContinueMBB
+  // RoundBB:
+  //    RoundReg = page address of FinalReg
+  // LoopMBB:
+  //    LoopReg = PHI(LimitReg,ProbeReg)
+  //    ProbeReg = LoopReg - PageSize
+  //    [ProbeReg] = 0
+  //    if (ProbeReg > RoundReg) goto LoopMBB
+  // ContinueMBB:
+  //    RSP = RSP - RAX
+  //    [rest of original MBB]
+
+  // Set up the new basic blocks
+  MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+  MF.insert(MBBIter, RoundMBB);
+  MF.insert(MBBIter, LoopMBB);
+  MF.insert(MBBIter, ContinueMBB);
+
+  // Split MBB and move the tail portion down to ContinueMBB.
+  MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+  ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+  ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  // Some useful constants
+  const int64_t ThreadEnvironmentStackLimit = 0x10;
+  const int64_t PageSize = 0x1000;
+  const int64_t PageMask = ~(PageSize - 1);
+
+  // Registers we need. For the normal case we use virtual
+  // registers. For the prolog expansion we use RAX, RCX and RDX.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+  const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+                                    : MRI.createVirtualRegister(RegClass),
+                 ZeroReg = InProlog ? (unsigned)X86::RCX
+                                    : MRI.createVirtualRegister(RegClass),
+                 CopyReg = InProlog ? (unsigned)X86::RDX
+                                    : MRI.createVirtualRegister(RegClass),
+                 TestReg = InProlog ? (unsigned)X86::RDX
+                                    : MRI.createVirtualRegister(RegClass),
+                 FinalReg = InProlog ? (unsigned)X86::RDX
+                                     : MRI.createVirtualRegister(RegClass),
+                 RoundedReg = InProlog ? (unsigned)X86::RDX
+                                       : MRI.createVirtualRegister(RegClass),
+                 LimitReg = InProlog ? (unsigned)X86::RCX
+                                     : MRI.createVirtualRegister(RegClass),
+                 JoinReg = InProlog ? (unsigned)X86::RCX
+                                    : MRI.createVirtualRegister(RegClass),
+                 ProbeReg = InProlog ? (unsigned)X86::RCX
+                                     : MRI.createVirtualRegister(RegClass);
+
+  // SP-relative offsets where we can save RCX and RDX.
+  int64_t RCXShadowSlot = 0;
+  int64_t RDXShadowSlot = 0;
+
+  // If inlining in the prolog, save RCX and RDX.     
+  // Future optimization: don't save or restore if not live in.
+  if (InProlog) {
+    // Compute the offsets. We need to account for things already
+    // pushed onto the stack at this point: return address, frame
+    // pointer (if used), and callee saves.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+    const bool HasFP = hasFP(MF);
+    RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+    RDXShadowSlot = RCXShadowSlot + 8;
+    // Emit the saves.
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RCXShadowSlot)
+        .addReg(X86::RCX);
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RDXShadowSlot)
+        .addReg(X86::RDX);
+  } else {
+    // Not in the prolog. Copy RAX to a virtual reg.
+    BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+  }
+
+  // Add code to MBB to check for overflow and set the new target stack pointer
+  // to zero if so.
+  BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+      .addReg(ZeroReg, RegState::Undef)
+      .addReg(ZeroReg, RegState::Undef);
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+  BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+      .addReg(CopyReg)
+      .addReg(SizeReg);
+  BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+      .addReg(TestReg)
+      .addReg(ZeroReg);
+
+  // FinalReg now holds final stack pointer value, or zero if
+  // allocation would overflow. Compare against the current stack
+  // limit from the thread environment block. Note this limit is the
+  // lowest touched page on the stack, not the point at which the OS
+  // will cause an overflow exception, so this is just an optimization
+  // to avoid unnecessarily touching pages that are below the current
+  // SP but already commited to the stack by the OS.
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+      .addReg(0)
+      .addImm(1)
+      .addReg(0)
+      .addImm(ThreadEnvironmentStackLimit)
+      .addReg(X86::GS);
+  BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+  // Jump if the desired stack pointer is at or above the stack limit.
+  BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+  // Add code to roundMBB to round the final stack pointer to a page boundary.
+  BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+      .addReg(FinalReg)
+      .addImm(PageMask);
+  BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+  // LimitReg now holds the current stack limit, RoundedReg page-rounded
+  // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+  // and probe until we reach RoundedReg.
+  if (!InProlog) {
+    BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+        .addReg(LimitReg)
+        .addMBB(RoundMBB)
+        .addReg(ProbeReg)
+        .addMBB(LoopMBB);
+  }
+
+  addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+               false, -PageSize);
+
+  // Probe by storing a byte onto the stack.
+  BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+      .addReg(ProbeReg)
+      .addImm(1)
+      .addReg(0)
+      .addImm(0)
+      .addReg(0)
+      .addImm(0);
+  BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+      .addReg(RoundedReg)
+      .addReg(ProbeReg);
+  BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+  MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+  // If in prolog, restore RDX and RCX.
+  if (InProlog) {
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RCX),
+                 X86::RSP, false, RCXShadowSlot);
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RDX),
+                 X86::RSP, false, RDXShadowSlot);
+  }
+
+  // Now that the probing is done, add code to continueMBB to update
+  // the stack pointer for real.
+  BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+      .addReg(X86::RSP)
+      .addReg(SizeReg);
+
+  // Add the control flow edges we need.
+  MBB.addSuccessor(ContinueMBB);
+  MBB.addSuccessor(RoundMBB);
+  RoundMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(ContinueMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+
+  // Mark all the instructions added to the prolog as frame setup.
+  if (InProlog) {
+    for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+      BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *RoundMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *LoopMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+         CMBBI != ContinueMBBI; ++CMBBI) {
+      CMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  // Possible TODO: physreg liveness for InProlog case.
+
+  return ContinueMBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeCall(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
   bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 
   unsigned CallOp;
@@ -478,6 +706,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
     Symbol = "_chkstk";
 
   MachineInstrBuilder CI;
+  MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
 
   // All current stack probes take AX and SP as input, clobber flags, and
   // preserve all registers. x86_64 probes leave RSP unmodified.
@@ -507,6 +736,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
         .addReg(X86::RSP)
         .addReg(X86::RAX);
   }
+
+  if (InProlog) {
+    // Apply the frame setup flag to all inserted instrs.
+    for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+      ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+  }
+
+  return MBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+
+  assert(InProlog && "ChkStkStub called outside prolog!");
+
+  BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__chkstk_stub");
+
+  return MBBI;
 }
 
 static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -526,7 +775,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
   unsigned StackAlign = getStackAlignment();
-  if (ForceStackAlign) {
+  if (MF.getFunction()->hasFnAttribute("stackrealign")) {
     if (MFI->hasCalls())
       MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
     else if (MaxAlign < SlotSize)
@@ -537,15 +786,14 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con
 
 void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL,
+                                          DebugLoc DL, unsigned Reg,
                                           uint64_t MaxAlign) const {
   uint64_t Val = -MaxAlign;
-  MachineInstr *MI =
-      BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)),
-              StackPtr)
-          .addReg(StackPtr)
-          .addImm(Val)
-          .setMIFlag(MachineInstr::FrameSetup);
+  unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+                         .addReg(Reg)
+                         .addImm(Val)
+                         .setMIFlag(MachineInstr::FrameSetup);
 
   // The EFLAGS implicit def is dead.
   MI->getOperand(3).setIsDead();
@@ -646,6 +894,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
+  bool IsFunclet = MBB.isEHFuncletEntry();
+  EHPersonality Personality = EHPersonality::Unknown;
+  if (Fn->hasPersonalityFn())
+    Personality = classifyEHPersonality(Fn->getPersonalityFn());
+  bool FnHasClrFunclet =
+      MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+  bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
   bool HasFP = hasFP(MF);
   bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
   bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -655,9 +910,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   unsigned FramePtr = TRI->getFrameRegister(MF);
   const unsigned MachineFramePtr =
       STI.isTarget64BitILP32()
-          ? getX86SubSuperRegister(FramePtr, MVT::i64, false)
-          : FramePtr;
+          ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
   unsigned BasePtr = TRI->getBaseRegister();
+  
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
   DebugLoc DL;
 
   // Add RETADDR move area to callee saved frame size.
@@ -686,11 +943,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // push and pop from the stack.
   if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
       !TRI->needsStackRealignment(MF) &&
-      !MFI->hasVarSizedObjects() && // No dynamic alloca.
-      !MFI->adjustsStack() &&       // No calls.
-      !IsWin64CC &&                 // Win64 has no Red Zone
-      !usesTheStack(MF) &&          // Don't push and pop.
-      !MF.shouldSplitStack()) {     // Regular stack
+      !MFI->hasVarSizedObjects() &&    // No dynamic alloca.
+      !MFI->adjustsStack() &&          // No calls.
+      !IsWin64CC &&                    // Win64 has no Red Zone
+      !MFI->hasOpaqueSPAdjustment() && // Don't push and pop.
+      !MF.shouldSplitStack()) {        // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -723,6 +980,24 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   uint64_t NumBytes = 0;
   int stackGrowth = -SlotSize;
 
+  // Find the funclet establisher parameter
+  unsigned Establisher = X86::NoRegister;
+  if (IsClrFunclet)
+    Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+  else if (IsFunclet)
+    Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+  if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+    // Immediately spill establisher into the home slot.
+    // The runtime cares about this.
+    // MOV64mr %rdx, 16(%rsp)
+    unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+        .addReg(Establisher)
+        .setMIFlag(MachineInstr::FrameSetup);
+    MBB.addLiveIn(Establisher);
+  }
+
   if (HasFP) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
@@ -739,7 +1014,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
     // Update the frame offset adjustment.
-    MFI->setOffsetAdjustment(-NumBytes);
+    if (!IsFunclet)
+      MFI->setOffsetAdjustment(-NumBytes);
+    else
+      assert(MFI->getOffsetAdjustment() == -(int)NumBytes &&
+             "should calculate same local variable offset for funclets");
 
     // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
@@ -765,35 +1044,46 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    if (!IsWin64Prologue) {
+    if (!IsWin64Prologue && !IsFunclet) {
       // Update EBP with the new base value.
       BuildMI(MBB, MBBI, DL,
               TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
               FramePtr)
           .addReg(StackPtr)
           .setMIFlag(MachineInstr::FrameSetup);
-    }
 
-    if (NeedsDwarfCFI) {
-      // Mark effective beginning of when frame pointer becomes valid.
-      // Define the current CFA to use the EBP/RBP register.
-      unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
-      BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+      if (NeedsDwarfCFI) {
+        // Mark effective beginning of when frame pointer becomes valid.
+        // Define the current CFA to use the EBP/RBP register.
+        unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+        BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+                                    nullptr, DwarfFramePtr));
+      }
     }
 
-    // Mark the FramePtr as live-in in every block.
-    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-      I->addLiveIn(MachineFramePtr);
+    // Mark the FramePtr as live-in in every block. Don't do this again for
+    // funclet prologues.
+    if (!IsFunclet) {
+      for (MachineBasicBlock &EveryMBB : MF)
+        EveryMBB.addLiveIn(MachineFramePtr);
+    }
   } else {
+    assert(!IsFunclet && "funclets without FPs not yet implemented");
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
 
+  // For EH funclets, only allocate enough space for outgoing calls. Save the
+  // NumBytes value that we would've used for the parent frame.
+  unsigned ParentFrameNumBytes = NumBytes;
+  if (IsFunclet)
+    NumBytes = getWinEHFuncletFrameSize(MF);
+
   // Skip the callee-saved push instructions.
   bool PushedRegs = false;
   int StackOffset = 2 * stackGrowth;
 
   while (MBBI != MBB.end() &&
+         MBBI->getFlag(MachineInstr::FrameSetup) &&
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
     PushedRegs = true;
@@ -818,9 +1108,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // Realign stack after we pushed callee-saved registers (so that we'll be
   // able to calculate their offsets from the frame pointer).
   // Don't do this for Win64, it needs to realign the stack after the prologue.
-  if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+  if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
-    BuildStackAlignAND(MBB, MBBI, DL, MaxAlign);
+    BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
   }
 
   // If there is an SUB32ri of ESP immediately before this instruction, merge
@@ -839,7 +1129,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
   uint64_t AlignedNumBytes = NumBytes;
-  if (IsWin64Prologue && TRI->needsStackRealignment(MF))
+  if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
     AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
   if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
     // Check whether EAX is livein for this function.
@@ -876,26 +1166,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
       // We'll also use 4 already allocated bytes for EAX.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
-        .setMIFlag(MachineInstr::FrameSetup);
+          .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+          .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    // Save a pointer to the MI where we set AX.
-    MachineBasicBlock::iterator SetRAX = MBBI;
-    --SetRAX;
-
     // Call __chkstk, __chkstk_ms, or __alloca.
-    emitStackProbeCall(MF, MBB, MBBI, DL);
-
-    // Apply the frame setup flag to all inserted instrs.
-    for (; SetRAX != MBBI; ++SetRAX)
-      SetRAX->setFlag(MachineInstr::FrameSetup);
+    emitStackProbe(MF, MBB, MBBI, DL, true);
 
     if (isEAXAlive) {
       // Restore EAX
-      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
-                                              X86::EAX),
-                                      StackPtr, false, NumBytes - 4);
+      MachineInstr *MI =
+          addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+                       StackPtr, false, NumBytes - 4);
       MI->setFlag(MachineInstr::FrameSetup);
       MBB.insert(MBBI, MI);
     }
@@ -909,19 +1191,72 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
 
   int SEHFrameOffset = 0;
+  unsigned SPOrEstablisher;
+  if (IsFunclet) {
+    if (IsClrFunclet) {
+      // The establisher parameter passed to a CLR funclet is actually a pointer
+      // to the (mostly empty) frame of its nearest enclosing funclet; we have
+      // to find the root function establisher frame by loading the PSPSym from
+      // the intermediate frame.
+      unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+      MachinePointerInfo NoInfo;
+      MBB.addLiveIn(Establisher);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+                   Establisher, false, PSPSlotOffset)
+          .addMemOperand(MF.getMachineMemOperand(
+              NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+      ;
+      // Save the root establisher back into the current funclet's (mostly
+      // empty) frame, in case a sub-funclet or the GC needs it.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+                   false, PSPSlotOffset)
+          .addReg(Establisher)
+          .addMemOperand(
+              MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
+                                                  MachineMemOperand::MOVolatile,
+                                      SlotSize, SlotSize));
+    }
+    SPOrEstablisher = Establisher;
+  } else {
+    SPOrEstablisher = StackPtr;
+  }
+
   if (IsWin64Prologue && HasFP) {
-    SEHFrameOffset = calculateSetFPREG(NumBytes);
+    // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+    // this calculation on the incoming establisher, which holds the value of
+    // RSP from the parent frame at the end of the prologue.
+    SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
     if (SEHFrameOffset)
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
-                   StackPtr, false, SEHFrameOffset);
+                   SPOrEstablisher, false, SEHFrameOffset);
     else
-      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+          .addReg(SPOrEstablisher);
 
-    if (NeedsWinCFI)
+    // If this is not a funclet, emit the CFI describing our frame pointer.
+    if (NeedsWinCFI && !IsFunclet) {
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
           .addImm(FramePtr)
           .addImm(SEHFrameOffset)
           .setMIFlag(MachineInstr::FrameSetup);
+      if (isAsynchronousEHPersonality(Personality))
+        MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+    }
+  } else if (IsFunclet && STI.is32Bit()) {
+    // Reset EBP / ESI to something good for funclets.
+    MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+    // If we're a catch funclet, we can be returned to via catchret. Save ESP
+    // into the registration node so that the runtime will restore it for us.
+    if (!MBB.isCleanupFuncletEntry()) {
+      assert(Personality == EHPersonality::MSVC_CXX);
+      unsigned FrameReg;
+      int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+      int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+      // ESP is the first field, so no extra displacement is needed.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+                   false, EHRegOffset)
+          .addReg(X86::ESP);
+    }
   }
 
   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
@@ -932,7 +1267,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       int FI;
       if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
         if (X86::FR64RegClass.contains(Reg)) {
-          int Offset = getFrameIndexOffset(MF, FI);
+          unsigned IgnoredFrameReg;
+          int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
           Offset += SEHFrameOffset;
 
           BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
@@ -948,14 +1284,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
         .setMIFlag(MachineInstr::FrameSetup);
 
+  if (FnHasClrFunclet && !IsFunclet) {
+    // Save the so-called Initial-SP (i.e. the value of the stack pointer
+    // immediately after the prolog)  into the PSPSlot so that funclets
+    // and the GC can recover it.
+    unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+    auto PSPInfo = MachinePointerInfo::getFixedStack(
+        MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+                 PSPSlotOffset)
+        .addReg(StackPtr)
+        .addMemOperand(MF.getMachineMemOperand(
+            PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+            SlotSize, SlotSize));
+  }
+
   // Realign stack after we spilled callee-saved registers (so that we'll be
   // able to calculate their offsets from the frame pointer).
   // Win64 requires aligning the stack after the prologue.
   if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
-    BuildStackAlignAND(MBB, MBBI, DL, MaxAlign);
+    BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
   }
 
+  // We already dealt with stack realignment and funclets above.
+  if (IsFunclet && STI.is32Bit())
+    return;
+
   // If we need a base pointer, set it up here. It's whatever the value
   // of the stack pointer is at this point. Any variable size objects
   // will be allocated after this, so we can still use the base pointer
@@ -964,7 +1319,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     // Update the base pointer with the current stack pointer.
     unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
     BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
-      .addReg(StackPtr)
+      .addReg(SPOrEstablisher)
       .setMIFlag(MachineInstr::FrameSetup);
     if (X86FI->getRestoreBasePointer()) {
       // Stash value of base pointer.  Saving RSP instead of EBP shortens
@@ -972,18 +1327,21 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
                    FramePtr, true, X86FI->getRestoreBasePointerOffset())
-        .addReg(StackPtr)
+        .addReg(SPOrEstablisher)
         .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    if (X86FI->getHasSEHFramePtrSave()) {
+    if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
       // Stash the value of the frame pointer relative to the base pointer for
       // Win32 EH. This supports Win32 EH, which does the inverse of the above:
       // it recovers the frame pointer from the base pointer rather than the
       // other way around.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
-      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true,
-                   getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex()))
+      unsigned UsedReg;
+      int Offset =
+          getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+      assert(UsedReg == BasePtr);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
           .addReg(FramePtr)
           .setMIFlag(MachineInstr::FrameSetup);
     }
@@ -1015,6 +1373,69 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue(
   return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
 }
 
+static bool isFuncletReturnInstr(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case X86::CATCHRET:
+  case X86::CLEANUPRET:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame.  The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals.  To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+  const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+  // getFrameIndexReferenceFromSP has an out ref parameter for the stack
+  // pointer register; pass a dummy that we ignore
+  unsigned SPReg;
+  int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg);
+  assert(Offset >= 0);
+  return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+  // This is the size of the pushed CSRs.
+  unsigned CSSize =
+      MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  // This is the amount of stack a funclet needs to allocate.
+  unsigned UsedSize;
+  EHPersonality Personality =
+      classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+  if (Personality == EHPersonality::CoreCLR) {
+    // CLR funclets need to hold enough space to include the PSPSym, at the
+    // same offset from the stack pointer (immediately after the prolog) as it
+    // resides at in the main function.
+    UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+  } else {
+    // Other funclets just need enough stack for outgoing call arguments.
+    UsedSize = MF.getFrameInfo()->getMaxCallFrameSize();
+  }
+  // RBP is not included in the callee saved register block. After pushing RBP,
+  // everything is 16 byte aligned. Everything we allocate before an outgoing
+  // call must also be 16 byte aligned.
+  unsigned FrameSizeMinusRBP =
+      RoundUpToAlignment(CSSize + UsedSize, getStackAlignment());
+  // Subtract out the size of the callee saved registers. This is how much stack
+  // each funclet will allocate.
+  return FrameSizeMinusRBP - CSSize;
+}
+
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1027,12 +1448,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   const bool Is64BitILP32 = STI.isTarget64BitILP32();
   unsigned FramePtr = TRI->getFrameRegister(MF);
   unsigned MachineFramePtr =
-      Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false)
-                   : FramePtr;
+      Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
 
   bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinCFI =
       IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+  bool IsFunclet = isFuncletReturnInstr(MBBI);
+  MachineBasicBlock *TargetMBB = nullptr;
 
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI->getStackSize();
@@ -1040,7 +1462,27 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t NumBytes = 0;
 
-  if (hasFP(MF)) {
+  if (MBBI->getOpcode() == X86::CATCHRET) {
+    // SEH shouldn't use catchret.
+    assert(!isAsynchronousEHPersonality(
+               classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
+           "SEH should not use CATCHRET");
+
+    NumBytes = getWinEHFuncletFrameSize(MF);
+    assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+    TargetMBB = MBBI->getOperand(0).getMBB();
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+            MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else if (MBBI->getOpcode() == X86::CLEANUPRET) {
+    NumBytes = getWinEHFuncletFrameSize(MF);
+    assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+            MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else if (hasFP(MF)) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     NumBytes = FrameSize - CSSize;
@@ -1052,7 +1494,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr);
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
   } else {
     NumBytes = StackSize - CSSize;
   }
@@ -1063,26 +1506,50 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     MachineBasicBlock::iterator PI = std::prev(MBBI);
     unsigned Opc = PI->getOpcode();
 
-    if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
-        !PI->isTerminator())
+    if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+        (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+        Opc != X86::DBG_VALUE && !PI->isTerminator())
       break;
 
     --MBBI;
   }
   MachineBasicBlock::iterator FirstCSPop = MBBI;
 
+  if (TargetMBB) {
+    // Fill EAX/RAX with the address of the target block.
+    unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
+    if (STI.is64Bit()) {
+      // LEA64r TargetMBB(%rip), %rax
+      BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
+          .addReg(X86::RIP)
+          .addImm(0)
+          .addReg(0)
+          .addMBB(TargetMBB)
+          .addReg(0);
+    } else {
+      // MOV32ri $TargetMBB, %eax
+      BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
+          .addMBB(TargetMBB);
+    }
+    // Record that we've taken the address of TargetMBB and no longer just
+    // reference it in a terminator.
+    TargetMBB->setHasAddressTaken();
+  }
+
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
   // If there is an ADD32ri or SUB32ri of ESP immediately before this
   // instruction, merge the two instructions.
   if (NumBytes || MFI->hasVarSizedObjects())
-    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+    NumBytes += mergeSPUpdates(MBB, MBBI, true);
 
   // If dynamic alloca is used, then reset esp to point to the last callee-saved
   // slot before popping them off! Same applies for the case, when stack was
-  // realigned.
-  if (TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+  // realigned. Don't do this if this was a funclet epilogue, since the funclets
+  // will not do realignment or dynamic stack allocation.
+  if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) &&
+      !IsFunclet) {
     if (TRI->needsStackRealignment(MF))
       MBBI = FirstCSPop;
     unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
@@ -1134,9 +1601,24 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                          int FI) const {
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (TRI->hasBasePointer(MF))
+    FrameReg = TRI->getBaseRegister();
+  else if (TRI->needsStackRealignment(MF))
+    FrameReg = TRI->getStackRegister();
+  else
+    FrameReg = TRI->getFrameRegister(MF);
+
   // Offset will hold the offset from the stack pointer at function entry to the
   // object.
   // We need to factor in additional offsets applied during the prologue to the
@@ -1207,48 +1689,62 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   return Offset + FPDelta;
 }
 
-int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                             unsigned &FrameReg) const {
-  // We can't calculate offset from frame pointer if the stack is realigned,
-  // so enforce usage of stack/base pointer.  The base pointer is used when we
-  // have dynamic allocas in addition to dynamic realignment.
-  if (TRI->hasBasePointer(MF))
-    FrameReg = TRI->getBaseRegister();
-  else if (TRI->needsStackRealignment(MF))
-    FrameReg = TRI->getStackRegister();
-  else
-    FrameReg = TRI->getFrameRegister(MF);
-  return getFrameIndexOffset(MF, FI);
-}
-
-// Simplified from getFrameIndexOffset keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const {
+// Simplified from getFrameIndexReference keeping only StackPointer cases
+int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI,
+                                                   unsigned &FrameReg) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   // Does not include any dynamic realign.
   const uint64_t StackSize = MFI->getStackSize();
   {
 #ifndef NDEBUG
-    // Note: LLVM arranges the stack as:
-    // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP)
-    //      > "Stack Slots" (<--SP)
-    // We can always address StackSlots from RSP.  We can usually (unless
-    // needsStackRealignment) address CSRs from RSP, but sometimes need to
-    // address them from RBP.  FixedObjects can be placed anywhere in the stack
-    // frame depending on their specific requirements (i.e. we can actually
-    // refer to arguments to the function which are stored in the *callers*
-    // frame).  As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs
-    // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject.
-
-    assert(!TRI->hasBasePointer(MF) && "we don't handle this case");
-
-    // We don't handle tail calls, and shouldn't be seeing them
-    // either.
+    // LLVM arranges the stack as follows:
+    //   ...
+    //   ARG2
+    //   ARG1
+    //   RETADDR
+    //   PUSH RBP   <-- RBP points here
+    //   PUSH CSRs
+    //   ~~~~~~~    <-- possible stack realignment (non-win64)
+    //   ...
+    //   STACK OBJECTS
+    //   ...        <-- RSP after prologue points here
+    //   ~~~~~~~    <-- possible stack realignment (win64)
+    //
+    // if (hasVarSizedObjects()):
+    //   ...        <-- "base pointer" (ESI/RBX) points here
+    //   DYNAMIC ALLOCAS
+    //   ...        <-- RSP points here
+    //
+    // Case 1: In the simple case of no stack realignment and no dynamic
+    // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+    // with fixed offsets from RSP.
+    //
+    // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+    // stack objects are addressed with RBP and regular stack objects with RSP.
+    //
+    // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+    // to address stack arguments for outgoing calls and nothing else. The "base
+    // pointer" points to local variables, and RBP points to fixed objects.
+    //
+    // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+    // answer we give is relative to the SP after the prologue, and not the
+    // SP in the middle of the function.
+
+    assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) ||
+            STI.isTargetWin64()) &&
+           "offset from fixed object to SP is not static");
+
+    // We don't handle tail calls, and shouldn't be seeing them either.
     int TailCallReturnAddrDelta =
         MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
     assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
 #endif
   }
 
+  // Fill in FrameReg output argument.
+  FrameReg = TRI->getStackRegister();
+
   // This is how the math works out:
   //
   //  %rsp grows (i.e. gets lower) left to right. Each box below is
@@ -1280,15 +1776,6 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F
 
   return Offset + StackSize;
 }
-// Simplified from getFrameIndexReference keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
-                                                   int FI,
-                                                   unsigned &FrameReg) const {
-  assert(!TRI->hasBasePointer(MF) && "we don't handle this case");
-
-  FrameReg = TRI->getStackRegister();
-  return getFrameIndexOffsetFromSP(MF, FI);
-}
 
 bool X86FrameLowering::assignCalleeSavedSpillSlots(
     MachineFunction &MF, const TargetRegisterInfo *TRI,
@@ -1358,6 +1845,11 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
     const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(MI);
 
+  // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+  // for us, and there are no XMM CSRs on Win32.
+  if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+    return true;
+
   // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
@@ -1399,6 +1891,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (CSI.empty())
     return false;
 
+  if (isFuncletReturnInstr(MI) && STI.isOSWindows()) {
+    // Don't restore CSRs in 32-bit EH funclets. Matches
+    // spillCalleeSavedRegisters.
+    if (STI.is32Bit())
+      return true;
+    // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+    // funclets. emitEpilogue transforms these to normal jumps.
+    if (MI->getOpcode() == X86::CATCHRET) {
+      const Function *Func = MBB.getParent()->getFunction();
+      bool IsSEH = isAsynchronousEHPersonality(
+          classifyEHPersonality(Func->getPersonalityFn()));
+      if (IsSEH)
+        return true;
+    }
+  }
+
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   // Reload XMMs from stack frame.
@@ -1420,7 +1928,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         !X86::GR32RegClass.contains(Reg))
       continue;
 
-    BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+        .setMIFlag(MachineInstr::FrameDestroy);
   }
   return true;
 }
@@ -1450,8 +1959,16 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 
   // Spill the BasePtr if it's used.
-  if (TRI->hasBasePointer(MF))
+  if (TRI->hasBasePointer(MF)) {
     SavedRegs.set(TRI->getBaseRegister());
+
+    // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+    if (MF.getMMI().hasEHFunclets()) {
+      int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize);
+      X86FI->setHasSEHFramePtrSave(true);
+      X86FI->setSEHFramePtrSaveIndex(FI);
+    }
+  }
 }
 
 static bool
@@ -1545,11 +2062,9 @@ void X86FrameLowering::adjustForSegmentedStacks(
   // The MOV R10, RAX needs to be in a different block, since the RET we emit in
   // allocMBB needs to be last (terminating) instruction.
 
-  for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(),
-                                          e = PrologueMBB.livein_end();
-       i != e; i++) {
-    allocMBB->addLiveIn(*i);
-    checkMBB->addLiveIn(*i);
+  for (const auto &LI : PrologueMBB.liveins()) {
+    allocMBB->addLiveIn(LI);
+    checkMBB->addLiveIn(LI);
   }
 
   if (IsNested)
@@ -1682,8 +2197,6 @@ void X86FrameLowering::adjustForSegmentedStacks(
       .addImm(StackSize);
     BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
       .addImm(X86FI->getArgumentStackSize());
-    MF.getRegInfo().setPhysRegUsed(Reg10);
-    MF.getRegInfo().setPhysRegUsed(Reg11);
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
@@ -1821,11 +2334,9 @@ void X86FrameLowering::adjustForHiPEPrologue(
     MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
     MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
 
-    for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(),
-                                            E = PrologueMBB.livein_end();
-         I != E; I++) {
-      stackCheckMBB->addLiveIn(*I);
-      incStackMBB->addLiveIn(*I);
+    for (const auto &LI : PrologueMBB.liveins()) {
+      stackCheckMBB->addLiveIn(LI);
+      incStackMBB->addLiveIn(LI);
     }
 
     MF.push_front(incStackMBB);
@@ -1870,16 +2381,84 @@ void X86FrameLowering::adjustForHiPEPrologue(
                  .addReg(ScratchReg), PReg, false, SPLimitOffset);
     BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
 
-    stackCheckMBB->addSuccessor(&PrologueMBB, 99);
-    stackCheckMBB->addSuccessor(incStackMBB, 1);
-    incStackMBB->addSuccessor(&PrologueMBB, 99);
-    incStackMBB->addSuccessor(incStackMBB, 1);
+    stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+    stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+    incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+    incStackMBB->addSuccessor(incStackMBB, {1, 100});
   }
 #ifdef XDEBUG
   MF.verify();
 #endif
 }
 
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const {
+
+  if (Offset <= 0)
+    return false;
+
+  if (Offset % SlotSize)
+    return false;
+
+  int NumPops = Offset / SlotSize;
+  // This is only worth it if we have at most 2 pops.
+  if (NumPops != 1 && NumPops != 2)
+    return false;
+
+  // Handle only the trivial case where the adjustment directly follows
+  // a call. This is the most common one, anyway.
+  if (MBBI == MBB.begin())
+    return false;
+  MachineBasicBlock::iterator Prev = std::prev(MBBI);
+  if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+    return false;
+
+  unsigned Regs[2];
+  unsigned FoundRegs = 0;
+
+  auto RegMask = Prev->getOperand(1);
+
+  auto &RegClass =
+      Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+  // Try to find up to NumPops free registers.
+  for (auto Candidate : RegClass) {
+
+    // Poor man's liveness:
+    // Since we're immediately after a call, any register that is clobbered
+    // by the call and not defined by it can be considered dead.
+    if (!RegMask.clobbersPhysReg(Candidate))
+      continue;
+
+    bool IsDef = false;
+    for (const MachineOperand &MO : Prev->implicit_operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) {
+        IsDef = true;
+        break;
+      }
+    }
+
+    if (IsDef)
+      continue;
+
+    Regs[FoundRegs++] = Candidate;
+    if (FoundRegs == (unsigned)NumPops)
+      break;
+  }
+
+  if (FoundRegs == 0)
+    return false;
+
+  // If we found only one free register, but need two, reuse the same one twice.
+  while (FoundRegs < (unsigned)NumPops)
+    Regs[FoundRegs++] = Regs[0];
+
+  for (int i = 0; i < NumPops; ++i)
+    BuildMI(MBB, MBBI, DL, 
+            TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+  return true;
+}
+
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -1895,8 +2474,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub ESP, <amt>' and the
     // adjcallstackdown instruction into 'add ESP, <amt>'
-    if (Amount == 0)
-      return;
 
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
@@ -1904,15 +2481,68 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     unsigned StackAlign = getStackAlignment();
     Amount = RoundUpToAlignment(Amount, StackAlign);
 
+    MachineModuleInfo &MMI = MF.getMMI();
+    const Function *Fn = MF.getFunction();
+    bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+    bool DwarfCFI = !WindowsCFI && 
+                    (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
+    // If we have any exception handlers in this function, and we adjust
+    // the SP before calls, we may need to indicate this to the unwinder
+    // using GNU_ARGS_SIZE. Note that this may be necessary even when
+    // Amount == 0, because the preceding function may have set a non-0
+    // GNU_ARGS_SIZE.
+    // TODO: We don't need to reset this between subsequent functions,
+    // if it didn't change.
+    bool HasDwarfEHHandlers = !WindowsCFI &&
+                              !MF.getMMI().getLandingPads().empty();
+
+    if (HasDwarfEHHandlers && !isDestroy &&
+        MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+      BuildCFI(MBB, I, DL,
+               MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+    if (Amount == 0)
+      return;
+
     // Factor out the amount that gets handled inside the sequence
     // (Pushes of argument for frame setup, callee pops for frame destroy)
     Amount -= InternalAmt;
 
+    // TODO: This is needed only if we require precise CFA.
+    // If this is a callee-pop calling convention, emit a CFA adjust for
+    // the amount the callee popped.
+    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+      BuildCFI(MBB, I, DL, 
+               MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
     if (Amount) {
       // Add Amount to SP to destroy a frame, and subtract to setup.
       int Offset = isDestroy ? Amount : -Amount;
-      BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
+
+      if (!(Fn->optForMinSize() && 
+            adjustStackWithPops(MBB, I, DL, Offset)))
+        BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
     }
+
+    if (DwarfCFI && !hasFP(MF)) {
+      // If we don't have FP, but need to generate unwind information,
+      // we need to set the correct CFA offset after the stack adjustment.
+      // How much we adjust the CFA offset depends on whether we're emitting
+      // CFI only for EH purposes or for debugging. EH only requires the CFA
+      // offset to be correct at each call site, while for debugging we want
+      // it to be more precise.
+      int CFAOffset = Amount;
+      // TODO: When not using precise CFA, we also need to adjust for the
+      // InternalAmt here.
+
+      if (CFAOffset) {
+        CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+        BuildCFI(MBB, I, DL, 
+                 MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      }
+    }
+
     return;
   }
 
@@ -1933,12 +2563,136 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
   assert(MBB.getParent() && "Block is not attached to a function!");
 
+  // Win64 has strict requirements in terms of epilogue and we are
+  // not taking a chance at messing with them.
+  // I.e., unless this block is already an exit block, we can't use
+  // it as an epilogue.
+  if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+    return false;
+
   if (canUseLEAForSPInEpilogue(*MBB.getParent()))
     return true;
 
   // If we cannot use LEA to adjust SP, we may need to use ADD, which
-  // clobbers the EFLAGS. Check that none of the terminators reads the
-  // EFLAGS, and if one uses it, conservatively assume this is not
+  // clobbers the EFLAGS. Check that we do not need to preserve it,
+  // otherwise, conservatively assume this is not
   // safe to insert the epilogue here.
-  return !terminatorsNeedFlagsAsInput(MBB);
+  return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // If we may need to emit frameless compact unwind information, give
+  // up as this is currently broken: PR25614.
+  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    DebugLoc DL, bool RestoreSP) const {
+  assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+  assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+  assert(STI.is32Bit() && !Uses64BitFramePtr &&
+         "restoring EBP/ESI on non-32-bit target");
+
+  MachineFunction &MF = *MBB.getParent();
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  unsigned BasePtr = TRI->getBaseRegister();
+  WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // FIXME: Don't set FrameSetup flag in catchret case.
+
+  int FI = FuncInfo.EHRegNodeFrameIndex;
+  int EHRegSize = MFI->getObjectSize(FI);
+
+  if (RestoreSP) {
+    // MOV32rm -EHRegSize(%ebp), %esp
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+                 X86::EBP, true, -EHRegSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  unsigned UsedReg;
+  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+  int EndOffset = -EHRegOffset - EHRegSize;
+  FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+  if (UsedReg == FramePtr) {
+    // ADD $offset, %ebp
+    unsigned ADDri = getADDriOpcode(false, EndOffset);
+    BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+        .addReg(FramePtr)
+        .addImm(EndOffset)
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->getOperand(3)
+        .setIsDead();
+    assert(EndOffset >= 0 &&
+           "end of registration object above normal EBP position!");
+  } else if (UsedReg == BasePtr) {
+    // LEA offset(%ebp), %esi
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+                 FramePtr, false, EndOffset)
+        .setMIFlag(MachineInstr::FrameSetup);
+    // MOV32rm SavedEBPOffset(%esi), %ebp
+    assert(X86FI->getHasSEHFramePtrSave());
+    int Offset =
+        getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+    assert(UsedReg == BasePtr);
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+                 UsedReg, true, Offset)
+        .setMIFlag(MachineInstr::FrameSetup);
+  } else {
+    llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+  }
+  return MBBI;
+}
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+  // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+  unsigned Offset = 16;
+  // RBP is immediately pushed.
+  Offset += SlotSize;
+  // All callee-saved registers are then pushed.
+  Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  // Every funclet allocates enough stack space for the largest outgoing call.
+  Offset += getWinEHFuncletFrameSize(MF);
+  return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  // If this function isn't doing Win64-style C++ EH, we don't need to do
+  // anything.
+  const Function *Fn = MF.getFunction();
+  if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() ||
+      classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+    return;
+
+  // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+  // relative to RSP after the prologue.  Find the offset of the last fixed
+  // object, so that we can allocate a slot immediately following it. If there
+  // were no fixed objects, use offset -SlotSize, which is immediately after the
+  // return address. Fixed objects have negative frame indices.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  int64_t MinFixedObjOffset = -SlotSize;
+  for (int I = MFI->getObjectIndexBegin(); I < 0; ++I)
+    MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I));
+
+  int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+  int UnwindHelpFI =
+      MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+  MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI;
+
+  // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+  // other frame setup instructions.
+  MachineBasicBlock &MBB = MF.front();
+  auto MBBI = MBB.begin();
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+                    UnwindHelpFI)
+      .addImm(-2);
 }
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
index 495cfcd..3ab41b4 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -47,11 +47,17 @@ public:
 
   unsigned StackPtr;
 
-  /// Emit a call to the target's stack probe function. This is required for all
+  /// Emit target stack probe code. This is required for all
   /// large stack allocations on Windows. The caller is required to materialize
-  /// the number of bytes to probe in RAX/EAX.
-  void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, DebugLoc DL) const;
+  /// the number of bytes to probe in RAX/EAX. Returns instruction just
+  /// after the expansion.
+  MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                               bool InProlog) const;
+
+  /// Replace a StackProbe inline-stub with the actual probe code inline.
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
@@ -91,11 +97,9 @@ public:
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
   bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
-  int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const;
   int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
                                    unsigned &FrameReg) const override;
 
@@ -103,6 +107,11 @@ public:
                                  MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI) const override;
 
+  unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+
   /// Check the instruction before/after the passed instruction. If
   /// it is an ADD/SUB/LEA instruction it is deleted argument and the
   /// stack adjustment is returned as a positive value for ADD/LEA and
@@ -125,7 +134,9 @@ public:
   /// \p MBB will be correctly handled by the target.
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
-private:
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   /// convertArgMovsToPushes - This method tries to convert a call sequence
   /// that uses sub and mov instructions to put the argument onto the stack
   /// into a series of pushes.
@@ -135,22 +146,56 @@ private:
                               MachineBasicBlock::iterator I, 
                               uint64_t Amount) const;
 
-  uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
-
   /// Wraps up getting a CFI index and building a MachineInstr for it.
   void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 DebugLoc DL, MCCFIInstruction CFIInst) const;
 
+  /// Sets up EBP and optionally ESI based on the incoming EBP value.  Only
+  /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+  MachineBasicBlock::iterator
+  restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                              bool RestoreSP = false) const;
+
+private:
+  uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+  /// Emit target stack probe as a call to a helper function
+  MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc DL, bool InProlog) const;
+
+  /// Emit target stack probe as an inline sequence.
+  MachineInstr *emitStackProbeInline(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, bool InProlog) const;
+
+  /// Emit a stub to later inline the target stack probe.
+  MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         DebugLoc DL, bool InProlog) const;
+
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
   void BuildStackAlignAND(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                          uint64_t MaxAlign) const;
+                          unsigned Reg, uint64_t MaxAlign) const;
+
+  /// Make small positive stack adjustments using POPs.
+  bool adjustStackWithPops(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           int Offset) const;
 
   /// Adjusts the stack pointer using LEA, SUB, or ADD.
   MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
                                            DebugLoc DL, int64_t Offset,
                                            bool InEpilogue) const;
+
+  unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+  unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d5351d2..868ae4e 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -46,9 +46,8 @@ STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
-  /// SDValue's instead of register numbers for the leaves of the matched
-  /// tree.
+  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+  /// numbers for the leaves of the matched tree.
   struct X86ISelAddressMode {
     enum {
       RegBase,
@@ -87,8 +86,7 @@ namespace {
              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
     }
 
-    /// isRIPRelative - Return true if this addressing mode is already RIP
-    /// relative.
+    /// Return true if this addressing mode is already RIP-relative.
     bool isRIPRelative() const {
       if (BaseType != RegBase) return false;
       if (RegisterSDNode *RegNode =
@@ -147,16 +145,16 @@ namespace {
 
 namespace {
   //===--------------------------------------------------------------------===//
-  /// ISel - X86 specific code to select X86 machine instructions for
+  /// ISel - X86-specific code to select X86 machine instructions for
   /// SelectionDAG operations.
   ///
   class X86DAGToDAGISel final : public SelectionDAGISel {
-    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
 
-    /// OptForSize - If true, selector should try to optimize for code size
-    /// instead of performance.
+    /// If true, selector should try to optimize for code size instead of
+    /// performance.
     bool OptForSize;
 
   public:
@@ -184,8 +182,7 @@ namespace {
       return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
     }
 
-    // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
-    // sign extended field.
+    // True if the 64-bit immediate fits in a 32-bit sign-extended field.
     inline bool i64immSExt32(SDNode *N) const {
       uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
       return (int64_t)v == (int32_t)v;
@@ -196,50 +193,50 @@ namespace {
 
   private:
     SDNode *Select(SDNode *N) override;
-    SDNode *SelectGather(SDNode *N, unsigned Opc);
-    SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
-
-    bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
-    bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
-    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
-    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
-    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+    SDNode *selectGather(SDNode *N, unsigned Opc);
+    SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT);
+
+    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                  unsigned Depth);
-    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
-    bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
-    bool SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+    bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
                           SDValue &Segment);
-    bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
-    bool SelectLEAAddr(SDValue N, SDValue &Base,
+    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+    bool selectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
                        SDValue &Segment);
-    bool SelectLEA64_32Addr(SDValue N, SDValue &Base,
+    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
                             SDValue &Scale, SDValue &Index, SDValue &Disp,
                             SDValue &Segment);
-    bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
+    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
                            SDValue &Segment);
-    bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+    bool selectScalarSSELoad(SDNode *Root, SDValue N,
                              SDValue &Base, SDValue &Scale,
                              SDValue &Index, SDValue &Disp,
                              SDValue &Segment,
                              SDValue &NodeWithChain);
 
-    bool TryFoldLoad(SDNode *P, SDValue N,
+    bool tryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment);
 
-    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-    /// inline asm expressions.
+    /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
                                       std::vector<SDValue> &OutOps) override;
 
-    void EmitSpecialCodeForMain();
+    void emitSpecialCodeForMain();
 
     inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL,
                                    SDValue &Base, SDValue &Scale,
@@ -252,7 +249,7 @@ namespace {
                  : AM.Base_Reg;
       Scale = getI8Imm(AM.Scale, DL);
       Index = AM.IndexReg;
-      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // These are 32-bit even in 64-bit mode since RIP-relative offset
       // is 32-bit.
       if (AM.GV)
         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
@@ -283,32 +280,105 @@ namespace {
         Segment = CurDAG->getRegister(0, MVT::i32);
     }
 
-    /// getI8Imm - Return a target constant with the specified value, of type
-    /// i8.
+    // Utility function to determine whether we should avoid selecting
+    // immediate forms of instructions for better code size or not.
+    // At a high level, we'd like to avoid such instructions when
+    // we have similar constants used within the same basic block
+    // that can be kept in a register.
+    //
+    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+      uint32_t UseCount = 0;
+
+      // Do not want to hoist if we're not optimizing for size.
+      // TODO: We'd like to remove this restriction.
+      // See the comment in X86InstrInfo.td for more info.
+      if (!OptForSize)
+        return false;
+
+      // Walk all the users of the immediate.
+      for (SDNode::use_iterator UI = N->use_begin(),
+           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+        SDNode *User = *UI;
+
+        // This user is already selected. Count it as a legitimate use and
+        // move on.
+        if (User->isMachineOpcode()) {
+          UseCount++;
+          continue;
+        }
+
+        // We want to count stores of immediates as real uses.
+        if (User->getOpcode() == ISD::STORE &&
+            User->getOperand(1).getNode() == N) {
+          UseCount++;
+          continue;
+        }
+
+        // We don't currently match users that have > 2 operands (except
+        // for stores, which are handled above)
+        // Those instruction won't match in ISEL, for now, and would
+        // be counted incorrectly.
+        // This may change in the future as we add additional instruction
+        // types.
+        if (User->getNumOperands() != 2)
+          continue;
+        
+        // Immediates that are used for offsets as part of stack
+        // manipulation should be left alone. These are typically
+        // used to indicate SP offsets for argument passing and
+        // will get pulled into stores/pushes (implicitly).
+        if (User->getOpcode() == X86ISD::ADD ||
+            User->getOpcode() == ISD::ADD    ||
+            User->getOpcode() == X86ISD::SUB ||
+            User->getOpcode() == ISD::SUB) {
+
+          // Find the other operand of the add/sub.
+          SDValue OtherOp = User->getOperand(0);
+          if (OtherOp.getNode() == N)
+            OtherOp = User->getOperand(1);
+
+          // Don't count if the other operand is SP.
+          RegisterSDNode *RegNode;
+          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+              (RegNode = dyn_cast_or_null<RegisterSDNode>(
+                 OtherOp->getOperand(1).getNode())))
+            if ((RegNode->getReg() == X86::ESP) ||
+                (RegNode->getReg() == X86::RSP))
+              continue;
+        }
+
+        // ... otherwise, count this and move on.
+        UseCount++;
+      }
+
+      // If we have more than 1 use, then recommend for hoisting.
+      return (UseCount > 1);
+    }
+
+    /// Return a target constant with the specified value of type i8.
     inline SDValue getI8Imm(unsigned Imm, SDLoc DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
     }
 
-    /// getI32Imm - Return a target constant with the specified value, of type
-    /// i32.
+    /// Return a target constant with the specified value, of type i32.
     inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
     }
 
-    /// getGlobalBaseReg - Return an SDNode that returns the value of
-    /// the global base register. Output instructions required to
-    /// initialize the global base register, if necessary.
-    ///
+    /// Return an SDNode that returns the value of the global base register.
+    /// Output instructions required to initialize the global base register,
+    /// if necessary.
     SDNode *getGlobalBaseReg();
 
-    /// getTargetMachine - Return a reference to the TargetMachine, casted
-    /// to the target-specific type.
+    /// Return a reference to the TargetMachine, casted to the target-specific
+    /// type.
     const X86TargetMachine &getTargetMachine() const {
       return static_cast<const X86TargetMachine &>(TM);
     }
 
-    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
-    /// to the target-specific type.
+    /// Return a reference to the TargetInstrInfo, casted to the target-specific
+    /// type.
     const X86InstrInfo *getInstrInfo() const {
       return Subtarget->getInstrInfo();
     }
@@ -386,9 +456,9 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   return true;
 }
 
-/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
+/// Replace the original chain operand of the call with
 /// load's chain operand and move load below the call's chain operand.
-static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
                                SDValue Call, SDValue OrigChain) {
   SmallVector<SDValue, 8> Ops;
   SDValue Chain = OrigChain.getOperand(0);
@@ -418,7 +488,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 }
 
-/// isCalleeLoad - Return true if call address is a load and it can be
+/// Return true if call address is a load and it can be
 /// moved below CALLSEQ_START and the chains leading up to the call.
 /// Return the CALLSEQ_START by reference as a second output.
 /// In the case of a tail call, there isn't a callseq node between the call
@@ -462,11 +532,11 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
   // OptForSize is used in pattern predicates that isel is matching.
-  OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  OptForSize = MF->getFunction()->optForSize();
 
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
-    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 
     if (OptLevel != CodeGenOpt::None &&
         // Only does this when target favors doesn't favor register indirect
@@ -500,7 +570,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       SDValue Load  = N->getOperand(1);
       if (!isCalleeLoad(Load, Chain, HasCallSeq))
         continue;
-      MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
       ++NumLoadMoved;
       continue;
     }
@@ -577,9 +647,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 }
 
 
-/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
-/// the main function.
-void X86DAGToDAGISel::EmitSpecialCodeForMain() {
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
   if (Subtarget->isTargetCygMing()) {
     TargetLowering::ArgListTy Args;
     auto &DL = CurDAG->getDataLayout();
@@ -599,7 +668,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() {
   // If this is main, emit special code for main.
   if (const Function *Fn = MF->getFunction())
     if (Fn->hasExternalLinkage() && Fn->getName() == "main")
-      EmitSpecialCodeForMain();
+      emitSpecialCodeForMain();
 }
 
 static bool isDispSafeForFrameIndex(int64_t Val) {
@@ -612,7 +681,7 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
   return isInt<31>(Val);
 }
 
-bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
                                             X86ISelAddressMode &AM) {
   // Cannot combine ExternalSymbol displacements with integer offsets.
   if (Offset != 0 && (AM.ES || AM.MCSym))
@@ -634,7 +703,7 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
 
 }
 
-bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   SDValue Address = N->getOperand(1);
 
   // load gs:0 -> GS segment register.
@@ -658,11 +727,10 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   return true;
 }
 
-/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
-/// into an addressing mode.  These wrap things that will resolve down into a
-/// symbol reference.  If no match is possible, this returns true, otherwise it
-/// returns false.
-bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   // If the addressing mode already has a symbol as the displacement, we can
   // never match another symbol.
   if (AM.hasSymbolicDisplacement())
@@ -685,7 +753,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
       X86ISelAddressMode Backup = AM;
       AM.GV = G->getGlobal();
       AM.SymbolFlags = G->getTargetFlags();
-      if (FoldOffsetIntoAddress(G->getOffset(), AM)) {
+      if (foldOffsetIntoAddress(G->getOffset(), AM)) {
         AM = Backup;
         return true;
       }
@@ -694,7 +762,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
       AM.CP = CP->getConstVal();
       AM.Align = CP->getAlignment();
       AM.SymbolFlags = CP->getTargetFlags();
-      if (FoldOffsetIntoAddress(CP->getOffset(), AM)) {
+      if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
         AM = Backup;
         return true;
       }
@@ -710,7 +778,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
       X86ISelAddressMode Backup = AM;
       AM.BlockAddr = BA->getBlockAddress();
       AM.SymbolFlags = BA->getTargetFlags();
-      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+      if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
         AM = Backup;
         return true;
       }
@@ -758,11 +826,10 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
   return true;
 }
 
-/// MatchAddress - Add the specified node to the specified addressing mode,
-/// returning true if it cannot be done.  This just pattern matches for the
-/// addressing mode.
-bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
-  if (MatchAddressRecursively(N, AM, 0))
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (matchAddressRecursively(N, AM, 0))
     return true;
 
   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
@@ -790,15 +857,49 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
   return false;
 }
 
+bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+                               unsigned Depth) {
+  // Add an artificial use to this node so that we can keep track of
+  // it if it gets CSE'd with a different node.
+  HandleSDNode Handle(N);
+
+  X86ISelAddressMode Backup = AM;
+  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+    return false;
+  AM = Backup;
+
+  // Try again after commuting the operands.
+  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+    return false;
+  AM = Backup;
+
+  // If we couldn't fold both operands into the address at the same time,
+  // see if we can just put each operand into a register and fold at least
+  // the add.
+  if (AM.BaseType == X86ISelAddressMode::RegBase &&
+      !AM.Base_Reg.getNode() &&
+      !AM.IndexReg.getNode()) {
+    N = Handle.getValue();
+    AM.Base_Reg = N.getOperand(0);
+    AM.IndexReg = N.getOperand(1);
+    AM.Scale = 1;
+    return false;
+  }
+  N = Handle.getValue();
+  return true;
+}
+
 // Insert a node into the DAG at least before the Pos node's position. This
 // will reposition the node as needed, and will assign it a node ID that is <=
 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
 // IDs! The selection DAG must no longer depend on their uniqueness when this
 // is used.
-static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
   if (N.getNode()->getNodeId() == -1 ||
       N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
-    DAG.RepositionNode(Pos.getNode(), N.getNode());
+    DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
     N.getNode()->setNodeId(Pos.getNode()->getNodeId());
   }
 }
@@ -807,7 +908,7 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
 // safe. This allows us to convert the shift and and into an h-register
 // extract and a scaled index. Returns false if the simplification is
 // performed.
-static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
                                       uint64_t Mask,
                                       SDValue Shift, SDValue X,
                                       X86ISelAddressMode &AM) {
@@ -835,12 +936,12 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
-  InsertDAGNode(DAG, N, Eight);
-  InsertDAGNode(DAG, N, Srl);
-  InsertDAGNode(DAG, N, NewMask);
-  InsertDAGNode(DAG, N, And);
-  InsertDAGNode(DAG, N, ShlCount);
-  InsertDAGNode(DAG, N, Shl);
+  insertDAGNode(DAG, N, Eight);
+  insertDAGNode(DAG, N, Srl);
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, And);
+  insertDAGNode(DAG, N, ShlCount);
+  insertDAGNode(DAG, N, Shl);
   DAG.ReplaceAllUsesWith(N, Shl);
   AM.IndexReg = And;
   AM.Scale = (1 << ScaleLog);
@@ -850,7 +951,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
 // allows us to fold the shift into this addressing mode. Returns false if the
 // transform succeeded.
-static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
                                         uint64_t Mask,
                                         SDValue Shift, SDValue X,
                                         X86ISelAddressMode &AM) {
@@ -880,9 +981,9 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
-  InsertDAGNode(DAG, N, NewMask);
-  InsertDAGNode(DAG, N, NewAnd);
-  InsertDAGNode(DAG, N, NewShift);
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, NewAnd);
+  insertDAGNode(DAG, N, NewShift);
   DAG.ReplaceAllUsesWith(N, NewShift);
 
   AM.Scale = 1 << ShiftAmt;
@@ -917,7 +1018,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
 // Note that this function assumes the mask is provided as a mask *after* the
 // value is shifted. The input chain may or may not match that, but computing
 // such a mask is trivial.
-static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
                                     uint64_t Mask,
                                     SDValue Shift, SDValue X,
                                     X86ISelAddressMode &AM) {
@@ -973,7 +1074,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
     assert(X.getValueType() != VT);
     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
-    InsertDAGNode(DAG, N, NewX);
+    insertDAGNode(DAG, N, NewX);
     X = NewX;
   }
   SDLoc DL(N);
@@ -987,10 +1088,10 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
-  InsertDAGNode(DAG, N, NewSRLAmt);
-  InsertDAGNode(DAG, N, NewSRL);
-  InsertDAGNode(DAG, N, NewSHLAmt);
-  InsertDAGNode(DAG, N, NewSHL);
+  insertDAGNode(DAG, N, NewSRLAmt);
+  insertDAGNode(DAG, N, NewSRL);
+  insertDAGNode(DAG, N, NewSHLAmt);
+  insertDAGNode(DAG, N, NewSHL);
   DAG.ReplaceAllUsesWith(N, NewSHL);
 
   AM.Scale = 1 << AMShiftAmt;
@@ -998,7 +1099,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   return false;
 }
 
-bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
   SDLoc dl(N);
   DEBUG({
@@ -1007,7 +1108,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     });
   // Limit recursion.
   if (Depth > 5)
-    return MatchAddressBase(N, AM);
+    return matchAddressBase(N, AM);
 
   // If this is already a %rip relative address, we can only merge immediates
   // into it.  Instead of handling this in every case, we handle it here.
@@ -1020,7 +1121,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       return true;
 
     if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
-      if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM))
+      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
         return false;
     return true;
   }
@@ -1038,19 +1139,19 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   }
   case ISD::Constant: {
     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
-    if (!FoldOffsetIntoAddress(Val, AM))
+    if (!foldOffsetIntoAddress(Val, AM))
       return false;
     break;
   }
 
   case X86ISD::Wrapper:
   case X86ISD::WrapperRIP:
-    if (!MatchWrapper(N, AM))
+    if (!matchWrapper(N, AM))
       return false;
     break;
 
   case ISD::LOAD:
-    if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM))
+    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
       return false;
     break;
 
@@ -1087,7 +1188,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
           ConstantSDNode *AddVal =
             cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
-          if (!FoldOffsetIntoAddress(Disp, AM))
+          if (!foldOffsetIntoAddress(Disp, AM))
             return false;
         }
 
@@ -1119,7 +1220,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // Try to fold the mask and shift into the scale, and return false if we
     // succeed.
-    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
       return false;
     break;
   }
@@ -1153,7 +1254,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
             ConstantSDNode *AddVal =
               cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
-            if (FoldOffsetIntoAddress(Disp, AM))
+            if (foldOffsetIntoAddress(Disp, AM))
               Reg = N.getNode()->getOperand(0);
           } else {
             Reg = N.getNode()->getOperand(0);
@@ -1179,7 +1280,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
-    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+    if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
@@ -1227,56 +1328,26 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     AM.Scale = 1;
 
     // Insert the new nodes into the topological ordering.
-    InsertDAGNode(*CurDAG, N, Zero);
-    InsertDAGNode(*CurDAG, N, Neg);
+    insertDAGNode(*CurDAG, N, Zero);
+    insertDAGNode(*CurDAG, N, Neg);
     return false;
   }
 
-  case ISD::ADD: {
-    // Add an artificial use to this node so that we can keep track of
-    // it if it gets CSE'd with a different node.
-    HandleSDNode Handle(N);
-
-    X86ISelAddressMode Backup = AM;
-    if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
-        !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
-      return false;
-    AM = Backup;
-
-    // Try again after commuting the operands.
-    if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
-        !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+  case ISD::ADD:
+    if (!matchAdd(N, AM, Depth))
       return false;
-    AM = Backup;
-
-    // If we couldn't fold both operands into the address at the same time,
-    // see if we can just put each operand into a register and fold at least
-    // the add.
-    if (AM.BaseType == X86ISelAddressMode::RegBase &&
-        !AM.Base_Reg.getNode() &&
-        !AM.IndexReg.getNode()) {
-      N = Handle.getValue();
-      AM.Base_Reg = N.getOperand(0);
-      AM.IndexReg = N.getOperand(1);
-      AM.Scale = 1;
-      return false;
-    }
-    N = Handle.getValue();
     break;
-  }
 
   case ISD::OR:
-    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
-    if (CurDAG->isBaseWithConstantOffset(N)) {
-      X86ISelAddressMode Backup = AM;
-      ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
-
-      // Start with the LHS as an addr mode.
-      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
-          !FoldOffsetIntoAddress(CN->getSExtValue(), AM))
-        return false;
-      AM = Backup;
-    }
+    // We want to look through a transform in InstCombine and DAGCombiner that
+    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+    // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+    // An 'lea' can then be used to match the shift (multiply) and add:
+    // and $1, %esi
+    // lea (%rsi, %rdi, 8), %rax
+    if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+        !matchAdd(N, AM, Depth))
+      return false;
     break;
 
   case ISD::AND: {
@@ -1299,27 +1370,27 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     uint64_t Mask = N.getConstantOperandVal(1);
 
     // Try to fold the mask and shift into an extract and scale.
-    if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+    if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
       return false;
 
     // Try to fold the mask and shift directly into the scale.
-    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
       return false;
 
     // Try to swap the mask and shift to place shifts which can be done as
     // a scale on the outside of the mask.
-    if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+    if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
       return false;
     break;
   }
   }
 
-  return MatchAddressBase(N, AM);
+  return matchAddressBase(N, AM);
 }
 
-/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// Helper for MatchAddress. Add the specified node to the
 /// specified addressing mode without any further recursion.
-bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   // Is the base register already occupied?
   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
     // If so, check to see if the scale index register is set.
@@ -1339,7 +1410,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   return false;
 }
 
-bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                                       SDValue &Scale, SDValue &Index,
                                       SDValue &Disp, SDValue &Segment) {
 
@@ -1362,7 +1433,7 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
 
   // If Base is 0, the whole address is in index and the Scale is 1
   if (isa<ConstantSDNode>(Base)) {
-    assert(dyn_cast<ConstantSDNode>(Base)->isNullValue() &&
+    assert(cast<ConstantSDNode>(Base)->isNullValue() &&
            "Unexpected base in gather/scatter");
     Scale = getI8Imm(1, DL);
     Base = CurDAG->getRegister(0, MVT::i32);
@@ -1375,14 +1446,14 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   return true;
 }
 
-/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// Returns true if it is able to pattern match an addressing mode.
 /// It returns the operands which make up the maximal addressing mode it can
 /// match by reference.
 ///
 /// Parent is the parent node of the addr operand that is being matched.  It
 /// is always a load, store, atomic node, or null.  It is only null when
 /// checking memory operands for inline asm nodes.
-bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                                  SDValue &Scale, SDValue &Index,
                                  SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
@@ -1404,7 +1475,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
   }
 
-  if (MatchAddress(N, AM))
+  if (matchAddress(N, AM))
     return false;
 
   MVT VT = N.getSimpleValueType();
@@ -1420,14 +1491,14 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   return true;
 }
 
-/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
-/// match a load whose top elements are either undef or zeros.  The load flavor
-/// is derived from the type of N, which is either v4f32 or v2f64.
+/// Match a scalar SSE load. In particular, we want to match a load whose top
+/// elements are either undef or zeros. The load flavor is derived from the
+/// type of N, which is either v4f32 or v2f64.
 ///
 /// We also return:
 ///   PatternChainNode: this is the matched node that has a chain input and
 ///   output.
-bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
                                           SDValue N, SDValue &Base,
                                           SDValue &Scale, SDValue &Index,
                                           SDValue &Disp, SDValue &Segment,
@@ -1439,7 +1510,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
         IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
         IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
-      if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+      if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
         return false;
       return true;
     }
@@ -1457,7 +1528,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
       IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
     // Okay, this is a zero extending load.  Fold it.
     LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
-    if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+    if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
       return false;
     PatternNodeWithChain = SDValue(LD, 0);
     return true;
@@ -1466,7 +1537,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
 }
 
 
-bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
   if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
     uint64_t ImmVal = CN->getZExtValue();
     if ((uint32_t)ImmVal != (uint64_t)ImmVal)
@@ -1495,10 +1566,10 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
   return TM.getCodeModel() == CodeModel::Small;
 }
 
-bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
                                          SDValue &Scale, SDValue &Index,
                                          SDValue &Disp, SDValue &Segment) {
-  if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
     return false;
 
   SDLoc DL(N);
@@ -1533,9 +1604,9 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
   return true;
 }
 
-/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// Calls SelectAddr and determines if the maximal addressing
 /// mode it matches can be cost effectively emitted as an LEA instruction.
-bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
                                     SDValue &Base, SDValue &Scale,
                                     SDValue &Index, SDValue &Disp,
                                     SDValue &Segment) {
@@ -1546,7 +1617,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   SDValue Copy = AM.Segment;
   SDValue T = CurDAG->getRegister(0, MVT::i32);
   AM.Segment = T;
-  if (MatchAddress(N, AM))
+  if (matchAddress(N, AM))
     return false;
   assert (T == AM.Segment);
   AM.Segment = Copy;
@@ -1572,13 +1643,12 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
     Complexity++;
 
   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
-  // to a LEA. This is determined with some expermentation but is by no means
+  // to a LEA. This is determined with some experimentation but is by no means
   // optimal (especially for code size consideration). LEA is nice because of
   // its three-address nature. Tweak the cost function again when we can run
   // convertToThreeAddress() at register allocation time.
   if (AM.hasSymbolicDisplacement()) {
-    // For X86-64, we should always use lea to materialize RIP relative
-    // addresses.
+    // For X86-64, always use LEA to materialize RIP-relative addresses.
     if (Subtarget->is64Bit())
       Complexity = 4;
     else
@@ -1596,8 +1666,8 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   return true;
 }
 
-/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
-bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
                                         SDValue &Scale, SDValue &Index,
                                         SDValue &Disp, SDValue &Segment) {
   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
@@ -1621,7 +1691,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
 }
 
 
-bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
                                   SDValue &Base, SDValue &Scale,
                                   SDValue &Index, SDValue &Disp,
                                   SDValue &Segment) {
@@ -1630,14 +1700,13 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
       !IsLegalToFold(N, P, P, OptLevel))
     return false;
 
-  return SelectAddr(N.getNode(),
+  return selectAddr(N.getNode(),
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
-/// getGlobalBaseReg - Return an SDNode that returns the value of
-/// the global base register. Output instructions required to
-/// initialize the global base register, if necessary.
-///
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
   auto &DL = MF->getDataLayout();
@@ -1828,7 +1897,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
   return Val;
 }
 
-SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
+SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return nullptr;
 
@@ -1841,7 +1910,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   SDValue Ptr = Node->getOperand(1);
   SDValue Val = Node->getOperand(2);
   SDValue Base, Scale, Index, Disp, Segment;
-  if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
+  if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
     return nullptr;
 
   // Which index into the table.
@@ -1933,9 +2002,9 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   return CurDAG->getMergeValues(RetVals, dl).getNode();
 }
 
-/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
-/// any uses which require the SF or OF bits to be accurate.
-static bool HasNoSignedComparisonUses(SDNode *N) {
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// or OF bits to be accurate.
+static bool hasNoSignedComparisonUses(SDNode *N) {
   // Examine each user of the node.
   for (SDNode::use_iterator UI = N->use_begin(),
          UE = N->use_end(); UI != UE; ++UI) {
@@ -1995,9 +2064,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
   return true;
 }
 
-/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
-/// is suitable for doing the {load; increment or decrement; store} to modify
-/// transformation.
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; increment or decrement; store} to modify transformation.
 static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
                                 SDValue StoredVal, SelectionDAG *CurDAG,
                                 LoadSDNode* &LoadNode, SDValue &InputChain) {
@@ -2081,8 +2149,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
   return true;
 }
 
-/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
-/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+/// Get the appropriate X86 opcode for an in-memory increment or decrement.
+/// Opc should be X86ISD::DEC or X86ISD::INC.
 static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   if (Opc == X86ISD::DEC) {
     if (LdVT == MVT::i64) return X86::DEC64m;
@@ -2099,9 +2167,8 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   llvm_unreachable("unrecognized size for LdVT");
 }
 
-/// SelectGather - Customized ISel for GATHER operations.
-///
-SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+/// Customized ISel for GATHER operations.
+SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
   // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
   SDValue Chain = Node->getOperand(0);
   SDValue VSrc = Node->getOperand(2);
@@ -2148,6 +2215,27 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
   switch (Opcode) {
   default: break;
+  case ISD::BRIND: {
+    if (Subtarget->isTargetNaCl())
+      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+      // leave the instruction alone.
+      break;
+    if (Subtarget->isTarget64BitILP32()) {
+      // Converts a 32-bit register to a 64-bit, zero-extended version of
+      // it. This is needed because x86-64 can do many things, but jmp %r32
+      // ain't one of them.
+      const SDValue &Target = Node->getOperand(1);
+      assert(Target.getSimpleValueType() == llvm::MVT::i32);
+      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+      SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+                                      Node->getOperand(0), ZextTarget);
+      ReplaceUses(SDValue(Node, 0), Brind);
+      SelectCode(ZextTarget.getNode());
+      SelectCode(Brind.getNode());
+      return nullptr;
+    }
+    break;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
@@ -2190,7 +2278,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
       case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
       }
-      SDNode *RetVal = SelectGather(Node, Opc);
+      SDNode *RetVal = selectGather(Node, Opc);
       if (RetVal)
         // We already called ReplaceUses inside SelectGather.
         return nullptr;
@@ -2217,7 +2305,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_ADD: {
-    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+    SDNode *RetVal = selectAtomicLoadArith(Node, NVT);
     if (RetVal)
       return RetVal;
     break;
@@ -2404,10 +2492,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
     // Multiply is commmutative.
     if (!foldedLoad) {
-      foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
       if (foldedLoad)
         std::swap(N0, N1);
     }
@@ -2549,7 +2637,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
 
     SDValue InFlag;
@@ -2557,7 +2645,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       // Special case for div8, just use a move with zero extension to AX to
       // clear the upper 8 bits (AH).
       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
-      if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
         Move =
           SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
@@ -2692,7 +2780,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
-        HasNoSignedComparisonUses(Node))
+        hasNoSignedComparisonUses(Node))
       N0 = N0.getOperand(0);
 
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
@@ -2709,7 +2797,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       // For example, convert "testl %eax, $8" to "testb %al, $8"
       if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
           (!(C->getZExtValue() & 0x80) ||
-           HasNoSignedComparisonUses(Node))) {
+           hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
         SDValue Reg = N0.getNode()->getOperand(0);
 
@@ -2743,7 +2831,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       // For example, "testl %eax, $2048" to "testb %ah, $8".
       if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
           (!(C->getZExtValue() & 0x8000) ||
-           HasNoSignedComparisonUses(Node))) {
+           hasNoSignedComparisonUses(Node))) {
         // Shift the immediate right by 8 bits.
         SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
                                                        dl, MVT::i8);
@@ -2781,7 +2869,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
           N0.getValueType() != MVT::i16 &&
           (!(C->getZExtValue() & 0x8000) ||
-           HasNoSignedComparisonUses(Node))) {
+           hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i16);
         SDValue Reg = N0.getNode()->getOperand(0);
@@ -2804,7 +2892,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
           N0.getValueType() == MVT::i64 &&
           (!(C->getZExtValue() & 0x80000000) ||
-           HasNoSignedComparisonUses(Node))) {
+           hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i32);
         SDValue Reg = N0.getNode()->getOperand(0);
@@ -2854,7 +2942,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       break;
 
     SDValue Base, Scale, Index, Disp, Segment;
-    if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
+    if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
                     Base, Scale, Index, Disp, Segment))
       break;
 
@@ -2903,7 +2991,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
   case InlineAsm::Constraint_v: // not offsetable    ??
   case InlineAsm::Constraint_m: // memory
   case InlineAsm::Constraint_X:
-    if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+    if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
       return true;
     break;
   }
@@ -2916,9 +3004,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
   return false;
 }
 
-/// createX86ISelDag - This pass converts a legalized DAG into a
-/// X86-specific DAG, ready for instruction scheduling.
-///
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
                                      CodeGenOpt::Level OptLevel) {
   return new X86DAGToDAGISel(TM, OptLevel);
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0f29b51..1ec93b5 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18,6 +18,7 @@
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -25,6 +26,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -67,19 +69,14 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
-// Forward declarations.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
-                       SDValue V2);
-
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  TD = TM.getDataLayout();
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 
   // Set up the TargetLowering object.
-  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 
   // X86 is weird. It always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -118,13 +115,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
-
-    // The _ftol2 runtime function has an unusual calling conv, which
-    // is modeled by a special pseudo-instruction.
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
   }
 
   if (Subtarget->isTargetDarwin()) {
@@ -175,14 +165,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 
   if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+      // f32/f64 are legal, f80 is custom.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
+    else
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
   } else if (!Subtarget->useSoftFloat()) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
     // We have an algorithm for SSE2, and we turn this into a 64-bit
-    // FILD for other targets.
+    // FILD or VCVTUSI2SS/SD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
   }
 
@@ -206,23 +200,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
   }
 
-  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
-  // are Legal, f80 is custom lowered.
-  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
-  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
-
   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
   // this operation.
   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 
-  if (X86ScalarSSEf32) {
-    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
-    // f32 and f64 cases are Legal, f80 case is not
-    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  if (!Subtarget->useSoftFloat()) {
+    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+    // are Legal, f80 is custom lowered.
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    }
   } else {
-    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
-    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
   }
 
   // Handle FP_TO_UINT by promoting the destination to a larger signed
@@ -232,8 +232,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 
   if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
-    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
+    }
   } else if (!Subtarget->useSoftFloat()) {
     // Since AVX is a superset of SSE3, only check for SSE here.
     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
@@ -242,14 +248,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // the optimal thing for SSE vs. the default expansion in the legalizer.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
     else
+      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
       // With SSE3 we can use fisttpll to convert to a signed i64; without
       // SSE, we're stuck with a fistpll.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
-  }
 
-  if (isTargetFTOL()) {
-    // Use the _ftol2 runtime function, which has a pseudo-instruction
-    // to handle its weird calling convention.
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
   }
 
@@ -262,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // Without SSE, i64->f64 goes through memory.
       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
     }
-  }
+  } else if (!Subtarget->is64Bit())
+    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
   // produce two results, to match the available instructions. This exposes
@@ -274,8 +278,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
-  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
-    MVT VT = IntVTs[i];
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
@@ -295,6 +298,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f128,  Expand);
   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
@@ -302,6 +306,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f128,  Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
@@ -312,7 +317,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+
+  if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
+    // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+    // is. We should promote the value to 64-bits to solve this.
+    // This is what the CRT headers do - `fmodf` is an inline header
+    // function casting to f64 and calling `fmod`.
+    setOperationAction(ISD::FREM           , MVT::f32  , Promote);
+  } else {
+    setOperationAction(ISD::FREM           , MVT::f32  , Expand);
+  }
+
   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
@@ -404,15 +419,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f128 , Custom);
   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f128 , Custom);
+  setOperationAction(ISD::SETCCE          , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCCE          , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCCE          , MVT::i32  , Custom);
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCCE        , MVT::i64  , Custom);
   }
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
@@ -456,8 +477,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 
   // Expand certain atomics
-  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
-    MVT VT = IntVTs[i];
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
@@ -473,13 +493,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
   }
 
-  if (Subtarget->is64Bit()) {
-    setExceptionPointerRegister(X86::RAX);
-    setExceptionSelectorRegister(X86::RDX);
-  } else {
-    setExceptionPointerRegister(X86::EAX);
-    setExceptionSelectorRegister(X86::EDX);
-  }
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 
@@ -492,8 +505,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
-  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
-    // TargetInfo::X86_64ABIBuiltinVaList
+  if (Subtarget->is64Bit()) {
     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
   } else {
@@ -505,7 +517,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 
   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
@@ -613,8 +625,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FMA, MVT::f64, Expand);
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
-  // Long double always uses X87.
+  // Long double always uses X87, except f128 in MMX.
   if (!Subtarget->useSoftFloat()) {
+    if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+      addRegisterClass(MVT::f128, &X86::FR128RegClass);
+      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+      setOperationAction(ISD::FABS , MVT::f128, Custom);
+      setOperationAction(ISD::FNEG , MVT::f128, Custom);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+    }
+
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -846,15 +866,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
     setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
 
+    setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
+    // ISD::CTTZ v2i64 - scalarization is faster.
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v4i32, Custom);
+    // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
+
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
-    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-      // Do not attempt to custom lower non-power-of-2 vectors
-      if (!isPowerOf2_32(VT.getVectorNumElements()))
-        continue;
-      // Do not attempt to custom lower non-128-bit vectors
-      if (!VT.is128BitVector())
-        continue;
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
@@ -892,13 +914,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
-      // Do not attempt to promote non-128-bit vectors
-      if (!VT.is128BitVector())
-        continue;
-
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
       setOperationAction(ISD::AND,    VT, Promote);
       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
       setOperationAction(ISD::OR,     VT, Promote);
@@ -1036,6 +1052,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   }
 
+  if (Subtarget->hasXOP()) {
+    setOperationAction(ISD::ROTL,              MVT::v16i8, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v8i16, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v4i32, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v2i64, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v32i8, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v8i32, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v4i64, Custom);
+  }
+
   if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
@@ -1126,7 +1153,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
     setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
 
-    if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
+    setOperationAction(ISD::CTTZ,              MVT::v32i8, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v16i16, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v8i32, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v4i64, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v32i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v16i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
+
+    if (Subtarget->hasAnyFMA()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
@@ -1202,6 +1238,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
       setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::SMAX,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::SMAX,            MVT::v16i16, Custom);
+      setOperationAction(ISD::SMAX,            MVT::v8i32,  Custom);
+      setOperationAction(ISD::UMAX,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::UMAX,            MVT::v16i16, Custom);
+      setOperationAction(ISD::UMAX,            MVT::v8i32,  Custom);
+      setOperationAction(ISD::SMIN,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::SMIN,            MVT::v16i16, Custom);
+      setOperationAction(ISD::SMIN,            MVT::v8i32,  Custom);
+      setOperationAction(ISD::UMIN,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::UMIN,            MVT::v16i16, Custom);
+      setOperationAction(ISD::UMIN,            MVT::v8i32,  Custom);
     }
 
     // In the customized shift lowering, the legal cases in AVX2 will be
@@ -1243,15 +1292,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget->hasInt256())
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
-
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
-      // Do not attempt to promote non-256-bit vectors
-      if (!VT.is256BitVector())
-        continue;
-
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationAction(ISD::AND,    VT, Promote);
       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
       setOperationAction(ISD::OR,     VT, Promote);
@@ -1293,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
     setOperationAction(ISD::AND,                MVT::i1,    Legal);
@@ -1311,6 +1354,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
@@ -1318,19 +1362,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
 
-    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
-    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
-    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
-    if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
-      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
-    }
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
@@ -1348,12 +1383,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
 
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
+    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
+    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
+    if (Subtarget->hasVLX()){
+      setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
+      setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+      setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+      setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
+      setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+      setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
+      setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    } else {
+      setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
+      setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
+    }
     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
     if (Subtarget->hasDQI()) {
-      setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
-      setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
+      setOperationAction(ISD::TRUNCATE,         MVT::v2i1, Custom);
+      setOperationAction(ISD::TRUNCATE,         MVT::v4i1, Custom);
+
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
+      if (Subtarget->hasVLX()) {
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
+        setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
+        setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
+      }
+    }
+    if (Subtarget->hasVLX()) {
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
     }
     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
@@ -1386,7 +1471,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
@@ -1395,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
@@ -1439,9 +1525,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
 
     if (Subtarget->hasCDI()) {
-      setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
-    }
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i64,  Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i32, Expand);
+
+      setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i16,  Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i8,  Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i16, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v32i8,  Expand);
+
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
+
+      if (Subtarget->hasVLX()) {
+        setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Expand);
+
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
+      } else {
+        setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Expand);
+      }
+    } // Subtarget->hasCDI()
+
     if (Subtarget->hasDQI()) {
       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
@@ -1455,7 +1581,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::OR,  VT, Legal);
         setOperationAction(ISD::XOR,  VT, Legal);
       }
-      if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
+      if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
         setOperationAction(ISD::MGATHER,  VT, Custom);
         setOperationAction(ISD::MSCATTER, VT, Custom);
       }
@@ -1481,15 +1607,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
         setOperationAction(ISD::MLOAD,               VT, Legal);
         setOperationAction(ISD::MSTORE,              VT, Legal);
+        setOperationAction(ISD::MGATHER,  VT, Legal);
+        setOperationAction(ISD::MSCATTER, VT, Custom);
       }
     }
-    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
-
-      // Do not attempt to promote non-512-bit vectors.
-      if (!VT.is512BitVector())
-        continue;
-
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
       setOperationAction(ISD::SELECT, VT, Promote);
       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
     }
@@ -1515,22 +1637,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
     setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
     setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
 
     setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
     setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
@@ -1541,19 +1676,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
     setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
 
-    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
-      const MVT VT = (MVT::SimpleValueType)i;
+    setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
+    setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
+    if (Subtarget->hasVLX())
+      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
 
-      const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+    if (Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
+      setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8,  Expand);
+    }
 
-      // Do not attempt to promote non-512-bit vectors.
-      if (!VT.is512BitVector())
-        continue;
+    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::SRL,                 VT, Custom);
+      setOperationAction(ISD::SHL,                 VT, Custom);
+      setOperationAction(ISD::SRA,                 VT, Custom);
 
-      if (EltSize < 32) {
-        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-        setOperationAction(ISD::VSELECT,             VT, Legal);
-      }
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v8i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v8i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v8i64);
     }
   }
 
@@ -1571,6 +1718,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
 
     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
@@ -1595,8 +1744,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  if (!Subtarget->is64Bit())
+  if (!Subtarget->is64Bit()) {
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  }
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -1604,9 +1755,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // FIXME: We really should do custom legalization for addition and
   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   // than generic legalization for 64-bit multiplication-with-overflow, though.
-  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget->is64Bit())
+      continue;
     // Add/Sub/Mul with overflow operations are custom lowered.
-    MVT VT = IntVTs[i];
     setOperationAction(ISD::SADDO, VT, Custom);
     setOperationAction(ISD::UADDO, VT, Custom);
     setOperationAction(ISD::SSUBO, VT, Custom);
@@ -1615,7 +1767,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMULO, VT, Custom);
   }
 
-
   if (!Subtarget->is64Bit()) {
     // These libcalls are not available in 32-bit.
     setLibcallName(RTLIB::SHL_I128, nullptr);
@@ -1658,12 +1809,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FMA);
+  setTargetDAGCombine(ISD::FMINNUM);
+  setTargetDAGCombine(ISD::FMAXNUM);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::MLOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::MSTORE);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
@@ -1671,24 +1826,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::MSCATTER);
+  setTargetDAGCombine(ISD::MGATHER);
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
-  // On Darwin, -Os means optimize for size without hurting performance,
-  // do not reduce the limit.
   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
-  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+  MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
-  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
-  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemmoveOptSize = 4;
   setPrefLoopAlignment(4); // 2^4 bytes.
 
-  // Predictable cmov don't hurt on atom because it's in-order.
+  // A predictable cmov does not hurt on an in-order CPU.
+  // FIXME: Use a CPU attribute to trigger this, not a CPU model.
   PredictableSelectIsExpensive = !Subtarget->isAtom();
   EnableExtLdPromotion = true;
   setPrefFunctionAlignment(4); // 2^4 bytes.
@@ -1716,40 +1871,43 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
   if (!VT.isVector())
     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
 
-  const unsigned NumElts = VT.getVectorNumElements();
-  const EVT EltVT = VT.getVectorElementType();
-  if (VT.is512BitVector()) {
-    if (Subtarget->hasAVX512())
-      if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
-          EltVT == MVT::f32 || EltVT == MVT::f64)
-        switch(NumElts) {
-        case  8: return MVT::v8i1;
-        case 16: return MVT::v16i1;
-      }
-    if (Subtarget->hasBWI())
-      if (EltVT == MVT::i8 || EltVT == MVT::i16)
-        switch(NumElts) {
-        case 32: return MVT::v32i1;
-        case 64: return MVT::v64i1;
-      }
-  }
+  if (VT.isSimple()) {
+    MVT VVT = VT.getSimpleVT();
+    const unsigned NumElts = VVT.getVectorNumElements();
+    const MVT EltVT = VVT.getVectorElementType();
+    if (VVT.is512BitVector()) {
+      if (Subtarget->hasAVX512())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+        }
+      if (Subtarget->hasBWI())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case 32: return MVT::v32i1;
+          case 64: return MVT::v64i1;
+        }
+    }
 
-  if (VT.is256BitVector() || VT.is128BitVector()) {
-    if (Subtarget->hasVLX())
-      if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
-          EltVT == MVT::f32 || EltVT == MVT::f64)
-        switch(NumElts) {
-        case 2: return MVT::v2i1;
-        case 4: return MVT::v4i1;
-        case 8: return MVT::v8i1;
-      }
-    if (Subtarget->hasBWI() && Subtarget->hasVLX())
-      if (EltVT == MVT::i8 || EltVT == MVT::i16)
-        switch(NumElts) {
-        case  8: return MVT::v8i1;
-        case 16: return MVT::v16i1;
-        case 32: return MVT::v32i1;
-      }
+    if (VVT.is256BitVector() || VVT.is128BitVector()) {
+      if (Subtarget->hasVLX())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case 2: return MVT::v2i1;
+          case 4: return MVT::v4i1;
+          case 8: return MVT::v8i1;
+        }
+      if (Subtarget->hasBWI() && Subtarget->hasVLX())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+          case 32: return MVT::v32i1;
+        }
+    }
   }
 
   return VT.changeVectorElementTypeToInteger();
@@ -1769,9 +1927,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    for (auto *EltTy : STy->elements()) {
       unsigned EltAlign = 0;
-      getMaxByValAlign(STy->getElementType(i), EltAlign);
+      getMaxByValAlign(EltTy, EltAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
       if (MaxAlign == 16)
@@ -1821,10 +1979,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   if ((!IsMemset || ZeroMemset) &&
       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
-        (Subtarget->isUnalignedMemAccessFast() ||
+        (!Subtarget->isUnalignedMem16Slow() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16)))) {
       if (Size >= 32) {
+        // FIXME: Check if unaligned 32-byte accesses are slow.
         if (Subtarget->hasInt256())
           return MVT::v8i32;
         if (Subtarget->hasFp256())
@@ -1842,6 +2001,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
       return MVT::f64;
     }
   }
+  // This is a compromise. If we reach here, unaligned accesses may be slow on
+  // this target. However, creating smaller, aligned accesses could be even
+  // slower and would certainly be a lot more code.
   if (Subtarget->is64Bit() && Size >= 8)
     return MVT::i64;
   return MVT::i32;
@@ -1860,8 +2022,22 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                   unsigned,
                                                   unsigned,
                                                   bool *Fast) const {
-  if (Fast)
-    *Fast = Subtarget->isUnalignedMemAccessFast();
+  if (Fast) {
+    switch (VT.getSizeInBits()) {
+    default:
+      // 8-byte and under are always assumed to be fast.
+      *Fast = true;
+      break;
+    case 128:
+      *Fast = !Subtarget->isUnalignedMem16Slow();
+      break;
+    case 256:
+      *Fast = !Subtarget->isUnalignedMem32Slow();
+      break;
+    // TODO: What about AVX-512 (512-bit) accesses?
+    }
+  }
+  // Misaligned accesses of any size are always allowed.
   return true;
 }
 
@@ -1964,6 +2140,32 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   return true;
 }
 
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getSafeStackPointerLocation(IRB);
+
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  unsigned AddressSpace, Offset;
+  if (Subtarget->is64Bit()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x48;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x24 on i386
+    Offset = 0x24;
+    AddressSpace = 256;
+  }
+
+  return ConstantExpr::getIntToPtr(
+      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   assert(SrcAS != DestAS && "Expected different address spaces!");
@@ -1977,11 +2179,9 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
 
 #include "X86GenCallingConv.inc"
 
-bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                  MachineFunction &MF, bool isVarArg,
-                        const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const {
+bool X86TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
@@ -2001,6 +2201,9 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
+  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+    report_fatal_error("X86 interrupts may not return any value");
+
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
@@ -2025,7 +2228,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     else if (VA.getLocInfo() == CCValAssign::ZExt)
       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
     else if (VA.getLocInfo() == CCValAssign::AExt) {
-      if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1)
+      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
         ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
       else
         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
@@ -2108,13 +2311,28 @@ X86TargetLowering::LowerReturn(SDValue Chain,
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
+  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (X86::GR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
+  X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+  if (CallConv == CallingConv::X86_INTR)
+    opcode = X86ISD::IRET;
+  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
 }
 
 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2193,7 +2411,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
     EVT CopyVT = VA.getLocVT();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
-    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
@@ -2244,28 +2462,28 @@ enum StructReturnType {
   StackStructReturn
 };
 static StructReturnType
-callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
   if (Outs.empty())
     return NotStructReturn;
 
   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   if (!Flags.isSRet())
     return NotStructReturn;
-  if (Flags.isInReg())
+  if (Flags.isInReg() || IsMCU)
     return RegStructReturn;
   return StackStructReturn;
 }
 
 /// Determines whether a function uses struct return semantics.
 static StructReturnType
-argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
   if (Ins.empty())
     return NotStructReturn;
 
   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   if (!Flags.isSRet())
     return NotStructReturn;
-  if (Flags.isInReg())
+  if (Flags.isInReg() || IsMCU)
     return RegStructReturn;
   return StackStructReturn;
 }
@@ -2285,17 +2503,34 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                        MachinePointerInfo(), MachinePointerInfo());
 }
 
-/// Return true if the calling convention is one that
-/// supports tail call optimization.
-static bool IsTailCallConvention(CallingConv::ID CC) {
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
-          CC == CallingConv::HiPE);
+          CC == CallingConv::HiPE || CC == CallingConv::HHVM);
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+  switch (CC) {
+  // C calling conventions:
+  case CallingConv::C:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+  // Callee pop conventions:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_VectorCall:
+  case CallingConv::X86_FastCall:
+    return true;
+  default:
+    return canGuaranteeTCO(CC);
+  }
 }
 
-/// \brief Return true if the calling convention is a C calling convention.
-static bool IsCCallConvention(CallingConv::ID CC) {
-  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
-          CC == CallingConv::X86_64_SysV);
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
 }
 
 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
@@ -2306,19 +2541,12 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 
   CallSite CS(CI);
   CallingConv::ID CalleeCC = CS.getCallingConv();
-  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+  if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   return true;
 }
 
-/// Return true if the function is being made into
-/// a tailcall target by changing its ABI.
-static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
-                                   bool GuaranteedTailCallOpt) {
-  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
-}
-
 SDValue
 X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     CallingConv::ID CallConv,
@@ -2329,7 +2557,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
-  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+  bool AlwaysUseMutable = shouldGuaranteeTCO(
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
@@ -2344,6 +2572,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
   else
     ValVT = VA.getValVT();
 
+  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+  // taken by a return address.
+  int Offset = 0;
+  if (CallConv == CallingConv::X86_INTR) {
+    const X86Subtarget& Subtarget =
+        static_cast<const X86Subtarget&>(DAG.getSubtarget());
+    // X86 interrupts may take one or two arguments.
+    // On the stack there will be no return address as in regular call.
+    // Offset of last argument need to be set to -4/-8 bytes.
+    // Where offset of the first argument out of two, should be set to 0 bytes.
+    Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+  }
+
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   // changed with more analysis.
   // In case of tail call optimization mark all arguments mutable. Since they
@@ -2352,14 +2593,24 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
     unsigned Bytes = Flags.getByValSize();
     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+    // Adjust SP offset of interrupt parameter.
+    if (CallConv == CallingConv::X86_INTR) {
+      MFI->setObjectOffset(FI, Offset);
+    }
     return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   } else {
     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                     VA.getLocMemOffset(), isImmutable);
+    // Adjust SP offset of interrupt parameter.
+    if (CallConv == CallingConv::X86_INTR) {
+      MFI->setObjectOffset(FI, Offset);
+    }
+
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue Val =  DAG.getLoad(ValVT, dl, Chain, FIN,
-                               MachinePointerInfo::getFixedStack(FI),
-                               false, false, false, 0);
+    SDValue Val = DAG.getLoad(
+        ValVT, dl, Chain, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+        false, false, 0);
     return ExtendedInMem ?
       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
   }
@@ -2413,15 +2664,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
-SDValue
-X86TargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv,
-                                        bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                        SDLoc dl,
-                                        SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue X86TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
@@ -2436,9 +2682,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   bool Is64Bit = Subtarget->is64Bit();
   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
 
-  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
 
+  if (CallConv == CallingConv::X86_INTR) {
+    bool isLegal = Ins.size() == 1 ||
+                   (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+                                        (!Is64Bit && Ins[1].VT == MVT::i32)));
+    if (!isLegal)
+      report_fatal_error("X86 interrupts may take one or two arguments");
+  }
+
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@@ -2471,6 +2725,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         RC = &X86::FR32RegClass;
       else if (RegVT == MVT::f64)
         RC = &X86::FR64RegClass;
+      else if (RegVT == MVT::f128)
+        RC = &X86::FR128RegClass;
       else if (RegVT.is512BitVector())
         RC = &X86::VR512RegClass;
       else if (RegVT.is256BitVector())
@@ -2547,8 +2803,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   unsigned StackSize = CCInfo.getNextStackOffset();
   // Align stack specially for tail calls.
-  if (FuncIsMadeTailCallSafe(CallConv,
-                             MF.getTarget().Options.GuaranteedTailCallOpt))
+  if (shouldGuaranteeTCO(CallConv,
+                         MF.getTarget().Options.GuaranteedTailCallOpt))
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
@@ -2561,13 +2817,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         MFI->CreateFixedObject(1, StackSize, true));
   }
 
-  MachineModuleInfo &MMI = MF.getMMI();
-  const Function *WinEHParent = nullptr;
-  if (MMI.hasWinEHFuncInfo(Fn))
-    WinEHParent = MMI.getWinEHParent(Fn);
-  bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
-  bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
-
   // Figure out if XMM registers are in use.
   assert(!(Subtarget->useSoftFloat() &&
            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
@@ -2631,10 +2880,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
       SDValue Store =
-        DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                     MachinePointerInfo::getFixedStack(
-                       FuncInfo->getRegSaveFrameIndex(), Offset),
-                     false, false, 0);
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(),
+                           FuncInfo->getRegSaveFrameIndex(), Offset),
+                       false, false, 0);
       MemOps.push_back(Store);
       Offset += 8;
     }
@@ -2656,27 +2906,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
     if (!MemOps.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-  } else if (IsWin64 && IsWinEHOutlined) {
-    // Get to the caller-allocated home save location.  Add 8 to account
-    // for the return address.
-    int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
-    FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
-        /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
-
-    MMI.getWinEHFuncInfo(Fn)
-        .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
-        FuncInfo->getRegSaveFrameIndex();
-
-    // Store the second integer parameter (rdx) into rsp+16 relative to the
-    // stack pointer at the entry of the function.
-    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                      getPointerTy(DAG.getDataLayout()));
-    unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
-    Chain = DAG.getStore(
-        Val.getValue(1), dl, Val, RSFIN,
-        MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
-        /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
   }
 
   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
@@ -2723,12 +2952,15 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+    // X86 interrupts must pop the error code if present
+    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+    if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
         !Subtarget->getTargetTriple().isOSMSVCRT() &&
-        argsAreStructReturn(Ins) == StackStructReturn)
+        argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn)
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
@@ -2743,21 +2975,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   FuncInfo->setArgumentStackSize(StackSize);
 
-  if (IsWinEHParent) {
-    if (Is64Bit) {
-      int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
-      SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
-      MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
-      SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
-      Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
-                           MachinePointerInfo::getFixedStack(UnwindHelpFI),
-                           /*isVolatile=*/true,
-                           /*isNonTemporal=*/false, /*Alignment=*/0);
-    } else {
-      // Functions using Win32 EH are considered to have opaque SP adjustments
-      // to force local variables to be addressed from the frame or base
-      // pointers.
-      MFI->setHasOpaqueSPAdjustment(true);
+  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+    EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+    if (Personality == EHPersonality::CoreCLR) {
+      assert(Is64Bit);
+      // TODO: Add a mechanism to frame lowering that will allow us to indicate
+      // that we'd prefer this slot be allocated towards the bottom of the frame
+      // (i.e. near the stack pointer after allocating the frame).  Every
+      // funclet needs a copy of this slot in its (mostly empty) frame, and the
+      // offset from the bottom of this and each funclet's frame must be the
+      // same, so the size of funclets' (mostly empty) frames is dictated by
+      // how far this slot is from the bottom (since they allocate just enough
+      // space to accomodate holding this slot at the correct offset).
+      int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+      EHInfo->PSPSymFrameIdx = PSPSymFI;
     }
   }
 
@@ -2777,9 +3008,10 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   if (Flags.isByVal())
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
-  return DAG.getStore(Chain, dl, Arg, PtrOff,
-                      MachinePointerInfo::getStack(LocMemOffset),
-                      false, false, 0);
+  return DAG.getStore(
+      Chain, dl, Arg, PtrOff,
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+      false, false, 0);
 }
 
 /// Emit a load of return address if tail call
@@ -2813,11 +3045,24 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                          false);
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
-                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), NewReturnAddrFI),
                        false, false, 0);
   return Chain;
 }
 
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
 SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -2835,11 +3080,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
-  StructReturnType SR = callIsStructReturn(Outs);
+  StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU());
   bool IsSibcall      = false;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
 
+  if (CallConv == CallingConv::X86_INTR)
+    report_fatal_error("X86 interrupts may not be called directly");
+
   if (Attr.getValueAsString() == "true")
     isTailCall = false;
 
@@ -2878,7 +3126,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       ++NumTailCalls;
   }
 
-  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -2892,13 +3140,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   if (IsSibcall)
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
-           IsTailCallConvention(CallConv))
+           canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
@@ -2970,7 +3218,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       break;
     case CCValAssign::AExt:
       if (Arg.getValueType().isVector() &&
-          Arg.getValueType().getScalarType() == MVT::i1)
+          Arg.getValueType().getVectorElementType() == MVT::i1)
         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
       else if (RegVT.is128BitVector()) {
         // Special case: passing MMX values in XMM registers.
@@ -2987,9 +3235,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       // Store the argument.
       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
-                           MachinePointerInfo::getFixedStack(FI),
-                           false, false, 0);
+      Chain = DAG.getStore(
+          Chain, dl, Arg, SpillSlot,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+          false, false, 0);
       Arg = SpillSlot;
       break;
     }
@@ -3125,10 +3374,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                                          Flags, DAG, dl));
       } else {
         // Store relative to framepointer.
-        MemOpChains2.push_back(
-          DAG.getStore(ArgChain, dl, Arg, FIN,
-                       MachinePointerInfo::getFixedStack(FI),
-                       false, false, 0));
+        MemOpChains2.push_back(DAG.getStore(
+            ArgChain, dl, Arg, FIN,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+            false, false, 0));
       }
     }
 
@@ -3207,7 +3456,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (ExtraLoad)
         Callee = DAG.getLoad(
             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
-            MachinePointerInfo::getGOT(), false, false, false, 0);
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
+            false, 0);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
@@ -3261,9 +3511,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
 
-  // If this is an invoke in a 32-bit function using an MSVC personality, assume
-  // the function clobbers all registers. If an exception is thrown, the runtime
-  // will not restore CSRs.
+  // If this is an invoke in a 32-bit function using a funclet-based
+  // personality, assume the function clobbers all registers. If an exception
+  // is thrown, the runtime will not restore CSRs.
   // FIXME: Model this more precisely so that we can register allocate across
   // the normal edge and spill and fill across the exceptional edge.
   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
@@ -3272,7 +3522,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         CallerFn->hasPersonalityFn()
             ? classifyEHPersonality(CallerFn->getPersonalityFn())
             : EHPersonality::Unknown;
-    if (isMSVCEHPersonality(Pers))
+    if (isFuncletEHPersonality(Pers))
       Mask = RegInfo->getNoPreservedMask();
   }
 
@@ -3300,7 +3550,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+  else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
            !Subtarget->getTargetTriple().isOSMSVCRT() &&
            SR == StackStructReturn)
     // If this is a call to a struct-return function, the callee
@@ -3358,8 +3608,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 //    EDI
 //    local1 ..
 
-/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
-/// for a 16 byte align requirement.
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
@@ -3380,9 +3630,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   return Offset;
 }
 
-/// MatchingStackOffset - Return true if the given stack call argument is
-/// already available in the same position (relatively) of the caller's
-/// incoming argument stack.
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
 static
 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
@@ -3435,25 +3684,19 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
 }
 
-/// IsEligibleForTailCallOptimization - Check whether the call is eligible
-/// for tail call optimization. Targets which want to do tail call
-/// optimization should implement this function.
-bool
-X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
-                                                     CallingConv::ID CalleeCC,
-                                                     bool isVarArg,
-                                                     bool isCalleeStructRet,
-                                                     bool isCallerStructRet,
-                                                     Type *RetTy,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                                     SelectionDAG &DAG) const {
-  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   // If -tailcallopt is specified, make fastcc functions tail-callable.
-  const MachineFunction &MF = DAG.getMachineFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = MF.getFunction();
 
   // If the function return type is x86_fp80 and the callee return type is not,
@@ -3474,7 +3717,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return false;
 
   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
-    if (IsTailCallConvention(CalleeCC) && CCMatch)
+    if (canGuaranteeTCO(CalleeCC) && CCMatch)
       return true;
     return false;
   }
@@ -3493,19 +3736,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
-  // An stdcall/thiscall caller is expected to clean up its arguments; the
-  // callee isn't going to do that.
-  // FIXME: this is more restrictive than needed. We could produce a tailcall
-  // when the stack adjustment matches. For example, with a thiscall that takes
-  // only one argument.
-  if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
-                   CallerCC == CallingConv::X86_ThisCall))
-    return false;
-
   // Do not sibcall optimize vararg calls unless all arguments are passed via
   // registers.
   if (isVarArg && !Outs.empty()) {
-
     // Optimizing for varargs on Win64 is unlikely to be safe without
     // additional testing.
     if (IsCalleeWin64 || IsCallerWin64)
@@ -3573,6 +3806,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     }
   }
 
+  unsigned StackArgsSize = 0;
+
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
@@ -3587,11 +3822,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       CCInfo.AllocateStack(32, 8);
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
-    if (CCInfo.getNextStackOffset()) {
-      MachineFunction &MF = DAG.getMachineFunction();
-      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
-        return false;
+    StackArgsSize = CCInfo.getNextStackOffset();
 
+    if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -3642,6 +3875,21 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     }
   }
 
+  bool CalleeWillPop =
+      X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt);
+
+  if (unsigned BytesToPop =
+          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+    // If we have bytes to pop, the callee must pop them.
+    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+    if (!CalleePopMatches)
+      return false;
+  } else if (CalleeWillPop && StackArgsSize > 0) {
+    // If we don't have bytes to pop, make sure the callee doesn't pop any.
+    return false;
+  }
+
   return true;
 }
 
@@ -3672,6 +3920,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::SHUFP:
+  case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVLHPD:
@@ -3688,11 +3937,13 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::VPERMILPI:
   case X86ISD::VPERM2X128:
   case X86ISD::VPERMI:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
     return true;
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
                                     SDValue V1, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3707,7 +3958,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3772,23 +4023,23 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   return false;
 }
 
-/// isCalleePop - Determines whether the callee is required to pop its
-/// own arguments. Callee pop is necessary to support tail calls.
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
 bool X86::isCalleePop(CallingConv::ID CallingConv,
-                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+  // can guarantee TCO.
+  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+    return true;
+
   switch (CallingConv) {
   default:
     return false;
   case CallingConv::X86_StdCall:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_ThisCall:
+  case CallingConv::X86_VectorCall:
     return !is64Bit;
-  case CallingConv::Fast:
-  case CallingConv::GHC:
-  case CallingConv::HiPE:
-    if (IsVarArg)
-      return false;
-    return TailCallOpt;
   }
 }
 
@@ -3807,11 +4058,26 @@ static bool isX86CCUnsigned(unsigned X86CC) {
   case X86::COND_BE:    return true;
   case X86::COND_AE:    return true;
   }
-  llvm_unreachable("covered switch fell through?!");
 }
 
-/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
-/// specific condition code, returning the condition code and the LHS/RHS of the
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:  return X86::COND_E;
+  case ISD::SETGT:  return X86::COND_G;
+  case ISD::SETGE:  return X86::COND_GE;
+  case ISD::SETLT:  return X86::COND_L;
+  case ISD::SETLE:  return X86::COND_LE;
+  case ISD::SETNE:  return X86::COND_NE;
+  case ISD::SETULT: return X86::COND_B;
+  case ISD::SETUGT: return X86::COND_A;
+  case ISD::SETULE: return X86::COND_BE;
+  case ISD::SETUGE: return X86::COND_AE;
+  }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
@@ -3833,19 +4099,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
       }
     }
 
-    switch (SetCCOpcode) {
-    default: llvm_unreachable("Invalid integer condition!");
-    case ISD::SETEQ:  return X86::COND_E;
-    case ISD::SETGT:  return X86::COND_G;
-    case ISD::SETGE:  return X86::COND_GE;
-    case ISD::SETLT:  return X86::COND_L;
-    case ISD::SETLE:  return X86::COND_LE;
-    case ISD::SETNE:  return X86::COND_NE;
-    case ISD::SETULT: return X86::COND_B;
-    case ISD::SETUGT: return X86::COND_A;
-    case ISD::SETULE: return X86::COND_BE;
-    case ISD::SETUGE: return X86::COND_AE;
-    }
+    return TranslateIntegerX86CC(SetCCOpcode);
   }
 
   // First determine if it is required or is profitable to flip the operands.
@@ -3898,8 +4152,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
   }
 }
 
-/// hasFPCMov - is there a floating point cmov for the specific X86 condition
-/// code. Current x86 isa includes the following FP cmov instructions:
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
 static bool hasFPCMov(unsigned X86CC) {
   switch (X86CC) {
@@ -3917,7 +4171,36 @@ static bool hasFPCMov(unsigned X86CC) {
   }
 }
 
-/// isFPImmLegal - Returns true if the target can instruction select the
+
+bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+
+  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+  if (!IntrData)
+    return false;
+
+  switch (IntrData->Type) {
+  case LOADA:
+  case LOADU: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1);
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+/// Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
@@ -3970,7 +4253,7 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget->hasLZCNT();
 }
 
-/// isUndefInRange - Return true if every element in Mask, beginning
+/// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size is undef.
 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
@@ -3979,19 +4262,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   return true;
 }
 
-/// isUndefOrInRange - Return true if Val is undef or if its value falls within
-/// the specified range (L, H].
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val < 0) || (Val >= Low && Val < Hi);
 }
 
-/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
-/// specified value.
+/// Val is either less than zero (undef) or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return (Val < 0 || Val == CmpVal);
 }
 
-/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
+/// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (Low, Low+Size]. or is undef.
 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
@@ -4002,9 +4284,8 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   return true;
 }
 
-/// isVEXTRACTIndex - Return true if the specified
-/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
-/// suitable for instruction that extract 128 or 256 bit vectors
+/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
+/// extract that is suitable for instruction that extract 128 or 256 bit vectors
 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -4021,7 +4302,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   return Result;
 }
 
-/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
+/// Return true if the specified INSERT_SUBVECTOR
 /// operand specifies a subvector insert that is suitable for input to
 /// insertion of 128 or 256-bit subvectors
 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
@@ -4057,8 +4338,8 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
 
 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
-  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
-    llvm_unreachable("Illegal extract subvector for VEXTRACT");
+  assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+         "Illegal extract subvector for VEXTRACT");
 
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
@@ -4072,8 +4353,8 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
 
 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
-  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
-    llvm_unreachable("Illegal insert subvector for VINSERT");
+  assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+         "Illegal insert subvector for VINSERT");
 
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
@@ -4085,53 +4366,71 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   return Index / NumElemsPerChunk;
 }
 
-/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
-/// and VINSERTI128 instructions.
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
   return getExtractVEXTRACTImmediate(N, 128);
 }
 
-/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
-/// and VINSERTI64x4 instructions.
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
   return getExtractVEXTRACTImmediate(N, 256);
 }
 
-/// getInsertVINSERT128Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
-/// and VINSERTI128 instructions.
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
   return getInsertVINSERTImmediate(N, 128);
 }
 
-/// getInsertVINSERT256Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
-/// and VINSERTI64x4 instructions.
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   return getInsertVINSERTImmediate(N, 256);
 }
 
-/// isZero - Returns true if Elt is a constant integer zero
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
-/// isZeroNode - Returns true if Elt is a constant zero or a floating point
-/// constant +0.0.
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
-  if (isZero(Elt))
-    return true;
-  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
-    return CFP->getValueAPF().isPosZero();
-  return false;
+  return isNullConstant(Elt) || isNullFPConstant(Elt);
 }
 
-/// getZeroVector - Returns a vector of specified type with all zero elements.
-///
-static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
+                              SelectionDAG &DAG,
+                              SDLoc dl, bool IsMask = false) {
+
+  SmallVector<SDValue, 32>  Ops;
+  bool Split = false;
+
+  MVT ConstVecVT = VT;
+  unsigned NumElts = VT.getVectorNumElements();
+  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+    Split = true;
+  }
+
+  MVT EltVT = ConstVecVT.getVectorElementType();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    bool IsUndef = Values[i] < 0 && IsMask;
+    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+      DAG.getConstant(Values[i], dl, EltVT);
+    Ops.push_back(OpNode);
+    if (Split)
+      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+                    DAG.getConstant(0, dl, EltVT));
+  }
+  SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+  if (Split)
+    ConstsNode = DAG.getBitcast(VT, ConstsNode);
+  return ConstsNode;
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
@@ -4163,7 +4462,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
-  } else if (VT.getScalarType() == MVT::i1) {
+  } else if (VT.getVectorElementType() == MVT::i1) {
 
     assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
             && "Unexpected vector type");
@@ -4195,19 +4494,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
 
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
-  // we want.
-  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
-                               * ElemsPerChunk);
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
 
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
-                                    ElemsPerChunk));
+                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
 
-  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 }
 
@@ -4245,13 +4543,13 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 
   // Insert the relevant vectorWidth bits.
   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
-  // we want.
-  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
-                               * ElemsPerChunk);
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
 
-  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 }
 
@@ -4279,7 +4577,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                  Vec, ZeroIndex);
 
     // The blend instruction, and therefore its mask, depend on the data type.
-    MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
     if (ScalarType.isFloatingPoint()) {
       // Choose either vblendps (float) or vblendpd (double).
       unsigned ScalarSize = ScalarType.getSizeInBits();
@@ -4302,6 +4600,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
 
     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+    Result = DAG.getBitcast(CastVT, Result);
     Vec256 = DAG.getBitcast(CastVT, Vec256);
     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
     return DAG.getBitcast(ResultVT, Vec256);
@@ -4316,6 +4615,81 @@ static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
 
+/// Insert i1-subvector to i1-vector.
+static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
+
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
+    return Op;
+
+  MVT OpVT = Op.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+  unsigned NumElems = OpVT.getVectorNumElements();
+  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+
+  assert(IdxVal + SubVecNumElems <= NumElems &&
+         IdxVal % SubVecVT.getSizeInBits() == 0 &&
+         "Unexpected index value in INSERT_SUBVECTOR");
+
+  // There are 3 possible cases:
+  // 1. Subvector should be inserted in the lower part (IdxVal == 0)
+  // 2. Subvector should be inserted in the upper part
+  //    (IdxVal + SubVecNumElems == NumElems)
+  // 3. Subvector should be inserted in the middle (for example v2i1
+  //    to v16i1, index 2)
+
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+  SDValue Undef = DAG.getUNDEF(OpVT);
+  SDValue WideSubVec =
+    DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
+  if (Vec.isUndef())
+    return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+      DAG.getConstant(IdxVal, dl, MVT::i8));
+
+  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+    unsigned ShiftLeft = NumElems - SubVecNumElems;
+    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+      DAG.getConstant(ShiftLeft, dl, MVT::i8));
+    return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
+      DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
+  }
+
+  if (IdxVal == 0) {
+    // Zero lower bits of the Vec
+    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+    // Merge them together
+    return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+  }
+
+  // Simple case when we put subvector in the upper part
+  if (IdxVal + SubVecNumElems == NumElems) {
+    // Zero upper bits of the Vec
+    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
+                        DAG.getConstant(IdxVal, dl, MVT::i8));
+    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+    return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+  }
+  // Subvector should be inserted in the middle - use shuffle
+  SmallVector<int, 64> Mask;
+  for (unsigned i = 0; i < NumElems; ++i)
+    Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
+                    i : i + NumElems);
+  return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+}
+
 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
@@ -4334,18 +4708,22 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
-/// getOnesVector - Returns a vector of specified type with all bits set.
+/// Returns a vector of specified type with all bits set.
 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
 /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
-                             SDLoc dl) {
+static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
   SDValue Vec;
-  if (VT.is256BitVector()) {
-    if (HasInt256) { // AVX2
+  if (VT.is512BitVector()) {
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+  } else if (VT.is256BitVector()) {
+    if (Subtarget->hasInt256()) { // AVX2
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
     } else { // AVX
@@ -4360,19 +4738,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   return DAG.getBitcast(VT, Vec);
 }
 
-/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
-/// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
-                       SDValue V2) {
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> Mask;
-  Mask.push_back(NumElems);
-  for (unsigned i = 1; i != NumElems; ++i)
-    Mask.push_back(i);
-  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
-}
-
-/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+/// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
@@ -4384,7 +4750,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
+/// Returns a vector_shuffle node for an unpackh operation.
 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
@@ -4396,10 +4762,10 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
-/// vector of zero or undef vector.  This produces a shuffle where the low
-/// element of V2 is swizzled into the zero/undef vector, landing at element
-/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                            bool IsZero,
                                            const X86Subtarget *Subtarget,
@@ -4415,13 +4781,12 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
 }
 
-/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
-/// target specific opcode. Returns true if the Mask could be calculated. Sets
-/// IsUnary to true if only uses one source. Note that this will set IsUnary for
-/// shuffles which use a single input multiple times, and in those cases it will
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// Returns true if the Mask could be calculated. Sets IsUnary to true if only
+/// uses one source. Note that this will set IsUnary for shuffles which use a
+/// single input multiple times, and in those cases it will
 /// adjust the mask to only have indices within that single input.
-/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
-static bool getTargetShuffleMask(SDNode *N, MVT VT,
+static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
@@ -4438,6 +4803,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
+  case X86ISD::INSERTPS:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
   case X86ISD::UNPCKH:
     DecodeUNPCKHMask(VT, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -4482,7 +4852,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
 
     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
       // If we have a build-vector, then things are easy.
-      EVT VT = MaskNode.getValueType();
+      MVT VT = MaskNode.getSimpleValueType();
       assert(VT.isVector() &&
              "Can't produce a non-vector with a build_vector!");
       if (!VT.isInteger())
@@ -4530,8 +4900,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
 
     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
       DecodePSHUFBMask(C, Mask);
-      if (Mask.empty())
-        return false;
       break;
     }
 
@@ -4549,11 +4917,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   case X86ISD::VPERM2X128:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
-    if (Mask.empty()) return false;
-    // Mask only contains negative index if an element is zero.
-    if (std::any_of(Mask.begin(), Mask.end(),
-                    [](int M){ return M == SM_SentinelZero; }))
-      return false;
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
     DecodeMOVSLDUPMask(VT, Mask);
@@ -4572,9 +4936,128 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   case X86ISD::MOVLPS:
     // Not yet implemented
     return false;
+  case X86ISD::VPERMV: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(0);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
+    SmallVector<uint64_t, 32> RawMask;
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      assert(MaskNode.getSimpleValueType().isInteger() &&
+             MaskNode.getSimpleValueType().getVectorNumElements() ==
+             VT.getVectorNumElements());
+
+      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF)
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+        else if (isa<ConstantSDNode>(Op)) {
+          APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
+          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+        } else
+          return false;
+      }
+      DecodeVPERMVMask(RawMask, Mask);
+      break;
+    }
+    if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
+      unsigned NumEltsInMask = MaskNode->getNumOperands();
+      MaskNode = MaskNode->getOperand(0);
+      if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
+        APInt MaskEltValue = CN->getAPIntValue();
+        for (unsigned i = 0; i < NumEltsInMask; ++i)
+          RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
+        DecodeVPERMVMask(RawMask, Mask);
+        break;
+      }
+      // It may be a scalar load
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+      DecodeVPERMVMask(C, VT, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMV3: {
+    IsUnary = false;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(1);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      assert(MaskNode.getSimpleValueType().isInteger() &&
+             MaskNode.getSimpleValueType().getVectorNumElements() ==
+             VT.getVectorNumElements());
+
+      SmallVector<uint64_t, 32> RawMask;
+      unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
+
+      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF)
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+        else {
+          auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+          if (!CN)
+            return false;
+          APInt MaskElement = CN->getAPIntValue();
+          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+        }
+      }
+      DecodeVPERMV3Mask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+      DecodeVPERMV3Mask(C, VT, Mask);
+      break;
+    }
+    return false;
+  }
   default: llvm_unreachable("unknown target shuffle node");
   }
 
+  // Empty mask indicates the decode failed.
+  if (Mask.empty())
+    return false;
+
+  // Check if we're getting a shuffle mask with zero'd elements.
+  if (!AllowSentinelZero)
+    if (std::any_of(Mask.begin(), Mask.end(),
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
+
   // If we have a fake unary shuffle, the shuffle mask is spread across two
   // inputs that are actually the same node. Re-map the mask to always point
   // into the first input.
@@ -4586,7 +5069,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   return true;
 }
 
-/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// Returns the scalar element that will make up the ith
 /// element of the result of the vector shuffle.
 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
                                    unsigned Depth) {
@@ -4613,19 +5096,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
     MVT ShufVT = V.getSimpleValueType();
-    unsigned NumElems = ShufVT.getVectorNumElements();
+    int NumElems = (int)ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     bool IsUnary;
 
-    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+    if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
-    if (Elt < 0)
+    if (Elt == SM_SentinelUndef)
       return DAG.getUNDEF(ShufVT.getVectorElementType());
 
-    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
-                                         : N->getOperand(1);
+    assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
+    SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1);
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
@@ -4650,8 +5133,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
-///
+/// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                        unsigned NumNonZero, unsigned NumZero,
                                        SelectionDAG &DAG,
@@ -4721,8 +5203,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   return DAG.getBitcast(MVT::v16i8, V);
 }
 
-/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
-///
+/// Custom lower build_vector of v8i16.
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
@@ -4753,7 +5234,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   return V;
 }
 
-/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+/// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget *Subtarget,
                                      const TargetLowering &TLI) {
@@ -4924,7 +5405,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
       return SDValue();
     if ((Offset % RequiredAlign) & 3)
       return SDValue();
-    int64_t StartOffset = Offset & ~(RequiredAlign-1);
+    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
     if (StartOffset) {
       SDLoc DL(Ptr);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -5157,8 +5638,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   // TODO: If multiple splats are generated to load the same constant,
   // it may be detrimental to overall size. There needs to be a way to detect
   // that condition to know if this is truly a size win.
-  const Function *F = DAG.getMachineFunction().getFunction();
-  bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
+  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
 
   // Handle broadcasting a single constant scalar from the constant pool
   // into a vector.
@@ -5188,9 +5668,10 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       SDValue CP =
           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
-      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
-                       MachinePointerInfo::getConstantPool(),
-                       false, false, false, Alignment);
+      Ld = DAG.getLoad(
+          CVT, dl, DAG.getEntryNode(), CP,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, Alignment);
 
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
     }
@@ -5329,7 +5810,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   return NV;
 }
 
-static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) {
+static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
          Op.getScalarValueSizeInBits() == 1 &&
          "Can not convert non-constant vector");
@@ -5366,7 +5847,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
-    SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
+    SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
       return DAG.getBitcast(VT, Imm);
     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
@@ -5600,7 +6081,7 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
 /// node.
 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
                              const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  EVT VT = BV->getValueType(0);
+  MVT VT = BV->getSimpleValueType(0);
   if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
       (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
     return SDValue();
@@ -5662,12 +6143,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
     // Update InVec0 and InVec1.
     if (InVec0.getOpcode() == ISD::UNDEF) {
       InVec0 = Op0.getOperand(0);
-      if (InVec0.getValueType() != VT)
+      if (InVec0.getSimpleValueType() != VT)
         return SDValue();
     }
     if (InVec1.getOpcode() == ISD::UNDEF) {
       InVec1 = Op1.getOperand(0);
-      if (InVec1.getValueType() != VT)
+      if (InVec1.getSimpleValueType() != VT)
         return SDValue();
     }
 
@@ -5703,7 +6184,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
                                    const X86Subtarget *Subtarget,
                                    SelectionDAG &DAG) {
-  EVT VT = BV->getValueType(0);
+  MVT VT = BV->getSimpleValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumUndefsLO = 0;
   unsigned NumUndefsHI = 0;
@@ -5845,7 +6326,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   unsigned NumElems = Op.getNumOperands();
 
   // Generate vectors for predicate vectors.
-  if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
+  if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
     return LowerBUILD_VECTORvXi1(Op, DAG);
 
   // Vectors containing all zeros can be matched by pxor and xorps later
@@ -5866,7 +6347,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return Op;
 
     if (!VT.is512BitVector())
-      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+      return getOnesVector(VT, Subtarget, DAG, dl);
   }
 
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
@@ -5881,7 +6362,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   unsigned NumZero  = 0;
   unsigned NumNonZero = 0;
-  unsigned NonZeros = 0;
+  uint64_t NonZeros = 0;
   bool IsAllConstants = true;
   SmallSet<SDValue, 8> Values;
   for (unsigned i = 0; i < NumElems; ++i) {
@@ -5895,7 +6376,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (X86::isZeroNode(Elt))
       NumZero++;
     else {
-      NonZeros |= (1 << i);
+      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+      NonZeros |= ((uint64_t)1 << i);
       NumNonZero++;
     }
   }
@@ -5919,7 +6401,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
         // Handle SSE only.
         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
-        EVT VecVT = MVT::v4i32;
+        MVT VecVT = MVT::v4i32;
 
         // Truncate the value (which may itself be a constant) to i32, and
         // convert it to a vector with movd (S2V+shuffle to zero extend).
@@ -6051,7 +6533,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // One half is zero or undef.
       unsigned Idx = countTrailingZeros(NonZeros);
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
-                                 Op.getOperand(Idx));
+                               Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
     }
     return SDValue();
@@ -6059,13 +6541,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
-    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                        Subtarget, *this))
+    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+                                          DAG, Subtarget, *this))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
-    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                      Subtarget, *this))
+    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+                                          DAG, Subtarget, *this))
       return V;
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
@@ -6077,7 +6559,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
     for (unsigned i = 0; i < 4; ++i) {
-      bool isZero = !(NonZeros & (1 << i));
+      bool isZero = !(NonZeros & (1ULL << i));
       if (isZero)
         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
@@ -6177,7 +6659,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -6193,8 +6675,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 
   if (Op.getNumOperands() == 4) {
-    MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
-                                ResVT.getVectorNumElements()/2);
+    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+                                  ResVT.getVectorNumElements()/2);
     SDValue V3 = Op.getOperand(2);
     SDValue V4 = Op.getOperand(3);
     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
@@ -6213,8 +6695,27 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   assert(isPowerOf2_32(NumOfOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
+  SDValue Undef = DAG.getUNDEF(ResVT);
   if (NumOfOperands > 2) {
-    MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+    // Specialize the cases when all, or all but one, of the operands are undef.
+    unsigned NumOfDefinedOps = 0;
+    unsigned OpIdx = 0;
+    for (unsigned i = 0; i < NumOfOperands; i++)
+      if (!Op.getOperand(i).isUndef()) {
+        NumOfDefinedOps++;
+        OpIdx = i;
+      }
+    if (NumOfDefinedOps == 0)
+      return Undef;
+    if (NumOfDefinedOps == 1) {
+      unsigned SubVecNumElts =
+        Op.getOperand(OpIdx).getValueType().getVectorNumElements();
+      SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
+                         Op.getOperand(OpIdx), IdxVal);
+    }
+
+    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
     SmallVector<SDValue, 2> Ops;
     for (unsigned i = 0; i < NumOfOperands/2; i++)
@@ -6227,31 +6728,38 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
+  // 2 operands
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+  assert(V1.getValueType() == V2.getValueType() &&
+         V1.getValueType().getVectorNumElements() == NumElems/2 &&
+         "Unexpected operands in CONCAT_VECTORS");
+
+  if (ResVT.getSizeInBits() >= 16)
+    return Op; // The operation is legal with KUNPCK
+
   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
-
+  SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
   if (IsZeroV1 && IsZeroV2)
-    return getZeroVector(ResVT, Subtarget, DAG, dl);
+    return ZeroVec;
 
   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
-  SDValue Undef = DAG.getUNDEF(ResVT);
-  unsigned NumElems = ResVT.getVectorNumElements();
-  SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
+  if (V2.isUndef())
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+  if (IsZeroV2)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
+
+  SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
+  if (V1.isUndef())
+    V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
 
-  V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
-  V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
   if (IsZeroV1)
-    return V2;
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
 
   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
-  // Zero the upper bits of V1
-  V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
-  V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
-  if (IsZeroV2)
-    return V1;
-  return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
@@ -6272,7 +6780,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
-
 //===----------------------------------------------------------------------===//
 // Vector shuffle lowering
 //
@@ -6422,6 +6929,127 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
   return DAG.getConstant(Imm, DL, MVT::i8);
 }
 
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                                     SDValue V1, SDValue V2) {
+  SmallBitVector Zeroable(Mask.size(), false);
+
+  while (V1.getOpcode() == ISD::BITCAST)
+    V1 = V1->getOperand(0);
+  while (V2.getOpcode() == ISD::BITCAST)
+    V2 = V2->getOperand(0);
+
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    // Handle the easy cases.
+    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+      Zeroable[i] = true;
+      continue;
+    }
+
+    // If this is an index into a build_vector node (which has the same number
+    // of elements), dig out the input value and use it.
+    SDValue V = M < Size ? V1 : V2;
+    if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+      continue;
+
+    SDValue Input = V.getOperand(M % Size);
+    // The UNDEF opcode check really should be dead code here, but not quite
+    // worth asserting on (it isn't invalid, just unexpected).
+    if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+      Zeroable[i] = true;
+  }
+
+  return Zeroable;
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
+                                           SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+  int NumElts = VT.getVectorNumElements();
+  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+  SmallVector<int, 8> Unpckl;
+  SmallVector<int, 8> Unpckh;
+
+  for (int i = 0; i < NumElts; ++i) {
+    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+    int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
+    int HiPos = LoPos + NumEltsInLane / 2;
+    Unpckl.push_back(LoPos);
+    Unpckh.push_back(HiPos);
+  }
+
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+  // Commute and try again.
+  ShuffleVectorSDNode::commuteMask(Unpckl);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+  ShuffleVectorSDNode::commuteMask(Unpckh);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+  return SDValue();
+}
+
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  MVT EltVT = VT.getVectorElementType();
+  int NumEltBits = EltVT.getSizeInBits();
+  MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+  SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
+  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+                                    IntEltVT);
+  if (EltVT.isFloatingPoint()) {
+    Zero = DAG.getBitcast(EltVT, Zero);
+    AllOnes = DAG.getBitcast(EltVT, AllOnes);
+  }
+  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  SDValue V;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Zeroable[i])
+      continue;
+    if (Mask[i] % Size != i)
+      return SDValue(); // Not a blend.
+    if (!V)
+      V = Mask[i] < Size ? V1 : V2;
+    else if (V != (Mask[i] < Size ? V1 : V2))
+      return SDValue(); // Can only let one input through the mask.
+
+    VMaskOps[i] = AllOnes;
+  }
+  if (!V)
+    return SDValue(); // No non-zeroable elements!
+
+  SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+  V = DAG.getNode(VT.isFloatingPoint()
+                  ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+                  DL, VT, V, VMask);
+  return V;
+}
+
 /// \brief Try to emit a blend instruction for a shuffle using bit math.
 ///
 /// This is used as a fallback approach when first class blend instructions are
@@ -6431,7 +7059,7 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
-  MVT EltVT = VT.getScalarType();
+  MVT EltVT = VT.getVectorElementType();
   int NumEltBits = EltVT.getSizeInBits();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
@@ -6458,22 +7086,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
 /// This doesn't do any checks for the availability of instructions for blending
 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
 /// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is in fact a blend.
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
-                                         SDValue V2, ArrayRef<int> Mask,
+                                         SDValue V2, ArrayRef<int> Original,
                                          const X86Subtarget *Subtarget,
                                          SelectionDAG &DAG) {
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+  SmallVector<int, 8> Mask(Original.begin(), Original.end());
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  bool ForceV1Zero = false, ForceV2Zero = false;
+
+  // Attempt to generate the binary blend mask. If an input is zero then
+  // we can use any lane.
+  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
   unsigned BlendMask = 0;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Mask[i] >= Size) {
-      if (Mask[i] != i + Size)
-        return SDValue(); // Shuffled V2 input!
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+    if (M == i)
+      continue;
+    if (M == i + Size) {
       BlendMask |= 1u << i;
       continue;
     }
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return SDValue(); // Shuffled V1 input!
+    if (Zeroable[i]) {
+      if (V1IsZero) {
+        ForceV1Zero = true;
+        Mask[i] = i;
+        continue;
+      }
+      if (V2IsZero) {
+        ForceV2Zero = true;
+        BlendMask |= 1u << i;
+        Mask[i] = i + Size;
+        continue;
+      }
+    }
+    return SDValue(); // Shuffled input!
   }
+
+  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+  if (ForceV1Zero)
+    V1 = getZeroVector(VT, Subtarget, DAG, DL);
+  if (ForceV2Zero)
+    V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+    unsigned ScaledMask = 0;
+    for (int i = 0; i != Size; ++i)
+      if (BlendMask & (1u << i))
+        for (int j = 0; j != Scale; ++j)
+          ScaledMask |= 1u << (i * Scale + j);
+    return ScaledMask;
+  };
+
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
@@ -6493,12 +7161,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
     if (Subtarget->hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = 0;
-      for (int i = 0, Size = Mask.size(); i < Size; ++i)
-        if (Mask[i] >= Size)
-          for (int j = 0; j < Scale; ++j)
-            BlendMask |= 1u << (i * Scale + j);
-
+      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
       V1 = DAG.getBitcast(BlendVT, V1);
       V2 = DAG.getBitcast(BlendVT, V2);
@@ -6511,12 +7174,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
     // For integer shuffles we need to expand the mask and cast the inputs to
     // v8i16s prior to blending.
     int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = 0;
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      if (Mask[i] >= Size)
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-
+    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = DAG.getBitcast(MVT::v8i16, V2);
     return DAG.getBitcast(VT,
@@ -6541,9 +7199,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
     // FALLTHROUGH
   case MVT::v16i8:
   case MVT::v32i8: {
-    assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+    assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
+    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+    if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+      return Masked;
+
     // Scale the blend by the number of bytes per element.
     int Scale = VT.getScalarSizeInBits() / 8;
 
@@ -6760,11 +7422,11 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
     Hi = DAG.getBitcast(AlignVT, Hi);
 
     return DAG.getBitcast(
-        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
   }
 
-  assert(VT.getSizeInBits() == 128 &&
+  assert(VT.is128BitVector() &&
          "Rotate-based lowering only supports 128-bit lowering!");
   assert(Mask.size() <= 16 &&
          "Can shuffle at most 16 bytes in a 128-bit vector!");
@@ -6785,92 +7447,6 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
                         DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
 }
 
-/// \brief Compute whether each element of a shuffle is zeroable.
-///
-/// A "zeroable" vector shuffle element is one which can be lowered to zero.
-/// Either it is an undef element in the shuffle mask, the element of the input
-/// referenced is undef, or the element of the input referenced is known to be
-/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
-/// as many lanes with this technique as possible to simplify the remaining
-/// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
-                                                     SDValue V1, SDValue V2) {
-  SmallBitVector Zeroable(Mask.size(), false);
-
-  while (V1.getOpcode() == ISD::BITCAST)
-    V1 = V1->getOperand(0);
-  while (V2.getOpcode() == ISD::BITCAST)
-    V2 = V2->getOperand(0);
-
-  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
-  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    int M = Mask[i];
-    // Handle the easy cases.
-    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
-      Zeroable[i] = true;
-      continue;
-    }
-
-    // If this is an index into a build_vector node (which has the same number
-    // of elements), dig out the input value and use it.
-    SDValue V = M < Size ? V1 : V2;
-    if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
-      continue;
-
-    SDValue Input = V.getOperand(M % Size);
-    // The UNDEF opcode check really should be dead code here, but not quite
-    // worth asserting on (it isn't invalid, just unexpected).
-    if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
-      Zeroable[i] = true;
-  }
-
-  return Zeroable;
-}
-
-/// \brief Try to emit a bitmask instruction for a shuffle.
-///
-/// This handles cases where we can model a blend exactly as a bitmask due to
-/// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
-                                           SDValue V2, ArrayRef<int> Mask,
-                                           SelectionDAG &DAG) {
-  MVT EltVT = VT.getScalarType();
-  int NumEltBits = EltVT.getSizeInBits();
-  MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
-  SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
-  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
-                                    IntEltVT);
-  if (EltVT.isFloatingPoint()) {
-    Zero = DAG.getBitcast(EltVT, Zero);
-    AllOnes = DAG.getBitcast(EltVT, AllOnes);
-  }
-  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
-  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-  SDValue V;
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    if (Zeroable[i])
-      continue;
-    if (Mask[i] % Size != i)
-      return SDValue(); // Not a blend.
-    if (!V)
-      V = Mask[i] < Size ? V1 : V2;
-    else if (V != (Mask[i] < Size ? V1 : V2))
-      return SDValue(); // Can only let one input through the mask.
-
-    VMaskOps[i] = AllOnes;
-  }
-  if (!V)
-    return SDValue(); // No non-zeroable elements!
-
-  SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
-  V = DAG.getNode(VT.isFloatingPoint()
-                  ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
-                  DL, VT, V, VMask);
-  return V;
-}
-
 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
 ///
 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -6982,7 +7558,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
     // Determine the extraction length from the part of the
     // lower half that isn't zeroable.
     int Len = HalfSize;
-    for (; Len >= 0; --Len)
+    for (; Len > 0; --Len)
       if (!Zeroable[Len - 1])
         break;
     assert(Len > 0 && "Zeroable shuffle mask");
@@ -6997,8 +7573,9 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
       SDValue &V = (M < Size ? V1 : V2);
       M = M % Size;
 
-      // All mask elements must be in the lower half.
-      if (M > HalfSize)
+      // The extracted elements must start at a valid index and all mask
+      // elements must be in the lower half.
+      if (i > M || M >= HalfSize)
         return SDValue();
 
       if (Idx < 0 || (Src == V && Idx == (M - i))) {
@@ -7095,64 +7672,104 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
 ///
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
-/// features of the subtarget.
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-    SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
+    SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
     ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
-  int NumElements = VT.getVectorNumElements();
   int EltBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = 128 / EltBits;
+  int OffsetLane = Offset / NumEltsPerLane;
   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
          "Only 8, 16, and 32 bit elements can be extended.");
   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+  assert(0 <= Offset && "Extension offset must be positive.");
+  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+         "Extension offset must be in the first lane or start an upper lane.");
+
+  // Check that an index is in same lane as the base offset.
+  auto SafeOffset = [&](int Idx) {
+    return OffsetLane == (Idx / NumEltsPerLane);
+  };
+
+  // Shift along an input so that the offset base moves to the first element.
+  auto ShuffleOffset = [&](SDValue V) {
+    if (!Offset)
+      return V;
+
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = 0; i * Scale < NumElements; ++i) {
+      int SrcIdx = i + Offset;
+      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+    }
+    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+  };
 
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget->hasSSE41()) {
+    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // PUNPCK will catch this in a later shuffle match.
+    if (Offset && Scale == 2 && VT.is128BitVector())
+      return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
-    return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+    return DAG.getBitcast(VT, InputV);
   }
 
+  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
   // For any extends we can cheat for larger element sizes and use shuffle
   // instructions that can fold with a load and/or copy.
   if (AnyExt && EltBits == 32) {
-    int PSHUFDMask[4] = {0, -1, 1, -1};
+    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+                         -1};
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                         DAG.getBitcast(MVT::v4i32, InputV),
                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   }
   if (AnyExt && EltBits == 16 && Scale > 2) {
-    int PSHUFDMask[4] = {0, -1, 0, -1};
+    int PSHUFDMask[4] = {Offset / 2, -1,
+                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                          DAG.getBitcast(MVT::v4i32, InputV),
                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
-    int PSHUFHWMask[4] = {1, -1, -1, -1};
+    int PSHUFWMask[4] = {1, -1, -1, -1};
+    unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
     return DAG.getBitcast(
-        VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
                         DAG.getBitcast(MVT::v8i16, InputV),
-                        getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
   }
 
   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   // to 64-bits.
   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
-    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+    assert(VT.is128BitVector() && "Unexpected vector width!");
 
+    int LoIdx = Offset * EltBits;
     SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
                              DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
                                          DAG.getConstant(EltBits, DL, MVT::i8),
-                                         DAG.getConstant(0, DL, MVT::i8)));
-    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+                                         DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+    if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+        !SafeOffset(Offset + 1))
       return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
 
-    SDValue Hi =
-        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
-                                DAG.getConstant(EltBits, DL, MVT::i8),
-                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    int HiIdx = (Offset + 1) * EltBits;
+    SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(HiIdx, DL, MVT::i8)));
     return DAG.getNode(ISD::BITCAST, DL, VT,
                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   }
@@ -7163,9 +7780,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
     assert(NumElements == 16 && "Unexpected byte vector width!");
     SDValue PSHUFBMask[16];
-    for (int i = 0; i < 16; ++i)
-      PSHUFBMask[i] =
-          DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
+    for (int i = 0; i < 16; ++i) {
+      int Idx = Offset + (i / Scale);
+      PSHUFBMask[i] = DAG.getConstant(
+          (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+    }
     InputV = DAG.getBitcast(MVT::v16i8, InputV);
     return DAG.getBitcast(VT,
                           DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
@@ -7173,13 +7792,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                                                   MVT::v16i8, PSHUFBMask)));
   }
 
+  // If we are extending from an offset, ensure we start on a boundary that
+  // we can unpack from.
+  int AlignToUnpack = Offset % (NumElements / Scale);
+  if (AlignToUnpack) {
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = AlignToUnpack; i < NumElements; ++i)
+      ShMask[i - AlignToUnpack] = i;
+    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+    Offset -= AlignToUnpack;
+  }
+
   // Otherwise emit a sequence of unpacks.
   do {
+    unsigned UnpackLoHi = X86ISD::UNPCKL;
+    if (Offset >= (NumElements / 2)) {
+      UnpackLoHi = X86ISD::UNPCKH;
+      Offset -= (NumElements / 2);
+    }
+
     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
                          : getZeroVector(InputVT, Subtarget, DAG, DL);
     InputV = DAG.getBitcast(InputVT, InputV);
-    InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
     Scale /= 2;
     EltBits *= 2;
     NumElements /= 2;
@@ -7205,7 +7841,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
   int Bits = VT.getSizeInBits();
+  int NumLanes = Bits / 128;
   int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = NumElements / NumLanes;
   assert(VT.getScalarSizeInBits() <= 32 &&
          "Exceeds 32-bit integer zero extension limit");
   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
@@ -7215,8 +7853,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   auto Lower = [&](int Scale) -> SDValue {
     SDValue InputV;
     bool AnyExt = true;
+    int Offset = 0;
+    int Matches = 0;
     for (int i = 0; i < NumElements; ++i) {
-      if (Mask[i] == -1)
+      int M = Mask[i];
+      if (M == -1)
         continue; // Valid anywhere but doesn't tell us anything.
       if (i % Scale != 0) {
         // Each of the extended elements need to be zeroable.
@@ -7230,14 +7871,29 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
 
       // Each of the base elements needs to be consecutive indices into the
       // same input vector.
-      SDValue V = Mask[i] < NumElements ? V1 : V2;
-      if (!InputV)
+      SDValue V = M < NumElements ? V1 : V2;
+      M = M % NumElements;
+      if (!InputV) {
         InputV = V;
-      else if (InputV != V)
+        Offset = M - (i / Scale);
+      } else if (InputV != V)
         return SDValue(); // Flip-flopping inputs.
 
-      if (Mask[i] % NumElements != i / Scale)
+      // Offset must start in the lowest 128-bit lane or at the start of an
+      // upper lane.
+      // FIXME: Is it ever worth allowing a negative base offset?
+      if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+            (Offset % NumEltsPerLane) == 0))
+        return SDValue();
+
+      // If we are offsetting, all referenced entries must come from the same
+      // lane.
+      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+        return SDValue();
+
+      if ((M % NumElements) != (Offset + (i / Scale)))
         return SDValue(); // Non-consecutive strided elements.
+      Matches++;
     }
 
     // If we fail to find an input, we have a zero-shuffle which should always
@@ -7246,8 +7902,13 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     if (!InputV)
       return SDValue();
 
+    // If we are offsetting, don't extend if we only match a single input, we
+    // can always do better by using a basic PSHUF or PUNPCK.
+    if (Offset != 0 && Matches < 2)
+      return SDValue();
+
     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
+        DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
   };
 
   // The widest scale possible for extending is to a 64-bit integer.
@@ -7355,8 +8016,9 @@ static SDValue lowerVectorShuffleAsElementInsertion(
   // all the smarts here sunk into that routine. However, the current
   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   // vector shuffle lowering is dead.
-  if (SDValue V2S = getScalarValueForVectorElement(
-          V2, Mask[V2Index] - Mask.size(), DAG)) {
+  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+                                               DAG);
+  if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
@@ -7431,11 +8093,65 @@ static SDValue lowerVectorShuffleAsElementInsertion(
   return V2;
 }
 
+/// \brief Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
+                                                  int BroadcastIdx,
+                                                  const X86Subtarget *Subtarget,
+                                                  SelectionDAG &DAG) {
+  assert(Subtarget->hasAVX2() &&
+         "We can only lower integer broadcasts with AVX2!");
+
+  EVT EltVT = VT.getVectorElementType();
+  EVT V0VT = V0.getValueType();
+
+  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+  EVT V0EltVT = V0VT.getVectorElementType();
+  if (!V0EltVT.isInteger())
+    return SDValue();
+
+  const unsigned EltSize = EltVT.getSizeInBits();
+  const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+  // This is only a truncation if the original element type is larger.
+  if (V0EltSize <= EltSize)
+    return SDValue();
+
+  assert(((V0EltSize % EltSize) == 0) &&
+         "Scalar type sizes must all be powers of 2 on x86!");
+
+  const unsigned V0Opc = V0.getOpcode();
+  const unsigned Scale = V0EltSize / EltSize;
+  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+  if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+      V0Opc != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+  // If we're extracting non-least-significant bits, shift so we can truncate.
+  // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+  if (const int OffsetIdx = BroadcastIdx % Scale)
+    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+            DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
+  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
 /// \brief Try to lower broadcast of a single element.
 ///
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
                                              ArrayRef<int> Mask,
                                              const X86Subtarget *Subtarget,
@@ -7476,7 +8192,7 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
 
       int BeginIdx = (int)ConstantIdx->getZExtValue();
       int EndIdx =
-          BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+          BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
         BroadcastIdx -= BeginIdx;
         V = VInner;
@@ -7491,6 +8207,22 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
 
   // Check if this is a broadcast of a scalar. We special case lowering
   // for scalars so that we can more effectively fold with loads.
+  // First, look through bitcast: if the original value has a larger element
+  // type than the shuffle, the broadcast element is in essence truncated.
+  // Make that explicit to ease folding.
+  if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
+    if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
+            DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+      return TruncBroadcast;
+
+  MVT BroadcastVT = VT;
+
+  // Peek through any bitcast (only useful for loads).
+  SDValue BC = V;
+  while (BC.getOpcode() == ISD::BITCAST)
+    BC = BC.getOperand(0);
+
+  // Also check the simpler case, where we can directly reuse the scalar.
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
@@ -7499,13 +8231,32 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
     // Only AVX2 has register broadcasts.
     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
       return SDValue();
+  } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+    // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+    if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64)
+      BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
+
+    // If we are broadcasting a load that is only used by the shuffle
+    // then we can reduce the vector load to the broadcasted scalar load.
+    LoadSDNode *Ld = cast<LoadSDNode>(BC);
+    SDValue BaseAddr = Ld->getOperand(1);
+    EVT AddrVT = BaseAddr.getValueType();
+    EVT SVT = BroadcastVT.getScalarType();
+    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+    SDValue NewAddr = DAG.getNode(
+        ISD::ADD, DL, AddrVT, BaseAddr,
+        DAG.getConstant(Offset, DL, AddrVT));
+    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
     // We can't broadcast from a vector register without AVX2, and we can only
     // broadcast from the zero-element of a vector register.
     return SDValue();
   }
 
-  return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+  V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V);
+  return DAG.getBitcast(VT, V);
 }
 
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -7595,9 +8346,10 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
 /// because for floating point vectors we have a generalized SHUFPS lowering
 /// strategy that handles everything that doesn't *exactly* match an unpack,
 /// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
-                                          SDValue V2, ArrayRef<int> Mask,
-                                          SelectionDAG &DAG) {
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
+                                                    SDValue V1, SDValue V2,
+                                                    ArrayRef<int> Mask,
+                                                    SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() &&
          "This routine only supports integer vectors.");
   assert(!isSingleInputShuffleMask(Mask) &&
@@ -7774,10 +8526,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+    return V;
 
   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
@@ -7869,10 +8620,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+    return V;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8077,14 +8827,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+    return V;
 
   // Otherwise fall back to a SHUFPS lowering strategy.
   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
@@ -8161,14 +8906,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+    return V;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8184,8 +8924,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                       Mask, DAG);
 
   // Try to lower by permuting the inputs into an unpack instruction.
-  if (SDValue Unpack =
-          lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
+  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
+                                                            V2, Mask, DAG))
     return Unpack;
 
   // We implement this with SHUFPS because it can blend from two vectors.
@@ -8218,7 +8958,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
@@ -8286,16 +9026,18 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
 
+    bool ThreeAInputs = AToAInputs.size() == 3;
+
     // Compute the index of dword with only one word among the three inputs in
     // a half by taking the sum of the half with three inputs and subtracting
     // the sum of the actual three inputs. The difference is the remaining
     // slot.
     int ADWord, BDWord;
-    int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
-    int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
-    int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
-    ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
-    int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
+    int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+    int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+    int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+    ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+    int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
     int TripleNonInputIdx =
         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
@@ -8364,8 +9106,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
         } else {
           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
-          int APinnedIdx =
-              AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+          int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
         }
       }
@@ -8751,10 +9492,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return Shift;
 
     // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
-    if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+      return V;
 
     // Try to use byte rotation instructions.
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
@@ -8798,10 +9538,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+    return V;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
@@ -8812,8 +9551,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return BitBlend;
 
-  if (SDValue Unpack =
-          lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
+  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
+                                                            V2, Mask, DAG))
     return Unpack;
 
   // If we can't directly blend but can use PSHUFB, that will be better as it
@@ -9037,17 +9776,14 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       return V;
   }
 
+  if (SDValue Masked =
+          lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
+    return Masked;
+
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
-                                         0, 16, 1, 17, 2, 18, 3, 19,
-                                         // High half.
-                                         4, 20, 5, 21, 6, 22, 7, 23}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
-                                         8, 24, 9, 25, 10, 26, 11, 27,
-                                         // High half.
-                                         12, 28, 13, 29, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+    return V;
 
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
@@ -9086,8 +9822,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       // FIXME: It might be worth trying to detect if the unpack-feeding
       // shuffles will both be pshufb, in which case we shouldn't bother with
       // this.
-      if (SDValue Unpack =
-              lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+              DL, MVT::v16i8, V1, V2, Mask, DAG))
         return Unpack;
     }
 
@@ -9296,7 +10032,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
 
   int NumElements = VT.getVectorNumElements();
   int SplitNumElements = NumElements / 2;
-  MVT ScalarVT = VT.getScalarType();
+  MVT ScalarVT = VT.getVectorElementType();
   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
 
   // Rather than splitting build-vectors, just build two narrower build
@@ -9308,7 +10044,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
     MVT OrigVT = V.getSimpleValueType();
     int OrigNumElements = OrigVT.getVectorNumElements();
     int OrigSplitNumElements = OrigNumElements / 2;
-    MVT OrigScalarVT = OrigVT.getScalarType();
+    MVT OrigScalarVT = OrigVT.getVectorElementType();
     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
 
     SDValue LoV, HiV;
@@ -9478,7 +10214,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
                                                        ArrayRef<int> Mask,
                                                        SelectionDAG &DAG) {
   // FIXME: This should probably be generalized for 512-bit vectors as well.
-  assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
   int LaneSize = Mask.size() / 2;
 
   // If there are only inputs from one 128-bit lane, splitting will in fact be
@@ -9682,6 +10418,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
 }
 
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
+                                               SDValue V2, ArrayRef<int> Mask,
+                                               const X86Subtarget *Subtarget,
+                                               SelectionDAG &DAG) {
+  assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfNumElts = NumElts / 2;
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+  bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+  bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+  if (!UndefLower && !UndefUpper)
+    return SDValue();
+
+  // Upper half is undef and lower half is whole upper subvector.
+  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (UndefUpper &&
+      isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                             DAG.getIntPtrConstant(HalfNumElts, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  // Lower half is undef and upper half is whole lower subvector.
+  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (UndefLower &&
+      isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                             DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+                       DAG.getIntPtrConstant(HalfNumElts, DL));
+  }
+
+  // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+  if (UndefLower && Subtarget->hasAVX2() &&
+      (VT == MVT::v4f64 || VT == MVT::v4i64))
+    return SDValue();
+
+  // If the shuffle only uses the lower halves of the input operands,
+  // then extract them and perform the 'half' shuffle at half width.
+  // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+  int HalfIdx1 = -1, HalfIdx2 = -1;
+  SmallVector<int, 8> HalfMask;
+  unsigned Offset = UndefLower ? HalfNumElts : 0;
+  for (unsigned i = 0; i != HalfNumElts; ++i) {
+    int M = Mask[i + Offset];
+    if (M < 0) {
+      HalfMask.push_back(M);
+      continue;
+    }
+
+    // Determine which of the 4 half vectors this element is from.
+    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+    int HalfIdx = M / HalfNumElts;
+
+    // Only shuffle using the lower halves of the inputs.
+    // TODO: Investigate usefulness of shuffling with upper halves.
+    if (HalfIdx != 0 && HalfIdx != 2)
+      return SDValue();
+
+    // Determine the element index into its half vector source.
+    int HalfElt = M % HalfNumElts;
+
+    // We can shuffle with up to 2 half vectors, set the new 'half'
+    // shuffle mask accordingly.
+    if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
+      HalfMask.push_back(HalfElt);
+      HalfIdx1 = HalfIdx;
+      continue;
+    }
+    if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
+      HalfMask.push_back(HalfElt + HalfNumElts);
+      HalfIdx2 = HalfIdx;
+      continue;
+    }
+
+    // Too many half vectors referenced.
+    return SDValue();
+  }
+  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+  auto GetHalfVector = [&](int HalfIdx) {
+    if (HalfIdx < 0)
+      return DAG.getUNDEF(HalfVT);
+    SDValue V = (HalfIdx < 2 ? V1 : V2);
+    HalfIdx = (HalfIdx % 2) * HalfNumElts;
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+                       DAG.getIntPtrConstant(HalfIdx, DL));
+  };
+
+  SDValue Half1 = GetHalfVector(HalfIdx1);
+  SDValue Half2 = GetHalfVector(HalfIdx2);
+  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+                     DAG.getIntPtrConstant(Offset, DL));
+}
+
 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
 /// given mask.
 ///
@@ -9776,16 +10614,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    DAG);
   }
 
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                 Subtarget, DAG))
@@ -9876,14 +10708,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Shift;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+    return V;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -9941,14 +10768,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
-    if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+      return V;
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
     // have already handled any direct blends. We also need to squash the
@@ -9974,9 +10796,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     if (Subtarget->hasAVX2())
       return DAG.getNode(
           X86ISD::VPERMV, DL, MVT::v8f32,
-          DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                                 MVT::v8i32, VPermMask)),
-          V1);
+          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10041,14 +10861,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
-    if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+      return V;
   }
 
   // Try to use shift instructions.
@@ -10115,18 +10930,9 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane:
-                           0, 16, 1, 17, 2, 18, 3, 19,
-                           // Second 128-bit lane:
-                           8, 24, 9, 25, 10, 26, 11, 27}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane:
-                           4, 20, 5, 21, 6, 22, 7, 23,
-                           // Second 128-bit lane:
-                           12, 28, 13, 29, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+    return V;
 
   // Try to use shift instructions.
   if (SDValue Shift =
@@ -10215,22 +11021,9 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
-  // Note that these are repeated 128-bit lane unpacks, not unpacks across all
-  // 256-bit lanes.
-  if (isShuffleEquivalent(
-          V1, V2, Mask,
-          {// First 128-bit lane:
-           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
-           // Second 128-bit lane:
-           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
-  if (isShuffleEquivalent(
-          V1, V2, Mask,
-          {// First 128-bit lane:
-           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-           // Second 128-bit lane:
-           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+    return V;
 
   // Try to use shift instructions.
   if (SDValue Shift =
@@ -10296,12 +11089,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                               DL, VT, V1, V2, Mask, Subtarget, DAG))
       return Insertion;
 
-  // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
-  // check for those subtargets here and avoid much of the subtarget querying in
-  // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
-  // ability to manipulate a 256-bit vector with integer types. Since we'll use
-  // floating point types there eventually, just immediately cast everything to
-  // a float and operate entirely in that domain.
+  // Handle special cases where the lower or upper half is UNDEF.
+  if (SDValue V =
+          lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+  // can check for those subtargets here and avoid much of the subtarget
+  // querying in the per-vector-type lowering routines. With AVX1 we have
+  // essentially *zero* ability to manipulate a 256-bit vector with integer
+  // types. Since we'll use floating point types there eventually, just
+  // immediately cast everything to a float and operate entirely in that domain.
   if (VT.isInteger() && !Subtarget->hasAVX2()) {
     int ElementBits = VT.getScalarSizeInBits();
     if (ElementBits < 32)
@@ -10334,6 +11132,57 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 }
 
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
+                                        ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        SelectionDAG &DAG) {
+  assert(VT.getScalarSizeInBits() == 64 &&
+         "Unexpected element type size for 128bit shuffle.");
+
+  // To handle 256 bit vector requires VLX and most probably
+  // function lowerV2X128VectorShuffle() is better solution.
+  assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+
+  SmallVector<int, 4> WidenedMask;
+  if (!canWidenShuffleElements(Mask, WidenedMask))
+    return SDValue();
+
+  // Form a 128-bit permutation.
+  // Convert the 64-bit shuffle mask selection values into 128-bit selection
+  // bits defined by a vshuf64x2 instruction's immediate control byte.
+  unsigned PermMask = 0, Imm = 0;
+  unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    if (WidenedMask[i] == SM_SentinelZero)
+      return SDValue();
+
+    // Use first element in place of undef mask.
+    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+    PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+  }
+
+  return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+
+  assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
+
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                        const X86Subtarget *Subtarget,
@@ -10345,21 +11194,21 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Shuf128;
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
@@ -10367,22 +11216,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           0, 16, 1, 17, 4, 20, 5, 21,
-                           // Second 128-bit lane.
-                           8, 24, 9, 25, 12, 28, 13, 29}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           2, 18, 3, 19, 6, 22, 7, 23,
-                           // Second 128-bit lane.
-                           10, 26, 11, 27, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+    return Unpck;
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
@@ -10396,21 +11234,21 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Shuf128;
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
@@ -10418,22 +11256,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           0, 16, 1, 17, 4, 20, 5, 21,
-                           // Second 128-bit lane.
-                           8, 24, 9, 25, 12, 28, 13, 29}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           2, 18, 3, 19, 6, 22, 7, 23,
-                           // Second 128-bit lane.
-                           10, 26, 11, 27, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+    return Unpck;
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10448,8 +11275,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
+  return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
@@ -10517,6 +11343,60 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
 }
 
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                      MVT VT, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Subtarget->hasAVX512() &&
+         "Cannot lower 512-bit vectors w/o basic ISA!");
+  MVT ExtVT;
+  switch (VT.SimpleTy) {
+  default:
+    llvm_unreachable("Expected a vector of i1 elements");
+  case MVT::v2i1:
+    ExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    ExtVT = MVT::v4i32;
+    break;
+  case MVT::v8i1:
+    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+    break;
+  case MVT::v16i1:
+    ExtVT = MVT::v16i32;
+    break;
+  case MVT::v32i1:
+    ExtVT = MVT::v32i16;
+    break;
+  case MVT::v64i1:
+    ExtVT = MVT::v64i8;
+    break;
+  }
+
+  if (ISD::isBuildVectorAllZeros(V1.getNode()))
+    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+  if (V2.isUndef())
+    V2 = DAG.getUNDEF(ExtVT);
+  else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+  return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                     DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
+}
 /// \brief Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
@@ -10533,8 +11413,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   MVT VT = Op.getSimpleValueType();
   int NumElements = VT.getVectorNumElements();
   SDLoc dl(Op);
+  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
 
-  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+         "Can't lower MMX shuffles");
 
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -10572,7 +11454,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   // elements wider than 64 bits, but it might be interesting to form i128
   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   SmallVector<int, 16> WidenedMask;
-  if (VT.getScalarSizeInBits() < 64 &&
+  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
       canWidenShuffleElements(Mask, WidenedMask)) {
     MVT NewEltVT = VT.isFloatingPoint()
                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
@@ -10640,17 +11522,17 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   }
 
   // For each vector width, delegate to a specialized lowering routine.
-  if (VT.getSizeInBits() == 128)
+  if (VT.is128BitVector())
     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
 
-  if (VT.getSizeInBits() == 256)
+  if (VT.is256BitVector())
     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
 
-  // Force AVX-512 vectors to be scalarized for now.
-  // FIXME: Implement AVX-512 support!
-  if (VT.getSizeInBits() == 512)
+  if (VT.is512BitVector())
     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
 
+  if (Is1BitVector)
+    return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   llvm_unreachable("Unimplemented!");
 }
 
@@ -10661,11 +11543,16 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
                                     unsigned &MaskValue) {
   MaskValue = 0;
   unsigned NumElems = BuildVector->getNumOperands();
+
   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  // We don't handle the >2 lanes case right now.
   unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  if (NumLanes > 2)
+    return false;
+
   unsigned NumElemsInLane = NumElems / NumLanes;
 
-  // Blend for v16i16 should be symetric for the both lanes.
+  // Blend for v16i16 should be symmetric for the both lanes.
   for (unsigned i = 0; i < NumElemsInLane; ++i) {
     SDValue EltCond = BuildVector->getOperand(i);
     SDValue SndLaneEltCond =
@@ -10673,20 +11560,25 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
 
     int Lane1Cond = -1, Lane2Cond = -1;
     if (isa<ConstantSDNode>(EltCond))
-      Lane1Cond = !isZero(EltCond);
+      Lane1Cond = !isNullConstant(EltCond);
     if (isa<ConstantSDNode>(SndLaneEltCond))
-      Lane2Cond = !isZero(SndLaneEltCond);
+      Lane2Cond = !isNullConstant(SndLaneEltCond);
 
+    unsigned LaneMask = 0;
     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
       // Lane1Cond != 0, means we want the first argument.
       // Lane1Cond == 0, means we want the second argument.
       // The encoding of this argument is 0 for the first argument, 1
       // for the second. Therefore, invert the condition.
-      MaskValue |= !Lane1Cond << i;
+      LaneMask = !Lane1Cond << i;
     else if (Lane1Cond < 0)
-      MaskValue |= !Lane2Cond << i;
+      LaneMask = !Lane2Cond << i;
     else
       return false;
+
+    MaskValue |= LaneMask;
+    if (NumLanes == 2)
+      MaskValue |= LaneMask << NumElemsInLane;
   }
   return true;
 }
@@ -10711,7 +11603,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
     SDValue CondElt = CondBV->getOperand(i);
     Mask.push_back(
-        isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
+        isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+                                     : -1);
   }
   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
 }
@@ -10776,9 +11669,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (VT.getSizeInBits() == 16) {
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
-    if (Idx == 0)
+    if (isNullConstant(Op.getOperand(1)))
       return DAG.getNode(
           ISD::TRUNCATE, dl, MVT::i16,
           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
@@ -10801,8 +11693,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
       return SDValue();
     SDNode *User = *Op.getNode()->use_begin();
     if ((User->getOpcode() != ISD::STORE ||
-         (isa<ConstantSDNode>(Op.getOperand(1)) &&
-          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+         isNullConstant(Op.getOperand(1))) &&
         (User->getOpcode() != ISD::BITCAST ||
          User->getValueType(0) != MVT::i32))
       return SDValue();
@@ -10900,10 +11791,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     MVT EltVT = VecVT.getVectorElementType();
 
     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
-    //if (IdxVal >= NumElems/2)
-    //  IdxVal -= NumElems/2;
-    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
+    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+    // this can be done with a mask.
+    IdxVal &= ElemsPerChunk - 1;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                        DAG.getConstant(IdxVal, dl, MVT::i32));
   }
@@ -10918,8 +11810,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
     SDValue Vec = Op.getOperand(0);
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
+    if (isNullConstant(Op.getOperand(1)))
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec),
@@ -10951,8 +11842,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
+    if (isNullConstant(Op.getOperand(1)))
       return Op;
 
     // UNPCKHPD the element to the lowest double word, then movsd.
@@ -11039,7 +11929,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 
     // Insert the element into the desired chunk.
     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
-    unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
+    assert(isPowerOf2_32(NumEltsIn128));
+    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
 
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
                     DAG.getConstant(IdxIn128, dl, MVT::i32));
@@ -11078,8 +11970,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
       //   combine either bitwise AND or insert of float 0.0 to set these bits.
 
-      const Function *F = DAG.getMachineFunction().getFunction();
-      bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+      bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
         // If this is an insertion of 32-bits into the low 32-bits of
         // a vector, we prefer to generate a blend with immediate rather
@@ -11199,14 +12090,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   // --> load32 addr
   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
-      !Subtarget->isUnalignedMem32Slow()) {
-    SDValue SubVec2 = Vec.getOperand(1);
-    if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
-      if (Idx2->getZExtValue() == 0) {
-        SDValue Ops[] = { SubVec2, SubVec };
-        if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
-          return Ld;
+      OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+    if (Idx2 && Idx2->getZExtValue() == 0) {
+      SDValue SubVec2 = Vec.getOperand(1);
+      // If needed, look through a bitcast to get to the load.
+      if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
+        SubVec2 = SubVec2.getOperand(0);
+
+      if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
+        bool Fast;
+        unsigned Alignment = FirstLd->getAlignment();
+        unsigned AS = FirstLd->getAddressSpace();
+        const X86TargetLowering *TLI = Subtarget->getTargetLowering();
+        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                    OpVT, AS, Alignment, &Fast) && Fast) {
+          SDValue Ops[] = { SubVec2, SubVec };
+          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+            return Ld;
+        }
       }
     }
   }
@@ -11218,37 +12120,9 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
 
-  if (OpVT.getVectorElementType() == MVT::i1) {
-    if (IdxVal == 0  && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
-      return Op;
-    SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
-    SDValue Undef = DAG.getUNDEF(OpVT);
-    unsigned NumElems = OpVT.getVectorNumElements();
-    SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
-
-    if (IdxVal == OpVT.getVectorNumElements() / 2) {
-      // Zero upper bits of the Vec
-      Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
-      Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
-
-      SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
-                                 SubVec, ZeroIdx);
-      Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
-      return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
-    }
-    if (IdxVal == 0) {
-      SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
-                                 SubVec, ZeroIdx);
-      // Zero upper bits of the Vec2
-      Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
-      Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
-      // Zero lower bits of the Vec
-      Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
-      Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
-      // Merge them together
-      return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
-    }
-  }
+  if (OpVT.getVectorElementType() == MVT::i1)
+    return Insert1BitVector(Op, DAG);
+
   return SDValue();
 }
 
@@ -11363,7 +12237,8 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   // load.
   if (isGlobalStubReference(OpFlag))
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
 
   return Result;
 }
@@ -11430,7 +12305,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   // load.
   if (isGlobalStubReference(OpFlags))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
@@ -11587,7 +12463,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
   }
 
   // The address of the thread local variable is the add of the thread
@@ -11599,10 +12476,18 @@ SDValue
 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  // Cygwin uses emutls.
+  // FIXME: It may be EmulatedTLS-generic also for X86-Android.
+  if (Subtarget->isTargetWindowsCygwin())
+    return LowerToTLSEmulatedModel(GA, DAG);
+
   const GlobalValue *GV = GA->getGlobal();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->isTargetELF()) {
+    if (DAG.getTarget().Options.EmulatedTLS)
+      return LowerToTLSEmulatedModel(GA, DAG);
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
@@ -11651,8 +12536,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+    Chain =
+        DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+                           DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -11825,15 +12714,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     return Op;
   }
 
+  SDValue ValueToStore = Op.getOperand(0);
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      !Subtarget->is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
-  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                               StackSlot,
-                               MachinePointerInfo::getFixedStack(SSFI),
-                               false, false, 0);
+  SDValue Chain = DAG.getStore(
+      DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
+      false, 0);
   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
 }
 
@@ -11855,10 +12752,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   MachineMemOperand *MMO;
   if (FI) {
     int SSFI = FI->getIndex();
-    MMO =
-      DAG.getMachineFunction()
-      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        MachineMemOperand::MOLoad, ByteSize, ByteSize);
   } else {
     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
     StackSlot = StackSlot.getOperand(1);
@@ -11884,16 +12780,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
     SDValue Ops[] = {
       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
     };
-    MachineMemOperand *MMO =
-      DAG.getMachineFunction()
-      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                            MachineMemOperand::MOStore, SSFISize, SSFISize);
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        MachineMemOperand::MOStore, SSFISize, SSFISize);
 
     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
                                     Ops, Op.getValueType(), MMO);
-    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
-                         MachinePointerInfo::getFixedStack(SSFI),
-                         false, false, false, 0);
+    Result = DAG.getLoad(
+        Op.getValueType(), DL, Chain, StackSlot,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        false, false, false, 0);
   }
 
   return Result;
@@ -11937,16 +12833,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   // Load the 64-bit value into an XMM register.
   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                             Op.getOperand(0));
-  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
+  SDValue CLod0 =
+      DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
   SDValue Unpck1 =
       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
 
-  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
+  SDValue CLod1 =
+      DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+  // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
@@ -11996,10 +12895,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
 
   // Subtract the bias.
+  // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
 
   // Handle final rounding.
-  EVT DestVT = Op.getValueType();
+  MVT DestVT = Op.getSimpleValueType();
 
   if (DestVT.bitsLT(MVT::f64))
     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
@@ -12025,14 +12925,23 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   //     return (float4) lo + fhi;
 
+  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+  // reassociate the two FADDs, and if we do that, the algorithm fails
+  // spectacularly (PR24512).
+  // FIXME: If we ever have some kind of Machine FMF, this should be marked
+  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+  // there's also the MachineCombiner reassociations happening on Machine IR.
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
   SDLoc DL(Op);
   SDValue V = Op->getOperand(0);
-  EVT VecIntVT = V.getValueType();
+  MVT VecIntVT = V.getSimpleValueType();
   bool Is128 = VecIntVT == MVT::v4i32;
-  EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+  MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   // If we convert to something else than the supported type, e.g., to v4f64,
   // abort early.
-  if (VecFloatVT != Op->getValueType(0))
+  if (VecFloatVT != Op->getSimpleValueType(0))
     return SDValue();
 
   unsigned NumElts = VecIntVT.getVectorNumElements();
@@ -12070,7 +12979,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
 
   SDValue Low, High;
   if (Subtarget.hasSSE41()) {
-    EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+    MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
@@ -12108,6 +13017,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+  // TODO: Are there any fast-math-flags to propagate here?
   SDValue FHigh =
       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   //     return (float4) lo + fhi;
@@ -12137,11 +13047,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
   case MVT::v16i8:
   case MVT::v16i16:
-    if (Subtarget->hasAVX512())
-      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
-                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+    assert(Subtarget->hasAVX512());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
   }
-  llvm_unreachable(nullptr);
 }
 
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -12150,7 +13059,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDLoc dl(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  if (Op.getValueType().isVector())
+  if (Op.getSimpleValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG);
 
   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
@@ -12161,6 +13070,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   MVT SrcVT = N0.getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
+
+  if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+    // Conversions from unsigned i32 to f32/f64 are legal,
+    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
+    return Op;
+  }
+
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
@@ -12184,7 +13101,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   }
 
   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+  SDValue ValueToStore = Op.getOperand(0);
+  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
                                StackSlot, MachinePointerInfo(),
                                false, false, 0);
   // For i64 source, we need to add the appropriate power of 2 if the input
@@ -12193,10 +13116,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // we must be careful to do the computation in x87 extended precision, not
   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction()
-    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                          MachineMemOperand::MOLoad, 8, 8);
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+      MachineMemOperand::MOLoad, 8, 8);
 
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
@@ -12223,24 +13145,52 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   // Load the value out, extending it from f32 to f80.
   // FIXME: Avoid the extend by constructing the right constant pool?
-  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
-                                 FudgePtr, MachinePointerInfo::getConstantPool(),
-                                 MVT::f32, false, false, false, 4);
+  SDValue Fudge = DAG.getExtLoad(
+      ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+      false, false, false, 4);
   // Extend everything to 80 bits to force it to be done on x87.
+  // TODO: Are there any fast-math-flags to propagate here?
   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
                      DAG.getIntPtrConstant(0, dl));
 }
 
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
 std::pair<SDValue,SDValue>
-X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned, bool IsReplace) const {
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                   bool IsSigned, bool IsReplace) const {
   SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
+  EVT TheVT = Op.getOperand(0).getValueType();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+  if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+    // f16 must be promoted before using the lowering in this routine.
+    // fp128 does not use this lowering.
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  // If using FIST to compute an unsigned i64, we'll need some fixup
+  // to handle values above the maximum signed i64.  A FIST is always
+  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+  bool UnsignedFixup = !IsSigned &&
+                       DstTy == MVT::i64 &&
+                       (!Subtarget->is64Bit() ||
+                        !isScalarFPTypeInSSEReg(TheVT));
+
+  if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+    // The low 32 bits of the fist result will have the correct uint32 result.
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
     DstTy = MVT::i64;
   }
@@ -12258,42 +13208,87 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
 
-  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
-  // stack slot, or into the FTOL runtime function.
+  // We lower FP->int64 into FISTP64 followed by a load from a temporary
+  // stack slot.
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
 
   unsigned Opc;
-  if (!IsSigned && isIntegerTypeFTOL(DstTy))
-    Opc = X86ISD::WIN_FTOL;
-  else
-    switch (DstTy.getSimpleVT().SimpleTy) {
-    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
-    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
-    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
-    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
-    }
+  switch (DstTy.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
 
   SDValue Chain = DAG.getEntryNode();
   SDValue Value = Op.getOperand(0);
-  EVT TheVT = Op.getOperand(0).getValueType();
+  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+  if (UnsignedFixup) {
+    //
+    // Conversion to unsigned i64 is implemented with a select,
+    // depending on whether the source value fits in the range
+    // of a signed i64.  Let Thresh be the FP equivalent of
+    // 0x8000000000000000ULL.
+    //
+    //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+    //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
+    //  Fist-to-mem64 FistSrc
+    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+    //  to XOR'ing the high 32 bits with Adjust.
+    //
+    // Being a power of 2, Thresh is exactly representable in all FP formats.
+    // For X87 we'd like to use the smallest FP type for this constant, but
+    // for DAG type consistency we have to match the FP operand type.
+
+    APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
+    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+    bool LosesInfo = false;
+    if (TheVT == MVT::f64)
+      // The rounding mode is irrelevant as the conversion should be exact.
+      Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                              &LosesInfo);
+    else if (TheVT == MVT::f80)
+      Status = Thresh.convert(APFloat::x87DoubleExtended,
+                              APFloat::rmNearestTiesToEven, &LosesInfo);
+
+    assert(Status == APFloat::opOK && !LosesInfo &&
+           "FP conversion should have been exact");
+
+    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+    SDValue Cmp = DAG.getSetCC(DL,
+                               getSetCCResultType(DAG.getDataLayout(),
+                                                  *DAG.getContext(), TheVT),
+                               Value, ThreshVal, ISD::SETLT);
+    Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+                           DAG.getConstant(0, DL, MVT::i32),
+                           DAG.getConstant(0x80000000, DL, MVT::i32));
+    SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+    Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+                                              *DAG.getContext(), TheVT),
+                       Value, ThreshVal, ISD::SETLT);
+    Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+  }
+
   // FIXME This causes a redundant load/store if the SSE-class value is already
   // in memory, such as if it is on the callstack.
   if (isScalarFPTypeInSSEReg(TheVT)) {
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
-                         MachinePointerInfo::getFixedStack(SSFI),
-                         false, false, 0);
+                         MachinePointerInfo::getFixedStack(MF, SSFI), false,
+                         false, 0);
     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
     SDValue Ops[] = {
       Chain, StackSlot, DAG.getValueType(TheVT)
     };
 
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                              MachineMemOperand::MOLoad, MemSize, MemSize);
+        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                                MachineMemOperand::MOLoad, MemSize, MemSize);
     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
@@ -12301,28 +13296,52 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   }
 
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                            MachineMemOperand::MOStore, MemSize, MemSize);
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                              MachineMemOperand::MOStore, MemSize, MemSize);
+
+  if (UnsignedFixup) {
+
+    // Insert the FIST, load its result as two i32's,
+    // and XOR the high i32 with Adjust.
+
+    SDValue FistOps[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           FistOps, DstTy, MMO);
+
+    SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
+                                MachinePointerInfo(),
+                                false, false, false, 0);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
+                                   DAG.getConstant(4, DL, PtrVT));
+
+    SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
+                                 MachinePointerInfo(),
+                                 false, false, false, 0);
+    High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
 
-  if (Opc != X86ISD::WIN_FTOL) {
+    if (Subtarget->is64Bit()) {
+      // Join High32 and Low32 into a 64-bit result.
+      // (High32 << 32) | Low32
+      Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+      High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+      High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+                           DAG.getConstant(32, DL, MVT::i8));
+      SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+      return std::make_pair(Result, SDValue());
+    }
+
+    SDValue ResultOps[] = { Low32, High32 };
+
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+      : DAG.getMergeValues(ResultOps, DL);
+    return std::make_pair(pair, SDValue());
+  } else {
     // Build the FP_TO_INT*_IN_MEM
     SDValue Ops[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                            Ops, DstTy, MMO);
     return std::make_pair(FIST, StackSlot);
-  } else {
-    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
-      DAG.getVTList(MVT::Other, MVT::Glue),
-      Chain, Value);
-    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
-      MVT::i32, ftol.getValue(1));
-    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
-      MVT::i32, eax.getValue(2));
-    SDValue Ops[] = { eax, edx };
-    SDValue pair = IsReplace
-      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
-      : DAG.getMergeValues(Ops, DL);
-    return std::make_pair(pair, SDValue());
   }
 }
 
@@ -12333,7 +13352,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1)
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
 
   // Optimize vectors in AVX mode:
@@ -12426,6 +13445,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   return SDValue();
 }
 
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+
+  // Shift LSB to MSB and use VPMOVB2M - SKX.
+  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+  if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
+         Subtarget->hasBWI()) ||     // legal, will go to VPMOVB2M, VPMOVW2M
+      ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+             InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
+             Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
+    // Shift packed bytes not supported natively, bitcast to dword
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+    SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+                                     DAG.getBitcast(ExtVT, In),
+                                     DAG.getConstant(ShiftInx, DL, ExtVT));
+    ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+  }
+  if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
+         Subtarget->hasDQI()) ||  // legal, will go to VPMOVD2M, VPMOVQ2M
+      ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+         InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
+         Subtarget->hasVLX())) {  // legal, will go to VPMOVD2M, VPMOVQ2M
+
+    SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+                                     DAG.getConstant(ShiftInx, DL, InVT));
+    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+  }
+
+  // Shift LSB to MSB, extend if necessary and use TESTM.
+  unsigned NumElts = InVT.getVectorNumElements();
+  if (InVT.getSizeInBits() < 512 &&
+      (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
+       !Subtarget->hasVLX())) {
+    assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
+
+    // TESTD/Q should be used (if BW supported we use CVT2MASK above),
+    // so vector should be extended to packed dword/qword.
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+    InVT = ExtVT;
+    ShiftInx = InVT.getScalarSizeInBits() - 1;
+  }
+
+  SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+                                   DAG.getConstant(ShiftInx, DL, InVT));
+  return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
@@ -12443,42 +13518,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
          "Invalid TRUNCATE operation");
 
-  // move vector to mask - truncate solution for SKX
-  if (VT.getVectorElementType() == MVT::i1) {
-    if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
-        Subtarget->hasBWI())
-      return Op; // legal, will go to VPMOVB2M, VPMOVW2M
-    if ((InVT.is256BitVector() || InVT.is128BitVector())
-        && InVT.getScalarSizeInBits() <= 16 &&
-        Subtarget->hasBWI() && Subtarget->hasVLX())
-      return Op; // legal, will go to VPMOVB2M, VPMOVW2M
-    if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
-        Subtarget->hasDQI())
-      return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
-    if ((InVT.is256BitVector() || InVT.is128BitVector())
-        && InVT.getScalarSizeInBits() >= 32 &&
-        Subtarget->hasDQI() && Subtarget->hasVLX())
-      return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
-  }
-  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
-    if (VT.getVectorElementType().getSizeInBits() >=8)
-      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
-
-    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
-    unsigned NumElts = InVT.getVectorNumElements();
-    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
-    if (InVT.getSizeInBits() < 512) {
-      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
-      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
-      InVT = ExtVT;
-    }
-
-    SDValue OneV =
-     DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
-    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
-    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
-  }
+  if (VT.getVectorElementType() == MVT::i1)
+    return LowerTruncateVecI1(Op, DAG, Subtarget);
 
+  // vpmovqb/w/d, vpmovdb/w, vpmovwb
+  if (Subtarget->hasAVX512()) {
+    // word to byte only under BWI
+    if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT,
+                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+    return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+  }
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget->hasInt256()) {
@@ -12583,7 +13633,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
-  if (!FIST.getNode()) return Op;
+  if (!FIST.getNode())
+    return Op;
 
   if (StackSlot.getNode())
     // Load the result.
@@ -12600,7 +13651,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     /*IsSigned=*/ false, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
-  assert(FIST.getNode() && "Unexpected failure");
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode())
+    return Op;
 
   if (StackSlot.getNode())
     // Load the result.
@@ -12643,6 +13696,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
+  bool IsF128 = (VT == MVT::f128);
+
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
@@ -12650,11 +13705,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   MVT LogicVT;
   MVT EltVT;
   unsigned NumElts;
-  
+
   if (VT.isVector()) {
     LogicVT = VT;
     EltVT = VT.getVectorElementType();
     NumElts = VT.getVectorNumElements();
+  } else if (IsF128) {
+    // SSE instructions are used for optimized f128 logical operations.
+    LogicVT = MVT::f128;
+    EltVT = VT;
+    NumElts = 1;
   } else {
     // There are no scalar bitwise logical SSE/AVX instructions, so we
     // generate a 16-byte vector constant and logic op even for the scalar case.
@@ -12675,9 +13735,10 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
-                             MachinePointerInfo::getConstantPool(),
-                             false, false, false, Alignment);
+  SDValue Mask =
+      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, Alignment);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
@@ -12685,7 +13746,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
-  if (VT.isVector())
+  if (VT.isVector() || IsF128)
     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
 
   // For the scalar case extend to a 128-bit vector, perform the logic op,
@@ -12704,6 +13765,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   MVT SrcVT = Op1.getSimpleValueType();
+  bool IsF128 = (VT == MVT::f128);
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -12718,13 +13780,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
 
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
+  assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+         "Unexpected type in LowerFCOPYSIGN");
 
   const fltSemantics &Sem =
-      VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+      VT == MVT::f64 ? APFloat::IEEEdouble :
+          (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
   const unsigned SizeInBits = VT.getSizeInBits();
 
   SmallVector<Constant *, 4> CV(
-      VT == MVT::f64 ? 2 : 4,
+      VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
 
   // First, clear all bits but the sign bit from the second operand (sign).
@@ -12737,11 +13802,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // Perform all logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE. This allows load folding of the
   // constants into the logic instructions.
-  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
-  SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
-  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
+  SDValue Mask1 =
+      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
+  if (!IsF128)
+    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
 
   // Next, clear the sign bit from the first operand (magnitude).
@@ -12750,8 +13817,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
     APFloat APF = Op0CN->getValueAPF();
     // If the magnitude is a positive zero, the sign bit alone is enough.
     if (APF.isPosZero())
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
-                         DAG.getIntPtrConstant(0, dl));
+      return IsF128 ? SignBit :
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                      DAG.getIntPtrConstant(0, dl));
     APF.clearSign();
     CV[0] = ConstantFP::get(*Context, APF);
   } else {
@@ -12761,18 +13829,21 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   }
   C = ConstantVector::get(CV);
   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
-                            MachinePointerInfo::getConstantPool(),
-                            false, false, false, 16);
+  SDValue Val =
+      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
   // If the magnitude operand wasn't a constant, we need to AND out the sign.
   if (!isa<ConstantFPSDNode>(Op0)) {
-    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    if (!IsF128)
+      Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
     Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
   }
   // OR the magnitude value with the sign bit.
   Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
-                     DAG.getIntPtrConstant(0, dl));
+  return IsF128 ? Val :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                  DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -12859,7 +13930,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
       return SDValue();
   }
 
-  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+  MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
 
   // Cast all vectors into TestVT for PTEST.
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
@@ -12999,14 +14070,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     if (ConstantSDNode *C =
         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
       // An add of one will be selected as an INC.
-      if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
+      if (C->isOne() && !Subtarget->slowIncDec()) {
         Opcode = X86ISD::INC;
         NumOperands = 1;
         break;
       }
 
       // An add of negative one (subtract of one) will be selected as a DEC.
-      if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
+      if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
         Opcode = X86ISD::DEC;
         NumOperands = 1;
         break;
@@ -13135,13 +14206,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                    SDLoc dl, SelectionDAG &DAG) const {
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
-    if (C->getAPIntValue() == 0)
-      return EmitTest(Op0, X86CC, dl, DAG);
+  if (isNullConstant(Op1))
+    return EmitTest(Op0, X86CC, dl, DAG);
 
-     if (Op0.getValueType() == MVT::i1)
-       llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
-  }
+  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+         "Unexpected comparison operation for MVT::i1 operands");
 
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
@@ -13150,8 +14219,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
     // if we're optimizing for size, however, as that'll allow better folding
     // of memory operations.
     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
-        !DAG.getMachineFunction().getFunction()->hasFnAttribute(
-            Attribute::MinSize) &&
+        !DAG.getMachineFunction().getFunction()->optForMinSize() &&
         !Subtarget->isAtom()) {
       unsigned ExtendOp =
           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
@@ -13188,6 +14256,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
                             DAG.getConstant(8, dl, MVT::i8));
   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+  assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
@@ -13261,13 +14332,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
 /// original divisions.
-bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
-  return NumUsers > 1;
-}
-
-static bool isAllOnes(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isAllOnesValue();
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+  return 2;
 }
 
 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
@@ -13285,8 +14351,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   if (Op1.getOpcode() == ISD::SHL)
     std::swap(Op0, Op1);
   if (Op0.getOpcode() == ISD::SHL) {
-    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
-      if (And00C->getZExtValue() == 1) {
+    if (isOneConstant(Op0.getOperand(0))) {
         // If we looked past a truncate, check that it's only truncating away
         // known zeros.
         unsigned BitWidth = Op0.getValueSizeInBits();
@@ -13423,7 +14488,7 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  assert(Op0.getValueType().getVectorElementType() == MVT::i1 &&
+  assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Unexpected type for boolean compare operation");
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
@@ -13467,8 +14532,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
-         Op.getValueType().getScalarType() == MVT::i1 &&
+  assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
+         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
@@ -13515,7 +14580,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
 
   for (unsigned i = 0; i < n; ++i) {
     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
-    if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
+    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
       return SDValue();
 
     // Avoid underflow.
@@ -13606,13 +14671,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (VT.is256BitVector() && !Subtarget->hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
-  EVT OpVT = Op1.getValueType();
+  MVT OpVT = Op1.getSimpleValueType();
   if (OpVT.getVectorElementType() == MVT::i1)
     return LowerBoolVSETCC_AVX512(Op, DAG);
 
   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
   if (Subtarget->hasAVX512()) {
-    if (Op1.getValueType().is512BitVector() ||
+    if (Op1.getSimpleValueType().is512BitVector() ||
         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
@@ -13628,6 +14693,33 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   }
 
+  // Lower using XOP integer comparisons.
+  if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+    // Translate compare code to XOP PCOM compare mode.
+    unsigned CmpMode = 0;
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Unexpected SETCC condition");
+    case ISD::SETULT:
+    case ISD::SETLT: CmpMode = 0x00; break;
+    case ISD::SETULE:
+    case ISD::SETLE: CmpMode = 0x01; break;
+    case ISD::SETUGT:
+    case ISD::SETGT: CmpMode = 0x02; break;
+    case ISD::SETUGE:
+    case ISD::SETGE: CmpMode = 0x03; break;
+    case ISD::SETEQ: CmpMode = 0x04; break;
+    case ISD::SETNE: CmpMode = 0x05; break;
+    }
+
+    // Are we comparing unsigned or signed integers?
+    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+      ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                       DAG.getConstant(CmpMode, dl, MVT::i8));
+  }
+
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
@@ -13777,7 +14869,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
-    EVT EltVT = VT.getVectorElementType();
+    MVT EltVT = VT.getVectorElementType();
     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
                                  VT);
     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
@@ -13818,11 +14910,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
-      Op1.getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op1)->isNullValue() &&
+      isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
-    if (NewSetCC.getNode()) {
+    if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
       if (VT == MVT::i1)
         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
       return NewSetCC;
@@ -13831,17 +14921,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
-  if (Op1.getOpcode() == ISD::Constant &&
-      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
-       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
 
     // If the input is a setcc, then reuse the input setcc or use a new one with
     // the inverted condition.
     if (Op0.getOpcode() == X86ISD::SETCC) {
       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
-      bool Invert = (CC == ISD::SETNE) ^
-        cast<ConstantSDNode>(Op1)->isNullValue();
+      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
       if (!Invert)
         return Op0;
 
@@ -13854,8 +14941,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       return SetCC;
     }
   }
-  if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
-      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
+  if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
 
     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
@@ -13876,6 +14962,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return SetCC;
 }
 
+SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+  return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
+                     DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+}
+
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getNode()->getOpcode();
@@ -13918,7 +15021,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT VT = Op1.getValueType();
+  MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
@@ -13927,7 +15030,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (Cond.getOpcode() == ISD::SETCC &&
       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
-      VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
+      VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
     int SSECC = translateX86FSETCC(
         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
@@ -13961,12 +15064,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         // Convert to vectors, do a VSELECT, and convert back to scalar.
         // All of the conversions should be optimized away.
 
-        EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+        MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
 
-        EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+        MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
@@ -13980,26 +15083,26 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-    if (VT.isVector() && VT.getScalarType() == MVT::i1) {
-      SDValue Op1Scalar;
-      if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
-        Op1Scalar = ConvertI1VectorToInterger(Op1, DAG);
-      else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
-        Op1Scalar = Op1.getOperand(0);
-      SDValue Op2Scalar;
-      if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
-        Op2Scalar = ConvertI1VectorToInterger(Op2, DAG);
-      else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
-        Op2Scalar = Op2.getOperand(0);
-      if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-        SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
-                                        Op1Scalar.getValueType(),
-                                        Cond, Op1Scalar, Op2Scalar);
-        if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
-          return DAG.getBitcast(VT, newSelect);
-        SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
-        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
-                           DAG.getIntPtrConstant(0, DL));
+  if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+    SDValue Op1Scalar;
+    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+      Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
+    else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+      Op1Scalar = Op1.getOperand(0);
+    SDValue Op2Scalar;
+    if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+      Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
+    else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+      Op2Scalar = Op2.getOperand(0);
+    if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+                                      Op1Scalar.getValueType(),
+                                      Cond, Op1Scalar, Op2Scalar);
+      if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+        return DAG.getBitcast(VT, newSelect);
+      SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+                         DAG.getIntPtrConstant(0, DL));
     }
   }
 
@@ -14026,22 +15129,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   if (Cond.getOpcode() == X86ISD::SETCC &&
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
-      isZero(Cond.getOperand(1).getOperand(1))) {
+      isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
 
     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
 
-    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
+    if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
-      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
+      SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
 
       SDValue CmpOp0 = Cmp.getOperand(0);
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
       // (select (x == 0), 0, -1) -> neg & sbb
-      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
-        if (YC->isNullValue() &&
-            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+      if (isNullConstant(Y) &&
+            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
                                     DAG.getConstant(0, DL,
@@ -14061,11 +15163,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
 
-      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
         Res = DAG.getNOT(DL, Res, Res.getValueType());
 
-      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
-      if (!N2C || !N2C->isNullValue())
+      if (!isNullConstant(Op2))
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
     }
@@ -14073,11 +15174,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   // Look past (and (setcc_carry (cmp ...)), 1).
   if (Cond.getOpcode() == ISD::AND &&
-      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
-    if (C && C->getAPIntValue() == 1)
-      Cond = Cond.getOperand(0);
-  }
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
@@ -14136,15 +15235,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (addTest) {
-    // Look pass the truncate if the high bits are known zero.
+    // Look past the truncate if the high bits are known zero.
     if (isTruncWithZeroHighBitsInput(Cond, DAG))
-        Cond = Cond.getOperand(0);
+      Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
-      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
-      if (NewSetCC.getNode()) {
+      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
         CC = NewSetCC.getOperand(0);
         Cond = NewSetCC.getOperand(1);
         addTest = false;
@@ -14166,11 +15264,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
 
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
-        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
+        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+        (isNullConstant(Op1) || isNullConstant(Op2))) {
       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
                                 Cond);
-      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
         return DAG.getNOT(DL, Res, Res.getValueType());
       return Res;
     }
@@ -14256,8 +15355,8 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
   MVT InVT = In.getSimpleValueType();
   assert(VT.getSizeInBits() == InVT.getSizeInBits());
 
-  MVT InSVT = InVT.getScalarType();
-  assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits());
+  MVT InSVT = InVT.getVectorElementType();
+  assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
 
   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
     return SDValue();
@@ -14276,7 +15375,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
 
   // As SRAI is only available on i16/i32 types, we expand only up to i32
   // and handle i64 separately.
-  while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) {
+  while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
@@ -14286,7 +15385,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
   SDValue SignExt = Curr;
   if (CurrVT != InVT) {
     unsigned SignExtShift =
-        CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits();
+        CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   }
@@ -14346,7 +15445,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
 
-  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements()/2);
 
   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
@@ -14470,7 +15569,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
   // memory. In practice, we ''widen'' MemVT.
   EVT WideVecVT =
       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                       loadRegZize / MemVT.getScalarType().getSizeInBits());
+                       loadRegZize / MemVT.getScalarSizeInBits());
 
   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
          "Invalid vector type");
@@ -14518,29 +15617,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
       return Sext;
     }
 
-    // Otherwise we'll shuffle the small elements in the high bits of the
-    // larger type and perform an arithmetic shift. If the shift is not legal
-    // it's better to scalarize.
-    assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
-           "We can't implement a sext load without an arithmetic right shift!");
-
-    // Redistribute the loaded elements into the different locations.
-    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
-
-    SDValue Shuff = DAG.getVectorShuffle(
-        WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
-
-    Shuff = DAG.getBitcast(RegVT, Shuff);
-
-    // Build the arithmetic shift.
-    unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
-                   MemVT.getVectorElementType().getSizeInBits();
-    Shuff =
-        DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
-                    DAG.getConstant(Amt, dl, RegVT));
+    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+    // lanes.
+    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
 
+    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
     return Shuff;
   }
@@ -14577,11 +15659,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
 static bool isXor1OfSetCC(SDValue Op) {
   if (Op.getOpcode() != ISD::XOR)
     return false;
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (N1C && N1C->getAPIntValue() == 1) {
+  if (isOneConstant(Op.getOperand(1)))
     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
-      Op.getOperand(0).hasOneUse();
-  }
+           Op.getOperand(0).hasOneUse();
   return false;
 }
 
@@ -14597,8 +15677,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   if (Cond.getOpcode() == ISD::SETCC) {
     // Check for setcc([su]{add,sub,mul}o == 0).
     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
-        isa<ConstantSDNode>(Cond.getOperand(1)) &&
-        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+        isNullConstant(Cond.getOperand(1)) &&
         Cond.getOperand(0).getResNo() == 1 &&
         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
@@ -14625,11 +15704,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
   // Look pass (and (setcc_carry (cmp ...)), 1).
   if (Cond.getOpcode() == ISD::AND &&
-      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
-    if (C && C->getAPIntValue() == 1)
-      Cond = Cond.getOperand(0);
-  }
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
@@ -14673,16 +15750,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     switch (CondOpcode) {
     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
     case ISD::SADDO:
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-        if (C->isOne()) {
+      if (isOneConstant(RHS)) {
           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
           break;
         }
       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
     case ISD::SSUBO:
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-        if (C->isOne()) {
+      if (isOneConstant(RHS)) {
           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
           break;
         }
@@ -14844,8 +15919,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
-      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
-      if (NewSetCC.getNode()) {
+      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
         CC = NewSetCC.getOperand(0);
         Cond = NewSetCC.getOperand(1);
         addTest = false;
@@ -14877,54 +15951,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                SplitStack;
   SDLoc dl(Op);
 
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  bool Is64Bit = Subtarget->is64Bit();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
   if (!Lower) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDNode* Node = Op.getNode();
-
     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
-        " not tell us which reg is the stack pointer!");
+                    " not tell us which reg is the stack pointer!");
     EVT VT = Node->getValueType(0);
-    SDValue Tmp1 = SDValue(Node, 0);
-    SDValue Tmp2 = SDValue(Node, 1);
     SDValue Tmp3 = Node->getOperand(2);
-    SDValue Chain = Tmp1.getOperand(0);
-
-    // Chain the dynamic stack allocation so that it doesn't modify the stack
-    // pointer when other instructions are using the stack.
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
-        SDLoc(Node));
 
-    SDValue Size = Tmp2.getOperand(1);
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
-    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
-      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-          DAG.getConstant(-(uint64_t)Align, dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
-
-    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
-        DAG.getIntPtrConstant(0, dl, true), SDValue(),
-        SDLoc(Node));
-
-    SDValue Ops[2] = { Tmp1, Tmp2 };
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  // Get the inputs.
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  EVT VT = Op.getNode()->getValueType(0);
-
-  bool Is64Bit = Subtarget->is64Bit();
-  MVT SPTy = getPointerTy(DAG.getDataLayout());
-
-  if (SplitStack) {
+      Result = DAG.getNode(ISD::AND, dl, VT, Result,
+                         DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+  } else if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
@@ -14942,10 +16002,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
-    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
-    SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, dl);
   } else {
     SDValue Flag;
     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
@@ -14967,9 +16025,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
     }
 
-    SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, dl);
+    Result = SP;
   }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -14980,7 +16043,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   SDLoc DL(Op);
 
-  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
+  if (!Subtarget->is64Bit() ||
+      Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -15019,10 +16083,11 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL));
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+      Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
-  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
-                       MachinePointerInfo(SV, 16), false, false, 0);
+  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
+      SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
   MemOps.push_back(Store);
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
@@ -15030,10 +16095,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->is64Bit() &&
          "LowerVAARG only handles 64-bit va_arg!");
-  assert((Subtarget->isTargetLinux() ||
-          Subtarget->isTargetDarwin()) &&
-          "Unhandled target in LowerVAARG");
   assert(Op.getNode()->getNumOperands() == 4);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    // The Win64 ABI uses char* instead of a structure.
+    return DAG.expandVAArg(Op.getNode());
+
   SDValue Chain = Op.getOperand(0);
   SDValue SrcPtr = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -15061,8 +16129,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
     assert(!Subtarget->useSoftFloat() &&
-           !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
-               Attribute::NoImplicitFloat)) &&
+           !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
            Subtarget->hasSSE1());
   }
 
@@ -15091,8 +16158,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                            SelectionDAG &DAG) {
-  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+  // where a va_list is still an i8*.
   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  if (Subtarget->isCallingConvWin64(
+        DAG.getMachineFunction().getFunction()->getCallingConv()))
+    // Probably a Win64 va_copy.
+    return DAG.expandVACopy(Op.getNode());
+
   SDValue Chain = Op.getOperand(0);
   SDValue DstPtr = Op.getOperand(1);
   SDValue SrcPtr = Op.getOperand(2);
@@ -15230,72 +16303,126 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
   MVT EltVT = VT.getVectorElementType();
-  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+  MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
 
   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
-/// \brief Return (and \p Op, \p Mask) for compare instructions or
-/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
-/// necessary casting for \p Mask when lowering masking intrinsics.
-static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
-                                    SDValue PreservedSrc,
-                                    const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-    EVT VT = Op.getValueType();
-    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
-                                  MVT::i1, VT.getVectorNumElements());
-    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                     Mask.getValueType().getSizeInBits());
-    SDLoc dl(Op);
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG, SDLoc dl) {
+
+  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+    // Mask should be extended
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+  }
 
-    assert(MaskVT.isSimple() && "invalid mask type");
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+    if (MaskVT == MVT::v64i1) {
+      assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(0, dl, MVT::i32));
+      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(1, dl, MVT::i32));
 
-    if (isAllOnes(Mask))
-      return Op;
+      Lo = DAG.getBitcast(MVT::v32i1, Lo);
+      Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+    } else {
+      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+      // and bitcast.
+      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+      return DAG.getBitcast(MaskVT,
+                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+    }
 
+  } else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
-    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                DAG.getBitcast(BitcastVT, Mask),
-                                DAG.getIntPtrConstant(0, dl));
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                       DAG.getBitcast(BitcastVT, Mask),
+                       DAG.getIntPtrConstant(0, dl));
+  }
+}
 
-    switch (Op.getOpcode()) {
-      default: break;
-      case X86ISD::PCMPEQM:
-      case X86ISD::PCMPGTM:
-      case X86ISD::CMPM:
-      case X86ISD::CMPMU:
-        return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
-    }
-    if (PreservedSrc.getOpcode() == ISD::UNDEF)
-      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+                  SDValue PreservedSrc,
+                  const X86Subtarget *Subtarget,
+                  SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  unsigned OpcodeSelect = ISD::VSELECT;
+  SDLoc dl(Op);
+
+  if (isAllOnesConstant(Mask))
+    return Op;
+
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case X86ISD::VFPCLASS:
+    case X86ISD::VFPCLASSS:
+    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
+    // We can't use ISD::VSELECT here because it is not always "Legal"
+    // for the destination type. For example vpmovqb require only AVX512
+    // and vselect that can operate on byte element type require BWI
+    OpcodeSelect = X86ISD::SELECT;
+    break;
+  }
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
 
 /// \brief Creates an SDNode for a predicated scalar operation.
 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
-/// The mask is comming as MVT::i8 and it should be truncated
+/// The mask is coming as MVT::i8 and it should be truncated
 /// to MVT::i1 while lowering masking intrinsics.
 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
-/// "X86select" instead of "vselect". We just can't create the "vselect" node for
-/// a scalar instruction.
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
-    if (isAllOnes(Mask))
-      return Op;
+  if (isAllOnesConstant(Mask))
+    return Op;
 
-    EVT VT = Op.getValueType();
-    SDLoc dl(Op);
-    // The mask should be of type MVT::i1
-    SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  // The mask should be of type MVT::i1
+  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+  if (Op.getOpcode() == X86ISD::FSETCC)
+    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+  if (Op.getOpcode() == X86ISD::VFPCLASS ||
+      Op.getOpcode() == X86ISD::VFPCLASSS)
+    return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
 
-    if (PreservedSrc.getOpcode() == ISD::UNDEF)
-      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
 }
 
 static int getSEHRegistrationNodeSize(const Function *Fn) {
@@ -15309,15 +16436,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) {
   case EHPersonality::MSVC_CXX: return 16;
   default: break;
   }
-  report_fatal_error("can only recover FP for MSVC EH personality functions");
+  report_fatal_error(
+      "can only recover FP for 32-bit MSVC EH personality functions");
 }
 
-/// When the 32-bit MSVC runtime transfers control to us, either to an outlined
+/// When the MSVC runtime transfers control to us, either to an outlined
 /// function or when returning to a parent frame after catching an exception, we
 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
 /// Here's the math:
 ///   RegNodeBase = EntryEBP - RegNodeSize
-///   ParentFP = RegNodeBase - RegNodeFrameOffset
+///   ParentFP = RegNodeBase - ParentFrameOffset
 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
 /// subtracting the offset (negative on x86) takes us back to the parent FP.
 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
@@ -15334,29 +16462,35 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   if (!Fn->hasPersonalityFn())
     return EntryEBP;
 
-  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
-
   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
-  // registration.
+  // registration, or the .set_setframe offset.
   MCSymbol *OffsetSym =
       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
           GlobalValue::getRealLinkageName(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
-  SDValue RegNodeFrameOffset =
+  SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
 
+  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+  // prologue to RBP in the parent function.
+  const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  if (Subtarget.is64Bit())
+    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
   // RegNodeBase = EntryEBP - RegNodeSize
-  // ParentFP = RegNodeBase - RegNodeFrameOffset
+  // ParentFP = RegNodeBase - ParentFrameOffset
   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
-  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset);
+  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
 }
 
 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
   SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   if (IntrData) {
     switch(IntrData->Type) {
@@ -15365,6 +16499,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     case INTR_TYPE_2OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2));
+    case INTR_TYPE_2OP_IMM8:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+                         DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
     case INTR_TYPE_3OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3));
@@ -15376,28 +16513,53 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue RoundingMode;
+      // We allways add rounding mode to the Node.
+      // If the rounding mode is not specified, we add the
+      // "current direction" mode.
       if (Op.getNumOperands() == 4)
-        RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+        RoundingMode =
+          DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       else
         RoundingMode = Op.getOperand(4);
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
-      if (IntrWithRoundingModeOpcode != 0) {
-        unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue();
-        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION)
+      if (IntrWithRoundingModeOpcode != 0)
+        if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(), Src, RoundingMode),
                                       Mask, PassThru, Subtarget, DAG);
-      }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                               RoundingMode),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_1OP_MASK: {
       SDValue Src = Op.getOperand(1);
-      SDValue Passthru = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
+      // We add rounding mode to the Node when
+      //   - RM Opcode is specified and
+      //   - RM is not "current direction".
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(4);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
-                                  Mask, Passthru, Subtarget, DAG);
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_SCALAR_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue passThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+                                  Mask, passThru, Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
@@ -15405,7 +16567,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Src0 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       // There are 2 kinds of intrinsics in this group:
-      // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands
+      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
       // (2) With rounding mode and sae - 7 operands.
       if (Op.getNumOperands() == 6) {
         SDValue Sae  = Op.getOperand(5);
@@ -15421,11 +16583,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               RoundingMode, Sae),
                                   Mask, Src0, Subtarget, DAG);
     }
-    case INTR_TYPE_2OP_MASK: {
+    case INTR_TYPE_2OP_MASK:
+    case INTR_TYPE_2OP_IMM8_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
+
+      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -15440,8 +16607,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
-                                              Src1,Src2),
+      // TODO: Intrinsics should have fast-math-flags to propagate.
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK_RM: {
@@ -15449,7 +16616,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
-      // We specify 2 possible modes for intrinsics, with/without rounding modes.
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
       // First, we check if the intrinsic have rounding mode (6 operands),
       // if not, we set rounding mode to "current".
       SDValue Rnd;
@@ -15461,12 +16629,56 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1, Src2, Rnd),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case INTR_TYPE_3OP_MASK: {
+    case INTR_TYPE_3OP_SCALAR_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+                                              Src2, Src3, Sae),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Imm = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (7 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 7)
+        Rnd = Op.getOperand(6);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+        Src1, Src2, Imm, Rnd),
+        Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_IMM8_MASK:
+    case INTR_TYPE_3OP_MASK:
+    case INSERT_SUBVEC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+
+      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+      else if (IntrData->Type == INSERT_SUBVEC) {
+        // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+        assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+        unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+        Imm *= Src2.getSimpleValueType().getVectorNumElements();
+        Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+      }
+
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -15486,7 +16698,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                   Mask, PassThru, Subtarget, DAG);
     }
     case VPERM_3OP_MASKZ:
-    case VPERM_3OP_MASK:
+    case VPERM_3OP_MASK:{
+      // Src2 is the PassThru
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == VPERM_3OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        PassThru = DAG.getBitcast(VT, Src2);
+
+      // Swap Src1 and Src2 in the node creation
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src2, Src1, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
     case FMA_OP_MASK3:
     case FMA_OP_MASKZ:
     case FMA_OP_MASK: {
@@ -15494,11 +16726,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
-      EVT VT = Op.getValueType();
+      MVT VT = Op.getSimpleValueType();
       SDValue PassThru = SDValue();
 
       // set PassThru element
-      if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ)
+      if (IntrData->Type == FMA_OP_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
       else if (IntrData->Type == FMA_OP_MASK3)
         PassThru = Src3;
@@ -15523,6 +16755,50 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
+    case TERLOG_OP_MASK:
+    case TERLOG_OP_MASKZ: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+      SDValue Mask = Op.getOperand(5);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = Src1;
+      // Set PassThru element.
+      if (IntrData->Type == TERLOG_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3, Src4),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FPCLASS: {
+      // FPclass intrinsics with mask
+       SDValue Src1 = Op.getOperand(1);
+       MVT VT = Src1.getSimpleValueType();
+       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+       SDValue Imm = Op.getOperand(2);
+       SDValue Mask = Op.getOperand(3);
+       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+                                                 DAG.getTargetConstant(0, dl, MaskVT),
+                                                 Subtarget, DAG);
+       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                 DAG.getUNDEF(BitcastVT), FPclassMask,
+                                 DAG.getIntPtrConstant(0, dl));
+       return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case FPCLASSS: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Imm = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+        DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
+    }
     case CMP_MASK:
     case CMP_MASK_CC: {
       // Comparison intrinsics with masks.
@@ -15534,12 +16810,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       //           (v2i1 (and (PCMPEQM %a, %b),
       //                      (extract_subvector
       //                         (v8i1 (bitcast %mask)), 0))), 0))))
-      EVT VT = Op.getOperand(1).getValueType();
-      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    VT.getVectorNumElements());
+      MVT VT = Op.getOperand(1).getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
-      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                       Mask.getValueType().getSizeInBits());
+      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                       Mask.getSimpleValueType().getSizeInBits());
       SDValue Cmp;
       if (IntrData->Type == CMP_MASK_CC) {
         SDValue CC = Op.getOperand(3);
@@ -15573,6 +16848,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                 DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
+    case CMP_MASK_SCALAR_CC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+      SDValue Mask = Op.getOperand(4);
+
+      SDValue Cmp;
+      if (IntrData->Opc1 != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+      }
+      //default rounding mode
+      if(!Cmp.getNode())
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, dl,
+                                                                   MVT::i1),
+                                             Subtarget, DAG);
+
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
+                         DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
+                         DAG.getValueType(MVT::i1));
+    }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
       SDValue LHS = Op.getOperand(1);
@@ -15584,6 +16885,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                   DAG.getConstant(X86CC, dl, MVT::i8), Cond);
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
     }
+    case COMI_RM: { // Comparison intrinsics with Sae
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      SDValue CC = Op.getOperand(3);
+      SDValue Sae = Op.getOperand(4);
+      auto ComiType = TranslateX86ConstCondToX86CC(CC);
+      // choose between ordered and unordered (comi/ucomi)
+      unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+      SDValue Cond;
+      if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+                                           X86::STATIC_ROUNDING::CUR_DIRECTION)
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+      else
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+        DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), DAG);
@@ -15598,27 +16917,75 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
-      if (isAllOnes(Mask)) // return data as is
+      if (isAllOnesConstant(Mask)) // return data as is
         return Op.getOperand(1);
 
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               DataToCompress),
                                   Mask, PassThru, Subtarget, DAG);
     }
+    case BROADCASTM: {
+      SDValue Mask = Op.getOperand(1);
+      MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                    Mask.getSimpleValueType().getSizeInBits());
+      Mask = DAG.getBitcast(MaskVT, Mask);
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
+    }
     case BLEND: {
       SDValue Mask = Op.getOperand(3);
-      EVT VT = Op.getValueType();
-      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    VT.getVectorNumElements());
-      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                       Mask.getValueType().getSizeInBits());
-      SDLoc dl(Op);
-      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getBitcast(BitcastVT, Mask),
-                                  DAG.getIntPtrConstant(0, dl));
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                          Op.getOperand(2));
     }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
+    case CONVERT_TO_MASK: {
+      MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+      MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+      SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+                                    Op.getOperand(1));
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CvtMask,
+                                DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case CONVERT_MASK_TO_VEC: {
+      SDValue Mask = Op.getOperand(1);
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+    }
+    case BRCST_SUBVEC_TO_VEC: {
+      SDValue Src = Op.getOperand(1);
+      SDValue Passthru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      EVT resVT = Passthru.getValueType();
+      SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
+                                       DAG.getUNDEF(resVT), Src,
+                                       DAG.getIntPtrConstant(0, dl));
+      SDValue immVal;
+      if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
+        immVal = DAG.getConstant(0x44, dl, MVT::i8);
+      else
+        immVal = DAG.getConstant(0, dl, MVT::i8);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              subVec, subVec, immVal),
+                                  Mask, Passthru, Subtarget, DAG);
+    }
     default:
       break;
     }
@@ -15832,23 +17199,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget * Subtarget) {
   SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  if (!C)
-    llvm_unreachable("Invalid scale type");
-  unsigned ScaleVal = C->getZExtValue();
-  if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
-    llvm_unreachable("Valid scale values are 1, 2, 4, 8");
-
+  auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg;
   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   if (MaskC)
     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   else {
-    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                     Mask.getValueType().getSizeInBits());
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
 
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
@@ -15860,7 +17221,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   if (Src.getOpcode() == ISD::UNDEF)
-    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
@@ -15871,25 +17232,19 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Src, SDValue Mask, SDValue Base,
                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
   SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  if (!C)
-    llvm_unreachable("Invalid scale type");
-  unsigned ScaleVal = C->getZExtValue();
-  if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
-    llvm_unreachable("Valid scale values are 1, 2, 4, 8");
-
+  auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg;
   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   if (MaskC)
     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   else {
-    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                     Mask.getValueType().getSizeInBits());
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
 
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
@@ -15907,12 +17262,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Mask, SDValue Base, SDValue Index,
                                SDValue ScaleOp, SDValue Chain) {
   SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
+  auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT =
+  MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg;
   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
@@ -16034,64 +17388,59 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getMergeValues(Results, DL);
 }
 
-static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
-  const Function *Fn = MF.getFunction();
-  SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
+  SDValue RegNode = Op.getOperand(2);
+  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+  if (!EHInfo)
+    report_fatal_error("EH registrations only live in functions using WinEH");
+
+  // Cast the operand to an alloca, and remember the frame index.
+  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+  if (!FINode)
+    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+  // Return the chain operand without making any DAG nodes.
+  return Chain;
+}
 
-  assert(Subtarget->getFrameLowering()->hasFP(MF) &&
-         "using llvm.x86.seh.restoreframe requires a frame pointer");
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  MVT VT = TLI.getPointerTy(DAG.getDataLayout());
-
-  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
-  unsigned FrameReg =
-      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
-  unsigned SPReg = RegInfo->getStackRegister();
-  unsigned SlotSize = RegInfo->getSlotSize();
+/// \brief Lower intrinsics for TRUNCATE_TO_MEM case
+/// return truncate Store/MaskedStore Node
+static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
+                                               SelectionDAG &DAG,
+                                               MVT ElementType) {
+  SDLoc dl(Op);
+  SDValue Mask = Op.getOperand(4);
+  SDValue DataToTruncate = Op.getOperand(3);
+  SDValue Addr = Op.getOperand(2);
+  SDValue Chain = Op.getOperand(0);
 
-  // Get incoming EBP.
-  SDValue IncomingEBP =
-      DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+  MVT VT  = DataToTruncate.getSimpleValueType();
+  MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
 
-  // SP is saved in the first field of every registration node, so load
-  // [EBP-RegNodeSize] into SP.
-  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
-  SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP,
-                               DAG.getConstant(-RegNodeSize, dl, VT));
-  SDValue NewSP =
-      DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false,
-                  false, VT.getScalarSizeInBits() / 8);
-  Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
-
-  if (!RegInfo->needsStackRealignment(MF)) {
-    // Adjust EBP to point back to the original frame position.
-    SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP);
-    Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
-  } else {
-    assert(RegInfo->hasBasePointer(MF) &&
-           "functions with Win32 EH must use frame or base pointer register");
+  if (isAllOnesConstant(Mask)) // return just a truncate store
+    return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
+                             MachinePointerInfo(), SVT, false, false,
+                             SVT.getScalarSizeInBits()/8);
 
-    // Reload the base pointer (ESI) with the adjusted incoming EBP.
-    SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP);
-    Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP);
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                   Mask.getSimpleValueType().getSizeInBits());
+  // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+  // are extracted by EXTRACT_SUBVECTOR.
+  SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                              DAG.getBitcast(BitcastVT, Mask),
+                              DAG.getIntPtrConstant(0, dl));
 
-    // Reload the spilled EBP value, now that the stack and base pointers are
-    // set up.
-    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-    X86FI->setHasSEHFramePtrSave(true);
-    int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize);
-    X86FI->setSEHFramePtrSaveIndex(FI);
-    SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT),
-                                MachinePointerInfo(), false, false, false,
-                                VT.getScalarSizeInBits() / 8);
-    Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP);
-  }
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(),
+                         MachineMemOperand::MOStore, SVT.getStoreSize(),
+                         SVT.getScalarSizeInBits()/8);
 
-  return Chain;
+  return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
+                            VMask, SVT, MMO, true);
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
@@ -16100,16 +17449,26 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
 
   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
   if (!IntrData) {
-    if (IntNo == llvm::Intrinsic::x86_seh_restoreframe)
-      return LowerSEHRESTOREFRAME(Op, Subtarget, DAG);
+    if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+      return MarkEHRegistrationNode(Op, DAG);
+    if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
+        IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
+        IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
+        IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+      // We need a frame pointer because this will get lowered to a PUSH/POP
+      // sequence.
+      MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+      MFI->setHasOpaqueSPAdjustment(true);
+      // Don't do anything here, we will expand these intrinsics out later
+      // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+      return SDValue();
+    }
     return SDValue();
   }
 
   SDLoc dl(Op);
   switch(IntrData->Type) {
-  default:
-    llvm_unreachable("Unknown Intrinsic Type");
-    break;
+  default: llvm_unreachable("Unknown Intrinsic Type");
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
@@ -16208,14 +17567,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     return DAG.getMergeValues(Results, dl);
   }
   case COMPRESS_TO_MEM: {
-    SDLoc dl(Op);
     SDValue Mask = Op.getOperand(4);
     SDValue DataToCompress = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
 
-    EVT VT = DataToCompress.getValueType();
-    if (isAllOnes(Mask)) // return just a store
+    MVT VT = DataToCompress.getSimpleValueType();
+    if (isAllOnesConstant(Mask)) // return just a store
       return DAG.getStore(Chain, dl, DataToCompress, Addr,
                           MachinePointerInfo(), false, false,
                           VT.getScalarSizeInBits()/8);
@@ -16227,15 +17585,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                         MachinePointerInfo(), false, false,
                         VT.getScalarSizeInBits()/8);
   }
+  case TRUNCATE_TO_MEM_VI8:
+    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
+  case TRUNCATE_TO_MEM_VI16:
+    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
+  case TRUNCATE_TO_MEM_VI32:
+    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
   case EXPAND_FROM_MEM: {
-    SDLoc dl(Op);
     SDValue Mask = Op.getOperand(4);
     SDValue PassThru = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
-    EVT VT = Op.getValueType();
+    MVT VT = Op.getSimpleValueType();
 
-    if (isAllOnes(Mask)) // return just a load
+    if (isAllOnesConstant(Mask)) // return just a load
       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
                          false, VT.getScalarSizeInBits()/8);
 
@@ -16248,6 +17611,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                            Mask, PassThru, Subtarget, DAG), Chain};
     return DAG.getMergeValues(Results, dl);
   }
+  case LOADU:
+  case LOADA: {
+    SDValue Mask = Op.getOperand(4);
+    SDValue PassThru = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+    MVT VT = Op.getSimpleValueType();
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+    if (isAllOnesConstant(Mask)) // return just a load
+      return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+    return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
+                             MemIntr->getMemOperand(), ISD::NON_EXTLOAD);
+  }
   }
 }
 
@@ -16359,6 +17741,21 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
 }
 
+unsigned X86TargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+    return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+  return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+unsigned X86TargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  // Funclet personalities don't use selectors (the runtime does the selection).
+  assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+  return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
@@ -16497,9 +17894,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.hasAttribute(Idx, Attribute::InReg))
+          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+            auto &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
-            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+          }
 
         if (InRegCount > 2) {
           report_fatal_error("Nest register in use - reduce number of inreg"
@@ -16588,8 +17987,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
 
   MachineMemOperand *MMO =
-   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                           MachineMemOperand::MOStore, 2, 2);
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                              MachineMemOperand::MOStore, 2, 2);
 
   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
@@ -16623,12 +18022,75 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
 }
 
-static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+//    to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+//    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+//    split the vector, perform operation on it's Lo a Hi part and
+//    concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  EVT OpVT = VT;
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+    // Extend to 512 bit vector.
+    assert((VT.is256BitVector() || VT.is128BitVector()) &&
+              "Unsupported value type for operation");
+
+    MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+    SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+                                 DAG.getUNDEF(NewVT),
+                                 Op.getOperand(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+          "Unsupported element type");
+
+  if (16 < NumElems) {
+    // Split vector, it's Lo and Hi parts will be handled in next iteration.
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+    MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
+    Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+  }
+
+  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+          "Unsupported value type for operation");
+
+  // Use native supported vector instruction vplzcntd.
+  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
 
+  if (VT.isVector() && Subtarget->hasAVX512())
+    return LowerVectorCTLZ_AVX512(Op, DAG);
+
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
     // Zero extend to i32 since there is not an i8 bsr.
@@ -16658,7 +18120,8 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   return Op;
 }
 
-static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -16686,13 +18149,39 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  unsigned NumBits = VT.getSizeInBits();
+  unsigned NumBits = VT.getScalarSizeInBits();
   SDLoc dl(Op);
-  Op = Op.getOperand(0);
+
+  if (VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    SDValue N0 = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+
+    // lsb(x) = (x & -x)
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+    // cttz_undef(x) = (width - 1) - ctlz(lsb)
+    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+        TLI.isOperationLegal(ISD::CTLZ, VT)) {
+      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+    }
+
+    // cttz(x) = ctpop(lsb - 1)
+    SDValue One = DAG.getConstant(1, dl, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT,
+                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+  }
+
+  assert(Op.getOpcode() == ISD::CTTZ &&
+         "Only scalar CTTZ requires custom lowering");
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {
@@ -16753,6 +18242,13 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
   return Lower256IntArith(Op, DAG);
 }
 
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                         SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -16885,7 +18381,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   SDValue AhiBlo = Ahi;
   SDValue AloBhi = Bhi;
   // Bit cast to 32-bit vectors for MULUDQ
-  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+  MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   A = DAG.getBitcast(MulVT, A);
   B = DAG.getBitcast(MulVT, B);
@@ -16962,7 +18458,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
-  EVT VT = Op0.getValueType();
+  MVT VT = Op0.getSimpleValueType();
   SDLoc dl(Op);
 
   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
@@ -17034,7 +18530,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getMergeValues(Ops, dl);
 }
 
-// Return true if the requred (according to Opcode) shift-imm form is natively
+// Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
                                         unsigned Opcode) {
@@ -17054,14 +18550,14 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
 }
 
 // The shift amount is a variable, but it is the same for all vector lanes.
-// These instrcutions are defined together with shift-immediate.
+// These instructions are defined together with shift-immediate.
 static
 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
                                       unsigned Opcode) {
   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
-// Return true if the requred (according to Opcode) variable-shift form is
+// Return true if the required (according to Opcode) variable-shift form is
 // natively supported by the Subtarget
 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
                                     unsigned Opcode) {
@@ -17133,27 +18629,37 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
       // i64 SRA needs to be performed as partial shifts.
       if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
-          Op.getOpcode() == ISD::SRA)
+          Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
         return ArithmeticShiftRight64(ShiftAmt);
 
-      if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+      if (VT == MVT::v16i8 ||
+          (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+          VT == MVT::v64i8) {
         unsigned NumElts = VT.getVectorNumElements();
         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
-        if (Op.getOpcode() == ISD::SHL) {
-          // Simple i8 add case
-          if (ShiftAmt == 1)
-            return DAG.getNode(ISD::ADD, dl, VT, R, R);
+        // Simple i8 add case
+        if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+          return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+        // ashr(R, 7)  === cmp_slt(R, 0)
+        if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+          SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+        }
 
+        // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+        if (VT == MVT::v16i8 && Subtarget->hasXOP())
+          return SDValue();
+
+        if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
                                                    R, ShiftAmt, DAG);
           SHL = DAG.getBitcast(VT, SHL);
           // Zero out the rightmost bits.
-          SmallVector<SDValue, 32> V(
-              NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+                             DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
@@ -17161,24 +18667,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                                    R, ShiftAmt, DAG);
           SRL = DAG.getBitcast(VT, SRL);
           // Zero out the leftmost bits.
-          SmallVector<SDValue, 32> V(
-              NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+                             DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
         }
         if (Op.getOpcode() == ISD::SRA) {
-          if (ShiftAmt == 7) {
-            // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
-          }
-
-          // R s>> a === ((R u>> a) ^ m) - m
+          // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-          SmallVector<SDValue, 32> V(NumElts,
-                                     DAG.getConstant(128 >> ShiftAmt, dl,
-                                                     MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
+
+          SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
@@ -17189,35 +18685,51 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
-  if (!Subtarget->is64Bit() &&
-      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
-      Amt.getOpcode() == ISD::BITCAST &&
-      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+  if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
+
+    // Peek through any splat that was introduced for i64 shift vectorization.
+    int SplatIndex = -1;
+    if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
+      if (SVN->isSplat()) {
+        SplatIndex = SVN->getSplatIndex();
+        Amt = Amt.getOperand(0);
+        assert(SplatIndex < (int)VT.getVectorNumElements() &&
+               "Splat shuffle referencing second operand");
+      }
+
+    if (Amt.getOpcode() != ISD::BITCAST ||
+        Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
+      return SDValue();
+
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
                      VT.getVectorNumElements();
     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
     uint64_t ShiftAmt = 0;
+    unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
     for (unsigned i = 0; i != Ratio; ++i) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
       if (!C)
         return SDValue();
       // 6 == Log2(64)
       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
     }
-    // Check remaining shift amounts.
-    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
-      uint64_t ShAmt = 0;
-      for (unsigned j = 0; j != Ratio; ++j) {
-        ConstantSDNode *C =
-          dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
-        if (!C)
+
+    // Check remaining shift amounts (if not a splat).
+    if (SplatIndex < 0) {
+      for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+        uint64_t ShAmt = 0;
+        for (unsigned j = 0; j != Ratio; ++j) {
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+          if (!C)
+            return SDValue();
+          // 6 == Log2(64)
+          ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+        }
+        if (ShAmt != ShiftAmt)
           return SDValue();
-        // 6 == Log2(64)
-        ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
       }
-      if (ShAmt != ShiftAmt)
-        return SDValue();
     }
 
     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
@@ -17245,7 +18757,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 
   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
     SDValue BaseShAmt;
-    EVT EltVT = VT.getVectorElementType();
+    MVT EltVT = VT.getVectorElementType();
 
     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
       // Check if this build_vector node is doing a splat.
@@ -17262,7 +18774,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
         SDValue InVec = Amt.getOperand(0);
         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
-          assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
+          assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
                  "Unexpected shuffle index found!");
           BaseShAmt = InVec.getOperand(SplatIdx);
         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
@@ -17327,11 +18839,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     return V;
 
   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
-      return V;
+    return V;
 
   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
     return Op;
 
+  // XOP has 128-bit variable logical/arithmetic shifts.
+  // +ve/-ve Amt = shift left/right.
+  if (Subtarget->hasXOP() &&
+      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+       VT == MVT::v8i16 || VT == MVT::v16i8)) {
+    if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+    }
+    if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+    if (Op.getOpcode() == ISD::SRA)
+      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+  }
+
   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   // shifts per-lane and then shuffle the partial results back together.
   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
@@ -17343,6 +18870,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   }
 
+  // i64 vector arithmetic shift can be emulated with the transform:
+  // M = lshr(SIGN_BIT, Amt)
+  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
+      Op.getOpcode() == ISD::SRA) {
+    SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+    return R;
+  }
+
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // Do this only if the vector shift count is a constant build_vector.
@@ -17351,9 +18891,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
     SmallVector<SDValue, 8> Elts;
-    EVT SVT = VT.getScalarType();
+    MVT SVT = VT.getVectorElementType();
     unsigned SVTBits = SVT.getSizeInBits();
-    const APInt &One = APInt(SVTBits, 1);
+    APInt One(SVTBits, 1);
     unsigned NumElems = VT.getVectorNumElements();
 
     for (unsigned i=0; i !=NumElems; ++i) {
@@ -17364,7 +18904,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       }
 
       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
-      const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
       uint64_t ShAmt = C.getZExtValue();
       if (ShAmt >= SVTBits) {
         Elts.push_back(DAG.getUNDEF(SVT));
@@ -17443,7 +18983,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
         isa<ConstantSDNode>(Amt2)) {
       // Replace this node with two shifts followed by a MOVSS/MOVSD.
-      EVT CastVT = MVT::v4i32;
+      MVT CastVT = MVT::v4i32;
       SDValue Splat1 =
         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
@@ -17507,7 +19047,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
   }
 
-  if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
+  if (VT == MVT::v16i8 ||
+      (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
 
@@ -17627,7 +19168,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   }
 
-  if (Subtarget->hasInt256() && VT == MVT::v16i16) {
+  if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
     MVT ExtVT = MVT::v8i32;
     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
@@ -17710,7 +19251,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   if (VT.is256BitVector()) {
     unsigned NumElems = VT.getVectorNumElements();
     MVT EltVT = VT.getVectorElementType();
-    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+    MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
     // Extract the two vectors
     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
@@ -17743,6 +19284,40 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   return SDValue();
 }
 
+static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  assert(VT.isVector() && "Custom lowering only for vector rotates!");
+  assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+  // XOP has 128-bit vector variable + immediate rotates.
+  // +ve/-ve Amt = rotate left/right.
+
+  // Split 256-bit integers.
+  if (VT.is256BitVector())
+    return Lower256IntArith(Op, DAG);
+
+  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+  // Attempt to rotate by immediate.
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+      uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+                         DAG.getConstant(RotateAmt, DL, MVT::i8));
+    }
+  }
+
+  // Use general rotate by variable (per-element).
+  return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
@@ -17759,8 +19334,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   case ISD::SADDO:
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-      if (C->isOne()) {
+    if (isOneConstant(RHS)) {
         BaseOp = X86ISD::INC;
         Cond = X86::COND_O;
         break;
@@ -17775,8 +19349,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   case ISD::SSUBO:
     // A subtract of one will be selected as a DEC. Note that DEC doesn't
     // set CF, so we can't do this for USUBO.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
-      if (C->isOne()) {
+    if (isOneConstant(RHS)) {
         BaseOp = X86ISD::DEC;
         Cond = X86::COND_O;
         break;
@@ -17827,7 +19400,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
-bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
 
   if (OpWidth == 64)
@@ -17844,21 +19417,23 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
-bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
-  return needsCmpXchgNb(PTy->getElementType());
+  return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
+                                               : AtomicExpansionKind::None;
 }
 
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
-  const Type *MemType = AI->getType();
+  Type *MemType = AI->getType();
 
   // If the operand is too big, we must see if cmpxchg8/16b is available
   // and default to library calls otherwise.
   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
-    return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
-                                   : AtomicRMWExpansionKind::None;
+    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+                                   : AtomicExpansionKind::None;
   }
 
   AtomicRMWInst::BinOp Op = AI->getOperation();
@@ -17869,14 +19444,14 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   case AtomicRMWInst::Add:
   case AtomicRMWInst::Sub:
     // It's better to use xadd, xsub or xchg for these in all cases.
-    return AtomicRMWExpansionKind::None;
+    return AtomicExpansionKind::None;
   case AtomicRMWInst::Or:
   case AtomicRMWInst::And:
   case AtomicRMWInst::Xor:
     // If the atomicrmw's result isn't actually used, we can just add a "lock"
     // prefix to a normal instruction for these operations.
-    return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
-                            : AtomicRMWExpansionKind::None;
+    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+                            : AtomicExpansionKind::None;
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::Max:
   case AtomicRMWInst::Min:
@@ -17884,7 +19459,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   case AtomicRMWInst::UMin:
     // These always require a non-trivial set of data operations on x86. We must
     // use a cmpxchg loop.
-    return AtomicRMWExpansionKind::CmpXChg;
+    return AtomicExpansionKind::CmpXChg;
   }
 }
 
@@ -17898,7 +19473,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) {
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
-  const Type *MemType = AI->getType();
+  Type *MemType = AI->getType();
   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   // there is no benefit in turning such RMWs into loads, and it is actually
   // harmful as it introduces a mfence.
@@ -17926,7 +19501,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // lowered to just a load without a fence. A mfence flushes the store buffer,
   // making the optimization clearly correct.
   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
-  // otherwise, we might be able to be more agressive on relaxed idempotent
+  // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
   if (SynchScope == SingleThread)
@@ -18034,24 +19609,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
-  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+      SrcVT == MVT::i64) {
     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
     if (DstVT != MVT::f64)
       // This conversion needs to be expanded.
       return SDValue();
 
-    SDValue InVec = Op->getOperand(0);
-    SDLoc dl(Op);
-    unsigned NumElts = SrcVT.getVectorNumElements();
-    EVT SVT = SrcVT.getVectorElementType();
-
-    // Widen the vector in input in the case of MVT::v2i32.
-    // Example: from MVT::v2i32 to MVT::v4i32.
+    SDValue Op0 = Op->getOperand(0);
     SmallVector<SDValue, 16> Elts;
-    for (unsigned i = 0, e = NumElts; i != e; ++i)
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
-                                 DAG.getIntPtrConstant(i, dl)));
-
+    SDLoc dl(Op);
+    unsigned NumElts;
+    MVT SVT;
+    if (SrcVT.isVector()) {
+      NumElts = SrcVT.getVectorNumElements();
+      SVT = SrcVT.getVectorElementType();
+
+      // Widen the vector in input in the case of MVT::v2i32.
+      // Example: from MVT::v2i32 to MVT::v4i32.
+      for (unsigned i = 0, e = NumElts; i != e; ++i)
+        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+                                   DAG.getIntPtrConstant(i, dl)));
+    } else {
+      assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() &&
+             "Unexpected source type in LowerBITCAST");
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(0, dl)));
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(1, dl)));
+      NumElts = 2;
+      SVT = MVT::i32;
+    }
     // Explicitly mark the extra elements as Undef.
     Elts.append(NumElts, DAG.getUNDEF(SVT));
 
@@ -18103,7 +19691,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   if (EltVT == MVT::i64) {
     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
-    V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
+    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
     return DAG.getBitcast(VT, V);
   }
 
@@ -18119,9 +19708,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
 
     // Do the horizontal sums into two v2i64s.
     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
-    Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                       DAG.getBitcast(ByteVecVT, Low), Zeros);
-    High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                        DAG.getBitcast(ByteVecVT, High), Zeros);
 
     // Merge them together.
@@ -18311,7 +19901,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
 
 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
                           SelectionDAG &DAG) {
-  assert(Op.getValueType().isVector() &&
+  assert(Op.getSimpleValueType().isVector() &&
          "We only do custom lowering for vector population count.");
   return LowerVectorCTPOP(Op, Subtarget, DAG);
 }
@@ -18357,7 +19947,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getNode()->getSimpleValueType(0);
+  MVT VT = Op.getNode()->getSimpleValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -18435,31 +20025,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
 }
 
+/// Widen a vector input to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+                            bool FillWithZeroes = false) {
+  // Check if InOp already has the right width.
+  MVT InVT = InOp.getSimpleValueType();
+  if (InVT == NVT)
+    return InOp;
+
+  if (InOp.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+         "Unexpected request for vector widening");
+
+  EVT EltVT = NVT.getVectorElementType();
+
+  SDLoc dl(InOp);
+  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+      InOp.getNumOperands() == 2) {
+    SDValue N1 = InOp.getOperand(1);
+    if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+        N1.isUndef()) {
+      InOp = InOp.getOperand(0);
+      InVT = InOp.getSimpleValueType();
+      InNumElts = InVT.getVectorNumElements();
+    }
+  }
+  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i < InNumElts; ++i)
+      Ops.push_back(InOp.getOperand(i));
+
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+      DAG.getUNDEF(EltVT);
+    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+      Ops.push_back(FillVal);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  }
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+    DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+                     InOp, DAG.getIntPtrConstant(0, dl));
+}
+
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
   assert(Subtarget->hasAVX512() &&
          "MGATHER/MSCATTER are supported on AVX-512 arch only");
 
+  // X86 scatter kills mask register, so its type should be added to
+  // the list of return values.
+  // If the "scatter" has 2 return values, it is already handled.
+  if (Op.getNode()->getNumValues() == 2)
+    return Op;
+
   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
-  EVT VT = N->getValue().getValueType();
+  SDValue Src = N->getValue();
+  MVT VT = Src.getSimpleValueType();
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   SDLoc dl(Op);
 
-  // X86 scatter kills mask register, so its type should be added to
-  // the list of return values
-  if (N->getNumValues() == 1) {
-    SDValue Index = N->getIndex();
-    if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
-        !Index.getValueType().is512BitVector())
+  SDValue NewScatter;
+  SDValue Index = N->getIndex();
+  SDValue Mask = N->getMask();
+  SDValue Chain = N->getChain();
+  SDValue BasePtr = N->getBasePtr();
+  MVT MemVT = N->getMemoryVT().getSimpleVT();
+  MVT IndexVT = Index.getSimpleValueType();
+  MVT MaskVT = Mask.getSimpleValueType();
+
+  if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+    // The v2i32 value was promoted to v2i64.
+    // Now we "redo" the type legalizer's work and widen the original
+    // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+    // with a shuffle.
+    assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+           "Unexpected memory type");
+    int ShuffleMask[] = {0, 2, -1, -1};
+    Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+                               DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+    // Now we have 4 elements instead of 2.
+    // Expand the index.
+    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+    Index = ExtendToType(Index, NewIndexVT, DAG);
+
+    // Expand the mask with zeroes
+    // Mask may be <2 x i64> or <2 x i1> at this moment
+    assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+           "Unexpected mask type");
+    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+    VT = MVT::v4i32;
+  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+      !Index.getSimpleValueType().is512BitVector()) {
+    // AVX512F supports only 512-bit vectors. Or data or index should
+    // be 512 bit wide. If now the both index and data are 256-bit, but
+    // the vector contains 8 elements, we just sign-extend the index
+    if (IndexVT == MVT::v8i32)
+      // Just extend index
       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+    else {
+      // The minimal number of elts in scatter is 8
+      NumElts = 8;
+      // Index
+      MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+      // Use original index here, do not modify the index twice
+      Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+      if (IndexVT.getScalarType() == MVT::i32)
+        Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+      // Mask
+      // At this point we have promoted mask operand
+      assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+      MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+      // Use the original mask here, do not modify the mask twice
+      Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+      // The value that should be stored
+      MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+      Src = ExtendToType(Src, NewVT, DAG);
+    }
+  }
+  // If the mask is "wide" at this point - truncate it to i1 vector
+  MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+  Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+  // The mask is killed by scatter, add it to the values
+  SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+  NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand());
+  DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+  return SDValue(NewScatter.getNode(), 0);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
 
-    SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
-                      N->getOperand(3), Index };
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
 
-    SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
-    DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
-    return SDValue(NewScatter.getNode(), 0);
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    SDValue Src0 = N->getSrc0();
+    Src0 = ExtendToType(Src0, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                        N->getBasePtr(), Mask, Src0,
+                                        N->getMemoryVT(), N->getMemOperand(),
+                                        N->getExtensionType());
+
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewLoad.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+  SDValue DataToStore = N->getValue();
+  MVT VT = DataToStore.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                              Mask, N->getMemoryVT(), N->getMemOperand(),
+                              N->isTruncatingStore());
   }
   return Op;
 }
@@ -18470,17 +20232,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
          "MGATHER/MSCATTER are supported on AVX-512 arch only");
 
   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
-  EVT VT = Op.getValueType();
-  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
   SDLoc dl(Op);
-
+  MVT VT = Op.getSimpleValueType();
   SDValue Index = N->getIndex();
+  SDValue Mask = N->getMask();
+  SDValue Src0 = N->getValue();
+  MVT IndexVT = Index.getSimpleValueType();
+  MVT MaskVT = Mask.getSimpleValueType();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
   if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
-      !Index.getValueType().is512BitVector()) {
-    Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
-                      N->getOperand(3), Index };
-    DAG.UpdateNodeOperands(N, Ops);
+      !Index.getSimpleValueType().is512BitVector()) {
+    // AVX512F supports only 512-bit vectors. Or data or index should
+    // be 512 bit wide. If now the both index and data are 256-bit, but
+    // the vector contains 8 elements, we just sign-extend the index
+    if (NumElts == 8) {
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
+                        N->getOperand(3), Index };
+      DAG.UpdateNodeOperands(N, Ops);
+      return Op;
+    }
+
+    // Minimal number of elements in Gather
+    NumElts = 8;
+    // Index
+    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+    Index = ExtendToType(Index, NewIndexVT, DAG);
+    if (IndexVT.getScalarType() == MVT::i32)
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+    // Mask
+    MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+    // At this point we have promoted mask operand
+    assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+    Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+    // The pass-thru value
+    MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+    Src0 = ExtendToType(Src0, NewVT, DAG);
+
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+                                            N->getMemoryVT(), dl, Ops,
+                                            N->getMemOperand());
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewGather.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
   }
   return Op;
 }
@@ -18572,6 +20376,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
@@ -18592,12 +20397,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
-  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
-  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
-  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
+  case ISD::CTLZ:               return LowerCTLZ(Op, Subtarget, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
+  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
@@ -18615,7 +20422,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
+  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   case ISD::GC_TRANSITION_START:
@@ -18634,14 +20447,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case X86ISD::AVG: {
+    // Legalize types for X86ISD::AVG by expanding vectors.
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+
+    auto InVT = N->getValueType(0);
+    auto InVTSize = InVT.getSizeInBits();
+    const unsigned RegSize =
+        (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+    assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+           "512-bit vector requires AVX512");
+    assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+           "256-bit vector requires AVX2");
+
+    auto ElemVT = InVT.getVectorElementType();
+    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                  RegSize / ElemVT.getSizeInBits());
+    assert(RegSize % InVT.getSizeInBits() == 0);
+    unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+    Ops[0] = N->getOperand(0);
+    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+    Ops[0] = N->getOperand(1);
+    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+    SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+    Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+                                  DAG.getIntPtrConstant(0, dl)));
+    return;
+  }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
   case X86ISD::FMAXC:
   case X86ISD::FMAX: {
     EVT VT = N->getValueType(0);
-    if (VT != MVT::v2f32)
-      llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
     SDValue UNDEF = DAG.getUNDEF(VT);
     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(0), UNDEF);
@@ -18668,17 +20510,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::FP_TO_SINT:
-    // FP_TO_INT*_IN_MEM is not legal for f16 inputs.  Do not convert
-    // (FP_TO_SINT (load f16)) to FP_TO_INT*.
-    if (N->getOperand(0).getValueType() == MVT::f16)
-      break;
-    // fallthrough
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
 
-    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
-      return;
-
     std::pair<SDValue,SDValue> Vals =
         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
@@ -18707,6 +20541,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
                              DAG.getBitcast(MVT::v2i64, VBias));
     Or = DAG.getBitcast(MVT::v2f64, Or);
+    // TODO: Are there any fast-math-flags to propagate here?
     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
     return;
@@ -18740,6 +20575,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
     }
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(V);
+    return;
+  }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
@@ -18748,7 +20588,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
-    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+    MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
     SDValue cpInL, cpInH;
     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
                         DAG.getConstant(0, dl, HalfT));
@@ -18884,6 +20724,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::IRET:               return "X86ISD::IRET";
   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
@@ -18910,6 +20751,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::ABS:                return "X86ISD::ABS";
+  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
@@ -18937,12 +20779,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
-  case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
+  case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
+  case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
   case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
+  case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -18951,6 +20795,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::VROTLI:             return "X86ISD::VROTLI";
+  case X86ISD::VROTRI:             return "X86ISD::VROTRI";
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
@@ -18978,6 +20824,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::TESTM:              return "X86ISD::TESTM";
   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
@@ -19000,6 +20847,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
@@ -19009,11 +20857,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
+  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -19022,10 +20872,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SFENCE:             return "X86ISD::SFENCE";
   case X86ISD::LFENCE:             return "X86ISD::LFENCE";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
-  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
   case X86ISD::SAHF:               return "X86ISD::SAHF";
   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
+  case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
+  case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
+  case X86ISD::VPROT:              return "X86ISD::VPROT";
+  case X86ISD::VPROTI:             return "X86ISD::VPROTI";
+  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
+  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
+  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
+  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
   case X86ISD::FMADD:              return "X86ISD::FMADD";
   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
@@ -19038,7 +20895,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
-  case X86ISD::RNDSCALE:           return "X86ISD::RNDSCALE";
+  case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
+  case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
+  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
@@ -19064,6 +20923,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
   case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
   case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
+  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
+  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
   }
   return nullptr;
 }
@@ -19218,7 +21079,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()))
+  if (!Subtarget->hasAnyFMA())
     return false;
 
   VT = VT.getScalarType();
@@ -19253,11 +21114,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
     return false;
 
   // Not for i1 vectors
-  if (VT.getScalarType() == MVT::i1)
+  if (VT.getSimpleVT().getScalarType() == MVT::i1)
     return false;
 
   // Very little shuffling can be done for 64-bit vectors right now.
-  if (VT.getSizeInBits() == 64)
+  if (VT.getSimpleVT().getSizeInBits() == 64)
     return false;
 
   // We only care that the types being shuffled are legal. The lowering can
@@ -19282,8 +21143,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
   DebugLoc DL = MI->getDebugLoc();
 
   const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
+  MachineFunction::iterator I = ++MBB->getIterator();
 
   // For the v = xbegin(), we generate
   //
@@ -19407,6 +21267,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+                                     const X86Subtarget *Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  // insert input VAL into EAX
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+                           .addReg(MI->getOperand(0).getReg());
+  // insert zero to ECX
+  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+                           .addReg(X86::ECX)
+                           .addReg(X86::ECX);
+  // insert zero to EDX
+  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX)
+                           .addReg(X86::EDX)
+                           .addReg(X86::EDX);
+  // insert WRPKRU instruction
+  BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+                                     const X86Subtarget *Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  // insert zero to ECX
+  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+                           .addReg(X86::ECX)
+                           .addReg(X86::ECX);
+  // insert RDPKRU instruction
+  BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+                           .addReg(X86::EAX);
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
                                       const X86Subtarget *Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
@@ -19531,8 +21432,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
-    MachineFunction::iterator MBBIter = MBB;
-    ++MBBIter;
+    MachineFunction::iterator MBBIter = ++MBB->getIterator();
 
     // Insert the new basic blocks
     MF->insert(MBBIter, offsetMBB);
@@ -19702,8 +21602,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   // stores were performed.
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction *F = MBB->getParent();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  MachineFunction::iterator MBBIter = ++MBB->getIterator();
   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(MBBIter, XMMSaveMBB);
@@ -19727,7 +21626,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
 
-  if (!Subtarget->isTargetWin64()) {
+  if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
     // If %al is 0, branch around the XMM save block.
     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
@@ -19744,9 +21643,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   // In the XMM save block, save all the XMM argument registers.
   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
-    MachineMemOperand *MMO =
-      F->getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
+    MachineMemOperand *MMO = F->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
         /*Size=*/16, /*Align=*/16);
     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
@@ -19800,6 +21698,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   return true;
 }
 
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_V8I1:
+  case X86::CMOV_V16I1:
+  case X86::CMOV_V32I1:
+  case X86::CMOV_V64I1:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
@@ -19811,8 +21742,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
@@ -19823,8 +21753,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   MachineBasicBlock *thisMBB = BB;
   MachineFunction *F = BB->getParent();
 
-  // We also lower double CMOVs:
+  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+  // as described above, by inserting a BB, and then making a PHI at the join
+  // point to select the true and false operands of the CMOV in the PHI.
+  //
+  // The code also handles two different cases of multiple CMOV opcodes
+  // in a row.
+  //
+  // Case 1:
+  // In this case, there are multiple CMOVs in a row, all which are based on
+  // the same condition setting (or the exact opposite condition setting).
+  // In this case we can lower all the CMOVs using a single inserted BB, and
+  // then make a number of PHIs at the join point to model the CMOVs. The only
+  // trickiness here, is that in a case like:
+  //
+  // t2 = CMOV cond1 t1, f1
+  // t3 = CMOV cond1 t2, f2
+  //
+  // when rewriting this into PHIs, we have to perform some renaming on the
+  // temps since you cannot have a PHI operand refer to a PHI result earlier
+  // in the same block.  The "simple" but wrong lowering would be:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t2(BB1), f2(BB2)
+  //
+  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+  // renaming is to note that on the path through BB1, t2 is really just a
+  // copy of t1, and do that renaming, properly generating:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t1(BB1), f2(BB2)
+  //
+  // Case 2, we lower cascaded CMOVs such as
+  //
   //   (CMOV (CMOV F, T, cc1), T, cc2)
+  //
   // to two successives branches.  For that, we look for another CMOV as the
   // following instruction.
   //
@@ -19890,19 +21853,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   // .LBB5_4:
   //         retq
   //
-  MachineInstr *NextCMOV = nullptr;
+  MachineInstr *CascadedCMOV = nullptr;
+  MachineInstr *LastCMOV = MI;
+  X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   MachineBasicBlock::iterator NextMIIt =
       std::next(MachineBasicBlock::iterator(MI));
-  if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+
+  // Check for case 1, where there are multiple CMOVs with the same condition
+  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
+  // number of jumps the most.
+
+  if (isCMOVPseudo(MI)) {
+    // See if we have a string of CMOVS with the same condition.
+    while (NextMIIt != BB->end() &&
+           isCMOVPseudo(NextMIIt) &&
+           (NextMIIt->getOperand(3).getImm() == CC ||
+            NextMIIt->getOperand(3).getImm() == OppCC)) {
+      LastCMOV = &*NextMIIt;
+      ++NextMIIt;
+    }
+  }
+
+  // This checks for case 2, but only do this if we didn't already find
+  // case 1, as indicated by LastCMOV == MI.
+  if (LastCMOV == MI &&
+      NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
       NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
-      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
-    NextCMOV = &*NextMIIt;
+      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+    CascadedCMOV = &*NextMIIt;
+  }
 
   MachineBasicBlock *jcc1MBB = nullptr;
 
-  // If we have a double CMOV, we lower it to two successive branches to
+  // If we have a cascaded CMOV, we lower it to two successive branches to
   // the same block.  EFLAGS is used by both, so mark it as live in the second.
-  if (NextCMOV) {
+  if (CascadedCMOV) {
     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
     F->insert(It, jcc1MBB);
     jcc1MBB->addLiveIn(X86::EFLAGS);
@@ -19917,7 +21903,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   // live into the sink and copy blocks.
   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
-  MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+  MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
     copy0MBB->addLiveIn(X86::EFLAGS);
@@ -19926,12 +21912,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Add the true and fallthrough blocks as its successors.
-  if (NextCMOV) {
-    // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+  if (CascadedCMOV) {
+    // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
     BB->addSuccessor(jcc1MBB);
 
     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
@@ -19946,13 +21932,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   BB->addSuccessor(sinkMBB);
 
   // Create the conditional branch instruction.
-  unsigned Opc =
-    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  unsigned Opc = X86::GetCondBranchFromCond(CC);
   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
 
-  if (NextCMOV) {
+  if (CascadedCMOV) {
     unsigned Opc2 = X86::GetCondBranchFromCond(
-        (X86::CondCode)NextCMOV->getOperand(3).getImm());
+        (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
   }
 
@@ -19964,28 +21949,110 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   //  sinkMBB:
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
-  MachineInstrBuilder MIB =
-      BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
-              MI->getOperand(0).getReg())
-          .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
-          .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+    std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+  MachineInstrBuilder MIB;
+
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are generating is the opposite condition from
+    // the jump we generated, then we have to swap the operands for the
+    // PHI that is going to be generated.
+    if (MIIt->getOperand(3).getImm() == OppCC)
+        std::swap(Op1Reg, Op2Reg);
+
+    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+      Op1Reg = RegRewriteTable[Op1Reg].first;
+
+    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+      Op2Reg = RegRewriteTable[Op2Reg].second;
+
+    MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+                  TII->get(X86::PHI), DestReg)
+          .addReg(Op1Reg).addMBB(copy0MBB)
+          .addReg(Op2Reg).addMBB(thisMBB);
 
-  // If we have a double CMOV, the second Jcc provides the same incoming
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // If we have a cascaded CMOV, the second Jcc provides the same incoming
   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
-  if (NextCMOV) {
+  if (CascadedCMOV) {
     MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
     // Copy the PHI result to the register defined by the second CMOV.
     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
-            DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+            DL, TII->get(TargetOpcode::COPY),
+            CascadedCMOV->getOperand(0).getReg())
         .addReg(MI->getOperand(0).getReg());
-    NextCMOV->eraseFromParent();
+    CascadedCMOV->eraseFromParent();
   }
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  // Now remove the CMOV(s).
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+    (MIIt++)->eraseFromParent();
+
   return sinkMBB;
 }
 
 MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  // Combine the following atomic floating-point modification pattern:
+  //   a.store(reg OP a.load(acquire), release)
+  // Transform them into:
+  //   OPss (%gpr), %xmm
+  //   movss %xmm, (%gpr)
+  // Or sd equivalent for 64-bit operations.
+  unsigned MOp, FOp;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+  case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
+  case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+  }
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  MachineOperand MSrc = MI->getOperand(0);
+  unsigned VSrc = MI->getOperand(5).getReg();
+  const MachineOperand &Disp = MI->getOperand(3);
+  MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
+  bool hasDisp = Disp.isGlobal() || Disp.isImm();
+  if (hasDisp && MSrc.isReg())
+    MSrc.setIsKill(false);
+  MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
+                                .addOperand(/*Base=*/MSrc)
+                                .addImm(/*Scale=*/1)
+                                .addReg(/*Index=*/0)
+                                .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+                                .addReg(0);
+  MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
+                              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+                          .addReg(VSrc)
+                          .addOperand(/*Base=*/MSrc)
+                          .addImm(/*Scale=*/1)
+                          .addReg(/*Index=*/0)
+                          .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+                          .addReg(/*Segment=*/0);
+  MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
@@ -20032,8 +22099,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
     sizeVReg = MI->getOperand(1).getReg(),
     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
 
-  MachineFunction::iterator MBBIter = BB;
-  ++MBBIter;
+  MachineFunction::iterator MBBIter = ++BB->getIterator();
 
   MF->insert(MBBIter, bumpMBB);
   MF->insert(MBBIter, mallocMBB);
@@ -20120,14 +22186,60 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
+  assert(!Subtarget->isTargetMachO());
   DebugLoc DL = MI->getDebugLoc();
+  MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
+      *BB->getParent(), *BB, MI, DL, false);
+  MachineBasicBlock *ResumeBB = ResumeMI->getParent();
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return ResumeBB;
+}
 
-  assert(!Subtarget->isTargetMachO());
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
+  DebugLoc DL = MI->getDebugLoc();
 
-  Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI,
-                                                    DL);
+  assert(!isAsynchronousEHPersonality(
+             classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+         "SEH does not use catchret!");
 
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  // Only 32-bit EH needs to worry about manually restoring stack pointers.
+  if (!Subtarget->is32Bit())
+    return BB;
+
+  // C++ EH creates a new target block to hold the restore code, and wires up
+  // the new block to the return destination with a normal JMP_4.
+  MachineBasicBlock *RestoreMBB =
+      MF->CreateMachineBasicBlock(BB->getBasicBlock());
+  assert(BB->succ_size() == 1);
+  MF->insert(std::next(BB->getIterator()), RestoreMBB);
+  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+  BB->addSuccessor(RestoreMBB);
+  MI->getOperand(0).setMBB(RestoreMBB);
+
+  auto RestoreMBBI = RestoreMBB->begin();
+  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
+  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+  bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
+  // Only 32-bit SEH requires special handling for catchpad.
+  if (IsSEH && Subtarget->is32Bit()) {
+    const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
+  }
+  MI->eraseFromParent();
   return BB;
 }
 
@@ -20149,6 +22261,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
+      Subtarget->is64Bit() ?
+      Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
       Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   if (Subtarget->is64Bit()) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
@@ -20198,8 +22312,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
+  MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
@@ -20225,7 +22338,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // For v = setjmp(buf), we generate
   //
   // thisMBB:
-  //  buf[LabelOffset] = restoreMBB
+  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
   //  SjLjSetup restoreMBB
   //
   // mainMBB:
@@ -20245,6 +22358,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   MF->insert(I, mainMBB);
   MF->insert(I, sinkMBB);
   MF->push_back(restoreMBB);
+  restoreMBB->setHasAddressTaken();
 
   MachineInstrBuilder MIB;
 
@@ -20511,35 +22625,74 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return BB;
   case X86::WIN_ALLOCA:
     return EmitLoweredWinAlloca(MI, BB);
+  case X86::CATCHRET:
+    return EmitLoweredCatchRet(MI, BB);
+  case X86::CATCHPAD:
+    return EmitLoweredCatchPad(MI, BB);
   case X86::SEG_ALLOCA_32:
   case X86::SEG_ALLOCA_64:
     return EmitLoweredSegAlloca(MI, BB);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
-  case X86::CMOV_GR8:
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
-  case X86::CMOV_V4F32:
+  case X86::CMOV_FR128:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
   case X86::CMOV_V2F64:
   case X86::CMOV_V2I64:
-  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F32:
   case X86::CMOV_V4F64:
   case X86::CMOV_V4I64:
   case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
   case X86::CMOV_V8F64:
   case X86::CMOV_V8I64:
-  case X86::CMOV_GR16:
-  case X86::CMOV_GR32:
-  case X86::CMOV_RFP32:
-  case X86::CMOV_RFP64:
-  case X86::CMOV_RFP80:
   case X86::CMOV_V8I1:
   case X86::CMOV_V16I1:
   case X86::CMOV_V32I1:
   case X86::CMOV_V64I1:
     return EmitLoweredSelect(MI, BB);
 
+  case X86::RDFLAGS32:
+  case X86::RDFLAGS64: {
+    DebugLoc DL = MI->getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    unsigned PushF =
+        MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+    unsigned Pop =
+        MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+    BuildMI(*BB, MI, DL, TII->get(PushF));
+    BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg());
+
+    MI->eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+
+  case X86::WRFLAGS32:
+  case X86::WRFLAGS64: {
+    DebugLoc DL = MI->getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    unsigned Push =
+        MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+    unsigned PopF =
+        MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg());
+    BuildMI(*BB, MI, DL, TII->get(PopF));
+
+    MI->eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+
+  case X86::RELEASE_FADD32mr:
+  case X86::RELEASE_FADD64mr:
+    return EmitLoweredAtomicFP(MI, BB);
+
   case X86::FP32_TO_INT16_IN_MEM:
   case X86::FP32_TO_INT32_IN_MEM:
   case X86::FP32_TO_INT64_IN_MEM:
@@ -20652,7 +22805,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // Thread synchronization.
   case X86::MONITOR:
     return EmitMonitor(MI, BB, Subtarget);
-
+  // PKU feature
+  case X86::WRPKRU:
+    return EmitWRPKRU(MI, BB, Subtarget);
+  case X86::RDPKRU:
+    return EmitRDPKRU(MI, BB, Subtarget);
   // xbegin
   case X86::XBEGIN:
     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
@@ -20793,7 +22950,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   unsigned Depth) const {
   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
-    return Op.getValueType().getScalarType().getSizeInBits();
+    return Op.getValueType().getScalarSizeInBits();
 
   // Fallback case.
   return 1;
@@ -20814,39 +22971,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
-/// same as extracting the high 128-bit part of 256-bit vector and then
-/// inserting the result into the low part of a new 256-bit vector
-static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
-    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
-        SVOp->getMaskElt(j) >= 0)
-      return false;
-
-  return true;
-}
-
-/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
-/// same as extracting the low 128-bit part of 256-bit vector and then
-/// inserting the result into the high part of a new 256-bit vector
-static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
-    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
-        SVOp->getMaskElt(j) >= 0)
-      return false;
-
-  return true;
-}
-
 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget* Subtarget) {
@@ -20854,7 +22980,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
 
   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
@@ -20920,24 +23046,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
     return DCI.CombineTo(N, InsV);
   }
 
-  //===--------------------------------------------------------------------===//
-  // Combine some shuffles into subvector extracts and inserts:
-  //
-
-  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  if (isShuffleHigh128VectorInsertLow(SVOp)) {
-    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
-    return DCI.CombineTo(N, InsV);
-  }
-
-  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  if (isShuffleLow128VectorInsertHigh(SVOp)) {
-    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
-    return DCI.CombineTo(N, InsV);
-  }
-
   return SDValue();
 }
 
@@ -20966,10 +23074,22 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
   MVT RootVT = Root.getSimpleValueType();
   SDLoc DL(Root);
 
-  // Just remove no-op shuffle masks.
   if (Mask.size() == 1) {
-    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
-                  /*AddTo*/ true);
+    int Index = Mask[0];
+    assert((Index >= 0 || Index == SM_SentinelUndef ||
+            Index == SM_SentinelZero) &&
+           "Invalid shuffle index found!");
+
+    // We may end up with an accumulated mask of size 1 as a result of
+    // widening of shuffle operands (see function canWidenShuffleElements).
+    // If the only shuffle index is equal to SM_SentinelZero then propagate
+    // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
+    // mask, and therefore the entire chain of shuffles can be folded away.
+    if (Index == SM_SentinelZero)
+      DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
+    else
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+                    /*AddTo*/ true);
     return true;
   }
 
@@ -20985,7 +23105,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
   // doesn't preclude something switching to the shorter encoding post-RA.
   //
   // FIXME: Should teach these routines about AVX vector widths.
-  if (FloatDomain && VT.getSizeInBits() == 128) {
+  if (FloatDomain && VT.is128BitVector()) {
     if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
       bool Lo = Mask.equals({0, 0});
       unsigned Shuffle;
@@ -21049,7 +23169,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
   // variants as none of these have single-instruction variants that are
   // superior to the UNPCK formulation.
-  if (!FloatDomain && VT.getSizeInBits() == 128 &&
+  if (!FloatDomain && VT.is128BitVector() &&
       (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
        Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
        Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
@@ -21176,7 +23296,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
     return false;
   SmallVector<int, 16> OpMask;
   bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
   // We only can combine unary shuffles which we can decode the mask for.
   if (!HaveMask || !IsUnary)
     return false;
@@ -21226,26 +23346,28 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
 
   // See if we can recurse into the operand to combine more things.
   switch (Op.getOpcode()) {
-    case X86ISD::PSHUFB:
-      HasPSHUFB = true;
-    case X86ISD::PSHUFD:
-    case X86ISD::PSHUFHW:
-    case X86ISD::PSHUFLW:
-      if (Op.getOperand(0).hasOneUse() &&
-          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                        HasPSHUFB, DAG, DCI, Subtarget))
-        return true;
-      break;
+  case X86ISD::PSHUFB:
+    HasPSHUFB = true;
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+    if (Op.getOperand(0).hasOneUse() &&
+        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                      HasPSHUFB, DAG, DCI, Subtarget))
+      return true;
+    break;
 
-    case X86ISD::UNPCKL:
-    case X86ISD::UNPCKH:
-      assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
-      // We can't check for single use, we have to check that this shuffle is the only user.
-      if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                        HasPSHUFB, DAG, DCI, Subtarget))
-          return true;
-      break;
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+    assert(Op.getOperand(0) == Op.getOperand(1) &&
+           "We only combine unary shuffles!");
+    // We can't check for single use, we have to check that this shuffle is the
+    // only user.
+    if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                      HasPSHUFB, DAG, DCI, Subtarget))
+      return true;
+    break;
   }
 
   // Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -21271,7 +23393,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
   bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary);
   (void)HaveMask;
   assert(HaveMask);
 
@@ -21360,8 +23482,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
     case X86ISD::UNPCKH:
       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
       // shuffle into a preceding word shuffle.
-      if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
-          V.getSimpleValueType().getScalarType() != MVT::i16)
+      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+          V.getSimpleValueType().getVectorElementType() != MVT::i16)
         return SDValue();
 
       // Search for a half-shuffle which we can combine with.
@@ -21438,7 +23560,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   return V;
 }
 
-/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
 ///
 /// We walk up the chain, skipping shuffles of the other half and looking
 /// through shuffles which switch halves trying to find a shuffle of the same
@@ -21520,6 +23643,66 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
     Mask = getPSHUFShuffleMask(N);
     assert(Mask.size() == 4);
     break;
+  case X86ISD::UNPCKL: {
+    // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
+    // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
+    // moves upper half elements into the lower half part. For example:
+    //
+    // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
+    //     undef:v16i8
+    // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
+    //
+    // will be combined to:
+    //
+    // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
+
+    // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
+    // happen due to advanced instructions.
+    if (!VT.is128BitVector())
+      return SDValue();
+
+    auto Op0 = N.getOperand(0);
+    auto Op1 = N.getOperand(1);
+    if (Op0.getOpcode() == ISD::UNDEF &&
+        Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+      ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
+
+      unsigned NumElts = VT.getVectorNumElements();
+      SmallVector<int, 8> ExpectedMask(NumElts, -1);
+      std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
+                NumElts / 2);
+
+      auto ShufOp = Op1.getOperand(0);
+      if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
+        return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
+    }
+    return SDValue();
+  }
+  case X86ISD::BLENDI: {
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
+           "Unexpected input vector types");
+
+    // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+    // operands and changing the mask to 1. This saves us a bunch of
+    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+    // x86InstrInfo knows how to commute this back after instruction selection
+    // if it would help register allocation.
+
+    // TODO: If optimizing for size or a processor that doesn't suffer from
+    // partial register update stalls, this should be transformed into a MOVSD
+    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+    if (VT == MVT::v2f64)
+      if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+        if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+          SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+          return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+        }
+
+    return SDValue();
+  }
   default:
     return SDValue();
   }
@@ -21535,7 +23718,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
     break;
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
-    assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
+    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
 
     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
       return SDValue(); // We combined away this shuffle, so we're done.
@@ -21613,9 +23796,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
 /// the operands which explicitly discard the lanes which are unused by this
 /// operation to try to flow through the rest of the combiner the fact that
 /// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
+  if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+    return SDValue();
 
   // We only handle target-independent shuffles.
   // FIXME: It would be easy and harmless to use the target shuffle mask
@@ -21624,14 +23811,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
     return SDValue();
 
   auto *SVN = cast<ShuffleVectorSDNode>(N);
-  ArrayRef<int> Mask = SVN->getMask();
+  SmallVector<int, 8> Mask;
+  for (int M : SVN->getMask())
+    Mask.push_back(M);
+
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
 
-  // We require the first shuffle operand to be the SUB node, and the second to
-  // be the ADD node.
-  // FIXME: We should support the commuted patterns.
-  if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
+  // We require the first shuffle operand to be the FSUB node, and the second to
+  // be the FADD node.
+  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(V1, V2);
+  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
     return SDValue();
 
   // If there are other uses of these operations we can't fold them.
@@ -21652,12 +23844,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
     return SDValue();
 
-  // Only specific types are legal at this point, assert so we notice if and
-  // when these change.
-  assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
-          VT == MVT::v4f64) &&
-         "Unknown vector type encountered!");
-
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
 }
 
@@ -21677,12 +23863,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
 
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
-  if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
-    if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
+  if (TLI.isTypeLegal(VT))
+    if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
       return AddSub;
 
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasFp256() && VT.is256BitVector() &&
+  if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
@@ -21780,6 +23966,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   SDValue InVec = N->getOperand(0);
   SDValue EltNo = N->getOperand(1);
+  EVT EltVT = N->getValueType(0);
 
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
@@ -21808,14 +23995,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   SmallVector<int, 16> ShuffleMask;
   bool UnaryShuffle;
-  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
                             ShuffleMask, UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
   unsigned NumElems = CurrentVT.getVectorNumElements();
   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+
+  if (Idx == SM_SentinelZero)
+    return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
+                             : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
+  if (Idx == SM_SentinelUndef)
+    return DAG.getUNDEF(EltVT);
+
+  assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
                                          : InVec.getOperand(1);
 
@@ -21840,7 +24035,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
     return SDValue();
 
-  EVT EltVT = N->getValueType(0);
   // If there's a bitcast before the shuffle, check if the load type and
   // alignment is valid.
   unsigned Align = LN0->getAlignment();
@@ -21866,21 +24060,45 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
-/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-/// special and don't usually play with other vector types, it's better to
-/// handle them early to be sure we emit efficient code by avoiding
-/// store-load conversions.
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::x86mmx ||
-      N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
-      N->getOperand(0)->getValueType(0) != MVT::v2i32)
-    return SDValue();
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
 
-  SDValue V = N->getOperand(0);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
-  if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
-    return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
-                       N->getValueType(0), V.getOperand(0));
+  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+  // special and don't usually play with other vector types, it's better to
+  // handle them early to be sure we emit efficient code by avoiding
+  // store-load conversions.
+  if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getValueType() == MVT::v2i32 &&
+      isNullConstant(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N00.getValueType() == MVT::i32)
+      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+  }
+
+  // Convert a bitcasted integer logic operation that has one bitcasted
+  // floating-point operand and one constant operand into a floating-point
+  // logic operation. This may create a load of the constant, but that is
+  // cheaper than materializing the constant in an integer register and
+  // transferring it to an SSE register or transferring the SSE operand to
+  // integer register and back.
+  unsigned FPOpcode;
+  switch (N0.getOpcode()) {
+    case ISD::AND: FPOpcode = X86ISD::FAND; break;
+    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
+    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+    default: return SDValue();
+  }
+  if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
+       (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::BITCAST &&
+      N0.getOperand(0).getOperand(0).getValueType() == VT) {
+    SDValue N000 = N0.getOperand(0).getOperand(0);
+    SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
+    return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+  }
 
   return SDValue();
 }
@@ -21910,26 +24128,26 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                          InputVector.getNode()->getOperand(0));
 
     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
-    SDValue MMXSrcOp = MMXSrc.getOperand(0);
     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
-        MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
-        MMXSrcOp.getOpcode() == ISD::BITCAST &&
-        MMXSrcOp.getValueType() == MVT::v1i64 &&
-        MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
-      return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
-                         N->getValueType(0),
-                         MMXSrcOp.getOperand(0));
+        MMXSrc.getValueType() == MVT::i64) {
+      SDValue MMXSrcOp = MMXSrc.getOperand(0);
+      if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
+          MMXSrcOp.getValueType() == MVT::v1i64 &&
+          MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+        return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                           N->getValueType(0), MMXSrcOp.getOperand(0));
+    }
   }
 
   EVT VT = N->getValueType(0);
 
-  if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
+  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
       InputVector.getOpcode() == ISD::BITCAST &&
-      dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
+      isa<ConstantSDNode>(InputVector.getOperand(0))) {
     uint64_t ExtractedElt =
-	  cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     uint64_t InputValue =
-	  cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
     uint64_t Res = (InputValue >> ExtractedElt) & 1;
     return DAG.getConstant(Res, dl, MVT::i1);
   }
@@ -22036,96 +24254,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
-static std::pair<unsigned, bool>
-matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
-                   SelectionDAG &DAG, const X86Subtarget *Subtarget) {
-  if (!VT.isVector())
-    return std::make_pair(0, false);
-
-  bool NeedSplit = false;
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: return std::make_pair(0, false);
-  case MVT::v4i64:
-  case MVT::v2i64:
-    if (!Subtarget->hasVLX())
-      return std::make_pair(0, false);
-    break;
-  case MVT::v64i8:
-  case MVT::v32i16:
-    if (!Subtarget->hasBWI())
-      return std::make_pair(0, false);
-    break;
-  case MVT::v16i32:
-  case MVT::v8i64:
-    if (!Subtarget->hasAVX512())
-      return std::make_pair(0, false);
-    break;
-  case MVT::v32i8:
-  case MVT::v16i16:
-  case MVT::v8i32:
-    if (!Subtarget->hasAVX2())
-      NeedSplit = true;
-    if (!Subtarget->hasAVX())
-      return std::make_pair(0, false);
-    break;
-  case MVT::v16i8:
-  case MVT::v8i16:
-  case MVT::v4i32:
-    if (!Subtarget->hasSSE2())
-      return std::make_pair(0, false);
-  }
-
-  // SSE2 has only a small subset of the operations.
-  bool hasUnsigned = Subtarget->hasSSE41() ||
-                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
-  bool hasSigned = Subtarget->hasSSE41() ||
-                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
-
-  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
-  unsigned Opc = 0;
-  // Check for x CC y ? x : y.
-  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
-      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
-    switch (CC) {
-    default: break;
-    case ISD::SETULT:
-    case ISD::SETULE:
-      Opc = hasUnsigned ? ISD::UMIN : 0; break;
-    case ISD::SETUGT:
-    case ISD::SETUGE:
-      Opc = hasUnsigned ? ISD::UMAX : 0; break;
-    case ISD::SETLT:
-    case ISD::SETLE:
-      Opc = hasSigned ? ISD::SMIN : 0; break;
-    case ISD::SETGT:
-    case ISD::SETGE:
-      Opc = hasSigned ? ISD::SMAX : 0; break;
-    }
-  // Check for x CC y ? y : x -- a min/max with reversed arms.
-  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
-             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
-    switch (CC) {
-    default: break;
-    case ISD::SETULT:
-    case ISD::SETULE:
-      Opc = hasUnsigned ? ISD::UMAX : 0; break;
-    case ISD::SETUGT:
-    case ISD::SETUGE:
-      Opc = hasUnsigned ? ISD::UMIN : 0; break;
-    case ISD::SETLT:
-    case ISD::SETLE:
-      Opc = hasSigned ? ISD::SMAX : 0; break;
-    case ISD::SETGT:
-    case ISD::SETGE:
-      Opc = hasSigned ? ISD::SMIN : 0; break;
-    }
-  }
-
-  return std::make_pair(Opc, NeedSplit);
-}
-
 static SDValue
 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget *Subtarget) {
@@ -22189,7 +24317,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+      VT != MVT::f80 && VT != MVT::f128 &&
+      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -22535,32 +24664,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Try to match a min/max vector operation.
-  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
-    std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
-    unsigned Opc = ret.first;
-    bool NeedSplit = ret.second;
-
-    if (Opc && NeedSplit) {
-      unsigned NumElems = VT.getVectorNumElements();
-      // Extract the LHS vectors
-      SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
-      SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
-
-      // Extract the RHS vectors
-      SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
-      SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
-
-      // Create min/max for each subvector
-      LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
-      RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
-
-      // Merge the result
-      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
-    } else if (Opc)
-      return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-
   // Simplify vector selection if condition value type matches vselect
   // operand type
   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
@@ -22635,7 +24738,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() &&
       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
-    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+    unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
 
     // Don't optimize vector selects that map to mask-registers.
     if (BitWidth == 1)
@@ -22656,14 +24759,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     // FIXME: We don't support i16-element blends currently. We could and
     // should support them by making *all* the bits in the condition be set
     // rather than just the high bit and using an i8-element blend.
-    if (VT.getScalarType() == MVT::i16)
+    if (VT.getVectorElementType() == MVT::i16)
       return SDValue();
     // Dynamic blending was only available from SSE4.1 onward.
-    if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+    if (VT.is128BitVector() && !Subtarget->hasSSE41())
       return SDValue();
     // Byte blends are only available in AVX2
-    if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
-        !Subtarget->hasAVX2())
+    if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
       return SDValue();
 
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
@@ -22773,12 +24875,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
          SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;
-      ConstantSDNode *CS;
-      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
-          CS->getZExtValue() == 1)
+      if (isOneConstant(SetCC.getOperand(0)))
         OpIdx = 1;
-      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
-          CS->getZExtValue() == 1)
+      if (isOneConstant(SetCC.getOperand(1)))
         OpIdx = 0;
       if (OpIdx == -1)
         break;
@@ -22857,8 +24956,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
                                            X86::CondCode &CC1, SDValue &Flags,
                                            bool &isAnd) {
   if (Cond->getOpcode() == X86ISD::CMP) {
-    ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
-    if (!CondOp1C || !CondOp1C->isNullValue())
+    if (!isNullConstant(Cond->getOperand(1)))
       return false;
 
     Cond = Cond->getOperand(0);
@@ -23102,106 +25200,15 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
-                                                const X86Subtarget *Subtarget) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-  switch (IntNo) {
-  default: return SDValue();
-  // SSE/AVX/AVX2 blend intrinsics.
-  case Intrinsic::x86_avx2_pblendvb:
-    // Don't try to simplify this intrinsic if we don't have AVX2.
-    if (!Subtarget->hasAVX2())
-      return SDValue();
-    // FALL-THROUGH
-  case Intrinsic::x86_avx_blendv_pd_256:
-  case Intrinsic::x86_avx_blendv_ps_256:
-    // Don't try to simplify this intrinsic if we don't have AVX.
-    if (!Subtarget->hasAVX())
-      return SDValue();
-    // FALL-THROUGH
-  case Intrinsic::x86_sse41_blendvps:
-  case Intrinsic::x86_sse41_blendvpd:
-  case Intrinsic::x86_sse41_pblendvb: {
-    SDValue Op0 = N->getOperand(1);
-    SDValue Op1 = N->getOperand(2);
-    SDValue Mask = N->getOperand(3);
-
-    // Don't try to simplify this intrinsic if we don't have SSE4.1.
-    if (!Subtarget->hasSSE41())
-      return SDValue();
-
-    // fold (blend A, A, Mask) -> A
-    if (Op0 == Op1)
-      return Op0;
-    // fold (blend A, B, allZeros) -> A
-    if (ISD::isBuildVectorAllZeros(Mask.getNode()))
-      return Op0;
-    // fold (blend A, B, allOnes) -> B
-    if (ISD::isBuildVectorAllOnes(Mask.getNode()))
-      return Op1;
-
-    // Simplify the case where the mask is a constant i32 value.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
-      if (C->isNullValue())
-        return Op0;
-      if (C->isAllOnesValue())
-        return Op1;
-    }
-
-    return SDValue();
-  }
-
-  // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
-  case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-  case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx2_psra_d: {
-    SDValue Op0 = N->getOperand(1);
-    SDValue Op1 = N->getOperand(2);
-    EVT VT = Op0.getValueType();
-    assert(VT.isVector() && "Expected a vector type!");
-
-    if (isa<BuildVectorSDNode>(Op1))
-      Op1 = Op1.getOperand(0);
-
-    if (!isa<ConstantSDNode>(Op1))
-      return SDValue();
-
-    EVT SVT = VT.getVectorElementType();
-    unsigned SVTBits = SVT.getSizeInBits();
-
-    ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
-    const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
-    uint64_t ShAmt = C.getZExtValue();
-
-    // Don't try to convert this shift into a ISD::SRA if the shift
-    // count is bigger than or equal to the element size.
-    if (ShAmt >= SVTBits)
-      return SDValue();
-
-    // Trivial case: if the shift count is zero, then fold this
-    // into the first operand.
-    if (ShAmt == 0)
-      return Op0;
-
-    // Replace this packed shift intrinsic with a target independent
-    // shift dag node.
-    SDLoc DL(N);
-    SDValue Splat = DAG.getConstant(C, DL, VT);
-    return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat);
-  }
-  }
-}
-
 /// PerformMulCombine - Optimize a single multiply with constant into two
 /// in order to implement it with two cheaper instructions, e.g.
 /// LEA + SHL, LEA + LEA.
 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI) {
+  // An imul is usually smaller than the alternative sequence.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
@@ -23228,9 +25235,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
     MulAmt1 = 3;
     MulAmt2 = MulAmt / 3;
   }
+
+  SDLoc DL(N);
+  SDValue NewMul;
   if (MulAmt2 &&
       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
-    SDLoc DL(N);
 
     if (isPowerOf2_64(MulAmt2) &&
         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
@@ -23239,7 +25248,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
       // is an add.
       std::swap(MulAmt1, MulAmt2);
 
-    SDValue NewMul;
     if (isPowerOf2_64(MulAmt1))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
@@ -23253,10 +25261,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
+  }
+
+  if (!NewMul) {
+    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+           && "Both cases that could cause potential overflows should have "
+              "already been handled.");
+    if (isPowerOf2_64(MulAmt - 1))
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
+                                MVT::i8)));
 
+    else if (isPowerOf2_64(MulAmt + 1))
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+                                N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt + 1),
+                                DL, MVT::i8)), N->getOperand(0));
+  }
+
+  if (NewMul)
     // Do not add new nodes to DAG combiner worklist.
     DCI.CombineTo(N, NewMul, false);
-  }
+
   return SDValue();
 }
 
@@ -23272,18 +25301,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
       N1C && N0.getOpcode() == ISD::AND &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
-    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
-        ((N00.getOpcode() == ISD::ANY_EXTEND ||
-          N00.getOpcode() == ISD::ZERO_EXTEND) &&
-         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
-      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-      APInt ShAmt = N1C->getAPIntValue();
-      Mask = Mask.shl(ShAmt);
-      if (Mask != 0) {
-        SDLoc DL(N);
-        return DAG.getNode(ISD::AND, DL, VT,
-                           N00, DAG.getConstant(Mask, DL, VT));
-      }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    APInt ShAmt = N1C->getAPIntValue();
+    Mask = Mask.shl(ShAmt);
+    bool MaskOK = false;
+    // We can handle cases concerning bit-widening nodes containing setcc_c if
+    // we carefully interrogate the mask to make sure we are semantics
+    // preserving.
+    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+    // of the underlying setcc_c operation if the setcc_c was zero extended.
+    // Consider the following example:
+    //   zext(setcc_c)                 -> i32 0x0000FFFF
+    //   c1                            -> i32 0x0000FFFF
+    //   c2                            -> i32 0x00000001
+    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+                N00.getOpcode() == ISD::ANY_EXTEND) &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+    }
+    if (MaskOK && Mask != 0) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
     }
   }
 
@@ -23304,6 +25349,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned Size = VT.getSizeInBits();
+
+  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+  // depending on sign of (SarConst - [56,48,32,24,16])
+
+  // sexts in X86 are MOVs. The MOVs have the same code size
+  // as above SHIFTs (only SHIFT on 1 has lower code size).
+  // However the MOVs have 2 advantages to a SHIFT:
+  // 1. MOVs can write to a register that differs from source
+  // 2. MOVs accept memory operands
+
+  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+      N0.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+  EVT CVT = N1.getValueType();
+
+  if (SarConst.isNegative())
+    return SDValue();
+
+  for (MVT SVT : MVT::integer_valuetypes()) {
+    unsigned ShiftSize = SVT.getSizeInBits();
+    // skipping types without corresponding sext/zext and
+    // ShlConst that is not one of [56,48,32,24,16]
+    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+      continue;
+    SDLoc DL(N);
+    SDValue NN =
+        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+    SarConst = SarConst - (Size - ShiftSize);
+    if (SarConst == 0)
+      return NN;
+    else if (SarConst.isNegative())
+      return DAG.getNode(ISD::SHL, DL, VT, NN,
+                         DAG.getConstant(-SarConst, DL, CVT));
+    else
+      return DAG.getNode(ISD::SRA, DL, VT, NN,
+                         DAG.getConstant(SarConst, DL, CVT));
+  }
+  return SDValue();
+}
+
 /// \brief Returns a vector of 0s if the node in input is a vector logical
 /// shift by a constant amount which is known to be bigger than or equal
 /// to the vector element size in bits.
@@ -23321,14 +25419,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
       APInt ShiftAmt = AmtSplat->getAPIntValue();
-      unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+      unsigned MaxAmount =
+        VT.getSimpleVT().getVectorElementType().getSizeInBits();
 
       // SSE2/AVX2 logical shifts always return a vector of 0s
       // if the shift amount is bigger than or equal to
       // the element size. The constant shift amount will be
       // encoded as a 8-bit immediate.
       if (ShiftAmt.trunc(8).uge(MaxAmount))
-        return getZeroVector(VT, Subtarget, DAG, DL);
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
     }
 
   return SDValue();
@@ -23342,6 +25441,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
     if (SDValue V = PerformSHLCombine(N, DAG))
       return V;
 
+  if (N->getOpcode() == ISD::SRA)
+    if (SDValue V = PerformSRACombine(N, DAG))
+      return V;
+
   // Try to fold this logical shift into a zero vector.
   if (N->getOpcode() != ISD::SRA)
     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
@@ -23537,7 +25640,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   // Set N0 and N1 to hold the inputs to the new wide operation.
   N0 = N0->getOperand(0);
   if (RHSConstSplat) {
-    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
                      SDValue(RHSConstSplat, 0));
     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
@@ -23552,9 +25655,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   case ISD::ANY_EXTEND:
     return Op;
   case ISD::ZERO_EXTEND: {
-    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+    unsigned InBits = NarrowVT.getScalarSizeInBits();
     APInt Mask = APInt::getAllOnesValue(InBits);
-    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+    Mask = Mask.zext(VT.getScalarSizeInBits());
     return DAG.getNode(ISD::AND, DL, VT,
                        Op, DAG.getConstant(Mask, DL, VT));
   }
@@ -23656,6 +25759,41 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getBitcast(N0.getValueType(), NewShuffle);
 }
 
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  unsigned FPOpcode = ISD::DELETED_NODE;
+  if (N->getOpcode() == ISD::AND)
+    FPOpcode = X86ISD::FAND;
+  else if (N->getOpcode() == ISD::OR)
+    FPOpcode = X86ISD::FOR;
+  else if (N->getOpcode() == ISD::XOR)
+    FPOpcode = X86ISD::FXOR;
+
+  assert(FPOpcode != ISD::DELETED_NODE &&
+         "Unexpected input node for FP logic conversion");
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+      ((Subtarget->hasSSE1() && VT == MVT::i32) ||
+       (Subtarget->hasSSE2() && VT == MVT::i64))) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N10 = N1.getOperand(0);
+    EVT N00Type = N00.getValueType();
+    EVT N10Type = N10.getValueType();
+    if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+      SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+      return DAG.getBitcast(VT, FPLogic);
+    }
+  }
+  return SDValue();
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
@@ -23668,6 +25806,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
     return R;
 
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -23728,6 +25869,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
     return R;
 
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
@@ -23799,7 +25943,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
       if (!Subtarget->hasSSE41())
         return SDValue();
 
-      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+      MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
 
       X = DAG.getBitcast(BlendVT, X);
       Y = DAG.getBitcast(BlendVT, Y);
@@ -23813,9 +25957,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool OptForSize =
-      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
 
   // SHLD/SHRD instructions have lower register pressure, but on some
   // platforms they have higher latency than the equivalent
@@ -23913,17 +26055,188 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+// Try to turn tests against the signbit in the form of:
+//   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+// into:
+//   SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+  // This is only worth doing if the output type is i8.
+  if (N->getValueType(0) != MVT::i8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // We should be performing an xor against a truncated shift.
+  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+    return SDValue();
+
+  // Make sure we are performing an xor against one.
+  if (!isOneConstant(N1))
+    return SDValue();
+
+  // SetCC on x86 zero extends so only act on this if it's a logical shift.
+  SDValue Shift = N0.getOperand(0);
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+    return SDValue();
+
+  // Make sure we are truncating from one of i16, i32 or i64.
+  EVT ShiftTy = Shift.getValueType();
+  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+    return SDValue();
+
+  // Make sure the shift amount extracts the sign bit.
+  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+    return SDValue();
+
+  // Create a greater-than comparison against -1.
+  // N.B. Using SETGE against 0 works but we want a canonical looking
+  // comparison, using SETGT matches up with what TranslateX86CC.
+  SDLoc DL(N);
+  SDValue ShiftOp = Shift.getOperand(0);
+  EVT ShiftOpTy = ShiftOp.getValueType();
+  SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+  return Cond;
+}
+
 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+    return RV;
+
   if (Subtarget->hasCMov())
     if (SDValue RV = performIntegerAbsCombine(N, DAG))
       return RV;
 
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  return SDValue();
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                const X86Subtarget *Subtarget, SDLoc DL) {
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  EVT ScalarVT = VT.getVectorElementType();
+  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+        isPowerOf2_32(NumElems)))
+    return SDValue();
+
+  // InScalarVT is the intermediate type in AVG pattern and it should be greater
+  // than the original input type (i8/i16).
+  EVT InScalarVT = InVT.getVectorElementType();
+  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+    return SDValue();
+
+  if (Subtarget->hasAVX512()) {
+    if (VT.getSizeInBits() > 512)
+      return SDValue();
+  } else if (Subtarget->hasAVX2()) {
+    if (VT.getSizeInBits() > 256)
+      return SDValue();
+  } else {
+    if (VT.getSizeInBits() > 128)
+      return SDValue();
+  }
+
+  // Detect the following pattern:
+  //
+  //   %1 = zext <N x i8> %a to <N x i32>
+  //   %2 = zext <N x i8> %b to <N x i32>
+  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+  //   %4 = add nuw nsw <N x i32> %3, %2
+  //   %5 = lshr <N x i32> %N, <i32 1 x N>
+  //   %6 = trunc <N x i32> %5 to <N x i8>
+  //
+  // In AVX512, the last instruction can also be a trunc store.
+
+  if (In.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // A lambda checking the given SDValue is a constant vector and each element
+  // is in the range [Min, Max].
+  auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV || !BV->isConstant())
+      return false;
+    for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+      if (!C)
+        return false;
+      uint64_t Val = C->getZExtValue();
+      if (Val < Min || Val > Max)
+        return false;
+    }
+    return true;
+  };
+
+  // Check if each element of the vector is left-shifted by one.
+  auto LHS = In.getOperand(0);
+  auto RHS = In.getOperand(1);
+  if (!IsConstVectorInRange(RHS, 1, 1))
+    return SDValue();
+  if (LHS.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // Detect a pattern of a + b + 1 where the order doesn't matter.
+  SDValue Operands[3];
+  Operands[0] = LHS.getOperand(0);
+  Operands[1] = LHS.getOperand(1);
+
+  // Take care of the case when one of the operands is a constant vector whose
+  // element is in the range [1, 256].
+  if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+      Operands[0].getOperand(0).getValueType() == VT) {
+    // The pattern is detected. Subtract one from the constant vector, then
+    // demote it and emit X86ISD::AVG instruction.
+    SDValue One = DAG.getConstant(1, DL, InScalarVT);
+    SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
+                               SmallVector<SDValue, 8>(NumElems, One));
+    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1]);
+  }
+
+  if (Operands[0].getOpcode() == ISD::ADD)
+    std::swap(Operands[0], Operands[1]);
+  else if (Operands[1].getOpcode() != ISD::ADD)
+    return SDValue();
+  Operands[2] = Operands[1].getOperand(0);
+  Operands[1] = Operands[1].getOperand(1);
+
+  // Now we have three operands of two additions. Check that one of them is a
+  // constant vector with ones, and the other two are promoted from i8/i16.
+  for (int i = 0; i < 3; ++i) {
+    if (!IsConstVectorInRange(Operands[i], 1, 1))
+      continue;
+    std::swap(Operands[i], Operands[2]);
+
+    // Check if Operands[0] and Operands[1] are results of type promotion.
+    for (int j = 0; j < 2; ++j)
+      if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+          Operands[j].getOperand(0).getValueType() != VT)
+        return SDValue();
+
+    // The pattern is detected, emit X86ISD::AVG instruction.
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1].getOperand(0));
+  }
+
   return SDValue();
 }
 
@@ -23940,10 +26253,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   // into two 16-byte operations.
   ISD::LoadExtType Ext = Ld->getExtensionType();
+  bool Fast;
+  unsigned AddressSpace = Ld->getAddressSpace();
   unsigned Alignment = Ld->getAlignment();
-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
-  if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+      Ext == ISD::NON_EXTLOAD &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+                             AddressSpace, Alignment, &Fast) && !Fast) {
     unsigned NumElems = RegVT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
@@ -24012,8 +26328,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
       ShuffleVec[i] = i * SizeRatio;
 
     // Can't shuffle using an illegal type.
-    assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
-	    && "WideVecVT should be legal");
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+           "WideVecVT should be legal");
     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
   }
@@ -24026,8 +26342,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
-    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
-      ShuffleVec[i] = NumElems*SizeRatio;
+    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+      ShuffleVec[i] = NumElems * SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    &ShuffleVec[0]);
@@ -24055,7 +26371,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
                                      ISD::NON_EXTLOAD);
   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
-
 }
 /// PerformMSTORECombine - Resolve truncating stores
 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -24073,6 +26388,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // The truncating store is legal in some cases. For example
+  // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+  // are designated for truncate store.
+  // In this case we don't need any further transformations.
+  if (TLI.isTruncStoreLegal(VT, StVT))
+    return SDValue();
+
   // From, To sizes and ElemCount must be pow of two
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for truncating masked store");
@@ -24096,12 +26420,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
     ShuffleVec[i] = i * SizeRatio;
 
   // Can't shuffle using an illegal type.
-  assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
-	  && "WideVecVT should be legal");
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+         "WideVecVT should be legal");
 
   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                        DAG.getUNDEF(WideVecVT),
-                                        &ShuffleVec[0]);
+                                              DAG.getUNDEF(WideVecVT),
+                                              &ShuffleVec[0]);
 
   SDValue NewMask;
   SDValue Mask = Mst->getMask();
@@ -24133,8 +26457,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
-  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
-                            NewMask, StVT, Mst->getMemOperand(), false);
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+                            Mst->getBasePtr(), NewMask, StVT,
+                            Mst->getMemOperand(), false);
 }
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -24148,10 +26473,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
   // If we are saving a concatenation of two XMM registers and 32-byte stores
   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+  bool Fast;
+  unsigned AddressSpace = St->getAddressSpace();
   unsigned Alignment = St->getAlignment();
-  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
-  if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
-      StVT == VT && !IsAligned) {
+  if (VT.is256BitVector() && StVT == VT &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                             AddressSpace, Alignment, &Fast) && !Fast) {
     unsigned NumElems = VT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
@@ -24178,12 +26505,29 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.
   if (St->isTruncatingStore() && VT.isVector()) {
+    // Check if we can detect an AVG pattern from the truncation. If yes,
+    // replace the trunc store by a normal store with the result of X86ISD::AVG
+    // instruction.
+    SDValue Avg =
+        detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
+    if (Avg.getNode())
+      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), St->getAlignment());
+
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
 
+    // The truncating store is legal in some cases. For example
+    // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+    // are designated for truncate store.
+    // In this case we don't need any further transformations.
+    if (TLI.isTruncStoreLegal(VT, StVT))
+      return SDValue();
+
     // From, To sizes and ElemCount must be pow of two
     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
     // We are going to use the original vector elt for storing.
@@ -24306,7 +26650,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget->is64Bit() || F64IsLegal) {
-      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getPointerInfo(), Ld->isVolatile(),
                                   Ld->isNonTemporal(), Ld->isInvariant(),
@@ -24539,8 +26883,234 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+                                  SmallVector<SDValue, 8> &Regs) {
+  assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+                             Regs[0].getValueType() == MVT::v2i64));
+  EVT OutVT = N->getValueType(0);
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InVT = Regs[0].getValueType();
+  EVT InSVT = InVT.getVectorElementType();
+  SDLoc DL(N);
+
+  // First, use mask to unset all bits that won't appear in the result.
+  assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+         "OutSVT can only be either i8 or i16.");
+  SDValue MaskVal =
+      DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
+  SDValue MaskVec = DAG.getNode(
+      ISD::BUILD_VECTOR, DL, InVT,
+      SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+  for (auto &Reg : Regs)
+    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+
+  MVT UnpackedVT, PackedVT;
+  if (OutSVT == MVT::i8) {
+    UnpackedVT = MVT::v8i16;
+    PackedVT = MVT::v16i8;
+  } else {
+    UnpackedVT = MVT::v4i32;
+    PackedVT = MVT::v8i16;
+  }
+
+  // In each iteration, truncate the type by a half size.
+  auto RegNum = Regs.size();
+  for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+       j < e; j *= 2, RegNum /= 2) {
+    for (unsigned i = 0; i < RegNum; i++)
+      Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+    for (unsigned i = 0; i < RegNum / 2; i++)
+      Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+                            Regs[i * 2 + 1]);
+  }
+
+  // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+  // then extract a subvector as the result since v8i8 is not a legal type.
+  if (OutVT == MVT::v8i8) {
+    Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+    Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+                          DAG.getIntPtrConstant(0, DL));
+    return Regs[0];
+  } else if (RegNum > 1) {
+    Regs.resize(RegNum);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+  } else
+    return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+                                  SmallVector<SDValue, 8> &Regs) {
+  assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+  EVT OutVT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+  SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+  for (auto &Reg : Regs) {
+    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+  }
+
+  for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+    Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+                          Regs[i * 2 + 1]);
+
+  if (Regs.size() > 2) {
+    Regs.resize(Regs.size() / 2);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+  } else
+    return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// diffcult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget *Subtarget) {
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  SDValue In = N->getOperand(0);
+  if (!In.getValueType().isSimple())
+    return SDValue();
+
+  EVT InVT = In.getValueType();
+  unsigned NumElems = OutVT.getVectorNumElements();
+
+  // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+  // SSE2, and we need to take care of it specially.
+  // AVX512 provides vpmovdb.
+  if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+    return SDValue();
+
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InSVT = InVT.getVectorElementType();
+  if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+        (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+        NumElems >= 8))
+    return SDValue();
+
+  // SSSE3's pshufb results in less instructions in the cases below.
+  if (Subtarget->hasSSSE3() && NumElems == 8 &&
+      ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+       (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Split a long vector into vectors of legal type.
+  unsigned RegNum = InVT.getSizeInBits() / 128;
+  SmallVector<SDValue, 8> SubVec(RegNum);
+  if (InSVT == MVT::i32) {
+    for (unsigned i = 0; i < RegNum; i++)
+      SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                              DAG.getIntPtrConstant(i * 4, DL));
+  } else {
+    for (unsigned i = 0; i < RegNum; i++)
+      SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                              DAG.getIntPtrConstant(i * 2, DL));
+  }
+
+  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+  // truncate 2 x v4i32 to v8i16.
+  if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+    return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+  else if (InSVT == MVT::i32)
+    return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+  else
+    return SDValue();
+}
+
+static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  // Try to detect AVG pattern first.
+  SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
+                                 Subtarget, SDLoc(N));
+  if (Avg.getNode())
+    return Avg;
+
+  return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  SDValue Arg = N->getOperand(0);
+  SDLoc DL(N);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // If we're negating a FMUL node on a target with FMA, then we can avoid the
+  // use of a constant by performing (-0 - A*B) instead.
+  // FIXME: Check rounding control flags as well once it becomes available.
+  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+    return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                       Arg.getOperand(1), Zero);
+  }
+
+  // If we're negating a FMA node, then we can adjust the
+  // instruction to include the extra negation.
+  if (Arg.hasOneUse()) {
+    switch (Arg.getOpcode()) {
+    case X86ISD::FMADD:
+      return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMSUB:
+      return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMADD:
+      return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMSUB:
+      return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+    // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+    // These logic operations may be executed in the integer domain.
+    SDLoc dl(N);
+    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+    unsigned IntOpcode = 0;
+    switch (N->getOpcode()) {
+      default: llvm_unreachable("Unexpected FP logic op");
+      case X86ISD::FOR: IntOpcode = ISD::OR; break;
+      case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+      case X86ISD::FAND: IntOpcode = ISD::AND; break;
+      case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+    }
+    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+  }
+  return SDValue();
+}
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
-static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
 
   // F[X]OR(0.0, x) -> x
@@ -24552,7 +27122,8 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(0);
-  return SDValue();
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
@@ -24576,8 +27147,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
                      N->getOperand(0), N->getOperand(1));
 }
 
+static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
+                                            const X86Subtarget *Subtarget) {
+  if (Subtarget->useSoftFloat())
+    return SDValue();
+
+  // TODO: Check for global or instruction-level "nnan". In that case, we
+  //       should be able to lower to FMAX/FMIN alone.
+  // TODO: If an operand is already known to be a NaN or not a NaN, this
+  //       should be an optional swap and FMAX/FMIN.
+
+  EVT VT = N->getValueType(0);
+  if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+        (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+        (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+    return SDValue();
+
+  // This takes at least 3 instructions, so favor a library call when operating
+  // on a scalar and minimizing code size.
+  if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc DL(N);
+  EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+      DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  // There are 4 possibilities involving NaN inputs, and these are the required
+  // outputs:
+  //                   Op1
+  //               Num     NaN
+  //            ----------------
+  //       Num  |  Max  |  Op0 |
+  // Op0        ----------------
+  //       NaN  |  Op1  |  NaN |
+  //            ----------------
+  //
+  // The SSE FP max/min instructions were not designed for this case, but rather
+  // to implement:
+  //   Min = Op1 < Op0 ? Op1 : Op0
+  //   Max = Op1 > Op0 ? Op1 : Op0
+  //
+  // So they always return Op0 if either input is a NaN. However, we can still
+  // use those instructions for fmaxnum by selecting away a NaN input.
+
+  // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+  // are NaN, the NaN value of Op1 is the result.
+  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
 /// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
   // FAND(0.0, x) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -24588,11 +27216,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
 
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
   // FANDN(0.0, x) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -24603,7 +27232,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
 
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 static SDValue PerformBTCombine(SDNode *N,
@@ -24673,6 +27302,83 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
+/// to combine math ops, use an LEA, or use a complex addressing mode. This can
+/// eliminate extend, add, and shift instructions.
+static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
+                                       const X86Subtarget *Subtarget) {
+  // TODO: This should be valid for other integer types.
+  EVT VT = Sext->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  // We need an 'add nsw' feeding into the 'sext'.
+  SDValue Add = Sext->getOperand(0);
+  if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+    return SDValue();
+
+  // Having a constant operand to the 'add' ensures that we are not increasing
+  // the instruction count because the constant is extended for free below.
+  // A constant operand can also become the displacement field of an LEA.
+  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+  if (!AddOp1)
+    return SDValue();
+
+  // Don't make the 'add' bigger if there's no hope of combining it with some
+  // other 'add' or 'shl' instruction.
+  // TODO: It may be profitable to generate simpler LEA instructions in place
+  // of single 'add' instructions, but the cost model for selecting an LEA
+  // currently has a high threshold.
+  bool HasLEAPotential = false;
+  for (auto *User : Sext->uses()) {
+    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+      HasLEAPotential = true;
+      break;
+    }
+  }
+  if (!HasLEAPotential)
+    return SDValue();
+
+  // Everything looks good, so pull the 'sext' ahead of the 'add'.
+  int64_t AddConstant = AddOp1->getSExtValue();
+  SDValue AddOp0 = Add.getOperand(0);
+  SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+  // The wider add is guaranteed to not wrap because both operands are
+  // sign-extended.
+  SDNodeFlags Flags;
+  Flags.setNoSignedWrap(true);
+  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+}
+
+/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
+/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
+/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
+/// extends from AH (which we otherwise need to do contortions to access).
+static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  auto OpcodeN = N->getOpcode();
+  auto OpcodeN0 = N0.getOpcode();
+  if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
+        (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT InVT = N0.getValueType();
+  if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
+    return SDValue();
+
+  SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
+                                               : X86ISD::UDIVREM8_ZEXT_HREG;
+  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
+                          N0.getOperand(1));
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+  return R.getValue(1);
+}
+
 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
@@ -24683,18 +27389,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   EVT InSVT = InVT.getScalarType();
   SDLoc DL(N);
 
-  // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
-  // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
-  // This exposes the sext to the sdivrem lowering, so that it directly extends
-  // from AH (which we otherwise need to do contortions to access).
-  if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
-      InVT == MVT::i8 && VT == MVT::i32) {
-    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
-    SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
-                            N0.getOperand(0), N0.getOperand(1));
-    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
-    return R.getValue(1);
-  }
+  if (SDValue DivRem8 = getDivRem8(N, DAG))
+    return DivRem8;
 
   if (!DCI.isBeforeLegalizeOps()) {
     if (InVT == MVT::i1) {
@@ -24763,13 +27459,13 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  if (!Subtarget->hasFp256())
-    return SDValue();
-
-  if (VT.isVector() && VT.getSizeInBits() == 256)
+  if (Subtarget->hasAVX() && VT.is256BitVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
+  if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+    return NewAdd;
+
   return SDValue();
 }
 
@@ -24783,9 +27479,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
-      (!Subtarget->hasFMA() && !Subtarget->hasFMA4() &&
-       !Subtarget->hasAVX512()))
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -24830,8 +27524,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
       N0.getOperand(0).hasOneUse()) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-      if (!C || C->getZExtValue() != 1)
+      if (!isOneConstant(N0.getOperand(1)))
         return SDValue();
       return DAG.getNode(ISD::AND, dl, VT,
                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
@@ -24856,19 +27549,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
-  // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
-  // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
-  // This exposes the zext to the udivrem lowering, so that it directly extends
-  // from AH (which we otherwise need to do contortions to access).
-  if (N0.getOpcode() == ISD::UDIVREM &&
-      N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
-      (VT == MVT::i32 || VT == MVT::i64)) {
-    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
-    SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
-                            N0.getOperand(0), N0.getOperand(1));
-    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
-    return R.getValue(1);
-  }
+  if (SDValue DivRem8 = getDivRem8(N, DAG))
+    return DivRem8;
 
   return SDValue();
 }
@@ -24884,21 +27566,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
 
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
-      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                   LHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+                                 LHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
-      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                   RHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+                                 RHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
 
   if (VT.getScalarType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
@@ -24936,75 +27616,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
-                                         SelectionDAG &DAG) {
-  SDLoc dl(Load);
-  MVT VT = Load->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  SDValue Addr = Load->getOperand(1);
-  SDValue NewAddr = DAG.getNode(
-      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-      DAG.getConstant(Index * EVT.getStoreSize(), dl,
-                      Addr.getSimpleValueType()));
-
-  SDValue NewLoad =
-      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                  DAG.getMachineFunction().getMachineMemOperand(
-                      Load->getMemOperand(), 0, EVT.getStoreSize()));
-  return NewLoad;
-}
-
-static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget *Subtarget) {
-  SDLoc dl(N);
-  MVT VT = N->getOperand(1)->getSimpleValueType(0);
-  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
-         "X86insertps is only defined for v4x32");
-
-  SDValue Ld = N->getOperand(1);
-  if (MayFoldLoad(Ld)) {
-    // Extract the countS bits from the immediate so we can get the proper
-    // address when narrowing the vector load to a specific element.
-    // When the second source op is a memory address, insertps doesn't use
-    // countS and just gets an f32 from that address.
-    unsigned DestIndex =
-        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-
-    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-
-    // Create this as a scalar to vector to match the instruction pattern.
-    SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
-    // countS bits are ignored when loading from memory on insertps, which
-    // means we don't need to explicitly set them to 0.
-    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
-                       LoadScalarToVector, N->getOperand(2));
-  }
-  return SDValue();
-}
-
-static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue V0 = N->getOperand(0);
-  SDValue V1 = N->getOperand(1);
+static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
-  // operands and changing the mask to 1. This saves us a bunch of
-  // pattern-matching possibilities related to scalar math ops in SSE/AVX.
-  // x86InstrInfo knows how to commute this back after instruction selection
-  // if it would help register allocation.
-
-  // TODO: If optimizing for size or a processor that doesn't suffer from
-  // partial register update stalls, this should be transformed into a MOVSD
-  // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
-
-  if (VT == MVT::v2f64)
-    if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
-      if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
-        SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
-        return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
-      }
-
+  // Gather and Scatter instructions use k-registers for masks. The type of
+  // the masks is v*i1. So the mask will be truncated anyway.
+  // The SIGN_EXTEND_INREG my be dropped.
+  SDValue Mask = N->getOperand(2);
+  if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+    NewOps[2] = Mask.getOperand(0);
+    DAG.UpdateNodeOperands(N, NewOps);
+  }
   return SDValue();
 }
 
@@ -25182,7 +27804,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
 
   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   // a 32-bit target where SSE doesn't support i64->FP operations.
-  if (Op0.getOpcode() == ISD::LOAD) {
+  if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
     EVT LdVT = Ld->getValueType(0);
 
@@ -25357,15 +27979,14 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   }
 
   // Check if we can bypass extracting and re-inserting an element of an input
-  // vector. Essentialy:
+  // vector. Essentially:
   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
     SDValue ExtractedV = V.getOperand(0);
     SDValue OrigV = ExtractedV.getOperand(0);
-    if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
-      if (ExtractIdx->getZExtValue() == 0) {
+    if (isNullConstant(ExtractedV.getOperand(1))) {
         MVT OrigVT = OrigV.getSimpleValueType();
         // Extract a subvector if necessary...
         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
@@ -25394,7 +28015,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND:
     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
+  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
@@ -25414,12 +28035,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
+  case ISD::FNEG:           return PerformFNEGCombine(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return PerformTRUNCATECombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
-  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
-  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
-  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:        return performFMinNumFMaxNumCombine(N, DAG,
+                                                                Subtarget);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ANY_EXTEND:
@@ -25433,6 +28059,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGNR:
+  case X86ISD::BLENDI:
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
@@ -25447,14 +28074,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
-  case X86ISD::INSERTPS: {
-    if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return PerformINSERTPSCombine(N, DAG, Subtarget);
-    break;
-  }
-  case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
+  case ISD::MGATHER:
+  case ISD::MSCATTER:       return PerformGatherScatterCombine(N, DAG);
   }
 
   return SDValue();
@@ -25489,6 +28110,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   }
 }
 
+/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
+/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
+/// we don't adjust the stack we clobber the first frame index.
+/// See X86InstrInfo::copyPhysReg.
+bool X86TargetLowering::hasCopyImplyingStackAdjustment(
+    MachineFunction *MF) const {
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  return any_of(MRI.reg_instructions(X86::EFLAGS),
+                [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
 /// IsDesirableToPromoteOp - This method query the target whether it is
 /// beneficial for dag combiner to promote the specified node. If true, it
 /// should return the desired promotion type by reference.
@@ -26084,6 +28717,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       case MVT::f64:
       case MVT::i64:
         return std::make_pair(0U, &X86::FR64RegClass);
+      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
       // Vector types.
       case MVT::v16i8:
       case MVT::v8i16:
@@ -26168,17 +28802,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
       Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
     unsigned Size = VT.getSizeInBits();
-    MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8
-                                  : Size == 16 ? MVT::i16
-                                  : Size == 32 ? MVT::i32
-                                  : Size == 64 ? MVT::i64
-                                  : MVT::Other;
-    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy);
+    if (Size == 1) Size = 8;
+    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
     if (DestReg > 0) {
       Res.first = DestReg;
-      Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass
-                 : SimpleTy == MVT::i16 ? &X86::GR16RegClass
-                 : SimpleTy == MVT::i32 ? &X86::GR32RegClass
+      Res.second = Size == 8 ? &X86::GR8RegClass
+                 : Size == 16 ? &X86::GR16RegClass
+                 : Size == 32 ? &X86::GR32RegClass
                  : &X86::GR64RegClass;
       assert(Res.second->contains(Res.first) && "Register in register class");
     } else {
@@ -26196,6 +28826,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     // target independent register mapper will just pick the first match it can
     // find, ignoring the required type.
 
+    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
     if (VT == MVT::f32 || VT == MVT::i32)
       Res.second = &X86::FR32RegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
@@ -26244,6 +28875,63 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   return -1;
 }
 
-bool X86TargetLowering::isTargetFTOL() const {
-  return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on x86 is expensive. However, when aggressively optimizing
+  // for code size, we prefer to use a div instruction, as it is usually smaller
+  // than the alternative sequence.
+  // The exception to this is vector division. Since x86 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::MinSize);
+  return OptSize && !VT.isVector();
+}
+
+void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  if (!Subtarget->is64Bit())
+    return;
+
+  // Update IsSplitCSR in X86MachineFunctionInfo.
+  X86MachineFunctionInfo *AFI =
+    Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void X86TargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (X86::GR64RegClass.contains(*I))
+      RC = &X86::GR64RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            NewVR)
+        .addReg(*I);
+
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+              *I)
+          .addReg(NewVR);
+  }
 }
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
index 723d530..0ab786e 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -126,6 +126,9 @@ namespace llvm {
       /// 1 is the number of bytes of stack to pop.
       RET_FLAG,
 
+      /// Return from interrupt. Operand 0 is the number of bytes to pop.
+      IRET,
+
       /// Repeat fill, corresponds to X86::REP_STOSx.
       REP_STOS,
 
@@ -182,6 +185,8 @@ namespace llvm {
 
       /// Compute Sum of Absolute Differences.
       PSADBW,
+      /// Compute Double Block Packed Sum-Absolute-Differences
+      DBPSADBW,
 
       /// Bitwise Logical AND NOT of Packed FP values.
       ANDNP,
@@ -211,6 +216,8 @@ namespace llvm {
 
       // FP vector get exponent 
       FGETEXP_RND,
+      // Extract Normalized Mantissas
+      VGETMANT,
       // FP Scale
       SCALEF,
       // Integer add/sub with unsigned saturation.
@@ -236,6 +243,9 @@ namespace llvm {
       // Integer absolute value
       ABS,
 
+      // Detect Conflicts Within a Vector
+      CONFLICT,
+
       /// Floating point max and min.
       FMAX, FMIN,
 
@@ -282,9 +292,8 @@ namespace llvm {
 
       // Vector integer truncate.
       VTRUNC,
-
-      // Vector integer truncate with mask.
-      VTRUNCM,
+      // Vector integer truncate with unsigned/signed saturation.
+      VTRUNCUS, VTRUNCS,
 
       // Vector FP extend.
       VFPEXT,
@@ -295,6 +304,9 @@ namespace llvm {
       // Vector signed/unsigned integer to double.
       CVTDQ2PD, CVTUDQ2PD,
 
+      // Convert a vector to mask, set bits base on MSB.
+      CVT2MASK,
+
       // 128-bit vector logical left / right shift
       VSHLDQ, VSRLDQ,
 
@@ -304,6 +316,9 @@ namespace llvm {
       // Vector shift elements by immediate
       VSHLI, VSRLI, VSRAI,
 
+      // Bit rotate by immediate
+      VROTLI, VROTRI,
+
       // Vector packed double/float comparison.
       CMPP,
 
@@ -349,6 +364,7 @@ namespace llvm {
 
       // OR/AND test for masks
       KORTEST,
+      KTEST,
 
       // Several flavors of instructions with vector shuffle behaviors.
       PACKSS,
@@ -382,12 +398,24 @@ namespace llvm {
       VPERMIV3,
       VPERMI,
       VPERM2X128,
-      //Fix Up Special Packed Float32/64 values
+      // Bitwise ternary logic
+      VPTERNLOG,
+      // Fix Up Special Packed Float32/64 values
       VFIXUPIMM,
-      //Range Restriction Calculation For Packed Pairs of Float32/64 values
+      // Range Restriction Calculation For Packed Pairs of Float32/64 values
       VRANGE,
+      // Reduce - Perform Reduction Transformation on scalar\packed FP
+      VREDUCE,
+      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits
+      VRNDSCALE,
+      // VFPCLASS - Tests Types Of a FP Values for packed types.
+      VFPCLASS, 
+      // VFPCLASSS - Tests Types Of a FP Values for scalar types.
+      VFPCLASSS, 
       // Broadcast scalar to vector
       VBROADCAST,
+      // Broadcast mask to vector
+      VBROADCASTM,
       // Broadcast subvector to vector
       SUBV_BROADCAST,
       // Insert/Extract vector element
@@ -397,13 +425,21 @@ namespace llvm {
       /// SSE4A Extraction and Insertion.
       EXTRQI, INSERTQI,
 
+      // XOP variable/immediate rotations
+      VPROT, VPROTI,
+      // XOP arithmetic/logical shifts
+      VPSHA, VPSHL,
+      // XOP signed/unsigned integer comparisons
+      VPCOM, VPCOMU,
+
       // Vector multiply packed unsigned doubleword integers
       PMULUDQ,
       // Vector multiply packed signed doubleword integers
       PMULDQ,
       // Vector Multiply Packed UnsignedIntegers with Round and Scale
       MULHRS,
-
+      // Multiply and Add Packed Integers
+      VPMADDUBSW, VPMADDWD,
       // FMA nodes
       FMADD,
       FNMADD,
@@ -418,7 +454,6 @@ namespace llvm {
       FNMSUB_RND,
       FMADDSUB_RND,
       FMSUBADD_RND,
-      RNDSCALE,
 
       // Compress and expand
       COMPRESS,
@@ -443,9 +478,6 @@ namespace llvm {
       // falls back to heap allocation if not.
       SEG_ALLOCA,
 
-      // Windows's _ftol2 runtime routine to do fptoui.
-      WIN_FTOL,
-
       // Memory barrier
       MEMBARRIER,
       MFENCE,
@@ -580,15 +612,6 @@ namespace llvm {
     bool isCalleePop(CallingConv::ID CallingConv,
                      bool is64Bit, bool IsVarArg, bool TailCallOpt);
 
-    /// AVX512 static rounding constants.  These need to match the values in
-    /// avx512fintrin.h.
-    enum STATIC_ROUNDING {
-      TO_NEAREST_INT = 0,
-      TO_NEG_INF = 1,
-      TO_POS_INF = 2,
-      TO_ZERO = 3,
-      CUR_DIRECTION = 4
-    };
   }
 
   //===--------------------------------------------------------------------===//
@@ -677,6 +700,10 @@ namespace llvm {
     /// and some i16 instructions are slow.
     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
+    /// Return true if the MachineFunction contains a COPY which would imply
+    /// HasOpaqueSPAdjustment.
+    bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
+
     MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const override;
@@ -813,6 +840,13 @@ namespace llvm {
     /// from i32 to i8 but not from i32 to i16.
     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
+    /// Given an intrinsic, checks if on the target the intrinsic will need to map
+    /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+    /// true and stores the intrinsic information into the IntrinsicInfo that was
+    /// passed to the function.
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                            unsigned Intrinsic) const override;
+
     /// Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
@@ -850,16 +884,7 @@ namespace llvm {
     /// register, not on the X87 floating point stack.
     bool isScalarFPTypeInSSEReg(EVT VT) const {
       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
-    }
-
-    /// Return true if the target uses the MSVC _ftol2 routine for fptoui.
-    bool isTargetFTOL() const;
-
-    /// Return true if the MSVC _ftol2 routine should be used for fptoui to the
-    /// given type.
-    bool isIntegerTypeFTOL(EVT VT) const {
-      return isTargetFTOL() && VT == MVT::i64;
+             (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
     }
 
     /// \brief Returns true if it is beneficial to convert a load of a constant
@@ -879,6 +904,16 @@ namespace llvm {
     unsigned getRegisterByName(const char* RegName, EVT VT,
                                SelectionDAG &DAG) const override;
 
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
     /// This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -890,6 +925,11 @@ namespace llvm {
     bool getStackCookieLocation(unsigned &AddressSpace,
                                 unsigned &Offset) const override;
 
+    /// Return true if the target stores SafeStack pointer at a fixed offset in
+    /// some non-standard address space, and populates the address space and
+    /// offset as appropriate.
+    Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG) const;
 
@@ -899,6 +939,8 @@ namespace llvm {
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
+    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -908,7 +950,6 @@ namespace llvm {
     /// Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
-    const DataLayout *TD;
 
     /// Select between SSE or x87 floating point ops.
     /// When SSE is available, use it for f32 operations.
@@ -955,7 +996,6 @@ namespace llvm {
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                            SelectionDAG& DAG) const;
-    bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
                                 SDValue Chain, bool IsTailCall, bool Is64Bit,
                                 int FPDiff, SDLoc dl) const;
@@ -969,7 +1009,6 @@ namespace llvm {
 
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
@@ -994,9 +1033,9 @@ namespace llvm {
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       SDLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
@@ -1028,6 +1067,15 @@ namespace llvm {
                         const SmallVectorImpl<SDValue> &OutVals,
                         SDLoc dl, SelectionDAG &DAG) const override;
 
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+          MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
@@ -1042,27 +1090,16 @@ namespace llvm {
 
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
-    bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
-    TargetLoweringBase::AtomicRMWExpansionKind
+    TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
     LoadInst *
     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
 
-    bool needsCmpXchgNb(const Type *MemType) const;
-
-    /// Utility function to emit atomic-load-arith operations (and, or, xor,
-    /// nand, max, min, umax, umin). It takes the corresponding instruction to
-    /// expand, the associated machine basic block, and the associated X86
-    /// opcodes for reg/reg.
-    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
-                                           MachineBasicBlock *MBB) const;
-
-    /// Utility function to emit atomic-load-arith operations (and, or, xor,
-    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
-    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
-                                               MachineBasicBlock *MBB) const;
+    bool needsCmpXchgNb(Type *MemType) const;
 
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *EmitVAARG64WithCustomInserter(
@@ -1077,18 +1114,24 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
                                          MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
+                                           MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
                                               MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
                                             MachineBasicBlock *BB) const;
 
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const;
-
     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
                                         MachineBasicBlock *MBB) const;
 
@@ -1121,7 +1164,7 @@ namespace llvm {
                              unsigned &RefinementSteps) const override;
 
     /// Reassociate floating point divisions into multiply by reciprocal.
-    bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+    unsigned combineRepeatedFPDivisors() const override;
   };
 
   namespace X86 {
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index faa9150..49be648 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -79,7 +79,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                           !if (!eq (TypeVariantName, "i"),
                                 !if (!eq (Size, 128), "v2i64",
                                 !if (!eq (Size, 256), "v4i64",
-                                !if (!eq (Size, 512), 
+                                !if (!eq (Size, 512),
                                     !if (!eq (EltSize, 64), "v8i64", "v16i32"),
                                     VTName))), VTName));
 
@@ -145,6 +145,8 @@ def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
 
 // We map scalar types to the smallest (128-bit) vector type
 // with the appropriate element type. This allows to use the same masking logic.
+def i32x_info    : X86VectorVTInfo<1,  i32, GR32, "si">;
+def i64x_info    : X86VectorVTInfo<1,  i64, GR64, "sq">;
 def f32x_info    : X86VectorVTInfo<1,  f32, VR128X, "ss">;
 def f64x_info    : X86VectorVTInfo<1,  f64, VR128X, "sd">;
 
@@ -186,7 +188,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
   let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
-                                     "$dst , "#IntelSrcAsm#"}",
+                                     "$dst, "#IntelSrcAsm#"}",
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
@@ -274,6 +276,22 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
 
+// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+                                     X86VectorVTInfo InVT,
+                                     dag Outs, dag NonTiedIns, string OpcodeStr,
+                                     string AttSrcAsm, string IntelSrcAsm,
+                                     dag RHS> :
+   AVX512_maskable_common<O, F, OutVT, Outs,
+                          !con((ins InVT.RC:$src1), NonTiedIns),
+                          !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+                          !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect InVT.KRCWM:$mask, RHS,
+                           (bitconvert InVT.RC:$src1))>;
+
 multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      dag Outs, dag NonTiedIns, string OpcodeStr,
                                      string AttSrcAsm, string IntelSrcAsm,
@@ -305,18 +323,16 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   list<dag> Pattern,
-                                  list<dag> MaskingPattern,
-                                  string Round = "",
-                                  InstrItinClass itin = NoItinerary> {
+                                  list<dag> MaskingPattern> {
     def NAME: AVX512<O, F, Outs, Ins,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
-                                     "$dst "#Round#", "#IntelSrcAsm#"}",
-                       Pattern, itin>;
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                     "$dst, "#IntelSrcAsm#"}",
+                       Pattern, NoItinerary>;
 
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
-                       OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"#
-                                     "$dst {${mask}}, "#IntelSrcAsm#Round#"}",
-                       MaskingPattern, itin>, EVEX_K;
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
+                       MaskingPattern, NoItinerary>, EVEX_K;
 }
 
 multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -324,33 +340,27 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Ins, dag MaskingIns,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
-                                  dag RHS, dag MaskingRHS,
-                                  string Round = "",
-                                  InstrItinClass itin = NoItinerary> :
+                                  dag RHS, dag MaskingRHS> :
   AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.KRC:$dst, RHS)],
-                         [(set _.KRC:$dst, MaskingRHS)],
-                         Round, NoItinerary>;
+                         [(set _.KRC:$dst, MaskingRHS)]>;
 
 multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS, string Round = "",
-                           InstrItinClass itin = NoItinerary> :
+                           dag RHS> :
    AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (and _.KRCWM:$mask, RHS),
-                          Round, itin>;
+                          (and _.KRCWM:$mask, RHS)>;
 
 multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm> :
    AVX512_maskable_custom_cmp<O, F, Outs,
                              Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
-                             AttSrcAsm, IntelSrcAsm,
-                             [],[],"", NoItinerary>;
+                             AttSrcAsm, IntelSrcAsm, [],[]>;
 
 // Bitcasts between 512-bit vector types. Return the original type since
 // no instruction is needed for the conversion
@@ -471,84 +481,123 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
-
-multiclass vinsert_for_size_no_alt<int Opcode,
-                                   X86VectorVTInfo From, X86VectorVTInfo To,
-                                   PatFrag vinsert_insert,
-                                   SDNodeXForm INSERT_get_vinsert_imm> {
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
+                                                       PatFrag vinsert_insert> {
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
-    def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst),
-               (ins VR512:$src1, From.RC:$src2, u8imm:$src3),
-               "vinsert" # From.EltTypeName # "x" # From.NumElts #
-                                                "\t{$src3, $src2, $src1, $dst|"
-                                                   "$dst, $src1, $src2, $src3}",
-               [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1),
-                                                       (From.VT From.RC:$src2),
-                                                       (iPTR imm)))]>,
-             EVEX_4V, EVEX_V512;
+    defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+                   (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+                   "vinsert" # From.EltTypeName # "x" # From.NumElts,
+                   "$src3, $src2, $src1", "$src1, $src2, $src3",
+                   (vinsert_insert:$src3 (To.VT To.RC:$src1),
+                                         (From.VT From.RC:$src2),
+                                         (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
 
-    let mayLoad = 1 in
-    def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst),
-               (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3),
-               "vinsert" # From.EltTypeName # "x" # From.NumElts #
-                                                "\t{$src3, $src2, $src1, $dst|"
-                                                   "$dst, $src1, $src2, $src3}",
-               []>,
-             EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>;
-  }
-}
-
-multiclass vinsert_for_size<int Opcode,
-                            X86VectorVTInfo From, X86VectorVTInfo To,
-                            X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo,
-                            PatFrag vinsert_insert,
-                            SDNodeXForm INSERT_get_vinsert_imm> :
-  vinsert_for_size_no_alt<Opcode, From, To,
-                          vinsert_insert, INSERT_get_vinsert_imm> {
-  // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for
-  // vinserti32x4.  Only add this if 64x2 and friends are not supported
-  // natively via AVX512DQ.
-  let Predicates = [NoDQI] in
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+                   (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+                   "vinsert" # From.EltTypeName # "x" # From.NumElts,
+                   "$src3, $src2, $src1", "$src1, $src2, $src3",
+                   (vinsert_insert:$src3 (To.VT To.RC:$src1),
+                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+                   EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+  }
+}
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+                       X86VectorVTInfo To, PatFrag vinsert_insert,
+                       SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+  let Predicates = p in {
     def : Pat<(vinsert_insert:$ins
-                 (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)),
-              (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr")
-                            VR512:$src1, From.RC:$src2,
-                            (INSERT_get_vinsert_imm VR512:$ins)))>;
+                     (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+              (To.VT (!cast<Instruction>(InstrStr#"rr")
+                     To.RC:$src1, From.RC:$src2,
+                     (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+    def : Pat<(vinsert_insert:$ins
+                  (To.VT To.RC:$src1),
+                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                  (iPTR imm)),
+              (To.VT (!cast<Instruction>(InstrStr#"rm")
+                  To.RC:$src1, addr:$src2,
+                  (INSERT_get_vinsert_imm To.RC:$ins)))>;
+  }
 }
 
 multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
                             ValueType EltVT64, int Opcode256> {
-  defm NAME # "32x4" : vinsert_for_size<Opcode128,
+
+  let Predicates = [HasVLX] in
+    defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 vinsert128_insert>, EVEX_V256;
+
+  defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
-                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vinsert128_insert>, EVEX_V512;
+
+  defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 vinsert128_insert,
-                                 INSERT_get_vinsert128_imm>;
-  let Predicates = [HasDQI] in
-    defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128,
+                                 vinsert256_insert>, VEX_W, EVEX_V512;
+
+  let Predicates = [HasVLX, HasDQI] in
+    defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+                                   X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                   X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                   vinsert128_insert>, VEX_W, EVEX_V256;
+
+  let Predicates = [HasDQI] in {
+    defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 vinsert128_insert,
-                                 INSERT_get_vinsert128_imm>, VEX_W;
-  defm NAME # "64x4" : vinsert_for_size<Opcode256,
-                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
-                                 X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 X86VectorVTInfo< 8, EltVT32, VR256>,
-                                 X86VectorVTInfo<16, EltVT32, VR512>,
-                                 vinsert256_insert,
-                                 INSERT_get_vinsert256_imm>, VEX_W;
-  let Predicates = [HasDQI] in
-    defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256,
-                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
-                                 X86VectorVTInfo<16, EltVT32, VR512>,
-                                 vinsert256_insert,
-                                 INSERT_get_vinsert256_imm>;
+                                 vinsert128_insert>, VEX_W, EVEX_V512;
+
+    defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+                                   X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                   X86VectorVTInfo<16, EltVT32, VR512>,
+                                   vinsert256_insert>, EVEX_V512;
+  }
 }
 
 defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
 defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
 
+// Codegen pattern with the alternative types,
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
 // vinsertps - insert f32 to XMM
 def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
@@ -566,90 +615,158 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
 // AVX-512 VECTOR EXTRACT
 //---
 
+multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From,
+                                                     X86VectorVTInfo To> {
+  // A subvector extract from the first vector position is
+  // a subregister copy that needs no instruction.
+  def NAME # To.NumElts:
+      Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))),
+          (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>;
+}
+
 multiclass vextract_for_size<int Opcode,
-                             X86VectorVTInfo From, X86VectorVTInfo To,
-                             X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo,
-                             PatFrag vextract_extract,
-                             SDNodeXForm EXTRACT_get_vextract_imm> {
+                                    X86VectorVTInfo From, X86VectorVTInfo To,
+                                    PatFrag vextract_extract> :
+  vextract_for_size_first_position_lowering<From, To> {
+
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+    // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
+    // vextract_extract), we interesting only in patterns without mask,
+    // intrinsics pattern match generated bellow.
     defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
-                (ins VR512:$src1, u8imm:$idx),
-                "vextract" # To.EltTypeName # "x4",
+                (ins From.RC:$src1, i32u8imm:$idx),
+                "vextract" # To.EltTypeName # "x" # To.NumElts,
                 "$idx, $src1", "$src1, $idx",
-                [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1),
+                [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
                                                          (iPTR imm)))]>,
-              AVX512AIi8Base, EVEX, EVEX_V512;
-    let mayStore = 1 in
-    def rm : AVX512AIi8<Opcode, MRMDestMem, (outs),
-            (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2),
-            "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|"
-                                               "$dst, $src1, $src2}",
-            []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>;
-  }
-
-  // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for
-  // vextracti32x4
-  def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)),
-            (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr")
-                          VR512:$src1,
-                          (EXTRACT_get_vextract_imm To.RC:$ext)))>;
-
-  // A 128/256-bit subvector extract from the first 512-bit vector position is
-  // a subregister copy that needs no instruction.
-  def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))),
-            (To.VT
-               (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>;
-
-  // And for the alternative types.
-  def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))),
-            (AltTo.VT
-               (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>;
+              AVX512AIi8Base, EVEX;
+    let mayStore = 1 in {
+      def rm  : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                      (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2),
+                      "vextract" # To.EltTypeName # "x" # To.NumElts #
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX;
+
+      def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                      (ins To.MemOp:$dst, To.KRCWM:$mask,
+                                          From.RC:$src1, i32u8imm:$src2),
+                       "vextract" # To.EltTypeName # "x" # To.NumElts #
+                            "\t{$src2, $src1, $dst {${mask}}|"
+                            "$dst {${mask}}, $src1, $src2}",
+                      []>, EVEX_K, EVEX;
+    }//mayStore = 1
+  }
 
   // Intrinsic call with masking.
   def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
-                              "x4_512")
-                VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask),
-            (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0,
-                (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)),
-                VR512:$src1, imm:$idx)>;
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrk")
+                To.RC:$src0,
+                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+                From.RC:$src1, imm:$idx)>;
 
   // Intrinsic call with zero-masking.
   def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
-                              "x4_512")
-                VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask),
-            (!cast<Instruction>(NAME # To.EltSize # "x4rrkz")
-                (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)),
-                VR512:$src1, imm:$idx)>;
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrkz")
+                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+                From.RC:$src1, imm:$idx)>;
 
   // Intrinsic call without masking.
   def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
-                              "x4_512")
-                VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
-            (!cast<Instruction>(NAME # To.EltSize # "x4rr")
-                VR512:$src1, imm:$idx)>;
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rr")
+                From.RC:$src1, imm:$idx)>;
+}
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+                X86VectorVTInfo To, PatFrag vextract_extract,
+                SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> :
+  vextract_for_size_first_position_lowering<From, To> {
+
+  let Predicates = p in
+     def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+               (To.VT (!cast<Instruction>(InstrStr#"rr")
+                          From.RC:$src1,
+                          (EXTRACT_get_vextract_imm To.RC:$ext)))>;
 }
 
-multiclass vextract_for_type<ValueType EltVT32, int Opcode32,
-                             ValueType EltVT64, int Opcode64> {
-  defm NAME # "32x4" : vextract_for_size<Opcode32,
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+                                             ValueType EltVT64, int Opcode256> {
+  defm NAME # "32x4Z" : vextract_for_size<Opcode128,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 vextract128_extract>,
+                                     EVEX_V512, EVEX_CD8<32, CD8VT4>;
+  defm NAME # "64x4Z" : vextract_for_size<Opcode256,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 vextract256_extract>,
+                                     VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+  let Predicates = [HasVLX] in
+    defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 vextract128_extract>,
+                                     EVEX_V256, EVEX_CD8<32, CD8VT4>;
+  let Predicates = [HasVLX, HasDQI] in
+    defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
-                                 vextract128_extract,
-                                 EXTRACT_get_vextract128_imm>;
-  defm NAME # "64x4" : vextract_for_size<Opcode64,
+                                 vextract128_extract>,
+                                     VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+  let Predicates = [HasDQI] in {
+    defm NAME # "64x2Z" : vextract_for_size<Opcode128,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vextract128_extract>,
+                                     VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+    defm NAME # "32x8Z" : vextract_for_size<Opcode256,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
-                                 X86VectorVTInfo< 8, EltVT32, VR256>,
-                                 vextract256_extract,
-                                 EXTRACT_get_vextract256_imm>, VEX_W;
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 vextract256_extract>,
+                                     EVEX_V512, EVEX_CD8<32, CD8VT8>;
+  }
 }
 
 defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
 defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
 
+// extract_subvector codegen patterns with the alternative types.
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
 // A 128-bit subvector insert to the first 512-bit vector position
 // is a subregister copy that needs no instruction.
 def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
@@ -677,6 +794,10 @@ def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
           (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
 def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
           (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
 
 // vextractps - extract 32 bits from XMM
 def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
@@ -694,50 +815,49 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
 //===---------------------------------------------------------------------===//
 // AVX-512 BROADCAST
 //---
-multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
-                              ValueType svt, X86VectorVTInfo _> {
-  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                   (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix),
-                   "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>,
-                   T8PD, EVEX;
 
-  let mayLoad = 1 in {
-    defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                     (ins _.ScalarMemOp:$src),
-                     "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src",
-                     (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>,
-                     T8PD, EVEX;
-  }
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                   (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
+                   (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
+                   T8PD, EVEX;
+  let mayLoad = 1 in
+    defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                     (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                     (DestInfo.VT (X86VBroadcast
+                                     (SrcInfo.ScalarLdFrag addr:$src)))>,
+                     T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
 }
 
-multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode,
-                                  AVX512VLVectorVTInfo _> {
-  defm Z  : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>,
+multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> {
+  defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
                              EVEX_V512;
 
   let Predicates = [HasVLX] in {
-    defm Z256  : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>,
-                                  EVEX_V256;
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                             EVEX_V256;
   }
 }
 
 let ExeDomain = SSEPackedSingle in {
-  defm VBROADCASTSS  : avx512_fp_broadcast_vl<0x18, X86VBroadcast,
-                              avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>;
+  defm VBROADCASTSS  : avx512_fp_broadcast_vl<0x18, "vbroadcastss",
+                                         avx512vl_f32_info>;
    let Predicates = [HasVLX] in {
-     defm VBROADCASTSSZ128  : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X,
-                                     v4f32, v4f32x_info>, EVEX_V128,
-                                     EVEX_CD8<32, CD8VT1>;
+     defm VBROADCASTSSZ128  : avx512_broadcast_rm<0x18, "vbroadcastss",
+                                         v4f32x_info, v4f32x_info>, EVEX_V128;
    }
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VBROADCASTSD  : avx512_fp_broadcast_vl<0x19, X86VBroadcast,
-                              avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VBROADCASTSD  : avx512_fp_broadcast_vl<0x19, "vbroadcastsd",
+                                         avx512vl_f64_info>, VEX_W;
 }
 
 // avx512_broadcast_pat introduces patterns for broadcast with a scalar argument.
-// Later, we can canonize broadcast instructions before ISel phase and 
+// Later, we can canonize broadcast instructions before ISel phase and
 // eliminate additional patterns on ISel.
 // SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar
 // representations of source
@@ -834,70 +954,50 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
                    (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
           (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
 
-multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
-                          X86MemOperand x86memop, PatFrag ld_frag,
-                          RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
-                          RegisterClass KRC> {
-  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set DstRC:$dst,
-                    (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
-  def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
-                                                         VR128X:$src),
-                    !strconcat(OpcodeStr,
-                    "\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"),
-                    []>, EVEX, EVEX_K;
-  def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
-                                                         VR128X:$src),
-                    !strconcat(OpcodeStr,
-                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-                    []>, EVEX, EVEX_KZ;
-  let mayLoad = 1 in {
-  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set DstRC:$dst,
-                    (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
-  def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
-                                                         x86memop:$src),
-                  !strconcat(OpcodeStr,
-                      "\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"),
-                  []>, EVEX, EVEX_K;
-  def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
-                                                         x86memop:$src),
-                  !strconcat(OpcodeStr,
-                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-                  [(set DstRC:$dst, (OpVT (vselect KRC:$mask,
-                             (X86VBroadcast (ld_frag addr:$src)), 
-                             (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ;
-  }
-}
-
-defm VPBROADCASTDZ  : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
-                      loadi32, VR512, v16i32, v4i32, VK16WM>,
-                      EVEX_V512, EVEX_CD8<32, CD8VT1>;
-defm VPBROADCASTQZ  : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
-                      loadi64, VR512, v8i64, v2i64, VK8WM>,  EVEX_V512, VEX_W,
-                      EVEX_CD8<64, CD8VT1>;
+// Provide aliases for broadcast from the same register class that
+// automatically does the extract.
+multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
+                                            X86VectorVTInfo SrcInfo> {
+  def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
+                (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+}
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+                                        AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+               avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+                                  EVEX_V512;
+    // Defined separately to avoid redefinition.
+    defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+                                 EVEX_V256;
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+                                 EVEX_V128;
+  }
+}
+
+defm VPBROADCASTB  : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+                                           avx512vl_i8_info, HasBWI>;
+defm VPBROADCASTW  : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+                                           avx512vl_i16_info, HasBWI>;
+defm VPBROADCASTD  : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+                                           avx512vl_i32_info, HasAVX512>;
+defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+                                           avx512vl_i64_info, HasAVX512>, VEX_W;
 
 multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
-  let mayLoad = 1 in {
-  def rm : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set _Dst.RC:$dst, 
-                    (_Dst.VT (X86SubVBroadcast 
-                    (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))))]>, EVEX;
-  def rmk : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask,
-                                                         _Src.MemOp:$src),
-                  !strconcat(OpcodeStr,
-                      "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
-                  []>, EVEX, EVEX_K;
-  def rmkz : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask,
-                                                         _Src.MemOp:$src),
-                  !strconcat(OpcodeStr,
-                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-                  []>, EVEX, EVEX_KZ;
-  }
+  let mayLoad = 1 in 
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), 
+                             (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+                             (_Dst.VT (X86SubVBroadcast
+                               (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, 
+                              AVX5128IBase, EVEX;
 }
 
 defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
@@ -944,10 +1044,45 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
 }
 
-def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
-          (VPBROADCASTDZrr VR128X:$src)>;
-def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
-          (VPBROADCASTQZrr VR128X:$src)>;
+multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr,
+                                 X86VectorVTInfo _Dst, X86VectorVTInfo _Src,
+                                 SDNode OpNode = X86SubVBroadcast> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+                   (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+                   (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>,
+                   T8PD, EVEX;
+  let mayLoad = 1 in
+    defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                   (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                   (_Dst.VT (OpNode
+                              (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>,
+                   T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+                             AVX512VLVectorVTInfo _> {
+  let Predicates = [HasDQI] in
+    defm Z :    avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>,
+                                  EVEX_V512;
+  let Predicates = [HasDQI, HasVLX] in
+    defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>,
+                                  EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> :
+  avx512_common_broadcast_32x2<opc, OpcodeStr, _> {
+
+  let Predicates = [HasDQI, HasVLX] in
+    defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128,
+                                      X86SubV32x2Broadcast>, EVEX_V128;
+}
+
+defm VPBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+                                           avx512vl_i32_info>;
+defm VPBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+                                           avx512vl_f32_info>;
 
 def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
           (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
@@ -959,21 +1094,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
 def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
           (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
 
-def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))),
-          (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
-def : Pat<(v16i32 (X86VBroadcast (v8i32 VR256X:$src))),
-          (VPBROADCASTDZrr (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm))>;
-
-def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))),
-          (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
-def : Pat<(v8i64 (X86VBroadcast (v4i64 VR256X:$src))),
-          (VPBROADCASTQZrr (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm))>;
-
-def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
-          (VBROADCASTSSZr VR128X:$src)>;
-def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
-          (VBROADCASTSDZr VR128X:$src)>;
-
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
 def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
@@ -985,170 +1105,178 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST MASK TO VECTOR REGISTER
 //---
-
-multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
-                       RegisterClass KRC> {
-let Predicates = [HasCDI] in
-def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  []>, EVEX, EVEX_V512;
-
-let Predicates = [HasCDI, HasVLX] in {
-def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  []>, EVEX, EVEX_V128;
-def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src),
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+                                  X86VectorVTInfo _, RegisterClass KRC> {
+  def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  []>, EVEX, EVEX_V256;
+                  [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
 }
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, 
+                                 AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+  let Predicates = [HasCDI] in
+    defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+  let Predicates = [HasCDI, HasVLX] in {
+    defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+    defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+  }
 }
 
-let Predicates = [HasCDI] in {
 defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
-                                             VK16>;
+                                               avx512vl_i32_info, VK16>;
 defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
-                                             VK8>, VEX_W;
-}
+                                               avx512vl_i64_info, VK8>, VEX_W;
 
 //===----------------------------------------------------------------------===//
-// AVX-512 - VPERM
-//
-// -- immediate form --
-multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in {
-  def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst),
-                     (ins _.RC:$src1, u8imm:$src2),
-                     !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set _.RC:$dst,
-                       (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
-                     EVEX;
-  def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst),
-                     (ins _.MemOp:$src1, u8imm:$src2),
-                     !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set _.RC:$dst,
-                       (_.VT (OpNode (_.LdFrag addr:$src1),
-                              (i8 imm:$src2))))]>,
-           EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+                         X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst" in {
+  defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+         AVX5128IBase;
+
+  let mayLoad = 1 in
+  defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2,
+                   (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+            EVEX_4V, AVX5128IBase;
+  }
 }
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+  let mayLoad = 1, Constraints = "$src1 = $dst" in
+  defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+              (ins _.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+              !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (_.VT (X86VPermi2X IdxVT.RC:$src1,
+               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+              AVX5128IBase, EVEX_4V, EVEX_B;
 }
 
-multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
-                         X86VectorVTInfo Ctrl> :
-     avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> {
-  let ExeDomain = _.ExeDomain in {
-    def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.RC:$src2),
-                     !strconcat("vpermil" # _.Suffix,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set _.RC:$dst,
-                         (_.VT (X86VPermilpv _.RC:$src1,
-                                  (Ctrl.VT Ctrl.RC:$src2))))]>,
-             EVEX_4V;
-    def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst),
-                     (ins _.RC:$src1, Ctrl.MemOp:$src2),
-                     !strconcat("vpermil" # _.Suffix,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set _.RC:$dst,
-                         (_.VT (X86VPermilpv _.RC:$src1,
-                                  (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>,
-             EVEX_4V;
-  }
-}
-defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>,
-                  EVEX_V512;
-defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>,
-                  EVEX_V512, VEX_W;
-
-def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
-          (VPERMILPSZri VR512:$src1, imm:$imm)>;
-def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
-          (VPERMILPDZri VR512:$src1, imm:$imm)>;
-
-// -- VPERM2I - 3 source operands form --
-multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr,
-                            SDNode OpNode, X86VectorVTInfo _> {
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+                               AVX512VLVectorVTInfo VTInfo,
+                               AVX512VLVectorVTInfo ShuffleMask> {
+  defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+                           ShuffleMask.info512>,
+            avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512,
+                             ShuffleMask.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+                               ShuffleMask.info128>,
+                 avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128,
+                                  ShuffleMask.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+                               ShuffleMask.info256>,
+                 avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256,
+                                  ShuffleMask.info256>,  EVEX_V256;
+  }
+}
+
+multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo,
+                                 AVX512VLVectorVTInfo Idx> {
+  let Predicates = [HasBWI] in
+  defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+                           Idx.info512>, EVEX_V512;
+  let Predicates = [HasBWI, HasVLX] in {
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+                               Idx.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+                               Idx.info256>,  EVEX_V256;
+  }
+}
+
+defm VPERMI2D  : avx512_perm_i_sizes<0x76, "vpermi2d",
+                  avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_i_sizes<0x76, "vpermi2q",
+                  avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W  : avx512_perm_i_sizes_w<0x75, "vpermi2w",
+                  avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+                  avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+                  avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+                         X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
 let Constraints = "$src1 = $dst" in {
   defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
-          (ins _.RC:$src2, _.RC:$src3),
+          (ins IdxVT.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+          (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
          AVX5128IBase;
 
   let mayLoad = 1 in
   defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-            (ins _.RC:$src2, _.MemOp:$src3),
+            (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src1, _.RC:$src2,
-                   (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+            (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+                   (bitconvert (_.LdFrag addr:$src3))))>,
             EVEX_4V, AVX5128IBase;
   }
 }
-multiclass avx512_perm_3src_mb<bits<8> opc, string OpcodeStr,
-                               SDNode OpNode, X86VectorVTInfo _> {
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
   let mayLoad = 1, Constraints = "$src1 = $dst" in
   defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-              (ins _.RC:$src2, _.ScalarMemOp:$src3),
+              (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
-              (_.VT (OpNode _.RC:$src1,
-               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, 
+              (_.VT (X86VPermt2 _.RC:$src1,
+               IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
               AVX5128IBase, EVEX_4V, EVEX_B;
 }
 
-multiclass avx512_perm_3src_sizes<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNode, AVX512VLVectorVTInfo VTInfo> {
-  let Predicates = [HasAVX512] in
-  defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, 
-            avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+                               AVX512VLVectorVTInfo VTInfo,
+                               AVX512VLVectorVTInfo ShuffleMask> {
+  defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+                              ShuffleMask.info512>,
+            avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+                              ShuffleMask.info512>, EVEX_V512;
   let Predicates = [HasVLX] in {
-  defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, 
-                 avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>,
-                 EVEX_V128;
-  defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, 
-                 avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>,
-                 EVEX_V256;
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+                              ShuffleMask.info128>,
+                 avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+                              ShuffleMask.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+                              ShuffleMask.info256>,
+                 avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+                              ShuffleMask.info256>, EVEX_V256;
   }
 }
-multiclass avx512_perm_3src_sizes_w<bits<8> opc, string OpcodeStr, 
-                                   SDNode OpNode, AVX512VLVectorVTInfo VTInfo> {
+
+multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo,
+                                 AVX512VLVectorVTInfo Idx> {
   let Predicates = [HasBWI] in
-  defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, 
-             avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>,
-             EVEX_V512;
+  defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+                           Idx.info512>, EVEX_V512;
   let Predicates = [HasBWI, HasVLX] in {
-  defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, 
-                 avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>,
-                 EVEX_V128;
-  defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, 
-                 avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>,
-                 EVEX_V256;
-  }
-}
-defm VPERMI2D  : avx512_perm_3src_sizes<0x76, "vpermi2d", X86VPermiv3,
-                                  avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q  : avx512_perm_3src_sizes<0x76, "vpermi2q", X86VPermiv3,
-                                  avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2PS : avx512_perm_3src_sizes<0x77, "vpermi2ps", X86VPermiv3,
-                                  avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_3src_sizes<0x77, "vpermi2pd", X86VPermiv3,
-                                  avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPERMT2D  : avx512_perm_3src_sizes<0x7E, "vpermt2d", X86VPermv3,
-                                  avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q  : avx512_perm_3src_sizes<0x7E, "vpermt2q", X86VPermv3,
-                                  avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2PS : avx512_perm_3src_sizes<0x7F, "vpermt2ps", X86VPermv3,
-                                  avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_3src_sizes<0x7F, "vpermt2pd", X86VPermv3,
-                                  avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPERMT2W  : avx512_perm_3src_sizes_w<0x7D, "vpermt2w", X86VPermv3,
-                                  avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMI2W  : avx512_perm_3src_sizes_w<0x75, "vpermi2w", X86VPermiv3,
-                                  avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+                               Idx.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+                               Idx.info256>, EVEX_V256;
+  }
+}
+
+defm VPERMT2D  : avx512_perm_t_sizes<0x7E, "vpermt2d",
+                  avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q  : avx512_perm_t_sizes<0x7E, "vpermt2q",
+                  avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W  : avx512_perm_t_sizes_w<0x7D, "vpermt2w",
+                  avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+                  avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+                  avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
@@ -1158,7 +1286,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
-             "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"),
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
              []>, EVEX_4V;
   def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -1175,7 +1303,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
-             "\t{$src2, $src1, ${dst} |${dst},  $src1, $src2}"),
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
              []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
   def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
@@ -1265,41 +1393,85 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
 //===----------------------------------------------------------------------===//
 
 // avx512_cmp_scalar - AVX512 CMPSS and CMPSD
-multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            SDNode OpNode, ValueType VT,
-                            PatFrag ld_frag, string Suffix> {
-  def rr : AVX512Ii8<0xC2, MRMSrcReg,
-                (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
-                !strconcat("vcmp${cc}", Suffix,
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
+
+  defm  rr_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                      (outs _.KRC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                      "vcmp${cc}"#_.Suffix,
+                      "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              imm:$cc)>, EVEX_4V;
+  let mayLoad = 1 in
+    defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                      (outs _.KRC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+                      "vcmp${cc}"#_.Suffix,
+                      "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+  defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                     (outs _.KRC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                     "vcmp${cc}"#_.Suffix,
+                     "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                     (OpNodeRnd (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                imm:$cc,
+                                (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rri_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                        (outs VK1:$dst),
+                        (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                        "vcmp"#_.Suffix,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+    defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                        (outs _.KRC:$dst),
+                        (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+                        "vcmp"#_.Suffix,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">,
+                        EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+    defm  rrb_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                       (outs _.KRC:$dst),
+                       (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                       "vcmp"#_.Suffix,
+                       "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
+                       EVEX_4V, EVEX_B;
+  }// let isAsmParserOnly = 1, hasSideEffects = 0
+
+  let isCodeGenOnly = 1 in {
+    def rr : AVX512Ii8<0xC2, MRMSrcReg,
+                (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", _.Suffix,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+                [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                          _.FRC:$src2,
+                                          imm:$cc))],
                 IIC_SSE_ALU_F32S_RR>, EVEX_4V;
-  def rm : AVX512Ii8<0xC2, MRMSrcMem,
-                (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
-                !strconcat("vcmp${cc}", Suffix,
-                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                [(set VK1:$dst, (OpNode (VT RC:$src1),
-                (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
-  let isAsmParserOnly = 1, hasSideEffects = 0 in {
-    def rri_alt : AVX512Ii8<0xC2, MRMSrcReg,
-               (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
-               !strconcat("vcmp", Suffix,
-                          "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
-               [], IIC_SSE_ALU_F32S_RR>, EVEX_4V;
     let mayLoad = 1 in
-    def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem,
-               (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
-               !strconcat("vcmp", Suffix,
-                          "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
-               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+      def rm : AVX512Ii8<0xC2, MRMSrcMem,
+                (outs _.KRC:$dst),
+                (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", _.Suffix,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                          (_.ScalarLdFrag addr:$src2),
+                                          imm:$cc))],
+                IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
   }
 }
 
 let Predicates = [HasAVX512] in {
-defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">,
-                                 XS;
-defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">,
-                                 XD, VEX_W;
+  defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
+                                   AVX512XSIi8Base;
+  defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
+                                   AVX512XDIi8Base, VEX_W;
 }
 
 multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -1651,7 +1823,7 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
   defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
                      "vcmp${cc}"#_.Suffix,
-                     "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+                     "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                      (X86cmpmRnd (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     imm:$cc,
@@ -1662,8 +1834,8 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
                          (outs _.KRC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                          "vcmp"#_.Suffix,
-                         "$cc,{sae}, $src2, $src1",
-                         "$src1, $src2,{sae}, $cc">, EVEX_B;
+                         "$cc, {sae}, $src2, $src1",
+                         "$src1, $src2, {sae}, $cc">, EVEX_B;
    }
 }
 
@@ -1700,6 +1872,128 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
             imm:$cc), VK8)>;
 
+// ----------------------------------------------------------------
+// FPClass
+//handle fpclass instruction  mask =  op(reg_scalar,imm)
+//                                    op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+      def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2)))], NoItinerary>;
+      def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                                      (OpNode (_.VT _.RC:$src1),
+                                      (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    let mayLoad = 1, AddedComplexity = 20 in {
+      def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,
+                            (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                    (i32 imm:$src2)))], NoItinerary>;
+      def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                          (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                              (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    }
+  }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+//                                  fpclass(reg_vec, mem_vec, imm)
+//                                  fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, string mem, string broadcast>{
+  def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+                                       (i32 imm:$src2)))], NoItinerary>;
+  def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                                       (OpNode (_.VT _.RC:$src1),
+                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+  let mayLoad = 1 in {
+    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##mem#
+                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode 
+                                       (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                       (i32 imm:$src2)))], NoItinerary>;
+    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##mem#
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode 
+                                    (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                    (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                                        _.BroadcastStr##", $dst|$dst, ${src1}"
+                                                    ##_.BroadcastStr##", $src2}",
+                      [(set _.KRC:$dst,(OpNode 
+                                       (_.VT (X86VBroadcast 
+                                             (_.ScalarLdFrag addr:$src1))),
+                                       (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+    def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                            _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
+                                                     _.BroadcastStr##", $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode 
+                                       (_.VT (X86VBroadcast 
+                                             (_.ScalarLdFrag addr:$src1))),
+                                       (i32 imm:$src2))))], NoItinerary>,
+                                                            EVEX_B, EVEX_K;
+  }
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, 
+                                                              string broadcast>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", 
+                                      broadcast>, EVEX_V512;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
+                                      broadcast>, EVEX_V128;
+    defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
+                                      broadcast>, EVEX_V256;
+  }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+             bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+  defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec, 
+                                      VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+  defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec, 
+                                      VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+  defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                      f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+  defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                      f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
+                                      X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+
 //-----------------------------------------------------------------
 // Mask register copy, including
 // - copy between mask registers
@@ -1786,6 +2080,11 @@ let Predicates = [HasDQI] in {
             (KMOVBmk addr:$dst, VK8:$src)>;
   def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
             (KMOVBkm addr:$src)>;
+
+  def : Pat<(store VK4:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
+  def : Pat<(store VK2:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
 }
 let Predicates = [HasAVX512, NoDQI] in {
   def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
@@ -1837,10 +2136,15 @@ let Predicates = [HasAVX512] in {
             (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
   def : Pat<(i32 (anyext VK1:$src)),
             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
   def : Pat<(i8 (zext VK1:$src)),
             (EXTRACT_SUBREG
              (AND32ri (KMOVWrk
                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+  def : Pat<(i8 (anyext VK1:$src)),
+              (EXTRACT_SUBREG
+                (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
+
   def : Pat<(i64 (zext VK1:$src)),
             (AND64ri8 (SUBREG_TO_REG (i64 0),
              (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
@@ -1848,17 +2152,19 @@ let Predicates = [HasAVX512] in {
             (EXTRACT_SUBREG
              (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
               sub_16bit)>;
-  def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
-            (COPY_TO_REGCLASS VK1:$src, VK16)>;
-  def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
-            (COPY_TO_REGCLASS VK1:$src, VK8)>;
-}
-let Predicates = [HasBWI] in {
-  def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
-            (COPY_TO_REGCLASS VK1:$src, VK32)>;
-  def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
-            (COPY_TO_REGCLASS VK1:$src, VK64)>;
 }
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK64)>;
 
 
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
@@ -1955,11 +2261,12 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator OpNode, bit IsCommutable> {
+                               SDPatternOperator OpNode, bit IsCommutable,
+                               Predicate prdW = HasAVX512> {
   defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
                              HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                             HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS;
+                             prdW, IsCommutable>, VEX_4V, VEX_L, PS;
   defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
                              HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
   defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
@@ -1974,6 +2281,7 @@ defm KOR   : avx512_mask_binop_all<0x45, "kor",   or,   1>;
 defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>;
 defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,  1>;
 defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>;
+defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,  1, HasDQI>;
 
 multiclass avx512_mask_binop_int<string IntName, string InstName> {
   let Predicates = [HasAVX512] in
@@ -2047,59 +2355,49 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)),
                              (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
 
 // Mask unpacking
-multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
-                           RegisterClass KRC> {
-  let Predicates = [HasAVX512] in
-    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
-               !strconcat(OpcodeStr,
-                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-}
+multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
+                             RegisterClass KRCSrc, Predicate prd> {
+  let Predicates = [prd] in {
+    let hasSideEffects = 0 in
+    def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
+               (ins KRC:$src1, KRC:$src2),
+               "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               VEX_4V, VEX_L;
 
-multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
-  defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>,
-                            VEX_4V, VEX_L, PD;
+    def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
+              (!cast<Instruction>(NAME##rr)
+                        (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
+                        (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+  }
 }
 
-defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
-def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))),
-          (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16),
-                  (COPY_TO_REGCLASS VK8:$src1, VK16))>;
-
-
-multiclass avx512_mask_unpck_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw")
-                (i16 GR16:$src1), (i16 GR16:$src2)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
-              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-defm : avx512_mask_unpck_int<"kunpck",  "KUNPCK">;
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
 
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
-                            SDNode OpNode> {
-  let Predicates = [HasAVX512], Defs = [EFLAGS] in
+                              SDNode OpNode, Predicate prd> {
+  let Predicates = [prd], Defs = [EFLAGS] in
     def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
 }
 
-multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
-  defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                            VEX, PS;
-  let Predicates = [HasDQI] in
-  defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
-                            VEX, PD;
-  let Predicates = [HasBWI] in {
-  defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
-                            VEX, PS, VEX_W;
-  defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
-                            VEX, PD, VEX_W;
-  }
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                Predicate prdW = HasAVX512> {
+  defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+                                                                VEX, PD;
+  defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+                                                                VEX, PS;
+  defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+                                                                VEX, PS, VEX_W;
+  defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+                                                                VEX, PD, VEX_W;
 }
 
 defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
 
 // Mask shift
 multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -2124,7 +2422,7 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
   let Predicates = [HasDQI] in
   defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
                                VEX, TAPD;
-  }  
+  }
 }
 
 defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
@@ -2167,24 +2465,52 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
 
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+          (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+
 def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
           (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
 
 def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
           (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
 
-let Predicates = [HasVLX] in {
-  def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
-            (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
-  def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
-            (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
-  def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
-            (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
-  def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
-            (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
-  def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
-            (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
-}
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+          (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
+
+def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+          (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
+
+def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+          (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
+
+def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
+def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
+
+def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>;
+
+def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>;
+
 
 def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
           (v8i1 (COPY_TO_REGCLASS
@@ -2304,23 +2630,21 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                         PatFrag st_frag, PatFrag mstore> {
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
-  def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
-                        OpcodeStr # "\t{$src, $dst|$dst, $src}", [],
-                        _.ExeDomain>, EVEX;
-  let Constraints = "$src1 = $dst" in
-  def rrk_alt : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
-                         (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2),
-                         OpcodeStr #
-                         "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}",
+
+  def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+                         OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
+                         [], _.ExeDomain>, EVEX;
+  def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
+                         (ins _.KRCWM:$mask, _.RC:$src),
+                         OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+                         "${dst} {${mask}}, $src}",
                          [], _.ExeDomain>,  EVEX, EVEX_K;
-  def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
+  def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                           (ins _.KRCWM:$mask, _.RC:$src),
-                          OpcodeStr #
-                          "\t{$src, ${dst} {${mask}} {z}|" # 
+                          OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
                           [], _.ExeDomain>, EVEX, EVEX_KZ;
-  }
+
   let mayStore = 1 in {
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -2383,30 +2707,6 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
-                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
-       (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
-
-def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
-                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
-       (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
-
-def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
-                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
-       (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
-
-def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
-                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
-       (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
-
-def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
-                (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-       (VMOVAPDZrm addr:$ptr)>;
-
-def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
-                 (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
-       (VMOVAPSZrm addr:$ptr)>;
-
 def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
           GR16:$mask),
          (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
@@ -2425,22 +2725,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
          (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
             VR512:$src)>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
-         (VMOVUPSZmrk addr:$ptr,
-         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
-         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
-         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz 
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))),
-         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk
-         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm),
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
-
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
@@ -2502,17 +2786,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
                            (v16i32 VR512:$src))),
                   (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
 }
-// NoVLX patterns
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
-         (VMOVDQU32Zmrk addr:$ptr,
-         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
-         (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
-         (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz 
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
 
 // Move Int Doubleword to Packed Double Int
 //
@@ -2520,32 +2793,37 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
-                        EVEX, VEX_LIG;
+                        EVEX;
 def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
-                        IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                        IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
 def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                         [(set VR128X:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
-                          IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}", []>,
+                      EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
 let isCodeGenOnly = 1 in {
-def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
                        "vmovq\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       [(set FR64X:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
-def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
-                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         [(set GR64:$dst, (bitconvert FR64X:$src))],
                          IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
-}
-def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
-                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
                          IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
                          EVEX_CD8<64, CD8VT1>;
+}
 
 // Move Int Doubleword to Single Scalar
 //
@@ -2553,27 +2831,27 @@ let isCodeGenOnly = 1 in {
 def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set FR32X:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
+                      IIC_SSE_MOVDQ>, EVEX;
 
 def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
 }
 
 // Move doubleword from xmm register to r/m32
 //
 def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
                        "vmovd\t{$src, $dst|$dst, $src}",
-                       [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
+                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
-                       EVEX, VEX_LIG;
+                       EVEX;
 def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128X:$src),
                        "vmovd\t{$src, $dst|$dst, $src}",
-                       [(store (i32 (vector_extract (v4i32 VR128X:$src),
+                       [(store (i32 (extractelt (v4i32 VR128X:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                       EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                       EVEX, EVEX_CD8<32, CD8VT1>;
 
 // Move quadword from xmm1 register to r/m64
 //
@@ -2581,16 +2859,28 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
                                                    (iPTR 0)))],
-                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W,
+                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
                       Requires<[HasAVX512, In64BitMode]>;
 
-def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
-                       (ins i64mem:$dst, VR128X:$src),
-                       "vmovq\t{$src, $dst|$dst, $src}",
-                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
-                               addr:$dst)], IIC_SSE_MOVDQ>,
-                       EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>,
-                       Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+                      (ins i64mem:$dst, VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                              addr:$dst)], IIC_SSE_MOVDQ>,
+                      EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+                      Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+let hasSideEffects = 0 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+                             (ins VR128X:$src),
+                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+                             EVEX, VEX_W;
 
 // Move Scalar Single to Double Int
 //
@@ -2599,92 +2889,95 @@ def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
                       (ins FR32X:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32X:$src))],
-                      IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
+                      IIC_SSE_MOVD_ToGP>, EVEX;
 def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                       (ins i32mem:$dst, FR32X:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
 }
 
 // Move Quadword Int to Packed Quadword Int
 //
-def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
                       (ins i64mem:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
-                      EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_move_scalar <string asm, RegisterClass RC,
-                              SDNode OpNode, ValueType vt,
-                              X86MemOperand x86memop, PatFrag mem_pat> {
-  let hasSideEffects = 0 in {
-  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
-              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
-                                      (scalar_to_vector RC:$src2))))],
-              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
-  let Constraints = "$src1 = $dst" in
-  def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
-              (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
-              !strconcat(asm,
-                "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
-              [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
-  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
-              EVEX, VEX_LIG;
+multiclass avx512_move_scalar <string asm, SDNode OpNode, 
+                              X86VectorVTInfo _> {
+  defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), 
+                    (ins _.RC:$src1, _.RC:$src2),
+                    asm, "$src2, $src1","$src1, $src2", 
+                    (_.VT (OpNode (_.VT _.RC:$src1),
+                                   (_.VT _.RC:$src2))),
+                                   IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let Constraints = "$src1 = $dst" , mayLoad = 1 in
+    defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+                    (outs _.RC:$dst), 
+                    (ins _.ScalarMemOp:$src),
+                    asm,"$src","$src",
+                    (_.VT (OpNode (_.VT _.RC:$src1), 
+                               (_.VT (scalar_to_vector 
+                                     (_.ScalarLdFrag addr:$src)))))>, EVEX;
+  let isCodeGenOnly = 1 in {
+    def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), 
+               (ins _.RC:$src1, _.FRC:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+                                      (scalar_to_vector _.FRC:$src2))))],
+               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let mayLoad = 1 in
+    def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+               _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+  }
   let mayStore = 1 in {
-  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
-             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
-             EVEX, VEX_LIG;
-  def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
-             !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
-             [], IIC_SSE_MOV_S_MR>,
-             EVEX, VEX_LIG, EVEX_K;
+    def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
+               EVEX;
+    def mrk: AVX512PI<0x11, MRMDestMem, (outs), 
+                (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+                !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+                [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
   } // mayStore
-  } //hasSideEffects = 0
 }
 
-let ExeDomain = SSEPackedSingle in
-defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
-                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
 
-let ExeDomain = SSEPackedDouble in
-defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
-                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+                                  VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-           VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+          (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
 
 def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-           VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+          (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
           (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
-// For the disassembler
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
-  def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
-                        (ins VR128X:$src1, FR32X:$src2),
-                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-                        IIC_SSE_MOV_S_RR>,
-                        XS, EVEX_4V, VEX_LIG;
-  def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
-                        (ins VR128X:$src1, FR64X:$src2),
-                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-                        IIC_SSE_MOV_S_RR>,
-                        XD, EVEX_4V, VEX_LIG, VEX_W;
-}
+defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
+                           XS, EVEX_4V, VEX_LIG;
+
+defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
+                           XD, EVEX_4V, VEX_LIG, VEX_W;
 
 let Predicates = [HasAVX512] in {
   let AddedComplexity = 15 in {
@@ -2768,10 +3061,10 @@ let Predicates = [HasAVX512] in {
                        (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
 
   // Extract and store.
-  def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
+  def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
-  def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
+  def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
 
@@ -2835,7 +3128,7 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
                                                    (v2i64 VR128X:$src))))],
                                 IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
 
-let AddedComplexity = 20 in
+let AddedComplexity = 20 , isCodeGenOnly = 1 in
 def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
                                  (ins i128mem:$src),
                                  "vmovq\t{$src, $dst|$dst, $src}",
@@ -2964,7 +3257,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _, OpndItins itins,
                            bit IsCommutable = 0> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                     itins.rr, IsCommutable>,
@@ -2972,7 +3265,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
   let mayLoad = 1 in
     defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1,
                                   (bitconvert (_.LdFrag addr:$src2)))),
@@ -2986,7 +3279,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
            avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
   let mayLoad = 1 in
     defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr,
                     (_.VT (OpNode _.RC:$src1,
@@ -3058,20 +3351,20 @@ multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
                                  SDNode OpNode, OpndItins itins, Predicate prd,
                                  bit IsCommutable = 0> {
-  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd,
+  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
                                    IsCommutable>;
 
-  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd,
+  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
                                    IsCommutable>;
 }
 
 multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
                                  SDNode OpNode, OpndItins itins, Predicate prd,
                                  bit IsCommutable = 0> {
-  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd,
+  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
                                    IsCommutable>;
 
-  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd,
+  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
                                    IsCommutable>;
 }
 
@@ -3086,15 +3379,15 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
 }
 
 multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
-                            SDNode OpNode,X86VectorVTInfo _Src, 
+                            SDNode OpNode,X86VectorVTInfo _Src,
                             X86VectorVTInfo _Dst, bit IsCommutable = 0> {
-  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), 
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
                             (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
-                            "$src2, $src1","$src1, $src2", 
-                            (_Dst.VT (OpNode 
-                                         (_Src.VT _Src.RC:$src1), 
+                            "$src2, $src1","$src1, $src2",
+                            (_Dst.VT (OpNode
+                                         (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
-                            itins.rr, IsCommutable>, 
+                            itins.rr, IsCommutable>,
                             AVX512BIBase, EVEX_4V;
   let mayLoad = 1 in {
       defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -3106,12 +3399,12 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
                             AVX512BIBase, EVEX_4V;
 
       defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                        (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), 
+                        (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
                         OpcodeStr,
                         "${src2}"##_Dst.BroadcastStr##", $src1",
                          "$src1, ${src2}"##_Dst.BroadcastStr,
-                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert 
-                                     (_Dst.VT (X86VBroadcast 
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                     (_Dst.VT (X86VBroadcast
                                               (_Dst.ScalarLdFrag addr:$src2)))))),
                         itins.rm>,
                         AVX512BIBase, EVEX_4V, EVEX_B;
@@ -3127,24 +3420,24 @@ defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
 defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
                                     SSE_INTALU_ITINS_P, HasBWI, 0>;
 defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
-                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
 defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
-                                    SSE_INTALU_ITINS_P, HasBWI, 0>;
-defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul,
-                                   SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul,
-                                   SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul,
-                                   SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
-defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P,
-                                    HasBWI, 1>;
-defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, 
+                                     SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+                                    SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+                                    SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
                                     HasBWI, 1>;
-defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, 
-                                    HasBWI, 1>, T8PD;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+                                     HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
+                                      HasBWI, 1>, T8PD;
 defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
-                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
-                                   
+                                   SSE_INTALU_ITINS_P, HasBWI, 1>;
+
 multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
                             SDNode OpNode, bit IsCommutable = 0> {
 
@@ -3159,7 +3452,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
                                       v4i32x_info, v2i64x_info, IsCommutable>,
                                      EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
   }
-}                            
+}
 
 defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
                    X86pmuldq, 1>,T8PD;
@@ -3170,25 +3463,25 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
   let mayLoad = 1 in {
       defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
-                        (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), 
+                        (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
                         OpcodeStr,
                         "${src2}"##_Src.BroadcastStr##", $src1",
                          "$src1, ${src2}"##_Src.BroadcastStr,
-                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert 
-                                     (_Src.VT (X86VBroadcast 
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                     (_Src.VT (X86VBroadcast
                                               (_Src.ScalarLdFrag addr:$src2))))))>,
                         EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
   }
 }
 
-multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, 
-                            SDNode OpNode,X86VectorVTInfo _Src, 
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,X86VectorVTInfo _Src,
                             X86VectorVTInfo _Dst> {
-  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), 
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
                             (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
-                            "$src2, $src1","$src1, $src2", 
-                            (_Dst.VT (OpNode 
-                                         (_Src.VT _Src.RC:$src1), 
+                            "$src2, $src1","$src1, $src2",
+                            (_Dst.VT (OpNode
+                                         (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2)))>,
                             EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
   let mayLoad = 1 in {
@@ -3229,102 +3522,59 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
                                     v16i8x_info>, EVEX_V128;
   }
 }
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode, AVX512VLVectorVTInfo _Src,
+                            AVX512VLVectorVTInfo _Dst> {
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+                                _Dst.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+                                     _Dst.info256>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+                                     _Dst.info128>, EVEX_V128;
+  }
+}
+
 let Predicates = [HasBWI] in {
   defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD;
   defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD;
   defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W;
   defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
+
+  defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+                       avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+  defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+                       avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
 }
 
-defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax,
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax,
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
 defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax,
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax,
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
 defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin,
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin,
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
 defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin,
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin,
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
 defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-
-//===----------------------------------------------------------------------===//
-// AVX-512 - Unpack Instructions
-//===----------------------------------------------------------------------===//
-
-multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
-                                   PatFrag mem_frag, RegisterClass RC,
-                                   X86MemOperand x86memop, string asm,
-                                   Domain d> {
-    def rr : AVX512PI<opc, MRMSrcReg,
-                (outs RC:$dst), (ins RC:$src1, RC:$src2),
-                asm, [(set RC:$dst,
-                           (vt (OpNode RC:$src1, RC:$src2)))],
-                           d>, EVEX_4V;
-    def rm : AVX512PI<opc, MRMSrcMem,
-                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-                asm, [(set RC:$dst,
-                       (vt (OpNode RC:$src1,
-                            (bitconvert (mem_frag addr:$src2)))))],
-                        d>, EVEX_4V;
-}
-
-defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64,
-      VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64,
-      VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64,
-      VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64,
-      VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                        X86MemOperand x86memop> {
-  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-       (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
-       IIC_SSE_UNPCK>, EVEX_4V;
-  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))],
-                                     IIC_SSE_UNPCK>, EVEX_4V;
-}
-defm VPUNPCKLDQZ  : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
-                                VR512, loadv16i32, i512mem>, EVEX_V512,
-                                EVEX_CD8<32, CD8VF>;
-defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
-                                VR512, loadv8i64, i512mem>, EVEX_V512,
-                                VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPUNPCKHDQZ  : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
-                                VR512, loadv16i32, i512mem>, EVEX_V512,
-                                EVEX_CD8<32, CD8VF>;
-defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
-                                VR512, loadv8i64, i512mem>, EVEX_V512,
-                                VEX_W, EVEX_CD8<64, CD8VF>;
 //===----------------------------------------------------------------------===//
 // AVX-512  Logical Instructions
 //===----------------------------------------------------------------------===//
@@ -3362,12 +3612,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   let isCodeGenOnly = 1, isCommutable = IsCommutable,
       Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
-                         (ins _.FRC:$src1, _.FRC:$src2), 
+                         (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
                           itins.rr>;
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
-                         (ins _.FRC:$src1, _.ScalarMemOp:$src2), 
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
                          (_.ScalarLdFrag addr:$src2)))], itins.rr>;
@@ -3375,7 +3625,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 }
 
 multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode VecNode, OpndItins itins, bit IsCommutable> {
+                         SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
 
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                           (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
@@ -3470,7 +3720,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                   EVEX_4V, EVEX_B;
 }
 
-multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                              bit IsCommutable = 0> {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
                               IsCommutable>, EVEX_V512, PS,
@@ -3514,7 +3764,7 @@ defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
             avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
 defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
             avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, 
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
             avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
 defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
             avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
@@ -3550,13 +3800,34 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }//let mayLoad = 1
 }
 
-multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
-  defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, 
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>;
+  }//let mayLoad = 1
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> {
+  defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
                               EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, 
+  defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
              avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
                               EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>,
+                              EVEX_4V,EVEX_CD8<32, CD8VT1>;
+  defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>,
+                              EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
@@ -3569,7 +3840,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
                                    EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
   }
 }
-defm VSCALEF : avx512_fp_scalef_all<0x2C, "vscalef", X86scalef>, T8PD;
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
@@ -3586,7 +3857,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (OpNode (_.VT _.RC:$src1), 
+                   (OpNode (_.VT _.RC:$src1),
                     (_.VT (bitconvert (_.LdFrag addr:$src2))))>,
                     EVEX_4V,
                    EVEX_CD8<_.EltSize, CD8VF>;
@@ -3748,12 +4019,12 @@ multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
                               VTInfo.info256>, EVEX_V256;
   defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
                               VTInfo.info128>,
-             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, 
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
                               VTInfo.info128>, EVEX_V128;
   }
 }
 
-multiclass avx512_shift_rmi_w<bits<8> opcw, 
+multiclass avx512_shift_rmi_w<bits<8> opcw,
                                  Format ImmFormR, Format ImmFormM,
                                  string OpcodeStr, SDNode OpNode> {
   let Predicates = [HasBWI] in
@@ -3785,8 +4056,8 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
 defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
              avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
 
-defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V;
-defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V;
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
 defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
@@ -3846,6 +4117,27 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
                                  avx512vl_i64_info>, VEX_W;
 }
 
+// Use 512bit version to implement 128/256 bit in case NoVLX.  
+multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
+  let Predicates = [HasBWI, NoVLX] in {
+  def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), 
+                                  (_.info256.VT _.info256.RC:$src2))),
+            (EXTRACT_SUBREG                
+                (!cast<Instruction>(NAME#"WZrr")
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+             sub_ymm)>;
+
+  def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), 
+                                  (_.info128.VT _.info128.RC:$src2))),
+            (EXTRACT_SUBREG                
+                (!cast<Instruction>(NAME#"WZrr")
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+             sub_xmm)>;
+  }
+}
+
 multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
                                  SDNode OpNode> {
   let Predicates = [HasBWI] in
@@ -3861,11 +4153,14 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
 }
 
 defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
-              avx512_var_shift_w<0x12, "vpsllvw", shl>;
+              avx512_var_shift_w<0x12, "vpsllvw", shl>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
 defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
-              avx512_var_shift_w<0x11, "vpsravw", sra>;
+              avx512_var_shift_w<0x11, "vpsravw", sra>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
 defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
-              avx512_var_shift_w<0x10, "vpsrlvw", srl>;
+              avx512_var_shift_w<0x10, "vpsrlvw", srl>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
 defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
 defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
 
@@ -3916,19 +4211,77 @@ defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
 defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
                              X86VPermi, avx512vl_f64_info>,
                              EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL 
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr,  SDNode OpNode,
+                             X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+  defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1,
+                               (Ctrl.VT Ctrl.RC:$src2)))>,
+                  T8PD, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode
+                             _.RC:$src1,
+                             (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                    T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+    defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                     "${src2}"##_.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_.BroadcastStr,
+                     (_.VT (OpNode
+                              _.RC:$src1,
+                              (Ctrl.VT (X86VBroadcast
+                                         (Ctrl.ScalarLdFrag addr:$src2)))))>,
+                     T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+  }//let mayLoad = 1
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+                             AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+                                  Ctrl.info512>, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+                                  Ctrl.info128>, EVEX_V128;
+    defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+                                  Ctrl.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+                         AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+  defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+  defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+                                    X86VPermilpi, _>,
+                    EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
 
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+                               avx512vl_i32_info>;
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+                               avx512vl_i64_info>, VEX_W;
 //===----------------------------------------------------------------------===//
 // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
 //===----------------------------------------------------------------------===//
 
 defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
-                             X86PShufd, avx512vl_i32_info>, 
+                             X86PShufd, avx512vl_i32_info>,
                              EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
 defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
-                                  X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W;
+                                  X86PShufhw>, EVEX, AVX512XSIi8Base;
 defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
-                                  X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W;
-                                  
+                                  X86PShuflw>, EVEX, AVX512XDIi8Base;
+
 multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
   let Predicates = [HasBWI] in
   defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
@@ -3942,55 +4295,6 @@ multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
 defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
 
 //===----------------------------------------------------------------------===//
-// AVX-512 - MOVDDUP
-//===----------------------------------------------------------------------===//
-
-multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT,
-                        X86MemOperand x86memop, PatFrag memop_frag> {
-def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
-def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set RC:$dst,
-                      (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
-}
-
-defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>,
-                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
-          (VMOVDDUPZrm addr:$src)>;
-
-//===---------------------------------------------------------------------===//
-// Replicate Single FP - MOVSHDUP and MOVSLDUP
-//===---------------------------------------------------------------------===//
-multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
-                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
-                              X86MemOperand x86memop> {
-  def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
-  let mayLoad = 1 in
-  def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
-}
-
-defm VMOVSHDUPZ  : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                       v16f32, VR512, loadv16f32, f512mem>, EVEX_V512,
-                       EVEX_CD8<32, CD8VF>;
-defm VMOVSLDUPZ  : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                       v16f32, VR512, loadv16f32, f512mem>, EVEX_V512,
-                       EVEX_CD8<32, CD8VF>;
-
-def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
-def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))),
-           (VMOVSHDUPZrm addr:$src)>;
-def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
-def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))),
-           (VMOVSLDUPZrm addr:$src)>;
-
-//===----------------------------------------------------------------------===//
 // Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
@@ -4017,6 +4321,115 @@ let Predicates = [HasAVX512] in {
 }
 
 //===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  X86VectorVTInfo _> {
+  let mayLoad = 1 in
+    def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+                    (ins _.RC:$src1, f64mem:$src2),
+                    !strconcat(OpcodeStr,
+                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set _.RC:$dst,
+                       (OpNode _.RC:$src1,
+                         (_.VT (bitconvert
+                           (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+                    IIC_SSE_MOV_LH>, EVEX_4V;
+}
+
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128X:$src1,
+               (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128X:$src1,
+               (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVHPD patterns
+  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+                    (scalar_to_vector (loadf64 addr:$src2)))),
+           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+                    (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVLPS patterns
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVLPD patterns
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1,
+                           (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+let mayStore = 1 in {
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovhps\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract
+                                     (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
+                                                (bc_v2f64 (v4f32 VR128X:$src))),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovhpd\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract
+                                     (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovlps\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovlpd\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract (v2f64 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+let Predicates = [HasAVX512] in {
+  // VMOVHPD patterns
+  def : Pat<(store (f64 (vector_extract
+                           (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+                           (iPTR 0))), addr:$dst),
+           (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+  // VMOVLPS patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
+            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+  // VMOVLPD patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+}
+//===----------------------------------------------------------------------===//
 // FMA - Fused Multiply Operations
 //
 
@@ -4034,7 +4447,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
-            AVX512FMA3Base; 
+            AVX512FMA3Base;
 
     defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
               (ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -4435,50 +4848,55 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
 //===----------------------------------------------------------------------===//
 // AVX-512  Scalar convert from float/double to integer
 //===----------------------------------------------------------------------===//
-multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
-                          string asm> {
-let hasSideEffects = 0 in {
-  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG,
-              Requires<[HasAVX512]>;
-  let mayLoad = 1 in
-  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG,
-              Requires<[HasAVX512]>;
-} // hasSideEffects = 0
+multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, 
+                                  RegisterClass DstRC, Intrinsic Int,
+                           Operand memop, ComplexPattern mem_cpat, string asm> {
+  let hasSideEffects = 0, Predicates = [HasAVX512] in {
+    def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+    def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+                !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, 
+                EVEX, VEX_LIG, EVEX_B, EVEX_RC;
+    let mayLoad = 1 in
+    def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+  } // hasSideEffects = 0, Predicates = [HasAVX512] 
 }
-let Predicates = [HasAVX512] in {
+
 // Convert float/double to signed/unsigned int 32/64
-defm VCVTSS2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
                                    ssmem, sse_load_f32, "cvtss2si">,
                                    XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, 
+                                  int_x86_sse_cvtss2si64,
                                    ssmem, sse_load_f32, "cvtss2si">,
                                    XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, 
+                                  int_x86_avx512_cvtss2usi,
                                    ssmem, sse_load_f32, "cvtss2usi">,
                                    XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
                                    int_x86_avx512_cvtss2usi64, ssmem,
                                    sse_load_f32, "cvtss2usi">, XS, VEX_W,
                                    EVEX_CD8<32, CD8VT1>;
-defm VCVTSD2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
                                    sdmem, sse_load_f64, "cvtsd2si">,
                                    XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, 
+                                   int_x86_sse2_cvtsd2si64,
                                    sdmem, sse_load_f64, "cvtsd2si">,
                                    XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
+defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, VR128X, GR32, 
+                                   int_x86_avx512_cvtsd2usi,
                                    sdmem, sse_load_f64, "cvtsd2usi">,
                                    XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
                                    int_x86_avx512_cvtsd2usi64, sdmem,
                                    sse_load_f64, "cvtsd2usi">, XD, VEX_W,
                                    EVEX_CD8<64, CD8VT1>;
 
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
   defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
             int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
             SSE_CVT_Scalar, 0>, XS, EVEX_4V;
@@ -4495,121 +4913,170 @@ let isCodeGenOnly = 1 in {
   defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
             int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
             SSE_CVT_Scalar, 0>, XD, EVEX_4V;
-} // isCodeGenOnly = 1
+} // isCodeGenOnly = 1, Predicates = [HasAVX512]
 
 // Convert float/double to signed/unsigned int 32/64 with truncation
-let isCodeGenOnly = 1 in {
-  defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
-                                     ssmem, sse_load_f32, "cvttss2si">,
-                                     XS, EVEX_CD8<32, CD8VT1>;
-  defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
-                                     int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
-                                     "cvttss2si">, XS, VEX_W,
-                                     EVEX_CD8<32, CD8VT1>;
-  defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
-                                     sdmem, sse_load_f64, "cvttsd2si">, XD,
-                                     EVEX_CD8<64, CD8VT1>;
-  defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
-                                     int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
-                                     "cvttsd2si">, XD, VEX_W,
-                                     EVEX_CD8<64, CD8VT1>;
-  defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
-                                     int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
-                                     "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>;
-  defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
-                                     int_x86_avx512_cvttss2usi64, ssmem,
-                                     sse_load_f32, "cvttss2usi">, XS, VEX_W,
-                                     EVEX_CD8<32, CD8VT1>;
-  defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
-                                     int_x86_avx512_cvttsd2usi,
-                                     sdmem, sse_load_f64, "cvttsd2usi">, XD,
-                                     EVEX_CD8<64, CD8VT1>;
-  defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
-                                     int_x86_avx512_cvttsd2usi64, sdmem,
-                                     sse_load_f64, "cvttsd2usi">, XD, VEX_W,
-                                     EVEX_CD8<64, CD8VT1>;
-} // isCodeGenOnly = 1
-
-multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
-                         string asm> {
-  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, 
+                            X86VectorVTInfo _DstRC, SDNode OpNode, 
+                            SDNode OpNodeRnd>{
+let Predicates = [HasAVX512] in {
+  def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
-  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+  def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+                !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                []>, EVEX, EVEX_B;
+  def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
-}
-
-defm VCVTTSS2SIZ    : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
-                                  loadf32, "cvttss2si">, XS,
-                                  EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USIZ   : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
-                                  loadf32, "cvttss2usi">, XS,
-                                  EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2SI64Z  : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
-                                  loadf32, "cvttss2si">, XS, VEX_W,
-                                  EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
-                                  loadf32, "cvttss2usi">, XS, VEX_W,
-                                  EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2SIZ    : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
-                                  loadf64, "cvttsd2si">, XD,
-                                  EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USIZ   : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
-                                  loadf64, "cvttsd2usi">, XD,
-                                  EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2SI64Z  : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
-                                  loadf64, "cvttsd2si">, XD, VEX_W,
-                                  EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
-                                  loadf64, "cvttsd2usi">, XD, VEX_W,
-                                  EVEX_CD8<64, CD8VT1>;
+              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, 
+              EVEX;
+
+  let isCodeGenOnly = 1,hasSideEffects = 0 in {
+      def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+               [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+                                     (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+      def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+                !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, 
+                                      (i32 FROUND_NO_EXC)))]>, 
+                                      EVEX,VEX_LIG , EVEX_B;
+      let mayLoad = 1 in
+        def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), 
+                    (ins _SrcRC.MemOp:$src),
+                    !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                    []>, EVEX, VEX_LIG;
+
+  } // isCodeGenOnly = 1, hasSideEffects = 0
+} //HasAVX512
+}
+
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, 
+                        fp_to_sint,X86cvttss2IntRnd>, 
+                        XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, 
+                        fp_to_sint,X86cvttss2IntRnd>, 
+                        VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, 
+                        fp_to_sint,X86cvttsd2IntRnd>,
+                        XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, 
+                        fp_to_sint,X86cvttsd2IntRnd>, 
+                        VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, 
+                        fp_to_uint,X86cvttss2UIntRnd>, 
+                        XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, 
+                        fp_to_uint,X86cvttss2UIntRnd>, 
+                        XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, 
+                        fp_to_uint,X86cvttsd2UIntRnd>, 
+                        XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, 
+                        fp_to_uint,X86cvttsd2UIntRnd>, 
+                        XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+let Predicates = [HasAVX512] in {
+  def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
+            (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
+            (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
+            (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
+            (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+
 } // HasAVX512
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
 //===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in {
-def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
-                    (ins FR32X:$src1, FR32X:$src2),
-                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
-let mayLoad = 1 in
-def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
-                    (ins FR32X:$src1, f32mem:$src2),
-                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
-                    EVEX_CD8<32, CD8VT1>;
-
-// Convert scalar double to scalar single
-def VCVTSD2SSZrr  : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
-                      (ins FR64X:$src1, FR64X:$src2),
-                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
-let mayLoad = 1 in
-def VCVTSD2SSZrm  : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
-                      (ins FR64X:$src1, f64mem:$src2),
-                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      []>, EVEX_4V, VEX_LIG, VEX_W,
-                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
-}
-
-def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>,
-      Requires<[HasAVX512]>;
-def : Pat<(fextend (loadf32 addr:$src)),
-    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>;
-
-def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode> {
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, 
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                       (_Src.VT _Src.RC:$src2)))>, 
+                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, 
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src1), 
+                                  (_Src.VT (scalar_to_vector 
+                                            (_Src.ScalarLdFrag addr:$src2)))))>, 
+                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Scalar Coversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                        "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), 
+                                         (_Src.VT _Src.RC:$src2),
+                                         (i32 FROUND_NO_EXC)))>,
+                        EVEX_4V, VEX_LIG, EVEX_B;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src2, $src1", "$src1, $src2, $rc",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), 
+                                         (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+                        EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                        EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                                  SDNode OpNodeRnd, X86VectorVTInfo _src, 
+                                                        X86VectorVTInfo _dst> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+             avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+                               OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>,
+                               EVEX_V512, XD;
+  }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                                    SDNode OpNodeRnd, X86VectorVTInfo _src, 
+                                                          X86VectorVTInfo _dst> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, 
+             EVEX_CD8<32, CD8VT1>, XS, EVEX_V512;
+  }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround,
+                                         X86froundRnd, f64x_info, f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, 
+                                          X86fpextRnd,f32x_info, f64x_info >;
+
+def : Pat<(f64 (fextend FR32X:$src)), 
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), 
+                               (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+          Requires<[HasAVX512]>;
+def : Pat<(f64 (fextend (loadf32 addr:$src))),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          Requires<[HasAVX512]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+      (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
       Requires<[HasAVX512, OptForSize]>;
 
-def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
-    Requires<[HasAVX512, OptForSpeed]>;
+def : Pat<(f64 (extloadf32 addr:$src)),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), 
+                    (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+          Requires<[HasAVX512, OptForSpeed]>;
 
-def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
+def : Pat<(f32 (fround FR64X:$src)), 
+          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), 
+                    (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
            Requires<[HasAVX512]>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512  Vector convert from signed/unsigned integer to float/double
 //          and from float/double to signed/unsigned integer
@@ -4992,7 +5459,7 @@ defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
 defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
                             X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
 
-let Predicates = [NoVLX] in {
+let Predicates = [HasAVX512, NoVLX] in {
 def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -5024,40 +5491,102 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
-multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC,
-                             X86MemOperand x86memop> {
-  def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
-             "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             []>, EVEX;
-  let hasSideEffects = 0, mayLoad = 1 in
-  def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
-             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
-}
-
-multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC,
-                             X86MemOperand x86memop> {
-  def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
-               (ins srcRC:$src1, i32u8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               []>, EVEX;
-  let hasSideEffects = 0, mayStore = 1 in
-  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
-               (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, 
+                           X86MemOperand x86memop, PatFrag ld_frag> {
+  defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+                    "vcvtph2ps", "$src", "$src",
+                   (X86cvtph2ps (_src.VT _src.RC:$src),
+                                                (i32 FROUND_CURRENT))>, T8PD;
+  let hasSideEffects = 0, mayLoad = 1 in {
+    defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+                      "vcvtph2ps", "$src", "$src", 
+                      (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+                                       (i32 FROUND_CURRENT))>, T8PD;
+  }
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+  defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+                    "vcvtph2ps", "{sae}, $src", "$src, {sae}",
+                   (X86cvtph2ps (_src.VT _src.RC:$src),
+                                                (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+
 }
 
-defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VH>;
-defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VH>;
+let Predicates = [HasAVX512] in {
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
+                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, 
+                    EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+  let Predicates = [HasVLX] in {
+    defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, 
+                         loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+    defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+                         loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+  }
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, 
+                           X86MemOperand x86memop> {
+  defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+               (ins _src.RC:$src1, i32u8imm:$src2),
+                    "vcvtps2ph", "$src2, $src1", "$src1, $src2", 
+                   (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                (i32 imm:$src2), 
+                                (i32 FROUND_CURRENT))>, AVX512AIi8Base;
+  let hasSideEffects = 0, mayStore = 1 in {
+    def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 
+               [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                       (i32 imm:$src2), (i32 FROUND_CURRENT) )),
+                                       addr:$dst)]>;
+    def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", 
+                []>, EVEX_K;
+  }
+}
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+  defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+               (ins _src.RC:$src1, i32u8imm:$src2),
+                    "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", 
+                   (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                (i32 imm:$src2), 
+                                (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base;
+}
+let Predicates = [HasAVX512] in {
+  defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
+                    avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
+                      EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+  let Predicates = [HasVLX] in {
+    defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
+                        EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+                        EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+  }
+}
 
-def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src),
-           imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))),
-           (VCVTPS2PHZrr VR512:$src, imm:$rc)>;
+//  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+                            string OpcodeStr> {
+  def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+                 !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+                 [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, 
+                                                        (i32 FROUND_NO_EXC)))],
+                 IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+                 Sched<[WriteFAdd]>;
+}
 
-def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src),
-           (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))),
-           (VCVTPH2PSZrr VR256X:$src)>;
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
@@ -5067,10 +5596,10 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
                                   "ucomisd">, PD, EVEX,
                                   VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   let Pattern = []<dag> in {
-    defm VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
                                    "comiss">, PS, EVEX, VEX_LIG,
                                    EVEX_CD8<32, CD8VT1>;
-    defm VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
                                    "comisd">, PD, EVEX,
                                     VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
@@ -5092,50 +5621,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 }
 
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
-multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                            X86MemOperand x86memop> {
-  let hasSideEffects = 0 in {
-  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2),
-               !strconcat(OpcodeStr,
-               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
   let mayLoad = 1 in {
-  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-               (ins RC:$src1, x86memop:$src2),
-               !strconcat(OpcodeStr,
-               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
   }
 }
 }
 
-defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>,
-                  EVEX_CD8<32, CD8VT1>;
-defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>,
-                  VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>,
-                  EVEX_CD8<32, CD8VT1>;
-defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>,
-                  VEX_W, EVEX_CD8<64, CD8VT1>;
-
-def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1),
-              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
-           (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
-
-def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1),
-              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
-           (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
-
-def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1),
-              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
-           (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
-
-def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1),
-              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
-           (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
-                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+                  EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+                  EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5183,20 +5693,6 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
 defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
 defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
 
-def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src),
-              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
-           (VRSQRT14PSZr VR512:$src)>;
-def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src),
-              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-           (VRSQRT14PDZr VR512:$src)>;
-
-def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src),
-              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
-           (VRCP14PSZr VR512:$src)>;
-def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),
-              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-           (VRCP14PDZr VR512:$src)>;
-
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          SDNode OpNode> {
@@ -5232,6 +5728,8 @@ let hasSideEffects = 0, Predicates = [HasERI] in {
   defm VRCP28   : avx512_eri_s<0xCB, "vrcp28",   X86rcp28s>,   T8PD, EVEX_4V;
   defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
 }
+
+defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
 /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -5322,67 +5820,6 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
   }
 }
 
-multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
-                          Intrinsic F32Int, Intrinsic F64Int,
-                          OpndItins itins_s, OpndItins itins_d> {
-  def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
-               (ins FR32X:$src1, FR32X:$src2),
-               !strconcat(OpcodeStr,
-                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      [], itins_s.rr>, XS, EVEX_4V;
-  let isCodeGenOnly = 1 in
-  def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
-               (ins VR128X:$src1, VR128X:$src2),
-               !strconcat(OpcodeStr,
-                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set VR128X:$dst,
-                 (F32Int VR128X:$src1, VR128X:$src2))],
-               itins_s.rr>, XS, EVEX_4V;
-  let mayLoad = 1 in {
-  def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
-               (ins FR32X:$src1, f32mem:$src2),
-               !strconcat(OpcodeStr,
-                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-  let isCodeGenOnly = 1 in
-  def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
-                   (ins VR128X:$src1, ssmem:$src2),
-                   !strconcat(OpcodeStr,
-                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR128X:$dst,
-                     (F32Int VR128X:$src1, sse_load_f32:$src2))],
-                   itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-  }
-  def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
-               (ins FR64X:$src1, FR64X:$src2),
-               !strconcat(OpcodeStr,
-                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-                      XD, EVEX_4V, VEX_W;
-  let isCodeGenOnly = 1 in
-  def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
-               (ins VR128X:$src1, VR128X:$src2),
-               !strconcat(OpcodeStr,
-                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set VR128X:$dst,
-                 (F64Int VR128X:$src1, VR128X:$src2))],
-               itins_s.rr>, XD, EVEX_4V, VEX_W;
-  let mayLoad = 1 in {
-  def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
-               (ins FR64X:$src1, f64mem:$src2),
-               !strconcat(OpcodeStr,
-                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-               XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
-  let isCodeGenOnly = 1 in
-  def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
-                  (ins VR128X:$src1, sdmem:$src2),
-                   !strconcat(OpcodeStr,
-                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set VR128X:$dst,
-                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>,
-                  XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
-  }
-}
-
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
                                   SDNode OpNode> {
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
@@ -5416,93 +5853,77 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
                                 v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
 }
 
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                              string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+  defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                    (_.VT _.RC:$src2),
+                                    (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in
+    defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                    (_.VT (scalar_to_vector
+                                              (_.ScalarLdFrag addr:$src2))),
+                                    (i32 FROUND_CURRENT))>;
+
+  defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                         "$rc, $src2, $src1", "$src1, $src2, $rc",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                     (_.VT _.RC:$src2),
+                                     (i32 imm:$rc))>,
+                         EVEX_B, EVEX_RC;
+
+  let isCodeGenOnly = 1 in {
+    def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _.FRC:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+    let mayLoad = 1 in
+      def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+                 (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+  }
+
+  def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+            (!cast<Instruction>(NAME#SUFF#Zr)
+                (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+  def : Pat<(_.EltVT (OpNode (load addr:$src))),
+            (!cast<Instruction>(NAME#SUFF#Zm)
+                (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+                        X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+                        X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
 defm VSQRT   : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
                avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
 
-defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt",
-                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd,
-                SSE_SQRTSS, SSE_SQRTSD>;
+defm VSQRT   : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
 
 let Predicates = [HasAVX512] in {
-  def : Pat<(f32 (fsqrt FR32X:$src)),
-            (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
-  def : Pat<(f32 (fsqrt (load addr:$src))),
-            (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[OptForSize]>;
-  def : Pat<(f64 (fsqrt FR64X:$src)),
-            (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
-  def : Pat<(f64 (fsqrt (load addr:$src))),
-            (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[OptForSize]>;
-
   def : Pat<(f32 (X86frsqrt FR32X:$src)),
-            (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+            (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
   def : Pat<(f32 (X86frsqrt (load addr:$src))),
-            (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
             Requires<[OptForSize]>;
-
   def : Pat<(f32 (X86frcp FR32X:$src)),
-            (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+            (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
   def : Pat<(f32 (X86frcp (load addr:$src))),
-            (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
             Requires<[OptForSize]>;
-
-  def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
-            (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
-                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
-                              VR128X)>;
-  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
-            (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
-  def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
-            (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
-                                        (COPY_TO_REGCLASS VR128X:$src, FR64)),
-                              VR128X)>;
-  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
-            (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
-}
-
-
-multiclass avx512_rndscale<bits<8> opc, string OpcodeStr,
-                            X86MemOperand x86memop, RegisterClass RC,
-                            PatFrag mem_frag, Domain d> {
-let ExeDomain = d in {
-  // Intrinsic operation, reg.
-  // Vector intrinsic operation, reg
-  def r : AVX512AIi8<opc, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, EVEX;
-
-  // Vector intrinsic operation, mem
-  def m : AVX512AIi8<opc, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, EVEX;
-} // ExeDomain
 }
 
-defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
-                                loadv16f32, SSEPackedSingle>, EVEX_V512,
-                                EVEX_CD8<32, CD8VF>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
-                   imm:$src2, (v16f32 VR512:$src1), (i16 -1),
-                   FROUND_CURRENT)),
-                   (VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
-
-
-defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
-                                loadv8f64, SSEPackedDouble>, EVEX_V512,
-                                VEX_W, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
-                  imm:$src2, (v8f64 VR512:$src1), (i8 -1),
-                  FROUND_CURRENT)),
-                   (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
-
 multiclass
 avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
@@ -5510,20 +5931,20 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                            "$src3, $src2, $src1", "$src1, $src2, $src3",
-                           (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                           (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                             (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
 
   defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
-                         "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}",
-                         (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                         "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+                         (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                          (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
 
   let mayLoad = 1 in
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
-                         (_.VT (X86RndScale (_.VT _.RC:$src1),
+                         (_.VT (X86RndScales (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                           (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
   }
@@ -5568,109 +5989,238 @@ defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
 defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
                                 AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
 
-let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (ffloor VR512:$src)),
-          (VRNDSCALEPSZr VR512:$src, (i32 0x1))>;
-def : Pat<(v16f32 (fnearbyint VR512:$src)),
-          (VRNDSCALEPSZr VR512:$src, (i32 0xC))>;
-def : Pat<(v16f32 (fceil VR512:$src)),
-          (VRNDSCALEPSZr VR512:$src, (i32 0x2))>;
-def : Pat<(v16f32 (frint VR512:$src)),
-          (VRNDSCALEPSZr VR512:$src, (i32 0x4))>;
-def : Pat<(v16f32 (ftrunc VR512:$src)),
-          (VRNDSCALEPSZr VR512:$src, (i32 0x3))>;
-
-def : Pat<(v8f64 (ffloor VR512:$src)),
-          (VRNDSCALEPDZr VR512:$src, (i32 0x1))>;
-def : Pat<(v8f64 (fnearbyint VR512:$src)),
-          (VRNDSCALEPDZr VR512:$src, (i32 0xC))>;
-def : Pat<(v8f64 (fceil VR512:$src)),
-          (VRNDSCALEPDZr VR512:$src, (i32 0x2))>;
-def : Pat<(v8f64 (frint VR512:$src)),
-          (VRNDSCALEPDZr VR512:$src, (i32 0x4))>;
-def : Pat<(v8f64 (ftrunc VR512:$src)),
-          (VRNDSCALEPDZr VR512:$src, (i32 0x3))>;
-}
 //-------------------------------------------------
 // Integer truncate and extend operations
 //-------------------------------------------------
 
-multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
-                          RegisterClass dstRC, RegisterClass srcRC,
-                          RegisterClass KRC, X86MemOperand x86memop> {
-  def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
-               (ins srcRC:$src),
-               !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
+                              X86MemOperand x86memop> {
+
+  defm rr  : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
+                      (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
+                      (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+                       EVEX, T8XS;
+
+  // for intrinsic patter match
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           undef)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
+                                      DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  let mayStore = 1 in {
+    def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, SrcInfo.RC:$src),
+               OpcodeStr # "\t{$src, $dst|$dst, $src}",
                []>, EVEX;
 
-  def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
-               (ins KRC:$mask, srcRC:$src),
-               !strconcat(OpcodeStr,
-                 "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+    def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+               OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                []>, EVEX, EVEX_K;
+  }//mayStore = 1
+}
 
-  def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
-               (ins KRC:$mask, srcRC:$src),
-               !strconcat(OpcodeStr,
-                 "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-               []>, EVEX, EVEX_KZ;
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+                                    X86VectorVTInfo DestInfo,
+                                    PatFrag truncFrag, PatFrag mtruncFrag > {
 
-  def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-               []>, EVEX;
+  def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+                                    addr:$dst, SrcInfo.RC:$src)>;
 
-  def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
-               (ins x86memop:$dst, KRC:$mask, srcRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
-               []>, EVEX, EVEX_K;
+  def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
+                                               (SrcInfo.VT SrcInfo.RC:$src)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+                            addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo,
+                                        X86VectorVTInfo DestInfo, string sat > {
+
+  def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+                               DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+                  addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask),
+           (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr,
+                    (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM),
+                    (SrcInfo.VT SrcInfo.RC:$src))>;
 
+  def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+                               DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+                  addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1),
+           (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr,
+                    (SrcInfo.VT SrcInfo.RC:$src))>;
 }
-defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
-defm VPMOVSQB   : avx512_trunc_sat<0x22, "vpmovsqb",  VR128X, VR512, VK8WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
-defm VPMOVUSQB  : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
-defm VPMOVQW    : avx512_trunc_sat<0x34, "vpmovqw",   VR128X, VR512, VK8WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
-defm VPMOVSQW   : avx512_trunc_sat<0x24, "vpmovsqw",  VR128X, VR512, VK8WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
-defm VPMOVUSQW  : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
-defm VPMOVQD    : avx512_trunc_sat<0x35, "vpmovqd",   VR256X, VR512, VK8WM,
-                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
-defm VPMOVSQD   : avx512_trunc_sat<0x25, "vpmovsqd",  VR256X, VR512, VK8WM,
-                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
-defm VPMOVUSQD  : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM,
-                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
-defm VPMOVDW    : avx512_trunc_sat<0x33, "vpmovdw",   VR256X, VR512, VK16WM,
-                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
-defm VPMOVSDW   : avx512_trunc_sat<0x23, "vpmovsdw",  VR256X, VR512, VK16WM,
-                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
-defm VPMOVUSDW  : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM,
-                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
-defm VPMOVDB    : avx512_trunc_sat<0x31, "vpmovdb",   VR128X, VR512, VK16WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
-defm VPMOVSDB   : avx512_trunc_sat<0x21, "vpmovsdb",  VR128X, VR512, VK16WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
-defm VPMOVUSDB  : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM,
-                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
-
-def : Pat<(v16i8  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQBrr  VR512:$src)>;
-def : Pat<(v8i16  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQWrr  VR512:$src)>;
-def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr  VR512:$src)>;
-def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
-def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
-
-def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
-                  (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>;
-def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
-                  (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>;
-def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
-                  (VPMOVQWrrkz  VK8WM:$mask, VR512:$src)>;
-def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
-                  (VPMOVQDrrkz  VK8WM:$mask, VR512:$src)>;
 
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+         X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+         X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+         X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
+                                                     Predicate prd = HasAVX512>{
+
+  let Predicates = [HasVLX, prd] in {
+    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+                             DestInfoZ128, x86memopZ128>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+                             truncFrag, mtruncFrag>, EVEX_V128;
+
+    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+                             DestInfoZ256, x86memopZ256>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+                             truncFrag, mtruncFrag>, EVEX_V256;
+  }
+  let Predicates = [prd] in
+    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+                             DestInfoZ, x86memopZ>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+                             truncFrag, mtruncFrag>, EVEX_V512;
+}
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+         X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+         X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+         X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{
+
+  let Predicates = [HasVLX, prd] in {
+    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+                             DestInfoZ128, x86memopZ128>,
+                avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+                             sat>, EVEX_V128;
+
+    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+                             DestInfoZ256, x86memopZ256>,
+                avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+                             sat>, EVEX_V256;
+  }
+  let Predicates = [prd] in
+    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+                             DestInfoZ, x86memopZ>,
+                avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+                             sat>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+               truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>;
+}
+multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+               sat>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+               truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>;
+}
+multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info,
+               v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+               sat>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+               truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>;
+}
+multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info,
+               v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+               sat>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+               truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>;
+}
+multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+               sat>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+              v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+              truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info,
+              v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+              sat>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+              v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+              truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info,
+              v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+              sat, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>;
+defm VPMOVSQB   : avx512_trunc_sat_qb<0x22, "s",   X86vtruncs>;
+defm VPMOVUSQB  : avx512_trunc_sat_qb<0x12, "us",  X86vtruncus>;
+
+defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>;
+defm VPMOVSQW   : avx512_trunc_sat_qw<0x24, "s",   X86vtruncs>;
+defm VPMOVUSQW  : avx512_trunc_sat_qw<0x14, "us",  X86vtruncus>;
+
+defm VPMOVQD    : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>;
+defm VPMOVSQD   : avx512_trunc_sat_qd<0x25, "s",   X86vtruncs>;
+defm VPMOVUSQD  : avx512_trunc_sat_qd<0x15, "us",  X86vtruncus>;
+
+defm VPMOVDB    : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>;
+defm VPMOVSDB   : avx512_trunc_sat_db<0x21, "s",   X86vtruncs>;
+defm VPMOVUSDB  : avx512_trunc_sat_db<0x11, "us",  X86vtruncus>;
+
+defm VPMOVDW    : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>;
+defm VPMOVSDW   : avx512_trunc_sat_dw<0x23, "s",   X86vtruncs>;
+defm VPMOVUSDW  : avx512_trunc_sat_dw<0x13, "us",  X86vtruncus>;
+
+defm VPMOVWB    : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>;
+defm VPMOVSWB   : avx512_trunc_sat_wb<0x20, "s",   X86vtruncs>;
+defm VPMOVUSWB  : avx512_trunc_sat_wb<0x10, "us",  X86vtruncus>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+         (v8i16 (EXTRACT_SUBREG
+                 (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0),
+                                          VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+         (v4i32 (EXTRACT_SUBREG
+                 (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0),
+                                           VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+         (v16i8 (EXTRACT_SUBREG  (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0),
+                                            VR256X:$src, sub_ymm))), sub_xmm))>;
+}
 
 multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
                   X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
@@ -5985,163 +6535,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd
 
 defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
                      VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-//===----------------------------------------------------------------------===//
-// VSHUFPS - VSHUFPD Operations
-
-multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
-                      ValueType vt, string OpcodeStr, PatFrag mem_frag,
-                      Domain d> {
-  def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
-                   (ins RC:$src1, x86memop:$src2, u8imm:$src3),
-                   !strconcat(OpcodeStr,
-                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
-                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
-                   EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
-  def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, u8imm:$src3),
-                   !strconcat(OpcodeStr,
-                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
-                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
-                   EVEX_4V, Sched<[WriteShuffle]>;
-}
-
-defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32,
-                  SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64,
-                  SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
-          (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
-def : Pat<(v16i32 (X86Shufp VR512:$src1,
-                    (loadv16i32 addr:$src2), (i8 imm:$imm))),
-          (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
-          (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
-def : Pat<(v8i64 (X86Shufp VR512:$src1,
-                            (loadv8i64 addr:$src2), (i8 imm:$imm))),
-          (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
 
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
 def v8i1sextv8i64  : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
 
-multiclass avx512_conflict<bits<8> opc, string OpcodeStr,
-                        RegisterClass RC, RegisterClass KRC,
-                        X86MemOperand x86memop,
-                        X86MemOperand x86scalar_mop, string BrdcstStr> {
-  let hasSideEffects = 0 in {
-  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-       (ins RC:$src),
-       !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
-       []>, EVEX;
-  let mayLoad = 1 in
-  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins x86memop:$src),
-       !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
-       []>, EVEX;
-  let mayLoad = 1 in
-  def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins x86scalar_mop:$src),
-       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
-                  ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
-       []>, EVEX, EVEX_B;
-  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src),
-       !strconcat(OpcodeStr,
-                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-       []>, EVEX, EVEX_KZ;
-  let mayLoad = 1 in
-  def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, x86memop:$src),
-       !strconcat(OpcodeStr,
-                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-       []>, EVEX, EVEX_KZ;
-  let mayLoad = 1 in
-  def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, x86scalar_mop:$src),
-       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
-                  ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
-                  BrdcstStr, "}"),
-       []>, EVEX, EVEX_KZ, EVEX_B;
-
-  let Constraints = "$src1 = $dst" in {
-  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-       (ins RC:$src1, KRC:$mask, RC:$src2),
-       !strconcat(OpcodeStr,
-                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-       []>, EVEX, EVEX_K;
-  let mayLoad = 1 in
-  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, KRC:$mask, x86memop:$src2),
-       !strconcat(OpcodeStr,
-                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-       []>, EVEX, EVEX_K;
-  let mayLoad = 1 in
-  def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
-       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
-                  ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
-       []>, EVEX, EVEX_K, EVEX_B;
-  }
-  }
-}
-
-let Predicates = [HasCDI] in {
-defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
-                    i512mem, i32mem, "{1to16}">,
-                    EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-
-defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
-                    i512mem, i64mem, "{1to8}">,
-                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-}
-
-def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1,
-                                              GR16:$mask),
-          (VPCONFLICTDrrk VR512:$src1,
-           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
-
-def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
-                                              GR8:$mask),
-          (VPCONFLICTQrrk VR512:$src1,
-           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
-
-let Predicates = [HasCDI] in {
-defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM,
-                    i512mem, i32mem, "{1to16}">,
-                    EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-
-defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM,
-                    i512mem, i64mem, "{1to8}">,
-                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-}
-
-def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1,
-                                              GR16:$mask),
-          (VPLZCNTDrrk VR512:$src1,
-           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
-
-def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
-                                              GR8:$mask),
-          (VPLZCNTQrrk VR512:$src1,
-           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
-
-def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))),
-          (VPLZCNTDrm addr:$src)>;
-def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
-          (VPLZCNTDrr VR512:$src)>;
-def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))),
-          (VPLZCNTQrm addr:$src)>;
-def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
-          (VPLZCNTQrr VR512:$src)>;
-
 def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
@@ -6197,7 +6595,7 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
 multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
 def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX;
+                  [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
 }
 
 multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
@@ -6230,19 +6628,19 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
 multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
                                  string OpcodeStr> {
   defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
-              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", 
+              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
               (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
 
   let mayStore = 1 in {
   def mr : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.RC:$src),
-              OpcodeStr # "\t{$src, $dst |$dst, $src}",
+              OpcodeStr # "\t{$src, $dst|$dst, $src}",
               []>, EVEX_CD8<_.EltSize, CD8VT1>;
 
   def mrk : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
-              OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
-              [(store (_.VT (vselect _.KRCWM:$mask, 
+              OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+              [(store (_.VT (vselect _.KRCWM:$mask,
                              (_.VT (X86compress  _.RC:$src)), _.ImmAllZerosV)),
                 addr:$dst)]>,
               EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
@@ -6272,7 +6670,7 @@ defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info
 multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
                                  string OpcodeStr> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", 
+              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
               (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
 
   let mayLoad = 1 in
@@ -6302,6 +6700,62 @@ defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
 defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
                                          EVEX, VEX_W;
 
+//handle instruction  reg_vec1 = op(reg_vec,imm)
+//                               op(mem_vec,imm)
+//                               op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>;
+    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+                      "${src1}"##_.BroadcastStr##", $src2",
+                      (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>, EVEX_B;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+  defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
+                      "$src1, {sae}, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                                  EVEX_V512;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+                                  EVEX_V128;
+    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
 //                               op(reg_vec2,mem_vec,imm)
 //                               op(reg_vec2,broadcast(eltVt),imm)
@@ -6309,49 +6763,60 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
 multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _>{
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i8 imm:$src3),
+                              (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>;
   let mayLoad = 1 in {
     defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                      (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                              (i8 imm:$src3),
+                              (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>;
     defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                      (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
                       "$src1, ${src2}"##_.BroadcastStr##", $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                              (i8 imm:$src3),
+                              (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>, EVEX_B;
   }
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
 //                               op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+
+  defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                               (SrcInfo.VT SrcInfo.RC:$src2),
+                               (i8 imm:$src3)))>;
+  let mayLoad = 1 in
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                               (SrcInfo.VT (bitconvert
+                                                  (SrcInfo.LdFrag addr:$src2))),
+                               (i8 imm:$src3)))>;
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
 //                               op(reg_vec2,broadcast(eltVt),imm)
 multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                                            X86VectorVTInfo _>{
-  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
-                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT _.RC:$src2),
-                              (i8 imm:$src3))>;
-  let mayLoad = 1 in {
-    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
-                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                      (OpNode (_.VT _.RC:$src1),
-                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                              (i8 imm:$src3))>;
+                           X86VectorVTInfo _>:
+  avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+
+  let mayLoad = 1 in
     defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
                       OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
@@ -6359,7 +6824,6 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
                               (i8 imm:$src3))>, EVEX_B;
-  }
 }
 
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -6369,20 +6833,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                            X86VectorVTInfo _> {
 
   defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i8 imm:$src3),
+                              (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>;
   let mayLoad = 1 in {
     defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                      (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT (scalar_to_vector
                                         (_.ScalarLdFrag addr:$src2))),
-                              (i8 imm:$src3),
+                              (i32 imm:$src3),
                               (i32 FROUND_CURRENT))>;
 
     let isAsmParserOnly = 1 in {
@@ -6398,18 +6862,25 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _>{
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
-                      OpcodeStr, "$src3,{sae}, $src2, $src1",
-                      "$src1, $src2,{sae}, $src3",
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, {sae}, $src2, $src1",
+                      "$src1, $src2, {sae}, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i8 imm:$src3),
+                              (i32 imm:$src3),
                               (i32 FROUND_NO_EXC))>, EVEX_B;
 }
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _> {
-  defm NAME: avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _>;
+  defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, {sae}, $src2, $src1",
+                      "$src1, $src2, {sae}, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
 }
 
 multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
@@ -6428,6 +6899,20 @@ multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
   }
 }
 
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+                   AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
+  let Predicates = [HasBWI] in {
+    defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+  }
+  let Predicates = [HasBWI, HasVLX] in {
+    defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+    defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode,  DestInfo.info256,
+                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+  }
+}
+
 multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
                                 bits<8> opc, SDNode OpNode>{
   let Predicates = [HasAVX512] in {
@@ -6447,6 +6932,14 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
   }
 }
 
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+                    bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+  defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+                            opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+  defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+                            opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
 defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd",
                               avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
@@ -6461,6 +6954,14 @@ defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info,
                                                  0x55, X86VFixupimm, HasAVX512>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
+defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+                              X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+                              X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+                              X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
+
+
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                        0x50, X86VRange, HasDQI>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
@@ -6475,6 +6976,19 @@ defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
                                                  0x51, X86VRange, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+                                                 0x57, X86Reduces, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+                                                 0x57, X86Reduces, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+                                                 0x27, X86GetMants, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+                                                 0x27, X86GetMants, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
 multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
                                        bits<8> opc, SDNode OpNode = X86Shuf128>{
@@ -6486,6 +7000,29 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
      defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
   }
 }
+let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+}
 
 defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
@@ -6496,31 +7033,51 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
 defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 
-multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
-                                                AVX512VLVectorVTInfo VTInfo_FP>{
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
   defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
                            AVX512AIi8Base, EVEX_4V;
-  let isCodeGenOnly = 1 in {
-    defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>,
-                           AVX512AIi8Base, EVEX_4V;
-  }
 }
 
-defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>,
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
                                                   EVEX_CD8<32, CD8VF>;
-defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>,
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
                                                   EVEX_CD8<64, CD8VF>, VEX_W;
 
+multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+  let Predicates = p in
+    def NAME#_.VTName#rri:
+          Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+              (!cast<Instruction>(NAME#_.ZSuffix#rri)
+                    _.RC:$src1, _.RC:$src2, imm:$imm)>;
+}
+
+multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>:
+      avx512_vpalign_lowering<_.info512, [HasBWI]>,
+      avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>,
+      avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>;
+
+defm VPALIGN:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+                                          avx512vl_i8_info, avx512vl_i8_info>,
+                avx512_vpalign_lowering_common<avx512vl_i16_info>,
+                avx512_vpalign_lowering_common<avx512vl_i32_info>,
+                avx512_vpalign_lowering_common<avx512vl_f32_info>,
+                avx512_vpalign_lowering_common<avx512vl_i64_info>,
+                avx512_vpalign_lowering_common<avx512vl_f64_info>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
+                    avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
 multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _> {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1), OpcodeStr##_.Suffix,
+                    (ins _.RC:$src1), OpcodeStr,
                     "$src1", "$src1",
                     (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
 
   let mayLoad = 1 in
     defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.MemOp:$src1), OpcodeStr##_.Suffix,
+                    (ins _.MemOp:$src1), OpcodeStr,
                     "$src1", "$src1",
                     (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
               EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
@@ -6531,7 +7088,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
            avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
   let mayLoad = 1 in
     defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.ScalarMemOp:$src1), OpcodeStr##_.Suffix,
+                    (ins _.ScalarMemOp:$src1), OpcodeStr,
                     "${src1}"##_.BroadcastStr,
                     "${src1}"##_.BroadcastStr,
                     (_.VT (OpNode (X86VBroadcast
@@ -6568,15 +7125,16 @@ multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
                                  SDNode OpNode, Predicate prd> {
-  defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr, OpNode, avx512vl_i64_info,
+  defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
                                prd>, VEX_W;
-  defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr, OpNode, avx512vl_i32_info, prd>;
+  defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
+                               prd>;
 }
 
 multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
                                  SDNode OpNode, Predicate prd> {
-  defm W : avx512_unary_rm_vl<opc_w, OpcodeStr, OpNode, avx512vl_i16_info, prd>;
-  defm B : avx512_unary_rm_vl<opc_b, OpcodeStr, OpNode, avx512vl_i8_info, prd>;
+  defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
+  defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
 }
 
 multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
@@ -6598,3 +7156,332 @@ def : Pat<(xor
           (bc_v8i64 (v8i1sextv8i64)),
           (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
           (VPABSQZrr VR512:$src)>;
+
+multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+
+  defm NAME :          avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+}
+
+defm VPLZCNT    : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+  defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
+                                      HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                   (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                   (_.VT (OpNode (_.VT (scalar_to_vector
+                                         (_.ScalarLdFrag addr:$src)))))>,
+                   EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                  AVX512VLVectorVTInfo VTInfo> {
+
+  defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                               EVEX_V256;
+    defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                               EVEX_V128;
+  }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+  defm NAME:      avx512_movddup_common<opc, OpcodeStr, OpNode,
+                                        avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+
+def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+          (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>;
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  let mayStore = 1 in
+    def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+                (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+                                                            imm:$src2)))),
+                        addr:$dst)]>,
+                EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32orGR64:$dst,
+                        (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, TAPD;
+
+    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+  }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32orGR64:$dst,
+                        (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, PD;
+
+    def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins _.RC:$src1, u8imm:$src2),
+                   OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                   EVEX, TAPD;
+
+    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+  }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+                                                            RegisterClass GRC> {
+  let Predicates = [HasDQI] in {
+    def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GRC:$dst,
+                      (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, TAPD;
+
+    let mayStore = 1 in
+      def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+                  (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(store (extractelt (_.VT _.RC:$src1),
+                                      imm:$src2),addr:$dst)]>,
+                  EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+  }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                            X86VectorVTInfo _, PatFrag LdFrag> {
+  def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.RC:$src1,  _.ScalarMemOp:$src2, u8imm:$src3),
+      OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set _.RC:$dst,
+          (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                            X86VectorVTInfo _, PatFrag LdFrag> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+        (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+        OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+        [(set _.RC:$dst,
+            (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+  }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+                                         X86VectorVTInfo _, RegisterClass GRC> {
+  let Predicates = [HasDQI] in {
+    def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+        (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+        OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+        [(set _.RC:$dst,
+            (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+        EVEX_4V, TAPD;
+
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+                                    _.ScalarLdFrag>, TAPD;
+  }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+                                     extloadi8>, TAPD;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+                                     extloadi16>, PD;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+                                                AVX512VLVectorVTInfo VTInfo_FP>{
+  defm NAME:     avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
+                                   EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+                                   AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+                             Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+  def rr : AVX512<opc, MRMr,
+             (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+  let mayLoad = 1 in
+    def rm : AVX512<opc, MRMm,
+             (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.RC:$dst,(_.VT (OpNode 
+                                   (_.LdFrag addr:$src1), (i8 imm:$src2))))]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, 
+                                 Format MRMm, string OpcodeStr, Predicate prd>{
+  let Predicates = [prd] in
+    defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
+                                    OpcodeStr, v8i64_info>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
+                                    OpcodeStr, v4i64x_info>, EVEX_V256;
+    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
+                                    OpcodeStr, v2i64x_info>, EVEX_V128;
+  }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", 
+                                       HasBWI>, AVX512PDIi8Base, EVEX_4V;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", 
+                                       HasBWI>, AVX512PDIi8Base, EVEX_4V;
+
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, 
+                                string OpcodeStr, X86VectorVTInfo _dst,
+                                X86VectorVTInfo _src>{
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _dst.RC:$dst,(_dst.VT
+                                (OpNode (_src.VT _src.RC:$src1),
+                                        (_src.VT _src.RC:$src2))))]>;
+  let mayLoad = 1 in
+    def rm : AVX512BI<opc, MRMSrcMem,
+             (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _dst.RC:$dst,(_dst.VT
+                                (OpNode (_src.VT _src.RC:$src1),
+                                (_src.VT (bitconvert
+                                          (_src.LdFrag addr:$src2))))))]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, 
+                                    string OpcodeStr, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
+                                    v64i8_info>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+                                    v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+                                    v16i8x_info>, EVEX_V128;
+  }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", 
+                                       HasBWI>, EVEX_4V;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  let Constraints = "$src1 = $dst" in {
+  defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT _.RC:$src3),
+                              (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT (bitconvert (_.LdFrag addr:$src3))),
+                              (i8 imm:$src4))>,
+                      AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+    defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+                      "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                              (i8 imm:$src4))>, EVEX_B,
+                      AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  }
+  }// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+  let Predicates = [HasAVX512] in
+    defm Z    : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
+    defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+  }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
index 5e19ad4..1a2e786 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -615,14 +615,14 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
 def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
 
 
-def Xi8  : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
-                       Imm8 , i8imm ,    imm,          i8imm   , invalid_node,
+def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+                       Imm8, i8imm, imm8_su, i8imm, invalid_node,
                        0, OpSizeFixed, 0>;
 def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
-                       Imm16, i16imm,    imm,          i16i8imm, i16immSExt8,
+                       Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
                        1, OpSize16, 0>;
 def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
-                       Imm32, i32imm,    imm,          i32i8imm, i32immSExt8,
+                       Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
                        1, OpSize32, 0>;
 def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
                        Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
@@ -928,15 +928,22 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   let hasSideEffects = 0;
 }
 
-// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
 // and use EFLAGS.
-class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                Register areg, string operands>
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  Register areg, string operands>
   : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
             IIC_BIN_CARRY_NONMEM> {
   let Uses = [areg, EFLAGS];
 }
 
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+  let Defs = [EFLAGS];
+}
+
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
 /// defined with "(set GPR:$dst, EFLAGS, (...".
 ///
@@ -1092,14 +1099,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     }
   } // Uses = [EFLAGS], Defs = [EFLAGS]
 
-  def NAME#8i8   : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
-                              "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
-                              "{$src, %ax|ax, $src}">;
-  def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
-                              "{$src, %eax|eax, $src}">;
-  def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
-                              "{$src, %rax|rax, $src}">;
+  def NAME#8i8   : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+                               "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+                               "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+                               "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+                               "{$src, %rax|rax, $src}">;
 }
 
 /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1170,14 +1177,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     }
   } // Defs = [EFLAGS]
 
-  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                           "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                           "{$src, %ax|ax, $src}">;
-  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                           "{$src, %eax|eax, $src}">;
-  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                           "{$src, %rax|rax, $src}">;
+  def NAME#8i8   : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+                             "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+                             "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+                             "{$src, %rax|rax, $src}">;
 }
 
 
@@ -1246,14 +1253,14 @@ let isCompare = 1 in {
                           "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
   } // Defs = [EFLAGS]
 
-  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
-                           "{$src, %al|al, $src}">;
-  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX,
-                           "{$src, %ax|ax, $src}">;
-  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX,
-                           "{$src, %eax|eax, $src}">;
-  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
-                           "{$src, %rax|rax, $src}">;
+  def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def TEST16i16  : BinOpAI_F<0xA8, "test", Xi16, AX,
+                             "{$src, %ax|ax, $src}">;
+  def TEST32i32  : BinOpAI_F<0xA8, "test", Xi32, EAX,
+                             "{$src, %eax|eax, $src}">;
+  def TEST64i32  : BinOpAI_F<0xA8, "test", Xi64, RAX,
+                             "{$src, %rax|rax, $src}">;
 } // isCompare
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
index 2056056..787f15b 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -156,10 +156,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
     Flags |= MachineMemOperand::MOLoad;
   if (MCID.mayStore())
     Flags |= MachineMemOperand::MOStore;
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset),
-                            Flags, MFI.getObjectSize(FI),
-                            MFI.getObjectAlignment(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
   return addOffset(MIB.addFrameIndex(FI), Offset)
             .addMemOperand(MMO);
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 315f213..c73c950 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 
-// SetCC instructions.
+// CMOV instructions.
 multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
   let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
       isCommutable = 1, SchedRW = [WriteALU] in {
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
index 7f850d6..96a29ca 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -132,26 +132,6 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
                     Requires<[In64BitMode]>;
 }
 
-// The MSVC runtime contains an _ftol2 routine for converting floating-point
-// to integer values. It has a strange calling convention: the input is
-// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
-// used as a temporary register. No other registers (aside from flags) are
-// touched.
-// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
-// variant is unnecessary.
-
-let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
-  def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
-                      "# win32 fptoui",
-                      [(X86WinFTOL RFP32:$src)]>,
-                    Requires<[Not64BitMode]>;
-
-  def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
-                      "# win32 fptoui",
-                      [(X86WinFTOL RFP64:$src)]>,
-                    Requires<[Not64BitMode]>;
-}
-
 //===----------------------------------------------------------------------===//
 // EH Pseudo Instructions
 //
@@ -172,6 +152,29 @@ def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
 
 }
 
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+    isCodeGenOnly = 1, isReturn = 1 in {
+  def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+  // CATCHRET needs a custom inserter for SEH.
+  let usesCustomInserter = 1 in
+    def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+                     "# CATCHRET",
+                     [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
+
+// This instruction is responsible for re-establishing stack pointers after an
+// exception has been caught and we are rejoining normal control flow in the
+// parent function or funclet. It generally sets ESP and EBP, and optionally
+// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
+// elsewhere.
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
+def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
+
 let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
     usesCustomInserter = 1 in {
   def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
@@ -259,18 +262,33 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
   let AddedComplexity = 20;
 }
 
+let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+    AddedComplexity = 1 in {
+  // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+  // which only require 3 bytes compared to MOV32ri which requires 5.
+  let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+    def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                        [(set GR32:$dst, 1)]>;
+    def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                        [(set GR32:$dst, -1)]>;
+  }
+
+  // MOV16ri is 4 bytes, so the instructions above are smaller.
+  def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+  def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
-let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1, hasSideEffects = 0 in
-def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
-                     "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1, hasSideEffects = 0 in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
 
 // This 64-bit pseudo-move can be used for both a 64-bit constant that is
-// actually the zero-extension of a 32-bit constant, and for labels in the
+// actually the zero-extension of a 32-bit constant and for labels in the
 // x86-64 small code model.
-def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
 
 let AddedComplexity = 1 in
 def : Pat<(i64 mov64imm32:$src),
@@ -509,6 +527,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in {
 
   defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
   defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
+  defm _FR128  : CMOVrr_PSEUDO<FR128, f128>;
   defm _V4F32  : CMOVrr_PSEUDO<VR128, v4f32>;
   defm _V2F64  : CMOVrr_PSEUDO<VR128, v2f64>;
   defm _V2I64  : CMOVrr_PSEUDO<VR128, v2i64>;
@@ -535,8 +554,8 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in {
 // TODO: Get this to fold the constant into the instruction.
 let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
-                      "or{l}\t{$zero, $dst|$dst, $zero}",
-                      [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
+                      "or{l}\t{$zero, $dst|$dst, $zero}", [],
+                      IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
                     Sched<[WriteALULd, WriteRMW]>;
 
 let hasSideEffects = 1 in
@@ -752,67 +771,111 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
 
 /* The following multiclass tries to make sure that in code like
  *    x.store (immediate op x.load(acquire), release)
+ * and
+ *    x.store (register op x.load(acquire), release)
  * an operation directly on memory is generated instead of wasting a register.
  * It is not automatic as atomic_store/load are only lowered to MOV instructions
  * extremely late to prevent them from being accidentally reordered in the backend
  * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
  */
-multiclass RELEASE_BINOP_MI<string op> {
+multiclass RELEASE_BINOP_MI<SDNode op> {
     def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
-        "#RELEASE_BINOP PSEUDO!",
-        [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
+        "#BINOP "#NAME#"8mi PSEUDO!",
+        [(atomic_store_8 addr:$dst, (op
             (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+    def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+        "#BINOP "#NAME#"8mr PSEUDO!",
+        [(atomic_store_8 addr:$dst, (op
+            (atomic_load_8 addr:$dst), GR8:$src))]>;
     // NAME#16 is not generated as 16-bit arithmetic instructions are considered
     // costly and avoided as far as possible by this backend anyway
     def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
-        "#RELEASE_BINOP PSEUDO!",
-        [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
+        "#BINOP "#NAME#"32mi PSEUDO!",
+        [(atomic_store_32 addr:$dst, (op
             (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst, (op
+            (atomic_load_32 addr:$dst), GR32:$src))]>;
     def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
-        "#RELEASE_BINOP PSEUDO!",
-        [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
+        "#BINOP "#NAME#"64mi32 PSEUDO!",
+        [(atomic_store_64 addr:$dst, (op
             (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst, (op
+            (atomic_load_64 addr:$dst), GR64:$src))]>;
+}
+let Defs = [EFLAGS] in {
+  defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
+  defm RELEASE_AND : RELEASE_BINOP_MI<and>;
+  defm RELEASE_OR  : RELEASE_BINOP_MI<or>;
+  defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
+  // Note: we don't deal with sub, because substractions of constants are
+  //       optimized into additions before this code can run.
+}
+
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<SDNode op> {
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst,
+	   (i32 (bitconvert (op
+             (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+	      FR32:$src))))]>, Requires<[HasSSE1]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst,
+	   (i64 (bitconvert (op
+             (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+	      FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
 }
-defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
-defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
-defm RELEASE_OR  : RELEASE_BINOP_MI<"or">;
-defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
-// Note: we don't deal with sub, because substractions of constants are
-// optimized into additions before this code can run
 
 multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
     def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"8m PSEUDO!",
         [(atomic_store_8 addr:$dst, dag8)]>;
     def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"16m PSEUDO!",
         [(atomic_store_16 addr:$dst, dag16)]>;
     def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"32m PSEUDO!",
         [(atomic_store_32 addr:$dst, dag32)]>;
     def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"64m PSEUDO!",
         [(atomic_store_64 addr:$dst, dag64)]>;
 }
 
-defm RELEASE_INC : RELEASE_UNOP<
-    (add (atomic_load_8  addr:$dst), (i8 1)),
-    (add (atomic_load_16 addr:$dst), (i16 1)),
-    (add (atomic_load_32 addr:$dst), (i32 1)),
-    (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
-defm RELEASE_DEC : RELEASE_UNOP<
-    (add (atomic_load_8  addr:$dst), (i8 -1)),
-    (add (atomic_load_16 addr:$dst), (i16 -1)),
-    (add (atomic_load_32 addr:$dst), (i32 -1)),
-    (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+let Defs = [EFLAGS] in {
+  defm RELEASE_INC : RELEASE_UNOP<
+      (add (atomic_load_8  addr:$dst), (i8 1)),
+      (add (atomic_load_16 addr:$dst), (i16 1)),
+      (add (atomic_load_32 addr:$dst), (i32 1)),
+      (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+  defm RELEASE_DEC : RELEASE_UNOP<
+      (add (atomic_load_8  addr:$dst), (i8 -1)),
+      (add (atomic_load_16 addr:$dst), (i16 -1)),
+      (add (atomic_load_32 addr:$dst), (i32 -1)),
+      (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+}
 /*
 TODO: These don't work because the type inference of TableGen fails.
 TODO: find a way to fix it.
-defm RELEASE_NEG : RELEASE_UNOP<
-    (ineg (atomic_load_8  addr:$dst)),
-    (ineg (atomic_load_16 addr:$dst)),
-    (ineg (atomic_load_32 addr:$dst)),
-    (ineg (atomic_load_64 addr:$dst))>;
+let Defs = [EFLAGS] in {
+  defm RELEASE_NEG : RELEASE_UNOP<
+      (ineg (atomic_load_8  addr:$dst)),
+      (ineg (atomic_load_16 addr:$dst)),
+      (ineg (atomic_load_32 addr:$dst)),
+      (ineg (atomic_load_64 addr:$dst))>;
+}
+// NOT doesn't set flags.
 defm RELEASE_NOT : RELEASE_UNOP<
     (not (atomic_load_8  addr:$dst)),
     (not (atomic_load_16 addr:$dst)),
@@ -821,42 +884,42 @@ defm RELEASE_NOT : RELEASE_UNOP<
 */
 
 def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV8mi PSEUDO!",
 			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
 def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV16mi PSEUDO!",
 			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
 def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV32mi PSEUDO!",
 			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
 def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV64mi32 PSEUDO!",
 			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
 
 def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV8mr PSEUDO!",
                         [(atomic_store_8  addr:$dst, GR8 :$src)]>;
 def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV16mr PSEUDO!",
                         [(atomic_store_16 addr:$dst, GR16:$src)]>;
 def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV32mr PSEUDO!",
                         [(atomic_store_32 addr:$dst, GR32:$src)]>;
 def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV64mr PSEUDO!",
                         [(atomic_store_64 addr:$dst, GR64:$src)]>;
 
 def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV8rm PSEUDO!",
                       [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
 def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV16rm PSEUDO!",
                       [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
 def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV32rm PSEUDO!",
                       [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
 def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV64rm PSEUDO!",
                       [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
 
 //===----------------------------------------------------------------------===//
@@ -1077,11 +1140,11 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
 
 // zextload bool -> zextload byte
 def : Pat<(zextloadi8i1  addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>;
-def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>;
-def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>;
+def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>;
+def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>;
 def : Pat<(zextloadi64i1 addr:$src),
           (SUBREG_TO_REG (i64 0),
-           (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
+           (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
 
 // extload bool -> extload byte
 // When extloading from 16-bit and smaller memory locations into 64-bit
@@ -1298,7 +1361,6 @@ def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
                          (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
                          sub_32bit)>;
 // r & (2^16-1) ==> movz
-let AddedComplexity = 1 in // Give priority over i64immZExt32.
 def : Pat<(and GR64:$src, 0xffff),
           (SUBREG_TO_REG (i64 0),
                       (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
index 4cd5563..8c351a5 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrControl.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -53,6 +53,19 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
                     "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
   def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
                     "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+
+  // The machine return from interrupt instruction, but sometimes we need to
+  // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+  // which expands to include an SP adjustment if necessary.
+  def IRET16 : I   <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+               OpSize16;
+  def IRET32 : I   <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
+                    IIC_IRET>, OpSize32;
+  def IRET64 : RI  <0xcf, RawFrm, (outs), (ins), "iretq", [],
+                    IIC_IRET>, Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>;
+  
 }
 
 // Unconditional branches.
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
index c4b2d6d..af43d9f 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -98,22 +98,22 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in {
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>;
 
 def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVSX>, TB, Sched<[WriteALU]>;
+                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
+                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>;
 }
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register
@@ -146,18 +146,22 @@ def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
                     Sched<[WriteALULd]>, Requires<[In64BitMode]>;
 
 // movzbq and movzwq encodings for the disassembler
-def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
-                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALU]>;
-def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
-                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALULd]>;
-def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
-                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALU]>;
-def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALULd]>;
+let hasSideEffects = 0 in {
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALULd]>;
+def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALULd]>;
+}
 
 // 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
 // 32-bit register.
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
index 7cc3b59..fd800cf 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -15,13 +15,31 @@
 // FMA3 - Intel 3 operand Fused Multiply-Add instructions
 //===----------------------------------------------------------------------===//
 
-let Constraints = "$src1 = $dst" in {
+// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
+// below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+//   operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+//   operands 1 and 3 (register forms only):     *213* --> *231*;
+//   operands 2 and 3 (register forms only):     *213* --> *132*.
+// FMA*132*:
+//   operands 1 and 2 (memory & register forms): *132* --> *231*;
+//   operands 1 and 3 (register forms only):     *132* --> *132*(no changes);
+//   operands 2 and 3 (register forms only):     *132* --> *213*.
+// FMA*231*:
+//   operands 1 and 2 (memory & register forms): *231* --> *132*;
+//   operands 1 and 3 (register forms only):     *231* --> *213*;
+//   operands 2 and 3 (register forms only):     *231* --> *231*(no changes).
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                     PatFrag MemFrag128, PatFrag MemFrag256,
                     ValueType OpVT128, ValueType OpVT256,
-                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
                     SDPatternOperator Op = null_frag> {
-  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+  let usesCustomInserter = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -29,7 +47,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  let mayLoad = 1, isCommutable = IsMVariantCommutable in
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
@@ -37,7 +55,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
-  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+  let usesCustomInserter = 1 in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
                    !strconcat(OpcodeStr,
@@ -45,7 +63,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
                                                VR256:$src3)))]>, VEX_L;
 
-  let mayLoad = 1, isCommutable = IsMVariantCommutable in
+  let mayLoad = 1 in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
                    !strconcat(OpcodeStr,
@@ -54,34 +72,20 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                      (OpVT256 (Op VR256:$src2, VR256:$src1,
                                (MemFrag256 addr:$src3))))]>, VEX_L;
 }
-} // Constraints = "$src1 = $dst"
 
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
-  // For 213, both the register and memory variant are commutable.
-  // Indeed, the commutable operands are 1 and 2 and both live in registers
-  // for both variants.
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, "213", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256,
-                       /* IsRVariantCommutable */ 1,
-                       /* IsMVariantCommutable */ 1,
-                       Op>;
-let hasSideEffects = 0 in {
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, "132", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
-  // For 231, only the register variant is commutable.
-  // For the memory variant the folded operand must be in 3. Thus,
-  // in that case, it cannot be swapped with 2.
   defm r231 : fma3p_rm<opc231,
                        !strconcat(OpcodeStr, "231", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256,
-                       /* IsRVariantCommutable */ 1,
-                       /* IsMVariantCommutable */ 0>;
-} // hasSideEffects = 0
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
 }
 
 // Fused Multiply-Add
@@ -126,83 +130,122 @@ let ExeDomain = SSEPackedDouble in {
                                v4f64>, VEX_W;
 }
 
-let Constraints = "$src1 = $dst" in {
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
-                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
-                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requres an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+//     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+//     -->
+//     FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+                    X86MemOperand x86memop, RegisterClass RC,
                     SDPatternOperator OpNode = null_frag> {
-  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+  let usesCustomInserter = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set RC:$dst,
-                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+                   [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
 
-  let mayLoad = 1, isCommutable = IsMVariantCommutable in
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
-                     (OpVT (OpNode RC:$src2, RC:$src1,
-                            (mem_frag addr:$src3))))]>;
+                     (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+    hasSideEffects = 0 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+                        Operand memopr, RegisterClass RC> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+
+  let mayLoad = 1 in
+  def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, memopr:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
 }
-} // Constraints = "$src1 = $dst"
 
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpStr, string PackTy, string PT2, Intrinsic Int,
-                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
-                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
-                       ComplexPattern mem_cpat> {
-let hasSideEffects = 0 in {
-  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
-                       x86memop, RC, OpVT, mem_frag>;
-  // See the other defm of r231 for the explanation regarding the
-  // commutable flags.
-  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                       x86memop, RC, OpVT, mem_frag,
-                       /* IsRVariantCommutable */ 1,
-                       /* IsMVariantCommutable */ 0>;
+                       string OpStr, string PackTy,
+                       SDNode OpNode, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
+  defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
+                       OpNode>;
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
 }
 
-// See the other defm of r213 for the explanation regarding the
-// commutable flags.
-defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                     x86memop, RC, OpVT, mem_frag,
-                     /* IsRVariantCommutable */ 1,
-                     /* IsMVariantCommutable */ 1,
-                     OpNode>;
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                           string OpStr, string PackTy,
+                           RegisterClass RC, Operand memop> {
+  defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+                           memop, RC>;
+  defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+                           memop, RC>;
+  defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+                           memop, RC>;
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                  string OpStr, Intrinsic IntF32, Intrinsic IntF64,
                  SDNode OpNode> {
-  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
-                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
-  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
-                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+  let ExeDomain = SSEPackedSingle in
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
+                        FR32, f32mem>,
+            fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+
+  let ExeDomain = SSEPackedDouble in
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
+                        FR64, f64mem>,
+            fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
+            VEX_W;
 
-// These patterns use the 123 ordering, instead of 213, even though
-// they match the intrinsic to the 213 version of the instruction.
-// This is because src1 is tied to dest, and the scalar intrinsics
-// require the pass-through values to come from the first source
-// operand, not the second.
+  // These patterns use the 123 ordering, instead of 213, even though
+  // they match the intrinsic to the 213 version of the instruction.
+  // This is because src1 is tied to dest, and the scalar intrinsics
+  // require the pass-through values to come from the first source
+  // operand, not the second.
   def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
-            (COPY_TO_REGCLASS
-              (!cast<Instruction>(NAME#"SSr213r")
-                (COPY_TO_REGCLASS $src1, FR32),
-                (COPY_TO_REGCLASS $src2, FR32),
-                (COPY_TO_REGCLASS $src3, FR32)),
-              VR128)>;
+            (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") 
+             $src1, $src2, $src3), VR128)>;
 
   def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
-            (COPY_TO_REGCLASS
-              (!cast<Instruction>(NAME#"SDr213r")
-                (COPY_TO_REGCLASS $src1, FR64),
-                (COPY_TO_REGCLASS $src2, FR64),
-                (COPY_TO_REGCLASS $src3, FR64)),
-              VR128)>;
+            (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") 
+             $src1, $src2, $src3), VR128)>;
 }
 
 defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
@@ -334,36 +377,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 } // isCodeGenOnly = 1
 }
 
-defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
-                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                            int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
-                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfmadd_sd>;
-defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
-                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                            int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
-                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfmsub_sd>;
-defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
-                        X86Fnmadd, loadf32>,
-                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                            int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
-                        X86Fnmadd, loadf64>,
-                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfnmadd_sd>;
-defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
-                        X86Fnmsub, loadf32>,
-                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                            int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
-                        X86Fnmsub, loadf64>,
-                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfnmsub_sd>;
-
 let ExeDomain = SSEPackedSingle in {
+  // Scalar Instructions
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                    fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmadd_ss>;
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                    fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmsub_ss>;
+  defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                          X86Fnmadd, loadf32>,
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmadd_ss>;
+  defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                          X86Fnmsub, loadf32>,
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmsub_ss>;
+  // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
                             loadv4f32, loadv8f32>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
@@ -379,6 +409,22 @@ let ExeDomain = SSEPackedSingle in {
 }
 
 let ExeDomain = SSEPackedDouble in {
+  // Scalar Instructions
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmadd_sd>;
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmsub_sd>;
+  defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                          X86Fnmadd, loadf64>,
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmadd_sd>;
+  defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                          X86Fnmsub, loadf64>,
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmsub_sd>;
+  // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
                             loadv2f64, loadv4f64>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
index 49068e9..03ae211 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
 // The FopST0 series are not included here because of the irregularities
 // in where the 'r' goes in assembly output.
 // These instructions cannot address 80-bit memory.
-multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+                    bit Forward = 1> {
 // ST(0) = ST(0) + [mem]
 def _Fp32m  : FpIf32<(outs RFP32:$dst),
                      (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP32:$dst,
-                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+                  [!if(Forward,
+                       (set RFP32:$dst,
+                        (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+                       (set RFP32:$dst,
+                        (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
 def _Fp64m  : FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst,
-                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+                       (set RFP64:$dst,
+                        (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
 def _Fp64m32: FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst,
-                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+                       (set RFP64:$dst,
+                        (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
 def _Fp80m32: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst,
-                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
 def _Fp80m64: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst,
-                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
 def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
-                 !strconcat("f", asmstring, "{s}\t$src")> {
-  let mayLoad = 1;
-}
+                 !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
 def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src),
-                 !strconcat("f", asmstring, "{l}\t$src")> {
-  let mayLoad = 1;
-}
+                 !strconcat("f", asmstring, "{l}\t$src")>;
 // ST(0) = ST(0) + [memint]
 def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
                        OneArgFPRW,
-                    [(set RFP32:$dst, (OpNode RFP32:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
 def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
                        OneArgFPRW,
-                    [(set RFP32:$dst, (OpNode RFP32:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
 def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
                        OneArgFPRW,
-                    [(set RFP64:$dst, (OpNode RFP64:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
 def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
                        OneArgFPRW,
-                    [(set RFP64:$dst, (OpNode RFP64:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
 def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
-                       OneArgFPRW,
-                    [(set RFP80:$dst, (OpNode RFP80:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
 def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
-                       OneArgFPRW,
-                    [(set RFP80:$dst, (OpNode RFP80:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
 def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src),
-                  !strconcat("fi", asmstring, "{s}\t$src")> {
-  let mayLoad = 1;
-}
+                  !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
 def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
-                  !strconcat("fi", asmstring, "{l}\t$src")> {
-  let mayLoad = 1;
-}
+                  !strconcat("fi", asmstring, "{l}\t$src")>;
 }
 
 let Defs = [FPSW] in {
@@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>;
 let SchedRW = [WriteFAddLd] in {
 defm ADD : FPBinary<fadd, MRM0m, "add">;
 defm SUB : FPBinary<fsub, MRM4m, "sub">;
-defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
 }
 let SchedRW = [WriteFMulLd] in {
 defm MUL : FPBinary<fmul, MRM1m, "mul">;
 }
 let SchedRW = [WriteFDivLd] in {
 defm DIV : FPBinary<fdiv, MRM6m, "div">;
-defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
 }
 }
 
@@ -306,13 +336,13 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
 
 def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
 def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
-def FNSTSWm  : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
 
 def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
 def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
 
-def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
-def FBSTPm   : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">;
 
 // Floating point cmovs.
 class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
@@ -633,16 +663,18 @@ def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
 def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
 def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
 
-def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-               "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
-def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                  "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], 
-                  IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
-def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-              "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
-def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                   "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
-                   IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+let Predicates = [HasFXSR] in {
+  def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+                 "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+  def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+                    "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+                    IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+  def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+  def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                     "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+                     IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+} // Predicates = [FeatureFXSR]
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1f61ffa..6432863 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -38,6 +38,8 @@ def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
+                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
@@ -58,13 +60,17 @@ def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRT",  SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCP",    SDTFPBinOp>;
 def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
 def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86comiSae : SDNode<"X86ISD::COMI",      SDTX86CmpTestSae>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86ucomiSae: SDNode<"X86ISD::UCOMI",     SDTX86CmpTestSae>;
 def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
 //def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
@@ -74,11 +80,18 @@ def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
                  SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
                                       SDTCisVT<1, v4i32>]>>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
-                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                 SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
 def X86psadbw  : SDNode<"X86ISD::PSADBW",
-                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>]>>;
+                 SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                      SDTCVecEltisVT<1, i8>,
+                                      SDTCisSameSizeAs<0,1>,
+                                      SDTCisSameAs<1,2>]>>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+                  SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+                                       SDTCVecEltisVT<1, i8>,
+                                       SDTCisSameSizeAs<0,1>,
+                                       SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
 def X86andnp   : SDNode<"X86ISD::ANDNP",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
@@ -86,9 +99,11 @@ def X86psign   : SDNode<"X86ISD::PSIGN",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+                                      SDTCisPtrTy<2>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+                                      SDTCisPtrTy<2>]>>;
 def X86pinsrb  : SDNode<"X86ISD::PINSRB",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
@@ -114,19 +129,17 @@ def X86vsext   : SDNode<"X86ISD::VSEXT",
                                               SDTCisInt<0>, SDTCisInt<1>,
                                               SDTCisOpSmallerThanOp<1, 0>]>>;
 
-def X86vtrunc   : SDNode<"X86ISD::VTRUNC",
-                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>,
-                                              SDTCisOpSmallerThanOp<0, 1>]>>;
+def SDTVtrunc    : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisInt<0>, SDTCisInt<1>,
+                                       SDTCisOpSmallerThanOp<0, 1>]>;
+
+def X86vtrunc    : SDNode<"X86ISD::VTRUNC",   SDTVtrunc>;
+def X86vtruncs   : SDNode<"X86ISD::VTRUNCS",  SDTVtrunc>;
+def X86vtruncus  : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+
 def X86trunc    : SDNode<"X86ISD::TRUNC",
                          SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
                                               SDTCisOpSmallerThanOp<0, 1>]>>;
-
-def X86vtruncm   : SDNode<"X86ISD::VTRUNCM",
-                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>,
-                                              SDTCisVec<2>, SDTCisInt<2>,
-                                              SDTCisOpSmallerThanOp<0, 2>]>>;
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>,
@@ -136,6 +149,35 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND",
                                              SDTCisFP<0>, SDTCisFP<1>,
                                              SDTCisOpSmallerThanOp<0, 1>]>>;
 
+def X86fround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86froundRnd: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisOpSmallerThanOp<0, 1>,
+                                             SDTCisInt<3>]>>;
+
+def X86fpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86fpextRnd  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisOpSmallerThanOp<1, 0>,
+                                             SDTCisInt<3>]>>;
+
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
 def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
 def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
@@ -159,10 +201,15 @@ def X86CmpMaskCCRound :
 def X86CmpMaskCCScalar :
       SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
 
-def X86cmpm    : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
-def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
-def X86cmpmu   : SDNode<"X86ISD::CMPMU",    X86CmpMaskCC>;
-def X86cmpms   : SDNode<"X86ISD::FSETCC",   X86CmpMaskCCScalar>;
+def X86CmpMaskCCScalarRound :
+      SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
+                           SDTCisInt<4>]>;
+
+def X86cmpm     : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
+def X86cmpmRnd  : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmu    : SDNode<"X86ISD::CMPMU",    X86CmpMaskCC>;
+def X86cmpms    : SDNode<"X86ISD::FSETCC",   X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCC",   X86CmpMaskCCScalarRound>;
 
 def X86vshl    : SDNode<"X86ISD::VSHL",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -178,6 +225,32 @@ def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
 def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
 def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
 
+def X86vrotli  : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
+def X86vrotri  : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
+
+def X86vprot   : SDNode<"X86ISD::VPROT",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+def X86vproti  : SDNode<"X86ISD::VPROTI",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisVT<2, i8>]>>;
+
+def X86vpshl   : SDNode<"X86ISD::VPSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+def X86vpsha   : SDNode<"X86ISD::VPSHA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+
+def X86vpcom   : SDNode<"X86ISD::VPCOM",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i8>]>>;
+def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i8>]>>;
+
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>;
@@ -190,6 +263,7 @@ def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
 def X86testm   : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
                                           SDTCisVec<1>, SDTCisSameAs<2, 1>,
                                           SDTCVecEltisVT<0, i1>,
@@ -201,11 +275,15 @@ def X86testnm  : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
 def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
-                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                      SDTCisSameAs<1,2>]>>;
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                             SDTCVecEltisVT<1, i32>,
+                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<1,2>]>>;
 def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
-                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
-                                       SDTCisSameAs<1,2>]>>;
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                             SDTCVecEltisVT<1, i32>,
+                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<1,2>]>>;
 
 def X86extrqi : SDNode<"X86ISD::EXTRQI",
                   SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
@@ -221,24 +299,30 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI",
 def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>]>;
-def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
 def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                        SDTCisVec<2>]>;
+                                        SDTCisSameSizeAs<0,2>,
+                                        SDTCisSameNumEltsAs<0,2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                 SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+                                 SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                 SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+                                 SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
 def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                              SDTCisInt<2>, SDTCisInt<3>]>;
 
 def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
-def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                          SDTCisInt<0>, SDTCisInt<1>]>;
 
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
 
+def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
+                                SDTCisVT<4, i8>]>;
+
 def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
 
@@ -250,15 +334,17 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
 def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>;
 def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
-                           SDTCisVec<0>, SDTCisInt<2>]>;
+                           SDTCisVec<0>, SDTCisVT<2, i32>]>;
 def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
-                           SDTCisVec<0>, SDTCisInt<3>]>;
+                           SDTCisVec<0>, SDTCisVT<3, i32>]>;
 def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
-                           SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>;
+                           SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
-def X86Abs     : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+
+def X86Abs      : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
 def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
@@ -281,33 +367,74 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>;
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                   SDTCisSameSizeAs<0,1>,
+                                   SDTCisSameAs<1,2>]>;
 def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
 def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
 
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
+def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
+def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD"   , SDTPack>;
+
 def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
 def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
-def X86VPermv     : SDNode<"X86ISD::VPERMV",    SDTShuff2Op>;
+def X86VPermv     : SDNode<"X86ISD::VPERMV",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+                                                SDTCisSameNumEltsAs<0,1>,
+                                                SDTCisSameSizeAs<0,1>,
+                                                SDTCisSameAs<0,2>]>>;
 def X86VPermi     : SDNode<"X86ISD::VPERMI",    SDTShuff2OpI>;
-def X86VPermv3    : SDNode<"X86ISD::VPERMV3",   SDTShuff3Op>;
-def X86VPermiv3   : SDNode<"X86ISD::VPERMIV3",  SDTShuff3Op>;
+def X86VPermt2     : SDNode<"X86ISD::VPERMV3",
+                    SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>, SDTCisInt<2>,
+                                         SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+                                         SDTCisSameSizeAs<0,2>,
+                                         SDTCisSameAs<0,3>]>, []>;
+
+def X86VPermi2X   : SDNode<"X86ISD::VPERMIV3",
+                    SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+                                         SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+                                         SDTCisSameSizeAs<0,1>,
+                                         SDTCisSameAs<0,2>,
+                                         SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog  : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
-def X86VFixupimm       : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
-def X86VRange          : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
+def X86VFixupimm   : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
+def X86VRange      : SDNode<"X86ISD::VRANGE",    SDTFPBinOpImmRound>;
+def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
+def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
+def X86VGetMant    : SDNode<"X86ISD::VGETMANT",  SDTFPUnaryOpImmRound>;
+def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
+                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                                            SDTCisVec<1>, SDTCisFP<1>,
+                                            SDTCisSameNumEltsAs<0,1>,
+                                            SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
+                       SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+                                            SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
 
 def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
                     SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisSubVecOfVec<1, 0>]>, []>;
+// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2.
+def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+                    SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>]>, []>;
+
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
 def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
-                              [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
+                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
+                               SDTCisPtrTy<3>]>, []>;
 def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
-                              [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>;
+                              [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+                               SDTCisPtrTy<2>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 
@@ -317,11 +444,13 @@ def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
 def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
 def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
-def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",      SDTFPBinOpRound>;
-def X86scalef    : SDNode<"X86ISD::SCALEF",    SDTFPBinOpRound>;
-def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",      SDTFPBinOpRound>;
-def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",  SDTFPUnaryOpRound>;
-def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND",  SDTFPUnaryOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
+def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
+def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
+def X86fsqrtRnds    : SDNode<"X86ISD::FSQRT_RND",   STDFp2SrcRm>;
+def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>;
 
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
@@ -341,9 +470,11 @@ def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  STDFp1SrcRm>;
 def X86rcp28     : SDNode<"X86ISD::RCP28",    STDFp1SrcRm>;
 def X86exp2      : SDNode<"X86ISD::EXP2",     STDFp1SrcRm>;
 
-def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28",  STDFp2SrcRm>;
-def X86rcp28s    : SDNode<"X86ISD::RCP28",    STDFp2SrcRm>;
-def X86RndScale  : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>;
+def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28",   STDFp2SrcRm>;
+def X86rcp28s    : SDNode<"X86ISD::RCP28",     STDFp2SrcRm>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>;
+def X86Reduces   : SDNode<"X86ISD::VREDUCE",   STDFp3SrcRm>;
+def X86GetMants  : SDNode<"X86ISD::VGETMANT",  STDFp3SrcRm>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -362,7 +493,8 @@ def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
                               [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
 
 def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
-                               SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>;
+                                          SDTCisSameAs<0,1>, SDTCisInt<2>,
+                                          SDTCisVT<3, i32>]>;
 
 def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
@@ -371,9 +503,12 @@ def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
 
 def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, 
+                                             SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>;
 def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
-
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+                                            SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>;
 def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                            SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
                                            SDTCisInt<2>]>;
@@ -392,6 +527,10 @@ def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
 def X86SintToFpRnd  : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
 def X86UintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
 
+def X86cvttss2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSFloatToIntRnd>;
+def X86cvttss2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSFloatToIntRnd>;
+def X86cvttsd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSDoubleToIntRnd>;
+def X86cvttsd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSDoubleToIntRnd>;
 // Vector with rounding mode
 
 // cvtt fp-to-int staff
@@ -417,17 +556,35 @@ def X86cvtps2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToInt>;
 def X86cvtpd2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToInt>;
 def X86cvtpd2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToInt>;
 
+def X86cvtph2ps     : SDNode<"ISD::FP16_TO_FP",
+                              SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                   SDTCVecEltisVT<0, f32>,
+                                                   SDTCVecEltisVT<1, i16>,
+                                                   SDTCisFP<0>,
+                                                   SDTCisVT<2, i32>]> >;
+
+def X86cvtps2ph   : SDNode<"ISD::FP_TO_FP16",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCVecEltisVT<0, i16>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisFP<1>, SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>]> >;
 def X86vfpextRnd  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
                                              SDTCisOpSmallerThanOp<1, 0>,
-                                             SDTCisInt<2>]>>;
+                                             SDTCisVT<2, i32>]>>;
 def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>,
                                              SDTCVecEltisVT<0, f32>,
                                              SDTCVecEltisVT<1, f64>,
-                                             SDTCisInt<2>]>>;
+                                             SDTCisOpSmallerThanOp<0, 1>,
+                                             SDTCisVT<2, i32>]>>;
+
+def X86cvt2mask   : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
 
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
@@ -436,10 +593,10 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
 // These are 'extloads' from a scalar to the low element of a vector, zeroing
 // the top elements.  These are used for the SSE 'ss' and 'sd' instruction
 // forms.
-def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
                                   [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
                                    SDNPWantRoot]>;
-def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
                                   [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
                                    SDNPWantRoot]>;
 
@@ -490,9 +647,9 @@ def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 // The memory operand is required to be a 128-bit load, so it must be converted
 // from a vector to a scalar.
 def loadf32_128 : PatFrag<(ops node:$ptr),
-  (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>;
+  (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>;
 def loadf64_128 : PatFrag<(ops node:$ptr),
-  (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>;
+  (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>;
 
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -590,9 +747,9 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
 // The memory operand is required to be a 128-bit load, so it must be converted
 // from a vector to a scalar.
 def memopfsf32_128 : PatFrag<(ops node:$ptr),
-  (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>;
+  (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
 def memopfsf64_128 : PatFrag<(ops node:$ptr),
-  (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>;
+  (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
 
 
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
@@ -604,32 +761,6 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 
 def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
 
-// MOVNT Support
-// Like 'store', but requires the non-temporal bit to be set
-def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-                           (st node:$val, node:$ptr), [{
-  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
-    return ST->isNonTemporal();
-  return false;
-}]>;
-
-def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-                                    (st node:$val, node:$ptr), [{
-  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
-    return ST->isNonTemporal() && !ST->isTruncatingStore() &&
-           ST->getAddressingMode() == ISD::UNINDEXED &&
-           ST->getAlignment() >= 16;
-  return false;
-}]>;
-
-def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-                                      (st node:$val, node:$ptr), [{
-  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
-    return ST->isNonTemporal() &&
-           ST->getAlignment() < 16;
-  return false;
-}]>;
-
 def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -851,29 +982,59 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   return isa<MaskedLoadSDNode>(N);
 }]>;
 
+// masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                        (masked_store node:$src1, node:$src2, node:$src3), [{
+  return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+
 def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                         (masked_store node:$src1, node:$src2, node:$src3), [{
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
   if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
     return Store->getAlignment() >= 16;
   return false;
 }]>;
 
 def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                         (masked_store node:$src1, node:$src2, node:$src3), [{
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
   if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
     return Store->getAlignment() >= 32;
   return false;
 }]>;
 
 def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                         (masked_store node:$src1, node:$src2, node:$src3), [{
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
   if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
     return Store->getAlignment() >= 64;
   return false;
 }]>;
 
 def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                         (masked_store node:$src1, node:$src2, node:$src3), [{
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
   return isa<MaskedStoreSDNode>(N);
 }]>;
 
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                             (masked_store node:$src1, node:$src2, node:$src3), [{
+    return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def masked_truncstorevi8 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index cf68ef0..246804e 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -101,9 +101,11 @@ struct X86MemoryFoldTableEntry {
 void X86InstrInfo::anchor() {}
 
 X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
-    : X86GenInstrInfo(
-          (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
-          (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+    : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+                                               : X86::ADJCALLSTACKDOWN32),
+                      (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+                                               : X86::ADJCALLSTACKUP32),
+                      X86::CATCHRET),
       Subtarget(STI), RI(STI.getTargetTriple()) {
 
   static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
@@ -332,6 +334,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
     { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
     { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
+    { X86::PUSH16r,     X86::PUSH16rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH32r,     X86::PUSH32rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH64r,     X86::PUSH64rmm,     TB_FOLDED_LOAD },
     { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
     { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
     { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
@@ -495,7 +500,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
     { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
     { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
-    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
     { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
     { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
     { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
@@ -605,7 +609,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         0 },
     { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
     { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
-    { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
     { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
     { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
     { X86::VPABSDrr128,     X86::VPABSDrm128,         0 },
@@ -1647,6 +1650,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
 
+    // ADX foldable instructions
+    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
+    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
+    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
+    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
+
     // AVX-512 foldable instructions
     { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
@@ -1729,11 +1738,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
     // FMA foldable instructions
     { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr231r_Int,     X86::VFMADDSSr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr231r_Int,     X86::VFMADDSDr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr132r_Int,     X86::VFMADDSSr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr132r_Int,     X86::VFMADDSDr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     TB_ALIGN_NONE },
 
     { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
     { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
@@ -1749,11 +1764,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
 
     { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr231r_Int,    X86::VFNMADDSSr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr231r_Int,    X86::VFNMADDSDr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr132r_Int,    X86::VFNMADDSSr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr132r_Int,    X86::VFNMADDSDr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    TB_ALIGN_NONE },
 
     { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
     { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
@@ -1769,11 +1790,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
 
     { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr231r_Int,     X86::VFMSUBSSr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr231r_Int,     X86::VFMSUBSDr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr132r_Int,     X86::VFMSUBSSr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr132r_Int,     X86::VFMSUBSDr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     TB_ALIGN_NONE },
 
     { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
     { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
@@ -1789,11 +1816,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
 
     { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr231r_Int,    X86::VFNMSUBSSr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr231r_Int,    X86::VFNMSUBSDr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr132r_Int,    X86::VFNMSUBSSr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr132r_Int,    X86::VFNMSUBSDr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    TB_ALIGN_NONE },
 
     { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
@@ -2282,7 +2315,35 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   case X86::FsVMOVAPSrm:
   case X86::FsVMOVAPDrm:
   case X86::FsMOVAPSrm:
-  case X86::FsMOVAPDrm: {
+  case X86::FsMOVAPDrm:
+  // AVX-512
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQU64Zrm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVUPSZrm: {
     // Loads from constant pools are trivially rematerializable.
     if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
         MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
@@ -2363,9 +2424,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
   // It is safe to clobber EFLAGS at the end of a block of no successor has it
   // live in.
   if (Iter == E) {
-    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
-           SE = MBB.succ_end(); SI != SE; ++SI)
-      if ((*SI)->isLiveIn(X86::EFLAGS))
+    for (MachineBasicBlock *S : MBB.successors())
+      if (S->isLiveIn(X86::EFLAGS))
         return false;
     return true;
   }
@@ -2411,13 +2471,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  unsigned DestReg, unsigned SubIdx,
                                  const MachineInstr *Orig,
                                  const TargetRegisterInfo &TRI) const {
-  // MOV32r0 is implemented with a xor which clobbers condition code.
-  // Re-materialize it as movri instructions to avoid side effects.
-  unsigned Opc = Orig->getOpcode();
-  if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
+  bool ClobbersEFLAGS = false;
+  for (const MachineOperand &MO : Orig->operands()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      ClobbersEFLAGS = true;
+      break;
+    }
+  }
+
+  if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+    // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+    // effects.
+    int Value;
+    switch (Orig->getOpcode()) {
+    case X86::MOV32r0:  Value = 0; break;
+    case X86::MOV32r1:  Value = 1; break;
+    case X86::MOV32r_1: Value = -1; break;
+    default:
+      llvm_unreachable("Unexpected instruction!");
+    }
+
     DebugLoc DL = Orig->getDebugLoc();
     BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
-      .addImm(0);
+      .addImm(Value);
   } else {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
     MBB.insert(I, MI);
@@ -2428,7 +2504,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 }
 
 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
-static bool hasLiveCondCodeDef(MachineInstr *MI) {
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg() && MO.isDef() &&
@@ -2453,7 +2529,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
   // Left shift instructions can be transformed into load-effective-address
   // instructions if we can encode them appropriately.
-  // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+  // A LEA instruction utilizes a SIB byte to encode its scale factor.
   // The SIB.scale field is two bits wide which means that we can encode any
   // shift amount less than 4.
   return ShAmt < 4 && ShAmt > 0;
@@ -2493,7 +2569,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
     ImplicitOp = Src;
     ImplicitOp.setImplicit();
 
-    NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
+    NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
     MachineBasicBlock::LivenessQueryResult LQR =
       MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
 
@@ -2914,10 +2990,162 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   return NewMI;
 }
 
-/// We have a few instructions that must be hacked on to commute them.
-///
-MachineInstr *
-X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+/// Returns true if the given instruction opcode is FMA3.
+/// Otherwise, returns false.
+/// The second parameter is optional and is used as the second return from
+/// the function. It is set to true if the given instruction has FMA3 opcode
+/// that is used for lowering of scalar FMA intrinsics, and it is set to false
+/// otherwise.
+static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
+  if (IsIntrinsic)
+    *IsIntrinsic = false;
+
+  switch (Opcode) {
+    case X86::VFMADDSDr132r:      case X86::VFMADDSDr132m:
+    case X86::VFMADDSSr132r:      case X86::VFMADDSSr132m:
+    case X86::VFMSUBSDr132r:      case X86::VFMSUBSDr132m:
+    case X86::VFMSUBSSr132r:      case X86::VFMSUBSSr132m:
+    case X86::VFNMADDSDr132r:     case X86::VFNMADDSDr132m:
+    case X86::VFNMADDSSr132r:     case X86::VFNMADDSSr132m:
+    case X86::VFNMSUBSDr132r:     case X86::VFNMSUBSDr132m:
+    case X86::VFNMSUBSSr132r:     case X86::VFNMSUBSSr132m:
+
+    case X86::VFMADDSDr213r:      case X86::VFMADDSDr213m:
+    case X86::VFMADDSSr213r:      case X86::VFMADDSSr213m:
+    case X86::VFMSUBSDr213r:      case X86::VFMSUBSDr213m:
+    case X86::VFMSUBSSr213r:      case X86::VFMSUBSSr213m:
+    case X86::VFNMADDSDr213r:     case X86::VFNMADDSDr213m:
+    case X86::VFNMADDSSr213r:     case X86::VFNMADDSSr213m:
+    case X86::VFNMSUBSDr213r:     case X86::VFNMSUBSDr213m:
+    case X86::VFNMSUBSSr213r:     case X86::VFNMSUBSSr213m:
+
+    case X86::VFMADDSDr231r:      case X86::VFMADDSDr231m:
+    case X86::VFMADDSSr231r:      case X86::VFMADDSSr231m:
+    case X86::VFMSUBSDr231r:      case X86::VFMSUBSDr231m:
+    case X86::VFMSUBSSr231r:      case X86::VFMSUBSSr231m:
+    case X86::VFNMADDSDr231r:     case X86::VFNMADDSDr231m:
+    case X86::VFNMADDSSr231r:     case X86::VFNMADDSSr231m:
+    case X86::VFNMSUBSDr231r:     case X86::VFNMSUBSDr231m:
+    case X86::VFNMSUBSSr231r:     case X86::VFNMSUBSSr231m:
+
+    case X86::VFMADDSUBPDr132r:   case X86::VFMADDSUBPDr132m:
+    case X86::VFMADDSUBPSr132r:   case X86::VFMADDSUBPSr132m:
+    case X86::VFMSUBADDPDr132r:   case X86::VFMSUBADDPDr132m:
+    case X86::VFMSUBADDPSr132r:   case X86::VFMSUBADDPSr132m:
+    case X86::VFMADDSUBPDr132rY:  case X86::VFMADDSUBPDr132mY:
+    case X86::VFMADDSUBPSr132rY:  case X86::VFMADDSUBPSr132mY:
+    case X86::VFMSUBADDPDr132rY:  case X86::VFMSUBADDPDr132mY:
+    case X86::VFMSUBADDPSr132rY:  case X86::VFMSUBADDPSr132mY:
+
+    case X86::VFMADDPDr132r:      case X86::VFMADDPDr132m:
+    case X86::VFMADDPSr132r:      case X86::VFMADDPSr132m:
+    case X86::VFMSUBPDr132r:      case X86::VFMSUBPDr132m:
+    case X86::VFMSUBPSr132r:      case X86::VFMSUBPSr132m:
+    case X86::VFNMADDPDr132r:     case X86::VFNMADDPDr132m:
+    case X86::VFNMADDPSr132r:     case X86::VFNMADDPSr132m:
+    case X86::VFNMSUBPDr132r:     case X86::VFNMSUBPDr132m:
+    case X86::VFNMSUBPSr132r:     case X86::VFNMSUBPSr132m:
+    case X86::VFMADDPDr132rY:     case X86::VFMADDPDr132mY:
+    case X86::VFMADDPSr132rY:     case X86::VFMADDPSr132mY:
+    case X86::VFMSUBPDr132rY:     case X86::VFMSUBPDr132mY:
+    case X86::VFMSUBPSr132rY:     case X86::VFMSUBPSr132mY:
+    case X86::VFNMADDPDr132rY:    case X86::VFNMADDPDr132mY:
+    case X86::VFNMADDPSr132rY:    case X86::VFNMADDPSr132mY:
+    case X86::VFNMSUBPDr132rY:    case X86::VFNMSUBPDr132mY:
+    case X86::VFNMSUBPSr132rY:    case X86::VFNMSUBPSr132mY:
+
+    case X86::VFMADDSUBPDr213r:   case X86::VFMADDSUBPDr213m:
+    case X86::VFMADDSUBPSr213r:   case X86::VFMADDSUBPSr213m:
+    case X86::VFMSUBADDPDr213r:   case X86::VFMSUBADDPDr213m:
+    case X86::VFMSUBADDPSr213r:   case X86::VFMSUBADDPSr213m:
+    case X86::VFMADDSUBPDr213rY:  case X86::VFMADDSUBPDr213mY:
+    case X86::VFMADDSUBPSr213rY:  case X86::VFMADDSUBPSr213mY:
+    case X86::VFMSUBADDPDr213rY:  case X86::VFMSUBADDPDr213mY:
+    case X86::VFMSUBADDPSr213rY:  case X86::VFMSUBADDPSr213mY:
+
+    case X86::VFMADDPDr213r:      case X86::VFMADDPDr213m:
+    case X86::VFMADDPSr213r:      case X86::VFMADDPSr213m:
+    case X86::VFMSUBPDr213r:      case X86::VFMSUBPDr213m:
+    case X86::VFMSUBPSr213r:      case X86::VFMSUBPSr213m:
+    case X86::VFNMADDPDr213r:     case X86::VFNMADDPDr213m:
+    case X86::VFNMADDPSr213r:     case X86::VFNMADDPSr213m:
+    case X86::VFNMSUBPDr213r:     case X86::VFNMSUBPDr213m:
+    case X86::VFNMSUBPSr213r:     case X86::VFNMSUBPSr213m:
+    case X86::VFMADDPDr213rY:     case X86::VFMADDPDr213mY:
+    case X86::VFMADDPSr213rY:     case X86::VFMADDPSr213mY:
+    case X86::VFMSUBPDr213rY:     case X86::VFMSUBPDr213mY:
+    case X86::VFMSUBPSr213rY:     case X86::VFMSUBPSr213mY:
+    case X86::VFNMADDPDr213rY:    case X86::VFNMADDPDr213mY:
+    case X86::VFNMADDPSr213rY:    case X86::VFNMADDPSr213mY:
+    case X86::VFNMSUBPDr213rY:    case X86::VFNMSUBPDr213mY:
+    case X86::VFNMSUBPSr213rY:    case X86::VFNMSUBPSr213mY:
+
+    case X86::VFMADDSUBPDr231r:   case X86::VFMADDSUBPDr231m:
+    case X86::VFMADDSUBPSr231r:   case X86::VFMADDSUBPSr231m:
+    case X86::VFMSUBADDPDr231r:   case X86::VFMSUBADDPDr231m:
+    case X86::VFMSUBADDPSr231r:   case X86::VFMSUBADDPSr231m:
+    case X86::VFMADDSUBPDr231rY:  case X86::VFMADDSUBPDr231mY:
+    case X86::VFMADDSUBPSr231rY:  case X86::VFMADDSUBPSr231mY:
+    case X86::VFMSUBADDPDr231rY:  case X86::VFMSUBADDPDr231mY:
+    case X86::VFMSUBADDPSr231rY:  case X86::VFMSUBADDPSr231mY:
+
+    case X86::VFMADDPDr231r:      case X86::VFMADDPDr231m:
+    case X86::VFMADDPSr231r:      case X86::VFMADDPSr231m:
+    case X86::VFMSUBPDr231r:      case X86::VFMSUBPDr231m:
+    case X86::VFMSUBPSr231r:      case X86::VFMSUBPSr231m:
+    case X86::VFNMADDPDr231r:     case X86::VFNMADDPDr231m:
+    case X86::VFNMADDPSr231r:     case X86::VFNMADDPSr231m:
+    case X86::VFNMSUBPDr231r:     case X86::VFNMSUBPDr231m:
+    case X86::VFNMSUBPSr231r:     case X86::VFNMSUBPSr231m:
+    case X86::VFMADDPDr231rY:     case X86::VFMADDPDr231mY:
+    case X86::VFMADDPSr231rY:     case X86::VFMADDPSr231mY:
+    case X86::VFMSUBPDr231rY:     case X86::VFMSUBPDr231mY:
+    case X86::VFMSUBPSr231rY:     case X86::VFMSUBPSr231mY:
+    case X86::VFNMADDPDr231rY:    case X86::VFNMADDPDr231mY:
+    case X86::VFNMADDPSr231rY:    case X86::VFNMADDPSr231mY:
+    case X86::VFNMSUBPDr231rY:    case X86::VFNMSUBPDr231mY:
+    case X86::VFNMSUBPSr231rY:    case X86::VFNMSUBPSr231mY:
+      return true;
+
+    case X86::VFMADDSDr132r_Int:  case X86::VFMADDSDr132m_Int:
+    case X86::VFMADDSSr132r_Int:  case X86::VFMADDSSr132m_Int:
+    case X86::VFMSUBSDr132r_Int:  case X86::VFMSUBSDr132m_Int:
+    case X86::VFMSUBSSr132r_Int:  case X86::VFMSUBSSr132m_Int:
+    case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
+    case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
+    case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
+    case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
+
+    case X86::VFMADDSDr213r_Int:  case X86::VFMADDSDr213m_Int:
+    case X86::VFMADDSSr213r_Int:  case X86::VFMADDSSr213m_Int:
+    case X86::VFMSUBSDr213r_Int:  case X86::VFMSUBSDr213m_Int:
+    case X86::VFMSUBSSr213r_Int:  case X86::VFMSUBSSr213m_Int:
+    case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
+    case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
+    case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
+    case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
+
+    case X86::VFMADDSDr231r_Int:  case X86::VFMADDSDr231m_Int:
+    case X86::VFMADDSSr231r_Int:  case X86::VFMADDSSr231m_Int:
+    case X86::VFMSUBSDr231r_Int:  case X86::VFMSUBSDr231m_Int:
+    case X86::VFMSUBSSr231r_Int:  case X86::VFMSUBSSr231m_Int:
+    case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
+    case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
+    case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
+    case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
+      if (IsIntrinsic)
+        *IsIntrinsic = true;
+      return true;
+    default:
+      return false;
+  }
+  llvm_unreachable("Opcode not handled by the switch");
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                   bool NewMI,
+                                                   unsigned OpIdx1,
+                                                   unsigned OpIdx2) const {
   switch (MI->getOpcode()) {
   case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
   case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
@@ -2944,7 +3172,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     }
     MI->setDesc(get(Opc));
     MI->getOperand(3).setImm(Size-Amt);
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
@@ -2980,7 +3208,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
       NewMI = false;
     }
     MI->getOperand(3).setImm(Mask ^ Imm);
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::PCLMULQDQrr:
   case X86::VPCLMULQDQrr:{
@@ -2995,7 +3223,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
       NewMI = false;
     }
     MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::CMPPDrri:
   case X86::CMPPSrri:
@@ -3016,7 +3244,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
         MI = MF.CloneMachineInstr(MI);
         NewMI = false;
       }
-      return TargetInstrInfo::commuteInstruction(MI, NewMI);
+      return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
     default:
       return nullptr;
     }
@@ -3045,7 +3273,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
       NewMI = false;
     }
     MI->getOperand(3).setImm(Imm);
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
   case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
@@ -3124,11 +3352,272 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     // Fallthrough intended.
   }
   default:
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    if (isFMA3(MI->getOpcode())) {
+      unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+      if (Opc == 0)
+        return nullptr;
+      if (NewMI) {
+        MachineFunction &MF = *MI->getParent()->getParent();
+        MI = MF.CloneMachineInstr(MI);
+        NewMI = false;
+      }
+      MI->setDesc(get(Opc));
+    }
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   }
 }
 
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+                                             unsigned &SrcOpIdx1,
+                                             unsigned &SrcOpIdx2) const {
+
+  unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+
+  // Only the first RegOpsNum operands are commutable.
+  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+  // that the operand is not specified/fixed.
+  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+      (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+    return false;
+  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+      (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+    return false;
+
+  // Look for two different register operands assumed to be commutable
+  // regardless of the FMA opcode. The FMA opcode is adjusted later.
+  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+      SrcOpIdx2 == CommuteAnyOperandIndex) {
+    unsigned CommutableOpIdx1 = SrcOpIdx1;
+    unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+    // At least one of operands to be commuted is not specified and
+    // this method is free to choose appropriate commutable operands.
+    if (SrcOpIdx1 == SrcOpIdx2)
+      // Both of operands are not fixed. By default set one of commutable
+      // operands to the last register operand of the instruction.
+      CommutableOpIdx2 = RegOpsNum;
+    else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+      // Only one of operands is not fixed.
+      CommutableOpIdx2 = SrcOpIdx1;
+
+    // CommutableOpIdx2 is well defined now. Let's choose another commutable
+    // operand and assign its index to CommutableOpIdx1.
+    unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+    for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+      // The commuted operands must have different registers.
+      // Otherwise, the commute transformation does not change anything and
+      // is useless then.
+      if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+        break;
+    }
+
+    // No appropriate commutable operands were found.
+    if (CommutableOpIdx1 == 0)
+      return false;
+
+    // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+    // to return those values.
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                              CommutableOpIdx1, CommutableOpIdx2))
+      return false;
+  }
+
+  // Check if we can adjust the opcode to preserve the semantics when
+  // commute the register operands.
+  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+                                                      unsigned SrcOpIdx1,
+                                                      unsigned SrcOpIdx2) const {
+  unsigned Opc = MI->getOpcode();
+
+  // Define the array that holds FMA opcodes in groups
+  // of 3 opcodes(132, 213, 231) in each group.
+  static const unsigned RegularOpcodeGroups[][3] = {
+    { X86::VFMADDSSr132r,   X86::VFMADDSSr213r,   X86::VFMADDSSr231r  },
+    { X86::VFMADDSDr132r,   X86::VFMADDSDr213r,   X86::VFMADDSDr231r  },
+    { X86::VFMADDPSr132r,   X86::VFMADDPSr213r,   X86::VFMADDPSr231r  },
+    { X86::VFMADDPDr132r,   X86::VFMADDPDr213r,   X86::VFMADDPDr231r  },
+    { X86::VFMADDPSr132rY,  X86::VFMADDPSr213rY,  X86::VFMADDPSr231rY },
+    { X86::VFMADDPDr132rY,  X86::VFMADDPDr213rY,  X86::VFMADDPDr231rY },
+    { X86::VFMADDSSr132m,   X86::VFMADDSSr213m,   X86::VFMADDSSr231m  },
+    { X86::VFMADDSDr132m,   X86::VFMADDSDr213m,   X86::VFMADDSDr231m  },
+    { X86::VFMADDPSr132m,   X86::VFMADDPSr213m,   X86::VFMADDPSr231m  },
+    { X86::VFMADDPDr132m,   X86::VFMADDPDr213m,   X86::VFMADDPDr231m  },
+    { X86::VFMADDPSr132mY,  X86::VFMADDPSr213mY,  X86::VFMADDPSr231mY },
+    { X86::VFMADDPDr132mY,  X86::VFMADDPDr213mY,  X86::VFMADDPDr231mY },
+
+    { X86::VFMSUBSSr132r,   X86::VFMSUBSSr213r,   X86::VFMSUBSSr231r  },
+    { X86::VFMSUBSDr132r,   X86::VFMSUBSDr213r,   X86::VFMSUBSDr231r  },
+    { X86::VFMSUBPSr132r,   X86::VFMSUBPSr213r,   X86::VFMSUBPSr231r  },
+    { X86::VFMSUBPDr132r,   X86::VFMSUBPDr213r,   X86::VFMSUBPDr231r  },
+    { X86::VFMSUBPSr132rY,  X86::VFMSUBPSr213rY,  X86::VFMSUBPSr231rY },
+    { X86::VFMSUBPDr132rY,  X86::VFMSUBPDr213rY,  X86::VFMSUBPDr231rY },
+    { X86::VFMSUBSSr132m,   X86::VFMSUBSSr213m,   X86::VFMSUBSSr231m  },
+    { X86::VFMSUBSDr132m,   X86::VFMSUBSDr213m,   X86::VFMSUBSDr231m  },
+    { X86::VFMSUBPSr132m,   X86::VFMSUBPSr213m,   X86::VFMSUBPSr231m  },
+    { X86::VFMSUBPDr132m,   X86::VFMSUBPDr213m,   X86::VFMSUBPDr231m  },
+    { X86::VFMSUBPSr132mY,  X86::VFMSUBPSr213mY,  X86::VFMSUBPSr231mY },
+    { X86::VFMSUBPDr132mY,  X86::VFMSUBPDr213mY,  X86::VFMSUBPDr231mY },
+
+    { X86::VFNMADDSSr132r,  X86::VFNMADDSSr213r,  X86::VFNMADDSSr231r  },
+    { X86::VFNMADDSDr132r,  X86::VFNMADDSDr213r,  X86::VFNMADDSDr231r  },
+    { X86::VFNMADDPSr132r,  X86::VFNMADDPSr213r,  X86::VFNMADDPSr231r  },
+    { X86::VFNMADDPDr132r,  X86::VFNMADDPDr213r,  X86::VFNMADDPDr231r  },
+    { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
+    { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
+    { X86::VFNMADDSSr132m,  X86::VFNMADDSSr213m,  X86::VFNMADDSSr231m  },
+    { X86::VFNMADDSDr132m,  X86::VFNMADDSDr213m,  X86::VFNMADDSDr231m  },
+    { X86::VFNMADDPSr132m,  X86::VFNMADDPSr213m,  X86::VFNMADDPSr231m  },
+    { X86::VFNMADDPDr132m,  X86::VFNMADDPDr213m,  X86::VFNMADDPDr231m  },
+    { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
+    { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
+
+    { X86::VFNMSUBSSr132r,  X86::VFNMSUBSSr213r,  X86::VFNMSUBSSr231r  },
+    { X86::VFNMSUBSDr132r,  X86::VFNMSUBSDr213r,  X86::VFNMSUBSDr231r  },
+    { X86::VFNMSUBPSr132r,  X86::VFNMSUBPSr213r,  X86::VFNMSUBPSr231r  },
+    { X86::VFNMSUBPDr132r,  X86::VFNMSUBPDr213r,  X86::VFNMSUBPDr231r  },
+    { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
+    { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
+    { X86::VFNMSUBSSr132m,  X86::VFNMSUBSSr213m,  X86::VFNMSUBSSr231m  },
+    { X86::VFNMSUBSDr132m,  X86::VFNMSUBSDr213m,  X86::VFNMSUBSDr231m  },
+    { X86::VFNMSUBPSr132m,  X86::VFNMSUBPSr213m,  X86::VFNMSUBPSr231m  },
+    { X86::VFNMSUBPDr132m,  X86::VFNMSUBPDr213m,  X86::VFNMSUBPDr231m  },
+    { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
+    { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
+
+    { X86::VFMADDSUBPSr132r,  X86::VFMADDSUBPSr213r,  X86::VFMADDSUBPSr231r  },
+    { X86::VFMADDSUBPDr132r,  X86::VFMADDSUBPDr213r,  X86::VFMADDSUBPDr231r  },
+    { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
+    { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
+    { X86::VFMADDSUBPSr132m,  X86::VFMADDSUBPSr213m,  X86::VFMADDSUBPSr231m  },
+    { X86::VFMADDSUBPDr132m,  X86::VFMADDSUBPDr213m,  X86::VFMADDSUBPDr231m  },
+    { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
+    { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
+
+    { X86::VFMSUBADDPSr132r,  X86::VFMSUBADDPSr213r,  X86::VFMSUBADDPSr231r  },
+    { X86::VFMSUBADDPDr132r,  X86::VFMSUBADDPDr213r,  X86::VFMSUBADDPDr231r  },
+    { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
+    { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
+    { X86::VFMSUBADDPSr132m,  X86::VFMSUBADDPSr213m,  X86::VFMSUBADDPSr231m  },
+    { X86::VFMSUBADDPDr132m,  X86::VFMSUBADDPDr213m,  X86::VFMSUBADDPDr231m  },
+    { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
+    { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
+  };
+
+  // Define the array that holds FMA*_Int opcodes in groups
+  // of 3 opcodes(132, 213, 231) in each group.
+  static const unsigned IntrinOpcodeGroups[][3] = {
+    { X86::VFMADDSSr132r_Int,  X86::VFMADDSSr213r_Int,  X86::VFMADDSSr231r_Int },
+    { X86::VFMADDSDr132r_Int,  X86::VFMADDSDr213r_Int,  X86::VFMADDSDr231r_Int },
+    { X86::VFMADDSSr132m_Int,  X86::VFMADDSSr213m_Int,  X86::VFMADDSSr231m_Int },
+    { X86::VFMADDSDr132m_Int,  X86::VFMADDSDr213m_Int,  X86::VFMADDSDr231m_Int },
+
+    { X86::VFMSUBSSr132r_Int,  X86::VFMSUBSSr213r_Int,  X86::VFMSUBSSr231r_Int },
+    { X86::VFMSUBSDr132r_Int,  X86::VFMSUBSDr213r_Int,  X86::VFMSUBSDr231r_Int },
+    { X86::VFMSUBSSr132m_Int,  X86::VFMSUBSSr213m_Int,  X86::VFMSUBSSr231m_Int },
+    { X86::VFMSUBSDr132m_Int,  X86::VFMSUBSDr213m_Int,  X86::VFMSUBSDr231m_Int },
+
+    { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
+    { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
+    { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
+    { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
+
+    { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
+    { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
+    { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
+    { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
+  };
+
+  const unsigned Form132Index = 0;
+  const unsigned Form213Index = 1;
+  const unsigned Form231Index = 2;
+  const unsigned FormsNum = 3;
+
+  bool IsIntrinOpcode;
+  isFMA3(Opc, &IsIntrinOpcode);
+
+  size_t GroupsNum;
+  const unsigned (*OpcodeGroups)[3];
+  if (IsIntrinOpcode) {
+    GroupsNum = array_lengthof(IntrinOpcodeGroups);
+    OpcodeGroups = IntrinOpcodeGroups;
+  } else {
+    GroupsNum = array_lengthof(RegularOpcodeGroups);
+    OpcodeGroups = RegularOpcodeGroups;
+  }
+
+  const unsigned *FoundOpcodesGroup = nullptr;
+  size_t FormIndex;
+
+  // Look for the input opcode in the corresponding opcodes table.
+  for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+         ++GroupIndex) {
+    for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
+      if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
+        FoundOpcodesGroup = OpcodeGroups[GroupIndex];
+        break;
+      }
+    }
+  }
+
+  // The input opcode does not match with any of the opcodes from the tables.
+  // The unsupported FMA opcode must be added to one of the two opcode groups
+  // defined above.
+  assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+
+  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+  if (SrcOpIdx1 > SrcOpIdx2)
+    std::swap(SrcOpIdx1, SrcOpIdx2);
+
+  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+  // analysis. The commute optimization is legal only if all users of FMA*_Int
+  // use only the lowest element of the FMA*_Int instruction. Such analysis are
+  // not implemented yet. So, just return 0 in that case.
+  // When such analysis are available this place will be the right place for
+  // calling it.
+  if (IsIntrinOpcode && SrcOpIdx1 == 1)
+    return 0;
+
+  unsigned Case;
+  if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+    Case = 0;
+  else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+    Case = 1;
+  else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+    Case = 2;
+  else
+    return 0;
+
+  // Define the FMA forms mapping array that helps to map input FMA form
+  // to output FMA form to preserve the operation semantics after
+  // commuting the operands.
+  static const unsigned FormMapping[][3] = {
+    // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+    // FMA132 A, C, b; ==> FMA231 C, A, b;
+    // FMA213 B, A, c; ==> FMA213 A, B, c;
+    // FMA231 C, A, b; ==> FMA132 A, C, b;
+    { Form231Index, Form213Index, Form132Index },
+    // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+    // FMA132 A, c, B; ==> FMA132 B, c, A;
+    // FMA213 B, a, C; ==> FMA231 C, a, B;
+    // FMA231 C, a, B; ==> FMA213 B, a, C;
+    { Form132Index, Form231Index, Form213Index },
+    // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+    // FMA132 a, C, B; ==> FMA213 a, B, C;
+    // FMA213 b, A, C; ==> FMA132 b, C, A;
+    // FMA231 c, A, B; ==> FMA231 c, B, A;
+    { Form213Index, Form132Index, Form231Index }
+  };
+
+  // Everything is ready, just adjust the FMA opcode and return it.
+  FormIndex = FormMapping[Case][FormIndex];
+  return FoundOpcodesGroup[FormIndex];
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
+                                         unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   switch (MI->getOpcode()) {
     case X86::CMPPDrri:
@@ -3141,46 +3630,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
       // Ordered/Unordered/Equal/NotEqual tests
       unsigned Imm = MI->getOperand(3).getImm() & 0x7;
       switch (Imm) {
-      case 0x00: // EQUAL
-      case 0x03: // UNORDERED
-      case 0x04: // NOT EQUAL
-      case 0x07: // ORDERED
-        SrcOpIdx1 = 1;
-        SrcOpIdx2 = 2;
-        return true;
+        case 0x00: // EQUAL
+        case 0x03: // UNORDERED
+        case 0x04: // NOT EQUAL
+        case 0x07: // ORDERED
+          // The indices of the commutable operands are 1 and 2.
+          // Assign them to the returned operand indices here.
+          return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
       }
       return false;
     }
-    case X86::VFMADDPDr231r:
-    case X86::VFMADDPSr231r:
-    case X86::VFMADDSDr231r:
-    case X86::VFMADDSSr231r:
-    case X86::VFMSUBPDr231r:
-    case X86::VFMSUBPSr231r:
-    case X86::VFMSUBSDr231r:
-    case X86::VFMSUBSSr231r:
-    case X86::VFNMADDPDr231r:
-    case X86::VFNMADDPSr231r:
-    case X86::VFNMADDSDr231r:
-    case X86::VFNMADDSSr231r:
-    case X86::VFNMSUBPDr231r:
-    case X86::VFNMSUBPSr231r:
-    case X86::VFNMSUBSDr231r:
-    case X86::VFNMSUBSSr231r:
-    case X86::VFMADDPDr231rY:
-    case X86::VFMADDPSr231rY:
-    case X86::VFMSUBPDr231rY:
-    case X86::VFMSUBPSr231rY:
-    case X86::VFNMADDPDr231rY:
-    case X86::VFNMADDPSr231rY:
-    case X86::VFNMSUBPDr231rY:
-    case X86::VFNMSUBPSr231rY:
-      SrcOpIdx1 = 2;
-      SrcOpIdx2 = 3;
-      return true;
     default:
+      if (isFMA3(MI->getOpcode()))
+        return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
       return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   }
+  return false;
 }
 
 static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
@@ -3821,15 +4286,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   return 0;
 }
 
-inline static bool MaskRegClassContains(unsigned Reg) {
+static bool MaskRegClassContains(unsigned Reg) {
   return X86::VK8RegClass.contains(Reg) ||
          X86::VK16RegClass.contains(Reg) ||
          X86::VK32RegClass.contains(Reg) ||
          X86::VK64RegClass.contains(Reg) ||
          X86::VK1RegClass.contains(Reg);
 }
+
+static bool GRRegClassContains(unsigned Reg) {
+  return X86::GR64RegClass.contains(Reg) ||
+         X86::GR32RegClass.contains(Reg) ||
+         X86::GR16RegClass.contains(Reg) ||
+         X86::GR8RegClass.contains(Reg);
+}
+static
+unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
+  if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
+    DestReg = getX86SubSuperRegister(DestReg, 32);
+    return X86::KMOVBrk;
+  }
+  if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
+    SrcReg = getX86SubSuperRegister(SrcReg, 32);
+    return X86::KMOVBkr;
+  }
+  return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
+  if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
+    return X86::KMOVQkk;
+  if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
+    return X86::KMOVDrk;
+  if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
+    return X86::KMOVQrk;
+  if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+    return X86::KMOVDkr;
+  if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
+    return X86::KMOVQkr;
+  return 0;
+}
+
 static
-unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
+                                  const X86Subtarget &Subtarget)
+{
+  if (Subtarget.hasDQI())
+    if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
+      return Opc;
+  if (Subtarget.hasBWI())
+    if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
+      return Opc;
   if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
       X86::VR256XRegClass.contains(DestReg, SrcReg) ||
       X86::VR512RegClass.contains(DestReg, SrcReg)) {
@@ -3837,21 +4345,14 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
      SrcReg = get512BitSuperRegister(SrcReg);
      return X86::VMOVAPSZrr;
   }
-  if (MaskRegClassContains(DestReg) &&
-      MaskRegClassContains(SrcReg))
+  if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
     return X86::KMOVWkk;
-  if (MaskRegClassContains(DestReg) &&
-      (X86::GR32RegClass.contains(SrcReg) ||
-       X86::GR16RegClass.contains(SrcReg) ||
-       X86::GR8RegClass.contains(SrcReg))) {
-    SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
+  if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
+    SrcReg = getX86SubSuperRegister(SrcReg, 32);
     return X86::KMOVWkr;
   }
-  if ((X86::GR32RegClass.contains(DestReg) ||
-       X86::GR16RegClass.contains(DestReg) ||
-       X86::GR8RegClass.contains(DestReg)) &&
-       MaskRegClassContains(SrcReg)) {
-    DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
+  if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
+    DestReg = getX86SubSuperRegister(DestReg, 32);
     return X86::KMOVWrk;
   }
   return 0;
@@ -3886,7 +4387,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (X86::VR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MMX_MOVQ64rr;
   else if (HasAVX512)
-    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
   else if (X86::VR128RegClass.contains(DestReg, SrcReg))
     Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
@@ -3900,34 +4401,91 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  // Moving EFLAGS to / from another register requires a push and a pop.
-  // Notice that we have to adjust the stack if we don't want to clobber the
-  // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
-  if (SrcReg == X86::EFLAGS) {
-    if (X86::GR64RegClass.contains(DestReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSHF64));
-      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+  bool FromEFLAGS = SrcReg == X86::EFLAGS;
+  bool ToEFLAGS = DestReg == X86::EFLAGS;
+  int Reg = FromEFLAGS ? DestReg : SrcReg;
+  bool is32 = X86::GR32RegClass.contains(Reg);
+  bool is64 = X86::GR64RegClass.contains(Reg);
+
+  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int PopF = is64 ? X86::POPF64 : X86::POPF32;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    if (!Subtarget.hasLAHFSAHF()) {
+      assert(Subtarget.is64Bit() &&
+             "Not having LAHF/SAHF only happens on 64-bit.");
+      // Moving EFLAGS to / from another register requires a push and a pop.
+      // Notice that we have to adjust the stack if we don't want to clobber the
+      // first frame index. See X86FrameLowering.cpp - usesTheStack.
+      if (FromEFLAGS) {
+        BuildMI(MBB, MI, DL, get(PushF));
+        BuildMI(MBB, MI, DL, get(Pop), DestReg);
+      }
+      if (ToEFLAGS) {
+        BuildMI(MBB, MI, DL, get(Push))
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(PopF));
+      }
       return;
     }
-    if (X86::GR32RegClass.contains(DestReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSHF32));
-      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
-      return;
+
+    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+    // inefficient. Instead:
+    //   - Save the overflow flag OF into AL using SETO, and restore it using a
+    //     signed 8-bit addition of AL and INT8_MAX.
+    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+    //     using LAHF/SAHF.
+    //   - When RAX/EAX is live and isn't the destination register, make sure it
+    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+    //     the flags.
+    // This approach is ~2.25x faster than using PUSHF/POPF.
+    //
+    // This is still somewhat inefficient because we don't know which flags are
+    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+    //
+    // PUSHF/POPF is also potentially incorrect because it affects other flags
+    // such as TF/IF/DF, which LLVM doesn't model.
+    //
+    // Notice that we have to adjust the stack if we don't want to clobber the
+    // first frame index.
+    // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
+
+
+    bool AXDead = (Reg == AX) ||
+                  (MachineBasicBlock::LQR_Dead ==
+                   MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+    if (!AXDead) {
+      // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may
+      // actually be dead. This is not a problem for correctness as we are just
+      // (unnecessarily) saving+restoring a dead register. However the
+      // MachineVerifier expects operands that read from dead registers
+      // to be marked with the "undef" flag.
+      // An example of this can be found in
+      // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and
+      // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using
+      // -verify-machineinstrs.
+      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
     }
-  }
-  if (DestReg == X86::EFLAGS) {
-    if (X86::GR64RegClass.contains(SrcReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSH64r))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(X86::POPF64));
-      return;
+    if (FromEFLAGS) {
+      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+      BuildMI(MBB, MI, DL, get(X86::LAHF));
+      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
     }
-    if (X86::GR32RegClass.contains(SrcReg)) {
-      BuildMI(MBB, MI, DL, get(X86::PUSH32r))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(X86::POPF32));
-      return;
+    if (ToEFLAGS) {
+      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+          .addReg(X86::AL)
+          .addImm(INT8_MAX);
+      BuildMI(MBB, MI, DL, get(X86::SAHF));
     }
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Pop), AX);
+    return;
   }
 
   DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
@@ -4602,9 +5160,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // live-out. If it is live-out, do not optimize.
   if ((IsCmpZero || IsSwapped) && !IsSafe) {
     MachineBasicBlock *MBB = CmpInstr->getParent();
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-             SE = MBB->succ_end(); SI != SE; ++SI)
-      if ((*SI)->isLiveIn(X86::EFLAGS))
+    for (MachineBasicBlock *Successor : MBB->successors())
+      if (Successor->isLiveIn(X86::EFLAGS))
         return false;
   }
 
@@ -4645,8 +5202,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
-  for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
-    OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+  for (auto &Op : OpsToUpdate)
+    Op.first->setDesc(get(Op.second));
   return true;
 }
 
@@ -4694,8 +5251,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI);
-  if (FoldMI) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
@@ -4725,6 +5281,38 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
   return true;
 }
 
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+///   %k4 = K_SET1
+/// to:
+///   %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
+                            const MCInstrDesc &Desc, unsigned Reg) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  MIB->setDesc(Desc);
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+                          bool MinusOne) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  unsigned Reg = MIB->getOperand(0).getReg();
+
+  // Insert the XOR.
+  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+      .addReg(Reg, RegState::Undef)
+      .addReg(Reg, RegState::Undef);
+
+  // Turn the pseudo into an INC or DEC.
+  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+  MIB.addReg(Reg);
+
+  return true;
+}
+
 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
 // code sequence is needed for other targets.
 static void expandLoadStackGuard(MachineInstrBuilder &MIB,
@@ -4735,8 +5323,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
   const GlobalValue *GV =
       cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
   unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
-  MachineMemOperand *MMO = MBB.getParent()->
-      getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
   MachineBasicBlock::iterator I = MIB.getInstr();
 
   BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
@@ -4753,6 +5341,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+  case X86::MOV32r1:
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+  case X86::MOV32r_1:
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
   case X86::SETB_C8r:
     return Expand2AddrUndef(MIB, get(X86::SBB8rr));
   case X86::SETB_C16r:
@@ -4777,10 +5369,25 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
+  case X86::MOV32ri64:
+    MI->setDesc(get(X86::MOV32ri));
+    return true;
+
+  // KNL does not recognize dependency-breaking idioms for mask registers,
+  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+  // Using %k0 as the undef input register is a performance heuristic based
+  // on the assumption that %k0 is used less frequently than the other mask
+  // registers, since it is not usable as a write mask.
+  // FIXME: A more advanced approach would be to choose the best input mask
+  // register based on context.
   case X86::KSET0B:
-  case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
   case X86::KSET1B:
-  case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
+  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
   case TargetOpcode::LOAD_STACK_GUARD:
     expandLoadStackGuard(MIB, *this);
     return true;
@@ -4788,12 +5395,28 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   return false;
 }
 
-static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+                        int PtrOffset = 0) {
   unsigned NumAddrOps = MOs.size();
-  for (unsigned i = 0; i != NumAddrOps; ++i)
-    MIB.addOperand(MOs[i]);
-  if (NumAddrOps < 4) // FrameIndex only
-    addOffset(MIB, 0);
+
+  if (NumAddrOps < 4) {
+    // FrameIndex only - add an immediate offset (whether its zero or not).
+    for (unsigned i = 0; i != NumAddrOps; ++i)
+      MIB.addOperand(MOs[i]);
+    addOffset(MIB, PtrOffset);
+  } else {
+    // General Memory Addressing - we need to add any offset to an existing
+    // offset.
+    assert(MOs.size() == 5 && "Unexpected memory operand list length");
+    for (unsigned i = 0; i != NumAddrOps; ++i) {
+      const MachineOperand &MO = MOs[i];
+      if (i == 3 && PtrOffset != 0) {
+        MIB.addDisp(MO, PtrOffset);
+      } else {
+        MIB.addOperand(MO);
+      }
+    }
+  }
 }
 
 static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
@@ -4828,7 +5451,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
 static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
                               unsigned OpNo, ArrayRef<MachineOperand> MOs,
                               MachineBasicBlock::iterator InsertPt,
-                              MachineInstr *MI, const TargetInstrInfo &TII) {
+                              MachineInstr *MI, const TargetInstrInfo &TII,
+                              int PtrOffset = 0) {
   // Omit the implicit operands, something BuildMI can't do.
   MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
                                               MI->getDebugLoc(), true);
@@ -4838,7 +5462,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
     MachineOperand &MO = MI->getOperand(i);
     if (i == OpNo) {
       assert(MO.isReg() && "Expected to fold into reg operand!");
-      addOperands(MIB, MOs);
+      addOperands(MIB, MOs, PtrOffset);
     } else {
       MIB.addOperand(MO);
     }
@@ -4860,6 +5484,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
   return MIB.addImm(0);
 }
 
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align) const {
+  switch (MI->getOpcode()) {
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    // Attempt to convert the load of inserted vector into a fold load
+    // of a single float.
+    if (OpNum == 2) {
+      unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+      unsigned ZMask = Imm & 15;
+      unsigned DstIdx = (Imm >> 4) & 3;
+      unsigned SrcIdx = (Imm >> 6) & 3;
+
+      unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 4 <= Align) {
+        int PtrOffset = SrcIdx * 4;
+        unsigned NewImm = (DstIdx << 4) | ZMask;
+        unsigned NewOpCode =
+            (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+                                                 : X86::INSERTPSrm);
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+        NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+        return NewMI;
+      }
+    }
+    break;
+  };
+
+  return nullptr;
+}
+
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
@@ -4869,10 +5527,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   bool isCallRegIndirect = Subtarget.callRegIndirect();
   bool isTwoAddrFold = false;
 
-  // For CPUs that favor the register form of a call,
-  // do not fold loads into calls.
-  if (isCallRegIndirect &&
-    (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r))
+  // For CPUs that favor the register form of a call or push,
+  // do not fold loads into calls or pushes, unless optimizing for size
+  // aggressively.
+  if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+      (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+       MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+       MI->getOpcode() == X86::PUSH64r))
     return nullptr;
 
   unsigned NumOps = MI->getDesc().getNumOperands();
@@ -4886,6 +5547,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   MachineInstr *NewMI = nullptr;
+
+  // Attempt to fold any custom cases we have.
+  if (MachineInstr *CustomMI =
+          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+    return CustomMI;
+
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
@@ -4963,60 +5630,56 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // If the instruction and target operand are commutable, commute the
   // instruction and try again.
   if (AllowCommute) {
-    unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
+    unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
     if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
       bool HasDef = MI->getDesc().getNumDefs();
       unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
       unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
       unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
-      bool Tied0 =
-          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
       bool Tied1 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied2 =
           0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
 
       // If either of the commutable operands are tied to the destination
       // then we can not commute + fold.
-      if ((HasDef && Reg0 == Reg1 && Tied0) ||
-          (HasDef && Reg0 == Reg2 && Tied1))
+      if ((HasDef && Reg0 == Reg1 && Tied1) ||
+          (HasDef && Reg0 == Reg2 && Tied2))
         return nullptr;
 
-      if ((CommuteOpIdx1 == OriginalOpIdx) ||
-          (CommuteOpIdx2 == OriginalOpIdx)) {
-        MachineInstr *CommutedMI = commuteInstruction(MI, false);
-        if (!CommutedMI) {
-          // Unable to commute.
-          return nullptr;
-        }
-        if (CommutedMI != MI) {
-          // New instruction. We can't fold from this.
-          CommutedMI->eraseFromParent();
-          return nullptr;
-        }
+      MachineInstr *CommutedMI =
+          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+      if (!CommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (CommutedMI != MI) {
+        // New instruction. We can't fold from this.
+        CommutedMI->eraseFromParent();
+        return nullptr;
+      }
 
-        // Attempt to fold with the commuted version of the instruction.
-        unsigned CommuteOp =
-            (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
-        NewMI =
-            foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align,
-                                  /*AllowCommute=*/false);
-        if (NewMI)
-          return NewMI;
-
-        // Folding failed again - undo the commute before returning.
-        MachineInstr *UncommutedMI = commuteInstruction(MI, false);
-        if (!UncommutedMI) {
-          // Unable to commute.
-          return nullptr;
-        }
-        if (UncommutedMI != MI) {
-          // New instruction. It doesn't need to be kept.
-          UncommutedMI->eraseFromParent();
-          return nullptr;
-        }
+      // Attempt to fold with the commuted version of the instruction.
+      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+                                    Size, Align, /*AllowCommute=*/false);
+      if (NewMI)
+        return NewMI;
 
-        // Return here to prevent duplicate fuse failure report.
+      // Folding failed again - undo the commute before returning.
+      MachineInstr *UncommutedMI =
+          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+      if (!UncommutedMI) {
+        // Unable to commute.
         return nullptr;
       }
+      if (UncommutedMI != MI) {
+        // New instruction. It doesn't need to be kept.
+        UncommutedMI->eraseFromParent();
+        return nullptr;
+      }
+
+      // Return here to prevent duplicate fuse failure report.
+      return nullptr;
     }
   }
 
@@ -5208,13 +5871,14 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   // If MI kills this register, the false dependence is already broken.
   if (MI->killsRegister(Reg, TRI))
     return;
+
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
-    bool HasAVX = Subtarget.hasAVX();
-    unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+    unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+    MI->addRegisterKilled(Reg, TRI, true);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // Use vxorps to clear the full ymm register.
     // It wants to read and write the xmm sub-register.
@@ -5222,21 +5886,20 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
       .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
       .addReg(Reg, RegState::ImplicitDefine);
-  } else
-    return;
-  MI->addRegisterKilled(Reg, TRI, true);
+    MI->addRegisterKilled(Reg, TRI, true);
+  }
 }
 
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
   // Check switch flag
-  if (NoFusing) return nullptr;
+  if (NoFusing)
+    return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
-      hasPartialRegUpdate(MI->getOpcode()))
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
     return nullptr;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -5303,6 +5966,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
     case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
     case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+    case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
+    case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
+    case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
+    case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
+    case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
+    case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
       return false;
     default:
       return true;
@@ -5318,6 +5987,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
     case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
     case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+    case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
+    case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
+    case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
+    case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
+    case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
+    case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
       return false;
     default:
       return true;
@@ -5342,10 +6017,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // Check switch flag
   if (NoFusing) return nullptr;
 
-  // Unless optimizing for size, don't fold to avoid partial
-  // register update stalls
-  if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
-      hasPartialRegUpdate(MI->getOpcode()))
+  // Avoid partial register update stalls unless optimizing for size.
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
     return nullptr;
 
   // Determine the alignment of the load.
@@ -5460,62 +6133,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
                                /*Size=*/0, Alignment, /*AllowCommute=*/true);
 }
 
-bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                        ArrayRef<unsigned> Ops) const {
-  // Check switch flag
-  if (NoFusing) return 0;
-
-  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
-    switch (MI->getOpcode()) {
-    default: return false;
-    case X86::TEST8rr:
-    case X86::TEST16rr:
-    case X86::TEST32rr:
-    case X86::TEST64rr:
-      return true;
-    case X86::ADD32ri:
-      // FIXME: AsmPrinter doesn't know how to handle
-      // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
-      if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
-        return false;
-      break;
-    }
-  }
-
-  if (Ops.size() != 1)
-    return false;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  unsigned NumOps = MI->getDesc().getNumOperands();
-  bool isTwoAddr = NumOps > 1 &&
-    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
-  // Folding a memory location into the two-address part of a two-address
-  // instruction is different than folding it other places.  It requires
-  // replacing the *two* registers with the memory location.
-  const DenseMap<unsigned,
-                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
-  if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
-    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
-  } else if (OpNum == 0) {
-    if (Opc == X86::MOV32r0)
-      return true;
-
-    OpcodeTablePtr = &RegOp2MemOpTable0;
-  } else if (OpNum == 1) {
-    OpcodeTablePtr = &RegOp2MemOpTable1;
-  } else if (OpNum == 2) {
-    OpcodeTablePtr = &RegOp2MemOpTable2;
-  } else if (OpNum == 3) {
-    OpcodeTablePtr = &RegOp2MemOpTable3;
-  }
-
-  if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
-    return true;
-  return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
-}
-
 bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                                 unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
@@ -5536,9 +6153,10 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   const MCInstrDesc &MCID = get(Opc);
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  // TODO: Check if 32-byte or greater accesses are slow too?
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
-      !Subtarget.isUnalignedMemAccessFast())
+      Subtarget.isUnalignedMem16Slow())
     // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
     // conservatively assume the address is unaligned. That's bad for
     // performance.
@@ -5582,20 +6200,19 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   if (FoldedStore)
     MIB.addReg(Reg, RegState::Define);
-  for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
-    MIB.addOperand(BeforeOps[i]);
+  for (MachineOperand &BeforeOp : BeforeOps)
+    MIB.addOperand(BeforeOp);
   if (FoldedLoad)
     MIB.addReg(Reg);
-  for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
-    MIB.addOperand(AfterOps[i]);
-  for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
-    MachineOperand &MO = ImpOps[i];
-    MIB.addReg(MO.getReg(),
-               getDefRegState(MO.isDef()) |
+  for (MachineOperand &AfterOp : AfterOps)
+    MIB.addOperand(AfterOp);
+  for (MachineOperand &ImpOp : ImpOps) {
+    MIB.addReg(ImpOp.getReg(),
+               getDefRegState(ImpOp.isDef()) |
                RegState::Implicit |
-               getKillRegState(MO.isKill()) |
-               getDeadRegState(MO.isDead()) |
-               getUndefRegState(MO.isUndef()));
+               getKillRegState(ImpOp.isKill()) |
+               getDeadRegState(ImpOp.isDead()) |
+               getUndefRegState(ImpOp.isUndef()));
   }
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
   switch (DataMI->getOpcode()) {
@@ -5686,9 +6303,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                             cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        !Subtarget.isUnalignedMemAccessFast())
+        Subtarget.isUnalignedMem16Slow())
       // Do not introduce a slow unaligned load.
       return false;
+    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+    // memory access is slow above.
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
@@ -5729,9 +6348,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                              cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        !Subtarget.isUnalignedMemAccessFast())
+        Subtarget.isUnalignedMem16Slow())
       // Do not introduce a slow unaligned store.
       return false;
+    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+    // memory access is slow above.
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
@@ -6192,16 +6813,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
 // domains, but they require a bit more work than just switching opcodes.
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain) {
-  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
-    if (ReplaceableInstrs[i][domain-1] == opcode)
-      return ReplaceableInstrs[i];
+  for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+    if (Row[domain-1] == opcode)
+      return Row;
   return nullptr;
 }
 
 static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
-  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
-    if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
-      return ReplaceableInstrsAVX2[i];
+  for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+    if (Row[domain-1] == opcode)
+      return Row;
   return nullptr;
 }
 
@@ -6347,230 +6968,181 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel,
   return isHighLatencyDef(DefMI->getOpcode());
 }
 
-static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst,
-                                          const MachineBasicBlock *MBB) {
-  assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators");
-  const MachineOperand &Op1 = Inst.getOperand(1);
-  const MachineOperand &Op2 = Inst.getOperand(2);
-  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
-  // We need virtual register definitions.
-  MachineInstr *MI1 = nullptr;
-  MachineInstr *MI2 = nullptr;
-  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
-    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
-  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
-    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
-
-  // And they need to be in the trace (otherwise, they won't have a depth).
-  if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
-    return true;
-
-  return false;
-}
-
-static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
-  const MachineBasicBlock *MBB = Inst.getParent();
-  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
-  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
-  unsigned AssocOpcode = Inst.getOpcode();
-
-  // If only one operand has the same opcode and it's the second source operand,
-  // the operands must be commuted.
-  Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
-  if (Commuted)
-    std::swap(MI1, MI2);
-
-  // 1. The previous instruction must be the same type as Inst.
-  // 2. The previous instruction must have virtual register definitions for its
-  //    operands in the same basic block as Inst.
-  // 3. The previous instruction's result must only be used by Inst.
-  if (MI1->getOpcode() == AssocOpcode &&
-      hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
-      MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
-    return true;
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+                                           const MachineBasicBlock *MBB) const {
+  assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+         "Reassociation needs binary operators");
+
+  // Integer binary math/logic instructions have a third source operand:
+  // the EFLAGS register. That operand must be both defined here and never
+  // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+  // not change anything because rearranging the operands could affect other
+  // instructions that depend on the exact status flags (zero, sign, etc.)
+  // that are set by using these particular operands with this operation.
+  if (Inst.getNumOperands() == 4) {
+    assert(Inst.getOperand(3).isReg() &&
+           Inst.getOperand(3).getReg() == X86::EFLAGS &&
+           "Unexpected operand in reassociable instruction");
+    if (!Inst.getOperand(3).isDead())
+      return false;
+  }
 
-  return false;
+  return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
 }
 
 // TODO: There are many more machine instruction opcodes to match:
 //       1. Other data types (integer, vectors)
-//       2. Other math / logic operations (and, or)
-static bool isAssociativeAndCommutative(unsigned Opcode) {
-  switch (Opcode) {
+//       2. Other math / logic operations (xor, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  case X86::AND8rr:
+  case X86::AND16rr:
+  case X86::AND32rr:
+  case X86::AND64rr:
+  case X86::OR8rr:
+  case X86::OR16rr:
+  case X86::OR32rr:
+  case X86::OR64rr:
+  case X86::XOR8rr:
+  case X86::XOR16rr:
+  case X86::XOR32rr:
+  case X86::XOR64rr:
+  case X86::IMUL16rr:
+  case X86::IMUL32rr:
+  case X86::IMUL64rr:
+  case X86::PANDrr:
+  case X86::PORrr:
+  case X86::PXORrr:
+  case X86::VPANDrr:
+  case X86::VPANDYrr:
+  case X86::VPORrr:
+  case X86::VPORYrr:
+  case X86::VPXORrr:
+  case X86::VPXORYrr:
+  // Normal min/max instructions are not commutative because of NaN and signed
+  // zero semantics, but these are. Thus, there's no need to check for global
+  // relaxed math; the instructions themselves have the properties we need.
+  case X86::MAXCPDrr:
+  case X86::MAXCPSrr:
+  case X86::MAXCSDrr:
+  case X86::MAXCSSrr:
+  case X86::MINCPDrr:
+  case X86::MINCPSrr:
+  case X86::MINCSDrr:
+  case X86::MINCSSrr:
+  case X86::VMAXCPDrr:
+  case X86::VMAXCPSrr:
+  case X86::VMAXCPDYrr:
+  case X86::VMAXCPSYrr:
+  case X86::VMAXCSDrr:
+  case X86::VMAXCSSrr:
+  case X86::VMINCPDrr:
+  case X86::VMINCPSrr:
+  case X86::VMINCPDYrr:
+  case X86::VMINCPSYrr:
+  case X86::VMINCSDrr:
+  case X86::VMINCSSrr:
+    return true;
+  case X86::ADDPDrr:
+  case X86::ADDPSrr:
   case X86::ADDSDrr:
   case X86::ADDSSrr:
-  case X86::VADDSDrr:
-  case X86::VADDSSrr:
+  case X86::MULPDrr:
+  case X86::MULPSrr:
   case X86::MULSDrr:
   case X86::MULSSrr:
+  case X86::VADDPDrr:
+  case X86::VADDPSrr:
+  case X86::VADDPDYrr:
+  case X86::VADDPSYrr:
+  case X86::VADDSDrr:
+  case X86::VADDSSrr:
+  case X86::VMULPDrr:
+  case X86::VMULPSrr:
+  case X86::VMULPDYrr:
+  case X86::VMULPSYrr:
   case X86::VMULSDrr:
   case X86::VMULSSrr:
-    return true;
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
   default:
     return false;
   }
 }
 
-/// Return true if the input instruction is part of a chain of dependent ops
-/// that are suitable for reassociation, otherwise return false.
-/// If the instruction's operands must be commuted to have a previous
-/// instruction of the same type define the first source operand, Commuted will
-/// be set to true.
-static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) {
-  // 1. The operation must be associative and commutative.
-  // 2. The instruction must have virtual register definitions for its
-  //    operands in the same basic block.
-  // 3. The instruction must have a reassociable sibling.
-  if (isAssociativeAndCommutative(Inst.getOpcode()) &&
-      hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
-      hasReassocSibling(Inst, Commuted))
-    return true;
-
-  return false;
-}
-
-// FIXME: This has the potential to be expensive (compile time) while not
-// improving the code at all. Some ways to limit the overhead:
-// 1. Track successful transforms; bail out if hit rate gets too low.
-// 2. Only enable at -O3 or some other non-default optimization level.
-// 3. Pre-screen pattern candidates here: if an operand of the previous
-//    instruction is known to not increase the critical path, then don't match
-//    that pattern.
-bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
-        SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
-  if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
-    return false;
-
-  // TODO: There is nothing x86-specific here except the instruction type.
-  // This logic could be hoisted into the machine combiner pass itself.
-
-  // Look for this reassociation pattern:
-  //   B = A op X (Prev)
-  //   C = B op Y (Root)
-
-  bool Commute;
-  if (isReassocCandidate(Root, Commute)) {
-    // We found a sequence of instructions that may be suitable for a
-    // reassociation of operands to increase ILP. Specify each commutation
-    // possibility for the Prev instruction in the sequence and let the
-    // machine combiner decide if changing the operands is worthwhile.
-    if (Commute) {
-      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
-      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
-    } else {
-      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
-      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
-    }
-    return true;
-  }
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+                                         MachineInstr &OldMI2,
+                                         MachineInstr &NewMI1,
+                                         MachineInstr &NewMI2) const {
+  // Integer instructions define an implicit EFLAGS source register operand as
+  // the third source (fourth total) operand.
+  if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+    return;
 
-  return false;
+  assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+         "Unexpected instruction type for reassociation");
+
+  MachineOperand &OldOp1 = OldMI1.getOperand(3);
+  MachineOperand &OldOp2 = OldMI2.getOperand(3);
+  MachineOperand &NewOp1 = NewMI1.getOperand(3);
+  MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+  assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+  assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+
+  (void)OldOp1;
+  (void)OldOp2;
+
+  assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+  assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+
+  // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+  // of this pass or other passes. The EFLAGS operands must be dead in these new
+  // instructions because the EFLAGS operands in the original instructions must
+  // be dead in order for reassociation to occur.
+  NewOp1.setIsDead();
+  NewOp2.setIsDead();
 }
 
-/// Attempt the following reassociation to reduce critical path length:
-///   B = A op X (Prev)
-///   C = B op Y (Root)
-///   ===>
-///   B = X op Y
-///   C = A op B
-static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
-                           MachineCombinerPattern::MC_PATTERN Pattern,
-                           SmallVectorImpl<MachineInstr *> &InsInstrs,
-                           SmallVectorImpl<MachineInstr *> &DelInstrs,
-                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
-  MachineFunction *MF = Root.getParent()->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
-
-  // This array encodes the operand index for each parameter because the
-  // operands may be commuted. Each row corresponds to a pattern value,
-  // and each column specifies the index of A, B, X, Y.
-  unsigned OpIdx[4][4] = {
-    { 1, 1, 2, 2 },
-    { 1, 2, 2, 1 },
-    { 2, 1, 1, 2 },
-    { 2, 2, 1, 1 }
-  };
-
-  MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]);
-  MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]);
-  MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
-  MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
-  MachineOperand &OpC = Root.getOperand(0);
-
-  unsigned RegA = OpA.getReg();
-  unsigned RegB = OpB.getReg();
-  unsigned RegX = OpX.getReg();
-  unsigned RegY = OpY.getReg();
-  unsigned RegC = OpC.getReg();
-
-  if (TargetRegisterInfo::isVirtualRegister(RegA))
-    MRI.constrainRegClass(RegA, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegB))
-    MRI.constrainRegClass(RegB, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegX))
-    MRI.constrainRegClass(RegX, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegY))
-    MRI.constrainRegClass(RegY, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegC))
-    MRI.constrainRegClass(RegC, RC);
-
-  // Create a new virtual register for the result of (X op Y) instead of
-  // recycling RegB because the MachineCombiner's computation of the critical
-  // path requires a new register definition rather than an existing one.
-  unsigned NewVR = MRI.createVirtualRegister(RC);
-  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
-
-  unsigned Opcode = Root.getOpcode();
-  bool KillA = OpA.isKill();
-  bool KillX = OpX.isKill();
-  bool KillY = OpY.isKill();
-
-  // Create new instructions for insertion.
-  MachineInstrBuilder MIB1 =
-    BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
-      .addReg(RegX, getKillRegState(KillX))
-      .addReg(RegY, getKillRegState(KillY));
-  InsInstrs.push_back(MIB1);
-
-  MachineInstrBuilder MIB2 =
-    BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
-      .addReg(RegA, getKillRegState(KillA))
-      .addReg(NewVR, getKillRegState(true));
-  InsInstrs.push_back(MIB2);
-
-  // Record old instructions for deletion.
-  DelInstrs.push_back(&Prev);
-  DelInstrs.push_back(&Root);
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
 }
 
-void X86InstrInfo::genAlternativeCodeSequence(
-    MachineInstr &Root,
-    MachineCombinerPattern::MC_PATTERN Pattern,
-    SmallVectorImpl<MachineInstr *> &InsInstrs,
-    SmallVectorImpl<MachineInstr *> &DelInstrs,
-    DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
-  MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
-
-  // Select the previous instruction in the sequence based on the input pattern.
-  MachineInstr *Prev = nullptr;
-  switch (Pattern) {
-    case MachineCombinerPattern::MC_REASSOC_AX_BY:
-    case MachineCombinerPattern::MC_REASSOC_XA_BY:
-      Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-      break;
-    case MachineCombinerPattern::MC_REASSOC_AX_YB:
-    case MachineCombinerPattern::MC_REASSOC_XA_YB:
-      Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
-  }
-  assert(Prev && "Unknown pattern for machine combiner");
-
-  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
-  return;
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace X86II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+      {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+      {MO_GOT, "x86-got"},
+      {MO_GOTOFF, "x86-gotoff"},
+      {MO_GOTPCREL, "x86-gotpcrel"},
+      {MO_PLT, "x86-plt"},
+      {MO_TLSGD, "x86-tlsgd"},
+      {MO_TLSLD, "x86-tlsld"},
+      {MO_TLSLDM, "x86-tlsldm"},
+      {MO_GOTTPOFF, "x86-gottpoff"},
+      {MO_INDNTPOFF, "x86-indntpoff"},
+      {MO_TPOFF, "x86-tpoff"},
+      {MO_DTPOFF, "x86-dtpoff"},
+      {MO_NTPOFF, "x86-ntpoff"},
+      {MO_GOTNTPOFF, "x86-gotntpoff"},
+      {MO_DLLIMPORT, "x86-dllimport"},
+      {MO_DARWIN_STUB, "x86-darwin-stub"},
+      {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+      {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+      {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+      {MO_TLVP, "x86-tlvp"},
+      {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+      {MO_SECREL, "x86-secrel"}};
+  return makeArrayRef(TargetFlags);
 }
 
 namespace {
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index bf63336..edd09d6 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -26,19 +26,6 @@ namespace llvm {
   class X86RegisterInfo;
   class X86Subtarget;
 
-  namespace MachineCombinerPattern {
-    enum MC_PATTERN : int {
-      // These are commutative variants for reassociating a computation chain
-      // of the form:
-      //   B = A op X (Prev)
-      //   C = B op Y (Root)
-      MC_REASSOC_AX_BY = 0,
-      MC_REASSOC_AX_YB = 1,
-      MC_REASSOC_XA_BY = 2,
-      MC_REASSOC_XA_YB = 3,
-    };
-  } // end namespace MachineCombinerPattern
-
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
@@ -259,14 +246,64 @@ public:
                                       MachineBasicBlock::iterator &MBBI,
                                       LiveVariables *LV) const override;
 
-  /// commuteInstruction - We have a few instructions that must be hacked on to
-  /// commute them.
+  /// Returns true iff the routine could find two commutable operands in the
+  /// given machine instruction.
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+  /// input values can be re-defined in this method only if the input values
+  /// are not pre-defined, which is designated by the special value
+  /// 'CommuteAnyOperandIndex' assigned to it.
+  /// If both of indices are pre-defined and refer to some operands, then the
+  /// method simply returns true if the corresponding operands are commutable
+  /// and returns false otherwise.
   ///
-  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
-
+  /// For example, calling this method this way:
+  ///     unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+  ///     findCommutedOpIndices(MI, Op1, Op2);
+  /// can be interpreted as a query asking to find an operand that would be
+  /// commutable with the operand#1.
   bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
+  /// Returns true if the routine could find two commutable operands
+  /// in the given FMA instruction. Otherwise, returns false.
+  ///
+  /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
+  /// The output indices of the commuted operands are returned in these
+  /// arguments. Also, the input values of these arguments may be preset either
+  /// to indices of operands that must be commuted or be equal to a special
+  /// value 'CommuteAnyOperandIndex' which means that the corresponding
+  /// operand index is not set and this method is free to pick any of
+  /// available commutable operands.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
+  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+  /// can be interpreted as a query asking if the operand #1 can be swapped
+  /// with any other available operand (e.g. operand #2, operand #3, etc.).
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  bool findFMA3CommutedOpIndices(MachineInstr *MI,
+                                 unsigned &SrcOpIdx1,
+                                 unsigned &SrcOpIdx2) const;
+
+  /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+  /// performs the same computations as the given MI but which has the operands
+  /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+  /// It may return 0 if it is unsafe to commute the operands.
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given \p MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+                                          unsigned SrcOpIdx1,
+                                          unsigned SrcOpIdx2) const;
+
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -342,11 +379,6 @@ public:
                                       MachineBasicBlock::iterator InsertPt,
                                       MachineInstr *LoadMI) const override;
 
-  /// canFoldMemoryOperand - Returns true if the specified load / store is
-  /// folding is possible.
-  bool canFoldMemoryOperand(const MachineInstr *,
-                            ArrayRef<unsigned>) const override;
-
   /// unfoldMemoryOperand - Separate a single instruction which folded a load or
   /// a store or a load and a store into two or more instruction. If this is
   /// possible, returns true as well as the new instructions by reference.
@@ -406,10 +438,9 @@ public:
   bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I) const;
 
-  static bool isX86_64ExtendedReg(const MachineOperand &MO) {
-    if (!MO.isReg()) return false;
-    return X86II::isX86_64ExtendedReg(MO.getReg());
-  }
+  /// True if MI has a condition code def, e.g. EFLAGS, that is
+  /// not marked dead.
+  bool hasLiveCondCodeDef(MachineInstr *MI) const;
 
   /// getGlobalBaseReg - Return a virtual register initialized with the
   /// the global base register value. Output instructions required to
@@ -452,26 +483,19 @@ public:
                              const MachineInstr *DefMI, unsigned DefIdx,
                              const MachineInstr *UseMI,
                              unsigned UseIdx) const override;
-
   
   bool useMachineCombiner() const override {
     return true;
   }
-  
-  /// Return true when there is potentially a faster code sequence
-  /// for an instruction chain ending in <Root>. All potential patterns are
-  /// output in the <Pattern> array.
-  bool getMachineCombinerPatterns(
-      MachineInstr &Root,
-      SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override;
-  
-  /// When getMachineCombinerPatterns() finds a pattern, this function generates
-  /// the instructions that could replace the original code sequence.
-  void genAlternativeCodeSequence(
-          MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
-          SmallVectorImpl<MachineInstr *> &InsInstrs,
-          SmallVectorImpl<MachineInstr *> &DelInstrs,
-          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+  bool hasReassociableOperands(const MachineInstr &Inst,
+                               const MachineBasicBlock *MBB) const override;
+
+  void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+                             MachineInstr &NewMI1,
+                             MachineInstr &NewMI2) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
@@ -500,12 +524,42 @@ public:
                                   unsigned &FoldAsLoadDefReg,
                                   MachineInstr *&DefMI) const override;
 
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+protected:
+  /// Commutes the operands in the given instruction by changing the operands
+  /// order and/or changing the instruction's opcode and/or the immediate value
+  /// operand.
+  ///
+  /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+  /// to be commuted.
+  ///
+  /// Do not call this method for a non-commutable instruction or
+  /// non-commutable operands.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+                                       unsigned CommuteOpIdx1,
+                                       unsigned CommuteOpIdx2) const override;
+
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
                                               MachineBasicBlock::iterator &MBBI,
                                               LiveVariables *LV) const;
 
+  /// Handles memory folding for special case instructions, for instance those
+  /// requiring custom manipulation of the address.
+  MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+                                        unsigned OpNum,
+                                        ArrayRef<MachineOperand> MOs,
+                                        MachineBasicBlock::iterator InsertPt,
+                                        unsigned Size, unsigned Align) const;
+
   /// isFrameOperand - Return true and the FrameIndex if the specified
   /// operand and follow operands form a reference to the stack frame.
   bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
index 52bab9c..9c8339a 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -106,8 +106,6 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 
-def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
-
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -158,6 +156,8 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
 
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue]>;
 
 def X86vastart_save_xmm_regs :
                  SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
@@ -250,9 +250,6 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
 def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
-def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL,
-                        [SDNPHasChain, SDNPOutGlue]>;
-
 //===----------------------------------------------------------------------===//
 // X86 Operand Definitions.
 //
@@ -344,18 +341,21 @@ def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>;
 def vz32mem  : X86VMemOperand<VR512,  "printi32mem", X86MemVZ32Operand>;
 def vz64mem  : X86VMemOperand<VR512,  "printi64mem", X86MemVZ64Operand>;
 
-// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
-// plain GR64, so that it doesn't potentially require a REX prefix.
-def i8mem_NOREX : Operand<i64> {
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
   let PrintMethod = "printi8mem";
-  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm);
   let ParserMatchClass = X86Mem8AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
 // GPRs available for tailcall.
 // It represents GR32_TC, GR64_TC or GR64_TCW64.
-def ptr_rc_tailcall : PointerLikeRegClass<2>;
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
 
 // Special i32mem for addresses of load folding tail calls. These are not
 // allowed to use callee-saved registers since they must be scheduled
@@ -697,34 +697,34 @@ def lea64mem : Operand<i64> {
 // X86 Complex Pattern Definitions.
 //
 
-// Define X86 specific addressing mode.
-def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
-def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+// Define X86-specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
                                [add, sub, mul, X86mul_imm, shl, or, frameindex],
                                []>;
 // In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
-def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr",
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
                                   [add, sub, mul, X86mul_imm, shl, or,
                                    frameindex, X86WrapperRIP],
                                   []>;
 
-def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
-def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
-def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
                         [add, sub, mul, X86mul_imm, shl, or, frameindex,
                          X86WrapperRIP], []>;
 
-def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
-def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
-def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>;
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
@@ -767,12 +767,21 @@ def HasDQI       : Predicate<"Subtarget->hasDQI()">,
 def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
 def HasBWI       : Predicate<"Subtarget->hasBWI()">,
                      AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def NoBWI        : Predicate<"!Subtarget->hasBWI()">;
 def HasVLX       : Predicate<"Subtarget->hasVLX()">,
                      AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
 def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU        : Predicate<"!Subtarget->hasPKU()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
+def HasFXSR      : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE     : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT  : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC    : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES    : Predicate<"Subtarget->hasXSAVES()">;
 def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
 def HasFMA       : Predicate<"Subtarget->hasFMA()">;
 def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
@@ -794,6 +803,7 @@ def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
@@ -867,20 +877,54 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{
 }]>;
 
 
-def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
-def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
-def i64immSExt8  : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+def i16immSExt8  : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
 
 
-def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
 
 
 // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
 // unsigned field.
-def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
 
 def i64immZExt32SExt8 : ImmLeaf<i64, [{
-  return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
+  return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
 }]>;
 
 // Helper fragments for loads.
@@ -914,11 +958,12 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
   return false;
 }]>;
 
-def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
-def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
-def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
-def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
-def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadi8   : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
 
 def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
 def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
@@ -1020,12 +1065,8 @@ def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
                  IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
 def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
                  IIC_PUSH_REG>, OpSize16;
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
-                 IIC_PUSH_MEM>, OpSize16;
 def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
                  IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
-                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
 
 def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
                    "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
@@ -1039,6 +1080,40 @@ def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
                    "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                    Requires<[Not64BitMode]>;
 } // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+    SchedRW = [WriteRMW], Defs = [ESP] in {
+  let Uses = [ESP, EFLAGS] in
+  def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+                   [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+                Requires<[Not64BitMode]>;
+
+  let Uses = [RSP, EFLAGS] in
+  def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+                   [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+                Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+    SchedRW = [WriteRMW] in {
+  let Defs = [ESP, EFLAGS], Uses = [ESP] in
+  def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+                   [(int_x86_flags_write_u32 GR32:$src)]>,
+                Requires<[Not64BitMode]>;
+
+  let Defs = [RSP, EFLAGS], Uses = [RSP] in
+  def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+                   [(int_x86_flags_write_u64 GR64:$src)]>,
+                Requires<[In64BitMode]>;
 }
 
 let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
@@ -1071,15 +1146,18 @@ def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
                  IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
 def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
                  IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
 def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
                  IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
-} // mayStore, SchedRW
+} // mayLoad, mayStore, SchedRW
 }
 
 let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
     SchedRW = [WriteStore] in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
-                    "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
 def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
                     "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                     Requires<[In64BitMode]>;
@@ -1195,7 +1273,7 @@ def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
 def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
               "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
-let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
 def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
                "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
 
@@ -1275,13 +1353,13 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
 let SchedRW = [WriteStore] in {
 def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+                   [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
 def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+                   [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
 def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+                   [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
 def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                        "mov{q}\t{$src, $dst|$dst, $src}",
                        [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
@@ -1457,10 +1535,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
 let SchedRW = [WriteALU] in {
 let Defs = [EFLAGS], Uses = [AH] in
 def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
-                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+               Requires<[HasLAHFSAHF]>;
 let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
 def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
-                IIC_AHF>;  // AH = flags
+                IIC_AHF>,  // AH = flags
+               Requires<[HasLAHFSAHF]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1894,37 +1974,38 @@ def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
 }
 
 // Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
 def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
            Sched<[WriteLoad]>;
 
 let SchedRW = [WriteMicrocoded] in {
 // ASCII Adjust After Addition
-// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
 def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
             Requires<[Not64BitMode]>;
 
 // ASCII Adjust AX Before Division
-// sets AL, AH and EFLAGS and uses AL and AH
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
 def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
                  "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
 
 // ASCII Adjust AX After Multiply
-// sets AL, AH and EFLAGS and uses AL
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
 def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
                  "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
 
 // ASCII Adjust AL After Subtraction - sets
-// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
 def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
             Requires<[Not64BitMode]>;
 
 // Decimal Adjust AL after Addition
-// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
 def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
             Requires<[Not64BitMode]>;
 
 // Decimal Adjust AL after Subtraction
-// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
 def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
             Requires<[Not64BitMode]>;
 } // SchedRW
@@ -2357,6 +2438,32 @@ defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
 } // HasTBM, EFLAGS
 
 //===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [],
+                    IIC_SSE_MONITOR>, TB;
+let Uses = [ECX, EAX, EBX] in
+def MWAITXrr   : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>,
+                 TB;
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+      Requires<[Not64BitMode]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+      Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let Uses = [EAX] in
+def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+
+//===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate TBM instructions.
 //===----------------------------------------------------------------------===//
 
@@ -2498,8 +2605,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
 def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
 def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
 
-def : MnemonicAlias<"loopz",  "loope",  "att">;
-def : MnemonicAlias<"loopnz", "loopne", "att">;
+def : MnemonicAlias<"loopz",  "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
 
 def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
@@ -2532,14 +2639,15 @@ def : MnemonicAlias<"pusha",  "pushaw", "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"popa",   "popal",  "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pusha",  "pushal", "att">, Requires<[In32BitMode]>;
 
-def : MnemonicAlias<"repe",  "rep",   "att">;
-def : MnemonicAlias<"repz",  "rep",   "att">;
-def : MnemonicAlias<"repnz", "repne", "att">;
+def : MnemonicAlias<"repe",  "rep">;
+def : MnemonicAlias<"repz",  "rep">;
+def : MnemonicAlias<"repnz", "repne">;
 
 def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
 
+def : MnemonicAlias<"sal", "shl", "intel">;
 def : MnemonicAlias<"salb", "shlb", "att">;
 def : MnemonicAlias<"salw", "shlw", "att">;
 def : MnemonicAlias<"sall", "shll", "att">;
@@ -2579,14 +2687,14 @@ def : MnemonicAlias<"fcmova",   "fcmovnbe", "att">;
 def : MnemonicAlias<"fcmovnae", "fcmovb",   "att">;
 def : MnemonicAlias<"fcmovna",  "fcmovbe",  "att">;
 def : MnemonicAlias<"fcmovae",  "fcmovnb",  "att">;
-def : MnemonicAlias<"fcomip",   "fcompi",   "att">;
+def : MnemonicAlias<"fcomip",   "fcompi">;
 def : MnemonicAlias<"fildq",    "fildll",   "att">;
 def : MnemonicAlias<"fistpq",   "fistpll",  "att">;
 def : MnemonicAlias<"fisttpq",  "fisttpll", "att">;
 def : MnemonicAlias<"fldcww",   "fldcw",    "att">;
 def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
 def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
-def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
+def : MnemonicAlias<"fucomip",  "fucompi">;
 def : MnemonicAlias<"fwait",    "wait">;
 
 def : MnemonicAlias<"fxsaveq",   "fxsave64",   "att">;
@@ -2594,7 +2702,9 @@ def : MnemonicAlias<"fxrstorq",  "fxrstor64",  "att">;
 def : MnemonicAlias<"xsaveq",    "xsave64",    "att">;
 def : MnemonicAlias<"xrstorq",   "xrstor64",   "att">;
 def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
-
+def : MnemonicAlias<"xrstorsq",  "xrstors64",  "att">;
+def : MnemonicAlias<"xsavecq",   "xsavec64",   "att">;
+def : MnemonicAlias<"xsavesq",   "xsaves64",   "att">;
 
 class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
                     string VariantName>
@@ -2640,61 +2750,61 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
 //===----------------------------------------------------------------------===//
 
 // aad/aam default to base 10 if no operand is specified.
-def : InstAlias<"aad", (AAD8i8 10)>;
-def : InstAlias<"aam", (AAM8i8 10)>;
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
 
 // Disambiguate the mem/imm form of bt-without-a-suffix as btl.
 // Likewise for btc/btr/bts.
-def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
                 (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
-def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
                 (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
-def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
                 (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
-def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
                 (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
 
 // clr aliases.
-def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
-def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
-def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
-def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+def : InstAlias<"clrb\t$reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
 
 // lods aliases. Accept the destination being omitted because it's implicit
 // in the mnemonic, or the mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"lodsb $src", (LODSB srcidx8:$src),  0>;
-def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>;
-def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>;
-def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
-def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
-def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
-def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
 
 // stos aliases. Accept the source being omitted because it's implicit in
 // the mnemonic, or the mnemonic suffix being omitted because it's implicit
 // in the source.
-def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst),  0>;
-def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>;
-def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>;
-def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
-def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
-def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
-def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
 
 // scas aliases. Accept the destination being omitted because it's implicit
 // in the mnemonic, or the mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst),  0>;
-def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>;
-def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>;
-def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
-def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
-def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
-def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
 
 // div and idiv aliases for explicit A register.
 def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
@@ -2719,8 +2829,10 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
 // Various unary fpstack operations default to operating on on ST1.
 // For example, "fxch" -> "fxch %st(1)"
 def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
+def:  InstAlias<"fadd",         (ADD_FPrST0  ST1), 0>;
 def : InstAlias<"fsub{|r}p",    (SUBR_FPrST0 ST1), 0>;
 def : InstAlias<"fsub{r|}p",    (SUB_FPrST0  ST1), 0>;
+def : InstAlias<"fmul",         (MUL_FPrST0  ST1), 0>;
 def : InstAlias<"fmulp",        (MUL_FPrST0  ST1), 0>;
 def : InstAlias<"fdiv{|r}p",    (DIVR_FPrST0 ST1), 0>;
 def : InstAlias<"fdiv{r|}p",    (DIV_FPrST0  ST1), 0>;
@@ -2780,90 +2892,90 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
-def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall {*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp {*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall {*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp {*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-
-def : InstAlias<"call {*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp {*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"call {*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp {*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"call {*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp {*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call\t{*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call\t{*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call\t{*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
 
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
-def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
-def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
-def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
-def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
-def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
-def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
 
 // inb %dx -> inb %al, %dx
 def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
 def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
 def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
-def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>;
-def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>;
-def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
 
 
 // jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
-def : InstAlias<"call $seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp $seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmpw $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
-def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"call\t$seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call\t$seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
 
 // Force mov without a suffix with a segment and mem to prefer the 'l' form of
 // the move.  All segment/mem forms are equivalent, this has the shortest
 // encoding.
-def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
-def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq {$src, $dst|$dst, $src}",
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq {$src, $dst|$dst, $src}",
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
 // movsx aliases
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
 
 // movzx aliases
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
 def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
 def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
 def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
-def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>;
-def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>;
-def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
 
 // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
 // effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
@@ -2940,3 +3052,34 @@ def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
 def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
                 (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
 def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}",  (OR16ri8 AX,  i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}",  (OR32ri8 EAX,  i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}",  (OR64ri8 RAX,  i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
index eaa7894..83f9b14 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -249,6 +249,7 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                           (MMX_X86movd2w (x86mmx VR64:$src)))],
                           IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
 
+let isBitcast = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst, (bitconvert GR64:$src))],
@@ -262,7 +263,7 @@ def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
 // movd.
-let SchedRW = [WriteMove] in {
+let SchedRW = [WriteMove], isBitcast = 1 in {
 def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
                                "movd\t{$src, $dst|$dst, $src}",
@@ -303,7 +304,7 @@ def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                              (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
                                (x86mmx (bitconvert
-                               (i64 (vector_extract (v2i64 VR128:$src),
+                               (i64 (extractelt (v2i64 VR128:$src),
                                      (iPTR 0))))))],
                              IIC_MMX_MOVQ_RR>;
 
@@ -326,6 +327,7 @@ def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
 }
 } // SchedRW
 
+let Predicates = [HasSSE1] in
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
                          [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
@@ -355,6 +357,7 @@ defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
                                    MMX_INTALU_ITINS, 1>;
 defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
                                    MMX_INTALU_ITINS, 1>;
+let Predicates = [HasSSE2] in
 defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
                                    MMX_INTALUQ_ITINS, 1>;
 defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
@@ -382,6 +385,7 @@ defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
                                    MMX_INTALU_ITINS>;
 defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
                                    MMX_INTALU_ITINS>;
+let Predicates = [HasSSE2] in
 defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
                                    MMX_INTALUQ_ITINS>;
 
@@ -408,8 +412,10 @@ defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
 
 defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
                                      MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE1] in
 defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
                                      MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE2] in
 defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
                                      MMX_PMUL_ITINS, 1>;
 let isCommutable = 1 in
@@ -422,6 +428,7 @@ defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
 
 defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
                                      int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+let Predicates = [HasSSE1] in {
 defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
                                      MMX_MISC_FUNC_ITINS, 1>;
 defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
@@ -439,6 +446,7 @@ defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
 
 defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
                                      MMX_PSADBW_ITINS, 1>;
+}
 
 defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
                                         MMX_MISC_FUNC_ITINS>;
@@ -594,6 +602,7 @@ let Constraints = "$src1 = $dst" in {
 }
 
 // Extract / Insert
+let Predicates = [HasSSE1] in
 def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
                        (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
                        "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -601,6 +610,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
                                                imm:$src2))],
                        IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
 let Constraints = "$src1 = $dst" in {
+let Predicates = [HasSSE1] in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst),
                       (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
@@ -618,8 +628,10 @@ let Constraints = "$src1 = $dst" in {
                                        imm:$src3))],
                      IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
+}
 
 // Mask creation
+let Predicates = [HasSSE1] in
 def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                           (ins VR64:$src),
                           "pmovmskb\t{$src, $dst|$dst, $src}",
@@ -639,12 +651,12 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
 
 // Misc.
 let SchedRW = [WriteShuffle] in {
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
 def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                           "maskmovq\t{$mask, $src|$src, $mask}",
                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
                           IIC_MMX_MASKMOV>;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
                            [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
@@ -653,10 +665,6 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
 
 // 64-bit bit convert.
 let Predicates = [HasSSE2] in {
-def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
-          (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
-          (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
index cf5e2e3..71ab973 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -15,10 +15,10 @@
 
 multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
   def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
-              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
 
@@ -26,16 +26,16 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
 
 multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
   def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
   def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rr: RI<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
 defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
@@ -43,28 +43,28 @@ defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
 defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
 
 def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
 def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
 def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
 
 def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
 def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
 def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
 
 def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
-                    "bndstx \t{$src, $dst|$dst, $src}", []>, TB,
+                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
 def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-                    "bndldx \t{$src, $dst|$dst, $src}", []>, TB,
-                    Requires<[HasMPX]>;
-\ No newline at end of file
+                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
+                    Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index 99386b0..6a7c456 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -330,9 +330,9 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
 //===----------------------------------------------------------------------===//
 
 // A vector extract of the first f32/f64 position is a subregister copy
-def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
           (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
-def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
           (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
 
 // A 128-bit subvector extract from the first 256-bit vector position
@@ -413,6 +413,8 @@ let Predicates = [HasSSE2] in {
   def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
   def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
   def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
+  def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
 }
 
 // Bitcasts between 256-bit vector types. Return the original type since
@@ -650,10 +652,10 @@ let Predicates = [UseAVX] in {
   }
 
   // Extract and store.
-  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
-  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
 
@@ -736,7 +738,7 @@ let Predicates = [UseSSE1] in {
   }
 
   // Extract and store.
-  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
 
@@ -770,7 +772,7 @@ let Predicates = [UseSSE2] in {
   }
 
   // Extract and store.
-  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
 
@@ -935,22 +937,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
                             IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
-let Predicates = [HasAVX] in {
-def : Pat<(v8i32 (X86vzmovl
-                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
-          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-def : Pat<(v4i64 (X86vzmovl
-                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
-          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-def : Pat<(v8f32 (X86vzmovl
-                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
-          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-def : Pat<(v4f64 (X86vzmovl
-                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
-          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-}
-
-
 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
           (VMOVUPSYmr addr:$dst, VR256:$src)>;
 def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
@@ -1172,12 +1158,13 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
 
 multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
                                  string base_opc, InstrItinClass itin> {
-  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+  let Predicates = [UseAVX] in
+    defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                                     itin>, VEX_4V;
 
-let Constraints = "$src1 = $dst" in
-  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+  let Constraints = "$src1 = $dst" in
+    defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
                                     "\t{$src2, $dst|$dst, $src2}",
                                     itin>;
 }
@@ -1188,29 +1175,31 @@ let AddedComplexity = 20 in {
 }
 
 let SchedRW = [WriteStore] in {
+let Predicates = [UseAVX] in {
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
                                  (iPTR 0))), addr:$dst)],
                                  IIC_SSE_MOV_LH>, VEX;
 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                   [(store (f64 (extractelt (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)],
                                  IIC_SSE_MOV_LH>, VEX;
+}// UseAVX
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
                                  (iPTR 0))), addr:$dst)],
                                  IIC_SSE_MOV_LH>;
 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                   [(store (f64 (extractelt (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)],
                                  IIC_SSE_MOV_LH>;
 } // SchedRW
 
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   // Shuffle with VMOVLPS
   def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
             (VMOVLPSrm VR128:$src1, addr:$src2)>;
@@ -1243,7 +1232,7 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [UseSSE1] in {
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
-  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
+  def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
                                  (iPTR 0))), addr:$src1),
             (MOVLPSmr addr:$src1, VR128:$src2)>;
 
@@ -1297,31 +1286,33 @@ let AddedComplexity = 20 in {
 let SchedRW = [WriteStore] in {
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract
+                   [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract
+                   [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+} // UseAVX
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract
+                   [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract
+                   [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
                                  (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
 } // SchedRW
 
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   // VMOVHPS patterns
   def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
@@ -1345,7 +1336,7 @@ let Predicates = [HasAVX] in {
                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 
-  def : Pat<(store (f64 (vector_extract
+  def : Pat<(store (f64 (extractelt
                           (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
                           (iPTR 0))), addr:$dst),
             (VMOVHPDmr addr:$dst, VR128:$src)>;
@@ -1377,7 +1368,7 @@ let Predicates = [UseSSE2] in {
                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
             (MOVHPDrm VR128:$src1, addr:$src2)>;
 
-  def : Pat<(store (f64 (vector_extract
+  def : Pat<(store (f64 (extractelt
                           (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
                           (iPTR 0))), addr:$dst),
             (MOVHPDmr addr:$dst, VR128:$src)>;
@@ -1475,6 +1466,8 @@ def SSE_CVT_SD2SI : OpndItins<
   IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
 >;
 
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                      SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
                      string asm, OpndItins itins> {
@@ -1498,6 +1491,8 @@ let hasSideEffects = 0 in {
 }
 }
 
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
 let hasSideEffects = 0, Predicates = [UseAVX] in {
@@ -1635,6 +1630,8 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
 // and/or XMM operand(s).
 
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
                          string asm, OpndItins itins> {
@@ -1811,7 +1808,7 @@ def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+                (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
 
 /// SSE 2 Only
 
@@ -2073,14 +2070,16 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
             (VCVTDQ2PSrm addr:$src)>;
+}
 
-  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
@@ -2149,7 +2148,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
             (VCVTTPD2DQYrr VR256:$src)>;
   def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
@@ -2306,7 +2305,9 @@ let Predicates = [HasAVX] in {
             (VCVTDQ2PSYrr VR256:$src)>;
   def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
             (VCVTDQ2PSYrm addr:$src)>;
+}
 
+let Predicates = [HasAVX, NoVLX] in {
   // Match fround and fextend for 128/256-bit conversions
   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
             (VCVTPD2PSrr VR128:$src)>;
@@ -2452,9 +2453,9 @@ let Defs = [EFLAGS] in {
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
                                   "ucomisd">, PD, VEX, VEX_LIG;
   let Pattern = []<dag> in {
-    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+    defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
                                     "comiss">, PS, VEX, VEX_LIG;
-    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+    defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
                                     "comisd">, PD, VEX, VEX_LIG;
   }
 
@@ -2475,9 +2476,9 @@ let Defs = [EFLAGS] in {
                                   "ucomisd">, PD;
 
   let Pattern = []<dag> in {
-    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+    defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
                                     "comiss">, PS;
-    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+    defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
                                     "comisd">, PD;
   }
 
@@ -2605,19 +2606,20 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
             Sched<[WriteFShuffle]>;
 }
 
-defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+let Predicates = [HasAVX, NoVLX] in {
+  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f32, SSEPackedSingle>, PS, VEX_4V;
-defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
-defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv2f64, SSEPackedDouble>, PD, VEX_4V;
-defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
-
+}
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -2627,7 +2629,7 @@ let Constraints = "$src1 = $dst" in {
                     memopv2f64, SSEPackedDouble>, PD;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
                        (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
@@ -2694,6 +2696,7 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
              Sched<[WriteFShuffleLd, ReadAfterLd]>;
 }
 
+let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, PS, VEX_4V;
@@ -2719,7 +2722,7 @@ defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, PD, VEX_4V, VEX_L;
-
+}// Predicates = [HasAVX, NoVLX]
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
@@ -2845,8 +2848,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          ValueType OpVT128, ValueType OpVT256,
-                         OpndItins itins, bit IsCommutable = 0> {
-let Predicates = [HasAVX, NoVLX] in
+                         OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
                     VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
 
@@ -2854,7 +2857,7 @@ let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
                            memopv2i64, i128mem, itins, IsCommutable, 1>;
 
-let Predicates = [HasAVX2, NoVLX] in
+let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, loadv4i64, i256mem, itins,
                                IsCommutable, 0>, VEX_4V, VEX_L;
@@ -2863,13 +2866,13 @@ let Predicates = [HasAVX2, NoVLX] in
 // These are ordered here for pattern ordering requirements with the fp versions
 
 defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
-                           SSE_VEC_BIT_ITINS_P, 1>;
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
 defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
-                           SSE_VEC_BIT_ITINS_P, 1>;
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
 defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
-                           SSE_VEC_BIT_ITINS_P, 1>;
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
-                           SSE_VEC_BIT_ITINS_P, 0>;
+                           SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Logical Instructions
@@ -2911,7 +2914,7 @@ let isCodeGenOnly = 1 in {
 // Multiclass for vectors using the X86 logical operation aliases for FP.
 multiclass sse12_fp_packed_vector_logical_alias<
     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
-  let Predicates = [HasAVX, NoVLX] in {
+  let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
               VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
               PS, VEX_4V;
@@ -2923,7 +2926,7 @@ multiclass sse12_fp_packed_vector_logical_alias<
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
         VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
         PS, VEX_4V, VEX_L;
-        
+
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
         VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
         PD, VEX_4V, VEX_L;
@@ -3183,7 +3186,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [UseSSE1] in {
     // extracted scalar math op with insert via movss
     def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
           FR32:$src))))),
       (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
           (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3198,7 +3201,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [UseSSE41] in {
     // extracted scalar math op with insert via blend
     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
           FR32:$src))), (i8 1))),
       (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
           (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3215,7 +3218,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [HasAVX] in {
     // extracted scalar math op with insert via blend
     def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
-          (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
           FR32:$src))), (i8 1))),
       (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
           (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3241,7 +3244,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [UseSSE2] in {
     // extracted scalar math op with insert via movsd
     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
           FR64:$src))))),
       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
           (COPY_TO_REGCLASS FR64:$src, VR128))>;
@@ -3256,7 +3259,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [UseSSE41] in {
     // extracted scalar math op with insert via blend
     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
           FR64:$src))), (i8 1))),
       (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
           (COPY_TO_REGCLASS FR64:$src, VR128))>;
@@ -3271,14 +3274,14 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
   let Predicates = [HasAVX] in {
     // extracted scalar math op with insert via movsd
     def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
           FR64:$src))))),
       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
           (COPY_TO_REGCLASS FR64:$src, VR128))>;
 
     // extracted scalar math op with insert via blend
     def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
-          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
           FR64:$src))), (i8 1))),
       (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
           (COPY_TO_REGCLASS FR64:$src, VR128))>;
@@ -3390,9 +3393,18 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def : Pat<(Intr (load addr:$src)),
             (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
                                       addr:$src), VR128))>;
-  def : Pat<(Intr mem_cpat:$src),
-             (!cast<Instruction>(NAME#Suffix##m_Int)
-                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+  }
+  // We don't want to fold scalar loads into these instructions unless
+  // optimizing for size. This is because the folded instruction will have a
+  // partial register update, while the unfolded sequence will not, e.g.
+  // movss mem, %xmm0
+  // rcpss %xmm0, %xmm0
+  // which has a clobber before the rcp, vs.
+  // rcpss mem, %xmm0
+  let Predicates = [target, OptForSize] in {
+    def : Pat<(Intr mem_cpat:$src),
+               (!cast<Instruction>(NAME#Suffix##m_Int)
+                      (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
   }
 }
 
@@ -3423,34 +3435,43 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   }
   }
 
+  // We don't want to fold scalar loads into these instructions unless
+  // optimizing for size. This is because the folded instruction will have a
+  // partial register update, while the unfolded sequence will not, e.g.
+  // vmovss mem, %xmm0
+  // vrcpss %xmm0, %xmm0, %xmm0
+  // which has a clobber before the rcp, vs.
+  // vrcpss mem, %xmm0, %xmm0
+  // TODO: In theory, we could fold the load, and avoid the stall caused by
+  // the partial register store, either in ExeDepFix or with smarter RA.
   let Predicates = [UseAVX] in {
    def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
-
-   def : Pat<(vt (OpNode mem_cpat:$src)),
-             (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
-                                  mem_cpat:$src)>;
-
   }
   let Predicates = [HasAVX] in {
    def : Pat<(Intr VR128:$src),
              (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
                                  VR128:$src)>;
-
-   def : Pat<(Intr mem_cpat:$src),
-             (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+  }
+  let Predicates = [HasAVX, OptForSize] in {
+    def : Pat<(Intr mem_cpat:$src),
+              (!cast<Instruction>("V"#NAME#Suffix##m_Int)
                     (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
   }
-  let Predicates = [UseAVX, OptForSize] in
-  def : Pat<(ScalarVT (OpNode (load addr:$src))),
-            (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
-             addr:$src)>;
+  let Predicates = [UseAVX, OptForSize] in {
+    def : Pat<(ScalarVT (OpNode (load addr:$src))),
+              (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+            addr:$src)>;
+    def : Pat<(vt (OpNode mem_cpat:$src)),
+              (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
+                                  mem_cpat:$src)>;
+  }
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins> {
-let Predicates = [HasAVX] in {
+                          OpndItins itins, list<Predicate> prds> {
+let Predicates = prds in {
   def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
@@ -3546,16 +3567,16 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 // Square root.
 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
-             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
              sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
              sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>;
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
 defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
-             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>;
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
 
 // There is no f64 version of the reciprocal approximation instructions.
 
@@ -4018,39 +4039,43 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 } // ExeDomain = SSEPackedInt
 
 defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX>;
 defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
-                             SSE_INTALUQ_ITINS_P, 1>;
+                             SSE_INTALUQ_ITINS_P, 1, NoVLX>;
 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
-                             SSE_INTMUL_ITINS_P, 1>;
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
-                             SSE_INTMUL_ITINS_P, 1>;
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
-                             SSE_INTMUL_ITINS_P, 1>;
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX>;
 defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
-                             SSE_INTALUQ_ITINS_P, 0>;
+                             SSE_INTALUQ_ITINS_P, 0, NoVLX>;
 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 
 // Intrinsic forms
 defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
@@ -4067,26 +4092,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
                                  int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
-defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
-                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
-defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
-                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
-defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
-                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
-
-let Predicates = [HasAVX2] in
-  def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1),
-                              (v32i8 VR256:$src2))),
-            (VPSADBWYrr VR256:$src2, VR256:$src1)>;
 
 let Predicates = [HasAVX] in
-  def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
-                              (v16i8 VR128:$src2))),
-            (VPSADBWrr VR128:$src2, VR128:$src1)>;
-
-def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
-                            (v16i8 VR128:$src2))),
-          (PSADBWrr VR128:$src2, VR128:$src1)>;
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+                             loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                             VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+                             loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                             VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+                            memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
 
 let Predicates = [HasAVX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
@@ -4105,9 +4122,6 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
 //===---------------------------------------------------------------------===//
 
 let Predicates = [HasAVX, NoVLX] in {
-defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
-                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
-                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
                             VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
@@ -4115,9 +4129,6 @@ defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
                             VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
-defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
-                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
-                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
                             VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
@@ -4125,14 +4136,26 @@ defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
                             VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 
-defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
-                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
-                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
 defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                             VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX]
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
+                                    Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
   // 128-bit logical shifts.
   def VPSLLDQri : PDIi8<0x73, MRM7r,
                     (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
@@ -4147,13 +4170,9 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
                       (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
                     VEX_4V;
   // PSRADQri doesn't exist in SSE[1-3].
-}
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
 
 let Predicates = [HasAVX2, NoVLX] in {
-defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
-                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
                              VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
@@ -4161,9 +4180,6 @@ defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
                              VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
-defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
-                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
                              VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
@@ -4171,14 +4187,25 @@ defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
                              VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
-defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
-                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                              VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
                              SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX]
 
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
+                                    Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   // 256-bit logical shifts.
   def VPSLLDQYri : PDIi8<0x73, MRM7r,
                     (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
@@ -4193,8 +4220,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
                       (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
                     VEX_4V, VEX_L;
   // PSRADQYri doesn't exist in SSE[1-3].
-}
-} // Predicates = [HasAVX2]
+} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
 
 let Constraints = "$src1 = $dst" in {
 defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
@@ -4247,17 +4273,17 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
 //===---------------------------------------------------------------------===//
 
 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 1>;
+                             SSE_INTALU_ITINS_P, 1, NoVLX>;
 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
-                             SSE_INTALU_ITINS_P, 0>;
+                             SSE_INTALU_ITINS_P, 0, NoVLX>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Shuffle Instructions
@@ -4511,40 +4537,43 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
       Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-let Predicates = [HasAVX] in {
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
                                  bc_v16i8, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
                                  bc_v8i16, loadv2i64, 0>, VEX_4V;
-  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
-                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
-  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
-                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
-
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
                                  bc_v16i8, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
                                  bc_v8i16, loadv2i64, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
                                  bc_v4i32, loadv2i64, 0>, VEX_4V;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
                                  bc_v2i64, loadv2i64, 0>, VEX_4V;
 }
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
                                    bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
                                    bc_v16i16>, VEX_4V, VEX_L;
-  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
-                                   bc_v8i32>, VEX_4V, VEX_L;
-  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
-                                   bc_v4i64>, VEX_4V, VEX_L;
-
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
                                    bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
                                    bc_v16i16>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+                                   bc_v4i64>, VEX_4V, VEX_L;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
                                    bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
@@ -4600,7 +4629,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 }
 
 // Extract
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4615,7 +4644,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                Sched<[WriteShuffleLd, ReadAfterLd]>;
 
 // Insert
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
 
 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
@@ -4683,7 +4712,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 } // ExeDomain = SSEPackedInt
 
 //===---------------------------------------------------------------------===//
-// SSE2 - Move Doubleword
+// SSE2 - Move Doubleword/Quadword
 //===---------------------------------------------------------------------===//
 
 //===---------------------------------------------------------------------===//
@@ -4770,23 +4799,23 @@ let isCodeGenOnly = 1 in {
 //
 def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
-                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
                     Sched<[WriteMove]>;
 def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
-                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                       [(store (i32 (extractelt (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
                                      VEX, Sched<[WriteStore]>;
 def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
-                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
                    Sched<[WriteMove]>;
 def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
-                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                       [(store (i32 (extractelt (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
@@ -4808,24 +4837,25 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
 let SchedRW = [WriteMove] in {
 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "movq\t{$src, $dst|$dst, $src}",
-                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
-                                                           (iPTR 0)))],
+                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+                                                        (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
                       VEX;
 
 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
                                                          (iPTR 0)))],
                                                          IIC_SSE_MOVD_ToGP>;
 } //SchedRW
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst),
-                          (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}",
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+                          (ins i64mem:$dst, VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
                           [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src),
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
@@ -4883,30 +4913,18 @@ let isCodeGenOnly = 1 in {
                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 }
 
-//===---------------------------------------------------------------------===//
-// Patterns and instructions to describe movd/movq to XMM register zero-extends
-//
-let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
-let AddedComplexity = 15 in {
-def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
-                       [(set VR128:$dst, (v2i64 (X86vzmovl
-                                      (v2i64 (scalar_to_vector GR64:$src)))))],
-                                      IIC_SSE_MOVDQ>,
-                                      VEX, VEX_W;
-def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
-                       [(set VR128:$dst, (v2i64 (X86vzmovl
-                                      (v2i64 (scalar_to_vector GR64:$src)))))],
-                                      IIC_SSE_MOVDQ>;
-}
-} // isCodeGenOnly, SchedRW
-
 let Predicates = [UseAVX] in {
-  let AddedComplexity = 15 in
+  let AddedComplexity = 15 in {
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
               (VMOVDI2PDIrr GR32:$src)>;
 
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIrr GR64:$src)>;
+
+    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+              (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
+  }
   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   // These instructions also write zeros in the high part of a 256-bit register.
   let AddedComplexity = 20 in {
@@ -4924,16 +4942,16 @@ let Predicates = [UseAVX] in {
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
-  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
 let Predicates = [UseSSE2] in {
-  let AddedComplexity = 15 in
+  let AddedComplexity = 15 in {
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
               (MOVDI2PDIrr GR32:$src)>;
 
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (MOV64toPQIrr GR64:$src)>;
+  }
   let AddedComplexity = 20 in {
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (MOVDI2PDIrm addr:$src)>;
@@ -4985,12 +5003,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
-                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                      [(store (i64 (extractelt (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
                                     IIC_SSE_MOVDQ>, VEX;
 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
-                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                      [(store (i64 (extractelt (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
                                     IIC_SSE_MOVDQ>;
 } // ExeDomain, SchedRW
@@ -5119,7 +5137,7 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                       IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
                                        v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
@@ -5134,7 +5152,7 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
                                    memopv4f32, f128mem>;
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -5190,21 +5208,30 @@ def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
-                      (v4f64 (X86Movddup
-                              (scalar_to_vector (loadf64 addr:$src)))))]>,
+                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
                     Sched<[WriteLoad]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
   defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
-let Predicates = [HasAVX] in {
+
+let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(X86Movddup (loadv2f64 addr:$src)),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [HasAVX] in {
   def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
@@ -5212,16 +5239,6 @@ let Predicates = [HasAVX] in {
   def : Pat<(X86Movddup (bc_v2f64
                              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-
-  // 256-bit version
-  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
-            (VMOVDDUPYrm addr:$src)>;
-  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
-            (VMOVDDUPYrm addr:$src)>;
-  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
-            (VMOVDDUPYrm addr:$src)>;
-  def : Pat<(X86Movddup (v4i64 VR256:$src)),
-            (VMOVDDUPYrr VR256:$src)>;
 }
 
 let Predicates = [UseAVX, OptForSize] in {
@@ -5791,37 +5808,37 @@ let Predicates = [HasAVX2] in
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGN : ssse3_palignr<"palignr">;
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
 let Predicates = [UseSSSE3] in {
 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -6145,7 +6162,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
                                                  imm:$src2)))), addr:$dst)]>;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
 
 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
@@ -6170,7 +6187,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
                                                   imm:$src2)))), addr:$dst)]>;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
   defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
 
 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
@@ -6194,7 +6211,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
   defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
 
 defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
@@ -6217,7 +6234,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, REX_W;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
 
 defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
@@ -6285,7 +6302,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
                    imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
   defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
@@ -6311,7 +6328,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                           imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
   defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
@@ -6337,7 +6354,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                           imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
   defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
 let Constraints = "$src1 = $dst" in
   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
@@ -6543,71 +6560,71 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [UseAVX] in {
   def : Pat<(ffloor FR32:$src),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
   def : Pat<(f64 (ffloor FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
   def : Pat<(f32 (fnearbyint FR32:$src)),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
   def : Pat<(f64 (fnearbyint FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
   def : Pat<(f32 (fceil FR32:$src)),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
   def : Pat<(f64 (fceil FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
   def : Pat<(f32 (frint FR32:$src)),
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
   def : Pat<(f64 (frint FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   def : Pat<(f32 (ftrunc FR32:$src)),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
 }
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (ffloor VR128:$src)),
-            (VROUNDPSr VR128:$src, (i32 0x1))>;
+            (VROUNDPSr VR128:$src, (i32 0x9))>;
   def : Pat<(v4f32 (fnearbyint VR128:$src)),
             (VROUNDPSr VR128:$src, (i32 0xC))>;
   def : Pat<(v4f32 (fceil VR128:$src)),
-            (VROUNDPSr VR128:$src, (i32 0x2))>;
+            (VROUNDPSr VR128:$src, (i32 0xA))>;
   def : Pat<(v4f32 (frint VR128:$src)),
             (VROUNDPSr VR128:$src, (i32 0x4))>;
   def : Pat<(v4f32 (ftrunc VR128:$src)),
-            (VROUNDPSr VR128:$src, (i32 0x3))>;
+            (VROUNDPSr VR128:$src, (i32 0xB))>;
 
   def : Pat<(v2f64 (ffloor VR128:$src)),
-            (VROUNDPDr VR128:$src, (i32 0x1))>;
+            (VROUNDPDr VR128:$src, (i32 0x9))>;
   def : Pat<(v2f64 (fnearbyint VR128:$src)),
             (VROUNDPDr VR128:$src, (i32 0xC))>;
   def : Pat<(v2f64 (fceil VR128:$src)),
-            (VROUNDPDr VR128:$src, (i32 0x2))>;
+            (VROUNDPDr VR128:$src, (i32 0xA))>;
   def : Pat<(v2f64 (frint VR128:$src)),
             (VROUNDPDr VR128:$src, (i32 0x4))>;
   def : Pat<(v2f64 (ftrunc VR128:$src)),
-            (VROUNDPDr VR128:$src, (i32 0x3))>;
+            (VROUNDPDr VR128:$src, (i32 0xB))>;
 
   def : Pat<(v8f32 (ffloor VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+            (VROUNDYPSr VR256:$src, (i32 0x9))>;
   def : Pat<(v8f32 (fnearbyint VR256:$src)),
             (VROUNDYPSr VR256:$src, (i32 0xC))>;
   def : Pat<(v8f32 (fceil VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0x2))>;
+            (VROUNDYPSr VR256:$src, (i32 0xA))>;
   def : Pat<(v8f32 (frint VR256:$src)),
             (VROUNDYPSr VR256:$src, (i32 0x4))>;
   def : Pat<(v8f32 (ftrunc VR256:$src)),
-            (VROUNDYPSr VR256:$src, (i32 0x3))>;
+            (VROUNDYPSr VR256:$src, (i32 0xB))>;
 
   def : Pat<(v4f64 (ffloor VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0x1))>;
+            (VROUNDYPDr VR256:$src, (i32 0x9))>;
   def : Pat<(v4f64 (fnearbyint VR256:$src)),
             (VROUNDYPDr VR256:$src, (i32 0xC))>;
   def : Pat<(v4f64 (fceil VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0x2))>;
+            (VROUNDYPDr VR256:$src, (i32 0xA))>;
   def : Pat<(v4f64 (frint VR256:$src)),
             (VROUNDYPDr VR256:$src, (i32 0x4))>;
   def : Pat<(v4f64 (ftrunc VR256:$src)),
-            (VROUNDYPDr VR256:$src, (i32 0x3))>;
+            (VROUNDYPDr VR256:$src, (i32 0xB))>;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6619,47 +6636,47 @@ defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
 
 let Predicates = [UseSSE41] in {
   def : Pat<(ffloor FR32:$src),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
   def : Pat<(f64 (ffloor FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
   def : Pat<(f32 (fnearbyint FR32:$src)),
             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
   def : Pat<(f64 (fnearbyint FR64:$src)),
             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
   def : Pat<(f32 (fceil FR32:$src)),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
   def : Pat<(f64 (fceil FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
   def : Pat<(f32 (frint FR32:$src)),
             (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
   def : Pat<(f64 (frint FR64:$src)),
             (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   def : Pat<(f32 (ftrunc FR32:$src)),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
 
   def : Pat<(v4f32 (ffloor VR128:$src)),
-            (ROUNDPSr VR128:$src, (i32 0x1))>;
+            (ROUNDPSr VR128:$src, (i32 0x9))>;
   def : Pat<(v4f32 (fnearbyint VR128:$src)),
             (ROUNDPSr VR128:$src, (i32 0xC))>;
   def : Pat<(v4f32 (fceil VR128:$src)),
-            (ROUNDPSr VR128:$src, (i32 0x2))>;
+            (ROUNDPSr VR128:$src, (i32 0xA))>;
   def : Pat<(v4f32 (frint VR128:$src)),
             (ROUNDPSr VR128:$src, (i32 0x4))>;
   def : Pat<(v4f32 (ftrunc VR128:$src)),
-            (ROUNDPSr VR128:$src, (i32 0x3))>;
+            (ROUNDPSr VR128:$src, (i32 0xB))>;
 
   def : Pat<(v2f64 (ffloor VR128:$src)),
-            (ROUNDPDr VR128:$src, (i32 0x1))>;
+            (ROUNDPDr VR128:$src, (i32 0x9))>;
   def : Pat<(v2f64 (fnearbyint VR128:$src)),
             (ROUNDPDr VR128:$src, (i32 0xC))>;
   def : Pat<(v2f64 (fceil VR128:$src)),
-            (ROUNDPDr VR128:$src, (i32 0x2))>;
+            (ROUNDPDr VR128:$src, (i32 0xA))>;
   def : Pat<(v2f64 (frint VR128:$src)),
             (ROUNDPDr VR128:$src, (i32 0x4))>;
   def : Pat<(v2f64 (ftrunc VR128:$src)),
-            (ROUNDPDr VR128:$src, (i32 0x3))>;
+            (ROUNDPDr VR128:$src, (i32 0xB))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7815,62 +7832,56 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
 // VBROADCAST - Load from memory and broadcast to all elements of the
 //              destination operand
 //
-class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                    X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
-  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-        [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
-
-class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
                            X86MemOperand x86memop, ValueType VT,
                            PatFrag ld_frag, SchedWrite Sched> :
   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
         [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
-        Sched<[Sched]>, VEX {
-    let mayLoad = 1;
-}
+        Sched<[Sched]>, VEX;
 
 // AVX2 adds register forms
-class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                         Intrinsic Int, SchedWrite Sched> :
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
   AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-         [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
+         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+         Sched<[Sched]>, VEX;
 
 let ExeDomain = SSEPackedSingle in {
-  def VBROADCASTSSrm  : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
                                              f32mem, v4f32, loadf32, WriteLoad>;
-  def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
                                              f32mem, v8f32, loadf32,
                                              WriteFShuffleLd>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrm  : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
                                     v4f64, loadf64, WriteFShuffleLd>, VEX_L;
-def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
-                                   int_x86_avx_vbroadcastf128_pd_256,
-                                   WriteFShuffleLd>, VEX_L;
 
 let ExeDomain = SSEPackedSingle in {
-  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
-                                           int_x86_avx2_vbroadcast_ss_ps,
-                                           WriteFShuffle>;
-  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
-                                      int_x86_avx2_vbroadcast_ss_ps_256,
-                                      WriteFShuffle256>, VEX_L;
+  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+                                          v4f32, v4f32, WriteFShuffle>;
+  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                      int_x86_avx2_vbroadcast_sd_pd_256,
-                                      WriteFShuffle256>, VEX_L;
+def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
 
-let mayLoad = 1, Predicates = [HasAVX2] in
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
                            (ins i128mem:$src),
                            "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
                            Sched<[WriteLoad]>, VEX, VEX_L;
 
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+                           (ins f128mem:$src),
+                           "vbroadcastf128\t{$src, $dst|$dst, $src}",
+                           [(set VR256:$dst,
+                              (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
+                           Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+
 let Predicates = [HasAVX] in
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
           (VBROADCASTF128 addr:$src)>;
@@ -7891,7 +7902,7 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
 def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
@@ -8080,17 +8091,19 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                              (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
              Sched<[WriteFShuffleLd, ReadAfterLd]>;
 
-  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+  let Predicates = [HasAVX, NoVLX] in {
+    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffle]>;
-  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
              (ins x86memop_f:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
                (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffleLd]>;
+  }// Predicates = [HasAVX, NoVLX]
 }
 
 let ExeDomain = SSEPackedSingle in {
@@ -8106,7 +8119,7 @@ let ExeDomain = SSEPackedDouble in {
                        loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
 def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
           (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
 def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
@@ -8244,12 +8257,15 @@ let Predicates = [HasF16C] in {
             (VCVTPH2PSrm addr:$src)>;
   def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
             (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VCVTPH2PSrm addr:$src)>;
 
-  def : Pat<(store (f64 (vector_extract (bc_v2f64 (v8i16
+  def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
                   (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
                    addr:$dst),
                    (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
-  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v8i16
+  def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
                   (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
                    addr:$dst),
                    (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
@@ -8309,97 +8325,62 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
 //
 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
                           X86MemOperand x86memop, PatFrag ld_frag,
-                          Intrinsic Int128, Intrinsic Int256> {
-  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+  let Predicates = [HasAVX2, prd] in {
+    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR128:$dst, (Int128 VR128:$src))]>,
+                  [(set VR128:$dst,
+                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
                   Sched<[WriteShuffle]>, VEX;
-  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
+                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
                   Sched<[WriteLoad]>, VEX;
-  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR256:$dst, (Int256 VR128:$src))]>,
+                   [(set VR256:$dst,
+                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
                    Sched<[WriteShuffle256]>, VEX, VEX_L;
-  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
-                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
                    Sched<[WriteLoad]>, VEX, VEX_L;
+
+    // Provide aliases for broadcast from the same register class that
+    // automatically does the extract.
+    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+              (!cast<Instruction>(NAME#"Yrr")
+                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+  }
 }
 
 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
-                                    int_x86_avx2_pbroadcastb_128,
-                                    int_x86_avx2_pbroadcastb_256>;
+                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
 defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
-                                    int_x86_avx2_pbroadcastw_128,
-                                    int_x86_avx2_pbroadcastw_256>;
+                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
 defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
-                                    int_x86_avx2_pbroadcastd_128,
-                                    int_x86_avx2_pbroadcastd_256>;
+                                    v4i32, v8i32, NoVLX>;
 defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
-                                    int_x86_avx2_pbroadcastq_128,
-                                    int_x86_avx2_pbroadcastq_256>;
+                                    v2i64, v4i64, NoVLX>;
 
 let Predicates = [HasAVX2] in {
-  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
-          (VPBROADCASTBrm addr:$src)>;
-  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
-          (VPBROADCASTBYrm addr:$src)>;
-  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
-          (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
-          (VPBROADCASTWYrm addr:$src)>;
-  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VPBROADCASTDrm addr:$src)>;
-  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VPBROADCASTDYrm addr:$src)>;
-  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
-          (VPBROADCASTQrm addr:$src)>;
-  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
-          (VPBROADCASTQYrm addr:$src)>;
-
-  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
-          (VPBROADCASTBrr VR128:$src)>;
-  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
-          (VPBROADCASTBYrr VR128:$src)>;
-  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
-          (VPBROADCASTWrr VR128:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
-          (VPBROADCASTWYrr VR128:$src)>;
-  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
-          (VPBROADCASTDrr VR128:$src)>;
-  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
-          (VPBROADCASTDYrr VR128:$src)>;
-  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
-          (VPBROADCASTQrr VR128:$src)>;
-  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
-          (VPBROADCASTQYrr VR128:$src)>;
-  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
-          (VBROADCASTSSrr VR128:$src)>;
-  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
-          (VBROADCASTSSYrr VR128:$src)>;
-  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
-          (VPBROADCASTQrr VR128:$src)>;
-  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
-          (VBROADCASTSDYrr VR128:$src)>;
+  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+  // This means we'll encounter truncated i32 loads; match that here.
+  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
 
   // Provide aliases for broadcast from the same register class that
   // automatically does the extract.
-  def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
-            (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src),
-                                                    sub_xmm)))>;
-  def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))),
-            (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src),
-                                                    sub_xmm)))>;
-  def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))),
-            (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src),
-                                                    sub_xmm)))>;
-  def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))),
-            (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src),
-                                                    sub_xmm)))>;
   def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
             (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
                                                     sub_xmm)))>;
@@ -8598,7 +8579,7 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
@@ -8722,16 +8703,16 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
                                 int_x86_avx2_maskstore_q,
                                 int_x86_avx2_maskstore_q_256>, VEX_W;
 
-def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
          (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
 
-def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
          (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
 
-def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
          (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
 
-def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
          (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
 
 def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
@@ -8776,10 +8757,10 @@ def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0)
          (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
                        VR128:$mask)>;
 
-def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
          (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
 
-def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
          (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
 
 def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
@@ -8804,10 +8785,10 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0)
          (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
                        VR256:$mask)>;
 
-def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
          (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
 
-def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
          (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
 
 def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
@@ -8865,12 +8846,13 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
 }
 
-defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
-defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
-defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
-defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
-defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
-
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+}
 //===----------------------------------------------------------------------===//
 // VGATHER - GATHER Operations
 multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
@@ -8905,3 +8887,59 @@ let mayLoad = 1, Constraints
     defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
   }
 }
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+
+def : Pat<(loadf128 addr:$src),
+          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
index caecf70..c1df978 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -31,21 +31,21 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                   [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
 } // Uses = [CL]
 
-def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "shl{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
 
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
-def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "shl{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
                    OpSize16;
-def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "shl{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
                    OpSize32;
 def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
-                    (ins GR64:$src1, i8imm:$src2),
+                    (ins GR64:$src1, u8imm:$src2),
                     "shl{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
                     IIC_SR>;
@@ -85,19 +85,19 @@ def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
                   "shl{q}\t{%cl, $dst|$dst, cl}",
                   [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
 }
-def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "shl{b}\t{$src, $dst|$dst, $src}",
                 [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
                 IIC_SR>;
-def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "shl{w}\t{$src, $dst|$dst, $src}",
                [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>, OpSize16;
-def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "shl{l}\t{$src, $dst|$dst, $src}",
                [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>, OpSize32;
-def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
                   "shl{q}\t{$src, $dst|$dst, $src}",
                  [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
                  IIC_SR>;
@@ -137,18 +137,18 @@ def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                   [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
 }
 
-def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
                    "shr{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "shr{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize16;
-def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "shr{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize32;
-def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
                   "shr{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
 
@@ -185,19 +185,19 @@ def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
                   "shr{q}\t{%cl, $dst|$dst, cl}",
                   [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
 }
-def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "shr{b}\t{$src, $dst|$dst, $src}",
                 [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
                 IIC_SR>;
-def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "shr{w}\t{$src, $dst|$dst, $src}",
                [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>, OpSize16;
-def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "shr{l}\t{$src, $dst|$dst, $src}",
                [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>, OpSize32;
-def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
                   "shr{q}\t{$src, $dst|$dst, $src}",
                  [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
                  IIC_SR>;
@@ -241,20 +241,20 @@ def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
                  IIC_SR>;
 }
 
-def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "sar{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
                    IIC_SR>;
-def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "sar{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize16;
-def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "sar{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize32;
 def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
-                    (ins GR64:$src1, i8imm:$src2),
+                    (ins GR64:$src1, u8imm:$src2),
                     "sar{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
                     IIC_SR>;
@@ -298,19 +298,19 @@ def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
                  [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 }
-def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "sar{b}\t{$src, $dst|$dst, $src}",
                 [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
                 IIC_SR>;
-def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "sar{w}\t{$src, $dst|$dst, $src}",
                [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>, OpSize16;
-def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "sar{l}\t{$src, $dst|$dst, $src}",
                [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>, OpSize32;
-def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
                     "sar{q}\t{$src, $dst|$dst, $src}",
                  [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
                  IIC_SR>;
@@ -342,7 +342,7 @@ let hasSideEffects = 0 in {
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                "rcl{b}\t$dst", [], IIC_SR>;
-def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
@@ -350,7 +350,7 @@ def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
 
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
                   "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 let Uses = [CL] in
 def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
@@ -358,7 +358,7 @@ def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
 
 def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
                   "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 let Uses = [CL] in
 def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
@@ -367,7 +367,7 @@ def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
 
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcl{q}\t$dst", [], IIC_SR>;
-def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
                    "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
@@ -376,7 +376,7 @@ def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
 
 def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                "rcr{b}\t$dst", [], IIC_SR>;
-def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
@@ -384,7 +384,7 @@ def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
 
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
                   "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 let Uses = [CL] in
 def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
@@ -392,7 +392,7 @@ def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
 
 def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
                   "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 let Uses = [CL] in
 def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
@@ -400,7 +400,7 @@ def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
 
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcr{q}\t$dst", [], IIC_SR>;
-def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
@@ -411,36 +411,36 @@ def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
                "rcl{b}\t$dst", [], IIC_SR>;
-def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
                 "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
                   "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
                 "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
                   "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
                  "rcl{q}\t$dst", [], IIC_SR>;
-def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
                    "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 
 def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
                "rcr{b}\t$dst", [], IIC_SR>;
-def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
                 "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
                   "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
                 "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
                   "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
                  "rcr{q}\t$dst", [], IIC_SR>;
-def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 
 let Uses = [CL] in {
@@ -482,19 +482,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
 }
 
-def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "rol{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "rol{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize16;
-def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "rol{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize32;
 def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
-                    (ins GR64:$src1, i8imm:$src2),
+                    (ins GR64:$src1, u8imm:$src2),
                     "rol{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
                     IIC_SR>;
@@ -537,19 +537,19 @@ def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
                    [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
                    IIC_SR>;
 }
-def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
+def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
                    "rol{b}\t{$src1, $dst|$dst, $src1}",
                [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
                IIC_SR>;
-def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
+def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
                    "rol{w}\t{$src1, $dst|$dst, $src1}",
               [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
               IIC_SR>, OpSize16;
-def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
+def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
                    "rol{l}\t{$src1, $dst|$dst, $src1}",
               [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
               IIC_SR>, OpSize32;
-def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
                     "rol{q}\t{$src1, $dst|$dst, $src1}",
                 [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
                 IIC_SR>;
@@ -589,19 +589,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
 }
 
-def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "ror{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "ror{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize16;
-def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "ror{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
                    IIC_SR>, OpSize32;
 def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
-                    (ins GR64:$src1, i8imm:$src2),
+                    (ins GR64:$src1, u8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
                     IIC_SR>;
@@ -644,19 +644,19 @@ def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
                   [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
                   IIC_SR>;
 }
-def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
                    "ror{b}\t{$src, $dst|$dst, $src}",
                [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
                IIC_SR>;
-def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
                    "ror{w}\t{$src, $dst|$dst, $src}",
               [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
               IIC_SR>, OpSize16;
-def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
                    "ror{l}\t{$src, $dst|$dst, $src}",
               [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
               IIC_SR>, OpSize32;
-def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
                     "ror{q}\t{$src, $dst|$dst, $src}",
                 [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
                 IIC_SR>;
@@ -727,42 +727,42 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
 let isCommutable = 1 in {  // These instructions commute to each other.
 def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR16:$dst),
-                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
                      TB, OpSize16;
 def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR16:$dst),
-                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
                      TB, OpSize16;
 def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR32:$dst),
-                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
                  TB, OpSize32;
 def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR32:$dst),
-                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
                  TB, OpSize32;
 def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
                       (outs GR64:$dst),
-                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
                                        (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
                  TB;
 def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
                       (outs GR64:$dst),
-                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
                                        (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
@@ -801,14 +801,14 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
 }
 
 def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
-                    (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                    (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD16_MEM_IM>,
                     TB, OpSize16;
 def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
-                     (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                     (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
                                       (i8 imm:$src3)), addr:$dst)],
@@ -816,14 +816,14 @@ def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
                      TB, OpSize16;
 
 def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
-                    (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                    (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD32_MEM_IM>,
                     TB, OpSize32;
 def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
-                     (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                     (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
                                        (i8 imm:$src3)), addr:$dst)],
@@ -831,14 +831,14 @@ def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
                      TB, OpSize32;
 
 def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
-                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
                                        (i8 imm:$src3)), addr:$dst)],
                                        IIC_SHD64_MEM_IM>,
                  TB;
 def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
-                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
                                        (i8 imm:$src3)), addr:$dst)],
@@ -860,12 +860,12 @@ def ROT64L2R_imm8  : SDNodeXForm<imm, [{
 
 multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
 let hasSideEffects = 0 in {
-  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
+  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
                !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>, TAXD, VEX, Sched<[WriteShift]>;
   let mayLoad = 1 in
   def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
-               (ins x86memop:$src1, i8imm:$src2),
+               (ins x86memop:$src1, u8imm:$src2),
                !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>, TAXD, VEX, Sched<[WriteShiftLd]>;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
index 0350566..a97d1e5 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -44,7 +44,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
 
 let SchedRW = [WriteSystem] in {
 
-def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
               [(int_x86_int imm:$trap)], IIC_INT>;
 
 
@@ -60,12 +60,6 @@ def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
                  IIC_SYS_ENTER_EXIT>, TB;
 def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
                  IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
-
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16;
-def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>,
-             OpSize32;
-def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
-             Requires<[In64BitMode]>;
 } // SchedRW
 
 def : Pat<(debugtrap),
@@ -88,13 +82,13 @@ def IN32rr : I<0xED, RawFrm, (outs), (ins),
                "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
 
 let Defs = [AL] in
-def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
                   "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
 let Defs = [AX] in
-def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
                   "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
 let Defs = [EAX] in
-def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
                   "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
 
 let Uses = [DX, AL] in
@@ -108,13 +102,13 @@ def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
                 "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
 
 let Uses = [AL] in
-def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
                    "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
 let Uses = [AX] in
-def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
                    "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
 let Uses = [EAX] in
-def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
                   "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
 
 } // SchedRW
@@ -478,39 +472,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
 //===----------------------------------------------------------------------===//
 // XSAVE instructions
 let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
 let Defs = [EDX, EAX], Uses = [ECX] in
   def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
 
 let Uses = [EDX, EAX, ECX] in
   def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+}
 
-let Uses = [RDX, RAX] in {
-  def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
-               "xsave\t$dst", []>, TB;
-  def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
-                 "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>;
+let Uses = [EDX, EAX] in {
+let Predicates = [HasXSAVE] in {
+  def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+                "xsave\t$dst",
+                [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
+  def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+                   "xsave64\t$dst",
+                   [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
   def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
-               "xrstor\t$dst", []>, TB;
+                 "xrstor\t$dst",
+                 [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
   def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
-                 "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>;
-  def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
-                  "xsaveopt\t$dst", []>, PS;
-  def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
-                    "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>;
-
+                    "xrstor64\t$dst",
+                    [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEOPT] in {
+  def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+                   "xsaveopt\t$dst",
+                   [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
+  def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+                      "xsaveopt64\t$dst",
+                      [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEC] in {
+  def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+                 "xsavec\t$dst",
+                 [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
+  def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+                   "xsavec64\t$dst",
+                   [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVES] in {
+  def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xsaves\t$dst",
+                 [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
+  def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+                    "xsaves64\t$dst",
+                    [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
   def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
-                "xrstors\t$dst", []>, TB;
+                  "xrstors\t$dst",
+                  [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
   def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
-                  "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>;
-  def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins),
-                "xsavec\t$dst", []>, TB;
-  def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins),
-                  "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>;
-  def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins),
-                "xsaves\t$dst", []>, TB;
-  def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins),
-                  "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>;
+                     "xrstors64\t$dst",
+                     [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
 }
+} // Uses
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -534,6 +549,19 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
 }
 let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
   def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+//==-----------------------------------------------------------------------===//
+// PKU  - enable protection key
+let usesCustomInserter = 1 in {
+  def WRPKRU : PseudoI<(outs), (ins GR32:$src),
+                [(int_x86_wrpkru GR32:$src)]>;
+  def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
+                [(set GR32:$dst, (int_x86_rdpkru))]>;
+}
+
+let Defs = [EAX, EDX], Uses = [ECX] in 
+  def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+let Uses = [EAX, ECX, EDX] in
+  def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
 
 //===----------------------------------------------------------------------===//
 // FS/GS Base Instructions
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
index 8455b8d..4cb2304 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -83,57 +83,64 @@ let ExeDomain = SSEPackedDouble in {
   defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
 }
 
-multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3;
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+           XOP_4VOp3, Sched<[WriteVarVecShift]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>,
-           XOP_4V, VEX_W;
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+           XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
   def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>,
-             XOP_4VOp3;
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+                             (vt128 VR128:$src2))))]>,
+             XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
-  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
-  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
-  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
-  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
-  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
-  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
-  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
-  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
-  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
-  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
-  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+  defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
+  defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
+  defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
+  defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+  defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+  defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+  defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
 }
 
-multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                     ValueType vt128> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, i8imm:$src2),
+           (ins VR128:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP;
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins i128mem:$src1, i8imm:$src2),
+           (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-             (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP;
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
-  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
-  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+  defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
+  defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
 }
 
 // Instruction where second source can be memory, but third must be register
@@ -170,30 +177,34 @@ let ExeDomain = SSEPackedInt in {
 }
 
 // Instruction where second source can be memory, third must be imm8
-multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
   let isCommutable = 1 in
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
            !strconcat("vpcom${cc}", Suffix,
            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>,
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             i8immZExt3:$cc)))]>,
            XOP_4V;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
            !strconcat("vpcom${cc}", Suffix,
            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-             (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
-              i8immZExt3:$cc))]>, XOP_4V;
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                              i8immZExt3:$cc)))]>,
+           XOP_4V;
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-                 (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
                  !strconcat("vpcom", Suffix,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                  []>, XOP_4V;
     let mayLoad = 1 in
     def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-                 (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                  !strconcat("vpcom", Suffix,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                  []>, XOP_4V;
@@ -201,14 +212,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
 }
 
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
-  defm VPCOMB  : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
-  defm VPCOMW  : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
-  defm VPCOMD  : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
-  defm VPCOMQ  : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
-  defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
-  defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
-  defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
-  defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
+  defm VPCOMB  : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
+  defm VPCOMW  : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
+  defm VPCOMD  : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
+  defm VPCOMQ  : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
+  defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
+  defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
+  defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
+  defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
 }
 
 // Instruction where either second or third source can be memory
@@ -270,42 +281,52 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
 let ExeDomain = SSEPackedInt in
   defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
 
+let Predicates = [HasXOP] in {
+  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+                       (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+                       (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
 multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
                   Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
   def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR128:$dst,
            (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
   def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
+        (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR128:$dst,
            (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
         VEX_W, MemOp4;
   def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
+        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR128:$dst,
            (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
   def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
           (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
   def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
+        (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
           (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
         VEX_W, MemOp4, VEX_L;
   def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
+        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 2c8b95b..b525d5e 100644
--- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -18,14 +18,19 @@ namespace llvm {
 
 enum IntrinsicType {
   INTR_NO_TYPE,
-  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
-  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
-  CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
-  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
-  INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
-  VPERM_3OP_MASKZ,
-  INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
-  EXPAND_FROM_MEM, BLEND
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
+  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
+  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
+  INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
+  INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
+  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
+  VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+  INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
+  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
+  TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+  EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
 };
 
 struct IntrinsicData {
@@ -138,6 +143,54 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
@@ -209,6 +262,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -241,6 +296,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
   X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
@@ -274,16 +330,56 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2sd32,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2sd64,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2ss32,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtsi2ss64,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtusi2ss,   INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
-
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
@@ -371,6 +467,50 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_blend_w_128,  BLEND, X86ISD::SELECT, 0),
   X86_INTRINSIC_DATA(avx512_mask_blend_w_256,  BLEND, X86ISD::SELECT, 0),
   X86_INTRINSIC_DATA(avx512_mask_blend_w_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
@@ -388,6 +528,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+                     X86ISD::FSETCC),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+                     X86ISD::FSETCC),
   X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
@@ -415,7 +559,184 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::COMPRESS, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_q_512,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::COMPRESS, 0),
-
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTDQ2PD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0), // no rm
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
+                    X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+                    X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps,     INTR_TYPE_1OP_MASK,
+                    X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP_ROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP_ROUND, X86ISD::VFPROUND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_EXTEND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTUDQ2PD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0), // no rm
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
@@ -452,6 +773,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
   X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
                      X86ISD::FGETEXP_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
@@ -464,6 +793,62 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FGETEXP_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
                      X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
@@ -472,10 +857,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX,
-  X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX,
-  X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMAX, X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMAX, X86ISD::FMAX_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
@@ -484,10 +869,32 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN,
-  X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN,
-  X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMIN, X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMIN, X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSHDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSHDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSHDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSLDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSLDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSLDUP, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
@@ -554,6 +961,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::PALIGNR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::PALIGNR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::PALIGNR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
   X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
   X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
@@ -596,6 +1009,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
@@ -644,6 +1069,186 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
@@ -680,28 +1285,139 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),  
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK,
                     X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
                     X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_128, INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_256, INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_512, INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_128,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_256,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_512,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_128,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFLW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_256,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFLW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_512,  INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFLW, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_q,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_pslli_d,       VSHIFT_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_pslli_q,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv16_hi,    INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv2_di,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv32hi,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv4_di,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv4_si,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv8_hi,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv8_si,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv_d,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv_q,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrai_d,       VSHIFT_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrai_q,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav16_hi,    INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav32_hi,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav4_si,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav8_hi,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav8_si,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrav_d,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrav_q,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q_128,   INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q_256,   INTR_TYPE_2OP_MASK, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), 
   X86_INTRINSIC_DATA(avx512_mask_psrli_d,       VSHIFT_MASK, X86ISD::VSRLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrli_q,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi,    INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv2_di,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv32hi,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv4_di,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv4_si,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv8_si,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv_d,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrlv_q,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
@@ -728,16 +1444,98 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx512_mask_rndscale_sd,   INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::RNDSCALE, 0),
+                     X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx512_mask_rndscale_ss,   INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::RNDSCALE, 0),
+                     X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
                      X86ISD::SCALEF, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
@@ -750,6 +1548,38 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::SCALEF, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
                      X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
   X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
@@ -758,6 +1588,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
                      X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRT_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
@@ -782,9 +1616,54 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
-
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP16_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP16_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP16_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM,
+                     ISD::FP_TO_FP16, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM,
+                     ISD::FP_TO_FP16, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM,
+                     ISD::FP_TO_FP16, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
@@ -821,7 +1700,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
                      X86ISD::FNMSUB_RND),
 
-
   X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
                     X86ISD::VPERMIV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
@@ -852,54 +1730,56 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                     X86ISD::VPERMIV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
                     X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
-                    X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
@@ -910,7 +1790,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
-
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
@@ -959,14 +1850,59 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
                      X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_rcp28_pd,   INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
-  X86_INTRINSIC_DATA(avx512_rcp28_ps,   INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
-  X86_INTRINSIC_DATA(avx512_rcp28_sd,   INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
-  X86_INTRINSIC_DATA(avx512_rcp28_ss,   INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0),
+  X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -1017,6 +1953,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
   X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, ISD::SMAX, 0),
   X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, ISD::UMAX, 0),
   X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, ISD::SMIN, 0),
@@ -1024,6 +1962,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(sse2_psad_bw,      INTR_TYPE_2OP, X86ISD::PSADBW, 0),
   X86_INTRINSIC_DATA(sse2_pshuf_d,      INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
   X86_INTRINSIC_DATA(sse2_pshufh_w,     INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
   X86_INTRINSIC_DATA(sse2_pshufl_w,     INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
@@ -1066,12 +2005,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, ISD::SMIN, 0),
   X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pmovsxbd,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovsxbq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovsxbw,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovsxdq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovsxwd,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(sse41_pmovsxwq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovzxbd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovzxbq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovzxbw,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
@@ -1105,7 +2038,31 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
   X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
-  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0)
+  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(xop_vpcomb,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomd,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomq,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomub,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomud,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomuq,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomuw,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomw,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vprotb,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotbi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotd,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotdi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotq,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotqi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotw,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotwi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vpshab,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshad,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaq,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaw,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshlb,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshld,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlq,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlw,        INTR_TYPE_2OP, X86ISD::VPSHL, 0)
 };
 
 /*
@@ -1128,6 +2085,102 @@ static void verifyIntrinsicTables() {
          std::is_sorted(std::begin(IntrinsicsWithChain),
                         std::end(IntrinsicsWithChain)) &&
          "Intrinsic data tables should be sorted by Intrinsic ID");
+  assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+                             std::end(IntrinsicsWithoutChain)) ==
+          std::end(IntrinsicsWithoutChain)) &&
+         (std::adjacent_find(std::begin(IntrinsicsWithChain),
+                             std::end(IntrinsicsWithChain)) ==
+          std::end(IntrinsicsWithChain)) &&
+         "Intrinsic data tables should have unique entries");
+}
+
+// X86 specific compare constants.
+// They must be kept in synch with avxintrin.h
+#define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _X86_CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _X86_CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _X86_CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _X86_CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _X86_CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _X86_CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _X86_CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _X86_CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _X86_CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _X86_CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _X86_CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _X86_CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _X86_CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _X86_CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _X86_CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _X86_CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _X86_CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _X86_CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _X86_CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _X86_CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _X86_CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _X86_CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _X86_CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _X86_CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+/*
+* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
+* Return tuple <isOrdered, X86 condcode>
+*/
+static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
+  ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
+  unsigned IntImm = CImm->getZExtValue();
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (IntImm) {
+  default: llvm_unreachable("Invalid floating point compare value for Comi!");
+  case _X86_CMP_EQ_OQ:      // 0x00 - Equal (ordered, nonsignaling)
+  case _X86_CMP_EQ_OS:      // 0x10 - Equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_E);
+  case _X86_CMP_EQ_UQ:      // 0x08 - Equal (unordered, non-signaling)
+  case _X86_CMP_EQ_US:      // 0x18 - Equal (unordered, signaling)
+    return std::make_tuple(false , X86::COND_E);
+  case _X86_CMP_LT_OS:      // 0x01 - Less-than (ordered, signaling)
+  case _X86_CMP_LT_OQ:      // 0x11 - Less-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_B);
+  case _X86_CMP_NGE_US:     // 0x09 - Not-greater-than-or-equal (unordered, signaling)
+  case _X86_CMP_NGE_UQ:     // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false , X86::COND_B);
+  case _X86_CMP_LE_OS:      // 0x02 - Less-than-or-equal (ordered, signaling)
+  case _X86_CMP_LE_OQ:      // 0x12 - Less-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_BE);
+  case _X86_CMP_NGT_US:     // 0x0A - Not-greater-than (unordered, signaling)
+  case _X86_CMP_NGT_UQ:     // 0x1A - Not-greater-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_BE);
+  case _X86_CMP_GT_OS:      // 0x0E - Greater-than (ordered, signaling)
+  case _X86_CMP_GT_OQ:      // 0x1E - Greater-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_A);
+  case _X86_CMP_NLE_US:     // 0x06 - Not-less-than-or-equal (unordered,signaling)
+  case _X86_CMP_NLE_UQ:     // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_A);
+  case _X86_CMP_GE_OS:      // 0x0D - Greater-than-or-equal (ordered, signaling)
+  case _X86_CMP_GE_OQ:      // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_AE);
+  case _X86_CMP_NLT_US:     // 0x05 - Not-less-than (unordered, signaling)
+  case _X86_CMP_NLT_UQ:     // 0x15 - Not-less-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_AE);
+  case _X86_CMP_NEQ_OQ:     // 0x0C - Not-equal (ordered, non-signaling)
+  case _X86_CMP_NEQ_OS:     // 0x1C - Not-equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_NE);
+  case _X86_CMP_NEQ_UQ:     // 0x04 - Not-equal (unordered, nonsignaling)
+  case _X86_CMP_NEQ_US:     // 0x14 - Not-equal (unordered, signaling)
+    return std::make_tuple(false, X86::COND_NE);
+  }
 }
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
index 3415ced..e1ca558 100644
--- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -14,6 +14,7 @@
 
 #include "X86AsmPrinter.h"
 #include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "Utils/X86ShuffleDecode.h"
@@ -92,7 +93,6 @@ namespace llvm {
       SmallVector<MCFixup, 4> Fixups;
       raw_svector_ostream VecOS(Code);
       CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
-      VecOS.flush();
       CurrentShadowSize += Code.size();
       if (CurrentShadowSize >= RequiredShadowSize)
         InShadow = false; // The shadow is big enough. Stop counting.
@@ -128,7 +128,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = MF.getDataLayout();
   assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   MCSymbol *Sym = nullptr;
@@ -151,7 +151,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   }
 
   if (!Suffix.empty())
-    Name += DL->getPrivateGlobalPrefix();
+    Name += DL.getPrivateGlobalPrefix();
 
   unsigned PrefixLen = Name.size();
 
@@ -159,7 +159,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     const GlobalValue *GV = MO.getGlobal();
     AsmPrinter.getNameWithPrefix(Name, GV);
   } else if (MO.isSymbol()) {
-    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL);
+    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
   } else if (MO.isMBB()) {
     assert(Suffix.empty());
     Sym = MO.getMBB()->getSymbol();
@@ -455,12 +455,9 @@ ReSimplify:
            "LEA has segment specified!");
     break;
 
-  case X86::MOV32ri64:
-    OutMI.setOpcode(X86::MOV32ri);
-    break;
-
   // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
   // if one of the registers is extended, but other isn't.
+  case X86::VMOVZPQILo2PQIrr:
   case X86::VMOVAPDrr:
   case X86::VMOVAPDYrr:
   case X86::VMOVAPSrr:
@@ -478,18 +475,19 @@ ReSimplify:
       unsigned NewOpc;
       switch (OutMI.getOpcode()) {
       default: llvm_unreachable("Invalid opcode");
-      case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
-      case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
-      case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
-      case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
-      case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
-      case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
-      case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
-      case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
-      case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
-      case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
-      case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
-      case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+      case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr;   break;
+      case X86::VMOVAPDrr:        NewOpc = X86::VMOVAPDrr_REV;  break;
+      case X86::VMOVAPDYrr:       NewOpc = X86::VMOVAPDYrr_REV; break;
+      case X86::VMOVAPSrr:        NewOpc = X86::VMOVAPSrr_REV;  break;
+      case X86::VMOVAPSYrr:       NewOpc = X86::VMOVAPSYrr_REV; break;
+      case X86::VMOVDQArr:        NewOpc = X86::VMOVDQArr_REV;  break;
+      case X86::VMOVDQAYrr:       NewOpc = X86::VMOVDQAYrr_REV; break;
+      case X86::VMOVDQUrr:        NewOpc = X86::VMOVDQUrr_REV;  break;
+      case X86::VMOVDQUYrr:       NewOpc = X86::VMOVDQUYrr_REV; break;
+      case X86::VMOVUPDrr:        NewOpc = X86::VMOVUPDrr_REV;  break;
+      case X86::VMOVUPDYrr:       NewOpc = X86::VMOVUPDYrr_REV; break;
+      case X86::VMOVUPSrr:        NewOpc = X86::VMOVUPSrr_REV;  break;
+      case X86::VMOVUPSYrr:       NewOpc = X86::VMOVUPSYrr_REV; break;
       }
       OutMI.setOpcode(NewOpc);
     }
@@ -532,6 +530,23 @@ ReSimplify:
     break;
   }
 
+  case X86::CLEANUPRET: {
+    // Replace CATCHRET with the appropriate RET.
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Replace CATCHRET with the appropriate RET.
+    const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+    unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(Subtarget));
+    OutMI.addOperand(MCOperand::createReg(ReturnReg));
+    break;
+  }
+
   // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
   case X86::TAILJMPr:
   case X86::TAILJMPd:
@@ -598,17 +613,29 @@ ReSimplify:
   case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
   case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
   case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+  case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
   case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+  case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
   case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
   case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+  case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
   case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+  case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
   case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+  case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
   case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+  case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
   case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+  case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
   case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+  case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
   case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+  case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
   case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+  case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
   case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+  case X86::RELEASE_XOR64mr:   OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
   case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m); goto ReSimplify;
   case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m); goto ReSimplify;
   case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m); goto ReSimplify;
@@ -875,7 +902,10 @@ void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
 
   MCInst LoadMI;
   LoadMI.setOpcode(LoadOpcode);
-  LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
+  if (LoadDefRegister != X86::NoRegister)
+    LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
   for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
             E = MI.operands_end();
        I != E; ++I)
@@ -1062,6 +1092,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
                             X86ATTInstPrinter::getRegisterName(Reg));
     break;
   }
+  case X86::CLEANUPRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CLEANUPRET");
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CATCHRET");
+    break;
+  }
+
   case X86::TAILJMPr:
   case X86::TAILJMPm:
   case X86::TAILJMPd:
@@ -1095,12 +1137,30 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
       .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
 
+    const X86FrameLowering* FrameLowering =
+        MF->getSubtarget<X86Subtarget>().getFrameLowering();
+    bool hasFP = FrameLowering->hasFP(*MF);
+    
+    // TODO: This is needed only if we require precise CFA.
+    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+                               !OutStreamer->getDwarfFrameInfos().back().End;
+
+    int stackGrowth = -RI->getSlotSize();
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
+    }
+
     // Emit the label.
     OutStreamer->EmitLabel(PICBase);
 
     // popl $reg
     EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
                             .addReg(MI->getOperand(0).getReg()));
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
+    }
     return;
   }
 
@@ -1206,19 +1266,48 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-    // Lower PSHUFB and VPERMILP normally but add a comment if we can find
-    // a constant shuffle mask. We won't be able to do this at the MC layer
-    // because the mask isn't an immediate.
+  // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+  // a constant shuffle mask. We won't be able to do this at the MC layer
+  // because the mask isn't an immediate.
   case X86::PSHUFBrm:
   case X86::VPSHUFBrm:
-  case X86::VPSHUFBYrm: {
+  case X86::VPSHUFBYrm:
+  case X86::VPSHUFBZ128rm:
+  case X86::VPSHUFBZ128rmk:
+  case X86::VPSHUFBZ128rmkz:
+  case X86::VPSHUFBZ256rm:
+  case X86::VPSHUFBZ256rmk:
+  case X86::VPSHUFBZ256rmkz:
+  case X86::VPSHUFBZrm:
+  case X86::VPSHUFBZrmk:
+  case X86::VPSHUFBZrmkz: {
     if (!OutStreamer->isVerboseAsm())
       break;
-    assert(MI->getNumOperands() > 5 &&
-           "We should always have at least 5 operands!");
+    unsigned SrcIdx, MaskIdx;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::PSHUFBrm:
+    case X86::VPSHUFBrm:
+    case X86::VPSHUFBYrm:
+    case X86::VPSHUFBZ128rm:
+    case X86::VPSHUFBZ256rm:
+    case X86::VPSHUFBZrm:
+      SrcIdx = 1; MaskIdx = 5; break;
+    case X86::VPSHUFBZ128rmkz:
+    case X86::VPSHUFBZ256rmkz:
+    case X86::VPSHUFBZrmkz:
+      SrcIdx = 2; MaskIdx = 6; break;
+    case X86::VPSHUFBZ128rmk:
+    case X86::VPSHUFBZ256rmk:
+    case X86::VPSHUFBZrmk:
+      SrcIdx = 3; MaskIdx = 7; break;
+    }
+
+    assert(MI->getNumOperands() >= 6 &&
+           "We should always have at least 6 operands!");
     const MachineOperand &DstOp = MI->getOperand(0);
-    const MachineOperand &SrcOp = MI->getOperand(1);
-    const MachineOperand &MaskOp = MI->getOperand(5);
+    const MachineOperand &SrcOp = MI->getOperand(SrcIdx);
+    const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
 
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       SmallVector<int, 16> Mask;
@@ -1240,35 +1329,53 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &SrcOp = MI->getOperand(1);
     const MachineOperand &MaskOp = MI->getOperand(5);
 
+    unsigned ElSize;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break;
+    case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break;
+    }
+
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, Mask);
+      DecodeVPERMILPMask(C, ElSize, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
     }
     break;
   }
 
-    // For loads from a constant pool to a vector register, print the constant
-    // loaded.
-  case X86::MOVAPDrm:
-  case X86::VMOVAPDrm:
-  case X86::VMOVAPDYrm:
-  case X86::MOVUPDrm:
-  case X86::VMOVUPDrm:
-  case X86::VMOVUPDYrm:
-  case X86::MOVAPSrm:
-  case X86::VMOVAPSrm:
-  case X86::VMOVAPSYrm:
-  case X86::MOVUPSrm:
-  case X86::VMOVUPSrm:
-  case X86::VMOVUPSYrm:
-  case X86::MOVDQArm:
-  case X86::VMOVDQArm:
-  case X86::VMOVDQAYrm:
-  case X86::MOVDQUrm:
-  case X86::VMOVDQUrm:
-  case X86::VMOVDQUYrm:
+#define MOV_CASE(Prefix, Suffix)        \
+  case X86::Prefix##MOVAPD##Suffix##rm: \
+  case X86::Prefix##MOVAPS##Suffix##rm: \
+  case X86::Prefix##MOVUPD##Suffix##rm: \
+  case X86::Prefix##MOVUPS##Suffix##rm: \
+  case X86::Prefix##MOVDQA##Suffix##rm: \
+  case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix)         \
+  case X86::VMOVDQA64##Suffix##rm:      \
+  case X86::VMOVDQA32##Suffix##rm:      \
+  case X86::VMOVDQU64##Suffix##rm:      \
+  case X86::VMOVDQU32##Suffix##rm:      \
+  case X86::VMOVDQU16##Suffix##rm:      \
+  case X86::VMOVDQU8##Suffix##rm:       \
+  case X86::VMOVAPS##Suffix##rm:        \
+  case X86::VMOVAPD##Suffix##rm:        \
+  case X86::VMOVUPS##Suffix##rm:        \
+  case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM()               \
+  MOV_CASE(, )   /* SSE */              \
+  MOV_CASE(V, )  /* AVX-128 */          \
+  MOV_CASE(V, Y) /* AVX-256 */          \
+  MOV_AVX512_CASE(Z)                    \
+  MOV_AVX512_CASE(Z256)                 \
+  MOV_AVX512_CASE(Z128)
+
+  // For loads from a constant pool to a vector register, print the constant
+  // loaded.
+  CASE_ALL_MOV_RM()
     if (!OutStreamer->isVerboseAsm())
       break;
     if (MI->getNumOperands() > 4)
@@ -1302,7 +1409,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           if (isa<UndefValue>(COp)) {
             CS << "u";
           } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
-            CS << CI->getZExtValue();
+            if (CI->getBitWidth() <= 64) {
+              CS << CI->getZExtValue();
+            } else {
+              // print multi-word constant as (w0,w1)
+              auto Val = CI->getValue();
+              CS << "(";
+              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+                if (i > 0)
+                  CS << ",";
+                CS << Val.getRawData()[i];
+              }
+              CS << ")";
+            }
           } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
             SmallString<32> Str;
             CF->getValueAPF().toString(Str);
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index ac2cdc8c..c9e636f 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===//
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index e6db970..00515dd 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -84,14 +84,18 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// of pushes to pass function parameters.
   bool HasPushSequences = false;
 
-  /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill
-  /// slot for the frame pointer.
+  /// True if the function recovers from an SEH exception, and therefore needs
+  /// to spill and restore the frame pointer.
   bool HasSEHFramePtrSave = false;
 
   /// The frame index of a stack object containing the original frame pointer
   /// used to address arguments in a function using a base pointer.
   int SEHFramePtrSaveIndex = 0;
 
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR = false;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -100,7 +104,7 @@ private:
 public:
   X86MachineFunctionInfo() = default;
 
-  explicit X86MachineFunctionInfo(MachineFunction &MF) {};
+  explicit X86MachineFunctionInfo(MachineFunction &MF) {}
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -160,6 +164,9 @@ public:
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
+
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 0000000..45cc0ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,503 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve code size.
+// Currently, it does two things:
+// 1) If there are two LEA instructions calculating addresses which only differ
+//    by displacement inside a basic block, one of them is removed.
+// 2) Address calculations in load and store instructions are replaced by
+//    existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden,
+                                     cl::desc("X86: Enable LEA optimizations."),
+                                     cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+  OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "X86 LEA Optimize"; }
+
+  /// \brief Loop over all of the basic blocks, replacing address
+  /// calculations in load and store instructions, if it's already
+  /// been calculated by LEA. Also, remove redundant LEAs.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// \brief Returns a distance between two instructions inside one basic block.
+  /// Negative result means, that instructions occur in reverse order.
+  int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+  /// \brief Choose the best \p LEA instruction from the \p List to replace
+  /// address calculation in \p MI instruction. Return the address displacement
+  /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
+  /// and \p Dist.
+  bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                     const MachineInstr &MI, MachineInstr *&LEA,
+                     int64_t &AddrDispShift, int &Dist);
+
+  /// \brief Returns true if two machine operand are identical and they are not
+  /// physical registers.
+  bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
+
+  /// \brief Returns true if the instruction is LEA.
+  bool isLEA(const MachineInstr &MI);
+
+  /// \brief Returns true if the \p Last LEA instruction can be replaced by the
+  /// \p First. The difference between displacements of the addresses calculated
+  /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
+  /// replacement of the \p Last LEA's uses with the \p First's def register.
+  bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
+                     int64_t &AddrDispShift);
+
+  /// \brief Returns true if two instructions have memory operands that only
+  /// differ by displacement. The numbers of the first memory operands for both
+  /// instructions are specified through \p N1 and \p N2. The address
+  /// displacement is returned through AddrDispShift.
+  bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                      const MachineInstr &MI2, unsigned N2,
+                      int64_t &AddrDispShift);
+
+  /// \brief Find all LEA instructions in the basic block. Also, assign position
+  /// numbers to all instructions in the basic block to speed up calculation of
+  /// distance between them.
+  void findLEAs(const MachineBasicBlock &MBB,
+                SmallVectorImpl<MachineInstr *> &List);
+
+  /// \brief Removes redundant address calculations.
+  bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+
+  /// \brief Removes LEAs which calculate similar addresses.
+  bool removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List);
+
+  DenseMap<const MachineInstr *, unsigned> InstrPos;
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+
+  static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+                                   const MachineInstr &Last) {
+  // Both instructions must be in the same basic block and they must be
+  // presented in InstrPos.
+  assert(Last.getParent() == First.getParent() &&
+         "Instructions are in different basic blocks");
+  assert(InstrPos.find(&First) != InstrPos.end() &&
+         InstrPos.find(&Last) != InstrPos.end() &&
+         "Instructions' positions are undefined");
+
+  return InstrPos[&Last] - InstrPos[&First];
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+//    the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+//    register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+//    possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                                    const MachineInstr &MI, MachineInstr *&LEA,
+                                    int64_t &AddrDispShift, int &Dist) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+                X86II::getOperandBias(Desc);
+
+  LEA = nullptr;
+
+  // Loop over all LEA instructions.
+  for (auto DefMI : List) {
+    int64_t AddrDispShiftTemp = 0;
+
+    // Compare instructions memory operands.
+    if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
+      continue;
+
+    // Make sure address displacement fits 4 bytes.
+    if (!isInt<32>(AddrDispShiftTemp))
+      continue;
+
+    // Check that LEA def register can be used as MI address base. Some
+    // instructions can use a limited set of registers as address base, for
+    // example MOV8mr_NOREX. We could constrain the register class of the LEA
+    // def to suit MI, however since this case is very rare and hard to
+    // reproduce in a test it's just more reliable to skip the LEA.
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+        MRI->getRegClass(DefMI->getOperand(0).getReg()))
+      continue;
+
+    // Choose the closest LEA instruction from the list, prior to MI if
+    // possible. Note that we took into account resulting address displacement
+    // as well. Also note that the list is sorted by the order in which the LEAs
+    // occur, so the break condition is pretty simple.
+    int DistTemp = calcInstrDist(*DefMI, MI);
+    assert(DistTemp != 0 &&
+           "The distance between two different instructions cannot be zero");
+    if (DistTemp > 0 || LEA == nullptr) {
+      // Do not update return LEA, if the current one provides a displacement
+      // which fits in 1 byte, while the new candidate does not.
+      if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+          isInt<8>(AddrDispShift))
+        continue;
+
+      LEA = DefMI;
+      AddrDispShift = AddrDispShiftTemp;
+      Dist = DistTemp;
+    }
+
+    // FIXME: Maybe we should not always stop at the first LEA after MI.
+    if (DistTemp < 0)
+      break;
+  }
+
+  return LEA != nullptr;
+}
+
+bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
+                                    const MachineOperand &MO2) {
+  return MO1.isIdenticalTo(MO2) &&
+         (!MO1.isReg() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+// Check that the Last LEA can be replaced by the First LEA. To be so,
+// these requirements must be met:
+// 1) Addresses calculated by LEAs differ only by displacement.
+// 2) Def registers of LEAs belong to the same class.
+// 3) All uses of the Last LEA def register are replaceable, thus the
+//    register is used only as address base.
+bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+                                    const MachineInstr &Last,
+                                    int64_t &AddrDispShift) {
+  assert(isLEA(First) && isLEA(Last) &&
+         "The function works only with LEA instructions");
+
+  // Compare instructions' memory operands.
+  if (!isSimilarMemOp(Last, 1, First, 1, AddrDispShift))
+    return false;
+
+  // Make sure that LEA def registers belong to the same class. There may be
+  // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
+  // be used as their operands, so we must be sure that replacing one LEA
+  // with another won't lead to putting a wrong register in the instruction.
+  if (MRI->getRegClass(First.getOperand(0).getReg()) !=
+      MRI->getRegClass(Last.getOperand(0).getReg()))
+    return false;
+
+  // Loop over all uses of the Last LEA to check that its def register is
+  // used only as address base for memory accesses. If so, it can be
+  // replaced, otherwise - no.
+  for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+    MachineInstr &MI = *MO.getParent();
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode());
+
+    // If the use instruction has no memory operand - the LEA is not
+    // replaceable.
+    if (MemOpNo < 0)
+      return false;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // If the address base of the use instruction is not the LEA def register -
+    // the LEA is not replaceable.
+    if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO))
+      return false;
+
+    // If the LEA def register is used as any other operand of the use
+    // instruction - the LEA is not replaceable.
+    for (unsigned i = 0; i < MI.getNumOperands(); i++)
+      if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) &&
+          isIdenticalOp(MI.getOperand(i), MO))
+        return false;
+
+    // Check that the new address displacement will fit 4 bytes.
+    if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() &&
+        !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() +
+                   AddrDispShift))
+      return false;
+  }
+
+  return true;
+}
+
+// Check if MI1 and MI2 have memory operands which represent addresses that
+// differ only by displacement.
+bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                                     const MachineInstr &MI2, unsigned N2,
+                                     int64_t &AddrDispShift) {
+  // Address base, scale, index and segment operands must be identical.
+  static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
+                                        X86::AddrIndexReg, X86::AddrSegmentReg};
+  for (auto &N : IdenticalOpNums)
+    if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
+      return false;
+
+  // Address displacement operands may differ by a constant.
+  const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
+  const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
+  if (!isIdenticalOp(*Op1, *Op2)) {
+    if (Op1->isImm() && Op2->isImm())
+      AddrDispShift = Op1->getImm() - Op2->getImm();
+    else if (Op1->isGlobal() && Op2->isGlobal() &&
+             Op1->getGlobal() == Op2->getGlobal())
+      AddrDispShift = Op1->getOffset() - Op2->getOffset();
+    else
+      return false;
+  }
+
+  return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+                               SmallVectorImpl<MachineInstr *> &List) {
+  unsigned Pos = 0;
+  for (auto &MI : MBB) {
+    // Assign the position number to the instruction. Note that we are going to
+    // move some instructions during the optimization however there will never
+    // be a need to move two instructions before any selected instruction. So to
+    // avoid multiple positions' updates during moves we just increase position
+    // counter by two leaving a free space for instructions which will be moved.
+    InstrPos[&MI] = Pos += 2;
+
+    if (isLEA(MI))
+      List.push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(
+    const SmallVectorImpl<MachineInstr *> &List) {
+  bool Changed = false;
+
+  assert(List.size() > 0);
+  MachineBasicBlock *MBB = List[0]->getParent();
+
+  // Process all instructions in basic block.
+  for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr &MI = *I++;
+    unsigned Opcode = MI.getOpcode();
+
+    // Instruction must be load or store.
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+
+    // If instruction has no memory operand - skip it.
+    if (MemOpNo < 0)
+      continue;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // Get the best LEA instruction to replace address calculation.
+    MachineInstr *DefMI;
+    int64_t AddrDispShift;
+    int Dist;
+    if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+      continue;
+
+    // If LEA occurs before current instruction, we can freely replace
+    // the instruction. If LEA occurs after, we can lift LEA above the
+    // instruction and this way to be able to replace it. Since LEA and the
+    // instruction have similar memory operands (thus, the same def
+    // instructions for these operands), we can always do that, without
+    // worries of using registers before their defs.
+    if (Dist < 0) {
+      DefMI->removeFromParent();
+      MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+      InstrPos[DefMI] = InstrPos[&MI] - 1;
+
+      // Make sure the instructions' position numbers are sane.
+      assert(((InstrPos[DefMI] == 1 && DefMI == MBB->begin()) ||
+              InstrPos[DefMI] >
+                  InstrPos[std::prev(MachineBasicBlock::iterator(DefMI))]) &&
+             "Instruction positioning is broken");
+    }
+
+    // Since we can possibly extend register lifetime, clear kill flags.
+    MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+    ++NumSubstLEAs;
+    DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+    // Change instruction operands.
+    MI.getOperand(MemOpNo + X86::AddrBaseReg)
+        .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+    MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+    MI.getOperand(MemOpNo + X86::AddrIndexReg)
+        .ChangeToRegister(X86::NoRegister, false);
+    MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+    MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+        .ChangeToRegister(X86::NoRegister, false);
+
+    DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Try to find similar LEAs in the list and replace one with another.
+bool
+OptimizeLEAPass::removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List) {
+  bool Changed = false;
+
+  // Loop over all LEA pairs.
+  auto I1 = List.begin();
+  while (I1 != List.end()) {
+    MachineInstr &First = **I1;
+    auto I2 = std::next(I1);
+    while (I2 != List.end()) {
+      MachineInstr &Last = **I2;
+      int64_t AddrDispShift;
+
+      // LEAs should be in occurence order in the list, so we can freely
+      // replace later LEAs with earlier ones.
+      assert(calcInstrDist(First, Last) > 0 &&
+             "LEAs must be in occurence order in the list");
+
+      // Check that the Last LEA instruction can be replaced by the First.
+      if (!isReplaceable(First, Last, AddrDispShift)) {
+        ++I2;
+        continue;
+      }
+
+      // Loop over all uses of the Last LEA and update their operands. Note that
+      // the correctness of this has already been checked in the isReplaceable
+      // function.
+      for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
+                UE = MRI->use_end();
+           UI != UE;) {
+        MachineOperand &MO = *UI++;
+        MachineInstr &MI = *MO.getParent();
+
+        // Get the number of the first memory operand.
+        const MCInstrDesc &Desc = MI.getDesc();
+        int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+                      X86II::getOperandBias(Desc);
+
+        // Update address base.
+        MO.setReg(First.getOperand(0).getReg());
+
+        // Update address disp.
+        MachineOperand *Op = &MI.getOperand(MemOpNo + X86::AddrDisp);
+        if (Op->isImm())
+          Op->setImm(Op->getImm() + AddrDispShift);
+        else if (Op->isGlobal())
+          Op->setOffset(Op->getOffset() + AddrDispShift);
+        else
+          llvm_unreachable("Invalid address displacement operand");
+      }
+
+      // Since we can possibly extend register lifetime, clear kill flags.
+      MRI->clearKillFlags(First.getOperand(0).getReg());
+
+      ++NumRedundantLEAs;
+      DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+
+      // By this moment, all of the Last LEA's uses must be replaced. So we can
+      // freely remove it.
+      assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+             "The LEA's def register must have no uses");
+      Last.eraseFromParent();
+
+      // Erase removed LEA from the list.
+      I2 = List.erase(I2);
+
+      Changed = true;
+    }
+    ++I1;
+  }
+
+  return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  // Perform this optimization only if we care about code size.
+  if (!EnableX86LEAOpt || !MF.getFunction()->optForSize())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Process all basic blocks.
+  for (auto &MBB : MF) {
+    SmallVector<MachineInstr *, 16> LEAs;
+    InstrPos.clear();
+
+    // Find all LEA instructions in basic block.
+    findLEAs(MBB, LEAs);
+
+    // If current basic block has no LEAs, move on to the next one.
+    if (LEAs.empty())
+      continue;
+
+    // Remove redundant LEA instructions. The optimization may have a negative
+    // effect on performance, so do it only for -Oz.
+    if (MF.getFunction()->optForMinSize())
+      Changed |= removeRedundantLEAs(LEAs);
+
+    // Remove redundant address calculations.
+    Changed |= removeRedundantAddrCalc(LEAs);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 143e70b..0f425e2 100644
--- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -93,8 +93,7 @@ FunctionPass *llvm::createX86PadShortFunctions() {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// NOOP instructions before early exits.
 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
-      MF.getFunction()->hasFnAttribute(Attribute::MinSize)) {
+  if (MF.getFunction()->optForSize()) {
     return false;
   }
 
@@ -107,7 +106,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
   VisitedBBs.clear();
-  findReturns(MF.begin());
+  findReturns(&MF.front());
 
   bool MadeChange = false;
 
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index d8495e5..274b566 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
@@ -44,12 +43,6 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "X86GenRegisterInfo.inc"
 
-cl::opt<bool>
-ForceStackAlign("force-align-stack",
-                 cl::desc("Force align the stack to the minimum alignment"
-                           " needed for the function."),
-                 cl::init(false), cl::Hidden);
-
 static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
@@ -174,21 +167,34 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64_NOSPRegClass;
     return &X86::GR32_NOSPRegClass;
-  case 2: // Available for tailcall (not callee-saved GPRs).
-    const Function *F = MF.getFunction();
-    if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
-      return &X86::GR64_TCW64RegClass;
-    else if (Is64Bit)
-      return &X86::GR64_TCRegClass;
-
-    bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
-    if (hasHipeCC)
-      return &X86::GR32RegClass;
-    return &X86::GR32_TCRegClass;
+  case 2: // NOREX GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOREXRegClass;
+    return &X86::GR32_NOREXRegClass;
+  case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOREX_NOSPRegClass;
+    return &X86::GR32_NOREX_NOSPRegClass;
+  case 4: // Available for tailcall (not callee-saved GPRs).
+    return getGPRsForTailCall(MF);
   }
 }
 
 const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+    return &X86::GR64_TCW64RegClass;
+  else if (Is64Bit)
+    return &X86::GR64_TCRegClass;
+
+  bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+  if (hasHipeCC)
+    return &X86::GR32RegClass;
+  return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
 X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (RC == &X86::CCRRegClass) {
     if (Is64Bit)
@@ -222,6 +228,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+  bool HasSSE = Subtarget.hasSSE1();
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
   bool CallsEHReturn = MF->getMMI().callsEHReturn();
@@ -241,6 +248,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_SaveList;
     return CSR_64_RT_AllRegs_SaveList;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ?
+             CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
+    break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
@@ -254,6 +266,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
       return CSR_64_Intel_OCL_BI_SaveList;
     break;
   }
+  case CallingConv::HHVM:
+    return CSR_64_HHVM_SaveList;
   case CallingConv::Cold:
     if (Is64Bit)
       return CSR_64_MostRegs_SaveList;
@@ -264,6 +278,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (CallsEHReturn)
       return CSR_64EHRet_SaveList;
     return CSR_64_SaveList;
+  case CallingConv::X86_INTR:
+    if (Is64Bit) {
+      if (HasAVX)
+        return CSR_64_AllRegs_AVX_SaveList;
+      else
+        return CSR_64_AllRegs_SaveList;
+    } else {
+      if (HasSSE)
+        return CSR_32_AllRegs_SSE_SaveList;
+      else
+        return CSR_32_AllRegs_SaveList;
+    }
   default:
     break;
   }
@@ -280,10 +306,20 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CSR_32_SaveList;
 }
 
+const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
+    return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
+  return nullptr;
+}
+
 const uint32_t *
 X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
   const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  bool HasSSE = Subtarget.hasSSE1();
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
 
@@ -301,6 +337,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_RegMask;
     return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_RegMask;
+    break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
@@ -314,16 +354,30 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
       return CSR_64_Intel_OCL_BI_RegMask;
     break;
   }
+  case CallingConv::HHVM:
+    return CSR_64_HHVM_RegMask;
   case CallingConv::Cold:
     if (Is64Bit)
       return CSR_64_MostRegs_RegMask;
     break;
-  default:
-    break;
   case CallingConv::X86_64_Win64:
     return CSR_Win64_RegMask;
   case CallingConv::X86_64_SysV:
     return CSR_64_RegMask;
+  case CallingConv::X86_INTR:
+    if (Is64Bit) {
+      if (HasAVX)
+        return CSR_64_AllRegs_AVX_RegMask;
+      else
+        return CSR_64_AllRegs_RegMask;
+    } else {
+      if (HasSSE)
+        return CSR_32_AllRegs_SSE_RegMask;
+      else
+        return CSR_32_AllRegs_RegMask;
+    }
+    default:
+      break;
   }
 
   // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
@@ -341,6 +395,10 @@ X86RegisterInfo::getNoPreservedMask() const {
   return CSR_NoRegs_RegMask;
 }
 
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+  return CSR_64_TLS_Darwin_RegMask;
+}
+
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const X86FrameLowering *TFI = getFrameLowering(MF);
@@ -371,8 +429,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         "Stack realignment in presence of dynamic allocas is not supported with"
         "this calling convention.");
 
-    unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64,
-                                              false);
+    unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
     for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
          I.isValid(); ++I)
       Reserved.set(*I);
@@ -439,6 +496,10 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
+static bool CantUseSP(const MachineFrameInfo *MFI) {
+  return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
+}
+
 bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    const MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -451,13 +512,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    // reference locals while also adjusting the stack pointer.  When we can't
    // use both the SP and the FP, we need a separate base pointer register.
    bool CantUseFP = needsStackRealignment(MF);
-   bool CantUseSP =
-       MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
-   return CantUseFP && CantUseSP;
+   return CantUseFP && CantUseSP(MFI);
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+  if (!TargetRegisterInfo::canRealignStack(MF))
     return false;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -470,26 +529,11 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 
   // If a base pointer is necessary.  Check that it isn't too late to reserve
   // it.
-  if (MFI->hasVarSizedObjects())
+  if (CantUseSP(MFI))
     return MRI->canReserveReg(BasePtr);
   return true;
 }
 
-bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const X86FrameLowering *TFI = getFrameLowering(MF);
-  const Function *F = MF.getFunction();
-  unsigned StackAlign = TFI->getStackAlignment();
-  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
-                              F->hasFnAttribute(Attribute::StackAlignment));
-
-  // If we've requested that we force align the stack do so now.
-  if (ForceStackAlign)
-    return canRealignStack(MF);
-
-  return requiresRealignment && canRealignStack(MF);
-}
-
 bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
                                            unsigned Reg, int &FrameIdx) const {
   // Since X86 defines assignCalleeSavedSpillSlots which always return true
@@ -510,6 +554,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned Opc = MI.getOpcode();
   bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
                     Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
+
   if (hasBasePointer(MF))
     BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
   else if (needsStackRealignment(MF))
@@ -524,14 +569,11 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // offset is from the traditional base pointer location.  On 64-bit, the
   // offset is from the SP at the end of the prologue, not the FP location. This
   // matches the behavior of llvm.frameaddress.
+  unsigned IgnoredFrameReg;
   if (Opc == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
-    bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
     int Offset;
-    if (IsWinEH)
-      Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex);
-    else
-      Offset = TFI->getFrameIndexOffset(MF, FrameIndex);
+    Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
     FI.ChangeToImmediate(Offset);
     return;
   }
@@ -540,7 +582,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // register as source operand, semantic is the same and destination is
   // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
   if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
-    BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false);
+    BasePtr = getX86SubSuperRegister(BasePtr, 64);
 
   // This must be part of a four operand memory reference.  Replace the
   // FrameIndex with base register with EBP.  Add an offset to the offset.
@@ -553,7 +595,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     const MachineFrameInfo *MFI = MF.getFrameInfo();
     FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
   } else
-    FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
 
   if (BasePtr == StackPtr)
     FIOffset += SPAdj;
@@ -592,193 +634,11 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
   const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
   unsigned FrameReg = getFrameRegister(MF);
   if (Subtarget.isTarget64BitILP32())
-    FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false);
+    FrameReg = getX86SubSuperRegister(FrameReg, 32);
   return FrameReg;
 }
 
-namespace llvm {
-unsigned getX86SubSuperRegisterOrZero(unsigned Reg, MVT::SimpleValueType VT,
-                                      bool High) {
-  switch (VT) {
-  default: return 0;
-  case MVT::i8:
-    if (High) {
-      switch (Reg) {
-      default: return getX86SubSuperRegister(Reg, MVT::i64);
-      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
-        return X86::SI;
-      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
-        return X86::DI;
-      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
-        return X86::BP;
-      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
-        return X86::SP;
-      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
-        return X86::AH;
-      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
-        return X86::DH;
-      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
-        return X86::CH;
-      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
-        return X86::BH;
-      }
-    } else {
-      switch (Reg) {
-      default: return 0;
-      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
-        return X86::AL;
-      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
-        return X86::DL;
-      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
-        return X86::CL;
-      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
-        return X86::BL;
-      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
-        return X86::SIL;
-      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
-        return X86::DIL;
-      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
-        return X86::BPL;
-      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
-        return X86::SPL;
-      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
-        return X86::R8B;
-      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
-        return X86::R9B;
-      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
-        return X86::R10B;
-      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
-        return X86::R11B;
-      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
-        return X86::R12B;
-      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
-        return X86::R13B;
-      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
-        return X86::R14B;
-      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
-        return X86::R15B;
-      }
-    }
-  case MVT::i16:
-    switch (Reg) {
-    default: return 0;
-    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
-      return X86::AX;
-    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
-      return X86::DX;
-    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
-      return X86::CX;
-    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
-      return X86::BX;
-    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
-      return X86::SI;
-    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
-      return X86::DI;
-    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
-      return X86::BP;
-    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
-      return X86::SP;
-    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
-      return X86::R8W;
-    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
-      return X86::R9W;
-    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
-      return X86::R10W;
-    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
-      return X86::R11W;
-    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
-      return X86::R12W;
-    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
-      return X86::R13W;
-    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
-      return X86::R14W;
-    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
-      return X86::R15W;
-    }
-  case MVT::i32:
-    switch (Reg) {
-    default: return 0;
-    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
-      return X86::EAX;
-    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
-      return X86::EDX;
-    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
-      return X86::ECX;
-    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
-      return X86::EBX;
-    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
-      return X86::ESI;
-    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
-      return X86::EDI;
-    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
-      return X86::EBP;
-    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
-      return X86::ESP;
-    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
-      return X86::R8D;
-    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
-      return X86::R9D;
-    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
-      return X86::R10D;
-    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
-      return X86::R11D;
-    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
-      return X86::R12D;
-    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
-      return X86::R13D;
-    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
-      return X86::R14D;
-    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
-      return X86::R15D;
-    }
-  case MVT::i64:
-    switch (Reg) {
-    default: return 0;
-    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
-      return X86::RAX;
-    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
-      return X86::RDX;
-    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
-      return X86::RCX;
-    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
-      return X86::RBX;
-    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
-      return X86::RSI;
-    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
-      return X86::RDI;
-    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
-      return X86::RBP;
-    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
-      return X86::RSP;
-    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
-      return X86::R8;
-    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
-      return X86::R9;
-    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
-      return X86::R10;
-    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
-      return X86::R11;
-    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
-      return X86::R12;
-    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
-      return X86::R13;
-    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
-      return X86::R14;
-    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
-      return X86::R15;
-    }
-  }
-}
-
-unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
-                                bool High) {
-  unsigned Res = getX86SubSuperRegisterOrZero(Reg, VT, High);
-  if (Res == 0)
-    llvm_unreachable("Unexpected register or VT");
-  return Res;
-}
-
-unsigned get512BitSuperRegister(unsigned Reg) {
+unsigned llvm::get512BitSuperRegister(unsigned Reg) {
   if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
     return X86::ZMM0 + (Reg - X86::XMM0);
   if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
@@ -787,5 +647,3 @@ unsigned get512BitSuperRegister(unsigned Reg) {
     return Reg;
   llvm_unreachable("Unexpected SIMD register");
 }
-
-}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
index 8de1d0b..8d0094c 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -87,6 +87,11 @@ public:
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
+  /// getGPRsForTailCall - Returns a register class with registers that can be
+  /// used in forming tail calls.
+  const TargetRegisterClass *
+  getGPRsForTailCall(const MachineFunction &MF) const;
+
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
@@ -94,9 +99,15 @@ public:
   /// callee-save registers on this target.
   const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction* MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
-  const uint32_t *getNoPreservedMask() const;
+  const uint32_t *getNoPreservedMask() const override;
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getDarwinTLSCallPreservedMask() const;
 
   /// getReservedRegs - Returns a bitset indexed by physical register number
   /// indicating if a register is a special register that has particular uses and
@@ -108,9 +119,7 @@ public:
 
   bool hasBasePointer(const MachineFunction &MF) const;
 
-  bool canRealignStack(const MachineFunction &MF) const;
-
-  bool needsStackRealignment(const MachineFunction &MF) const override;
+  bool canRealignStack(const MachineFunction &MF) const override;
 
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
                             int &FrameIdx) const override;
@@ -128,16 +137,6 @@ public:
   unsigned getSlotSize() const { return SlotSize; }
 };
 
-/// Returns the sub or super register of a specific X86 register.
-/// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) returns X86::AX.
-/// Aborts on error.
-unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
-
-/// Returns the sub or super register of a specific X86 register.
-/// Like getX86SubSuperRegister() but returns 0 on error.
-unsigned getX86SubSuperRegisterOrZero(unsigned, MVT::SimpleValueType,
-                                      bool High = false);
-
 //get512BitRegister - X86 utility - returns 512-bit super register
 unsigned get512BitSuperRegister(unsigned Reg);
 
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
index cdb151c..56f0d93 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -225,15 +225,15 @@ let SubRegIndices = [sub_ymm] in {
   }
 }
 
-  // Mask Registers, used by AVX-512 instructions.
-  def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
-  def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
-  def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
-  def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
-  def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
-  def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
-  def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
-  def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
 
 // Floating point stack registers. These don't map one-to-one to the FP
 // pseudo registers, but we still mark them as aliasing FP registers. That
@@ -375,7 +375,7 @@ def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
                                                      R8, R9, R11, RIP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
-                                                      R8, R9, R11)>;
+                                                      R8, R9, R10, R11, RIP)>;
 
 // GR8_NOREX - GR8 registers which do not require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
@@ -423,6 +423,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 
 def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
 
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
@@ -442,10 +444,11 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
 }
 
 // Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
 def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
-def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
                           128, (add FR32)>;
-def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                           256, (sequence "YMM%u", 0, 15)>;
 
 // Status flags registers.
@@ -459,8 +462,8 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
 }
 
 // AVX-512 vector/mask registers.
-def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512,
-    (sequence "ZMM%u", 0, 31)>;
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+                          512, (sequence "ZMM%u", 0, 31)>;
 
 // Scalar AVX-512 floating point registers.
 def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
@@ -468,10 +471,10 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
 def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
 
 // Extended VR128 and VR256 for AVX-512 instructions
-def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                          128, (add FR32X)>;
-def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                          256, (sequence "YMM%u", 0, 31)>;
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
 def VK1     : RegisterClass<"X86", [i1],    8,  (sequence "K%u", 0, 7)> {let Size = 8;}
@@ -491,4 +494,4 @@ def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
 def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 
 // Bound registers
-def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
-\ No newline at end of file
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce79fcf..b1a0161 100644
--- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   return false;
 }
 
-SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile,
-                                         MachinePointerInfo DstPtrInfo) const {
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
@@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
-    if (const char *bzeroEntry =  V &&
+    if (const char *bzeroEntry = V &&
         V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
-      EVT IntPtr =
-          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
@@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                    0)
         .setDiscardResult();
 
-      std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       BytesLeft = SizeVal % UBytes;
     }
 
-    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
-                              InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+                             InFlag);
     InFlag = Chain.getValue(1);
   } else {
     AVT = MVT::i8;
@@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
                                                CVT));
-    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
-                                                             X86::ECX,
-                              Left, InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                             Left, InFlag);
     InFlag = Chain.getValue(1);
     Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
@@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   unsigned BytesLeft = SizeVal % UBytes;
 
   SDValue InFlag;
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
-                                                              X86::ECX,
-                            Count, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
-                                                              X86::EDI,
-                            Dst, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
-                                                              X86::ESI,
-                            Src, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+                           Src, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 0000000..ef16c5b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,190 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+  // constant pool uniques constants by their bit representation.
+  // e.g. the following take up the same space in the constant pool:
+  //   i128 -170141183420855150465331762880109871104
+  //
+  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+  //
+  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
+  //              i32 -2147483648, i32 -2147483648>
+
+#ifndef NDEBUG
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
+#endif
+
+  // This is a straightforward byte vector.
+  if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
+    int NumElements = MaskTy->getVectorNumElements();
+    ShuffleMask.reserve(NumElements);
+
+    for (int i = 0; i < NumElements; ++i) {
+      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+      // lane of the vector we're inside.
+      int Base = i & ~0xf;
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp) {
+        ShuffleMask.clear();
+        return;
+      } else if (isa<UndefValue>(COp)) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      // If the high bit (7) of the byte is set, the element is zeroed.
+      if (Element & (1 << 7))
+        ShuffleMask.push_back(SM_SentinelZero);
+      else {
+        // Only the least significant 4 bits of the byte are used.
+        int Index = Base + (Element & 0xf);
+        ShuffleMask.push_back(Index);
+      }
+    }
+  }
+  // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+  // constant pool uniques constants by their bit representation.
+  // e.g. the following take up the same space in the constant pool:
+  //   i128 -170141183420855150465331762880109871104
+  //
+  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+  //
+  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
+  //              i32 -2147483648, i32 -2147483648>
+
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+    return;
+
+  // Only support vector types.
+  if (!MaskTy->isVectorTy())
+    return;
+
+  // Make sure its an integer type.
+  Type *VecEltTy = MaskTy->getVectorElementType();
+  if (!VecEltTy->isIntegerTy())
+    return;
+
+  // Support any element type from byte up to element size.
+  // This is necesary primarily because 64-bit elements get split to 32-bit
+  // in the constant pool on 32-bit target.
+  unsigned EltTySize = VecEltTy->getIntegerBitWidth();
+  if (EltTySize < 8 || EltTySize > ElSize)
+    return;
+
+  unsigned NumElements = MaskTySize / ElSize;
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  ShuffleMask.reserve(NumElements);
+  unsigned NumElementsPerLane = 128 / ElSize;
+  unsigned Factor = ElSize / EltTySize;
+
+  for (unsigned i = 0; i < NumElements; ++i) {
+    Constant *COp = C->getAggregateElement(i * Factor);
+    if (!COp) {
+      ShuffleMask.clear();
+      return;
+    } else if (isa<UndefValue>(COp)) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    int Index = i & ~(NumElementsPerLane - 1);
+    uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+    if (ElSize == 64)
+      Index += (Element >> 1) & 0x1;
+    else
+      Index += Element & 0x3;
+    ShuffleMask.push_back(Index);
+  }
+
+  // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  if (MaskTy->isVectorTy()) {
+    unsigned NumElements = MaskTy->getVectorNumElements();
+    if (NumElements == VT.getVectorNumElements()) {
+      for (unsigned i = 0; i < NumElements; ++i) {
+        Constant *COp = C->getAggregateElement(i);
+        if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
+          ShuffleMask.clear();
+          return;
+        }
+        if (isa<UndefValue>(COp))
+          ShuffleMask.push_back(SM_SentinelUndef);
+        else {
+          uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+          Element &= (1 << NumElements) - 1;
+          ShuffleMask.push_back(Element);
+        }
+      }
+    }
+    return;
+  }
+  // Scalar value; just broadcast it
+  if (!isa<ConstantInt>(C))
+    return;
+  uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
+  int NumElements = VT.getVectorNumElements();
+  Element &= (1 << NumElements) - 1;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask.push_back(Element);
+}
+
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned NumElements = MaskTy->getVectorNumElements();
+  if (NumElements == VT.getVectorNumElements()) {
+    for (unsigned i = 0; i < NumElements; ++i) {
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp) {
+        ShuffleMask.clear();
+        return;
+      }
+      if (isa<UndefValue>(COp))
+        ShuffleMask.push_back(SM_SentinelUndef);
+      else {
+        uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+        Element &= (1 << NumElements*2) - 1;
+        ShuffleMask.push_back(Element);
+      }
+    }
+  }
+}
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 0000000..bcf4632
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,45 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+                       SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
index dff3624..8ef08c9 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -44,9 +44,8 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
                cl::desc("Enable early if-conversion on X86"));
 
 
-/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
-/// current subtarget according to how we should reference it in a non-pcrel
-/// context.
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
 unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
   if (isPICStyleGOT())    // 32-bit ELF targets.
     return X86II::MO_GOTOFF;
@@ -58,9 +57,8 @@ unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
   return X86II::MO_NO_FLAG;
 }
 
-/// ClassifyGlobalReference - Classify a global variable reference for the
-/// current subtarget according to how we should reference it in a non-pcrel
-/// context.
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
 unsigned char X86Subtarget::
 ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
   // DLLImport only exists on windows, it is implemented as a load from a
@@ -147,9 +145,9 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 }
 
 
-/// getBZeroEntry - This function returns the name of a function which has an
-/// interface like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over memset with zero
+/// This function returns the name of a function which has an interface like
+/// the non-standard bzero function, if such a function exists on the
+/// current subtarget and it is considered preferable over memset with zero
 /// passed as the second argument. Otherwise it returns null.
 const char *X86Subtarget::getBZeroEntry() const {
   // Darwin 10 has a __bzero entry point for this purpose.
@@ -166,8 +164,7 @@ bool X86Subtarget::hasSinCos() const {
     is64Bit();
 }
 
-/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
-/// to immediate address.
+/// Return true if the subtarget allows calls to immediate address.
 bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
   // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
   // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
@@ -192,9 +189,25 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit,+sse2";
   }
 
+  // LAHF/SAHF are always supported in non-64-bit mode.
+  if (!In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+sahf," + FullFS;
+    else
+      FullFS = "+sahf";
+  }
+
+
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
+  // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+  // 16-bytes and under that are reasonably fast. These features were
+  // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+  // micro-architectures respectively.
+  if (hasSSE42() || hasSSE4A())
+    IsUAMem16Slow = false;
+  
   InstrItins = getInstrItineraryForCPU(CPUName);
 
   // It's important to keep the MCSubtargetInfo feature bits in sync with
@@ -224,13 +237,18 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 }
 
 void X86Subtarget::initializeEnvironment() {
-  X86SSELevel = NoMMXSSE;
+  X86SSELevel = NoSSE;
   X863DNowLevel = NoThreeDNow;
   HasCMov = false;
   HasX86_64 = false;
   HasPOPCNT = false;
   HasSSE4A = false;
   HasAES = false;
+  HasFXSR = false;
+  HasXSAVE = false;
+  HasXSAVEOPT = false;
+  HasXSAVEC = false;
+  HasXSAVES = false;
   HasPCLMUL = false;
   HasFMA = false;
   HasFMA4 = false;
@@ -252,13 +270,15 @@ void X86Subtarget::initializeEnvironment() {
   HasBWI = false;
   HasVLX = false;
   HasADX = false;
+  HasPKU = false;
   HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
+  HasLAHFSAHF = false;
   HasMPX = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
-  IsUAMemFast = false;
+  IsUAMem16Slow = false;
   IsUAMem32Slow = false;
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
index f026d42..13d1026 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -47,11 +47,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
 
 protected:
   enum X86SSEEnum {
-    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+    NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
   };
 
   enum X863DNowEnum {
-    NoThreeDNow, ThreeDNow, ThreeDNowA
+    NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
   };
 
   enum X86ProcFamilyEnum {
@@ -64,10 +64,10 @@ protected:
   /// Which PIC style to use
   PICStyles::Style PICStyle;
 
-  /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+  /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
   X86SSEEnum X86SSELevel;
 
-  /// 3DNow, 3DNow Athlon, or none supported.
+  /// MMX, 3DNow, 3DNow Athlon, or none supported.
   X863DNowEnum X863DNowLevel;
 
   /// True if this processor has conditional move instructions
@@ -86,6 +86,18 @@ protected:
   /// Target has AES instructions
   bool HasAES;
 
+  /// Target has FXSAVE/FXRESTOR instructions
+  bool HasFXSR;
+
+  /// Target has XSAVE instructions
+  bool HasXSAVE;
+  /// Target has XSAVEOPT instructions
+  bool HasXSAVEOPT;
+  /// Target has XSAVEC instructions
+  bool HasXSAVEC;
+  /// Target has XSAVES instructions
+  bool HasXSAVES;
+
   /// Target has carry-less multiplication
   bool HasPCLMUL;
 
@@ -140,16 +152,19 @@ protected:
   /// Processor has RDSEED instructions.
   bool HasRDSEED;
 
+  /// Processor has LAHF/SAHF instructions.
+  bool HasLAHFSAHF;
+
   /// True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
   /// True if SHLD instructions are slow.
   bool IsSHLDSlow;
 
-  /// True if unaligned memory access is fast.
-  bool IsUAMemFast;
+  /// True if unaligned memory accesses of 16-bytes are slow.
+  bool IsUAMem16Slow;
 
-  /// True if unaligned 32-byte memory accesses are slow.
+  /// True if unaligned memory accesses of 32-bytes are slow.
   bool IsUAMem32Slow;
 
   /// True if SSE operations can have unaligned memory operands.
@@ -208,6 +223,9 @@ protected:
   /// Processor has AVX-512 Vector Length eXtenstions
   bool HasVLX;
 
+  /// Processor has PKU extenstions
+  bool HasPKU;
+
   /// Processot supports MPX - Memory Protection Extensions
   bool HasMPX;
 
@@ -319,7 +337,6 @@ public:
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
 
   bool hasCMov() const { return HasCMov; }
-  bool hasMMX() const { return X86SSELevel >= MMX; }
   bool hasSSE1() const { return X86SSELevel >= SSE1; }
   bool hasSSE2() const { return X86SSELevel >= SSE2; }
   bool hasSSE3() const { return X86SSELevel >= SSE3; }
@@ -332,14 +349,22 @@ public:
   bool hasFp256() const { return hasAVX(); }
   bool hasInt256() const { return hasAVX2(); }
   bool hasSSE4A() const { return HasSSE4A; }
+  bool hasMMX() const { return X863DNowLevel >= MMX; }
   bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
   bool hasPOPCNT() const { return HasPOPCNT; }
   bool hasAES() const { return HasAES; }
+  bool hasFXSR() const { return HasFXSR; }
+  bool hasXSAVE() const { return HasXSAVE; }
+  bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+  bool hasXSAVEC() const { return HasXSAVEC; }
+  bool hasXSAVES() const { return HasXSAVES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
-  bool hasFMA() const { return HasFMA; }
-  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
-  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+  // Prefer FMA4 to FMA - its better for commutation/memory folding and
+  // has equal or better performance on all supported targets.
+  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA4() const { return HasFMA4; }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
@@ -355,9 +380,10 @@ public:
   bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
+  bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
-  bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+  bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
@@ -375,6 +401,7 @@ public:
   bool hasDQI() const { return HasDQI; }
   bool hasBWI() const { return HasBWI; }
   bool hasVLX() const { return HasVLX; }
+  bool hasPKU() const { return HasPKU; }
   bool hasMPX() const { return HasMPX; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
@@ -394,9 +421,11 @@ public:
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+  bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
 
   bool isTargetWindowsMSVC() const {
     return TargetTriple.isWindowsMSVCEnvironment();
@@ -406,6 +435,10 @@ public:
     return TargetTriple.isKnownWindowsMSVCEnvironment();
   }
 
+  bool isTargetWindowsCoreCLR() const {
+    return TargetTriple.isWindowsCoreCLREnvironment();
+  }
+
   bool isTargetWindowsCygwin() const {
     return TargetTriple.isWindowsCygwinEnvironment();
   }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
index fb9cb4b..0e7e4c0 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -28,10 +28,17 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
   RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeWinEHStatePassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -45,7 +52,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     return make_unique<X86LinuxNaClTargetObjectFile>();
   if (TT.isOSBinFormatELF())
     return make_unique<X86ELFTargetObjectFile>();
-  if (TT.isKnownWindowsMSVCEnvironment())
+  if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
     return make_unique<X86WindowsTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
     return make_unique<TargetLoweringObjectFileCOFF>();
@@ -175,8 +182,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
 //===----------------------------------------------------------------------===//
 
 TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); });
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(X86TTIImpl(this, F));
+  });
 }
 
 
@@ -246,6 +254,9 @@ bool X86PassConfig::addPreISel() {
 }
 
 void X86PassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createX86OptimizeLEAs());
+
   addPass(createX86CallFrameOptimization());
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 6f900ea..782768d 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -152,9 +153,8 @@ static std::string scalarConstantToHexString(const Constant *C) {
   }
 }
 
-MCSection *
-X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind,
-                                                  const Constant *C) const {
+MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
   if (Kind.isMergeableConst() && C) {
     const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                      COFF::IMAGE_SCN_MEM_READ |
@@ -171,5 +171,5 @@ X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind,
                                          COFF::IMAGE_COMDAT_SELECT_ANY);
   }
 
-  return TargetLoweringObjectFile::getSectionForConstant(Kind, C);
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C);
 }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
index 66366b2..6b2448c 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -58,7 +58,7 @@ namespace llvm {
 
     /// \brief Given a mergeable constant with the specified size and relocation
     /// information, return a section that it should be placed in.
-    MCSection *getSectionForConstant(SectionKind Kind,
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                      const Constant *C) const override;
   };
 
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7df7260..2e7bbb2 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "x86tti"
@@ -62,8 +63,8 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
 
   if (ST->is64Bit())
     return 64;
-  return 32;
 
+  return 32;
 }
 
 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
@@ -84,12 +85,12 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return 2;
 }
 
-unsigned X86TTIImpl::getArithmeticInstrCost(
+int X86TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
@@ -101,10 +102,9 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     // normally expanded to the sequence SRA + SRL + ADD + SRA.
     // The OperandValue properties many not be same as that of previous
     // operation;conservatively assume OP_None.
-    unsigned Cost =
-        2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
+    int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
+                                          Op2Info, TargetTransformInfo::OP_None,
+                                          TargetTransformInfo::OP_None);
     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
                                    TargetTransformInfo::OP_None,
                                    TargetTransformInfo::OP_None);
@@ -115,8 +115,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     return Cost;
   }
 
-  static const CostTblEntry<MVT::SimpleValueType>
-  AVX2UniformConstCostTable[] = {
+  static const CostTblEntry AVX2UniformConstCostTable[] = {
     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
 
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
@@ -127,12 +126,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasAVX2()) {
-    int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
-    if (Idx != -1)
-      return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
   }
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = {
+  static const CostTblEntry AVX512CostTable[] = {
     { ISD::SHL,     MVT::v16i32,    1 },
     { ISD::SRL,     MVT::v16i32,    1 },
     { ISD::SRA,     MVT::v16i32,    1 },
@@ -141,7 +140,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,     MVT::v8i64,    1 },
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
+  if (ST->hasAVX512()) {
+    if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
     { ISD::SHL,     MVT::v4i32,    1 },
@@ -154,7 +158,57 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,     MVT::v2i64,    1 },
     { ISD::SHL,     MVT::v4i64,    1 },
     { ISD::SRL,     MVT::v4i64,    1 },
+  };
+
+  // Look for AVX2 lowering tricks.
+  if (ST->hasAVX2()) {
+    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      // On AVX2, a packed v16i16 shift left by a constant build_vector
+      // is lowered into a vector multiply (vpmullw).
+      return LT.first;
+
+    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry XOPCostTable[] = {
+    // 128bit shifts take 1cy, but right shifts require negation beforehand.
+    { ISD::SHL,     MVT::v16i8,    1 },
+    { ISD::SRL,     MVT::v16i8,    2 },
+    { ISD::SRA,     MVT::v16i8,    2 },
+    { ISD::SHL,     MVT::v8i16,    1 },
+    { ISD::SRL,     MVT::v8i16,    2 },
+    { ISD::SRA,     MVT::v8i16,    2 },
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    2 },
+    { ISD::SRA,     MVT::v4i32,    2 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    2 },
+    { ISD::SRA,     MVT::v2i64,    2 },
+    // 256bit shifts require splitting if AVX2 didn't catch them above.
+    { ISD::SHL,     MVT::v32i8,    2 },
+    { ISD::SRL,     MVT::v32i8,    4 },
+    { ISD::SRA,     MVT::v32i8,    4 },
+    { ISD::SHL,     MVT::v16i16,   2 },
+    { ISD::SRL,     MVT::v16i16,   4 },
+    { ISD::SRA,     MVT::v16i16,   4 },
+    { ISD::SHL,     MVT::v8i32,    2 },
+    { ISD::SRL,     MVT::v8i32,    4 },
+    { ISD::SRA,     MVT::v8i32,    4 },
+    { ISD::SHL,     MVT::v4i64,    2 },
+    { ISD::SRL,     MVT::v4i64,    4 },
+    { ISD::SRA,     MVT::v4i64,    4 },
+  };
 
+  // Look for XOP lowering tricks.
+  if (ST->hasXOP()) {
+    if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2CustomCostTable[] = {
     { ISD::SHL,  MVT::v32i8,      11 }, // vpblendvb sequence.
     { ISD::SHL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
 
@@ -163,7 +217,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
     { ISD::SRA,  MVT::v32i8,      24 }, // vpblendvb sequence.
     { ISD::SRA,  MVT::v16i16,     10 }, // extend/vpsravd/pack sequence.
-    { ISD::SRA,  MVT::v4i64,    4*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v2i64,       4 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,       4 }, // srl/xor/sub sequence.
 
     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
     { ISD::SDIV,  MVT::v32i8,  32*20 },
@@ -176,44 +231,44 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     { ISD::UDIV,  MVT::v4i64,  4*20 },
   };
 
-  if (ST->hasAVX512()) {
-    int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second);
-    if (Idx != -1)
-      return LT.first * AVX512CostTable[Idx].Cost;
-  }
-  // Look for AVX2 lowering tricks.
+  // Look for AVX2 lowering tricks for custom cases.
   if (ST->hasAVX2()) {
-    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
-        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
-         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
-      // On AVX2, a packed v16i16 shift left by a constant build_vector
-      // is lowered into a vector multiply (vpmullw).
-      return LT.first;
-
-    int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
-    if (Idx != -1)
-      return LT.first * AVX2CostTable[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
   }
 
-  static const CostTblEntry<MVT::SimpleValueType>
+  static const CostTblEntry
   SSE2UniformConstCostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // Constant splats are cheaper for the following instructions.
     { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
+    { ISD::SHL,  MVT::v32i8,  2 }, // psllw.
     { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
     { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
     { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
 
     { ISD::SRL,  MVT::v16i8,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v32i8,  2 }, // psrlw.
     { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
     { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
     { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
 
     { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v32i8,  8 }, // psrlw, pand, pxor, psubb.
     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
     { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
 
     { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
     { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
@@ -227,27 +282,34 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 15;
 
-    int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
-    if (Idx != -1)
-      return LT.first * SSE2UniformConstCostTable[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
   }
 
   if (ISD == ISD::SHL &&
       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
-    EVT VT = LT.second;
+    MVT VT = LT.second;
+    // Vector shift left by non uniform constant can be lowered
+    // into vector multiply (pmullw/pmulld).
     if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
         (VT == MVT::v4i32 && ST->hasSSE41()))
-      // Vector shift left by non uniform constant can be lowered
-      // into vector multiply (pmullw/pmulld).
       return LT.first;
+
+    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+    // sequence of extract + two vector multiply + insert.
+    if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
+       (ST->hasAVX() && !ST->hasAVX2()))
+      ISD = ISD::MUL;
+
+    // A vector shift left by non uniform constant is converted
+    // into a vector multiply; the new multiply is eventually
+    // lowered into a sequence of shuffles and 2 x pmuludq.
     if (VT == MVT::v4i32 && ST->hasSSE2())
-      // A vector shift left by non uniform constant is converted
-      // into a vector multiply; the new multiply is eventually
-      // lowered into a sequence of shuffles and 2 x pmuludq.
       ISD = ISD::MUL;
   }
 
-  static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
+  static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // For some cases, where the shift amount is a scalar we would be able
@@ -257,20 +319,31 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     // used for vectorization and we don't want to make vectorized code worse
     // than scalar code.
     { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v2i64,  2*10 }, // Scalarized.
-    { ISD::SHL,  MVT::v4i64,  4*10 }, // Scalarized.
+    { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
     { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v2i64,  2*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
     { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*54 }, // unpacked cmpgtb sequence.
     { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v2i64,  2*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
 
     // It is not a good idea to vectorize division. We have to scalarize it and
     // in the process we will often end up having to spilling regular
@@ -289,12 +362,11 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
   };
 
   if (ST->hasSSE2()) {
-    int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
-    if (Idx != -1)
-      return LT.first * SSE2CostTable[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
   }
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
+  static const CostTblEntry AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
@@ -314,29 +386,21 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
   // Look for AVX1 lowering tricks.
   if (ST->hasAVX() && !ST->hasAVX2()) {
-    EVT VT = LT.second;
+    MVT VT = LT.second;
 
-    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
-    // sequence of extract + two vector multiply + insert.
-    if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&
-        Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)
-      ISD = ISD::MUL;
-
-    int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
-    if (Idx != -1)
-      return LT.first * AVX1CostTable[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+      return LT.first * Entry->Cost;
   }
 
   // Custom lowering of vectors.
-  static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
+  static const CostTblEntry CustomLowered[] = {
     // A v2i64/v4i64 and multiply is custom lowered as a series of long
     // multiplies(3), shifts(4) and adds(2).
     { ISD::MUL,     MVT::v2i64,    9 },
     { ISD::MUL,     MVT::v4i64,    9 },
   };
-  int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
-  if (Idx != -1)
-    return LT.first * CustomLowered[Idx].Cost;
+  if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
+    return LT.first * Entry->Cost;
 
   // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
   // 2x pmuludq, 2x shuffle.
@@ -348,15 +412,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
 }
 
-unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                                    Type *SubTp) {
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
   // We only estimate the cost of reverse and alternate shuffles.
   if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    unsigned Cost = 1;
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    int Cost = 1;
     if (LT.second.getSizeInBits() > 128)
       Cost = 3; // Extract + insert + copy.
 
@@ -367,14 +431,14 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   if (Kind == TTI::SK_Alternate) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
     // The backend knows how to generate a single VEX.256 version of
     // instruction VPBLENDW if the target supports AVX2.
     if (ST->hasAVX2() && LT.second == MVT::v16i16)
       return LT.first;
 
-    static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {
+    static const CostTblEntry AVXAltShuffleTbl[] = {
       {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vblendpd
       {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vblendpd
 
@@ -390,13 +454,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
     };
 
-    if (ST->hasAVX()) {
-      int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-      if (Idx != -1)
-        return LT.first * AVXAltShuffleTbl[Idx].Cost;
-    }
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
 
-    static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {
+    static const CostTblEntry SSE41AltShuffleTbl[] = {
       // These are lowered into movsd.
       {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
       {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
@@ -414,13 +477,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
     };
 
-    if (ST->hasSSE41()) {
-      int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-      if (Idx != -1)
-        return LT.first * SSE41AltShuffleTbl[Idx].Cost;
-    }
+    if (ST->hasSSE41())
+      if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
+                                              LT.second))
+        return LT.first * Entry->Cost;
 
-    static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {
+    static const CostTblEntry SSSE3AltShuffleTbl[] = {
       {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
       {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
 
@@ -433,13 +495,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
     };
 
-    if (ST->hasSSSE3()) {
-      int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-      if (Idx != -1)
-        return LT.first * SSSE3AltShuffleTbl[Idx].Cost;
-    }
+    if (ST->hasSSSE3())
+      if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
 
-    static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {
+    static const CostTblEntry SSEAltShuffleTbl[] = {
       {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
       {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
 
@@ -454,65 +515,47 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     };
 
     // Fall-back (SSE3 and SSE2).
-    int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-    if (Idx != -1)
-      return LT.first * SSEAltShuffleTbl[Idx].Cost;
+    if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
+                                            ISD::VECTOR_SHUFFLE, LT.second))
+      return LT.first * Entry->Cost;
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
   }
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
-  std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
-
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  SSE2ConvTbl[] = {
-    // These are somewhat magic numbers justified by looking at the output of
-    // Intel's IACA, running some kernels and making sure when we take
-    // legalization into account the throughput will be overestimated.
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
-    // There are faster sequences for float conversions.
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+  // FIXME: Need a better design of the cost table to handle non-simple types of
+  // potential massive combinations (elem_num x src_type x dst_type).
+
+  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f32, 1 },
   };
 
-  if (ST->hasSSE2() && !ST->hasAVX()) {
-    int Idx =
-        ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
-    if (Idx != -1)
-      return LTSrc.first * SSE2ConvTbl[Idx].Cost;
-  }
-
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  AVX512ConversionTbl[] = {
+  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
-    { ISD::FP_ROUND,  MVT::v16f32,  MVT::v8f64,  3 },
 
     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
-    { ISD::TRUNCATE,  MVT::v16i32,  MVT::v8i64,  4 },
 
     // v16i1 -> v16i32 - load + broadcast
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
@@ -522,33 +565,49 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
 
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-  };
 
-  if (ST->hasAVX512()) {
-    int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second,
-                                     LTSrc.second);
-    if (Idx != -1)
-      return AVX512ConversionTbl[Idx].Cost;
-  }
-  EVT SrcTy = TLI->getValueType(DL, Src);
-  EVT DstTy = TLI->getValueType(DL, Dst);
-
-  // The function getSimpleVT only handles simple value types.
-  if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
+  };
 
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  AVX2ConversionTbl[] = {
+  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
@@ -579,8 +638,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
   };
 
-  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-  AVXConversionTbl[] = {
+  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
@@ -650,34 +708,158 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
   };
 
+  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 30 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+  };
+
+  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+    // These are somewhat magic numbers justified by looking at the output of
+    // Intel's IACA, running some kernels and making sure when we take
+    // legalization into account the throughput will be overestimated.
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    // There are faster sequences for float conversions.
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
+  };
+
+  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+  if (ST->hasSSE2() && !ST->hasAVX()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   LTDest.second, LTSrc.second))
+      return LTSrc.first * Entry->Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
+
+  // The function getSimpleVT only handles simple value types.
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+  if (ST->hasDQI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
   if (ST->hasAVX2()) {
-    int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
-                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
-    if (Idx != -1)
-      return AVX2ConversionTbl[Idx].Cost;
+    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
   }
 
   if (ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
-                                     SrcTy.getSimpleVT());
-    if (Idx != -1)
-      return AVXConversionTbl[Idx].Cost;
+    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE41()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
   }
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                        Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
+  static const CostTblEntry SSE42CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   1 },
     { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
@@ -686,7 +868,7 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
+  static const CostTblEntry AVX1CostTbl[] = {
     { ISD::SETCC,   MVT::v4f64,   1 },
     { ISD::SETCC,   MVT::v8f32,   1 },
     // AVX1 does not support 8-wide integer compare.
@@ -696,54 +878,45 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v32i8,   4 },
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
+  static const CostTblEntry AVX2CostTbl[] = {
     { ISD::SETCC,   MVT::v4i64,   1 },
     { ISD::SETCC,   MVT::v8i32,   1 },
     { ISD::SETCC,   MVT::v16i16,  1 },
     { ISD::SETCC,   MVT::v32i8,   1 },
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = {
+  static const CostTblEntry AVX512CostTbl[] = {
     { ISD::SETCC,   MVT::v8i64,   1 },
     { ISD::SETCC,   MVT::v16i32,  1 },
     { ISD::SETCC,   MVT::v8f64,   1 },
     { ISD::SETCC,   MVT::v16f32,  1 },
   };
 
-  if (ST->hasAVX512()) {
-    int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy);
-    if (Idx != -1)
-      return LT.first * AVX512CostTbl[Idx].Cost;
-  }
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
-  if (ST->hasAVX2()) {
-    int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
-    if (Idx != -1)
-      return LT.first * AVX2CostTbl[Idx].Cost;
-  }
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
-  if (ST->hasAVX()) {
-    int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
-    if (Idx != -1)
-      return LT.first * AVX1CostTbl[Idx].Cost;
-  }
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
-  if (ST->hasSSE42()) {
-    int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
-    if (Idx != -1)
-      return LT.first * SSE42CostTbl[Idx].Cost;
-  }
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                        unsigned Index) {
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   if (Index != -1U) {
     // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
     // This type is legalized to a scalar type.
     if (!LT.second.isVector())
@@ -761,10 +934,9 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
-unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
-                                              bool Extract) {
+int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
   assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  unsigned Cost = 0;
+  int Cost = 0;
 
   for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
     if (Insert)
@@ -776,9 +948,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
   return Cost;
 }
 
-unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                     unsigned Alignment,
-                                     unsigned AddressSpace) {
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
   // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
@@ -796,22 +967,21 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
     // Assume that all other non-power-of-two numbers are scalarized.
     if (!isPowerOf2_32(NumElem)) {
-      unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(),
-                                             Alignment, AddressSpace);
-      unsigned SplitCost = getScalarizationOverhead(Src,
-                                                    Opcode == Instruction::Load,
-                                                    Opcode==Instruction::Store);
+      int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+                                        AddressSpace);
+      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+                                               Opcode == Instruction::Store);
       return NumElem * Cost + SplitCost;
     }
   }
 
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
   // Each load/store unit costs 1.
-  unsigned Cost = LT.first * 1;
+  int Cost = LT.first * 1;
 
   // On Sandybridge 256bit load/stores are double pumped
   // (but not on Haswell).
@@ -821,9 +991,9 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return Cost;
 }
 
-unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace) {
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
   VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
   if (!SrcVTy)
     // To calculate scalar take the regular cost, without mask
@@ -832,34 +1002,33 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   unsigned NumElem = SrcVTy->getVectorNumElements();
   VectorType *MaskTy =
     VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
-  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) ||
-      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) ||
+  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
       !isPowerOf2_32(NumElem)) {
     // Scalarization
-    unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
-    unsigned ScalarCompareCost =
-      getCmpSelInstrCost(Instruction::ICmp,
-                         Type::getInt8Ty(getGlobalContext()), NULL);
-    unsigned BranchCost = getCFInstrCost(Instruction::Br);
-    unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-
-    unsigned ValueSplitCost =
-      getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load,
-                               Opcode == Instruction::Store);
-    unsigned MemopCost =
+    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+    int ScalarCompareCost = getCmpSelInstrCost(
+        Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr);
+    int BranchCost = getCFInstrCost(Instruction::Br);
+    int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+    int ValueSplitCost = getScalarizationOverhead(
+        SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+    int MemopCost =
         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                          Alignment, AddressSpace);
     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
   }
 
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
-  unsigned Cost = 0;
-  if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() &&
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+  auto VT = TLI->getValueType(DL, SrcVTy);
+  int Cost = 0;
+  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires expand/truncate for data and a shuffle for mask.
-    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
-            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0);
+    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
+            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
 
   else if (LT.second.getVectorNumElements() > NumElem) {
     VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -874,7 +1043,7 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   return Cost+LT.first;
 }
 
-unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -887,10 +1056,10 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   return BaseT::getAddressComputationCost(Ty, IsComplex);
 }
 
-unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
-                                      bool IsPairwise) {
+int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+                                 bool IsPairwise) {
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
@@ -900,7 +1069,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
   // and make it as the cost.
 
-  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
+  static const CostTblEntry SSE42CostTblPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   2 },
     { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
@@ -908,7 +1077,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
     { ISD::ADD,   MVT::v8i16,   5 },
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
+  static const CostTblEntry AVX1CostTblPairWise[] = {
     { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::FADD,  MVT::v4f64,   5 },
     { ISD::FADD,  MVT::v8f32,   7 },
@@ -919,7 +1088,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
     { ISD::ADD,   MVT::v8i32,   5 },
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
+  static const CostTblEntry SSE42CostTblNoPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   2 },
     { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
@@ -927,7 +1096,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
   };
 
-  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
+  static const CostTblEntry AVX1CostTblNoPairWise[] = {
     { ISD::FADD,  MVT::v4f32,   3 },
     { ISD::FADD,  MVT::v4f64,   3 },
     { ISD::FADD,  MVT::v8f32,   4 },
@@ -939,29 +1108,21 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
   };
 
   if (IsPairwise) {
-    if (ST->hasAVX()) {
-      int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
-      if (Idx != -1)
-        return LT.first * AVX1CostTblPairWise[Idx].Cost;
-    }
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-    if (ST->hasSSE42()) {
-      int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
-      if (Idx != -1)
-        return LT.first * SSE42CostTblPairWise[Idx].Cost;
-    }
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
   } else {
-    if (ST->hasAVX()) {
-      int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
-      if (Idx != -1)
-        return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
-    }
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
 
-    if (ST->hasSSE42()) {
-      int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
-      if (Idx != -1)
-        return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
-    }
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
   }
 
   return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
@@ -970,7 +1131,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
-unsigned X86TTIImpl::getIntImmCost(int64_t Val) {
+int X86TTIImpl::getIntImmCost(int64_t Val) {
   if (Val == 0)
     return TTI::TCC_Free;
 
@@ -980,7 +1141,7 @@ unsigned X86TTIImpl::getIntImmCost(int64_t Val) {
   return 2 * TTI::TCC_Basic;
 }
 
-unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1004,18 +1165,18 @@ unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 
   // Split the constant into 64-bit chunks and calculate the cost for each
   // chunk.
-  unsigned Cost = 0;
+  int Cost = 0;
   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
     int64_t Val = Tmp.getSExtValue();
     Cost += getIntImmCost(Val);
   }
   // We need at least one instruction to materialze the constant.
-  return std::max(1U, Cost);
+  return std::max(1, Cost);
 }
 
-unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                   const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1038,6 +1199,26 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
   case Instruction::Store:
     ImmIdx = 0;
     break;
+  case Instruction::ICmp:
+    // This is an imperfect hack to prevent constant hoisting of
+    // compares that might be trying to check if a 64-bit value fits in
+    // 32-bits. The backend can optimize these cases using a right shift by 32.
+    // Ideally we would check the compare predicate here. There also other
+    // similar immediates the backend can use shifts for.
+    if (Idx == 1 && Imm.getBitWidth() == 64) {
+      uint64_t ImmVal = Imm.getZExtValue();
+      if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+        return TTI::TCC_Free;
+    }
+    ImmIdx = 1;
+    break;
+  case Instruction::And:
+    // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+    // by using a 32-bit operation with implicit zero extension. Detect such
+    // immediates here as the normal path expects bit 31 to be sign extended.
+    if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+      return TTI::TCC_Free;
+    // Fallthrough
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
@@ -1045,10 +1226,8 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
-  case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-  case Instruction::ICmp:
     ImmIdx = 1;
     break;
   // Always return TCC_Free for the shift value of a shift instruction.
@@ -1073,18 +1252,18 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
   }
 
   if (Idx == ImmIdx) {
-    unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
     return (Cost <= NumConstants * TTI::TCC_Basic)
-               ? static_cast<unsigned>(TTI::TCC_Free)
+               ? static_cast<int>(TTI::TCC_Free)
                : Cost;
   }
 
   return X86TTIImpl::getIntImmCost(Imm, Ty);
 }
 
-unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                   const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1118,23 +1297,181 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   return X86TTIImpl::getIntImmCost(Imm, Ty);
 }
 
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
-  int DataWidth = DataTy->getPrimitiveSizeInBits();
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+                                unsigned Alignment, unsigned AddressSpace) {
+
+  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+  unsigned VF = SrcVTy->getVectorNumElements();
+
+  // Try to reduce index size from 64 bit (default for GEP)
+  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+  // operation will use 16 x 64 indices which do not fit in a zmm and needs
+  // to split. Also check that the base pointer is the same for all lanes,
+  // and that there's at most one variable index.
+  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+    unsigned IndexSize = DL.getPointerSizeInBits();
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+    if (IndexSize < 64 || !GEP)
+      return IndexSize;
+ 
+    unsigned NumOfVarIndices = 0;
+    Value *Ptrs = GEP->getPointerOperand();
+    if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+      return IndexSize;
+    for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+      if (isa<Constant>(GEP->getOperand(i)))
+        continue;
+      Type *IndxTy = GEP->getOperand(i)->getType();
+      if (IndxTy->isVectorTy())
+        IndxTy = IndxTy->getVectorElementType();
+      if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+          !isa<SExtInst>(GEP->getOperand(i))) ||
+         ++NumOfVarIndices > 1)
+        return IndexSize; // 64
+    }
+    return (unsigned)32;
+  };
+
+
+  // Trying to reduce IndexSize to 32 bits for vector 16.
+  // By default the IndexSize is equal to pointer size.
+  unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+    DL.getPointerSizeInBits();
+
+  Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
+                                                    IndexSize), VF);
+  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+  if (SplitFactor > 1) {
+    // Handle splitting of vector of pointers
+    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+    return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+                                         AddressSpace);
+  }
+
+  // The gather / scatter cost is given by Intel architects. It is a rough
+  // number since we are looking at one instruction in a time.
+  const int GSOverhead = 2;
+  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                           Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+                                bool VariableMask, unsigned Alignment,
+                                unsigned AddressSpace) {
+  unsigned VF = SrcVTy->getVectorNumElements();
+
+  int MaskUnpackCost = 0;
+  if (VariableMask) {
+    VectorType *MaskTy =
+      VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
+    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+    int ScalarCompareCost =
+      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
+                         nullptr);
+    int BranchCost = getCFInstrCost(Instruction::Br);
+    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+  }
 
-  // Todo: AVX512 allows gather/scatter, works with strided and random as well
-  if ((DataWidth < 32) || (Consecutive == 0))
+  // The cost of the scalar loads/stores.
+  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                          Alignment, AddressSpace);
+
+  int InsertExtractCost = 0;
+  if (Opcode == Instruction::Load)
+    for (unsigned i = 0; i < VF; ++i)
+      // Add the cost of inserting each scalar load into the vector
+      InsertExtractCost +=
+        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+  else
+    for (unsigned i = 0; i < VF; ++i)
+      // Add the cost of extracting each element out of the data vector
+      InsertExtractCost +=
+        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+                                       Value *Ptr, bool VariableMask,
+                                       unsigned Alignment) {
+  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+  unsigned VF = SrcVTy->getVectorNumElements();
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy && Ptr->getType()->isVectorTy())
+    PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+  assert(PtrTy && "Unexpected type for Ptr argument");
+  unsigned AddressSpace = PtrTy->getAddressSpace();
+
+  bool Scalarize = false;
+  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+    Scalarize = true;
+  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+  // Vector-4 of gather/scatter instruction does not exist on KNL.
+  // We can extend it to 8 elements, but zeroing upper bits of
+  // the mask vector will add more instructions. Right now we give the scalar
+  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
+  // better in the VariableMask case.
+  if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+    Scalarize = true;
+
+  if (Scalarize)
+    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+
+  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+  Type *ScalarTy = DataTy->getScalarType();
+  int DataWidth = isa<PointerType>(ScalarTy) ?
+    DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+  return (DataWidth >= 32 && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
+  return isLegalMaskedLoad(DataType);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+  // This function is called now in two cases: from the Loop Vectorizer
+  // and from the Scalarizer.
+  // When the Loop Vectorizer asks about legality of the feature,
+  // the vectorization factor is not calculated yet. The Loop Vectorizer
+  // sends a scalar type and the decision is based on the width of the
+  // scalar element.
+  // Later on, the cost model will estimate usage this intrinsic based on
+  // the vector type.
+  // The Scalarizer asks again about legality. It sends a vector type.
+  // In this case we can reject non-power-of-2 vectors.
+  if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
     return false;
-  if (ST->hasAVX512() || ST->hasAVX2())
-    return true;
-  return false;
+  Type *ScalarTy = DataTy->getScalarType();
+  int DataWidth = isa<PointerType>(ScalarTy) ?
+    DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+  // AVX-512 allows gather and scatter
+  return DataWidth >= 32 && ST->hasAVX512();
 }
 
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) {
-  return isLegalMaskedLoad(DataType, Consecutive);
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+  return isLegalMaskedGather(DataType);
 }
 
-bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller,
-                                                 const Function *Callee) const {
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+                                     const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
 
   // Work this as a subsetting of subtarget features.
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
index da3f36c..adb745e 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -33,13 +33,13 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   const X86Subtarget *ST;
   const X86TargetLowering *TLI;
 
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+  int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
 
   const X86Subtarget *getST() const { return ST; }
   const X86TargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit X86TTIImpl(const X86TargetMachine *TM, Function &F)
+  explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
@@ -62,38 +62,44 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
-  unsigned getArithmeticInstrCost(
+  int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
-  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                          Type *SubTp);
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace);
-  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                 unsigned AddressSpace);
-
-  unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex);
-
-  unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
-
-  unsigned getIntImmCost(int64_t);
-
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
-
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  bool isLegalMaskedLoad(Type *DataType, int Consecutive);
-  bool isLegalMaskedStore(Type *DataType, int Consecutive);
-  bool hasCompatibleFunctionAttributes(const Function *Caller,
-                                       const Function *Callee) const;
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                            unsigned AddressSpace);
+  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                             bool VariableMask, unsigned Alignment);
+  int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+  int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+  int getIntImmCost(int64_t);
+
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
+  bool isLegalMaskedLoad(Type *DataType);
+  bool isLegalMaskedStore(Type *DataType);
+  bool isLegalMaskedGather(Type *DataType);
+  bool isLegalMaskedScatter(Type *DataType);
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+private:
+  int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+                      unsigned Alignment, unsigned AddressSpace);
+  int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                      unsigned Alignment, unsigned AddressSpace);
 
   /// @}
 };
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
index 9190d0b..dce94a9 100644
--- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -15,7 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
@@ -38,12 +39,16 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "winehstate"
 
+namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); }
+
 namespace {
 class WinEHStatePass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
 
-  WinEHStatePass() : FunctionPass(ID) {}
+  WinEHStatePass() : FunctionPass(ID) {
+    initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
+  }
 
   bool runOnFunction(Function &Fn) override;
 
@@ -62,18 +67,13 @@ private:
 
   void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
   void unlinkExceptionRegistration(IRBuilder<> &Builder);
-  void addCXXStateStores(Function &F, MachineModuleInfo &MMI);
-  void addSEHStateStores(Function &F, MachineModuleInfo &MMI);
-  void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo,
-                                  Function &F, int BaseState);
+  void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
   void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State);
 
   Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
 
   Function *generateLSDAInEAXThunk(Function *ParentFunc);
 
-  int escapeRegNode(Function &F);
-
   // Module-level type getters.
   Type *getEHLinkRegistrationType();
   Type *getSEHRegistrationType();
@@ -111,6 +111,9 @@ FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
 
 char WinEHStatePass::ID = 0;
 
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+                "Insert stores for EH state numbers", false, false)
+
 bool WinEHStatePass::doInitialization(Module &M) {
   TheModule = &M;
   FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
@@ -138,14 +141,7 @@ void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool WinEHStatePass::runOnFunction(Function &F) {
-  // If this is an outlined handler, don't do anything. We'll do state insertion
-  // for it in the parent.
-  StringRef WinEHParentName =
-      F.getFnAttribute("wineh-parent").getValueAsString();
-  if (WinEHParentName != F.getName() && !WinEHParentName.empty())
-    return false;
-
-  // Check the personality. Do nothing if this is not an MSVC personality.
+  // Check the personality. Do nothing if this personality doesn't use funclets.
   if (!F.hasPersonalityFn())
     return false;
   PersonalityFn =
@@ -153,7 +149,19 @@ bool WinEHStatePass::runOnFunction(Function &F) {
   if (!PersonalityFn)
     return false;
   Personality = classifyEHPersonality(PersonalityFn);
-  if (!isMSVCEHPersonality(Personality))
+  if (!isFuncletEHPersonality(Personality))
+    return false;
+
+  // Skip this function if there are no EH pads and we aren't using IR-level
+  // outlining.
+  bool HasPads = false;
+  for (BasicBlock &BB : F) {
+    if (BB.isEHPad()) {
+      HasPads = true;
+      break;
+    }
+  }
+  if (!HasPads)
     return false;
 
   // Disable frame pointer elimination in this function.
@@ -163,14 +171,13 @@ bool WinEHStatePass::runOnFunction(Function &F) {
 
   emitExceptionRegistrationRecord(&F);
 
-  auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>();
-  assert(MMIPtr && "MachineModuleInfo should always be available");
-  MachineModuleInfo &MMI = *MMIPtr;
-  switch (Personality) {
-  default: llvm_unreachable("unexpected personality function");
-  case EHPersonality::MSVC_CXX:    addCXXStateStores(F, MMI); break;
-  case EHPersonality::MSVC_X86SEH: addSEHStateStores(F, MMI); break;
-  }
+  // The state numbers calculated here in IR must agree with what we calculate
+  // later on for the MachineFunction. In particular, if an IR pass deletes an
+  // unreachable EH pad after this point before machine CFG construction, we
+  // will be in trouble. If this assumption is ever broken, we should turn the
+  // numbers into an immutable analysis pass.
+  WinEHFuncInfo FuncInfo;
+  addStateStores(F, FuncInfo);
 
   // Reset per-function state.
   PersonalityFn = nullptr;
@@ -261,7 +268,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
     // TryLevel = -1
     StateFieldIndex = 2;
-    insertStateNumberStore(RegNode, Builder.GetInsertPoint(), -1);
+    insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1);
     // Handler = __ehhandler$F
     Function *Trampoline = generateLSDAInEAXThunk(F);
     Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
@@ -278,7 +285,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
     // TryLevel = -2 / -1
     StateFieldIndex = 4;
-    insertStateNumberStore(RegNode, Builder.GetInsertPoint(),
+    insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(),
                            UseStackGuard ? -2 : -1);
     // ScopeTable = llvm.x86.seh.lsda(F)
     Value *FI8 = Builder.CreateBitCast(F, Int8PtrType);
@@ -347,7 +354,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
   Value *CastPersonality =
       Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
   auto AI = Trampoline->arg_begin();
-  Value *Args[5] = {LSDA, AI++, AI++, AI++, AI++};
+  Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
   CallInst *Call = Builder.CreateCall(CastPersonality, Args);
   // Can't use musttail due to prototype mismatch, but we can use tail.
   Call->setTailCall(true);
@@ -391,160 +398,53 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
   Builder.CreateStore(Next, FSZero);
 }
 
-void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) {
-  WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F);
-  calculateWinCXXEHStateNumbers(&F, FuncInfo);
-
-  // The base state for the parent is -1.
-  addCXXStateStoresToFunclet(RegNode, FuncInfo, F, -1);
-
-  // Set up RegNodeEscapeIndex
-  int RegNodeEscapeIndex = escapeRegNode(F);
-  FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex;
-
-  // Only insert stores in catch handlers.
-  Constant *FI8 =
-      ConstantExpr::getBitCast(&F, Type::getInt8PtrTy(TheModule->getContext()));
-  for (auto P : FuncInfo.HandlerBaseState) {
-    Function *Handler = const_cast<Function *>(P.first);
-    int BaseState = P.second;
-    IRBuilder<> Builder(&Handler->getEntryBlock(),
-                        Handler->getEntryBlock().begin());
-    // FIXME: Find and reuse such a call if present.
-    Value *ParentFP = Builder.CreateCall(FrameAddress, {Builder.getInt32(1)});
-    Value *RecoveredRegNode = Builder.CreateCall(
-        FrameRecover, {FI8, ParentFP, Builder.getInt32(RegNodeEscapeIndex)});
-    RecoveredRegNode =
-        Builder.CreateBitCast(RecoveredRegNode, RegNodeTy->getPointerTo(0));
-    addCXXStateStoresToFunclet(RecoveredRegNode, FuncInfo, *Handler, BaseState);
-  }
-}
-
-/// Escape RegNode so that we can access it from child handlers. Find the call
-/// to localescape, if any, in the entry block and append RegNode to the list
-/// of arguments.
-int WinEHStatePass::escapeRegNode(Function &F) {
-  // Find the call to localescape and extract its arguments.
-  IntrinsicInst *EscapeCall = nullptr;
-  for (Instruction &I : F.getEntryBlock()) {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
-      EscapeCall = II;
-      break;
-    }
-  }
-  SmallVector<Value *, 8> Args;
-  if (EscapeCall) {
-    auto Ops = EscapeCall->arg_operands();
-    Args.append(Ops.begin(), Ops.end());
-  }
-  Args.push_back(RegNode);
-
-  // Replace the call (if it exists) with new one. Otherwise, insert at the end
-  // of the entry block.
-  Instruction *InsertPt = EscapeCall;
-  if (!EscapeCall)
-    InsertPt = F.getEntryBlock().getTerminator();
-  IRBuilder<> Builder(&F.getEntryBlock(), InsertPt);
-  Builder.CreateCall(FrameEscape, Args);
-  if (EscapeCall)
-    EscapeCall->eraseFromParent();
-  return Args.size() - 1;
-}
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+  // Mark the registration node. The backend needs to know which alloca it is so
+  // that it can recover the original frame pointer.
+  IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator()));
+  Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+      {RegNodeI8});
+
+  // Calculate state numbers.
+  if (isAsynchronousEHPersonality(Personality))
+    calculateSEHStateNumbers(&F, FuncInfo);
+  else
+    calculateWinCXXEHStateNumbers(&F, FuncInfo);
 
-void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode,
-                                                WinEHFuncInfo &FuncInfo,
-                                                Function &F, int BaseState) {
   // Iterate all the instructions and emit state number stores.
+  DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
   for (BasicBlock &BB : F) {
+    // Figure out what state we should assign calls in this block.
+    int BaseState = -1;
+    auto &BBColors = BlockColors[&BB];
+
+    assert(BBColors.size() == 1 &&
+           "multi-color BB not removed by preparation");
+    BasicBlock *FuncletEntryBB = BBColors.front();
+    if (auto *FuncletPad =
+            dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+      auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+      if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+        BaseState = BaseStateI->second;
+    }
+
     for (Instruction &I : BB) {
       if (auto *CI = dyn_cast<CallInst>(&I)) {
         // Possibly throwing call instructions have no actions to take after
         // an unwind. Ensure they are in the -1 state.
         if (CI->doesNotThrow())
           continue;
-        insertStateNumberStore(ParentRegNode, CI, BaseState);
+        insertStateNumberStore(RegNode, CI, BaseState);
       } else if (auto *II = dyn_cast<InvokeInst>(&I)) {
         // Look up the state number of the landingpad this unwinds to.
-        LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst();
-        // FIXME: Why does this assertion fail?
-        //assert(FuncInfo.LandingPadStateMap.count(LPI) && "LP has no state!");
-        int State = FuncInfo.LandingPadStateMap[LPI];
-        insertStateNumberStore(ParentRegNode, II, State);
-      }
-    }
-  }
-}
-
-/// Assign every distinct landingpad a unique state number for SEH. Unlike C++
-/// EH, we can use this very simple algorithm while C++ EH cannot because catch
-/// handlers aren't outlined and the runtime doesn't have to figure out which
-/// catch handler frame to unwind to.
-/// FIXME: __finally blocks are outlined, so this approach may break down there.
-void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) {
-  WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F);
-
-  // Remember and return the index that we used. We save it in WinEHFuncInfo so
-  // that we can lower llvm.x86.seh.recoverfp later in filter functions without
-  // too much trouble.
-  int RegNodeEscapeIndex = escapeRegNode(F);
-  FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex;
-
-  // Iterate all the instructions and emit state number stores.
-  int CurState = 0;
-  SmallPtrSet<BasicBlock *, 4> ExceptBlocks;
-  for (BasicBlock &BB : F) {
-    for (auto I = BB.begin(), E = BB.end(); I != E; ++I) {
-      if (auto *CI = dyn_cast<CallInst>(I)) {
-        auto *Intrin = dyn_cast<IntrinsicInst>(CI);
-        if (Intrin) {
-          // Calls that "don't throw" are considered to be able to throw asynch
-          // exceptions, but intrinsics cannot.
-          continue;
-        }
-        insertStateNumberStore(RegNode, CI, -1);
-      } else if (auto *II = dyn_cast<InvokeInst>(I)) {
-        // Look up the state number of the landingpad this unwinds to.
-        LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst();
-        auto InsertionPair =
-            FuncInfo.LandingPadStateMap.insert(std::make_pair(LPI, CurState));
-        auto Iter = InsertionPair.first;
-        int &State = Iter->second;
-        bool Inserted = InsertionPair.second;
-        if (Inserted) {
-          // Each action consumes a state number.
-          auto *EHActions = cast<IntrinsicInst>(LPI->getNextNode());
-          SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
-          parseEHActions(EHActions, ActionList);
-          assert(!ActionList.empty());
-          CurState += ActionList.size();
-          State += ActionList.size() - 1;
-
-          // Remember all the __except block targets.
-          for (auto &Handler : ActionList) {
-            if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) {
-              auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc());
-#ifndef NDEBUG
-              for (BasicBlock *Pred : predecessors(BA->getBasicBlock()))
-                assert(Pred->isLandingPad() &&
-                       "WinEHPrepare failed to split block");
-#endif
-              ExceptBlocks.insert(BA->getBasicBlock());
-            }
-          }
-        }
+        assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+        int State = FuncInfo.InvokeStateMap[II];
         insertStateNumberStore(RegNode, II, State);
       }
     }
   }
-
-  // Insert llvm.x86.seh.restoreframe() into each __except block.
-  Function *RestoreFrame =
-      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_restoreframe);
-  for (BasicBlock *ExceptBB : ExceptBlocks) {
-    IRBuilder<> Builder(ExceptBB->begin());
-    Builder.CreateCall(RestoreFrame, {});
-  }
 }
 
 void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode,
diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 2e44ac9..aaf267a 100644
--- a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -224,7 +224,7 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
                                       uint64_t Address, const void *Decoder) {
   if (Val > 11)
     return MCDisassembler::Fail;
-  static unsigned Values[] = {
+  static const unsigned Values[] = {
     32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
   };
   Inst.addOperand(MCOperand::createImm(Values[Val]));
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 6fd2dec..dc513f7 100644
--- a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -19,8 +19,6 @@
 
 namespace llvm {
 
-class TargetMachine;
-
 class XCoreInstPrinter : public MCInstPrinter {
 public:
   XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index 702056d..b00cdd5 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -115,14 +115,14 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       EmitSpecialLLVMGlobal(GV))
     return;
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   OutStreamer->SwitchSection(
       getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
 
   MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
-  unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
-  
+  unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType());
+
   // Mark the start of the global
   getTargetStreamer().emitCCTopData(GVSym->getName());
 
@@ -154,15 +154,15 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   if (GV->isThreadLocal()) {
     report_fatal_error("TLS is not supported by this target!");
   }
-  unsigned Size = TD->getTypeAllocSize(C->getType());
+  unsigned Size = DL.getTypeAllocSize(C->getType());
   if (MAI->hasDotTypeDotSizeDirective()) {
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
     OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym),
                              MCConstantExpr::create(Size, OutContext));
   }
   OutStreamer->EmitLabel(GVSym);
-  
-  EmitGlobalConstant(C);
+
+  EmitGlobalConstant(DL, C);
   // The ABI requires that unsigned scalar types smaller than 32 bits
   // are padded to 32 bits.
   if (Size < 4)
@@ -208,7 +208,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
 
 void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
@@ -224,8 +224,8 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     getSymbol(MO.getGlobal())->print(O, MAI);
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
-      << '_' << MO.getIndex();
+    O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
     break;
   case MachineOperand::MO_BlockAddress:
     GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 76c3d81..ae493de 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -160,27 +160,26 @@ static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
 /// As offsets are negative, the largest offsets will be first.
 static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
                            MachineFrameInfo *MFI, XCoreFunctionInfo *XFI,
+                           const Constant *PersonalityFn,
                            const TargetLowering *TL) {
   assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots");
-  const int* EHSlot = XFI->getEHSpillSlot();
-  SpillList.push_back(StackSlotInfo(EHSlot[0],
-                                    MFI->getObjectOffset(EHSlot[0]),
-                                    TL->getExceptionPointerRegister()));
-  SpillList.push_back(StackSlotInfo(EHSlot[0],
-                                    MFI->getObjectOffset(EHSlot[1]),
-                                    TL->getExceptionSelectorRegister()));
+  const int *EHSlot = XFI->getEHSpillSlot();
+  SpillList.push_back(
+      StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[0]),
+                    TL->getExceptionPointerRegister(PersonalityFn)));
+  SpillList.push_back(
+      StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[1]),
+                    TL->getExceptionSelectorRegister(PersonalityFn)));
   std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
 }
 
-
 static MachineMemOperand *
 getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) {
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
-  MachineMemOperand *MMO =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
-                             flags, MFI.getObjectSize(FrameIndex),
-                             MFI.getObjectAlignment(FrameIndex));
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags,
+      MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex));
   return MMO;
 }
 
@@ -323,8 +322,11 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
     if (XFI->hasEHSpillSlot()) {
       // The unwinder requires stack slot & CFI offsets for the exception info.
       // We do not save/spill these registers.
-      SmallVector<StackSlotInfo,2> SpillList;
-      GetEHSpillList(SpillList, MFI, XFI,
+      const Function *Fn = MF.getFunction();
+      const Constant *PersonalityFn =
+          Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+      SmallVector<StackSlotInfo, 2> SpillList;
+      GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
                      MF.getSubtarget().getTargetLowering());
       assert(SpillList.size()==2 && "Unexpected SpillList size");
       EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
@@ -355,8 +357,12 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   if (RetOpcode == XCore::EH_RETURN) {
     // 'Restore' the exception info the unwinder has placed into the stack
     // slots.
-    SmallVector<StackSlotInfo,2> SpillList;
-    GetEHSpillList(SpillList, MFI, XFI, MF.getSubtarget().getTargetLowering());
+    const Function *Fn = MF.getFunction();
+    const Constant *PersonalityFn =
+        Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+    SmallVector<StackSlotInfo, 2> SpillList;
+    GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
+                   MF.getSubtarget().getTargetLowering());
     RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
 
     // Return to the landing pad.
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 9d4a966..9f61c84 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -151,8 +151,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
                                             MVT::Other, CPIdx,
                                             CurDAG->getEntryNode());
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = MF->getMachineMemOperand(
-        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4);      
+      MemOp[0] =
+          MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                                   MachineMemOperand::MOLoad, 4, 4);
       cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
       return node;
     }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index d62e742..105b2cf 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -79,9 +79,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
   // Compute derived properties from the register classes
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
-  // Division is expensive
-  setIntDivIsCheap(false);
-
   setStackPointerRegisterToSaveRestore(XCore::SP);
 
   setSchedulingPreference(Sched::Source);
@@ -154,8 +151,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
 
   // Exception handling
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
-  setExceptionPointerRegister(XCore::R0);
-  setExceptionSelectorRegister(XCore::R1);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 
   // Atomic operations
@@ -839,7 +834,7 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
   return DAG.getLoad(
       getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN,
-      MachinePointerInfo::getFixedStack(FI), false, false, false, 0);
+      MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0);
 }
 
 SDValue XCoreTargetLowering::
@@ -1367,8 +1362,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
       ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                          MachinePointerInfo::getFixedStack(FI),
-                          false, false, false, 0);
+                          MachinePointerInfo::getFixedStack(MF, FI), false,
+                          false, false, 0);
     }
     const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
     ArgData.push_back(ADP);
@@ -1517,9 +1512,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
     // Create a SelectionDAG node corresponding to a store
     // to this memory location.
     SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-    MemOpChains.push_back(DAG.getStore(Chain, dl, OutVals[i], FIN,
-                          MachinePointerInfo::getFixedStack(FI), false, false,
-                          0));
+    MemOpChains.push_back(DAG.getStore(
+        Chain, dl, OutVals[i], FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+        false, 0));
   }
 
   // Transform all store nodes into one single node because
@@ -1567,8 +1563,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // to set, the condition code register to branch on, the true/false values to
   // select between, and a branch opcode to use.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MachineFunction::iterator It = ++BB->getIterator();
 
   //  thisMBB:
   //  ...
@@ -1828,9 +1823,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Chain = ST->getChain();
 
     unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
-    if (StoreBits % 8) {
-      break;
-    }
+    assert((StoreBits % 8) == 0 &&
+           "Store size in bits must be a multiple of 8");
     unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
         ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
     unsigned Alignment = ST->getAlignment();
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
index ddd675c..b6f09ff 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -125,6 +125,20 @@ namespace llvm {
     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
                                Type *Ty, unsigned AS) const override;
 
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return XCore::R0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return XCore::R1;
+    }
+
   private:
     const TargetMachine &TM;
     const XCoreSubtarget &Subtarget;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index ee30344..e4129ae 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -368,11 +368,10 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     DL = I->getDebugLoc();
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
-  MachineMemOperand *MMO =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
-                             MachineMemOperand::MOStore,
-                             MFI.getObjectSize(FrameIndex),
-                             MFI.getObjectAlignment(FrameIndex));
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
   BuildMI(MBB, I, DL, get(XCore::STWFI))
     .addReg(SrcReg, getKillRegState(isKill))
     .addFrameIndex(FrameIndex)
@@ -391,11 +390,10 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     DL = I->getDebugLoc();
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
-  MachineMemOperand *MMO =
-    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
-                             MachineMemOperand::MOLoad,
-                             MFI.getObjectSize(FrameIndex),
-                             MFI.getObjectAlignment(FrameIndex));
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
   BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
     .addFrameIndex(FrameIndex)
     .addImm(0)
diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 996c6f5..f0b7201 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -228,12 +228,9 @@ bool XCoreLowerThreadLocal::runOnModule(Module &M) {
   // Find thread local globals.
   bool MadeChange = false;
   SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
-  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-       GVI != E; ++GVI) {
-    GlobalVariable *GV = GVI;
-    if (GV->isThreadLocal())
-      ThreadLocalGlobals.push_back(GV);
-  }
+  for (GlobalVariable &GV : M.globals())
+    if (GV.isThreadLocal())
+      ThreadLocalGlobals.push_back(&GV);
   for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
     MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
   }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index 9ef9752..6c77096 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- XCoreMachineFuctionInfo.cpp - XCore machine function info ---------===//
+//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
index 078ffde..cdcc52f 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===//
+//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index f420081..4a79dac 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -85,7 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() {
 }
 
 TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(XCoreTTIImpl(this, F));
   });
 }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
index b5a9905..aa16ecc 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -123,18 +123,21 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
     if (Kind.isMergeableConst16())      return MergeableConst16Section;
   }
   Type *ObjType = GV->getType()->getPointerElementType();
+  auto &DL = GV->getParent()->getDataLayout();
   if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
-      TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+      DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) {
     if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
                                                        : DataRelROSection;
     if (Kind.isBSS() || Kind.isCommon())return BSSSection;
-    if (Kind.isDataRel())               return DataSection;
+    if (Kind.isData())
+      return DataSection;
     if (Kind.isReadOnlyWithRel())       return DataRelROSection;
   } else {
     if (Kind.isReadOnly())              return UseCPRel? ReadOnlySectionLarge
                                                        : DataRelROSectionLarge;
     if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge;
-    if (Kind.isDataRel())               return DataSectionLarge;
+    if (Kind.isData())
+      return DataSectionLarge;
     if (Kind.isReadOnlyWithRel())       return DataRelROSectionLarge;
   }
 
@@ -142,9 +145,8 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   report_fatal_error("Target does not support TLS or Common sections");
 }
 
-MCSection *
-XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind,
-                                             const Constant *C) const {
+MCSection *XCoreTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
   if (Kind.isMergeableConst4())           return MergeableConst4Section;
   if (Kind.isMergeableConst8())           return MergeableConst8Section;
   if (Kind.isMergeableConst16())          return MergeableConst16Section;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
index 2a5ac23..6701c66 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -33,7 +33,7 @@ static const unsigned CodeModelLargeSize = 256;
                                       Mangler &Mang,
                                       const TargetMachine &TM) const override;
 
-    MCSection *getSectionForConstant(SectionKind Kind,
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                      const Constant *C) const override;
   };
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
index e23aef3..b2cb889 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -37,7 +37,7 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
   const XCoreTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F)
+  explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
         TLI(ST->getTargetLowering()) {}
 
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 4762011..0e05129 100644
--- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -34,8 +34,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
@@ -63,7 +66,8 @@ namespace {
   ///
   struct ArgPromotion : public CallGraphSCCPass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       CallGraphSCCPass::getAnalysisUsage(AU);
     }
 
@@ -81,7 +85,8 @@ namespace {
     bool isDenselyPacked(Type *type, const DataLayout &DL);
     bool canPaddingBeAccessed(Argument *Arg);
     CallGraphNode *PromoteArguments(CallGraphNode *CGN);
-    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
+    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal,
+                                 AAResults &AAR) const;
     CallGraphNode *DoPromotion(Function *F,
                               SmallPtrSetImpl<Argument*> &ArgsToPromote,
                               SmallPtrSetImpl<Argument*> &ByValArgsToTransform);
@@ -90,15 +95,15 @@ namespace {
     bool doInitialization(CallGraph &CG) override;
     /// The maximum number of elements to expand, or 0 for unlimited.
     unsigned maxElements;
-    DenseMap<const Function *, DISubprogram *> FunctionDIs;
   };
 }
 
 char ArgPromotion::ID = 0;
 INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 
@@ -217,9 +222,9 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
 
   // First check: see if there are any pointer arguments!  If not, quick exit.
   SmallVector<Argument*, 16> PointerArgs;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
-    if (I->getType()->isPointerTy())
-      PointerArgs.push_back(I);
+  for (Argument &I : F->args())
+    if (I.getType()->isPointerTy())
+      PointerArgs.push_back(&I);
   if (PointerArgs.empty()) return nullptr;
 
   // Second check: make sure that all callers are direct callers.  We can't
@@ -237,6 +242,14 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   
   const DataLayout &DL = F->getParent()->getDataLayout();
 
+  // We need to manually construct BasicAA directly in order to disable its use
+  // of other function analyses.
+  BasicAAResult BAR(createLegacyPMBasicAAResult(*this, *F));
+
+  // Construct our own AA results for this function. We do this manually to
+  // work around the limitations of the legacy pass manager.
+  AAResults AAR(createLegacyPMAAResults(*this, *F, BAR));
+
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
   SmallPtrSet<Argument*, 8> ArgsToPromote;
@@ -281,8 +294,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
         
         // If all the elements are single-value types, we can promote it.
         bool AllSimple = true;
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          if (!STy->getElementType(i)->isSingleValueType()) {
+        for (const auto *EltTy : STy->elements()) {
+          if (!EltTy->isSingleValueType()) {
             AllSimple = false;
             break;
           }
@@ -303,8 +316,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     if (isSelfRecursive) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         bool RecursiveType = false;
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          if (STy->getElementType(i) == PtrArg->getType()) {
+        for (const auto *EltTy : STy->elements()) {
+          if (EltTy == PtrArg->getType()) {
             RecursiveType = true;
             break;
           }
@@ -315,7 +328,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     }
     
     // Otherwise, see if we can promote the pointer to its value.
-    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr()))
+    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR))
       ArgsToPromote.insert(PtrArg);
   }
 
@@ -416,7 +429,8 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
 /// elements of the aggregate in order to avoid exploding the number of
 /// arguments passed in.
 bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
-                                           bool isByValOrInAlloca) const {
+                                           bool isByValOrInAlloca,
+                                           AAResults &AAR) const {
   typedef std::set<IndicesVector> GEPIndicesSet;
 
   // Quick exit for unused arguments
@@ -453,12 +467,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
 
   // First, iterate the entry block and mark loads of (geps of) arguments as
   // safe.
-  BasicBlock *EntryBlock = Arg->getParent()->begin();
+  BasicBlock &EntryBlock = Arg->getParent()->front();
   // Declare this here so we can reuse it
   IndicesVector Indices;
-  for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end();
-       I != E; ++I)
-    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+  for (Instruction &I : EntryBlock)
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
       Value *V = LI->getPointerOperand();
       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
         V = GEP->getPointerOperand();
@@ -501,12 +514,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
       if (GEP->use_empty()) {
         // Dead GEP's cause trouble later.  Just remove them if we run into
         // them.
-        getAnalysis<AliasAnalysis>().deleteValue(GEP);
         GEP->eraseFromParent();
         // TODO: This runs the above loop over and over again for dead GEPs
         // Couldn't we just do increment the UI iterator earlier and erase the
         // use?
-        return isSafeToPromoteArgument(Arg, isByValOrInAlloca);
+        return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR);
       }
 
       // Ensure that all of the indices are constants.
@@ -563,8 +575,6 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
   // blocks we know to be transparent to the load.
   SmallPtrSet<BasicBlock*, 16> TranspBlocks;
 
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-
   for (unsigned i = 0, e = Loads.size(); i != e; ++i) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
@@ -572,8 +582,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
     BasicBlock *BB = Load->getParent();
 
     MemoryLocation Loc = MemoryLocation::get(Load);
-    if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc,
-        AliasAnalysis::Mod))
+    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))
       return false;  // Pointer is invalidated!
 
     // Now check every path from the entry block to the load for transparency.
@@ -581,7 +590,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
     // loading block.
     for (BasicBlock *P : predecessors(BB)) {
       for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
-        if (AA.canBasicBlockModify(*TranspBB, Loc))
+        if (AAR.canBasicBlockModify(*TranspBB, Loc))
           return false;
     }
   }
@@ -637,13 +646,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   unsigned ArgIndex = 1;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
        ++I, ++ArgIndex) {
-    if (ByValArgsToTransform.count(I)) {
+    if (ByValArgsToTransform.count(&*I)) {
       // Simple byval argument? Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
       StructType *STy = cast<StructType>(AgTy);
       Params.insert(Params.end(), STy->element_begin(), STy->element_end());
       ++NumByValArgsPromoted;
-    } else if (!ArgsToPromote.count(I)) {
+    } else if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
       AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
@@ -661,7 +670,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
       // In this table, we will track which indices are loaded from the argument
       // (where direct loads are tracked as no indices).
-      ScalarizeTable &ArgIndices = ScalarizedElements[I];
+      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
       for (User *U : I->users()) {
         Instruction *UI = cast<Instruction>(U);
         Type *SrcTy;
@@ -687,7 +696,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         else
           // Take any load, we will use it only to update Alias Analysis
           OrigLoad = cast<LoadInst>(UI->user_back());
-        OriginalLoads[std::make_pair(I, Indices)] = OrigLoad;
+        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
       }
 
       // Add a parameter to the function for each element passed in.
@@ -722,15 +731,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   NF->copyAttributesFrom(F);
 
   // Patch the pointer to LLVM function in debug info descriptor.
-  auto DI = FunctionDIs.find(F);
-  if (DI != FunctionDIs.end()) {
-    DISubprogram *SP = DI->second;
-    SP->replaceFunction(NF);
-    // Ensure the map is updated so it can be reused on subsequent argument
-    // promotions of the same function.
-    FunctionDIs.erase(DI);
-    FunctionDIs[NF] = SP;
-  }
+  NF->setSubprogram(F->getSubprogram());
+  F->setSubprogram(nullptr);
 
   DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
         << "From: " << *F);
@@ -740,13 +742,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
   AttributesVec.clear();
 
-  F->getParent()->getFunctionList().insert(F, NF);
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
 
-  // Get the alias analysis information that we need to update to reflect our
-  // changes.
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-
   // Get the callgraph information that we need to update to reflect our
   // changes.
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
@@ -775,7 +773,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     ArgIndex = 1;
     for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
          I != E; ++I, ++AI, ++ArgIndex)
-      if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
         Args.push_back(*AI);          // Unmodified argument
 
         if (CallPAL.hasAttributes(ArgIndex)) {
@@ -783,7 +781,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
           AttributesVec.
             push_back(AttributeSet::get(F->getContext(), Args.size(), B));
         }
-      } else if (ByValArgsToTransform.count(I)) {
+      } else if (ByValArgsToTransform.count(&*I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = cast<PointerType>(I->getType())->getElementType();
         StructType *STy = cast<StructType>(AgTy);
@@ -798,14 +796,14 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         }
       } else if (!I->use_empty()) {
         // Non-dead argument: insert GEPs and loads as appropriate.
-        ScalarizeTable &ArgIndices = ScalarizedElements[I];
+        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
         // Store the Value* version of the indices in here, but declare it now
         // for reuse.
         std::vector<Value*> Ops;
         for (ScalarizeTable::iterator SI = ArgIndices.begin(),
                E = ArgIndices.end(); SI != E; ++SI) {
           Value *V = *AI;
-          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, SI->second)];
+          LoadInst *OrigLoad = OriginalLoads[std::make_pair(&*I, SI->second)];
           if (!SI->second.empty()) {
             Ops.reserve(SI->second.size());
             Type *ElTy = V->getType();
@@ -873,10 +871,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     Args.clear();
     AttributesVec.clear();
 
-    // Update the alias analysis implementation to know that we are replacing
-    // the old call with a new one.
-    AA.replaceWithNewValue(Call, New);
-
     // Update the callgraph to know that the callsite has been transformed.
     CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];
     CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN);
@@ -901,20 +895,19 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   //
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
        I2 = NF->arg_begin(); I != E; ++I) {
-    if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
       // If this is an unmodified argument, move the name and users over to the
       // new version.
-      I->replaceAllUsesWith(I2);
-      I2->takeName(I);
-      AA.replaceWithNewValue(I, I2);
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
       ++I2;
       continue;
     }
 
-    if (ByValArgsToTransform.count(I)) {
+    if (ByValArgsToTransform.count(&*I)) {
       // In the callee, we create an alloca, and store each of the new incoming
       // arguments into the alloca.
-      Instruction *InsertPt = NF->begin()->begin();
+      Instruction *InsertPt = &NF->begin()->front();
 
       // Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
@@ -929,13 +922,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
             AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
             InsertPt);
         I2->setName(I->getName()+"."+Twine(i));
-        new StoreInst(I2++, Idx, InsertPt);
+        new StoreInst(&*I2++, Idx, InsertPt);
       }
 
       // Anything that used the arg should now use the alloca.
       I->replaceAllUsesWith(TheAlloca);
-      TheAlloca->takeName(I);
-      AA.replaceWithNewValue(I, TheAlloca);
+      TheAlloca->takeName(&*I);
 
       // If the alloca is used in a call, we must clear the tail flag since
       // the callee now uses an alloca from the caller.
@@ -948,23 +940,20 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       continue;
     }
 
-    if (I->use_empty()) {
-      AA.deleteValue(I);
+    if (I->use_empty())
       continue;
-    }
 
     // Otherwise, if we promoted this argument, then all users are load
     // instructions (or GEPs with only load users), and all loads should be
     // using the new argument that we added.
-    ScalarizeTable &ArgIndices = ScalarizedElements[I];
+    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
 
     while (!I->use_empty()) {
       if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
         assert(ArgIndices.begin()->second.empty() &&
                "Load element should sort to front!");
         I2->setName(I->getName()+".val");
-        LI->replaceAllUsesWith(I2);
-        AA.replaceWithNewValue(LI, I2);
+        LI->replaceAllUsesWith(&*I2);
         LI->eraseFromParent();
         DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
               << "' in function '" << F->getName() << "'\n");
@@ -1000,11 +989,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         // the argument specified by ArgNo.
         while (!GEP->use_empty()) {
           LoadInst *L = cast<LoadInst>(GEP->user_back());
-          L->replaceAllUsesWith(TheArg);
-          AA.replaceWithNewValue(L, TheArg);
+          L->replaceAllUsesWith(&*TheArg);
           L->eraseFromParent();
         }
-        AA.deleteValue(GEP);
         GEP->eraseFromParent();
       }
     }
@@ -1013,10 +1000,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     std::advance(I2, ArgIndices.size());
   }
 
-  // Tell the alias analysis that the old function is about to disappear.
-  AA.replaceWithNewValue(F, NF);
-
-  
   NF_CGN->stealCalledFunctionsFrom(CG[F]);
   
   // Now that the old function is dead, delete it.  If there is a dangling
@@ -1032,6 +1015,5 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 }
 
 bool ArgPromotion::doInitialization(CallGraph &CG) {
-  FunctionDIs = makeSubprogramMap(CG.getModule());
   return CallGraphSCCPass::doInitialization(CG);
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index 8ce7646..0aa49d6 100644
--- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -119,7 +119,7 @@ bool ConstantMerge::runOnModule(Module &M) {
     // First: Find the canonical constants others will be merged with.
     for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
          GVI != E; ) {
-      GlobalVariable *GV = GVI++;
+      GlobalVariable *GV = &*GVI++;
 
       // If this GV is dead, remove it.
       GV->removeDeadConstantUsers();
@@ -160,7 +160,7 @@ bool ConstantMerge::runOnModule(Module &M) {
     // invalidating the Constant* pointers in CMap.
     for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
          GVI != E; ) {
-      GlobalVariable *GV = GVI++;
+      GlobalVariable *GV = &*GVI++;
 
       // Only process constants with initializers in the default address space.
       if (!GV->isConstant() || !GV->hasDefinitiveInitializer() ||
diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
new file mode 100644
index 0000000..5bbb751
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -0,0 +1,166 @@
+//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass exports all llvm.bitset's found in the module in the form of a
+// __cfi_check function, which can be used to verify cross-DSO call targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cross-dso-cfi"
+
+STATISTIC(TypeIds, "Number of unique type identifiers");
+
+namespace {
+
+struct CrossDSOCFI : public ModulePass {
+  static char ID;
+  CrossDSOCFI() : ModulePass(ID) {
+    initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry());
+  }
+
+  Module *M;
+  MDNode *VeryLikelyWeights;
+
+  ConstantInt *extractBitSetTypeId(MDNode *MD);
+  void buildCFICheck();
+
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false,
+                      false)
+INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false)
+char CrossDSOCFI::ID = 0;
+
+ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; }
+
+bool CrossDSOCFI::doInitialization(Module &Mod) {
+  M = &Mod;
+  VeryLikelyWeights =
+      MDBuilder(M->getContext()).createBranchWeights((1U << 20) - 1, 1);
+
+  return false;
+}
+
+/// extractBitSetTypeId - Extracts TypeId from a hash-based bitset MDNode.
+ConstantInt *CrossDSOCFI::extractBitSetTypeId(MDNode *MD) {
+  // This check excludes vtables for classes inside anonymous namespaces.
+  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+  if (!TM)
+    return nullptr;
+  auto C = dyn_cast_or_null<ConstantInt>(TM->getValue());
+  if (!C) return nullptr;
+  // We are looking for i64 constants.
+  if (C->getBitWidth() != 64) return nullptr;
+
+  // Sanity check.
+  auto FM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(1));
+  // Can be null if a function was removed by an optimization.
+  if (FM) {
+    auto F = dyn_cast<Function>(FM->getValue());
+    // But can never be a function declaration.
+    assert(!F || !F->isDeclaration());
+    (void)F; // Suppress unused variable warning in the no-asserts build.
+  }
+  return C;
+}
+
+/// buildCFICheck - emits __cfi_check for the current module.
+void CrossDSOCFI::buildCFICheck() {
+  // FIXME: verify that __cfi_check ends up near the end of the code section,
+  // but before the jump slots created in LowerBitSets.
+  llvm::DenseSet<uint64_t> BitSetIds;
+  NamedMDNode *BitSetNM = M->getNamedMetadata("llvm.bitsets");
+
+  if (BitSetNM)
+    for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I)
+      if (ConstantInt *TypeId = extractBitSetTypeId(BitSetNM->getOperand(I)))
+        BitSetIds.insert(TypeId->getZExtValue());
+
+  LLVMContext &Ctx = M->getContext();
+  Constant *C = M->getOrInsertFunction(
+      "__cfi_check",
+      FunctionType::get(
+          Type::getVoidTy(Ctx),
+          {Type::getInt64Ty(Ctx), PointerType::getUnqual(Type::getInt8Ty(Ctx))},
+          false));
+  Function *F = dyn_cast<Function>(C);
+  F->setAlignment(4096);
+  auto args = F->arg_begin();
+  Argument &CallSiteTypeId = *(args++);
+  CallSiteTypeId.setName("CallSiteTypeId");
+  Argument &Addr = *(args++);
+  Addr.setName("Addr");
+  assert(args == F->arg_end());
+
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+
+  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "trap", F);
+  IRBuilder<> IRBTrap(TrapBB);
+  Function *TrapFn = Intrinsic::getDeclaration(M, Intrinsic::trap);
+  llvm::CallInst *TrapCall = IRBTrap.CreateCall(TrapFn);
+  TrapCall->setDoesNotReturn();
+  TrapCall->setDoesNotThrow();
+  IRBTrap.CreateUnreachable();
+
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
+  IRBuilder<> IRBExit(ExitBB);
+  IRBExit.CreateRetVoid();
+
+  IRBuilder<> IRB(BB);
+  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, BitSetIds.size());
+  for (uint64_t TypeId : BitSetIds) {
+    ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
+    BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
+    IRBuilder<> IRBTest(TestBB);
+    Function *BitsetTestFn =
+        Intrinsic::getDeclaration(M, Intrinsic::bitset_test);
+
+    Value *Test = IRBTest.CreateCall(
+        BitsetTestFn, {&Addr, MetadataAsValue::get(
+                                  Ctx, ConstantAsMetadata::get(CaseTypeId))});
+    BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
+    BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
+
+    SI->addCase(CaseTypeId, TestBB);
+    ++TypeIds;
+  }
+}
+
+bool CrossDSOCFI::runOnModule(Module &M) {
+  if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
+    return false;
+  buildCFICheck();
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d044764..4de3d95 100644
--- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <map>
 #include <set>
 #include <tuple>
@@ -121,14 +122,6 @@ namespace {
 
     typedef SmallVector<RetOrArg, 5> UseVector;
 
-    // Map each LLVM function to corresponding metadata with debug info. If
-    // the function is replaced with another one, we should patch the pointer
-    // to LLVM function in metadata.
-    // As the code generation for module is finished (and DIBuilder is
-    // finalized) we assume that subprogram descriptors won't be changed, and
-    // they are stored in map for short duration anyway.
-    DenseMap<const Function *, DISubprogram *> FunctionDIs;
-
   protected:
     // DAH uses this to specify a different ID.
     explicit DAE(char &ID) : ModulePass(ID) {}
@@ -198,6 +191,13 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   if (Fn.hasAddressTaken())
     return false;
 
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (Fn.hasFnAttribute(Attribute::Naked)) {
+    return false;
+  }
+
   // Okay, we know we can transform this function if safe.  Scan its body
   // looking for calls marked musttail or calls to llvm.vastart.
   for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
@@ -229,7 +229,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   // Create the new function body and insert it into the module...
   Function *NF = Function::Create(NFTy, Fn.getLinkage());
   NF->copyAttributesFrom(&Fn);
-  Fn.getParent()->getFunctionList().insert(&Fn, NF);
+  Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
   NF->takeName(&Fn);
 
   // Loop over all of the callers of the function, transforming the call sites
@@ -296,20 +296,12 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
        I2 = NF->arg_begin(); I != E; ++I, ++I2) {
     // Move the name and users over to the new version.
-    I->replaceAllUsesWith(I2);
-    I2->takeName(I);
+    I->replaceAllUsesWith(&*I2);
+    I2->takeName(&*I);
   }
 
   // Patch the pointer to LLVM function in debug info descriptor.
-  auto DI = FunctionDIs.find(&Fn);
-  if (DI != FunctionDIs.end()) {
-    DISubprogram *SP = DI->second;
-    SP->replaceFunction(NF);
-    // Ensure the map is updated so it can be reused on non-varargs argument
-    // eliminations of the same function.
-    FunctionDIs.erase(DI);
-    FunctionDIs[NF] = SP;
-  }
+  NF->setSubprogram(Fn.getSubprogram());
 
   // Fix up any BlockAddresses that refer to the function.
   Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
@@ -345,16 +337,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
     return false;
 
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (Fn.hasFnAttribute(Attribute::Naked))
+    return false;
+
   if (Fn.use_empty())
     return false;
 
   SmallVector<unsigned, 8> UnusedArgs;
-  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); 
-       I != E; ++I) {
-    Argument *Arg = I;
-
-    if (Arg->use_empty() && !Arg->hasByValOrInAllocaAttr())
-      UnusedArgs.push_back(Arg->getArgNo());
+  for (Argument &Arg : Fn.args()) {
+    if (Arg.use_empty() && !Arg.hasByValOrInAllocaAttr())
+      UnusedArgs.push_back(Arg.getArgNo());
   }
 
   if (UnusedArgs.empty())
@@ -485,6 +480,10 @@ DAE::Liveness DAE::SurveyUse(const Use *U,
       if (F) {
         // Used in a direct call.
 
+        // The function argument is live if it is used as a bundle operand.
+        if (CS.isBundleOperand(U))
+          return Live;
+
         // Find the argument number. We know for sure that this use is an
         // argument, since if it was the function argument this would be an
         // indirect call and the we know can't be looking at a value of the
@@ -543,6 +542,14 @@ void DAE::SurveyFunction(const Function &F) {
     return;
   }
 
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (F.hasFnAttribute(Attribute::Naked)) {
+    MarkLive(F);
+    return;
+  }
+
   unsigned RetCount = NumRetVals(&F);
   // Assume all return values are dead
   typedef SmallVector<Liveness, 5> RetVals;
@@ -648,7 +655,7 @@ void DAE::SurveyFunction(const Function &F) {
     } else {
       // See what the effect of this use is (recording any uses that cause
       // MaybeLive in MaybeLiveArgUses). 
-      Result = SurveyUses(AI, MaybeLiveArgUses);
+      Result = SurveyUses(&*AI, MaybeLiveArgUses);
     }
 
     // Mark the result.
@@ -878,7 +885,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   NF->setAttributes(NewPAL);
   // Insert the new function before the old function, so we won't be processing
   // it again.
-  F->getParent()->getFunctionList().insert(F, NF);
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
 
   // Loop over all of the callers of the function, transforming the call sites
@@ -946,7 +953,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, "", Call);
+                               Args, "", Call->getParent());
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
       cast<InvokeInst>(New)->setAttributes(NewCallPAL);
     } else {
@@ -976,9 +983,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
                " must have been a struct or an array!");
         Instruction *InsertPt = Call;
         if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-          BasicBlock::iterator IP = II->getNormalDest()->begin();
-          while (isa<PHINode>(IP)) ++IP;
-          InsertPt = IP;
+          BasicBlock *NewEdge = SplitEdge(New->getParent(), II->getNormalDest());
+          InsertPt = &*NewEdge->getFirstInsertionPt();
         }
 
         // We used to return a struct or array. Instead of doing smart stuff
@@ -1026,8 +1032,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     if (ArgAlive[i]) {
       // If this is a live argument, move the name and users over to the new
       // version.
-      I->replaceAllUsesWith(I2);
-      I2->takeName(I);
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
       ++I2;
     } else {
       // If this argument is dead, replace any uses of it with null constants
@@ -1079,9 +1085,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       }
 
   // Patch the pointer to LLVM function in debug info descriptor.
-  auto DI = FunctionDIs.find(F);
-  if (DI != FunctionDIs.end())
-    DI->second->replaceFunction(NF);
+  NF->setSubprogram(F->getSubprogram());
 
   // Now that the old function is dead, delete it.
   F->eraseFromParent();
@@ -1092,9 +1096,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 bool DAE::runOnModule(Module &M) {
   bool Changed = false;
 
-  // Collect debug info descriptors for functions.
-  FunctionDIs = makeSubprogramMap(M);
-
   // First pass: Do a simple check to see if any functions can have their "..."
   // removed.  We can do this if they never call va_start.  This loop cannot be
   // fused with the next loop, because deleting a function invalidates
@@ -1119,7 +1120,7 @@ bool DAE::runOnModule(Module &M) {
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
     // Increment now, because the function will probably get removed (ie.
     // replaced by a new one).
-    Function *F = I++;
+    Function *F = &*I++;
     Changed |= RemoveDeadStuffFromFunction(F);
   }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index 67ba72d..af313a6 100644
--- a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -1,4 +1,5 @@
-//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===//
+//===-- ElimAvailExtern.cpp - DCE unreachable internal functions
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,9 +16,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Pass.h"
 using namespace llvm;
@@ -28,18 +27,18 @@ STATISTIC(NumFunctions, "Number of functions removed");
 STATISTIC(NumVariables, "Number of global variables removed");
 
 namespace {
-  struct EliminateAvailableExternally : public ModulePass {
-    static char ID; // Pass identification, replacement for typeid
-    EliminateAvailableExternally() : ModulePass(ID) {
-      initializeEliminateAvailableExternallyPass(
-          *PassRegistry::getPassRegistry());
-    }
+struct EliminateAvailableExternally : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  EliminateAvailableExternally() : ModulePass(ID) {
+    initializeEliminateAvailableExternallyPass(
+        *PassRegistry::getPassRegistry());
+  }
 
-    // run - Do the EliminateAvailableExternally pass on the specified module,
-    // optionally updating the specified callgraph to reflect the changes.
-    //
-    bool runOnModule(Module &M) override;
-  };
+  // run - Do the EliminateAvailableExternally pass on the specified module,
+  // optionally updating the specified callgraph to reflect the changes.
+  //
+  bool runOnModule(Module &M) override;
+};
 }
 
 char EliminateAvailableExternally::ID = 0;
@@ -54,30 +53,31 @@ bool EliminateAvailableExternally::runOnModule(Module &M) {
   bool Changed = false;
 
   // Drop initializers of available externally global variables.
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I) {
-    if (!I->hasAvailableExternallyLinkage())
+  for (GlobalVariable &GV : M.globals()) {
+    if (!GV.hasAvailableExternallyLinkage())
       continue;
-    if (I->hasInitializer()) {
-      Constant *Init = I->getInitializer();
-      I->setInitializer(nullptr);
+    if (GV.hasInitializer()) {
+      Constant *Init = GV.getInitializer();
+      GV.setInitializer(nullptr);
       if (isSafeToDestroyConstant(Init))
         Init->destroyConstant();
     }
-    I->removeDeadConstantUsers();
-    I->setLinkage(GlobalValue::ExternalLinkage);
+    GV.removeDeadConstantUsers();
+    GV.setLinkage(GlobalValue::ExternalLinkage);
     NumVariables++;
+    Changed = true;
   }
 
   // Drop the bodies of available externally functions.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    if (!I->hasAvailableExternallyLinkage())
+  for (Function &F : M) {
+    if (!F.hasAvailableExternallyLinkage())
       continue;
-    if (!I->isDeclaration())
+    if (!F.isDeclaration())
       // This will set the linkage to external
-      I->deleteBody();
-    I->removeDeadConstantUsers();
+      F.deleteBody();
+    F.removeDeadConstantUsers();
     NumFunctions++;
+    Changed = true;
   }
 
   return Changed;
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
index b9462f2..1a3b925 100644
--- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -83,7 +83,7 @@ namespace {
       for (Module::global_iterator I = M.global_begin(), E = M.global_end();
            I != E; ++I) {
         bool Delete =
-          deleteStuff == (bool)Named.count(I) && !I->isDeclaration();
+            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();
         if (!Delete) {
           if (I->hasAvailableExternallyLinkage())
             continue;
@@ -103,7 +103,7 @@ namespace {
       // Visit the Functions.
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
         bool Delete =
-          deleteStuff == (bool)Named.count(I) && !I->isDeclaration();
+            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();
         if (!Delete) {
           if (I->hasAvailableExternallyLinkage())
             continue;
@@ -124,7 +124,7 @@ namespace {
         Module::alias_iterator CurI = I;
         ++I;
 
-        bool Delete = deleteStuff == (bool)Named.count(CurI);
+        bool Delete = deleteStuff == (bool)Named.count(&*CurI);
         makeVisible(*CurI, Delete);
 
         if (Delete) {
@@ -143,7 +143,7 @@ namespace {
 
           }
           CurI->replaceAllUsesWith(Declaration);
-          delete CurI;
+          delete &*CurI;
         }
       }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
new file mode 100644
index 0000000..6df0447
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -0,0 +1,121 @@
+//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "forceattrs"
+
+static cl::list<std::string>
+    ForceAttributes("force-attribute", cl::Hidden,
+                    cl::desc("Add an attribute to a function. This should be a "
+                             "pair of 'function-name:attribute-name', for "
+                             "example -force-attribute=foo:noinline. This "
+                             "option can be specified multiple times."));
+
+static Attribute::AttrKind parseAttrKind(StringRef Kind) {
+  return StringSwitch<Attribute::AttrKind>(Kind)
+      .Case("alwaysinline", Attribute::AlwaysInline)
+      .Case("builtin", Attribute::Builtin)
+      .Case("cold", Attribute::Cold)
+      .Case("convergent", Attribute::Convergent)
+      .Case("inlinehint", Attribute::InlineHint)
+      .Case("jumptable", Attribute::JumpTable)
+      .Case("minsize", Attribute::MinSize)
+      .Case("naked", Attribute::Naked)
+      .Case("nobuiltin", Attribute::NoBuiltin)
+      .Case("noduplicate", Attribute::NoDuplicate)
+      .Case("noimplicitfloat", Attribute::NoImplicitFloat)
+      .Case("noinline", Attribute::NoInline)
+      .Case("nonlazybind", Attribute::NonLazyBind)
+      .Case("noredzone", Attribute::NoRedZone)
+      .Case("noreturn", Attribute::NoReturn)
+      .Case("norecurse", Attribute::NoRecurse)
+      .Case("nounwind", Attribute::NoUnwind)
+      .Case("optnone", Attribute::OptimizeNone)
+      .Case("optsize", Attribute::OptimizeForSize)
+      .Case("readnone", Attribute::ReadNone)
+      .Case("readonly", Attribute::ReadOnly)
+      .Case("argmemonly", Attribute::ArgMemOnly)
+      .Case("returns_twice", Attribute::ReturnsTwice)
+      .Case("safestack", Attribute::SafeStack)
+      .Case("sanitize_address", Attribute::SanitizeAddress)
+      .Case("sanitize_memory", Attribute::SanitizeMemory)
+      .Case("sanitize_thread", Attribute::SanitizeThread)
+      .Case("ssp", Attribute::StackProtect)
+      .Case("sspreq", Attribute::StackProtectReq)
+      .Case("sspstrong", Attribute::StackProtectStrong)
+      .Case("uwtable", Attribute::UWTable)
+      .Default(Attribute::None);
+}
+
+/// If F has any forced attributes given on the command line, add them.
+static void addForcedAttributes(Function &F) {
+  for (auto &S : ForceAttributes) {
+    auto KV = StringRef(S).split(':');
+    if (KV.first != F.getName())
+      continue;
+
+    auto Kind = parseAttrKind(KV.second);
+    if (Kind == Attribute::None) {
+      DEBUG(dbgs() << "ForcedAttribute: " << KV.second
+                   << " unknown or not handled!\n");
+      continue;
+    }
+    if (F.hasFnAttribute(Kind))
+      continue;
+    F.addFnAttr(Kind);
+  }
+}
+
+PreservedAnalyses ForceFunctionAttrsPass::run(Module &M) {
+  if (ForceAttributes.empty())
+    return PreservedAnalyses::all();
+
+  for (Function &F : M.functions())
+    addForcedAttributes(F);
+
+  // Just conservatively invalidate analyses, this isn't likely to be important.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct ForceFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ForceFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeForceFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (ForceAttributes.empty())
+      return false;
+
+    for (Function &F : M.functions())
+      addForcedAttributes(F);
+
+    // Conservatively assume we changed something.
+    return true;
+  }
+};
+}
+
+char ForceFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs",
+                "Force set function attributes", false, false)
+
+Pass *llvm::createForceFunctionAttrsLegacyPass() {
+  return new ForceFunctionAttrsLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index bb5e64a..527fdd1 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -6,16 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements a simple interprocedural pass which walks the
-// call-graph, looking for functions which do not access or only read
-// non-local memory, and marking them readnone/readonly.  It does the
-// same with function arguments independently, marking them readonly/
-// readnone/nocapture.  Finally, well-known library call declarations
-// are marked with all attributes that are consistent with the
-// function's standard definition. This pass is implemented as a
-// bottom-up traversal of the call-graph.
-//
+///
+/// \file
+/// This file implements interprocedural passes which walk the
+/// call-graph deducing and/or propagating function attributes.
+///
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
@@ -23,14 +18,21 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 
@@ -42,230 +44,185 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
 STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
 STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
-STATISTIC(NumAnnotated, "Number of attributes added to library functions");
+STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
+STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 
 namespace {
-  struct FunctionAttrs : public CallGraphSCCPass {
-    static char ID; // Pass identification, replacement for typeid
-    FunctionAttrs() : CallGraphSCCPass(ID), AA(nullptr) {
-      initializeFunctionAttrsPass(*PassRegistry::getPassRegistry());
-    }
+typedef SmallSetVector<Function *, 8> SCCNodeSet;
+}
 
-    // runOnSCC - Analyze the SCC, performing the transformation if possible.
-    bool runOnSCC(CallGraphSCC &SCC) override;
+namespace {
+struct PostOrderFunctionAttrs : public CallGraphSCCPass {
+  static char ID; // Pass identification, replacement for typeid
+  PostOrderFunctionAttrs() : CallGraphSCCPass(ID) {
+    initializePostOrderFunctionAttrsPass(*PassRegistry::getPassRegistry());
+  }
 
-    // AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
-    bool AddReadAttrs(const CallGraphSCC &SCC);
+  bool runOnSCC(CallGraphSCC &SCC) override;
 
-    // AddArgumentAttrs - Deduce nocapture attributes for the SCC.
-    bool AddArgumentAttrs(const CallGraphSCC &SCC);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
 
-    // IsFunctionMallocLike - Does this function allocate new memory?
-    bool IsFunctionMallocLike(Function *F,
-                              SmallPtrSet<Function*, 8> &) const;
+private:
+  TargetLibraryInfo *TLI;
+};
+}
 
-    // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
-    bool AddNoAliasAttrs(const CallGraphSCC &SCC);
+char PostOrderFunctionAttrs::ID = 0;
+INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrs, "functionattrs",
+                      "Deduce function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PostOrderFunctionAttrs, "functionattrs",
+                    "Deduce function attributes", false, false)
 
-    // Utility methods used by inferPrototypeAttributes to add attributes
-    // and maintain annotation statistics.
+Pass *llvm::createPostOrderFunctionAttrsPass() { return new PostOrderFunctionAttrs(); }
 
-    void setDoesNotAccessMemory(Function &F) {
-      if (!F.doesNotAccessMemory()) {
-        F.setDoesNotAccessMemory();
-        ++NumAnnotated;
-      }
-    }
+namespace {
+/// The three kinds of memory access relevant to 'readonly' and
+/// 'readnone' attributes.
+enum MemoryAccessKind {
+  MAK_ReadNone = 0,
+  MAK_ReadOnly = 1,
+  MAK_MayWrite = 2
+};
+}
 
-    void setOnlyReadsMemory(Function &F) {
-      if (!F.onlyReadsMemory()) {
-        F.setOnlyReadsMemory();
-        ++NumAnnotated;
-      }
-    }
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
+                                                  const SCCNodeSet &SCCNodes) {
+  FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
+  if (MRB == FMRB_DoesNotAccessMemory)
+    // Already perfect!
+    return MAK_ReadNone;
+
+  // Definitions with weak linkage may be overridden at linktime with
+  // something that writes memory, so treat them like declarations.
+  if (F.isDeclaration() || F.mayBeOverridden()) {
+    if (AliasAnalysis::onlyReadsMemory(MRB))
+      return MAK_ReadOnly;
+
+    // Conservatively assume it writes to memory.
+    return MAK_MayWrite;
+  }
 
-    void setDoesNotThrow(Function &F) {
-      if (!F.doesNotThrow()) {
-        F.setDoesNotThrow();
-        ++NumAnnotated;
-      }
-    }
+  // Scan the function body for instructions that may read or write memory.
+  bool ReadsMemory = false;
+  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+    Instruction *I = &*II;
+
+    // Some instructions can be ignored even if they read or write memory.
+    // Detect these now, skipping to the next instruction if one is found.
+    CallSite CS(cast<Value>(I));
+    if (CS) {
+      // Ignore calls to functions in the same SCC.
+      if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
+        continue;
+      FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS);
 
-    void setDoesNotCapture(Function &F, unsigned n) {
-      if (!F.doesNotCapture(n)) {
-        F.setDoesNotCapture(n);
-        ++NumAnnotated;
-      }
-    }
+      // If the call doesn't access memory, we're done.
+      if (!(MRB & MRI_ModRef))
+        continue;
 
-    void setOnlyReadsMemory(Function &F, unsigned n) {
-      if (!F.onlyReadsMemory(n)) {
-        F.setOnlyReadsMemory(n);
-        ++NumAnnotated;
+      if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+        // The call could access any memory. If that includes writes, give up.
+        if (MRB & MRI_Mod)
+          return MAK_MayWrite;
+        // If it reads, note it.
+        if (MRB & MRI_Ref)
+          ReadsMemory = true;
+        continue;
       }
-    }
 
-    void setDoesNotAlias(Function &F, unsigned n) {
-      if (!F.doesNotAlias(n)) {
-        F.setDoesNotAlias(n);
-        ++NumAnnotated;
-      }
-    }
+      // Check whether all pointer arguments point to local memory, and
+      // ignore calls that only access local memory.
+      for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+           CI != CE; ++CI) {
+        Value *Arg = *CI;
+        if (!Arg->getType()->isPtrOrPtrVectorTy())
+          continue;
 
-    // inferPrototypeAttributes - Analyze the name and prototype of the
-    // given function and set any applicable attributes.  Returns true
-    // if any attributes were set and false otherwise.
-    bool inferPrototypeAttributes(Function &F);
+        AAMDNodes AAInfo;
+        I->getAAMetadata(AAInfo);
+        MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo);
 
-    // annotateLibraryCalls - Adds attributes to well-known standard library
-    // call declarations.
-    bool annotateLibraryCalls(const CallGraphSCC &SCC);
+        // Skip accesses to local or constant memory as they don't impact the
+        // externally visible mod/ref behavior.
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<AliasAnalysis>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      CallGraphSCCPass::getAnalysisUsage(AU);
+        if (MRB & MRI_Mod)
+          // Writes non-local memory.  Give up.
+          return MAK_MayWrite;
+        if (MRB & MRI_Ref)
+          // Ok, it reads non-local memory.
+          ReadsMemory = true;
+      }
+      continue;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      // Ignore non-volatile loads from local memory. (Atomic is okay here.)
+      if (!LI->isVolatile()) {
+        MemoryLocation Loc = MemoryLocation::get(LI);
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Ignore non-volatile stores to local memory. (Atomic is okay here.)
+      if (!SI->isVolatile()) {
+        MemoryLocation Loc = MemoryLocation::get(SI);
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+      // Ignore vaargs on local memory.
+      MemoryLocation Loc = MemoryLocation::get(VI);
+      if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+        continue;
     }
 
-  private:
-    AliasAnalysis *AA;
-    TargetLibraryInfo *TLI;
-  };
-}
+    // Any remaining instructions need to be taken seriously!  Check if they
+    // read or write memory.
+    if (I->mayWriteToMemory())
+      // Writes memory.  Just give up.
+      return MAK_MayWrite;
 
-char FunctionAttrs::ID = 0;
-INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
-                "Deduce function attributes", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
-                "Deduce function attributes", false, false)
-
-Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); }
-
-
-/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
-bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
-  SmallPtrSet<Function*, 8> SCCNodes;
+    // If this instruction may read memory, remember that.
+    ReadsMemory |= I->mayReadFromMemory();
+  }
 
-  // Fill SCCNodes with the elements of the SCC.  Used for quickly
-  // looking up whether a given CallGraphNode is in this SCC.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
-    SCCNodes.insert((*I)->getFunction());
+  return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
+}
 
+/// Deduce readonly/readnone attributes for the SCC.
+template <typename AARGetterT>
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
+  for (Function *F : SCCNodes) {
+    // Call the callable parameter to look up AA results for this function.
+    AAResults &AAR = AARGetter(*F);
 
-    if (!F || F->hasFnAttribute(Attribute::OptimizeNone))
-      // External node or node we don't want to optimize - assume it may write
-      // memory and give up.
+    switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) {
+    case MAK_MayWrite:
       return false;
-
-    AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F);
-    if (MRB == AliasAnalysis::DoesNotAccessMemory)
-      // Already perfect!
-      continue;
-
-    // Definitions with weak linkage may be overridden at linktime with
-    // something that writes memory, so treat them like declarations.
-    if (F->isDeclaration() || F->mayBeOverridden()) {
-      if (!AliasAnalysis::onlyReadsMemory(MRB))
-        // May write memory.  Just give up.
-        return false;
-
+    case MAK_ReadOnly:
       ReadsMemory = true;
-      continue;
-    }
-
-    // Scan the function body for instructions that may read or write memory.
-    for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
-      Instruction *I = &*II;
-
-      // Some instructions can be ignored even if they read or write memory.
-      // Detect these now, skipping to the next instruction if one is found.
-      CallSite CS(cast<Value>(I));
-      if (CS) {
-        // Ignore calls to functions in the same SCC.
-        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
-          continue;
-        AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(CS);
-        // If the call doesn't access arbitrary memory, we may be able to
-        // figure out something.
-        if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
-          // If the call does access argument pointees, check each argument.
-          if (AliasAnalysis::doesAccessArgPointees(MRB))
-            // Check whether all pointer arguments point to local memory, and
-            // ignore calls that only access local memory.
-            for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
-                 CI != CE; ++CI) {
-              Value *Arg = *CI;
-              if (Arg->getType()->isPointerTy()) {
-                AAMDNodes AAInfo;
-                I->getAAMetadata(AAInfo);
-
-                MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo);
-                if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) {
-                  if (MRB & AliasAnalysis::Mod)
-                    // Writes non-local memory.  Give up.
-                    return false;
-                  if (MRB & AliasAnalysis::Ref)
-                    // Ok, it reads non-local memory.
-                    ReadsMemory = true;
-                }
-              }
-            }
-          continue;
-        }
-        // The call could access any memory. If that includes writes, give up.
-        if (MRB & AliasAnalysis::Mod)
-          return false;
-        // If it reads, note it.
-        if (MRB & AliasAnalysis::Ref)
-          ReadsMemory = true;
-        continue;
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        // Ignore non-volatile loads from local memory. (Atomic is okay here.)
-        if (!LI->isVolatile()) {
-          MemoryLocation Loc = MemoryLocation::get(LI);
-          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
-            continue;
-        }
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // Ignore non-volatile stores to local memory. (Atomic is okay here.)
-        if (!SI->isVolatile()) {
-          MemoryLocation Loc = MemoryLocation::get(SI);
-          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
-            continue;
-        }
-      } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
-        // Ignore vaargs on local memory.
-        MemoryLocation Loc = MemoryLocation::get(VI);
-        if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
-          continue;
-      }
-
-      // Any remaining instructions need to be taken seriously!  Check if they
-      // read or write memory.
-      if (I->mayWriteToMemory())
-        // Writes memory.  Just give up.
-        return false;
-
-      // If this instruction may read memory, remember that.
-      ReadsMemory |= I->mayReadFromMemory();
+      break;
+    case MAK_ReadNone:
+      // Nothing to do!
+      break;
     }
   }
 
   // Success!  Functions in this SCC do not access memory, or only read memory.
   // Give them the appropriate attribute.
   bool MadeChange = false;
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
-
+  for (Function *F : SCCNodes) {
     if (F->doesNotAccessMemory())
       // Already perfect!
       continue;
@@ -278,11 +235,10 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
 
     // Clear out any existing attributes.
     AttrBuilder B;
-    B.addAttribute(Attribute::ReadOnly)
-      .addAttribute(Attribute::ReadNone);
-    F->removeAttributes(AttributeSet::FunctionIndex,
-                        AttributeSet::get(F->getContext(),
-                                          AttributeSet::FunctionIndex, B));
+    B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
+    F->removeAttributes(
+        AttributeSet::FunctionIndex,
+        AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B));
 
     // Add in the new attribute.
     F->addAttribute(AttributeSet::FunctionIndex,
@@ -298,124 +254,140 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
 }
 
 namespace {
-  // For a given pointer Argument, this retains a list of Arguments of functions
-  // in the same SCC that the pointer data flows into. We use this to build an
-  // SCC of the arguments.
-  struct ArgumentGraphNode {
-    Argument *Definition;
-    SmallVector<ArgumentGraphNode*, 4> Uses;
-  };
-
-  class ArgumentGraph {
-    // We store pointers to ArgumentGraphNode objects, so it's important that
-    // that they not move around upon insert.
-    typedef std::map<Argument*, ArgumentGraphNode> ArgumentMapTy;
+/// For a given pointer Argument, this retains a list of Arguments of functions
+/// in the same SCC that the pointer data flows into. We use this to build an
+/// SCC of the arguments.
+struct ArgumentGraphNode {
+  Argument *Definition;
+  SmallVector<ArgumentGraphNode *, 4> Uses;
+};
+
+class ArgumentGraph {
+  // We store pointers to ArgumentGraphNode objects, so it's important that
+  // that they not move around upon insert.
+  typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy;
+
+  ArgumentMapTy ArgumentMap;
+
+  // There is no root node for the argument graph, in fact:
+  //   void f(int *x, int *y) { if (...) f(x, y); }
+  // is an example where the graph is disconnected. The SCCIterator requires a
+  // single entry point, so we maintain a fake ("synthetic") root node that
+  // uses every node. Because the graph is directed and nothing points into
+  // the root, it will not participate in any SCCs (except for its own).
+  ArgumentGraphNode SyntheticRoot;
+
+public:
+  ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
+
+  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator;
+
+  iterator begin() { return SyntheticRoot.Uses.begin(); }
+  iterator end() { return SyntheticRoot.Uses.end(); }
+  ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+
+  ArgumentGraphNode *operator[](Argument *A) {
+    ArgumentGraphNode &Node = ArgumentMap[A];
+    Node.Definition = A;
+    SyntheticRoot.Uses.push_back(&Node);
+    return &Node;
+  }
+};
 
-    ArgumentMapTy ArgumentMap;
+/// This tracker checks whether callees are in the SCC, and if so it does not
+/// consider that a capture, instead adding it to the "Uses" list and
+/// continuing with the analysis.
+struct ArgumentUsesTracker : public CaptureTracker {
+  ArgumentUsesTracker(const SCCNodeSet &SCCNodes)
+      : Captured(false), SCCNodes(SCCNodes) {}
 
-    // There is no root node for the argument graph, in fact:
-    //   void f(int *x, int *y) { if (...) f(x, y); }
-    // is an example where the graph is disconnected. The SCCIterator requires a
-    // single entry point, so we maintain a fake ("synthetic") root node that
-    // uses every node. Because the graph is directed and nothing points into
-    // the root, it will not participate in any SCCs (except for its own).
-    ArgumentGraphNode SyntheticRoot;
+  void tooManyUses() override { Captured = true; }
 
-  public:
-    ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
+  bool captured(const Use *U) override {
+    CallSite CS(U->getUser());
+    if (!CS.getInstruction()) {
+      Captured = true;
+      return true;
+    }
 
-    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator;
+    Function *F = CS.getCalledFunction();
+    if (!F || F->isDeclaration() || F->mayBeOverridden() ||
+        !SCCNodes.count(F)) {
+      Captured = true;
+      return true;
+    }
 
-    iterator begin() { return SyntheticRoot.Uses.begin(); }
-    iterator end() { return SyntheticRoot.Uses.end(); }
-    ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+    // Note: the callee and the two successor blocks *follow* the argument
+    // operands.  This means there is no need to adjust UseIndex to account for
+    // these.
 
-    ArgumentGraphNode *operator[](Argument *A) {
-      ArgumentGraphNode &Node = ArgumentMap[A];
-      Node.Definition = A;
-      SyntheticRoot.Uses.push_back(&Node);
-      return &Node;
-    }
-  };
+    unsigned UseIndex =
+        std::distance(const_cast<const Use *>(CS.arg_begin()), U);
 
-  // This tracker checks whether callees are in the SCC, and if so it does not
-  // consider that a capture, instead adding it to the "Uses" list and
-  // continuing with the analysis.
-  struct ArgumentUsesTracker : public CaptureTracker {
-    ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes)
-      : Captured(false), SCCNodes(SCCNodes) {}
+    assert(UseIndex < CS.data_operands_size() &&
+           "Indirect function calls should have been filtered above!");
 
-    void tooManyUses() override { Captured = true; }
+    if (UseIndex >= CS.getNumArgOperands()) {
+      // Data operand, but not a argument operand -- must be a bundle operand
+      assert(CS.hasOperandBundles() && "Must be!");
 
-    bool captured(const Use *U) override {
-      CallSite CS(U->getUser());
-      if (!CS.getInstruction()) { Captured = true; return true; }
+      // CaptureTracking told us that we're being captured by an operand bundle
+      // use.  In this case it does not matter if the callee is within our SCC
+      // or not -- we've been captured in some unknown way, and we have to be
+      // conservative.
+      Captured = true;
+      return true;
+    }
 
-      Function *F = CS.getCalledFunction();
-      if (!F || !SCCNodes.count(F)) { Captured = true; return true; }
-
-      bool Found = false;
-      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-      for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end();
-           PI != PE; ++PI, ++AI) {
-        if (AI == AE) {
-          assert(F->isVarArg() && "More params than args in non-varargs call");
-          Captured = true;
-          return true;
-        }
-        if (PI == U) {
-          Uses.push_back(AI);
-          Found = true;
-          break;
-        }
-      }
-      assert(Found && "Capturing call-site captured nothing?");
-      (void)Found;
-      return false;
+    if (UseIndex >= F->arg_size()) {
+      assert(F->isVarArg() && "More params than args in non-varargs call");
+      Captured = true;
+      return true;
     }
 
-    bool Captured;  // True only if certainly captured (used outside our SCC).
-    SmallVector<Argument*, 4> Uses;  // Uses within our SCC.
+    Uses.push_back(&*std::next(F->arg_begin(), UseIndex));
+    return false;
+  }
+
+  bool Captured; // True only if certainly captured (used outside our SCC).
+  SmallVector<Argument *, 4> Uses; // Uses within our SCC.
 
-    const SmallPtrSet<Function*, 8> &SCCNodes;
-  };
+  const SCCNodeSet &SCCNodes;
+};
 }
 
 namespace llvm {
-  template<> struct GraphTraits<ArgumentGraphNode*> {
-    typedef ArgumentGraphNode NodeType;
-    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator ChildIteratorType;
+template <> struct GraphTraits<ArgumentGraphNode *> {
+  typedef ArgumentGraphNode NodeType;
+  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;
 
-    static inline NodeType *getEntryNode(NodeType *A) { return A; }
-    static inline ChildIteratorType child_begin(NodeType *N) {
-      return N->Uses.begin();
-    }
-    static inline ChildIteratorType child_end(NodeType *N) {
-      return N->Uses.end();
-    }
-  };
-  template<> struct GraphTraits<ArgumentGraph*>
-    : public GraphTraits<ArgumentGraphNode*> {
-    static NodeType *getEntryNode(ArgumentGraph *AG) {
-      return AG->getEntryNode();
-    }
-    static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
-      return AG->begin();
-    }
-    static ChildIteratorType nodes_end(ArgumentGraph *AG) {
-      return AG->end();
-    }
-  };
+  static inline NodeType *getEntryNode(NodeType *A) { return A; }
+  static inline ChildIteratorType child_begin(NodeType *N) {
+    return N->Uses.begin();
+  }
+  static inline ChildIteratorType child_end(NodeType *N) {
+    return N->Uses.end();
+  }
+};
+template <>
+struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
+  static NodeType *getEntryNode(ArgumentGraph *AG) {
+    return AG->getEntryNode();
+  }
+  static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
+    return AG->begin();
+  }
+  static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); }
+};
 }
 
-// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
+/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
 static Attribute::AttrKind
 determinePointerReadAttrs(Argument *A,
-                          const SmallPtrSet<Argument*, 8> &SCCNodes) {
-                                                       
-  SmallVector<Use*, 32> Worklist;
-  SmallSet<Use*, 32> Visited;
-  int Count = 0;
+                          const SmallPtrSet<Argument *, 8> &SCCNodes) {
+
+  SmallVector<Use *, 32> Worklist;
+  SmallSet<Use *, 32> Visited;
 
   // inalloca arguments are always clobbered by the call.
   if (A->hasInAllocaAttr())
@@ -425,9 +397,6 @@ determinePointerReadAttrs(Argument *A,
   // We don't need to track IsWritten. If A is written to, return immediately.
 
   for (Use &U : A->uses()) {
-    if (Count++ >= 20)
-      return Attribute::None;
-
     Visited.insert(&U);
     Worklist.push_back(&U);
   }
@@ -435,7 +404,6 @@ determinePointerReadAttrs(Argument *A,
   while (!Worklist.empty()) {
     Use *U = Worklist.pop_back_val();
     Instruction *I = cast<Instruction>(U->getUser());
-    Value *V = U->get();
 
     switch (I->getOpcode()) {
     case Instruction::BitCast:
@@ -479,24 +447,44 @@ determinePointerReadAttrs(Argument *A,
         return Attribute::None;
       }
 
-      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-      for (CallSite::arg_iterator A = B; A != E; ++A, ++AI) {
-        if (A->get() == V) {
-          if (AI == AE) {
-            assert(F->isVarArg() &&
-                   "More params than args in non-varargs call.");
-            return Attribute::None;
-          }
-          Captures &= !CS.doesNotCapture(A - B);
-          if (SCCNodes.count(AI))
-            continue;
-          if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B))
-            return Attribute::None;
-          if (!CS.doesNotAccessMemory(A - B))
-            IsRead = true;
-        }
+      // Note: the callee and the two successor blocks *follow* the argument
+      // operands.  This means there is no need to adjust UseIndex to account
+      // for these.
+
+      unsigned UseIndex = std::distance(CS.arg_begin(), U);
+
+      // U cannot be the callee operand use: since we're exploring the
+      // transitive uses of an Argument, having such a use be a callee would
+      // imply the CallSite is an indirect call or invoke; and we'd take the
+      // early exit above.
+      assert(UseIndex < CS.data_operands_size() &&
+             "Data operand use expected!");
+
+      bool IsOperandBundleUse = UseIndex >= CS.getNumArgOperands();
+
+      if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
+        assert(F->isVarArg() && "More params than args in non-varargs call");
+        return Attribute::None;
       }
+
+      Captures &= !CS.doesNotCapture(UseIndex);
+
+      // Since the optimizer (by design) cannot see the data flow corresponding
+      // to a operand bundle use, these cannot participate in the optimistic SCC
+      // analysis.  Instead, we model the operand bundle uses as arguments in
+      // call to a function external to the SCC.
+      if (!SCCNodes.count(&*std::next(F->arg_begin(), UseIndex)) ||
+          IsOperandBundleUse) {
+
+        // The accessors used on CallSite here do the right thing for calls and
+        // invokes with operand bundles.
+
+        if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(UseIndex))
+          return Attribute::None;
+        if (!CS.doesNotAccessMemory(UseIndex))
+          IsRead = true;
+      }
+
       AddUsersToWorklistIfCapturing();
       break;
     }
@@ -517,21 +505,10 @@ determinePointerReadAttrs(Argument *A,
   return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
 }
 
-/// AddArgumentAttrs - Deduce nocapture attributes for the SCC.
-bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
+/// Deduce nocapture attributes for the SCC.
+static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
 
-  SmallPtrSet<Function*, 8> SCCNodes;
-
-  // Fill SCCNodes with the elements of the SCC.  Used for quickly
-  // looking up whether a given CallGraphNode is in this SCC.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
-    if (F && !F->isDeclaration() && !F->mayBeOverridden() &&
-        !F->hasFnAttribute(Attribute::OptimizeNone))
-      SCCNodes.insert(F);
-  }
-
   ArgumentGraph AG;
 
   AttrBuilder B;
@@ -539,14 +516,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
 
   // Check each function in turn, determining which pointer arguments are not
   // captured.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
-
-    if (!F || F->hasFnAttribute(Attribute::OptimizeNone))
-      // External node or function we're trying not to optimize - only a problem
-      // for arguments that we pass to it.
-      continue;
-
+  for (Function *F : SCCNodes) {
     // Definitions with weak linkage may be overridden at linktime with
     // something that captures pointers, so treat them like declarations.
     if (F->isDeclaration() || F->mayBeOverridden())
@@ -556,8 +526,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
     // a value can't capture arguments. Don't analyze them.
     if (F->onlyReadsMemory() && F->doesNotThrow() &&
         F->getReturnType()->isVoidTy()) {
-      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
-           A != E; ++A) {
+      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+           ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
           A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
           ++NumNoCapture;
@@ -567,26 +537,30 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
       continue;
     }
 
-    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
-         A != E; ++A) {
-      if (!A->getType()->isPointerTy()) continue;
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+         ++A) {
+      if (!A->getType()->isPointerTy())
+        continue;
       bool HasNonLocalUses = false;
       if (!A->hasNoCaptureAttr()) {
         ArgumentUsesTracker Tracker(SCCNodes);
-        PointerMayBeCaptured(A, &Tracker);
+        PointerMayBeCaptured(&*A, &Tracker);
         if (!Tracker.Captured) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
-            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B));
+            A->addAttr(
+                AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
             ++NumNoCapture;
             Changed = true;
           } else {
             // If it's not trivially captured and not trivially not captured,
             // then it must be calling into another function in our SCC. Save
             // its particulars for Argument-SCC analysis later.
-            ArgumentGraphNode *Node = AG[A];
-            for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(),
-                     UE = Tracker.Uses.end(); UI != UE; ++UI) {
+            ArgumentGraphNode *Node = AG[&*A];
+            for (SmallVectorImpl<Argument *>::iterator
+                     UI = Tracker.Uses.begin(),
+                     UE = Tracker.Uses.end();
+                 UI != UE; ++UI) {
               Node->Uses.push_back(AG[*UI]);
               if (*UI != A)
                 HasNonLocalUses = true;
@@ -600,9 +574,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
         // Note that we don't allow any calls at all here, or else our result
         // will be dependent on the iteration order through the functions in the
         // SCC.
-        SmallPtrSet<Argument*, 8> Self;
-        Self.insert(A);
-        Attribute::AttrKind R = determinePointerReadAttrs(A, Self);
+        SmallPtrSet<Argument *, 8> Self;
+        Self.insert(&*A);
+        Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
         if (R != Attribute::None) {
           AttrBuilder B;
           B.addAttribute(R);
@@ -621,10 +595,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
   // made.  If the definition doesn't have a 'nocapture' attribute by now, it
   // captures.
 
-  for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
+  for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
     const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;
     if (ArgumentSCC.size() == 1) {
-      if (!ArgumentSCC[0]->Definition) continue;  // synthetic root node
+      if (!ArgumentSCC[0]->Definition)
+        continue; // synthetic root node
 
       // eg. "void f(int* x) { if (...) f(x); }"
       if (ArgumentSCC[0]->Uses.size() == 1 &&
@@ -646,9 +621,10 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
           SCCCaptured = true;
       }
     }
-    if (SCCCaptured) continue;
+    if (SCCCaptured)
+      continue;
 
-    SmallPtrSet<Argument*, 8> ArgumentSCCNodes;
+    SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
     // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
     // quickly looking up whether a given Argument is in this ArgumentSCC.
     for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) {
@@ -658,8 +634,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
     for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
          I != E && !SCCCaptured; ++I) {
       ArgumentGraphNode *N = *I;
-      for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(),
-             UE = N->Uses.end(); UI != UE; ++UI) {
+      for (SmallVectorImpl<ArgumentGraphNode *>::iterator UI = N->Uses.begin(),
+                                                          UE = N->Uses.end();
+           UI != UE; ++UI) {
         Argument *A = (*UI)->Definition;
         if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
           continue;
@@ -667,7 +644,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
         break;
       }
     }
-    if (SCCCaptured) continue;
+    if (SCCCaptured)
+      continue;
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
@@ -704,8 +682,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
     if (ReadAttr != Attribute::None) {
       AttrBuilder B, R;
       B.addAttribute(ReadAttr);
-      R.addAttribute(Attribute::ReadOnly)
-        .addAttribute(Attribute::ReadNone);
+      R.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
         // Clear out existing readonly/readnone attributes
@@ -720,10 +697,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
   return Changed;
 }
 
-/// IsFunctionMallocLike - A function is malloc-like if it returns either null
-/// or a pointer that doesn't alias any other pointer visible to the caller.
-bool FunctionAttrs::IsFunctionMallocLike(Function *F,
-                              SmallPtrSet<Function*, 8> &SCCNodes) const {
+/// Tests whether a function is "malloc-like".
+///
+/// A function is "malloc-like" if it returns either null or a pointer that
+/// doesn't alias any other pointer visible to the caller.
+static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
   SmallSetVector<Value *, 8> FlowsToReturn;
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator()))
@@ -744,39 +722,38 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,
 
     if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
       switch (RVI->getOpcode()) {
-        // Extend the analysis by looking upwards.
-        case Instruction::BitCast:
-        case Instruction::GetElementPtr:
-        case Instruction::AddrSpaceCast:
-          FlowsToReturn.insert(RVI->getOperand(0));
-          continue;
-        case Instruction::Select: {
-          SelectInst *SI = cast<SelectInst>(RVI);
-          FlowsToReturn.insert(SI->getTrueValue());
-          FlowsToReturn.insert(SI->getFalseValue());
-          continue;
-        }
-        case Instruction::PHI: {
-          PHINode *PN = cast<PHINode>(RVI);
-          for (Value *IncValue : PN->incoming_values())
-            FlowsToReturn.insert(IncValue);
-          continue;
-        }
+      // Extend the analysis by looking upwards.
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::AddrSpaceCast:
+        FlowsToReturn.insert(RVI->getOperand(0));
+        continue;
+      case Instruction::Select: {
+        SelectInst *SI = cast<SelectInst>(RVI);
+        FlowsToReturn.insert(SI->getTrueValue());
+        FlowsToReturn.insert(SI->getFalseValue());
+        continue;
+      }
+      case Instruction::PHI: {
+        PHINode *PN = cast<PHINode>(RVI);
+        for (Value *IncValue : PN->incoming_values())
+          FlowsToReturn.insert(IncValue);
+        continue;
+      }
 
-        // Check whether the pointer came from an allocation.
-        case Instruction::Alloca:
+      // Check whether the pointer came from an allocation.
+      case Instruction::Alloca:
+        break;
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        CallSite CS(RVI);
+        if (CS.paramHasAttr(0, Attribute::NoAlias))
+          break;
+        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
           break;
-        case Instruction::Call:
-        case Instruction::Invoke: {
-          CallSite CS(RVI);
-          if (CS.paramHasAttr(0, Attribute::NoAlias))
-            break;
-          if (CS.getCalledFunction() &&
-              SCCNodes.count(CS.getCalledFunction()))
-            break;
-        } // fall-through
-        default:
-          return false;  // Did not come from an allocation.
+      } // fall-through
+      default:
+        return false; // Did not come from an allocation.
       }
 
     if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false))
@@ -786,24 +763,11 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,
   return true;
 }
 
-/// AddNoAliasAttrs - Deduce noalias attributes for the SCC.
-bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
-  SmallPtrSet<Function*, 8> SCCNodes;
-
-  // Fill SCCNodes with the elements of the SCC.  Used for quickly
-  // looking up whether a given CallGraphNode is in this SCC.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
-    SCCNodes.insert((*I)->getFunction());
-
+/// Deduce noalias attributes for the SCC.
+static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
   // Check each function in turn, determining which functions return noalias
   // pointers.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
-
-    if (!F || F->hasFnAttribute(Attribute::OptimizeNone))
-      // External node or node we don't want to optimize - skip it;
-      return false;
-
+  for (Function *F : SCCNodes) {
     // Already noalias.
     if (F->doesNotAlias(0))
       continue;
@@ -813,18 +777,17 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
     if (F->isDeclaration() || F->mayBeOverridden())
       return false;
 
-    // We annotate noalias return values, which are only applicable to 
+    // We annotate noalias return values, which are only applicable to
     // pointer types.
     if (!F->getReturnType()->isPointerTy())
       continue;
 
-    if (!IsFunctionMallocLike(F, SCCNodes))
+    if (!isFunctionMallocLike(F, SCCNodes))
       return false;
   }
 
   bool MadeChange = false;
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
+  for (Function *F : SCCNodes) {
     if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy())
       continue;
 
@@ -836,880 +799,308 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
   return MadeChange;
 }
 
-/// inferPrototypeAttributes - Analyze the name and prototype of the
-/// given function and set any applicable attributes.  Returns true
-/// if any attributes were set and false otherwise.
-bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
-  if (F.hasFnAttribute(Attribute::OptimizeNone))
-    return false;
+/// Tests whether this function is known to not return null.
+///
+/// Requires that the function returns a pointer.
+///
+/// Returns true if it believes the function will not return a null, and sets
+/// \p Speculative based on whether the returned conclusion is a speculative
+/// conclusion due to SCC calls.
+static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
+                            const TargetLibraryInfo &TLI, bool &Speculative) {
+  assert(F->getReturnType()->isPointerTy() &&
+         "nonnull only meaningful on pointer types");
+  Speculative = false;
 
-  FunctionType *FTy = F.getFunctionType();
-  LibFunc::Func TheLibFunc;
-  if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc)))
-    return false;
+  SmallSetVector<Value *, 8> FlowsToReturn;
+  for (BasicBlock &BB : *F)
+    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
 
-  switch (TheLibFunc) {
-  case LibFunc::strlen:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setOnlyReadsMemory(F);
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::strchr:
-  case LibFunc::strrchr:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isIntegerTy())
-      return false;
-    setOnlyReadsMemory(F);
-    setDoesNotThrow(F);
-    break;
-  case LibFunc::strtol:
-  case LibFunc::strtod:
-  case LibFunc::strtof:
-  case LibFunc::strtoul:
-  case LibFunc::strtoll:
-  case LibFunc::strtold:
-  case LibFunc::strtoull:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
-  case LibFunc::strcat:
-  case LibFunc::strncat:
-  case LibFunc::strncpy:
-  case LibFunc::stpncpy:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::strxfrm:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::strcmp: //0,1
-    case LibFunc::strspn: // 0,1
-    case LibFunc::strncmp: // 0,1
-    case LibFunc::strcspn: //0,1
-    case LibFunc::strcoll: //0,1
-    case LibFunc::strcasecmp:  // 0,1
-    case LibFunc::strncasecmp: // 
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setOnlyReadsMemory(F);
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::strstr:
-  case LibFunc::strpbrk:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setOnlyReadsMemory(F);
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::strtok:
-  case LibFunc::strtok_r:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::scanf:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::setbuf:
-  case LibFunc::setvbuf:
-    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::strdup:
-  case LibFunc::strndup:
-    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::stat:
-  case LibFunc::statvfs:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::sscanf:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::sprintf:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::snprintf:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 3);
-    setOnlyReadsMemory(F, 3);
-    break;
-  case LibFunc::setitimer:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(1)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    setDoesNotCapture(F, 3);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::system:
-    if (FTy->getNumParams() != 1 ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    // May throw; "system" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::malloc:
-    if (FTy->getNumParams() != 1 ||
-        !FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    break;
-  case LibFunc::memcmp:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setOnlyReadsMemory(F);
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::memchr:
-  case LibFunc::memrchr:
-    if (FTy->getNumParams() != 3)
-      return false;
-    setOnlyReadsMemory(F);
-    setDoesNotThrow(F);
-    break;
-  case LibFunc::modf:
-  case LibFunc::modff:
-  case LibFunc::modfl:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::memcpy:
-  case LibFunc::memccpy:
-  case LibFunc::memmove:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::memalign:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotAlias(F, 0);
-    break;
-  case LibFunc::mkdir:
-    if (FTy->getNumParams() == 0 ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::mktime:
-    if (FTy->getNumParams() == 0 ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::realloc:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::read:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "read" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::rewind:
-    if (FTy->getNumParams() < 1 ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::rmdir:
-  case LibFunc::remove:
-  case LibFunc::realpath:
-    if (FTy->getNumParams() < 1 ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::rename:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::readlink:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::write:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "write" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::bcopy:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::bcmp:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setOnlyReadsMemory(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::bzero:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::calloc:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    break;
-  case LibFunc::chmod:
-  case LibFunc::chown:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::ctermid:
-  case LibFunc::clearerr:
-  case LibFunc::closedir:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::atoi:
-  case LibFunc::atol:
-  case LibFunc::atof:
-  case LibFunc::atoll:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setOnlyReadsMemory(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::access:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::fopen:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::fdopen:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::feof:
-  case LibFunc::free:
-  case LibFunc::fseek:
-  case LibFunc::ftell:
-  case LibFunc::fgetc:
-  case LibFunc::fseeko:
-  case LibFunc::ftello:
-  case LibFunc::fileno:
-  case LibFunc::fflush:
-  case LibFunc::fclose:
-  case LibFunc::fsetpos:
-  case LibFunc::flockfile:
-  case LibFunc::funlockfile:
-  case LibFunc::ftrylockfile:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::ferror:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F);
-    break;
-  case LibFunc::fputc:
-  case LibFunc::fstat:
-  case LibFunc::frexp:
-  case LibFunc::frexpf:
-  case LibFunc::frexpl:
-  case LibFunc::fstatvfs:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::fgets:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 3);
-    break;
-  case LibFunc::fread:
-    if (FTy->getNumParams() != 4 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(3)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 4);
-    break;
-  case LibFunc::fwrite:
-    if (FTy->getNumParams() != 4 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(3)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 4);
-    break;
-  case LibFunc::fputs:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::fscanf:
-  case LibFunc::fprintf:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::fgetpos:
-    if (FTy->getNumParams() < 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::getc:
-  case LibFunc::getlogin_r:
-  case LibFunc::getc_unlocked:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::getenv:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setOnlyReadsMemory(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::gets:
-  case LibFunc::getchar:
-    setDoesNotThrow(F);
-    break;
-  case LibFunc::getitimer:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::getpwnam:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::ungetc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::uname:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::unlink:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::unsetenv:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::utime:
-  case LibFunc::utimes:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::putc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::puts:
-  case LibFunc::printf:
-  case LibFunc::perror:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::pread:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "pread" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::pwrite:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    // May throw; "pwrite" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::putchar:
-    setDoesNotThrow(F);
-    break;
-  case LibFunc::popen:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::pclose:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::vscanf:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::vsscanf:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(1)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::vfscanf:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(1)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::valloc:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    break;
-  case LibFunc::vprintf:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::vfprintf:
-  case LibFunc::vsprintf:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::vsnprintf:
-    if (FTy->getNumParams() != 4 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(2)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 3);
-    setOnlyReadsMemory(F, 3);
-    break;
-  case LibFunc::open:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    // May throw; "open" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::opendir:
-    if (FTy->getNumParams() != 1 ||
-        !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::tmpfile:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    break;
-  case LibFunc::times:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::htonl:
-  case LibFunc::htons:
-  case LibFunc::ntohl:
-  case LibFunc::ntohs:
-    setDoesNotThrow(F);
-    setDoesNotAccessMemory(F);
-    break;
-  case LibFunc::lstat:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::lchown:
-    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::qsort:
-    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
-      return false;
-    // May throw; places call through function pointer.
-    setDoesNotCapture(F, 4);
-    break;
-  case LibFunc::dunder_strdup:
-  case LibFunc::dunder_strndup:
-    if (FTy->getNumParams() < 1 ||
-        !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::dunder_strtok_r:
-    if (FTy->getNumParams() != 3 ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::under_IO_getc:
-    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::under_IO_putc:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::dunder_isoc99_scanf:
-    if (FTy->getNumParams() < 1 ||
-        !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::stat64:
-  case LibFunc::lstat64:
-  case LibFunc::statvfs64:
-    if (FTy->getNumParams() < 1 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::dunder_isoc99_sscanf:
-    if (FTy->getNumParams() < 1 ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::fopen64:
-    if (FTy->getNumParams() != 2 ||
-        !FTy->getReturnType()->isPointerTy() ||
-        !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    setOnlyReadsMemory(F, 1);
-    setOnlyReadsMemory(F, 2);
-    break;
-  case LibFunc::fseeko64:
-  case LibFunc::ftello64:
-    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    break;
-  case LibFunc::tmpfile64:
-    if (!FTy->getReturnType()->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotAlias(F, 0);
-    break;
-  case LibFunc::fstat64:
-  case LibFunc::fstatvfs64:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-      return false;
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::open64:
-    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i];
+
+    // If this value is locally known to be non-null, we're good
+    if (isKnownNonNull(RetVal, &TLI))
+      continue;
+
+    // Otherwise, we need to look upwards since we can't make any local
+    // conclusions.
+    Instruction *RVI = dyn_cast<Instruction>(RetVal);
+    if (!RVI)
       return false;
-    // May throw; "open" is a valid pthread cancellation point.
-    setDoesNotCapture(F, 1);
-    setOnlyReadsMemory(F, 1);
-    break;
-  case LibFunc::gettimeofday:
-    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
-        !FTy->getParamType(1)->isPointerTy())
+    switch (RVI->getOpcode()) {
+    // Extend the analysis by looking upwards.
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::AddrSpaceCast:
+      FlowsToReturn.insert(RVI->getOperand(0));
+      continue;
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(RVI);
+      FlowsToReturn.insert(SI->getTrueValue());
+      FlowsToReturn.insert(SI->getFalseValue());
+      continue;
+    }
+    case Instruction::PHI: {
+      PHINode *PN = cast<PHINode>(RVI);
+      for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        FlowsToReturn.insert(PN->getIncomingValue(i));
+      continue;
+    }
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallSite CS(RVI);
+      Function *Callee = CS.getCalledFunction();
+      // A call to a node within the SCC is assumed to return null until
+      // proven otherwise
+      if (Callee && SCCNodes.count(Callee)) {
+        Speculative = true;
+        continue;
+      }
       return false;
-    // Currently some platforms have the restrict keyword on the arguments to
-    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
-    // arguments.
-    setDoesNotThrow(F);
-    setDoesNotCapture(F, 1);
-    setDoesNotCapture(F, 2);
-    break;
-  default:
-    // Didn't mark any attributes.
-    return false;
+    }
+    default:
+      return false; // Unknown source, may be null
+    };
+    llvm_unreachable("should have either continued or returned");
   }
 
   return true;
 }
 
-/// annotateLibraryCalls - Adds attributes to well-known standard library
-/// call declarations.
-bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
+/// Deduce nonnull attributes for the SCC.
+static bool addNonNullAttrs(const SCCNodeSet &SCCNodes,
+                            const TargetLibraryInfo &TLI) {
+  // Speculative that all functions in the SCC return only nonnull
+  // pointers.  We may refute this as we analyze functions.
+  bool SCCReturnsNonNull = true;
+
   bool MadeChange = false;
 
-  // Check each function in turn annotating well-known library function
-  // declarations with attributes.
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-    Function *F = (*I)->getFunction();
+  // Check each function in turn, determining which functions return nonnull
+  // pointers.
+  for (Function *F : SCCNodes) {
+    // Already nonnull.
+    if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                        Attribute::NonNull))
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime, so
+    // treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      return false;
+
+    // We annotate nonnull return values, which are only applicable to
+    // pointer types.
+    if (!F->getReturnType()->isPointerTy())
+      continue;
+
+    bool Speculative = false;
+    if (isReturnNonNull(F, SCCNodes, TLI, Speculative)) {
+      if (!Speculative) {
+        // Mark the function eagerly since we may discover a function
+        // which prevents us from speculating about the entire SCC
+        DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
+        F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        ++NumNonNullReturn;
+        MadeChange = true;
+      }
+      continue;
+    }
+    // At least one function returns something which could be null, can't
+    // speculate any more.
+    SCCReturnsNonNull = false;
+  }
+
+  if (SCCReturnsNonNull) {
+    for (Function *F : SCCNodes) {
+      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                          Attribute::NonNull) ||
+          !F->getReturnType()->isPointerTy())
+        continue;
 
-    if (F && F->isDeclaration())
-      MadeChange |= inferPrototypeAttributes(*F);
+      DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
+      F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+      ++NumNonNullReturn;
+      MadeChange = true;
+    }
   }
 
   return MadeChange;
 }
 
-bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
-  AA = &getAnalysis<AliasAnalysis>();
+static bool setDoesNotRecurse(Function &F) {
+  if (F.doesNotRecurse())
+    return false;
+  F.setDoesNotRecurse();
+  ++NumNoRecurse;
+  return true;
+}
+
+static bool addNoRecurseAttrs(const CallGraphSCC &SCC) {
+  // Try and identify functions that do not recurse.
+
+  // If the SCC contains multiple nodes we know for sure there is recursion.
+  if (!SCC.isSingular())
+    return false;
+
+  const CallGraphNode *CGN = *SCC.begin();
+  Function *F = CGN->getFunction();
+  if (!F || F->isDeclaration() || F->doesNotRecurse())
+    return false;
+
+  // If all of the calls in F are identifiable and are to norecurse functions, F
+  // is norecurse. This check also detects self-recursion as F is not currently
+  // marked norecurse, so any called from F to F will not be marked norecurse.
+  if (std::all_of(CGN->begin(), CGN->end(),
+                  [](const CallGraphNode::CallRecord &CR) {
+                    Function *F = CR.second->getFunction();
+                    return F && F->doesNotRecurse();
+                  }))
+    // Function calls a potentially recursive function.
+    return setDoesNotRecurse(*F);
+
+  // Nothing else we can deduce usefully during the postorder traversal.
+  return false;
+}
+
+bool PostOrderFunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  bool Changed = false;
+
+  // We compute dedicated AA results for each function in the SCC as needed. We
+  // use a lambda referencing external objects so that they live long enough to
+  // be queried, but we re-use them each time.
+  Optional<BasicAAResult> BAR;
+  Optional<AAResults> AAR;
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
+    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
+    return *AAR;
+  };
+
+  // Fill SCCNodes with the elements of the SCC. Used for quickly looking up
+  // whether a given CallGraphNode is in this SCC. Also track whether there are
+  // any external or opt-none nodes that will prevent us from optimizing any
+  // part of the SCC.
+  SCCNodeSet SCCNodes;
+  bool ExternalNode = false;
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) {
+      // External node or function we're trying not to optimize - we both avoid
+      // transform them and avoid leveraging information they provide.
+      ExternalNode = true;
+      continue;
+    }
+
+    SCCNodes.insert(F);
+  }
+
+  Changed |= addReadAttrs(SCCNodes, AARGetter);
+  Changed |= addArgumentAttrs(SCCNodes);
+
+  // If we have no external nodes participating in the SCC, we can deduce some
+  // more precise attributes as well.
+  if (!ExternalNode) {
+    Changed |= addNoAliasAttrs(SCCNodes);
+    Changed |= addNonNullAttrs(SCCNodes, *TLI);
+  }
+
+  Changed |= addNoRecurseAttrs(SCC);
+  return Changed;
+}
+
+namespace {
+/// A pass to do RPO deduction and propagation of function attributes.
+///
+/// This pass provides a general RPO or "top down" propagation of
+/// function attributes. For a few (rare) cases, we can deduce significantly
+/// more about function attributes by working in RPO, so this pass
+/// provides the compliment to the post-order pass above where the majority of
+/// deduction is performed.
+// FIXME: Currently there is no RPO CGSCC pass structure to slide into and so
+// this is a boring module pass, but eventually it should be an RPO CGSCC pass
+// when such infrastructure is available.
+struct ReversePostOrderFunctionAttrs : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ReversePostOrderFunctionAttrs() : ModulePass(ID) {
+    initializeReversePostOrderFunctionAttrsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<CallGraphWrapperPass>();
+  }
+};
+}
+
+char ReversePostOrderFunctionAttrs::ID = 0;
+INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrs, "rpo-functionattrs",
+                      "Deduce function attributes in RPO", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(ReversePostOrderFunctionAttrs, "rpo-functionattrs",
+                    "Deduce function attributes in RPO", false, false)
+
+Pass *llvm::createReversePostOrderFunctionAttrsPass() {
+  return new ReversePostOrderFunctionAttrs();
+}
+
+static bool addNoRecurseAttrsTopDown(Function &F) {
+  // We check the preconditions for the function prior to calling this to avoid
+  // the cost of building up a reversible post-order list. We assert them here
+  // to make sure none of the invariants this relies on were violated.
+  assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!");
+  assert(!F.doesNotRecurse() &&
+         "This function has already been deduced as norecurs!");
+  assert(F.hasInternalLinkage() &&
+         "Can only do top-down deduction for internal linkage functions!");
+
+  // If F is internal and all of its uses are calls from a non-recursive
+  // functions, then none of its calls could in fact recurse without going
+  // through a function marked norecurse, and so we can mark this function too
+  // as norecurse. Note that the uses must actually be calls -- otherwise
+  // a pointer to this function could be returned from a norecurse function but
+  // this function could be recursively (indirectly) called. Note that this
+  // also detects if F is directly recursive as F is not yet marked as
+  // a norecurse function.
+  for (auto *U : F.users()) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    CallSite CS(I);
+    if (!CS || !CS.getParent()->getParent()->doesNotRecurse())
+      return false;
+  }
+  return setDoesNotRecurse(F);
+}
+
+bool ReversePostOrderFunctionAttrs::runOnModule(Module &M) {
+  // We only have a post-order SCC traversal (because SCCs are inherently
+  // discovered in post-order), so we accumulate them in a vector and then walk
+  // it in reverse. This is simpler than using the RPO iterator infrastructure
+  // because we need to combine SCC detection and the PO walk of the call
+  // graph. We can also cheat egregiously because we're primarily interested in
+  // synthesizing norecurse and so we can only save the singular SCCs as SCCs
+  // with multiple functions in them will clearly be recursive.
+  auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  SmallVector<Function *, 16> Worklist;
+  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+    if (I->size() != 1)
+      continue;
+
+    Function *F = I->front()->getFunction();
+    if (F && !F->isDeclaration() && !F->doesNotRecurse() &&
+        F->hasInternalLinkage())
+      Worklist.push_back(F);
+  }
+
+  bool Changed = false;
+  for (auto *F : reverse(Worklist))
+    Changed |= addNoRecurseAttrsTopDown(*F);
 
-  bool Changed = annotateLibraryCalls(SCC);
-  Changed |= AddReadAttrs(SCC);
-  Changed |= AddArgumentAttrs(SCC);
-  Changed |= AddNoAliasAttrs(SCC);
   return Changed;
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
new file mode 100644
index 0000000..5e0df95
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -0,0 +1,445 @@
+//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Function import based on summaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionImport.h"
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Object/FunctionIndexObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include <map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "function-import"
+
+/// Limit on instruction count of imported functions.
+static cl::opt<unsigned> ImportInstrLimit(
+    "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Only import functions with less than N instructions"));
+
+// Load lazily a module from \p FileName in \p Context.
+static std::unique_ptr<Module> loadFile(const std::string &FileName,
+                                        LLVMContext &Context) {
+  SMDiagnostic Err;
+  DEBUG(dbgs() << "Loading '" << FileName << "'\n");
+  // Metadata isn't loaded or linked until after all functions are
+  // imported, after which it will be materialized and linked.
+  std::unique_ptr<Module> Result =
+      getLazyIRFileModule(FileName, Err, Context,
+                          /* ShouldLazyLoadMetadata = */ true);
+  if (!Result) {
+    Err.print("function-import", errs());
+    return nullptr;
+  }
+
+  return Result;
+}
+
+namespace {
+/// Helper to load on demand a Module from file and cache it for subsequent
+/// queries. It can be used with the FunctionImporter.
+class ModuleLazyLoaderCache {
+  /// Cache of lazily loaded module for import.
+  StringMap<std::unique_ptr<Module>> ModuleMap;
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule;
+
+public:
+  /// Create the loader, Module will be initialized in \p Context.
+  ModuleLazyLoaderCache(std::function<
+      std::unique_ptr<Module>(StringRef FileName)> createLazyModule)
+      : createLazyModule(createLazyModule) {}
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  Module &operator()(StringRef FileName);
+
+  std::unique_ptr<Module> takeModule(StringRef FileName) {
+    auto I = ModuleMap.find(FileName);
+    assert(I != ModuleMap.end());
+    std::unique_ptr<Module> Ret = std::move(I->second);
+    ModuleMap.erase(I);
+    return Ret;
+  }
+};
+
+// Get a Module for \p FileName from the cache, or load it lazily.
+Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) {
+  auto &Module = ModuleMap[Identifier];
+  if (!Module)
+    Module = createLazyModule(Identifier);
+  return *Module;
+}
+} // anonymous namespace
+
+/// Walk through the instructions in \p F looking for external
+/// calls not already in the \p CalledFunctions set. If any are
+/// found they are added to the \p Worklist for importing.
+static void findExternalCalls(const Module &DestModule, Function &F,
+                              const FunctionInfoIndex &Index,
+                              StringSet<> &CalledFunctions,
+                              SmallVector<StringRef, 64> &Worklist) {
+  // We need to suffix internal function calls imported from other modules,
+  // prepare the suffix ahead of time.
+  std::string Suffix;
+  if (F.getParent() != &DestModule)
+    Suffix =
+        (Twine(".llvm.") +
+         Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str();
+
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (isa<CallInst>(I)) {
+        auto CalledFunction = cast<CallInst>(I).getCalledFunction();
+        // Insert any new external calls that have not already been
+        // added to set/worklist.
+        if (!CalledFunction || !CalledFunction->hasName())
+          continue;
+        // Ignore intrinsics early
+        if (CalledFunction->isIntrinsic()) {
+          assert(CalledFunction->getIntrinsicID() != 0);
+          continue;
+        }
+        auto ImportedName = CalledFunction->getName();
+        auto Renamed = (ImportedName + Suffix).str();
+        // Rename internal functions
+        if (CalledFunction->hasInternalLinkage()) {
+          ImportedName = Renamed;
+        }
+        auto It = CalledFunctions.insert(ImportedName);
+        if (!It.second) {
+          // This is a call to a function we already considered, skip.
+          continue;
+        }
+        // Ignore functions already present in the destination module
+        auto *SrcGV = DestModule.getNamedValue(ImportedName);
+        if (SrcGV) {
+          if (GlobalAlias *SGA = dyn_cast<GlobalAlias>(SrcGV))
+            SrcGV = SGA->getBaseObject();
+          assert(isa<Function>(SrcGV) && "Name collision during import");
+          if (!cast<Function>(SrcGV)->isDeclaration()) {
+            DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring "
+                         << ImportedName << " already in DestinationModule\n");
+            continue;
+          }
+        }
+
+        Worklist.push_back(It.first->getKey());
+        DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                     << ": Adding callee for : " << ImportedName << " : "
+                     << F.getName() << "\n");
+      }
+    }
+  }
+}
+
+// Helper function: given a worklist and an index, will process all the worklist
+// and decide what to import based on the summary information.
+//
+// Nothing is actually imported, functions are materialized in their source
+// module and analyzed there.
+//
+// \p ModuleToFunctionsToImportMap is filled with the set of Function to import
+// per Module.
+static void GetImportList(Module &DestModule,
+                          SmallVector<StringRef, 64> &Worklist,
+                          StringSet<> &CalledFunctions,
+                          std::map<StringRef, DenseSet<const GlobalValue *>>
+                              &ModuleToFunctionsToImportMap,
+                          const FunctionInfoIndex &Index,
+                          ModuleLazyLoaderCache &ModuleLoaderCache) {
+  while (!Worklist.empty()) {
+    auto CalledFunctionName = Worklist.pop_back_val();
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for "
+                 << CalledFunctionName << "\n");
+
+    // Try to get a summary for this function call.
+    auto InfoList = Index.findFunctionInfoList(CalledFunctionName);
+    if (InfoList == Index.end()) {
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for "
+                   << CalledFunctionName << " Ignoring.\n");
+      continue;
+    }
+    assert(!InfoList->second.empty() && "No summary, error at import?");
+
+    // Comdat can have multiple entries, FIXME: what do we do with them?
+    auto &Info = InfoList->second[0];
+    assert(Info && "Nullptr in list, error importing summaries?\n");
+
+    auto *Summary = Info->functionSummary();
+    if (!Summary) {
+      // FIXME: in case we are lazyloading summaries, we can do it now.
+      DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                   << ": Missing summary for  " << CalledFunctionName
+                   << ", error at import?\n");
+      llvm_unreachable("Missing summary");
+    }
+
+    if (Summary->instCount() > ImportInstrLimit) {
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of "
+                   << CalledFunctionName << " with " << Summary->instCount()
+                   << " instructions (limit " << ImportInstrLimit << ")\n");
+      continue;
+    }
+
+    // Get the module path from the summary.
+    auto ModuleIdentifier = Summary->modulePath();
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing "
+                 << CalledFunctionName << " from " << ModuleIdentifier << "\n");
+
+    auto &SrcModule = ModuleLoaderCache(ModuleIdentifier);
+
+    // The function that we will import!
+    GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName);
+
+    if (!SGV) {
+      // The destination module is referencing function using their renamed name
+      // when importing a function that was originally local in the source
+      // module. The source module we have might not have been renamed so we try
+      // to remove the suffix added during the renaming to recover the original
+      // name in the source module.
+      std::pair<StringRef, StringRef> Split =
+          CalledFunctionName.split(".llvm.");
+      SGV = SrcModule.getNamedValue(Split.first);
+      assert(SGV && "Can't find function to import in source module");
+    }
+    if (!SGV) {
+      report_fatal_error(Twine("Can't load function '") + CalledFunctionName +
+                         "' in Module '" + SrcModule.getModuleIdentifier() +
+                         "', error in the summary?\n");
+    }
+
+    Function *F = dyn_cast<Function>(SGV);
+    if (!F && isa<GlobalAlias>(SGV)) {
+      auto *SGA = dyn_cast<GlobalAlias>(SGV);
+      F = dyn_cast<Function>(SGA->getBaseObject());
+      CalledFunctionName = F->getName();
+    }
+    assert(F && "Imported Function is ... not a Function");
+
+    // We cannot import weak_any functions/aliases without possibly affecting
+    // the order they are seen and selected by the linker, changing program
+    // semantics.
+    if (SGV->hasWeakAnyLinkage()) {
+      DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                   << ": Ignoring import request for weak-any "
+                   << (isa<Function>(SGV) ? "function " : "alias ")
+                   << CalledFunctionName << " from "
+                   << SrcModule.getModuleIdentifier() << "\n");
+      continue;
+    }
+
+    // Add the function to the import list
+    auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()];
+    Entry.insert(F);
+
+    // Process the newly imported functions and add callees to the worklist.
+    F->materialize();
+    findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist);
+  }
+}
+
+// Automatically import functions in Module \p DestModule based on the summaries
+// index.
+//
+// The current implementation imports every called functions that exists in the
+// summaries index.
+bool FunctionImporter::importFunctions(Module &DestModule) {
+  DEBUG(dbgs() << "Starting import for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  unsigned ImportedCount = 0;
+
+  /// First step is collecting the called external functions.
+  StringSet<> CalledFunctions;
+  SmallVector<StringRef, 64> Worklist;
+  for (auto &F : DestModule) {
+    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
+      continue;
+    findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist);
+  }
+  if (Worklist.empty())
+    return false;
+
+  /// Second step: for every call to an external function, try to import it.
+
+  // Linker that will be used for importing function
+  Linker TheLinker(DestModule);
+
+  // Map of Module -> List of Function to import from the Module
+  std::map<StringRef, DenseSet<const GlobalValue *>>
+      ModuleToFunctionsToImportMap;
+
+  // Analyze the summaries and get the list of functions to import by
+  // populating ModuleToFunctionsToImportMap
+  ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader);
+  GetImportList(DestModule, Worklist, CalledFunctions,
+                ModuleToFunctionsToImportMap, Index, ModuleLoaderCache);
+  assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList");
+
+  StringMap<std::unique_ptr<DenseMap<unsigned, MDNode *>>>
+      ModuleToTempMDValsMap;
+
+  // Do the actual import of functions now, one Module at a time
+  for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) {
+    // Get the module for the import
+    auto &FunctionsToImport = FunctionsToImportPerModule.second;
+    std::unique_ptr<Module> SrcModule =
+        ModuleLoaderCache.takeModule(FunctionsToImportPerModule.first);
+    assert(&DestModule.getContext() == &SrcModule->getContext() &&
+           "Context mismatch");
+
+    // Save the mapping of value ids to temporary metadata created when
+    // importing this function. If we have already imported from this module,
+    // add new temporary metadata to the existing mapping.
+    auto &TempMDVals = ModuleToTempMDValsMap[SrcModule->getModuleIdentifier()];
+    if (!TempMDVals)
+      TempMDVals = llvm::make_unique<DenseMap<unsigned, MDNode *>>();
+
+    // Link in the specified functions.
+    if (TheLinker.linkInModule(std::move(SrcModule), Linker::Flags::None,
+                               &Index, &FunctionsToImport, TempMDVals.get()))
+      report_fatal_error("Function Import: link error");
+
+    ImportedCount += FunctionsToImport.size();
+  }
+
+  // Now link in metadata for all modules from which we imported functions.
+  for (StringMapEntry<std::unique_ptr<DenseMap<unsigned, MDNode *>>> &SME :
+       ModuleToTempMDValsMap) {
+    // Load the specified source module.
+    auto &SrcModule = ModuleLoaderCache(SME.getKey());
+    // The modules were created with lazy metadata loading. Materialize it
+    // now, before linking it.
+    SrcModule.materializeMetadata();
+    UpgradeDebugInfo(SrcModule);
+
+    // Link in all necessary metadata from this module.
+    if (TheLinker.linkInMetadata(SrcModule, SME.getValue().get()))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  return ImportedCount;
+}
+
+/// Summary file to use for function importing when using -function-import from
+/// the command line.
+static cl::opt<std::string>
+    SummaryFile("summary-file",
+                cl::desc("The summary file to use for function importing."));
+
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  raw_ostream &OS = errs();
+  DiagnosticPrinterRawOStream DP(OS);
+  DI.print(DP);
+  OS << '\n';
+}
+
+/// Parse the function index out of an IR file and return the function
+/// index object if found, or nullptr if not.
+static std::unique_ptr<FunctionInfoIndex>
+getFunctionIndexForFile(StringRef Path, std::string &Error,
+                        DiagnosticHandlerFunction DiagnosticHandler) {
+  std::unique_ptr<MemoryBuffer> Buffer;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(Path);
+  if (std::error_code EC = BufferOrErr.getError()) {
+    Error = EC.message();
+    return nullptr;
+  }
+  Buffer = std::move(BufferOrErr.get());
+  ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr =
+      object::FunctionIndexObjectFile::create(Buffer->getMemBufferRef(),
+                                              DiagnosticHandler);
+  if (std::error_code EC = ObjOrErr.getError()) {
+    Error = EC.message();
+    return nullptr;
+  }
+  return (*ObjOrErr)->takeIndex();
+}
+
+namespace {
+/// Pass that performs cross-module function import provided a summary file.
+class FunctionImportPass : public ModulePass {
+  /// Optional function summary index to use for importing, otherwise
+  /// the summary-file option must be specified.
+  const FunctionInfoIndex *Index;
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  const char *getPassName() const override {
+    return "Function Importing";
+  }
+
+  explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr)
+      : ModulePass(ID), Index(Index) {}
+
+  bool runOnModule(Module &M) override {
+    if (SummaryFile.empty() && !Index)
+      report_fatal_error("error: -function-import requires -summary-file or "
+                         "file from frontend\n");
+    std::unique_ptr<FunctionInfoIndex> IndexPtr;
+    if (!SummaryFile.empty()) {
+      if (Index)
+        report_fatal_error("error: -summary-file and index from frontend\n");
+      std::string Error;
+      IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler);
+      if (!IndexPtr) {
+        errs() << "Error loading file '" << SummaryFile << "': " << Error
+               << "\n";
+        return false;
+      }
+      Index = IndexPtr.get();
+    }
+
+    // First we need to promote to global scope and rename any local values that
+    // are potentially exported to other modules.
+    if (renameModuleForThinLTO(M, Index)) {
+      errs() << "Error renaming module\n";
+      return false;
+    }
+
+    // Perform the import now.
+    auto ModuleLoader = [&M](StringRef Identifier) {
+      return loadFile(Identifier, M.getContext());
+    };
+    FunctionImporter Importer(*Index, ModuleLoader);
+    return Importer.importFunctions(M);
+  }
+};
+} // anonymous namespace
+
+char FunctionImportPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FunctionImportPass, "function-import",
+                      "Summary Based Function Import", false, false)
+INITIALIZE_PASS_END(FunctionImportPass, "function-import",
+                    "Summary Based Function Import", false, false)
+
+namespace llvm {
+Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) {
+  return new FunctionImportPass(Index);
+}
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 61d0ff9..9b276ed 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -92,33 +92,28 @@ bool GlobalDCE::runOnModule(Module &M) {
       ComdatMembers.insert(std::make_pair(C, &GA));
 
   // Loop over the module, adding globals which are obviously necessary.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Changed |= RemoveUnusedGlobalValue(*I);
+  for (Function &F : M) {
+    Changed |= RemoveUnusedGlobalValue(F);
     // Functions with external linkage are needed if they have a body
-    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
-      if (!I->isDiscardableIfUnused())
-        GlobalIsNeeded(I);
-    }
+    if (!F.isDeclaration() && !F.hasAvailableExternallyLinkage())
+      if (!F.isDiscardableIfUnused())
+        GlobalIsNeeded(&F);
   }
 
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I) {
-    Changed |= RemoveUnusedGlobalValue(*I);
+  for (GlobalVariable &GV : M.globals()) {
+    Changed |= RemoveUnusedGlobalValue(GV);
     // Externally visible & appending globals are needed, if they have an
     // initializer.
-    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
-      if (!I->isDiscardableIfUnused())
-        GlobalIsNeeded(I);
-    }
+    if (!GV.isDeclaration() && !GV.hasAvailableExternallyLinkage())
+      if (!GV.isDiscardableIfUnused())
+        GlobalIsNeeded(&GV);
   }
 
-  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E; ++I) {
-    Changed |= RemoveUnusedGlobalValue(*I);
+  for (GlobalAlias &GA : M.aliases()) {
+    Changed |= RemoveUnusedGlobalValue(GA);
     // Externally visible aliases are needed.
-    if (!I->isDiscardableIfUnused()) {
-      GlobalIsNeeded(I);
-    }
+    if (!GA.isDiscardableIfUnused())
+      GlobalIsNeeded(&GA);
   }
 
   // Now that all globals which are needed are in the AliveGlobals set, we loop
@@ -126,52 +121,50 @@ bool GlobalDCE::runOnModule(Module &M) {
   //
 
   // The first pass is to drop initializers of global variables which are dead.
-  std::vector<GlobalVariable*> DeadGlobalVars;   // Keep track of dead globals
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    if (!AliveGlobals.count(I)) {
-      DeadGlobalVars.push_back(I);         // Keep track of dead globals
-      if (I->hasInitializer()) {
-        Constant *Init = I->getInitializer();
-        I->setInitializer(nullptr);
+  std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals
+  for (GlobalVariable &GV : M.globals())
+    if (!AliveGlobals.count(&GV)) {
+      DeadGlobalVars.push_back(&GV);         // Keep track of dead globals
+      if (GV.hasInitializer()) {
+        Constant *Init = GV.getInitializer();
+        GV.setInitializer(nullptr);
         if (isSafeToDestroyConstant(Init))
           Init->destroyConstant();
       }
     }
 
   // The second pass drops the bodies of functions which are dead...
-  std::vector<Function*> DeadFunctions;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!AliveGlobals.count(I)) {
-      DeadFunctions.push_back(I);         // Keep track of dead globals
-      if (!I->isDeclaration())
-        I->deleteBody();
+  std::vector<Function *> DeadFunctions;
+  for (Function &F : M)
+    if (!AliveGlobals.count(&F)) {
+      DeadFunctions.push_back(&F);         // Keep track of dead globals
+      if (!F.isDeclaration())
+        F.deleteBody();
     }
 
   // The third pass drops targets of aliases which are dead...
   std::vector<GlobalAlias*> DeadAliases;
-  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;
-       ++I)
-    if (!AliveGlobals.count(I)) {
-      DeadAliases.push_back(I);
-      I->setAliasee(nullptr);
+  for (GlobalAlias &GA : M.aliases())
+    if (!AliveGlobals.count(&GA)) {
+      DeadAliases.push_back(&GA);
+      GA.setAliasee(nullptr);
     }
 
   if (!DeadFunctions.empty()) {
     // Now that all interferences have been dropped, delete the actual objects
     // themselves.
-    for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) {
-      RemoveUnusedGlobalValue(*DeadFunctions[i]);
-      M.getFunctionList().erase(DeadFunctions[i]);
+    for (Function *F : DeadFunctions) {
+      RemoveUnusedGlobalValue(*F);
+      M.getFunctionList().erase(F);
     }
     NumFunctions += DeadFunctions.size();
     Changed = true;
   }
 
   if (!DeadGlobalVars.empty()) {
-    for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) {
-      RemoveUnusedGlobalValue(*DeadGlobalVars[i]);
-      M.getGlobalList().erase(DeadGlobalVars[i]);
+    for (GlobalVariable *GV : DeadGlobalVars) {
+      RemoveUnusedGlobalValue(*GV);
+      M.getGlobalList().erase(GV);
     }
     NumVariables += DeadGlobalVars.size();
     Changed = true;
@@ -179,9 +172,9 @@ bool GlobalDCE::runOnModule(Module &M) {
 
   // Now delete any dead aliases.
   if (!DeadAliases.empty()) {
-    for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) {
-      RemoveUnusedGlobalValue(*DeadAliases[i]);
-      M.getAliasList().erase(DeadAliases[i]);
+    for (GlobalAlias *GA : DeadAliases) {
+      RemoveUnusedGlobalValue(*GA);
+      M.getAliasList().erase(GA);
     }
     NumAliases += DeadAliases.size();
     Changed = true;
@@ -222,21 +215,15 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
     // any globals used will be marked as needed.
     Function *F = cast<Function>(G);
 
-    if (F->hasPrefixData())
-      MarkUsedGlobalsAsNeeded(F->getPrefixData());
-
-    if (F->hasPrologueData())
-      MarkUsedGlobalsAsNeeded(F->getPrologueData());
+    for (Use &U : F->operands())
+      MarkUsedGlobalsAsNeeded(cast<Constant>(U.get()));
 
-    if (F->hasPersonalityFn())
-      MarkUsedGlobalsAsNeeded(F->getPersonalityFn());
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-        for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
-          if (GlobalValue *GV = dyn_cast<GlobalValue>(*U))
+    for (BasicBlock &BB : *F)
+      for (Instruction &I : BB)
+        for (Use &U : I.operands())
+          if (GlobalValue *GV = dyn_cast<GlobalValue>(U))
             GlobalIsNeeded(GV);
-          else if (Constant *C = dyn_cast<Constant>(*U))
+          else if (Constant *C = dyn_cast<Constant>(U))
             MarkUsedGlobalsAsNeeded(C);
   }
 }
@@ -247,9 +234,9 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
 
   // Loop over all of the operands of the constant, adding any globals they
   // use to the list of needed globals.
-  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) {
+  for (Use &U : C->operands()) {
     // If we've already processed this constant there's no need to do it again.
-    Constant *Op = dyn_cast<Constant>(*I);
+    Constant *Op = dyn_cast<Constant>(U);
     if (Op && SeenConstants.insert(Op).second)
       MarkUsedGlobalsAsNeeded(Op);
   }
@@ -262,7 +249,8 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
 // might make it deader.
 //
 bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) {
-  if (GV.use_empty()) return false;
+  if (GV.use_empty())
+    return false;
   GV.removeDeadConstantUsers();
   return GV.use_empty();
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 5ffe15d..fd77369 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -54,7 +55,6 @@ STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
 STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");
 STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
 STATISTIC(NumDeleted   , "Number of globals deleted");
-STATISTIC(NumFnDeleted , "Number of functions deleted");
 STATISTIC(NumGlobUses  , "Number of global uses devirtualized");
 STATISTIC(NumLocalized , "Number of globals localized");
 STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans");
@@ -69,6 +69,7 @@ namespace {
   struct GlobalOpt : public ModulePass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
     }
     static char ID; // Pass identification, replacement for typeid
     GlobalOpt() : ModulePass(ID) {
@@ -81,11 +82,14 @@ namespace {
     bool OptimizeFunctions(Module &M);
     bool OptimizeGlobalVars(Module &M);
     bool OptimizeGlobalAliases(Module &M);
-    bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
-    bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
-                               const GlobalStatus &GS);
+    bool deleteIfDead(GlobalValue &GV);
+    bool processGlobal(GlobalValue &GV);
+    bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
 
+    bool isPointerValueDeadOnEntryToFunction(const Function *F,
+                                             GlobalValue *GV);
+
     TargetLibraryInfo *TLI;
     SmallSet<const Comdat *, 8> NotDiscardableComdats;
   };
@@ -95,13 +99,14 @@ char GlobalOpt::ID = 0;
 INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
 
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
 
-/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
-/// as a root?  If so, we might not really want to eliminate the stores to it.
+/// Is this global variable possibly used by a leak checker as a root?  If so,
+/// we might not really want to eliminate the stores to it.
 static bool isLeakCheckerRoot(GlobalVariable *GV) {
   // A global variable is a root if it is a pointer, or could plausibly contain
   // a pointer.  There are two challenges; one is that we could have a struct
@@ -176,10 +181,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
   } while (1);
 }
 
-/// CleanupPointerRootUsers - This GV is a pointer root.  Loop over all users
-/// of the global and clean up any that obviously don't assign the global a
-/// value that isn't dynamically allocated.
-///
+/// This GV is a pointer root.  Loop over all users of the global and clean up
+/// any that obviously don't assign the global a value that isn't dynamically
+/// allocated.
 static bool CleanupPointerRootUsers(GlobalVariable *GV,
                                     const TargetLibraryInfo *TLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
@@ -263,10 +267,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
   return Changed;
 }
 
-/// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
-/// users of the global, cleaning up the obvious ones.  This is largely just a
-/// quick scan over the use list to clean up the easy and obvious cruft.  This
-/// returns true if it made a change.
+/// We just marked GV constant.  Loop over all users of the global, cleaning up
+/// the obvious ones.  This is largely just a quick scan over the use list to
+/// clean up the easy and obvious cruft.  This returns true if it made a change.
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        const DataLayout &DL,
                                        TargetLibraryInfo *TLI) {
@@ -353,8 +356,8 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
   return Changed;
 }
 
-/// isSafeSROAElementUse - Return true if the specified instruction is a safe
-/// user of a derived expression from a global that we want to SROA.
+/// Return true if the specified instruction is a safe user of a derived
+/// expression from a global that we want to SROA.
 static bool isSafeSROAElementUse(Value *V) {
   // We might have a dead and dangling constant hanging off of here.
   if (Constant *C = dyn_cast<Constant>(V))
@@ -385,9 +388,8 @@ static bool isSafeSROAElementUse(Value *V) {
 }
 
 
-/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value.
-/// Look at it and its uses and decide whether it is safe to SROA this global.
-///
+/// U is a direct user of the specified global value.  Look at it and its uses
+/// and decide whether it is safe to SROA this global.
 static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
   // The user of the global must be a GEP Inst or a ConstantExpr GEP.
   if (!isa<GetElementPtrInst>(U) &&
@@ -452,9 +454,8 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
   return true;
 }
 
-/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it
-/// is safe for us to perform this transformation.
-///
+/// Look at all uses of the global and decide whether it is safe for us to
+/// perform this transformation.
 static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
   for (User *U : GV->users())
     if (!IsUserOfGlobalSafeForSRA(U, GV))
@@ -464,10 +465,10 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
 }
 
 
-/// SRAGlobal - Perform scalar replacement of aggregates on the specified global
-/// variable.  This opens the door for other optimizations by exposing the
-/// behavior of the program in a more fine-grained way.  We have determined that
-/// this transformation is safe already.  We return the first global variable we
+/// Perform scalar replacement of aggregates on the specified global variable.
+/// This opens the door for other optimizations by exposing the behavior of the
+/// program in a more fine-grained way.  We have determined that this
+/// transformation is safe already.  We return the first global variable we
 /// insert so that the caller can reprocess it.
 static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   // Make sure this global only has simple uses that we can SRA.
@@ -497,7 +498,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
                                                In, GV->getName()+"."+Twine(i),
                                                GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
-      Globals.insert(GV, NGV);
+      NGV->setExternallyInitialized(GV->isExternallyInitialized());
+      Globals.push_back(NGV);
       NewGlobals.push_back(NGV);
 
       // Calculate the known alignment of the field.  If the original aggregate
@@ -530,7 +532,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
                                                In, GV->getName()+"."+Twine(i),
                                                GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
-      Globals.insert(GV, NGV);
+      NGV->setExternallyInitialized(GV->isExternallyInitialized());
+      Globals.push_back(NGV);
       NewGlobals.push_back(NGV);
 
       // Calculate the known alignment of the field.  If the original aggregate
@@ -545,7 +548,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   if (NewGlobals.empty())
     return nullptr;
 
-  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV);
+  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
 
   Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
 
@@ -610,9 +613,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr;
 }
 
-/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified
-/// value will trap if the value is dynamically null.  PHIs keeps track of any
-/// phi nodes we've seen to avoid reprocessing them.
+/// Return true if all users of the specified value will trap if the value is
+/// dynamically null.  PHIs keeps track of any phi nodes we've seen to avoid
+/// reprocessing them.
 static bool AllUsesOfValueWillTrapIfNull(const Value *V,
                                         SmallPtrSetImpl<const PHINode*> &PHIs) {
   for (const User *U : V->users())
@@ -653,9 +656,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
   return true;
 }
 
-/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads
-/// from GV will trap if the loaded value is null.  Note that this also permits
-/// comparisons of the loaded value against null, as a special case.
+/// Return true if all uses of any loads from GV will trap if the loaded value
+/// is null.  Note that this also permits comparisons of the loaded value
+/// against null, as a special case.
 static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
   for (const User *U : GV->users())
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
@@ -735,10 +738,10 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
 }
 
 
-/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null
-/// value stored into it.  If there are uses of the loaded value that would trap
-/// if the loaded value is dynamically null, then we know that they cannot be
-/// reachable with a null optimize away the load.
+/// The specified global has only one non-null value stored into it.  If there
+/// are uses of the loaded value that would trap if the loaded value is
+/// dynamically null, then we know that they cannot be reachable with a null
+/// optimize away the load.
 static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
                                             const DataLayout &DL,
                                             TargetLibraryInfo *TLI) {
@@ -778,7 +781,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   }
 
   if (Changed) {
-    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV);
+    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n");
     ++NumGlobUses;
   }
 
@@ -801,8 +804,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   return Changed;
 }
 
-/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
-/// instructions that are foldable.
+/// Walk the use list of V, constant folding all of the instructions that are
+/// foldable.
 static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
                                 TargetLibraryInfo *TLI) {
   for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
@@ -818,11 +821,11 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
       }
 }
 
-/// OptimizeGlobalAddressOfMalloc - This function takes the specified global
-/// variable, and transforms the program as if it always contained the result of
-/// the specified malloc.  Because it is always the result of the specified
-/// malloc, there is no reason to actually DO the malloc.  Instead, turn the
-/// malloc into a global, and any loads of GV as uses of the new global.
+/// This function takes the specified global variable, and transforms the
+/// program as if it always contained the result of the specified malloc.
+/// Because it is always the result of the specified malloc, there is no reason
+/// to actually DO the malloc.  Instead, turn the malloc into a global, and any
+/// loads of GV as uses of the new global.
 static GlobalVariable *
 OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
                               ConstantInt *NElements, const DataLayout &DL,
@@ -838,13 +841,10 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
 
   // Create the new global variable.  The contents of the malloc'd memory is
   // undefined, so initialize with an undef value.
-  GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(),
-                                             GlobalType, false,
-                                             GlobalValue::InternalLinkage,
-                                             UndefValue::get(GlobalType),
-                                             GV->getName()+".body",
-                                             GV,
-                                             GV->getThreadLocalMode());
+  GlobalVariable *NewGV = new GlobalVariable(
+      *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
+      UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
+      GV->getThreadLocalMode());
 
   // If there are bitcast users of the malloc (which is typical, usually we have
   // a malloc + bitcast) then replace them with uses of the new global.  Update
@@ -935,7 +935,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       cast<StoreInst>(InitBool->user_back())->eraseFromParent();
     delete InitBool;
   } else
-    GV->getParent()->getGlobalList().insert(GV, InitBool);
+    GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
 
   // Now the GV is dead, nuke it and the malloc..
   GV->eraseFromParent();
@@ -951,10 +951,9 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   return NewGV;
 }
 
-/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking
-/// to make sure that there are no complex uses of V.  We permit simple things
-/// like dereferencing the pointer, but not storing through the address, unless
-/// it is to the specified global.
+/// Scan the use-list of V checking to make sure that there are no complex uses
+/// of V.  We permit simple things like dereferencing the pointer, but not
+/// storing through the address, unless it is to the specified global.
 static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
                                                       const GlobalVariable *GV,
                                         SmallPtrSetImpl<const PHINode*> &PHIs) {
@@ -998,10 +997,9 @@ static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
   return true;
 }
 
-/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV
-/// somewhere.  Transform all uses of the allocation into loads from the
-/// global and uses of the resultant pointer.  Further, delete the store into
-/// GV.  This assumes that these value pass the
+/// The Alloc pointer is stored into GV somewhere.  Transform all uses of the
+/// allocation into loads from the global and uses of the resultant pointer.
+/// Further, delete the store into GV.  This assumes that these value pass the
 /// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
 static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
                                           GlobalVariable *GV) {
@@ -1043,9 +1041,9 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
   }
 }
 
-/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi
-/// of a load) are simple enough to perform heap SRA on.  This permits GEP's
-/// that index through the array and struct field, icmps of null, and PHIs.
+/// Verify that all uses of V (a load, or a phi of a load) are simple enough to
+/// perform heap SRA on.  This permits GEP's that index through the array and
+/// struct field, icmps of null, and PHIs.
 static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
                         SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
                         SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
@@ -1096,8 +1094,8 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
 }
 
 
-/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from
-/// GV are simple enough to perform HeapSRA, return true.
+/// If all users of values loaded from GV are simple enough to perform HeapSRA,
+/// return true.
 static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
                                                     Instruction *StoredVal) {
   SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
@@ -1186,8 +1184,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   return FieldVals[FieldNo] = Result;
 }
 
-/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from
-/// the load, rewrite the derived value to use the HeapSRoA'd load.
+/// Given a load instruction and a value derived from the load, rewrite the
+/// derived value to use the HeapSRoA'd load.
 static void RewriteHeapSROALoadUser(Instruction *LoadUser,
              DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
                    std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
@@ -1248,10 +1246,9 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
   }
 }
 
-/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global.  Ptr
-/// is a value loaded from the global.  Eliminate all uses of Ptr, making them
-/// use FieldGlobals instead.  All uses of loaded values satisfy
-/// AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+/// We are performing Heap SRoA on a global.  Ptr is a value loaded from the
+/// global.  Eliminate all uses of Ptr, making them use FieldGlobals instead.
+/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
 static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
                DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
                    std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
@@ -1266,8 +1263,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
   }
 }
 
-/// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
-/// it up into multiple allocations of arrays of the fields.
+/// CI is an allocation of an array of structures.  Break it up into multiple
+/// allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                                             Value *NElems, const DataLayout &DL,
                                             const TargetLibraryInfo *TLI) {
@@ -1291,12 +1288,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     Type *FieldTy = STy->getElementType(FieldNo);
     PointerType *PFieldTy = PointerType::get(FieldTy, AS);
 
-    GlobalVariable *NGV =
-      new GlobalVariable(*GV->getParent(),
-                         PFieldTy, false, GlobalValue::InternalLinkage,
-                         Constant::getNullValue(PFieldTy),
-                         GV->getName() + ".f" + Twine(FieldNo), GV,
-                         GV->getThreadLocalMode());
+    GlobalVariable *NGV = new GlobalVariable(
+        *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
+        Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
+        nullptr, GV->getThreadLocalMode());
     FieldGlobals.push_back(NGV);
 
     unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
@@ -1336,7 +1331,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 
   // Split the basic block at the old malloc.
   BasicBlock *OrigBB = CI->getParent();
-  BasicBlock *ContBB = OrigBB->splitBasicBlock(CI, "malloc_cont");
+  BasicBlock *ContBB =
+      OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");
 
   // Create the block to check the first condition.  Put all these blocks at the
   // end of the function as they are unlikely to be executed.
@@ -1376,9 +1372,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   // CI is no longer needed, remove it.
   CI->eraseFromParent();
 
-  /// InsertedScalarizedLoads - As we process loads, if we can't immediately
-  /// update all uses of the load, keep track of what scalarized loads are
-  /// inserted for a given load.
+  /// As we process loads, if we can't immediately update all uses of the load,
+  /// keep track of what scalarized loads are inserted for a given load.
   DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
   InsertedScalarizedValues[GV] = FieldGlobals;
 
@@ -1454,13 +1449,11 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   return cast<GlobalVariable>(FieldGlobals[0]);
 }
 
-/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a
-/// pointer global variable with a single value stored it that is a malloc or
-/// cast of malloc.
-static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
+/// This function is called when we see a pointer global variable with a single
+/// value stored it that is a malloc or cast of malloc.
+static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
                                                Type *AllocTy,
                                                AtomicOrdering Ordering,
-                                               Module::global_iterator &GVI,
                                                const DataLayout &DL,
                                                TargetLibraryInfo *TLI) {
   // If this is a malloc of an abstract type, don't touch it.
@@ -1499,7 +1492,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
     // (2048 bytes currently), as we don't want to introduce a 16M global or
     // something.
     if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
-      GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
+      OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
       return true;
     }
 
@@ -1544,19 +1537,18 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
         CI = cast<CallInst>(Malloc);
     }
 
-    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true),
-                               DL, TLI);
+    PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL,
+                         TLI);
     return true;
   }
 
   return false;
 }
 
-// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge
-// that only one value (besides its initializer) is ever stored to the global.
-static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+// Try to optimize globals based on the knowledge that only one value (besides
+// its initializer) is ever stored to the global.
+static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
                                      AtomicOrdering Ordering,
-                                     Module::global_iterator &GVI,
                                      const DataLayout &DL,
                                      TargetLibraryInfo *TLI) {
   // Ignore no-op GEPs and bitcasts.
@@ -1577,9 +1569,8 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
         return true;
     } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
       Type *MallocType = getMallocAllocatedType(CI, TLI);
-      if (MallocType &&
-          TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI,
-                                             DL, TLI))
+      if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
+                                                           Ordering, DL, TLI))
         return true;
     }
   }
@@ -1587,10 +1578,10 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
   return false;
 }
 
-/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only
-/// two values ever stored into GV are its initializer and OtherVal.  See if we
-/// can shrink the global into a boolean and select between the two values
-/// whenever it is used.  This exposes the values to other scalar optimizations.
+/// At this point, we have learned that the only two values ever stored into GV
+/// are its initializer and OtherVal.  See if we can shrink the global into a
+/// boolean and select between the two values whenever it is used.  This exposes
+/// the values to other scalar optimizations.
 static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   Type *GVElType = GV->getType()->getElementType();
 
@@ -1610,7 +1601,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
     if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
       return false;
 
-  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV);
+  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");
 
   // Create the new global, initializing it to false.
   GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
@@ -1620,7 +1611,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GV->getName()+".b",
                                              GV->getThreadLocalMode(),
                                              GV->getType()->getAddressSpace());
-  GV->getParent()->getGlobalList().insert(GV, NewGV);
+  GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);
 
   Constant *InitVal = GV->getInitializer();
   assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
@@ -1688,61 +1679,213 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   return true;
 }
 
+bool GlobalOpt::deleteIfDead(GlobalValue &GV) {
+  GV.removeDeadConstantUsers();
 
-/// ProcessGlobal - Analyze the specified global variable and optimize it if
-/// possible.  If we make a change, return true.
-bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
-                              Module::global_iterator &GVI) {
-  // Do more involved optimizations if the global is internal.
-  GV->removeDeadConstantUsers();
+  if (!GV.isDiscardableIfUnused())
+    return false;
 
-  if (GV->use_empty()) {
-    DEBUG(dbgs() << "GLOBAL DEAD: " << *GV);
-    GV->eraseFromParent();
-    ++NumDeleted;
-    return true;
-  }
+  if (const Comdat *C = GV.getComdat())
+    if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C))
+      return false;
 
-  if (!GV->hasLocalLinkage())
+  bool Dead;
+  if (auto *F = dyn_cast<Function>(&GV))
+    Dead = F->isDefTriviallyDead();
+  else
+    Dead = GV.use_empty();
+  if (!Dead)
+    return false;
+
+  DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+  GV.eraseFromParent();
+  ++NumDeleted;
+  return true;
+}
+
+/// Analyze the specified global variable and optimize it if possible.  If we
+/// make a change, return true.
+bool GlobalOpt::processGlobal(GlobalValue &GV) {
+  // Do more involved optimizations if the global is internal.
+  if (!GV.hasLocalLinkage())
     return false;
 
   GlobalStatus GS;
 
-  if (GlobalStatus::analyzeGlobal(GV, GS))
+  if (GlobalStatus::analyzeGlobal(&GV, GS))
     return false;
 
-  if (!GS.IsCompared && !GV->hasUnnamedAddr()) {
-    GV->setUnnamedAddr(true);
+  bool Changed = false;
+  if (!GS.IsCompared && !GV.hasUnnamedAddr()) {
+    GV.setUnnamedAddr(true);
     NumUnnamed++;
+    Changed = true;
   }
 
-  if (GV->isConstant() || !GV->hasInitializer())
+  auto *GVar = dyn_cast<GlobalVariable>(&GV);
+  if (!GVar)
+    return Changed;
+
+  if (GVar->isConstant() || !GVar->hasInitializer())
+    return Changed;
+
+  return processInternalGlobal(GVar, GS) || Changed;
+}
+
+bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) {
+  // Find all uses of GV. We expect them all to be in F, and if we can't
+  // identify any of the uses we bail out.
+  //
+  // On each of these uses, identify if the memory that GV points to is
+  // used/required/live at the start of the function. If it is not, for example
+  // if the first thing the function does is store to the GV, the GV can
+  // possibly be demoted.
+  //
+  // We don't do an exhaustive search for memory operations - simply look
+  // through bitcasts as they're quite common and benign.
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+  for (auto *U : GV->users()) {
+    if (Operator::getOpcode(U) == Instruction::BitCast) {
+      for (auto *UU : U->users()) {
+        if (auto *LI = dyn_cast<LoadInst>(UU))
+          Loads.push_back(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(UU))
+          Stores.push_back(SI);
+        else
+          return false;
+      }
+      continue;
+    }
+
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    assert(I->getParent()->getParent() == F);
+
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      Loads.push_back(LI);
+    else if (auto *SI = dyn_cast<StoreInst>(I))
+      Stores.push_back(SI);
+    else
+      return false;
+  }
+
+  // We have identified all uses of GV into loads and stores. Now check if all
+  // of them are known not to depend on the value of the global at the function
+  // entry point. We do this by ensuring that every load is dominated by at
+  // least one store.
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F))
+                 .getDomTree();
+
+  // The below check is quadratic. Check we're not going to do too many tests.
+  // FIXME: Even though this will always have worst-case quadratic time, we
+  // could put effort into minimizing the average time by putting stores that
+  // have been shown to dominate at least one load at the beginning of the
+  // Stores array, making subsequent dominance checks more likely to succeed
+  // early.
+  //
+  // The threshold here is fairly large because global->local demotion is a
+  // very powerful optimization should it fire.
+  const unsigned Threshold = 100;
+  if (Loads.size() * Stores.size() > Threshold)
     return false;
 
-  return ProcessInternalGlobal(GV, GVI, GS);
+  for (auto *L : Loads) {
+    auto *LTy = L->getType();
+    if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) {
+          auto *STy = S->getValueOperand()->getType();
+          // The load is only dominated by the store if DomTree says so
+          // and the number of bits loaded in L is less than or equal to
+          // the number of bits stored in S.
+          return DT.dominates(S, L) &&
+                 DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy);
+        }))
+      return false;
+  }
+  // All loads have known dependences inside F, so the global can be localized.
+  return true;
+}
+
+/// C may have non-instruction users. Can all of those users be turned into
+/// instructions?
+static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) {
+  // We don't do this exhaustively. The most common pattern that we really need
+  // to care about is a constant GEP or constant bitcast - so just looking
+  // through one single ConstantExpr.
+  //
+  // The set of constants that this function returns true for must be able to be
+  // handled by makeAllConstantUsesInstructions.
+  for (auto *U : C->users()) {
+    if (isa<Instruction>(U))
+      continue;
+    if (!isa<ConstantExpr>(U))
+      // Non instruction, non-constantexpr user; cannot convert this.
+      return false;
+    for (auto *UU : U->users())
+      if (!isa<Instruction>(UU))
+        // A constantexpr used by another constant. We don't try and recurse any
+        // further but just bail out at this point.
+        return false;
+  }
+
+  return true;
+}
+
+/// C may have non-instruction users, and
+/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the
+/// non-instruction users to instructions.
+static void makeAllConstantUsesInstructions(Constant *C) {
+  SmallVector<ConstantExpr*,4> Users;
+  for (auto *U : C->users()) {
+    if (isa<ConstantExpr>(U))
+      Users.push_back(cast<ConstantExpr>(U));
+    else
+      // We should never get here; allNonInstructionUsersCanBeMadeInstructions
+      // should not have returned true for C.
+      assert(
+          isa<Instruction>(U) &&
+          "Can't transform non-constantexpr non-instruction to instruction!");
+  }
+
+  SmallVector<Value*,4> UUsers;
+  for (auto *U : Users) {
+    UUsers.clear();
+    for (auto *UU : U->users())
+      UUsers.push_back(UU);
+    for (auto *UU : UUsers) {
+      Instruction *UI = cast<Instruction>(UU);
+      Instruction *NewU = U->getAsInstruction();
+      NewU->insertBefore(UI);
+      UI->replaceUsesOfWith(U, NewU);
+    }
+    U->dropAllReferences();
+  }
 }
 
-/// ProcessInternalGlobal - Analyze the specified global variable and optimize
+/// Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
-bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
-                                      Module::global_iterator &GVI,
+bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,
                                       const GlobalStatus &GS) {
   auto &DL = GV->getParent()->getDataLayout();
-  // If this is a first class global and has only one accessing function
-  // and this function is main (which we know is not recursive), we replace
-  // the global with a local alloca in this function.
+  // If this is a first class global and has only one accessing function and
+  // this function is non-recursive, we replace the global with a local alloca
+  // in this function.
   //
   // NOTE: It doesn't make sense to promote non-single-value types since we
   // are just replacing static memory to stack memory.
   //
   // If the global is in different address space, don't bring it to stack.
   if (!GS.HasMultipleAccessingFunctions &&
-      GS.AccessingFunction && !GS.HasNonInstructionUser &&
+      GS.AccessingFunction &&
       GV->getType()->getElementType()->isSingleValueType() &&
-      GS.AccessingFunction->getName() == "main" &&
-      GS.AccessingFunction->hasExternalLinkage() &&
-      GV->getType()->getAddressSpace() == 0) {
-    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV);
+      GV->getType()->getAddressSpace() == 0 &&
+      !GV->isExternallyInitialized() &&
+      allNonInstructionUsersCanBeMadeInstructions(GV) &&
+      GS.AccessingFunction->doesNotRecurse() &&
+      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) {
+    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
     Type *ElemTy = GV->getType()->getElementType();
@@ -1752,6 +1895,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     if (!isa<UndefValue>(GV->getInitializer()))
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
 
+    makeAllConstantUsesInstructions(GV);
+    
     GV->replaceAllUsesWith(Alloca);
     GV->eraseFromParent();
     ++NumLocalized;
@@ -1761,7 +1906,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
   // If the global is never loaded (but may be stored to), it is dead.
   // Delete it now.
   if (!GS.IsLoaded) {
-    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
+    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
 
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
@@ -1800,11 +1945,9 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     return true;
   } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
     const DataLayout &DL = GV->getParent()->getDataLayout();
-    if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) {
-      GVI = FirstNewGV; // Don't skip the newly produced globals!
+    if (SRAGlobal(GV, DL))
       return true;
-    }
-  } else if (GS.StoredType == GlobalStatus::StoredOnce) {
+  } else if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
@@ -1822,8 +1965,6 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                        << "simplify all users and delete global!\n");
           GV->eraseFromParent();
           ++NumDeleted;
-        } else {
-          GVI = GV;
         }
         ++NumSubstitute;
         return true;
@@ -1831,8 +1972,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI,
-                                 DL, TLI))
+    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI))
       return true;
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
@@ -1850,8 +1990,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
   return false;
 }
 
-/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified
-/// function, changing them to FastCC.
+/// Walk all of the direct calls of the specified function, changing them to
+/// FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
   for (User *U : F->users()) {
     if (isa<BlockAddress>(U))
@@ -1898,38 +2038,38 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
   bool Changed = false;
   // Optimize functions.
   for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
-    Function *F = FI++;
+    Function *F = &*FI++;
     // Functions without names cannot be referenced outside this module.
     if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
       F->setLinkage(GlobalValue::InternalLinkage);
 
-    const Comdat *C = F->getComdat();
-    bool inComdat = C && NotDiscardableComdats.count(C);
-    F->removeDeadConstantUsers();
-    if ((!inComdat || F->hasLocalLinkage()) && F->isDefTriviallyDead()) {
-      F->eraseFromParent();
+    if (deleteIfDead(*F)) {
       Changed = true;
-      ++NumFnDeleted;
-    } else if (F->hasLocalLinkage()) {
-      if (isProfitableToMakeFastCC(F) && !F->isVarArg() &&
-          !F->hasAddressTaken()) {
-        // If this function has a calling convention worth changing, is not a
-        // varargs function, and is only called directly, promote it to use the
-        // Fast calling convention.
-        F->setCallingConv(CallingConv::Fast);
-        ChangeCalleesToFastCall(F);
-        ++NumFastCallFns;
-        Changed = true;
-      }
+      continue;
+    }
 
-      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
-          !F->hasAddressTaken()) {
-        // The function is not used by a trampoline intrinsic, so it is safe
-        // to remove the 'nest' attribute.
-        RemoveNestAttribute(F);
-        ++NumNestRemoved;
-        Changed = true;
-      }
+    Changed |= processGlobal(*F);
+
+    if (!F->hasLocalLinkage())
+      continue;
+    if (isProfitableToMakeFastCC(F) && !F->isVarArg() &&
+        !F->hasAddressTaken()) {
+      // If this function has a calling convention worth changing, is not a
+      // varargs function, and is only called directly, promote it to use the
+      // Fast calling convention.
+      F->setCallingConv(CallingConv::Fast);
+      ChangeCalleesToFastCall(F);
+      ++NumFastCallFns;
+      Changed = true;
+    }
+
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+        !F->hasAddressTaken()) {
+      // The function is not used by a trampoline intrinsic, so it is safe
+      // to remove the 'nest' attribute.
+      RemoveNestAttribute(F);
+      ++NumNestRemoved;
+      Changed = true;
     }
   }
   return Changed;
@@ -1940,7 +2080,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
 
   for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
        GVI != E; ) {
-    GlobalVariable *GV = GVI++;
+    GlobalVariable *GV = &*GVI++;
     // Global variables without names cannot be referenced outside this module.
     if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
       GV->setLinkage(GlobalValue::InternalLinkage);
@@ -1953,12 +2093,12 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
           GV->setInitializer(New);
       }
 
-    if (GV->isDiscardableIfUnused()) {
-      if (const Comdat *C = GV->getComdat())
-        if (NotDiscardableComdats.count(C) && !GV->hasLocalLinkage())
-          continue;
-      Changed |= ProcessGlobal(GV, GVI);
+    if (deleteIfDead(*GV)) {
+      Changed = true;
+      continue;
     }
+
+    Changed |= processGlobal(*GV);
   }
   return Changed;
 }
@@ -1968,8 +2108,8 @@ isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSetImpl<Constant *> &SimpleConstants,
                             const DataLayout &DL);
 
-/// isSimpleEnoughValueToCommit - Return true if the specified constant can be
-/// handled by the code generator.  We don't want to generate something like:
+/// Return true if the specified constant can be handled by the code generator.
+/// We don't want to generate something like:
 ///   void *X = &X/42;
 /// because the code generator doesn't have a relocation that can handle that.
 ///
@@ -2044,11 +2184,11 @@ isSimpleEnoughValueToCommit(Constant *C,
 }
 
 
-/// isSimpleEnoughPointerToCommit - Return true if this constant is simple
-/// enough for us to understand.  In particular, if it is a cast to anything
-/// other than from one pointer type to another pointer type, we punt.
-/// We basically just support direct accesses to globals and GEP's of
-/// globals.  This should be kept up to date with CommitValueTo.
+/// Return true if this constant is simple enough for us to understand.  In
+/// particular, if it is a cast to anything other than from one pointer type to
+/// another pointer type, we punt.  We basically just support direct accesses to
+/// globals and GEP's of globals.  This should be kept up to date with
+/// CommitValueTo.
 static bool isSimpleEnoughPointerToCommit(Constant *C) {
   // Conservatively, avoid aggregate types. This is because we don't
   // want to worry about them partially overlapping other stores.
@@ -2095,9 +2235,9 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
   return false;
 }
 
-/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global
-/// initializer.  This returns 'Init' modified to reflect 'Val' stored into it.
-/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into.
+/// Evaluate a piece of a constantexpr store into a global initializer.  This
+/// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the
+/// GEP operands of Addr [0, OpNo) have been stepped into.
 static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
                                    ConstantExpr *Addr, unsigned OpNo) {
   // Base case of the recursion.
@@ -2144,7 +2284,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   return ConstantVector::get(Elts);
 }
 
-/// CommitValueTo - We have decided that Addr (which satisfies the predicate
+/// We have decided that Addr (which satisfies the predicate
 /// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
 static void CommitValueTo(Constant *Val, Constant *Addr) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
@@ -2160,10 +2300,10 @@ static void CommitValueTo(Constant *Val, Constant *Addr) {
 
 namespace {
 
-/// Evaluator - This class evaluates LLVM IR, producing the Constant
-/// representing each SSA instruction.  Changes to global variables are stored
-/// in a mapping that can be iterated over after the evaluation is complete.
-/// Once an evaluation call fails, the evaluation object should not be reused.
+/// This class evaluates LLVM IR, producing the Constant representing each SSA
+/// instruction.  Changes to global variables are stored in a mapping that can
+/// be iterated over after the evaluation is complete.  Once an evaluation call
+/// fails, the evaluation object should not be reused.
 class Evaluator {
 public:
   Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI)
@@ -2180,15 +2320,15 @@ public:
         Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
   }
 
-  /// EvaluateFunction - Evaluate a call to function F, returning true if
-  /// successful, false if we can't evaluate it.  ActualArgs contains the formal
-  /// arguments for the function.
+  /// Evaluate a call to function F, returning true if successful, false if we
+  /// can't evaluate it.  ActualArgs contains the formal arguments for the
+  /// function.
   bool EvaluateFunction(Function *F, Constant *&RetVal,
                         const SmallVectorImpl<Constant*> &ActualArgs);
 
-  /// EvaluateBlock - Evaluate all instructions in block BB, returning true if
-  /// successful, false if we can't evaluate it.  NewBB returns the next BB that
-  /// control flows into, or null upon return.
+  /// Evaluate all instructions in block BB, returning true if successful, false
+  /// if we can't evaluate it.  NewBB returns the next BB that control flows
+  /// into, or null upon return.
   bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB);
 
   Constant *getVal(Value *V) {
@@ -2213,32 +2353,31 @@ public:
 private:
   Constant *ComputeLoadResult(Constant *P);
 
-  /// ValueStack - As we compute SSA register values, we store their contents
-  /// here. The back of the deque contains the current function and the stack
-  /// contains the values in the calling frames.
+  /// As we compute SSA register values, we store their contents here. The back
+  /// of the deque contains the current function and the stack contains the
+  /// values in the calling frames.
   std::deque<DenseMap<Value*, Constant*>> ValueStack;
 
-  /// CallStack - This is used to detect recursion.  In pathological situations
-  /// we could hit exponential behavior, but at least there is nothing
-  /// unbounded.
+  /// This is used to detect recursion.  In pathological situations we could hit
+  /// exponential behavior, but at least there is nothing unbounded.
   SmallVector<Function*, 4> CallStack;
 
-  /// MutatedMemory - For each store we execute, we update this map.  Loads
-  /// check this to get the most up-to-date value.  If evaluation is successful,
-  /// this state is committed to the process.
+  /// For each store we execute, we update this map.  Loads check this to get
+  /// the most up-to-date value.  If evaluation is successful, this state is
+  /// committed to the process.
   DenseMap<Constant*, Constant*> MutatedMemory;
 
-  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
-  /// to represent its body.  This vector is needed so we can delete the
-  /// temporary globals when we are done.
+  /// To 'execute' an alloca, we create a temporary global variable to represent
+  /// its body.  This vector is needed so we can delete the temporary globals
+  /// when we are done.
   SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps;
 
-  /// Invariants - These global variables have been marked invariant by the
-  /// static constructor.
+  /// These global variables have been marked invariant by the static
+  /// constructor.
   SmallPtrSet<GlobalVariable*, 8> Invariants;
 
-  /// SimpleConstants - These are constants we have checked and know to be
-  /// simple enough to live in a static initializer of a global.
+  /// These are constants we have checked and know to be simple enough to live
+  /// in a static initializer of a global.
   SmallPtrSet<Constant*, 8> SimpleConstants;
 
   const DataLayout &DL;
@@ -2247,9 +2386,8 @@ private:
 
 }  // anonymous namespace
 
-/// ComputeLoadResult - Return the value that would be computed by a load from
-/// P after the stores reflected by 'memory' have been performed.  If we can't
-/// decide, return null.
+/// Return the value that would be computed by a load from P after the stores
+/// reflected by 'memory' have been performed.  If we can't decide, return null.
 Constant *Evaluator::ComputeLoadResult(Constant *P) {
   // If this memory location has been recently stored, use the stored value: it
   // is the most up-to-date.
@@ -2275,9 +2413,9 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {
   return nullptr;  // don't know how to evaluate.
 }
 
-/// EvaluateBlock - Evaluate all instructions in block BB, returning true if
-/// successful, false if we can't evaluate it.  NewBB returns the next BB that
-/// control flows into, or null upon return.
+/// Evaluate all instructions in block BB, returning true if successful, false
+/// if we can't evaluate it.  NewBB returns the next BB that control flows into,
+/// or null upon return.
 bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
                               BasicBlock *&NextBB) {
   // This is the main evaluation loop.
@@ -2438,7 +2576,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult = AllocaTmps.back().get();
       DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
-      CallSite CS(CurInst);
+      CallSite CS(&*CurInst);
 
       // Debug info can safely be ignored here.
       if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
@@ -2504,6 +2642,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // Continue even if we do nothing.
           ++CurInst;
           continue;
+        } else if (II->getIntrinsicID() == Intrinsic::assume) {
+          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          ++CurInst;
+          continue;
         }
 
         DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
@@ -2600,7 +2742,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
         InstResult = ConstantFoldConstantExpression(CE, DL, TLI);
 
-      setVal(CurInst, InstResult);
+      setVal(&*CurInst, InstResult);
     }
 
     // If we just processed an invoke, we finished evaluating the block.
@@ -2615,9 +2757,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
   }
 }
 
-/// EvaluateFunction - Evaluate a call to function F, returning true if
-/// successful, false if we can't evaluate it.  ActualArgs contains the formal
-/// arguments for the function.
+/// Evaluate a call to function F, returning true if successful, false if we
+/// can't evaluate it.  ActualArgs contains the formal arguments for the
+/// function.
 bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
                                  const SmallVectorImpl<Constant*> &ActualArgs) {
   // Check to see if this function is already executing (recursion).  If so,
@@ -2631,7 +2773,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
   unsigned ArgNo = 0;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
        ++AI, ++ArgNo)
-    setVal(AI, ActualArgs[ArgNo]);
+    setVal(&*AI, ActualArgs[ArgNo]);
 
   // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
   // we can only evaluate any one basic block at most once.  This set keeps
@@ -2639,7 +2781,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
   SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
 
   // CurBB - The current basic block we're evaluating.
-  BasicBlock *CurBB = F->begin();
+  BasicBlock *CurBB = &F->front();
 
   BasicBlock::iterator CurInst = CurBB->begin();
 
@@ -2679,8 +2821,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
   }
 }
 
-/// EvaluateStaticConstructor - Evaluate static constructors in the function, if
-/// we can.  Return true if we can, false otherwise.
+/// Evaluate static constructors in the function, if we can.  Return true if we
+/// can, false otherwise.
 static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
                                       const TargetLibraryInfo *TLI) {
   // Call the function.
@@ -2708,7 +2850,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
 }
 
 static int compareNames(Constant *const *A, Constant *const *B) {
-  return (*A)->getName().compare((*B)->getName());
+  return (*A)->stripPointerCasts()->getName().compare(
+      (*B)->stripPointerCasts()->getName());
 }
 
 static void setUsedInitializer(GlobalVariable &V,
@@ -2742,7 +2885,7 @@ static void setUsedInitializer(GlobalVariable &V,
 }
 
 namespace {
-/// \brief An easy to access representation of llvm.used and llvm.compiler.used.
+/// An easy to access representation of llvm.used and llvm.compiler.used.
 class LLVMUsed {
   SmallPtrSet<GlobalValue *, 8> Used;
   SmallPtrSet<GlobalValue *, 8> CompilerUsed;
@@ -2861,10 +3004,17 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
 
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E;) {
-    Module::alias_iterator J = I++;
+    GlobalAlias *J = &*I++;
+
     // Aliases without names cannot be referenced outside this module.
     if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
       J->setLinkage(GlobalValue::InternalLinkage);
+
+    if (deleteIfDead(*J)) {
+      Changed = true;
+      continue;
+    }
+
     // If the aliasee may change at link time, nothing can be done - bail out.
     if (J->mayBeOverridden())
       continue;
@@ -2889,15 +3039,15 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
 
     if (RenameTarget) {
       // Give the aliasee the name, linkage and other attributes of the alias.
-      Target->takeName(J);
+      Target->takeName(&*J);
       Target->setLinkage(J->getLinkage());
       Target->setVisibility(J->getVisibility());
       Target->setDLLStorageClass(J->getDLLStorageClass());
 
-      if (Used.usedErase(J))
+      if (Used.usedErase(&*J))
         Used.usedInsert(Target);
 
-      if (Used.compilerUsedErase(J))
+      if (Used.compilerUsedErase(&*J))
         Used.compilerUsedInsert(Target);
     } else if (mayHaveOtherReferences(*J, Used))
       continue;
@@ -2936,8 +3086,8 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
   return Fn;
 }
 
-/// cxxDtorIsEmpty - Returns whether the given function is an empty C++
-/// destructor and can therefore be eliminated.
+/// Returns whether the given function is an empty C++ destructor and can
+/// therefore be eliminated.
 /// Note that we assume that other optimization passes have already simplified
 /// the code so we only look for a function with a single basic block, where
 /// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and
@@ -3081,3 +3231,4 @@ bool GlobalOpt::runOnModule(Module &M) {
 
   return Changed;
 }
+
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
index 50f56b0..89629cf0 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the common infrastructure (including C bindings) for 
-// libLLVMIPO.a, which implements several transformations over the LLVM 
+// This file implements the common infrastructure (including C bindings) for
+// libLLVMIPO.a, which implements several transformations over the LLVM
 // intermediate representation.
 //
 //===----------------------------------------------------------------------===//
@@ -24,14 +24,16 @@ using namespace llvm;
 void llvm::initializeIPO(PassRegistry &Registry) {
   initializeArgPromotionPass(Registry);
   initializeConstantMergePass(Registry);
+  initializeCrossDSOCFIPass(Registry);
   initializeDAEPass(Registry);
   initializeDAHPass(Registry);
-  initializeFunctionAttrsPass(Registry);
+  initializeForceFunctionAttrsLegacyPassPass(Registry);
   initializeGlobalDCEPass(Registry);
   initializeGlobalOptPass(Registry);
   initializeIPCPPass(Registry);
   initializeAlwaysInlinerPass(Registry);
   initializeSimpleInlinerPass(Registry);
+  initializeInferFunctionAttrsLegacyPassPass(Registry);
   initializeInternalizePassPass(Registry);
   initializeLoopExtractorPass(Registry);
   initializeBlockExtractorPassPass(Registry);
@@ -39,14 +41,18 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeLowerBitSetsPass(Registry);
   initializeMergeFunctionsPass(Registry);
   initializePartialInlinerPass(Registry);
+  initializePostOrderFunctionAttrsPass(Registry);
+  initializeReversePostOrderFunctionAttrsPass(Registry);
   initializePruneEHPass(Registry);
-  initializeStripDeadPrototypesPassPass(Registry);
+  initializeStripDeadPrototypesLegacyPassPass(Registry);
   initializeStripSymbolsPass(Registry);
   initializeStripDebugDeclarePass(Registry);
   initializeStripDeadDebugInfoPass(Registry);
   initializeStripNonDebugSymbolsPass(Registry);
   initializeBarrierNoopPass(Registry);
   initializeEliminateAvailableExternallyPass(Registry);
+  initializeSampleProfileLoaderPass(Registry);
+  initializeFunctionImportPassPass(Registry);
 }
 
 void LLVMInitializeIPO(LLVMPassRegistryRef R) {
@@ -66,7 +72,7 @@ void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createFunctionAttrsPass());
+  unwrap(PM)->add(createPostOrderFunctionAttrsPass());
 }
 
 void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
new file mode 100644
index 0000000..4295a75
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -0,0 +1,988 @@
+//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "inferattrs"
+
+STATISTIC(NumReadNone, "Number of functions inferred as readnone");
+STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
+STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
+STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
+STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
+STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
+
+static bool setDoesNotAccessMemory(Function &F) {
+  if (F.doesNotAccessMemory())
+    return false;
+  F.setDoesNotAccessMemory();
+  ++NumReadNone;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F) {
+  if (F.onlyReadsMemory())
+    return false;
+  F.setOnlyReadsMemory();
+  ++NumReadOnly;
+  return true;
+}
+
+static bool setOnlyAccessesArgMemory(Function &F) {
+  if (F.onlyAccessesArgMemory())
+    return false;
+  F.setOnlyAccessesArgMemory ();
+  ++NumArgMemOnly;
+  return true;
+}
+
+
+static bool setDoesNotThrow(Function &F) {
+  if (F.doesNotThrow())
+    return false;
+  F.setDoesNotThrow();
+  ++NumNoUnwind;
+  return true;
+}
+
+static bool setDoesNotCapture(Function &F, unsigned n) {
+  if (F.doesNotCapture(n))
+    return false;
+  F.setDoesNotCapture(n);
+  ++NumNoCapture;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F, unsigned n) {
+  if (F.onlyReadsMemory(n))
+    return false;
+  F.setOnlyReadsMemory(n);
+  ++NumReadOnlyArg;
+  return true;
+}
+
+static bool setDoesNotAlias(Function &F, unsigned n) {
+  if (F.doesNotAlias(n))
+    return false;
+  F.setDoesNotAlias(n);
+  ++NumNoAlias;
+  return true;
+}
+
+static bool setNonNull(Function &F, unsigned n) {
+  assert((n != AttributeSet::ReturnIndex ||
+          F.getReturnType()->isPointerTy()) &&
+         "nonnull applies only to pointers");
+  if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
+    return false;
+  F.addAttribute(n, Attribute::NonNull);
+  ++NumNonNull;
+  return true;
+}
+
+/// Analyze the name and prototype of the given function and set any applicable
+/// attributes.
+///
+/// Returns true if any attributes were set and false otherwise.
+static bool inferPrototypeAttributes(Function &F,
+                                     const TargetLibraryInfo &TLI) {
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
+  FunctionType *FTy = F.getFunctionType();
+  LibFunc::Func TheLibFunc;
+  if (!(TLI.getLibFunc(F.getName(), TheLibFunc) && TLI.has(TheLibFunc)))
+    return false;
+
+  bool Changed = false;
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isIntegerTy())
+      return false;
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strtoull:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::strxfrm:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::strcmp:      // 0,1
+  case LibFunc::strspn:      // 0,1
+  case LibFunc::strncmp:     // 0,1
+  case LibFunc::strcspn:     // 0,1
+  case LibFunc::strcoll:     // 0,1
+  case LibFunc::strcasecmp:  // 0,1
+  case LibFunc::strncasecmp: //
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::scanf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::stat:
+  case LibFunc::statvfs:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::sscanf:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::sprintf:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::snprintf:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 3);
+    return Changed;
+  case LibFunc::setitimer:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::system:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "system" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::malloc:
+    if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::memcmp:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    if (FTy->getNumParams() != 3)
+      return false;
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::memalign:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::mkdir:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::mktime:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::realloc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::read:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "read" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::rewind:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::rmdir:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::rename:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::readlink:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::write:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "write" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::bcopy:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::bcmp:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::bzero:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::calloc:
+    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::access:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::fopen:
+    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fdopen:
+    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::ferror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F);
+    return Changed;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::fgets:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc::fread:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::fwrite:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::fputs:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fgetpos:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::getenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::getitimer:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::getpwnam:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::ungetc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::uname:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::unlink:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::unsetenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::pread:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "pread" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::pwrite:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "pwrite" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::putchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::popen:
+    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::pclose:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::vscanf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::vsscanf:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::vfscanf:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::valloc:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::vprintf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::vsnprintf:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 3);
+    return Changed;
+  case LibFunc::open:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::opendir:
+    if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::tmpfile:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::times:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAccessMemory(F);
+    return Changed;
+  case LibFunc::lstat:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::lchown:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::qsort:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
+      return false;
+    // May throw; places call through function pointer.
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::dunder_strtok_r:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::under_IO_getc:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::under_IO_putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::dunder_isoc99_scanf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::dunder_isoc99_sscanf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fopen64:
+    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::tmpfile64:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::open64:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::gettimeofday:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // Currently some platforms have the restrict keyword on the arguments to
+    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+    // arguments.
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+
+  case LibFunc::Znwj: // new(unsigned int)
+  case LibFunc::Znwm: // new(unsigned long)
+  case LibFunc::Znaj: // new[](unsigned int)
+  case LibFunc::Znam: // new[](unsigned long)
+  case LibFunc::msvc_new_int: // new(unsigned int)
+  case LibFunc::msvc_new_longlong: // new(unsigned long long)
+  case LibFunc::msvc_new_array_int: // new[](unsigned int)
+  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
+    if (FTy->getNumParams() != 1)
+      return false;
+    // Operator new always returns a nonnull noalias pointer
+    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
+    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+    return Changed;
+
+  //TODO: add LibFunc entries for:
+  //case LibFunc::memset_pattern4:
+  //case LibFunc::memset_pattern8:
+  case LibFunc::memset_pattern16:
+    if (FTy->isVarArg() || FTy->getNumParams() != 3 ||
+        !isa<PointerType>(FTy->getParamType(0)) ||
+        !isa<PointerType>(FTy->getParamType(1)) ||
+        !isa<IntegerType>(FTy->getParamType(2)))
+      return false;
+
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+
+  default:
+    // FIXME: It'd be really nice to cover all the library functions we're
+    // aware of here.
+    return false;
+  }
+}
+
+static bool inferAllPrototypeAttributes(Module &M,
+                                        const TargetLibraryInfo &TLI) {
+  bool Changed = false;
+
+  for (Function &F : M.functions())
+    // We only infer things using the prototype if the definition isn't around
+    // to analyze directly.
+    if (F.isDeclaration())
+      Changed |= inferPrototypeAttributes(F, TLI);
+
+  return Changed;
+}
+
+PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
+                                              AnalysisManager<Module> *AM) {
+  auto &TLI = AM->getResult<TargetLibraryAnalysis>(M);
+
+  if (!inferAllPrototypeAttributes(M, TLI))
+    // If we didn't infer anything, preserve all analyses.
+    return PreservedAnalyses::all();
+
+  // Otherwise, we may have changed fundamental function attributes, so clear
+  // out all the passes.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct InferFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  InferFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeInferFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return inferAllPrototypeAttributes(M, TLI);
+  }
+};
+}
+
+char InferFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs",
+                      "Infer set function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs",
+                    "Infer set function attributes", false, false)
+
+Pass *llvm::createInferFunctionAttrsLegacyPass() {
+  return new InferFunctionAttrsLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
index dc56a02..1704bfe 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
@@ -14,10 +14,10 @@
 
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
@@ -35,17 +35,15 @@ namespace {
 
 /// \brief Inliner pass which only handles "always inline" functions.
 class AlwaysInliner : public Inliner {
-  InlineCostAnalysis *ICA;
 
 public:
   // Use extremely low threshold.
-  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true),
-                    ICA(nullptr) {
+  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true) {
     initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
   }
 
   AlwaysInliner(bool InsertLifetime)
-      : Inliner(ID, -2000000000, InsertLifetime), ICA(nullptr) {
+      : Inliner(ID, -2000000000, InsertLifetime) {
     initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -53,9 +51,6 @@ public:
 
   InlineCost getInlineCost(CallSite CS) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
   using llvm::Pass::doFinalization;
   bool doFinalization(CallGraph &CG) override {
     return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true);
@@ -67,10 +62,9 @@ public:
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 
@@ -99,19 +93,8 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
   // that are viable for inlining. FIXME: We shouldn't even get here for
   // declarations.
   if (Callee && !Callee->isDeclaration() &&
-      CS.hasFnAttr(Attribute::AlwaysInline) &&
-      ICA->isInlineViable(*Callee))
+      CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))
     return InlineCost::getAlways();
 
   return InlineCost::getNever();
 }
-
-bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) {
-  ICA = &getAnalysis<InlineCostAnalysis>();
-  return Inliner::runOnSCC(SCC);
-}
-
-void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<InlineCostAnalysis>();
-  Inliner::getAnalysisUsage(AU);
-}
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
index 9b01d81..45609f8 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 
 using namespace llvm;
@@ -37,26 +38,30 @@ namespace {
 /// inliner pass and the always inliner pass. The two passes use different cost
 /// analyses to determine when to inline.
 class SimpleInliner : public Inliner {
-  InlineCostAnalysis *ICA;
 
 public:
-  SimpleInliner() : Inliner(ID), ICA(nullptr) {
+  SimpleInliner() : Inliner(ID) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
   SimpleInliner(int Threshold)
-      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) {
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
   static char ID; // Pass identification, replacement for typeid
 
   InlineCost getInlineCost(CallSite CS) override {
-    return ICA->getInlineCost(CS, getInlineThreshold(CS));
+    Function *Callee = CS.getCalledFunction();
+    TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+    return llvm::getInlineCost(CS, getInlineThreshold(CS), TTI, ACT);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  TargetTransformInfoWrapperPass *TTIWP;
 };
 
 static int computeThresholdFromOptLevels(unsigned OptLevel,
@@ -75,10 +80,10 @@ static int computeThresholdFromOptLevels(unsigned OptLevel,
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 
@@ -95,11 +100,11 @@ Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
 }
 
 bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
-  ICA = &getAnalysis<InlineCostAnalysis>();
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
   return Inliner::runOnSCC(SCC);
 }
 
 void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<InlineCostAnalysis>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   Inliner::getAnalysisUsage(AU);
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
index 5273c3d..bbe5f876 100644
--- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -64,20 +65,22 @@ ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225),
 // Threshold to use when optsize is specified (and there is no -inline-limit).
 const int OptSizeThreshold = 75;
 
-Inliner::Inliner(char &ID) 
-  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {}
+Inliner::Inliner(char &ID)
+    : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {
+}
 
 Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
-  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ?
-                                          InlineLimit : Threshold),
-    InsertLifetime(InsertLifetime) {}
+    : CallGraphSCCPass(ID),
+      InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? InlineLimit
+                                                          : Threshold),
+      InsertLifetime(InsertLifetime) {}
 
 /// For this class, we declare that we require and preserve the call graph.
 /// If the derived class implements this method, it should
 /// always explicitly call the implementation here.
 void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
   AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
@@ -85,39 +88,6 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
 typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
 InlinedArrayAllocasTy;
 
-/// \brief If the inlined function had a higher stack protection level than the
-/// calling function, then bump up the caller's stack protection level.
-static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
-  // If upgrading the SSP attribute, clear out the old SSP Attributes first.
-  // Having multiple SSP attributes doesn't actually hurt, but it adds useless
-  // clutter to the IR.
-  AttrBuilder B;
-  B.addAttribute(Attribute::StackProtect)
-    .addAttribute(Attribute::StackProtectStrong)
-    .addAttribute(Attribute::StackProtectReq);
-  AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(),
-                                              AttributeSet::FunctionIndex,
-                                              B);
-
-  if (Callee->hasFnAttribute(Attribute::SafeStack)) {
-    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
-    Caller->addFnAttr(Attribute::SafeStack);
-  } else if (Callee->hasFnAttribute(Attribute::StackProtectReq) &&
-             !Caller->hasFnAttribute(Attribute::SafeStack)) {
-    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
-    Caller->addFnAttr(Attribute::StackProtectReq);
-  } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) &&
-             !Caller->hasFnAttribute(Attribute::SafeStack) &&
-             !Caller->hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
-    Caller->addFnAttr(Attribute::StackProtectStrong);
-  } else if (Callee->hasFnAttribute(Attribute::StackProtect) &&
-             !Caller->hasFnAttribute(Attribute::SafeStack) &&
-             !Caller->hasFnAttribute(Attribute::StackProtectReq) &&
-             !Caller->hasFnAttribute(Attribute::StackProtectStrong))
-    Caller->addFnAttr(Attribute::StackProtect);
-}
-
 /// If it is possible to inline the specified call site,
 /// do so and update the CallGraph for this operation.
 ///
@@ -126,18 +96,26 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
 /// available from other functions inlined into the caller.  If we are able to
 /// inline this call site we attempt to reuse already available allocas or add
 /// any new allocas to the set if not possible.
-static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
+static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,
                                  InlinedArrayAllocasTy &InlinedArrayAllocas,
                                  int InlineHistory, bool InsertLifetime) {
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
 
+  // We need to manually construct BasicAA directly in order to disable
+  // its use of other function analyses.
+  BasicAAResult BAR(createLegacyPMBasicAAResult(P, *Callee));
+
+  // Construct our own AA results for this function. We do this manually to
+  // work around the limitations of the legacy pass manager.
+  AAResults AAR(createLegacyPMAAResults(P, *Callee, BAR));
+
   // Try to inline the function.  Get the list of static allocas that were
   // inlined.
-  if (!InlineFunction(CS, IFI, InsertLifetime))
+  if (!InlineFunction(CS, IFI, &AAR, InsertLifetime))
     return false;
 
-  AdjustCallerSSPLevel(Caller, Callee);
+  AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
 
   // Look at all of the allocas that we inlined through this call site.  If we
   // have already inlined other allocas through other calls into this function,
@@ -219,6 +197,14 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
       DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: "
                    << *AvailableAlloca << '\n');
       
+      // Move affected dbg.declare calls immediately after the new alloca to
+      // avoid the situation when a dbg.declare preceeds its alloca.
+      if (auto *L = LocalAsMetadata::getIfExists(AI))
+        if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+          for (User *U : MDV->users())
+            if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+              DDI->moveBefore(AvailableAlloca->getNextNode());
+
       AI->replaceAllUsesWith(AvailableAlloca);
 
       if (Align1 != Align2) {
@@ -258,39 +244,64 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
 }
 
 unsigned Inliner::getInlineThreshold(CallSite CS) const {
-  int thres = InlineThreshold; // -inline-threshold or else selected by
-                               // overall opt level
+  int Threshold = InlineThreshold; // -inline-threshold or else selected by
+                                   // overall opt level
 
   // If -inline-threshold is not given, listen to the optsize attribute when it
   // would decrease the threshold.
   Function *Caller = CS.getCaller();
   bool OptSize = Caller && !Caller->isDeclaration() &&
+                 // FIXME: Use Function::optForSize().
                  Caller->hasFnAttribute(Attribute::OptimizeForSize);
   if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
-      OptSizeThreshold < thres)
-    thres = OptSizeThreshold;
+      OptSizeThreshold < Threshold)
+    Threshold = OptSizeThreshold;
 
-  // Listen to the inlinehint attribute when it would increase the threshold
-  // and the caller does not need to minimize its size.
   Function *Callee = CS.getCalledFunction();
-  bool InlineHint = Callee && !Callee->isDeclaration() &&
-                    Callee->hasFnAttribute(Attribute::InlineHint);
-  if (InlineHint && HintThreshold > thres &&
-      !Caller->hasFnAttribute(Attribute::MinSize))
-    thres = HintThreshold;
+  if (!Callee || Callee->isDeclaration())
+    return Threshold;
+
+  // If profile information is available, use that to adjust threshold of hot
+  // and cold functions.
+  // FIXME: The heuristic used below for determining hotness and coldness are
+  // based on preliminary SPEC tuning and may not be optimal. Replace this with
+  // a well-tuned heuristic based on *callsite* hotness and not callee hotness.
+  uint64_t FunctionCount = 0, MaxFunctionCount = 0;
+  bool HasPGOCounts = false;
+  if (Callee->getEntryCount() &&
+      Callee->getParent()->getMaximumFunctionCount()) {
+    HasPGOCounts = true;
+    FunctionCount = Callee->getEntryCount().getValue();
+    MaxFunctionCount =
+        Callee->getParent()->getMaximumFunctionCount().getValue();
+  }
 
-  // Listen to the cold attribute when it would decrease the threshold.
-  bool ColdCallee = Callee && !Callee->isDeclaration() &&
-                    Callee->hasFnAttribute(Attribute::Cold);
+  // Listen to the inlinehint attribute or profile based hotness information
+  // when it would increase the threshold and the caller does not need to
+  // minimize its size.
+  bool InlineHint =
+      Callee->hasFnAttribute(Attribute::InlineHint) ||
+      (HasPGOCounts &&
+       FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount));
+  if (InlineHint && HintThreshold > Threshold &&
+      !Caller->hasFnAttribute(Attribute::MinSize))
+    Threshold = HintThreshold;
+
+  // Listen to the cold attribute or profile based coldness information
+  // when it would decrease the threshold.
+  bool ColdCallee =
+      Callee->hasFnAttribute(Attribute::Cold) ||
+      (HasPGOCounts &&
+       FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount));
   // Command line argument for InlineLimit will override the default
   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
   // do not use the default cold threshold even if it is smaller.
   if ((InlineLimit.getNumOccurrences() == 0 ||
        ColdThreshold.getNumOccurrences() > 0) && ColdCallee &&
-      ColdThreshold < thres)
-    thres = ColdThreshold;
+      ColdThreshold < Threshold)
+    Threshold = ColdThreshold;
 
-  return thres;
+  return Threshold;
 }
 
 static void emitAnalysis(CallSite CS, const Twine &Msg) {
@@ -430,10 +441,8 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
-  AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
   DEBUG(dbgs() << "Inliner visiting SCC:");
@@ -469,8 +478,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         // If this is a direct call to an external function, we can never inline
         // it.  If it is an indirect call, inlining may resolve it to be a
         // direct call, so we keep it.
-        if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration())
-          continue;
+        if (Function *Callee = CS.getCalledFunction())
+          if (Callee->isDeclaration())
+            continue;
         
         CallSites.push_back(std::make_pair(CS, -1));
       }
@@ -492,7 +502,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
   
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, AA, ACT);
+  InlineFunctionInfo InlineInfo(&CG, ACT);
 
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -513,7 +523,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
       // just delete the call instead of trying to inline it, regardless of
       // size.  This happens because IPSCCP propagates the result out of the
       // call and then we're left with the dead call.
-      if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) {
+      if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) {
         DEBUG(dbgs() << "    -> Deleting dead call: "
                      << *CS.getInstruction() << "\n");
         // Update the call graph by deleting the edge from Callee to Caller.
@@ -550,7 +560,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         }
 
         // Attempt to inline the function.
-        if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
+        if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas,
                                   InlineHistoryID, InsertLifetime)) {
           emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
                                        Twine(Callee->getName() +
@@ -647,8 +657,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
 
   // Scan for all of the functions, looking for ones that should now be removed
   // from the program.  Insert the dead ones in the FunctionsToRemove set.
-  for (auto I : CG) {
-    CallGraphNode *CGN = I.second;
+  for (const auto &I : CG) {
+    CallGraphNode *CGN = I.second.get();
     Function *F = CGN->getFunction();
     if (!F || F->isDeclaration())
       continue;
diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
index 7950163..21bb5d0 100644
--- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -60,6 +60,10 @@ namespace {
     explicit InternalizePass();
     explicit InternalizePass(ArrayRef<const char *> ExportList);
     void LoadFile(const char *Filename);
+    bool maybeInternalize(GlobalValue &GV,
+                          const std::set<const Comdat *> &ExternalComdats);
+    void checkComdatVisibility(GlobalValue &GV,
+                               std::set<const Comdat *> &ExternalComdats);
     bool runOnModule(Module &M) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -105,40 +109,85 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
-static bool shouldInternalize(const GlobalValue &GV,
-                              const std::set<std::string> &ExternalNames) {
+static bool isExternallyVisible(const GlobalValue &GV,
+                                const std::set<std::string> &ExternalNames) {
   // Function must be defined here
   if (GV.isDeclaration())
-    return false;
+    return true;
 
   // Available externally is really just a "declaration with a body".
   if (GV.hasAvailableExternallyLinkage())
-    return false;
+    return true;
 
   // Assume that dllexported symbols are referenced elsewhere
   if (GV.hasDLLExportStorageClass())
-    return false;
-
-  // Already has internal linkage
-  if (GV.hasLocalLinkage())
-    return false;
+    return true;
 
   // Marked to keep external?
-  if (ExternalNames.count(GV.getName()))
-    return false;
+  if (!GV.hasLocalLinkage() && ExternalNames.count(GV.getName()))
+    return true;
+
+  return false;
+}
 
+// Internalize GV if it is possible to do so, i.e. it is not externally visible
+// and is not a member of an externally visible comdat.
+bool InternalizePass::maybeInternalize(
+    GlobalValue &GV, const std::set<const Comdat *> &ExternalComdats) {
+  if (Comdat *C = GV.getComdat()) {
+    if (ExternalComdats.count(C))
+      return false;
+
+    // If a comdat is not externally visible we can drop it.
+    if (auto GO = dyn_cast<GlobalObject>(&GV))
+      GO->setComdat(nullptr);
+
+    if (GV.hasLocalLinkage())
+      return false;
+  } else {
+    if (GV.hasLocalLinkage())
+      return false;
+
+    if (isExternallyVisible(GV, ExternalNames))
+      return false;
+  }
+
+  GV.setVisibility(GlobalValue::DefaultVisibility);
+  GV.setLinkage(GlobalValue::InternalLinkage);
   return true;
 }
 
+// If GV is part of a comdat and is externally visible, keep track of its
+// comdat so that we don't internalize any of its members.
+void InternalizePass::checkComdatVisibility(
+    GlobalValue &GV, std::set<const Comdat *> &ExternalComdats) {
+  Comdat *C = GV.getComdat();
+  if (!C)
+    return;
+
+  if (isExternallyVisible(GV, ExternalNames))
+    ExternalComdats.insert(C);
+}
+
 bool InternalizePass::runOnModule(Module &M) {
   CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>();
   CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
-  bool Changed = false;
 
   SmallPtrSet<GlobalValue *, 8> Used;
   collectUsedGlobalVariables(M, Used, false);
 
+  // Collect comdat visiblity information for the module.
+  std::set<const Comdat *> ExternalComdats;
+  if (!M.getComdatSymbolTable().empty()) {
+    for (Function &F : M)
+      checkComdatVisibility(F, ExternalComdats);
+    for (GlobalVariable &GV : M.globals())
+      checkComdatVisibility(GV, ExternalComdats);
+    for (GlobalAlias &GA : M.aliases())
+      checkComdatVisibility(GA, ExternalComdats);
+  }
+
   // We must assume that globals in llvm.used have a reference that not even
   // the linker can see, so we don't internalize them.
   // For llvm.compiler.used the situation is a bit fuzzy. The assembler and
@@ -153,20 +202,16 @@ bool InternalizePass::runOnModule(Module &M) {
   }
 
   // Mark all functions not in the api as internal.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    if (!shouldInternalize(*I, ExternalNames))
+  for (Function &I : M) {
+    if (!maybeInternalize(I, ExternalComdats))
       continue;
 
-    I->setVisibility(GlobalValue::DefaultVisibility);
-    I->setLinkage(GlobalValue::InternalLinkage);
-
     if (ExternalNode)
       // Remove a callgraph edge from the external node to this function.
-      ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
 
-    Changed = true;
     ++NumFunctions;
-    DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
+    DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
   }
 
   // Never internalize the llvm.used symbol.  It is used to implement
@@ -191,12 +236,9 @@ bool InternalizePass::runOnModule(Module &M) {
   // internal as well.
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
-    if (!shouldInternalize(*I, ExternalNames))
+    if (!maybeInternalize(*I, ExternalComdats))
       continue;
 
-    I->setVisibility(GlobalValue::DefaultVisibility);
-    I->setLinkage(GlobalValue::InternalLinkage);
-    Changed = true;
     ++NumGlobals;
     DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
   }
@@ -204,17 +246,20 @@ bool InternalizePass::runOnModule(Module &M) {
   // Mark all aliases that are not in the api as internal as well.
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I) {
-    if (!shouldInternalize(*I, ExternalNames))
+    if (!maybeInternalize(*I, ExternalComdats))
       continue;
 
-    I->setVisibility(GlobalValue::DefaultVisibility);
-    I->setLinkage(GlobalValue::InternalLinkage);
-    Changed = true;
     ++NumAliases;
     DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
   }
 
-  return Changed;
+  // We do not keep track of whether this pass changed the module because
+  // it adds unnecessary complexity:
+  // 1) This pass will generally be near the start of the pass pipeline, so
+  //    there will be no analyses to invalidate.
+  // 2) This pass will most likely end up changing the module and it isn't worth
+  //    worrying about optimizing the case where the module is unchanged.
+  return true;
 }
 
 ModulePass *llvm::createInternalizePass() { return new InternalizePass(); }
diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index 41334ca..3c6a7bb 100644
--- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -38,17 +38,18 @@ namespace {
     static char ID; // Pass identification, replacement for typeid
     unsigned NumLoops;
 
-    explicit LoopExtractor(unsigned numLoops = ~0) 
+    explicit LoopExtractor(unsigned numLoops = ~0)
       : LoopPass(ID), NumLoops(numLoops) {
         initializeLoopExtractorPass(*PassRegistry::getPassRegistry());
       }
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+    bool runOnLoop(Loop *L, LPPassManager &) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequiredID(BreakCriticalEdgesID);
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
     }
   };
 }
@@ -79,7 +80,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
 //
 Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
 
-bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) {
   if (skipOptnoneFunction(L))
     return false;
 
@@ -92,6 +93,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   bool Changed = false;
 
   // If there is more than one top-level loop in this function, extract all of
@@ -120,14 +122,14 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   if (ShouldExtractLoop) {
-    // We must omit landing pads. Landing pads must accompany the invoke
+    // We must omit EH pads. EH pads must accompany the invoke
     // instruction. But this would result in a loop in the extracted
     // function. An infinite cycle occurs when it tries to extract that loop as
     // well.
     SmallVector<BasicBlock*, 8> ExitBlocks;
     L->getExitBlocks(ExitBlocks);
     for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-      if (ExitBlocks[i]->isLandingPad()) {
+      if (ExitBlocks[i]->isEHPad()) {
         ShouldExtractLoop = false;
         break;
       }
@@ -141,7 +143,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
-      LPM.deleteLoopFromQueue(L);
+      LI.markAsRemoved(L);
     }
     ++NumExtracted;
   }
@@ -259,7 +261,7 @@ bool BlockExtractorPass::runOnModule(Module &M) {
     // Figure out which index the basic block is in its function.
     Function::iterator BBI = MF->begin();
     std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
-    TranslatedBlocksToNotExtract.insert(BBI);
+    TranslatedBlocksToNotExtract.insert(&*BBI);
   }
 
   while (!BlocksToNotExtractByName.empty()) {
@@ -278,7 +280,7 @@ bool BlockExtractorPass::runOnModule(Module &M) {
         BasicBlock &BB = *BI;
         if (BB.getName() != BlockName) continue;
 
-        TranslatedBlocksToNotExtract.insert(BI);
+        TranslatedBlocksToNotExtract.insert(&*BI);
       }
     }
 
@@ -291,8 +293,8 @@ bool BlockExtractorPass::runOnModule(Module &M) {
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     SplitLandingPadPreds(&*F);
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      if (!TranslatedBlocksToNotExtract.count(BB))
-        BlocksToExtract.push_back(BB);
+      if (!TranslatedBlocksToNotExtract.count(&*BB))
+        BlocksToExtract.push_back(&*BB);
   }
 
   for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) {
diff --git a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp
index c6795c6..7b51574 100644
--- a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp
@@ -19,6 +19,8 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -26,6 +28,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -59,9 +63,9 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
 
 bool BitSetInfo::containsValue(
     const DataLayout &DL,
-    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V,
+    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V,
     uint64_t COffset) const {
-  if (auto GV = dyn_cast<GlobalVariable>(V)) {
+  if (auto GV = dyn_cast<GlobalObject>(V)) {
     auto I = GlobalLayout.find(GV);
     if (I == GlobalLayout.end())
       return false;
@@ -90,6 +94,21 @@ bool BitSetInfo::containsValue(
   return false;
 }
 
+void BitSetInfo::print(raw_ostream &OS) const {
+  OS << "offset " << ByteOffset << " size " << BitSize << " align "
+     << (1 << AlignLog2);
+
+  if (isAllOnes()) {
+    OS << " all-ones\n";
+    return;
+  }
+
+  OS << " { ";
+  for (uint64_t B : Bits)
+    OS << B << ' ';
+  OS << "}\n";
+}
+
 BitSetInfo BitSetBuilder::build() {
   if (Min > Max)
     Min = 0;
@@ -193,34 +212,48 @@ struct LowerBitSets : public ModulePass {
   Module *M;
 
   bool LinkerSubsectionsViaSymbols;
+  Triple::ArchType Arch;
+  Triple::ObjectFormatType ObjectFormat;
   IntegerType *Int1Ty;
   IntegerType *Int8Ty;
   IntegerType *Int32Ty;
   Type *Int32PtrTy;
   IntegerType *Int64Ty;
-  Type *IntPtrTy;
+  IntegerType *IntPtrTy;
 
   // The llvm.bitsets named metadata.
   NamedMDNode *BitSetNM;
 
-  // Mapping from bitset mdstrings to the call sites that test them.
-  DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites;
+  // Mapping from bitset identifiers to the call sites that test them.
+  DenseMap<Metadata *, std::vector<CallInst *>> BitSetTestCallSites;
 
   std::vector<ByteArrayInfo> ByteArrayInfos;
 
   BitSetInfo
-  buildBitSet(MDString *BitSet,
-              const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
+  buildBitSet(Metadata *BitSet,
+              const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
   ByteArrayInfo *createByteArray(BitSetInfo &BSI);
   void allocateByteArrays();
   Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
                           Value *BitOffset);
+  void lowerBitSetCalls(ArrayRef<Metadata *> BitSets,
+                        Constant *CombinedGlobalAddr,
+                        const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
   Value *
   lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
-                  GlobalVariable *CombinedGlobal,
-                  const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
-  void buildBitSetsFromGlobals(const std::vector<MDString *> &BitSets,
-                               const std::vector<GlobalVariable *> &Globals);
+                  Constant *CombinedGlobal,
+                  const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> BitSets,
+                                       ArrayRef<GlobalVariable *> Globals);
+  unsigned getJumpTableEntrySize();
+  Type *getJumpTableEntryType();
+  Constant *createJumpTableEntry(GlobalObject *Src, Function *Dest,
+                                 unsigned Distance);
+  void verifyBitSetMDNode(MDNode *Op);
+  void buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
+                                 ArrayRef<Function *> Functions);
+  void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> BitSets,
+                                   ArrayRef<GlobalObject *> Globals);
   bool buildBitSets();
   bool eraseBitSetMetadata();
 
@@ -228,7 +261,7 @@ struct LowerBitSets : public ModulePass {
   bool runOnModule(Module &M) override;
 };
 
-} // namespace
+} // anonymous namespace
 
 INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets",
                 "Lower bitset metadata", false, false)
@@ -244,6 +277,8 @@ bool LowerBitSets::doInitialization(Module &Mod) {
 
   Triple TargetTriple(M->getTargetTriple());
   LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
+  Arch = TargetTriple.getArch();
+  ObjectFormat = TargetTriple.getObjectFormat();
 
   Int1Ty = Type::getInt1Ty(M->getContext());
   Int8Ty = Type::getInt8Ty(M->getContext());
@@ -262,8 +297,8 @@ bool LowerBitSets::doInitialization(Module &Mod) {
 /// Build a bit set for BitSet using the object layouts in
 /// GlobalLayout.
 BitSetInfo LowerBitSets::buildBitSet(
-    MDString *BitSet,
-    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) {
+    Metadata *BitSet,
+    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
   BitSetBuilder BSB;
 
   // Compute the byte offset of each element of this bitset.
@@ -271,8 +306,11 @@ BitSetInfo LowerBitSets::buildBitSet(
     for (MDNode *Op : BitSetNM->operands()) {
       if (Op->getOperand(0) != BitSet || !Op->getOperand(1))
         continue;
-      auto OpGlobal = dyn_cast<GlobalVariable>(
-          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
+      Constant *OpConst =
+          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue();
+      if (auto GA = dyn_cast<GlobalAlias>(OpConst))
+        OpConst = GA->getAliasee();
+      auto OpGlobal = dyn_cast<GlobalObject>(OpConst);
       if (!OpGlobal)
         continue;
       uint64_t Offset =
@@ -360,9 +398,8 @@ void LowerBitSets::allocateByteArrays() {
     if (LinkerSubsectionsViaSymbols) {
       BAI->ByteArray->replaceAllUsesWith(GEP);
     } else {
-      GlobalAlias *Alias =
-          GlobalAlias::create(PointerType::getUnqual(Int8Ty),
-                              GlobalValue::PrivateLinkage, "bits", GEP, M);
+      GlobalAlias *Alias = GlobalAlias::create(
+          Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, M);
       BAI->ByteArray->replaceAllUsesWith(Alias);
     }
     BAI->ByteArray->eraseFromParent();
@@ -404,7 +441,7 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
       // Each use of the byte array uses a different alias. This makes the
       // backend less likely to reuse previously computed byte array addresses,
       // improving the security of the CFI mechanism based on this pass.
-      ByteArray = GlobalAlias::create(BAI->ByteArray->getType(),
+      ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0,
                                       GlobalValue::PrivateLinkage, "bits_use",
                                       ByteArray, M);
     }
@@ -421,17 +458,16 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
 /// replace the call with.
 Value *LowerBitSets::lowerBitSetCall(
     CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
-    GlobalVariable *CombinedGlobal,
-    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) {
+    Constant *CombinedGlobalIntAddr,
+    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
   Value *Ptr = CI->getArgOperand(0);
   const DataLayout &DL = M->getDataLayout();
 
   if (BSI.containsValue(DL, GlobalLayout, Ptr))
-    return ConstantInt::getTrue(CombinedGlobal->getParent()->getContext());
+    return ConstantInt::getTrue(M->getContext());
 
-  Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy);
   Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
-      GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset));
+      CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset));
 
   BasicBlock *InitialBB = CI->getParent();
 
@@ -490,18 +526,19 @@ Value *LowerBitSets::lowerBitSetCall(
 
 /// Given a disjoint set of bitsets and globals, layout the globals, build the
 /// bit sets and lower the llvm.bitset.test calls.
-void LowerBitSets::buildBitSetsFromGlobals(
-    const std::vector<MDString *> &BitSets,
-    const std::vector<GlobalVariable *> &Globals) {
+void LowerBitSets::buildBitSetsFromGlobalVariables(
+    ArrayRef<Metadata *> BitSets, ArrayRef<GlobalVariable *> Globals) {
   // Build a new global with the combined contents of the referenced globals.
+  // This global is a struct whose even-indexed elements contain the original
+  // contents of the referenced globals and whose odd-indexed elements contain
+  // any padding required to align the next element to the next power of 2.
   std::vector<Constant *> GlobalInits;
   const DataLayout &DL = M->getDataLayout();
   for (GlobalVariable *G : Globals) {
     GlobalInits.push_back(G->getInitializer());
-    uint64_t InitSize = DL.getTypeAllocSize(G->getInitializer()->getType());
+    uint64_t InitSize = DL.getTypeAllocSize(G->getValueType());
 
-    // Compute the amount of padding required to align the next element to the
-    // next power of 2.
+    // Compute the amount of padding required.
     uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
 
     // Cap at 128 was found experimentally to have a good data/instruction
@@ -515,34 +552,20 @@ void LowerBitSets::buildBitSetsFromGlobals(
   if (!GlobalInits.empty())
     GlobalInits.pop_back();
   Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits);
-  auto CombinedGlobal =
+  auto *CombinedGlobal =
       new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true,
                          GlobalValue::PrivateLinkage, NewInit);
 
-  const StructLayout *CombinedGlobalLayout =
-      DL.getStructLayout(cast<StructType>(NewInit->getType()));
+  StructType *NewTy = cast<StructType>(NewInit->getType());
+  const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);
 
   // Compute the offsets of the original globals within the new global.
-  DenseMap<GlobalVariable *, uint64_t> GlobalLayout;
+  DenseMap<GlobalObject *, uint64_t> GlobalLayout;
   for (unsigned I = 0; I != Globals.size(); ++I)
     // Multiply by 2 to account for padding elements.
     GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
 
-  // For each bitset in this disjoint set...
-  for (MDString *BS : BitSets) {
-    // Build the bitset.
-    BitSetInfo BSI = buildBitSet(BS, GlobalLayout);
-
-    ByteArrayInfo *BAI = 0;
-
-    // Lower each call to llvm.bitset.test for this bitset.
-    for (CallInst *CI : BitSetTestCallSites[BS]) {
-      ++NumBitSetCallsLowered;
-      Value *Lowered = lowerBitSetCall(CI, BSI, BAI, CombinedGlobal, GlobalLayout);
-      CI->replaceAllUsesWith(Lowered);
-      CI->eraseFromParent();
-    }
-  }
+  lowerBitSetCalls(BitSets, CombinedGlobal, GlobalLayout);
 
   // Build aliases pointing to offsets into the combined global for each
   // global from which we built the combined global, and replace references
@@ -556,9 +579,11 @@ void LowerBitSets::buildBitSetsFromGlobals(
     if (LinkerSubsectionsViaSymbols) {
       Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
     } else {
-      GlobalAlias *GAlias =
-          GlobalAlias::create(Globals[I]->getType(), Globals[I]->getLinkage(),
-                              "", CombinedGlobalElemPtr, M);
+      assert(Globals[I]->getType()->getAddressSpace() == 0);
+      GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0,
+                                                Globals[I]->getLinkage(), "",
+                                                CombinedGlobalElemPtr, M);
+      GAlias->setVisibility(Globals[I]->getVisibility());
       GAlias->takeName(Globals[I]);
       Globals[I]->replaceAllUsesWith(GAlias);
     }
@@ -566,6 +591,331 @@ void LowerBitSets::buildBitSetsFromGlobals(
   }
 }
 
+void LowerBitSets::lowerBitSetCalls(
+    ArrayRef<Metadata *> BitSets, Constant *CombinedGlobalAddr,
+    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
+  Constant *CombinedGlobalIntAddr =
+      ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy);
+
+  // For each bitset in this disjoint set...
+  for (Metadata *BS : BitSets) {
+    // Build the bitset.
+    BitSetInfo BSI = buildBitSet(BS, GlobalLayout);
+    DEBUG({
+      if (auto BSS = dyn_cast<MDString>(BS))
+        dbgs() << BSS->getString() << ": ";
+      else
+        dbgs() << "<unnamed>: ";
+      BSI.print(dbgs());
+    });
+
+    ByteArrayInfo *BAI = nullptr;
+
+    // Lower each call to llvm.bitset.test for this bitset.
+    for (CallInst *CI : BitSetTestCallSites[BS]) {
+      ++NumBitSetCallsLowered;
+      Value *Lowered =
+          lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout);
+      CI->replaceAllUsesWith(Lowered);
+      CI->eraseFromParent();
+    }
+  }
+}
+
+void LowerBitSets::verifyBitSetMDNode(MDNode *Op) {
+  if (Op->getNumOperands() != 3)
+    report_fatal_error(
+        "All operands of llvm.bitsets metadata must have 3 elements");
+  if (!Op->getOperand(1))
+    return;
+
+  auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1));
+  if (!OpConstMD)
+    report_fatal_error("Bit set element must be a constant");
+  auto OpGlobal = dyn_cast<GlobalObject>(OpConstMD->getValue());
+  if (!OpGlobal)
+    return;
+
+  if (OpGlobal->isThreadLocal())
+    report_fatal_error("Bit set element may not be thread-local");
+  if (OpGlobal->hasSection())
+    report_fatal_error("Bit set element may not have an explicit section");
+
+  if (isa<GlobalVariable>(OpGlobal) && OpGlobal->isDeclarationForLinker())
+    report_fatal_error("Bit set global var element must be a definition");
+
+  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2));
+  if (!OffsetConstMD)
+    report_fatal_error("Bit set element offset must be a constant");
+  auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
+  if (!OffsetInt)
+    report_fatal_error("Bit set element offset must be an integer constant");
+}
+
+static const unsigned kX86JumpTableEntrySize = 8;
+
+unsigned LowerBitSets::getJumpTableEntrySize() {
+  if (Arch != Triple::x86 && Arch != Triple::x86_64)
+    report_fatal_error("Unsupported architecture for jump tables");
+
+  return kX86JumpTableEntrySize;
+}
+
+// Create a constant representing a jump table entry for the target. This
+// consists of an instruction sequence containing a relative branch to Dest. The
+// constant will be laid out at address Src+(Len*Distance) where Len is the
+// target-specific jump table entry size.
+Constant *LowerBitSets::createJumpTableEntry(GlobalObject *Src, Function *Dest,
+                                             unsigned Distance) {
+  if (Arch != Triple::x86 && Arch != Triple::x86_64)
+    report_fatal_error("Unsupported architecture for jump tables");
+
+  const unsigned kJmpPCRel32Code = 0xe9;
+  const unsigned kInt3Code = 0xcc;
+
+  ConstantInt *Jmp = ConstantInt::get(Int8Ty, kJmpPCRel32Code);
+
+  // Build a constant representing the displacement between the constant's
+  // address and Dest. This will resolve to a PC32 relocation referring to Dest.
+  Constant *DestInt = ConstantExpr::getPtrToInt(Dest, IntPtrTy);
+  Constant *SrcInt = ConstantExpr::getPtrToInt(Src, IntPtrTy);
+  Constant *Disp = ConstantExpr::getSub(DestInt, SrcInt);
+  ConstantInt *DispOffset =
+      ConstantInt::get(IntPtrTy, Distance * kX86JumpTableEntrySize + 5);
+  Constant *OffsetedDisp = ConstantExpr::getSub(Disp, DispOffset);
+  OffsetedDisp = ConstantExpr::getTruncOrBitCast(OffsetedDisp, Int32Ty);
+
+  ConstantInt *Int3 = ConstantInt::get(Int8Ty, kInt3Code);
+
+  Constant *Fields[] = {
+      Jmp, OffsetedDisp, Int3, Int3, Int3,
+  };
+  return ConstantStruct::getAnon(Fields, /*Packed=*/true);
+}
+
+Type *LowerBitSets::getJumpTableEntryType() {
+  if (Arch != Triple::x86 && Arch != Triple::x86_64)
+    report_fatal_error("Unsupported architecture for jump tables");
+
+  return StructType::get(M->getContext(),
+                         {Int8Ty, Int32Ty, Int8Ty, Int8Ty, Int8Ty},
+                         /*Packed=*/true);
+}
+
+/// Given a disjoint set of bitsets and functions, build a jump table for the
+/// functions, build the bit sets and lower the llvm.bitset.test calls.
+void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets,
+                                             ArrayRef<Function *> Functions) {
+  // Unlike the global bitset builder, the function bitset builder cannot
+  // re-arrange functions in a particular order and base its calculations on the
+  // layout of the functions' entry points, as we have no idea how large a
+  // particular function will end up being (the size could even depend on what
+  // this pass does!) Instead, we build a jump table, which is a block of code
+  // consisting of one branch instruction for each of the functions in the bit
+  // set that branches to the target function, and redirect any taken function
+  // addresses to the corresponding jump table entry. In the object file's
+  // symbol table, the symbols for the target functions also refer to the jump
+  // table entries, so that addresses taken outside the module will pass any
+  // verification done inside the module.
+  //
+  // In more concrete terms, suppose we have three functions f, g, h which are
+  // members of a single bitset, and a function foo that returns their
+  // addresses:
+  //
+  // f:
+  // mov 0, %eax
+  // ret
+  //
+  // g:
+  // mov 1, %eax
+  // ret
+  //
+  // h:
+  // mov 2, %eax
+  // ret
+  //
+  // foo:
+  // mov f, %eax
+  // mov g, %edx
+  // mov h, %ecx
+  // ret
+  //
+  // To create a jump table for these functions, we instruct the LLVM code
+  // generator to output a jump table in the .text section. This is done by
+  // representing the instructions in the jump table as an LLVM constant and
+  // placing them in a global variable in the .text section. The end result will
+  // (conceptually) look like this:
+  //
+  // f:
+  // jmp .Ltmp0 ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  //
+  // g:
+  // jmp .Ltmp1 ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  //
+  // h:
+  // jmp .Ltmp2 ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  //
+  // .Ltmp0:
+  // mov 0, %eax
+  // ret
+  //
+  // .Ltmp1:
+  // mov 1, %eax
+  // ret
+  //
+  // .Ltmp2:
+  // mov 2, %eax
+  // ret
+  //
+  // foo:
+  // mov f, %eax
+  // mov g, %edx
+  // mov h, %ecx
+  // ret
+  //
+  // Because the addresses of f, g, h are evenly spaced at a power of 2, in the
+  // normal case the check can be carried out using the same kind of simple
+  // arithmetic that we normally use for globals.
+
+  assert(!Functions.empty());
+
+  // Build a simple layout based on the regular layout of jump tables.
+  DenseMap<GlobalObject *, uint64_t> GlobalLayout;
+  unsigned EntrySize = getJumpTableEntrySize();
+  for (unsigned I = 0; I != Functions.size(); ++I)
+    GlobalLayout[Functions[I]] = I * EntrySize;
+
+  // Create a constant to hold the jump table.
+  ArrayType *JumpTableType =
+      ArrayType::get(getJumpTableEntryType(), Functions.size());
+  auto JumpTable = new GlobalVariable(*M, JumpTableType,
+                                      /*isConstant=*/true,
+                                      GlobalValue::PrivateLinkage, nullptr);
+  JumpTable->setSection(ObjectFormat == Triple::MachO
+                            ? "__TEXT,__text,regular,pure_instructions"
+                            : ".text");
+  lowerBitSetCalls(BitSets, JumpTable, GlobalLayout);
+
+  // Build aliases pointing to offsets into the jump table, and replace
+  // references to the original functions with references to the aliases.
+  for (unsigned I = 0; I != Functions.size(); ++I) {
+    Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
+        ConstantExpr::getGetElementPtr(
+            JumpTableType, JumpTable,
+            ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
+                                 ConstantInt::get(IntPtrTy, I)}),
+        Functions[I]->getType());
+    if (LinkerSubsectionsViaSymbols || Functions[I]->isDeclarationForLinker()) {
+      Functions[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
+    } else {
+      assert(Functions[I]->getType()->getAddressSpace() == 0);
+      GlobalAlias *GAlias = GlobalAlias::create(Functions[I]->getValueType(), 0,
+                                                Functions[I]->getLinkage(), "",
+                                                CombinedGlobalElemPtr, M);
+      GAlias->setVisibility(Functions[I]->getVisibility());
+      GAlias->takeName(Functions[I]);
+      Functions[I]->replaceAllUsesWith(GAlias);
+    }
+    if (!Functions[I]->isDeclarationForLinker())
+      Functions[I]->setLinkage(GlobalValue::PrivateLinkage);
+  }
+
+  // Build and set the jump table's initializer.
+  std::vector<Constant *> JumpTableEntries;
+  for (unsigned I = 0; I != Functions.size(); ++I)
+    JumpTableEntries.push_back(
+        createJumpTableEntry(JumpTable, Functions[I], I));
+  JumpTable->setInitializer(
+      ConstantArray::get(JumpTableType, JumpTableEntries));
+}
+
+void LowerBitSets::buildBitSetsFromDisjointSet(
+    ArrayRef<Metadata *> BitSets, ArrayRef<GlobalObject *> Globals) {
+  llvm::DenseMap<Metadata *, uint64_t> BitSetIndices;
+  llvm::DenseMap<GlobalObject *, uint64_t> GlobalIndices;
+  for (unsigned I = 0; I != BitSets.size(); ++I)
+    BitSetIndices[BitSets[I]] = I;
+  for (unsigned I = 0; I != Globals.size(); ++I)
+    GlobalIndices[Globals[I]] = I;
+
+  // For each bitset, build a set of indices that refer to globals referenced by
+  // the bitset.
+  std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size());
+  if (BitSetNM) {
+    for (MDNode *Op : BitSetNM->operands()) {
+      // Op = { bitset name, global, offset }
+      if (!Op->getOperand(1))
+        continue;
+      auto I = BitSetIndices.find(Op->getOperand(0));
+      if (I == BitSetIndices.end())
+        continue;
+
+      auto OpGlobal = dyn_cast<GlobalObject>(
+          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
+      if (!OpGlobal)
+        continue;
+      BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]);
+    }
+  }
+
+  // Order the sets of indices by size. The GlobalLayoutBuilder works best
+  // when given small index sets first.
+  std::stable_sort(
+      BitSetMembers.begin(), BitSetMembers.end(),
+      [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) {
+        return O1.size() < O2.size();
+      });
+
+  // Create a GlobalLayoutBuilder and provide it with index sets as layout
+  // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as
+  // close together as possible.
+  GlobalLayoutBuilder GLB(Globals.size());
+  for (auto &&MemSet : BitSetMembers)
+    GLB.addFragment(MemSet);
+
+  // Build the bitsets from this disjoint set.
+  if (Globals.empty() || isa<GlobalVariable>(Globals[0])) {
+    // Build a vector of global variables with the computed layout.
+    std::vector<GlobalVariable *> OrderedGVs(Globals.size());
+    auto OGI = OrderedGVs.begin();
+    for (auto &&F : GLB.Fragments) {
+      for (auto &&Offset : F) {
+        auto GV = dyn_cast<GlobalVariable>(Globals[Offset]);
+        if (!GV)
+          report_fatal_error(
+              "Bit set may not contain both global variables and functions");
+        *OGI++ = GV;
+      }
+    }
+
+    buildBitSetsFromGlobalVariables(BitSets, OrderedGVs);
+  } else {
+    // Build a vector of functions with the computed layout.
+    std::vector<Function *> OrderedFns(Globals.size());
+    auto OFI = OrderedFns.begin();
+    for (auto &&F : GLB.Fragments) {
+      for (auto &&Offset : F) {
+        auto Fn = dyn_cast<Function>(Globals[Offset]);
+        if (!Fn)
+          report_fatal_error(
+              "Bit set may not contain both global variables and functions");
+        *OFI++ = Fn;
+      }
+    }
+
+    buildBitSetsFromFunctions(BitSets, OrderedFns);
+  }
+}
+
 /// Lower all bit sets in this module.
 bool LowerBitSets::buildBitSets() {
   Function *BitSetTestFunc =
@@ -576,24 +926,36 @@ bool LowerBitSets::buildBitSets() {
   // Equivalence class set containing bitsets and the globals they reference.
   // This is used to partition the set of bitsets in the module into disjoint
   // sets.
-  typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>>
+  typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>>
       GlobalClassesTy;
   GlobalClassesTy GlobalClasses;
 
+  // Verify the bitset metadata and build a mapping from bitset identifiers to
+  // their last observed index in BitSetNM. This will used later to
+  // deterministically order the list of bitset identifiers.
+  llvm::DenseMap<Metadata *, unsigned> BitSetIdIndices;
+  if (BitSetNM) {
+    for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) {
+      MDNode *Op = BitSetNM->getOperand(I);
+      verifyBitSetMDNode(Op);
+      BitSetIdIndices[Op->getOperand(0)] = I;
+    }
+  }
+
   for (const Use &U : BitSetTestFunc->uses()) {
     auto CI = cast<CallInst>(U.getUser());
 
     auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
-    if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata()))
+    if (!BitSetMDVal)
       report_fatal_error(
-          "Second argument of llvm.bitset.test must be metadata string");
-    auto BitSet = cast<MDString>(BitSetMDVal->getMetadata());
+          "Second argument of llvm.bitset.test must be metadata");
+    auto BitSet = BitSetMDVal->getMetadata();
 
     // Add the call site to the list of call sites for this bit set. We also use
     // BitSetTestCallSites to keep track of whether we have seen this bit set
     // before. If we have, we don't need to re-add the referenced globals to the
     // equivalence class.
-    std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator,
+    std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator,
               bool> Ins =
         BitSetTestCallSites.insert(
             std::make_pair(BitSet, std::vector<CallInst *>()));
@@ -608,31 +970,16 @@ bool LowerBitSets::buildBitSets() {
     if (!BitSetNM)
       continue;
 
-    // Verify the bitset metadata and add the referenced globals to the bitset's
-    // equivalence class.
+    // Add the referenced globals to the bitset's equivalence class.
     for (MDNode *Op : BitSetNM->operands()) {
-      if (Op->getNumOperands() != 3)
-        report_fatal_error(
-            "All operands of llvm.bitsets metadata must have 3 elements");
-
       if (Op->getOperand(0) != BitSet || !Op->getOperand(1))
         continue;
 
-      auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1));
-      if (!OpConstMD)
-        report_fatal_error("Bit set element must be a constant");
-      auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue());
+      auto OpGlobal = dyn_cast<GlobalObject>(
+          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
       if (!OpGlobal)
         continue;
 
-      auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2));
-      if (!OffsetConstMD)
-        report_fatal_error("Bit set element offset must be a constant");
-      auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
-      if (!OffsetInt)
-        report_fatal_error(
-            "Bit set element offset must be an integer constant");
-
       CurSet = GlobalClasses.unionSets(
           CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal)));
     }
@@ -641,79 +988,51 @@ bool LowerBitSets::buildBitSets() {
   if (GlobalClasses.empty())
     return false;
 
-  // For each disjoint set we found...
+  // Build a list of disjoint sets ordered by their maximum BitSetNM index
+  // for determinism.
+  std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;
   for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
                                  E = GlobalClasses.end();
        I != E; ++I) {
     if (!I->isLeader()) continue;
-
     ++NumBitSetDisjointSets;
 
-    // Build the list of bitsets and referenced globals in this disjoint set.
-    std::vector<MDString *> BitSets;
-    std::vector<GlobalVariable *> Globals;
-    llvm::DenseMap<MDString *, uint64_t> BitSetIndices;
-    llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices;
+    unsigned MaxIndex = 0;
     for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
          MI != GlobalClasses.member_end(); ++MI) {
-      if ((*MI).is<MDString *>()) {
-        BitSetIndices[MI->get<MDString *>()] = BitSets.size();
-        BitSets.push_back(MI->get<MDString *>());
-      } else {
-        GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size();
-        Globals.push_back(MI->get<GlobalVariable *>());
-      }
+      if ((*MI).is<Metadata *>())
+        MaxIndex = std::max(MaxIndex, BitSetIdIndices[MI->get<Metadata *>()]);
     }
+    Sets.emplace_back(I, MaxIndex);
+  }
+  std::sort(Sets.begin(), Sets.end(),
+            [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
+               const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
+              return S1.second < S2.second;
+            });
 
-    // For each bitset, build a set of indices that refer to globals referenced
-    // by the bitset.
-    std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size());
-    if (BitSetNM) {
-      for (MDNode *Op : BitSetNM->operands()) {
-        // Op = { bitset name, global, offset }
-        if (!Op->getOperand(1))
-          continue;
-        auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0)));
-        if (I == BitSetIndices.end())
-          continue;
-
-        auto OpGlobal = dyn_cast<GlobalVariable>(
-            cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());
-        if (!OpGlobal)
-          continue;
-        BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]);
-      }
+  // For each disjoint set we found...
+  for (const auto &S : Sets) {
+    // Build the list of bitsets in this disjoint set.
+    std::vector<Metadata *> BitSets;
+    std::vector<GlobalObject *> Globals;
+    for (GlobalClassesTy::member_iterator MI =
+             GlobalClasses.member_begin(S.first);
+         MI != GlobalClasses.member_end(); ++MI) {
+      if ((*MI).is<Metadata *>())
+        BitSets.push_back(MI->get<Metadata *>());
+      else
+        Globals.push_back(MI->get<GlobalObject *>());
     }
 
-    // Order the sets of indices by size. The GlobalLayoutBuilder works best
-    // when given small index sets first.
-    std::stable_sort(
-        BitSetMembers.begin(), BitSetMembers.end(),
-        [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) {
-          return O1.size() < O2.size();
-        });
-
-    // Create a GlobalLayoutBuilder and provide it with index sets as layout
-    // fragments. The GlobalLayoutBuilder tries to lay out members of fragments
-    // as close together as possible.
-    GlobalLayoutBuilder GLB(Globals.size());
-    for (auto &&MemSet : BitSetMembers)
-      GLB.addFragment(MemSet);
-
-    // Build a vector of globals with the computed layout.
-    std::vector<GlobalVariable *> OrderedGlobals(Globals.size());
-    auto OGI = OrderedGlobals.begin();
-    for (auto &&F : GLB.Fragments)
-      for (auto &&Offset : F)
-        *OGI++ = Globals[Offset];
-
-    // Order bitsets by name for determinism.
-    std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) {
-      return S1->getString() < S2->getString();
+    // Order bitsets by BitSetNM index for determinism. This ordering is stable
+    // as there is a one-to-one mapping between metadata and indices.
+    std::sort(BitSets.begin(), BitSets.end(), [&](Metadata *M1, Metadata *M2) {
+      return BitSetIdIndices[M1] < BitSetIdIndices[M2];
     });
 
-    // Build the bitsets from this disjoint set.
-    buildBitSetsFromGlobals(BitSets, OrderedGlobals);
+    // Lower the bitsets in this disjoint set.
+    buildBitSetsFromDisjointSet(BitSets, Globals);
   }
 
   allocateByteArrays();
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 2e3519e..8a209a1 100644
--- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -27,6 +27,14 @@
 // -- We define Function* container class with custom "operator<" (FunctionPtr).
 // -- "FunctionPtr" instances are stored in std::set collection, so every
 //    std::set::insert operation will give you result in log(N) time.
+// 
+// As an optimization, a hash of the function structure is calculated first, and
+// two functions are only compared if they have the same hash. This hash is
+// cheap to compute, and has the property that if function F == G according to
+// the comparison function, then hash(F) == hash(G). This consistency property
+// is critical to ensuring all possible merging opportunities are exploited.
+// Collisions in the hash affect the speed of the pass but not the correctness
+// or determinism of the resulting transformation.
 //
 // When a match is found the functions are folded. If both functions are
 // overridable, we move the functionality into a new internal function and
@@ -87,6 +95,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -97,12 +106,14 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mergefunc"
@@ -121,21 +132,64 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck(
 
 namespace {
 
+/// GlobalNumberState assigns an integer to each global value in the program,
+/// which is used by the comparison routine to order references to globals. This
+/// state must be preserved throughout the pass, because Functions and other
+/// globals need to maintain their relative order. Globals are assigned a number
+/// when they are first visited. This order is deterministic, and so the
+/// assigned numbers are as well. When two functions are merged, neither number
+/// is updated. If the symbols are weak, this would be incorrect. If they are
+/// strong, then one will be replaced at all references to the other, and so
+/// direct callsites will now see one or the other symbol, and no update is
+/// necessary. Note that if we were guaranteed unique names, we could just
+/// compare those, but this would not work for stripped bitcodes or for those
+/// few symbols without a name.
+class GlobalNumberState {
+  struct Config : ValueMapConfig<GlobalValue*> {
+    enum { FollowRAUW = false };
+  };
+  // Each GlobalValue is mapped to an identifier. The Config ensures when RAUW
+  // occurs, the mapping does not change. Tracking changes is unnecessary, and
+  // also problematic for weak symbols (which may be overwritten).
+  typedef ValueMap<GlobalValue *, uint64_t, Config> ValueNumberMap;
+  ValueNumberMap GlobalNumbers;
+  // The next unused serial number to assign to a global.
+  uint64_t NextNumber;
+  public:
+    GlobalNumberState() : GlobalNumbers(), NextNumber(0) {}
+    uint64_t getNumber(GlobalValue* Global) {
+      ValueNumberMap::iterator MapIter;
+      bool Inserted;
+      std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber});
+      if (Inserted)
+        NextNumber++;
+      return MapIter->second;
+    }
+    void clear() {
+      GlobalNumbers.clear();
+    }
+};
+
 /// FunctionComparator - Compares two functions to determine whether or not
 /// they will generate machine code with the same behaviour. DataLayout is
 /// used if available. The comparator always fails conservatively (erring on the
 /// side of claiming that two functions are different).
 class FunctionComparator {
 public:
-  FunctionComparator(const Function *F1, const Function *F2)
-      : FnL(F1), FnR(F2) {}
+  FunctionComparator(const Function *F1, const Function *F2,
+                     GlobalNumberState* GN)
+      : FnL(F1), FnR(F2), GlobalNumbers(GN) {}
 
   /// Test whether the two functions have equivalent behaviour.
   int compare();
+  /// Hash a function. Equivalent functions will have the same hash, and unequal
+  /// functions will have different hashes with high probability.
+  typedef uint64_t FunctionHash;
+  static FunctionHash functionHash(Function &);
 
 private:
   /// Test whether two basic blocks have equivalent behaviour.
-  int compare(const BasicBlock *BBL, const BasicBlock *BBR);
+  int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR);
 
   /// Constants comparison.
   /// Its analog to lexicographical comparison between hypothetical numbers
@@ -241,6 +295,10 @@ private:
   /// If these properties are equal - compare their contents.
   int cmpConstants(const Constant *L, const Constant *R);
 
+  /// Compares two global values by number. Uses the GlobalNumbersState to
+  /// identify the same gobals across function calls.
+  int cmpGlobalValues(GlobalValue *L, GlobalValue *R);
+
   /// Assign or look up previously assigned numbers for the two values, and
   /// return whether the numbers are equal. Numbers are assigned in the order
   /// visited.
@@ -320,8 +378,9 @@ private:
   ///
   /// 1. If types are of different kind (different type IDs).
   ///    Return result of type IDs comparison, treating them as numbers.
-  /// 2. If types are vectors or integers, compare Type* values as numbers.
-  /// 3. Types has same ID, so check whether they belongs to the next group:
+  /// 2. If types are integers, check that they have the same width. If they
+  /// are vectors, check that they have the same count and subtype.
+  /// 3. Types have the same ID, so check whether they are one of:
   /// * Void
   /// * Float
   /// * Double
@@ -330,8 +389,7 @@ private:
   /// * PPC_FP128
   /// * Label
   /// * Metadata
-  /// If so - return 0, yes - we can treat these types as equal only because
-  /// their IDs are same.
+  /// We can treat these types as equal whenever their IDs are same.
   /// 4. If Left and Right are pointers, return result of address space
   /// comparison (numbers comparison). We can treat pointer types of same
   /// address space as equal.
@@ -343,11 +401,13 @@ private:
   int cmpTypes(Type *TyL, Type *TyR) const;
 
   int cmpNumbers(uint64_t L, uint64_t R) const;
-
   int cmpAPInts(const APInt &L, const APInt &R) const;
   int cmpAPFloats(const APFloat &L, const APFloat &R) const;
-  int cmpStrings(StringRef L, StringRef R) const;
+  int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const;
+  int cmpMem(StringRef L, StringRef R) const;
   int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
+  int cmpRangeMetadata(const MDNode* L, const MDNode* R) const;
+  int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const;
 
   // The two functions undergoing comparison.
   const Function *FnL, *FnR;
@@ -386,30 +446,30 @@ private:
   /// could be operands from further BBs we didn't scan yet.
   /// So it's impossible to use dominance properties in general.
   DenseMap<const Value*, int> sn_mapL, sn_mapR;
+
+  // The global state we will use
+  GlobalNumberState* GlobalNumbers;
 };
 
 class FunctionNode {
   mutable AssertingVH<Function> F;
-
+  FunctionComparator::FunctionHash Hash;
 public:
-  FunctionNode(Function *F) : F(F) {}
+  // Note the hash is recalculated potentially multiple times, but it is cheap.
+  FunctionNode(Function *F)
+    : F(F), Hash(FunctionComparator::functionHash(*F))  {}
   Function *getFunc() const { return F; }
+  FunctionComparator::FunctionHash getHash() const { return Hash; }
 
   /// Replace the reference to the function F by the function G, assuming their
   /// implementations are equal.
   void replaceBy(Function *G) const {
-    assert(!(*this < FunctionNode(G)) && !(FunctionNode(G) < *this) &&
-           "The two functions must be equal");
-
     F = G;
   }
 
-  void release() { F = 0; }
-  bool operator<(const FunctionNode &RHS) const {
-    return (FunctionComparator(F, RHS.getFunc()).compare()) == -1;
-  }
+  void release() { F = nullptr; }
 };
-}
+} // end anonymous namespace
 
 int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
   if (L < R) return -1;
@@ -426,13 +486,25 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
 }
 
 int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
-  if (int Res = cmpNumbers((uint64_t)&L.getSemantics(),
-                           (uint64_t)&R.getSemantics()))
+  // Floats are ordered first by semantics (i.e. float, double, half, etc.),
+  // then by value interpreted as a bitstring (aka APInt).
+  const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
+  if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
+                           APFloat::semanticsPrecision(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
+                           APFloat::semanticsMaxExponent(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
+                           APFloat::semanticsMinExponent(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
+                           APFloat::semanticsSizeInBits(SR)))
     return Res;
   return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
 }
 
-int FunctionComparator::cmpStrings(StringRef L, StringRef R) const {
+int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
   // Prevent heavy comparison, compare sizes first.
   if (int Res = cmpNumbers(L.size(), R.size()))
     return Res;
@@ -466,6 +538,59 @@ int FunctionComparator::cmpAttrs(const AttributeSet L,
   return 0;
 }
 
+int FunctionComparator::cmpRangeMetadata(const MDNode* L,
+                                         const MDNode* R) const {
+  if (L == R)
+    return 0;
+  if (!L)
+    return -1;
+  if (!R)
+    return 1;
+  // Range metadata is a sequence of numbers. Make sure they are the same
+  // sequence. 
+  // TODO: Note that as this is metadata, it is possible to drop and/or merge
+  // this data when considering functions to merge. Thus this comparison would
+  // return 0 (i.e. equivalent), but merging would become more complicated
+  // because the ranges would need to be unioned. It is not likely that
+  // functions differ ONLY in this metadata if they are actually the same
+  // function semantically.
+  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+    return Res;
+  for (size_t I = 0; I < L->getNumOperands(); ++I) {
+    ConstantInt* LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
+    ConstantInt* RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
+    if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
+      return Res;
+  }
+  return 0;
+}
+
+int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L,
+                                                const Instruction *R) const {
+  ImmutableCallSite LCS(L);
+  ImmutableCallSite RCS(R);
+
+  assert(LCS && RCS && "Must be calls or invokes!");
+  assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!");
+
+  if (int Res =
+          cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
+    return Res;
+
+  for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) {
+    auto OBL = LCS.getOperandBundleAt(i);
+    auto OBR = RCS.getOperandBundleAt(i);
+
+    if (int Res = OBL.getTagName().compare(OBR.getTagName()))
+      return Res;
+
+    if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
+      return Res;
+  }
+
+  return 0;
+}
+
 /// Constants comparison:
 /// 1. Check whether type of L constant could be losslessly bitcasted to R
 /// type.
@@ -500,9 +625,9 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {
     unsigned TyLWidth = 0;
     unsigned TyRWidth = 0;
 
-    if (const VectorType *VecTyL = dyn_cast<VectorType>(TyL))
+    if (auto *VecTyL = dyn_cast<VectorType>(TyL))
       TyLWidth = VecTyL->getBitWidth();
-    if (const VectorType *VecTyR = dyn_cast<VectorType>(TyR))
+    if (auto *VecTyR = dyn_cast<VectorType>(TyR))
       TyRWidth = VecTyR->getBitWidth();
 
     if (TyLWidth != TyRWidth)
@@ -538,11 +663,29 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {
   if (!L->isNullValue() && R->isNullValue())
     return -1;
 
+  auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L));
+  auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R));
+  if (GlobalValueL && GlobalValueR) {
+    return cmpGlobalValues(GlobalValueL, GlobalValueR);
+  }
+
   if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
     return Res;
 
+  if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
+    const auto *SeqR = cast<ConstantDataSequential>(R);
+    // This handles ConstantDataArray and ConstantDataVector. Note that we
+    // compare the two raw data arrays, which might differ depending on the host
+    // endianness. This isn't a problem though, because the endiness of a module
+    // will affect the order of the constants, but this order is the same
+    // for a given input module and host platform.
+    return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
+  }
+
   switch (L->getValueID()) {
-  case Value::UndefValueVal: return TypesRes;
+  case Value::UndefValueVal:
+  case Value::ConstantTokenNoneVal:
+    return TypesRes;
   case Value::ConstantIntVal: {
     const APInt &LInt = cast<ConstantInt>(L)->getValue();
     const APInt &RInt = cast<ConstantInt>(R)->getValue();
@@ -609,19 +752,55 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {
     }
     return 0;
   }
-  case Value::FunctionVal:
-  case Value::GlobalVariableVal:
-  case Value::GlobalAliasVal:
-  default: // Unknown constant, cast L and R pointers to numbers and compare.
-    return cmpNumbers((uint64_t)L, (uint64_t)R);
+  case Value::BlockAddressVal: {
+    const BlockAddress *LBA = cast<BlockAddress>(L);
+    const BlockAddress *RBA = cast<BlockAddress>(R);
+    if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
+      return Res;
+    if (LBA->getFunction() == RBA->getFunction()) {
+      // They are BBs in the same function. Order by which comes first in the
+      // BB order of the function. This order is deterministic.
+      Function* F = LBA->getFunction();
+      BasicBlock *LBB = LBA->getBasicBlock();
+      BasicBlock *RBB = RBA->getBasicBlock();
+      if (LBB == RBB)
+        return 0;
+      for(BasicBlock &BB : F->getBasicBlockList()) {
+        if (&BB == LBB) {
+          assert(&BB != RBB);
+          return -1;
+        }
+        if (&BB == RBB)
+          return 1;
+      }
+      llvm_unreachable("Basic Block Address does not point to a basic block in "
+                       "its function.");
+      return -1;
+    } else {
+      // cmpValues said the functions are the same. So because they aren't
+      // literally the same pointer, they must respectively be the left and
+      // right functions.
+      assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
+      // cmpValues will tell us if these are equivalent BasicBlocks, in the
+      // context of their respective functions.
+      return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
+    }
   }
+  default: // Unknown constant, abort.
+    DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+    llvm_unreachable("Constant ValueID not recognized.");
+    return -1;
+  }
+}
+
+int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue* R) {
+  return cmpNumbers(GlobalNumbers->getNumber(L), GlobalNumbers->getNumber(R));
 }
 
 /// cmpType - compares two types,
 /// defines total ordering among the types set.
 /// See method declaration comments for more details.
 int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
-
   PointerType *PTyL = dyn_cast<PointerType>(TyL);
   PointerType *PTyR = dyn_cast<PointerType>(TyR);
 
@@ -642,10 +821,15 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
     llvm_unreachable("Unknown type!");
     // Fall through in Release mode.
   case Type::IntegerTyID:
-  case Type::VectorTyID:
-    // TyL == TyR would have returned true earlier.
-    return cmpNumbers((uint64_t)TyL, (uint64_t)TyR);
-
+    return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
+                      cast<IntegerType>(TyR)->getBitWidth());
+  case Type::VectorTyID: {
+    VectorType *VTyL = cast<VectorType>(TyL), *VTyR = cast<VectorType>(TyR);
+    if (int Res = cmpNumbers(VTyL->getNumElements(), VTyR->getNumElements()))
+      return Res;
+    return cmpTypes(VTyL->getElementType(), VTyR->getElementType());
+  }
+  // TyL == TyR would have returned true earlier, because types are uniqued.
   case Type::VoidTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID:
@@ -654,6 +838,7 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   case Type::PPC_FP128TyID:
   case Type::LabelTyID:
   case Type::MetadataTyID:
+  case Type::TokenTyID:
     return 0;
 
   case Type::PointerTyID: {
@@ -759,8 +944,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
       return Res;
-    return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range),
-                      (uint64_t)cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
+    return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
+        cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
   }
   if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
     if (int Res =
@@ -783,20 +968,24 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes()))
       return Res;
-    return cmpNumbers(
-        (uint64_t)CI->getMetadata(LLVMContext::MD_range),
-        (uint64_t)cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));
+    if (int Res = cmpOperandBundlesSchema(CI, R))
+      return Res;
+    return cmpRangeMetadata(
+        CI->getMetadata(LLVMContext::MD_range),
+        cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));
   }
-  if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) {
-    if (int Res = cmpNumbers(CI->getCallingConv(),
+  if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) {
+    if (int Res = cmpNumbers(II->getCallingConv(),
                              cast<InvokeInst>(R)->getCallingConv()))
       return Res;
     if (int Res =
-            cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes()))
+            cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes()))
+      return Res;
+    if (int Res = cmpOperandBundlesSchema(II, R))
       return Res;
-    return cmpNumbers(
-        (uint64_t)CI->getMetadata(LLVMContext::MD_range),
-        (uint64_t)cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));
+    return cmpRangeMetadata(
+        II->getMetadata(LLVMContext::MD_range),
+        cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));
   }
   if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
     ArrayRef<unsigned> LIndices = IVI->getIndices();
@@ -876,9 +1065,8 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
   if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
       GEPR->accumulateConstantOffset(DL, OffsetR))
     return cmpAPInts(OffsetL, OffsetR);
-
-  if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(),
-                           (uint64_t)GEPR->getPointerOperand()->getType()))
+  if (int Res = cmpTypes(GEPL->getSourceElementType(),
+                         GEPR->getSourceElementType()))
     return Res;
 
   if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
@@ -892,6 +1080,28 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
   return 0;
 }
 
+int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
+                                     const InlineAsm *R) const {
+  // InlineAsm's are uniqued. If they are the same pointer, obviously they are
+  // the same, otherwise compare the fields.
+  if (L == R)
+    return 0;
+  if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
+    return Res;
+  if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
+    return Res;
+  if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
+    return Res;
+  if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
+    return Res;
+  if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
+    return Res;
+  if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
+    return Res;
+  llvm_unreachable("InlineAsm blocks were not uniqued.");
+  return 0;
+}
+
 /// Compare two values used by the two functions under pair-wise comparison. If
 /// this is the first time the values are seen, they're added to the mapping so
 /// that we will detect mismatches on next use.
@@ -926,7 +1136,7 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) {
   const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
 
   if (InlineAsmL && InlineAsmR)
-    return cmpNumbers((uint64_t)L, (uint64_t)R);
+    return cmpInlineAsm(InlineAsmL, InlineAsmR);
   if (InlineAsmL)
     return 1;
   if (InlineAsmR)
@@ -938,12 +1148,13 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) {
   return cmpNumbers(LeftSN.first->second, RightSN.first->second);
 }
 // Test whether two basic blocks have equivalent behaviour.
-int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {
+int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
+                                       const BasicBlock *BBR) {
   BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
   BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
 
   do {
-    if (int Res = cmpValues(InstL, InstR))
+    if (int Res = cmpValues(&*InstL, &*InstR))
       return Res;
 
     const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL);
@@ -961,7 +1172,7 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {
       if (int Res = cmpGEPs(GEPL, GEPR))
         return Res;
     } else {
-      if (int Res = cmpOperations(InstL, InstR))
+      if (int Res = cmpOperations(&*InstL, &*InstR))
         return Res;
       assert(InstL->getNumOperands() == InstR->getNumOperands());
 
@@ -970,11 +1181,8 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {
         Value *OpR = InstR->getOperand(i);
         if (int Res = cmpValues(OpL, OpR))
           return Res;
-        if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID()))
-          return Res;
-        // TODO: Already checked in cmpOperation
-        if (int Res = cmpTypes(OpL->getType(), OpR->getType()))
-          return Res;
+        // cmpValues should ensure this is true.
+        assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
       }
     }
 
@@ -990,7 +1198,6 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {
 
 // Test whether the two functions have equivalent behaviour.
 int FunctionComparator::compare() {
-
   sn_mapL.clear();
   sn_mapR.clear();
 
@@ -1001,7 +1208,7 @@ int FunctionComparator::compare() {
     return Res;
 
   if (FnL->hasGC()) {
-    if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC()))
+    if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
       return Res;
   }
 
@@ -1009,7 +1216,7 @@ int FunctionComparator::compare() {
     return Res;
 
   if (FnL->hasSection()) {
-    if (int Res = cmpStrings(FnL->getSection(), FnR->getSection()))
+    if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
       return Res;
   }
 
@@ -1033,7 +1240,7 @@ int FunctionComparator::compare() {
                                     ArgRI = FnR->arg_begin(),
                                     ArgLE = FnL->arg_end();
        ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
-    if (cmpValues(ArgLI, ArgRI) != 0)
+    if (cmpValues(&*ArgLI, &*ArgRI) != 0)
       llvm_unreachable("Arguments repeat!");
   }
 
@@ -1055,7 +1262,7 @@ int FunctionComparator::compare() {
     if (int Res = cmpValues(BBL, BBR))
       return Res;
 
-    if (int Res = compare(BBL, BBR))
+    if (int Res = cmpBasicBlocks(BBL, BBR))
       return Res;
 
     const TerminatorInst *TermL = BBL->getTerminator();
@@ -1074,6 +1281,68 @@ int FunctionComparator::compare() {
 }
 
 namespace {
+// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
+// hash of a sequence of 64bit ints, but the entire input does not need to be
+// available at once. This interface is necessary for functionHash because it
+// needs to accumulate the hash as the structure of the function is traversed
+// without saving these values to an intermediate buffer. This form of hashing
+// is not often needed, as usually the object to hash is just read from a
+// buffer.
+class HashAccumulator64 {
+  uint64_t Hash;
+public:
+  // Initialize to random constant, so the state isn't zero.
+  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
+  void add(uint64_t V) {
+     Hash = llvm::hashing::detail::hash_16_bytes(Hash, V);
+  }
+  // No finishing is required, because the entire hash value is used.
+  uint64_t getHash() { return Hash; }
+};
+} // end anonymous namespace
+
+// A function hash is calculated by considering only the number of arguments and
+// whether a function is varargs, the order of basic blocks (given by the
+// successors of each basic block in depth first order), and the order of
+// opcodes of each instruction within each of these basic blocks. This mirrors
+// the strategy compare() uses to compare functions by walking the BBs in depth
+// first order and comparing each instruction in sequence. Because this hash
+// does not look at the operands, it is insensitive to things such as the
+// target of calls and the constants used in the function, which makes it useful
+// when possibly merging functions which are the same modulo constants and call
+// targets.
+FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
+  HashAccumulator64 H;
+  H.add(F.isVarArg());
+  H.add(F.arg_size());
+  
+  SmallVector<const BasicBlock *, 8> BBs;
+  SmallSet<const BasicBlock *, 16> VisitedBBs;
+
+  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
+  // accumulating the hash of the function "structure." (BB and opcode sequence)
+  BBs.push_back(&F.getEntryBlock());
+  VisitedBBs.insert(BBs[0]);
+  while (!BBs.empty()) {
+    const BasicBlock *BB = BBs.pop_back_val();
+    // This random value acts as a block header, as otherwise the partition of
+    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
+    H.add(45798); 
+    for (auto &Inst : *BB) {
+      H.add(Inst.getOpcode());
+    }
+    const TerminatorInst *Term = BB->getTerminator();
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+        continue;
+      BBs.push_back(Term->getSuccessor(i));
+    }
+  }
+  return H.getHash();
+}
+
+
+namespace {
 
 /// MergeFunctions finds functions which will generate identical machine code,
 /// by considering all pointer types to be equivalent. Once identified,
@@ -1084,14 +1353,31 @@ class MergeFunctions : public ModulePass {
 public:
   static char ID;
   MergeFunctions()
-    : ModulePass(ID), HasGlobalAliases(false) {
+    : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(),
+      HasGlobalAliases(false) {
     initializeMergeFunctionsPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override;
 
 private:
-  typedef std::set<FunctionNode> FnTreeType;
+  // The function comparison operator is provided here so that FunctionNodes do
+  // not need to become larger with another pointer.
+  class FunctionNodeCmp {
+    GlobalNumberState* GlobalNumbers;
+  public:
+    FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {}
+    bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const {
+      // Order first by hashes, then full function comparison.
+      if (LHS.getHash() != RHS.getHash())
+        return LHS.getHash() < RHS.getHash();
+      FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers);
+      return FCmp.compare() == -1;
+    }
+  };
+  typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType;
+
+  GlobalNumberState GlobalNumbers;
 
   /// A work queue of functions that may have been modified and should be
   /// analyzed again.
@@ -1133,17 +1419,23 @@ private:
   void writeAlias(Function *F, Function *G);
 
   /// Replace function F with function G in the function tree.
-  void replaceFunctionInTree(FnTreeType::iterator &IterToF, Function *G);
+  void replaceFunctionInTree(const FunctionNode &FN, Function *G);
 
   /// The set of all distinct functions. Use the insert() and remove() methods
-  /// to modify it.
+  /// to modify it. The map allows efficient lookup and deferring of Functions.
   FnTreeType FnTree;
+  // Map functions to the iterators of the FunctionNode which contains them
+  // in the FnTree. This must be updated carefully whenever the FnTree is
+  // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
+  // dangling iterators into FnTree. The invariant that preserves this is that
+  // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
+  ValueMap<Function*, FnTreeType::iterator> FNodesInTree;
 
   /// Whether or not the target supports global aliases.
   bool HasGlobalAliases;
 };
 
-}  // end anonymous namespace
+} // end anonymous namespace
 
 char MergeFunctions::ID = 0;
 INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false)
@@ -1166,8 +1458,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
       for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) {
         Function *F1 = cast<Function>(*I);
         Function *F2 = cast<Function>(*J);
-        int Res1 = FunctionComparator(F1, F2).compare();
-        int Res2 = FunctionComparator(F2, F1).compare();
+        int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare();
+        int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare();
 
         // If F1 <= F2, then F2 >= F1, otherwise report failure.
         if (Res1 != -Res2) {
@@ -1188,8 +1480,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
             continue;
 
           Function *F3 = cast<Function>(*K);
-          int Res3 = FunctionComparator(F1, F3).compare();
-          int Res4 = FunctionComparator(F2, F3).compare();
+          int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare();
+          int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare();
 
           bool Transitive = true;
 
@@ -1227,11 +1519,33 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage())
-      Deferred.push_back(WeakVH(I));
+  // All functions in the module, ordered by hash. Functions with a unique
+  // hash value are easily eliminated.
+  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
+    HashedFuncs;
+  for (Function &Func : M) {
+    if (!Func.isDeclaration() && !Func.hasAvailableExternallyLinkage()) {
+      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
+    } 
   }
 
+  std::stable_sort(
+      HashedFuncs.begin(), HashedFuncs.end(),
+      [](const std::pair<FunctionComparator::FunctionHash, Function *> &a,
+         const std::pair<FunctionComparator::FunctionHash, Function *> &b) {
+        return a.first < b.first;
+      });
+
+  auto S = HashedFuncs.begin();
+  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
+    // If the hash value matches the previous value or the next one, we must
+    // consider merging it. Otherwise it is dropped and never considered again.
+    if ((I != S && std::prev(I)->first == I->first) ||
+        (std::next(I) != IE && std::next(I)->first == I->first) ) {
+      Deferred.push_back(WeakVH(I->second));
+    }
+  }
+  
   do {
     std::vector<WeakVH> Worklist;
     Deferred.swap(Worklist);
@@ -1270,6 +1584,7 @@ bool MergeFunctions::runOnModule(Module &M) {
   } while (!Deferred.empty());
 
   FnTree.clear();
+  GlobalNumbers.clear();
 
   return Changed;
 }
@@ -1282,6 +1597,32 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
     ++UI;
     CallSite CS(U->getUser());
     if (CS && CS.isCallee(U)) {
+      // Transfer the called function's attributes to the call site. Due to the
+      // bitcast we will 'lose' ABI changing attributes because the 'called
+      // function' is no longer a Function* but the bitcast. Code that looks up
+      // the attributes from the called function will fail.
+
+      // FIXME: This is not actually true, at least not anymore. The callsite
+      // will always have the same ABI affecting attributes as the callee,
+      // because otherwise the original input has UB. Note that Old and New
+      // always have matching ABI, so no attributes need to be changed.
+      // Transferring other attributes may help other optimizations, but that
+      // should be done uniformly and not in this ad-hoc way.
+      auto &Context = New->getContext();
+      auto NewFuncAttrs = New->getAttributes();
+      auto CallSiteAttrs = CS.getAttributes();
+
+      CallSiteAttrs = CallSiteAttrs.addAttributes(
+          Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes());
+
+      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) {
+        AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx);
+        if (Attrs.getNumSlots())
+          CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs);
+      }
+
+      CS.setAttributes(CallSiteAttrs);
+
       remove(CS.getInstruction()->getParent()->getParent());
       U->set(BitcastNew);
     }
@@ -1352,15 +1693,15 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
   FunctionType *FFTy = F->getFunctionType();
-  for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
-       AI != AE; ++AI) {
-    Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i)));
+  for (Argument & AI : NewG->args()) {
+    Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
     ++i;
   }
 
   CallInst *CI = Builder.CreateCall(F, Args);
   CI->setTailCall();
   CI->setCallingConv(F->getCallingConv());
+  CI->setAttributes(F->getAttributes());
   if (NewG->getReturnType()->isVoidTy()) {
     Builder.CreateRetVoid();
   } else {
@@ -1379,8 +1720,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
 
 // Replace G with an alias to F and delete G.
 void MergeFunctions::writeAlias(Function *F, Function *G) {
-  PointerType *PTy = G->getType();
-  auto *GA = GlobalAlias::create(PTy, G->getLinkage(), "", F);
+  auto *GA = GlobalAlias::create(G->getLinkage(), "", F);
   F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
   GA->takeName(G);
   GA->setVisibility(G->getVisibility());
@@ -1425,19 +1765,24 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
   ++NumFunctionsMerged;
 }
 
-/// Replace function F for function G in the map.
-void MergeFunctions::replaceFunctionInTree(FnTreeType::iterator &IterToF,
+/// Replace function F by function G.
+void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
                                            Function *G) {
-  Function *F = IterToF->getFunc();
-
-  // A total order is already guaranteed otherwise because we process strong
-  // functions before weak functions.
-  assert(((F->mayBeOverridden() && G->mayBeOverridden()) ||
-          (!F->mayBeOverridden() && !G->mayBeOverridden())) &&
-         "Only change functions if both are strong or both are weak");
-  (void)F;
-
-  IterToF->replaceBy(G);
+  Function *F = FN.getFunc();
+  assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 &&
+         "The two functions must be equal");
+  
+  auto I = FNodesInTree.find(F);
+  assert(I != FNodesInTree.end() && "F should be in FNodesInTree");
+  assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G");
+  
+  FnTreeType::iterator IterToFNInFnTree = I->second;
+  assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree.");
+  // Remove F -> FN and insert G -> FN
+  FNodesInTree.erase(I);
+  FNodesInTree.insert({G, IterToFNInFnTree});
+  // Replace F with G in FN, which is stored inside the FnTree.
+  FN.replaceBy(G);
 }
 
 // Insert a ComparableFunction into the FnTree, or merge it away if equal to one
@@ -1447,6 +1792,8 @@ bool MergeFunctions::insert(Function *NewFunction) {
       FnTree.insert(FunctionNode(NewFunction));
 
   if (Result.second) {
+    assert(FNodesInTree.count(NewFunction) == 0);
+    FNodesInTree.insert({NewFunction, Result.first});
     DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
     return false;
   }
@@ -1476,7 +1823,7 @@ bool MergeFunctions::insert(Function *NewFunction) {
     if (OldF.getFunc()->getName() > NewFunction->getName()) {
       // Swap the two functions.
       Function *F = OldF.getFunc();
-      replaceFunctionInTree(Result.first, NewFunction);
+      replaceFunctionInTree(*Result.first, NewFunction);
       NewFunction = F;
       assert(OldF.getFunc() != F && "Must have swapped the functions.");
     }
@@ -1495,18 +1842,13 @@ bool MergeFunctions::insert(Function *NewFunction) {
 // Remove a function from FnTree. If it was already in FnTree, add
 // it to Deferred so that we'll look at it in the next round.
 void MergeFunctions::remove(Function *F) {
-  // We need to make sure we remove F, not a function "equal" to F per the
-  // function equality comparator.
-  FnTreeType::iterator found = FnTree.find(FunctionNode(F));
-  size_t Erased = 0;
-  if (found != FnTree.end() && found->getFunc() == F) {
-    Erased = 1;
-    FnTree.erase(found);
-  }
-
-  if (Erased) {
-    DEBUG(dbgs() << "Removed " << F->getName()
-                 << " from set and deferred it.\n");
+  auto I = FNodesInTree.find(F);
+  if (I != FNodesInTree.end()) {
+    DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n");
+    FnTree.erase(I->second);
+    // I->second has been invalidated, remove it from the FNodesInTree map to
+    // preserve the invariant.
+    FNodesInTree.erase(I);
     Deferred.emplace_back(F);
   }
 }
@@ -1516,6 +1858,8 @@ void MergeFunctions::remove(Function *F) {
 void MergeFunctions::removeUsers(Value *V) {
   std::vector<Value *> Worklist;
   Worklist.push_back(V);
+  SmallSet<Value*, 8> Visited;
+  Visited.insert(V);
   while (!Worklist.empty()) {
     Value *V = Worklist.back();
     Worklist.pop_back();
@@ -1526,8 +1870,10 @@ void MergeFunctions::removeUsers(Value *V) {
       } else if (isa<GlobalValue>(U)) {
         // do nothing
       } else if (Constant *C = dyn_cast<Constant>(U)) {
-        for (User *UU : C->users())
-          Worklist.push_back(UU);
+        for (User *UU : C->users()) {
+          if (!Visited.insert(UU).second)
+            Worklist.push_back(UU);
+        }
       }
     }
   }
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 4a7cb7b..0c5c84b 100644
--- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -50,7 +50,7 @@ ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); }
 
 Function* PartialInliner::unswitchFunction(Function* F) {
   // First, verify that this function is an unswitching candidate...
-  BasicBlock* entryBlock = F->begin();
+  BasicBlock *entryBlock = &F->front();
   BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator());
   if (!BR || BR->isUnconditional())
     return nullptr;
@@ -89,18 +89,18 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   // of which will go outside.
   BasicBlock* preReturn = newReturnBlock;
   newReturnBlock = newReturnBlock->splitBasicBlock(
-                                              newReturnBlock->getFirstNonPHI());
+      newReturnBlock->getFirstNonPHI()->getIterator());
   BasicBlock::iterator I = preReturn->begin();
-  BasicBlock::iterator Ins = newReturnBlock->begin();
+  Instruction *Ins = &newReturnBlock->front();
   while (I != preReturn->end()) {
     PHINode* OldPhi = dyn_cast<PHINode>(I);
     if (!OldPhi) break;
-    
-    PHINode* retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);
+
+    PHINode *retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);
     OldPhi->replaceAllUsesWith(retPhi);
     Ins = newReturnBlock->getFirstNonPHI();
-    
-    retPhi->addIncoming(I, preReturn);
+
+    retPhi->addIncoming(&*I, preReturn);
     retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock),
                         newEntryBlock);
     OldPhi->removeIncomingValue(newEntryBlock);
@@ -116,8 +116,8 @@ Function* PartialInliner::unswitchFunction(Function* F) {
        FE = duplicateFunction->end(); FI != FE; ++FI)
     if (&*FI != newEntryBlock && &*FI != newReturnBlock &&
         &*FI != newNonReturnBlock)
-      toExtract.push_back(FI);
-      
+      toExtract.push_back(&*FI);
+
   // The CodeExtractor needs a dominator tree.
   DominatorTree DT;
   DT.recalculate(*duplicateFunction);
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 909baae..faada9c 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -12,19 +12,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm-c/Transforms/PassManagerBuilder.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Verifier.h"
+#include "llvm/IR/FunctionInfo.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Vectorize.h"
 
@@ -89,11 +96,21 @@ static cl::opt<bool> EnableLoopDistribute(
     "enable-loop-distribute", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopDistribution Pass"));
 
+static cl::opt<bool> EnableNonLTOGlobalsModRef(
+    "enable-non-lto-gmr", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline."));
+
+static cl::opt<bool> EnableLoopLoadElim(
+    "enable-loop-load-elim", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopLoadElimination Pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
     LibraryInfo = nullptr;
     Inliner = nullptr;
+    FunctionIndex = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     BBVectorize = RunBBVectorization;
@@ -143,10 +160,9 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses(
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
   if (UseCFLAA)
-    PM.add(createCFLAliasAnalysisPass());
-  PM.add(createTypeBasedAliasAnalysisPass());
-  PM.add(createScopedNoAliasAAPass());
-  PM.add(createBasicAliasAnalysisPass());
+    PM.add(createCFLAAWrapperPass());
+  PM.add(createTypeBasedAAWrapperPass());
+  PM.add(createScopedNoAliasAAWrapperPass());
 }
 
 void PassManagerBuilder::populateFunctionPassManager(
@@ -172,6 +188,9 @@ void PassManagerBuilder::populateFunctionPassManager(
 
 void PassManagerBuilder::populateModulePassManager(
     legacy::PassManagerBase &MPM) {
+  // Allow forcing function attributes as a debugging and tuning aid.
+  MPM.add(createForceFunctionAttrsLegacyPass());
+
   // If all optimizations are disabled, just run the always-inline pass and,
   // if enabled, the function merging pass.
   if (OptLevel == 0) {
@@ -201,10 +220,15 @@ void PassManagerBuilder::populateModulePassManager(
   addInitialAliasAnalysisPasses(MPM);
 
   if (!DisableUnitAtATime) {
+    // Infer attributes about declarations if possible.
+    MPM.add(createInferFunctionAttrsLegacyPass());
+
     addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
     MPM.add(createIPSCCPPass());              // IP SCCP
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+    // Promote any localized global vars
+    MPM.add(createPromoteMemoryToRegisterPass());
 
     MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
 
@@ -213,6 +237,12 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
   }
 
+  if (EnableNonLTOGlobalsModRef)
+    // We add a module alias analysis pass here. In part due to bugs in the
+    // analysis infrastructure this "works" in that the analysis stays alive
+    // for the entire SCC pass run below.
+    MPM.add(createGlobalsAAWrapperPass());
+
   // Start of CallGraph SCC passes.
   if (!DisableUnitAtATime)
     MPM.add(createPruneEHPass());             // Remove dead EH info
@@ -221,7 +251,7 @@ void PassManagerBuilder::populateModulePassManager(
     Inliner = nullptr;
   }
   if (!DisableUnitAtATime)
-    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+    MPM.add(createPostOrderFunctionAttrsPass());
   if (OptLevel > 2)
     MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
 
@@ -245,6 +275,7 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
   MPM.add(createLICMPass());                  // Hoist loop invariants
   MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  MPM.add(createCFGSimplificationPass());
   MPM.add(createInstructionCombiningPass());
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
@@ -315,9 +346,45 @@ void PassManagerBuilder::populateModulePassManager(
   // we must insert a no-op module pass to reset the pass manager.
   MPM.add(createBarrierNoopPass());
 
+  if (!DisableUnitAtATime)
+    MPM.add(createReversePostOrderFunctionAttrsPass());
+
+  if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO) {
+    // Remove avail extern fns and globals definitions if we aren't
+    // compiling an object file for later LTO. For LTO we want to preserve
+    // these so they are eligible for inlining at link-time. Note if they
+    // are unreferenced they will be removed by GlobalDCE later, so
+    // this only impacts referenced available externally globals.
+    // Eventually they will be suppressed during codegen, but eliminating
+    // here enables more opportunity for GlobalDCE as it may make
+    // globals referenced by available external functions dead
+    // and saves running remaining passes on the eliminated functions.
+    MPM.add(createEliminateAvailableExternallyPass());
+  }
+
+  if (EnableNonLTOGlobalsModRef)
+    // We add a fresh GlobalsModRef run at this point. This is particularly
+    // useful as the above will have inlined, DCE'ed, and function-attr
+    // propagated everything. We should at this point have a reasonably minimal
+    // and richly annotated call graph. By computing aliasing and mod/ref
+    // information for all local globals here, the late loop passes and notably
+    // the vectorizer will be able to use them to help recognize vectorizable
+    // memory operations.
+    //
+    // Note that this relies on a bug in the pass manager which preserves
+    // a module analysis into a function pass pipeline (and throughout it) so
+    // long as the first function pass doesn't invalidate the module analysis.
+    // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
+    // this to work. Fortunately, it is trivial to preserve AliasAnalysis
+    // (doing nothing preserves it as it is required to be conservatively
+    // correct in the face of IR changes).
+    MPM.add(createGlobalsAAWrapperPass());
+
   if (RunFloat2Int)
     MPM.add(createFloat2IntPass());
 
+  addExtensionsToPM(EP_VectorizerStart, MPM);
+
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
   // on the rotated form. Disable header duplication at -Oz.
@@ -329,6 +396,12 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createLoopDistributePass());
 
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  if (EnableLoopLoadElim)
+    MPM.add(createLoopLoadEliminationPass());
+
   // FIXME: Because of #pragma vectorize enable, the passes below are always
   // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
   // on -O1 and no #pragma is found). Would be good to have these two passes
@@ -402,17 +475,6 @@ void PassManagerBuilder::populateModulePassManager(
     // GlobalOpt already deletes dead functions and globals, at -O2 try a
     // late pass of GlobalDCE.  It is capable of deleting dead cycles.
     if (OptLevel > 1) {
-      if (!PrepareForLTO) {
-        // Remove avail extern fns and globals definitions if we aren't
-        // compiling an object file for later LTO. For LTO we want to preserve
-        // these so they are eligible for inlining at link-time. Note if they
-        // are unreferenced they will be removed by GlobalDCE below, so
-        // this only impacts referenced available externally globals.
-        // Eventually they will be suppressed during codegen, but eliminating
-        // here enables more opportunity for GlobalDCE as it may make
-        // globals referenced by available external functions dead.
-        MPM.add(createEliminateAvailableExternallyPass());
-      }
       MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
       MPM.add(createConstantMergePass());     // Merge dup global constants
     }
@@ -428,13 +490,26 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
+  if (FunctionIndex)
+    PM.add(createFunctionImportPass(FunctionIndex));
+
+  // Allow forcing function attributes as a debugging and tuning aid.
+  PM.add(createForceFunctionAttrsLegacyPass());
+
+  // Infer attributes about declarations if possible.
+  PM.add(createInferFunctionAttrsLegacyPass());
+
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
   // pointers passed as arguments to direct uses of functions.
   PM.add(createIPSCCPPass());
 
   // Now that we internalized some globals, see if we can hack on them!
+  PM.add(createPostOrderFunctionAttrsPass());
+  PM.add(createReversePostOrderFunctionAttrsPass());
   PM.add(createGlobalOptimizerPass());
+  // Promote any localized global vars.
+  PM.add(createPromoteMemoryToRegisterPass());
 
   // Linking modules together can lead to duplicated global constants, only
   // keep one copy of each constant.
@@ -480,8 +555,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createScalarReplAggregatesPass());
 
   // Run a few AA driven optimizations here and now, to cleanup the code.
-  PM.add(createFunctionAttrsPass()); // Add nocapture.
-  PM.add(createGlobalsModRefPass()); // IP alias analysis.
+  PM.add(createPostOrderFunctionAttrsPass()); // Add nocapture.
+  PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
   PM.add(createLICMPass());                 // Hoist loop invariants.
   if (EnableMLSM)
@@ -500,6 +575,15 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
 
   PM.add(createLoopVectorizePass(true, LoopVectorize));
 
+  // Now that we've optimized loops (in particular loop induction variables),
+  // we may have exposed more scalar opportunities. Run parts of the scalar
+  // optimizer again at this point.
+  PM.add(createInstructionCombiningPass()); // Initial cleanup
+  PM.add(createCFGSimplificationPass()); // if-convert
+  PM.add(createSCCPPass()); // Propagate exposed constants
+  PM.add(createInstructionCombiningPass()); // Clean up again
+  PM.add(createBitTrackingDCEPass());
+
   // More scalar chains could be vectorized due to more alias information
   if (RunSLPAfterLoopVectorization)
     if (SLPVectorize)
@@ -524,6 +608,9 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(
   // Delete basic blocks, which optimization passes may have killed.
   PM.add(createCFGSimplificationPass());
 
+  // Drop bodies of available externally objects to improve GlobalDCE.
+  PM.add(createEliminateAvailableExternallyPass());
+
   // Now that we have optimized the program, discard unreachable functions.
   PM.add(createGlobalDCEPass());
 
@@ -543,6 +630,10 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   if (OptLevel > 1)
     addLTOOptimizationPasses(PM);
 
+  // Create a function that performs CFI checks for cross-DSO calls with targets
+  // in the current module.
+  PM.add(createCrossDSOCFIPass());
+
   // Lower bit sets to globals. This pass supports Clang's control flow
   // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI
   // is enabled. The pass does nothing if CFI is disabled.
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
index b2f1010..3af4afb 100644
--- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -153,21 +153,16 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
   // If the SCC doesn't unwind or doesn't throw, note this fact.
   if (!SCCMightUnwind || !SCCMightReturn)
     for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-      AttrBuilder NewAttributes;
-
-      if (!SCCMightUnwind)
-        NewAttributes.addAttribute(Attribute::NoUnwind);
-      if (!SCCMightReturn)
-        NewAttributes.addAttribute(Attribute::NoReturn);
-
       Function *F = (*I)->getFunction();
-      const AttributeSet &PAL = F->getAttributes().getFnAttributes();
-      const AttributeSet &NPAL = AttributeSet::get(
-          F->getContext(), AttributeSet::FunctionIndex, NewAttributes);
 
-      if (PAL != NPAL) {
+      if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
+        F->addFnAttr(Attribute::NoUnwind);
+        MadeChange = true;
+      }
+
+      if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) {
+        F->addFnAttr(Attribute::NoReturn);
         MadeChange = true;
-        F->addAttributes(AttributeSet::FunctionIndex, NPAL);
       }
     }
 
@@ -191,9 +186,13 @@ bool PruneEH::SimplifyFunction(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
       if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
-        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+        SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        II->getOperandBundlesAsDefs(OpBundles);
+
         // Insert a call instruction before the invoke.
-        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                          "", II);
         Call->takeName(II);
         Call->setCallingConv(II->getCallingConv());
         Call->setAttributes(II->getAttributes());
@@ -233,7 +232,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
 
           // Remove the uncond branch and add an unreachable.
           BB->getInstList().pop_back();
-          new UnreachableInst(BB->getContext(), BB);
+          new UnreachableInst(BB->getContext(), &*BB);
 
           DeleteBasicBlock(New);  // Delete the new BB.
           MadeChange = true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
index c8dfa54..928d92e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -22,7 +22,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -44,7 +43,11 @@
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <cctype>
 
 using namespace llvm;
@@ -61,27 +64,51 @@ static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
     "sample-profile-max-propagate-iterations", cl::init(100),
     cl::desc("Maximum number of iterations to go through when propagating "
              "sample block/edge weights through the CFG."));
+static cl::opt<unsigned> SampleProfileRecordCoverage(
+    "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
+    cl::desc("Emit a warning if less than N% of records in the input profile "
+             "are matched to the IR."));
+static cl::opt<unsigned> SampleProfileSampleCoverage(
+    "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
+    cl::desc("Emit a warning if less than N% of samples in the input profile "
+             "are matched to the IR."));
+static cl::opt<double> SampleProfileHotThreshold(
+    "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
+    cl::desc("Inlined functions that account for more than N% of all samples "
+             "collected in the parent function, will be inlined again."));
+static cl::opt<double> SampleProfileGlobalHotThreshold(
+    "sample-profile-global-hot-threshold", cl::init(30), cl::value_desc("N"),
+    cl::desc("Top-level functions that account for more than N% of all samples "
+             "collected in the profile, will be marked as hot for the inliner "
+             "to consider."));
+static cl::opt<double> SampleProfileGlobalColdThreshold(
+    "sample-profile-global-cold-threshold", cl::init(0.5), cl::value_desc("N"),
+    cl::desc("Top-level functions that account for less than N% of all samples "
+             "collected in the profile, will be marked as cold for the inliner "
+             "to consider."));
 
 namespace {
-typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap;
-typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap;
-typedef std::pair<BasicBlock *, BasicBlock *> Edge;
-typedef DenseMap<Edge, unsigned> EdgeWeightMap;
-typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap;
+typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap;
+typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap;
+typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
+typedef DenseMap<Edge, uint64_t> EdgeWeightMap;
+typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>
+    BlockEdgeMap;
 
 /// \brief Sample profile pass.
 ///
 /// This pass reads profile data from the file specified by
 /// -sample-profile-file and annotates every affected function with the
 /// profile information found in that file.
-class SampleProfileLoader : public FunctionPass {
+class SampleProfileLoader : public ModulePass {
 public:
   // Class identification, replacement for typeinfo
   static char ID;
 
   SampleProfileLoader(StringRef Name = SampleProfileFile)
-      : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr),
-        Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) {
+      : ModulePass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Reader(),
+        Samples(nullptr), Filename(Name), ProfileIsValid(false),
+        TotalCollectedSamples(0) {
     initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
   }
 
@@ -91,36 +118,37 @@ public:
 
   const char *getPassName() const override { return "Sample profile pass"; }
 
-  bool runOnFunction(Function &F) override;
+  bool runOnModule(Module &M) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTree>();
   }
 
 protected:
+  bool runOnFunction(Function &F);
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F);
-  unsigned getInstWeight(Instruction &I);
-  unsigned getBlockWeight(BasicBlock *BB);
+  ErrorOr<uint64_t> getInstWeight(const Instruction &I) const;
+  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB) const;
+  const FunctionSamples *findCalleeFunctionSamples(const CallInst &I) const;
+  const FunctionSamples *findFunctionSamples(const Instruction &I) const;
+  bool inlineHotFunctions(Function &F);
+  bool emitInlineHints(Function &F);
   void printEdgeWeight(raw_ostream &OS, Edge E);
-  void printBlockWeight(raw_ostream &OS, BasicBlock *BB);
-  void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB);
+  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
+  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
   bool computeBlockWeights(Function &F);
   void findEquivalenceClasses(Function &F);
   void findEquivalencesFor(BasicBlock *BB1,
                            SmallVector<BasicBlock *, 8> Descendants,
                            DominatorTreeBase<BasicBlock> *DomTree);
   void propagateWeights(Function &F);
-  unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
+  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
   bool propagateThroughEdges(Function &F);
-
-  /// \brief Line number for the function header. Used to compute absolute
-  /// line numbers from the relative line numbers found in the profile.
-  unsigned HeaderLineno;
+  void computeDominanceAndLoopInfo(Function &F);
+  unsigned getOffset(unsigned L, unsigned H) const;
+  void clearFunctionData();
 
   /// \brief Map basic blocks to their computed weights.
   ///
@@ -135,7 +163,7 @@ protected:
   EdgeWeightMap EdgeWeights;
 
   /// \brief Set of visited blocks during propagation.
-  SmallPtrSet<BasicBlock *, 128> VisitedBlocks;
+  SmallPtrSet<const BasicBlock *, 128> VisitedBlocks;
 
   /// \brief Set of visited edges during propagation.
   SmallSet<Edge, 128> VisitedEdges;
@@ -149,9 +177,9 @@ protected:
   EquivalenceClassMap EquivalenceClass;
 
   /// \brief Dominance, post-dominance and loop information.
-  DominatorTree *DT;
-  PostDominatorTree *PDT;
-  LoopInfo *LI;
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT;
+  std::unique_ptr<LoopInfo> LI;
 
   /// \brief Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
@@ -159,9 +187,6 @@ protected:
   /// \brief Successors for each basic block in the CFG.
   BlockEdgeMap Successors;
 
-  /// \brief LLVM context holding the debug data we need.
-  LLVMContext *Ctx;
-
   /// \brief Profile reader object.
   std::unique_ptr<SampleProfileReader> Reader;
 
@@ -173,7 +198,207 @@ protected:
 
   /// \brief Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid;
+
+  /// \brief Total number of samples collected in this profile.
+  ///
+  /// This is the sum of all the samples collected in all the functions executed
+  /// at runtime.
+  uint64_t TotalCollectedSamples;
 };
+
+class SampleCoverageTracker {
+public:
+  SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
+
+  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
+                       uint32_t Discriminator, uint64_t Samples);
+  unsigned computeCoverage(unsigned Used, unsigned Total) const;
+  unsigned countUsedRecords(const FunctionSamples *FS) const;
+  unsigned countBodyRecords(const FunctionSamples *FS) const;
+  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
+  uint64_t countBodySamples(const FunctionSamples *FS) const;
+  void clear() {
+    SampleCoverage.clear();
+    TotalUsedSamples = 0;
+  }
+
+private:
+  typedef std::map<LineLocation, unsigned> BodySampleCoverageMap;
+  typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap>
+      FunctionSamplesCoverageMap;
+
+  /// Coverage map for sampling records.
+  ///
+  /// This map keeps a record of sampling records that have been matched to
+  /// an IR instruction. This is used to detect some form of staleness in
+  /// profiles (see flag -sample-profile-check-coverage).
+  ///
+  /// Each entry in the map corresponds to a FunctionSamples instance.  This is
+  /// another map that counts how many times the sample record at the
+  /// given location has been used.
+  FunctionSamplesCoverageMap SampleCoverage;
+
+  /// Number of samples used from the profile.
+  ///
+  /// When a sampling record is used for the first time, the samples from
+  /// that record are added to this accumulator.  Coverage is later computed
+  /// based on the total number of samples available in this function and
+  /// its callsites.
+  ///
+  /// Note that this accumulator tracks samples used from a single function
+  /// and all the inlined callsites. Strictly, we should have a map of counters
+  /// keyed by FunctionSamples pointers, but these stats are cleared after
+  /// every function, so we just need to keep a single counter.
+  uint64_t TotalUsedSamples;
+};
+
+SampleCoverageTracker CoverageTracker;
+
+/// Return true if the given callsite is hot wrt to its caller.
+///
+/// Functions that were inlined in the original binary will be represented
+/// in the inline stack in the sample profile. If the profile shows that
+/// the original inline decision was "good" (i.e., the callsite is executed
+/// frequently), then we will recreate the inline decision and apply the
+/// profile from the inlined callsite.
+///
+/// To decide whether an inlined callsite is hot, we compute the fraction
+/// of samples used by the callsite with respect to the total number of samples
+/// collected in the caller.
+///
+/// If that fraction is larger than the default given by
+/// SampleProfileHotThreshold, the callsite will be inlined again.
+bool callsiteIsHot(const FunctionSamples *CallerFS,
+                   const FunctionSamples *CallsiteFS) {
+  if (!CallsiteFS)
+    return false; // The callsite was not inlined in the original binary.
+
+  uint64_t ParentTotalSamples = CallerFS->getTotalSamples();
+  if (ParentTotalSamples == 0)
+    return false; // Avoid division by zero.
+
+  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
+  if (CallsiteTotalSamples == 0)
+    return false; // Callsite is trivially cold.
+
+  double PercentSamples =
+      (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
+  return PercentSamples >= SampleProfileHotThreshold;
+}
+
+}
+
+/// Mark as used the sample record for the given function samples at
+/// (LineOffset, Discriminator).
+///
+/// \returns true if this is the first time we mark the given record.
+bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
+                                            uint32_t LineOffset,
+                                            uint32_t Discriminator,
+                                            uint64_t Samples) {
+  LineLocation Loc(LineOffset, Discriminator);
+  unsigned &Count = SampleCoverage[FS][Loc];
+  bool FirstTime = (++Count == 1);
+  if (FirstTime)
+    TotalUsedSamples += Samples;
+  return FirstTime;
+}
+
+/// Return the number of sample records that were applied from this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
+  auto I = SampleCoverage.find(FS);
+
+  // The size of the coverage map for FS represents the number of records
+  // that were marked used at least once.
+  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
+
+  // If there are inlined callsites in this function, count the samples found
+  // in the respective bodies. However, do not bother counting callees with 0
+  // total samples, these are callees that were never invoked at runtime.
+  for (const auto &I : FS->getCallsiteSamples()) {
+    const FunctionSamples *CalleeSamples = &I.second;
+    if (callsiteIsHot(FS, CalleeSamples))
+      Count += countUsedRecords(CalleeSamples);
+  }
+
+  return Count;
+}
+
+/// Return the number of sample records in the body of this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
+  unsigned Count = FS->getBodySamples().size();
+
+  // Only count records in hot callsites.
+  for (const auto &I : FS->getCallsiteSamples()) {
+    const FunctionSamples *CalleeSamples = &I.second;
+    if (callsiteIsHot(FS, CalleeSamples))
+      Count += countBodyRecords(CalleeSamples);
+  }
+
+  return Count;
+}
+
+/// Return the number of samples collected in the body of this profile.
+///
+/// This count does not include samples from cold inlined callsites.
+uint64_t
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
+  uint64_t Total = 0;
+  for (const auto &I : FS->getBodySamples())
+    Total += I.second.getSamples();
+
+  // Only count samples in hot callsites.
+  for (const auto &I : FS->getCallsiteSamples()) {
+    const FunctionSamples *CalleeSamples = &I.second;
+    if (callsiteIsHot(FS, CalleeSamples))
+      Total += countBodySamples(CalleeSamples);
+  }
+
+  return Total;
+}
+
+/// Return the fraction of sample records used in this profile.
+///
+/// The returned value is an unsigned integer in the range 0-100 indicating
+/// the percentage of sample records that were used while applying this
+/// profile to the associated function.
+unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
+                                                unsigned Total) const {
+  assert(Used <= Total &&
+         "number of used records cannot exceed the total number of records");
+  return Total > 0 ? Used * 100 / Total : 100;
+}
+
+/// Clear all the per-function data used to load samples and propagate weights.
+void SampleProfileLoader::clearFunctionData() {
+  BlockWeights.clear();
+  EdgeWeights.clear();
+  VisitedBlocks.clear();
+  VisitedEdges.clear();
+  EquivalenceClass.clear();
+  DT = nullptr;
+  PDT = nullptr;
+  LI = nullptr;
+  Predecessors.clear();
+  Successors.clear();
+  CoverageTracker.clear();
+}
+
+/// \brief Returns the offset of lineno \p L to head_lineno \p H
+///
+/// \param L  Lineno
+/// \param H  Header lineno of the function
+///
+/// \returns offset to the header lineno. 16 bits are used to represent offset.
+/// We assume that a single function will not exceed 65535 LOC.
+unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const {
+  return (L - H) & 0xffff;
 }
 
 /// \brief Print the weight of edge \p E on stream \p OS.
@@ -190,8 +415,8 @@ void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
 /// \param OS  Stream to emit the output to.
 /// \param BB  Block to print.
 void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
-                                                BasicBlock *BB) {
-  BasicBlock *Equiv = EquivalenceClass[BB];
+                                                const BasicBlock *BB) {
+  const BasicBlock *Equiv = EquivalenceClass[BB];
   OS << "equivalence[" << BB->getName()
      << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
 }
@@ -200,8 +425,11 @@ void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
 ///
 /// \param OS  Stream to emit the output to.
 /// \param BB  Block to print.
-void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
-  OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n";
+void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
+                                           const BasicBlock *BB) const {
+  const auto &I = BlockWeights.find(BB);
+  uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
+  OS << "weight[" << BB->getName() << "]: " << W << "\n";
 }
 
 /// \brief Get the weight for an instruction.
@@ -214,51 +442,67 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
 ///
 /// \param Inst Instruction to query.
 ///
-/// \returns The profiled weight of I.
-unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) {
+/// \returns the weight of \p Inst.
+ErrorOr<uint64_t>
+SampleProfileLoader::getInstWeight(const Instruction &Inst) const {
   DebugLoc DLoc = Inst.getDebugLoc();
   if (!DLoc)
-    return 0;
+    return std::error_code();
 
-  unsigned Lineno = DLoc.getLine();
-  if (Lineno < HeaderLineno)
-    return 0;
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (!FS)
+    return std::error_code();
 
   const DILocation *DIL = DLoc;
-  int LOffset = Lineno - HeaderLineno;
-  unsigned Discriminator = DIL->getDiscriminator();
-  unsigned Weight = Samples->samplesAt(LOffset, Discriminator);
-  DEBUG(dbgs() << "    " << Lineno << "." << Discriminator << ":" << Inst
-               << " (line offset: " << LOffset << "." << Discriminator
-               << " - weight: " << Weight << ")\n");
-  return Weight;
+  unsigned Lineno = DLoc.getLine();
+  unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine();
+
+  uint32_t LineOffset = getOffset(Lineno, HeaderLineno);
+  uint32_t Discriminator = DIL->getDiscriminator();
+  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
+  if (R) {
+    bool FirstMark =
+        CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
+    if (FirstMark) {
+      const Function *F = Inst.getParent()->getParent();
+      LLVMContext &Ctx = F->getContext();
+      emitOptimizationRemark(
+          Ctx, DEBUG_TYPE, *F, DLoc,
+          Twine("Applied ") + Twine(*R) + " samples from profile (offset: " +
+              Twine(LineOffset) +
+              ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")");
+    }
+    DEBUG(dbgs() << "    " << Lineno << "." << DIL->getDiscriminator() << ":"
+                 << Inst << " (line offset: " << Lineno - HeaderLineno << "."
+                 << DIL->getDiscriminator() << " - weight: " << R.get()
+                 << ")\n");
+  }
+  return R;
 }
 
 /// \brief Compute the weight of a basic block.
 ///
 /// The weight of basic block \p BB is the maximum weight of all the
-/// instructions in BB. The weight of \p BB is computed and cached in
-/// the BlockWeights map.
+/// instructions in BB.
 ///
 /// \param BB The basic block to query.
 ///
-/// \returns The computed weight of BB.
-unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) {
-  // If we've computed BB's weight before, return it.
-  std::pair<BlockWeightMap::iterator, bool> Entry =
-      BlockWeights.insert(std::make_pair(BB, 0));
-  if (!Entry.second)
-    return Entry.first->second;
-
-  // Otherwise, compute and cache BB's weight.
-  unsigned Weight = 0;
+/// \returns the weight for \p BB.
+ErrorOr<uint64_t>
+SampleProfileLoader::getBlockWeight(const BasicBlock *BB) const {
+  bool Found = false;
+  uint64_t Weight = 0;
   for (auto &I : BB->getInstList()) {
-    unsigned InstWeight = getInstWeight(I);
-    if (InstWeight > Weight)
-      Weight = InstWeight;
+    const ErrorOr<uint64_t> &R = getInstWeight(I);
+    if (R && R.get() >= Weight) {
+      Weight = R.get();
+      Found = true;
+    }
   }
-  Entry.first->second = Weight;
-  return Weight;
+  if (Found)
+    return Weight;
+  else
+    return std::error_code();
 }
 
 /// \brief Compute and store the weights of every basic block.
@@ -270,15 +514,199 @@ unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) {
 bool SampleProfileLoader::computeBlockWeights(Function &F) {
   bool Changed = false;
   DEBUG(dbgs() << "Block weights\n");
-  for (auto &BB : F) {
-    unsigned Weight = getBlockWeight(&BB);
-    Changed |= (Weight > 0);
+  for (const auto &BB : F) {
+    ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
+    if (Weight) {
+      BlockWeights[&BB] = Weight.get();
+      VisitedBlocks.insert(&BB);
+      Changed = true;
+    }
     DEBUG(printBlockWeight(dbgs(), &BB));
   }
 
   return Changed;
 }
 
+/// \brief Get the FunctionSamples for a call instruction.
+///
+/// The FunctionSamples of a call instruction \p Inst is the inlined
+/// instance in which that call instruction is calling to. It contains
+/// all samples that resides in the inlined instance. We first find the
+/// inlined instance in which the call instruction is from, then we
+/// traverse its children to find the callsite with the matching
+/// location and callee function name.
+///
+/// \param Inst Call instruction to query.
+///
+/// \returns The FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL) {
+    return nullptr;
+  }
+  DISubprogram *SP = DIL->getScope()->getSubprogram();
+  if (!SP)
+    return nullptr;
+
+  Function *CalleeFunc = Inst.getCalledFunction();
+  if (!CalleeFunc) {
+    return nullptr;
+  }
+
+  StringRef CalleeName = CalleeFunc->getName();
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return nullptr;
+
+  return FS->findFunctionSamplesAt(
+      CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()),
+                       DIL->getDiscriminator(), CalleeName));
+}
+
+/// \brief Get the FunctionSamples for an instruction.
+///
+/// The FunctionSamples of an instruction \p Inst is the inlined instance
+/// in which that instruction is coming from. We traverse the inline stack
+/// of that instruction, and match it with the tree nodes in the profile.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
+  SmallVector<CallsiteLocation, 10> S;
+  const DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL) {
+    return Samples;
+  }
+  StringRef CalleeName;
+  for (const DILocation *DIL = Inst.getDebugLoc(); DIL;
+       DIL = DIL->getInlinedAt()) {
+    DISubprogram *SP = DIL->getScope()->getSubprogram();
+    if (!SP)
+      return nullptr;
+    if (!CalleeName.empty()) {
+      S.push_back(CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()),
+                                   DIL->getDiscriminator(), CalleeName));
+    }
+    CalleeName = SP->getLinkageName();
+  }
+  if (S.size() == 0)
+    return Samples;
+  const FunctionSamples *FS = Samples;
+  for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
+    FS = FS->findFunctionSamplesAt(S[i]);
+  }
+  return FS;
+}
+
+/// \brief Emit an inline hint if \p F is globally hot or cold.
+///
+/// If \p F consumes a significant fraction of samples (indicated by
+/// SampleProfileGlobalHotThreshold), apply the InlineHint attribute for the
+/// inliner to consider the function hot.
+///
+/// If \p F consumes a small fraction of samples (indicated by
+/// SampleProfileGlobalColdThreshold), apply the Cold attribute for the inliner
+/// to consider the function cold.
+///
+/// FIXME - This setting of inline hints is sub-optimal. Instead of marking a
+/// function globally hot or cold, we should be annotating individual callsites.
+/// This is not currently possible, but work on the inliner will eventually
+/// provide this ability. See http://reviews.llvm.org/D15003 for details and
+/// discussion.
+///
+/// \returns True if either attribute was applied to \p F.
+bool SampleProfileLoader::emitInlineHints(Function &F) {
+  if (TotalCollectedSamples == 0)
+    return false;
+
+  uint64_t FunctionSamples = Samples->getTotalSamples();
+  double SamplesPercent =
+      (double)FunctionSamples / (double)TotalCollectedSamples * 100.0;
+
+  // If the function collected more samples than the hot threshold, mark
+  // it globally hot.
+  if (SamplesPercent >= SampleProfileGlobalHotThreshold) {
+    F.addFnAttr(llvm::Attribute::InlineHint);
+    std::string Msg;
+    raw_string_ostream S(Msg);
+    S << "Applied inline hint to globally hot function '" << F.getName()
+      << "' with " << format("%.2f", SamplesPercent)
+      << "% of samples (threshold: "
+      << format("%.2f", SampleProfileGlobalHotThreshold.getValue()) << "%)";
+    S.flush();
+    emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg);
+    return true;
+  }
+
+  // If the function collected fewer samples than the cold threshold, mark
+  // it globally cold.
+  if (SamplesPercent <= SampleProfileGlobalColdThreshold) {
+    F.addFnAttr(llvm::Attribute::Cold);
+    std::string Msg;
+    raw_string_ostream S(Msg);
+    S << "Applied cold hint to globally cold function '" << F.getName()
+      << "' with " << format("%.2f", SamplesPercent)
+      << "% of samples (threshold: "
+      << format("%.2f", SampleProfileGlobalColdThreshold.getValue()) << "%)";
+    S.flush();
+    emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg);
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Iteratively inline hot callsites of a function.
+///
+/// Iteratively traverse all callsites of the function \p F, and find if
+/// the corresponding inlined instance exists and is hot in profile. If
+/// it is hot enough, inline the callsites and adds new callsites of the
+/// callee into the caller.
+///
+/// TODO: investigate the possibility of not invoking InlineFunction directly.
+///
+/// \param F function to perform iterative inlining.
+///
+/// \returns True if there is any inline happened.
+bool SampleProfileLoader::inlineHotFunctions(Function &F) {
+  bool Changed = false;
+  LLVMContext &Ctx = F.getContext();
+  while (true) {
+    bool LocalChanged = false;
+    SmallVector<CallInst *, 10> CIS;
+    for (auto &BB : F) {
+      for (auto &I : BB.getInstList()) {
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (CI && callsiteIsHot(Samples, findCalleeFunctionSamples(*CI)))
+          CIS.push_back(CI);
+      }
+    }
+    for (auto CI : CIS) {
+      InlineFunctionInfo IFI;
+      Function *CalledFunction = CI->getCalledFunction();
+      DebugLoc DLoc = CI->getDebugLoc();
+      uint64_t NumSamples = findCalleeFunctionSamples(*CI)->getTotalSamples();
+      if (InlineFunction(CI, IFI)) {
+        LocalChanged = true;
+        emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
+                               Twine("inlined hot callee '") +
+                                   CalledFunction->getName() + "' with " +
+                                   Twine(NumSamples) + " samples into '" +
+                                   F.getName() + "'");
+      }
+    }
+    if (LocalChanged) {
+      Changed = true;
+    } else {
+      break;
+    }
+  }
+  return Changed;
+}
+
 /// \brief Find equivalence classes for the given block.
 ///
 /// This finds all the blocks that are guaranteed to execute the same
@@ -305,12 +733,13 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) {
 void SampleProfileLoader::findEquivalencesFor(
     BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants,
     DominatorTreeBase<BasicBlock> *DomTree) {
-  for (auto *BB2 : Descendants) {
+  const BasicBlock *EC = EquivalenceClass[BB1];
+  uint64_t Weight = BlockWeights[EC];
+  for (const auto *BB2 : Descendants) {
     bool IsDomParent = DomTree->dominates(BB2, BB1);
     bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
-    if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent &&
-        IsInSameLoop) {
-      EquivalenceClass[BB2] = BB1;
+    if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
+      EquivalenceClass[BB2] = EC;
 
       // If BB2 is heavier than BB1, make BB2 have the same weight
       // as BB1.
@@ -320,11 +749,10 @@ void SampleProfileLoader::findEquivalencesFor(
       // during the propagation phase. Right now, we just want to
       // make sure that BB1 has the largest weight of all the
       // members of its equivalence set.
-      unsigned &BB1Weight = BlockWeights[BB1];
-      unsigned &BB2Weight = BlockWeights[BB2];
-      BB1Weight = std::max(BB1Weight, BB2Weight);
+      Weight = std::max(Weight, BlockWeights[BB2]);
     }
   }
+  BlockWeights[EC] = Weight;
 }
 
 /// \brief Find equivalence classes.
@@ -364,19 +792,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
     // class by making BB2's equivalence class be BB1.
     DominatedBBs.clear();
     DT->getDescendants(BB1, DominatedBBs);
-    findEquivalencesFor(BB1, DominatedBBs, PDT->DT);
-
-    // Repeat the same logic for all the blocks post-dominated by BB1.
-    // We are looking for every basic block BB2 such that:
-    //
-    // 1- BB1 post-dominates BB2.
-    // 2- BB2 dominates BB1.
-    // 3- BB1 and BB2 are in the same loop nest.
-    //
-    // If all those conditions hold, BB2's equivalence class is BB1.
-    DominatedBBs.clear();
-    PDT->getDescendants(BB1, DominatedBBs);
-    findEquivalencesFor(BB1, DominatedBBs, DT);
+    findEquivalencesFor(BB1, DominatedBBs, PDT.get());
 
     DEBUG(printBlockEquivalence(dbgs(), BB1));
   }
@@ -389,8 +805,8 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
   // to all the blocks in that equivalence class.
   DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
   for (auto &BI : F) {
-    BasicBlock *BB = &BI;
-    BasicBlock *EquivBB = EquivalenceClass[BB];
+    const BasicBlock *BB = &BI;
+    const BasicBlock *EquivBB = EquivalenceClass[BB];
     if (BB != EquivBB)
       BlockWeights[BB] = BlockWeights[EquivBB];
     DEBUG(printBlockWeight(dbgs(), BB));
@@ -407,7 +823,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {
 /// \param UnknownEdge  Set if E has not been visited before.
 ///
 /// \returns E's weight, if known. Otherwise, return 0.
-unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
+uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
                                         Edge *UnknownEdge) {
   if (!VisitedEdges.count(E)) {
     (*NumUnknownEdges)++;
@@ -432,8 +848,9 @@ unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
 bool SampleProfileLoader::propagateThroughEdges(Function &F) {
   bool Changed = false;
   DEBUG(dbgs() << "\nPropagation through edges\n");
-  for (auto &BI : F) {
-    BasicBlock *BB = &BI;
+  for (const auto &BI : F) {
+    const BasicBlock *BB = &BI;
+    const BasicBlock *EC = EquivalenceClass[BB];
 
     // Visit all the predecessor and successor edges to determine
     // which ones have a weight assigned already. Note that it doesn't
@@ -441,7 +858,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
     // only case we are interested in handling is when only a single
     // edge is unknown (see setEdgeOrBlockWeight).
     for (unsigned i = 0; i < 2; i++) {
-      unsigned TotalWeight = 0;
+      uint64_t TotalWeight = 0;
       unsigned NumUnknownEdges = 0;
       Edge UnknownEdge, SelfReferentialEdge;
 
@@ -485,7 +902,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
       // all edges will get a weight, or iteration will stop when
       // it reaches SampleProfileMaxPropagateIterations.
       if (NumUnknownEdges <= 1) {
-        unsigned &BBWeight = BlockWeights[BB];
+        uint64_t &BBWeight = BlockWeights[EC];
         if (NumUnknownEdges == 0) {
           // If we already know the weight of all edges, the weight of the
           // basic block can be computed. It should be no larger than the sum
@@ -497,9 +914,9 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
                          << " known. Set weight for block: ";
                   printBlockWeight(dbgs(), BB););
           }
-          if (VisitedBlocks.insert(BB).second)
+          if (VisitedBlocks.insert(EC).second)
             Changed = true;
-        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) {
+        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
           // If there is a single unknown edge and the block has been
           // visited, then we can compute E's weight.
           if (BBWeight >= TotalWeight)
@@ -511,8 +928,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {
           DEBUG(dbgs() << "Set weight for edge: ";
                 printEdgeWeight(dbgs(), UnknownEdge));
         }
-      } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) {
-        unsigned &BBWeight = BlockWeights[BB];
+      } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
+        uint64_t &BBWeight = BlockWeights[BB];
         // We have a self-referential edge and the weight of BB is known.
         if (BBWeight >= TotalWeight)
           EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
@@ -578,7 +995,7 @@ void SampleProfileLoader::buildEdges(Function &F) {
 ///   known).
 void SampleProfileLoader::propagateWeights(Function &F) {
   bool Changed = true;
-  unsigned i = 0;
+  unsigned I = 0;
 
   // Add an entry count to the function using the samples gathered
   // at the function entry.
@@ -592,14 +1009,15 @@ void SampleProfileLoader::propagateWeights(Function &F) {
   buildEdges(F);
 
   // Propagate until we converge or we go past the iteration limit.
-  while (Changed && i++ < SampleProfileMaxPropagateIterations) {
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
     Changed = propagateThroughEdges(F);
   }
 
   // Generate MD_prof metadata for every branch instruction using the
   // edge weights computed during propagation.
   DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
-  MDBuilder MDB(F.getContext());
+  LLVMContext &Ctx = F.getContext();
+  MDBuilder MDB(Ctx);
   for (auto &BI : F) {
     BasicBlock *BB = &BI;
     TerminatorInst *TI = BB->getTerminator();
@@ -610,24 +1028,44 @@ void SampleProfileLoader::propagateWeights(Function &F) {
 
     DEBUG(dbgs() << "\nGetting weights for branch at line "
                  << TI->getDebugLoc().getLine() << ".\n");
-    SmallVector<unsigned, 4> Weights;
-    bool AllWeightsZero = true;
+    SmallVector<uint32_t, 4> Weights;
+    uint32_t MaxWeight = 0;
+    DebugLoc MaxDestLoc;
     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
       Edge E = std::make_pair(BB, Succ);
-      unsigned Weight = EdgeWeights[E];
+      uint64_t Weight = EdgeWeights[E];
       DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
-      Weights.push_back(Weight);
-      if (Weight != 0)
-        AllWeightsZero = false;
+      // Use uint32_t saturated arithmetic to adjust the incoming weights,
+      // if needed. Sample counts in profiles are 64-bit unsigned values,
+      // but internally branch weights are expressed as 32-bit values.
+      if (Weight > std::numeric_limits<uint32_t>::max()) {
+        DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+        Weight = std::numeric_limits<uint32_t>::max();
+      }
+      Weights.push_back(static_cast<uint32_t>(Weight));
+      if (Weight != 0) {
+        if (Weight > MaxWeight) {
+          MaxWeight = Weight;
+          MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc();
+        }
+      }
     }
 
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
-    if (!AllWeightsZero) {
+    if (MaxWeight > 0) {
       DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
       TI->setMetadata(llvm::LLVMContext::MD_prof,
                       MDB.createBranchWeights(Weights));
+      DebugLoc BranchLoc = TI->getDebugLoc();
+      emitOptimizationRemark(
+          Ctx, DEBUG_TYPE, F, MaxDestLoc,
+          Twine("most popular destination for conditional branches at ") +
+              ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" +
+                                   Twine(BranchLoc.getLine()) + ":" +
+                                   Twine(BranchLoc.getCol()))
+                           : Twine("<UNKNOWN LOCATION>")));
     } else {
       DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
     }
@@ -649,7 +1087,7 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
   if (DISubprogram *S = getDISubprogram(&F))
     return S->getLine();
 
-  // If could not find the start of \p F, emit a diagnostic to inform the user
+  // If the start of \p F is missing, emit a diagnostic to inform the user
   // about the missed opportunity.
   F.getContext().diagnose(DiagnosticInfoSampleProfile(
       "No debug information found in function " + F.getName() +
@@ -658,6 +1096,17 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
   return 0;
 }
 
+void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
+  DT.reset(new DominatorTree);
+  DT->recalculate(F);
+
+  PDT.reset(new DominatorTreeBase<BasicBlock>(true));
+  PDT->recalculate(F);
+
+  LI.reset(new LoopInfo);
+  LI->analyze(*DT);
+}
+
 /// \brief Generate branch weight metadata for all branches in \p F.
 ///
 /// Branch weights are computed out of instruction samples using a
@@ -710,18 +1159,23 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
 bool SampleProfileLoader::emitAnnotations(Function &F) {
   bool Changed = false;
 
-  // Initialize invariants used during computation and propagation.
-  HeaderLineno = getFunctionLoc(F);
-  if (HeaderLineno == 0)
+  if (getFunctionLoc(F) == 0)
     return false;
 
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
-               << ": " << HeaderLineno << "\n");
+               << ": " << getFunctionLoc(F) << "\n");
+
+  Changed |= emitInlineHints(F);
+
+  Changed |= inlineHotFunctions(F);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
 
   if (Changed) {
+    // Compute dominance and loop info needed for propagation.
+    computeDominanceAndLoopInfo(F);
+
     // Find equivalence classes.
     findEquivalenceClasses(F);
 
@@ -729,24 +1183,48 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
     propagateWeights(F);
   }
 
+  // If coverage checking was requested, compute it now.
+  if (SampleProfileRecordCoverage) {
+    unsigned Used = CoverageTracker.countUsedRecords(Samples);
+    unsigned Total = CoverageTracker.countBodyRecords(Samples);
+    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+    if (Coverage < SampleProfileRecordCoverage) {
+      F.getContext().diagnose(DiagnosticInfoSampleProfile(
+          getDISubprogram(&F)->getFilename(), getFunctionLoc(F),
+          Twine(Used) + " of " + Twine(Total) + " available profile records (" +
+              Twine(Coverage) + "%) were applied",
+          DS_Warning));
+    }
+  }
+
+  if (SampleProfileSampleCoverage) {
+    uint64_t Used = CoverageTracker.getTotalUsedSamples();
+    uint64_t Total = CoverageTracker.countBodySamples(Samples);
+    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+    if (Coverage < SampleProfileSampleCoverage) {
+      F.getContext().diagnose(DiagnosticInfoSampleProfile(
+          getDISubprogram(&F)->getFilename(), getFunctionLoc(F),
+          Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
+              Twine(Coverage) + "%) were applied",
+          DS_Warning));
+    }
+  }
   return Changed;
 }
 
 char SampleProfileLoader::ID = 0;
 INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile",
                       "Sample Profile loader", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AddDiscriminators)
 INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",
                     "Sample Profile loader", false, false)
 
 bool SampleProfileLoader::doInitialization(Module &M) {
-  auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext());
+  auto &Ctx = M.getContext();
+  auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
   if (std::error_code EC = ReaderOrErr.getError()) {
     std::string Msg = "Could not open profile: " + EC.message();
-    M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
     return false;
   }
   Reader = std::move(ReaderOrErr.get());
@@ -754,22 +1232,32 @@ bool SampleProfileLoader::doInitialization(Module &M) {
   return true;
 }
 
-FunctionPass *llvm::createSampleProfileLoaderPass() {
+ModulePass *llvm::createSampleProfileLoaderPass() {
   return new SampleProfileLoader(SampleProfileFile);
 }
 
-FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
   return new SampleProfileLoader(Name);
 }
 
-bool SampleProfileLoader::runOnFunction(Function &F) {
+bool SampleProfileLoader::runOnModule(Module &M) {
   if (!ProfileIsValid)
     return false;
 
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  PDT = &getAnalysis<PostDominatorTree>();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  Ctx = &F.getParent()->getContext();
+  // Compute the total number of samples collected in this profile.
+  for (const auto &I : Reader->getProfiles())
+    TotalCollectedSamples += I.second.getTotalSamples();
+
+  bool retval = false;
+  for (auto &F : M)
+    if (!F.isDeclaration()) {
+      clearFunctionData();
+      retval |= runOnFunction(F);
+    }
+  return retval;
+}
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
   Samples = Reader->getSamplesFor(F);
   if (!Samples->empty())
     return emitAnnotations(F);
diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 956991a..c94cc7c 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -7,47 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass loops over all of the functions in the input module, looking for 
+// This pass loops over all of the functions in the input module, looking for
 // dead declarations and removes them. Dead declarations are declarations of
 // functions for which no implementation is available (i.e., declarations for
 // unused library functions).
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "strip-dead-prototypes"
 
 STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
 
-namespace {
-
-/// @brief Pass to remove unused function declarations.
-class StripDeadPrototypesPass : public ModulePass {
-public:
-  static char ID; // Pass identification, replacement for typeid
-  StripDeadPrototypesPass() : ModulePass(ID) {
-    initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry());
-  }
-  bool runOnModule(Module &M) override;
-};
-
-} // end anonymous namespace
-
-char StripDeadPrototypesPass::ID = 0;
-INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes",
-                "Strip Unused Function Prototypes", false, false)
-
-bool StripDeadPrototypesPass::runOnModule(Module &M) {
+static bool stripDeadPrototypes(Module &M) {
   bool MadeChange = false;
-  
+
   // Erase dead function prototypes.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    Function *F = I++;
+    Function *F = &*I++;
     // Function must be a prototype and unused.
     if (F->isDeclaration() && F->use_empty()) {
       F->eraseFromParent();
@@ -59,16 +43,42 @@ bool StripDeadPrototypesPass::runOnModule(Module &M) {
   // Erase dead global var prototypes.
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ) {
-    GlobalVariable *GV = I++;
+    GlobalVariable *GV = &*I++;
     // Global must be a prototype and unused.
     if (GV->isDeclaration() && GV->use_empty())
       GV->eraseFromParent();
   }
-  
+
   // Return an indication of whether we changed anything or not.
   return MadeChange;
 }
 
+PreservedAnalyses StripDeadPrototypesPass::run(Module &M) {
+  if (stripDeadPrototypes(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+class StripDeadPrototypesLegacyPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  StripDeadPrototypesLegacyPass() : ModulePass(ID) {
+    initializeStripDeadPrototypesLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    return stripDeadPrototypes(M);
+  }
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesLegacyPass::ID = 0;
+INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes",
+                "Strip Unused Function Prototypes", false, false)
+
 ModulePass *llvm::createStripDeadPrototypesPass() {
-  return new StripDeadPrototypesPass();
+  return new StripDeadPrototypesLegacyPass();
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
index a4f30c5..46f352f 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -211,13 +211,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
 
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
-    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
   }
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
     StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
@@ -305,6 +305,12 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
   SmallVector<Metadata *, 64> LiveSubprograms;
   DenseSet<const MDNode *> VisitedSet;
 
+  std::set<DISubprogram *> LiveSPs;
+  for (Function &F : M) {
+    if (DISubprogram *SP = F.getSubprogram())
+      LiveSPs.insert(SP);
+  }
+
   for (DICompileUnit *DIC : F.compile_units()) {
     // Create our live subprogram list.
     bool SubprogramChange = false;
@@ -314,7 +320,7 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
         continue;
 
       // If the function referenced by DISP is not null, the function is live.
-      if (DISP->getFunction())
+      if (LiveSPs.count(DISP))
         LiveSubprograms.push_back(DISP);
       else
         SubprogramChange = true;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 2d2c109f..6f49399 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1,4 +1,4 @@
-//===- InstCombineAddSub.cpp ----------------------------------------------===//
+//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/PatternMatch.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -67,17 +68,17 @@ namespace {
 
   private:
     bool insaneIntVal(int V) { return V > 4 || V < -4; }
-    APFloat *getFpValPtr(void)
+    APFloat *getFpValPtr()
       { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); }
-    const APFloat *getFpValPtr(void) const
+    const APFloat *getFpValPtr() const
       { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); }
 
-    const APFloat &getFpVal(void) const {
+    const APFloat &getFpVal() const {
       assert(IsFp && BufHasFpVal && "Incorret state");
       return *getFpValPtr();
     }
 
-    APFloat &getFpVal(void) {
+    APFloat &getFpVal() {
       assert(IsFp && BufHasFpVal && "Incorret state");
       return *getFpValPtr();
     }
@@ -92,8 +93,8 @@ namespace {
     // TODO: We should get rid of this function when APFloat can be constructed
     //       from an *SIGNED* integer.
     APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
-  private:
 
+  private:
     bool IsFp;
 
     // True iff FpValBuf contains an instance of APFloat.
@@ -114,10 +115,10 @@ namespace {
   ///
   class FAddend {
   public:
-    FAddend() { Val = nullptr; }
+    FAddend() : Val(nullptr) {}
 
-    Value *getSymVal (void) const { return Val; }
-    const FAddendCoef &getCoef(void) const { return Coeff; }
+    Value *getSymVal() const { return Val; }
+    const FAddendCoef &getCoef() const { return Coeff; }
 
     bool isConstant() const { return Val == nullptr; }
     bool isZero() const { return Coeff.isZero(); }
@@ -182,7 +183,6 @@ namespace {
     InstCombiner::BuilderTy *Builder;
     Instruction *Instr;
 
-  private:
      // Debugging stuff are clustered here.
     #ifndef NDEBUG
       unsigned CreateInstrNum;
@@ -193,7 +193,8 @@ namespace {
       void incCreateInstNum() {}
     #endif
   };
-}
+
+} // anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //
@@ -602,7 +603,6 @@ Value *FAddCombine::simplify(Instruction *I) {
 }
 
 Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
-
   unsigned AddendNum = Addends.size();
   assert(AddendNum <= 4 && "Too many addends");
 
@@ -886,7 +886,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero,
   return Op0ZeroPosition >= Op1OnePosition;
 }
 
-/// WillNotOverflowSignedAdd - Return true if we can prove that:
+/// Return true if we can prove that:
 ///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
@@ -1118,8 +1118,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C,
       // transform them into (X + (signbit ^ C))
       if (XorRHS->getValue().isSignBit())
-          return BinaryOperator::CreateAdd(XorLHS,
-                                           ConstantExpr::getXor(XorRHS, CI));
+        return BinaryOperator::CreateAdd(XorLHS,
+                                         ConstantExpr::getXor(XorRHS, CI));
     }
   }
 
@@ -1421,7 +1421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-
 /// Optimize pointer differences into the same array into a size.  Consider:
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
@@ -1589,7 +1588,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     }
   }
 
-
   {
     Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
@@ -1611,32 +1609,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(A, B);
   }
 
-  // (sub (select (a, c, b)), (select (a, d, b))) -> (select (a, (sub c, d), 0))
-  // (sub (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (sub c, d)))
-  if (auto *SI0 = dyn_cast<SelectInst>(Op0)) {
-    if (auto *SI1 = dyn_cast<SelectInst>(Op1)) {
-      if (SI0->getCondition() == SI1->getCondition()) {
-        if (Value *V = SimplifySubInst(
-                SI0->getFalseValue(), SI1->getFalseValue(), I.hasNoSignedWrap(),
-                I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
-          return SelectInst::Create(
-              SI0->getCondition(),
-              Builder->CreateSub(SI0->getTrueValue(), SI1->getTrueValue(), "",
-                                 /*HasNUW=*/I.hasNoUnsignedWrap(),
-                                 /*HasNSW=*/I.hasNoSignedWrap()),
-              V);
-        if (Value *V = SimplifySubInst(SI0->getTrueValue(), SI1->getTrueValue(),
-                                       I.hasNoSignedWrap(),
-                                       I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
-          return SelectInst::Create(
-              SI0->getCondition(), V,
-              Builder->CreateSub(SI0->getFalseValue(), SI1->getFalseValue(), "",
-                                 /*HasNUW=*/I.hasNoUnsignedWrap(),
-                                 /*HasNSW=*/I.hasNoSignedWrap()));
-      }
-    }
-  }
-
   if (Op0->hasOneUse()) {
     Value *Y = nullptr;
     // ((X | Y) - X) --> (~X & Y)
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 15e0889..95c50d3 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -37,9 +37,9 @@ static inline Value *dyn_castNotVal(Value *V) {
   return nullptr;
 }
 
-/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp
-/// predicate into a three bit mask. It also returns whether it is an ordered
-/// predicate by reference.
+/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
+/// a three bit mask. It also returns whether it is an ordered predicate by
+/// reference.
 static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
   isOrdered = false;
   switch (CC) {
@@ -64,10 +64,10 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
   }
 }
 
-/// getNewICmpValue - This is the complement of getICmpCode, which turns an
-/// opcode and two operands into either a constant true or false, or a brand
-/// new ICmp instruction. The sign is passed in to determine which kind
-/// of predicate to use in the new icmp instruction.
+/// This is the complement of getICmpCode, which turns an opcode and two
+/// operands into either a constant true or false, or a brand new ICmp
+/// instruction. The sign is passed in to determine which kind of predicate to
+/// use in the new icmp instruction.
 static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
                               InstCombiner::BuilderTy *Builder) {
   ICmpInst::Predicate NewPred;
@@ -76,9 +76,9 @@ static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
   return Builder->CreateICmp(NewPred, LHS, RHS);
 }
 
-/// getFCmpValue - This is the complement of getFCmpCode, which turns an
-/// opcode and two operands into either a FCmp instruction. isordered is passed
-/// in to determine which kind of predicate to use in the new fcmp instruction.
+/// This is the complement of getFCmpCode, which turns an opcode and two
+/// operands into either a FCmp instruction. isordered is passed in to determine
+/// which kind of predicate to use in the new fcmp instruction.
 static Value *getFCmpValue(bool isordered, unsigned code,
                            Value *LHS, Value *RHS,
                            InstCombiner::BuilderTy *Builder) {
@@ -150,14 +150,13 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
   else //if (Op == Instruction::Xor)
     BinOp = Builder->CreateXor(NewLHS, NewRHS);
 
-  Module *M = I.getParent()->getParent()->getParent();
-  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy);
+  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy);
   return Builder->CreateCall(F, BinOp);
 }
 
-// OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
-// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
-// guaranteed to be a binary operator.
+/// This handles expressions of the form ((val OP C1) & C2).  Where
+/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
+/// guaranteed to be a binary operator.
 Instruction *InstCombiner::OptAndOp(Instruction *Op,
                                     ConstantInt *OpRHS,
                                     ConstantInt *AndRHS,
@@ -341,10 +340,10 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
   return Builder->CreateICmpUGT(Add, LowerBound);
 }
 
-// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with
-// any number of 0s on either side.  The 1s are allowed to wrap from LSB to
-// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
-// not, since all 1s are not contiguous.
+/// Returns true iff Val consists of one contiguous run of 1s with any number
+/// of 0s on either side.  The 1s are allowed to wrap from LSB to MSB,
+/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
+/// not, since all 1s are not contiguous.
 static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
   const APInt& V = Val->getValue();
   uint32_t BitWidth = Val->getType()->getBitWidth();
@@ -357,9 +356,8 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
   return true;
 }
 
-/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask,
-/// where isSub determines whether the operator is a sub.  If we can fold one of
-/// the following xforms:
+/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines
+/// whether the operator is a sub. If we can fold one of the following xforms:
 ///
 /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
 /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
@@ -449,8 +447,8 @@ enum MaskedICmpType {
   FoldMskICmp_BMask_NotMixed          =   512
 };
 
-/// return the set of pattern classes (from MaskedICmpType)
-/// that (icmp SCC (A & B), C) satisfies
+/// Return the set of pattern classes (from MaskedICmpType)
+/// that (icmp SCC (A & B), C) satisfies.
 static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
                                     ICmpInst::Predicate SCC)
 {
@@ -538,8 +536,8 @@ static unsigned conjugateICmpMask(unsigned Mask) {
   return NewMask;
 }
 
-/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z)
-/// if possible. The returned predicate is either == or !=. Returns false if
+/// Decompose an icmp into the form ((X & Y) pred Z) if possible.
+/// The returned predicate is either == or !=. Returns false if
 /// decomposition fails.
 static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,
                                  Value *&X, Value *&Y, Value *&Z) {
@@ -585,10 +583,9 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,
   return true;
 }
 
-/// foldLogOpOfMaskedICmpsHelper:
-/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
-/// return the set of pattern classes (from MaskedICmpType)
-/// that both LHS and RHS satisfy
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// Return the set of pattern classes (from MaskedICmpType)
+/// that both LHS and RHS satisfy.
 static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
                                              Value*& B, Value*& C,
                                              Value*& D, Value*& E,
@@ -700,9 +697,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC);
   return left_type & right_type;
 }
-/// foldLogOpOfMaskedICmps:
-/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
-/// into a single (icmp(A & X) ==/!= Y)
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy *Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
@@ -879,7 +876,7 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return Builder->CreateICmp(NewPred, Input, RangeEnd);
 }
 
-/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
+/// Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
 
@@ -1123,9 +1120,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   return nullptr;
 }
 
-/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of
-/// instcombine, this returns a Value which should already be inserted into the
-/// function.
+/// Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of instcombine, this returns
+/// a Value which should already be inserted into the function.
 Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
       RHS->getPredicate() == FCmpInst::FCMP_ORD) {
@@ -1203,6 +1199,54 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   return nullptr;
 }
 
+/// Match De Morgan's Laws:
+/// (~A & ~B) == (~(A | B))
+/// (~A | ~B) == (~(A & B))
+static Instruction *matchDeMorgansLaws(BinaryOperator &I,
+                                       InstCombiner::BuilderTy *Builder) {
+  auto Opcode = I.getOpcode();
+  assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+         "Trying to match De Morgan's Laws with something other than and/or");
+  // Flip the logic operation.
+  if (Opcode == Instruction::And)
+    Opcode = Instruction::Or;
+  else
+    Opcode = Instruction::And;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  // TODO: Use pattern matchers instead of dyn_cast.
+  if (Value *Op0NotVal = dyn_castNotVal(Op0))
+    if (Value *Op1NotVal = dyn_castNotVal(Op1))
+      if (Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
+                                              I.getName() + ".demorgan");
+        return BinaryOperator::CreateNot(LogicOp);
+      }
+
+  // De Morgan's Law in disguise:
+  // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B))
+  // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B))
+  Value *A = nullptr;
+  Value *B = nullptr;
+  ConstantInt *C1 = nullptr;
+  if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) &&
+      match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) {
+    // TODO: This check could be loosened to handle different type sizes.
+    // Alternatively, we could fix the definition of m_Not to recognize a not
+    // operation hidden by a zext?
+    if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) &&
+        C1->isOne()) {
+      Value *LogicOp = Builder->CreateBinOp(Opcode, A, B,
+                                            I.getName() + ".demorgan");
+      Value *Not = Builder->CreateNot(LogicOp);
+      return CastInst::CreateZExtOrBitCast(Not, I.getType());
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -1273,6 +1317,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
           return BinaryOperator::CreateAnd(V, AndRHS);
 
+        // -x & 1 -> x & 1
+        if (AndRHSMask == 1 && match(Op0LHS, m_Zero()))
+          return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
+
         // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
         // has 1's for all bits that the subtraction with A might affect.
         if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) {
@@ -1329,15 +1377,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return NV;
   }
 
-
-  // (~A & ~B) == (~(A | B)) - De Morgan's Law
-  if (Value *Op0NotVal = dyn_castNotVal(Op0))
-    if (Value *Op1NotVal = dyn_castNotVal(Op1))
-      if (Op0->hasOneUse() && Op1->hasOneUse()) {
-        Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal,
-                                      I.getName()+".demorgan");
-        return BinaryOperator::CreateNot(Or);
-      }
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+    return DeMorgan;
 
   {
     Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
@@ -1446,14 +1487,15 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return ReplaceInstUsesWith(I, Res);
 
 
-  // fold (and (cast A), (cast B)) -> (cast (and A, B))
-  if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    Value *Op0COp = Op0C->getOperand(0);
+    Type *SrcTy = Op0COp->getType();
+    // fold (and (cast A), (cast B)) -> (cast (and A, B))
     if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) {
-      Type *SrcTy = Op0C->getOperand(0)->getType();
       if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ?
           SrcTy == Op1C->getOperand(0)->getType() &&
           SrcTy->isIntOrIntVectorTy()) {
-        Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
+        Value *Op1COp = Op1C->getOperand(0);
 
         // Only do this if the casts both really cause code to be generated.
         if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) &&
@@ -1478,6 +1520,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
     }
 
+    // If we are masking off the sign bit of a floating-point value, convert
+    // this to the canonical fabs intrinsic call and cast back to integer.
+    // The backend should know how to optimize fabs().
+    // TODO: This transform should also apply to vectors.
+    ConstantInt *CI;
+    if (isa<BitCastInst>(Op0C) && SrcTy->isFloatingPointTy() &&
+        match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) {
+      Module *M = I.getModule();
+      Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy);
+      Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs");
+      return CastInst::CreateBitOrPointerCast(Call, I.getType());
+    }
+  }
+
   {
     Value *X = nullptr;
     bool OpsSwapped = false;
@@ -1509,163 +1565,195 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-/// CollectBSwapParts - Analyze the specified subexpression and see if it is
-/// capable of providing pieces of a bswap.  The subexpression provides pieces
-/// of a bswap if it is proven that each of the non-zero bytes in the output of
-/// the expression came from the corresponding "byte swapped" byte in some other
-/// value.  For example, if the current subexpression is "(shl i32 %X, 24)" then
-/// we know that the expression deposits the low byte of %X into the high byte
-/// of the bswap result and that all other bytes are zero.  This expression is
-/// accepted, the high byte of ByteValues is set to X to indicate a correct
-/// match.
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
+/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// (value, bitnumber) to bitnumber. It is the caller's responsibility to
+/// validate that all `value`s are identical and that the bitnumber to bitnumber
+/// mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted,
+/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7].
 ///
 /// This function returns true if the match was unsuccessful and false if so.
 /// On entry to the function the "OverallLeftShift" is a signed integer value
-/// indicating the number of bytes that the subexpression is later shifted.  For
+/// indicating the number of bits that the subexpression is later shifted.  For
 /// example, if the expression is later right shifted by 16 bits, the
-/// OverallLeftShift value would be -2 on entry.  This is used to specify which
-/// byte of ByteValues is actually being set.
+/// OverallLeftShift value would be -16 on entry.  This is used to specify which
+/// bits of BitValues are actually being set.
 ///
-/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding
-/// byte is masked to zero by a user.  For example, in (X & 255), X will be
-/// processed with a bytemask of 1.  Because bytemask is 32-bits, this limits
-/// this function to working on up to 32-byte (256 bit) values.  ByteMask is
-/// always in the local (OverallLeftShift) coordinate space.
+/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding
+/// bit is masked to zero by a user.  For example, in (X & 255), X will be
+/// processed with a bytemask of 255. BitMask is always in the local
+/// (OverallLeftShift) coordinate space.
 ///
-static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
-                              SmallVectorImpl<Value *> &ByteValues) {
+static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask,
+                            SmallVectorImpl<Value *> &BitValues,
+                            SmallVectorImpl<int> &BitProvenance) {
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or) {
-      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
-                               ByteValues) ||
-             CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask,
-                               ByteValues);
-    }
-
-    // If this is a logical shift by a constant multiple of 8, recurse with
-    // OverallLeftShift and ByteMask adjusted.
+    if (I->getOpcode() == Instruction::Or)
+      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
+                             BitValues, BitProvenance) ||
+             CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask,
+                             BitValues, BitProvenance);
+
+    // If this is a logical shift by a constant, recurse with OverallLeftShift
+    // and BitMask adjusted.
     if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
       unsigned ShAmt =
-        cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
-      // Ensure the shift amount is defined and of a byte value.
-      if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size()))
+          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined.
+      if (ShAmt > BitValues.size())
         return true;
 
-      unsigned ByteShift = ShAmt >> 3;
+      unsigned BitShift = ShAmt;
       if (I->getOpcode() == Instruction::Shl) {
-        // X << 2 -> collect(X, +2)
-        OverallLeftShift += ByteShift;
-        ByteMask >>= ByteShift;
+        // X << C -> collect(X, +C)
+        OverallLeftShift += BitShift;
+        BitMask = BitMask.lshr(BitShift);
       } else {
-        // X >>u 2 -> collect(X, -2)
-        OverallLeftShift -= ByteShift;
-        ByteMask <<= ByteShift;
-        ByteMask &= (~0U >> (32-ByteValues.size()));
+        // X >>u C -> collect(X, -C)
+        OverallLeftShift -= BitShift;
+        BitMask = BitMask.shl(BitShift);
       }
 
-      if (OverallLeftShift >= (int)ByteValues.size()) return true;
-      if (OverallLeftShift <= -(int)ByteValues.size()) return true;
+      if (OverallLeftShift >= (int)BitValues.size())
+        return true;
+      if (OverallLeftShift <= -(int)BitValues.size())
+        return true;
 
-      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
-                               ByteValues);
+      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
+                             BitValues, BitProvenance);
     }
 
-    // If this is a logical 'and' with a mask that clears bytes, clear the
-    // corresponding bytes in ByteMask.
+    // If this is a logical 'and' with a mask that clears bits, clear the
+    // corresponding bits in BitMask.
     if (I->getOpcode() == Instruction::And &&
         isa<ConstantInt>(I->getOperand(1))) {
-      // Scan every byte of the and mask, seeing if the byte is either 0 or 255.
-      unsigned NumBytes = ByteValues.size();
-      APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255);
+      unsigned NumBits = BitValues.size();
+      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
       const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
 
-      for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) {
-        // If this byte is masked out by a later operation, we don't care what
+      for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) {
+        // If this bit is masked out by a later operation, we don't care what
         // the and mask is.
-        if ((ByteMask & (1 << i)) == 0)
+        if (BitMask[i] == 0)
           continue;
 
-        // If the AndMask is all zeros for this byte, clear the bit.
-        APInt MaskB = AndMask & Byte;
+        // If the AndMask is zero for this bit, clear the bit.
+        APInt MaskB = AndMask & Bit;
         if (MaskB == 0) {
-          ByteMask &= ~(1U << i);
+          BitMask.clearBit(i);
           continue;
         }
 
-        // If the AndMask is not all ones for this byte, it's not a bytezap.
-        if (MaskB != Byte)
-          return true;
-
-        // Otherwise, this byte is kept.
+        // Otherwise, this bit is kept.
       }
 
-      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
-                               ByteValues);
+      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
+                             BitValues, BitProvenance);
     }
   }
 
   // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
-  // the input value to the bswap.  Some observations: 1) if more than one byte
-  // is demanded from this input, then it could not be successfully assembled
-  // into a byteswap.  At least one of the two bytes would not be aligned with
-  // their ultimate destination.
-  if (!isPowerOf2_32(ByteMask)) return true;
-  unsigned InputByteNo = countTrailingZeros(ByteMask);
-
-  // 2) The input and ultimate destinations must line up: if byte 3 of an i32
-  // is demanded, it needs to go into byte 0 of the result.  This means that the
-  // byte needs to be shifted until it lands in the right byte bucket.  The
-  // shift amount depends on the position: if the byte is coming from the high
-  // part of the value (e.g. byte 3) then it must be shifted right.  If from the
-  // low part, it must be shifted left.
-  unsigned DestByteNo = InputByteNo + OverallLeftShift;
-  if (ByteValues.size()-1-DestByteNo != InputByteNo)
+  // the input value to the bswap/bitreverse. To be part of a bswap or
+  // bitreverse we must be demanding a contiguous range of bits from it.
+  unsigned InputBitLen = BitMask.countPopulation();
+  unsigned InputBitNo = BitMask.countTrailingZeros();
+  if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo !=
+      InputBitLen)
+    // Not a contiguous set range of bits!
     return true;
 
-  // If the destination byte value is already defined, the values are or'd
-  // together, which isn't a bswap (unless it's an or of the same bits).
-  if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V)
+  // We know we're moving a contiguous range of bits from the input to the
+  // output. Record which bits in the output came from which bits in the input.
+  unsigned DestBitNo = InputBitNo + OverallLeftShift;
+  for (unsigned I = 0; I < InputBitLen; ++I)
+    BitProvenance[DestBitNo + I] = InputBitNo + I;
+
+  // If the destination bit value is already defined, the values are or'd
+  // together, which isn't a bswap/bitreverse (unless it's an or of the same
+  // bits).
+  if (BitValues[DestBitNo] && BitValues[DestBitNo] != V)
     return true;
-  ByteValues[DestByteNo] = V;
+  for (unsigned I = 0; I < InputBitLen; ++I)
+    BitValues[DestBitNo + I] = V;
+
   return false;
 }
 
-/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom.
-/// If so, insert the new bswap intrinsic and return it.
-Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
-  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-  if (!ITy || ITy->getBitWidth() % 16 ||
-      // ByteMask only allows up to 32-byte values.
-      ITy->getBitWidth() > 32*8)
-    return nullptr;   // Can only bswap pairs of bytes.  Can't do vectors.
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
 
-  /// ByteValues - For each byte of the result, we keep track of which value
-  /// defines each byte.
-  SmallVector<Value*, 8> ByteValues;
-  ByteValues.resize(ITy->getBitWidth()/8);
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
 
+/// Given an OR instruction, check to see if this is a bswap or bitreverse
+/// idiom. If so, insert the new intrinsic and return it.
+Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
+  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
+  if (!ITy)
+    return nullptr;   // Can't do vectors.
+  unsigned BW = ITy->getBitWidth();
+  
+  /// We keep track of which bit (BitProvenance) inside which value (BitValues)
+  /// defines each bit in the result.
+  SmallVector<Value *, 8> BitValues(BW, nullptr);
+  SmallVector<int, 8> BitProvenance(BW, -1);
+  
   // Try to find all the pieces corresponding to the bswap.
-  uint32_t ByteMask = ~0U >> (32-ByteValues.size());
-  if (CollectBSwapParts(&I, 0, ByteMask, ByteValues))
+  APInt BitMask = APInt::getAllOnesValue(BitValues.size());
+  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance))
     return nullptr;
 
-  // Check to see if all of the bytes come from the same value.
-  Value *V = ByteValues[0];
-  if (!V) return nullptr;  // Didn't find a byte?  Must be zero.
+  // Check to see if all of the bits come from the same value.
+  Value *V = BitValues[0];
+  if (!V) return nullptr;  // Didn't find a bit?  Must be zero.
 
-  // Check to make sure that all of the bytes come from the same value.
-  for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
-    if (ByteValues[i] != V)
-      return nullptr;
-  Module *M = I.getParent()->getParent()->getParent();
-  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy);
+  if (!std::all_of(BitValues.begin(), BitValues.end(),
+                   [&](const Value *X) { return X == V; }))
+    return nullptr;
+
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
+  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;;
+  for (unsigned i = 0, e = BitValues.size(); i != e; ++i) {
+    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+    OKForBitReverse &=
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+  }
+
+  Intrinsic::ID Intrin;
+  if (OKForBSwap)
+    Intrin = Intrinsic::bswap;
+  else if (OKForBitReverse)
+    Intrin = Intrinsic::bitreverse;
+  else
+    return nullptr;
+
+  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy);
   return CallInst::Create(F, V);
 }
 
-/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D).  Check
-/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then
-/// we can simplify this expression to "cond ? C : D or B".
+/// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0)
+/// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this
+/// expression to "cond ? C : D or B".
 static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
                                          Value *C, Value *D) {
   // If A is not a select of -1/0, this cannot match.
@@ -1688,7 +1776,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
   return nullptr;
 }
 
-/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
+/// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                    Instruction *CxtI) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
@@ -1905,14 +1993,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     case ICmpInst::ICMP_EQ:
       if (LHS->getOperand(0) == RHS->getOperand(0)) {
         // if LHSCst and RHSCst differ only by one bit:
-        // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
+        // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2
         assert(LHSCst->getValue().ule(LHSCst->getValue()));
 
         APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
         if (Xor.isPowerOf2()) {
-          Value *NegCst = Builder->getInt(~Xor);
-          Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst);
-          return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst);
+          Value *Cst = Builder->getInt(Xor);
+          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst);
+          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);
         }
       }
 
@@ -2020,9 +2108,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   return nullptr;
 }
 
-/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of
-/// instcombine, this returns a Value which should already be inserted into the
-/// function.
+/// Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of instcombine, this returns
+/// a Value which should already be inserted into the function.
 Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
       RHS->getPredicate() == FCmpInst::FCMP_UNO &&
@@ -2080,7 +2167,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   return nullptr;
 }
 
-/// FoldOrWithConstants - This helper function folds:
+/// This helper function folds:
 ///
 ///     ((A | B) & C1) | (B & C2)
 ///
@@ -2199,14 +2286,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   ConstantInt *C1 = nullptr, *C2 = nullptr;
 
   // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
+                 match(Op1, m_Or(m_Value(), m_Value()));
   // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
-  if (match(Op0, m_Or(m_Value(), m_Value())) ||
-      match(Op1, m_Or(m_Value(), m_Value())) ||
-      (match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
-       match(Op1, m_LogicalShift(m_Value(), m_Value())))) {
-    if (Instruction *BSwap = MatchBSwap(I))
+  bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
+                    match(Op1, m_LogicalShift(m_Value(), m_Value()));
+  // (A & B) | (C & D)                              -> bswap if possible.
+  bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) &&
+                  match(Op1, m_And(m_Value(), m_Value()));
+
+  if (OrOfOrs || OrOfShifts || OrOfAnds)
+    if (Instruction *BSwap = MatchBSwapOrBitReverse(I))
       return BSwap;
-  }
 
   // (X^C)|Y -> (X|Y)^C iff Y&C == 0
   if (Op0->hasOneUse() &&
@@ -2360,14 +2451,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
     return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
 
-  // (~A | ~B) == (~(A & B)) - De Morgan's Law
-  if (Value *Op0NotVal = dyn_castNotVal(Op0))
-    if (Value *Op1NotVal = dyn_castNotVal(Op1))
-      if (Op0->hasOneUse() && Op1->hasOneUse()) {
-        Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal,
-                                        I.getName()+".demorgan");
-        return BinaryOperator::CreateNot(And);
-      }
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+    return DeMorgan;
 
   // Canonicalize xor to the RHS.
   bool SwappedForXor = false;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6de380b..090245d 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -67,8 +67,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned CopyAlign = MI->getAlignment();
 
   if (CopyAlign < MinAlign) {
-    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
-                                             MinAlign, false));
+    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
     return MI;
   }
 
@@ -198,12 +197,140 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   return nullptr;
 }
 
+static Value *SimplifyX86immshift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+    LogicalShift = false; ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+    LogicalShift = true; ShiftLeft = false;
+    break;
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+    LogicalShift = true; ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  // Simplify if count is constant.
+  auto Arg1 = II.getArgOperand(1);
+  auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
+  auto CDV = dyn_cast<ConstantDataVector>(Arg1);
+  auto CInt = dyn_cast<ConstantInt>(Arg1);
+  if (!CAZ && !CDV && !CInt)
+    return nullptr;
+
+  APInt Count(64, 0);
+  if (CDV) {
+    // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+    // operand to compute the shift amount.
+    auto VT = cast<VectorType>(CDV->getType());
+    unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
+    assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
+    unsigned NumSubElts = 64 / BitWidth;
+
+    // Concatenate the sub-elements to create the 64-bit value.
+    for (unsigned i = 0; i != NumSubElts; ++i) {
+      unsigned SubEltIdx = (NumSubElts - 1) - i;
+      auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+      Count = Count.shl(BitWidth);
+      Count |= SubElt->getValue().zextOrTrunc(64);
+    }
+  }
+  else if (CInt)
+    Count = CInt->getValue();
+
+  auto Vec = II.getArgOperand(0);
+  auto VT = cast<VectorType>(Vec->getType());
+  auto SVT = VT->getElementType();
+  unsigned VWidth = VT->getNumElements();
+  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+  // If shift-by-zero then just return the original value.
+  if (Count == 0)
+    return Vec;
+
+  // Handle cases when Shift >= BitWidth.
+  if (Count.uge(BitWidth)) {
+    // If LogicalShift - just return zero.
+    if (LogicalShift)
+      return ConstantAggregateZero::get(VT);
+
+    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+    Count = APInt(64, BitWidth - 1);
+  }
+
+  // Get a constant vector of the same type as the first operand.
+  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *SimplifyX86extend(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder,
+                                bool SignExtend) {
+  VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType());
+  VectorType *DstTy = cast<VectorType>(II.getType());
+  unsigned NumDstElts = DstTy->getNumElements();
+
+  // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
+  SmallVector<int, 8> ShuffleMask;
+  for (int i = 0; i != (int)NumDstElts; ++i)
+    ShuffleMask.push_back(i);
+
+  Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0),
+                                          UndefValue::get(SrcTy), ShuffleMask);
+  return SignExtend ? Builder.CreateSExt(SV, DstTy)
+                    : Builder.CreateZExt(SV, DstTy);
+}
+
 static Value *SimplifyX86insertps(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
     VectorType *VecTy = cast<VectorType>(II.getType());
     assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-    
+
     // The immediate permute control byte looks like this:
     //    [3:0] - zero mask for each 32-bit lane
     //    [5:4] - select one 32-bit destination lane
@@ -248,12 +375,202 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
       // Replace the selected destination lane with the selected source lane.
       ShuffleMask[DestLane] = SourceLane + 4;
     }
-  
+
     return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
   }
   return nullptr;
 }
 
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0,
+                               ConstantInt *CILength, ConstantInt *CIIndex,
+                               InstCombiner::BuilderTy &Builder) {
+  auto LowConstantHighUndef = [&](uint64_t Val) {
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  };
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  ConstantInt *CI0 =
+      C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Attempt to constant fold.
+  if (CILength && CIIndex) {
+    // From AMD documentation: "The bit index and field length are each six
+    // bits in length other bits of the field are ignored."
+    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+    APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+    unsigned Index = APIndex.getZExtValue();
+
+    // From AMD documentation: "a value of zero in the field length is
+    // defined as length of 64".
+    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+    // From AMD documentation: "If the sum of the bit index + length field
+    // is greater than 64, the results are undefined".
+    unsigned End = Index + Length;
+
+    // Note that both field index and field length are 8-bit quantities.
+    // Since variables 'Index' and 'Length' are unsigned values
+    // obtained from zero-extending field index and field length
+    // respectively, their sum should never wrap around.
+    if (End > 64)
+      return UndefValue::get(II.getType());
+
+    // If we are inserting whole bytes, we can convert this to a shuffle.
+    // Lowering can recognize EXTRQI shuffle masks.
+    if ((Length % 8) == 0 && (Index % 8) == 0) {
+      // Convert bit indices to byte indices.
+      Length /= 8;
+      Index /= 8;
+
+      Type *IntTy8 = Type::getInt8Ty(II.getContext());
+      Type *IntTy32 = Type::getInt32Ty(II.getContext());
+      VectorType *ShufTy = VectorType::get(IntTy8, 16);
+
+      SmallVector<Constant *, 16> ShuffleMask;
+      for (int i = 0; i != (int)Length; ++i)
+        ShuffleMask.push_back(
+            Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
+      for (int i = Length; i != 8; ++i)
+        ShuffleMask.push_back(
+            Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+      for (int i = 8; i != 16; ++i)
+        ShuffleMask.push_back(UndefValue::get(IntTy32));
+
+      Value *SV = Builder.CreateShuffleVector(
+          Builder.CreateBitCast(Op0, ShufTy),
+          ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
+      return Builder.CreateBitCast(SV, II.getType());
+    }
+
+    // Constant Fold - shift Index'th bit to lowest position and mask off
+    // Length bits.
+    if (CI0) {
+      APInt Elt = CI0->getValue();
+      Elt = Elt.lshr(Index).zextOrTrunc(Length);
+      return LowConstantHighUndef(Elt.getZExtValue());
+    }
+
+    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+      Value *Args[] = {Op0, CILength, CIIndex};
+      Module *M = II.getModule();
+      Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+      return Builder.CreateCall(F, Args);
+    }
+  }
+
+  // Constant Fold - extraction from zero is always {zero, undef}.
+  if (CI0 && CI0->equalsInt(0))
+    return LowConstantHighUndef(0);
+
+  return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+                                 APInt APLength, APInt APIndex,
+                                 InstCombiner::BuilderTy &Builder) {
+
+  // From AMD documentation: "The bit index and field length are each six bits
+  // in length other bits of the field are ignored."
+  APIndex = APIndex.zextOrTrunc(6);
+  APLength = APLength.zextOrTrunc(6);
+
+  // Attempt to constant fold.
+  unsigned Index = APIndex.getZExtValue();
+
+  // From AMD documentation: "a value of zero in the field length is
+  // defined as length of 64".
+  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+  // From AMD documentation: "If the sum of the bit index + length field
+  // is greater than 64, the results are undefined".
+  unsigned End = Index + Length;
+
+  // Note that both field index and field length are 8-bit quantities.
+  // Since variables 'Index' and 'Length' are unsigned values
+  // obtained from zero-extending field index and field length
+  // respectively, their sum should never wrap around.
+  if (End > 64)
+    return UndefValue::get(II.getType());
+
+  // If we are inserting whole bytes, we can convert this to a shuffle.
+  // Lowering can recognize INSERTQI shuffle masks.
+  if ((Length % 8) == 0 && (Index % 8) == 0) {
+    // Convert bit indices to byte indices.
+    Length /= 8;
+    Index /= 8;
+
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    Type *IntTy32 = Type::getInt32Ty(II.getContext());
+    VectorType *ShufTy = VectorType::get(IntTy8, 16);
+
+    SmallVector<Constant *, 16> ShuffleMask;
+    for (int i = 0; i != (int)Index; ++i)
+      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
+    for (int i = 0; i != (int)Length; ++i)
+      ShuffleMask.push_back(
+          Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+    for (int i = Index + Length; i != 8; ++i)
+      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
+    for (int i = 8; i != 16; ++i)
+      ShuffleMask.push_back(UndefValue::get(IntTy32));
+
+    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+                                            Builder.CreateBitCast(Op1, ShufTy),
+                                            ConstantVector::get(ShuffleMask));
+    return Builder.CreateBitCast(SV, II.getType());
+  }
+
+  // See if we're dealing with constant values.
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  Constant *C1 = dyn_cast<Constant>(Op1);
+  ConstantInt *CI00 =
+      C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0))
+         : nullptr;
+  ConstantInt *CI10 =
+      C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0))
+         : nullptr;
+
+  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+  if (CI00 && CI10) {
+    APInt V00 = CI00->getValue();
+    APInt V10 = CI10->getValue();
+    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+    V00 = V00 & ~Mask;
+    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+    APInt Val = V00 | V10;
+    Type *IntTy64 = Type::getInt64Ty(II.getContext());
+    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+                        UndefValue::get(IntTy64)};
+    return ConstantVector::get(Args);
+  }
+
+  // If we were an INSERTQ call, we'll save demanded elements if we convert to
+  // INSERTQI.
+  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+    Type *IntTy8 = Type::getInt8Ty(II.getContext());
+    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+    Value *Args[] = {Op0, Op1, CILength, CIIndex};
+    Module *M = II.getModule();
+    Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+    return Builder.CreateCall(F, Args);
+  }
+
+  return nullptr;
+}
+
 /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
 /// source vectors, unless a zero bit is set. If a zero bit is set,
 /// then ignore that half of the mask and clear that half of the vector.
@@ -289,7 +606,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,
     // The high bit of the selection field chooses the 1st or 2nd operand.
     bool LowInputSelect = Imm & 0x02;
     bool HighInputSelect = Imm & 0x20;
-    
+
     // The low bit of the selection field chooses the low or high half
     // of the selected operand.
     bool LowHalfSelect = Imm & 0x01;
@@ -298,11 +615,11 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,
     // Determine which operand(s) are actually in use for this instruction.
     Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
     Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-    
+
     // If needed, replace operands based on zero mask.
     V0 = LowHalfZero ? ZeroVector : V0;
     V1 = HighHalfZero ? ZeroVector : V1;
-    
+
     // Permute low half of result.
     unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
     for (unsigned i = 0; i < HalfSize; ++i)
@@ -319,6 +636,43 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,
   return nullptr;
 }
 
+/// Decode XOP integer vector comparison intrinsics.
+static Value *SimplifyX86vpcom(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
+  if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+    uint64_t Imm = CInt->getZExtValue() & 0x7;
+    VectorType *VecTy = cast<VectorType>(II.getType());
+    CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+
+    switch (Imm) {
+    case 0x0:
+      Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+      break;
+    case 0x1:
+      Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+      break;
+    case 0x2:
+      Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+      break;
+    case 0x3:
+      Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+      break;
+    case 0x4:
+      Pred = ICmpInst::ICMP_EQ; break;
+    case 0x5:
+      Pred = ICmpInst::ICMP_NE; break;
+    case 0x6:
+      return ConstantInt::getSigned(VecTy, 0); // FALSE
+    case 0x7:
+      return ConstantInt::getSigned(VecTy, -1); // TRUE
+    }
+
+    if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), II.getArgOperand(1)))
+      return Builder.CreateSExtOrTrunc(Cmp, VecTy);
+  }
+  return nullptr;
+}
+
 /// visitCallInst - CallInst simplification.  This mostly only handles folding
 /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
 /// the heavy lifting.
@@ -371,7 +725,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
       if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
         if (GVSrc->isConstant()) {
-          Module *M = CI.getParent()->getParent()->getParent();
+          Module *M = CI.getModule();
           Intrinsic::ID MemCpyID = Intrinsic::memcpy;
           Type *Tys[3] = { CI.getArgOperand(0)->getType(),
                            CI.getArgOperand(1)->getType(),
@@ -400,6 +754,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
+  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth)
+  {
+    APInt UndefElts(Width, 0);
+    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+  };
+
   switch (II->getIntrinsicID()) {
   default: break;
   case Intrinsic::objectsize: {
@@ -427,6 +788,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::bitreverse: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = nullptr;
+
+    // bitreverse(bitreverse(x)) -> x
+    if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X))))
+      return ReplaceInstUsesWith(CI, X);
+    break;
+  }
+
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // powi(x, 0) -> 1.0
@@ -669,6 +1040,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
+
   case Intrinsic::x86_sse_storeu_ps:
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
@@ -682,6 +1054,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
 
+  case Intrinsic::x86_vcvtph2ps_128:
+  case Intrinsic::x86_vcvtph2ps_256: {
+    auto Arg = II->getArgOperand(0);
+    auto ArgType = cast<VectorType>(Arg->getType());
+    auto RetType = cast<VectorType>(II->getType());
+    unsigned ArgWidth = ArgType->getNumElements();
+    unsigned RetWidth = RetType->getNumElements();
+    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
+    assert(ArgType->isIntOrIntVectorTy() &&
+           ArgType->getScalarSizeInBits() == 16 &&
+           "CVTPH2PS input type should be 16-bit integer vector");
+    assert(RetType->getScalarType()->isFloatTy() &&
+           "CVTPH2PS output type should be 32-bit float vector");
+
+    // Constant folding: Convert to generic half to single conversion.
+    if (isa<ConstantAggregateZero>(Arg))
+      return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
+
+    if (isa<ConstantDataVector>(Arg)) {
+      auto VectorHalfAsShorts = Arg;
+      if (RetWidth < ArgWidth) {
+        SmallVector<int, 8> SubVecMask;
+        for (unsigned i = 0; i != RetWidth; ++i)
+          SubVecMask.push_back((int)i);
+        VectorHalfAsShorts = Builder->CreateShuffleVector(
+            Arg, UndefValue::get(ArgType), SubVecMask);
+      }
+
+      auto VectorHalfType =
+          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
+      auto VectorHalfs =
+          Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType);
+      auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType);
+      return ReplaceInstUsesWith(*II, VectorFloats);
+    }
+
+    // We only use the lowest lanes of the argument.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
@@ -692,194 +1108,229 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_cvttsd2si64: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
-    unsigned VWidth =
-      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
-    APInt DemandedElts(VWidth, 1);
-    APInt UndefElts(VWidth, 0);
-    if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),
-                                              DemandedElts, UndefElts)) {
+    Value *Arg = II->getArgOperand(0);
+    unsigned VWidth = Arg->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
       II->setArgOperand(0, V);
       return II;
     }
     break;
   }
 
-  // Constant fold <A x Bi> << Ci.
-  // FIXME: We don't handle _dq because it's a shift of an i128, but is
-  // represented in the IR as <2 x i64>. A per element shift is wrong.
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_sse2_psll_w:
+  // Constant fold ashr( <A x Bi>, Ci ).
+  // Constant fold lshr( <A x Bi>, Ci ).
+  // Constant fold shl( <A x Bi>, Ci ).
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
   case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
+    if (Value *V = SimplifyX86immshift(*II, *Builder))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
   case Intrinsic::x86_sse2_psrl_d:
   case Intrinsic::x86_sse2_psrl_q:
   case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_avx2_psrl_d:
   case Intrinsic::x86_avx2_psrl_q:
   case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w: {
-    // Simplify if count is constant. To 0 if >= BitWidth,
-    // otherwise to shl/lshr.
-    auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1));
-    auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1));
-    if (!CDV && !CInt)
-      break;
-    ConstantInt *Count;
-    if (CDV)
-      Count = cast<ConstantInt>(CDV->getElementAsConstant(0));
-    else
-      Count = CInt;
-
-    auto Vec = II->getArgOperand(0);
-    auto VT = cast<VectorType>(Vec->getType());
-    if (Count->getZExtValue() >
-        VT->getElementType()->getPrimitiveSizeInBits() - 1)
-      return ReplaceInstUsesWith(
-          CI, ConstantAggregateZero::get(Vec->getType()));
-
-    bool isPackedShiftLeft = true;
-    switch (II->getIntrinsicID()) {
-    default : break;
-    case Intrinsic::x86_sse2_psrl_d:
-    case Intrinsic::x86_sse2_psrl_q:
-    case Intrinsic::x86_sse2_psrl_w:
-    case Intrinsic::x86_sse2_psrli_d:
-    case Intrinsic::x86_sse2_psrli_q:
-    case Intrinsic::x86_sse2_psrli_w:
-    case Intrinsic::x86_avx2_psrl_d:
-    case Intrinsic::x86_avx2_psrl_q:
-    case Intrinsic::x86_avx2_psrl_w:
-    case Intrinsic::x86_avx2_psrli_d:
-    case Intrinsic::x86_avx2_psrli_q:
-    case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break;
-    }
-
-    unsigned VWidth = VT->getNumElements();
-    // Get a constant vector of the same type as the first operand.
-    auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue());
-    if (isPackedShiftLeft)
-      return BinaryOperator::CreateShl(Vec,
-          Builder->CreateVectorSplat(VWidth, VTCI));
-
-    return BinaryOperator::CreateLShr(Vec,
-        Builder->CreateVectorSplat(VWidth, VTCI));
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w: {
+    if (Value *V = SimplifyX86immshift(*II, *Builder))
+      return ReplaceInstUsesWith(*II, V);
+
+    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+    // operand to compute the shift amount.
+    Value *Arg1 = II->getArgOperand(1);
+    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+           "Unexpected packed shift size");
+    unsigned VWidth = Arg1->getType()->getVectorNumElements();
+
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+      II->setArgOperand(1, V);
+      return II;
+    }
+    break;
   }
 
-  case Intrinsic::x86_sse41_pmovsxbw:
-  case Intrinsic::x86_sse41_pmovsxwd:
-  case Intrinsic::x86_sse41_pmovsxdq:
+  case Intrinsic::x86_avx2_pmovsxbd:
+  case Intrinsic::x86_avx2_pmovsxbq:
+  case Intrinsic::x86_avx2_pmovsxbw:
+  case Intrinsic::x86_avx2_pmovsxdq:
+  case Intrinsic::x86_avx2_pmovsxwd:
+  case Intrinsic::x86_avx2_pmovsxwq:
+    if (Value *V = SimplifyX86extend(*II, *Builder, true))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse41_pmovzxbd:
+  case Intrinsic::x86_sse41_pmovzxbq:
   case Intrinsic::x86_sse41_pmovzxbw:
+  case Intrinsic::x86_sse41_pmovzxdq:
   case Intrinsic::x86_sse41_pmovzxwd:
-  case Intrinsic::x86_sse41_pmovzxdq: {
-    // pmov{s|z}x ignores the upper half of their input vectors.
-    unsigned VWidth =
-      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
-    unsigned LowHalfElts = VWidth / 2;
-    APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts));
-    APInt UndefElts(VWidth, 0);
-    if (Value *TmpV = SimplifyDemandedVectorElts(
-            II->getArgOperand(0), InputDemandedElts, UndefElts)) {
-      II->setArgOperand(0, TmpV);
+  case Intrinsic::x86_sse41_pmovzxwq:
+  case Intrinsic::x86_avx2_pmovzxbd:
+  case Intrinsic::x86_avx2_pmovzxbq:
+  case Intrinsic::x86_avx2_pmovzxbw:
+  case Intrinsic::x86_avx2_pmovzxdq:
+  case Intrinsic::x86_avx2_pmovzxwd:
+  case Intrinsic::x86_avx2_pmovzxwq:
+    if (Value *V = SimplifyX86extend(*II, *Builder, false))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse41_insertps:
+    if (Value *V = SimplifyX86insertps(*II, *Builder))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse4a_extrq: {
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
+    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 16 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CILength =
+        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0))
+           : nullptr;
+    ConstantInt *CIIndex =
+        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+    if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
+      return ReplaceInstUsesWith(*II, V);
+
+    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+    // operands and the lowest 16-bits of the second.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+      II->setArgOperand(1, V);
       return II;
     }
     break;
   }
-  case Intrinsic::x86_sse41_insertps:
-    if (Value *V = SimplifyX86insertps(*II, *Builder))
+
+  case Intrinsic::x86_sse4a_extrqi: {
+    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+    // bits of the lower 64-bits. The upper 64-bits are undefined.
+    Value *Op0 = II->getArgOperand(0);
+    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
       return ReplaceInstUsesWith(*II, V);
+
+    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse4a_insertq: {
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+           Op1->getType()->getVectorNumElements() == 2 &&
+           "Unexpected operand size");
+
+    // See if we're dealing with constant values.
+    Constant *C1 = dyn_cast<Constant>(Op1);
+    ConstantInt *CI11 =
+        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1))
+           : nullptr;
+
+    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+    if (CI11) {
+      APInt V11 = CI11->getValue();
+      APInt Len = V11.zextOrTrunc(6);
+      APInt Idx = V11.lshr(8).zextOrTrunc(6);
+      if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
+        return ReplaceInstUsesWith(*II, V);
+    }
+
+    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+    // operand.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
     break;
-    
+  }
+
   case Intrinsic::x86_sse4a_insertqi: {
-    // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
-    // ones undef
-    // TODO: eventually we should lower this intrinsic to IR
-    if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-      if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
-        unsigned Index = CIStart->getZExtValue();
-        // From AMD documentation: "a value of zero in the field length is
-        // defined as length of 64".
-        unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue();
-
-        // From AMD documentation: "If the sum of the bit index + length field
-        // is greater than 64, the results are undefined".
-
-        // Note that both field index and field length are 8-bit quantities.
-        // Since variables 'Index' and 'Length' are unsigned values
-        // obtained from zero-extending field index and field length
-        // respectively, their sum should never wrap around.
-        if ((Index + Length) > 64)
-          return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
-
-        if (Length == 64 && Index == 0) {
-          Value *Vec = II->getArgOperand(1);
-          Value *Undef = UndefValue::get(Vec->getType());
-          const uint32_t Mask[] = { 0, 2 };
-          return ReplaceInstUsesWith(
-              CI,
-              Builder->CreateShuffleVector(
-                  Vec, Undef, ConstantDataVector::get(
-                                  II->getContext(), makeArrayRef(Mask))));
-
-        } else if (auto Source =
-                       dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
-          if (Source->hasOneUse() &&
-              Source->getArgOperand(1) == II->getArgOperand(1)) {
-            // If the source of the insert has only one use and it's another
-            // insert (and they're both inserting from the same vector), try to
-            // bundle both together.
-            auto CISourceWidth =
-                dyn_cast<ConstantInt>(Source->getArgOperand(2));
-            auto CISourceStart =
-                dyn_cast<ConstantInt>(Source->getArgOperand(3));
-            if (CISourceStart && CISourceWidth) {
-              unsigned Start = CIStart->getZExtValue();
-              unsigned Width = CIWidth->getZExtValue();
-              unsigned End = Start + Width;
-              unsigned SourceStart = CISourceStart->getZExtValue();
-              unsigned SourceWidth = CISourceWidth->getZExtValue();
-              unsigned SourceEnd = SourceStart + SourceWidth;
-              unsigned NewStart, NewWidth;
-              bool ShouldReplace = false;
-              if (Start <= SourceStart && SourceStart <= End) {
-                NewStart = Start;
-                NewWidth = std::max(End, SourceEnd) - NewStart;
-                ShouldReplace = true;
-              } else if (SourceStart <= Start && Start <= SourceEnd) {
-                NewStart = SourceStart;
-                NewWidth = std::max(SourceEnd, End) - NewStart;
-                ShouldReplace = true;
-              }
-
-              if (ShouldReplace) {
-                Constant *ConstantWidth = ConstantInt::get(
-                    II->getArgOperand(2)->getType(), NewWidth, false);
-                Constant *ConstantStart = ConstantInt::get(
-                    II->getArgOperand(3)->getType(), NewStart, false);
-                Value *Args[4] = { Source->getArgOperand(0),
-                                   II->getArgOperand(1), ConstantWidth,
-                                   ConstantStart };
-                Module *M = CI.getParent()->getParent()->getParent();
-                Value *F =
-                    Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
-                return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
-              }
-            }
-          }
-        }
-      }
+    // INSERTQI: Extract lowest Length bits from lower half of second source and
+    // insert over first source starting at Index bit. The upper 64-bits are
+    // undefined.
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
+    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+           VWidth1 == 2 && "Unexpected operand sizes");
+
+    // See if we're dealing with constant values.
+    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
+
+    // Attempt to simplify to a constant or shuffle vector.
+    if (CILength && CIIndex) {
+      APInt Len = CILength->getValue().zextOrTrunc(6);
+      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+      if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
+        return ReplaceInstUsesWith(*II, V);
+    }
+
+    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+    // operands.
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+      II->setArgOperand(1, V);
+      return II;
     }
     break;
   }
@@ -894,7 +1345,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // This optimization is convoluted because the intrinsic is defined as
     // getting a vector of floats or doubles for the ps and pd versions.
     // FIXME: That should be changed.
+
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
     Value *Mask = II->getArgOperand(2);
+
+    // fold (blend A, A, Mask) -> A
+    if (Op0 == Op1)
+      return ReplaceInstUsesWith(CI, Op0);
+
+    // Zero Mask - select 1st argument.
+    if (isa<ConstantAggregateZero>(Mask))
+      return ReplaceInstUsesWith(CI, Op0);
+
+    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
     if (auto C = dyn_cast<ConstantDataVector>(Mask)) {
       auto Tyi1 = Builder->getInt1Ty();
       auto SelectorType = cast<VectorType>(Mask->getType());
@@ -917,11 +1381,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));
       }
       auto NewSelector = ConstantVector::get(Selectors);
-      return SelectInst::Create(NewSelector, II->getArgOperand(1),
-                                II->getArgOperand(0), "blendv");
-    } else {
-      break;
+      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
+    break;
+  }
+
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b: {
+    // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant.
+    auto *V = II->getArgOperand(1);
+    auto *VTy = cast<VectorType>(V->getType());
+    unsigned NumElts = VTy->getNumElements();
+    assert((NumElts == 16 || NumElts == 32) &&
+           "Unexpected number of elements in shuffle mask!");
+    // Initialize the resulting shuffle mask to all zeroes.
+    uint32_t Indexes[32] = {0};
+
+    if (auto *Mask = dyn_cast<ConstantDataVector>(V)) {
+      // Each byte in the shuffle control mask forms an index to permute the
+      // corresponding byte in the destination operand.
+      for (unsigned I = 0; I < NumElts; ++I) {
+        int8_t Index = Mask->getElementAsInteger(I);
+        // If the most significant bit (bit[7]) of each byte of the shuffle
+        // control mask is set, then zero is written in the result byte.
+        // The zero vector is in the right-hand side of the resulting
+        // shufflevector.
+
+        // The value of each index is the least significant 4 bits of the
+        // shuffle control byte.
+        Indexes[I] = (Index < 0) ? NumElts : Index & 0xF;
+      }
+    } else if (!isa<ConstantAggregateZero>(V))
+      break;
+
+    // The value of each index for the high 128-bit lane is the least
+    // significant 4 bits of the respective shuffle control byte.
+    for (unsigned I = 16; I < NumElts; ++I)
+      Indexes[I] += I & 0xF0;
+
+    auto NewC = ConstantDataVector::get(V->getContext(),
+                                        makeArrayRef(Indexes, NumElts));
+    auto V1 = II->getArgOperand(0);
+    auto V2 = Constant::getNullValue(II->getType());
+    auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC);
+    return ReplaceInstUsesWith(CI, Shuffle);
   }
 
   case Intrinsic::x86_avx_vpermilvar_ps:
@@ -972,6 +1475,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return ReplaceInstUsesWith(*II, V);
     break;
 
+  case Intrinsic::x86_xop_vpcomb:
+  case Intrinsic::x86_xop_vpcomd:
+  case Intrinsic::x86_xop_vpcomq:
+  case Intrinsic::x86_xop_vpcomw:
+    if (Value *V = SimplifyX86vpcom(*II, *Builder, true))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_xop_vpcomub:
+  case Intrinsic::x86_xop_vpcomud:
+  case Intrinsic::x86_xop_vpcomuq:
+  case Intrinsic::x86_xop_vpcomuw:
+    if (Value *V = SimplifyX86vpcom(*II, *Builder, false))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
     // Note that ppc_altivec_vperm has a big-endian bias, so when creating
@@ -1115,15 +1634,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // happen when variable allocas are DCE'd.
     if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
       if (SS->getIntrinsicID() == Intrinsic::stacksave) {
-        BasicBlock::iterator BI = SS;
-        if (&*++BI == II)
+        if (&*++SS->getIterator() == II)
           return EraseInstFromFunction(CI);
       }
     }
 
     // Scan down this block to see if there is another stack restore in the
     // same block without an intervening call/alloca.
-    BasicBlock::iterator BI = II;
+    BasicBlock::iterator BI(II);
     TerminatorInst *TI = II->getParent()->getTerminator();
     bool CannotRemove = false;
     for (++BI; &*BI != TI; ++BI) {
@@ -1153,6 +1671,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return EraseInstFromFunction(CI);
     break;
   }
+  case Intrinsic::lifetime_start: {
+    // Remove trivially empty lifetime_start/end ranges, i.e. a start
+    // immediately followed by an end (ignoring debuginfo or other
+    // lifetime markers in between).
+    BasicBlock::iterator BI = II->getIterator(), BE = II->getParent()->end();
+    for (++BI; BI != BE; ++BI) {
+      if (IntrinsicInst *LTE = dyn_cast<IntrinsicInst>(BI)) {
+        if (isa<DbgInfoIntrinsic>(LTE) ||
+            LTE->getIntrinsicID() == Intrinsic::lifetime_start)
+          continue;
+        if (LTE->getIntrinsicID() == Intrinsic::lifetime_end) {
+          if (II->getOperand(0) == LTE->getOperand(0) &&
+              II->getOperand(1) == LTE->getOperand(1)) {
+            EraseInstFromFunction(*LTE);
+            return EraseInstFromFunction(*II);
+          }
+          continue;
+        }
+      }
+      break;
+    }
+    break;
+  }
   case Intrinsic::assume: {
     // Canonicalize assume(a && b) -> assume(a); assume(b);
     // Note: New assumption intrinsics created here are registered by
@@ -1206,8 +1747,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Translate facts known about a pointer before relocating into
     // facts about the relocate value, while being careful to
     // preserve relocation semantics.
-    GCRelocateOperands Operands(II);
-    Value *DerivedPtr = Operands.getDerivedPtr();
+    Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
     auto *GCRelocateType = cast<PointerType>(II->getType());
 
     // Remove the relocation if unused, note that this check is required
@@ -1233,7 +1773,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
 
     // isKnownNonNull -> nonnull attribute
-    if (isKnownNonNull(DerivedPtr))
+    if (isKnownNonNullAt(DerivedPtr, II, DT, TLI))
       II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
 
     // isDereferenceablePointer -> deref attribute
@@ -1355,9 +1895,10 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
                                                Value *TrampMem) {
   // Visit all the previous instructions in the basic block, and try to find a
   // init.trampoline which has a direct path to the adjust.trampoline.
-  for (BasicBlock::iterator I = AdjustTramp,
-       E = AdjustTramp->getParent()->begin(); I != E; ) {
-    Instruction *Inst = --I;
+  for (BasicBlock::iterator I = AdjustTramp->getIterator(),
+                            E = AdjustTramp->getParent()->begin();
+       I != E;) {
+    Instruction *Inst = &*--I;
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
       if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
           II->getOperand(0) == TrampMem)
@@ -1400,20 +1941,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   // Mark any parameters that are known to be non-null with the nonnull
   // attribute.  This is helpful for inlining calls to functions with null
   // checks on their arguments.
+  SmallVector<unsigned, 4> Indices;
   unsigned ArgNo = 0;
+
   for (Value *V : CS.args()) {
-    if (!CS.paramHasAttr(ArgNo+1, Attribute::NonNull) &&
-        isKnownNonNull(V)) {
-      AttributeSet AS = CS.getAttributes();
-      AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1,
-                           Attribute::NonNull);
-      CS.setAttributes(AS);
-      Changed = true;
-    }
+    if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) &&
+        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI))
+      Indices.push_back(ArgNo + 1);
     ArgNo++;
   }
+
   assert(ArgNo == CS.arg_size() && "sanity check");
 
+  if (!Indices.empty()) {
+    AttributeSet AS = CS.getAttributes();
+    LLVMContext &Ctx = CS.getInstruction()->getContext();
+    AS = AS.addAttribute(Ctx, Indices,
+                         Attribute::get(Ctx, Attribute::NonNull));
+    CS.setAttributes(AS);
+    Changed = true;
+  }
+
   // If the callee is a pointer to a function, attempt to move any casts to the
   // arguments of the call/invoke.
   Value *Callee = CS.getCalledValue();
@@ -1725,16 +2273,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
                                                        attrVec);
 
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CS.getOperandBundlesAsDefs(OpBundles);
+
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-    NC = Builder->CreateInvoke(Callee, II->getNormalDest(),
-                               II->getUnwindDest(), Args);
+    NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(),
+                               Args, OpBundles);
     NC->takeName(II);
     cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
     cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
   } else {
     CallInst *CI = cast<CallInst>(Caller);
-    NC = Builder->CreateCall(Callee, Args);
+    NC = Builder->CreateCall(Callee, Args, OpBundles);
     NC->takeName(CI);
     if (CI->isTailCall())
       cast<CallInst>(NC)->setTailCall();
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 48ab0eb..0f01d18 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -21,11 +21,11 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear
-/// expression.  If so, decompose it, returning some value X, such that Val is
+/// Analyze 'Val', seeing if it is a simple linear expression.
+/// If so, decompose it, returning some value X, such that Val is
 /// X*Scale+Offset.
 ///
-static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
                                         uint64_t &Offset) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
     Offset = CI->getZExtValue();
@@ -62,7 +62,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
         // where C1 is divisible by C2.
         unsigned SubScale;
         Value *SubVal =
-          DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+          decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
         Offset += RHS->getZExtValue();
         Scale = SubScale;
         return SubVal;
@@ -76,14 +76,14 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
   return Val;
 }
 
-/// PromoteCastOfAllocation - If we find a cast of an allocation instruction,
-/// try to eliminate the cast by moving the type information into the alloc.
+/// If we find a cast of an allocation instruction, try to eliminate the cast by
+/// moving the type information into the alloc.
 Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
                                                    AllocaInst &AI) {
   PointerType *PTy = cast<PointerType>(CI.getType());
 
   BuilderTy AllocaBuilder(*Builder);
-  AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
+  AllocaBuilder.SetInsertPoint(&AI);
 
   // Get the type really allocated and the type casted to.
   Type *AllocElTy = AI.getAllocatedType();
@@ -114,7 +114,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   unsigned ArraySizeScale;
   uint64_t ArrayOffset;
   Value *NumElements = // See if the array size is a decomposable linear expr.
-    DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+    decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
 
   // If we can now satisfy the modulus, by using a non-1 scale, we really can
   // do the xform.
@@ -154,9 +154,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   return ReplaceInstUsesWith(CI, New);
 }
 
-/// EvaluateInDifferentType - Given an expression that
-/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
-/// insert the code to evaluate the expression.
+/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
+/// true for, actually insert the code to evaluate the expression.
 Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
@@ -261,9 +260,9 @@ isEliminableCastPair(const CastInst *CI, ///< First cast instruction
   return Instruction::CastOps(Res);
 }
 
-/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
-/// results in any code being generated and is interesting to optimize out. If
-/// the cast can be eliminated by some other simple transformation, we prefer
+/// Return true if the cast from "V to Ty" actually results in any code being
+/// generated and is interesting to optimize out.
+/// If the cast can be eliminated by some other simple transformation, we prefer
 /// to do the simplification first.
 bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
                                       Type *Ty) {
@@ -318,9 +317,9 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   return nullptr;
 }
 
-/// CanEvaluateTruncated - Return true if we can evaluate the specified
-/// expression tree as type Ty instead of its larger type, and arrive with the
-/// same value.  This is used by code that tries to eliminate truncates.
+/// Return true if we can evaluate the specified expression tree as type Ty
+/// instead of its larger type, and arrive with the same value.
+/// This is used by code that tries to eliminate truncates.
 ///
 /// Ty will always be a type smaller than V.  We should return true if trunc(V)
 /// can be computed by computing V in the smaller type.  If V is an instruction,
@@ -329,7 +328,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
                                  Instruction *CxtI) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
@@ -359,8 +358,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
   case Instruction::Or:
   case Instruction::Xor:
     // These operators can all arbitrarily be extended or truncated.
-    return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
-           CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
 
   case Instruction::UDiv:
   case Instruction::URem: {
@@ -371,8 +370,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
       APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
       if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
           IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
-        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
-               CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+               canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
       }
     }
     break;
@@ -383,7 +382,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint32_t BitWidth = Ty->getScalarSizeInBits();
       if (CI->getLimitedValue(BitWidth) < BitWidth)
-        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
+        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
     }
     break;
   case Instruction::LShr:
@@ -396,7 +395,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
       if (IC.MaskedValueIsZero(I->getOperand(0),
             APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) &&
           CI->getLimitedValue(BitWidth) < BitWidth) {
-        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
+        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
       }
     }
     break;
@@ -410,8 +409,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     return true;
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
-    return CanEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
-           CanEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
+    return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
+           canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -419,7 +418,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
-      if (!CanEvaluateTruncated(IncValue, Ty, IC, CxtI))
+      if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI))
         return false;
     return true;
   }
@@ -431,6 +430,50 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
   return false;
 }
 
+/// Given a vector that is bitcast to an integer, optionally logically
+/// right-shifted, and truncated, convert it to an extractelement.
+/// Example (big endian):
+///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
+///   --->
+///   extractelement <4 x i32> %X, 1
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC,
+                                         const DataLayout &DL) {
+  Value *TruncOp = Trunc.getOperand(0);
+  Type *DestType = Trunc.getType();
+  if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
+    return nullptr;
+
+  Value *VecInput = nullptr;
+  ConstantInt *ShiftVal = nullptr;
+  if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)),
+                                  m_LShr(m_BitCast(m_Value(VecInput)),
+                                         m_ConstantInt(ShiftVal)))) ||
+      !isa<VectorType>(VecInput->getType()))
+    return nullptr;
+
+  VectorType *VecType = cast<VectorType>(VecInput->getType());
+  unsigned VecWidth = VecType->getPrimitiveSizeInBits();
+  unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+  unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0;
+
+  if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0))
+    return nullptr;
+
+  // If the element type of the vector doesn't match the result type,
+  // bitcast it to a vector type that we can extract from.
+  unsigned NumVecElts = VecWidth / DestWidth;
+  if (VecType->getElementType() != DestType) {
+    VecType = VectorType::get(DestType, NumVecElts);
+    VecInput = IC.Builder->CreateBitCast(VecInput, VecType, "bc");
+  }
+
+  unsigned Elt = ShiftAmount / DestWidth;
+  if (DL.isBigEndian())
+    Elt = NumVecElts - 1 - Elt;
+
+  return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
+}
+
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
@@ -441,7 +484,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // min/max.
   Value *LHS, *RHS;
   if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
-    if (matchSelectPattern(SI, LHS, RHS) != SPF_UNKNOWN)
+    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
       return nullptr;
   
   // See if we can simplify any instructions used by the input whose sole
@@ -457,7 +500,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateTruncated(Src, DestTy, *this, &CI)) {
+      canEvaluateTruncated(Src, DestTy, *this, &CI)) {
 
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
@@ -470,7 +513,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
 
   // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
-    Constant *One = ConstantInt::get(Src->getType(), 1);
+    Constant *One = ConstantInt::get(SrcTy, 1);
     Src = Builder->CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
@@ -489,31 +532,54 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
-      return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType()));
+      return ReplaceInstUsesWith(CI, Constant::getNullValue(DestTy));
 
     // Since we're doing an lshr and a zero extend, and know that the shift
     // amount is smaller than ASize, it is always safe to do the shift in A's
     // type, then zero extend or truncate to the result.
     Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue());
     Shift->takeName(Src);
-    return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
+    return CastInst::CreateIntegerCast(Shift, DestTy, false);
+  }
+
+  // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type
+  // conversion.
+  // It works because bits coming from sign extension have the same value as
+  // the sign bit of the original value; performing ashr instead of lshr
+  // generates bits of the same value as the sign bit.
+  if (Src->hasOneUse() &&
+      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
+      cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+    const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    // This optimization can be only performed when zero bits generated by
+    // the original lshr aren't pulled into the value after truncation, so we
+    // can only shift by values smaller than the size of destination type (in
+    // bits).
+    if (Cst->getValue().ult(ASize)) {
+      Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
+      Shift->takeName(Src);
+      return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+    }
   }
 
   // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
   // type isn't non-native.
-  if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) &&
-      ShouldChangeType(Src->getType(), CI.getType()) &&
+  if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
+      ShouldChangeType(SrcTy, DestTy) &&
       match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) {
-    Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr");
+    Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr");
     return BinaryOperator::CreateAnd(NewTrunc,
-                                     ConstantExpr::getTrunc(Cst, CI.getType()));
+                                     ConstantExpr::getTrunc(Cst, DestTy));
   }
 
+  if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL))
+    return I;
+
   return nullptr;
 }
 
-/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations
-/// in order to eliminate the icmp.
+/// Transform (zext icmp) to bitwise / integer operations in order to eliminate
+/// the icmp.
 Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                              bool DoXform) {
   // If we are just checking for a icmp eq of a single bit and zext'ing it
@@ -525,19 +591,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
     if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
-        (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) {
+        (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
       if (!DoXform) return ICI;
 
       Value *In = ICI->getOperand(0);
       Value *Sh = ConstantInt::get(In->getType(),
-                                   In->getType()->getScalarSizeInBits()-1);
-      In = Builder->CreateLShr(In, Sh, In->getName()+".lobit");
+                                   In->getType()->getScalarSizeInBits() - 1);
+      In = Builder->CreateLShr(In, Sh, In->getName() + ".lobit");
       if (In->getType() != CI.getType())
         In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/);
 
       if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
         Constant *One = ConstantInt::get(In->getType(), 1);
-        In = Builder->CreateXor(In, One, In->getName()+".not");
+        In = Builder->CreateXor(In, One, In->getName() + ".not");
       }
 
       return ReplaceInstUsesWith(CI, In);
@@ -573,13 +639,13 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           return ReplaceInstUsesWith(CI, Res);
         }
 
-        uint32_t ShiftAmt = KnownZeroMask.logBase2();
+        uint32_t ShAmt = KnownZeroMask.logBase2();
         Value *In = ICI->getOperand(0);
-        if (ShiftAmt) {
+        if (ShAmt) {
           // Perform a logical shr by shiftamt.
           // Insert the shift to put the result in the low bit.
-          In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
-                                   In->getName()+".lobit");
+          In = Builder->CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
+                                   In->getName() + ".lobit");
         }
 
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
@@ -637,8 +703,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   return nullptr;
 }
 
-/// CanEvaluateZExtd - Determine if the specified value can be computed in the
-/// specified wider type and produce the same low bits.  If not, return false.
+/// Determine if the specified value can be computed in the specified wider type
+/// and produce the same low bits. If not, return false.
 ///
 /// If this function returns true, it can also return a non-zero number of bits
 /// (in BitsToClear) which indicates that the value it computes is correct for
@@ -655,7 +721,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 /// clear the top bits anyway, doing this has no extra cost.
 ///
 /// This function works on both vectors and scalars.
-static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
+static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
                              InstCombiner &IC, Instruction *CxtI) {
   BitsToClear = 0;
   if (isa<Constant>(V))
@@ -685,8 +751,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
-    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
-        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
+    if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
+        !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
       return false;
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
@@ -713,7 +779,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
     // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
     // upper bits we can reduce BitsToClear by the shift amount.
     if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
         return false;
       uint64_t ShiftAmt = Amt->getZExtValue();
       BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
@@ -724,7 +790,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
     if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
         return false;
       BitsToClear += Amt->getZExtValue();
       if (BitsToClear > V->getType()->getScalarSizeInBits())
@@ -734,8 +800,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
     // Cannot promote variable LSHR.
     return false;
   case Instruction::Select:
-    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
-        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
+    if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
+        !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
         // TODO: If important, we could handle the case when the BitsToClear are
         // known zero in the disagreeing side.
         Tmp != BitsToClear)
@@ -747,10 +813,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
     // get into trouble with cyclic PHIs here because we only consider
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
-    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
+    if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
       return false;
     for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
+      if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
           // TODO: If important, we could handle the case when the BitsToClear
           // are known zero in the disagreeing input.
           Tmp != BitsToClear)
@@ -787,13 +853,13 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
+      canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
 
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid zero extend: " << CI);
+          " to avoid zero extend: " << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
 
@@ -897,8 +963,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   return nullptr;
 }
 
-/// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations
-/// in order to eliminate the icmp.
+/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
 Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
   Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
   ICmpInst::Predicate Pred = ICI->getPredicate();
@@ -985,15 +1050,14 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
   return nullptr;
 }
 
-/// CanEvaluateSExtd - Return true if we can take the specified value
-/// and return it as type Ty without inserting any new casts and without
-/// changing the value of the common low bits.  This is used by code that tries
-/// to promote integer operations to a wider types will allow us to eliminate
-/// the extension.
+/// Return true if we can take the specified value and return it as type Ty
+/// without inserting any new casts and without changing the value of the common
+/// low bits.  This is used by code that tries to promote integer operations to
+/// a wider types will allow us to eliminate the extension.
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateSExtd(Value *V, Type *Ty) {
+static bool canEvaluateSExtd(Value *V, Type *Ty) {
   assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
          "Can't sign extend type to a smaller type");
   // If this is a constant, it can be trivially promoted.
@@ -1023,15 +1087,15 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   case Instruction::Sub:
   case Instruction::Mul:
     // These operators can all arbitrarily be extended if their inputs can.
-    return CanEvaluateSExtd(I->getOperand(0), Ty) &&
-           CanEvaluateSExtd(I->getOperand(1), Ty);
+    return canEvaluateSExtd(I->getOperand(0), Ty) &&
+           canEvaluateSExtd(I->getOperand(1), Ty);
 
   //case Instruction::Shl:   TODO
   //case Instruction::LShr:  TODO
 
   case Instruction::Select:
-    return CanEvaluateSExtd(I->getOperand(1), Ty) &&
-           CanEvaluateSExtd(I->getOperand(2), Ty);
+    return canEvaluateSExtd(I->getOperand(1), Ty) &&
+           canEvaluateSExtd(I->getOperand(2), Ty);
 
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -1039,7 +1103,7 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
-      if (!CanEvaluateSExtd(IncValue, Ty)) return false;
+      if (!canEvaluateSExtd(IncValue, Ty)) return false;
     return true;
   }
   default:
@@ -1081,10 +1145,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateSExtd(Src, DestTy)) {
+      canEvaluateSExtd(Src, DestTy)) {
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid sign extend: " << CI);
+          " to avoid sign extend: " << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, true);
     assert(Res->getType() == DestTy);
 
@@ -1149,9 +1213,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
 }
 
 
-/// FitsInFPType - Return a Constant* for the specified FP constant if it fits
+/// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
-static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   bool losesInfo;
   APFloat F = CFP->getValueAPF();
   (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
@@ -1160,12 +1224,12 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   return nullptr;
 }
 
-/// LookThroughFPExtensions - If this is an fp extension instruction, look
+/// If this is a floating-point extension instruction, look
 /// through it until we get the source value.
-static Value *LookThroughFPExtensions(Value *V) {
+static Value *lookThroughFPExtensions(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::FPExt)
-      return LookThroughFPExtensions(I->getOperand(0));
+      return lookThroughFPExtensions(I->getOperand(0));
 
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
@@ -1174,14 +1238,14 @@ static Value *LookThroughFPExtensions(Value *V) {
     if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
       return V;  // No constant folding of this.
     // See if the value can be truncated to half and then reextended.
-    if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf))
+    if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf))
       return V;
     // See if the value can be truncated to float and then reextended.
-    if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle))
+    if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle))
       return V;
     if (CFP->getType()->isDoubleTy())
       return V;  // Won't shrink.
-    if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble))
+    if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble))
       return V;
     // Don't try to shrink to various long double types.
   }
@@ -1193,7 +1257,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
   // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
-  // simpilify this expression to avoid one or more of the trunc/extend
+  // simplify this expression to avoid one or more of the trunc/extend
   // operations if we can do so without changing the numerical results.
   //
   // The exact manner in which the widths of the operands interact to limit
@@ -1201,8 +1265,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   // is explained below in the various case statements.
   BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
   if (OpI && OpI->hasOneUse()) {
-    Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0));
-    Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1));
+    Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0));
+    Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1));
     unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth();
     unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth();
@@ -1307,10 +1371,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
 
   // (fptrunc (select cond, R1, Cst)) -->
   // (select cond, (fptrunc R1), (fptrunc Cst))
+  //
+  //  - but only if this isn't part of a min/max operation, else we'll
+  // ruin min/max canonical form which is to have the select and
+  // compare's operands be of the same type with no casts to look through.
+  Value *LHS, *RHS;
   SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
   if (SI &&
       (isa<ConstantFP>(SI->getOperand(1)) ||
-       isa<ConstantFP>(SI->getOperand(2)))) {
+       isa<ConstantFP>(SI->getOperand(2))) &&
+      matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) {
     Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1),
                                              CI.getType());
     Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2),
@@ -1327,9 +1397,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
                                                    CI.getType());
         Type *IntrinsicType[] = { CI.getType() };
-        Function *Overload =
-          Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(),
-                                    II->getIntrinsicID(), IntrinsicType);
+        Function *Overload = Intrinsic::getDeclaration(
+            CI.getModule(), II->getIntrinsicID(), IntrinsicType);
 
         Value *Args[] = { InnerTrunc };
         return CallInst::Create(Overload, Args, II->getName());
@@ -1483,12 +1552,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
 }
 
-/// OptimizeVectorResize - This input value (which is known to have vector type)
-/// is being zero extended or truncated to the specified vector type.  Try to
-/// replace it with a shuffle (and vector/vector bitcast) if possible.
+/// This input value (which is known to have vector type) is being zero extended
+/// or truncated to the specified vector type.
+/// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
+static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
                                          InstCombiner &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
@@ -1548,8 +1617,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
   return Value / Ty->getPrimitiveSizeInBits();
 }
 
-/// CollectInsertionElements - V is a value which is inserted into a vector of
-/// VecEltTy.  Look through the value to see if we can decompose it into
+/// V is a value which is inserted into a vector of VecEltTy.
+/// Look through the value to see if we can decompose it into
 /// insertions into the vector.  See the example in the comment for
 /// OptimizeIntegerToVectorInsertions for the pattern this handles.
 /// The type of V is always a non-zero multiple of VecEltTy's size.
@@ -1558,7 +1627,7 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
 ///
 /// This returns false if the pattern can't be matched or true if it can,
 /// filling in Elements with the elements found here.
-static bool CollectInsertionElements(Value *V, unsigned Shift,
+static bool collectInsertionElements(Value *V, unsigned Shift,
                                      SmallVectorImpl<Value *> &Elements,
                                      Type *VecEltTy, bool isBigEndian) {
   assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
@@ -1595,7 +1664,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
     // If the constant is the size of a vector element, we just need to bitcast
     // it to the right type so it gets properly inserted.
     if (NumElts == 1)
-      return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+      return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
                                       Shift, Elements, VecEltTy, isBigEndian);
 
     // Okay, this is a constant that covers multiple elements.  Slice it up into
@@ -1611,7 +1680,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
                                                                   ShiftI));
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
                                     isBigEndian))
         return false;
     }
@@ -1625,19 +1694,19 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
-    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
                                     isBigEndian);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
-    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
                                     isBigEndian);
   case Instruction::Or:
-    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
                                     isBigEndian) &&
-           CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
+           collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
                                     isBigEndian);
   case Instruction::Shl: {
     // Must be shifting by a constant that is a multiple of the element size.
@@ -1645,7 +1714,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
     if (!CI) return false;
     Shift += CI->getZExtValue();
     if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
-    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
                                     isBigEndian);
   }
 
@@ -1653,8 +1722,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
 }
 
 
-/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we
-/// may be doing shifts and ors to assemble the elements of the vector manually.
+/// If the input is an 'or' instruction, we may be doing shifts and ors to
+/// assemble the elements of the vector manually.
 /// Try to rip the code out and replace it with insertelements.  This is to
 /// optimize code like this:
 ///
@@ -1667,13 +1736,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
 ///    %tmp43 = bitcast i64 %ins35 to <2 x float>
 ///
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
-static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
+static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
-  if (!CollectInsertionElements(IntInput, 0, Elements,
+  if (!collectInsertionElements(IntInput, 0, Elements,
                                 DestVecTy->getElementType(),
                                 IC.getDataLayout().isBigEndian()))
     return nullptr;
@@ -1692,63 +1761,29 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
-
-/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
-/// bitcast.  The various long double bitcasts can't get in here.
-static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC,
+/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the
+/// vector followed by extract element. The backend tends to handle bitcasts of
+/// vectors better than bitcasts of scalars because vector registers are
+/// usually not type-specific like scalar integer or scalar floating-point.
+static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
+                                              InstCombiner &IC,
                                               const DataLayout &DL) {
-  Value *Src = CI.getOperand(0);
-  Type *DestTy = CI.getType();
-
-  // If this is a bitcast from int to float, check to see if the int is an
-  // extraction from a vector.
-  Value *VecInput = nullptr;
-  // bitcast(trunc(bitcast(somevector)))
-  if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
-      isa<VectorType>(VecInput->getType())) {
-    VectorType *VecTy = cast<VectorType>(VecInput->getType());
-    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
-
-    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) {
-      // If the element type of the vector doesn't match the result type,
-      // bitcast it to be a vector type we can extract from.
-      if (VecTy->getElementType() != DestTy) {
-        VecTy = VectorType::get(DestTy,
-                                VecTy->getPrimitiveSizeInBits() / DestWidth);
-        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
-      }
-
-      unsigned Elt = 0;
-      if (DL.isBigEndian())
-        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1;
-      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
-    }
-  }
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
 
-  // bitcast(trunc(lshr(bitcast(somevector), cst))
-  ConstantInt *ShAmt = nullptr;
-  if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
-                                m_ConstantInt(ShAmt)))) &&
-      isa<VectorType>(VecInput->getType())) {
-    VectorType *VecTy = cast<VectorType>(VecInput->getType());
-    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
-    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 &&
-        ShAmt->getZExtValue() % DestWidth == 0) {
-      // If the element type of the vector doesn't match the result type,
-      // bitcast it to be a vector type we can extract from.
-      if (VecTy->getElementType() != DestTy) {
-        VecTy = VectorType::get(DestTy,
-                                VecTy->getPrimitiveSizeInBits() / DestWidth);
-        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
-      }
+  // The bitcast must be to a vectorizable type, otherwise we can't make a new
+  // type to extract from.
+  Type *DestType = BitCast.getType();
+  if (!VectorType::isValidElementType(DestType))
+    return nullptr;
 
-      unsigned Elt = ShAmt->getZExtValue() / DestWidth;
-      if (DL.isBigEndian())
-        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt;
-      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
-    }
-  }
-  return nullptr;
+  unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
+  auto *NewVecType = VectorType::get(DestType, NumElts);
+  auto *NewBC = IC.Builder->CreateBitCast(ExtElt->getVectorOperand(),
+                                          NewVecType, "bc");
+  return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
 }
 
 Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
@@ -1794,11 +1829,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  // Try to optimize int -> float bitcasts.
-  if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
-    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL))
-      return I;
-
   if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
     if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
       Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType());
@@ -1815,7 +1845,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
         CastInst *SrcCast = cast<CastInst>(Src);
         if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
           if (isa<VectorType>(BCIn->getOperand(0)->getType()))
-            if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0),
+            if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0),
                                                cast<VectorType>(DestTy), *this))
               return I;
       }
@@ -1823,7 +1853,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // If the input is an 'or' instruction, we may be doing shifts and ors to
       // assemble the elements of the vector manually.  Try to rip the code out
       // and replace it with insertelements.
-      if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this))
+      if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))
         return ReplaceInstUsesWith(CI, V);
     }
   }
@@ -1872,6 +1902,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
+  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL))
+    return I;
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 95bba3c..c0786af 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -216,8 +216,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
   Max = KnownOne|UnknownBits;
 }
 
-
-
 /// FoldCmpLoadFromIndexedGlobal - Called we see this pattern:
 ///   cmp pred (load (gep GV, ...)), cmpcst
 /// where GV is a global variable with a constant initializer.  Try to simplify
@@ -371,7 +369,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
       }
     }
 
-
     // If this element is in range, update our magic bitvector.
     if (i < 64 && IsTrueForElt)
       MagicBitvector |= 1ULL << i;
@@ -469,7 +466,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
   }
 
-
   // If a magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
@@ -496,7 +492,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   return nullptr;
 }
 
-
 /// EvaluateGEPOffsetExpression - Return a value that can be used to compare
 /// the *offset* implied by a GEP to zero.  For example, if we have &A[i], we
 /// want to return 'i' for "icmp ne i, 0".  Note that, in general, indices can
@@ -562,8 +557,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
     }
   }
 
-
-
   // Okay, we know we have a single variable index, which must be a
   // pointer/array/vector index.  If there is no offset, life is simple, return
   // the index.
@@ -737,6 +730,83 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
   return nullptr;
 }
 
+Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca,
+                                         Value *Other) {
+  assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
+
+  // It would be tempting to fold away comparisons between allocas and any
+  // pointer not based on that alloca (e.g. an argument). However, even
+  // though such pointers cannot alias, they can still compare equal.
+  //
+  // But LLVM doesn't specify where allocas get their memory, so if the alloca
+  // doesn't escape we can argue that it's impossible to guess its value, and we
+  // can therefore act as if any such guesses are wrong.
+  //
+  // The code below checks that the alloca doesn't escape, and that it's only
+  // used in a comparison once (the current instruction). The
+  // single-comparison-use condition ensures that we're trivially folding all
+  // comparisons against the alloca consistently, and avoids the risk of
+  // erroneously folding a comparison of the pointer with itself.
+
+  unsigned MaxIter = 32; // Break cycles and bound to constant-time.
+
+  SmallVector<Use *, 32> Worklist;
+  for (Use &U : Alloca->uses()) {
+    if (Worklist.size() >= MaxIter)
+      return nullptr;
+    Worklist.push_back(&U);
+  }
+
+  unsigned NumCmps = 0;
+  while (!Worklist.empty()) {
+    assert(Worklist.size() <= MaxIter);
+    Use *U = Worklist.pop_back_val();
+    Value *V = U->getUser();
+    --MaxIter;
+
+    if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
+        isa<SelectInst>(V)) {
+      // Track the uses.
+    } else if (isa<LoadInst>(V)) {
+      // Loading from the pointer doesn't escape it.
+      continue;
+    } else if (auto *SI = dyn_cast<StoreInst>(V)) {
+      // Storing *to* the pointer is fine, but storing the pointer escapes it.
+      if (SI->getValueOperand() == U->get())
+        return nullptr;
+      continue;
+    } else if (isa<ICmpInst>(V)) {
+      if (NumCmps++)
+        return nullptr; // Found more than one cmp.
+      continue;
+    } else if (auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
+      switch (Intrin->getIntrinsicID()) {
+        // These intrinsics don't escape or compare the pointer. Memset is safe
+        // because we don't allow ptrtoint. Memcpy and memmove are safe because
+        // we don't allow stores, so src cannot point to V.
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+        case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
+          continue;
+        default:
+          return nullptr;
+      }
+    } else {
+      return nullptr;
+    }
+    for (Use &U : V->uses()) {
+      if (Worklist.size() >= MaxIter)
+        return nullptr;
+      Worklist.push_back(&U);
+    }
+  }
+
+  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
+  return ReplaceInstUsesWith(
+      ICI,
+      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
+}
+
 /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
 Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
                                             Value *X, ConstantInt *CI,
@@ -851,7 +921,6 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
       // to the same result value.
       HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false);
     }
-
   } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
     if (CmpRHSV == 0) {       // (X / pos) op 0
       // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
@@ -996,7 +1065,6 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
     return Res;
   }
 
-
   // If we are comparing against bits always shifted out, the
   // comparison cannot succeed.
   APInt Comp = CmpRHSV << ShAmtVal;
@@ -1074,18 +1142,22 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
   if (AP1 == AP2)
     return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
 
-  // Get the distance between the highest bit that's set.
   int Shift;
-  // Both the constants are negative, take their positive to calculate log.
   if (IsAShr && AP1.isNegative())
-    // Get the ones' complement of AP2 and AP1 when computing the distance.
-    Shift = (~AP2).logBase2() - (~AP1).logBase2();
+    Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
   else
-    Shift = AP2.logBase2() - AP1.logBase2();
+    Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
 
   if (Shift > 0) {
-    if (IsAShr ? AP1 == AP2.ashr(Shift) : AP1 == AP2.lshr(Shift))
+    if (IsAShr && AP1 == AP2.ashr(Shift)) {
+      // There are multiple solutions if we are comparing against -1 and the LHS
+      // of the ashr is not a power of two.
+      if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
+        return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
+      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+    } else if (AP1 == AP2.lshr(Shift)) {
       return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+    }
   }
   // Shifting const2 will never be equal to const1.
   return getConstant(false);
@@ -1145,6 +1217,14 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
 
   switch (LHSI->getOpcode()) {
   case Instruction::Trunc:
+    if (RHS->isOne() && RHSV.getBitWidth() > 1) {
+      // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
+      Value *V = nullptr;
+      if (ICI.getPredicate() == ICmpInst::ICMP_SLT &&
+          match(LHSI->getOperand(0), m_Signum(m_Value(V))))
+        return new ICmpInst(ICmpInst::ICMP_SLT, V,
+                            ConstantInt::get(V->getType(), 1));
+    }
     if (ICI.isEquality() && LHSI->hasOneUse()) {
       // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
       // of the high bits truncated out of x are known.
@@ -1447,9 +1527,35 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT
                                                   : ICmpInst::ICMP_ULE,
           LHSI->getOperand(0), SubOne(RHS));
+
+    // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1)
+    //   iff C is a power of 2
+    if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) {
+      if (auto *CI = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
+        const APInt &AI = CI->getValue();
+        int32_t ExactLogBase2 = AI.exactLogBase2();
+        if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
+          Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1);
+          Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy);
+          return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ
+                                  ? ICmpInst::ICMP_SGE
+                                  : ICmpInst::ICMP_SLT,
+                              Trunc, Constant::getNullValue(NTy));
+        }
+      }
+    }
     break;
 
   case Instruction::Or: {
+    if (RHS->isOne()) {
+      // icmp slt signum(V) 1 --> icmp slt V, 1
+      Value *V = nullptr;
+      if (ICI.getPredicate() == ICmpInst::ICMP_SLT &&
+          match(LHSI, m_Signum(m_Value(V))))
+        return new ICmpInst(ICmpInst::ICMP_SLT, V,
+                            ConstantInt::get(V->getType(), 1));
+    }
+
     if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse())
       break;
     Value *P, *Q;
@@ -2083,11 +2189,9 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // If the pattern matches, truncate the inputs to the narrower type and
   // use the sadd_with_overflow intrinsic to efficiently compute both the
   // result and the overflow bit.
-  Module *M = I.getParent()->getParent()->getParent();
-
   Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
-  Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow,
-                                       NewType);
+  Value *F = Intrinsic::getDeclaration(I.getModule(),
+                                       Intrinsic::sadd_with_overflow, NewType);
 
   InstCombiner::BuilderTy *Builder = IC.Builder;
 
@@ -2123,6 +2227,12 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
     return true;
   };
 
+  // If the overflow check was an add followed by a compare, the insertion point
+  // may be pointing to the compare.  We want to insert the new instructions
+  // before the add in case there are uses of the add between the add and the
+  // compare.
+  Builder->SetInsertPoint(&OrigI);
+
   switch (OCF) {
   case OCF_INVALID:
     llvm_unreachable("bad overflow check kind!");
@@ -2223,7 +2333,9 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
 
   assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
   assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
-  Instruction *MulInstr = cast<Instruction>(MulVal);
+  auto *MulInstr = dyn_cast<Instruction>(MulVal);
+  if (!MulInstr)
+    return nullptr;
   assert(MulInstr->getOpcode() == Instruction::Mul);
 
   auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
@@ -2357,7 +2469,6 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
 
   InstCombiner::BuilderTy *Builder = IC.Builder;
   Builder->SetInsertPoint(MulInstr);
-  Module *M = I.getParent()->getParent()->getParent();
 
   // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
   Value *MulA = A, *MulB = B;
@@ -2365,8 +2476,8 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
     MulA = Builder->CreateZExt(A, MulType);
   if (WidthB < MulWidth)
     MulB = Builder->CreateZExt(B, MulType);
-  Value *F =
-      Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType);
+  Value *F = Intrinsic::getDeclaration(I.getModule(),
+                                       Intrinsic::umul_with_overflow, MulType);
   CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul");
   IC.Worklist.Add(MulInstr);
 
@@ -2468,7 +2579,6 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
   default:
     return APInt::getAllOnesValue(BitWidth);
   }
-
 }
 
 /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
@@ -2905,7 +3015,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                               ConstantInt::get(X->getType(),
                                                CI->countTrailingZeros()));
       }
-
       break;
     }
     case ICmpInst::ICMP_NE: {
@@ -2950,7 +3059,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                               ConstantInt::get(X->getType(),
                                                CI->countTrailingZeros()));
       }
-
       break;
     }
     case ICmpInst::ICMP_ULT:
@@ -3103,7 +3211,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // comparison into the select arms, which will cause one to be
         // constant folded and the select turned into a bitwise or.
         Value *Op1 = nullptr, *Op2 = nullptr;
-        ConstantInt *CI = 0;
+        ConstantInt *CI = nullptr;
         if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
           Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
           CI = dyn_cast<ConstantInt>(Op1);
@@ -3177,6 +3285,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                            ICmpInst::getSwappedPredicate(I.getPredicate()), I))
       return NI;
 
+  // Try to optimize equality comparisons against alloca-based pointers.
+  if (Op0->getType()->isPointerTy() && I.isEquality()) {
+    assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
+    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL)))
+      if (Instruction *New = FoldAllocaCmp(I, Alloca, Op1))
+        return New;
+    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL)))
+      if (Instruction *New = FoldAllocaCmp(I, Alloca, Op0))
+        return New;
+  }
+
   // Test to see if the operands of the icmp are casted versions of other
   // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
   // now.
@@ -3304,6 +3423,26 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         match(B, m_One()))
       return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
 
+    // icmp sgt X, (Y + -1) -> icmp sge X, Y
+    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
+        match(D, m_AllOnes()))
+      return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
+
+    // icmp sle X, (Y + -1) -> icmp slt X, Y
+    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
+        match(D, m_AllOnes()))
+      return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
+
+    // icmp sge X, (Y + 1) -> icmp sgt X, Y
+    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE &&
+        match(D, m_One()))
+      return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
+
+    // icmp slt X, (Y + 1) -> icmp sle X, Y
+    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT &&
+        match(D, m_One()))
+      return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
+
     // if C1 has greater magnitude than C2:
     //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
     //  s.t. C3 = C1 - C2
@@ -3473,6 +3612,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
       }
     }
+
+    if (BO0) {
+      // Transform  A & (L - 1) `ult` L --> L != 0
+      auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
+      auto BitwiseAnd =
+          m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
+
+      if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+        auto *Zero = Constant::getNullValue(BO0->getType());
+        return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
+      }
+    }
   }
 
   { Value *A, *B;
@@ -3697,15 +3848,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
 
   IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
 
-  // Check to see that the input is converted from an integer type that is small
-  // enough that preserves all bits.  TODO: check here for "known" sign bits.
-  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
-  unsigned InputSize = IntTy->getScalarSizeInBits();
-
-  // If this is a uitofp instruction, we need an extra bit to hold the sign.
   bool LHSUnsigned = isa<UIToFPInst>(LHSI);
-  if (LHSUnsigned)
-    ++InputSize;
 
   if (I.isEquality()) {
     FCmpInst::Predicate P = I.getPredicate();
@@ -3732,13 +3875,30 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     // equality compares as integer?
   }
 
-  // Comparisons with zero are a special case where we know we won't lose
-  // information.
-  bool IsCmpZero = RHS.isPosZero();
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = IntTy->getScalarSizeInBits();
 
-  // If the conversion would lose info, don't hack on this.
-  if ((int)InputSize > MantissaWidth && !IsCmpZero)
-    return nullptr;
+  // Following test does NOT adjust InputSize downwards for signed inputs, 
+  // because the most negative value still requires all the mantissa bits 
+  // to distinguish it from one less than that value.
+  if ((int)InputSize > MantissaWidth) {
+    // Conversion would lose accuracy. Check if loss can impact comparison.
+    int Exp = ilogb(RHS);
+    if (Exp == APFloat::IEK_Inf) {
+      int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
+      if (MaxExponent < (int)InputSize - !LHSUnsigned) 
+        // Conversion could create infinity.
+        return nullptr;
+    } else {
+      // Note that if RHS is zero or NaN, then Exp is negative 
+      // and first condition is trivially false.
+      if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned) 
+        // Conversion could affect comparison.
+        return nullptr;
+    }
+  }
 
   // Otherwise, we can potentially simplify the comparison.  We know that it
   // will always come through as an integer value and we know the constant is
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ac934f1..e4e5065 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -281,6 +281,7 @@ public:
                                 ICmpInst::Predicate Pred);
   Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
+  Instruction *FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other);
   Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
                                    BinaryOperator &I);
   Instruction *commonCastTransforms(CastInst &CI);
@@ -341,6 +342,7 @@ public:
                                  const unsigned SIOpd);
 
 private:
+  bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
@@ -360,6 +362,11 @@ private:
   /// \brief Try to optimize a sequence of instructions checking if an operation
   /// on LHS and RHS overflows.
   ///
+  /// If this overflow check is done via one of the overflow check intrinsics,
+  /// then CtxI has to be the call instruction calling that intrinsic.  If this
+  /// overflow check is done by arithmetic followed by a compare, then CtxI has
+  /// to be the arithmetic instruction.
+  ///
   /// If a simplification is possible, stores the simplified result of the
   /// operation in OperationResult and result of the overflow check in
   /// OverflowResult, and return true.  If no simplification is possible,
@@ -393,7 +400,7 @@ public:
     assert(New && !New->getParent() &&
            "New instruction already inserted into a basic block!");
     BasicBlock *BB = Old.getParent();
-    BB->getInstList().insert(&Old, New); // Insert inst
+    BB->getInstList().insert(Old.getIterator(), New); // Insert inst
     Worklist.Add(New);
     return New;
   }
@@ -407,7 +414,7 @@ public:
   /// \brief A combiner-aware RAUW-like routine.
   ///
   /// This method is to be used when an instruction is found to be dead,
-  /// replacable with another preexisting expression. Here we add all uses of
+  /// replaceable with another preexisting expression. Here we add all uses of
   /// I to the worklist, replace all uses of I with the new value, then return
   /// I, so that the inst combiner will know that I was modified.
   Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
@@ -539,6 +546,7 @@ private:
   Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN);
 
   Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
@@ -548,7 +556,7 @@ private:
   Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,
                          bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
-  Instruction *MatchBSwap(BinaryOperator &I);
+  Instruction *MatchBSwapOrBitReverse(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e3179db..47406b9 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
@@ -90,21 +91,23 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
         if (CS.isCallee(&U))
           continue;
 
+        unsigned DataOpNo = CS.getDataOperandNo(&U);
+        bool IsArgOperand = CS.isArgOperand(&U);
+
         // Inalloca arguments are clobbered by the call.
-        unsigned ArgNo = CS.getArgumentNo(&U);
-        if (CS.isInAllocaArgument(ArgNo))
+        if (IsArgOperand && CS.isInAllocaArgument(DataOpNo))
           return false;
 
         // If this is a readonly/readnone call site, then we know it is just a
         // load (but one that potentially returns the value itself), so we can
         // ignore it if we know that the value isn't captured.
         if (CS.onlyReadsMemory() &&
-            (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
+            (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo)))
           continue;
 
         // If this is being passed as a byval argument, the caller is making a
         // copy, so it is only a read of the alloca.
-        if (CS.isByValArgument(ArgNo))
+        if (IsArgOperand && CS.isByValArgument(DataOpNo))
           continue;
       }
 
@@ -186,7 +189,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
     // Scan to the end of the allocation instructions, to skip over a block of
     // allocas if possible...also skip interleaved debug info
     //
-    BasicBlock::iterator It = New;
+    BasicBlock::iterator It(New);
     while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
       ++It;
 
@@ -367,7 +370,13 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
                              MDB.createRange(NonNullInt, NullInt));
       }
       break;
-
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewTy->isPointerTy())
+        NewLoad->setMetadata(ID, N);
+      break;
     case LLVMContext::MD_range:
       // FIXME: It would be nice to propagate this in some way, but the type
       // conversions make it hard. If the new type is a pointer, we could
@@ -418,6 +427,9 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
     case LLVMContext::MD_invariant_load:
     case LLVMContext::MD_nonnull:
     case LLVMContext::MD_range:
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
       // These don't apply for stores.
       break;
     }
@@ -511,16 +523,46 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
   if (!T->isAggregateType())
     return nullptr;
 
-  assert(LI.getAlignment() && "Alignement must be set at this point");
+  assert(LI.getAlignment() && "Alignment must be set at this point");
 
   if (auto *ST = dyn_cast<StructType>(T)) {
     // If the struct only have one element, we unpack.
-    if (ST->getNumElements() == 1) {
+    unsigned Count = ST->getNumElements();
+    if (Count == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
                                                ".unpack");
       return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, LI.getName()));
     }
+
+    // We don't want to break loads with padding here as we'd loose
+    // the knowledge that padding exists for the rest of the pipeline.
+    const DataLayout &DL = IC.getDataLayout();
+    auto *SL = DL.getStructLayout(ST);
+    if (SL->hasPadding())
+      return nullptr;
+
+    auto Name = LI.getName();
+    SmallString<16> LoadName = Name;
+    LoadName += ".unpack";
+    SmallString<16> EltName = Name;
+    EltName += ".elt";
+    auto *Addr = LI.getPointerOperand();
+    Value *V = UndefValue::get(T);
+    auto *IdxType = Type::getInt32Ty(ST->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+    for (unsigned i = 0; i < Count; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName);
+      auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName);
+      V = IC.Builder->CreateInsertValue(V, L, i);
+    }
+
+    V->setName(Name);
+    return IC.ReplaceInstUsesWith(LI, V);
   }
 
   if (auto *AT = dyn_cast<ArrayType>(T)) {
@@ -681,7 +723,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
   // FIXME: If the GEP is not inbounds, and there are extra indices after the
   // one we'll replace, those could cause the address computation to wrap
   // (rendering the IsAllNonNegative() check below insufficient). We can do
-  // better, ignoring zero indicies (and other indicies we can prove small
+  // better, ignoring zero indices (and other indices we can prove small
   // enough not to wrap).
   if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
     return false;
@@ -748,19 +790,19 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
-  BasicBlock::iterator BBI = &LI;
+  BasicBlock::iterator BBI(LI);
   AAMDNodes AATags;
-  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,
-                                                     6, AA, &AATags)) {
+  if (Value *AvailableVal =
+      FindAvailableLoadedValue(Op, LI.getParent(), BBI,
+                               DefMaxInstsToScan, AA, &AATags)) {
     if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) {
       unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,
-        LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,
-        LLVMContext::MD_range,
-        LLVMContext::MD_invariant_load,
-        LLVMContext::MD_nonnull,
-      };
+          LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope,
+          LLVMContext::MD_noalias,         LLVMContext::MD_range,
+          LLVMContext::MD_invariant_load,  LLVMContext::MD_nonnull,
+          LLVMContext::MD_invariant_group, LLVMContext::MD_align,
+          LLVMContext::MD_dereferenceable,
+          LLVMContext::MD_dereferenceable_or_null};
       combineMetadata(NLI, &LI, KnownIDs);
     };
 
@@ -822,7 +864,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       }
 
       // load (select (cond, null, P)) -> load P
-      if (isa<ConstantPointerNull>(SI->getOperand(1)) && 
+      if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
           LI.getPointerAddressSpace() == 0) {
         LI.setOperand(0, SI->getOperand(2));
         return &LI;
@@ -857,7 +899,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 ///
 /// \returns true if the store was successfully combined away. This indicates
 /// the caller must erase the store instruction. We have to let the caller erase
-/// the store instruction sas otherwise there is no way to signal whether it was
+/// the store instruction as otherwise there is no way to signal whether it was
 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
 static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and atomic
@@ -893,11 +935,38 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
 
   if (auto *ST = dyn_cast<StructType>(T)) {
     // If the struct only have one element, we unpack.
-    if (ST->getNumElements() == 1) {
+    unsigned Count = ST->getNumElements();
+    if (Count == 1) {
       V = IC.Builder->CreateExtractValue(V, 0);
       combineStoreToNewValue(IC, SI, V);
       return true;
     }
+
+    // We don't want to break loads with padding here as we'd loose
+    // the knowledge that padding exists for the rest of the pipeline.
+    const DataLayout &DL = IC.getDataLayout();
+    auto *SL = DL.getStructLayout(ST);
+    if (SL->hasPadding())
+      return false;
+
+    SmallString<16> EltName = V->getName();
+    EltName += ".elt";
+    auto *Addr = SI.getPointerOperand();
+    SmallString<16> AddrName = Addr->getName();
+    AddrName += ".repack";
+    auto *IdxType = Type::getInt32Ty(ST->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+    for (unsigned i = 0; i < Count; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), AddrName);
+      auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
+      IC.Builder->CreateStore(Val, Ptr);
+    }
+
+    return true;
   }
 
   if (auto *AT = dyn_cast<ArrayType>(T)) {
@@ -971,9 +1040,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       return &SI;
   }
 
-  // Don't hack volatile/atomic stores.
-  // FIXME: Some bits are legal for atomic stores; needs refactoring.
-  if (!SI.isSimple()) return nullptr;
+  // Don't hack volatile/ordered stores.
+  // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
+  if (!SI.isUnordered()) return nullptr;
 
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
@@ -991,7 +1060,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   // Do really simple DSE, to catch cases where there are several consecutive
   // stores to the same location, separated by a few arithmetic operations. This
   // situation often occurs with bitfield accesses.
-  BasicBlock::iterator BBI = &SI;
+  BasicBlock::iterator BBI(SI);
   for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
        --ScanInsts) {
     --BBI;
@@ -1005,7 +1074,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
-      if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
+      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
                                                         SI.getOperand(1))) {
         ++NumDeadStore;
         ++BBI;
@@ -1019,9 +1088,10 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     // the pointer we're loading and is producing the pointer we're storing,
     // then *this* store is dead (X = load P; store X -> P).
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
-      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
-          LI->isSimple())
+      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
+        assert(SI.isUnordered() && "can't eliminate ordering operation");
         return EraseInstFromFunction(SI);
+      }
 
       // Otherwise, this is a load from some other location.  Stores before it
       // may not be dead.
@@ -1047,10 +1117,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (isa<UndefValue>(Val))
     return EraseInstFromFunction(SI);
 
+  // The code below needs to be audited and adjusted for unordered atomics
+  if (!SI.isSimple())
+    return nullptr;
+
   // If this store is the last instruction in the basic block (possibly
   // excepting debug info instructions), and if the block ends with an
   // unconditional branch, try to move it to the successor block.
-  BBI = &SI;
+  BBI = SI.getIterator();
   do {
     ++BBI;
   } while (isa<DbgInfoIntrinsic>(BBI) ||
@@ -1106,7 +1180,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     return false;
 
   // Verify that the other block ends in a branch and is not otherwise empty.
-  BasicBlock::iterator BBI = OtherBB->getTerminator();
+  BasicBlock::iterator BBI(OtherBB->getTerminator());
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
   if (!OtherBr || BBI == OtherBB->begin())
     return false;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index a554e9f..160792b 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -22,9 +22,9 @@ using namespace PatternMatch;
 #define DEBUG_TYPE "instcombine"
 
 
-/// simplifyValueKnownNonZero - The specific integer value is used in a context
-/// where it is known to be non-zero.  If this allows us to simplify the
-/// computation, do so and return the new operand, otherwise return null.
+/// The specific integer value is used in a context where it is known to be
+/// non-zero.  If this allows us to simplify the computation, do so and return
+/// the new operand, otherwise return null.
 static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
                                         Instruction &CxtI) {
   // If V has multiple uses, then we would have to do more analysis to determine
@@ -76,8 +76,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
 }
 
 
-/// MultiplyOverflows - True if the multiply can not be expressed in an int
-/// this size.
+/// True if the multiply can not be expressed in an int this size.
 static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
                               bool IsSigned) {
   bool Overflow;
@@ -95,6 +94,14 @@ static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
   assert(C1.getBitWidth() == C2.getBitWidth() &&
          "Inconsistent width of constants!");
 
+  // Bail if we will divide by zero.
+  if (C2.isMinValue())
+    return false;
+
+  // Bail if we would divide INT_MIN by -1.
+  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+    return false;
+
   APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
   if (IsSigned)
     APInt::sdivrem(C1, C2, Quotient, Remainder);
@@ -629,7 +636,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     // if pattern detected emit alternate sequence
     if (OpX && OpY) {
       BuilderTy::FastMathFlagGuard Guard(*Builder);
-      Builder->SetFastMathFlags(Log2->getFastMathFlags());
+      Builder->setFastMathFlags(Log2->getFastMathFlags());
       Log2->setArgOperand(0, OpY);
       Value *FMulVal = Builder->CreateFMul(OpX, Log2);
       Value *FSub = Builder->CreateFSub(FMulVal, OpX);
@@ -645,7 +652,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     bool IgnoreZeroSign = I.hasNoSignedZeros();
     if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
       BuilderTy::FastMathFlagGuard Guard(*Builder);
-      Builder->SetFastMathFlags(I.getFastMathFlags());
+      Builder->setFastMathFlags(I.getFastMathFlags());
 
       Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
       Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
@@ -686,7 +693,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 
         if (Y) {
           BuilderTy::FastMathFlagGuard Guard(*Builder);
-          Builder->SetFastMathFlags(I.getFastMathFlags());
+          Builder->setFastMathFlags(I.getFastMathFlags());
           Value *T = Builder->CreateFMul(Opnd1, Opnd1);
 
           Value *R = Builder->CreateFMul(T, Y);
@@ -705,8 +712,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select
-/// instruction.
+/// Try to fold a divide or remainder of a select instruction.
 bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
   SelectInst *SI = cast<SelectInst>(I.getOperand(1));
 
@@ -740,7 +746,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
     return true;
 
   // Scan the current block backward, looking for other uses of SI.
-  BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin();
+  BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();
 
   while (BBI != BBFront) {
     --BBI;
@@ -754,10 +760,10 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
          I != E; ++I) {
       if (*I == SI) {
         *I = SI->getOperand(NonNullOperand);
-        Worklist.Add(BBI);
+        Worklist.Add(&*BBI);
       } else if (*I == SelectCond) {
         *I = Builder->getInt1(NonNullOperand == 1);
-        Worklist.Add(BBI);
+        Worklist.Add(&*BBI);
       }
     }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 460f6eb..f1aa98b 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instcombine"
@@ -245,7 +246,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
 /// non-address-taken alloca.  Doing so will cause us to not promote the alloca
 /// to a register.
 static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
-  BasicBlock::iterator BBI = L, E = L->getParent()->end();
+  BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
 
   for (++BBI; BBI != E; ++BBI)
     if (BBI->mayWriteToMemory())
@@ -349,24 +350,40 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 
   Value *InVal = FirstLI->getOperand(0);
   NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+  LoadInst *NewLI = new LoadInst(NewPN, "", isVolatile, LoadAlignment);
+
+  unsigned KnownIDs[] = {
+    LLVMContext::MD_tbaa,
+    LLVMContext::MD_range,
+    LLVMContext::MD_invariant_load,
+    LLVMContext::MD_alias_scope,
+    LLVMContext::MD_noalias,
+    LLVMContext::MD_nonnull,
+    LLVMContext::MD_align,
+    LLVMContext::MD_dereferenceable,
+    LLVMContext::MD_dereferenceable_or_null,
+  };
 
-  // Add all operands to the new PHI.
+  for (unsigned ID : KnownIDs)
+    NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
+
+  // Add all operands to the new PHI and combine TBAA metadata.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
-    Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0);
+    LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i));
+    combineMetadata(NewLI, LI, KnownIDs);
+    Value *NewInVal = LI->getOperand(0);
     if (NewInVal != InVal)
       InVal = nullptr;
     NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
   }
 
-  Value *PhiVal;
   if (InVal) {
     // The new PHI unions all of the same values together.  This is really
     // common, so we handle it intelligently here for compile-time speed.
-    PhiVal = InVal;
+    NewLI->setOperand(0, InVal);
     delete NewPN;
   } else {
     InsertNewInstBefore(NewPN, PN);
-    PhiVal = NewPN;
   }
 
   // If this was a volatile load that we are merging, make sure to loop through
@@ -376,17 +393,94 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
     for (Value *IncValue : PN.incoming_values())
       cast<LoadInst>(IncValue)->setVolatile(false);
 
-  LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
   NewLI->setDebugLoc(FirstLI->getDebugLoc());
   return NewLI;
 }
 
+/// TODO: This function could handle other cast types, but then it might
+/// require special-casing a cast from the 'i1' type. See the comment in
+/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
+Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
+  // We cannot create a new instruction after the PHI if the terminator is an
+  // EHPad because there is no valid insertion point.
+  if (TerminatorInst *TI = Phi.getParent()->getTerminator())
+    if (TI->isEHPad())
+      return nullptr;
+
+  // Early exit for the common case of a phi with two operands. These are
+  // handled elsewhere. See the comment below where we check the count of zexts
+  // and constants for more details.
+  unsigned NumIncomingValues = Phi.getNumIncomingValues();
+  if (NumIncomingValues < 3)
+    return nullptr;
 
+  // Find the narrower type specified by the first zext.
+  Type *NarrowType = nullptr;
+  for (Value *V : Phi.incoming_values()) {
+    if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+      NarrowType = Zext->getSrcTy();
+      break;
+    }
+  }
+  if (!NarrowType)
+    return nullptr;
+
+  // Walk the phi operands checking that we only have zexts or constants that
+  // we can shrink for free. Store the new operands for the new phi.
+  SmallVector<Value *, 4> NewIncoming;
+  unsigned NumZexts = 0;
+  unsigned NumConsts = 0;
+  for (Value *V : Phi.incoming_values()) {
+    if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+      // All zexts must be identical and have one use.
+      if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse())
+        return nullptr;
+      NewIncoming.push_back(Zext->getOperand(0));
+      NumZexts++;
+    } else if (auto *C = dyn_cast<Constant>(V)) {
+      // Make sure that constants can fit in the new type.
+      Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType);
+      if (ConstantExpr::getZExt(Trunc, C->getType()) != C)
+        return nullptr;
+      NewIncoming.push_back(Trunc);
+      NumConsts++;
+    } else {
+      // If it's not a cast or a constant, bail out.
+      return nullptr;
+    }
+  }
+
+  // The more common cases of a phi with no constant operands or just one
+  // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi()
+  // respectively. FoldOpIntoPhi() wants to do the opposite transform that is
+  // performed here. It tries to replicate a cast in the phi operand's basic
+  // block to expose other folding opportunities. Thus, InstCombine will
+  // infinite loop without this check.
+  if (NumConsts == 0 || NumZexts < 2)
+    return nullptr;
+
+  // All incoming values are zexts or constants that are safe to truncate.
+  // Create a new phi node of the narrow type, phi together all of the new
+  // operands, and zext the result back to the original type.
+  PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues,
+                                    Phi.getName() + ".shrunk");
+  for (unsigned i = 0; i != NumIncomingValues; ++i)
+    NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i));
+
+  InsertNewInstBefore(NewPhi, Phi);
+  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+}
 
 /// If all operands to a PHI node are the same "unary" operator and they all are
 /// only used by the PHI, PHI together their inputs, and do the operation once,
 /// to the result of the PHI.
 Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+  // We cannot create a new instruction after the PHI if the terminator is an
+  // EHPad because there is no valid insertion point.
+  if (TerminatorInst *TI = PN.getParent()->getTerminator())
+    if (TI->isEHPad())
+      return nullptr;
+
   Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
 
   if (isa<GetElementPtrInst>(FirstInst))
@@ -740,7 +834,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
         }
 
         // Otherwise, do an extract in the predecessor.
-        Builder->SetInsertPoint(Pred, Pred->getTerminator());
+        Builder->SetInsertPoint(Pred->getTerminator());
         Value *Res = InVal;
         if (Offset)
           Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(),
@@ -787,6 +881,9 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
   if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC))
     return ReplaceInstUsesWith(PN, V);
 
+  if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN))
+    return Result;
+
   // If all PHI operands are the same operation, pull them through the PHI,
   // reducing code size.
   if (isa<Instruction>(PN.getIncomingValue(0)) &&
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f51442a..51219bc 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -38,7 +38,8 @@ getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) {
   }
 }
 
-static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) {
+static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF,
+                                                   bool Ordered=false) {
   switch (SPF) {
   default:
     llvm_unreachable("unhandled!");
@@ -51,17 +52,22 @@ static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) {
     return ICmpInst::ICMP_SGT;
   case SPF_UMAX:
     return ICmpInst::ICMP_UGT;
+  case SPF_FMINNUM:
+    return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT;
+  case SPF_FMAXNUM:
+    return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;
   }
 }
 
 static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder,
                                           SelectPatternFlavor SPF, Value *A,
                                           Value *B) {
-  CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF);
+  CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF);
+  assert(CmpInst::isIntPredicate(Pred));
   return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B);
 }
 
-/// GetSelectFoldableOperands - We want to turn code that looks like this:
+/// We want to turn code that looks like this:
 ///   %C = or %A, %B
 ///   %D = select %cond, %C, %A
 /// into:
@@ -90,8 +96,8 @@ static unsigned GetSelectFoldableOperands(Instruction *I) {
   }
 }
 
-/// GetSelectFoldableConstant - For the same transformation as the previous
-/// function, return the identity constant that goes into the select.
+/// For the same transformation as the previous function, return the identity
+/// constant that goes into the select.
 static Constant *GetSelectFoldableConstant(Instruction *I) {
   switch (I->getOpcode()) {
   default: llvm_unreachable("This cannot happen!");
@@ -110,7 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) {
   }
 }
 
-/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI
+/// Here we have (select c, TI, FI), and we know that TI and FI
 /// have the same opcode and only one use each.  Try to simplify this.
 Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
@@ -197,8 +203,8 @@ static bool isSelect01(Constant *C1, Constant *C2) {
          C2I->isOne() || C2I->isAllOnesValue();
 }
 
-/// FoldSelectIntoOp - Try fold the select into one of the operands to
-/// facilitate further optimization.
+/// Try to fold the select into one of the operands to allow further
+/// optimization.
 Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
                                             Value *FalseVal) {
   // See the comment above GetSelectFoldableOperands for a description of the
@@ -276,7 +282,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
   return nullptr;
 }
 
-/// foldSelectICmpAndOr - We want to turn:
+/// We want to turn:
 ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
 /// into:
 ///   (or (shl (and X, C1), C3), y)
@@ -394,9 +400,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   return nullptr;
 }
 
-/// visitSelectInstWithICmp - Visit a SelectInst that has an
-/// ICmpInst as its first operand.
-///
+/// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
                                                    ICmpInst *ICI) {
   bool Changed = false;
@@ -595,10 +599,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
 }
 
 
-/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a
-/// PHI node (but the two may be in different blocks).  See if the true/false
-/// values (V) are live in all of the predecessor blocks of the PHI.  For
-/// example, cases like this cannot be mapped:
+/// SI is a select whose condition is a PHI node (but the two may be in
+/// different blocks). See if the true/false values (V) are live in all of the
+/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
 ///
 ///   X = phi [ C1, BB1], [C2, BB2]
 ///   Y = add
@@ -632,7 +635,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
   return false;
 }
 
-/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form:
+/// We have an SPF (e.g. a min or max) of an SPF of the form:
 ///   SPF2(SPF1(A, B), C)
 Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
                                         SelectPatternFlavor SPF1,
@@ -745,10 +748,10 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
   return nullptr;
 }
 
-/// foldSelectICmpAnd - If one of the constants is zero (we know they can't
-/// both be) and we have an icmp instruction with zero, and we have an 'and'
-/// with the non-constant value and a power of two we can turn the select
-/// into a shift on the result of the 'and'.
+/// If one of the constants is zero (we know they can't both be) and we have an
+/// icmp instruction with zero, and we have an 'and' with the non-constant value
+/// and a power of two we can turn the select into a shift on the result of the
+/// 'and'.
 static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
                                 ConstantInt *FalseVal,
                                 InstCombiner::BuilderTy *Builder) {
@@ -926,6 +929,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
       if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
         FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        IRBuilder<>::FastMathFlagGuard FMFG(*Builder);
+        Builder->setFastMathFlags(FCI->getFastMathFlags());
         Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal,
                                              FCI->getName() + ".inv");
 
@@ -967,6 +972,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y
       if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
         FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        IRBuilder<>::FastMathFlagGuard FMFG(*Builder);
+        Builder->setFastMathFlags(FCI->getFastMathFlags());
         Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal,
                                              FCI->getName() + ".inv");
 
@@ -1054,35 +1061,50 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       }
 
   // See if we can fold the select into one of our operands.
-  if (SI.getType()->isIntOrIntVectorTy()) {
+  if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) {
     if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
       return FoldI;
 
     Value *LHS, *RHS, *LHS2, *RHS2;
     Instruction::CastOps CastOp;
-    SelectPatternFlavor SPF = matchSelectPattern(&SI, LHS, RHS, &CastOp);
+    SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
+    auto SPF = SPR.Flavor;
 
-    if (SPF) {
+    if (SelectPatternResult::isMinOrMax(SPF)) {
       // Canonicalize so that type casts are outside select patterns.
       if (LHS->getType()->getPrimitiveSizeInBits() !=
           SI.getType()->getPrimitiveSizeInBits()) {
-        CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF);
-        Value *Cmp = Builder->CreateICmp(Pred, LHS, RHS);
+        CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered);
+
+        Value *Cmp;
+        if (CmpInst::isIntPredicate(Pred)) {
+          Cmp = Builder->CreateICmp(Pred, LHS, RHS);
+        } else {
+          IRBuilder<>::FastMathFlagGuard FMFG(*Builder);
+          auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
+          Builder->setFastMathFlags(FMF);
+          Cmp = Builder->CreateFCmp(Pred, LHS, RHS);
+        }
+
         Value *NewSI = Builder->CreateCast(CastOp,
                                            Builder->CreateSelect(Cmp, LHS, RHS),
                                            SI.getType());
         return ReplaceInstUsesWith(SI, NewSI);
       }
+    }
 
+    if (SPF) {
       // MAX(MAX(a, b), a) -> MAX(a, b)
       // MIN(MIN(a, b), a) -> MIN(a, b)
       // MAX(MIN(a, b), a) -> a
       // MIN(MAX(a, b), a) -> a
-      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2))
+      // ABS(ABS(a)) -> ABS(a)
+      // NABS(NABS(a)) -> NABS(a)
+      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
         if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
                                           SI, SPF, RHS))
           return R;
-      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2))
+      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)
         if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,
                                           SI, SPF, LHS))
           return R;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index d04ed58..0c7defa 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -55,7 +55,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   return nullptr;
 }
 
-/// CanEvaluateShifted - See if we can compute the specified value, but shifted
+/// See if we can compute the specified value, but shifted
 /// logically to the left or right by some number of bits.  This should return
 /// true if the expression can be computed for the same cost as the current
 /// expression tree.  This is used to eliminate extraneous shifting from things
@@ -184,7 +184,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   }
 }
 
-/// GetShiftedValue - When CanEvaluateShifted returned true for an expression,
+/// When CanEvaluateShifted returned true for an expression,
 /// this value inserts the new computation that produces the shifted value.
 static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                               InstCombiner &IC, const DataLayout &DL) {
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 80628b2..743d514 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -410,9 +410,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // If this is a select as part of a min/max pattern, don't simplify any
     // further in case we break the structure.
     Value *LHS, *RHS;
-    if (matchSelectPattern(I, LHS, RHS) != SPF_UNKNOWN)
+    if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
       return nullptr;
-      
+
     if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero,
                              RHSKnownOne, Depth + 1) ||
         SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero,
@@ -1057,7 +1057,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts);
     if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) {
       for (unsigned i = 0; i < VWidth; i++) {
-        if (CV->getAggregateElement(i)->isNullValue())
+        Constant *CElt = CV->getAggregateElement(i);
+        // Method isNullValue always returns false when called on a
+        // ConstantExpr. If CElt is a ConstantExpr then skip it in order to
+        // to avoid propagating incorrect information.
+        if (isa<ConstantExpr>(CElt))
+          continue;
+        if (CElt->isNullValue())
           LeftDemanded.clearBit(i);
         else
           RightDemanded.clearBit(i);
@@ -1082,6 +1088,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     if (!VTy) break;
     unsigned InVWidth = VTy->getNumElements();
     APInt InputDemandedElts(InVWidth, 0);
+    UndefElts2 = APInt(InVWidth, 0);
     unsigned Ratio;
 
     if (VWidth == InVWidth) {
@@ -1089,29 +1096,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       // elements as are demanded of us.
       Ratio = 1;
       InputDemandedElts = DemandedElts;
-    } else if (VWidth > InVWidth) {
-      // Untested so far.
-      break;
-
-      // If there are more elements in the result than there are in the source,
-      // then an input element is live if any of the corresponding output
-      // elements are live.
-      Ratio = VWidth/InVWidth;
-      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an input element is live if any of the
+      // corresponding output elements are live.
+      Ratio = VWidth / InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
         if (DemandedElts[OutIdx])
-          InputDemandedElts.setBit(OutIdx/Ratio);
-      }
-    } else {
-      // Untested so far.
-      break;
-
-      // If there are more elements in the source than there are in the result,
-      // then an input element is live if the corresponding output element is
-      // live.
-      Ratio = InVWidth/VWidth;
+          InputDemandedElts.setBit(OutIdx / Ratio);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an input element is live if the
+      // corresponding output element is live.
+      Ratio = InVWidth / VWidth;
       for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if (DemandedElts[InIdx/Ratio])
+        if (DemandedElts[InIdx / Ratio])
           InputDemandedElts.setBit(InIdx);
+    } else {
+      // Unsupported so far.
+      break;
     }
 
     // div/rem demand all inputs, because they don't want divide by zero.
@@ -1122,24 +1125,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       MadeChange = true;
     }
 
-    UndefElts = UndefElts2;
-    if (VWidth > InVWidth) {
-      llvm_unreachable("Unimp");
-      // If there are more elements in the result than there are in the source,
-      // then an output element is undef if the corresponding input element is
-      // undef.
+    if (VWidth == InVWidth) {
+      UndefElts = UndefElts2;
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an output element is undef if the
+      // corresponding input element is undef.
       for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
-        if (UndefElts2[OutIdx/Ratio])
+        if (UndefElts2[OutIdx / Ratio])
+          UndefElts.setBit(OutIdx);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an output element is undef if all of the
+      // corresponding input elements are undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
+        if (SubUndef.countPopulation() == Ratio)
           UndefElts.setBit(OutIdx);
-    } else if (VWidth < InVWidth) {
+      }
+    } else {
       llvm_unreachable("Unimp");
-      // If there are more elements in the source than there are in the result,
-      // then a result element is undef if all of the corresponding input
-      // elements are undef.
-      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
-      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if (!UndefElts2[InIdx])            // Not undef?
-          UndefElts.clearBit(InIdx/Ratio);    // Clear undef bit.
     }
     break;
   }
@@ -1237,6 +1242,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       // like undef&0.  The result is known zero, not undef.
       UndefElts &= UndefElts2;
       break;
+
+    // SSE4A instructions leave the upper 64-bits of the 128-bit result
+    // in an undefined state.
+    case Intrinsic::x86_sse4a_extrq:
+    case Intrinsic::x86_sse4a_extrqi:
+    case Intrinsic::x86_sse4a_insertq:
+    case Intrinsic::x86_sse4a_insertqi:
+      UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2);
+      break;
     }
     break;
   }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 2730472..5cde31a 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -22,10 +22,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
-/// is to leave as a vector operation.  isConstant indicates whether we're
-/// extracting one known element.  If false we're extracting a variable index.
-static bool CheapToScalarize(Value *V, bool isConstant) {
+/// Return true if the value is cheaper to scalarize than it is to leave as a
+/// vector operation. isConstant indicates whether we're extracting one known
+/// element. If false we're extracting a variable index.
+static bool cheapToScalarize(Value *V, bool isConstant) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (isConstant) return true;
 
@@ -50,13 +50,13 @@ static bool CheapToScalarize(Value *V, bool isConstant) {
     return true;
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
     if (BO->hasOneUse() &&
-        (CheapToScalarize(BO->getOperand(0), isConstant) ||
-         CheapToScalarize(BO->getOperand(1), isConstant)))
+        (cheapToScalarize(BO->getOperand(0), isConstant) ||
+         cheapToScalarize(BO->getOperand(1), isConstant)))
       return true;
   if (CmpInst *CI = dyn_cast<CmpInst>(I))
     if (CI->hasOneUse() &&
-        (CheapToScalarize(CI->getOperand(0), isConstant) ||
-         CheapToScalarize(CI->getOperand(1), isConstant)))
+        (cheapToScalarize(CI->getOperand(0), isConstant) ||
+         cheapToScalarize(CI->getOperand(1), isConstant)))
       return true;
 
   return false;
@@ -82,7 +82,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // and that it is a binary operation which is cheap to scalarize.
   // otherwise return NULL.
   if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
-      !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true))
+      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
     return nullptr;
 
   // Create a scalar PHI node that will replace the vector PHI node
@@ -115,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
       Instruction *pos = dyn_cast<Instruction>(PHIInVal);
       BasicBlock::iterator InsertPos;
       if (pos && !isa<PHINode>(pos)) {
-        InsertPos = pos;
-        ++InsertPos;
+        InsertPos = ++pos->getIterator();
       } else {
         InsertPos = inBB->getFirstInsertionPt();
       }
@@ -137,7 +136,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   // If vector val is constant with all elements the same, replace EI with
   // that element.  We handle a known element # below.
   if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
-    if (CheapToScalarize(C, false))
+    if (cheapToScalarize(C, false))
       return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));
 
   // If extracting a specified index from the vector, see if we can recursively
@@ -163,7 +162,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       }
     }
 
-    // If the this extractelement is directly using a bitcast from a vector of
+    // If this extractelement is directly using a bitcast from a vector of
     // the same number of elements, see if we can find the source element from
     // it.  In this case, we will end up needing to bitcast the scalars.
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
@@ -184,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
 
   if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
     // Push extractelement into predecessor operation if legal and
-    // profitable to do so
+    // profitable to do so.
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
       if (I->hasOneUse() &&
-          CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
+          cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
         Value *newEI0 =
           Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
                                         EI.getName()+".lhs");
@@ -230,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
                                                            SrcIdx, false));
       }
     } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
-      // Canonicalize extractelement(cast) -> cast(extractelement)
-      // bitcasts can change the number of vector elements and they cost nothing
+      // Canonicalize extractelement(cast) -> cast(extractelement).
+      // Bitcasts can change the number of vector elements, and they cost
+      // nothing.
       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
         Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
                                                   EI.getIndexOperand());
@@ -245,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         // fight the vectorizer.
 
         // If we are extracting an element from a vector select or a select on
-        // vectors, a select on the scalars extracted from the vector arguments.
+        // vectors, create a select on the scalars extracted from the vector
+        // arguments.
         Value *TrueVal = SI->getTrueValue();
         Value *FalseVal = SI->getFalseValue();
 
@@ -275,10 +276,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   return nullptr;
 }
 
-/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
-/// elements from either LHS or RHS, return the shuffle mask and true.
-/// Otherwise, return false.
-static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+/// If V is a shuffle of values that ONLY returns elements from either LHS or
+/// RHS, return the shuffle mask and true. Otherwise, return false.
+static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                          SmallVectorImpl<Constant*> &Mask) {
   assert(LHS->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
@@ -315,7 +315,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
     if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
       // We can handle this if the vector we are inserting into is
       // transitively ok.
-      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
         // If so, update the mask to reflect the inserted undef.
         Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
         return true;
@@ -330,7 +330,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
         if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
           // We can handle this if the vector we are inserting into is
           // transitively ok.
-          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
             // If so, update the mask to reflect the inserted value.
             if (EI->getOperand(0) == LHS) {
               Mask[InsertedIdx % NumElts] =
@@ -352,6 +352,58 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
   return false;
 }
 
+/// If we have insertion into a vector that is wider than the vector that we
+/// are extracting from, try to widen the source vector to allow a single
+/// shufflevector to replace one or more insert/extract pairs.
+static void replaceExtractElements(InsertElementInst *InsElt,
+                                   ExtractElementInst *ExtElt,
+                                   InstCombiner &IC) {
+  VectorType *InsVecType = InsElt->getType();
+  VectorType *ExtVecType = ExtElt->getVectorOperandType();
+  unsigned NumInsElts = InsVecType->getVectorNumElements();
+  unsigned NumExtElts = ExtVecType->getVectorNumElements();
+
+  // The inserted-to vector must be wider than the extracted-from vector.
+  if (InsVecType->getElementType() != ExtVecType->getElementType() ||
+      NumExtElts >= NumInsElts)
+    return;
+
+  // Create a shuffle mask to widen the extended-from vector using undefined
+  // values. The mask selects all of the values of the original vector followed
+  // by as many undefined values as needed to create a vector of the same length
+  // as the inserted-to vector.
+  SmallVector<Constant *, 16> ExtendMask;
+  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext());
+  for (unsigned i = 0; i < NumExtElts; ++i)
+    ExtendMask.push_back(ConstantInt::get(IntType, i));
+  for (unsigned i = NumExtElts; i < NumInsElts; ++i)
+    ExtendMask.push_back(UndefValue::get(IntType));
+
+  Value *ExtVecOp = ExtElt->getVectorOperand();
+  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
+                                        ConstantVector::get(ExtendMask));
+
+  // Insert the new shuffle after the vector operand of the extract is defined
+  // (as long as it's not a PHI) or at the start of the basic block of the
+  // extract, so any subsequent extracts in the same basic block can use it.
+  // TODO: Insert before the earliest ExtractElementInst that is replaced.
+  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
+  if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+    WideVec->insertAfter(ExtVecOpInst);
+  else
+    IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
+
+  // Replace extracts from the original narrow vector with extracts from the new
+  // wide vector.
+  for (User *U : ExtVecOp->users()) {
+    ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
+    if (!OldExt || OldExt->getParent() != WideVec->getParent())
+      continue;
+    auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
+    NewExt->insertAfter(WideVec);
+    IC.ReplaceInstUsesWith(*OldExt, NewExt);
+  }
+}
 
 /// We are building a shuffle to create V, which is a sequence of insertelement,
 /// extractelement pairs. If PermittedRHS is set, then we must either use it or
@@ -363,9 +415,10 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
 /// often been chosen carefully to be efficiently implementable on the target.
 typedef std::pair<Value *, Value *> ShuffleOps;
 
-static ShuffleOps CollectShuffleElements(Value *V,
+static ShuffleOps collectShuffleElements(Value *V,
                                          SmallVectorImpl<Constant *> &Mask,
-                                         Value *PermittedRHS) {
+                                         Value *PermittedRHS,
+                                         InstCombiner &IC) {
   assert(V->getType()->isVectorTy() && "Invalid shuffle!");
   unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
 
@@ -396,10 +449,14 @@ static ShuffleOps CollectShuffleElements(Value *V,
         // otherwise we'd end up with a shuffle of three inputs.
         if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
           Value *RHS = EI->getOperand(0);
-          ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS);
+          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
           assert(LR.second == nullptr || LR.second == RHS);
 
           if (LR.first->getType() != RHS->getType()) {
+            // Although we are giving up for now, see if we can create extracts
+            // that match the inserts for another round of combining.
+            replaceExtractElements(IEI, EI, IC);
+
             // We tried our best, but we can't find anything compatible with RHS
             // further up the chain. Return a trivial shuffle.
             for (unsigned i = 0; i < NumElts; ++i)
@@ -429,14 +486,14 @@ static ShuffleOps CollectShuffleElements(Value *V,
         // If this insertelement is a chain that comes from exactly these two
         // vectors, return the vector and the effective shuffle.
         if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
-            CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
                                          Mask))
           return std::make_pair(EI->getOperand(0), PermittedRHS);
       }
     }
   }
 
-  // Otherwise, can't do anything fancy.  Return an identity vector.
+  // Otherwise, we can't do anything fancy. Return an identity vector.
   for (unsigned i = 0; i != NumElts; ++i)
     Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
   return std::make_pair(V, nullptr);
@@ -512,7 +569,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
       // (and any insertelements it points to), into one big shuffle.
       if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
         SmallVector<Constant*, 16> Mask;
-        ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr);
+        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
 
         // The proposed shuffle may be trivial, in which case we shouldn't
         // perform the combine.
@@ -588,8 +645,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
     case Instruction::FPTrunc:
     case Instruction::FPExt:
     case Instruction::GetElementPtr: {
-      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1))
+      for (Value *Operand : I->operands()) {
+        if (!CanEvaluateShuffled(Operand, Mask, Depth-1))
           return false;
       }
       return true;
@@ -617,7 +674,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
 
 /// Rebuild a new instruction just like 'I' but with the new operands given.
 /// In the event of type mismatch, the type of the operands is correct.
-static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
   // We don't want to use the IRBuilder here because we want the replacement
   // instructions to appear next to 'I', not the builder's insertion point.
   switch (I->getOpcode()) {
@@ -760,7 +817,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
         NeedsRebuild |= (V != I->getOperand(i));
       }
       if (NeedsRebuild) {
-        return BuildNew(I, NewOps);
+        return buildNew(I, NewOps);
       }
       return I;
     }
@@ -792,7 +849,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
   llvm_unreachable("failed to reorder elements of vector instruction!");
 }
 
-static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask,
+static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,
                                   bool &isLHSID, bool &isRHSID) {
   isLHSID = isRHSID = true;
 
@@ -891,7 +948,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (VWidth == LHSWidth) {
     // Analyze the shuffle, are the LHS or RHS and identity shuffles?
     bool isLHSID, isRHSID;
-    RecognizeIdentityMask(Mask, isLHSID, isRHSID);
+    recognizeIdentityMask(Mask, isLHSID, isRHSID);
 
     // Eliminate identity shuffles.
     if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
@@ -1177,7 +1234,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // If the result mask is an identity, replace uses of this instruction with
   // corresponding argument.
   bool isLHSID, isRHSID;
-  RecognizeIdentityMask(newMask, isLHSID, isRHSID);
+  recognizeIdentityMask(newMask, isLHSID, isRHSID);
   if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS);
   if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS);
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index fd34a24..903a0b5 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -42,8 +42,9 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -79,14 +80,12 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(Builder, DL, GEP);
 }
 
-/// ShouldChangeType - Return true if it is desirable to convert a computation
-/// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
-/// type for example, or from a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
-  assert(From->isIntegerTy() && To->isIntegerTy());
-
-  unsigned FromWidth = From->getPrimitiveSizeInBits();
-  unsigned ToWidth = To->getPrimitiveSizeInBits();
+/// Return true if it is desirable to convert an integer computation from a
+/// given bit width to a new bit width.
+/// We don't want to convert from a legal to an illegal type for example or from
+/// a smaller to a larger illegal type.
+bool InstCombiner::ShouldChangeType(unsigned FromWidth,
+                                    unsigned ToWidth) const {
   bool FromLegal = DL.isLegalInteger(FromWidth);
   bool ToLegal = DL.isLegalInteger(ToWidth);
 
@@ -103,6 +102,17 @@ bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
   return true;
 }
 
+/// Return true if it is desirable to convert a computation from 'From' to 'To'.
+/// We don't want to convert from a legal to an illegal type for example or from
+/// a smaller to a larger illegal type.
+bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
+  assert(From->isIntegerTy() && To->isIntegerTy());
+
+  unsigned FromWidth = From->getPrimitiveSizeInBits();
+  unsigned ToWidth = To->getPrimitiveSizeInBits();
+  return ShouldChangeType(FromWidth, ToWidth);
+}
+
 // Return true, if No Signed Wrap should be maintained for I.
 // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
 // where both B and C should be ConstantInts, results in a constant that does
@@ -156,27 +166,26 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
   I.setFastMathFlags(FMF);
 }
 
-/// SimplifyAssociativeOrCommutative - This performs a few simplifications for
-/// operators which are associative or commutative:
-//
-//  Commutative operators:
-//
-//  1. Order operands such that they are listed from right (least complex) to
-//     left (most complex).  This puts constants before unary operators before
-//     binary operators.
-//
-//  Associative operators:
-//
-//  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
-//  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
-//
-//  Associative and commutative operators:
-//
-//  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
-//  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
-//  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
-//     if C1 and C2 are constants.
-//
+/// This performs a few simplifications for operators that are associative or
+/// commutative:
+///
+///  Commutative operators:
+///
+///  1. Order operands such that they are listed from right (least complex) to
+///     left (most complex).  This puts constants before unary operators before
+///     binary operators.
+///
+///  Associative operators:
+///
+///  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+///  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+///
+///  Associative and commutative operators:
+///
+///  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+///     if C1 and C2 are constants.
 bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
   Instruction::BinaryOps Opcode = I.getOpcode();
   bool Changed = false;
@@ -322,7 +331,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
   } while (1);
 }
 
-/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to
+/// Return whether "X LOp (Y ROp Z)" is always equal to
 /// "(X LOp Y) ROp (X LOp Z)".
 static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,
                                      Instruction::BinaryOps ROp) {
@@ -361,7 +370,7 @@ static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,
   }
 }
 
-/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to
+/// Return whether "(X LOp Y) ROp Z" is always equal to
 /// "(X ROp Z) LOp (Y ROp Z)".
 static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
                                      Instruction::BinaryOps ROp) {
@@ -519,7 +528,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
           if (isa<OverflowingBinaryOperator>(Op1))
             HasNSW &= Op1->hasNoSignedWrap();
 
-        // We can propogate 'nsw' if we know that
+        // We can propagate 'nsw' if we know that
         //  %Y = mul nsw i16 %X, C
         //  %Z = add nsw i16 %Y, %X
         // =>
@@ -537,11 +546,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
   return SimplifiedInst;
 }
 
-/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations
-/// which some other binary operation distributes over either by factorizing
-/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this
-/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is
-/// a win).  Returns the simplified value, or null if it didn't simplify.
+/// This tries to simplify binary operations which some other binary operation
+/// distributes over either by factorizing out common terms
+/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
+/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
+/// Returns the simplified value, or null if it didn't simplify.
 Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
@@ -623,12 +632,38 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
       }
   }
 
+  // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0))
+  // (op (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (op c, d)))
+  if (auto *SI0 = dyn_cast<SelectInst>(LHS)) {
+    if (auto *SI1 = dyn_cast<SelectInst>(RHS)) {
+      if (SI0->getCondition() == SI1->getCondition()) {
+        Value *SI = nullptr;
+        if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(),
+                                     SI1->getFalseValue(), DL, TLI, DT, AC))
+          SI = Builder->CreateSelect(SI0->getCondition(),
+                                     Builder->CreateBinOp(TopLevelOpcode,
+                                                          SI0->getTrueValue(),
+                                                          SI1->getTrueValue()),
+                                     V);
+        if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(),
+                                     SI1->getTrueValue(), DL, TLI, DT, AC))
+          SI = Builder->CreateSelect(
+              SI0->getCondition(), V,
+              Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(),
+                                   SI1->getFalseValue()));
+        if (SI) {
+          SI->takeName(&I);
+          return SI;
+        }
+      }
+    }
+  }
+
   return nullptr;
 }
 
-// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
-// if the LHS is a constant zero (which is the 'negate' form).
-//
+/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
+/// constant zero (which is the 'negate' form).
 Value *InstCombiner::dyn_castNegVal(Value *V) const {
   if (BinaryOperator::isNeg(V))
     return BinaryOperator::getNegArgument(V);
@@ -644,10 +679,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
   return nullptr;
 }
 
-// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the
-// instruction if the LHS is a constant negative zero (which is the 'negate'
-// form).
-//
+/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is
+/// a constant negative zero (which is the 'negate' form).
 Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
   if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
     return BinaryOperator::getFNegArgument(V);
@@ -700,10 +733,10 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
   llvm_unreachable("Unknown binary instruction type!");
 }
 
-// FoldOpIntoSelect - Given an instruction with a select as one operand and a
-// constant as the other operand, try to fold the binary operator into the
-// select arguments.  This also works for Cast instructions, which obviously do
-// not have a second operand.
+/// Given an instruction with a select as one operand and a constant as the
+/// other operand, try to fold the binary operator into the select arguments.
+/// This also works for Cast instructions, which obviously do not have a second
+/// operand.
 Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   // Don't modify shared select instructions
   if (!SI->hasOneUse()) return nullptr;
@@ -752,10 +785,9 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   return nullptr;
 }
 
-/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which
-/// has a PHI node as operand #0, see if we can fold the instruction into the
-/// PHI (which is only possible if all operands to the PHI are constants).
-///
+/// Given a binary operator, cast instruction, or select which has a PHI node as
+/// operand #0, see if we can fold the instruction into the PHI (which is only
+/// possible if all operands to the PHI are constants).
 Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   PHINode *PN = cast<PHINode>(I.getOperand(0));
   unsigned NumPHIValues = PN->getNumIncomingValues();
@@ -819,7 +851,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   NewPN->takeName(PN);
 
   // If we are going to have to insert a new computation, do so right before the
-  // predecessors terminator.
+  // predecessor's terminator.
   if (NonConstBB)
     Builder->SetInsertPoint(NonConstBB->getTerminator());
 
@@ -893,10 +925,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return ReplaceInstUsesWith(I, NewPN);
 }
 
-/// FindElementAtOffset - Given a pointer type and a constant offset, determine
-/// whether or not there is a sequence of GEP indices into the pointed type that
-/// will land us at the specified offset.  If so, fill them into NewIndices and
-/// return the resultant element type, otherwise return null.
+/// Given a pointer type and a constant offset, determine whether or not there
+/// is a sequence of GEP indices into the pointed type that will land us at the
+/// specified offset. If so, fill them into NewIndices and return the resultant
+/// element type, otherwise return null.
 Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
                                         SmallVectorImpl<Value *> &NewIndices) {
   Type *Ty = PtrTy->getElementType();
@@ -965,8 +997,8 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
   return true;
 }
 
-/// Descale - Return a value X such that Val = X * Scale, or null if none.  If
-/// the multiplication is known not to overflow then NoSignedWrap is set.
+/// Return a value X such that Val = X * Scale, or null if none.
+/// If the multiplication is known not to overflow, then NoSignedWrap is set.
 Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
   assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
@@ -1008,11 +1040,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   // 0'th operand of Val.
   std::pair<Instruction*, unsigned> Parent;
 
-  // RequireNoSignedWrap - Set if the transform requires a descaling at deeper
-  // levels that doesn't overflow.
+  // Set if the transform requires a descaling at deeper levels that doesn't
+  // overflow.
   bool RequireNoSignedWrap = false;
 
-  // logScale - log base 2 of the scale.  Negative if not a power of 2.
+  // Log base 2 of the scale. Negative if not a power of 2.
   int32_t logScale = Scale.exactLogBase2();
 
   for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
@@ -1213,16 +1245,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
 /// specified one but with other operands.
 static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
                                  InstCombiner::BuilderTy *B) {
-  Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS);
-  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) {
-    if (isa<OverflowingBinaryOperator>(NewBO)) {
-      NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap());
-      NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap());
-    }
-    if (isa<PossiblyExactOperator>(NewBO))
-      NewBO->setIsExact(Inst.isExact());
-  }
-  return BORes;
+  Value *BO = B->CreateBinOp(Inst.getOpcode(), LHS, RHS);
+  // If LHS and RHS are constant, BO won't be a binary operator.
+  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO))
+    NewBO->copyIRFlags(&Inst);
+  return BO;
 }
 
 /// \brief Makes transformation of binary operation specific for vector types.
@@ -1256,9 +1283,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
         LShuf->getMask() == RShuf->getMask()) {
       Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
           RShuf->getOperand(0), Builder);
-      Value *Res = Builder->CreateShuffleVector(NewBO,
+      return Builder->CreateShuffleVector(NewBO,
           UndefValue::get(NewBO->getType()), LShuf->getMask());
-      return Res;
     }
   }
 
@@ -1294,18 +1320,11 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
     }
     if (MayChange) {
       Constant *C2 = ConstantVector::get(C2M);
-      Value *NewLHS, *NewRHS;
-      if (isa<Constant>(LHS)) {
-        NewLHS = C2;
-        NewRHS = Shuffle->getOperand(0);
-      } else {
-        NewLHS = Shuffle->getOperand(0);
-        NewRHS = C2;
-      }
+      Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0);
+      Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2;
       Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder);
-      Value *Res = Builder->CreateShuffleVector(NewBO,
+      return Builder->CreateShuffleVector(NewBO,
           UndefValue::get(Inst.getType()), Shuffle->getMask());
-      return Res;
     }
   }
 
@@ -1323,7 +1342,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Eliminate unneeded casts for indices, and replace indices which displace
   // by multiples of a zero size type with zero.
   bool MadeChange = false;
-  Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType());
+  Type *IntPtrTy =
+    DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType());
 
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
@@ -1333,21 +1353,25 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (!SeqTy)
       continue;
 
+    // Index type should have the same width as IntPtr
+    Type *IndexTy = (*I)->getType();
+    Type *NewIndexType = IndexTy->isVectorTy() ?
+      VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy;
+ 
     // If the element type has zero size then any index over it is equivalent
     // to an index of zero, so replace it with zero if it is not zero already.
     if (SeqTy->getElementType()->isSized() &&
         DL.getTypeAllocSize(SeqTy->getElementType()) == 0)
       if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
-        *I = Constant::getNullValue(IntPtrTy);
+        *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;
       }
 
-    Type *IndexTy = (*I)->getType();
-    if (IndexTy != IntPtrTy) {
+    if (IndexTy != NewIndexType) {
       // If we are using a wider index than needed for this platform, shrink
       // it to what we need.  If narrower, sign-extend it to what we need.
       // This explicit cast can make subsequent optimizations more obvious.
-      *I = Builder->CreateIntCast(*I, IntPtrTy, true);
+      *I = Builder->CreateIntCast(*I, NewIndexType, true);
       MadeChange = true;
     }
   }
@@ -1421,8 +1445,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       }
     }
 
-    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+    // If not all GEPs are identical we'll have to create a new PHI node.
+    // Check that the old PHI node has only one use so that it will get
+    // removed.
+    if (DI != -1 && !PN->hasOneUse())
+      return nullptr;
 
+    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
     if (DI == -1) {
       // All the GEPs feeding the PHI are identical. Clone one down into our
       // BB so that it can be merged with the current GEP.
@@ -1432,11 +1461,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
       // into the current block so it can be merged, and create a new PHI to
       // set that index.
-      Instruction *InsertPt = Builder->GetInsertPoint();
-      Builder->SetInsertPoint(PN);
-      PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(),
-                                          PN->getNumOperands());
-      Builder->SetInsertPoint(InsertPt);
+      PHINode *NewPN;
+      {
+        IRBuilderBase::InsertPointGuard Guard(*Builder);
+        Builder->SetInsertPoint(PN);
+        NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(),
+                                   PN->getNumOperands());
+      }
 
       for (auto &I : PN->operands())
         NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
@@ -1790,7 +1821,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
               I->takeName(BCI);
-              BCI->getParent()->getInstList().insert(BCI, I);
+              BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
               ReplaceInstUsesWith(*BCI, I);
             }
             return &GEP;
@@ -1931,7 +1962,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
 
     if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
       // Replace invoke with a NOP intrinsic to maintain the original CFG
-      Module *M = II->getParent()->getParent()->getParent();
+      Module *M = II->getModule();
       Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
       InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
                          None, "", II->getParent());
@@ -2280,9 +2311,10 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   }
   if (LoadInst *L = dyn_cast<LoadInst>(Agg))
     // If the (non-volatile) load only has one use, we can rewrite this to a
-    // load from a GEP. This reduces the size of the load.
-    // FIXME: If a load is used only by extractvalue instructions then this
-    //        could be done regardless of having multiple uses.
+    // load from a GEP. This reduces the size of the load. If a load is used
+    // only by extractvalue instructions then this either must have been
+    // optimized before, or it is a struct with padding, in which case we
+    // don't want to do the transformation as it loses padding knowledge.
     if (L->isSimple() && L->hasOneUse()) {
       // extractvalue has integer indices, getelementptr has Value*s. Convert.
       SmallVector<Value*, 4> Indices;
@@ -2294,7 +2326,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
 
       // We need to insert these at the location of the old load, not at that of
       // the extractvalue.
-      Builder->SetInsertPoint(L->getParent(), L);
+      Builder->SetInsertPoint(L);
       Value *GEP = Builder->CreateInBoundsGEP(L->getType(),
                                               L->getPointerOperand(), Indices);
       // Returning the load directly will cause the main loop to insert it in
@@ -2312,7 +2344,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   return nullptr;
 }
 
-/// isCatchAll - Return 'true' if the given typeinfo will match anything.
+/// Return 'true' if the given typeinfo will match anything.
 static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
   switch (Personality) {
   case EHPersonality::GNU_C:
@@ -2330,6 +2362,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
   case EHPersonality::MSVC_X86SEH:
   case EHPersonality::MSVC_Win64SEH:
   case EHPersonality::MSVC_CXX:
+  case EHPersonality::CoreCLR:
     return TypeInfo->isNullValue();
   }
   llvm_unreachable("invalid enum");
@@ -2441,10 +2474,24 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
             SawCatchAll = true;
             break;
           }
-          if (AlreadyCaught.count(TypeInfo))
-            // Already caught by an earlier clause, so having it in the filter
-            // is pointless.
-            continue;
+
+          // Even if we've seen a type in a catch clause, we don't want to
+          // remove it from the filter.  An unexpected type handler may be
+          // set up for a call site which throws an exception of the same
+          // type caught.  In order for the exception thrown by the unexpected
+          // handler to propogate correctly, the filter must be correctly
+          // described for the call site.
+          //
+          // Example:
+          //
+          // void unexpected() { throw 1;}
+          // void foo() throw (int) {
+          //   std::set_unexpected(unexpected);
+          //   try {
+          //     throw 2.0;
+          //   } catch (int i) {}
+          // }
+
           // There is no point in having multiple copies of the same typeinfo in
           // a filter, so only add it if we didn't already.
           if (SeenInFilter.insert(TypeInfo).second)
@@ -2637,15 +2684,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   return nullptr;
 }
 
-/// TryToSinkInstruction - Try to move the specified instruction from its
-/// current block into the beginning of DestBlock, which can only happen if it's
-/// safe to move the instruction past all of the instructions between it and the
-/// end of its block.
+/// Try to move the specified instruction from its current block into the
+/// beginning of DestBlock, which can only happen if it's safe to move the
+/// instruction past all of the instructions between it and the end of its
+/// block.
 static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   assert(I->hasOneUse() && "Invariants didn't hold!");
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-  if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() ||
+  if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
       isa<TerminatorInst>(I))
     return false;
 
@@ -2654,17 +2701,24 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
         &DestBlock->getParent()->getEntryBlock())
     return false;
 
+  // Do not sink convergent call instructions.
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    if (CI->isConvergent())
+      return false;
+  }
+
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
-    for (BasicBlock::iterator Scan = I, E = I->getParent()->end();
+    for (BasicBlock::iterator Scan = I->getIterator(),
+                              E = I->getParent()->end();
          Scan != E; ++Scan)
       if (Scan->mayWriteToMemory())
         return false;
   }
 
   BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
-  I->moveBefore(InsertPos);
+  I->moveBefore(&*InsertPos);
   ++NumSunkInst;
   return true;
 }
@@ -2698,6 +2752,27 @@ bool InstCombiner::run() {
       }
     }
 
+    // In general, it is possible for computeKnownBits to determine all bits in a
+    // value even when the operands are not all constants.
+    if (!I->use_empty() && I->getType()->isIntegerTy()) {
+      unsigned BitWidth = I->getType()->getScalarSizeInBits();
+      APInt KnownZero(BitWidth, 0);
+      APInt KnownOne(BitWidth, 0);
+      computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I);
+      if ((KnownZero | KnownOne).isAllOnesValue()) {
+        Constant *C = ConstantInt::get(I->getContext(), KnownOne);
+        DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
+                        " from: " << *I << '\n');
+
+        // Add operands to the worklist.
+        ReplaceInstUsesWith(*I, C);
+        ++NumConstProp;
+        EraseInstFromFunction(*I);
+        MadeIRChange = true;
+        continue;
+      }
+    }
+
     // See if we can trivially sink this instruction to a successor basic block.
     if (I->hasOneUse()) {
       BasicBlock *BB = I->getParent();
@@ -2738,7 +2813,7 @@ bool InstCombiner::run() {
     }
 
     // Now that we have an instruction, try combining it to simplify it.
-    Builder->SetInsertPoint(I->getParent(), I);
+    Builder->SetInsertPoint(I);
     Builder->SetCurrentDebugLocation(I->getDebugLoc());
 
 #ifndef NDEBUG
@@ -2768,7 +2843,7 @@ bool InstCombiner::run() {
 
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
-        BasicBlock::iterator InsertPos = I;
+        BasicBlock::iterator InsertPos = I->getIterator();
 
         // If we replace a PHI with something that isn't a PHI, fix up the
         // insertion point.
@@ -2801,8 +2876,8 @@ bool InstCombiner::run() {
   return MadeIRChange;
 }
 
-/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding
-/// all reachable code to the worklist.
+/// Walk the function in depth-first order, adding all reachable code to the
+/// worklist.
 ///
 /// This has a couple of tricks to make the code faster and more powerful.  In
 /// particular, we constant fold and DCE instructions as we go, to avoid adding
@@ -2829,7 +2904,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       continue;
 
     for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
-      Instruction *Inst = BBI++;
+      Instruction *Inst = &*BBI++;
 
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst, TLI)) {
@@ -2900,8 +2975,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       }
     }
 
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      Worklist.push_back(TI->getSuccessor(i));
+    for (BasicBlock *SuccBB : TI->successors())
+      Worklist.push_back(SuccBB);
   } while (!Worklist.empty());
 
   // Once we've found all of the instructions to add to instcombine's worklist,
@@ -2909,8 +2984,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
   // of the function down.  This jives well with the way that it adds all uses
   // of instructions to the worklist after doing a transformation, thus avoiding
   // some N^2 behavior in pathological cases.
-  ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0],
-                             InstrsForInstCombineWorklist.size());
+  ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist);
 
   return MadeIRChange;
 }
@@ -2930,13 +3004,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
   // track of which blocks we visit.
   SmallPtrSet<BasicBlock *, 64> Visited;
   MadeIRChange |=
-      AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI);
+      AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI);
 
   // Do a quick scan over the function.  If we find any blocks that are
   // unreachable, remove any instructions inside of them.  This prevents
   // the instcombine code from having to deal with some bad special cases.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Visited.count(BB))
+    if (Visited.count(&*BB))
       continue;
 
     // Delete the instructions backwards, as it has a reduced likelihood of
@@ -2944,11 +3018,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
     Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
     while (EndInst != BB->begin()) {
       // Delete the next to last instruction.
-      BasicBlock::iterator I = EndInst;
-      Instruction *Inst = --I;
-      if (!Inst->use_empty())
+      Instruction *Inst = &*--EndInst->getIterator();
+      if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
         Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
-      if (isa<LandingPadInst>(Inst)) {
+      if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
         EndInst = Inst;
         continue;
       }
@@ -2968,8 +3041,6 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
                                 AliasAnalysis *AA, AssumptionCache &AC,
                                 TargetLibraryInfo &TLI, DominatorTree &DT,
                                 LoopInfo *LI = nullptr) {
-  // Minimizing size?
-  bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);
   auto &DL = F.getParent()->getDataLayout();
 
   /// Builder - This is an IRBuilder that automatically inserts new
@@ -2992,7 +3063,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
     if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))
       Changed = true;
 
-    InstCombiner IC(Worklist, &Builder, MinimizeSize,
+    InstCombiner IC(Worklist, &Builder, F.optForMinSize(),
                     AA, &AC, &TLI, &DT, DL, LI);
     if (IC.run())
       Changed = true;
@@ -3046,11 +3117,12 @@ public:
 
 void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
 }
 
 bool InstructionCombiningPass::runOnFunction(Function &F) {
@@ -3058,7 +3130,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
     return false;
 
   // Required analyses.
-  auto AA = &getAnalysis<AliasAnalysis>();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -3076,7 +3148,8 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
                     "Combine redundant instructions", false, false)
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index e7ef9f9..a9df5e5 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -90,7 +91,9 @@ static const char *const kAsanUnregisterGlobalsName =
     "__asan_unregister_globals";
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *const kAsanInitName = "__asan_init_v5";
+static const char *const kAsanInitName = "__asan_init";
+static const char *const kAsanVersionCheckName =
+    "__asan_version_mismatch_check_v6";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
@@ -119,6 +122,10 @@ static const unsigned kAllocaRzSize = 32;
 static cl::opt<bool> ClEnableKasan(
     "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
     cl::Hidden, cl::init(false));
+static cl::opt<bool> ClRecover(
+    "asan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
 
 // This flag may need to be replaced with -f[no-]asan-reads.
 static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
@@ -177,7 +184,7 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
     cl::init("__asan_"));
 static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas",
                                          cl::desc("instrument dynamic allocas"),
-                                         cl::Hidden, cl::init(false));
+                                         cl::Hidden, cl::init(true));
 static cl::opt<bool> ClSkipPromotableAllocas(
     "asan-skip-promotable-allocas",
     cl::desc("Do not instrument promotable allocas"), cl::Hidden,
@@ -273,6 +280,11 @@ class GlobalsMetadata {
 
   GlobalsMetadata() : inited_(false) {}
 
+  void reset() {
+    inited_ = false;
+    Entries.clear();
+  }
+
   void init(Module &M) {
     assert(!inited_);
     inited_ = true;
@@ -321,7 +333,7 @@ struct ShadowMapping {
 
 static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
-  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
+  bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
   bool IsLinux = TargetTriple.isOSLinux();
@@ -338,6 +350,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   ShadowMapping Mapping;
 
   if (LongSize == 32) {
+    // Android is always PIE, which means that the beginning of the address
+    // space is always available.
     if (IsAndroid)
       Mapping.Offset = 0;
     else if (IsMIPS32)
@@ -376,7 +390,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   // OR-ing shadow offset if more efficient (at least on x86) if the offset
   // is a power of two, but on ppc64 we have to use add since the shadow
   // offset is not necessary 1/8-th of the address space.
-  Mapping.OrShadowOffset = !IsPPC64 && !(Mapping.Offset & (Mapping.Offset - 1));
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64
+                           && !(Mapping.Offset & (Mapping.Offset - 1));
 
   return Mapping;
 }
@@ -389,8 +404,9 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  explicit AddressSanitizer(bool CompileKernel = false)
-      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan) {
+  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan),
+        Recover(Recover || ClRecover) {
     initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
   }
   const char *getPassName() const override {
@@ -437,7 +453,9 @@ struct AddressSanitizer : public FunctionPass {
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool runOnFunction(Function &F) override;
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  void markEscapedLocalAllocas(Function &F);
   bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
 
   DominatorTree &getDominatorTree() const { return *DT; }
@@ -450,10 +468,21 @@ struct AddressSanitizer : public FunctionPass {
   bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
                     uint64_t TypeSize) const;
 
+  /// Helper to cleanup per-function state.
+  struct FunctionStateRAII {
+    AddressSanitizer *Pass;
+    FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
+      assert(Pass->ProcessedAllocas.empty() &&
+             "last pass forgot to clear cache");
+    }
+    ~FunctionStateRAII() { Pass->ProcessedAllocas.clear(); }
+  };
+
   LLVMContext *C;
   Triple TargetTriple;
   int LongSize;
   bool CompileKernel;
+  bool Recover;
   Type *IntptrTy;
   ShadowMapping Mapping;
   DominatorTree *DT;
@@ -477,8 +506,10 @@ struct AddressSanitizer : public FunctionPass {
 
 class AddressSanitizerModule : public ModulePass {
  public:
-  explicit AddressSanitizerModule(bool CompileKernel = false)
-      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan) {}
+  explicit AddressSanitizerModule(bool CompileKernel = false,
+                                  bool Recover = false)
+      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
+        Recover(Recover || ClRecover) {}
   bool runOnModule(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
   const char *getPassName() const override { return "AddressSanitizerModule"; }
@@ -496,6 +527,7 @@ class AddressSanitizerModule : public ModulePass {
 
   GlobalsMetadata GlobalsMD;
   bool CompileKernel;
+  bool Recover;
   Type *IntptrTy;
   LLVMContext *C;
   Triple TargetTriple;
@@ -525,6 +557,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   ShadowMapping Mapping;
 
   SmallVector<AllocaInst *, 16> AllocaVec;
+  SmallSetVector<AllocaInst *, 16> NonInstrumentedStaticAllocaVec;
   SmallVector<Instruction *, 8> RetVec;
   unsigned StackAlignment;
 
@@ -545,12 +578,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   SmallVector<AllocaInst *, 1> DynamicAllocaVec;
   SmallVector<IntrinsicInst *, 1> StackRestoreVec;
   AllocaInst *DynamicAllocaLayout = nullptr;
+  IntrinsicInst *LocalEscapeCall = nullptr;
 
   // Maps Value to an AllocaInst from which the Value is originated.
   typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy;
   AllocaForValueMapTy AllocaForValue;
 
-  bool HasNonEmptyInlineAsm;
+  bool HasNonEmptyInlineAsm = false;
+  bool HasReturnsTwiceCall = false;
   std::unique_ptr<CallInst> EmptyInlineAsm;
 
   FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
@@ -562,7 +597,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
         IntptrPtrTy(PointerType::get(IntptrTy, 0)),
         Mapping(ASan.Mapping),
         StackAlignment(1 << Mapping.Scale),
-        HasNonEmptyInlineAsm(false),
         EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
 
   bool runOnFunction() {
@@ -596,9 +630,24 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
                                         Value *SavedStack) {
     IRBuilder<> IRB(InstBefore);
+    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+    // need to adjust extracted SP to compute the address of the most recent
+    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+    // this purpose.
+    if (!isa<ReturnInst>(InstBefore)) {
+      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+          {IntptrTy});
+
+      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+                                     DynamicAreaOffset);
+    }
+
     IRB.CreateCall(AsanAllocasUnpoisonFunc,
-                   {IRB.CreateLoad(DynamicAllocaLayout),
-                    IRB.CreatePtrToInt(SavedStack, IntptrTy)});
+                   {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr});
   }
 
   // Unpoison dynamic allocas redzones.
@@ -625,7 +674,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   /// \brief Collect Alloca instructions we want (and can) handle.
   void visitAllocaInst(AllocaInst &AI) {
-    if (!ASan.isInterestingAlloca(AI)) return;
+    if (!ASan.isInterestingAlloca(AI)) {
+      if (AI.isStaticAlloca()) NonInstrumentedStaticAllocaVec.insert(&AI);
+      return;
+    }
 
     StackAlignment = std::max(StackAlignment, AI.getAlignment());
     if (ASan.isDynamicAlloca(AI))
@@ -639,6 +691,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void visitIntrinsicInst(IntrinsicInst &II) {
     Intrinsic::ID ID = II.getIntrinsicID();
     if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
+    if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
     if (!ClCheckLifetime) return;
     if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)
       return;
@@ -660,9 +713,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     AllocaPoisonCallVec.push_back(APC);
   }
 
-  void visitCallInst(CallInst &CI) {
-    HasNonEmptyInlineAsm |=
-        CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get());
+  void visitCallSite(CallSite CS) {
+    Instruction *I = CS.getInstruction();
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      HasNonEmptyInlineAsm |=
+          CI->isInlineAsm() && !CI->isIdenticalTo(EmptyInlineAsm.get());
+      HasReturnsTwiceCall |= CI->canReturnTwice();
+    }
   }
 
   // ---------------------- Helpers.
@@ -689,7 +746,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
                      Instruction *ThenTerm, Value *ValueIfFalse);
 };
 
-}  // namespace
+} // anonymous namespace
 
 char AddressSanitizer::ID = 0;
 INITIALIZE_PASS_BEGIN(
@@ -697,12 +754,15 @@ INITIALIZE_PASS_BEGIN(
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
     AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
-FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel) {
-  return new AddressSanitizer(CompileKernel);
+FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
+                                                       bool Recover) {
+  assert(!CompileKernel || Recover);
+  return new AddressSanitizer(CompileKernel, Recover);
 }
 
 char AddressSanitizerModule::ID = 0;
@@ -711,8 +771,10 @@ INITIALIZE_PASS(
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass",
     false, false)
-ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel) {
-  return new AddressSanitizerModule(CompileKernel);
+ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
+                                                   bool Recover) {
+  assert(!CompileKernel || Recover);
+  return new AddressSanitizerModule(CompileKernel, Recover);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -799,8 +861,10 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {
        getAllocaSizeInBytes(&AI) > 0 &&
        // We are only interested in allocas not promotable to registers.
        // Promotable allocas are common under -O0.
-       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI) ||
-        isDynamicAlloca(AI)));
+       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+       // inalloca allocas are not treated as static, and we don't want
+       // dynamic alloca instrumentation for them as well.
+       !AI.isUsedWithInAlloca());
 
   ProcessedAllocas[&AI] = IsInteresting;
   return IsInteresting;
@@ -868,10 +932,8 @@ static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) {
   } else {
     return false;
   }
-  if (!isPointerOperand(I->getOperand(0)) ||
-      !isPointerOperand(I->getOperand(1)))
-    return false;
-  return true;
+  return isPointerOperand(I->getOperand(0)) &&
+         isPointerOperand(I->getOperand(1));
 }
 
 bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
@@ -919,7 +981,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
     // If initialization order checking is disabled, a simple access to a
     // dynamically initialized global is always valid.
     GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
-    if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+    if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
         isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
       NumOptimizedAccessesToGlobalVar++;
       return;
@@ -1041,13 +1103,17 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
     IRB.SetInsertPoint(CheckTerm);
     Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
-    BasicBlock *CrashBlock =
+    if (Recover) {
+      CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
+    } else {
+      BasicBlock *CrashBlock =
         BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
-    CrashTerm = new UnreachableInst(*C, CrashBlock);
-    BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
-    ReplaceInstWithInst(CheckTerm, NewTerm);
+      CrashTerm = new UnreachableInst(*C, CrashBlock);
+      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+      ReplaceInstWithInst(CheckTerm, NewTerm);
+    }
   } else {
-    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true);
+    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
   }
 
   Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
@@ -1084,7 +1150,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment(
 void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
                                                   GlobalValue *ModuleName) {
   // Set up the arguments to our poison/unpoison functions.
-  IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt());
+  IRBuilder<> IRB(&GlobalInit.front(),
+                  GlobalInit.front().getFirstInsertionPt());
 
   // Add a call to poison all external globals before the given function starts.
   Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
@@ -1147,6 +1214,14 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
     // Do not instrument globals from special LLVM sections.
     if (Section.find("__llvm") != StringRef::npos) return false;
 
+    // Do not instrument function pointers to initialization and termination
+    // routines: dynamic linker will not properly handle redzones.
+    if (Section.startswith(".preinit_array") ||
+        Section.startswith(".init_array") ||
+        Section.startswith(".fini_array")) {
+      return false;
+    }
+
     // Callbacks put into the CRT initializer/terminator sections
     // should not be instrumented.
     // See https://code.google.com/p/address-sanitizer/issues/detail?id=305
@@ -1162,10 +1237,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
       bool TAAParsed;
       std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
           Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
-      if (!ErrorCode.empty()) {
-        assert(false && "Invalid section specifier.");
-        return false;
-      }
+      assert(ErrorCode.empty() && "Invalid section specifier.");
 
       // Ignore the globals from the __OBJC section. The ObjC runtime assumes
       // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
@@ -1383,13 +1455,11 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
       const std::string TypeStr = AccessIsWrite ? "store" : "load";
       const std::string ExpStr = Exp ? "exp_" : "";
       const std::string SuffixStr = CompileKernel ? "N" : "_n";
-      const std::string EndingStr = CompileKernel ? "_noabort" : "";
-      const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
-      // TODO(glider): for KASan builds add _noabort to error reporting
-      // functions and make them actually noabort (remove the UnreachableInst).
+      const std::string EndingStr = Recover ? "_noabort" : "";
+      Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
       AsanErrorCallbackSized[AccessIsWrite][Exp] =
           checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr,
+              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr,
               IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
       AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
           checkSanitizerInterfaceFunction(M.getOrInsertFunction(
@@ -1400,7 +1470,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
         const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex);
         AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
             checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                kAsanReportErrorTemplate + ExpStr + Suffix,
+                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
                 IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
         AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
             checkSanitizerInterfaceFunction(M.getOrInsertFunction(
@@ -1448,15 +1518,20 @@ bool AddressSanitizer::doInitialization(Module &M) {
 
   if (!CompileKernel) {
     std::tie(AsanCtorFunction, AsanInitFunction) =
-        createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, kAsanInitName,
-                                            /*InitArgTypes=*/{},
-                                            /*InitArgs=*/{});
+        createSanitizerCtorAndInitFunctions(
+            M, kAsanModuleCtorName, kAsanInitName,
+            /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName);
     appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
   }
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   return true;
 }
 
+bool AddressSanitizer::doFinalization(Module &M) {
+  GlobalsMD.reset();
+  return false;
+}
+
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
@@ -1466,13 +1541,41 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
   if (F.getName().find(" load]") != std::string::npos) {
-    IRBuilder<> IRB(F.begin()->begin());
+    IRBuilder<> IRB(&F.front(), F.front().begin());
     IRB.CreateCall(AsanInitFunction, {});
     return true;
   }
   return false;
 }
 
+void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
+  // Find the one possible call to llvm.localescape and pre-mark allocas passed
+  // to it as uninteresting. This assumes we haven't started processing allocas
+  // yet. This check is done up front because iterating the use list in
+  // isInterestingAlloca would be algorithmically slower.
+  assert(ProcessedAllocas.empty() && "must process localescape before allocas");
+
+  // Try to get the declaration of llvm.localescape. If it's not in the module,
+  // we can exit early.
+  if (!F.getParent()->getFunction("llvm.localescape")) return;
+
+  // Look for a call to llvm.localescape call in the entry block. It can't be in
+  // any other block.
+  for (Instruction &I : F.getEntryBlock()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
+      // We found a call. Mark all the allocas passed in as uninteresting.
+      for (Value *Arg : II->arg_operands()) {
+        AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
+        assert(AI && AI->isStaticAlloca() &&
+               "non-static alloca arg to localescape");
+        ProcessedAllocas[AI] = false;
+      }
+      break;
+    }
+  }
+}
+
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
@@ -1488,6 +1591,12 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false;
 
+  FunctionStateRAII CleanupObj(this);
+
+  // We can't instrument allocas used with llvm.localescape. Only static allocas
+  // can be passed to that intrinsic.
+  markEscapedLocalAllocas(F);
+
   // We want to instrument every address only once per basic block (unless there
   // are calls between uses).
   SmallSet<Value *, 16> TempsToInstrument;
@@ -1715,6 +1824,16 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
 void FunctionStackPoisoner::poisonStack() {
   assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0);
 
+  // Insert poison calls for lifetime intrinsics for alloca.
+  bool HavePoisonedAllocas = false;
+  for (const auto &APC : AllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
+    HavePoisonedAllocas |= APC.DoPoison;
+  }
+
   if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) {
     // Handle dynamic allocas.
     createDynamicAllocasInitStorage();
@@ -1723,7 +1842,7 @@ void FunctionStackPoisoner::poisonStack() {
     unpoisonDynamicAllocas();
   }
 
-  if (AllocaVec.size() == 0) return;
+  if (AllocaVec.empty()) return;
 
   int StackMallocIdx = -1;
   DebugLoc EntryDebugLocation;
@@ -1734,6 +1853,19 @@ void FunctionStackPoisoner::poisonStack() {
   IRBuilder<> IRB(InsBefore);
   IRB.SetCurrentDebugLocation(EntryDebugLocation);
 
+  // Make sure non-instrumented allocas stay in the entry block. Otherwise,
+  // debug info is broken, because only entry-block allocas are treated as
+  // regular stack slots.
+  auto InsBeforeB = InsBefore->getParent();
+  assert(InsBeforeB == &F.getEntryBlock());
+  for (BasicBlock::iterator I(InsBefore); I != InsBeforeB->end(); ++I)
+    if (auto *AI = dyn_cast<AllocaInst>(I))
+      if (NonInstrumentedStaticAllocaVec.count(AI) > 0)
+        AI->moveBefore(InsBefore);
+
+  // If we have a call to llvm.localescape, keep it in the entry block.
+  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
+
   SmallVector<ASanStackVariableDescription, 16> SVD;
   SVD.reserve(AllocaVec.size());
   for (AllocaInst *AI : AllocaVec) {
@@ -1751,10 +1883,15 @@ void FunctionStackPoisoner::poisonStack() {
   uint64_t LocalStackSize = L.FrameSize;
   bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
                        LocalStackSize <= kMaxStackMallocSize;
-  // Don't do dynamic alloca or stack malloc in presence of inline asm:
-  // too often it makes assumptions on which registers are available.
-  bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm;
-  DoStackMalloc &= !HasNonEmptyInlineAsm;
+  bool DoDynamicAlloca = ClDynamicAllocaStack;
+  // Don't do dynamic alloca or stack malloc if:
+  // 1) There is inline asm: too often it makes assumptions on which registers
+  //    are available.
+  // 2) There is a returns_twice call (typically setjmp), which is
+  //    optimization-hostile, and doesn't play well with introduced indirect
+  //    register-relative calculation of local variable addresses.
+  DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
+  DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
 
   Value *StaticAlloca =
       DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
@@ -1804,16 +1941,6 @@ void FunctionStackPoisoner::poisonStack() {
         DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
   }
 
-  // Insert poison calls for lifetime intrinsics for alloca.
-  bool HavePoisonedAllocas = false;
-  for (const auto &APC : AllocaPoisonCallVec) {
-    assert(APC.InsBefore);
-    assert(APC.AI);
-    IRBuilder<> IRB(APC.InsBefore);
-    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
-    HavePoisonedAllocas |= APC.DoPoison;
-  }
-
   // Replace Alloca instructions with base+offset.
   for (const auto &Desc : SVD) {
     AllocaInst *AI = Desc.AI;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index f685803..fd3dfd9 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -106,7 +106,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) {
   }
   ++ChecksAdded;
 
-  Instruction *Inst = Builder->GetInsertPoint();
+  BasicBlock::iterator Inst = Builder->GetInsertPoint();
   BasicBlock *OldBB = Inst->getParent();
   BasicBlock *Cont = OldBB->splitBasicBlock(Inst);
   OldBB->getTerminator()->eraseFromParent();
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
new file mode 100644
index 0000000..c47fdbf
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -0,0 +1,217 @@
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+#define DEBUG_TYPE "cfgmst"
+
+/// \brief An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+  Function &F;
+
+  // Store all the edges in CFG. It may contain some stale edges
+  // when Removed is set.
+  std::vector<std::unique_ptr<Edge>> AllEdges;
+
+  // This map records the auxiliary information for each BB.
+  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+  // Find the root group of the G and compress the path from G to the root.
+  BBInfo *findAndCompressGroup(BBInfo *G) {
+    if (G->Group != G)
+      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+    return static_cast<BBInfo *>(G->Group);
+  }
+
+  // Union BB1 and BB2 into the same group and return true.
+  // Returns false if BB1 and BB2 are already in the same group.
+  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+    if (BB1G == BB2G)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (BB1G->Rank < BB2G->Rank)
+      BB1G->Group = BB2G;
+    else {
+      BB2G->Group = BB1G;
+      // If the ranks are the same, increment root of one tree by one.
+      if (BB1G->Rank == BB2G->Rank)
+        BB1G->Rank++;
+    }
+    return true;
+  }
+
+  // Give BB, return the auxiliary information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    assert(It->second.get() != nullptr);
+    return *It->second.get();
+  }
+
+  // Traverse the CFG using a stack. Find all the edges and assign the weight.
+  // Edges with large weight will be put into MST first so they are less likely
+  // to be instrumented.
+  void buildEdges() {
+    DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+    const BasicBlock *BB = &(F.getEntryBlock());
+    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    // Add a fake edge to the entry.
+    addEdge(nullptr, BB, EntryWeight);
+
+    // Special handling for single BB functions.
+    if (succ_empty(BB)) {
+      addEdge(BB, nullptr, EntryWeight);
+      return;
+    }
+
+    static const uint32_t CriticalEdgeMultiplier = 1000;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      uint64_t BBWeight =
+          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+      uint64_t Weight = 2;
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          BasicBlock *TargetBB = TI->getSuccessor(i);
+          bool Critical = isCriticalEdge(TI, i);
+          uint64_t scaleFactor = BBWeight;
+          if (Critical) {
+            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+              scaleFactor *= CriticalEdgeMultiplier;
+            else
+              scaleFactor = UINT64_MAX;
+          }
+          if (BPI != nullptr)
+            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          addEdge(&*BB, TargetBB, Weight).IsCritical = Critical;
+          DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                       << TargetBB->getName() << "  w=" << Weight << "\n");
+        }
+      } else {
+        addEdge(&*BB, nullptr, BBWeight);
+        DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to exit"
+                     << " w = " << BBWeight << "\n");
+      }
+    }
+  }
+
+  // Sort CFG edges based on its weight.
+  void sortEdgesByWeight() {
+    std::stable_sort(AllEdges.begin(), AllEdges.end(),
+                     [](const std::unique_ptr<Edge> &Edge1,
+                        const std::unique_ptr<Edge> &Edge2) {
+                       return Edge1->Weight > Edge2->Weight;
+                     });
+  }
+
+  // Traverse all the edges and compute the Minimum Weight Spanning Tree
+  // using union-find algorithm.
+  void computeMinimumSpanningTree() {
+    // First, put all the critical edge with landing-pad as the Dest to MST.
+    // This works around the insufficient support of critical edges split
+    // when destination BB is a landing pad.
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->IsCritical) {
+        if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+          if (unionGroups(Ei->SrcBB, Ei->DestBB))
+            Ei->InMST = true;
+        }
+      }
+    }
+
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (unionGroups(Ei->SrcBB, Ei->DestBB))
+        Ei->InMST = true;
+    }
+  }
+
+  // Dump the Debug information about the instrumentation.
+  void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+    if (!Message.str().empty())
+      OS << Message << "\n";
+    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
+    for (auto &BI : BBInfos) {
+      const BasicBlock *BB = BI.first;
+      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
+         << BI.second->infoString() << "\n";
+    }
+
+    OS << "  Number of Edges: " << AllEdges.size()
+       << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+    uint32_t Count = 0;
+    for (auto &EI : AllEdges)
+      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+  }
+
+  // Add an edge to AllEdges with weight W.
+  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+    uint32_t Index = BBInfos.size();
+    auto Iter = BBInfos.end();
+    bool Inserted;
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+    if (Inserted) {
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Index++;
+    }
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+    if (Inserted)
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+    AllEdges.emplace_back(new Edge(Src, Dest, W));
+    return *AllEdges.back();
+  }
+
+  BranchProbabilityInfo *BPI;
+  BlockFrequencyInfo *BFI;
+
+public:
+  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr,
+         BlockFrequencyInfo *BFI_ = nullptr)
+      : F(Func), BPI(BPI_), BFI(BFI_) {
+    buildEdges();
+    sortEdgesByWeight();
+    computeMinimumSpanningTree();
+  }
+};
+
+#undef DEBUG_TYPE // "cfgmst"
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 2de6e1a..d459fc5 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -72,6 +72,11 @@
 
 using namespace llvm;
 
+// External symbol to be used when generating the shadow address for
+// architectures with multiple VMAs. Instead of using a constant integer
+// the runtime will set the external mask based on the VMA range.
+static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask";
+
 // The -dfsan-preserve-alignment flag controls whether this pass assumes that
 // alignment requirements provided by the input IR are correct.  For example,
 // if the input IR contains a load with alignment 8, this flag will cause
@@ -124,6 +129,7 @@ static cl::opt<bool> ClDebugNonzeroLabels(
              "load or return with a nonzero label"),
     cl::Hidden);
 
+
 namespace {
 
 StringRef GetGlobalTypeString(const GlobalValue &G) {
@@ -231,6 +237,7 @@ class DataFlowSanitizer : public ModulePass {
   void *(*GetRetvalTLSPtr)();
   Constant *GetArgTLS;
   Constant *GetRetvalTLS;
+  Constant *ExternalShadowMask;
   FunctionType *DFSanUnionFnTy;
   FunctionType *DFSanUnionLoadFnTy;
   FunctionType *DFSanUnimplementedFnTy;
@@ -248,7 +255,7 @@ class DataFlowSanitizer : public ModulePass {
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
   AttributeSet ReadOnlyNoneAttrs;
-  DenseMap<const Function *, DISubprogram *> FunctionDIs;
+  bool DFSanRuntimeShadowMask;
 
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
   bool isInstrumented(const Function *F);
@@ -362,7 +369,8 @@ llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
 DataFlowSanitizer::DataFlowSanitizer(
     const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),
     void *(*getRetValTLS)())
-    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) {
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
+      DFSanRuntimeShadowMask(false) {
   std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
   AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
                          ClABIListFiles.end());
@@ -420,6 +428,8 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
   bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
   bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
                   TargetTriple.getArch() == llvm::Triple::mips64el;
+  bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 ||
+                   TargetTriple.getArch() == llvm::Triple::aarch64_be;
 
   const DataLayout &DL = M.getDataLayout();
 
@@ -434,6 +444,9 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
     ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
   else if (IsMIPS64)
     ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+  // AArch64 supports multiple VMAs and the shadow mask is set at runtime.
+  else if (IsAArch64)
+    DFSanRuntimeShadowMask = true;
   else
     report_fatal_error("unsupported triple");
 
@@ -578,7 +591,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
     Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
     for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
-      DFSF.ValShadowMap[ValAI] = ShadowAI;
+      DFSF.ValShadowMap[&*ValAI] = &*ShadowAI;
     DFSanVisitor(DFSF).visitCallInst(*CI);
     if (!FT->getReturnType()->isVoidTy())
       new StoreInst(DFSF.getShadow(RI->getReturnValue()),
@@ -592,8 +605,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
   if (ABIList.isIn(M, "skip"))
     return false;
 
-  FunctionDIs = makeSubprogramMap(M);
-
   if (!GetArgTLSPtr) {
     Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
     ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
@@ -606,6 +617,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
       G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
   }
 
+  ExternalShadowMask =
+      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
+
   DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
     F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
@@ -643,16 +657,16 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   std::vector<Function *> FnsToInstrument;
   llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
-  for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
-    if (!i->isIntrinsic() &&
-        i != DFSanUnionFn &&
-        i != DFSanCheckedUnionFn &&
-        i != DFSanUnionLoadFn &&
-        i != DFSanUnimplementedFn &&
-        i != DFSanSetLabelFn &&
-        i != DFSanNonzeroLabelFn &&
-        i != DFSanVarargWrapperFn)
-      FnsToInstrument.push_back(&*i);
+  for (Function &i : M) {
+    if (!i.isIntrinsic() &&
+        &i != DFSanUnionFn &&
+        &i != DFSanCheckedUnionFn &&
+        &i != DFSanUnionLoadFn &&
+        &i != DFSanUnimplementedFn &&
+        &i != DFSanSetLabelFn &&
+        &i != DFSanNonzeroLabelFn &&
+        &i != DFSanVarargWrapperFn)
+      FnsToInstrument.push_back(&i);
   }
 
   // Give function aliases prefixes when necessary, and build wrappers where the
@@ -710,7 +724,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
                                     NewFArg = NewF->arg_begin(),
                                     FArgEnd = F.arg_end();
              FArg != FArgEnd; ++FArg, ++NewFArg) {
-          FArg->replaceAllUsesWith(NewFArg);
+          FArg->replaceAllUsesWith(&*NewFArg);
         }
         NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
 
@@ -750,11 +764,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
       F.replaceAllUsesWith(WrappedFnCst);
 
-      // Patch the pointer to LLVM function in debug info descriptor.
-      auto DI = FunctionDIs.find(&F);
-      if (DI != FunctionDIs.end())
-        DI->second->replaceFunction(&F);
-
       UnwrappedFnMap[WrappedFnCst] = &F;
       *i = NewF;
 
@@ -842,7 +851,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         if (Instruction *I = dyn_cast<Instruction>(V))
           Pos = I->getNextNode();
         else
-          Pos = DFSF.F->getEntryBlock().begin();
+          Pos = &DFSF.F->getEntryBlock().front();
         while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
           Pos = Pos->getNextNode();
         IRBuilder<> IRB(Pos);
@@ -864,7 +873,7 @@ Value *DFSanFunction::getArgTLSPtr() {
   if (DFS.ArgTLS)
     return ArgTLSPtr = DFS.ArgTLS;
 
-  IRBuilder<> IRB(F->getEntryBlock().begin());
+  IRBuilder<> IRB(&F->getEntryBlock().front());
   return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {});
 }
 
@@ -874,7 +883,7 @@ Value *DFSanFunction::getRetvalTLS() {
   if (DFS.RetvalTLS)
     return RetvalTLSPtr = DFS.RetvalTLS;
 
-  IRBuilder<> IRB(F->getEntryBlock().begin());
+  IRBuilder<> IRB(&F->getEntryBlock().front());
   return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {});
 }
 
@@ -906,7 +915,7 @@ Value *DFSanFunction::getShadow(Value *V) {
         Function::arg_iterator i = F->arg_begin();
         while (ArgIdx--)
           ++i;
-        Shadow = i;
+        Shadow = &*i;
         assert(Shadow->getType() == DFS.ShadowTy);
         break;
       }
@@ -928,9 +937,15 @@ void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
 Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
   assert(Addr != RetvalTLS && "Reinstrumenting?");
   IRBuilder<> IRB(Pos);
+  Value *ShadowPtrMaskValue;
+  if (DFSanRuntimeShadowMask)
+    ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
+  else
+    ShadowPtrMaskValue = ShadowPtrMask;
   return IRB.CreateIntToPtr(
       IRB.CreateMul(
-          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask),
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
+                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
           ShadowPtrMul),
       ShadowPtrTy);
 }
@@ -991,7 +1006,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
     Call->addAttribute(2, Attribute::ZExt);
 
     BasicBlock *Tail = BI->getSuccessor(0);
-    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin());
+    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
     Phi->addIncoming(Call, Call->getParent());
     Phi->addIncoming(V1, Head);
 
@@ -1105,7 +1120,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
 
     BasicBlock *Head = Pos->getParent();
-    BasicBlock *Tail = Head->splitBasicBlock(Pos);
+    BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());
 
     if (DomTreeNode *OldNode = DT.getNode(Head)) {
       std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
@@ -1475,8 +1490,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         if (FT->isVarArg()) {
           auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
                                            CS.arg_size() - FT->getNumParams());
-          auto *LabelVAAlloca = new AllocaInst(LabelVATy, "labelva",
-                                               DFSF.F->getEntryBlock().begin());
+          auto *LabelVAAlloca = new AllocaInst(
+              LabelVATy, "labelva", &DFSF.F->getEntryBlock().front());
 
           for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
             auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
@@ -1490,7 +1505,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
           if (!DFSF.LabelReturnAlloca) {
             DFSF.LabelReturnAlloca =
                 new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
-                               DFSF.F->getEntryBlock().begin());
+                               &DFSF.F->getEntryBlock().front());
           }
           Args.push_back(DFSF.LabelReturnAlloca);
         }
@@ -1529,13 +1544,14 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
   if (!CS.getType()->isVoidTy()) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
       if (II->getNormalDest()->getSinglePredecessor()) {
-        Next = II->getNormalDest()->begin();
+        Next = &II->getNormalDest()->front();
       } else {
         BasicBlock *NewBB =
             SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
-        Next = NewBB->begin();
+        Next = &NewBB->front();
       }
     } else {
+      assert(CS->getIterator() != CS->getParent()->end());
       Next = CS->getNextNode();
     }
 
@@ -1568,7 +1584,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
       unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
       ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
       AllocaInst *VarArgShadow =
-          new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin());
+          new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front());
       Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
       for (unsigned n = 0; i != e; ++i, ++n) {
         IRB.CreateStore(
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 9a3ed5c..fa939ae 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -138,6 +138,7 @@ namespace {
     Module *M;
     LLVMContext *Ctx;
     SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+    DenseMap<DISubprogram *, Function *> FnMap;
   };
 }
 
@@ -309,13 +310,12 @@ namespace {
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
    public:
-     GCOVFunction(const DISubprogram *SP, raw_ostream *os, uint32_t Ident,
-                  bool UseCfgChecksum, bool ExitBlockBeforeBody)
+     GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os,
+                  uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody)
          : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
            ReturnBlock(1, os) {
       this->os = os;
 
-      Function *F = SP->getFunction();
       DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
 
       uint32_t i = 0;
@@ -347,8 +347,8 @@ namespace {
       std::string EdgeDestinations;
       raw_string_ostream EDOS(EdgeDestinations);
       Function *F = Blocks.begin()->first->getParent();
-      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-        GCOVBlock &Block = getBlock(I);
+      for (BasicBlock &I : *F) {
+        GCOVBlock &Block = getBlock(&I);
         for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)
           EDOS << Block.OutEdges[i]->Number;
       }
@@ -389,8 +389,8 @@ namespace {
       // Emit edges between blocks.
       if (Blocks.empty()) return;
       Function *F = Blocks.begin()->first->getParent();
-      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-        GCOVBlock &Block = getBlock(I);
+      for (BasicBlock &I : *F) {
+        GCOVBlock &Block = getBlock(&I);
         if (Block.OutEdges.empty()) continue;
 
         writeBytes(EdgeTag, 4);
@@ -405,9 +405,8 @@ namespace {
       }
 
       // Emit lines for each block.
-      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-        getBlock(I).writeOut();
-      }
+      for (BasicBlock &I : *F)
+        getBlock(&I).writeOut();
     }
 
    private:
@@ -451,6 +450,12 @@ bool GCOVProfiler::runOnModule(Module &M) {
   this->M = &M;
   Ctx = &M.getContext();
 
+  FnMap.clear();
+  for (Function &F : M) {
+    if (DISubprogram *SP = F.getSubprogram())
+      FnMap[SP] = &F;
+  }
+
   if (Options.EmitNotes) emitProfileNotes();
   if (Options.EmitData) return emitProfileArcs();
   return false;
@@ -495,7 +500,7 @@ void GCOVProfiler::emitProfileNotes() {
 
     unsigned FunctionIdent = 0;
     for (auto *SP : CU->getSubprograms()) {
-      Function *F = SP->getFunction();
+      Function *F = FnMap[SP];
       if (!F) continue;
       if (!functionHasLines(F)) continue;
 
@@ -507,13 +512,13 @@ void GCOVProfiler::emitProfileNotes() {
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++,
+      Funcs.push_back(make_unique<GCOVFunction>(SP, F, &out, FunctionIdent++,
                                                 Options.UseCfgChecksum,
                                                 Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
 
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        GCOVBlock &Block = Func.getBlock(BB);
+        GCOVBlock &Block = Func.getBlock(&*BB);
         TerminatorInst *TI = BB->getTerminator();
         if (int successors = TI->getNumSuccessors()) {
           for (int i = 0; i != successors; ++i) {
@@ -574,7 +579,7 @@ bool GCOVProfiler::emitProfileArcs() {
     auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
     for (auto *SP : CU->getSubprograms()) {
-      Function *F = SP->getFunction();
+      Function *F = FnMap[SP];
       if (!F) continue;
       if (!functionHasLines(F)) continue;
       if (!Result) Result = true;
@@ -605,7 +610,7 @@ bool GCOVProfiler::emitProfileArcs() {
         int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
         if (Successors) {
           if (Successors == 1) {
-            IRBuilder<> Builder(BB->getFirstInsertionPt());
+            IRBuilder<> Builder(&*BB->getFirstInsertionPt());
             Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                                 Edge);
             Value *Count = Builder.CreateLoad(Counter);
@@ -625,7 +630,7 @@ bool GCOVProfiler::emitProfileArcs() {
             Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else {
-            ComplexEdgePreds.insert(BB);
+            ComplexEdgePreds.insert(&*BB);
             for (int i = 0; i != Successors; ++i)
               ComplexEdgeSuccs.insert(TI->getSuccessor(i));
           }
@@ -641,13 +646,13 @@ bool GCOVProfiler::emitProfileArcs() {
         GlobalVariable *EdgeState = getEdgeStateValue();
 
         for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
-          IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt());
+          IRBuilder<> Builder(&*ComplexEdgePreds[i + 1]->getFirstInsertionPt());
           Builder.CreateStore(Builder.getInt32(i), EdgeState);
         }
 
         for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
           // Call runtime to perform increment.
-          IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt());
+          IRBuilder<> Builder(&*ComplexEdgeSuccs[i + 1]->getFirstInsertionPt());
           Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
@@ -731,8 +736,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
         IRBuilder<> Builder(Succ);
         Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                             Edge + i);
-        EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) +
-                  (Preds.idFor(BB)-1)] = cast<Constant>(Counter);
+        EdgeTable[((Succs.idFor(Succ) - 1) * Preds.size()) +
+                  (Preds.idFor(&*BB) - 1)] = cast<Constant>(Counter);
       }
     }
     Edge += Successors;
@@ -901,7 +906,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
 
   // uint32_t pred = *predecessor;
   // if (pred == 0xffffffff) return;
-  Argument *Arg = Fn->arg_begin();
+  Argument *Arg = &*Fn->arg_begin();
   Arg->setName("predecessor");
   Value *Pred = Builder.CreateLoad(Arg, "pred");
   Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff));
@@ -912,7 +917,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   // uint64_t *counter = counters[pred];
   // if (!counter) return;
   Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty());
-  Arg = std::next(Fn->arg_begin());
+  Arg = &*std::next(Fn->arg_begin());
   Arg->setName("counters");
   Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred);
   Value *Counter = Builder.CreateLoad(GEP, "counter");
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 712bf8e..28483e7 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -7,18 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass lowers instrprof_increment intrinsics emitted by a frontend for
-// profiling. It also builds the data structures and initialization code needed
-// for updating execution counts and emitting the profile at runtime.
+// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling.
+// It also builds the data structures and initialization code needed for
+// updating execution counts and emitting the profile at runtime.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
-
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -49,7 +49,15 @@ public:
 private:
   InstrProfOptions Options;
   Module *M;
-  DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters;
+  typedef struct PerFunctionProfileData {
+    uint32_t NumValueSites[IPVK_Last+1];
+    GlobalVariable* RegionCounters;
+    GlobalVariable* DataVar;
+    PerFunctionProfileData() : RegionCounters(nullptr), DataVar(nullptr) {
+      memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last+1));
+    }
+  } PerFunctionProfileData;
+  DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap;
   std::vector<Value *> UsedVars;
 
   bool isMachO() const {
@@ -58,29 +66,35 @@ private:
 
   /// Get the section name for the counter variables.
   StringRef getCountersSection() const {
-    return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts";
+    return getInstrProfCountersSectionName(isMachO());
   }
 
   /// Get the section name for the name variables.
   StringRef getNameSection() const {
-    return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names";
+    return getInstrProfNameSectionName(isMachO());
   }
 
   /// Get the section name for the profile data variables.
   StringRef getDataSection() const {
-    return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data";
+    return getInstrProfDataSectionName(isMachO());
   }
 
   /// Get the section name for the coverage mapping data.
   StringRef getCoverageSection() const {
-    return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap";
+    return getInstrProfCoverageSectionName(isMachO());
   }
 
+  /// Count the number of instrumented value sites for the function.
+  void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins);
+
+  /// Replace instrprof_value_profile with a call to runtime library.
+  void lowerValueProfileInst(InstrProfValueProfileInst *Ins);
+
   /// Replace instrprof_increment with an increment of the appropriate value.
   void lowerIncrement(InstrProfIncrementInst *Inc);
 
-  /// Set up the section and uses for coverage data and its references.
-  void lowerCoverageData(GlobalVariable *CoverageData);
+  /// Force emitting of name vars for unused functions.
+  void lowerCoverageData(GlobalVariable *CoverageNamesVar);
 
   /// Get the region counters for an increment, creating them if necessary.
   ///
@@ -117,20 +131,37 @@ bool InstrProfiling::runOnModule(Module &M) {
   bool MadeChange = false;
 
   this->M = &M;
-  RegionCounters.clear();
+  ProfileDataMap.clear();
   UsedVars.clear();
 
+  // We did not know how many value sites there would be inside
+  // the instrumented function. This is counting the number of instrumented
+  // target value sites to enter it as field in the profile data variable.
   for (Function &F : M)
     for (BasicBlock &BB : F)
       for (auto I = BB.begin(), E = BB.end(); I != E;)
-        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) {
+        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I++))
+          computeNumValueSiteCounts(Ind);
+
+  for (Function &F : M)
+    for (BasicBlock &BB : F)
+      for (auto I = BB.begin(), E = BB.end(); I != E;) {
+        auto Instr = I++;
+        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(Instr)) {
           lowerIncrement(Inc);
           MadeChange = true;
+        } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+          lowerValueProfileInst(Ind);
+          MadeChange = true;
         }
-  if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) {
-    lowerCoverageData(Coverage);
+      }
+
+  if (GlobalVariable *CoverageNamesVar =
+          M.getNamedGlobal(getCoverageNamesVarName())) {
+    lowerCoverageData(CoverageNamesVar);
     MadeChange = true;
   }
+
   if (!MadeChange)
     return false;
 
@@ -141,10 +172,59 @@ bool InstrProfiling::runOnModule(Module &M) {
   return true;
 }
 
+static Constant *getOrInsertValueProfilingCall(Module &M) {
+  LLVMContext &Ctx = M.getContext();
+  auto *ReturnTy = Type::getVoidTy(M.getContext());
+  Type *ParamTypes[] = {
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *ValueProfilingCallTy =
+      FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+  return M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+                               ValueProfilingCallTy);
+}
+
+void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
+
+  GlobalVariable *Name = Ind->getName();
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  auto It = ProfileDataMap.find(Name);
+  if (It == ProfileDataMap.end()) {
+    PerFunctionProfileData PD;
+    PD.NumValueSites[ValueKind] = Index + 1;
+    ProfileDataMap[Name] = PD;
+  } else if (It->second.NumValueSites[ValueKind] <= Index)
+    It->second.NumValueSites[ValueKind] = Index + 1;
+}
+
+void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+
+  GlobalVariable *Name = Ind->getName();
+  auto It = ProfileDataMap.find(Name);
+  assert(It != ProfileDataMap.end() && It->second.DataVar &&
+    "value profiling detected in function with no counter incerement");
+
+  GlobalVariable *DataVar = It->second.DataVar;
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
+    Index += It->second.NumValueSites[Kind];
+
+  IRBuilder<> Builder(Ind);
+  Value* Args[3] = {Ind->getTargetValue(),
+      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+      Builder.getInt32(Index)};
+  Ind->replaceAllUsesWith(
+      Builder.CreateCall(getOrInsertValueProfilingCall(*M), Args));
+  Ind->eraseFromParent();
+}
+
 void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
   GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
 
-  IRBuilder<> Builder(Inc->getParent(), *Inc);
+  IRBuilder<> Builder(Inc);
   uint64_t Index = Inc->getIndex()->getZExtValue();
   Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
   Value *Count = Builder.CreateLoad(Addr, "pgocount");
@@ -153,29 +233,16 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
   Inc->eraseFromParent();
 }
 
-void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {
-  CoverageData->setSection(getCoverageSection());
-  CoverageData->setAlignment(8);
-
-  Constant *Init = CoverageData->getInitializer();
-  // We're expecting { i32, i32, i32, i32, [n x { i8*, i32, i32 }], [m x i8] }
-  // for some C. If not, the frontend's given us something broken.
-  assert(Init->getNumOperands() == 6 && "bad number of fields in coverage map");
-  assert(isa<ConstantArray>(Init->getAggregateElement(4)) &&
-         "invalid function list in coverage map");
-  ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(4));
-  for (unsigned I = 0, E = Records->getNumOperands(); I < E; ++I) {
-    Constant *Record = Records->getOperand(I);
-    Value *V = const_cast<Value *>(Record->getOperand(0))->stripPointerCasts();
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 
+  ConstantArray *Names =
+      cast<ConstantArray>(CoverageNamesVar->getInitializer());
+  for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
+    Constant *NC = Names->getOperand(I);
+    Value *V = NC->stripPointerCasts();
     assert(isa<GlobalVariable>(V) && "Missing reference to function name");
     GlobalVariable *Name = cast<GlobalVariable>(V);
 
-    // If we have region counters for this name, we've already handled it.
-    auto It = RegionCounters.find(Name);
-    if (It != RegionCounters.end())
-      continue;
-
     // Move the name variable to the right section.
     Name->setSection(getNameSection());
     Name->setAlignment(1);
@@ -183,69 +250,108 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {
 }
 
 /// Get the name of a profiling variable for a particular function.
-static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) {
-  auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer());
-  StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
-  return ("__llvm_profile_" + VarName + "_" + Name).str();
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
+  StringRef NamePrefix = getInstrProfNameVarPrefix();
+  StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
+  return (Prefix + Name).str();
+}
+
+static inline bool shouldRecordFunctionAddr(Function *F) {
+  // Check the linkage
+  if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
+      !F->hasAvailableExternallyLinkage())
+    return true;
+  // Check uses of this function for other than direct calls or invokes to it.
+  return F->hasAddressTaken();
+}
+
+static inline Comdat *getOrCreateProfileComdat(Module &M,
+                                               InstrProfIncrementInst *Inc) {
+  // COFF format requires a COMDAT section to have a key symbol with the same
+  // name. The linker targeting COFF also requires that the COMDAT section
+  // a section is associated to must precede the associating section. For this
+  // reason, we must choose the name var's name as the name of the comdat.
+  StringRef ComdatPrefix = (Triple(M.getTargetTriple()).isOSBinFormatCOFF()
+                                ? getInstrProfNameVarPrefix()
+                                : getInstrProfComdatPrefix());
+  return M.getOrInsertComdat(StringRef(getVarName(Inc, ComdatPrefix)));
 }
 
 GlobalVariable *
 InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
-  GlobalVariable *Name = Inc->getName();
-  auto It = RegionCounters.find(Name);
-  if (It != RegionCounters.end())
-    return It->second;
-
-  // Move the name variable to the right section. Make sure it is placed in the
-  // same comdat as its associated function. Otherwise, we may get multiple
-  // counters for the same function in certain cases.
+  GlobalVariable *NamePtr = Inc->getName();
+  auto It = ProfileDataMap.find(NamePtr);
+  PerFunctionProfileData PD;
+  if (It != ProfileDataMap.end()) {
+    if (It->second.RegionCounters)
+      return It->second.RegionCounters;
+    PD = It->second;
+  }
+
+  // Move the name variable to the right section. Place them in a COMDAT group
+  // if the associated function is a COMDAT. This will make sure that
+  // only one copy of counters of the COMDAT function will be emitted after
+  // linking.
   Function *Fn = Inc->getParent()->getParent();
-  Name->setSection(getNameSection());
-  Name->setAlignment(1);
-  Name->setComdat(Fn->getComdat());
+  Comdat *ProfileVarsComdat = nullptr;
+  if (Fn->hasComdat())
+    ProfileVarsComdat = getOrCreateProfileComdat(*M, Inc);
+  NamePtr->setSection(getNameSection());
+  NamePtr->setAlignment(1);
+  NamePtr->setComdat(ProfileVarsComdat);
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
   LLVMContext &Ctx = M->getContext();
   ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
 
   // Create the counters variable.
-  auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(),
-                                      Constant::getNullValue(CounterTy),
-                                      getVarName(Inc, "counters"));
-  Counters->setVisibility(Name->getVisibility());
-  Counters->setSection(getCountersSection());
-  Counters->setAlignment(8);
-  Counters->setComdat(Fn->getComdat());
-
-  RegionCounters[Inc->getName()] = Counters;
+  auto *CounterPtr =
+      new GlobalVariable(*M, CounterTy, false, NamePtr->getLinkage(),
+                         Constant::getNullValue(CounterTy),
+                         getVarName(Inc, getInstrProfCountersVarPrefix()));
+  CounterPtr->setVisibility(NamePtr->getVisibility());
+  CounterPtr->setSection(getCountersSection());
+  CounterPtr->setAlignment(8);
+  CounterPtr->setComdat(ProfileVarsComdat);
 
   // Create data variable.
-  auto *NameArrayTy = Name->getType()->getPointerElementType();
-  auto *Int32Ty = Type::getInt32Ty(Ctx);
-  auto *Int64Ty = Type::getInt64Ty(Ctx);
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
-  auto *Int64PtrTy = Type::getInt64PtrTy(Ctx);
-
-  Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy};
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last+1);
+  Type *DataTypes[] = {
+    #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+    #include "llvm/ProfileData/InstrProfData.inc"
+  };
   auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+
+  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn) ?
+                           ConstantExpr::getBitCast(Fn, Int8PtrTy) :
+                           ConstantPointerNull::get(Int8PtrTy);
+
+  Constant *Int16ArrayVals[IPVK_Last+1];
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
+
   Constant *DataVals[] = {
-      ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()),
-      ConstantInt::get(Int32Ty, NumCounters),
-      ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()),
-      ConstantExpr::getBitCast(Name, Int8PtrTy),
-      ConstantExpr::getBitCast(Counters, Int64PtrTy)};
-  auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(),
+    #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+    #include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *Data = new GlobalVariable(*M, DataTy, false, NamePtr->getLinkage(),
                                   ConstantStruct::get(DataTy, DataVals),
-                                  getVarName(Inc, "data"));
-  Data->setVisibility(Name->getVisibility());
+                                  getVarName(Inc, getInstrProfDataVarPrefix()));
+  Data->setVisibility(NamePtr->getVisibility());
   Data->setSection(getDataSection());
-  Data->setAlignment(8);
-  Data->setComdat(Fn->getComdat());
+  Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT);
+  Data->setComdat(ProfileVarsComdat);
+
+  PD.RegionCounters = CounterPtr;
+  PD.DataVar = Data;
+  ProfileDataMap[NamePtr] = PD;
 
   // Mark the data variable as used so that it isn't stripped out.
   UsedVars.push_back(Data);
 
-  return Counters;
+  return CounterPtr;
 }
 
 void InstrProfiling::emitRegistration() {
@@ -253,20 +359,24 @@ void InstrProfiling::emitRegistration() {
   if (Triple(M->getTargetTriple()).isOSDarwin())
     return;
 
+  // Use linker script magic to get data/cnts/name start/end.
+  if (Triple(M->getTargetTriple()).isOSLinux() ||
+      Triple(M->getTargetTriple()).isOSFreeBSD())
+    return;
+
   // Construct the function.
   auto *VoidTy = Type::getVoidTy(M->getContext());
   auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
   auto *RegisterFTy = FunctionType::get(VoidTy, false);
   auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
-                                     "__llvm_profile_register_functions", M);
+                                     getInstrProfRegFuncsName(), M);
   RegisterF->setUnnamedAddr(true);
-  if (Options.NoRedZone)
-    RegisterF->addFnAttr(Attribute::NoRedZone);
+  if (Options.NoRedZone) RegisterF->addFnAttr(Attribute::NoRedZone);
 
   auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
   auto *RuntimeRegisterF =
       Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
-                       "__llvm_profile_register_function", M);
+                       getInstrProfRegFuncName(), M);
 
   IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
   for (Value *Data : UsedVars)
@@ -275,26 +385,27 @@ void InstrProfiling::emitRegistration() {
 }
 
 void InstrProfiling::emitRuntimeHook() {
-  const char *const RuntimeVarName = "__llvm_profile_runtime";
-  const char *const RuntimeUserName = "__llvm_profile_runtime_user";
 
-  // If the module's provided its own runtime, we don't need to do anything.
-  if (M->getGlobalVariable(RuntimeVarName))
+  // We expect the linker to be invoked with -u<hook_var> flag for linux,
+  // for which case there is no need to emit the user function.
+  if (Triple(M->getTargetTriple()).isOSLinux())
     return;
 
+  // If the module's provided its own runtime, we don't need to do anything.
+  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName())) return;
+
   // Declare an external variable that will pull in the runtime initialization.
   auto *Int32Ty = Type::getInt32Ty(M->getContext());
   auto *Var =
       new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
-                         nullptr, RuntimeVarName);
+                         nullptr, getInstrProfRuntimeHookVarName());
 
   // Make a function that uses it.
-  auto *User =
-      Function::Create(FunctionType::get(Int32Ty, false),
-                       GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M);
+  auto *User = Function::Create(FunctionType::get(Int32Ty, false),
+                                GlobalValue::LinkOnceODRLinkage,
+                                getInstrProfRuntimeHookVarUseFuncName(), M);
   User->addFnAttr(Attribute::NoInline);
-  if (Options.NoRedZone)
-    User->addFnAttr(Attribute::NoRedZone);
+  if (Options.NoRedZone) User->addFnAttr(Attribute::NoRedZone);
   User->setVisibility(GlobalValue::HiddenVisibility);
 
   IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
@@ -330,26 +441,23 @@ void InstrProfiling::emitUses() {
   LLVMUsed =
       new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
                          ConstantArray::get(ATy, MergedVars), "llvm.used");
-
   LLVMUsed->setSection("llvm.metadata");
 }
 
 void InstrProfiling::emitInitialization() {
   std::string InstrProfileOutput = Options.InstrProfileOutput;
 
-  Constant *RegisterF = M->getFunction("__llvm_profile_register_functions");
-  if (!RegisterF && InstrProfileOutput.empty())
-    return;
+  Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName());
+  if (!RegisterF && InstrProfileOutput.empty()) return;
 
   // Create the initialization function.
   auto *VoidTy = Type::getVoidTy(M->getContext());
-  auto *F =
-      Function::Create(FunctionType::get(VoidTy, false),
-                       GlobalValue::InternalLinkage, "__llvm_profile_init", M);
+  auto *F = Function::Create(FunctionType::get(VoidTy, false),
+                             GlobalValue::InternalLinkage,
+                             getInstrProfInitFuncName(), M);
   F->setUnnamedAddr(true);
   F->addFnAttr(Attribute::NoInline);
-  if (Options.NoRedZone)
-    F->addFnAttr(Attribute::NoRedZone);
+  if (Options.NoRedZone) F->addFnAttr(Attribute::NoRedZone);
 
   // Add the basic block and the necessary calls.
   IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
@@ -358,9 +466,8 @@ void InstrProfiling::emitInitialization() {
   if (!InstrProfileOutput.empty()) {
     auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
     auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false);
-    auto *SetNameF =
-        Function::Create(SetNameTy, GlobalValue::ExternalLinkage,
-                         "__llvm_profile_override_default_filename", M);
+    auto *SetNameF = Function::Create(SetNameTy, GlobalValue::ExternalLinkage,
+                                      getInstrProfFileOverriderFuncName(), M);
 
     // Create variable for profile name.
     Constant *ProfileNameConst =
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index 2750585..a05a5fa 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -12,12 +12,47 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm-c/Initialization.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
 using namespace llvm;
 
+/// Moves I before IP. Returns new insert point.
+static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) {
+  // If I is IP, move the insert point down.
+  if (I == IP)
+    return ++IP;
+  // Otherwise, move I before IP and return IP.
+  I->moveBefore(&*IP);
+  return IP;
+}
+
+/// Instrumentation passes often insert conditional checks into entry blocks.
+/// Call this function before splitting the entry block to move instructions
+/// that must remain in the entry block up before the split point. Static
+/// allocas and llvm.localescape calls, for example, must remain in the entry
+/// block.
+BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
+                                                    BasicBlock::iterator IP) {
+  assert(&BB.getParent()->getEntryBlock() == &BB);
+  for (auto I = IP, E = BB.end(); I != E; ++I) {
+    bool KeepInEntry = false;
+    if (auto *AI = dyn_cast<AllocaInst>(I)) {
+      if (AI->isStaticAlloca())
+        KeepInEntry = true;
+    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == llvm::Intrinsic::localescape)
+        KeepInEntry = true;
+    }
+    if (KeepInEntry)
+      IP = moveBeforeInsertPoint(I, IP);
+  }
+  return IP;
+}
+
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
@@ -25,6 +60,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
   initializeGCOVProfilerPass(Registry);
+  initializePGOInstrumentationGenPass(Registry);
+  initializePGOInstrumentationUsePass(Registry);
   initializeInstrProfilingPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 286a563..34aaa7f 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -148,7 +148,7 @@ static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
        cl::desc("poison uninitialized stack variables with a call"),
        cl::Hidden, cl::init(false));
 static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
-       cl::desc("poison uninitialized stack variables with the given patter"),
+       cl::desc("poison uninitialized stack variables with the given pattern"),
        cl::Hidden, cl::init(0xff));
 static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
        cl::desc("poison undef temps"),
@@ -222,10 +222,17 @@ static const MemoryMapParams Linux_I386_MemoryMapParams = {
 
 // x86_64 Linux
 static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+#ifdef MSAN_LINUX_X86_64_OLD_MAPPING
   0x400000000000,  // AndMask
   0,               // XorMask (not used)
   0,               // ShadowBase (not used)
   0x200000000000,  // OriginBase
+#else
+  0,               // AndMask (not used)
+  0x500000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x100000000000,  // OriginBase
+#endif
 };
 
 // mips64 Linux
@@ -244,6 +251,14 @@ static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
   0x1C0000000000,  // OriginBase
 };
 
+// aarch64 Linux
+static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x06000000000,   // XorMask
+  0,               // ShadowBase (not used)
+  0x01000000000,   // OriginBase
+};
+
 // i386 FreeBSD
 static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
   0x000180000000,  // AndMask
@@ -266,15 +281,20 @@ static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
 };
 
 static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
-  NULL,
+  nullptr,
   &Linux_MIPS64_MemoryMapParams,
 };
 
 static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
-  NULL,
+  nullptr,
   &Linux_PowerPC64_MemoryMapParams,
 };
 
+static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
+  nullptr,
+  &Linux_AArch64_MemoryMapParams,
+};
+
 static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
   &FreeBSD_I386_MemoryMapParams,
   &FreeBSD_X86_64_MemoryMapParams,
@@ -353,8 +373,9 @@ class MemorySanitizer : public FunctionPass {
   friend struct MemorySanitizerVisitor;
   friend struct VarArgAMD64Helper;
   friend struct VarArgMIPS64Helper;
+  friend struct VarArgAArch64Helper;
 };
-}  // namespace
+} // anonymous namespace
 
 char MemorySanitizer::ID = 0;
 INITIALIZE_PASS(MemorySanitizer, "msan",
@@ -377,7 +398,6 @@ static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
                             GlobalValue::PrivateLinkage, StrConst, "");
 }
 
-
 /// \brief Insert extern declaration of runtime-provided functions and globals.
 void MemorySanitizer::initializeCallbacks(Module &M) {
   // Only do this once.
@@ -496,6 +516,10 @@ bool MemorySanitizer::doInitialization(Module &M) {
         case Triple::ppc64le:
           MapParams = Linux_PowerPC_MemoryMapParams.bits64;
           break;
+        case Triple::aarch64:
+        case Triple::aarch64_be:
+          MapParams = Linux_ARM_MemoryMapParams.bits64;
+          break;
         default:
           report_fatal_error("unsupported architecture");
       }
@@ -668,7 +692,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const DataLayout &DL = F.getParent()->getDataLayout();
     unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
     unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
-    if (isa<StructType>(Shadow->getType())) {
+    if (Shadow->getType()->isAggregateType()) {
       paintOrigin(IRB, updateOrigin(Origin, IRB),
                   getOriginPtr(Addr, IRB, Alignment), StoreSize,
                   OriginAlignment);
@@ -697,7 +721,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         Value *Cmp = IRB.CreateICmpNE(
             ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
         Instruction *CheckTerm = SplitBlockAndInsertIfThen(
-            Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
+            Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
         IRBuilder<> IRBNew(CheckTerm);
         paintOrigin(IRBNew, updateOrigin(Origin, IRBNew),
                     getOriginPtr(Addr, IRBNew, Alignment), StoreSize,
@@ -893,16 +917,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Offset = (Addr & ~AndMask) ^ XorMask
   Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+    Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy);
+
     uint64_t AndMask = MS.MapParams->AndMask;
-    assert(AndMask != 0 && "AndMask shall be specified");
-    Value *OffsetLong =
-      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
-                    ConstantInt::get(MS.IntptrTy, ~AndMask));
+    if (AndMask)
+      OffsetLong =
+          IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));
 
     uint64_t XorMask = MS.MapParams->XorMask;
-    if (XorMask != 0)
-      OffsetLong = IRB.CreateXor(OffsetLong,
-                                 ConstantInt::get(MS.IntptrTy, XorMask));
+    if (XorMask)
+      OffsetLong =
+          IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));
     return OffsetLong;
   }
 
@@ -1339,6 +1364,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitBitCastInst(BitCastInst &I) {
+    // Special case: if this is the bitcast (there is exactly 1 allowed) between
+    // a musttail call and a ret, don't instrument. New instructions are not
+    // allowed after a musttail call.
+    if (auto *CI = dyn_cast<CallInst>(I.getOperand(0)))
+      if (CI->isMustTailCall())
+        return;
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
     setOrigin(&I, getOrigin(&I, 0));
@@ -1570,18 +1601,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Type *EltTy = Ty->getSequentialElementType();
       SmallVector<Constant *, 16> Elements;
       for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
-        ConstantInt *Elt =
-            dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx));
-        APInt V = Elt->getValue();
-        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
-        Elements.push_back(ConstantInt::get(EltTy, V2));
+        if (ConstantInt *Elt =
+                dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
+          APInt V = Elt->getValue();
+          APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+          Elements.push_back(ConstantInt::get(EltTy, V2));
+        } else {
+          Elements.push_back(ConstantInt::get(EltTy, 1));
+        }
       }
       ShadowMul = ConstantVector::get(Elements);
     } else {
-      ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg);
-      APInt V = Elt->getValue();
-      APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
-      ShadowMul = ConstantInt::get(Elt->getType(), V2);
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
+        APInt V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        ShadowMul = ConstantInt::get(Ty, V2);
+      } else {
+        ShadowMul = ConstantInt::get(Ty, 1);
+      }
     }
 
     IRBuilder<> IRB(&I);
@@ -1730,25 +1767,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   /// \brief Instrument signed relational comparisons.
   ///
-  /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
-  /// propagating the highest bit of the shadow. Everything else is delegated
-  /// to handleShadowOr().
+  /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
+  /// bit of the shadow. Everything else is delegated to handleShadowOr().
   void handleSignedRelationalComparison(ICmpInst &I) {
-    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
-    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
-    Value* op = nullptr;
-    CmpInst::Predicate pre = I.getPredicate();
-    if (constOp0 && constOp0->isNullValue() &&
-        (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) {
-      op = I.getOperand(1);
-    } else if (constOp1 && constOp1->isNullValue() &&
-               (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) {
+    Constant *constOp;
+    Value *op = nullptr;
+    CmpInst::Predicate pre;
+    if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {
       op = I.getOperand(0);
+      pre = I.getPredicate();
+    } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) {
+      op = I.getOperand(1);
+      pre = I.getSwappedPredicate();
+    } else {
+      handleShadowOr(I);
+      return;
     }
-    if (op) {
+
+    if ((constOp->isNullValue() &&
+         (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) ||
+        (constOp->isAllOnesValue() &&
+         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {
       IRBuilder<> IRB(&I);
-      Value* Shadow =
-        IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt");
+      Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op),
+                                        "_msprop_icmp_s");
       setShadow(&I, Shadow);
       setOrigin(&I, getOrigin(op));
     } else {
@@ -1860,25 +1902,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     VAHelper->visitVACopyInst(I);
   }
 
-  enum IntrinsicKind {
-    IK_DoesNotAccessMemory,
-    IK_OnlyReadsMemory,
-    IK_WritesMemory
-  };
-
-  static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) {
-    const int DoesNotAccessMemory = IK_DoesNotAccessMemory;
-    const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory;
-    const int OnlyReadsMemory = IK_OnlyReadsMemory;
-    const int OnlyAccessesArgumentPointees = IK_WritesMemory;
-    const int UnknownModRefBehavior = IK_WritesMemory;
-#define GET_INTRINSIC_MODREF_BEHAVIOR
-#define ModRefBehavior IntrinsicKind
-#include "llvm/IR/Intrinsics.gen"
-#undef ModRefBehavior
-#undef GET_INTRINSIC_MODREF_BEHAVIOR
-  }
-
   /// \brief Handle vector store-like intrinsics.
   ///
   /// Instrument intrinsics that look like a simple SIMD store: writes memory,
@@ -1978,17 +2001,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (NumArgOperands == 0)
       return false;
 
-    Intrinsic::ID iid = I.getIntrinsicID();
-    IntrinsicKind IK = getIntrinsicKind(iid);
-    bool OnlyReadsMemory = IK == IK_OnlyReadsMemory;
-    bool WritesMemory = IK == IK_WritesMemory;
-    assert(!(OnlyReadsMemory && WritesMemory));
-
     if (NumArgOperands == 2 &&
         I.getArgOperand(0)->getType()->isPointerTy() &&
         I.getArgOperand(1)->getType()->isVectorTy() &&
         I.getType()->isVoidTy() &&
-        WritesMemory) {
+        !I.onlyReadsMemory()) {
       // This looks like a vector store.
       return handleVectorStoreIntrinsic(I);
     }
@@ -1996,12 +2013,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (NumArgOperands == 1 &&
         I.getArgOperand(0)->getType()->isPointerTy() &&
         I.getType()->isVectorTy() &&
-        OnlyReadsMemory) {
+        I.onlyReadsMemory()) {
       // This looks like a vector load.
       return handleVectorLoadIntrinsic(I);
     }
 
-    if (!OnlyReadsMemory && !WritesMemory)
+    if (I.doesNotAccessMemory())
       if (maybeHandleSimpleNomemIntrinsic(I))
         return true;
 
@@ -2493,13 +2510,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     // Now, get the shadow for the RetVal.
     if (!I.getType()->isSized()) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return;
     IRBuilder<> IRBBefore(&I);
     // Until we have full dynamic coverage, make sure the retval shadow is 0.
     Value *Base = getShadowPtrForRetval(&I, IRBBefore);
     IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
-    Instruction *NextInsn = nullptr;
+    BasicBlock::iterator NextInsn;
     if (CS.isCall()) {
-      NextInsn = I.getNextNode();
+      NextInsn = ++I.getIterator();
+      assert(NextInsn != I.getParent()->end());
     } else {
       BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
       if (!NormalDest->getSinglePredecessor()) {
@@ -2511,10 +2531,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         return;
       }
       NextInsn = NormalDest->getFirstInsertionPt();
-      assert(NextInsn &&
+      assert(NextInsn != NormalDest->end() &&
              "Could not find insertion point for retval shadow load");
     }
-    IRBuilder<> IRBAfter(NextInsn);
+    IRBuilder<> IRBAfter(&*NextInsn);
     Value *RetvalShadow =
       IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),
                                  kShadowTLSAlignment, "_msret");
@@ -2523,10 +2543,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
   }
 
+  bool isAMustTailRetVal(Value *RetVal) {
+    if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
+      RetVal = I->getOperand(0);
+    }
+    if (auto *I = dyn_cast<CallInst>(RetVal)) {
+      return I->isMustTailCall();
+    }
+    return false;
+  }
+
   void visitReturnInst(ReturnInst &I) {
     IRBuilder<> IRB(&I);
     Value *RetVal = I.getReturnValue();
     if (!RetVal) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (isAMustTailRetVal(RetVal)) return;
     Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
     if (CheckReturnValue) {
       insertShadowCheck(RetVal, &I);
@@ -2653,6 +2685,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getCleanOrigin());
   }
 
+  void visitCatchSwitchInst(CatchSwitchInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitFuncletPadInst(FuncletPadInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
   void visitGetElementPtrInst(GetElementPtrInst &I) {
     handleShadowOr(I);
   }
@@ -2696,6 +2738,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Nothing to do here.
   }
 
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) {
+    DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCatchReturnInst(CatchReturnInst &CRI) {
+    DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
   void visitInstruction(Instruction &I) {
     // Everything else: stop propagating and check for poisoned shadow.
     if (ClDumpStrictInstructions)
@@ -2808,6 +2860,8 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVAStartInst(VAStartInst &I) override {
+    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+      return;
     IRBuilder<> IRB(&I);
     VAStartInstrumentationList.push_back(&I);
     Value *VAListTag = I.getArgOperand(0);
@@ -2820,6 +2874,8 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVACopyInst(VACopyInst &I) override {
+    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+      return;
     IRBuilder<> IRB(&I);
     Value *VAListTag = I.getArgOperand(0);
     Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
@@ -2979,6 +3035,242 @@ struct VarArgMIPS64Helper : public VarArgHelper {
   }
 };
 
+
+/// \brief AArch64-specific implementation of VarArgHelper.
+struct VarArgAArch64Helper : public VarArgHelper {
+  static const unsigned kAArch64GrArgSize = 56;
+  static const unsigned kAArch64VrArgSize = 128;
+
+  static const unsigned AArch64GrBegOffset = 0;
+  static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
+  // Make VR space aligned to 16 bytes.
+  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset + 8;
+  static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
+                                             + kAArch64VrArgSize;
+  static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgOverflowSize(nullptr) {}
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy())
+      return AK_FloatingPoint;
+    if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+        || (T->isPointerTy()))
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // The instrumentation stores the argument shadow in a non ABI-specific
+  // format because it does not know which argument is named (since Clang,
+  // like x86_64 case, lowers the va_args in the frontend and this pass only
+  // sees the low level code that deals with va_list internals).
+  // The first seven GR registers are saved in the first 56 bytes of the
+  // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then
+  // the remaining arguments.
+  // Using constant offset within the va_arg TLS array allows fast copy
+  // in the finalize instrumentation.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned GrOffset = AArch64GrBegOffset;
+    unsigned VrOffset = AArch64VrBegOffset;
+    unsigned OverflowOffset = AArch64VAEndOffset;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+        case AK_GeneralPurpose:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset);
+          GrOffset += 8;
+          break;
+        case AK_FloatingPoint:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset);
+          VrOffset += 16;
+          break;
+        case AK_Memory:
+          uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+          OverflowOffset += RoundUpToAlignment(ArgSize, 8);
+          break;
+      }
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants (size of va_list).
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */32, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants (size of va_list).
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */32, /* alignment */8, false);
+  }
+
+  // Retrieve a va_list field of 'void*' size.
+  Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtrPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt64PtrTy(*MS.C));
+    return IRB.CreateLoad(SaveAreaPtrPtr);
+  }
+
+  // Retrieve a va_list field of 'int' size.
+  Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt32PtrTy(*MS.C));
+    Value *SaveArea32 = IRB.CreateLoad(SaveAreaPtr);
+    return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
+    Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize);
+
+    // Instrument va_start, copy va_list shadow from the backup copy of
+    // the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      // The variadic ABI for AArch64 creates two areas to save the incoming
+      // argument registers (one for 64-bit general register xn-x7 and another
+      // for 128-bit FP/SIMD vn-v7).
+      // We need then to propagate the shadow arguments on both regions
+      // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
+      // The remaning arguments are saved on shadow for 'va::stack'.
+      // One caveat is it requires only to propagate the non-named arguments,
+      // however on the call site instrumentation 'all' the arguments are
+      // saved. So to copy the shadow values from the va_arg TLS array
+      // we need to adjust the offset for both GR and VR fields based on
+      // the __{gr,vr}_offs value (since they are stores based on incoming
+      // named arguments).
+
+      // Read the stack pointer from the va_list.
+      Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0);
+
+      // Read both the __gr_top and __gr_off and add them up.
+      Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8);
+      Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24);
+
+      Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea);
+
+      // Read both the __vr_top and __vr_off and add them up.
+      Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16);
+      Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28);
+
+      Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea);
+
+      // It does not know how many named arguments is being used and, on the
+      // callsite all the arguments were saved.  Since __gr_off is defined as
+      // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic
+      // argument by ignoring the bytes of shadow from named arguments.
+      Value *GrRegSaveAreaShadowPtrOff =
+        IRB.CreateAdd(GrArgSize, GrOffSaveArea);
+
+      Value *GrRegSaveAreaShadowPtr =
+        MSV.getShadowPtr(GrRegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+
+      Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                                              GrRegSaveAreaShadowPtrOff);
+      Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8);
+
+      // Again, but for FP/SIMD values.
+      Value *VrRegSaveAreaShadowPtrOff =
+          IRB.CreateAdd(VrArgSize, VrOffSaveArea);
+
+      Value *VrRegSaveAreaShadowPtr =
+        MSV.getShadowPtr(VrRegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+
+      Value *VrSrcPtr = IRB.CreateInBoundsGEP(
+        IRB.getInt8Ty(),
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VrBegOffset)),
+        VrRegSaveAreaShadowPtrOff);
+      Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8);
+
+      // And finally for remaining arguments.
+      Value *StackSaveAreaShadowPtr =
+        MSV.getShadowPtr(StackSaveAreaPtr, IRB.getInt8Ty(), IRB);
+
+      Value *StackSrcPtr =
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VAEndOffset));
+
+      IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr,
+                       VAArgOverflowSize, 16);
+    }
+  }
+};
+
 /// \brief A no-op implementation of VarArgHelper.
 struct VarArgNoOpHelper : public VarArgHelper {
   VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
@@ -3003,11 +3295,13 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
   else if (TargetTriple.getArch() == llvm::Triple::mips64 ||
            TargetTriple.getArch() == llvm::Triple::mips64el)
     return new VarArgMIPS64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::aarch64)
+    return new VarArgAArch64Helper(Func, Msan, Visitor);
   else
     return new VarArgNoOpHelper(Func, Msan, Visitor);
 }
 
-}  // namespace
+} // anonymous namespace
 
 bool MemorySanitizer::runOnFunction(Function &F) {
   if (&F == MsanCtorFunction)
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
new file mode 100644
index 0000000..4b59b93
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -0,0 +1,718 @@
+//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+//   Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights.
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "CFGMST.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/JamCRC.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+                       cl::value_desc("filename"),
+                       cl::desc("Specify the path of profile data file. This is"
+                                "mainly for test purpose."));
+
+namespace {
+class PGOInstrumentationGen : public ModulePass {
+public:
+  static char ID;
+
+  PGOInstrumentationGen() : ModulePass(ID) {
+    initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOInstrumentationGenPass";
+  }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationUse : public ModulePass {
+public:
+  static char ID;
+
+  // Provide the profile filename as the parameter.
+  PGOInstrumentationUse(std::string Filename = "")
+      : ModulePass(ID), ProfileFileName(Filename) {
+    if (!PGOTestProfileFile.empty())
+      ProfileFileName = PGOTestProfileFile;
+    initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOInstrumentationUsePass";
+  }
+
+private:
+  std::string ProfileFileName;
+  std::unique_ptr<IndexedInstrProfReader> PGOReader;
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOInstrumentationGen::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen",
+                      "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen",
+                    "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenPass() {
+  return new PGOInstrumentationGen();
+}
+
+char PGOInstrumentationUse::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use",
+                      "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use",
+                    "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) {
+  return new PGOInstrumentationUse(Filename.str());
+}
+
+namespace {
+/// \brief An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  bool InMST;
+  bool Removed;
+  bool IsCritical;
+  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false),
+        IsCritical(false) {}
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str();
+  }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank;
+
+  BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {}
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+  Function &F;
+  void computeCFGHash();
+
+public:
+  std::string FuncName;
+  GlobalVariable *FuncNameVar;
+  // CFG hash value for this function.
+  uint64_t FunctionHash;
+
+  // The Minimum Spanning Tree of function CFG.
+  CFGMST<Edge, BBInfo> MST;
+
+  // Give an edge, find the BB that will be instrumented.
+  // Return nullptr if there is no BB to be instrumented.
+  BasicBlock *getInstrBB(Edge *E);
+
+  // Return the auxiliary BB information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+  // Dump edges and BB information.
+  void dumpInfo(std::string Str = "") const {
+    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+                          Twine(FunctionHash) + "\t" + Str);
+  }
+
+  FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false,
+                         BranchProbabilityInfo *BPI = nullptr,
+                         BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), FunctionHash(0), MST(F, BPI, BFI) {
+    FuncName = getPGOFuncName(F);
+    computeCFGHash();
+    DEBUG(dumpInfo("after CFGMST"));
+
+    NumOfPGOBB += MST.BBInfos.size();
+    for (auto &E : MST.AllEdges) {
+      if (E->Removed)
+        continue;
+      NumOfPGOEdge++;
+      if (!E->InMST)
+        NumOfPGOInstrument++;
+    }
+
+    if (CreateGlobalVar)
+      FuncNameVar = createPGOFuncNameVar(F, FuncName);
+  };
+};
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+// value of each BB in the CFG. The higher 32 bits record the number of edges.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+  std::vector<char> Indexes;
+  JamCRC JC;
+  for (auto &BB : F) {
+    const TerminatorInst *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Index = getBBInfo(Succ).Index;
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((char)(Index >> (J * 8)));
+    }
+  }
+  JC.update(Indexes);
+  FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+  if (E->InMST || E->Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  TerminatorInst *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1)
+    return SrcBB;
+  if (!E->IsCritical)
+    return DestBB;
+
+  // For a critical edge, we have to split. Instrument the newly
+  // created BB.
+  NumOfPGOSplit++;
+  DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> "
+               << getBBInfo(DestBB).Index << "\n");
+  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
+  assert(InstrBB && "Critical edge is not split");
+
+  E->Removed = true;
+  return InstrBB;
+}
+
+// Visit all edge and instrument the edges not in MST.
+// Critical edges will be split.
+static void instrumentOneFunc(Function &F, Module *M,
+                              BranchProbabilityInfo *BPI,
+                              BlockFrequencyInfo *BFI) {
+  unsigned NumCounters = 0;
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI);
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (!E->InMST && !E->Removed)
+      NumCounters++;
+  }
+
+  uint32_t I = 0;
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get());
+    if (!InstrBB)
+      continue;
+
+    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+    assert(Builder.GetInsertPoint() != InstrBB->end() &&
+           "Cannot get the Instrumentation point");
+    Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
+  }
+}
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+  bool CountValid;
+  uint64_t CountValue;
+  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {}
+
+  // Set edge count value
+  void setEdgeCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string for this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return PGOEdge::infoString();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+typedef SmallVector<PGOUseEdge *, 2> DirectEdges;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+  uint64_t CountValue;
+  bool CountValid;
+  int32_t UnknownCountInEdge;
+  int32_t UnknownCountOutEdge;
+  DirectEdges InEdges;
+  DirectEdges OutEdges;
+  UseBBInfo(unsigned IX)
+      : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+  UseBBInfo(unsigned IX, uint64_t C)
+      : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+
+  // Set the profile count value for this BB.
+  void setBBInfoCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return BBInfo::infoString();
+    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+  uint64_t Total = 0;
+  for (auto &E : Edges) {
+    if (E->Removed)
+      continue;
+    Total += E->CountValue;
+  }
+  return Total;
+}
+
+class PGOUseFunc {
+private:
+  Function &F;
+  Module *M;
+  // This member stores the shared information with class PGOGenFunc.
+  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+  // Return the auxiliary BB information.
+  UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.getBBInfo(BB);
+  }
+
+  // The maximum count value in the profile. This is only used in PGO use
+  // compilation.
+  uint64_t ProgramMaxCount;
+
+  // Find the Instrumented BB and set the value.
+  void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+  // Set the edge counter value for the unknown edge -- there should be only
+  // one unknown edge.
+  void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+  // Return FuncName string;
+  const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+  // Set the hot/cold inline hints based on the count values.
+  // FIXME: This function should be removed once the functionality in
+  // the inliner is implemented.
+  void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+    if (ProgramMaxCount == 0)
+      return;
+    // Threshold of the hot functions.
+    const BranchProbability HotFunctionThreshold(1, 100);
+    // Threshold of the cold functions.
+    const BranchProbability ColdFunctionThreshold(2, 10000);
+    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
+      F.addFnAttr(llvm::Attribute::InlineHint);
+    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
+      F.addFnAttr(llvm::Attribute::Cold);
+  }
+
+public:
+  PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr,
+             BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {}
+
+  // Read counts for the instrumented BB from profile.
+  bool readCounters(IndexedInstrProfReader *PGOReader);
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+};
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB.
+void PGOUseFunc::setInstrumentedCounts(
+    const std::vector<uint64_t> &CountFromProfile) {
+
+  // Use a worklist as we will update the vector during the iteration.
+  std::vector<PGOUseEdge *> WorkList;
+  for (auto &E : FuncInfo.MST.AllEdges)
+    WorkList.push_back(E.get());
+
+  uint32_t I = 0;
+  for (auto &E : WorkList) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E);
+    if (!InstrBB)
+      continue;
+    uint64_t CountValue = CountFromProfile[I++];
+    if (!E->Removed) {
+      getBBInfo(InstrBB).setBBInfoCount(CountValue);
+      E->setEdgeCount(CountValue);
+      continue;
+    }
+
+    // Need to add two new edges.
+    BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+    BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+    // Add new edge of SrcBB->InstrBB.
+    PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0);
+    NewEdge.setEdgeCount(CountValue);
+    // Add new edge of InstrBB->DestBB.
+    PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0);
+    NewEdge1.setEdgeCount(CountValue);
+    NewEdge1.InMST = true;
+    getBBInfo(InstrBB).setBBInfoCount(CountValue);
+  }
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+  for (auto &E : Edges) {
+    if (E->CountValid)
+      continue;
+    E->setEdgeCount(Value);
+
+    getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    getBBInfo(E->DestBB).UnknownCountInEdge--;
+    return;
+  }
+  llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
+  auto &Ctx = M->getContext();
+  ErrorOr<InstrProfRecord> Result =
+      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  if (std::error_code EC = Result.getError()) {
+    if (EC == instrprof_error::unknown_function)
+      NumOfPGOMissing++;
+    else if (EC == instrprof_error::hash_mismatch ||
+             EC == llvm::instrprof_error::malformed)
+      NumOfPGOMismatch++;
+
+    std::string Msg = EC.message() + std::string(" ") + F.getName().str();
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    return false;
+  }
+  std::vector<uint64_t> &CountFromProfile = Result.get().Counts;
+
+  NumOfPGOFunc++;
+  DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  uint64_t ValueSum = 0;
+  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+    DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    ValueSum += CountFromProfile[I];
+  }
+
+  DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+
+  getBBInfo(nullptr).UnknownCountOutEdge = 2;
+  getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+  setInstrumentedCounts(CountFromProfile);
+  ProgramMaxCount = PGOReader->getMaximumFunctionCount();
+  return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+  // First set up Count variable for all BBs.
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (E->Removed)
+      continue;
+
+    const BasicBlock *SrcBB = E->SrcBB;
+    const BasicBlock *DestBB = E->DestBB;
+    UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+    UseBBInfo &DestInfo = getBBInfo(DestBB);
+    SrcInfo.OutEdges.push_back(E.get());
+    DestInfo.InEdges.push_back(E.get());
+    SrcInfo.UnknownCountOutEdge++;
+    DestInfo.UnknownCountInEdge++;
+
+    if (!E->CountValid)
+      continue;
+    DestInfo.UnknownCountInEdge--;
+    SrcInfo.UnknownCountOutEdge--;
+  }
+
+  bool Changes = true;
+  unsigned NumPasses = 0;
+  while (Changes) {
+    NumPasses++;
+    Changes = false;
+
+    // For efficient traversal, it's better to start from the end as most
+    // of the instrumented edges are at the end.
+    for (auto &BB : reverse(F)) {
+      UseBBInfo &Count = getBBInfo(&BB);
+      if (!Count.CountValid) {
+        if (Count.UnknownCountOutEdge == 0) {
+          Count.CountValue = sumEdgeCount(Count.OutEdges);
+          Count.CountValid = true;
+          Changes = true;
+        } else if (Count.UnknownCountInEdge == 0) {
+          Count.CountValue = sumEdgeCount(Count.InEdges);
+          Count.CountValid = true;
+          Changes = true;
+        }
+      }
+      if (Count.CountValid) {
+        if (Count.UnknownCountOutEdge == 1) {
+          uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges);
+          setEdgeCount(Count.OutEdges, Total);
+          Changes = true;
+        }
+        if (Count.UnknownCountInEdge == 1) {
+          uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges);
+          setEdgeCount(Count.InEdges, Total);
+          Changes = true;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+  // Assert every BB has a valid counter.
+  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  uint64_t FuncMaxCount = FuncEntryCount;
+  for (auto &BB : F) {
+    assert(getBBInfo(&BB).CountValid && "BB count is not valid");
+    uint64_t Count = getBBInfo(&BB).CountValue;
+    if (Count > FuncMaxCount)
+      FuncMaxCount = Count;
+  }
+  applyFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+  DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+  // Generate MD_prof metadata for every branch instruction.
+  DEBUG(dbgs() << "\nSetting branch weights.\n");
+  MDBuilder MDB(M->getContext());
+  for (auto &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() < 2)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+    if (getBBInfo(&BB).CountValue == 0)
+      continue;
+
+    // We have a non-zero Branch BB.
+    const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+    unsigned Size = BBCountInfo.OutEdges.size();
+    SmallVector<unsigned, 2> EdgeCounts(Size, 0);
+    uint64_t MaxCount = 0;
+    for (unsigned s = 0; s < Size; s++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+      const BasicBlock *SrcBB = E->SrcBB;
+      const BasicBlock *DestBB = E->DestBB;
+      if (DestBB == 0)
+        continue;
+      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+      uint64_t EdgeCount = E->CountValue;
+      if (EdgeCount > MaxCount)
+        MaxCount = EdgeCount;
+      EdgeCounts[SuccNum] = EdgeCount;
+    }
+    assert(MaxCount > 0 && "Bad max count");
+    uint64_t Scale = calculateCountScale(MaxCount);
+    SmallVector<unsigned, 4> Weights;
+    for (const auto &ECI : EdgeCounts)
+      Weights.push_back(scaleBranchCount(ECI, Scale));
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    DEBUG(dbgs() << "Weight is: ";
+          for (const auto &W : Weights) { dbgs() << W << " "; }
+          dbgs() << "\n";);
+  }
+}
+} // end anonymous namespace
+
+bool PGOInstrumentationGen::runOnModule(Module &M) {
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    BranchProbabilityInfo *BPI =
+        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
+    BlockFrequencyInfo *BFI =
+        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    instrumentOneFunc(F, &M, BPI, BFI);
+  }
+  return true;
+}
+
+static void setPGOCountOnFunc(PGOUseFunc &Func,
+                              IndexedInstrProfReader *PGOReader) {
+  if (Func.readCounters(PGOReader)) {
+    Func.populateCounters();
+    Func.setBranchWeights();
+  }
+}
+
+bool PGOInstrumentationUse::runOnModule(Module &M) {
+  DEBUG(dbgs() << "Read in profile counters: ");
+  auto &Ctx = M.getContext();
+  // Read the counter array from file.
+  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message()));
+    return false;
+  }
+
+  PGOReader = std::move(ReaderOrErr.get());
+  if (!PGOReader) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+                                          "Cannot get PGOReader"));
+    return false;
+  }
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    BranchProbabilityInfo *BPI =
+        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
+    BlockFrequencyInfo *BFI =
+        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    PGOUseFunc Func(F, &M, BPI, BFI);
+    setPGOCountOnFunc(Func, PGOReader.get());
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp
index 6b185a2..abed465 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp
@@ -18,8 +18,9 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -37,6 +38,8 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -44,6 +47,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "safestack"
 
+enum UnsafeStackPtrStorageVal { ThreadLocalUSP, SingleThreadUSP };
+
+static cl::opt<UnsafeStackPtrStorageVal> USPStorage("safe-stack-usp-storage",
+    cl::Hidden, cl::init(ThreadLocalUSP),
+    cl::desc("Type of storage for the unsafe stack pointer"),
+    cl::values(clEnumValN(ThreadLocalUSP, "thread-local",
+                          "Thread-local storage"),
+               clEnumValN(SingleThreadUSP, "single-thread",
+                          "Non-thread-local storage"),
+               clEnumValEnd));
+
 namespace llvm {
 
 STATISTIC(NumFunctions, "Total number of functions");
@@ -54,118 +68,48 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions,
 STATISTIC(NumAllocas, "Total number of allocas");
 STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");
 STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
+STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
 STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
 } // namespace llvm
 
 namespace {
 
-/// Check whether a given alloca instruction (AI) should be put on the safe
-/// stack or not. The function analyzes all uses of AI and checks whether it is
-/// only accessed in a memory safe way (as decided statically).
-bool IsSafeStackAlloca(const AllocaInst *AI) {
-  // Go through all uses of this alloca and check whether all accesses to the
-  // allocated object are statically known to be memory safe and, hence, the
-  // object can be placed on the safe stack.
-
-  SmallPtrSet<const Value *, 16> Visited;
-  SmallVector<const Instruction *, 8> WorkList;
-  WorkList.push_back(AI);
+/// Rewrite an SCEV expression for a memory access address to an expression that
+/// represents offset from the given alloca.
+///
+/// The implementation simply replaces all mentions of the alloca with zero.
+class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
+  const Value *AllocaPtr;
 
-  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
-  while (!WorkList.empty()) {
-    const Instruction *V = WorkList.pop_back_val();
-    for (const Use &UI : V->uses()) {
-      auto I = cast<const Instruction>(UI.getUser());
-      assert(V == UI.get());
-
-      switch (I->getOpcode()) {
-      case Instruction::Load:
-        // Loading from a pointer is safe.
-        break;
-      case Instruction::VAArg:
-        // "va-arg" from a pointer is safe.
-        break;
-      case Instruction::Store:
-        if (V == I->getOperand(0))
-          // Stored the pointer - conservatively assume it may be unsafe.
-          return false;
-        // Storing to the pointee is safe.
-        break;
-
-      case Instruction::GetElementPtr:
-        if (!cast<const GetElementPtrInst>(I)->hasAllConstantIndices())
-          // GEP with non-constant indices can lead to memory errors.
-          // This also applies to inbounds GEPs, as the inbounds attribute
-          // represents an assumption that the address is in bounds, rather than
-          // an assertion that it is.
-          return false;
-
-        // We assume that GEP on static alloca with constant indices is safe,
-        // otherwise a compiler would detect it and warn during compilation.
-
-        if (!isa<const ConstantInt>(AI->getArraySize()))
-          // However, if the array size itself is not constant, the access
-          // might still be unsafe at runtime.
-          return false;
-
-      /* fallthrough */
-
-      case Instruction::BitCast:
-      case Instruction::IntToPtr:
-      case Instruction::PHI:
-      case Instruction::PtrToInt:
-      case Instruction::Select:
-        // The object can be safe or not, depending on how the result of the
-        // instruction is used.
-        if (Visited.insert(I).second)
-          WorkList.push_back(cast<const Instruction>(I));
-        break;
-
-      case Instruction::Call:
-      case Instruction::Invoke: {
-        // FIXME: add support for memset and memcpy intrinsics.
-        ImmutableCallSite CS(I);
-
-        // LLVM 'nocapture' attribute is only set for arguments whose address
-        // is not stored, passed around, or used in any other non-trivial way.
-        // We assume that passing a pointer to an object as a 'nocapture'
-        // argument is safe.
-        // FIXME: a more precise solution would require an interprocedural
-        // analysis here, which would look at all uses of an argument inside
-        // the function being called.
-        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
-          if (A->get() == V && !CS.doesNotCapture(A - B))
-            // The parameter is not marked 'nocapture' - unsafe.
-            return false;
-        continue;
-      }
+public:
+  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
+      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
 
-      default:
-        // The object is unsafe if it is used in any other way.
-        return false;
-      }
-    }
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (Expr->getValue() == AllocaPtr)
+      return SE.getZero(Expr->getType());
+    return Expr;
   }
+};
 
-  // All uses of the alloca are safe, we can place it on the safe stack.
-  return true;
-}
-
-/// The SafeStack pass splits the stack of each function into the
-/// safe stack, which is only accessed through memory safe dereferences
-/// (as determined statically), and the unsafe stack, which contains all
-/// local variables that are accessed in unsafe ways.
+/// The SafeStack pass splits the stack of each function into the safe
+/// stack, which is only accessed through memory safe dereferences (as
+/// determined statically), and the unsafe stack, which contains all
+/// local variables that are accessed in ways that we can't prove to
+/// be safe.
 class SafeStack : public FunctionPass {
+  const TargetMachine *TM;
+  const TargetLoweringBase *TL;
   const DataLayout *DL;
+  ScalarEvolution *SE;
 
   Type *StackPtrTy;
   Type *IntPtrTy;
   Type *Int32Ty;
   Type *Int8Ty;
 
-  Constant *UnsafeStackPtr = nullptr;
+  Value *UnsafeStackPtr = nullptr;
 
   /// Unsafe stack alignment. Each stack frame must ensure that the stack is
   /// aligned to this value. We need to re-align the unsafe stack if the
@@ -175,26 +119,31 @@ class SafeStack : public FunctionPass {
   /// might expect to appear on the stack on most common targets.
   enum { StackAlignment = 16 };
 
-  /// \brief Build a constant representing a pointer to the unsafe stack
-  /// pointer.
-  Constant *getOrCreateUnsafeStackPtr(Module &M);
+  /// \brief Build a value representing a pointer to the unsafe stack pointer.
+  Value *getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F);
 
   /// \brief Find all static allocas, dynamic allocas, return instructions and
   /// stack restore points (exception unwind blocks and setjmp calls) in the
   /// given function and append them to the respective vectors.
   void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
                  SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                 SmallVectorImpl<Argument *> &ByValArguments,
                  SmallVectorImpl<ReturnInst *> &Returns,
                  SmallVectorImpl<Instruction *> &StackRestorePoints);
 
+  /// \brief Calculate the allocation size of a given alloca. Returns 0 if the
+  /// size can not be statically determined.
+  uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
+
   /// \brief Allocate space for all static allocas in \p StaticAllocas,
   /// replace allocas with pointers into the unsafe stack and generate code to
   /// restore the stack pointer before all return instructions in \p Returns.
   ///
   /// \returns A pointer to the top of the unsafe stack after all unsafe static
   /// allocas are allocated.
-  Value *moveStaticAllocasToUnsafeStack(Function &F,
+  Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
                                         ArrayRef<AllocaInst *> StaticAllocas,
+                                        ArrayRef<Argument *> ByValArguments,
                                         ArrayRef<ReturnInst *> Returns);
 
   /// \brief Generate code to restore the stack after all stack restore points
@@ -203,7 +152,7 @@ class SafeStack : public FunctionPass {
   /// \returns A local variable in which to maintain the dynamic top of the
   /// unsafe stack if needed.
   AllocaInst *
-  createStackRestorePoints(Function &F,
+  createStackRestorePoints(IRBuilder<> &IRB, Function &F,
                            ArrayRef<Instruction *> StackRestorePoints,
                            Value *StaticTop, bool NeedDynamicTop);
 
@@ -214,17 +163,26 @@ class SafeStack : public FunctionPass {
                                        AllocaInst *DynamicTop,
                                        ArrayRef<AllocaInst *> DynamicAllocas);
 
+  bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize);
+
+  bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
+                          const Value *AllocaPtr, uint64_t AllocaSize);
+  bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
+                    uint64_t AllocaSize);
+
 public:
   static char ID; // Pass identification, replacement for typeid.
-  SafeStack() : FunctionPass(ID), DL(nullptr) {
+  SafeStack(const TargetMachine *TM)
+      : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
     initializeSafeStackPass(*PassRegistry::getPassRegistry());
   }
+  SafeStack() : SafeStack(nullptr) {}
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<AliasAnalysis>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
   }
 
-  virtual bool doInitialization(Module &M) {
+  bool doInitialization(Module &M) override {
     DL = &M.getDataLayout();
 
     StackPtrTy = Type::getInt8PtrTy(M.getContext());
@@ -235,51 +193,203 @@ public:
     return false;
   }
 
-  bool runOnFunction(Function &F);
-
+  bool runOnFunction(Function &F) override;
 }; // class SafeStack
 
-Constant *SafeStack::getOrCreateUnsafeStackPtr(Module &M) {
-  // The unsafe stack pointer is stored in a global variable with a magic name.
-  const char *kUnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";
+uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
+  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  if (AI->isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    if (!C)
+      return 0;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
+bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
+                             const Value *AllocaPtr, uint64_t AllocaSize) {
+  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+
+  uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
+  ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+  ConstantRange SizeRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
+  ConstantRange AccessRange = AccessStartRange.add(SizeRange);
+  ConstantRange AllocaRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize));
+  bool Safe = AllocaRange.contains(AccessRange);
+
+  DEBUG(dbgs() << "[SafeStack] "
+               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+               << *AllocaPtr << "\n"
+               << "            Access " << *Addr << "\n"
+               << "            SCEV " << *Expr
+               << " U: " << SE->getUnsignedRange(Expr)
+               << ", S: " << SE->getSignedRange(Expr) << "\n"
+               << "            Range " << AccessRange << "\n"
+               << "            AllocaRange " << AllocaRange << "\n"
+               << "            " << (Safe ? "safe" : "unsafe") << "\n");
+
+  return Safe;
+}
+
+bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
+                                   const Value *AllocaPtr,
+                                   uint64_t AllocaSize) {
+  // All MemIntrinsics have destination address in Arg0 and size in Arg2.
+  if (MI->getRawDest() != U) return true;
+  const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
+  // Non-constant size => unsafe. FIXME: try SCEV getRange.
+  if (!Len) return false;
+  return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize);
+}
+
+/// Check whether a given allocation must be put on the safe
+/// stack or not. The function analyzes all uses of AI and checks whether it is
+/// only accessed in a memory safe way (as decided statically).
+bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
+  // Go through all uses of this alloca and check whether all accesses to the
+  // allocated object are statically known to be memory safe and, hence, the
+  // object can be placed on the safe stack.
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 8> WorkList;
+  WorkList.push_back(AllocaPtr);
+
+  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
+  while (!WorkList.empty()) {
+    const Value *V = WorkList.pop_back_val();
+    for (const Use &UI : V->uses()) {
+      auto I = cast<const Instruction>(UI.getUser());
+      assert(V == UI.get());
+
+      switch (I->getOpcode()) {
+      case Instruction::Load: {
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+                          AllocaSize))
+          return false;
+        break;
+      }
+      case Instruction::VAArg:
+        // "va-arg" from a pointer is safe.
+        break;
+      case Instruction::Store: {
+        if (V == I->getOperand(0)) {
+          // Stored the pointer - conservatively assume it may be unsafe.
+          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                       << "\n            store of address: " << *I << "\n");
+          return false;
+        }
+
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+                          AllocaPtr, AllocaSize))
+          return false;
+        break;
+      }
+      case Instruction::Ret: {
+        // Information leak.
+        return false;
+      }
+
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        ImmutableCallSite CS(I);
+
+        if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+              II->getIntrinsicID() == Intrinsic::lifetime_end)
+            continue;
+        }
+
+        if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+          if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) {
+            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                         << "\n            unsafe memintrinsic: " << *I
+                         << "\n");
+            return false;
+          }
+          continue;
+        }
 
+        // LLVM 'nocapture' attribute is only set for arguments whose address
+        // is not stored, passed around, or used in any other non-trivial way.
+        // We assume that passing a pointer to an object as a 'nocapture
+        // readnone' argument is safe.
+        // FIXME: a more precise solution would require an interprocedural
+        // analysis here, which would look at all uses of an argument inside
+        // the function being called.
+        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
+          if (A->get() == V)
+            if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
+                                               CS.doesNotAccessMemory()))) {
+              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                           << "\n            unsafe call: " << *I << "\n");
+              return false;
+            }
+        continue;
+      }
+
+      default:
+        if (Visited.insert(I).second)
+          WorkList.push_back(cast<const Instruction>(I));
+      }
+    }
+  }
+
+  // All uses of the alloca are safe, we can place it on the safe stack.
+  return true;
+}
+
+Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) {
+  // Check if there is a target-specific location for the unsafe stack pointer.
+  if (TL)
+    if (Value *V = TL->getSafeStackPointerLocation(IRB))
+      return V;
+
+  // Otherwise, assume the target links with compiler-rt, which provides a
+  // thread-local variable with a magic name.
+  Module &M = *F.getParent();
+  const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";
   auto UnsafeStackPtr =
-      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(kUnsafeStackPtrVar));
+      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(UnsafeStackPtrVar));
+
+  bool UseTLS = USPStorage == ThreadLocalUSP;
 
   if (!UnsafeStackPtr) {
+    auto TLSModel = UseTLS ?
+        GlobalValue::InitialExecTLSModel :
+        GlobalValue::NotThreadLocal;
     // The global variable is not defined yet, define it ourselves.
-    // We use the initial-exec TLS model because we do not support the variable
-    // living anywhere other than in the main executable.
+    // We use the initial-exec TLS model because we do not support the
+    // variable living anywhere other than in the main executable.
     UnsafeStackPtr = new GlobalVariable(
-        /*Module=*/M, /*Type=*/StackPtrTy,
-        /*isConstant=*/false, /*Linkage=*/GlobalValue::ExternalLinkage,
-        /*Initializer=*/0, /*Name=*/kUnsafeStackPtrVar,
-        /*InsertBefore=*/nullptr,
-        /*ThreadLocalMode=*/GlobalValue::InitialExecTLSModel);
+        M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr,
+        UnsafeStackPtrVar, nullptr, TLSModel);
   } else {
     // The variable exists, check its type and attributes.
-    if (UnsafeStackPtr->getValueType() != StackPtrTy) {
-      report_fatal_error(Twine(kUnsafeStackPtrVar) + " must have void* type");
-    }
-
-    if (!UnsafeStackPtr->isThreadLocal()) {
-      report_fatal_error(Twine(kUnsafeStackPtrVar) + " must be thread-local");
-    }
+    if (UnsafeStackPtr->getValueType() != StackPtrTy)
+      report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type");
+    if (UseTLS != UnsafeStackPtr->isThreadLocal())
+      report_fatal_error(Twine(UnsafeStackPtrVar) + " must " +
+                         (UseTLS ? "" : "not ") + "be thread-local");
   }
-
   return UnsafeStackPtr;
 }
 
 void SafeStack::findInsts(Function &F,
                           SmallVectorImpl<AllocaInst *> &StaticAllocas,
                           SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                          SmallVectorImpl<Argument *> &ByValArguments,
                           SmallVectorImpl<ReturnInst *> &Returns,
                           SmallVectorImpl<Instruction *> &StackRestorePoints) {
-  for (Instruction &I : inst_range(&F)) {
+  for (Instruction &I : instructions(&F)) {
     if (auto AI = dyn_cast<AllocaInst>(&I)) {
       ++NumAllocas;
 
-      if (IsSafeStackAlloca(AI))
+      uint64_t Size = getStaticAllocaAllocationSize(AI);
+      if (IsSafeStackAlloca(AI, Size))
         continue;
 
       if (AI->isStaticAlloca()) {
@@ -304,19 +414,26 @@ void SafeStack::findInsts(Function &F,
             "gcroot intrinsic not compatible with safestack attribute");
     }
   }
+  for (Argument &Arg : F.args()) {
+    if (!Arg.hasByValAttr())
+      continue;
+    uint64_t Size =
+        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+    if (IsSafeStackAlloca(&Arg, Size))
+      continue;
+
+    ++NumUnsafeByValArguments;
+    ByValArguments.push_back(&Arg);
+  }
 }
 
 AllocaInst *
-SafeStack::createStackRestorePoints(Function &F,
+SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
                                     ArrayRef<Instruction *> StackRestorePoints,
                                     Value *StaticTop, bool NeedDynamicTop) {
   if (StackRestorePoints.empty())
     return nullptr;
 
-  IRBuilder<> IRB(StaticTop
-                      ? cast<Instruction>(StaticTop)->getNextNode()
-                      : (Instruction *)F.getEntryBlock().getFirstInsertionPt());
-
   // We need the current value of the shadow stack pointer to restore
   // after longjmp or exception catching.
 
@@ -342,7 +459,7 @@ SafeStack::createStackRestorePoints(Function &F,
   for (Instruction *I : StackRestorePoints) {
     ++NumUnsafeStackRestorePoints;
 
-    IRB.SetInsertPoint(cast<Instruction>(I->getNextNode()));
+    IRB.SetInsertPoint(I->getNextNode());
     Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop;
     IRB.CreateStore(CurrentTop, UnsafeStackPtr);
   }
@@ -350,14 +467,12 @@ SafeStack::createStackRestorePoints(Function &F,
   return DynamicTop;
 }
 
-Value *
-SafeStack::moveStaticAllocasToUnsafeStack(Function &F,
-                                          ArrayRef<AllocaInst *> StaticAllocas,
-                                          ArrayRef<ReturnInst *> Returns) {
-  if (StaticAllocas.empty())
+Value *SafeStack::moveStaticAllocasToUnsafeStack(
+    IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
+    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) {
+  if (StaticAllocas.empty() && ByValArguments.empty())
     return nullptr;
 
-  IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt());
   DIBuilder DIB(*F.getParent());
 
   // We explicitly compute and set the unsafe stack layout for all unsafe
@@ -377,6 +492,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,
 
   // Compute maximum alignment among static objects on the unsafe stack.
   unsigned MaxAlignment = 0;
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+    if (Align > MaxAlignment)
+      MaxAlignment = Align;
+  }
   for (AllocaInst *AI : StaticAllocas) {
     Type *Ty = AI->getAllocatedType();
     unsigned Align =
@@ -388,22 +510,51 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,
   if (MaxAlignment > StackAlignment) {
     // Re-align the base pointer according to the max requested alignment.
     assert(isPowerOf2_32(MaxAlignment));
-    IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+    IRB.SetInsertPoint(BasePointer->getNextNode());
     BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
         IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
                       ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))),
         StackPtrTy));
   }
 
-  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   int64_t StaticOffset = 0; // Current stack top.
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+
+    uint64_t Size = DL->getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+
+    // Add alignment.
+    // NOTE: we ensure that BasePointer itself is aligned to >= Align.
+    StaticOffset += Size;
+    StaticOffset = RoundUpToAlignment(StaticOffset, Align);
+
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -StaticOffset));
+    Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
+                                     Arg->getName() + ".unsafe-byval");
+
+    // Replace alloc with the new location.
+    replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
+                      /*Deref=*/true, -StaticOffset);
+    Arg->replaceAllUsesWith(NewArg);
+    IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
+    IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
+  }
+
+  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   for (AllocaInst *AI : StaticAllocas) {
     IRB.SetInsertPoint(AI);
 
-    auto CArraySize = cast<ConstantInt>(AI->getArraySize());
     Type *Ty = AI->getAllocatedType();
-
-    uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue();
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
@@ -423,7 +574,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,
       cast<Instruction>(NewAI)->takeName(AI);
 
     // Replace alloc with the new location.
-    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true);
+    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -StaticOffset);
     AI->replaceAllUsesWith(NewAI);
     AI->eraseFromParent();
   }
@@ -434,7 +585,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,
   StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment);
 
   // Update shadow stack pointer in the function epilogue.
-  IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+  IRB.SetInsertPoint(BasePointer->getNextNode());
 
   Value *StaticTop =
       IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset),
@@ -478,7 +629,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
     if (DynamicTop)
       IRB.CreateStore(NewTop, DynamicTop);
 
-    Value *NewAI = IRB.CreateIntToPtr(SP, AI->getType());
+    Value *NewAI = IRB.CreatePointerCast(NewTop, AI->getType());
     if (AI->hasName() && isa<Instruction>(NewAI))
       NewAI->takeName(AI);
 
@@ -513,8 +664,6 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
 }
 
 bool SafeStack::runOnFunction(Function &F) {
-  auto AA = &getAnalysis<AliasAnalysis>();
-
   DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
 
   if (!F.hasFnAttribute(Attribute::SafeStack)) {
@@ -529,6 +678,9 @@ bool SafeStack::runOnFunction(Function &F) {
     return false;
   }
 
+  TL = TM ? TM->getSubtargetImpl(F)->getTargetLowering() : nullptr;
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
   {
     // Make sure the regular stack protector won't run on this function
     // (safestack attribute takes precedence).
@@ -541,16 +693,11 @@ bool SafeStack::runOnFunction(Function &F) {
         AttributeSet::get(F.getContext(), AttributeSet::FunctionIndex, B));
   }
 
-  if (AA->onlyReadsMemory(&F)) {
-    // XXX: we don't protect against information leak attacks for now.
-    DEBUG(dbgs() << "[SafeStack]     function only reads memory\n");
-    return false;
-  }
-
   ++NumFunctions;
 
   SmallVector<AllocaInst *, 16> StaticAllocas;
   SmallVector<AllocaInst *, 4> DynamicAllocas;
+  SmallVector<Argument *, 4> ByValArguments;
   SmallVector<ReturnInst *, 4> Returns;
 
   // Collect all points where stack gets unwound and needs to be restored
@@ -562,23 +709,26 @@ bool SafeStack::runOnFunction(Function &F) {
 
   // Find all static and dynamic alloca instructions that must be moved to the
   // unsafe stack, all return instructions and stack restore points.
-  findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints);
+  findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns,
+            StackRestorePoints);
 
   if (StaticAllocas.empty() && DynamicAllocas.empty() &&
-      StackRestorePoints.empty())
+      ByValArguments.empty() && StackRestorePoints.empty())
     return false; // Nothing to do in this function.
 
-  if (!StaticAllocas.empty() || !DynamicAllocas.empty())
+  if (!StaticAllocas.empty() || !DynamicAllocas.empty() ||
+      !ByValArguments.empty())
     ++NumUnsafeStackFunctions; // This function has the unsafe stack.
 
   if (!StackRestorePoints.empty())
     ++NumUnsafeStackRestorePointsFunctions;
 
-  if (!UnsafeStackPtr)
-    UnsafeStackPtr = getOrCreateUnsafeStackPtr(*F.getParent());
+  IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
+  UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F);
 
   // The top of the unsafe stack after all unsafe static allocas are allocated.
-  Value *StaticTop = moveStaticAllocasToUnsafeStack(F, StaticAllocas, Returns);
+  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas,
+                                                    ByValArguments, Returns);
 
   // Safe stack object that stores the current unsafe stack top. It is updated
   // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
@@ -587,7 +737,7 @@ bool SafeStack::runOnFunction(Function &F) {
   // FIXME: a better alternative might be to store the unsafe stack pointer
   // before setjmp / invoke instructions.
   AllocaInst *DynamicTop = createStackRestorePoints(
-      F, StackRestorePoints, StaticTop, !DynamicAllocas.empty());
+      IRB, F, StackRestorePoints, StaticTop, !DynamicAllocas.empty());
 
   // Handle dynamic allocas.
   moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop,
@@ -597,13 +747,14 @@ bool SafeStack::runOnFunction(Function &F) {
   return true;
 }
 
-} // end anonymous namespace
+} // anonymous namespace
 
 char SafeStack::ID = 0;
-INITIALIZE_PASS_BEGIN(SafeStack, "safe-stack",
-                      "Safe Stack instrumentation pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(SafeStack, "safe-stack", "Safe Stack instrumentation pass",
-                    false, false)
+INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
+                         "Safe Stack instrumentation pass", false, false)
+INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
+                       "Safe Stack instrumentation pass", false, false)
 
-FunctionPass *llvm::createSafeStackPass() { return new SafeStack(); }
+FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
+  return new SafeStack(TM);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 7a5b4cb..09de7a2 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -59,6 +60,7 @@ static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";
 static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";
 static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";
 static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp";
+static const char *const kSanCovTraceSwitch = "__sanitizer_cov_trace_switch";
 static const char *const kSanCovModuleCtorName = "sancov.module_ctor";
 static const uint64_t    kSanCtorAndDtorPriority = 2;
 
@@ -148,19 +150,25 @@ class SanitizerCoverageModule : public ModulePass {
   void InjectCoverageForIndirectCalls(Function &F,
                                       ArrayRef<Instruction *> IndirCalls);
   void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+  void InjectTraceForSwitch(Function &F,
+                            ArrayRef<Instruction *> SwitchTraceTargets);
   bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
   void SetNoSanitizeMetadata(Instruction *I);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
   unsigned NumberOfInstrumentedBlocks() {
-    return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses();
+    return SanCovFunction->getNumUses() +
+           SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
+           SanCovTraceEnter->getNumUses();
   }
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
   Function *SanCovIndirCallFunction;
   Function *SanCovTraceEnter, *SanCovTraceBB;
   Function *SanCovTraceCmpFunction;
+  Function *SanCovTraceSwitchFunction;
   InlineAsm *EmptyAsm;
-  Type *IntptrTy, *Int64Ty;
+  Type *IntptrTy, *Int64Ty, *Int64PtrTy;
+  Module *CurModule;
   LLVMContext *C;
   const DataLayout *DL;
 
@@ -177,11 +185,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
     return false;
   C = &(M.getContext());
   DL = &M.getDataLayout();
+  CurModule = &M;
   IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   Type *VoidTy = Type::getVoidTy(*C);
   IRBuilder<> IRB(*C);
   Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
   Int64Ty = IRB.getInt64Ty();
 
   SanCovFunction = checkSanitizerInterfaceFunction(
@@ -194,18 +204,19 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   SanCovTraceCmpFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
           kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr));
+  SanCovTraceSwitchFunction =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kSanCovTraceSwitch, VoidTy, Int64Ty, Int64PtrTy, nullptr));
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
-  if (Options.TraceBB) {
-    SanCovTraceEnter = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
-    SanCovTraceBB = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
-  }
+  SanCovTraceEnter = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
+  SanCovTraceBB = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
 
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
@@ -280,11 +291,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   if (F.empty()) return false;
   if (F.getName().find(".module_ctor") != std::string::npos)
     return false;  // Should not instrument sanitizer init functions.
+  // Don't instrument functions using SEH for now. Splitting basic blocks like
+  // we do for coverage breaks WinEHPrepare.
+  // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
+  if (F.hasPersonalityFn() &&
+      isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    return false;
   if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
     SplitAllCriticalEdges(F);
   SmallVector<Instruction*, 8> IndirCalls;
   SmallVector<BasicBlock*, 16> AllBlocks;
   SmallVector<Instruction*, 8> CmpTraceTargets;
+  SmallVector<Instruction*, 8> SwitchTraceTargets;
   for (auto &BB : F) {
     AllBlocks.push_back(&BB);
     for (auto &Inst : BB) {
@@ -293,13 +311,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
         if (CS && !CS.getCalledFunction())
           IndirCalls.push_back(&Inst);
       }
-      if (Options.TraceCmp && isa<ICmpInst>(&Inst))
-        CmpTraceTargets.push_back(&Inst);
+      if (Options.TraceCmp) {
+        if (isa<ICmpInst>(&Inst))
+          CmpTraceTargets.push_back(&Inst);
+        if (isa<SwitchInst>(&Inst))
+          SwitchTraceTargets.push_back(&Inst);
+      }
     }
   }
   InjectCoverage(F, AllBlocks);
   InjectCoverageForIndirectCalls(F, IndirCalls);
   InjectTraceForCmp(F, CmpTraceTargets);
+  InjectTraceForSwitch(F, SwitchTraceTargets);
   return true;
 }
 
@@ -348,6 +371,45 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
   }
 }
 
+// For every switch statement we insert a call:
+// __sanitizer_cov_trace_switch(CondValue,
+//      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
+
+void SanitizerCoverageModule::InjectTraceForSwitch(
+    Function &F, ArrayRef<Instruction *> SwitchTraceTargets) {
+  for (auto I : SwitchTraceTargets) {
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      IRBuilder<> IRB(I);
+      SmallVector<Constant *, 16> Initializers;
+      Value *Cond = SI->getCondition();
+      if (Cond->getType()->getScalarSizeInBits() >
+          Int64Ty->getScalarSizeInBits())
+        continue;
+      Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases()));
+      Initializers.push_back(
+          ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits()));
+      if (Cond->getType()->getScalarSizeInBits() <
+          Int64Ty->getScalarSizeInBits())
+        Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
+      for (auto It: SI->cases()) {
+        Constant *C = It.getCaseValue();
+        if (C->getType()->getScalarSizeInBits() <
+            Int64Ty->getScalarSizeInBits())
+          C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
+        Initializers.push_back(C);
+      }
+      ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
+      GlobalVariable *GV = new GlobalVariable(
+          *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
+          ConstantArray::get(ArrayOfInt64Ty, Initializers),
+          "__sancov_gen_cov_switch_values");
+      IRB.CreateCall(SanCovTraceSwitchFunction,
+                     {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)});
+    }
+  }
+}
+
+
 void SanitizerCoverageModule::InjectTraceForCmp(
     Function &F, ArrayRef<Instruction *> CmpTraceTargets) {
   for (auto I : CmpTraceTargets) {
@@ -369,8 +431,7 @@ void SanitizerCoverageModule::InjectTraceForCmp(
 
 void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) {
   I->setMetadata(
-      I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"),
-      MDNode::get(*C, None));
+      I->getModule()->getMDKindID("nosanitize"), MDNode::get(*C, None));
 }
 
 void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
@@ -382,34 +443,31 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   // locations.
   if (isa<UnreachableInst>(BB.getTerminator()))
     return;
-  BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end();
-  // Skip static allocas at the top of the entry block so they don't become
-  // dynamic when we split the block.  If we used our optimized stack layout,
-  // then there will only be one alloca and it will come first.
-  for (; IP != BE; ++IP) {
-    AllocaInst *AI = dyn_cast<AllocaInst>(IP);
-    if (!AI || !AI->isStaticAlloca())
-      break;
-  }
+  BasicBlock::iterator IP = BB.getFirstInsertionPt();
 
   bool IsEntryBB = &BB == &F.getEntryBlock();
   DebugLoc EntryLoc;
   if (IsEntryBB) {
     if (auto SP = getDISubprogram(&F))
       EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+    // Keep static allocas and llvm.localescape calls in the entry block.  Even
+    // if we aren't splitting the block, it's nice for allocas to be before
+    // calls.
+    IP = PrepareToSplitEntryBlock(BB, IP);
   } else {
     EntryLoc = IP->getDebugLoc();
   }
 
-  IRBuilder<> IRB(IP);
+  IRBuilder<> IRB(&*IP);
   IRB.SetCurrentDebugLocation(EntryLoc);
-  SmallVector<Value *, 1> Indices;
   Value *GuardP = IRB.CreateAdd(
       IRB.CreatePointerCast(GuardArray, IntptrTy),
       ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
-  if (UseCalls) {
+  if (Options.TraceBB) {
+    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
+  } else if (UseCalls) {
     IRB.CreateCall(SanCovWithCheckFunction, GuardP);
   } else {
     LoadInst *Load = IRB.CreateLoad(GuardP);
@@ -418,7 +476,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     SetNoSanitizeMetadata(Load);
     Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
     Instruction *Ins = SplitBlockAndInsertIfThen(
-        Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
+        Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
     IRB.SetInsertPoint(Ins);
     IRB.SetCurrentDebugLocation(EntryLoc);
     // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
@@ -427,7 +485,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   }
 
   if (Options.Use8bitCounters) {
-    IRB.SetInsertPoint(IP);
+    IRB.SetInsertPoint(&*IP);
     Value *P = IRB.CreateAdd(
         IRB.CreatePointerCast(EightBitCounterArray, IntptrTy),
         ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1));
@@ -438,13 +496,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     SetNoSanitizeMetadata(LI);
     SetNoSanitizeMetadata(SI);
   }
-
-  if (Options.TraceBB) {
-    // Experimental support for tracing.
-    // Insert a callback with the same guard variable as used for coverage.
-    IRB.SetInsertPoint(IP);
-    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
-  }
 }
 
 char SanitizerCoverageModule::ID = 0;
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 1a46bbb..9331e1d 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -142,37 +142,35 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
       M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr));
   OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
-    const size_t ByteSize = 1 << i;
-    const size_t BitSize = ByteSize * 8;
-    SmallString<32> ReadName("__tsan_read" + itostr(ByteSize));
+    const unsigned ByteSize = 1U << i;
+    const unsigned BitSize = ByteSize * 8;
+    std::string ByteSizeStr = utostr(ByteSize);
+    std::string BitSizeStr = utostr(BitSize);
+    SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
     TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
         ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
-    SmallString<32> WriteName("__tsan_write" + itostr(ByteSize));
+    SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
     TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
         WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
-    SmallString<64> UnalignedReadName("__tsan_unaligned_read" +
-        itostr(ByteSize));
+    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
     TsanUnalignedRead[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
             UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
-    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" +
-        itostr(ByteSize));
+    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
     TsanUnalignedWrite[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
             UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
-    SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
-                                   "_load");
+    SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
     TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr));
 
-    SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) +
-                                    "_store");
+    SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
     TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
         AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
 
@@ -201,7 +199,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
           M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr));
     }
 
-    SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
+    SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
                                   "_compare_exchange_val");
     TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
         AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
@@ -513,8 +511,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
-    const size_t ByteSize = 1 << Idx;
-    const size_t BitSize = ByteSize * 8;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
@@ -527,8 +525,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
-    const size_t ByteSize = 1 << Idx;
-    const size_t BitSize = ByteSize * 8;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
@@ -544,8 +542,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
     if (!F)
       return false;
-    const size_t ByteSize = 1 << Idx;
-    const size_t BitSize = ByteSize * 8;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
@@ -558,8 +556,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
-    const size_t ByteSize = 1 << Idx;
-    const size_t BitSize = ByteSize * 8;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h
deleted file mode 100644
index 636c65c..0000000
--- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h
+++ /dev/null
@@ -1,123 +0,0 @@
-//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H
-
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Function.h"
-
-namespace llvm {
-namespace objcarc {
-
-/// \enum ARCInstKind
-///
-/// \brief Equivalence classes of instructions in the ARC Model.
-///
-/// Since we do not have "instructions" to represent ARC concepts in LLVM IR,
-/// we instead operate on equivalence classes of instructions.
-///
-/// TODO: This should be split into two enums: a runtime entry point enum
-/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals
-/// with effects of instructions in the ARC model (which would handle the notion
-/// of a User or CallOrUser).
-enum class ARCInstKind {
-  Retain,                   ///< objc_retain
-  RetainRV,                 ///< objc_retainAutoreleasedReturnValue
-  RetainBlock,              ///< objc_retainBlock
-  Release,                  ///< objc_release
-  Autorelease,              ///< objc_autorelease
-  AutoreleaseRV,            ///< objc_autoreleaseReturnValue
-  AutoreleasepoolPush,      ///< objc_autoreleasePoolPush
-  AutoreleasepoolPop,       ///< objc_autoreleasePoolPop
-  NoopCast,                 ///< objc_retainedObject, etc.
-  FusedRetainAutorelease,   ///< objc_retainAutorelease
-  FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
-  LoadWeakRetained,         ///< objc_loadWeakRetained (primitive)
-  StoreWeak,                ///< objc_storeWeak (primitive)
-  InitWeak,                 ///< objc_initWeak (derived)
-  LoadWeak,                 ///< objc_loadWeak (derived)
-  MoveWeak,                 ///< objc_moveWeak (derived)
-  CopyWeak,                 ///< objc_copyWeak (derived)
-  DestroyWeak,              ///< objc_destroyWeak (derived)
-  StoreStrong,              ///< objc_storeStrong (derived)
-  IntrinsicUser,            ///< clang.arc.use
-  CallOrUser,               ///< could call objc_release and/or "use" pointers
-  Call,                     ///< could call objc_release
-  User,                     ///< could "use" a pointer
-  None                      ///< anything that is inert from an ARC perspective.
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class);
-
-/// \brief Test if the given class is a kind of user.
-bool IsUser(ARCInstKind Class);
-
-/// \brief Test if the given class is objc_retain or equivalent.
-bool IsRetain(ARCInstKind Class);
-
-/// \brief Test if the given class is objc_autorelease or equivalent.
-bool IsAutorelease(ARCInstKind Class);
-
-/// \brief Test if the given class represents instructions which return their
-/// argument verbatim.
-bool IsForwarding(ARCInstKind Class);
-
-/// \brief Test if the given class represents instructions which do nothing if
-/// passed a null pointer.
-bool IsNoopOnNull(ARCInstKind Class);
-
-/// \brief Test if the given class represents instructions which are always safe
-/// to mark with the "tail" keyword.
-bool IsAlwaysTail(ARCInstKind Class);
-
-/// \brief Test if the given class represents instructions which are never safe
-/// to mark with the "tail" keyword.
-bool IsNeverTail(ARCInstKind Class);
-
-/// \brief Test if the given class represents instructions which are always safe
-/// to mark with the nounwind attribute.
-bool IsNoThrow(ARCInstKind Class);
-
-/// Test whether the given instruction can autorelease any pointer or cause an
-/// autoreleasepool pop.
-bool CanInterruptRV(ARCInstKind Class);
-
-/// \brief Determine if F is one of the special known Functions.  If it isn't,
-/// return ARCInstKind::CallOrUser.
-ARCInstKind GetFunctionClass(const Function *F);
-
-/// \brief Determine which objc runtime call instruction class V belongs to.
-///
-/// This is similar to GetARCInstKind except that it only detects objc
-/// runtime calls. This allows it to be faster.
-///
-static inline ARCInstKind GetBasicARCInstKind(const Value *V) {
-  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
-    if (const Function *F = CI->getCalledFunction())
-      return GetFunctionClass(F);
-    // Otherwise, be conservative.
-    return ARCInstKind::CallOrUser;
-  }
-
-  // Otherwise, be conservative.
-  return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User;
-}
-
-/// Map V to its ARCInstKind equivalence class.
-ARCInstKind GetARCInstKind(const Value *V);
-
-/// Returns false if conservatively we can prove that any instruction mapped to
-/// this kind can not decrement ref counts. Returns true otherwise.
-bool CanDecrementRefCount(ARCInstKind Kind);
-
-} // end namespace objcarc
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 4edd029..9d78e5a 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -49,7 +49,7 @@ bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
   assert(CS && "Only calls can alter reference counts!");
 
   // See if AliasAnalysis can help us with the call.
-  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
   if (AliasAnalysis::onlyReadsMemory(MRB))
     return false;
   if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
@@ -226,7 +226,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
                                 SmallPtrSetImpl<Instruction *> &DependingInsts,
                                 SmallPtrSetImpl<const BasicBlock *> &Visited,
                                 ProvenanceAnalysis &PA) {
-  BasicBlock::iterator StartPos = StartInst;
+  BasicBlock::iterator StartPos = StartInst->getIterator();
 
   SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
   Worklist.push_back(std::make_pair(StartBB, StartPos));
@@ -252,7 +252,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
         break;
       }
 
-      Instruction *Inst = --LocalStartPos;
+      Instruction *Inst = &*--LocalStartPos;
       if (Depends(Flavor, Inst, Arg, PA)) {
         DependingInsts.insert(Inst);
         break;
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
index 6ea038b..d860723 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -26,18 +26,10 @@ namespace llvm {
 using namespace llvm;
 using namespace llvm::objcarc;
 
-/// \brief A handy option to enable/disable all ARC Optimizations.
-bool llvm::objcarc::EnableARCOpts;
-static cl::opt<bool, true>
-EnableARCOptimizations("enable-objc-arc-opts",
-                       cl::desc("enable/disable all ARC Optimizations"),
-                       cl::location(EnableARCOpts),
-                       cl::init(true));
-
 /// initializeObjCARCOptsPasses - Initialize all passes linked into the
 /// ObjCARCOpts library.
 void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
-  initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCAAWrapperPassPass(Registry);
   initializeObjCARCAPElimPass(Registry);
   initializeObjCARCExpandPass(Registry);
   initializeObjCARCContractPass(Registry);
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index 7595e2d..5fd45b0 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -26,6 +26,8 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
@@ -34,7 +36,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "ARCInstKind.h"
 
 namespace llvm {
 class raw_ostream;
@@ -43,99 +44,6 @@ class raw_ostream;
 namespace llvm {
 namespace objcarc {
 
-/// \brief A handy option to enable/disable all ARC Optimizations.
-extern bool EnableARCOpts;
-
-/// \brief Test if the given module looks interesting to run ARC optimization
-/// on.
-static inline bool ModuleHasARC(const Module &M) {
-  return
-    M.getNamedValue("objc_retain") ||
-    M.getNamedValue("objc_release") ||
-    M.getNamedValue("objc_autorelease") ||
-    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
-    M.getNamedValue("objc_retainBlock") ||
-    M.getNamedValue("objc_autoreleaseReturnValue") ||
-    M.getNamedValue("objc_autoreleasePoolPush") ||
-    M.getNamedValue("objc_loadWeakRetained") ||
-    M.getNamedValue("objc_loadWeak") ||
-    M.getNamedValue("objc_destroyWeak") ||
-    M.getNamedValue("objc_storeWeak") ||
-    M.getNamedValue("objc_initWeak") ||
-    M.getNamedValue("objc_moveWeak") ||
-    M.getNamedValue("objc_copyWeak") ||
-    M.getNamedValue("objc_retainedObject") ||
-    M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer") ||
-    M.getNamedValue("clang.arc.use");
-}
-
-/// \brief This is a wrapper around getUnderlyingObject which also knows how to
-/// look through objc_retain and objc_autorelease calls, which we know to return
-/// their argument verbatim.
-static inline const Value *GetUnderlyingObjCPtr(const Value *V,
-                                                const DataLayout &DL) {
-  for (;;) {
-    V = GetUnderlyingObject(V, DL);
-    if (!IsForwarding(GetBasicARCInstKind(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-
-  return V;
-}
-
-/// The RCIdentity root of a value \p V is a dominating value U for which
-/// retaining or releasing U is equivalent to retaining or releasing V. In other
-/// words, ARC operations on \p V are equivalent to ARC operations on \p U.
-///
-/// We use this in the ARC optimizer to make it easier to match up ARC
-/// operations by always mapping ARC operations to RCIdentityRoots instead of
-/// pointers themselves.
-///
-/// The two ways that we see RCIdentical values in ObjC are via:
-///
-///   1. PointerCasts
-///   2. Forwarding Calls that return their argument verbatim.
-///
-/// Thus this function strips off pointer casts and forwarding calls. *NOTE*
-/// This implies that two RCIdentical values must alias.
-static inline const Value *GetRCIdentityRoot(const Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicARCInstKind(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just
-/// casts away the const of the result. For documentation about what an
-/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that
-/// function.
-static inline Value *GetRCIdentityRoot(Value *V) {
-  return const_cast<Value *>(GetRCIdentityRoot((const Value *)V));
-}
-
-/// \brief Assuming the given instruction is one of the special calls such as
-/// objc_retain or objc_release, return the RCIdentity root of the argument of
-/// the call.
-static inline Value *GetArgRCIdentityRoot(Value *Inst) {
-  return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0));
-}
-
-static inline bool IsNullOrUndef(const Value *V) {
-  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
-}
-
-static inline bool IsNoopInstruction(const Instruction *I) {
-  return isa<BitCastInst>(I) ||
-    (isa<GetElementPtrInst>(I) &&
-     cast<GetElementPtrInst>(I)->hasAllZeroIndices());
-}
-
-
 /// \brief Erase the given instruction.
 ///
 /// Many ObjC calls return their argument verbatim,
@@ -162,152 +70,6 @@ static inline void EraseInstruction(Instruction *CI) {
     RecursivelyDeleteTriviallyDeadInstructions(OldArg);
 }
 
-/// \brief Test whether the given value is possible a retainable object pointer.
-static inline bool IsPotentialRetainableObjPtr(const Value *Op) {
-  // Pointers to static or stack storage are not valid retainable object
-  // pointers.
-  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
-    return false;
-  // Special arguments can not be a valid retainable object pointer.
-  if (const Argument *Arg = dyn_cast<Argument>(Op))
-    if (Arg->hasByValAttr() ||
-        Arg->hasInAllocaAttr() ||
-        Arg->hasNestAttr() ||
-        Arg->hasStructRetAttr())
-      return false;
-  // Only consider values with pointer types.
-  //
-  // It seemes intuitive to exclude function pointer types as well, since
-  // functions are never retainable object pointers, however clang occasionally
-  // bitcasts retainable object pointers to function-pointer type temporarily.
-  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
-  if (!Ty)
-    return false;
-  // Conservatively assume anything else is a potential retainable object
-  // pointer.
-  return true;
-}
-
-static inline bool IsPotentialRetainableObjPtr(const Value *Op,
-                                               AliasAnalysis &AA) {
-  // First make the rudimentary check.
-  if (!IsPotentialRetainableObjPtr(Op))
-    return false;
-
-  // Objects in constant memory are not reference-counted.
-  if (AA.pointsToConstantMemory(Op))
-    return false;
-
-  // Pointers in constant memory are not pointing to reference-counted objects.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
-    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
-      return false;
-
-  // Otherwise assume the worst.
-  return true;
-}
-
-/// \brief Helper for GetARCInstKind. Determines what kind of construct CS
-/// is.
-static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) {
-  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-       I != E; ++I)
-    if (IsPotentialRetainableObjPtr(*I))
-      return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser;
-
-  return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call;
-}
-
-/// \brief Return true if this value refers to a distinct and identifiable
-/// object.
-///
-/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses
-/// special knowledge of ObjC conventions.
-static inline bool IsObjCIdentifiedObject(const Value *V) {
-  // Assume that call results and arguments have their own "provenance".
-  // Constants (including GlobalVariables) and Allocas are never
-  // reference-counted.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
-      isa<Argument>(V) || isa<Constant>(V) ||
-      isa<AllocaInst>(V))
-    return true;
-
-  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
-    const Value *Pointer =
-      GetRCIdentityRoot(LI->getPointerOperand());
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
-      // A constant pointer can't be pointing to an object on the heap. It may
-      // be reference-counted, but it won't be deleted.
-      if (GV->isConstant())
-        return true;
-      StringRef Name = GV->getName();
-      // These special variables are known to hold values which are not
-      // reference-counted pointers.
-      if (Name.startswith("\01l_objc_msgSend_fixup_"))
-        return true;
-
-      StringRef Section = GV->getSection();
-      if (Section.find("__message_refs") != StringRef::npos ||
-          Section.find("__objc_classrefs") != StringRef::npos ||
-          Section.find("__objc_superrefs") != StringRef::npos ||
-          Section.find("__objc_methname") != StringRef::npos ||
-          Section.find("__cstring") != StringRef::npos)
-        return true;
-    }
-  }
-
-  return false;
-}
-
-enum class ARCMDKindID {
-  ImpreciseRelease,
-  CopyOnEscape,
-  NoObjCARCExceptions,
-};
-
-/// A cache of MDKinds used by various ARC optimizations.
-class ARCMDKindCache {
-  Module *M;
-
-  /// The Metadata Kind for clang.imprecise_release metadata.
-  llvm::Optional<unsigned> ImpreciseReleaseMDKind;
-
-  /// The Metadata Kind for clang.arc.copy_on_escape metadata.
-  llvm::Optional<unsigned> CopyOnEscapeMDKind;
-
-  /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
-  llvm::Optional<unsigned> NoObjCARCExceptionsMDKind;
-
-public:
-  void init(Module *Mod) {
-    M = Mod;
-    ImpreciseReleaseMDKind = NoneType::None;
-    CopyOnEscapeMDKind = NoneType::None;
-    NoObjCARCExceptionsMDKind = NoneType::None;
-  }
-
-  unsigned get(ARCMDKindID ID) {
-    switch (ID) {
-    case ARCMDKindID::ImpreciseRelease:
-      if (!ImpreciseReleaseMDKind)
-        ImpreciseReleaseMDKind =
-            M->getContext().getMDKindID("clang.imprecise_release");
-      return *ImpreciseReleaseMDKind;
-    case ARCMDKindID::CopyOnEscape:
-      if (!CopyOnEscapeMDKind)
-        CopyOnEscapeMDKind =
-            M->getContext().getMDKindID("clang.arc.copy_on_escape");
-      return *CopyOnEscapeMDKind;
-    case ARCMDKindID::NoObjCARCExceptions:
-      if (!NoObjCARCExceptionsMDKind)
-        NoObjCARCExceptionsMDKind =
-            M->getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
-      return *NoObjCARCExceptionsMDKind;
-    }
-    llvm_unreachable("Covered switch isn't covered?!");
-  }
-};
-
 } // end namespace objcarc
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index d318643..969e77c 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -72,12 +72,9 @@ bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
   if (const Function *Callee = CS.getCalledFunction()) {
     if (Callee->isDeclaration() || Callee->mayBeOverridden())
       return true;
-    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
-         I != E; ++I) {
-      const BasicBlock *BB = I;
-      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
-           J != F; ++J)
-        if (ImmutableCallSite JCS = ImmutableCallSite(J))
+    for (const BasicBlock &BB : *Callee) {
+      for (const Instruction &I : BB)
+        if (ImmutableCallSite JCS = ImmutableCallSite(&I))
           // This recursion depth limit is arbitrary. It's just great
           // enough to cover known interesting testcases.
           if (Depth < 3 &&
@@ -96,7 +93,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
 
   Instruction *Push = nullptr;
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-    Instruction *Inst = I++;
+    Instruction *Inst = &*I++;
     switch (GetBasicARCInstKind(Inst)) {
     case ARCInstKind::AutoreleasepoolPush:
       Push = Inst;
@@ -169,7 +166,7 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
     if (std::next(F->begin()) != F->end())
       continue;
     // Ok, a single-block constructor function definition. Try to optimize it.
-    Changed |= OptimizeBB(F->begin());
+    Changed |= OptimizeBB(&F->front());
   }
 
   return Changed;
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
deleted file mode 100644
index eecc82f..0000000
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- C++ -*-----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file declares a simple ARC-aware AliasAnalysis using special knowledge
-/// of Objective C to enhance other optimization passes which rely on the Alias
-/// Analysis infrastructure.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
-
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Pass.h"
-
-namespace llvm {
-namespace objcarc {
-
-  /// \brief This is a simple alias analysis implementation that uses knowledge
-  /// of ARC constructs to answer queries.
-  ///
-  /// TODO: This class could be generalized to know about other ObjC-specific
-  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
-  /// even though their offsets are dynamic.
-  class ObjCARCAliasAnalysis : public ImmutablePass,
-                               public AliasAnalysis {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
-      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    bool doInitialization(Module &M) override;
-
-    /// This method is used when a pass implements an analysis interface through
-    /// multiple inheritance.  If needed, it should override this to adjust the
-    /// this pointer as needed for the specified pass info.
-    void *getAdjustedAnalysisPointer(const void *PI) override {
-      if (PI == &AliasAnalysis::ID)
-        return static_cast<AliasAnalysis *>(this);
-      return this;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override;
-    bool pointsToConstantMemory(const MemoryLocation &Loc,
-                                bool OrLocal) override;
-    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override;
-    ModRefBehavior getModRefBehavior(const Function *F) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override;
-  };
-
-} // namespace objcarc
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index baca76b..1cdf568 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -119,9 +119,9 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
     return false;
 
   // Check that the call is next to the retain.
-  BasicBlock::const_iterator I = Call;
-  ++I;
-  while (IsNoopInstruction(I)) ++I;
+  BasicBlock::const_iterator I = ++Call->getIterator();
+  while (IsNoopInstruction(&*I))
+    ++I;
   if (&*I != Retain)
     return false;
 
@@ -247,7 +247,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
 
     // Ok, now we know we have not seen a store yet. See if Inst can write to
     // our load location, if it can not, just ignore the instruction.
-    if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod))
+    if (!(AA->getModRefInfo(Inst, Loc) & MRI_Mod))
       continue;
 
     Store = dyn_cast<StoreInst>(Inst);
@@ -282,9 +282,9 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
                                     Instruction *Release,
                                     ProvenanceAnalysis &PA) {
   // Walk up from the Store to find the retain.
-  BasicBlock::iterator I = Store;
+  BasicBlock::iterator I = Store->getIterator();
   BasicBlock::iterator Begin = Store->getParent()->begin();
-  while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) {
+  while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) {
     Instruction *Inst = &*I;
 
     // It is only safe to move the retain to the store if we can prove
@@ -294,7 +294,7 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
       return nullptr;
     --I;
   }
-  Instruction *Retain = I;
+  Instruction *Retain = &*I;
   if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)
     return nullptr;
   if (GetArgRCIdentityRoot(Retain) != New)
@@ -429,7 +429,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
       // insert it now.
       if (!RetainRVMarker)
         return false;
-      BasicBlock::iterator BBI = Inst;
+      BasicBlock::iterator BBI = Inst->getIterator();
       BasicBlock *InstParent = Inst->getParent();
 
       // Step up to see if the call immediately precedes the RetainRV call.
@@ -440,11 +440,11 @@ bool ObjCARCContract::tryToPeepholeInstruction(
           BasicBlock *Pred = InstParent->getSinglePredecessor();
           if (!Pred)
             goto decline_rv_optimization;
-          BBI = Pred->getTerminator();
+          BBI = Pred->getTerminator()->getIterator();
           break;
         }
         --BBI;
-      } while (IsNoopInstruction(BBI));
+      } while (IsNoopInstruction(&*BBI));
 
       if (&*BBI == GetArgRCIdentityRoot(Inst)) {
         DEBUG(dbgs() << "Adding inline asm marker for "
@@ -511,10 +511,10 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     return false;
 
   Changed = false;
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
-  PA.setAA(&getAnalysis<AliasAnalysis>());
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
 
   DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
 
@@ -629,13 +629,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
 char ObjCARCContract::ID = 0;
 INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract",
                       "ObjC ARC contraction", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract",
                     "ObjC ARC contraction", false, false)
 
 void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.setPreservesCFG();
 }
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 9edbb17..f0ee6e2 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -28,7 +28,6 @@
 #include "ARCRuntimeEntryPoints.h"
 #include "BlotMapVector.h"
 #include "DependencyAnalysis.h"
-#include "ObjCARCAliasAnalysis.h"
 #include "ProvenanceAnalysis.h"
 #include "PtrState.h"
 #include "llvm/ADT/DenseMap.h"
@@ -36,6 +35,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
@@ -482,7 +482,7 @@ namespace {
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
-    /// Flags which determine whether each of the interesting runtine functions
+    /// Flags which determine whether each of the interesting runtime functions
     /// is in fact used in the current function.
     unsigned UsedInThisFunction;
 
@@ -556,7 +556,7 @@ namespace {
 char ObjCARCOpt::ID = 0;
 INITIALIZE_PASS_BEGIN(ObjCARCOpt,
                       "objc-arc", "ObjC ARC optimization", false, false)
-INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
 INITIALIZE_PASS_END(ObjCARCOpt,
                     "objc-arc", "ObjC ARC optimization", false, false)
 
@@ -565,8 +565,8 @@ Pass *llvm::createObjCARCOptPass() {
 }
 
 void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ObjCARCAliasAnalysis>();
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<ObjCARCAAWrapperPass>();
+  AU.addRequired<AAResultsWrapperPass>();
   // ARC optimization doesn't currently split critical edges.
   AU.setPreservesCFG();
 }
@@ -581,16 +581,18 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   ImmutableCallSite CS(Arg);
   if (const Instruction *Call = CS.getInstruction()) {
     if (Call->getParent() == RetainRV->getParent()) {
-      BasicBlock::const_iterator I = Call;
+      BasicBlock::const_iterator I(Call);
       ++I;
-      while (IsNoopInstruction(I)) ++I;
+      while (IsNoopInstruction(&*I))
+        ++I;
       if (&*I == RetainRV)
         return false;
     } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       BasicBlock *RetainRVParent = RetainRV->getParent();
       if (II->getNormalDest() == RetainRVParent) {
         BasicBlock::const_iterator I = RetainRVParent->begin();
-        while (IsNoopInstruction(I)) ++I;
+        while (IsNoopInstruction(&*I))
+          ++I;
         if (&*I == RetainRV)
           return false;
       }
@@ -599,18 +601,21 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
 
   // Check for being preceded by an objc_autoreleaseReturnValue on the same
   // pointer. In this case, we can delete the pair.
-  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
+  BasicBlock::iterator I = RetainRV->getIterator(),
+                       Begin = RetainRV->getParent()->begin();
   if (I != Begin) {
-    do --I; while (I != Begin && IsNoopInstruction(I));
-    if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV &&
-        GetArgRCIdentityRoot(I) == Arg) {
+    do
+      --I;
+    while (I != Begin && IsNoopInstruction(&*I));
+    if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV &&
+        GetArgRCIdentityRoot(&*I) == Arg) {
       Changed = true;
       ++NumPeeps;
 
       DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
                    << "Erasing " << *RetainRV << "\n");
 
-      EraseInstruction(I);
+      EraseInstruction(&*I);
       EraseInstruction(RetainRV);
       return true;
     }
@@ -1216,7 +1221,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
-    Instruction *Inst = std::prev(I);
+    Instruction *Inst = &*std::prev(I);
 
     // Invoke instructions are visited as part of their successors (below).
     if (isa<InvokeInst>(Inst))
@@ -1264,7 +1269,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     Arg = GetArgRCIdentityRoot(Inst);
     TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
     NestingDetected |= S.InitTopDown(Class, Inst);
-    // A retain can be a potential use; procede to the generic checking
+    // A retain can be a potential use; proceed to the generic checking
     // code below.
     break;
   }
@@ -1342,12 +1347,10 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
                      << "Performing Dataflow:\n");
 
   // Visit all the instructions, top-down.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    Instruction *Inst = I;
+  for (Instruction &Inst : *BB) {
+    DEBUG(dbgs() << "    Visiting " << Inst << "\n");
 
-    DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
-
-    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
+    NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
   }
 
   DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n"
@@ -1413,16 +1416,15 @@ ComputePostOrders(Function &F,
   // Functions may have many exits, and there also blocks which we treat
   // as exits due to ignored edges.
   SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *ExitBB = I;
-    BBState &MyStates = BBStates[ExitBB];
+  for (BasicBlock &ExitBB : F) {
+    BBState &MyStates = BBStates[&ExitBB];
     if (!MyStates.isExit())
       continue;
 
     MyStates.SetAsExit();
 
-    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
-    Visited.insert(ExitBB);
+    PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin()));
+    Visited.insert(&ExitBB);
     while (!PredStack.empty()) {
     reverse_dfs_next_succ:
       BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
@@ -1830,7 +1832,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     // analysis too, but that would want caching. A better approach would be to
     // use the technique that EarlyCSE uses.
     inst_iterator Current = std::prev(I);
-    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
+    BasicBlock *CurrentBB = &*Current.getBasicBlockIterator();
     for (BasicBlock::iterator B = CurrentBB->begin(),
                               J = Current.getInstructionIterator();
          J != B; --J) {
@@ -2008,10 +2010,7 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
 
   // Check that the call is a regular call.
   ARCInstKind Class = GetBasicARCInstKind(Call);
-  if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call)
-    return false;
-
-  return true;
+  return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call;
 }
 
 /// Find a dependent retain that precedes the given autorelease for which there
@@ -2081,9 +2080,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
 
   SmallPtrSet<Instruction *, 4> DependingInstructions;
   SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-    BasicBlock *BB = FI;
-    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
+  for (BasicBlock &BB: F) {
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
 
     DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
 
@@ -2095,19 +2093,16 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
     // Look for an ``autorelease'' instruction that is a predecessor of Ret and
     // dependent on Arg such that there are no instructions dependent on Arg
     // that need a positive ref count in between the autorelease and Ret.
-    CallInst *Autorelease =
-      FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret,
-                                             DependingInstructions, Visited,
-                                             PA);
+    CallInst *Autorelease = FindPredecessorAutoreleaseWithSafePath(
+        Arg, &BB, Ret, DependingInstructions, Visited, PA);
     DependingInstructions.clear();
     Visited.clear();
 
     if (!Autorelease)
       continue;
 
-    CallInst *Retain =
-      FindPredecessorRetainWithSafePath(Arg, BB, Autorelease,
-                                        DependingInstructions, Visited, PA);
+    CallInst *Retain = FindPredecessorRetainWithSafePath(
+        Arg, &BB, Autorelease, DependingInstructions, Visited, PA);
     DependingInstructions.clear();
     Visited.clear();
 
@@ -2192,7 +2187,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
   DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"
         "\n");
 
-  PA.setAA(&getAnalysis<AliasAnalysis>());
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
 
 #ifndef NDEBUG
   if (AreStatisticsEnabled()) {
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 0ac41d3..1a12b659 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -26,10 +26,10 @@
 #define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 
 namespace llvm {
   class Value;
-  class AliasAnalysis;
   class DataLayout;
   class PHINode;
   class SelectInst;
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index 0be75af..c274e81 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -35,7 +35,7 @@ char PAEval::ID = 0;
 PAEval::PAEval() : FunctionPass(ID) {}
 
 void PAEval::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
 }
 
 static StringRef getName(Value *V) {
@@ -65,7 +65,7 @@ bool PAEval::runOnFunction(Function &F) {
   }
 
   ProvenanceAnalysis PA;
-  PA.setAA(&getAnalysis<AliasAnalysis>());
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   for (Value *V1 : Values) {
@@ -89,6 +89,6 @@ FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }
 
 INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",
                       "Evaluate ProvenanceAnalysis on all pairs", false, true)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(PAEval, "pa-eval",
                     "Evaluate ProvenanceAnalysis on all pairs", false, true)
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
index ae20e7e..df64fa3 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -256,9 +256,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
       // one of its successor blocks, since we can't insert code after it
       // in its own block, and we don't want to split critical edges.
       if (isa<InvokeInst>(Inst))
-        InsertReverseInsertPt(BB->getFirstInsertionPt());
+        InsertReverseInsertPt(&*BB->getFirstInsertionPt());
       else
-        InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst)));
+        InsertReverseInsertPt(&*++Inst->getIterator());
       SetSeq(S_Use);
     } else if (Seq == S_Release && IsUser(Class)) {
       DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() << "; "
@@ -268,9 +268,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
       assert(!HasReverseInsertPts());
       // As above; handle invoke specially.
       if (isa<InvokeInst>(Inst))
-        InsertReverseInsertPt(BB->getFirstInsertionPt());
+        InsertReverseInsertPt(&*BB->getFirstInsertionPt());
       else
-        InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst)));
+        InsertReverseInsertPt(&*++Inst->getIterator());
     }
     break;
   case S_Stop:
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
index e45e1ea..9749e44 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
@@ -17,8 +17,8 @@
 #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
 #define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
 
-#include "ARCInstKind.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/raw_ostream.h"
@@ -96,7 +96,7 @@ struct RRInfo {
 };
 
 /// \brief This class summarizes several per-pointer runtime properties which
-/// are propogated through the flow graph.
+/// are propagated through the flow graph.
 class PtrState {
 protected:
   /// True if the reference count is known to be incremented.
@@ -172,7 +172,7 @@ struct BottomUpPtrState : PtrState {
   bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);
 
   /// Return true if this set of releases can be paired with a release. Modifies
-  /// state appropriately to reflect that the matching occured if it is
+  /// state appropriately to reflect that the matching occurred if it is
   /// successful.
   ///
   /// It is assumed that one has already checked that the RCIdentity of the
@@ -194,7 +194,7 @@ struct TopDownPtrState : PtrState {
 
   /// Return true if this set of retains can be paired with the given
   /// release. Modifies state appropriately to reflect that the matching
-  /// occured.
+  /// occurred.
   bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);
 
   void HandlePotentialUse(Instruction *Inst, const Value *Ptr,
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index d6fc916..590a52d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -1,4 +1,4 @@
-//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,52 +14,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "adce"
 
 STATISTIC(NumRemoved, "Number of instructions removed");
 
-namespace {
-struct ADCE : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
-  ADCE() : FunctionPass(ID) {
-    initializeADCEPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function& F) override;
-
-  void getAnalysisUsage(AnalysisUsage& AU) const override {
-    AU.setPreservesCFG();
-  }
-};
-}
-
-char ADCE::ID = 0;
-INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false)
-
-bool ADCE::runOnFunction(Function& F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
+static bool aggressiveDCE(Function& F) {
   SmallPtrSet<Instruction*, 128> Alive;
   SmallVector<Instruction*, 128> Worklist;
 
   // Collect the set of "root" instructions that are known live.
-  for (Instruction &I : inst_range(F)) {
-    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
-        isa<LandingPadInst>(I) || I.mayHaveSideEffects()) {
+  for (Instruction &I : instructions(F)) {
+    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() ||
+        I.mayHaveSideEffects()) {
       Alive.insert(&I);
       Worklist.push_back(&I);
     }
@@ -79,7 +60,7 @@ bool ADCE::runOnFunction(Function& F) {
   // which have no side effects and do not influence the control flow or return
   // value of the function, and may therefore be deleted safely.
   // NOTE: We reuse the Worklist vector here for memory efficiency.
-  for (Instruction &I : inst_range(F)) {
+  for (Instruction &I : instructions(F)) {
     if (!Alive.count(&I)) {
       Worklist.push_back(&I);
       I.dropAllReferences();
@@ -94,6 +75,34 @@ bool ADCE::runOnFunction(Function& F) {
   return !Worklist.empty();
 }
 
-FunctionPass *llvm::createAggressiveDCEPass() {
-  return new ADCE();
+PreservedAnalyses ADCEPass::run(Function &F) {
+  if (aggressiveDCE(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
 }
+
+namespace {
+struct ADCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  ADCELegacyPass() : FunctionPass(ID) {
+    initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function& F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+    return aggressiveDCE(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char ADCELegacyPass::ID = 0;
+INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+                false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 8918909..4b721d3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -21,6 +21,8 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -54,13 +56,15 @@ struct AlignmentFromAssumptions : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
 
     AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<ScalarEvolution>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
   }
 
   // For memory transfers, we need a common alignment for both the source and
@@ -84,7 +88,7 @@ INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
                       aip_name, false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
                     aip_name, false, false)
 
@@ -249,8 +253,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
 
   // The mask must have some trailing ones (otherwise the condition is
   // trivial and tells us nothing about the alignment of the left operand).
-  unsigned TrailingOnes =
-    MaskSCEV->getValue()->getValue().countTrailingOnes();
+  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
   if (!TrailingOnes)
     return false;
 
@@ -270,7 +273,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
   OffSCEV = nullptr;
   if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
     AAPtr = PToI->getPointerOperand();
-    OffSCEV = SE->getConstant(Int64Ty, 0);
+    OffSCEV = SE->getZero(Int64Ty);
   } else if (const SCEVAddExpr* AndLHSAddSCEV =
              dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
     // Try to find the ptrtoint; subtract it and the rest is the offset.
@@ -410,7 +413,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
 bool AlignmentFromAssumptions::runOnFunction(Function &F) {
   bool Changed = false;
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   NewDestAlignments.clear();
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 09c605e..cb9b8b6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -15,26 +15,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/DemandedBits.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
 #define DEBUG_TYPE "bdce"
@@ -53,342 +45,42 @@ struct BDCE : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage& AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DemandedBits>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
   }
-
-  void determineLiveOperandBits(const Instruction *UserI,
-                                const Instruction *I, unsigned OperandNo,
-                                const APInt &AOut, APInt &AB,
-                                APInt &KnownZero, APInt &KnownOne,
-                                APInt &KnownZero2, APInt &KnownOne2);
-
-  AssumptionCache *AC;
-  DominatorTree *DT;
 };
 }
 
 char BDCE::ID = 0;
 INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DemandedBits)
 INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
                     false, false)
 
-static bool isAlwaysLive(Instruction *I) {
-  return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
-         isa<LandingPadInst>(I) || I->mayHaveSideEffects();
-}
-
-void BDCE::determineLiveOperandBits(const Instruction *UserI,
-                                    const Instruction *I, unsigned OperandNo,
-                                    const APInt &AOut, APInt &AB,
-                                    APInt &KnownZero, APInt &KnownOne,
-                                    APInt &KnownZero2, APInt &KnownOne2) {
-  unsigned BitWidth = AB.getBitWidth();
-
-  // We're called once per operand, but for some instructions, we need to
-  // compute known bits of both operands in order to determine the live bits of
-  // either (when both operands are instructions themselves). We don't,
-  // however, want to do this twice, so we cache the result in APInts that live
-  // in the caller. For the two-relevant-operands case, both operand values are
-  // provided here.
-  auto ComputeKnownBits =
-      [&](unsigned BitWidth, const Value *V1, const Value *V2) {
-        const DataLayout &DL = I->getModule()->getDataLayout();
-        KnownZero = APInt(BitWidth, 0);
-        KnownOne = APInt(BitWidth, 0);
-        computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0,
-                         AC, UserI, DT);
-
-        if (V2) {
-          KnownZero2 = APInt(BitWidth, 0);
-          KnownOne2 = APInt(BitWidth, 0);
-          computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL,
-                           0, AC, UserI, DT);
-        }
-      };
-
-  switch (UserI->getOpcode()) {
-  default: break;
-  case Instruction::Call:
-  case Instruction::Invoke:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI))
-      switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::bswap:
-        // The alive bits of the input are the swapped alive bits of
-        // the output.
-        AB = AOut.byteSwap();
-        break;
-      case Intrinsic::ctlz:
-        if (OperandNo == 0) {
-          // We need some output bits, so we need all bits of the
-          // input to the left of, and including, the leftmost bit
-          // known to be one.
-          ComputeKnownBits(BitWidth, I, nullptr);
-          AB = APInt::getHighBitsSet(BitWidth,
-                 std::min(BitWidth, KnownOne.countLeadingZeros()+1));
-        }
-        break;
-      case Intrinsic::cttz:
-        if (OperandNo == 0) {
-          // We need some output bits, so we need all bits of the
-          // input to the right of, and including, the rightmost bit
-          // known to be one.
-          ComputeKnownBits(BitWidth, I, nullptr);
-          AB = APInt::getLowBitsSet(BitWidth,
-                 std::min(BitWidth, KnownOne.countTrailingZeros()+1));
-        }
-        break;
-      }
-    break;
-  case Instruction::Add:
-  case Instruction::Sub:
-    // Find the highest live output bit. We don't need any more input
-    // bits than that (adds, and thus subtracts, ripple only to the
-    // left).
-    AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
-    break;
-  case Instruction::Shl:
-    if (OperandNo == 0)
-      if (ConstantInt *CI =
-            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
-        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
-        AB = AOut.lshr(ShiftAmt);
-
-        // If the shift is nuw/nsw, then the high bits are not dead
-        // (because we've promised that they *must* be zero).
-        const ShlOperator *S = cast<ShlOperator>(UserI);
-        if (S->hasNoSignedWrap())
-          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
-        else if (S->hasNoUnsignedWrap())
-          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
-      }
-    break;
-  case Instruction::LShr:
-    if (OperandNo == 0)
-      if (ConstantInt *CI =
-            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
-        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
-        AB = AOut.shl(ShiftAmt);
-
-        // If the shift is exact, then the low bits are not dead
-        // (they must be zero).
-        if (cast<LShrOperator>(UserI)->isExact())
-          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
-      }
-    break;
-  case Instruction::AShr:
-    if (OperandNo == 0)
-      if (ConstantInt *CI =
-            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
-        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
-        AB = AOut.shl(ShiftAmt);
-        // Because the high input bit is replicated into the
-        // high-order bits of the result, if we need any of those
-        // bits, then we must keep the highest input bit.
-        if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt))
-            .getBoolValue())
-          AB.setBit(BitWidth-1);
-
-        // If the shift is exact, then the low bits are not dead
-        // (they must be zero).
-        if (cast<AShrOperator>(UserI)->isExact())
-          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
-      }
-    break;
-  case Instruction::And:
-    AB = AOut;
-
-    // For bits that are known zero, the corresponding bits in the
-    // other operand are dead (unless they're both zero, in which
-    // case they can't both be dead, so just mark the LHS bits as
-    // dead).
-    if (OperandNo == 0) {
-      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
-      AB &= ~KnownZero2;
-    } else {
-      if (!isa<Instruction>(UserI->getOperand(0)))
-        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
-      AB &= ~(KnownZero & ~KnownZero2);
-    }
-    break;
-  case Instruction::Or:
-    AB = AOut;
-
-    // For bits that are known one, the corresponding bits in the
-    // other operand are dead (unless they're both one, in which
-    // case they can't both be dead, so just mark the LHS bits as
-    // dead).
-    if (OperandNo == 0) {
-      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
-      AB &= ~KnownOne2;
-    } else {
-      if (!isa<Instruction>(UserI->getOperand(0)))
-        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
-      AB &= ~(KnownOne & ~KnownOne2);
-    }
-    break;
-  case Instruction::Xor:
-  case Instruction::PHI:
-    AB = AOut;
-    break;
-  case Instruction::Trunc:
-    AB = AOut.zext(BitWidth);
-    break;
-  case Instruction::ZExt:
-    AB = AOut.trunc(BitWidth);
-    break;
-  case Instruction::SExt:
-    AB = AOut.trunc(BitWidth);
-    // Because the high input bit is replicated into the
-    // high-order bits of the result, if we need any of those
-    // bits, then we must keep the highest input bit.
-    if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(),
-                                      AOut.getBitWidth() - BitWidth))
-        .getBoolValue())
-      AB.setBit(BitWidth-1);
-    break;
-  case Instruction::Select:
-    if (OperandNo != 0)
-      AB = AOut;
-    break;
-  }
-}
-
 bool BDCE::runOnFunction(Function& F) {
   if (skipOptnoneFunction(F))
     return false;
+  DemandedBits &DB = getAnalysis<DemandedBits>();
 
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  DenseMap<Instruction *, APInt> AliveBits;
   SmallVector<Instruction*, 128> Worklist;
-
-  // The set of visited instructions (non-integer-typed only).
-  SmallPtrSet<Instruction*, 128> Visited;
-
-  // Collect the set of "root" instructions that are known live.
-  for (Instruction &I : inst_range(F)) {
-    if (!isAlwaysLive(&I))
-      continue;
-
-    DEBUG(dbgs() << "BDCE: Root: " << I << "\n");
-    // For integer-valued instructions, set up an initial empty set of alive
-    // bits and add the instruction to the work list. For other instructions
-    // add their operands to the work list (for integer values operands, mark
-    // all bits as live).
-    if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
-      if (!AliveBits.count(&I)) {
-        AliveBits[&I] = APInt(IT->getBitWidth(), 0);
-        Worklist.push_back(&I);
-      }
-
-      continue;
-    }
-
-    // Non-integer-typed instructions...
-    for (Use &OI : I.operands()) {
-      if (Instruction *J = dyn_cast<Instruction>(OI)) {
-        if (IntegerType *IT = dyn_cast<IntegerType>(J->getType()))
-          AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth());
-        Worklist.push_back(J);
-      }
-    }
-    // To save memory, we don't add I to the Visited set here. Instead, we
-    // check isAlwaysLive on every instruction when searching for dead
-    // instructions later (we need to check isAlwaysLive for the
-    // integer-typed instructions anyway).
-  }
-
-  // Propagate liveness backwards to operands.
-  while (!Worklist.empty()) {
-    Instruction *UserI = Worklist.pop_back_val();
-
-    DEBUG(dbgs() << "BDCE: Visiting: " << *UserI);
-    APInt AOut;
-    if (UserI->getType()->isIntegerTy()) {
-      AOut = AliveBits[UserI];
-      DEBUG(dbgs() << " Alive Out: " << AOut);
-    }
-    DEBUG(dbgs() << "\n");
-
-    if (!UserI->getType()->isIntegerTy())
-      Visited.insert(UserI);
-
-    APInt KnownZero, KnownOne, KnownZero2, KnownOne2;
-    // Compute the set of alive bits for each operand. These are anded into the
-    // existing set, if any, and if that changes the set of alive bits, the
-    // operand is added to the work-list.
-    for (Use &OI : UserI->operands()) {
-      if (Instruction *I = dyn_cast<Instruction>(OI)) {
-        if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) {
-          unsigned BitWidth = IT->getBitWidth();
-          APInt AB = APInt::getAllOnesValue(BitWidth);
-          if (UserI->getType()->isIntegerTy() && !AOut &&
-              !isAlwaysLive(UserI)) {
-            AB = APInt(BitWidth, 0);
-          } else {
-            // If all bits of the output are dead, then all bits of the input 
-            // Bits of each operand that are used to compute alive bits of the
-            // output are alive, all others are dead.
-            determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB,
-                                     KnownZero, KnownOne,
-                                     KnownZero2, KnownOne2);
-          }
-
-          // If we've added to the set of alive bits (or the operand has not
-          // been previously visited), then re-queue the operand to be visited
-          // again.
-          APInt ABPrev(BitWidth, 0);
-          auto ABI = AliveBits.find(I);
-          if (ABI != AliveBits.end())
-            ABPrev = ABI->second;
-
-          APInt ABNew = AB | ABPrev;
-          if (ABNew != ABPrev || ABI == AliveBits.end()) {
-            AliveBits[I] = std::move(ABNew);
-            Worklist.push_back(I);
-          }
-        } else if (!Visited.count(I)) {
-          Worklist.push_back(I);
-        }
-      }
-    }
-  }
-
   bool Changed = false;
-  // The inverse of the live set is the dead set.  These are those instructions
-  // which have no side effects and do not influence the control flow or return
-  // value of the function, and may therefore be deleted safely.
-  // NOTE: We reuse the Worklist vector here for memory efficiency.
-  for (Instruction &I : inst_range(F)) {
-    // For live instructions that have all dead bits, first make them dead by
-    // replacing all uses with something else. Then, if they don't need to
-    // remain live (because they have side effects, etc.) we can remove them.
-    if (I.getType()->isIntegerTy()) {
-      auto ABI = AliveBits.find(&I);
-      if (ABI != AliveBits.end()) {
-        if (ABI->second.getBoolValue())
-          continue;
-
-        DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
-        // FIXME: In theory we could substitute undef here instead of zero.
-        // This should be reconsidered once we settle on the semantics of
-        // undef, poison, etc.
-        Value *Zero = ConstantInt::get(I.getType(), 0);
-        ++NumSimplified;
-        I.replaceAllUsesWith(Zero);
-        Changed = true;
-      }
-    } else if (Visited.count(&I)) {
-      continue;
+  for (Instruction &I : instructions(F)) {
+    if (I.getType()->isIntegerTy() &&
+        !DB.getDemandedBits(&I).getBoolValue()) {
+      // For live instructions that have all dead bits, first make them dead by
+      // replacing all uses with something else. Then, if they don't need to
+      // remain live (because they have side effects, etc.) we can remove them.
+      DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+      // FIXME: In theory we could substitute undef here instead of zero.
+      // This should be reconsidered once we settle on the semantics of
+      // undef, poison, etc.
+      Value *Zero = ConstantInt::get(I.getType(), 0);
+      ++NumSimplified;
+      I.replaceAllUsesWith(Zero);
+      Changed = true;
     }
-
-    if (isAlwaysLive(&I))
+    if (!DB.isInstructionDead(&I))
       continue;
 
     Worklist.push_back(&I);
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 4288742..84f7f5f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -223,10 +223,10 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
   }
 
   // The simple and common case. This also includes constant expressions.
-  if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst))
+  if (!isa<PHINode>(Inst) && !Inst->isEHPad())
     return Inst;
 
-  // We can't insert directly before a phi node or landing pad. Insert before
+  // We can't insert directly before a phi node or an eh pad. Insert before
   // the terminator of the incoming or dominating block.
   assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
   if (Idx != ~0U && isa<PHINode>(Inst))
@@ -365,9 +365,9 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
 /// into an instruction itself.
 void ConstantHoisting::collectConstantCandidates(Function &Fn) {
   ConstCandMapType ConstCandMap;
-  for (Function::iterator BB : Fn)
-    for (BasicBlock::iterator Inst : *BB)
-      collectConstantCandidates(ConstCandMap, Inst);
+  for (BasicBlock &BB : Fn)
+    for (Instruction &Inst : BB)
+      collectConstantCandidates(ConstCandMap, &Inst);
 }
 
 /// \brief Find the base constant within the given range and rebase all other
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 79624b2..686bd40 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/IR/CFG.h"
@@ -32,6 +33,7 @@ STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumSelects,   "Number of selects propagated");
 STATISTIC(NumMemAccess, "Number of memory access targets propagated");
 STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumReturns,   "Number of return values propagated");
 STATISTIC(NumDeadCases, "Number of switch cases removed");
 
 namespace {
@@ -43,6 +45,11 @@ namespace {
     bool processMemAccess(Instruction *I);
     bool processCmp(CmpInst *C);
     bool processSwitch(SwitchInst *SI);
+    bool processCallSite(CallSite CS);
+
+    /// Return a constant value for V usable at At and everything it
+    /// dominates.  If no such Constant can be found, return nullptr.
+    Constant *getConstantAt(Value *V, Instruction *At);
 
   public:
     static char ID;
@@ -54,6 +61,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LazyValueInfo>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
     }
   };
 }
@@ -178,44 +186,33 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
   return true;
 }
 
-/// processCmp - If the value of this comparison could be determined locally,
-/// constant propagation would already have figured it out.  Instead, walk
-/// the predecessors and statically evaluate the comparison based on information
-/// available on that edge.  If a given static evaluation is true on ALL
-/// incoming edges, then it's true universally and we can simplify the compare.
+/// processCmp - See if LazyValueInfo's ability to exploit edge conditions,
+/// or range information is sufficient to prove this comparison.  Even for
+/// local conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
 bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
   Value *Op0 = C->getOperand(0);
-  if (isa<Instruction>(Op0) &&
-      cast<Instruction>(Op0)->getParent() == C->getParent())
-    return false;
-
   Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
   if (!Op1) return false;
 
-  pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent());
-  if (PI == PE) return false;
+  // As a policy choice, we choose not to waste compile time on anything where
+  // the comparison is testing local values.  While LVI can sometimes reason
+  // about such cases, it's not its primary purpose.  We do make sure to do
+  // the block local query for uses from terminator instructions, but that's
+  // handled in the code for each terminator.
+  auto *I = dyn_cast<Instruction>(Op0);
+  if (I && I->getParent() == C->getParent())
+    return false;
 
-  LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(),
-                                    C->getOperand(0), Op1, *PI,
-                                    C->getParent(), C);
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
   if (Result == LazyValueInfo::Unknown) return false;
 
-  ++PI;
-  while (PI != PE) {
-    LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(),
-                                    C->getOperand(0), Op1, *PI,
-                                    C->getParent(), C);
-    if (Res != Result) return false;
-    ++PI;
-  }
-
   ++NumCmps;
-
   if (Result == LazyValueInfo::True)
     C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
   else
     C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
-
   C->eraseFromParent();
 
   return true;
@@ -307,6 +304,59 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
   return Changed;
 }
 
+/// processCallSite - Infer nonnull attributes for the arguments at the
+/// specified callsite.
+bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
+  SmallVector<unsigned, 4> Indices;
+  unsigned ArgNo = 0;
+
+  for (Value *V : CS.args()) {
+    PointerType *Type = dyn_cast<PointerType>(V->getType());
+
+    if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+                            ConstantPointerNull::get(Type),
+                            CS.getInstruction()) == LazyValueInfo::False)
+      Indices.push_back(ArgNo + 1);
+    ArgNo++;
+  }
+
+  assert(ArgNo == CS.arg_size() && "sanity check");
+
+  if (Indices.empty())
+    return false;
+
+  AttributeSet AS = CS.getAttributes();
+  LLVMContext &Ctx = CS.getInstruction()->getContext();
+  AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+  CS.setAttributes(AS);
+
+  return true;
+}
+
+Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {
+  if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+    return C;
+
+  // TODO: The following really should be sunk inside LVI's core algorithm, or
+  // at least the outer shims around such.
+  auto *C = dyn_cast<CmpInst>(V);
+  if (!C) return nullptr;
+
+  Value *Op0 = C->getOperand(0);
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return nullptr;
+  
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+  if (Result == LazyValueInfo::Unknown)
+    return nullptr;
+  
+  return (Result == LazyValueInfo::True) ?
+    ConstantInt::getTrue(C->getContext()) :
+    ConstantInt::getFalse(C->getContext());
+}
+
 bool CorrelatedValuePropagation::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
@@ -318,7 +368,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     bool BBChanged = false;
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
-      Instruction *II = BI++;
+      Instruction *II = &*BI++;
       switch (II->getOpcode()) {
       case Instruction::Select:
         BBChanged |= processSelect(cast<SelectInst>(II));
@@ -334,6 +384,10 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
       case Instruction::Store:
         BBChanged |= processMemAccess(II);
         break;
+      case Instruction::Call:
+      case Instruction::Invoke:
+        BBChanged |= processCallSite(CallSite(II));
+        break;
       }
     }
 
@@ -342,7 +396,21 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
     case Instruction::Switch:
       BBChanged |= processSwitch(cast<SwitchInst>(Term));
       break;
+    case Instruction::Ret: {
+      auto *RI = cast<ReturnInst>(Term);
+      // Try to determine the return value if we can.  This is mainly here to
+      // simplify the writing of unit tests, but also helps to enable IPO by
+      // constant folding the return values of callees.
+      auto *RetVal = RI->getReturnValue();
+      if (!RetVal) break; // handle "ret void"
+      if (isa<Constant>(RetVal)) break; // nothing to do
+      if (auto *C = getConstantAt(RetVal, RI)) {
+        ++NumReturns;
+        RI->replaceUsesOfWith(RetVal, C);
+        BBChanged = true;        
+      }
     }
+    };
 
     FnChanged |= BBChanged;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index 3b262a2..b67c3c7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
@@ -46,7 +47,7 @@ namespace {
       TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
-        Instruction *Inst = DI++;
+        Instruction *Inst = &*DI++;
         if (isInstructionTriviallyDead(Inst, TLI)) {
           Inst->eraseFromParent();
           Changed = true;
@@ -92,6 +93,34 @@ namespace {
 char DCE::ID = 0;
 INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
 
+static bool DCEInstruction(Instruction *I,
+                           SmallSetVector<Instruction *, 16> &WorkList,
+                           const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+    ++DCEEliminated;
+    return true;
+  }
+  return false;
+}
+
 bool DCE::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
@@ -99,39 +128,24 @@ bool DCE::runOnFunction(Function &F) {
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
 
-  // Start out with all of the instructions in the worklist...
-  std::vector<Instruction*> WorkList;
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
-    WorkList.push_back(&*i);
-
-  // Loop over the worklist finding instructions that are dead.  If they are
-  // dead make them drop all of their uses, making other instructions
-  // potentially dead, and work until the worklist is empty.
-  //
   bool MadeChange = false;
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+    Instruction *I = &*FI;
+    ++FI;
+
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+
   while (!WorkList.empty()) {
-    Instruction *I = WorkList.back();
-    WorkList.pop_back();
-
-    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead.
-      // Loop over all of the values that the instruction uses, if there are
-      // instructions being used, add them to the worklist, because they might
-      // go dead after this one is removed.
-      //
-      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
-        if (Instruction *Used = dyn_cast<Instruction>(*OI))
-          WorkList.push_back(Used);
-
-      // Remove the instruction.
-      I->eraseFromParent();
-
-      // Remove the instruction from the worklist if it still exists in it.
-      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I),
-                     WorkList.end());
-
-      MadeChange = true;
-      ++DCEEliminated;
-    }
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= DCEInstruction(I, WorkList, TLI);
   }
   return MadeChange;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c505584..36ad0a5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -40,6 +41,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dse"
 
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
 
@@ -59,23 +61,24 @@ namespace {
       if (skipOptnoneFunction(F))
         return false;
 
-      AA = &getAnalysis<AliasAnalysis>();
+      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      TLI = AA->getTargetLibraryInfo();
+      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
       bool Changed = false;
-      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+      for (BasicBlock &I : F)
         // Only check non-dead blocks.  Dead blocks may have strange pointer
         // cycles that will confuse alias analysis.
-        if (DT->isReachableFromEntry(I))
-          Changed |= runOnBasicBlock(*I);
+        if (DT->isReachableFromEntry(&I))
+          Changed |= runOnBasicBlock(I);
 
       AA = nullptr; MD = nullptr; DT = nullptr;
       return Changed;
     }
 
     bool runOnBasicBlock(BasicBlock &BB);
+    bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
     void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
@@ -85,10 +88,11 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MemoryDependenceAnalysis>();
-      AU.addPreserved<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
   };
@@ -97,8 +101,10 @@ namespace {
 char DSE::ID = 0;
 INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
 
 FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
@@ -115,7 +121,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                MemoryDependenceAnalysis &MD,
-                               const TargetLibraryInfo *TLI,
+                               const TargetLibraryInfo &TLI,
                                SmallSetVector<Value*, 16> *ValueSet = nullptr) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
@@ -140,7 +146,7 @@ static void DeleteDeadInstruction(Instruction *I,
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI, TLI))
+        if (isInstructionTriviallyDead(OpI, &TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -153,7 +159,7 @@ static void DeleteDeadInstruction(Instruction *I,
 
 /// hasMemoryWrite - Does this instruction write some memory?  This only returns
 /// true for things that we can analyze with other helpers below.
-static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
   if (isa<StoreInst>(I))
     return true;
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
@@ -170,20 +176,20 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {
   }
   if (auto CS = CallSite(I)) {
     if (Function *F = CS.getCalledFunction()) {
-      if (TLI && TLI->has(LibFunc::strcpy) &&
-          F->getName() == TLI->getName(LibFunc::strcpy)) {
+      if (TLI.has(LibFunc::strcpy) &&
+          F->getName() == TLI.getName(LibFunc::strcpy)) {
         return true;
       }
-      if (TLI && TLI->has(LibFunc::strncpy) &&
-          F->getName() == TLI->getName(LibFunc::strncpy)) {
+      if (TLI.has(LibFunc::strncpy) &&
+          F->getName() == TLI.getName(LibFunc::strncpy)) {
         return true;
       }
-      if (TLI && TLI->has(LibFunc::strcat) &&
-          F->getName() == TLI->getName(LibFunc::strcat)) {
+      if (TLI.has(LibFunc::strcat) &&
+          F->getName() == TLI.getName(LibFunc::strcat)) {
         return true;
       }
-      if (TLI && TLI->has(LibFunc::strncat) &&
-          F->getName() == TLI->getName(LibFunc::strncat)) {
+      if (TLI.has(LibFunc::strncat) &&
+          F->getName() == TLI.getName(LibFunc::strncat)) {
         return true;
       }
     }
@@ -224,9 +230,9 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
 
 /// getLocForRead - Return the location read by the specified "hasMemoryWrite"
 /// instruction if any.
-static MemoryLocation getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
-  assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) &&
-         "Unknown instruction case");
+static MemoryLocation getLocForRead(Instruction *Inst,
+                                    const TargetLibraryInfo &TLI) {
+  assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
 
   // The only instructions that both read and write are the mem transfer
   // instructions (memcpy/memmove).
@@ -313,9 +319,9 @@ static Value *getStoredPointerOperand(Instruction *I) {
 }
 
 static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
-                               const TargetLibraryInfo *TLI) {
+                               const TargetLibraryInfo &TLI) {
   uint64_t Size;
-  if (getObjectSize(V, Size, DL, TLI))
+  if (getObjectSize(V, Size, DL, &TLI))
     return Size;
   return MemoryLocation::UnknownSize;
 }
@@ -336,7 +342,7 @@ namespace {
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
-                                   const TargetLibraryInfo *TLI,
+                                   const TargetLibraryInfo &TLI,
                                    int64_t &EarlierOff, int64_t &LaterOff) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
@@ -442,10 +448,12 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 /// because the DSE inducing instruction may be a self-read.
 static bool isPossibleSelfRead(Instruction *Inst,
                                const MemoryLocation &InstStoreLoc,
-                               Instruction *DepWrite, AliasAnalysis &AA) {
+                               Instruction *DepWrite,
+                               const TargetLibraryInfo &TLI,
+                               AliasAnalysis &AA) {
   // Self reads can only happen for instructions that read memory.  Get the
   // location read.
-  MemoryLocation InstReadLoc = getLocForRead(Inst, AA);
+  MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
   if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.
 
   // If the read and written loc obviously don't alias, it isn't a read.
@@ -459,7 +467,7 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // Here we don't know if A/B may alias, but we do know that B/B are must
   // aliases, so removing the first memcpy is safe (assuming it writes <= #
   // bytes as the second one.
-  MemoryLocation DepReadLoc = getLocForRead(DepWrite, AA);
+  MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
 
   if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
     return false;
@@ -475,11 +483,12 @@ static bool isPossibleSelfRead(Instruction *Inst,
 //===----------------------------------------------------------------------===//
 
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
+  const DataLayout &DL = BB.getModule()->getDataLayout();
   bool MadeChange = false;
 
   // Do a top-down walk on the BB.
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
-    Instruction *Inst = BBI++;
+    Instruction *Inst = &*BBI++;
 
     // Handle 'free' calls specially.
     if (CallInst *F = isFreeCall(Inst, TLI)) {
@@ -488,42 +497,68 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     }
 
     // If we find something that writes memory, get its memory dependence.
-    if (!hasMemoryWrite(Inst, TLI))
-      continue;
-
-    MemDepResult InstDep = MD->getDependency(Inst);
-
-    // Ignore any store where we can't find a local dependence.
-    // FIXME: cross-block DSE would be fun. :)
-    if (!InstDep.isDef() && !InstDep.isClobber())
+    if (!hasMemoryWrite(Inst, *TLI))
       continue;
 
     // If we're storing the same value back to a pointer that we just
     // loaded from, then the store can be removed.
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
+
+      auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) {
+        // DeleteDeadInstruction can delete the current instruction.  Save BBI
+        // in case we need it.
+        WeakVH NextInst(&*BBI);
+
+        DeleteDeadInstruction(DeadInst, *MD, *TLI);
+
+        if (!NextInst) // Next instruction deleted.
+          BBI = BB.begin();
+        else if (BBI != BB.begin()) // Revisit this instruction if possible.
+          --BBI;
+        ++NumRedundantStores;
+        MadeChange = true;
+      };
+
+      if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
         if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-            SI->getOperand(0) == DepLoad && isRemovable(SI)) {
+            isRemovable(SI) &&
+            MemoryIsNotModifiedBetween(DepLoad, SI)) {
+
           DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
                        << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
 
-          // DeleteDeadInstruction can delete the current instruction.  Save BBI
-          // in case we need it.
-          WeakVH NextInst(BBI);
+          RemoveDeadInstAndUpdateBBI(SI);
+          continue;
+        }
+      }
 
-          DeleteDeadInstruction(SI, *MD, TLI);
+      // Remove null stores into the calloc'ed objects
+      Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
 
-          if (!NextInst)  // Next instruction deleted.
-            BBI = BB.begin();
-          else if (BBI != BB.begin())  // Revisit this instruction if possible.
-            --BBI;
-          ++NumFastStores;
-          MadeChange = true;
+      if (StoredConstant && StoredConstant->isNullValue() &&
+          isRemovable(SI)) {
+        Instruction *UnderlyingPointer = dyn_cast<Instruction>(
+            GetUnderlyingObject(SI->getPointerOperand(), DL));
+
+        if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+            MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) {
+          DEBUG(dbgs()
+                << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
+                << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
+
+          RemoveDeadInstAndUpdateBBI(SI);
           continue;
         }
       }
     }
 
+    MemDepResult InstDep = MD->getDependency(Inst);
+
+    // Ignore any store where we can't find a local dependence.
+    // FIXME: cross-block DSE would be fun. :)
+    if (!InstDep.isDef() && !InstDep.isClobber())
+      continue;
+
     // Figure out what location is being stored to.
     MemoryLocation Loc = getLocForWrite(Inst, *AA);
 
@@ -549,24 +584,22 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // completely obliterated by the store to 'Loc', and c) which we know that
       // 'Inst' doesn't load from, then we can remove it.
       if (isRemovable(DepWrite) &&
-          !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
+          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
         int64_t InstWriteOffset, DepWriteOffset;
-        const DataLayout &DL = BB.getModule()->getDataLayout();
         OverwriteResult OR =
-            isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(),
-                        DepWriteOffset, InstWriteOffset);
+            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
         if (OR == OverwriteComplete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD, TLI);
+          DeleteDeadInstruction(DepWrite, *MD, *TLI);
           ++NumFastStores;
           MadeChange = true;
 
           // DeleteDeadInstruction can delete the current instruction in loop
           // cases, reset BBI.
-          BBI = Inst;
+          BBI = Inst->getIterator();
           if (BBI != BB.begin())
             --BBI;
           break;
@@ -609,10 +642,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       if (DepWrite == &BB.front()) break;
 
       // Can't look past this instruction if it might read 'Loc'.
-      if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref)
+      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
         break;
 
-      InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB);
+      InstDep = MD->getPointerDependencyFrom(Loc, false,
+                                             DepWrite->getIterator(), &BB);
     }
   }
 
@@ -624,6 +658,64 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
+bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
+                                     Instruction *SecondI) {
+  SmallVector<BasicBlock *, 16> WorkList;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  BasicBlock::iterator FirstBBI(FirstI);
+  ++FirstBBI;
+  BasicBlock::iterator SecondBBI(SecondI);
+  BasicBlock *FirstBB = FirstI->getParent();
+  BasicBlock *SecondBB = SecondI->getParent();
+  MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+
+  // Start checking the store-block.
+  WorkList.push_back(SecondBB);
+  bool isFirstBlock = true;
+
+  // Check all blocks going backward until we reach the load-block.
+  while (!WorkList.empty()) {
+    BasicBlock *B = WorkList.pop_back_val();
+
+    // Ignore instructions before LI if this is the FirstBB.
+    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+    BasicBlock::iterator EI;
+    if (isFirstBlock) {
+      // Ignore instructions after SI if this is the first visit of SecondBB.
+      assert(B == SecondBB && "first block is not the store block");
+      EI = SecondBBI;
+      isFirstBlock = false;
+    } else {
+      // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+      // In this case we also have to look at instructions after SI.
+      EI = B->end();
+    }
+    for (; BI != EI; ++BI) {
+      Instruction *I = &*BI;
+      if (I->mayWriteToMemory() && I != SecondI) {
+        auto Res = AA->getModRefInfo(I, MemLoc);
+        if (Res != MRI_NoModRef)
+          return false;
+      }
+    }
+    if (B != FirstBB) {
+      assert(B != &FirstBB->getParent()->getEntryBlock() &&
+          "Should not hit the entry block because SI must be dominated by LI");
+      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+        if (!Visited.insert(*PredI).second)
+          continue;
+        WorkList.push_back(*PredI);
+      }
+    }
+  }
+  return true;
+}
+
 /// Find all blocks that will unconditionally lead to the block BB and append
 /// them to F.
 static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
@@ -655,10 +747,11 @@ bool DSE::HandleFree(CallInst *F) {
     Instruction *InstPt = BB->getTerminator();
     if (BB == F->getParent()) InstPt = F;
 
-    MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB);
+    MemDepResult Dep =
+        MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
     while (Dep.isDef() || Dep.isClobber()) {
       Instruction *Dependency = Dep.getInst();
-      if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency))
+      if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
         break;
 
       Value *DepPointer =
@@ -668,10 +761,10 @@ bool DSE::HandleFree(CallInst *F) {
       if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
         break;
 
-      Instruction *Next = std::next(BasicBlock::iterator(Dependency));
+      auto Next = ++Dependency->getIterator();
 
       // DCE instructions only used to calculate that store
-      DeleteDeadInstruction(Dependency, *MD, TLI);
+      DeleteDeadInstruction(Dependency, *MD, *TLI);
       ++NumFastStores;
       MadeChange = true;
 
@@ -704,23 +797,22 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
   SmallSetVector<Value*, 16> DeadStackObjects;
 
   // Find all of the alloca'd pointers in the entry block.
-  BasicBlock *Entry = BB.getParent()->begin();
-  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) {
-    if (isa<AllocaInst>(I))
-      DeadStackObjects.insert(I);
+  BasicBlock &Entry = BB.getParent()->front();
+  for (Instruction &I : Entry) {
+    if (isa<AllocaInst>(&I))
+      DeadStackObjects.insert(&I);
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true))
-      DeadStackObjects.insert(I);
+    else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+      DeadStackObjects.insert(&I);
   }
 
   // Treat byval or inalloca arguments the same, stores to them are dead at the
   // end of the function.
-  for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
-       AE = BB.getParent()->arg_end(); AI != AE; ++AI)
-    if (AI->hasByValOrInAllocaAttr())
-      DeadStackObjects.insert(AI);
+  for (Argument &AI : BB.getParent()->args())
+    if (AI.hasByValOrInAllocaAttr())
+      DeadStackObjects.insert(&AI);
 
   const DataLayout &DL = BB.getModule()->getDataLayout();
 
@@ -729,10 +821,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     --BBI;
 
     // If we find a store, check to see if it points into a dead stack value.
-    if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) {
+    if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
       // See through pointer-to-pointer bitcasts
       SmallVector<Value *, 4> Pointers;
-      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL);
+      GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
 
       // Stores to stack values are valid candidates for removal.
       bool AllDead = true;
@@ -744,7 +836,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
         }
 
       if (AllDead) {
-        Instruction *Dead = BBI++;
+        Instruction *Dead = &*BBI++;
 
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
                      << *Dead << "\n  Objects: ";
@@ -757,7 +849,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
               dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
-        DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects);
+        DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
         continue;
@@ -765,9 +857,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     }
 
     // Remove any dead non-memory-mutating instructions.
-    if (isInstructionTriviallyDead(BBI, TLI)) {
-      Instruction *Inst = BBI++;
-      DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects);
+    if (isInstructionTriviallyDead(&*BBI, TLI)) {
+      Instruction *Inst = &*BBI++;
+      DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
       continue;
@@ -776,15 +868,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (isa<AllocaInst>(BBI)) {
       // Remove allocas from the list of dead stack objects; there can't be
       // any references before the definition.
-      DeadStackObjects.remove(BBI);
+      DeadStackObjects.remove(&*BBI);
       continue;
     }
 
-    if (auto CS = CallSite(BBI)) {
+    if (auto CS = CallSite(&*BBI)) {
       // Remove allocation function calls from the list of dead stack objects; 
       // there can't be any references before the definition.
-      if (isAllocLikeFn(BBI, TLI))
-        DeadStackObjects.remove(BBI);
+      if (isAllocLikeFn(&*BBI, TLI))
+        DeadStackObjects.remove(&*BBI);
 
       // If this call does not access memory, it can't be loading any of our
       // pointers.
@@ -795,10 +887,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // the call is live.
       DeadStackObjects.remove_if([&](Value *I) {
         // See if the call site touches the value.
-        AliasAnalysis::ModRefResult A = AA->getModRefInfo(
-            CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo()));
+        ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI));
 
-        return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref;
+        return A == MRI_ModRef || A == MRI_Ref;
       });
 
       // If all of the allocas were clobbered by the call then we're not going
@@ -864,8 +955,7 @@ void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
   // Remove objects that could alias LoadedLoc.
   DeadStackObjects.remove_if([&](Value *I) {
     // See if the loaded location could alias the stack location.
-    MemoryLocation StackLoc(I,
-                            getPointerSize(I, DL, AA->getTargetLibraryInfo()));
+    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
     return !AA->isNoAlias(StackLoc, LoadedLoc);
   });
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 029b44c..7ef062e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -263,7 +264,6 @@ namespace {
 /// expected that a later pass of GVN will catch the interesting/hard cases.
 class EarlyCSE {
 public:
-  Function &F;
   const TargetLibraryInfo &TLI;
   const TargetTransformInfo &TTI;
   DominatorTree &DT;
@@ -281,20 +281,37 @@ public:
   /// that dominated values can succeed in their lookup.
   ScopedHTType AvailableValues;
 
-  /// \brief A scoped hash table of the current values of loads.
+  /// A scoped hash table of the current values of previously encounted memory
+  /// locations.
   ///
-  /// This allows us to get efficient access to dominating loads when we have
-  /// a fully redundant load.  In addition to the most recent load, we keep
-  /// track of a generation count of the read, which is compared against the
-  /// current generation count.  The current generation count is incremented
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
   /// after every possibly writing memory operation, which ensures that we only
-  /// CSE loads with other loads that have no intervening store.
-  typedef RecyclingAllocator<
-      BumpPtrAllocator,
-      ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>>
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
+  struct LoadValue {
+    Value *Data;
+    unsigned Generation;
+    int MatchingId;
+    bool IsAtomic;
+    LoadValue()
+      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
+    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic)
+      : Data(Data), Generation(Generation), MatchingId(MatchingId),
+        IsAtomic(IsAtomic) {}
+  };
+  typedef RecyclingAllocator<BumpPtrAllocator,
+                             ScopedHashTableVal<Value *, LoadValue>>
       LoadMapAllocator;
-  typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>,
-                          DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType;
+  typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+                          LoadMapAllocator> LoadHTType;
   LoadHTType AvailableLoads;
 
   /// \brief A scoped hash table of the current values of read-only call
@@ -308,10 +325,9 @@ public:
   unsigned CurrentGeneration;
 
   /// \brief Set up the EarlyCSE runner for a particular function.
-  EarlyCSE(Function &F, const TargetLibraryInfo &TLI,
-           const TargetTransformInfo &TTI, DominatorTree &DT,
-           AssumptionCache &AC)
-      : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}
+  EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
+           DominatorTree &DT, AssumptionCache &AC)
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}
 
   bool run();
 
@@ -382,57 +398,91 @@ private:
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-        : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
-          MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
-      MayReadFromMemory = Inst->mayReadFromMemory();
-      MayWriteToMemory = Inst->mayWriteToMemory();
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-        MemIntrinsicInfo Info;
-        if (!TTI.getTgtMemIntrinsic(II, Info))
-          return;
-        if (Info.NumMemRefs == 1) {
-          Store = Info.WriteMem;
-          Load = Info.ReadMem;
-          MatchingId = Info.MatchingId;
-          MayReadFromMemory = Info.ReadMem;
-          MayWriteToMemory = Info.WriteMem;
-          Vol = Info.Vol;
-          Ptr = Info.PtrVal;
-        }
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Load = true;
-        Vol = !LI->isSimple();
-        Ptr = LI->getPointerOperand();
+      : IsTargetMemInst(false), Inst(Inst) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+          IsTargetMemInst = true;
+    }
+    bool isLoad() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return isa<LoadInst>(Inst);
+    }
+    bool isStore() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return isa<StoreInst>(Inst);
+    }
+    bool isAtomic() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      return Inst->isAtomic();
+    }
+    bool isUnordered() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        Store = true;
-        Vol = !SI->isSimple();
-        Ptr = SI->getPointerOperand();
+        return SI->isVolatile();
       }
+      // Conservative answer
+      return true;
     }
-    bool isLoad() { return Load; }
-    bool isStore() { return Store; }
-    bool isVolatile() { return Vol; }
-    bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
-      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+
+    
+    bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
+      return (getPointerOperand() == Inst.getPointerOperand() &&
+              getMatchingId() == Inst.getMatchingId());
     }
-    bool isValid() { return Ptr != nullptr; }
-    int getMatchingId() { return MatchingId; }
-    Value *getPtr() { return Ptr; }
-    bool mayReadFromMemory() { return MayReadFromMemory; }
-    bool mayWriteToMemory() { return MayWriteToMemory; }
+    bool isValid() const { return getPointerOperand() != nullptr; }
 
-  private:
-    bool Load;
-    bool Store;
-    bool Vol;
-    bool MayReadFromMemory;
-    bool MayWriteToMemory;
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
     // intrinsic loads/stores, the id is retrieved from the corresponding
     // field in the MemIntrinsicInfo structure.  That field contains
     // non-negative values only.
-    int MatchingId;
-    Value *Ptr;
+    int getMatchingId() const {
+      if (IsTargetMemInst) return Info.MatchingId;
+      return -1;
+    }
+    Value *getPointerOperand() const {
+      if (IsTargetMemInst) return Info.PtrVal;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->getPointerOperand();
+      }
+      return nullptr;
+    }
+    bool mayReadFromMemory() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return Inst->mayReadFromMemory();
+    }
+    bool mayWriteToMemory() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return Inst->mayWriteToMemory();
+    }
+
+  private:
+    bool IsTargetMemInst;
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
   };
 
   bool processNode(DomTreeNode *Node);
@@ -497,7 +547,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-    Instruction *Inst = I++;
+    Instruction *Inst = &*I++;
 
     // Dead instructions should just be removed.
     if (isInstructionTriviallyDead(Inst, &TLI)) {
@@ -548,24 +598,26 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     ParseMemoryInst MemInst(Inst, TTI);
     // If this is a non-volatile load, process it.
     if (MemInst.isValid() && MemInst.isLoad()) {
-      // Ignore volatile loads.
-      if (MemInst.isVolatile()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
         LastStore = nullptr;
-        // Don't CSE across synchronization boundaries.
-        if (Inst->mayWriteToMemory())
-          ++CurrentGeneration;
-        continue;
+        ++CurrentGeneration;
       }
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
-      std::pair<Value *, unsigned> InVal =
-          AvailableLoads.lookup(MemInst.getPtr());
-      if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
-        Value *Op = getOrCreateResult(InVal.first, Inst->getType());
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing loads with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          // We can't replace an atomic load with one which isn't also atomic.
+          InVal.IsAtomic >= MemInst.isAtomic()) {
+        Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
         if (Op != nullptr) {
           DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
-                       << "  to: " << *InVal.first << '\n');
+                       << "  to: " << *InVal.Data << '\n');
           if (!Inst->use_empty())
             Inst->replaceAllUsesWith(Op);
           Inst->eraseFromParent();
@@ -576,8 +628,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
 
       // Otherwise, remember that we have this instruction.
-      AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
-                                                  Inst, CurrentGeneration));
+      AvailableLoads.insert(
+          MemInst.getPointerOperand(),
+          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                    MemInst.isAtomic()));
       LastStore = nullptr;
       continue;
     }
@@ -613,6 +667,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    // A release fence requires that all stores complete before it, but does
+    // not prevent the reordering of following loads 'before' the fence.  As a
+    // result, we don't need to consider it as writing to memory and don't need
+    // to advance the generation.  We do need to prevent DSE across the fence,
+    // but that's handled above.
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      if (FI->getOrdering() == Release) {
+        assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
+        continue;
+      }
+
+    // write back DSE - If we write back the same value we just loaded from
+    // the same location and haven't passed any intervening writes or ordering
+    // operations, we can remove the write.  The primary benefit is in allowing
+    // the available load table to remain valid and value forward past where
+    // the store originally was.
+    if (MemInst.isValid() && MemInst.isStore()) {
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.Data &&
+          InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) &&
+          InVal.Generation == CurrentGeneration &&
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing stores with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered()) {
+        assert((!LastStore ||
+                ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+                MemInst.getPointerOperand()) &&
+               "can't have an intervening store!");
+        DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumDSE;
+        // We can avoid incrementing the generation count since we were able
+        // to eliminate this store.
+        continue;
+      }
+    }
+
     // Okay, this isn't something we can CSE at all.  Check to see if it is
     // something that could modify memory.  If so, our available memory values
     // cannot be used so bump the generation count.
@@ -622,8 +714,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       if (MemInst.isValid() && MemInst.isStore()) {
         // We do a trivial form of DSE if there are two stores to the same
         // location with no intervening loads.  Delete the earlier store.
+        // At the moment, we don't remove ordered stores, but do remove
+        // unordered atomic stores.  There's no special requirement (for
+        // unordered atomics) about removing atomic stores only in favor of
+        // other atomic stores since we we're going to execute the non-atomic
+        // one anyway and the atomic one might never have become visible.
         if (LastStore) {
           ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          assert(LastStoreMemInst.isUnordered() &&
+                 !LastStoreMemInst.isVolatile() &&
+                 "Violated invariant");
           if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
             DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
                          << "  due to: " << *Inst << '\n');
@@ -640,12 +740,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // version of the pointer.  It is safe to forward from volatile stores
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
-        AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
-                                                    Inst, CurrentGeneration));
-
-        // Remember that this was the last store we saw for DSE.
-        if (!MemInst.isVolatile())
+        AvailableLoads.insert(
+            MemInst.getPointerOperand(),
+            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                      MemInst.isAtomic()));
+
+        // Remember that this was the last unordered store we saw for DSE. We
+        // don't yet handle DSE on ordered or volatile stores since we don't
+        // have a good way to model the ordering requirement for following
+        // passes  once the store is removed.  We could insert a fence, but
+        // since fences are slightly stronger than stores in their ordering,
+        // it's not clear this is a profitable transform. Another option would
+        // be to merge the ordering with that of the post dominating store.
+        if (MemInst.isUnordered() && !MemInst.isVolatile())
           LastStore = Inst;
+        else
+          LastStore = nullptr;
       }
     }
   }
@@ -714,7 +824,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM->getResult<AssumptionAnalysis>(F);
 
-  EarlyCSE CSE(F, TLI, TTI, DT, AC);
+  EarlyCSE CSE(TLI, TTI, DT, AC);
 
   if (!CSE.run())
     return PreservedAnalyses::all();
@@ -751,7 +861,7 @@ public:
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
-    EarlyCSE CSE(F, TLI, TTI, DT, AC);
+    EarlyCSE CSE(TLI, TTI, DT, AC);
 
     return CSE.run();
   }
@@ -761,6 +871,7 @@ public:
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
     AU.setPreservesCFG();
   }
 };
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 0430c18..185cdbd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -30,7 +30,7 @@ public:
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<AAResultsWrapperPass>();
   }
 
 private:
@@ -41,7 +41,7 @@ private:
 char FlattenCFGPass::ID = 0;
 INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
                       false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
                     false)
 
@@ -59,7 +59,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
-      if (FlattenCFG(BBIt++, AA)) {
+      if (FlattenCFG(&*BBIt++, AA)) {
         LocalChange = true;
       }
     }
@@ -69,7 +69,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
 }
 
 bool FlattenCFGPass::runOnFunction(Function &F) {
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   bool EverChanged = false;
   // iterativelyFlattenCFG can make some blocks dead.
   while (iterativelyFlattenCFG(F, AA)) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index c931422..7f5d786 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -19,6 +19,8 @@
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
@@ -41,7 +43,7 @@ using namespace llvm;
 // integer domain inputs, produce an integer output; fadd, for example.
 //
 // If a non-mappable instruction is seen, this entire def-use graph is marked
-// as non-transformable. If we see an instruction that converts from the 
+// as non-transformable. If we see an instruction that converts from the
 // integer domain to FP domain (uitofp,sitofp), we terminate our walk.
 
 /// The largest integer type worth dealing with.
@@ -60,6 +62,7 @@ namespace {
     bool runOnFunction(Function &F) override;
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
     }
 
     void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots);
@@ -82,7 +85,9 @@ namespace {
 }
 
 char Float2Int::ID = 0;
-INITIALIZE_PASS(Float2Int, "float2int", "Float to int", false, false)
+INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false)
 
 // Given a FCmp predicate, return a matching ICmp predicate if one
 // exists, otherwise return BAD_ICMP_PREDICATE.
@@ -125,7 +130,9 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 // Find the roots - instructions that convert from the FP domain to
 // integer domain.
 void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
-  for (auto &I : inst_range(F)) {
+  for (auto &I : instructions(F)) {
+    if (isa<VectorType>(I.getType()))
+      continue;
     switch (I.getOpcode()) {
     default: break;
     case Instruction::FPToUI:
@@ -133,7 +140,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
       Roots.insert(&I);
       break;
     case Instruction::FCmp:
-      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != 
+      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
           CmpInst::BAD_ICMP_PREDICATE)
         Roots.insert(&I);
       break;
@@ -176,7 +183,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) {
 //   - walkForwards:  Iterate over SeenInsts in reverse order, so we visit
 //                     defs before their uses. Calculate the real range info.
 
-// Breadth-first walk of the use-def graph; determine the set of nodes 
+// Breadth-first walk of the use-def graph; determine the set of nodes
 // we care about and eagerly determine if some of them are poisonous.
 void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
   std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
@@ -222,14 +229,14 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
       seen(I, unknownRange());
       break;
     }
-  
+
     for (Value *O : I->operands()) {
       if (Instruction *OI = dyn_cast<Instruction>(O)) {
         // Unify def-use chains if they interfere.
         ECs.unionSets(I, OI);
-	if (SeenInsts.find(I)->second != badRange())
+        if (SeenInsts.find(I)->second != badRange())
           Worklist.push_back(OI);
-      } else if (!isa<ConstantFP>(O)) {      
+      } else if (!isa<ConstantFP>(O)) {
         // Not an instruction or ConstantFP? we can't do anything.
         seen(I, badRange());
       }
@@ -240,11 +247,11 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
 // Walk forwards down the list of seen instructions, so we visit defs before
 // uses.
 void Float2Int::walkForwards() {
-  for (auto It = SeenInsts.rbegin(), E = SeenInsts.rend(); It != E; ++It) {
-    if (It->second != unknownRange())
+  for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) {
+    if (It.second != unknownRange())
       continue;
 
-    Instruction *I = It->first;
+    Instruction *I = It.first;
     std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
     switch (I->getOpcode()) {
       // FIXME: Handle select and phi nodes.
@@ -299,7 +306,7 @@ void Float2Int::walkForwards() {
     for (Value *O : I->operands()) {
       if (Instruction *OI = dyn_cast<Instruction>(O)) {
         assert(SeenInsts.find(OI) != SeenInsts.end() &&
-	       "def not seen before use!");
+               "def not seen before use!");
         OpRanges.push_back(SeenInsts.find(OI)->second);
       } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
         // Work out if the floating point number can be losslessly represented
@@ -314,11 +321,11 @@ void Float2Int::walkForwards() {
         APFloat F = CF->getValueAPF();
 
         // First, weed out obviously incorrect values. Non-finite numbers
-        // can't be represented and neither can negative zero, unless 
+        // can't be represented and neither can negative zero, unless
         // we're in fast math mode.
         if (!F.isFinite() ||
             (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
-	     !I->hasNoSignedZeros())) {
+             !I->hasNoSignedZeros())) {
           seen(I, badRange());
           Abort = true;
           break;
@@ -345,7 +352,7 @@ void Float2Int::walkForwards() {
 
     // Reduce the operands' ranges to a single range and return.
     if (!Abort)
-      seen(I, Op(OpRanges));    
+      seen(I, Op(OpRanges));
   }
 }
 
@@ -395,7 +402,7 @@ bool Float2Int::validateAndTransform() {
         R.isFullSet() || R.isSignWrappedSet())
       continue;
     assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
-    
+
     // The number of bits required is the maximum of the upper and
     // lower limits, plus one so it can be signed.
     unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
@@ -505,9 +512,8 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) {
 
 // Perform dead code elimination on the instructions we just modified.
 void Float2Int::cleanup() {
-  for (auto I = ConvertedInsts.rbegin(), E = ConvertedInsts.rend();
-       I != E; ++I)
-    I->first->eraseFromParent();
+  for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend()))
+    I.first->eraseFromParent();
 }
 
 bool Float2Int::runOnFunction(Function &F) {
@@ -534,7 +540,4 @@ bool Float2Int::runOnFunction(Function &F) {
   return Modified;
 }
 
-FunctionPass *llvm::createFloat2IntPass() {
-  return new Float2Int();
-}
-
+FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index 89a0d0a..a028b8c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -128,6 +129,7 @@ namespace {
     uint32_t lookup(Value *V) const;
     uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,
                                Value *LHS, Value *RHS);
+    bool exists(Value *V) const;
     void add(Value *V, uint32_t num);
     void clear();
     void erase(Value *v);
@@ -388,6 +390,9 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
   }
 }
 
+/// Returns true if a value number exists for the specified value.
+bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
 /// lookup_or_add - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
 uint32_t ValueTable::lookup_or_add(Value *V) {
@@ -608,6 +613,10 @@ namespace {
     DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
     BumpPtrAllocator TableAllocator;
 
+    // Block-local map of equivalent values to their leader, does not
+    // propagate to any successors. Entries added mid-block are applied
+    // to the remaining instructions in the block.
+    SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap;
     SmallVector<Instruction*, 8> InstrsToErase;
 
     typedef SmallVector<NonLocalDepResult, 64> LoadDepVect;
@@ -689,16 +698,17 @@ namespace {
       AU.addRequired<TargetLibraryInfoWrapperPass>();
       if (!NoLoads)
         AU.addRequired<MemoryDependenceAnalysis>();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
 
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
     }
 
 
-    // Helper fuctions of redundant load elimination 
+    // Helper functions of redundant load elimination 
     bool processLoad(LoadInst *L);
     bool processNonLocalLoad(LoadInst *L);
+    bool processAssumeIntrinsic(IntrinsicInst *II);
     void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
                                  AvailValInBlkVect &ValuesPerBlock,
                                  UnavailBlkVect &UnavailableBlocks);
@@ -719,7 +729,9 @@ namespace {
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
     BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
-    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
+    bool replaceOperandsWithConsts(Instruction *I) const;
+    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+                           bool DominatesByEdge);
     bool processFoldableCondBr(BranchInst *BI);
     void addDeadBlock(BasicBlock *BB);
     void assignValNumForDeadCode();
@@ -738,7 +750,8 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1290,8 +1303,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI->getType(), LI->getName());
 
-  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
-    const AvailableValueInBlock &AV = ValuesPerBlock[i];
+  for (const AvailableValueInBlock &AV : ValuesPerBlock) {
     BasicBlock *BB = AV.BB;
 
     if (SSAUpdate.HasValueForBlock(BB))
@@ -1301,24 +1313,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   }
 
   // Perform PHI construction.
-  Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
-
-  // If new PHI nodes were created, notify alias analysis.
-  if (V->getType()->getScalarType()->isPointerTy()) {
-    AliasAnalysis *AA = gvn.getAliasAnalysis();
-
-    // Scan the new PHIs and inform alias analysis that we've added potentially
-    // escaping uses to any values that are operands to these PHIs.
-    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) {
-      PHINode *P = NewPHIs[i];
-      for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) {
-        unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
-        AA->addEscapingUse(P->getOperandUse(jj));
-      }
-    }
-  }
-
-  return V;
+  return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
 }
 
 Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
@@ -1518,9 +1513,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // that we only have to insert *one* load (which means we're basically moving
   // the load, not inserting a new one).
 
-  SmallPtrSet<BasicBlock *, 4> Blockers;
-  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
-    Blockers.insert(UnavailableBlocks[i]);
+  SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+                                        UnavailableBlocks.end());
 
   // Let's find the first basic block with more than one predecessor.  Walk
   // backwards through predecessors if needed.
@@ -1550,15 +1544,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // available.
   MapVector<BasicBlock *, Value *> PredLoads;
   DenseMap<BasicBlock*, char> FullyAvailableBlocks;
-  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
-    FullyAvailableBlocks[ValuesPerBlock[i].BB] = true;
-  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
-    FullyAvailableBlocks[UnavailableBlocks[i]] = false;
+  for (const AvailableValueInBlock &AV : ValuesPerBlock)
+    FullyAvailableBlocks[AV.BB] = true;
+  for (BasicBlock *UnavailableBB : UnavailableBlocks)
+    FullyAvailableBlocks[UnavailableBB] = false;
 
   SmallVector<BasicBlock *, 4> CriticalEdgePred;
-  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
-       PI != E; ++PI) {
-    BasicBlock *Pred = *PI;
+  for (BasicBlock *Pred : predecessors(LoadBB)) {
+    // If any predecessor block is an EH pad that does not allow non-PHI
+    // instructions before the terminator, we can't PRE the load.
+    if (Pred->getTerminator()->isEHPad()) {
+      DEBUG(dbgs()
+            << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+            << Pred->getName() << "': " << *LI << '\n');
+      return false;
+    }
+
     if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
@@ -1570,9 +1571,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
         return false;
       }
 
-      if (LoadBB->isLandingPad()) {
+      if (LoadBB->isEHPad()) {
         DEBUG(dbgs()
-              << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '"
+              << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
               << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
@@ -1655,12 +1656,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
                  << *NewInsts.back() << '\n');
 
   // Assign value numbers to the new instructions.
-  for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) {
+  for (Instruction *I : NewInsts) {
     // FIXME: We really _ought_ to insert these value numbers into their
     // parent's availability map.  However, in doing so, we risk getting into
     // ordering issues.  If a block hasn't been processed yet, we would be
     // marking a value as AVAIL-IN, which isn't what we intend.
-    VN.lookup_or_add(NewInsts[i]);
+    VN.lookup_or_add(I);
   }
 
   for (const auto &PredLoad : PredLoads) {
@@ -1677,6 +1678,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     if (Tags)
       NewLoad->setAAMetadata(Tags);
 
+    if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+    if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+
     // Transfer DebugLoc.
     NewLoad->setDebugLoc(LI->getDebugLoc());
 
@@ -1704,6 +1710,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 /// Attempt to eliminate a load whose dependencies are
 /// non-local by performing PHI construction.
 bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // non-local speculations are not allowed under asan.
+  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
+    return false;
+
   // Step 1: Find the non-local dependencies of the load.
   LoadDepVect Deps;
   MD->getNonLocalPointerDependency(LI, Deps);
@@ -1777,6 +1787,63 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
 }
 
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+  assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+         "This function can only be called with llvm.assume intrinsic");
+  Value *V = IntrinsicI->getArgOperand(0);
+
+  if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+    if (Cond->isZero()) {
+      Type *Int8Ty = Type::getInt8Ty(V->getContext());
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  FIXME: We could insert unreachable
+      // instruction directly because we can modify the CFG.
+      new StoreInst(UndefValue::get(Int8Ty),
+                    Constant::getNullValue(Int8Ty->getPointerTo()),
+                    IntrinsicI);
+    }
+    markInstructionForDeletion(IntrinsicI);
+    return false;
+  }
+
+  Constant *True = ConstantInt::getTrue(V->getContext());
+  bool Changed = false;
+
+  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+    // This property is only true in dominated successors, propagateEquality
+    // will check dominance for us.
+    Changed |= propagateEquality(V, True, Edge, false);
+  }
+
+  // We can replace assume value with true, which covers cases like this:
+  // call void @llvm.assume(i1 %cmp)
+  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+  ReplaceWithConstMap[V] = True;
+
+  // If one of *cmp *eq operand is const, adding it to map will cover this:
+  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %0 ; will change it to ret float 3.000000e+00
+  if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+    if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
+        CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+        (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+         CmpI->getFastMathFlags().noNaNs())) {
+      Value *CmpLHS = CmpI->getOperand(0);
+      Value *CmpRHS = CmpI->getOperand(1);
+      if (isa<Constant>(CmpLHS))
+        std::swap(CmpLHS, CmpRHS);
+      auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+
+      // If only one operand is constant.
+      if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
+        ReplaceWithConstMap[CmpLHS] = RHSConst;
+    }
+  }
+  return Changed;
+}
 
 static void patchReplacementInstruction(Instruction *I, Value *Repl) {
   // Patch the replacement so that it is not more restrictive than the value
@@ -1789,7 +1856,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
   if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
     // FIXME: If both the original and replacement value are part of the
     // same control-flow region (meaning that the execution of one
-    // guarentees the executation of the other), then we can combine the
+    // guarantees the execution of the other), then we can combine the
     // noalias scopes here and do better than the general conservative
     // answer used in combineMetadata().
 
@@ -1797,13 +1864,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
     // regions, and so we need a conservative combination of the noalias
     // scopes.
     static const unsigned KnownIDs[] = {
-      LLVMContext::MD_tbaa,
-      LLVMContext::MD_alias_scope,
-      LLVMContext::MD_noalias,
-      LLVMContext::MD_range,
-      LLVMContext::MD_fpmath,
-      LLVMContext::MD_invariant_load,
-    };
+        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+        LLVMContext::MD_noalias,        LLVMContext::MD_range,
+        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+        LLVMContext::MD_invariant_group};
     combineMetadata(ReplInst, I, KnownIDs);
   }
 }
@@ -1890,10 +1954,8 @@ bool GVN::processLoad(LoadInst *L) {
       ++NumGVNLoad;
       return true;
     }
-  }
 
-  // If the value isn't available, don't do anything!
-  if (Dep.isClobber()) {
+    // If the value isn't available, don't do anything!
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
@@ -2049,11 +2111,31 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   return Pred != nullptr;
 }
 
+// Tries to replace instruction with const, using information from
+// ReplaceWithConstMap.
+bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+  bool Changed = false;
+  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+    Value *Operand = Instr->getOperand(OpNum);
+    auto it = ReplaceWithConstMap.find(Operand);
+    if (it != ReplaceWithConstMap.end()) {
+      assert(!isa<Constant>(Operand) &&
+             "Replacing constants with constants is invalid");
+      DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
+                   << " in instruction " << *Instr << '\n');
+      Instr->setOperand(OpNum, it->second);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
 /// The given values are known to be equal in every block
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
-bool GVN::propagateEquality(Value *LHS, Value *RHS,
-                            const BasicBlockEdge &Root) {
+/// If DominatesByEdge is false, then it means that it is dominated by Root.End.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+                            bool DominatesByEdge) {
   SmallVector<std::pair<Value*, Value*>, 4> Worklist;
   Worklist.push_back(std::make_pair(LHS, RHS));
   bool Changed = false;
@@ -2065,11 +2147,13 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
     std::pair<Value*, Value*> Item = Worklist.pop_back_val();
     LHS = Item.first; RHS = Item.second;
 
-    if (LHS == RHS) continue;
+    if (LHS == RHS)
+      continue;
     assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
 
     // Don't try to propagate equalities between constants.
-    if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue;
+    if (isa<Constant>(LHS) && isa<Constant>(RHS))
+      continue;
 
     // Prefer a constant on the right-hand side, or an Argument if no constants.
     if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
@@ -2108,7 +2192,11 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
     // LHS always has at least one use that is not dominated by Root, this will
     // never do anything if LHS has only one use.
     if (!LHS->hasOneUse()) {
-      unsigned NumReplacements = replaceDominatedUsesWith(LHS, RHS, *DT, Root);
+      unsigned NumReplacements =
+          DominatesByEdge
+              ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd());
+
       Changed |= NumReplacements > 0;
       NumGVNEqProp += NumReplacements;
     }
@@ -2180,7 +2268,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
         Value *NotCmp = findLeader(Root.getEnd(), Num);
         if (NotCmp && isa<Instruction>(NotCmp)) {
           unsigned NumReplacements =
-            replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root);
+              DominatesByEdge
+                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+                                             Root.getEnd());
           Changed |= NumReplacements > 0;
           NumGVNEqProp += NumReplacements;
         }
@@ -2220,6 +2311,10 @@ bool GVN::processInstruction(Instruction *I) {
     return true;
   }
 
+  if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+    if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+      return processAssumeIntrinsic(IntrinsicI);
+
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (processLoad(LI))
       return true;
@@ -2250,11 +2345,11 @@ bool GVN::processInstruction(Instruction *I) {
 
     Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
     BasicBlockEdge TrueE(Parent, TrueSucc);
-    Changed |= propagateEquality(BranchCond, TrueVal, TrueE);
+    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
 
     Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
     BasicBlockEdge FalseE(Parent, FalseSucc);
-    Changed |= propagateEquality(BranchCond, FalseVal, FalseE);
+    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
 
     return Changed;
   }
@@ -2276,7 +2371,7 @@ bool GVN::processInstruction(Instruction *I) {
       // If there is only a single edge, propagate the case value into it.
       if (SwitchEdges.lookup(Dst) == 1) {
         BasicBlockEdge E(Parent, Dst);
-        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E);
+        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
       }
     }
     return Changed;
@@ -2284,7 +2379,8 @@ bool GVN::processInstruction(Instruction *I) {
 
   // Instructions with void type don't return a value, so there's
   // no point in trying to find redundancies in them.
-  if (I->getType()->isVoidTy()) return false;
+  if (I->getType()->isVoidTy())
+    return false;
 
   uint32_t NextNum = VN.getNextUnusedValueNumber();
   unsigned Num = VN.lookup_or_add(I);
@@ -2306,17 +2402,21 @@ bool GVN::processInstruction(Instruction *I) {
 
   // Perform fast-path value-number based elimination of values inherited from
   // dominators.
-  Value *repl = findLeader(I->getParent(), Num);
-  if (!repl) {
+  Value *Repl = findLeader(I->getParent(), Num);
+  if (!Repl) {
     // Failure, just remember this instance for future use.
     addToLeaderTable(Num, I, I->getParent());
     return false;
+  } else if (Repl == I) {
+    // If I was the result of a shortcut PRE, it might already be in the table
+    // and the best replacement for itself. Nothing to do.
+    return false;
   }
 
   // Remove it!
-  patchAndReplaceAllUsesWith(I, repl);
-  if (MD && repl->getType()->getScalarType()->isPointerTy())
-    MD->invalidateCachedPointerInfo(repl);
+  patchAndReplaceAllUsesWith(I, Repl);
+  if (MD && Repl->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(Repl);
   markInstructionForDeletion(I);
   return true;
 }
@@ -2331,7 +2431,7 @@ bool GVN::runOnFunction(Function& F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
+  VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults());
   VN.setMemDep(MD);
   VN.setDomTree(DT);
 
@@ -2341,10 +2441,10 @@ bool GVN::runOnFunction(Function& F) {
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
-    BasicBlock *BB = FI++;
+    BasicBlock *BB = &*FI++;
 
-    bool removedBlock = MergeBlockIntoPredecessor(
-        BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD);
+    bool removedBlock =
+        MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD);
     if (removedBlock) ++NumGVNBlocks;
 
     Changed |= removedBlock;
@@ -2382,7 +2482,6 @@ bool GVN::runOnFunction(Function& F) {
   return Changed;
 }
 
-
 bool GVN::processBlock(BasicBlock *BB) {
   // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
   // (and incrementing BI before processing an instruction).
@@ -2391,11 +2490,16 @@ bool GVN::processBlock(BasicBlock *BB) {
   if (DeadBlocks.count(BB))
     return false;
 
+  // Clearing map before every BB because it can be used only for single BB.
+  ReplaceWithConstMap.clear();
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
        BI != BE;) {
-    ChangedFunction |= processInstruction(BI);
+    if (!ReplaceWithConstMap.empty())
+      ChangedFunction |= replaceOperandsWithConsts(&*BI);
+    ChangedFunction |= processInstruction(&*BI);
+
     if (InstrsToErase.empty()) {
       ++BI;
       continue;
@@ -2439,7 +2543,14 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
     Value *Op = Instr->getOperand(i);
     if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
       continue;
-
+    // This could be a newly inserted instruction, in which case, we won't
+    // find a value number, and should give up before we hurt ourselves.
+    // FIXME: Rewrite the infrastructure to let it easier to value number
+    // and process newly inserted instructions.
+    if (!VN.exists(Op)) {
+      success = false;
+      break;
+    }
     if (Value *V = findLeader(Pred, VN.lookup(Op))) {
       Instr->setOperand(i, V);
     } else {
@@ -2499,9 +2610,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   BasicBlock *CurrentBlock = CurInst->getParent();
   predMap.clear();
 
-  for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
-       PI != PE; ++PI) {
-    BasicBlock *P = *PI;
+  for (BasicBlock *P : predecessors(CurrentBlock)) {
     // We're not interested in PRE where the block is its
     // own predecessor, or in blocks with predecessors
     // that are not reachable.
@@ -2570,7 +2679,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   // Create a PHI to make the value available in this block.
   PHINode *Phi =
       PHINode::Create(CurInst->getType(), predMap.size(),
-                      CurInst->getName() + ".pre-phi", CurrentBlock->begin());
+                      CurInst->getName() + ".pre-phi", &CurrentBlock->front());
   for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
     if (Value *V = predMap[i].first)
       Phi->addIncoming(V, predMap[i].second);
@@ -2582,18 +2691,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   addToLeaderTable(ValNo, Phi, CurrentBlock);
   Phi->setDebugLoc(CurInst->getDebugLoc());
   CurInst->replaceAllUsesWith(Phi);
-  if (Phi->getType()->getScalarType()->isPointerTy()) {
-    // Because we have added a PHI-use of the pointer value, it has now
-    // "escaped" from alias analysis' perspective.  We need to inform
-    // AA of this.
-    for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) {
-      unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
-      VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
-    }
-
-    if (MD)
-      MD->invalidateCachedPointerInfo(Phi);
-  }
+  if (MD && Phi->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(Phi);
   VN.erase(CurInst);
   removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
 
@@ -2616,15 +2715,15 @@ bool GVN::performPRE(Function &F) {
     if (CurrentBlock == &F.getEntryBlock())
       continue;
 
-    // Don't perform PRE on a landing pad.
-    if (CurrentBlock->isLandingPad())
+    // Don't perform PRE on an EH pad.
+    if (CurrentBlock->isEHPad())
       continue;
 
     for (BasicBlock::iterator BI = CurrentBlock->begin(),
                               BE = CurrentBlock->end();
          BI != BE;) {
-      Instruction *CurInst = BI++;
-      Changed = performScalarPRE(CurInst);
+      Instruction *CurInst = &*BI++;
+      Changed |= performScalarPRE(CurInst);
     }
   }
 
@@ -2637,8 +2736,8 @@ bool GVN::performPRE(Function &F) {
 /// Split the critical edge connecting the given two blocks, and return
 /// the block inserted to the critical edge.
 BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
-  BasicBlock *BB = SplitCriticalEdge(
-      Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT));
+  BasicBlock *BB =
+      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
   if (MD)
     MD->invalidateCachedPredecessors();
   return BB;
@@ -2652,7 +2751,7 @@ bool GVN::splitCriticalEdges() {
   do {
     std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
     SplitCriticalEdge(Edge.first, Edge.second,
-                      CriticalEdgeSplittingOptions(getAliasAnalysis(), DT));
+                      CriticalEdgeSplittingOptions(DT));
   } while (!toSplit.empty());
   if (MD) MD->invalidateCachedPredecessors();
   return true;
@@ -2728,17 +2827,14 @@ void GVN::addDeadBlock(BasicBlock *BB) {
     DeadBlocks.insert(Dom.begin(), Dom.end());
     
     // Figure out the dominance-frontier(D).
-    for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(),
-           E = Dom.end(); I != E; I++) {
-      BasicBlock *B = *I;
-      for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) {
-        BasicBlock *S = *SI;
+    for (BasicBlock *B : Dom) {
+      for (BasicBlock *S : successors(B)) {
         if (DeadBlocks.count(S))
           continue;
 
         bool AllPredDead = true;
-        for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++)
-          if (!DeadBlocks.count(*PI)) {
+        for (BasicBlock *P : predecessors(S))
+          if (!DeadBlocks.count(P)) {
             AllPredDead = false;
             break;
           }
@@ -2766,10 +2862,7 @@ void GVN::addDeadBlock(BasicBlock *BB) {
       continue;
 
     SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
-    for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(),
-           PE = Preds.end(); PI != PE; PI++) {
-      BasicBlock *P = *PI;
-
+    for (BasicBlock *P : Preds) {
       if (!DeadBlocks.count(P))
         continue;
 
@@ -2794,7 +2887,7 @@ void GVN::addDeadBlock(BasicBlock *BB) {
 //     R be the target of the dead out-coming edge.
 //  1) Identify the set of dead blocks implied by the branch's dead outcoming
 //     edge. The result of this step will be {X| X is dominated by R}
-//  2) Identify those blocks which haves at least one dead prodecessor. The
+//  2) Identify those blocks which haves at least one dead predecessor. The
 //     result of this step will be dominance-frontier(R).
 //  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
 //     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
@@ -2829,14 +2922,10 @@ bool GVN::processFoldableCondBr(BranchInst *BI) {
 // instructions, it makes more sense just to "fabricate" a val-number for the
 // dead code than checking if instruction involved is dead or not.
 void GVN::assignValNumForDeadCode() {
-  for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(),
-        E = DeadBlocks.end(); I != E; I++) {
-    BasicBlock *BB = *I;
-    for (BasicBlock::iterator II = BB->begin(), EE = BB->end();
-          II != EE; II++) {
-      Instruction *Inst = &*II;
-      unsigned ValNum = VN.lookup_or_add(Inst);
-      addToLeaderTable(ValNum, Inst, BB);
+  for (BasicBlock *BB : DeadBlocks) {
+    for (Instruction &Inst : *BB) {
+      unsigned ValNum = VN.lookup_or_add(&Inst);
+      addToLeaderTable(ValNum, &Inst, BB);
     }
   }
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 2a954d9..ec5e15f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -28,9 +28,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -48,6 +50,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
@@ -83,64 +86,62 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue(
 
 namespace {
 struct RewritePhi;
-}
 
-namespace {
-  class IndVarSimplify : public LoopPass {
-    LoopInfo                  *LI;
-    ScalarEvolution           *SE;
-    DominatorTree             *DT;
-    TargetLibraryInfo         *TLI;
-    const TargetTransformInfo *TTI;
-
-    SmallVector<WeakVH, 16> DeadInsts;
-    bool Changed;
-  public:
-
-    static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify()
-        : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) {
-      initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
-    }
+class IndVarSimplify : public LoopPass {
+  LoopInfo                  *LI;
+  ScalarEvolution           *SE;
+  DominatorTree             *DT;
+  TargetLibraryInfo         *TLI;
+  const TargetTransformInfo *TTI;
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addPreserved<ScalarEvolution>();
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addPreservedID(LCSSAID);
-      AU.setPreservesCFG();
-    }
+  SmallVector<WeakVH, 16> DeadInsts;
+  bool Changed;
+public:
 
-  private:
-    void releaseMemory() override {
-      DeadInsts.clear();
-    }
+  static char ID; // Pass identification, replacement for typeid
+  IndVarSimplify()
+    : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) {
+    initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool isValidRewrite(Value *FromVal, Value *ToVal);
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addPreservedID(LCSSAID);
+    AU.setPreservesCFG();
+  }
 
-    void HandleFloatingPointIV(Loop *L, PHINode *PH);
-    void RewriteNonIntegerIVs(Loop *L);
+private:
+  void releaseMemory() override {
+    DeadInsts.clear();
+  }
 
-    void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM);
+  bool isValidRewrite(Value *FromVal, Value *ToVal);
 
-    bool CanLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
-    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+  void handleFloatingPointIV(Loop *L, PHINode *PH);
+  void rewriteNonIntegerIVs(Loop *L);
 
-    Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
-                                     PHINode *IndVar, SCEVExpander &Rewriter);
+  void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
 
-    void SinkUnusedInvariants(Loop *L);
+  bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
+  void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
-    Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
-                              Instruction *InsertPt, Type *Ty,
-                              bool &IsHighCostExpansion);
-  };
+  Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+                                   PHINode *IndVar, SCEVExpander &Rewriter);
+
+  void sinkUnusedInvariants(Loop *L);
+
+  Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+                            Instruction *InsertPt, Type *Ty);
+};
 }
 
 char IndVarSimplify::ID = 0;
@@ -148,7 +149,7 @@ INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
                 "Induction Variable Simplification", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(IndVarSimplify, "indvars",
@@ -158,10 +159,10 @@ Pass *llvm::createIndVarSimplifyPass() {
   return new IndVarSimplify();
 }
 
-/// isValidRewrite - Return true if the SCEV expansion generated by the
-/// rewriter can replace the original value. SCEV guarantees that it
-/// produces the same value, but the way it is produced may be illegal IR.
-/// Ideally, this function will only be called for verification.
+/// Return true if the SCEV expansion generated by the rewriter can replace the
+/// original value. SCEV guarantees that it produces the same value, but the way
+/// it is produced may be illegal IR.  Ideally, this function will only be
+/// called for verification.
 bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
   // If an SCEV expression subsumed multiple pointers, its expansion could
   // reassociate the GEP changing the base pointer. This is illegal because the
@@ -175,10 +176,10 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
   // because it understands lcssa phis while SCEV does not.
   Value *FromPtr = FromVal;
   Value *ToPtr = ToVal;
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) {
+  if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {
     FromPtr = GEP->getPointerOperand();
   }
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) {
+  if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {
     ToPtr = GEP->getPointerOperand();
   }
   if (FromPtr != FromVal || ToPtr != ToVal) {
@@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
 /// loop. For PHI nodes, there may be multiple uses, so compute the nearest
 /// common dominator for the incoming blocks.
 static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
-                                          DominatorTree *DT) {
+                                          DominatorTree *DT, LoopInfo *LI) {
   PHINode *PHI = dyn_cast<PHINode>(User);
   if (!PHI)
     return User;
@@ -234,17 +235,28 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
     InsertPt = InsertBB->getTerminator();
   }
   assert(InsertPt && "Missing phi operand");
-  assert((!isa<Instruction>(Def) ||
-          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
-         "def does not dominate all uses");
-  return InsertPt;
+
+  auto *DefI = dyn_cast<Instruction>(Def);
+  if (!DefI)
+    return InsertPt;
+
+  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+  auto *L = LI->getLoopFor(DefI->getParent());
+  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+    if (LI->getLoopFor(DTN->getBlock()) == L)
+      return DTN->getBlock()->getTerminator();
+
+  llvm_unreachable("DefI dominates InsertPt!");
 }
 
 //===----------------------------------------------------------------------===//
-// RewriteNonIntegerIVs and helpers. Prefer integer IVs.
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
 //===----------------------------------------------------------------------===//
 
-/// ConvertToSInt - Convert APF to an integer, if possible.
+/// Convert APF to an integer, if possible.
 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   bool isExact = false;
   // See if we can convert this to an int64_t
@@ -256,8 +268,8 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   return true;
 }
 
-/// HandleFloatingPointIV - If the loop has floating induction variable
-/// then insert corresponding integer induction variable if possible.
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
 /// For example,
 /// for(double i = 0; i < 10000; ++i)
 ///   bar(i)
@@ -265,13 +277,12 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
 /// for(int i = 0; i < 10000; ++i)
 ///   bar((double)i);
 ///
-void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
+void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
   unsigned BackEdge     = IncomingEdge^1;
 
   // Check incoming value.
-  ConstantFP *InitValueVal =
-    dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+  auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
 
   int64_t InitValue;
   if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
@@ -279,8 +290,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
 
   // Check IV increment. Reject this PN if increment operation is not
   // an add or increment value can not be represented by an integer.
-  BinaryOperator *Incr =
-    dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+  auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
   if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;
 
   // If this is not an add of the PHI with a constantfp, or if the constant fp
@@ -456,14 +466,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // platforms.
   if (WeakPH) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
-                                 PN->getParent()->getFirstInsertionPt());
+                                 &*PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
   }
   Changed = true;
 }
 
-void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
+void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
   // First step.  Check to see if there are any floating-point recurrences.
   // If there are, change them into integer recurrences, permitting analysis by
   // the SCEV routines.
@@ -477,7 +487,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
 
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
     if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
-      HandleFloatingPointIV(L, PN);
+      handleFloatingPointIV(L, PN);
 
   // If the loop previously had floating-point IV, ScalarEvolution
   // may not have been able to compute a trip count. Now that we've done some
@@ -488,7 +498,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
 
 namespace {
 // Collect information about PHI nodes which can be transformed in
-// RewriteLoopExitValues.
+// rewriteLoopExitValues.
 struct RewritePhi {
   PHINode *PN;
   unsigned Ith;  // Ith incoming value.
@@ -501,70 +511,37 @@ struct RewritePhi {
 };
 }
 
-Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
                                           Loop *L, Instruction *InsertPt,
-                                          Type *ResultTy,
-                                          bool &IsHighCostExpansion) {
-  using namespace llvm::PatternMatch;
-
-  if (!Rewriter.isHighCostExpansion(S, L)) {
-    IsHighCostExpansion = false;
-    return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
-  }
-
+                                          Type *ResultTy) {
   // Before expanding S into an expensive LLVM expression, see if we can use an
-  // already existing value as the expansion for S.  There is potential to make
-  // this significantly smarter, but this simple heuristic already gets some
-  // interesting cases.
-
-  SmallVector<BasicBlock *, 4> Latches;
-  L->getLoopLatches(Latches);
-
-  for (BasicBlock *BB : Latches) {
-    ICmpInst::Predicate Pred;
-    Instruction *LHS, *RHS;
-    BasicBlock *TrueBB, *FalseBB;
-
-    if (!match(BB->getTerminator(),
-               m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
-                    TrueBB, FalseBB)))
-      continue;
-
-    if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) {
-      IsHighCostExpansion = false;
-      return LHS;
-    }
-
-    if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) {
-      IsHighCostExpansion = false;
-      return RHS;
-    }
-  }
+  // already existing value as the expansion for S.
+  if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L))
+    if (ExistingValue->getType() == ResultTy)
+      return ExistingValue;
 
   // We didn't find anything, fall back to using SCEVExpander.
-  assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!");
-  IsHighCostExpansion = true;
   return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
 }
 
 //===----------------------------------------------------------------------===//
-// RewriteLoopExitValues - Optimize IV users outside the loop.
+// rewriteLoopExitValues - Optimize IV users outside the loop.
 // As a side effect, reduces the amount of IV processing within the loop.
 //===----------------------------------------------------------------------===//
 
-/// RewriteLoopExitValues - Check to see if this loop has a computable
-/// loop-invariant execution count.  If so, this means that we can compute the
-/// final value of any expressions that are recurrent in the loop, and
-/// substitute the exit values from the loop into any instructions outside of
-/// the loop that use the final values of the current expressions.
+/// Check to see if this loop has a computable loop-invariant execution count.
+/// If so, this means that we can compute the final value of any expressions
+/// that are recurrent in the loop, and substitute the exit values from the loop
+/// into any instructions outside of the loop that use the final values of the
+/// current expressions.
 ///
 /// This is mostly redundant with the regular IndVarSimplify activities that
 /// happen later, except that it's more powerful in some cases, because it's
 /// able to brute-force evaluate arbitrary instructions as long as they have
 /// constant operands at the beginning of the loop.
-void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
-  // Verify the input to the pass in already in LCSSA form.
-  assert(L->isLCSSAForm(*DT));
+void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -679,9 +656,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
             continue;
         }
 
-        bool HighCost = false;
-        Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst,
-                                            PN->getType(), HighCost);
+        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
+        Value *ExitVal =
+            expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
 
         DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
                      << "  LoopVal = " << *Inst << "\n");
@@ -698,7 +675,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
     }
   }
 
-  bool LoopCanBeDel = CanLoopBeDeleted(L, RewritePhiSet);
+  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
 
   // Transformation.
   for (const RewritePhi &Phi : RewritePhiSet) {
@@ -735,10 +712,10 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
   Rewriter.clearInsertPoint();
 }
 
-/// CanLoopBeDeleted - Check whether it is possible to delete the loop after
-/// rewriting exit value. If it is possible, ignore ReplaceExitValue and
-/// do rewriting aggressively.
-bool IndVarSimplify::CanLoopBeDeleted(
+/// Check whether it is possible to delete the loop after rewriting exit
+/// value. If it is possible, ignore ReplaceExitValue and do rewriting
+/// aggressively.
+bool IndVarSimplify::canLoopBeDeleted(
     Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
 
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -782,14 +759,9 @@ bool IndVarSimplify::CanLoopBeDeleted(
     ++BI;
   }
 
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE;
-         ++BI) {
-      if (BI->mayHaveSideEffects())
-        return false;
-    }
-  }
+  for (auto *BB : L->blocks())
+    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
 
   return true;
 }
@@ -799,22 +771,19 @@ bool IndVarSimplify::CanLoopBeDeleted(
 //===----------------------------------------------------------------------===//
 
 namespace {
-  // Collect information about induction variables that are used by sign/zero
-  // extend operations. This information is recorded by CollectExtend and
-  // provides the input to WidenIV.
-  struct WideIVInfo {
-    PHINode *NarrowIV;
-    Type *WidestNativeType; // Widest integer type created [sz]ext
-    bool IsSigned;          // Was a sext user seen before a zext?
-
-    WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr),
-                   IsSigned(false) {}
-  };
+// Collect information about induction variables that are used by sign/zero
+// extend operations. This information is recorded by CollectExtend and provides
+// the input to WidenIV.
+struct WideIVInfo {
+  PHINode *NarrowIV = nullptr;
+  Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext
+  bool IsSigned = false;            // Was a sext user seen before a zext?
+};
 }
 
-/// visitCast - Update information about the induction variable that is
-/// extended by this sign or zero extend operation. This is used to determine
-/// the final width of the IV before actually widening it.
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
                         const TargetTransformInfo *TTI) {
   bool IsSigned = Cast->getOpcode() == Instruction::SExt;
@@ -855,24 +824,29 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
 
 namespace {
 
-/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the
-/// WideIV that computes the same value as the Narrow IV def.  This avoids
-/// caching Use* pointers.
+/// Record a link in the Narrow IV def-use chain along with the WideIV that
+/// computes the same value as the Narrow IV def.  This avoids caching Use*
+/// pointers.
 struct NarrowIVDefUse {
-  Instruction *NarrowDef;
-  Instruction *NarrowUse;
-  Instruction *WideDef;
-
-  NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {}
-
-  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD):
-    NarrowDef(ND), NarrowUse(NU), WideDef(WD) {}
+  Instruction *NarrowDef = nullptr;
+  Instruction *NarrowUse = nullptr;
+  Instruction *WideDef = nullptr;
+
+  // True if the narrow def is never negative.  Tracking this information lets
+  // us use a sign extension instead of a zero extension or vice versa, when
+  // profitable and legal.
+  bool NeverNegative = false;
+
+  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
+                 bool NeverNegative)
+      : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
+        NeverNegative(NeverNegative) {}
 };
 
-/// WidenIV - The goal of this transform is to remove sign and zero extends
-/// without creating any new induction variables. To do this, it creates a new
-/// phi of the wider type and redirects all users, either removing extends or
-/// inserting truncs whenever we stop propagating the type.
+/// The goal of this transform is to remove sign and zero extends without
+/// creating any new induction variables. To do this, it creates a new phi of
+/// the wider type and redirects all users, either removing extends or inserting
+/// truncs whenever we stop propagating the type.
 ///
 class WidenIV {
   // Parameters
@@ -913,32 +887,35 @@ public:
     assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
   }
 
-  PHINode *CreateWideIV(SCEVExpander &Rewriter);
+  PHINode *createWideIV(SCEVExpander &Rewriter);
 
 protected:
-  Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
-                   Instruction *Use);
+  Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use);
 
-  Instruction *CloneIVUser(NarrowIVDefUse DU);
+  Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
+  Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                     const SCEVAddRecExpr *WideAR);
+  Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
 
-  const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
+  const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse);
 
-  const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
+  const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU);
 
-  const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+  const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
                               unsigned OpCode) const;
 
-  Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
-  bool WidenLoopCompare(NarrowIVDefUse DU);
+  bool widenLoopCompare(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
 } // anonymous namespace
 
-/// isLoopInvariant - Perform a quick domtree based check for loop invariance
-/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems
-/// gratuitous for this purpose.
+/// Perform a quick domtree based check for loop invariance assuming that V is
+/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
+/// purpose.
 static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
   Instruction *Inst = dyn_cast<Instruction>(V);
   if (!Inst)
@@ -947,8 +924,8 @@ static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
   return DT->properlyDominates(Inst->getParent(), L->getHeader());
 }
 
-Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
-                          Instruction *Use) {
+Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
+                                 bool IsSigned, Instruction *Use) {
   // Set the debug location and conservative insertion point.
   IRBuilder<> Builder(Use);
   // Hoist the insertion point into loop preheaders as far as possible.
@@ -961,10 +938,11 @@ Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
                     Builder.CreateZExt(NarrowOper, WideType);
 }
 
-/// CloneIVUser - Instantiate a wide operation to replace a narrow
-/// operation. This only needs to handle operations that can evaluation to
-/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone.
-Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
+/// Instantiate a wide operation to replace a narrow operation. This only needs
+/// to handle operations that can evaluation to SCEVAddRec. It can safely return
+/// 0 for any operation we decide not to clone.
+Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
+                                  const SCEVAddRecExpr *WideAR) {
   unsigned Opcode = DU.NarrowUse->getOpcode();
   switch (Opcode) {
   default:
@@ -973,40 +951,140 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
   case Instruction::Mul:
   case Instruction::UDiv:
   case Instruction::Sub:
+    return cloneArithmeticIVUser(DU, WideAR);
+
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
-    DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n");
-
-    // Replace NarrowDef operands with WideDef. Otherwise, we don't know
-    // anything about the narrow operand yet so must insert a [sz]ext. It is
-    // probably loop invariant and will be folded or hoisted. If it actually
-    // comes from a widened IV, it should be removed during a future call to
-    // WidenIVUse.
-    Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef :
-      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse);
-    Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef :
-      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse);
-
-    BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse);
-    BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
-                                                    LHS, RHS,
-                                                    NarrowBO->getName());
-    IRBuilder<> Builder(DU.NarrowUse);
-    Builder.Insert(WideBO);
-    if (const OverflowingBinaryOperator *OBO =
-        dyn_cast<OverflowingBinaryOperator>(NarrowBO)) {
-      if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap();
-      if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap();
+    return cloneBitwiseIVUser(DU);
+  }
+}
+
+Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+
+  // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
+  // about the narrow operand yet so must insert a [sz]ext. It is probably loop
+  // invariant and will be folded or hoisted. If it actually comes from a
+  // widened IV, it should be removed during a future call to widenIVUse.
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      IsSigned, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      IsSigned, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                            const SCEVAddRecExpr *WideAR) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+  unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
+
+  // We're trying to find X such that
+  //
+  //  Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
+  //
+  // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
+  // and check using SCEV if any of them are correct.
+
+  // Returns true if extending NonIVNarrowDef according to `SignExt` is a
+  // correct solution to X.
+  auto GuessNonIVOperand = [&](bool SignExt) {
+    const SCEV *WideLHS;
+    const SCEV *WideRHS;
+
+    auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
+      if (SignExt)
+        return SE->getSignExtendExpr(S, Ty);
+      return SE->getZeroExtendExpr(S, Ty);
+    };
+
+    if (IVOpIdx == 0) {
+      WideLHS = SE->getSCEV(WideDef);
+      const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
+      WideRHS = GetExtend(NarrowRHS, WideType);
+    } else {
+      const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
+      WideLHS = GetExtend(NarrowLHS, WideType);
+      WideRHS = SE->getSCEV(WideDef);
+    }
+
+    // WideUse is "WideDef `op.wide` X" as described in the comment.
+    const SCEV *WideUse = nullptr;
+
+    switch (NarrowUse->getOpcode()) {
+    default:
+      llvm_unreachable("No other possibility!");
+
+    case Instruction::Add:
+      WideUse = SE->getAddExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::Mul:
+      WideUse = SE->getMulExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::UDiv:
+      WideUse = SE->getUDivExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::Sub:
+      WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
+      break;
     }
-    return WideBO;
+
+    return WideUse == WideAR;
+  };
+
+  bool SignExtend = IsSigned;
+  if (!GuessNonIVOperand(SignExtend)) {
+    SignExtend = !SignExtend;
+    if (!GuessNonIVOperand(SignExtend))
+      return nullptr;
   }
+
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      SignExtend, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      SignExtend, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
 }
 
-const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
                                      unsigned OpCode) const {
   if (OpCode == Instruction::Add)
     return SE->getAddExpr(LHS, RHS);
@@ -1022,7 +1100,7 @@ const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
 /// operands. Generate the SCEV value for the widened operation without
 /// actually modifying the IR yet. If the expression after extending the
 /// operands is an AddRec for this loop, return it.
-const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
 
   // Handle the common case of add<nsw/nuw>
   const unsigned OpCode = DU.NarrowUse->getOpcode();
@@ -1062,19 +1140,18 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
   if (ExtendOperIdx == 0)
     std::swap(lhs, rhs);
   const SCEVAddRecExpr *AddRec =
-      dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode));
+      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
 
   if (!AddRec || AddRec->getLoop() != L)
     return nullptr;
   return AddRec;
 }
 
-/// GetWideRecurrence - Is this instruction potentially interesting for further
-/// simplification after widening it's type? In other words, can the
-/// extend be safely hoisted out of the loop with SCEV reducing the value to a
-/// recurrence on the same loop. If so, return the sign or zero extended
-/// recurrence. Otherwise return NULL.
-const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
+/// Is this instruction potentially interesting for further simplification after
+/// widening it's type? In other words, can the extend be safely hoisted out of
+/// the loop with SCEV reducing the value to a recurrence on the same loop. If
+/// so, return the sign or zero extended recurrence. Otherwise return NULL.
+const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {
   if (!SE->isSCEVable(NarrowUse->getType()))
     return nullptr;
 
@@ -1097,10 +1174,11 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
 
 /// This IV user cannot be widen. Replace this use of the original narrow IV
 /// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
-static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
   DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
         << " for user " << *DU.NarrowUse << "\n");
-  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
 }
@@ -1108,13 +1186,27 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {
 /// If the narrow use is a compare instruction, then widen the compare
 //  (and possibly the other operand).  The extend operation is hoisted into the
 // loop preheader as far as possible.
-bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) {
+bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
   ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
   if (!Cmp)
     return false;
 
-  // Sign of IV user and compare must match.
-  if (IsSigned != CmpInst::isSigned(Cmp->getPredicate()))
+  // We can legally widen the comparison in the following two cases:
+  //
+  //  - The signedness of the IV extension and comparison match
+  //
+  //  - The narrow IV is always positive (and thus its sign extension is equal
+  //    to its zero extension).  For instance, let's say we're zero extending
+  //    %narrow for the following use
+  //
+  //      icmp slt i32 %narrow, %val   ... (A)
+  //
+  //    and %narrow is always positive.  Then
+  //
+  //      (A) == icmp slt i32 sext(%narrow), sext(%val)
+  //          == icmp slt i32 zext(%narrow), sext(%val)
+
+  if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
     return false;
 
   Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
@@ -1123,20 +1215,21 @@ bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) {
   assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
 
   // Widen the compare instruction.
-  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
 
   // Widen the other operand of the compare, if necessary.
   if (CastWidth < IVWidth) {
-    Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp);
+    Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
     DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
   }
   return true;
 }
 
-/// WidenIVUse - Determine whether an individual user of the narrow IV can be
-/// widened. If so, return the wide clone of the user.
-Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+/// Determine whether an individual user of the narrow IV can be widened. If so,
+/// return the wide clone of the user.
+Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
 
   // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
   if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
@@ -1145,13 +1238,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
       // After SimplifyCFG most loop exit targets have a single predecessor.
       // Otherwise fall back to a truncate within the loop.
       if (UsePhi->getNumOperands() != 1)
-        truncateIVUse(DU, DT);
+        truncateIVUse(DU, DT, LI);
       else {
         PHINode *WidePhi =
           PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
                           UsePhi);
         WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
-        IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt());
+        IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
         Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
         UsePhi->replaceAllUsesWith(Trunc);
         DeadInsts.emplace_back(UsePhi);
@@ -1200,20 +1293,20 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
   }
 
   // Does this user itself evaluate to a recurrence after widening?
-  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+  const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse);
   if (!WideAddRec)
-    WideAddRec = GetExtendedOperandRecurrence(DU);
+    WideAddRec = getExtendedOperandRecurrence(DU);
 
   if (!WideAddRec) {
     // If use is a loop condition, try to promote the condition instead of
     // truncating the IV first.
-    if (WidenLoopCompare(DU))
+    if (widenLoopCompare(DU))
       return nullptr;
 
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
-    truncateIVUse(DU, DT);
+    truncateIVUse(DU, DT, LI);
     return nullptr;
   }
   // Assume block terminators cannot evaluate to a recurrence. We can't to
@@ -1228,7 +1321,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
       && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
     WideUse = WideInc;
   else {
-    WideUse = CloneIVUser(DU);
+    WideUse = cloneIVUser(DU, WideAddRec);
     if (!WideUse)
       return nullptr;
   }
@@ -1248,9 +1341,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
   return WideUse;
 }
 
-/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers.
+/// Add eligible users of NarrowDef to NarrowIVUsers.
 ///
 void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+  const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
+  bool NeverNegative =
+      SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
+                           SE->getConstant(NarrowSCEV->getType(), 0));
   for (User *U : NarrowDef->users()) {
     Instruction *NarrowUser = cast<Instruction>(U);
 
@@ -1258,21 +1355,21 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
     if (!Widened.insert(NarrowUser).second)
       continue;
 
-    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef));
+    NarrowIVUsers.push_back(
+        NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative));
   }
 }
 
-/// CreateWideIV - Process a single induction variable. First use the
-/// SCEVExpander to create a wide induction variable that evaluates to the same
-/// recurrence as the original narrow IV. Then use a worklist to forward
-/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all
-/// interesting IV users, the narrow IV will be isolated for removal by
-/// DeleteDeadPHIs.
+/// Process a single induction variable. First use the SCEVExpander to create a
+/// wide induction variable that evaluates to the same recurrence as the
+/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
+/// def-use chain. After widenIVUse has processed all interesting IV users, the
+/// narrow IV will be isolated for removal by DeleteDeadPHIs.
 ///
 /// It would be simpler to delete uses as they are processed, but we must avoid
 /// invalidating SCEV expressions.
 ///
-PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
+PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
   // Is this phi an induction variable?
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
   if (!AddRec)
@@ -1302,11 +1399,11 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
   // either find an existing phi or materialize a new one. Either way, we
   // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
   // of the phi-SCC dominates the loop entry.
-  Instruction *InsertPt = L->getHeader()->begin();
+  Instruction *InsertPt = &L->getHeader()->front();
   WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
 
   // Remembering the WideIV increment generated by SCEVExpander allows
-  // WidenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // widenIVUse to reuse it when widening the narrow IV's increment. We don't
   // employ a general reuse mechanism because the call above is the only call to
   // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
   if (BasicBlock *LatchBlock = L->getLoopLatch()) {
@@ -1329,13 +1426,13 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 
     // Process a def-use edge. This may replace the use, so don't hold a
     // use_iterator across it.
-    Instruction *WideUse = WidenIVUse(DU, Rewriter);
+    Instruction *WideUse = widenIVUse(DU, Rewriter);
 
     // Follow all def-use edges from the previous narrow use.
     if (WideUse)
       pushNarrowIVUsers(DU.NarrowUse, WideUse);
 
-    // WidenIVUse may have removed the def-use edge.
+    // widenIVUse may have removed the def-use edge.
     if (DU.NarrowDef->use_empty())
       DeadInsts.emplace_back(DU.NarrowDef);
   }
@@ -1352,38 +1449,38 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  class IndVarSimplifyVisitor : public IVVisitor {
-    ScalarEvolution *SE;
-    const TargetTransformInfo *TTI;
-    PHINode *IVPhi;
-
-  public:
-    WideIVInfo WI;
-
-    IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
-                          const TargetTransformInfo *TTI,
-                          const DominatorTree *DTree)
-        : SE(SCEV), TTI(TTI), IVPhi(IV) {
-      DT = DTree;
-      WI.NarrowIV = IVPhi;
-      if (ReduceLiveIVs)
-        setSplitOverflowIntrinsics();
-    }
+class IndVarSimplifyVisitor : public IVVisitor {
+  ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
+  PHINode *IVPhi;
 
-    // Implement the interface used by simplifyUsersOfIV.
-    void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
-  };
+public:
+  WideIVInfo WI;
+
+  IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+                        const TargetTransformInfo *TTI,
+                        const DominatorTree *DTree)
+    : SE(SCEV), TTI(TTI), IVPhi(IV) {
+    DT = DTree;
+    WI.NarrowIV = IVPhi;
+    if (ReduceLiveIVs)
+      setSplitOverflowIntrinsics();
+  }
+
+  // Implement the interface used by simplifyUsersOfIV.
+  void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
 }
 
-/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV
-/// users. Each successive simplification may push more users which may
-/// themselves be candidates for simplification.
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
 ///
 /// Sign/Zero extend elimination is interleaved with IV simplification.
 ///
-void IndVarSimplify::SimplifyAndExtend(Loop *L,
+void IndVarSimplify::simplifyAndExtend(Loop *L,
                                        SCEVExpander &Rewriter,
-                                       LPPassManager &LPM) {
+                                       LoopInfo *LI) {
   SmallVector<WideIVInfo, 8> WideIVs;
 
   SmallVector<PHINode*, 8> LoopPhis;
@@ -1400,14 +1497,14 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
     // extension. The first time SCEV attempts to normalize sign/zero extension,
     // the result becomes final. So for the most predictable results, we delay
     // evaluation of sign/zero extend evaluation until needed, and avoid running
-    // other SCEV based analysis prior to SimplifyAndExtend.
+    // other SCEV based analysis prior to simplifyAndExtend.
     do {
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
       IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
 
-      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor);
+      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);
 
       if (Visitor.WI.WidestNativeType) {
         WideIVs.push_back(Visitor.WI);
@@ -1416,7 +1513,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
 
     for (; !WideIVs.empty(); WideIVs.pop_back()) {
       WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
-      if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) {
+      if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
         Changed = true;
         LoopPhis.push_back(WidePhi);
       }
@@ -1425,12 +1522,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
 }
 
 //===----------------------------------------------------------------------===//
-//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//  linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
 //===----------------------------------------------------------------------===//
 
-/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
-/// count expression can be safely and cheaply expanded into an instruction
-/// sequence that can be used by LinearFunctionTestReplace.
+/// Return true if this loop's backedge taken count expression can be safely and
+/// cheaply expanded into an instruction sequence that can be used by
+/// linearFunctionTestReplace.
 ///
 /// TODO: This fails for pointer-type loop counters with greater than one byte
 /// strides, consequently preventing LFTR from running. For the purpose of LFTR
@@ -1461,8 +1558,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE,
   return true;
 }
 
-/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
-/// invariant value to the phi.
+/// Return the loop header phi IFF IncV adds a loop invariant value to the phi.
 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
   Instruction *IncI = dyn_cast<Instruction>(IncV);
   if (!IncI)
@@ -1513,8 +1609,8 @@ static ICmpInst *getLoopTest(Loop *L) {
   return dyn_cast<ICmpInst>(BI->getCondition());
 }
 
-/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
-/// that the current exit test is already sufficiently canonical.
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
 static bool needsLFTR(Loop *L, DominatorTree *DT) {
   // Do LFTR to simplify the exit condition to an ICMP.
   ICmpInst *Cond = getLoopTest(L);
@@ -1574,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
     return false;
 
   // Optimistically handle other instructions.
-  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
-    if (!Visited.insert(*OI).second)
+  for (Value *Op : I->operands()) {
+    if (!Visited.insert(Op).second)
       continue;
-    if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
+    if (!hasConcreteDefImpl(Op, Visited, Depth+1))
       return false;
   }
   return true;
@@ -1594,8 +1690,8 @@ static bool hasConcreteDef(Value *V) {
   return hasConcreteDefImpl(V, Visited, 0);
 }
 
-/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to
-/// be rewritten) loop exit test.
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
 static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
   int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
   Value *IncV = Phi->getIncomingValue(LatchIdx);
@@ -1608,7 +1704,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
   return true;
 }
 
-/// FindLoopCounter - Find an affine IV in canonical form.
+/// Find an affine IV in canonical form.
 ///
 /// BECount may be an i8* pointer type. The pointer difference is already
 /// valid count without scaling the address stride, so it remains a pointer
@@ -1702,8 +1798,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
   return BestPhi;
 }
 
-/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that
-/// holds the RHS of the new loop test.
+/// Help linearFunctionTestReplace by generating a value that holds the RHS of
+/// the new loop test.
 static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
                            SCEVExpander &Rewriter, ScalarEvolution *SE) {
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
@@ -1785,13 +1881,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
   }
 }
 
-/// LinearFunctionTestReplace - This method rewrites the exit condition of the
-/// loop to be a canonical != comparison against the incremented loop induction
-/// variable.  This pass is able to rewrite the exit tests of any loop where the
-/// SCEV analysis can determine a loop-invariant trip count of the loop, which
-/// is actually a much broader range than just linear tests.
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable.  This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
 Value *IndVarSimplify::
-LinearFunctionTestReplace(Loop *L,
+linearFunctionTestReplace(Loop *L,
                           const SCEV *BackedgeTakenCount,
                           PHINode *IndVar,
                           SCEVExpander &Rewriter) {
@@ -1809,7 +1905,7 @@ LinearFunctionTestReplace(Loop *L,
     // This addition may overflow, which is valid as long as the comparison is
     // truncated to BackedgeTakenCount->getType().
     IVCount = SE->getAddExpr(BackedgeTakenCount,
-                             SE->getConstant(BackedgeTakenCount->getType(), 1));
+                             SE->getOne(BackedgeTakenCount->getType()));
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
@@ -1847,8 +1943,8 @@ LinearFunctionTestReplace(Loop *L,
     const SCEV *ARStep = AR->getStepRecurrence(*SE);
     // For constant IVCount, avoid truncation.
     if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
-      const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue();
-      APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue();
+      const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt();
+      APInt Count = cast<SCEVConstant>(IVCount)->getAPInt();
       // Note that the post-inc value of BackedgeTakenCount may have overflowed
       // above such that IVCount is now zero.
       if (IVCount != BackedgeTakenCount && Count == 0) {
@@ -1886,21 +1982,21 @@ LinearFunctionTestReplace(Loop *L,
 }
 
 //===----------------------------------------------------------------------===//
-//  SinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
 //===----------------------------------------------------------------------===//
 
 /// If there's a single exit block, sink any loop-invariant values that
 /// were defined in the preheader but not used inside the loop into the
 /// exit block to reduce register pressure in the loop.
-void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
+void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
   BasicBlock *ExitBlock = L->getExitBlock();
   if (!ExitBlock) return;
 
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) return;
 
-  Instruction *InsertPt = ExitBlock->getFirstInsertionPt();
-  BasicBlock::iterator I = Preheader->getTerminator();
+  Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt();
+  BasicBlock::iterator I(Preheader->getTerminator());
   while (I != Preheader->begin()) {
     --I;
     // New instructions were inserted at the end of the preheader.
@@ -1920,8 +2016,8 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
-    // Skip landingpad instructions.
-    if (isa<LandingPadInst>(I))
+    // Skip eh pad instructions.
+    if (I->isEHPad())
       continue;
 
     // Don't sink alloca: we never want to sink static alloca's out of the
@@ -1953,7 +2049,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
       continue;
 
     // Otherwise, sink it to the exit block.
-    Instruction *ToMove = I;
+    Instruction *ToMove = &*I;
     bool Done = false;
 
     if (I != Preheader->begin()) {
@@ -1994,7 +2090,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
 
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TLI = TLIP ? &TLIP->getTLI() : nullptr;
@@ -2007,7 +2103,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // If there are any floating-point recurrences, attempt to
   // transform them to use integer recurrences.
-  RewriteNonIntegerIVs(L);
+  rewriteNonIntegerIVs(L);
 
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 
@@ -2024,7 +2120,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // other expressions involving loop IVs have been evaluated. This helps SCEV
   // set no-wrap flags before normalizing sign/zero extension.
   Rewriter.disableCanonicalMode();
-  SimplifyAndExtend(L, Rewriter, LPM);
+  simplifyAndExtend(L, Rewriter, LI);
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -2034,7 +2130,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   //
   if (ReplaceExitValue != NeverRepl &&
       !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
-    RewriteLoopExitValues(L, Rewriter);
+    rewriteLoopExitValues(L, Rewriter);
 
   // Eliminate redundant IV cycles.
   NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
@@ -2054,7 +2150,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
       // explicitly check any assumptions made by SCEV. Brittle.
       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
       if (!AR || AR->getLoop()->getLoopPreheader())
-        (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+        (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
                                         Rewriter);
     }
   }
@@ -2074,13 +2170,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Loop-invariant instructions in the preheader that aren't used in the
   // loop may be sunk below the loop to reduce register pressure.
-  SinkUnusedInvariants(L);
+  sinkUnusedInvariants(L);
 
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
   // Check a post-condition.
-  assert(L->isLCSSAForm(*DT) &&
-         "Indvars did not leave the loop in lcssa form!");
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
 
   // Verify that LFTR, and any other change have not interfered with SCEV's
   // ability to compute trip count.
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index cbdacad..dea61f6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -214,8 +214,8 @@ public:
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
-    AU.addRequired<ScalarEvolution>();
-    AU.addRequired<BranchProbabilityInfo>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -224,8 +224,15 @@ public:
 char InductiveRangeCheckElimination::ID = 0;
 }
 
-INITIALIZE_PASS(InductiveRangeCheckElimination, "irce",
-                "Inductive range check elimination", false, false)
+INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+                      "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
+                    "Inductive range check elimination", false, false)
 
 const char *InductiveRangeCheck::rangeCheckKindToStr(
     InductiveRangeCheck::RangeCheckKind RCK) {
@@ -1044,9 +1051,9 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
 
   auto BBInsertLocation = std::next(Function::iterator(LS.Latch));
   RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
-                                        &F, BBInsertLocation);
+                                        &F, &*BBInsertLocation);
   RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
-                                      BBInsertLocation);
+                                      &*BBInsertLocation);
 
   BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());
   bool Increasing = LS.IndVarIncreasing;
@@ -1399,8 +1406,9 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
   LLVMContext &Context = Preheader->getContext();
   InductiveRangeCheck::AllocatorTy IRCAlloc;
   SmallVector<InductiveRangeCheck *, 16> RangeChecks;
-  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
-  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
 
   for (auto BBI : L->getBlocks())
     if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 1130d22..dcdcfed 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -18,15 +18,22 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
@@ -36,6 +43,8 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <memory>
 using namespace llvm;
 
 #define DEBUG_TYPE "jump-threading"
@@ -49,6 +58,13 @@ BBDuplicateThreshold("jump-threading-threshold",
           cl::desc("Max block size to duplicate for jump threading"),
           cl::init(6), cl::Hidden);
 
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+  "jump-threading-implication-search-threshold",
+  cl::desc("The number of predecessors to search for a stronger "
+           "condition to use to thread over a weaker condition"),
+  cl::init(3), cl::Hidden);
+
 namespace {
   // These are at global scope so static functions can use them too.
   typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo;
@@ -80,10 +96,13 @@ namespace {
   class JumpThreading : public FunctionPass {
     TargetLibraryInfo *TLI;
     LazyValueInfo *LVI;
+    std::unique_ptr<BlockFrequencyInfo> BFI;
+    std::unique_ptr<BranchProbabilityInfo> BPI;
+    bool HasProfileData;
 #ifdef NDEBUG
-    SmallPtrSet<BasicBlock*, 16> LoopHeaders;
+    SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
 #else
-    SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders;
+    SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders;
 #endif
     DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
 
@@ -114,9 +133,15 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LazyValueInfo>();
       AU.addPreserved<LazyValueInfo>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
+    void releaseMemory() override {
+      BFI.reset();
+      BPI.reset();
+    }
+
     void FindLoopHeaders(Function &F);
     bool ProcessBlock(BasicBlock *BB);
     bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
@@ -134,9 +159,17 @@ namespace {
 
     bool ProcessBranchOnPHI(PHINode *PN);
     bool ProcessBranchOnXOR(BinaryOperator *BO);
+    bool ProcessImpliedCondition(BasicBlock *BB);
 
     bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
     bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
+    bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
+
+  private:
+    BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+                                const char *Suffix);
+    void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
+                                      BasicBlock *NewBB, BasicBlock *SuccBB);
   };
 }
 
@@ -160,23 +193,34 @@ bool JumpThreading::runOnFunction(Function &F) {
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   LVI = &getAnalysis<LazyValueInfo>();
+  BFI.reset();
+  BPI.reset();
+  // When profile data is available, we need to update edge weights after
+  // successful jump threading, which requires both BPI and BFI being available.
+  HasProfileData = F.getEntryCount().hasValue();
+  if (HasProfileData) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
 
   // Remove unreachable blocks from function as they may result in infinite
   // loop. We do threading if we found something profitable. Jump threading a
   // branch can create other opportunities. If these opportunities form a cycle
-  // i.e. if any jump treading is undoing previous threading in the path, then
+  // i.e. if any jump threading is undoing previous threading in the path, then
   // we will loop forever. We take care of this issue by not jump threading for
   // back edges. This works for normal cases but not for unreachable blocks as
   // they may have cycle with no back edge.
-  removeUnreachableBlocks(F);
+  bool EverChanged = false;
+  EverChanged |= removeUnreachableBlocks(F, LVI);
 
   FindLoopHeaders(F);
 
-  bool Changed, EverChanged = false;
+  bool Changed;
   do {
     Changed = false;
     for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
-      BasicBlock *BB = I;
+      BasicBlock *BB = &*I;
       // Thread all of the branches we can over this block.
       while (ProcessBlock(BB))
         Changed = true;
@@ -239,11 +283,26 @@ bool JumpThreading::runOnFunction(Function &F) {
 static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
                                              unsigned Threshold) {
   /// Ignore PHI nodes, these will be flattened when duplication happens.
-  BasicBlock::const_iterator I = BB->getFirstNonPHI();
+  BasicBlock::const_iterator I(BB->getFirstNonPHI());
 
   // FIXME: THREADING will delete values that are just used to compute the
   // branch, so they shouldn't count against the duplication cost.
 
+  unsigned Bonus = 0;
+  const TerminatorInst *BBTerm = BB->getTerminator();
+  // Threading through a switch statement is particularly profitable.  If this
+  // block ends in a switch, decrease its cost to make it more likely to happen.
+  if (isa<SwitchInst>(BBTerm))
+    Bonus = 6;
+
+  // The same holds for indirect branches, but slightly more so.
+  if (isa<IndirectBrInst>(BBTerm))
+    Bonus = 8;
+
+  // Bump the threshold up so the early exit from the loop doesn't skip the
+  // terminator-based Size adjustment at the end.
+  Threshold += Bonus;
+
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
@@ -260,6 +319,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
     if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
       continue;
 
+    // Bail out if this instruction gives back a token type, it is not possible
+    // to duplicate it if it is used outside this BB.
+    if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+      return ~0U;
+
     // All other instructions count for at least one unit.
     ++Size;
 
@@ -268,7 +332,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
     // as having cost of 2 total, and if they are a vector intrinsic, we model
     // them as having cost 1.
     if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (CI->cannotDuplicate())
+      if (CI->cannotDuplicate() || CI->isConvergent())
         // Blocks with NoDuplicate are modelled as having infinite cost, so they
         // are never duplicated.
         return ~0U;
@@ -279,16 +343,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
     }
   }
 
-  // Threading through a switch statement is particularly profitable.  If this
-  // block ends in a switch, decrease its cost to make it more likely to happen.
-  if (isa<SwitchInst>(I))
-    Size = Size > 6 ? Size-6 : 0;
-
-  // The same holds for indirect branches, but slightly more so.
-  if (isa<IndirectBrInst>(I))
-    Size = Size > 8 ? Size-8 : 0;
-
-  return Size;
+  return Size > Bonus ? Size - Bonus : 0;
 }
 
 /// FindLoopHeaders - We do not want jump threading to turn proper loop
@@ -310,8 +365,8 @@ void JumpThreading::FindLoopHeaders(Function &F) {
   SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
   FindFunctionBackedges(F, Edges);
 
-  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
-    LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second));
+  for (const auto &Edge : Edges)
+    LoopHeaders.insert(Edge.second);
 }
 
 /// getKnownConstant - Helper method to determine if we can thread over a
@@ -357,8 +412,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
 
   // If V is a constant, then it is known in all predecessors.
   if (Constant *KC = getKnownConstant(V, Preference)) {
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-      Result.push_back(std::make_pair(KC, *PI));
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.push_back(std::make_pair(KC, Pred));
 
     return true;
   }
@@ -381,8 +436,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
     // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
     // Perhaps getConstantOnEdge should be smart enough to do this?
 
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-      BasicBlock *P = *PI;
+    for (BasicBlock *P : predecessors(BB)) {
       // If the value is known by LazyValueInfo to be a constant in a
       // predecessor, use that information to try to thread this block.
       Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
@@ -438,22 +492,17 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
 
       // Scan for the sentinel.  If we find an undef, force it to the
       // interesting value: x|undef -> true and x&undef -> false.
-      for (unsigned i = 0, e = LHSVals.size(); i != e; ++i)
-        if (LHSVals[i].first == InterestingVal ||
-            isa<UndefValue>(LHSVals[i].first)) {
-          Result.push_back(LHSVals[i]);
-          Result.back().first = InterestingVal;
-          LHSKnownBBs.insert(LHSVals[i].second);
+      for (const auto &LHSVal : LHSVals)
+        if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+          Result.emplace_back(InterestingVal, LHSVal.second);
+          LHSKnownBBs.insert(LHSVal.second);
         }
-      for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
-        if (RHSVals[i].first == InterestingVal ||
-            isa<UndefValue>(RHSVals[i].first)) {
+      for (const auto &RHSVal : RHSVals)
+        if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
           // If we already inferred a value for this block on the LHS, don't
           // re-add it.
-          if (!LHSKnownBBs.count(RHSVals[i].second)) {
-            Result.push_back(RHSVals[i]);
-            Result.back().first = InterestingVal;
-          }
+          if (!LHSKnownBBs.count(RHSVal.second))
+            Result.emplace_back(InterestingVal, RHSVal.second);
         }
 
       return !Result.empty();
@@ -469,8 +518,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
         return false;
 
       // Invert the known values.
-      for (unsigned i = 0, e = Result.size(); i != e; ++i)
-        Result[i].first = ConstantExpr::getNot(Result[i].first);
+      for (auto &R : Result)
+        R.first = ConstantExpr::getNot(R.first);
 
       return true;
     }
@@ -485,12 +534,12 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
                                       WantInteger, CxtI);
 
       // Try to use constant folding to simplify the binary operator.
-      for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
-        Constant *V = LHSVals[i].first;
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
         Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
 
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
-          Result.push_back(std::make_pair(KC, LHSVals[i].second));
+          Result.push_back(std::make_pair(KC, LHSVal.second));
       }
     }
 
@@ -538,8 +587,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
           cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
         Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
 
-        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);PI != E; ++PI){
-          BasicBlock *P = *PI;
+        for (BasicBlock *P : predecessors(BB)) {
           // If the value is known by LazyValueInfo to be a constant in a
           // predecessor, use that information to try to thread this block.
           LazyValueInfo::Tristate Res =
@@ -562,12 +610,12 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
         ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
                                         WantInteger, CxtI);
 
-        for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
-          Constant *V = LHSVals[i].first;
+        for (const auto &LHSVal : LHSVals) {
+          Constant *V = LHSVal.first;
           Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
                                                       V, CmpConst);
           if (Constant *KC = getKnownConstant(Folded, WantInteger))
-            Result.push_back(std::make_pair(KC, LHSVals[i].second));
+            Result.push_back(std::make_pair(KC, LHSVal.second));
         }
 
         return !Result.empty();
@@ -584,8 +632,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
     if ((TrueVal || FalseVal) &&
         ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
                                         WantInteger, CxtI)) {
-      for (unsigned i = 0, e = Conds.size(); i != e; ++i) {
-        Constant *Cond = Conds[i].first;
+      for (auto &C : Conds) {
+        Constant *Cond = C.first;
 
         // Figure out what value to use for the condition.
         bool KnownCond;
@@ -602,7 +650,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
 
         // See if the select has a known constant value for this predecessor.
         if (Constant *Val = KnownCond ? TrueVal : FalseVal)
-          Result.push_back(std::make_pair(Val, Conds[i].second));
+          Result.push_back(std::make_pair(Val, C.second));
       }
 
       return !Result.empty();
@@ -612,8 +660,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
   // If all else fails, see if LVI can figure out a constant value for us.
   Constant *CI = LVI->getConstant(V, BB, CxtI);
   if (Constant *KC = getKnownConstant(CI, Preference)) {
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-      Result.push_back(std::make_pair(KC, *PI));
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.push_back(std::make_pair(KC, Pred));
   }
 
   return !Result.empty();
@@ -669,7 +717,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
   if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
-    if (SinglePred->getTerminator()->getNumSuccessors() == 1 &&
+    const TerminatorInst *TI = SinglePred->getTerminator();
+    if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&
         SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
       // If SinglePred was a loop header, BB becomes one.
       if (LoopHeaders.erase(SinglePred))
@@ -682,6 +731,9 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     }
   }
 
+  if (TryToUnfoldSelectInCurrBB(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -761,7 +813,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     // If we're branching on a conditional, LVI might be able to determine
     // it's value at the branch instruction.  We only handle comparisons
     // against a constant at this time.
-    // TODO: This should be extended to handle switches as well.  
+    // TODO: This should be extended to handle switches as well.
     BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
     Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
     if (CondBr && CondConst && CondBr->isConditional()) {
@@ -829,9 +881,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
     return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
 
+  // Search for a stronger dominating condition that can be used to simplify a
+  // conditional branch leaving BB.
+  if (ProcessImpliedCondition(BB))
+    return true;
+
+  return false;
+}
+
+bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  BasicBlock *CurrentBB = BB;
+  BasicBlock *CurrentPred = BB->getSinglePredecessor();
+  unsigned Iter = 0;
 
-  // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
-  // "(X == 4)", thread through this block.
+  auto &DL = BB->getModule()->getDataLayout();
+
+  while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+    if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB)
+      return false;
+
+    if (isImpliedCondition(PBI->getCondition(), Cond, DL)) {
+      BI->getSuccessor(1)->removePredecessor(BB);
+      BranchInst::Create(BI->getSuccessor(0), BI);
+      BI->eraseFromParent();
+      return true;
+    }
+    CurrentBB = CurrentPred;
+    CurrentPred = CurrentBB->getSinglePredecessor();
+  }
 
   return false;
 }
@@ -850,10 +933,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (LoadBB->getSinglePredecessor())
     return false;
 
-  // If the load is defined in a landing pad, it can't be partially redundant,
-  // because the edges between the invoke and the landing pad cannot have other
+  // If the load is defined in an EH pad, it can't be partially redundant,
+  // because the edges between the invoke and the EH pad cannot have other
   // instructions between them.
-  if (LoadBB->isLandingPad())
+  if (LoadBB->isEHPad())
     return false;
 
   Value *LoadedPtr = LI->getOperand(0);
@@ -866,11 +949,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // Scan a few instructions up from the load, to see if it is obviously live at
   // the entry to its block.
-  BasicBlock::iterator BBIt = LI;
+  BasicBlock::iterator BBIt(LI);
 
   if (Value *AvailableVal =
-        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) {
-    // If the value if the load is locally available within the block, just use
+        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) {
+    // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
     //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
 
@@ -903,10 +986,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // If we got here, the loaded value is transparent through to the start of the
   // block.  Check to see if it is available in any of the predecessor blocks.
-  for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
-       PI != PE; ++PI) {
-    BasicBlock *PredBB = *PI;
-
+  for (BasicBlock *PredBB : predecessors(LoadBB)) {
     // If we already scanned this predecessor, skip it.
     if (!PredsScanned.insert(PredBB).second)
       continue;
@@ -914,7 +994,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
     AAMDNodes ThisAATags;
-    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt,
+                                                    DefMaxInstsToScan,
                                                     nullptr, &ThisAATags);
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
@@ -952,13 +1033,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     SmallVector<BasicBlock*, 8> PredsToSplit;
     SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
 
-    for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i)
-      AvailablePredSet.insert(AvailablePreds[i].first);
+    for (const auto &AvailablePred : AvailablePreds)
+      AvailablePredSet.insert(AvailablePred.first);
 
     // Add all the unavailable predecessors to the PredsToSplit list.
-    for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
-         PI != PE; ++PI) {
-      BasicBlock *P = *PI;
+    for (BasicBlock *P : predecessors(LoadBB)) {
       // If the predecessor is an indirect goto, we can't split the edge.
       if (isa<IndirectBrInst>(P->getTerminator()))
         return false;
@@ -968,8 +1047,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     }
 
     // Split them out to their own block.
-    UnavailablePred =
-      SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split");
+    UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
   }
 
   // If the value isn't available in all predecessors, then there will be
@@ -995,7 +1073,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // Create a PHI node at the start of the block for the PRE'd load value.
   pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
   PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
-                                LoadBB->begin());
+                                &LoadBB->front());
   PN->takeName(LI);
   PN->setDebugLoc(LI->getDebugLoc());
 
@@ -1044,9 +1122,9 @@ FindMostPopularDest(BasicBlock *BB,
   // blocks with known and real destinations to threading undef.  We'll handle
   // them later if interesting.
   DenseMap<BasicBlock*, unsigned> DestPopularity;
-  for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i)
-    if (PredToDestList[i].second)
-      DestPopularity[PredToDestList[i].second]++;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second)
+      DestPopularity[PredToDest.second]++;
 
   // Find the most popular dest.
   DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
@@ -1109,10 +1187,10 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
          "ComputeValueKnownInPredecessors returned true with no values");
 
   DEBUG(dbgs() << "IN BB: " << *BB;
-        for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
+        for (const auto &PredValue : PredValues) {
           dbgs() << "  BB '" << BB->getName() << "': FOUND condition = "
-            << *PredValues[i].first
-            << " for pred '" << PredValues[i].second->getName() << "'.\n";
+            << *PredValue.first
+            << " for pred '" << PredValue.second->getName() << "'.\n";
         });
 
   // Decide what we want to thread through.  Convert our list of known values to
@@ -1125,8 +1203,8 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   BasicBlock *OnlyDest = nullptr;
   BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
 
-  for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
-    BasicBlock *Pred = PredValues[i].second;
+  for (const auto &PredValue : PredValues) {
+    BasicBlock *Pred = PredValue.second;
     if (!SeenPreds.insert(Pred).second)
       continue;  // Duplicate predecessor entry.
 
@@ -1135,7 +1213,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     if (isa<IndirectBrInst>(Pred->getTerminator()))
       continue;
 
-    Constant *Val = PredValues[i].first;
+    Constant *Val = PredValue.first;
 
     BasicBlock *DestBB;
     if (isa<UndefValue>(Val))
@@ -1175,16 +1253,15 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   // Now that we know what the most popular destination is, factor all
   // predecessors that will jump to it into a single predecessor.
   SmallVector<BasicBlock*, 16> PredsToFactor;
-  for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i)
-    if (PredToDestList[i].second == MostPopularDest) {
-      BasicBlock *Pred = PredToDestList[i].first;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second == MostPopularDest) {
+      BasicBlock *Pred = PredToDest.first;
 
       // This predecessor may be a switch or something else that has multiple
       // edges to the block.  Factor each of these edges by listing them
       // according to # occurrences in PredsToFactor.
-      TerminatorInst *PredTI = Pred->getTerminator();
-      for (unsigned i = 0, e = PredTI->getNumSuccessors(); i != e; ++i)
-        if (PredTI->getSuccessor(i) == BB)
+      for (BasicBlock *Succ : successors(Pred))
+        if (Succ == BB)
           PredsToFactor.push_back(Pred);
     }
 
@@ -1262,7 +1339,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
   // Into:
   //  BB':
   //    %Y = icmp ne i32 %A, %B
-  //    br i1 %Z, ...
+  //    br i1 %Y, ...
 
   PredValueInfoTy XorOpValues;
   bool isLHS = true;
@@ -1281,11 +1358,11 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
   // Scan the information to see which is most popular: true or false.  The
   // predecessors can be of the set true, false, or undef.
   unsigned NumTrue = 0, NumFalse = 0;
-  for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) {
-    if (isa<UndefValue>(XorOpValues[i].first))
+  for (const auto &XorOpValue : XorOpValues) {
+    if (isa<UndefValue>(XorOpValue.first))
       // Ignore undefs for the count.
       continue;
-    if (cast<ConstantInt>(XorOpValues[i].first)->isZero())
+    if (cast<ConstantInt>(XorOpValue.first)->isZero())
       ++NumFalse;
     else
       ++NumTrue;
@@ -1301,12 +1378,11 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
   // Collect all of the blocks that this can be folded into so that we can
   // factor this once and clone it once.
   SmallVector<BasicBlock*, 8> BlocksToFoldInto;
-  for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) {
-    if (XorOpValues[i].first != SplitVal &&
-        !isa<UndefValue>(XorOpValues[i].first))
+  for (const auto &XorOpValue : XorOpValues) {
+    if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
       continue;
 
-    BlocksToFoldInto.push_back(XorOpValues[i].second);
+    BlocksToFoldInto.push_back(XorOpValue.second);
   }
 
   // If we inferred a value for all of the predecessors, then duplication won't
@@ -1387,14 +1463,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  // And finally, do it!  Start by factoring the predecessors is needed.
+  // And finally, do it!  Start by factoring the predecessors if needed.
   BasicBlock *PredBB;
   if (PredBBs.size() == 1)
     PredBB = PredBBs[0];
   else {
     DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
           << " common predecessors.\n");
-    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm");
+    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
   }
 
   // And finally, do it!
@@ -1415,6 +1491,13 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
                                          BB->getParent(), BB);
   NewBB->moveAfter(PredBB);
 
+  // Set the block frequency of NewBB.
+  if (HasProfileData) {
+    auto NewBBFreq =
+        BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
   BasicBlock::iterator BI = BB->begin();
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
     ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
@@ -1425,7 +1508,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
     Instruction *New = BI->clone();
     New->setName(BI->getName());
     NewBB->getInstList().push_back(New);
-    ValueMapping[BI] = New;
+    ValueMapping[&*BI] = New;
 
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
@@ -1438,7 +1521,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
-  BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB);
+  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
   NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
 
   // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
@@ -1451,10 +1534,10 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // PHI insertion, of which we are prepared to do, clean these up now.
   SSAUpdater SSAUpdate;
   SmallVector<Use*, 16> UsesToRename;
-  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+  for (Instruction &I : *BB) {
     // Scan all uses of this instruction to see if it is used outside of its
     // block, and if so, record them in UsesToRename.
-    for (Use &U : I->uses()) {
+    for (Use &U : I.uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
         if (UserPN->getIncomingBlock(U) == BB)
@@ -1469,14 +1552,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
     if (UsesToRename.empty())
       continue;
 
-    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n");
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
     // We found a use of I outside of BB.  Rename all uses of I that are outside
     // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
     // with the two values we know.
-    SSAUpdate.Initialize(I->getType(), I->getName());
-    SSAUpdate.AddAvailableValue(BB, I);
-    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]);
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
@@ -1499,11 +1582,98 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // frequently happens because of phi translation.
   SimplifyInstructionsInBlock(NewBB, TLI);
 
+  // Update the edge weight from BB to SuccBB, which should be less than before.
+  UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+
   // Threaded an edge!
   ++NumThreads;
   return true;
 }
 
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is availble, update the frequency of
+/// this new block.
+BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB,
+                                           ArrayRef<BasicBlock *> Preds,
+                                           const char *Suffix) {
+  // Collect the frequencies of all predecessors of BB, which will be used to
+  // update the edge weight on BB->SuccBB.
+  BlockFrequency PredBBFreq(0);
+  if (HasProfileData)
+    for (auto Pred : Preds)
+      PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+
+  BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+
+  // Set the block frequency of the newly created PredBB, which is the sum of
+  // frequencies of Preds.
+  if (HasProfileData)
+    BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
+  return PredBB;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+                                                 BasicBlock *BB,
+                                                 BasicBlock *NewBB,
+                                                 BasicBlock *SuccBB) {
+  if (!HasProfileData)
+    return;
+
+  assert(BFI && BPI && "BFI & BPI should have been created here");
+
+  // As the edge from PredBB to BB is deleted, we have to update the block
+  // frequency of BB.
+  auto BBOrigFreq = BFI->getBlockFreq(BB);
+  auto NewBBFreq = BFI->getBlockFreq(NewBB);
+  auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+  auto BBNewFreq = BBOrigFreq - NewBBFreq;
+  BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+  // Collect updated outgoing edges' frequencies from BB and use them to update
+  // edge probabilities.
+  SmallVector<uint64_t, 4> BBSuccFreq;
+  for (BasicBlock *Succ : successors(BB)) {
+    auto SuccFreq = (Succ == SuccBB)
+                        ? BB2SuccBBFreq - NewBBFreq
+                        : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+    BBSuccFreq.push_back(SuccFreq.getFrequency());
+  }
+
+  uint64_t MaxBBSuccFreq =
+      *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+  SmallVector<BranchProbability, 4> BBSuccProbs;
+  if (MaxBBSuccFreq == 0)
+    BBSuccProbs.assign(BBSuccFreq.size(),
+                       {1, static_cast<unsigned>(BBSuccFreq.size())});
+  else {
+    for (uint64_t Freq : BBSuccFreq)
+      BBSuccProbs.push_back(
+          BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+    // Normalize edge probabilities so that they sum up to one.
+    BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+                                              BBSuccProbs.end());
+  }
+
+  // Update edge probabilities in BPI.
+  for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
+    BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+
+  if (BBSuccProbs.size() >= 2) {
+    SmallVector<uint32_t, 4> Weights;
+    for (auto Prob : BBSuccProbs)
+      Weights.push_back(Prob.getNumerator());
+
+    auto TI = BB->getTerminator();
+    TI->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+  }
+}
+
 /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
 /// to BB which contains an i1 PHI node and a conditional branch on that PHI.
 /// If we can duplicate the contents of BB up into PredBB do so now, this
@@ -1530,14 +1700,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     return false;
   }
 
-  // And finally, do it!  Start by factoring the predecessors is needed.
+  // And finally, do it!  Start by factoring the predecessors if needed.
   BasicBlock *PredBB;
   if (PredBBs.size() == 1)
     PredBB = PredBBs[0];
   else {
     DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
           << " common predecessors.\n");
-    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm");
+    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
   }
 
   // Okay, we decided to do this!  Clone all the instructions in BB onto the end
@@ -1581,12 +1751,12 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     if (Value *IV =
             SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
       delete New;
-      ValueMapping[BI] = IV;
+      ValueMapping[&*BI] = IV;
     } else {
       // Otherwise, insert the new instruction into the block.
       New->setName(BI->getName());
-      PredBB->getInstList().insert(OldPredBranch, New);
-      ValueMapping[BI] = New;
+      PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+      ValueMapping[&*BI] = New;
     }
   }
 
@@ -1604,10 +1774,10 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   // PHI insertion, of which we are prepared to do, clean these up now.
   SSAUpdater SSAUpdate;
   SmallVector<Use*, 16> UsesToRename;
-  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+  for (Instruction &I : *BB) {
     // Scan all uses of this instruction to see if it is used outside of its
     // block, and if so, record them in UsesToRename.
-    for (Use &U : I->uses()) {
+    for (Use &U : I.uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
         if (UserPN->getIncomingBlock(U) == BB)
@@ -1622,14 +1792,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     if (UsesToRename.empty())
       continue;
 
-    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n");
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
     // We found a use of I outside of BB.  Rename all uses of I that are outside
     // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
     // with the two values we know.
-    SSAUpdate.Initialize(I->getType(), I->getName());
-    SSAUpdate.AddAvailableValue(BB, I);
-    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]);
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]);
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
@@ -1724,3 +1894,62 @@ bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
   }
   return false;
 }
+
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// bb:
+///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+///   %s = select p, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant.  If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
+bool JumpThreading::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  // Look for a Phi/Select pair in the same basic block.  The Phi feeds the
+  // condition of the Select and at least one of the incoming values is a
+  // constant.
+  for (BasicBlock::iterator BI = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+    unsigned NumPHIValues = PN->getNumIncomingValues();
+    if (NumPHIValues == 0 || !PN->hasOneUse())
+      continue;
+
+    SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
+    if (!SI || SI->getParent() != BB)
+      continue;
+
+    Value *Cond = SI->getCondition();
+    if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+      continue;
+
+    bool HasConst = false;
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      if (PN->getIncomingBlock(i) == BB)
+        return false;
+      if (isa<ConstantInt>(PN->getIncomingValue(i)))
+        HasConst = true;
+    }
+
+    if (HasConst) {
+      // Expand the select.
+      TerminatorInst *Term =
+          SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+      PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+      NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+      NewPN->addIncoming(SI->getFalseValue(), BB);
+      SI->replaceAllUsesWith(NewPN);
+      SI->eraseFromParent();
+      return true;
+    }
+  }
+  
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index 43fc50e..8923ff7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -34,10 +34,13 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
@@ -72,10 +75,12 @@ DisablePromotion("disable-licm-promotion", cl::Hidden,
                  cl::desc("Disable memory promotion in LICM pass"));
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop);
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LICMSafetyInfo *SafetyInfo);
 static bool hoist(Instruction &I, BasicBlock *Preheader);
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST );
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LICMSafetyInfo *SafetyInfo);
 static bool isGuaranteedToExecute(const Instruction &Inst,
                                   const DominatorTree *DT,
                                   const Loop *CurLoop,
@@ -89,10 +94,10 @@ static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
                                      const AAMDNodes &AAInfo, 
                                      AliasSetTracker *CurAST);
-static Instruction *CloneInstructionInExitBlock(const Instruction &I,
-                                                BasicBlock &ExitBlock,
-                                                PHINode &PN,
-                                                const LoopInfo *LI);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LICMSafetyInfo *SafetyInfo);
 static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
                                DominatorTree *DT, TargetLibraryInfo *TLI,
                                Loop *CurLoop, AliasSetTracker *CurAST,
@@ -118,9 +123,12 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
-      AU.addRequired<AliasAnalysis>();
-      AU.addPreserved<AliasAnalysis>();
-      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<BasicAAWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
@@ -164,9 +172,12 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
 
 Pass *llvm::createLICMPass() { return new LICM(); }
@@ -183,7 +194,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Get our Loop and Alias Analysis information...
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
@@ -192,9 +203,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   CurAST = new AliasSetTracker(*AA);
   // Collect Alias info from subloops.
-  for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
-       LoopItr != LoopItrE; ++LoopItr) {
-    Loop *InnerL = *LoopItr;
+  for (Loop *InnerL : L->getSubLoops()) {
     AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL];
     assert(InnerAST && "Where is my AST?");
 
@@ -216,9 +225,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Because subloops have already been incorporated into AST, we skip blocks in
   // subloops.
   //
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
+  for (BasicBlock *BB : L->blocks()) {
     if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops.
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
@@ -252,9 +259,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     PredIteratorCache PIC;
 
     // Loop over all of the alias sets in the tracker object.
-    for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
-         I != E; ++I)
-      Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts, 
+    for (AliasSet &AS : *CurAST)
+      Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts,
                                               PIC, LI, DT, CurLoop, 
                                               CurAST, &SafetyInfo);
 
@@ -264,9 +270,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     // FIXME: This is really heavy handed. It would be a bit better to use an
     // SSAUpdater strategy during promotion that was LCSSA aware and reformed
     // it as it went.
-    if (Changed)
-      formLCSSARecursively(*L, *DT, LI,
-                           getAnalysisIfAvailable<ScalarEvolution>());
+    if (Changed) {
+      auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+      formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr);
+    }
   }
 
   // Check that neither this loop nor its parent have had LCSSA broken. LICM is
@@ -312,9 +319,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 
   // We are processing blocks in reverse dfo, so process children first.
   const std::vector<DomTreeNode*> &Children = N->getChildren();
-  for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    Changed |=
-        sinkRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+  for (DomTreeNode *Child : Children)
+    Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
   if (inSubLoop(BB,CurLoop,LI)) return Changed;
@@ -338,10 +345,10 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     // outside of the loop.  In this case, it doesn't even matter if the
     // operands of the instruction are loop invariant.
     //
-    if (isNotUsedInLoop(I, CurLoop) &&
+    if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
         canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) {
       ++II;
-      Changed |= sink(I, LI, DT, CurLoop, CurAST);
+      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
     }
   }
   return Changed;
@@ -395,14 +402,13 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     }
 
   const std::vector<DomTreeNode*> &Children = N->getChildren();
-  for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    Changed |=
-        hoistRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+  for (DomTreeNode *Child : Children)
+    Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
   return Changed;
 }
 
 /// Computes loop safety information, checks loop body & header
-/// for the possiblity of may throw exception.
+/// for the possibility of may throw exception.
 ///
 void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
   assert(CurLoop != nullptr && "CurLoop cant be null");
@@ -410,7 +416,7 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
   // Setting default safety values.
   SafetyInfo->MayThrow = false;
   SafetyInfo->HeaderMayThrow = false;
-  // Iterate over header and compute dafety info.
+  // Iterate over header and compute safety info.
   for (BasicBlock::iterator I = Header->begin(), E = Header->end();
        (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
     SafetyInfo->HeaderMayThrow |= I->mayThrow();
@@ -422,6 +428,14 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
          (I != E) && !SafetyInfo->MayThrow; ++I)
       SafetyInfo->MayThrow |= I->mayThrow();
+
+  // Compute funclet colors if we might sink/hoist in a function with a funclet
+  // personality routine.
+  Function *Fn = CurLoop->getHeader()->getParent();
+  if (Fn->hasPersonalityFn())
+    if (Constant *PersonalityFn = Fn->getPersonalityFn())
+      if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
 /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
@@ -445,7 +459,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
-      Size = AA->getTypeStoreSize(LI->getType());
+      Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType());
 
     AAMDNodes AAInfo;
     LI->getAAMetadata(AAInfo);
@@ -456,17 +470,30 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
     if (isa<DbgInfoIntrinsic>(I))
       return false;
 
+    // Don't sink calls which can throw.
+    if (CI->mayThrow())
+      return false;
+
     // Handle simple cases by querying alias analysis.
-    AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI);
-    if (Behavior == AliasAnalysis::DoesNotAccessMemory)
+    FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == FMRB_DoesNotAccessMemory)
       return true;
     if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+      // A readonly argmemonly function only reads from memory pointed to by
+      // it's arguments with arbitrary offsets.  If we can prove there are no
+      // writes to this memory in the loop, we can hoist or sink.
+      if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) {
+        for (Value *Op : CI->arg_operands())
+          if (Op->getType()->isPointerTy() &&
+              pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize,
+                                       AAMDNodes(), CurAST))
+            return false;
+        return true;
+      }
       // If this call only reads from memory and there are no writes to memory
       // in the loop, we can hoist or sink the call as appropriate.
       bool FoundMod = false;
-      for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
-           I != E; ++I) {
-        AliasSet &AS = *I;
+      for (AliasSet &AS : *CurAST) {
         if (!AS.isForwardingAliasSet() && AS.isMod()) {
           FoundMod = true;
           break;
@@ -513,10 +540,24 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
 /// the loop. If this is true, we can sink the instruction to the exit
 /// blocks of the loop.
 ///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) {
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LICMSafetyInfo *SafetyInfo) {
+  const auto &BlockColors = SafetyInfo->BlockColors;
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
     if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      const BasicBlock *BB = PN->getParent();
+      // We cannot sink uses in catchswitches.
+      if (isa<CatchSwitchInst>(BB->getTerminator()))
+        return false;
+
+      // We need to sink a callsite to a unique funclet.  Avoid sinking if the
+      // phi use is too muddled.
+      if (isa<CallInst>(I))
+        if (!BlockColors.empty() &&
+            BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+          return false;
+
       // A PHI node where all of the incoming values are this instruction are
       // special -- they can just be RAUW'ed with the instruction and thus
       // don't require a use in the predecessor. This is a particular important
@@ -544,11 +585,41 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) {
   return true;
 }
 
-static Instruction *CloneInstructionInExitBlock(const Instruction &I,
-                                                BasicBlock &ExitBlock,
-                                                PHINode &PN,
-                                                const LoopInfo *LI) {
-  Instruction *New = I.clone();
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LICMSafetyInfo *SafetyInfo) {
+  Instruction *New;
+  if (auto *CI = dyn_cast<CallInst>(&I)) {
+    const auto &BlockColors = SafetyInfo->BlockColors;
+
+    // Sinking call-sites need to be handled differently from other
+    // instructions.  The cloned call-site needs a funclet bundle operand
+    // appropriate for it's location in the CFG.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+         BundleIdx != BundleEnd; ++BundleIdx) {
+      OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+      if (Bundle.getTagID() == LLVMContext::OB_funclet)
+        continue;
+
+      OpBundles.emplace_back(Bundle);
+    }
+
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+      assert(CV.size() == 1 && "non-unique color for exit block!");
+      BasicBlock *BBColor = CV.front();
+      Instruction *EHPad = BBColor->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+
+    New = CallInst::Create(CI, OpBundles);
+  } else {
+    New = I.clone();
+  }
+
   ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
   if (!I.getName().empty()) New->setName(I.getName() + ".le");
 
@@ -566,7 +637,7 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I,
         if (!OLoop->contains(&PN)) {
           PHINode *OpPN =
               PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
-                              OInst->getName() + ".lcssa", ExitBlock.begin());
+                              OInst->getName() + ".lcssa", &ExitBlock.front());
           for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
             OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
           *OI = OpPN;
@@ -580,7 +651,8 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I,
 /// position, and may either delete it or move it to outside of the loop.
 ///
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST ) {
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LICMSafetyInfo *SafetyInfo) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   bool Changed = false;
   if (isa<LoadInst>(I)) ++NumMovedLoads;
@@ -631,7 +703,7 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
       New = It->second;
     else
       New = SunkCopies[ExitBlock] =
-            CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI);
+          CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
 
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
@@ -651,6 +723,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) {
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
+  // Metadata can be dependent on the condition we are hoisting above.
+  // Conservatively strip all metadata on the instruction.
+  I.dropUnknownNonDebugMetadata();
+
   if (isa<LoadInst>(I)) ++NumMovedLoads;
   else if (isa<CallInst>(I)) ++NumMovedCalls;
   ++NumHoisted;
@@ -699,8 +775,8 @@ static bool isGuaranteedToExecute(const Instruction &Inst,
   CurLoop->getExitBlocks(ExitBlocks);
 
   // Verify that the block dominates each of the exit blocks of the loop.
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (!DT->dominates(Inst.getParent(), ExitBlocks[i]))
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(Inst.getParent(), ExitBlock))
       return false;
 
   // As a degenerate case, if the loop is statically infinite then we haven't
@@ -730,9 +806,9 @@ namespace {
           if (!L->contains(BB)) {
             // We need to create an LCSSA PHI node for the incoming value and
             // store that.
-            PHINode *PN = PHINode::Create(
-                I->getType(), PredCache.size(BB),
-                I->getName() + ".lcssa", BB->begin());
+            PHINode *PN =
+                PHINode::Create(I->getType(), PredCache.size(BB),
+                                I->getName() + ".lcssa", &BB->front());
             for (BasicBlock *Pred : PredCache.get(BB))
               PN->addIncoming(I, Pred);
             return PN;
@@ -867,17 +943,17 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      if (const LoadInst *load = dyn_cast<LoadInst>(UI)) {
-        assert(!load->isVolatile() && "AST broken");
-        if (!load->isSimple())
+      if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+        assert(!Load->isVolatile() && "AST broken");
+        if (!Load->isSimple())
           return Changed;
-      } else if (const StoreInst *store = dyn_cast<StoreInst>(UI)) {
+      } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
         if (UI->getOperand(1) != ASIV)
           continue;
-        assert(!store->isVolatile() && "AST broken");
-        if (!store->isSimple())
+        assert(!Store->isVolatile() && "AST broken");
+        if (!Store->isSimple())
           return Changed;
         // Don't sink stores from loops without dedicated block exits. Exits
         // containing indirect branches are not transformed by loop simplify,
@@ -895,7 +971,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
         // restrictive (and performant) alignment and if we are sure this
         // instruction will be executed, update the alignment.
         // Larger is better, with the exception of 0 being the best alignment.
-        unsigned InstAlignment = store->getAlignment();
+        unsigned InstAlignment = Store->getAlignment();
         if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
           if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
             GuaranteedToExecute = true;
@@ -925,6 +1001,21 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   if (!GuaranteedToExecute)
     return Changed;
 
+  // Figure out the loop exits and their insertion points, if this is the
+  // first promotion.
+  if (ExitBlocks.empty()) {
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    InsertPts.clear();
+    InsertPts.reserve(ExitBlocks.size());
+    for (BasicBlock *ExitBlock : ExitBlocks)
+      InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+  }
+
+  // Can't insert into a catchswitch.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
+      return Changed;
+
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
   Changed = true;
@@ -936,15 +1027,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   // location is better than none.
   DebugLoc DL = LoopUses[0]->getDebugLoc();
 
-  // Figure out the loop exits and their insertion points, if this is the
-  // first promotion.
-  if (ExitBlocks.empty()) {
-    CurLoop->getUniqueExitBlocks(ExitBlocks);
-    InsertPts.resize(ExitBlocks.size());
-    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-      InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt();
-  }
-
   // We use the SSAUpdater interface to insert phi nodes as required.
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
@@ -973,7 +1055,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
   return Changed;
 }
 
-/// Simple Analysis hook. Clone alias set info.
+/// Simple analysis hook. Clone alias set info.
 ///
 void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
   AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
index c19cd19..1648878 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -56,7 +57,7 @@ class LoadCombine : public BasicBlockPass {
 
 public:
   LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
-    initializeSROAPass(*PassRegistry::getPassRegistry());
+    initializeLoadCombinePass(*PassRegistry::getPassRegistry());
   }
   
   using llvm::Pass::doInitialization;
@@ -223,7 +224,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
   if (skipOptnoneFunction(BB))
     return false;
 
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   IRBuilder<true, TargetFolder> TheBuilder(
       BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
@@ -262,8 +263,8 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
 void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
 
-  AU.addRequired<AliasAnalysis>();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
 }
 
 char LoadCombine::ID = 0;
@@ -274,7 +275,8 @@ BasicBlockPass *llvm::createLoadCombinePass() {
 
 INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",
                       false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",
                     false, false)
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 98b068e..7b1940b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/Dominators.h"
@@ -35,18 +36,19 @@ namespace {
     }
 
     // Possibly eliminate loop L if it is dead.
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+    bool runOnLoop(Loop *L, LPPassManager &) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
 
-      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
     }
@@ -64,7 +66,7 @@ INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
@@ -130,7 +132,7 @@ bool LoopDeletion::isLoopDead(Loop *L,
 /// so could change the halting/non-halting nature of a program.
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
-bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
   if (skipOptnoneFunction(L))
     return false;
 
@@ -169,7 +171,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
@@ -242,9 +244,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
   for (BasicBlock *BB : blocks)
     loopInfo.removeBlock(BB);
 
-  // The last step is to inform the loop pass manager that we've
-  // eliminated this loop.
-  LPM.deleteLoopFromQueue(L);
+  // The last step is to update LoopInfo now that we've eliminated this loop.
+  loopInfo.markAsRemoved(L);
   Changed = true;
 
   ++NumDeleted;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 1b9859b..3d3cf3e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <list>
 
@@ -54,6 +55,11 @@ static cl::opt<bool> DistributeNonIfConvertible(
              "if-convertible by the loop vectorizer"),
     cl::init(false));
 
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Distribution"));
+
 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
 
 namespace {
@@ -164,9 +170,7 @@ public:
 
     // Delete the instructions backwards, as it has a reduced likelihood of
     // having to update as many def-use and use-def chains.
-    for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) {
-      auto *Inst = *I;
-
+    for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
       if (!Inst->use_empty())
         Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
       Inst->eraseFromParent();
@@ -373,7 +377,7 @@ public:
 
   /// \brief This performs the main chunk of the work of cloning the loops for
   /// the partitions.
-  void cloneLoops(Pass *P) {
+  void cloneLoops() {
     BasicBlock *OrigPH = L->getLoopPreheader();
     // At this point the predecessor of the preheader is either the memcheck
     // block or the top part of the original preheader.
@@ -547,11 +551,11 @@ public:
 
   MemoryInstructionDependences(
       const SmallVectorImpl<Instruction *> &Instructions,
-      const SmallVectorImpl<Dependence> &InterestingDependences) {
+      const SmallVectorImpl<Dependence> &Dependences) {
     Accesses.append(Instructions.begin(), Instructions.end());
 
     DEBUG(dbgs() << "Backward dependences:\n");
-    for (auto &Dep : InterestingDependences)
+    for (auto &Dep : Dependences)
       if (Dep.isPossiblyBackward()) {
         // Note that the designations source and destination follow the program
         // order, i.e. source is always first.  (The direction is given by the
@@ -567,25 +571,6 @@ private:
   AccessesType Accesses;
 };
 
-/// \brief Returns the instructions that use values defined in the loop.
-static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) {
-  SmallVector<Instruction *, 8> UsedOutside;
-
-  for (auto *Block : L->getBlocks())
-    // FIXME: I believe that this could use copy_if if the Inst reference could
-    // be adapted into a pointer.
-    for (auto &Inst : *Block) {
-      auto Users = Inst.users();
-      if (std::any_of(Users.begin(), Users.end(), [&](User *U) {
-            auto *Use = cast<Instruction>(U);
-            return !L->contains(Use->getParent());
-          }))
-        UsedOutside.push_back(&Inst);
-    }
-
-  return UsedOutside;
-}
-
 /// \brief The pass class.
 class LoopDistribute : public FunctionPass {
 public:
@@ -597,6 +582,7 @@ public:
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     LAA = &getAnalysis<LoopAccessAnalysis>();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
     // Build up a worklist of inner-loops to vectorize. This is necessary as the
     // act of distributing a loop creates new loops and can invalidate iterators
@@ -619,6 +605,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<LoopAccessAnalysis>();
@@ -629,6 +616,45 @@ public:
   static char ID;
 
 private:
+  /// \brief Filter out checks between pointers from the same partition.
+  ///
+  /// \p PtrToPartition contains the partition number for pointers.  Partition
+  /// number -1 means that the pointer is used in multiple partitions.  In this
+  /// case we can't safely omit the check.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4>
+  includeOnlyCrossPartitionChecks(
+      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+      const SmallVectorImpl<int> &PtrToPartition,
+      const RuntimePointerChecking *RtPtrChecking) {
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+                 [&](const RuntimePointerChecking::PointerCheck &Check) {
+                   for (unsigned PtrIdx1 : Check.first->Members)
+                     for (unsigned PtrIdx2 : Check.second->Members)
+                       // Only include this check if there is a pair of pointers
+                       // that require checking and the pointers fall into
+                       // separate partitions.
+                       //
+                       // (Note that we already know at this point that the two
+                       // pointer groups need checking but it doesn't follow
+                       // that each pair of pointers within the two groups need
+                       // checking as well.
+                       //
+                       // In other words we don't want to include a check just
+                       // because there is a pair of pointers between the two
+                       // pointer groups that require checks and a different
+                       // pair whose pointers fall into different partitions.)
+                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                           !RuntimePointerChecking::arePointersInSamePartition(
+                               PtrToPartition, PtrIdx1, PtrIdx2))
+                         return true;
+                   return false;
+                 });
+
+    return Checks;
+  }
+
   /// \brief Try to distribute an inner-most loop.
   bool processLoop(Loop *L) {
     assert(L->empty() && "Only process inner loops.");
@@ -655,9 +681,8 @@ private:
       DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
       return false;
     }
-    auto *InterestingDependences =
-        LAI.getDepChecker().getInterestingDependences();
-    if (!InterestingDependences || InterestingDependences->empty()) {
+    auto *Dependences = LAI.getDepChecker().getDependences();
+    if (!Dependences || Dependences->empty()) {
       DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
       return false;
     }
@@ -685,7 +710,7 @@ private:
     // NumUnsafeDependencesActive reaches 0.
     const MemoryDepChecker &DepChecker = LAI.getDepChecker();
     MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
-                                     *InterestingDependences);
+                                     *Dependences);
 
     int NumUnsafeDependencesActive = 0;
     for (auto &InstDep : MID) {
@@ -735,6 +760,13 @@ private:
         return false;
     }
 
+    // Don't distribute the loop if we need too many SCEV run-time checks.
+    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+    if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
+      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      return false;
+    }
+
     DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
     // We're done forming the partitions set up the reverse mapping from
     // instructions to partitions.
@@ -746,20 +778,25 @@ private:
     if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
       SplitBlock(PH, PH->getTerminator(), DT, LI);
 
-    // If we need run-time checks to disambiguate pointers are run-time, version
-    // the loop now.
+    // If we need run-time checks, version the loop now.
     auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
-    LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition);
-    if (LVer.needsRuntimeChecks()) {
+    const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
+    const auto &AllChecks = RtPtrChecking->getChecks();
+    auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+                                                  RtPtrChecking);
+
+    if (!Pred.isAlwaysTrue() || !Checks.empty()) {
       DEBUG(dbgs() << "\nPointers:\n");
-      DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition));
-      LVer.versionLoop(this);
-      LVer.addPHINodes(DefsUsedOutside);
+      DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+      LoopVersioning LVer(LAI, L, LI, DT, SE, false);
+      LVer.setAliasChecks(std::move(Checks));
+      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LVer.versionLoop(DefsUsedOutside);
     }
 
     // Create identical copies of the original loop for each partition and hook
     // them up sequentially.
-    Partitions.cloneLoops(this);
+    Partitions.cloneLoops();
 
     // Now, we remove the instruction from each loop that don't belong to that
     // partition.
@@ -780,6 +817,7 @@ private:
   LoopInfo *LI;
   LoopAccessAnalysis *LAA;
   DominatorTree *DT;
+  ScalarEvolution *SE;
 };
 } // anonymous namespace
 
@@ -790,6 +828,7 @@ INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
 
 namespace llvm {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a21ca24..4521640 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -31,11 +31,6 @@
 //   void foo(_Complex float *P)
 //     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }
 //
-// We should enhance this to handle negative strides through memory.
-// Alternatively (and perhaps better) we could rely on an earlier pass to force
-// forward iteration through memory, which is generally better for cache
-// behavior.  Negative strides *do* happen for memset/memcpy loops.
-//
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
 //
@@ -44,7 +39,10 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -67,149 +65,87 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 
 namespace {
 
-  class LoopIdiomRecognize;
+class LoopIdiomRecognize : public LoopPass {
+  Loop *CurLoop;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  static char ID;
+  explicit LoopIdiomRecognize() : LoopPass(ID) {
+    initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+  }
 
-  /// This class defines some utility functions for loop idiom recognization.
-  class LIRUtil {
-  public:
-    /// Return true iff the block contains nothing but an uncondition branch
-    /// (aka goto instruction).
-    static bool isAlmostEmpty(BasicBlock *);
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreservedID(LCSSAID);
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
 
-    static BranchInst *getBranch(BasicBlock *BB) {
-      return dyn_cast<BranchInst>(BB->getTerminator());
-    }
+private:
+  typedef SmallVector<StoreInst *, 8> StoreList;
+  StoreList StoreRefsForMemset;
+  StoreList StoreRefsForMemcpy;
+  bool HasMemset;
+  bool HasMemsetPattern;
+  bool HasMemcpy;
 
-    /// Derive the precondition block (i.e the block that guards the loop
-    /// preheader) from the given preheader.
-    static BasicBlock *getPrecondBb(BasicBlock *PreHead);
-  };
-
-  /// This class is to recoginize idioms of population-count conducted in
-  /// a noncountable loop. Currently it only recognizes this pattern:
-  /// \code
-  ///   while(x) {cnt++; ...; x &= x - 1; ...}
-  /// \endcode
-  class NclPopcountRecognize {
-    LoopIdiomRecognize &LIR;
-    Loop *CurLoop;
-    BasicBlock *PreCondBB;
-
-    typedef IRBuilder<> IRBuilderTy;
-
-  public:
-    explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR);
-    bool recognize();
-
-  private:
-    /// Take a glimpse of the loop to see if we need to go ahead recoginizing
-    /// the idiom.
-    bool preliminaryScreen();
-
-    /// Check if the given conditional branch is based on the comparison
-    /// between a variable and zero, and if the variable is non-zero, the
-    /// control yields to the loop entry. If the branch matches the behavior,
-    /// the variable involved in the comparion is returned. This function will
-    /// be called to see if the precondition and postcondition of the loop
-    /// are in desirable form.
-    Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const;
-
-    /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
-    /// is set to the instruction counting the population bit. 2) \p CntPhi
-    /// is set to the corresponding phi node. 3) \p Var is set to the value
-    /// whose population bits are being counted.
-    bool detectIdiom
-      (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
-
-    /// Insert ctpop intrinsic function and some obviously dead instructions.
-    void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var);
-
-    /// Create llvm.ctpop.* intrinsic function.
-    CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
-  };
-
-  class LoopIdiomRecognize : public LoopPass {
-    Loop *CurLoop;
-    DominatorTree *DT;
-    ScalarEvolution *SE;
-    TargetLibraryInfo *TLI;
-    const TargetTransformInfo *TTI;
-  public:
-    static char ID;
-    explicit LoopIdiomRecognize() : LoopPass(ID) {
-      initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
-      DT = nullptr;
-      SE = nullptr;
-      TLI = nullptr;
-      TTI = nullptr;
-    }
+  /// \name Countable Loop Idiom Handling
+  /// @{
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-    bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
-                        SmallVectorImpl<BasicBlock*> &ExitBlocks);
-
-    bool processLoopStore(StoreInst *SI, const SCEV *BECount);
-    bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
-
-    bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
-                                 unsigned StoreAlignment,
-                                 Value *SplatValue, Instruction *TheStore,
-                                 const SCEVAddRecExpr *Ev,
-                                 const SCEV *BECount);
-    bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
-                                    const SCEVAddRecExpr *StoreEv,
-                                    const SCEVAddRecExpr *LoadEv,
-                                    const SCEV *BECount);
-
-    /// This transformation requires natural loop information & requires that
-    /// loop preheaders be inserted into the CFG.
-    ///
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addPreservedID(LCSSAID);
-      AU.addRequired<AliasAnalysis>();
-      AU.addPreserved<AliasAnalysis>();
-      AU.addRequired<ScalarEvolution>();
-      AU.addPreserved<ScalarEvolution>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-    }
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+                      SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
-    DominatorTree *getDominatorTree() {
-      return DT ? DT
-                : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree());
-    }
+  void collectStores(BasicBlock *BB);
+  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
+  bool processLoopStore(StoreInst *SI, const SCEV *BECount);
+  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
-    ScalarEvolution *getScalarEvolution() {
-      return SE ? SE : (SE = &getAnalysis<ScalarEvolution>());
-    }
+  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+                               unsigned StoreAlignment, Value *StoredVal,
+                               Instruction *TheStore, const SCEVAddRecExpr *Ev,
+                               const SCEV *BECount, bool NegStride);
+  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
 
-    TargetLibraryInfo *getTargetLibraryInfo() {
-      if (!TLI)
-        TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  /// @}
+  /// \name Noncountable Loop Idiom Handling
+  /// @{
 
-      return TLI;
-    }
+  bool runOnNoncountableLoop();
 
-    const TargetTransformInfo *getTargetTransformInfo() {
-      return TTI ? TTI
-                 : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                        *CurLoop->getHeader()->getParent()));
-    }
+  bool recognizePopcount();
+  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+                               PHINode *CntPhi, Value *Var);
 
-    Loop *getLoop() const { return CurLoop; }
+  /// @}
+};
 
-  private:
-    bool runOnNoncountableLoop();
-    bool runOnCountableLoop();
-  };
-}
+} // End anonymous namespace.
 
 char LoopIdiomRecognize::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
@@ -218,9 +154,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
                     false, false)
@@ -242,448 +181,244 @@ static void deleteDeadInstruction(Instruction *I,
 
 //===----------------------------------------------------------------------===//
 //
-//          Implementation of LIRUtil
-//
-//===----------------------------------------------------------------------===//
-
-// This function will return true iff the given block contains nothing but goto.
-// A typical usage of this function is to check if the preheader function is
-// "almost" empty such that generated intrinsic functions can be moved across
-// the preheader and be placed at the end of the precondition block without
-// the concern of breaking data dependence.
-bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
-  if (BranchInst *Br = getBranch(BB)) {
-    return Br->isUnconditional() && Br == BB->begin();
-  }
-  return false;
-}
-
-BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
-  if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
-    BranchInst *Br = getBranch(BB);
-    return Br && Br->isConditional() ? BB : nullptr;
-  }
-  return nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-//
-//          Implementation of NclPopcountRecognize
+//          Implementation of LoopIdiomRecognize
 //
 //===----------------------------------------------------------------------===//
 
-NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
-  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) {
-}
-
-bool NclPopcountRecognize::preliminaryScreen() {
-  const TargetTransformInfo *TTI = LIR.getTargetTransformInfo();
-  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
-    return false;
-
-  // Counting population are usually conducted by few arithmetic instructions.
-  // Such instructions can be easilly "absorbed" by vacant slots in a
-  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
-  // in a compact loop.
-
-  // Give up if the loop has multiple blocks or multiple backedges.
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
-    return false;
-
-  BasicBlock *LoopBody = *(CurLoop->block_begin());
-  if (LoopBody->size() >= 20) {
-    // The loop is too big, bail out.
+bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
     return false;
-  }
 
-  // It should have a preheader containing nothing but a goto instruction.
-  BasicBlock *PreHead = CurLoop->getLoopPreheader();
-  if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead))
+  CurLoop = L;
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
     return false;
 
-  // It should have a precondition block where the generated popcount instrinsic
-  // function will be inserted.
-  PreCondBB = LIRUtil::getPrecondBb(PreHead);
-  if (!PreCondBB)
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
     return false;
 
-  return true;
-}
-
-Value *NclPopcountRecognize::matchCondition(BranchInst *Br,
-                                            BasicBlock *LoopEntry) const {
-  if (!Br || !Br->isConditional())
-    return nullptr;
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *CurLoop->getHeader()->getParent());
+  DL = &CurLoop->getHeader()->getModule()->getDataLayout();
 
-  ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
-  if (!Cond)
-    return nullptr;
+  HasMemset = TLI->has(LibFunc::memset);
+  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc::memcpy);
 
-  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
-  if (!CmpZero || !CmpZero->isZero())
-    return nullptr;
-
-  ICmpInst::Predicate Pred = Cond->getPredicate();
-  if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
-      (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
-    return Cond->getOperand(0);
+  if (HasMemset || HasMemsetPattern || HasMemcpy)
+    if (SE->hasLoopInvariantBackedgeTakenCount(L))
+      return runOnCountableLoop();
 
-  return nullptr;
+  return runOnNoncountableLoop();
 }
 
-bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
-                                       PHINode *&CntPhi,
-                                       Value *&Var) const {
-  // Following code tries to detect this idiom:
-  //
-  //    if (x0 != 0)
-  //      goto loop-exit // the precondition of the loop
-  //    cnt0 = init-val;
-  //    do {
-  //       x1 = phi (x0, x2);
-  //       cnt1 = phi(cnt0, cnt2);
-  //
-  //       cnt2 = cnt1 + 1;
-  //        ...
-  //       x2 = x1 & (x1 - 1);
-  //        ...
-  //    } while(x != 0);
-  //
-  // loop-exit:
-  //
-
-  // step 1: Check to see if the look-back branch match this pattern:
-  //    "if (a!=0) goto loop-entry".
-  BasicBlock *LoopEntry;
-  Instruction *DefX2, *CountInst;
-  Value *VarX1, *VarX0;
-  PHINode *PhiX, *CountPhi;
-
-  DefX2 = CountInst = nullptr;
-  VarX1 = VarX0 = nullptr;
-  PhiX = CountPhi = nullptr;
-  LoopEntry = *(CurLoop->block_begin());
-
-  // step 1: Check if the loop-back branch is in desirable form.
-  {
-    if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry))
-      DefX2 = dyn_cast<Instruction>(T);
-    else
-      return false;
-  }
-
-  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
-  {
-    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
-      return false;
-
-    BinaryOperator *SubOneOp;
-
-    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
-      VarX1 = DefX2->getOperand(1);
-    else {
-      VarX1 = DefX2->getOperand(0);
-      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
-    }
-    if (!SubOneOp)
-      return false;
-
-    Instruction *SubInst = cast<Instruction>(SubOneOp);
-    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
-    if (!Dec ||
-        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
-          (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) {
-      return false;
-    }
-  }
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+  assert(!isa<SCEVCouldNotCompute>(BECount) &&
+         "runOnCountableLoop() called on a loop without a predictable"
+         "backedge-taken count");
 
-  // step 3: Check the recurrence of variable X
-  {
-    PhiX = dyn_cast<PHINode>(VarX1);
-    if (!PhiX ||
-        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+  // If this loop executes exactly one time, then it should be peeled, not
+  // optimized by this pass.
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    if (BECst->getAPInt() == 0)
       return false;
-    }
-  }
 
-  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
-  {
-    CountInst = nullptr;
-    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
-           IterE = LoopEntry->end(); Iter != IterE; Iter++) {
-      Instruction *Inst = Iter;
-      if (Inst->getOpcode() != Instruction::Add)
-        continue;
-
-      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
-      if (!Inc || !Inc->isOne())
-        continue;
-
-      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
-      if (!Phi || Phi->getParent() != LoopEntry)
-        continue;
-
-      // Check if the result of the instruction is live of the loop.
-      bool LiveOutLoop = false;
-      for (User *U : Inst->users()) {
-        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
-          LiveOutLoop = true; break;
-        }
-      }
-
-      if (LiveOutLoop) {
-        CountInst = Inst;
-        CountPhi = Phi;
-        break;
-      }
-    }
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
 
-    if (!CountInst)
-      return false;
-  }
+  DEBUG(dbgs() << "loop-idiom Scanning: F["
+               << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
+               << CurLoop->getHeader()->getName() << "\n");
 
-  // step 5: check if the precondition is in this form:
-  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
-  {
-    BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
-    Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader());
-    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
-      return false;
+  bool MadeChange = false;
+  // Scan all the blocks in the loop that are not in subloops.
+  for (auto *BB : CurLoop->getBlocks()) {
+    // Ignore blocks in subloops.
+    if (LI->getLoopFor(BB) != CurLoop)
+      continue;
 
-    CntInst = CountInst;
-    CntPhi = CountPhi;
-    Var = T;
+    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
   }
-
-  return true;
+  return MadeChange;
 }
 
-void NclPopcountRecognize::transform(Instruction *CntInst,
-                                     PHINode *CntPhi, Value *Var) {
-
-  ScalarEvolution *SE = LIR.getScalarEvolution();
-  TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo();
-  BasicBlock *PreHead = CurLoop->getLoopPreheader();
-  BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
-  const DebugLoc DL = CntInst->getDebugLoc();
-
-  // Assuming before transformation, the loop is following:
-  //  if (x) // the precondition
-  //     do { cnt++; x &= x - 1; } while(x);
-
-  // Step 1: Insert the ctpop instruction at the end of the precondition block
-  IRBuilderTy Builder(PreCondBr);
-  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
-  {
-    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
-    NewCount = PopCntZext =
-      Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
-
-    if (NewCount != PopCnt)
-      (cast<Instruction>(NewCount))->setDebugLoc(DL);
-
-    // TripCnt is exactly the number of iterations the loop has
-    TripCnt = NewCount;
-
-    // If the population counter's initial value is not zero, insert Add Inst.
-    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
-    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
-    if (!InitConst || !InitConst->isZero()) {
-      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
-      (cast<Instruction>(NewCount))->setDebugLoc(DL);
-    }
-  }
-
-  // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to
-  //   "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic
-  //   function would be partial dead code, and downstream passes will drag
-  //   it back from the precondition block to the preheader.
-  {
-    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
-
-    Value *Opnd0 = PopCntZext;
-    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
-    if (PreCond->getOperand(0) != Var)
-      std::swap(Opnd0, Opnd1);
-
-    ICmpInst *NewPreCond =
-      cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
-    PreCondBr->setCondition(NewPreCond);
+static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
+  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
+         "Don't overflow unsigned.");
+  return (unsigned)SizeInBits >> 3;
+}
 
-    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
-  }
+static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) {
+  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+  return ConstStride->getAPInt().getZExtValue();
+}
 
-  // Step 3: Note that the population count is exactly the trip count of the
-  // loop in question, which enble us to to convert the loop from noncountable
-  // loop into a countable one. The benefit is twofold:
-  //
-  //  - If the loop only counts population, the entire loop become dead after
-  //    the transformation. It is lots easier to prove a countable loop dead
-  //    than to prove a noncountable one. (In some C dialects, a infite loop
-  //    isn't dead even if it computes nothing useful. In general, DCE needs
-  //    to prove a noncountable loop finite before safely delete it.)
-  //
-  //  - If the loop also performs something else, it remains alive.
-  //    Since it is transformed to countable form, it can be aggressively
-  //    optimized by some optimizations which are in general not applicable
-  //    to a noncountable loop.
-  //
-  // After this step, this loop (conceptually) would look like following:
-  //   newcnt = __builtin_ctpop(x);
-  //   t = newcnt;
-  //   if (x)
-  //     do { cnt++; x &= x-1; t--) } while (t > 0);
-  BasicBlock *Body = *(CurLoop->block_begin());
-  {
-    BranchInst *LbBr = LIRUtil::getBranch(Body);
-    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
-    Type *Ty = TripCnt->getType();
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in.  Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return nullptr;
 
-    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin());
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = DL->getTypeSizeInBits(V->getType());
+  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+    return nullptr;
 
-    Builder.SetInsertPoint(LbCond);
-    Value *Opnd1 = cast<Value>(TcPhi);
-    Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1));
-    Instruction *TcDec =
-      cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true));
+  // Don't care enough about darwin/ppc to implement this.
+  if (DL->isBigEndian())
+    return nullptr;
 
-    TcPhi->addIncoming(TripCnt, PreHead);
-    TcPhi->addIncoming(TcDec, Body);
+  // Convert to size in bytes.
+  Size /= 8;
 
-    CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ?
-      CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
-    LbCond->setPredicate(Pred);
-    LbCond->setOperand(0, TcDec);
-    LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0)));
-  }
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16)
+    return nullptr;
 
-  // Step 4: All the references to the original population counter outside
-  //  the loop are replaced with the NewCount -- the value returned from
-  //  __builtin_ctpop().
-  CntInst->replaceUsesOutsideBlock(NewCount, Body);
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16)
+    return C;
 
-  // step 5: Forget the "non-computable" trip-count SCEV associated with the
-  //   loop. The loop would otherwise not be deleted even if it becomes empty.
-  SE->forgetLoop(CurLoop);
+  // Otherwise, we'll use an array of the constants.
+  unsigned ArraySize = 16 / Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
 }
 
-CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder,
-                                                      Value *Val, DebugLoc DL) {
-  Value *Ops[] = { Val };
-  Type *Tys[] = { Val->getType() };
-
-  Module *M = (*(CurLoop->block_begin()))->getParent()->getParent();
-  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
-  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
-  CI->setDebugLoc(DL);
-
-  return CI;
-}
+bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
+                                      bool &ForMemcpy) {
+  // Don't touch volatile stores.
+  if (!SI->isSimple())
+    return false;
 
-/// recognize - detect population count idiom in a non-countable loop. If
-///   detected, transform the relevant code to popcount intrinsic function
-///   call, and return true; otherwise, return false.
-bool NclPopcountRecognize::recognize() {
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
 
-  if (!LIR.getTargetTransformInfo())
+  // Reject stores that are so large that they overflow an unsigned.
+  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
     return false;
 
-  LIR.getScalarEvolution();
-
-  if (!preliminaryScreen())
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *StoreEv =
+      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
     return false;
 
-  Instruction *CntInst;
-  PHINode *CntPhi;
-  Value *Val;
-  if (!detectIdiom(CntInst, CntPhi, Val))
+  // Check to see if we have a constant stride.
+  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
     return false;
 
-  transform(CntInst, CntPhi, Val);
-  return true;
-}
+  // See if the store can be turned into a memset.
 
-//===----------------------------------------------------------------------===//
-//
-//          Implementation of LoopIdiomRecognize
-//
-//===----------------------------------------------------------------------===//
+  // If the stored value is a byte-wise value (like i32 -1), then it may be
+  // turned into a memset of i8 -1, assuming that all the consecutive bytes
+  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
+  // but it can be turned into memset_pattern if the target supports it.
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = nullptr;
 
-bool LoopIdiomRecognize::runOnCountableLoop() {
-  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
-  assert(!isa<SCEVCouldNotCompute>(BECount) &&
-    "runOnCountableLoop() called on a loop without a predictable"
-    "backedge-taken count");
+  // If we're allowed to form a memset, and the stored value would be
+  // acceptable for memset, use it.
+  if (HasMemset && SplatValue &&
+      // Verify that the stored value is loop invariant.  If not, we can't
+      // promote the memset.
+      CurLoop->isLoopInvariant(SplatValue)) {
+    // It looks like we can use SplatValue.
+    ForMemset = true;
+    return true;
+  } else if (HasMemsetPattern &&
+             // Don't create memset_pattern16s with address spaces.
+             StorePtr->getType()->getPointerAddressSpace() == 0 &&
+             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+    // It looks like we can use PatternValue!
+    ForMemset = true;
+    return true;
+  }
 
-  // If this loop executes exactly one time, then it should be peeled, not
-  // optimized by this pass.
-  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
-    if (BECst->getValue()->getValue() == 0)
+  // Otherwise, see if the store can be turned into a memcpy.
+  if (HasMemcpy) {
+    // Check to see if the stride matches the size of the store.  If so, then we
+    // know that every byte is touched in the loop.
+    unsigned Stride = getStoreStride(StoreEv);
+    unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+    if (StoreSize != Stride && StoreSize != -Stride)
       return false;
 
-  // set DT
-  (void)getDominatorTree();
-
-  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-
-  // set TLI
-  (void)getTargetLibraryInfo();
-
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  CurLoop->getUniqueExitBlocks(ExitBlocks);
-
-  DEBUG(dbgs() << "loop-idiom Scanning: F["
-               << CurLoop->getHeader()->getParent()->getName()
-               << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
+    // The store must be feeding a non-volatile load.
+    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+    if (!LI || !LI->isSimple())
+      return false;
 
-  bool MadeChange = false;
-  // Scan all the blocks in the loop that are not in subloops.
-  for (auto *BB : CurLoop->getBlocks()) {
-    // Ignore blocks in subloops.
-    if (LI.getLoopFor(BB) != CurLoop)
-      continue;
+    // See if the pointer expression is an AddRec like {base,+,1} on the current
+    // loop, which indicates a strided load.  If we have something else, it's a
+    // random load we can't handle.
+    const SCEVAddRecExpr *LoadEv =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+      return false;
 
-    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
-  }
-  return MadeChange;
-}
+    // The store and load must share the same stride.
+    if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+      return false;
 
-bool LoopIdiomRecognize::runOnNoncountableLoop() {
-  NclPopcountRecognize Popcount(*this);
-  if (Popcount.recognize())
+    // Success.  This store can be converted into a memcpy.
+    ForMemcpy = true;
     return true;
-
+  }
+  // This store can't be transformed into a memset/memcpy.
   return false;
 }
 
-bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
-
-  CurLoop = L;
-
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!L->getLoopPreheader())
-    return false;
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+  StoreRefsForMemset.clear();
+  StoreRefsForMemcpy.clear();
+  for (Instruction &I : *BB) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
+    if (!SI)
+      continue;
 
-  // Disable loop idiom recognition if the function's name is a common idiom.
-  StringRef Name = L->getHeader()->getParent()->getName();
-  if (Name == "memset" || Name == "memcpy")
-    return false;
+    bool ForMemset = false;
+    bool ForMemcpy = false;
+    // Make sure this is a strided store with a constant stride.
+    if (!isLegalStore(SI, ForMemset, ForMemcpy))
+      continue;
 
-  SE = &getAnalysis<ScalarEvolution>();
-  if (SE->hasLoopInvariantBackedgeTakenCount(L))
-    return runOnCountableLoop();
-  return runOnNoncountableLoop();
+    // Save the store locations.
+    if (ForMemset)
+      StoreRefsForMemset.push_back(SI);
+    else if (ForMemcpy)
+      StoreRefsForMemcpy.push_back(SI);
+  }
 }
 
 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
 /// with the specified backedge count.  This block is known to be in the current
 /// loop and not in any subloops.
-bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
-                                     SmallVectorImpl<BasicBlock*> &ExitBlocks) {
+bool LoopIdiomRecognize::runOnLoopBlock(
+    BasicBlock *BB, const SCEV *BECount,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks) {
   // We can only promote stores in this block if they are unconditionally
   // executed in the loop.  For a block to be unconditionally executed, it has
   // to dominate all the exit blocks of the loop.  Verify this now.
@@ -692,25 +427,24 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
       return false;
 
   bool MadeChange = false;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-    Instruction *Inst = I++;
-    // Look for store instructions, which may be optimized to memset/memcpy.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))  {
-      WeakVH InstPtr(I);
-      if (!processLoopStore(SI, BECount)) continue;
-      MadeChange = true;
+  // Look for store instructions, which may be optimized to memset/memcpy.
+  collectStores(BB);
 
-      // If processing the store invalidated our iterator, start over from the
-      // top of the block.
-      if (!InstPtr)
-        I = BB->begin();
-      continue;
-    }
+  // Look for a single store which can be optimized into a memset.
+  for (auto &SI : StoreRefsForMemset)
+    MadeChange |= processLoopStore(SI, BECount);
 
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : StoreRefsForMemcpy)
+    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
     // Look for memset instructions, which may be optimized to a larger memset.
-    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst))  {
-      WeakVH InstPtr(I);
-      if (!processLoopMemSet(MSI, BECount)) continue;
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+      WeakVH InstPtr(&*I);
+      if (!processLoopMemSet(MSI, BECount))
+        continue;
       MadeChange = true;
 
       // If processing the memset invalidated our iterator, start over from the
@@ -724,71 +458,34 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
   return MadeChange;
 }
 
-
-/// processLoopStore - See if this store can be promoted to a memset or memcpy.
+/// processLoopStore - See if this store can be promoted to a memset.
 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
-  if (!SI->isSimple()) return false;
+  assert(SI->isSimple() && "Expected only non-volatile stores.");
 
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
 
-  // Reject stores that are so large that they overflow an unsigned.
-  auto &DL = CurLoop->getHeader()->getModule()->getDataLayout();
-  uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType());
-  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
-    return false;
-
-  // See if the pointer expression is an AddRec like {base,+,1} on the current
-  // loop, which indicates a strided store.  If we have something else, it's a
-  // random store we can't handle.
-  const SCEVAddRecExpr *StoreEv =
-    dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
-    return false;
-
   // Check to see if the stride matches the size of the store.  If so, then we
   // know that every byte is touched in the loop.
-  unsigned StoreSize = (unsigned)SizeInBits >> 3;
-  const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
-
-  if (!Stride || StoreSize != Stride->getValue()->getValue()) {
-    // TODO: Could also handle negative stride here someday, that will require
-    // the validity check in mayLoopAccessLocation to be updated though.
-    // Enable this to print exact negative strides.
-    if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) {
-      dbgs() << "NEGATIVE STRIDE: " << *SI << "\n";
-      dbgs() << "BB: " << *SI->getParent();
-    }
-
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  if (StoreSize != Stride && StoreSize != -Stride)
     return false;
-  }
 
-  // See if we can optimize just this store in isolation.
-  if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
-                              StoredVal, SI, StoreEv, BECount))
-    return true;
+  bool NegStride = StoreSize == -Stride;
 
-  // If the stored value is a strided load in the same loop with the same stride
-  // this this may be transformable into a memcpy.  This kicks in for stuff like
-  //   for (i) A[i] = B[i];
-  if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
-    const SCEVAddRecExpr *LoadEv =
-      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0)));
-    if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() &&
-        StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple())
-      if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount))
-        return true;
-  }
-  //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n";
-
-  return false;
+  // See if we can optimize just this store in isolation.
+  return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
+                                 StoredVal, SI, StoreEv, BECount, NegStride);
 }
 
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
-bool LoopIdiomRecognize::
-processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+                                           const SCEV *BECount) {
   // We can only handle non-volatile memsets with a constant size.
-  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false;
+  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+    return false;
 
   // If we're not allowed to hack on memset, we fail.
   if (!TLI->has(LibFunc::memset))
@@ -817,18 +514,23 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
   if (!Stride || MSI->getLength() != Stride->getValue())
     return false;
 
+  // Verify that the memset value is loop invariant.  If not, we can't promote
+  // the memset.
+  Value *SplatValue = MSI->getValue();
+  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+    return false;
+
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getAlignment(), MSI->getValue(),
-                                 MSI, Ev, BECount);
+                                 MSI->getAlignment(), SplatValue, MSI, Ev,
+                                 BECount, /*NegStride=*/false);
 }
 
-
 /// mayLoopAccessLocation - Return true if the specified loop might access the
 /// specified pointer location, which is a loop-strided access.  The 'Access'
 /// argument specifies what the verboten forms of access are (read or write).
-static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
-                                  Loop *L, const SCEV *BECount,
-                                  unsigned StoreSize, AliasAnalysis &AA,
+static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                                  const SCEV *BECount, unsigned StoreSize,
+                                  AliasAnalysis &AA,
                                   Instruction *IgnoredStore) {
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
@@ -838,7 +540,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
   // If the loop iterates a fixed number of times, we can refine the access size
   // to be exactly the size of the memset, which is (BECount+1)*StoreSize
   if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
-    AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize;
+    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
 
   // TODO: For this to be really effective, we have to dive into the pointer
   // operand in the store.  Store to &A[i] of 100 will always return may alias
@@ -849,96 +551,55 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
        ++BI)
     for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
-      if (&*I != IgnoredStore &&
-          (AA.getModRefInfo(I, StoreLoc) & Access))
+      if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access))
         return true;
 
   return false;
 }
 
-/// getMemSetPatternValue - If a strided store of the specified value is safe to
-/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
-/// be passed in.  Otherwise, return null.
-///
-/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
-/// just replicate their input array and then pass on to memset_pattern16.
-static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) {
-  // If the value isn't a constant, we can't promote it to being in a constant
-  // array.  We could theoretically do a store to an alloca or something, but
-  // that doesn't seem worthwhile.
-  Constant *C = dyn_cast<Constant>(V);
-  if (!C) return nullptr;
-
-  // Only handle simple values that are a power of two bytes in size.
-  uint64_t Size = DL.getTypeSizeInBits(V->getType());
-  if (Size == 0 || (Size & 7) || (Size & (Size-1)))
-    return nullptr;
-
-  // Don't care enough about darwin/ppc to implement this.
-  if (DL.isBigEndian())
-    return nullptr;
-
-  // Convert to size in bytes.
-  Size /= 8;
-
-  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
-  // if the top and bottom are the same (e.g. for vectors and large integers).
-  if (Size > 16) return nullptr;
-
-  // If the constant is exactly 16 bytes, just use it.
-  if (Size == 16) return C;
-
-  // Otherwise, we'll use an array of the constants.
-  unsigned ArraySize = 16/Size;
-  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
-  return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C));
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset.  Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+                                        Type *IntPtr, unsigned StoreSize,
+                                        ScalarEvolution *SE) {
+  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  if (StoreSize != 1)
+    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+                           SCEV::FlagNUW);
+  return SE->getMinusSCEV(Start, Index);
 }
 
-
 /// processLoopStridedStore - We see a strided store of some value.  If we can
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
-bool LoopIdiomRecognize::
-processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
-                        unsigned StoreAlignment, Value *StoredVal,
-                        Instruction *TheStore, const SCEVAddRecExpr *Ev,
-                        const SCEV *BECount) {
-
-  // If the stored value is a byte-wise value (like i32 -1), then it may be
-  // turned into a memset of i8 -1, assuming that all the consecutive bytes
-  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
-  // but it can be turned into memset_pattern if the target supports it.
+bool LoopIdiomRecognize::processLoopStridedStore(
+    Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
+    Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
+    const SCEV *BECount, bool NegStride) {
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = nullptr;
-  auto &DL = CurLoop->getHeader()->getModule()->getDataLayout();
-  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
 
-  // If we're allowed to form a memset, and the stored value would be acceptable
-  // for memset, use it.
-  if (SplatValue && TLI->has(LibFunc::memset) &&
-      // Verify that the stored value is loop invariant.  If not, we can't
-      // promote the memset.
-      CurLoop->isLoopInvariant(SplatValue)) {
-    // Keep and use SplatValue.
-    PatternValue = nullptr;
-  } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) &&
-             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
-    // Don't create memset_pattern16s with address spaces.
-    // It looks like we can use PatternValue!
-    SplatValue = nullptr;
-  } else {
-    // Otherwise, this isn't an idiom we can transform.  For example, we can't
-    // do anything with a 3-byte store.
-    return false;
-  }
+  if (!SplatValue)
+    PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+  assert((SplatValue || PatternValue) &&
+         "Expected either splat value or pattern value.");
 
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
-  SCEVExpander Expander(*SE, DL, "loop-idiom");
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
+
+  const SCEV *Start = Ev->getStart();
+  // Handle negative strided loops.
+  if (NegStride)
+    Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
 
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
@@ -946,12 +607,9 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   // or write to the aliased location.  Check for any overlap by generating the
   // base pointer and checking the region.
   Value *BasePtr =
-    Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
-                           Preheader->getTerminator());
-
-  if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
-                            CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
+      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
+                            *AA, TheStore)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
@@ -962,36 +620,30 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = Builder.getIntPtrTy(DL, DestAS);
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
-  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
-                                         SCEV::FlagNUW);
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
   if (StoreSize != 1) {
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
                                SCEV::FlagNUW);
   }
 
   Value *NumBytes =
-    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+      Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
   CallInst *NewCall;
   if (SplatValue) {
-    NewCall = Builder.CreateMemSet(BasePtr,
-                                   SplatValue,
-                                   NumBytes,
-                                   StoreAlignment);
+    NewCall =
+        Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
   } else {
     // Everything is emitted in default address space
     Type *Int8PtrTy = DestInt8PtrTy;
 
-    Module *M = TheStore->getParent()->getParent()->getParent();
-    Value *MSP = M->getOrInsertFunction("memset_pattern16",
-                                        Builder.getVoidTy(),
-                                        Int8PtrTy,
-                                        Int8PtrTy,
-                                        IntPtr,
-                                        (void*)nullptr);
+    Module *M = TheStore->getModule();
+    Value *MSP =
+        M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
@@ -1015,26 +667,43 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   return true;
 }
 
-/// processLoopStoreOfLoopLoad - We see a strided store whose value is a
-/// same-strided load.
-bool LoopIdiomRecognize::
-processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
-                           const SCEVAddRecExpr *StoreEv,
-                           const SCEVAddRecExpr *LoadEv,
-                           const SCEV *BECount) {
-  // If we're not allowed to form memcpy, we fail.
-  if (!TLI->has(LibFunc::memcpy))
-    return false;
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy.  This kicks in for stuff like
+///   for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+                                                    const SCEV *BECount) {
+  assert(SI->isSimple() && "Expected only non-volatile stores.");
 
+  Value *StorePtr = SI->getPointerOperand();
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  bool NegStride = StoreSize == -Stride;
+
+  // The store must be feeding a non-volatile load.
   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+  assert(LI->isSimple() && "Expected only non-volatile stores.");
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  const SCEVAddRecExpr *LoadEv =
+      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
 
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
-  const DataLayout &DL = Preheader->getModule()->getDataLayout();
-  SCEVExpander Expander(*SE, DL, "loop-idiom");
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+  const SCEV *StrStart = StoreEv->getStart();
+  unsigned StrAS = SI->getPointerAddressSpace();
+  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
+
+  // Handle negative strided loops.
+  if (NegStride)
+    StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
 
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
   // this into a memcpy in the loop preheader now if we want.  However, this
@@ -1042,29 +711,31 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
   // or write the memory region we're storing to.  This includes the load that
   // feeds the stores.  Check for an alias by generating the base address and
   // checking everything.
-  Value *StoreBasePtr =
-    Expander.expandCodeFor(StoreEv->getStart(),
-                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
-                           Preheader->getTerminator());
-
-  if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef,
-                            CurLoop, BECount, StoreSize,
-                            getAnalysis<AliasAnalysis>(), SI)) {
+  Value *StoreBasePtr = Expander.expandCodeFor(
+      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                            StoreSize, *AA, SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
     return false;
   }
 
+  const SCEV *LdStart = LoadEv->getStart();
+  unsigned LdAS = LI->getPointerAddressSpace();
+
+  // Handle negative strided loops.
+  if (NegStride)
+    LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
+
   // For a memcpy, we have to make sure that the input array is not being
   // mutated by the loop.
-  Value *LoadBasePtr =
-    Expander.expandCodeFor(LoadEv->getStart(),
-                           Builder.getInt8PtrTy(LI->getPointerAddressSpace()),
-                           Preheader->getTerminator());
+  Value *LoadBasePtr = Expander.expandCodeFor(
+      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
 
-  if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
+  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
+                            *AA, SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
@@ -1074,34 +745,368 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // Okay, everything is safe, we can transform this!
 
-
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
 
-  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
-                                         SCEV::FlagNUW);
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
   if (StoreSize != 1)
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
 
   Value *NumBytes =
-    Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
   CallInst *NewCall =
-    Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
-                         std::min(SI->getAlignment(), LI->getAlignment()));
+      Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
+                           std::min(SI->getAlignment(), LI->getAlignment()));
   NewCall->setDebugLoc(SI->getDebugLoc());
 
   DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
                << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
                << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
 
-
-  // Okay, the memset has been formed.  Zap the original store and anything that
+  // Okay, the memcpy has been formed.  Zap the original store and anything that
   // feeds into it.
   deleteDeadInstruction(SI, TLI);
   ++NumMemCpy;
   return true;
 }
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  return recognizePopcount();
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero, the control yields to
+/// the loop entry. If the branch matches the behavior, the variable involved
+/// in the comparion is returned. This function will be called to see if the
+/// precondition and postcondition of the loop are in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
+  if (!BI || !BI->isConditional())
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
+    return Cond->getOperand(0);
+
+  return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 != 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x1 = phi (x0, x2);
+///       cnt1 = phi(cnt0, cnt2);
+///
+///       cnt2 = cnt1 + 1;
+///        ...
+///       x2 = x1 & (x1 - 1);
+///        ...
+///    } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+                                Instruction *&CntInst, PHINode *&CntPhi,
+                                Value *&Var) {
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = nullptr;
+  VarX1 = VarX0 = nullptr;
+  PhiX = CountPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition(
+            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp)
+      return false;
+
+    Instruction *SubInst = cast<Instruction>(SubOneOp);
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    if (!Dec ||
+        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubInst->getOpcode() == Instruction::Add &&
+           Dec->isAllOnesValue()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  {
+    PhiX = dyn_cast<PHINode>(VarX1);
+    if (!PhiX ||
+        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+      return false;
+    }
+  }
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = nullptr;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                              IterE = LoopEntry->end();
+         Iter != IterE; Iter++) {
+      Instruction *Inst = &*Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+      if (!Phi || Phi->getParent() != LoopEntry)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (User *U : Inst->users()) {
+        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+          LiveOutLoop = true;
+          break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+    return false;
+
+  // Counting population are usually conducted by few arithmetic instructions.
+  // Such instructions can be easily "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but an unconditional branch.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  if (!PH)
+    return false;
+  if (&PH->front() != PH->getTerminator())
+    return false;
+  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+  if (!EntryBI || EntryBI->isConditional())
+    return false;
+
+  // It should have a precondition block where the generated popcount instrinsic
+  // function can be inserted.
+  auto *PreCondBB = PH->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  if (!PreCondBI || PreCondBI->isUnconditional())
+    return false;
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+    return false;
+
+  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+  return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                       DebugLoc DL) {
+  Value *Ops[] = {Val};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+                                                 Instruction *CntInst,
+                                                 PHINode *CntPhi, Value *Var) {
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  const DebugLoc DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilder<> Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the population counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond = cast<ICmpInst>(
+        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCondBr->setCondition(NewPreCond);
+
+    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enable us to to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop becomes dead after
+  //    the transformation. It is a lot easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, an infinite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+    Builder.SetInsertPoint(LbCond);
+    Instruction *TcDec = cast<Instruction>(
+        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                          "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred =
+        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index e125026..b4102fe 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -48,7 +48,7 @@ namespace {
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
   };
@@ -112,7 +112,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
       // Simplify instructions in the current basic block.
       for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = BI++;
+        Instruction *I = &*BI++;
 
         // The first time through the loop ToSimplify is empty and we try to
         // simplify all instructions. On later iterations ToSimplify is not
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 9d7e57f..4295235 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -99,7 +99,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         return false;
       if (St && !St->isSimple())
         return false;
-      MemInstr.push_back(I);
+      MemInstr.push_back(&*I);
     }
   }
 
@@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     }
   }
 
-  // We don't have a DepMatrix to check legality return false
+  // We don't have a DepMatrix to check legality return false.
   if (DepMatrix.size() == 0)
     return false;
   return true;
@@ -331,9 +331,9 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
 class LoopInterchangeLegality {
 public:
   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                          LoopInterchange *Pass)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass),
-        InnerLoopHasReduction(false) {}
+                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
 
   /// Check if the loops can be interchanged.
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -357,9 +357,10 @@ private:
   Loop *OuterLoop;
   Loop *InnerLoop;
 
-  /// Scev analysis.
   ScalarEvolution *SE;
-  LoopInterchange *CurrentPass;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  bool PreserveLCSSA;
 
   bool InnerLoopHasReduction;
 };
@@ -371,7 +372,7 @@ public:
   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
 
-  /// Check if the loop interchange is profitable
+  /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix);
 
@@ -385,12 +386,12 @@ private:
   ScalarEvolution *SE;
 };
 
-/// LoopInterchangeTransform interchanges the loop
+/// LoopInterchangeTransform interchanges the loop.
 class LoopInterchangeTransform {
 public:
   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
                            LoopInfo *LI, DominatorTree *DT,
-                           LoopInterchange *Pass, BasicBlock *LoopNestExit,
+                           BasicBlock *LoopNestExit,
                            bool InnerLoopContainsReductions)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
         LoopExit(LoopNestExit),
@@ -424,21 +425,22 @@ private:
   bool InnerLoopHasReduction;
 };
 
-// Main LoopInterchange Pass
+// Main LoopInterchange Pass.
 struct LoopInterchange : public FunctionPass {
   static char ID;
   ScalarEvolution *SE;
   LoopInfo *LI;
   DependenceAnalysis *DA;
   DominatorTree *DT;
+  bool PreserveLCSSA;
   LoopInterchange()
       : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScalarEvolution>();
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DependenceAnalysis>();
@@ -447,11 +449,13 @@ struct LoopInterchange : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    SE = &getAnalysis<ScalarEvolution>();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     DA = &getAnalysis<DependenceAnalysis>();
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
     // Build up a worklist of loop pairs to analyze.
     SmallVector<LoopVector, 8> Worklist;
 
@@ -489,7 +493,7 @@ struct LoopInterchange : public FunctionPass {
 
   unsigned selectLoopForInterchange(LoopVector LoopList) {
     // TODO: Add a better heuristic to select the loop to be interchanged based
-    // on the dependece matrix. Currently we select the innermost loop.
+    // on the dependence matrix. Currently we select the innermost loop.
     return LoopList.size() - 1;
   }
 
@@ -544,7 +548,7 @@ struct LoopInterchange : public FunctionPass {
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
-    // Move the selected loop outwards to the best posible position.
+    // Move the selected loop outwards to the best possible position.
     for (unsigned i = SelecLoopId; i > 0; i--) {
       bool Interchanged =
           processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
@@ -574,7 +578,8 @@ struct LoopInterchange : public FunctionPass {
     Loop *InnerLoop = LoopList[InnerLoopId];
     Loop *OuterLoop = LoopList[OuterLoopId];
 
-    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this);
+    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
+                                PreserveLCSSA);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
       return false;
@@ -586,7 +591,7 @@ struct LoopInterchange : public FunctionPass {
       return false;
     }
 
-    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this,
+    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
                                  LoopNestExit, LIL.hasInnerLoopReduction());
     LIT.transform();
     DEBUG(dbgs() << "Loops interchanged\n");
@@ -655,7 +660,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
 
   DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");
   // We do not have any basic block in between now make sure the outer header
-  // and outer loop latch doesnt contain any unsafe instructions.
+  // and outer loop latch doesn't contain any unsafe instructions.
   if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
       containsUnsafeInstructionsInLatch(OuterLoopLatch))
     return false;
@@ -698,9 +703,9 @@ bool LoopInterchangeLegality::findInductionAndReductions(
     return false;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
     RecurrenceDescriptor RD;
+    InductionDescriptor ID;
     PHINode *PHI = cast<PHINode>(I);
-    ConstantInt *StepValue = nullptr;
-    if (isInductionPHI(PHI, SE, StepValue))
+    if (InductionDescriptor::isInductionPHI(PHI, SE, ID))
       Inductions.push_back(PHI);
     else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
       Reductions.push_back(PHI);
@@ -836,7 +841,7 @@ bool LoopInterchangeLegality::currentLimitations() {
     else
       FoundInduction = true;
   }
-  // The loop latch ended and we didnt find the induction variable return as
+  // The loop latch ended and we didn't find the induction variable return as
   // current limitation.
   if (!FoundInduction)
     return true;
@@ -867,12 +872,14 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
       isa<PHINode>(OuterLoopPreHeader->begin()) ||
       !OuterLoopPreHeader->getUniquePredecessor()) {
-    OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass);
+    OuterLoopPreHeader =
+        InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
   }
 
   if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
       InnerLoopPreHeader == OuterLoop->getHeader()) {
-    InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass);
+    InnerLoopPreHeader =
+        InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
   }
 
   // TODO: The loops could not be interchanged due to current limitations in the
@@ -966,7 +973,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
                                                 unsigned OuterLoopId,
                                                 CharMatrix &DepMatrix) {
 
-  // TODO: Add Better Profitibility checks.
+  // TODO: Add better profitability checks.
   // e.g
   // 1) Construct dependency matrix and move the one with no loop carried dep
   //    inside to enable vectorization.
@@ -980,7 +987,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
   if (Cost < 0)
     return true;
 
-  // It is not profitable as per current cache profitibility model. But check if
+  // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
   bool ImprovesPar =
       isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
@@ -996,7 +1003,7 @@ void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
       return;
     }
   }
-  assert(false && "Couldn't find loop");
+  llvm_unreachable("Couldn't find loop");
 }
 
 void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
@@ -1045,7 +1052,7 @@ bool LoopInterchangeTransform::transform() {
     splitInnerLoopLatch(InnerIndexVar);
     DEBUG(dbgs() << "splitInnerLoopLatch Done\n");
 
-    // Splits the inner loops phi nodes out into a seperate basic block.
+    // Splits the inner loops phi nodes out into a separate basic block.
     splitInnerLoopHeader();
     DEBUG(dbgs() << "splitInnerLoopHeader Done\n");
   }
@@ -1113,8 +1120,8 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
   auto &ToList = InsertBefore->getParent()->getInstList();
   auto &FromList = FromBB->getInstList();
 
-  ToList.splice(InsertBefore, FromList, FromList.begin(),
-                FromBB->getTerminator());
+  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+                FromBB->getTerminator()->getIterator());
 }
 
 void LoopInterchangeTransform::adjustOuterLoopPreheader() {
@@ -1181,8 +1188,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
 
   if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
     return false;
-  BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor();
-  if (!InnerLoopHeaderSucessor)
+  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+  if (!InnerLoopHeaderSuccessor)
     return false;
 
   // Adjust Loop Preheader and headers
@@ -1198,11 +1205,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
     if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
       OuterLoopHeaderBI->setSuccessor(i, LoopExit);
     else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
-      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor);
+      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
   }
 
   // Adjust reduction PHI's now that the incoming block has changed.
-  updateIncomingBlock(InnerLoopHeaderSucessor, InnerLoopHeader,
+  updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
                       OuterLoopHeader);
 
   BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
@@ -1286,10 +1293,10 @@ bool LoopInterchangeTransform::adjustLoopLinks() {
 char LoopInterchange::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
                       "Interchanges loops for cache reuse", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
new file mode 100644
index 0000000..1064d08
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -0,0 +1,566 @@
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads.  These form the candidates for the
+// transformation.  The source value of each store then propagated to the user
+// of the corresponding load.  This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <forward_list>
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+using namespace llvm;
+
+static cl::opt<unsigned> CheckPerElim(
+    "runtime-check-per-loop-load-elim", cl::Hidden,
+    cl::desc("Max number of memchecks allowed per eliminated load on average"),
+    cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+    "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Load Elimination"));
+
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// \brief Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+  LoadInst *Load;
+  StoreInst *Store;
+
+  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+      : Load(Load), Store(Store) {}
+
+  /// \brief Return true if the dependence from the store to the load has a
+  /// distance of one.  E.g. A[i+1] = A[i]
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
+    Value *LoadPtr = Load->getPointerOperand();
+    Value *StorePtr = Store->getPointerOperand();
+    Type *LoadPtrType = LoadPtr->getType();
+    Type *LoadType = LoadPtrType->getPointerElementType();
+
+    assert(LoadPtrType->getPointerAddressSpace() ==
+               StorePtr->getType()->getPointerAddressSpace() &&
+           LoadType == StorePtr->getType()->getPointerElementType() &&
+           "Should be a known dependence");
+
+    auto &DL = Load->getParent()->getModule()->getDataLayout();
+    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+    // We don't need to check non-wrapping here because forward/backward
+    // dependence wouldn't be valid if these weren't monotonic accesses.
+    auto *Dist = cast<SCEVConstant>(
+        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    const APInt &Val = Dist->getAPInt();
+    return Val.abs() == TypeByteSize;
+  }
+
+  Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const StoreToLoadForwardingCandidate &Cand) {
+    OS << *Cand.Store << " -->\n";
+    OS.indent(2) << *Cand.Load << "\n";
+    return OS;
+  }
+#endif
+};
+
+/// \brief Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+                                  DominatorTree *DT) {
+  SmallVector<BasicBlock *, 8> Latches;
+  L->getLoopLatches(Latches);
+  return std::all_of(Latches.begin(), Latches.end(),
+                     [&](const BasicBlock *Latch) {
+                       return DT->dominates(StoreBlock, Latch);
+                     });
+}
+
+/// \brief The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+                         DominatorTree *DT)
+      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {}
+
+  /// \brief Look through the loop-carried and loop-independent dependences in
+  /// this loop and find store->load dependences.
+  ///
+  /// Note that no candidate is returned if LAA has failed to analyze the loop
+  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+  std::forward_list<StoreToLoadForwardingCandidate>
+  findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+    std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+    const auto *Deps = LAI.getDepChecker().getDependences();
+    if (!Deps)
+      return Candidates;
+
+    // Find store->load dependences (consequently true dep).  Both lexically
+    // forward and backward dependences qualify.  Disqualify loads that have
+    // other unknown dependences.
+
+    SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+    for (const auto &Dep : *Deps) {
+      Instruction *Source = Dep.getSource(LAI);
+      Instruction *Destination = Dep.getDestination(LAI);
+
+      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+        if (isa<LoadInst>(Source))
+          LoadsWithUnknownDepedence.insert(Source);
+        if (isa<LoadInst>(Destination))
+          LoadsWithUnknownDepedence.insert(Destination);
+        continue;
+      }
+
+      if (Dep.isBackward())
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        std::swap(Source, Destination);
+      else
+        assert(Dep.isForward() && "Needs to be a forward dependence");
+
+      auto *Store = dyn_cast<StoreInst>(Source);
+      if (!Store)
+        continue;
+      auto *Load = dyn_cast<LoadInst>(Destination);
+      if (!Load)
+        continue;
+      Candidates.emplace_front(Load, Store);
+    }
+
+    if (!LoadsWithUnknownDepedence.empty())
+      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+        return LoadsWithUnknownDepedence.count(C.Load);
+      });
+
+    return Candidates;
+  }
+
+  /// \brief Return the index of the instruction according to program order.
+  unsigned getInstrIndex(Instruction *Inst) {
+    auto I = InstOrder.find(Inst);
+    assert(I != InstOrder.end() && "No index for instruction");
+    return I->second;
+  }
+
+  /// \brief If a load has multiple candidates associated (i.e. different
+  /// stores), it means that it could be forwarding from multiple stores
+  /// depending on control flow.  Remove these candidates.
+  ///
+  /// Here, we rely on LAA to include the relevant loop-independent dependences.
+  /// LAA is known to omit these in the very simple case when the read and the
+  /// write within an alias set always takes place using the *same* pointer.
+  ///
+  /// However, we know that this is not the case here, i.e. we can rely on LAA
+  /// to provide us with loop-independent dependences for the cases we're
+  /// interested.  Consider the case for example where a loop-independent
+  /// dependece S1->S2 invalidates the forwarding S3->S2.
+  ///
+  ///         A[i]   = ...   (S1)
+  ///         ...    = A[i]  (S2)
+  ///         A[i+1] = ...   (S3)
+  ///
+  /// LAA will perform dependence analysis here because there are two
+  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+  void removeDependencesFromMultipleStores(
+      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+    // If Store is nullptr it means that we have multiple stores forwarding to
+    // this store.
+    typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
+        LoadToSingleCandT;
+    LoadToSingleCandT LoadToSingleCand;
+
+    for (const auto &Cand : Candidates) {
+      bool NewElt;
+      LoadToSingleCandT::iterator Iter;
+
+      std::tie(Iter, NewElt) =
+          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+      if (!NewElt) {
+        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+        // Already multiple stores forward to this load.
+        if (OtherCand == nullptr)
+          continue;
+
+        // Handle the very basic of case when the two stores are in the same
+        // block so deciding which one forwards is easy.  The later one forwards
+        // as long as they both have a dependence distance of one to the load.
+        if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+            Cand.isDependenceDistanceOfOne(PSE) &&
+            OtherCand->isDependenceDistanceOfOne(PSE)) {
+          // They are in the same block, the later one will forward to the load.
+          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+            OtherCand = &Cand;
+        } else
+          OtherCand = nullptr;
+      }
+    }
+
+    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+      if (LoadToSingleCand[Cand.Load] != &Cand) {
+        DEBUG(dbgs() << "Removing from candidates: \n" << Cand
+                     << "  The load may have multiple stores forwarding to "
+                     << "it\n");
+        return true;
+      }
+      return false;
+    });
+  }
+
+  /// \brief Given two pointers operations by their RuntimePointerChecking
+  /// indices, return true if they require an alias check.
+  ///
+  /// We need a check if one is a pointer for a candidate load and the other is
+  /// a pointer for a possibly intervening store.
+  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+                     const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+                     const std::set<Value *> &CandLoadPtrs) {
+    Value *Ptr1 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+    Value *Ptr2 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+  }
+
+  /// \brief Return pointers that are possibly written to on the path from a
+  /// forwarding store to a load.
+  ///
+  /// These pointers need to be alias-checked against the forwarding candidates.
+  SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+    // From FirstStore to LastLoad neither of the elimination candidate loads
+    // should overlap with any of the stores.
+    //
+    // E.g.:
+    //
+    // st1 C[i]
+    // ld1 B[i] <-------,
+    // ld0 A[i] <----,  |              * LastLoad
+    // ...           |  |
+    // st2 E[i]      |  |
+    // st3 B[i+1] -- | -'              * FirstStore
+    // st0 A[i+1] ---'
+    // st4 D[i]
+    //
+    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+    // ld0.
+
+    LoadInst *LastLoad =
+        std::max_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+                         })
+            ->Load;
+    StoreInst *FirstStore =
+        std::min_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Store) <
+                                  getInstrIndex(B.Store);
+                         })
+            ->Store;
+
+    // We're looking for stores after the first forwarding store until the end
+    // of the loop, then from the beginning of the loop until the last
+    // forwarded-to load.  Collect the pointer for the stores.
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+    auto InsertStorePtr = [&](Instruction *I) {
+      if (auto *S = dyn_cast<StoreInst>(I))
+        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+    };
+    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+                  MemInstrs.end(), InsertStorePtr);
+    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+                  InsertStorePtr);
+
+    return PtrsWrittenOnFwdingPath;
+  }
+
+  /// \brief Determine the pointer alias checks to prove that there are no
+  /// intervening stores.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+        findPointersWrittenOnForwardingPath(Candidates);
+
+    // Collect the pointers of the candidate loads.
+    // FIXME: SmallSet does not work with std::inserter.
+    std::set<Value *> CandLoadPtrs;
+    std::transform(Candidates.begin(), Candidates.end(),
+                   std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
+                   std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+
+    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+                 [&](const RuntimePointerChecking::PointerCheck &Check) {
+                   for (auto PtrIdx1 : Check.first->Members)
+                     for (auto PtrIdx2 : Check.second->Members)
+                       if (needsChecking(PtrIdx1, PtrIdx2,
+                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
+                         return true;
+                   return false;
+                 });
+
+    DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
+    DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+    return Checks;
+  }
+
+  /// \brief Perform the transformation for a candidate.
+  void
+  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+                                  SCEVExpander &SEE) {
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    Value *Ptr = Cand.Load->getPointerOperand();
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+    auto *PH = L->getLoopPreheader();
+    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+                                          PH->getTerminator());
+    Value *Initial =
+        new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+                                   &L->getHeader()->front());
+    PHI->addIncoming(Initial, PH);
+    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+    Cand.Load->replaceAllUsesWith(PHI);
+  }
+
+  /// \brief Top-level driver for each loop: find store->load forwarding
+  /// candidates, add run-time checks and perform transformation.
+  bool processLoop() {
+    DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+                 << "\" checking " << *L << "\n");
+    // Look for store-to-load forwarding cases across the
+    // backedge. E.g.:
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    // First start with store->load dependences.
+    auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Generate an index for each load and store according to the original
+    // program order.  This will be used later.
+    InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+    // To keep things simple for now, remove those where the load is potentially
+    // fed by multiple stores.
+    removeDependencesFromMultipleStores(StoreToLoadDependences);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Filter the candidates further.
+    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+    unsigned NumForwarding = 0;
+    for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+      DEBUG(dbgs() << "Candidate " << Cand);
+      // Make sure that the stored values is available everywhere in the loop in
+      // the next iteration.
+      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+        continue;
+
+      // Check whether the SCEV difference is the same as the induction step,
+      // thus we load the value in the next iteration.
+      if (!Cand.isDependenceDistanceOfOne(PSE))
+        continue;
+
+      ++NumForwarding;
+      DEBUG(dbgs()
+            << NumForwarding
+            << ". Valid store-to-load forwarding across the loop backedge\n");
+      Candidates.push_back(Cand);
+    }
+    if (Candidates.empty())
+      return false;
+
+    // Check intervening may-alias stores.  These need runtime checks for alias
+    // disambiguation.
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
+        collectMemchecks(Candidates);
+
+    // Too many checks are likely to outweigh the benefits of forwarding.
+    if (Checks.size() > Candidates.size() * CheckPerElim) {
+      DEBUG(dbgs() << "Too many run-time checks needed.\n");
+      return false;
+    }
+
+    if (LAI.PSE.getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
+      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      return false;
+    }
+
+    // Point of no-return, start the transformation.  First, version the loop if
+    // necessary.
+    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
+      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
+      LV.setAliasChecks(std::move(Checks));
+      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LV.versionLoop();
+    }
+
+    // Next, propagate the value stored by the store to the users of the load.
+    // Also for the first iteration, generate the initial value of the load.
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+                     "storeforward");
+    for (const auto &Cand : Candidates)
+      propagateStoredValueToLoadUsers(Cand, SEE);
+    NumLoopLoadEliminted += NumForwarding;
+
+    return true;
+  }
+
+private:
+  Loop *L;
+
+  /// \brief Maps the load/store instructions to their index according to
+  /// program order.
+  DenseMap<Instruction *, unsigned> InstOrder;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo &LAI;
+  DominatorTree *DT;
+  PredicatedScalarEvolution PSE;
+};
+
+/// \brief The pass.  Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+  LoopLoadElimination() : FunctionPass(ID) {
+    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+    // Build up a worklist of inner-loops to vectorize. This is necessary as the
+    // act of distributing a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist) {
+      const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+      // The actual work is performed by LoadEliminationForLoop.
+      LoadEliminationForLoop LEL(L, LI, LAI, DT);
+      Changed |= LEL.processLoop();
+    }
+
+    // Process each loop nest in the function.
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+char LoopLoadElimination::ID;
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopLoadEliminationPass() {
+  return new LoopLoadElimination();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index ed103e6..27c2d88 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -147,12 +147,12 @@ namespace {
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
@@ -162,11 +162,15 @@ namespace {
     ScalarEvolution *SE;
     TargetLibraryInfo *TLI;
     DominatorTree *DT;
+    bool PreserveLCSSA;
 
     typedef SmallVector<Instruction *, 16> SmallInstructionVector;
     typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
 
-    // A chain of isomorphic instructions, indentified by a single-use PHI,
+    // Map between induction variable and its increment
+    DenseMap<Instruction *, int64_t> IVToIncMap;
+
+    // A chain of isomorphic instructions, identified by a single-use PHI
     // representing a reduction. Only the last value may be used outside the
     // loop.
     struct SimpleLoopReduction {
@@ -300,22 +304,6 @@ namespace {
       // The functions below can be called after we've finished processing all
       // instructions in the loop, and we know which reductions were selected.
 
-      // Is the provided instruction the PHI of a reduction selected for
-      // rerolling?
-      bool isSelectedPHI(Instruction *J) {
-        if (!isa<PHINode>(J))
-          return false;
-
-        for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
-             RI != RIE; ++RI) {
-          int i = *RI;
-          if (cast<Instruction>(J) == PossibleReds[i].getPHI())
-            return true;
-        }
-
-        return false;
-      }
-
       bool validateSelected();
       void replaceSelected();
 
@@ -335,7 +323,7 @@ namespace {
     //   x[i*3+1] = y2
     //   x[i*3+2] = y3
     //
-    //   Base instruction -> i*3               
+    //   Base instruction -> i*3
     //                    +---+----+
     //                   /    |     \
     //               ST[y1]  +1     +2  <-- Roots
@@ -366,8 +354,11 @@ namespace {
     struct DAGRootTracker {
       DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
                      ScalarEvolution *SE, AliasAnalysis *AA,
-                     TargetLibraryInfo *TLI)
-          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {}
+                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+                     bool PreserveLCSSA,
+                     DenseMap<Instruction *, int64_t> &IncrMap)
+          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {}
 
       /// Stage 1: Find all the DAG roots for the induction variable.
       bool findRoots();
@@ -413,11 +404,14 @@ namespace {
       ScalarEvolution *SE;
       AliasAnalysis *AA;
       TargetLibraryInfo *TLI;
+      DominatorTree *DT;
+      LoopInfo *LI;
+      bool PreserveLCSSA;
 
       // The loop induction variable.
       Instruction *IV;
       // Loop step amount.
-      uint64_t Inc;
+      int64_t Inc;
       // Loop reroll count; if Inc == 1, this records the scaling applied
       // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
       // If Inc is not 1, Scale = Inc.
@@ -430,6 +424,8 @@ namespace {
       // they are used in (or specially, IL_All for instructions
       // used in the loop increment mechanism).
       UsesTy Uses;
+      // Map between induction variable and its increment
+      DenseMap<Instruction *, int64_t> &IVToIncMap;
     };
 
     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
@@ -442,10 +438,10 @@ namespace {
 
 char LoopReroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 
@@ -477,21 +473,20 @@ void LoopReroll::collectPossibleIVs(Loop *L,
       continue;
 
     if (const SCEVAddRecExpr *PHISCEV =
-        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) {
+            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
       if (PHISCEV->getLoop() != L)
         continue;
       if (!PHISCEV->isAffine())
         continue;
       if (const SCEVConstant *IncSCEV =
           dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
-        if (!IncSCEV->getValue()->getValue().isStrictlyPositive())
+        const APInt &AInt = IncSCEV->getAPInt().abs();
+        if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
           continue;
-        if (IncSCEV->getValue()->uge(MaxInc))
-          continue;
-
-        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " <<
-              *PHISCEV << "\n");
-        PossibleIVs.push_back(I);
+        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+                     << "\n");
+        PossibleIVs.push_back(&*I);
       }
     }
   }
@@ -552,7 +547,7 @@ void LoopReroll::collectPossibleReductions(Loop *L,
     if (!I->getType()->isSingleValueType())
       continue;
 
-    SimpleLoopReduction SLR(I, L);
+    SimpleLoopReduction SLR(&*I, L);
     if (!SLR.valid())
       continue;
 
@@ -699,17 +694,11 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
       }
     }
 
-    int64_t V = CI->getValue().getSExtValue();
+    int64_t V = std::abs(CI->getValue().getSExtValue());
     if (Roots.find(V) != Roots.end())
       // No duplicates, please.
       return false;
 
-    // FIXME: Add support for negative values.
-    if (V < 0) {
-      DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n");
-      return false;
-    }
-
     Roots[V] = cast<Instruction>(I);
   }
 
@@ -731,7 +720,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
   unsigned NumBaseUses = BaseUsers.size();
   if (NumBaseUses == 0)
     NumBaseUses = Roots.begin()->second->getNumUses();
-  
+
   // Check that every node has the same number of users.
   for (auto &KV : Roots) {
     if (KV.first == 0)
@@ -744,7 +733,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
     }
   }
 
-  return true; 
+  return true;
 }
 
 bool LoopReroll::DAGRootTracker::
@@ -787,7 +776,7 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
   if (!collectPossibleRoots(IVU, V))
     return false;
 
-  // If we didn't get a root for index zero, then IVU must be 
+  // If we didn't get a root for index zero, then IVU must be
   // subsumed.
   if (V.find(0) == V.end())
     SubsumedInsts.insert(IVU);
@@ -818,13 +807,10 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
 }
 
 bool LoopReroll::DAGRootTracker::findRoots() {
-
-  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
-  Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
-    getValue()->getZExtValue();
+  Inc = IVToIncMap[IV];
 
   assert(RootSets.empty() && "Unclean state!");
-  if (Inc == 1) {
+  if (std::abs(Inc) == 1) {
     for (auto *IVU : IV->users()) {
       if (isLoopIncrement(IVU, IV))
         LoopIncs.push_back(cast<Instruction>(IVU));
@@ -996,6 +982,25 @@ bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
   return false;
 }
 
+static bool isIgnorableInst(const Instruction *I) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case llvm::Intrinsic::annotation:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    // TODO: the following intrinsics may also be whitelisted:
+    //   lifetime_start, lifetime_end, invariant_start, invariant_end
+      return true;
+  }
+  return false;
+}
+
 bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
   // We now need to check for equivalence of the use graph of each root with
   // that of the primary induction variable (excluding the roots). Our goal
@@ -1029,7 +1034,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
   // Make sure all instructions in the loop are in one and only one
   // set.
   for (auto &KV : Uses) {
-    if (KV.second.count() != 1) {
+    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
       DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
             << *KV.first << " (#uses=" << KV.second.count() << ")\n");
       return false;
@@ -1103,15 +1108,15 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
                 " vs. " << *RootInst << "\n");
           return false;
         }
-        
+
         RootIt = TryIt;
         RootInst = TryIt->first;
       }
 
       // All instructions between the last root and this root
-      // may belong to some other iteration. If they belong to a 
+      // may belong to some other iteration. If they belong to a
       // future iteration, then they're dangerous to alias with.
-      // 
+      //
       // Note that because we allow a limited amount of flexibility in the order
       // that we visit nodes, LastRootIt might be *before* RootIt, in which
       // case we've already checked this set of instructions so we shouldn't
@@ -1267,6 +1272,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
 
     ++J;
   }
+  bool Negative = IVToIncMap[IV] < 0;
   const DataLayout &DL = Header->getModule()->getDataLayout();
 
   // We need to create a new induction variable for each different BaseInst.
@@ -1275,13 +1281,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
     const SCEVAddRecExpr *RealIVSCEV =
       cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
     const SCEV *Start = RealIVSCEV->getStart();
-    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>
-      (SE->getAddRecExpr(Start,
-                         SE->getConstant(RealIVSCEV->getType(), 1),
-                         L, SCEV::FlagAnyWrap));
+    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(
+        Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L,
+        SCEV::FlagAnyWrap));
     { // Limit the lifetime of SCEVExpander.
       SCEVExpander Expander(*SE, DL, "reroll");
-      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
+      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());
 
       for (auto &KV : Uses) {
         if (KV.second.find_first() == 0)
@@ -1294,8 +1299,8 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
           const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
 
           // Iteration count SCEV minus 1
-          const SCEV *ICMinus1SCEV =
-            SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1));
+          const SCEV *ICMinus1SCEV = SE->getMinusSCEV(
+              ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));
 
           Value *ICMinus1; // Iteration count minus 1
           if (isa<SCEVConstant>(ICMinus1SCEV)) {
@@ -1303,7 +1308,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
           } else {
             BasicBlock *Preheader = L->getLoopPreheader();
             if (!Preheader)
-              Preheader = InsertPreheaderForLoop(L, Parent);
+              Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
 
             ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
                                               Preheader->getTerminator());
@@ -1444,13 +1449,14 @@ void LoopReroll::ReductionTracker::replaceSelected() {
 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
                         const SCEV *IterCount,
                         ReductionTracker &Reductions) {
-  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI);
+  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+                          IVToIncMap);
 
   if (!DAGRoots.findRoots())
     return false;
   DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
                   *IV << "\n");
-  
+
   if (!DAGRoots.validate(Reductions))
     return false;
   if (!Reductions.validateSelected())
@@ -1469,11 +1475,12 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipOptnoneFunction(L))
     return false;
 
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
@@ -1490,13 +1497,13 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     return Changed;
 
   const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
-  const SCEV *IterCount =
-    SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1));
+  const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
   DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
 
   // First, we need to find the induction variable with respect to which we can
   // reroll (there may be several possible options).
   SmallInstructionVector PossibleIVs;
+  IVToIncMap.clear();
   collectPossibleIVs(L, PossibleIVs);
 
   if (PossibleIVs.empty()) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index a675e12..5e6c2da 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,11 +13,15 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
@@ -41,95 +45,6 @@ DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
        cl::desc("The default maximum header size for automatic loop rotation"));
 
 STATISTIC(NumRotated, "Number of loops rotated");
-namespace {
-
-  class LoopRotate : public LoopPass {
-  public:
-    static char ID; // Pass ID, replacement for typeid
-    LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
-      initializeLoopRotatePass(*PassRegistry::getPassRegistry());
-      if (SpecifiedMaxHeaderSize == -1)
-        MaxHeaderSize = DefaultRotationThreshold;
-      else
-        MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
-    }
-
-    // LCSSA form makes instruction renaming easier.
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addPreservedID(LCSSAID);
-      AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-    }
-
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-    bool simplifyLoopLatch(Loop *L);
-    bool rotateLoop(Loop *L, bool SimplifiedLatch);
-
-  private:
-    unsigned MaxHeaderSize;
-    LoopInfo *LI;
-    const TargetTransformInfo *TTI;
-    AssumptionCache *AC;
-    DominatorTree *DT;
-  };
-}
-
-char LoopRotate::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
-
-Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
-  return new LoopRotate(MaxHeaderSize);
-}
-
-/// Rotate Loop L as many times as possible. Return true if
-/// the loop is rotated at least once.
-bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
-
-  // Save the loop metadata.
-  MDNode *LoopMD = L->getLoopID();
-
-  Function &F = *L->getHeader()->getParent();
-
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
-  // Simplify the loop latch before attempting to rotate the header
-  // upward. Rotation may not be needed if the loop tail can be folded into the
-  // loop exit.
-  bool SimplifiedLatch = simplifyLoopLatch(L);
-
-  // One loop can be rotated multiple times.
-  bool MadeChange = false;
-  while (rotateLoop(L, SimplifiedLatch)) {
-    MadeChange = true;
-    SimplifiedLatch = false;
-  }
-
-  // Restore the loop metadata.
-  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
-  if ((MadeChange || SimplifiedLatch) && LoopMD)
-    L->setLoopID(LoopMD);
-
-  return MadeChange;
-}
 
 /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
 /// old header into the preheader.  If there were uses of the values produced by
@@ -147,7 +62,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   // as necessary.
   SSAUpdater SSA;
   for (I = OrigHeader->begin(); I != E; ++I) {
-    Value *OrigHeaderVal = I;
+    Value *OrigHeaderVal = &*I;
 
     // If there are no uses of the value (e.g. because it returns void), there
     // is nothing to rewrite.
@@ -196,127 +111,6 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   }
 }
 
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
-                                  BasicBlock::iterator End, Loop *L) {
-  bool seenIncrement = false;
-  bool MultiExitLoop = false;
-
-  if (!L->getExitingBlock())
-    MultiExitLoop = true;
-
-  for (BasicBlock::iterator I = Begin; I != End; ++I) {
-
-    if (!isSafeToSpeculativelyExecute(I))
-      return false;
-
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::GetElementPtr:
-      // GEPs are cheap if all indices are constant.
-      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
-        return false;
-      // fall-thru to increment case
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr: {
-      Value *IVOpnd = !isa<Constant>(I->getOperand(0))
-                          ? I->getOperand(0)
-                          : !isa<Constant>(I->getOperand(1))
-                                ? I->getOperand(1)
-                                : nullptr;
-      if (!IVOpnd)
-        return false;
-
-      // If increment operand is used outside of the loop, this speculation
-      // could cause extra live range interference.
-      if (MultiExitLoop) {
-        for (User *UseI : IVOpnd->users()) {
-          auto *UserInst = cast<Instruction>(UseI);
-          if (!L->contains(UserInst))
-            return false;
-        }
-      }
-
-      if (seenIncrement)
-        return false;
-      seenIncrement = true;
-      break;
-    }
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      // ignore type conversions
-      break;
-    }
-  }
-  return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch || Latch->hasAddressTaken())
-    return false;
-
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!Jmp || !Jmp->isUnconditional())
-    return false;
-
-  BasicBlock *LastExit = Latch->getSinglePredecessor();
-  if (!LastExit || !L->isLoopExiting(LastExit))
-    return false;
-
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
-  if (!BI)
-    return false;
-
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L))
-    return false;
-
-  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
-        << LastExit->getName() << "\n");
-
-  // Hoist the instructions from Latch into LastExit.
-  LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp);
-
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
-
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
-
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
-  return true;
-}
-
 /// Rotate loop LP. Return true if the loop is rotated.
 ///
 /// \param SimplifiedLatch is true if the latch was just folded into the final
@@ -327,7 +121,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
 /// rotation. LoopRotate should be repeatable and converge to a canonical
 /// form. This property is satisfied because simplifying the loop latch can only
 /// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
+                       const TargetTransformInfo *TTI, AssumptionCache *AC,
+                       DominatorTree *DT, ScalarEvolution *SE,
+                       bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
@@ -382,7 +179,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
   // in its header will soon be invalidated.
-  if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
+  if (SE)
     SE->forgetLoop(L);
 
   DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
@@ -420,7 +217,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // possible or create a clone in the OldPreHeader if not.
   TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
   while (I != E) {
-    Instruction *Inst = I++;
+    Instruction *Inst = &*I++;
 
     // If the instruction's operands are invariant and it doesn't read or write
     // memory, then it is safe to hoist.  Doing this doesn't change the order of
@@ -465,8 +262,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
   // successors by duplicating their incoming values for OrigHeader.
   TerminatorInst *TI = OrigHeader->getTerminator();
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-    for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
          PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
       PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
 
@@ -607,3 +404,221 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   ++NumRotated;
   return true;
 }
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd = !isa<Constant>(I->getOperand(0))
+                          ? I->getOperand(0)
+                          : !isa<Constant>(I->getOperand(1))
+                                ? I->getOperand(1)
+                                : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+        << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L as many times as possible. Return true if the loop is rotated
+/// at least once.
+static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
+                                  const TargetTransformInfo *TTI,
+                                  AssumptionCache *AC, DominatorTree *DT,
+                                  ScalarEvolution *SE) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT);
+
+  // One loop can be rotated multiple times.
+  bool MadeChange = false;
+  while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) {
+    MadeChange = true;
+    SimplifiedLatch = false;
+  }
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange;
+}
+
+namespace {
+
+class LoopRotate : public LoopPass {
+  unsigned MaxHeaderSize;
+
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+    initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+    if (SpecifiedMaxHeaderSize == -1)
+      MaxHeaderSize = DefaultRotationThreshold;
+    else
+      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+  }
+
+  // LCSSA form makes instruction renaming easier.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreservedID(LCSSAID);
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipOptnoneFunction(L))
+      return false;
+    Function &F = *L->getHeader()->getParent();
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+    return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE);
+  }
+};
+}
+
+char LoopRotate::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+  return new LoopRotate(MaxHeaderSize);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 4b59f3d..2101225 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -105,10 +105,33 @@ static bool StressIVChain = false;
 
 namespace {
 
-/// RegSortData - This class holds data which is used to order reuse candidates.
+struct MemAccessTy {
+  /// Used in situations where the accessed memory type is unknown.
+  static const unsigned UnknownAddressSpace = ~0u;
+
+  Type *MemTy;
+  unsigned AddrSpace;
+
+  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
+
+  MemAccessTy(Type *Ty, unsigned AS) :
+    MemTy(Ty), AddrSpace(AS) {}
+
+  bool operator==(MemAccessTy Other) const {
+    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+  }
+
+  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+  static MemAccessTy getUnknown(LLVMContext &Ctx) {
+    return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace);
+  }
+};
+
+/// This class holds data which is used to order reuse candidates.
 class RegSortData {
 public:
-  /// UsedByIndices - This represents the set of LSRUse indices which reference
+  /// This represents the set of LSRUse indices which reference
   /// a particular register.
   SmallBitVector UsedByIndices;
 
@@ -122,16 +145,14 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
 namespace {
 
-/// RegUseTracker - Map register candidates to information about how they are
-/// used.
+/// Map register candidates to information about how they are used.
 class RegUseTracker {
   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
 
@@ -139,9 +160,9 @@ class RegUseTracker {
   SmallVector<const SCEV *, 16> RegSequence;
 
 public:
-  void CountRegister(const SCEV *Reg, size_t LUIdx);
-  void DropRegister(const SCEV *Reg, size_t LUIdx);
-  void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+  void countRegister(const SCEV *Reg, size_t LUIdx);
+  void dropRegister(const SCEV *Reg, size_t LUIdx);
+  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
 
   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
 
@@ -160,7 +181,7 @@ public:
 }
 
 void
-RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
   std::pair<RegUsesTy::iterator, bool> Pair =
     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
   RegSortData &RSD = Pair.first->second;
@@ -171,7 +192,7 @@ RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
 }
 
 void
-RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
   RegUsesTy::iterator It = RegUsesMap.find(Reg);
   assert(It != RegUsesMap.end());
   RegSortData &RSD = It->second;
@@ -180,7 +201,7 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
 }
 
 void
-RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
   assert(LUIdx <= LastLUIdx);
 
   // Update RegUses. The data structure is not optimized for this purpose;
@@ -219,9 +240,8 @@ void RegUseTracker::clear() {
 
 namespace {
 
-/// Formula - This class holds information that describes a formula for
-/// computing satisfying a use. It may include broken-out immediates and scaled
-/// registers.
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
 struct Formula {
   /// Global base address used for complex addressing.
   GlobalValue *BaseGV;
@@ -235,8 +255,8 @@ struct Formula {
   /// The scale of any complex addressing.
   int64_t Scale;
 
-  /// BaseRegs - The list of "base" registers for this use. When this is
-  /// non-empty. The canonical representation of a formula is
+  /// The list of "base" registers for this use. When this is non-empty. The
+  /// canonical representation of a formula is
   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
   /// #1 enforces that the scaled register is always used when at least two
@@ -247,31 +267,31 @@ struct Formula {
   /// form.
   SmallVector<const SCEV *, 4> BaseRegs;
 
-  /// ScaledReg - The 'scaled' register for this use. This should be non-null
-  /// when Scale is not zero.
+  /// The 'scaled' register for this use. This should be non-null when Scale is
+  /// not zero.
   const SCEV *ScaledReg;
 
-  /// UnfoldedOffset - An additional constant offset which added near the
-  /// use. This requires a temporary register, but the offset itself can
-  /// live in an add immediate field rather than a register.
+  /// An additional constant offset which added near the use. This requires a
+  /// temporary register, but the offset itself can live in an add immediate
+  /// field rather than a register.
   int64_t UnfoldedOffset;
 
   Formula()
       : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
         ScaledReg(nullptr), UnfoldedOffset(0) {}
 
-  void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
   bool isCanonical() const;
 
-  void Canonicalize();
+  void canonicalize();
 
-  bool Unscale();
+  bool unscale();
 
   size_t getNumRegs() const;
   Type *getType() const;
 
-  void DeleteBaseReg(const SCEV *&S);
+  void deleteBaseReg(const SCEV *&S);
 
   bool referencesReg(const SCEV *S) const;
   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
@@ -283,7 +303,7 @@ struct Formula {
 
 }
 
-/// DoInitialMatch - Recursion helper for InitialMatch.
+/// Recursion helper for initialMatch.
 static void DoInitialMatch(const SCEV *S, Loop *L,
                            SmallVectorImpl<const SCEV *> &Good,
                            SmallVectorImpl<const SCEV *> &Bad,
@@ -336,10 +356,9 @@ static void DoInitialMatch(const SCEV *S, Loop *L,
   Bad.push_back(S);
 }
 
-/// InitialMatch - Incorporate loop-variant parts of S into this Formula,
-/// attempting to keep all loop-invariant and loop-computable values in a
-/// single base register.
-void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
   SmallVector<const SCEV *, 4> Good;
   SmallVector<const SCEV *, 4> Bad;
   DoInitialMatch(S, L, Good, Bad, SE);
@@ -355,7 +374,7 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
-  Canonicalize();
+  canonicalize();
 }
 
 /// \brief Check whether or not this formula statisfies the canonical
@@ -373,7 +392,7 @@ bool Formula::isCanonical() const {
 /// field. Otherwise, we would have to do special cases everywhere in LSR
 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
 /// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::Canonicalize() {
+void Formula::canonicalize() {
   if (isCanonical())
     return;
   // So far we did not need this case. This is easy to implement but it is
@@ -394,7 +413,7 @@ void Formula::Canonicalize() {
 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
 /// \return true if it was possible to get rid of the scale, false otherwise.
 /// \note After this operation the formula may not be in the canonical form.
-bool Formula::Unscale() {
+bool Formula::unscale() {
   if (Scale != 1)
     return false;
   Scale = 0;
@@ -403,15 +422,14 @@ bool Formula::Unscale() {
   return true;
 }
 
-/// getNumRegs - Return the total number of register operands used by this
-/// formula. This does not include register uses implied by non-constant
-/// addrec strides.
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
   return !!ScaledReg + BaseRegs.size();
 }
 
-/// getType - Return the type of this formula, if it has one, or null
-/// otherwise. This type is meaningless except for the bit size.
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
 Type *Formula::getType() const {
   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
          ScaledReg ? ScaledReg->getType() :
@@ -419,21 +437,21 @@ Type *Formula::getType() const {
          nullptr;
 }
 
-/// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
-void Formula::DeleteBaseReg(const SCEV *&S) {
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
   if (&S != &BaseRegs.back())
     std::swap(S, BaseRegs.back());
   BaseRegs.pop_back();
 }
 
-/// referencesReg - Test if this formula references the given register.
+/// Test if this formula references the given register.
 bool Formula::referencesReg(const SCEV *S) const {
   return S == ScaledReg ||
          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 }
 
-/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
-/// which are used by uses other than the use with the given index.
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
                                          const RegUseTracker &RegUses) const {
   if (ScaledReg)
@@ -481,30 +499,29 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
-/// isAddRecSExtable - Return true if the given addrec can be sign-extended
-/// without changing its value.
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
   Type *WideTy =
     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 }
 
-/// isAddSExtable - Return true if the given add can be sign-extended
-/// without changing its value.
+/// Return true if the given add can be sign-extended without changing its
+/// value.
 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
   Type *WideTy =
     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 }
 
-/// isMulSExtable - Return true if the given mul can be sign-extended
-/// without changing its value.
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
   Type *WideTy =
     IntegerType::get(SE.getContext(),
@@ -512,12 +529,11 @@ static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 }
 
-/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
-/// and if the remainder is known to be zero,  or null otherwise. If
-/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
-/// to Y, ignoring that the multiplication may overflow, which is useful when
-/// the result will be used in a context where the most significant bits are
-/// ignored.
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
                                 ScalarEvolution &SE,
                                 bool IgnoreSignificantBits = false) {
@@ -528,7 +544,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   // Handle a few RHS special cases.
   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
   if (RC) {
-    const APInt &RA = RC->getValue()->getValue();
+    const APInt &RA = RC->getAPInt();
     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
     // some folding.
     if (RA.isAllOnesValue())
@@ -542,8 +558,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
     if (!RC)
       return nullptr;
-    const APInt &LA = C->getValue()->getValue();
-    const APInt &RA = RC->getValue()->getValue();
+    const APInt &LA = C->getAPInt();
+    const APInt &RA = RC->getAPInt();
     if (LA.srem(RA) != 0)
       return nullptr;
     return SE.getConstant(LA.sdiv(RA));
@@ -603,12 +619,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   return nullptr;
 }
 
-/// ExtractImmediate - If S involves the addition of a constant integer value,
-/// return that integer value, and mutate S to point to a new SCEV with that
-/// value excluded.
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
-    if (C->getValue()->getValue().getMinSignedBits() <= 64) {
+    if (C->getAPInt().getMinSignedBits() <= 64) {
       S = SE.getConstant(C->getType(), 0);
       return C->getValue()->getSExtValue();
     }
@@ -630,9 +645,8 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
   return 0;
 }
 
-/// ExtractSymbol - If S involves the addition of a GlobalValue address,
-/// return that symbol, and mutate S to point to a new SCEV with that
-/// value excluded.
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
@@ -657,8 +671,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
   return nullptr;
 }
 
-/// isAddressUse - Returns true if the specified instruction is using the
-/// specified value as an address.
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -682,12 +696,15 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   return isAddress;
 }
 
-/// getAccessType - Return the type of the memory being accessed.
-static Type *getAccessType(const Instruction *Inst) {
-  Type *AccessTy = Inst->getType();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    AccessTy = SI->getOperand(0)->getType();
-  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const Instruction *Inst) {
+  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    AccessTy.MemTy = SI->getOperand(0)->getType();
+    AccessTy.AddrSpace = SI->getPointerAddressSpace();
+  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
     // of intrinsics.
     switch (II->getIntrinsicID()) {
@@ -696,21 +713,21 @@ static Type *getAccessType(const Instruction *Inst) {
     case Intrinsic::x86_sse2_storeu_pd:
     case Intrinsic::x86_sse2_storeu_dq:
     case Intrinsic::x86_sse2_storel_dq:
-      AccessTy = II->getArgOperand(0)->getType();
+      AccessTy.MemTy = II->getArgOperand(0)->getType();
       break;
     }
   }
 
   // All pointers have the same requirements, so canonicalize them to an
   // arbitrary pointer type to minimize variation.
-  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
-    AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
-                                PTy->getAddressSpace());
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+                                      PTy->getAddressSpace());
 
   return AccessTy;
 }
 
-/// isExistingPhi - Return true if this AddRec is already a phi in its loop.
+/// Return true if this AddRec is already a phi in its loop.
 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
   for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
@@ -793,9 +810,8 @@ static bool isHighCostExpansion(const SCEV *S,
   return true;
 }
 
-/// DeleteTriviallyDeadInstructions - If any of the instructions is the
-/// specified set are trivially dead, delete them and see if this makes any of
-/// their operands subsequently dead.
+/// If any of the instructions is the specified set are trivially dead, delete
+/// them and see if this makes any of their operands subsequently dead.
 static bool
 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
   bool Changed = false;
@@ -842,7 +858,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
 
 namespace {
 
-/// Cost - This class is used to measure and compare candidate formulae.
+/// This class is used to measure and compare candidate formulae.
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
@@ -905,7 +921,7 @@ private:
 
 }
 
-/// RateRegister - Tally up interesting quantities from the given register.
+/// Tally up interesting quantities from the given register.
 void Cost::RateRegister(const SCEV *Reg,
                         SmallPtrSetImpl<const SCEV *> &Regs,
                         const Loop *L,
@@ -951,9 +967,9 @@ void Cost::RateRegister(const SCEV *Reg,
                  SE.hasComputableLoopEvolution(Reg, L);
 }
 
-/// RatePrimaryRegister - Record this register in the set. If we haven't seen it
-/// before, rate it. Optional LoserRegs provides a way to declare any formula
-/// that refers to one of those regs an instant loser.
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
 void Cost::RatePrimaryRegister(const SCEV *Reg,
                                SmallPtrSetImpl<const SCEV *> &Regs,
                                const Loop *L,
@@ -1024,7 +1040,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   assert(isValid() && "invalid cost");
 }
 
-/// Lose - Set this cost to a losing value.
+/// Set this cost to a losing value.
 void Cost::Lose() {
   NumRegs = ~0u;
   AddRecCost = ~0u;
@@ -1035,7 +1051,7 @@ void Cost::Lose() {
   ScaleCost = ~0u;
 }
 
-/// operator< - Choose the lower cost.
+/// Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                   ImmCost, SetupCost) <
@@ -1061,37 +1077,35 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
 namespace {
 
-/// LSRFixup - An operand value in an instruction which is to be replaced
-/// with some equivalent, possibly strength-reduced, replacement.
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
 struct LSRFixup {
-  /// UserInst - The instruction which will be updated.
+  /// The instruction which will be updated.
   Instruction *UserInst;
 
-  /// OperandValToReplace - The operand of the instruction which will
-  /// be replaced. The operand may be used more than once; every instance
-  /// will be replaced.
+  /// The operand of the instruction which will be replaced. The operand may be
+  /// used more than once; every instance will be replaced.
   Value *OperandValToReplace;
 
-  /// PostIncLoops - If this user is to use the post-incremented value of an
-  /// induction variable, this variable is non-null and holds the loop
-  /// associated with the induction variable.
+  /// If this user is to use the post-incremented value of an induction
+  /// variable, this variable is non-null and holds the loop associated with the
+  /// induction variable.
   PostIncLoopSet PostIncLoops;
 
-  /// LUIdx - The index of the LSRUse describing the expression which
-  /// this fixup needs, minus an offset (below).
+  /// The index of the LSRUse describing the expression which this fixup needs,
+  /// minus an offset (below).
   size_t LUIdx;
 
-  /// Offset - A constant offset to be added to the LSRUse expression.
-  /// This allows multiple fixups to share the same LSRUse with different
-  /// offsets, for example in an unrolled loop.
+  /// A constant offset to be added to the LSRUse expression.  This allows
+  /// multiple fixups to share the same LSRUse with different offsets, for
+  /// example in an unrolled loop.
   int64_t Offset;
 
   bool isUseFullyOutsideLoop(const Loop *L) const;
@@ -1108,8 +1122,7 @@ LSRFixup::LSRFixup()
   : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
     Offset(0) {}
 
-/// isUseFullyOutsideLoop - Test whether this fixup always uses its
-/// value outside of the given loop.
+/// Test whether this fixup always uses its value outside of the given loop.
 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
   // PHI nodes use their value in their incoming blocks.
   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
@@ -1149,16 +1162,15 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
 namespace {
 
-/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
-/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
 struct UniquifierDenseMapInfo {
   static SmallVector<const SCEV *, 4> getEmptyKey() {
     SmallVector<const SCEV *, 4>  V;
@@ -1182,17 +1194,17 @@ struct UniquifierDenseMapInfo {
   }
 };
 
-/// LSRUse - This class holds the state that LSR keeps for each use in
-/// IVUsers, as well as uses invented by LSR itself. It includes information
-/// about what kinds of things can be folded into the user, information about
-/// the user itself, and information about how the use may be satisfied.
-/// TODO: Represent multiple users of the same expression in common?
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied.  TODO: Represent multiple
+/// users of the same expression in common?
 class LSRUse {
   DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
 
 public:
-  /// KindType - An enum for a kind of use, indicating what types of
-  /// scaled and immediate operands it might support.
+  /// An enum for a kind of use, indicating what types of scaled and immediate
+  /// operands it might support.
   enum KindType {
     Basic,   ///< A normal use, with no folding.
     Special, ///< A special case of basic, allowing -1 scales.
@@ -1204,15 +1216,14 @@ public:
   typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
 
   KindType Kind;
-  Type *AccessTy;
+  MemAccessTy AccessTy;
 
   SmallVector<int64_t, 8> Offsets;
   int64_t MinOffset;
   int64_t MaxOffset;
 
-  /// AllFixupsOutsideLoop - This records whether all of the fixups using this
-  /// LSRUse are outside of the loop, in which case some special-case heuristics
-  /// may be used.
+  /// This records whether all of the fixups using this LSRUse are outside of
+  /// the loop, in which case some special-case heuristics may be used.
   bool AllFixupsOutsideLoop;
 
   /// RigidFormula is set to true to guarantee that this use will be associated
@@ -1222,26 +1233,24 @@ public:
   /// changing the formula.
   bool RigidFormula;
 
-  /// WidestFixupType - This records the widest use type for any fixup using
-  /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
-  /// max fixup widths to be equivalent, because the narrower one may be relying
-  /// on the implicit truncation to truncate away bogus bits.
+  /// This records the widest use type for any fixup using this
+  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+  /// fixup widths to be equivalent, because the narrower one may be relying on
+  /// the implicit truncation to truncate away bogus bits.
   Type *WidestFixupType;
 
-  /// Formulae - A list of ways to build a value that can satisfy this user.
-  /// After the list is populated, one of these is selected heuristically and
-  /// used to formulate a replacement for OperandValToReplace in UserInst.
+  /// A list of ways to build a value that can satisfy this user.  After the
+  /// list is populated, one of these is selected heuristically and used to
+  /// formulate a replacement for OperandValToReplace in UserInst.
   SmallVector<Formula, 12> Formulae;
 
-  /// Regs - The set of register candidates used by all formulae in this LSRUse.
+  /// The set of register candidates used by all formulae in this LSRUse.
   SmallPtrSet<const SCEV *, 4> Regs;
 
-  LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
-                                      MinOffset(INT64_MAX),
-                                      MaxOffset(INT64_MIN),
-                                      AllFixupsOutsideLoop(true),
-                                      RigidFormula(false),
-                                      WidestFixupType(nullptr) {}
+  LSRUse(KindType K, MemAccessTy AT)
+      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
+        AllFixupsOutsideLoop(true), RigidFormula(false),
+        WidestFixupType(nullptr) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
   bool InsertFormula(const Formula &F);
@@ -1254,8 +1263,8 @@ public:
 
 }
 
-/// HasFormula - Test whether this use as a formula which has the same
-/// registers as the given formula.
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
@@ -1264,9 +1273,8 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   return Uniquifier.count(Key);
 }
 
-/// InsertFormula - If the given formula has not yet been inserted, add it to
-/// the list, and return true. Return false otherwise.
-/// The formula must be in canonical form.
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.  The formula must be in canonical form.
 bool LSRUse::InsertFormula(const Formula &F) {
   assert(F.isCanonical() && "Invalid canonical representation");
 
@@ -1300,14 +1308,14 @@ bool LSRUse::InsertFormula(const Formula &F) {
   return true;
 }
 
-/// DeleteFormula - Remove the given formula from this use's list.
+/// Remove the given formula from this use's list.
 void LSRUse::DeleteFormula(Formula &F) {
   if (&F != &Formulae.back())
     std::swap(F, Formulae.back());
   Formulae.pop_back();
 }
 
-/// RecomputeRegs - Recompute the Regs field, and update RegUses.
+/// Recompute the Regs field, and update RegUses.
 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
   // Now that we've filtered out some formulae, recompute the Regs set.
   SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
@@ -1320,7 +1328,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
   // Update the RegTracker.
   for (const SCEV *S : OldRegs)
     if (!Regs.count(S))
-      RegUses.DropRegister(S, LUIdx);
+      RegUses.dropRegister(S, LUIdx);
 }
 
 void LSRUse::print(raw_ostream &OS) const {
@@ -1331,10 +1339,13 @@ void LSRUse::print(raw_ostream &OS) const {
   case ICmpZero: OS << "ICmpZero"; break;
   case Address:
     OS << "Address of ";
-    if (AccessTy->isPointerTy())
+    if (AccessTy.MemTy->isPointerTy())
       OS << "pointer"; // the full pointer type could be really verbose
-    else
-      OS << *AccessTy;
+    else {
+      OS << *AccessTy.MemTy;
+    }
+
+    OS << " in addrspace(" << AccessTy.AddrSpace << ')';
   }
 
   OS << ", Offsets={";
@@ -1353,19 +1364,19 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
-                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  GlobalValue *BaseGV, int64_t BaseOffset,
                                  bool HasBaseReg, int64_t Scale) {
   switch (Kind) {
   case LSRUse::Address:
-    return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+                                     HasBaseReg, Scale, AccessTy.AddrSpace);
 
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
@@ -1412,7 +1423,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  int64_t MinOffset, int64_t MaxOffset,
-                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  GlobalValue *BaseGV, int64_t BaseOffset,
                                  bool HasBaseReg, int64_t Scale) {
   // Check for overflow.
@@ -1433,7 +1444,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  int64_t MinOffset, int64_t MaxOffset,
-                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  const Formula &F) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
   // or a scale not equal to zero is correct.
@@ -1447,11 +1458,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
-/// isLegalUse - Test whether we know how to expand the current formula.
+/// Test whether we know how to expand the current formula.
 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
-                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
-                       int64_t Scale) {
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, GlobalValue *BaseGV,
+                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
   // We know how to expand completely foldable formulae.
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
                               BaseOffset, HasBaseReg, Scale) ||
@@ -1463,8 +1474,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
 }
 
 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
-                       const Formula &F) {
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, const Formula &F) {
   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
@@ -1490,14 +1501,12 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
   switch (LU.Kind) {
   case LSRUse::Address: {
     // Check the scaling factor cost with both the min and max offsets.
-    int ScaleCostMinOffset =
-      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
-                               F.BaseOffset + LU.MinOffset,
-                               F.HasBaseReg, F.Scale);
-    int ScaleCostMaxOffset =
-      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
-                               F.BaseOffset + LU.MaxOffset,
-                               F.HasBaseReg, F.Scale);
+    int ScaleCostMinOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
 
     assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
            "Legal addressing mode has an illegal cost!");
@@ -1515,7 +1524,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
 }
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
-                             LSRUse::KindType Kind, Type *AccessTy,
+                             LSRUse::KindType Kind, MemAccessTy AccessTy,
                              GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg) {
   // Fast-path: zero is always foldable.
@@ -1539,7 +1548,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              ScalarEvolution &SE, int64_t MinOffset,
                              int64_t MaxOffset, LSRUse::KindType Kind,
-                             Type *AccessTy, const SCEV *S, bool HasBaseReg) {
+                             MemAccessTy AccessTy, const SCEV *S,
+                             bool HasBaseReg) {
   // Fast-path: zero is always foldable.
   if (S->isZero()) return true;
 
@@ -1564,9 +1574,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
 
 namespace {
 
-/// IVInc - An individual increment in a Chain of IV increments.
-/// Relate an IV user to an expression that computes the IV it uses from the IV
-/// used by the previous link in the Chain.
+/// An individual increment in a Chain of IV increments.  Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
 ///
 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
 /// original IVOperand. The head of the chain's IVOperand is only valid during
@@ -1582,8 +1592,8 @@ struct IVInc {
     UserInst(U), IVOperand(O), IncExpr(E) {}
 };
 
-// IVChain - The list of IV increments in program order.
-// We typically add the head of a chain without finding subsequent links.
+// The list of IV increments in program order.  We typically add the head of a
+// chain without finding subsequent links.
 struct IVChain {
   SmallVector<IVInc,1> Incs;
   const SCEV *ExprBase;
@@ -1595,7 +1605,7 @@ struct IVChain {
 
   typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
 
-  // begin - return the first increment in the chain.
+  // Return the first increment in the chain.
   const_iterator begin() const {
     assert(!Incs.empty());
     return std::next(Incs.begin());
@@ -1604,32 +1614,30 @@ struct IVChain {
     return Incs.end();
   }
 
-  // hasIncs - Returns true if this chain contains any increments.
+  // Returns true if this chain contains any increments.
   bool hasIncs() const { return Incs.size() >= 2; }
 
-  // add - Add an IVInc to the end of this chain.
+  // Add an IVInc to the end of this chain.
   void add(const IVInc &X) { Incs.push_back(X); }
 
-  // tailUserInst - Returns the last UserInst in the chain.
+  // Returns the last UserInst in the chain.
   Instruction *tailUserInst() const { return Incs.back().UserInst; }
 
-  // isProfitableIncrement - Returns true if IncExpr can be profitably added to
-  // this chain.
+  // Returns true if IncExpr can be profitably added to this chain.
   bool isProfitableIncrement(const SCEV *OperExpr,
                              const SCEV *IncExpr,
                              ScalarEvolution&);
 };
 
-/// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
-/// Distinguish between FarUsers that definitely cross IV increments and
-/// NearUsers that may be used between IV increments.
+/// Helper for CollectChains to track multiple IV increment uses.  Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
 struct ChainUsers {
   SmallPtrSet<Instruction*, 4> FarUsers;
   SmallPtrSet<Instruction*, 4> NearUsers;
 };
 
-/// LSRInstance - This class holds state for the main loop strength reduction
-/// logic.
+/// This class holds state for the main loop strength reduction logic.
 class LSRInstance {
   IVUsers &IU;
   ScalarEvolution &SE;
@@ -1639,25 +1647,25 @@ class LSRInstance {
   Loop *const L;
   bool Changed;
 
-  /// IVIncInsertPos - This is the insert position that the current loop's
-  /// induction variable increment should be placed. In simple loops, this is
-  /// the latch block's terminator. But in more complicated cases, this is a
-  /// position which will dominate all the in-loop post-increment users.
+  /// This is the insert position that the current loop's induction variable
+  /// increment should be placed. In simple loops, this is the latch block's
+  /// terminator. But in more complicated cases, this is a position which will
+  /// dominate all the in-loop post-increment users.
   Instruction *IVIncInsertPos;
 
-  /// Factors - Interesting factors between use strides.
+  /// Interesting factors between use strides.
   SmallSetVector<int64_t, 8> Factors;
 
-  /// Types - Interesting use types, to facilitate truncation reuse.
+  /// Interesting use types, to facilitate truncation reuse.
   SmallSetVector<Type *, 4> Types;
 
-  /// Fixups - The list of operands which are to be replaced.
+  /// The list of operands which are to be replaced.
   SmallVector<LSRFixup, 16> Fixups;
 
-  /// Uses - The list of interesting uses.
+  /// The list of interesting uses.
   SmallVector<LSRUse, 16> Uses;
 
-  /// RegUses - Track which uses use which register candidates.
+  /// Track which uses use which register candidates.
   RegUseTracker RegUses;
 
   // Limit the number of chains to avoid quadratic behavior. We don't expect to
@@ -1665,10 +1673,10 @@ class LSRInstance {
   // back to normal LSR behavior for those uses.
   static const unsigned MaxChains = 8;
 
-  /// IVChainVec - IV users can form a chain of IV increments.
+  /// IV users can form a chain of IV increments.
   SmallVector<IVChain, MaxChains> IVChainVec;
 
-  /// IVIncSet - IV users that belong to profitable IVChains.
+  /// IV users that belong to profitable IVChains.
   SmallPtrSet<Use*, MaxChains> IVIncSet;
 
   void OptimizeShadowIV();
@@ -1696,11 +1704,10 @@ class LSRInstance {
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
-                          LSRUse::KindType Kind, Type *AccessTy);
+                          LSRUse::KindType Kind, MemAccessTy AccessTy);
 
-  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
-                                    LSRUse::KindType Kind,
-                                    Type *AccessTy);
+  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+                                    MemAccessTy AccessTy);
 
   void DeleteUse(LSRUse &LU, size_t LUIdx);
 
@@ -1769,18 +1776,16 @@ class LSRInstance {
   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
                      const Formula &F,
                      SCEVExpander &Rewriter,
-                     SmallVectorImpl<WeakVH> &DeadInsts,
-                     Pass *P) const;
+                     SmallVectorImpl<WeakVH> &DeadInsts) const;
   void Rewrite(const LSRFixup &LF,
                const Formula &F,
                SCEVExpander &Rewriter,
-               SmallVectorImpl<WeakVH> &DeadInsts,
-               Pass *P) const;
-  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
-                         Pass *P);
+               SmallVectorImpl<WeakVH> &DeadInsts) const;
+  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
 
 public:
-  LSRInstance(Loop *L, Pass *P);
+  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+              LoopInfo &LI, const TargetTransformInfo &TTI);
 
   bool getChanged() const { return Changed; }
 
@@ -1793,8 +1798,8 @@ public:
 
 }
 
-/// OptimizeShadowIV - If IV is used in a int-to-float cast
-/// inside the loop then try to eliminate the cast operation.
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
 void LSRInstance::OptimizeShadowIV() {
   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
@@ -1902,9 +1907,8 @@ void LSRInstance::OptimizeShadowIV() {
   }
 }
 
-/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
-/// set the IV user and stride information and return true, otherwise return
-/// false.
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
   for (IVStrideUse &U : IU)
     if (U.getUser() == Cond) {
@@ -1917,8 +1921,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
   return false;
 }
 
-/// OptimizeMax - Rewrite the loop's terminating condition if it uses
-/// a max computation.
+/// Rewrite the loop's terminating condition if it uses a max computation.
 ///
 /// This is a narrow solution to a specific, but acute, problem. For loops
 /// like this:
@@ -2076,8 +2079,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   return NewCond;
 }
 
-/// OptimizeLoopTermCond - Change loop terminating condition to use the
-/// postinc iv when possible.
+/// Change loop terminating condition to use the postinc iv when possible.
 void
 LSRInstance::OptimizeLoopTermCond() {
   SmallPtrSet<Instruction *, 4> PostIncs;
@@ -2152,16 +2154,18 @@ LSRInstance::OptimizeLoopTermCond() {
                 C->getValue().isMinSignedValue())
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
-            Type *AccessTy = getAccessType(UI->getUser());
+            MemAccessTy AccessTy = getAccessType(UI->getUser());
             int64_t Scale = C->getSExtValue();
-            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr,
-                                          /*BaseOffset=*/ 0,
-                                          /*HasBaseReg=*/ false, Scale))
+            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                          /*BaseOffset=*/0,
+                                          /*HasBaseReg=*/false, Scale,
+                                          AccessTy.AddrSpace))
               goto decline_post_inc;
             Scale = -Scale;
-            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr,
-                                          /*BaseOffset=*/ 0,
-                                          /*HasBaseReg=*/ false, Scale))
+            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                          /*BaseOffset=*/0,
+                                          /*HasBaseReg=*/false, Scale,
+                                          AccessTy.AddrSpace))
               goto decline_post_inc;
           }
         }
@@ -2180,7 +2184,7 @@ LSRInstance::OptimizeLoopTermCond() {
         ICmpInst *OldCond = Cond;
         Cond = cast<ICmpInst>(Cond->clone());
         Cond->setName(L->getHeader()->getName() + ".termcond");
-        ExitingBlock->getInstList().insert(TermBr, Cond);
+        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
 
         // Clone the IVUse, as the old use still exists!
         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
@@ -2213,15 +2217,14 @@ LSRInstance::OptimizeLoopTermCond() {
   }
 }
 
-/// reconcileNewOffset - Determine if the given use can accommodate a fixup
-/// at the given offset and other details. If so, update the use and
-/// return true.
-bool
-LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
-                                LSRUse::KindType Kind, Type *AccessTy) {
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+                                     bool HasBaseReg, LSRUse::KindType Kind,
+                                     MemAccessTy AccessTy) {
   int64_t NewMinOffset = LU.MinOffset;
   int64_t NewMaxOffset = LU.MaxOffset;
-  Type *NewAccessTy = AccessTy;
+  MemAccessTy NewAccessTy = AccessTy;
 
   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
   // something conservative, however this can pessimize in the case that one of
@@ -2232,8 +2235,10 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   // Check for a mismatched access type, and fall back conservatively as needed.
   // TODO: Be less conservative when the type is similar and can use the same
   // addressing modes.
-  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
-    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
+  if (Kind == LSRUse::Address) {
+    if (AccessTy != LU.AccessTy)
+      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext());
+  }
 
   // Conservatively assume HasBaseReg is true for now.
   if (NewOffset < LU.MinOffset) {
@@ -2257,12 +2262,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   return true;
 }
 
-/// getUse - Return an LSRUse index and an offset value for a fixup which
-/// needs the given expression, with the given kind and optional access type.
-/// Either reuse an existing use or create a new one, as needed.
-std::pair<size_t, int64_t>
-LSRInstance::getUse(const SCEV *&Expr,
-                    LSRUse::KindType Kind, Type *AccessTy) {
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type.  Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+                                               LSRUse::KindType Kind,
+                                               MemAccessTy AccessTy) {
   const SCEV *Copy = Expr;
   int64_t Offset = ExtractImmediate(Expr, SE);
 
@@ -2300,18 +2305,18 @@ LSRInstance::getUse(const SCEV *&Expr,
   return std::make_pair(LUIdx, Offset);
 }
 
-/// DeleteUse - Delete the given use from the Uses list.
+/// Delete the given use from the Uses list.
 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
   if (&LU != &Uses.back())
     std::swap(LU, Uses.back());
   Uses.pop_back();
 
   // Update RegUses.
-  RegUses.SwapAndDropUse(LUIdx, Uses.size());
+  RegUses.swapAndDropUse(LUIdx, Uses.size());
 }
 
-/// FindUseWithFormula - Look for a use distinct from OrigLU which is has
-/// a formula that has the same registers as the given formula.
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
 LSRUse *
 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
                                        const LSRUse &OrigLU) {
@@ -2396,14 +2401,14 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
       if (const SCEVConstant *Factor =
             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
                                                         SE, true))) {
-        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
-          Factors.insert(Factor->getValue()->getValue().getSExtValue());
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
       } else if (const SCEVConstant *Factor =
                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
                                                                NewStride,
                                                                SE, true))) {
-        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
-          Factors.insert(Factor->getValue()->getValue().getSExtValue());
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
       }
     }
 
@@ -2415,9 +2420,9 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
   DEBUG(print_factors_and_types(dbgs()));
 }
 
-/// findIVOperand - Helper for CollectChains that finds an IV operand (computed
-/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped
-/// Instructions to IVStrideUses, we could partially skip this.
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
 static User::op_iterator
 findIVOperand(User::op_iterator OI, User::op_iterator OE,
               Loop *L, ScalarEvolution &SE) {
@@ -2436,29 +2441,28 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE,
   return OI;
 }
 
-/// getWideOperand - IVChain logic must consistenctly peek base TruncInst
-/// operands, so wrap it in a convenient helper.
+/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// a convenient helper.
 static Value *getWideOperand(Value *Oper) {
   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
     return Trunc->getOperand(0);
   return Oper;
 }
 
-/// isCompatibleIVType - Return true if we allow an IV chain to include both
-/// types.
+/// Return true if we allow an IV chain to include both types.
 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
   Type *LType = LVal->getType();
   Type *RType = RVal->getType();
   return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
 }
 
-/// getExprBase - Return an approximation of this SCEV expression's "base", or
-/// NULL for any constant. Returning the expression itself is
-/// conservative. Returning a deeper subexpression is more precise and valid as
-/// long as it isn't less complex than another subexpression. For expressions
-/// involving multiple unscaled values, we need to return the pointer-type
-/// SCEVUnknown. This avoids forming chains across objects, such as:
-/// PrevOper==a[i], IVOper==b[i], IVInc==b-a.
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
 ///
 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
 /// SCEVUnknown, we simply return the rightmost SCEV operand.
@@ -2601,8 +2605,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
   return cost < 0;
 }
 
-/// ChainInstruction - Add this IV user to an existing chain or make it the head
-/// of a new chain.
+/// Add this IV user to an existing chain or make it the head of a new chain.
 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
   // When IVs are used as types of varying widths, they are generally converted
@@ -2714,7 +2717,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
 }
 
-/// CollectChains - Populate the vector of Chains.
+/// Populate the vector of Chains.
 ///
 /// This decreases ILP at the architecture level. Targets with ample registers,
 /// multiple memory ports, and no register renaming probably don't want
@@ -2755,19 +2758,19 @@ void LSRInstance::CollectChains() {
     for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
          I != E; ++I) {
       // Skip instructions that weren't seen by IVUsers analysis.
-      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I))
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))
         continue;
 
       // Ignore users that are part of a SCEV expression. This way we only
       // consider leaf IV Users. This effectively rediscovers a portion of
       // IVUsers analysis but in program order this time.
-      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I)))
+      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))
         continue;
 
       // Remove this instruction from any NearUsers set it may be in.
       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
            ChainIdx < NChains; ++ChainIdx) {
-        ChainUsersVec[ChainIdx].NearUsers.erase(I);
+        ChainUsersVec[ChainIdx].NearUsers.erase(&*I);
       }
       // Search for operands that can be chained.
       SmallPtrSet<Instruction*, 4> UniqueOperands;
@@ -2776,7 +2779,7 @@ void LSRInstance::CollectChains() {
       while (IVOpIter != IVOpEnd) {
         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
         if (UniqueOperands.insert(IVOpInst).second)
-          ChainInstruction(I, IVOpInst, ChainUsersVec);
+          ChainInstruction(&*I, IVOpInst, ChainUsersVec);
         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
       }
     } // Continue walking down the instructions.
@@ -2828,20 +2831,20 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
   if (!IncConst || !isAddressUse(UserInst, Operand))
     return false;
 
-  if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
+  if (IncConst->getAPInt().getMinSignedBits() > 64)
     return false;
 
+  MemAccessTy AccessTy = getAccessType(UserInst);
   int64_t IncOffset = IncConst->getValue()->getSExtValue();
-  if (!isAlwaysFoldable(TTI, LSRUse::Address,
-                        getAccessType(UserInst), /*BaseGV=*/ nullptr,
-                        IncOffset, /*HaseBaseReg=*/ false))
+  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+                        IncOffset, /*HaseBaseReg=*/false))
     return false;
 
   return true;
 }
 
-/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
-/// materialize the IV user's operand from the previous IV user's operand.
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
                                   SmallVectorImpl<WeakVH> &DeadInsts) {
   // Find the new IVOperand for the head of the chain. It may have been replaced
@@ -2961,7 +2964,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     LF.PostIncLoops = U.getPostIncLoops();
 
     LSRUse::KindType Kind = LSRUse::Basic;
-    Type *AccessTy = nullptr;
+    MemAccessTy AccessTy;
     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
       Kind = LSRUse::Address;
       AccessTy = getAccessType(LF.UserInst);
@@ -3027,9 +3030,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
   DEBUG(print_fixups(dbgs()));
 }
 
-/// InsertInitialFormula - Insert a formula for the given expression into
-/// the given use, separating out loop-variant portions from loop-invariant
-/// and loop-computable portions.
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
 void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   // Mark uses whose expressions cannot be expanded.
@@ -3037,13 +3039,13 @@ LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
     LU.RigidFormula = true;
 
   Formula F;
-  F.InitialMatch(S, L, SE);
+  F.initialMatch(S, L, SE);
   bool Inserted = InsertFormula(LU, LUIdx, F);
   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
 }
 
-/// InsertSupplementalFormula - Insert a simple single-register formula for
-/// the given expression into the given use.
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
 void
 LSRInstance::InsertSupplementalFormula(const SCEV *S,
                                        LSRUse &LU, size_t LUIdx) {
@@ -3054,17 +3056,16 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S,
   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 }
 
-/// CountRegisters - Note which registers are used by the given formula,
-/// updating RegUses.
+/// Note which registers are used by the given formula, updating RegUses.
 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
   if (F.ScaledReg)
-    RegUses.CountRegister(F.ScaledReg, LUIdx);
+    RegUses.countRegister(F.ScaledReg, LUIdx);
   for (const SCEV *BaseReg : F.BaseRegs)
-    RegUses.CountRegister(BaseReg, LUIdx);
+    RegUses.countRegister(BaseReg, LUIdx);
 }
 
-/// InsertFormula - If the given formula has not yet been inserted, add it to
-/// the list, and return true. Return false otherwise.
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   // Do not insert formula that we will not be able to expand.
   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
@@ -3076,9 +3077,9 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   return true;
 }
 
-/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
-/// loop-invariant values which we're tracking. These other uses will pin these
-/// values in registers, making them less profitable for elimination.
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
 /// TODO: This currently misses non-constant addrec step registers.
 /// TODO: Should this give more weight to users inside the loop?
 void
@@ -3124,6 +3125,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
             PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
         if (!DT.dominates(L->getHeader(), UseBB))
           continue;
+        // Don't bother if the instruction is in a BB which ends in an EHPad.
+        if (UseBB->getTerminator()->isEHPad())
+          continue;
         // Ignore uses which are part of other SCEV expressions, to avoid
         // analyzing them multiple times.
         if (SE.isSCEVable(UserInst->getType())) {
@@ -3148,7 +3152,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         LSRFixup &LF = getNewFixup();
         LF.UserInst = const_cast<Instruction *>(UserInst);
         LF.OperandValToReplace = U;
-        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr);
+        std::pair<size_t, int64_t> P = getUse(
+            S, LSRUse::Basic, MemAccessTy());
         LF.LUIdx = P.first;
         LF.Offset = P.second;
         LSRUse &LU = Uses[LF.LUIdx];
@@ -3165,8 +3170,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
   }
 }
 
-/// CollectSubexprs - Split S into subexpressions which can be pulled out into
-/// separate registers. If C is non-null, multiply each subexpression by C.
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
 ///
 /// Return remainder expression after factoring the subexpressions captured by
 /// Ops. If Ops is complete, return NULL.
@@ -3300,7 +3305,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       F.BaseRegs.push_back(*J);
     // We may have changed the number of register in base regs, adjust the
     // formula accordingly.
-    F.Canonicalize();
+    F.canonicalize();
 
     if (InsertFormula(LU, LUIdx, F))
       // If that formula hadn't been seen before, recurse to find more like
@@ -3309,8 +3314,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
   }
 }
 
-/// GenerateReassociations - Split out subexpressions from adds and the bases of
-/// addrecs.
+/// Split out subexpressions from adds and the bases of addrecs.
 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                          Formula Base, unsigned Depth) {
   assert(Base.isCanonical() && "Input must be in the canonical form");
@@ -3326,8 +3330,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                /* Idx */ -1, /* IsScaledReg */ true);
 }
 
-/// GenerateCombinations - Generate a formula consisting of all of the
-/// loop-dominating registers added into a single register.
+///  Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
@@ -3336,7 +3340,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
 
   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
   // processing the formula.
-  Base.Unscale();
+  Base.unscale();
   Formula F = Base;
   F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
@@ -3354,7 +3358,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
-      F.Canonicalize();
+      F.canonicalize();
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
@@ -3379,7 +3383,7 @@ void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
   (void)InsertFormula(LU, LUIdx, F);
 }
 
-/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
+/// Generate reuse formulae using symbolic offsets.
 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                           Formula Base) {
   // We can't add a symbolic offset if the address already contains one.
@@ -3410,8 +3414,8 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           F.Scale = 0;
           F.ScaledReg = nullptr;
         } else
-          F.DeleteBaseReg(F.BaseRegs[Idx]);
-        F.Canonicalize();
+          F.deleteBaseReg(F.BaseRegs[Idx]);
+        F.canonicalize();
       } else if (IsScaledReg)
         F.ScaledReg = NewG;
       else
@@ -3452,8 +3456,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
                                 /* IsScaledReg */ true);
 }
 
-/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
-/// the comparison. For example, x == y -> x*c == y*c.
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
                                          Formula Base) {
   if (LU.Kind != LSRUse::ICmpZero) return;
@@ -3538,8 +3542,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   }
 }
 
-/// GenerateScales - Generate stride factor reuse formulae by making use of
-/// scaled-offset address modes, for example.
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   // Determine the integer type for the base formula.
   Type *IntTy = Base.getType();
@@ -3547,10 +3551,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
 
   // If this Formula already has a scaled register, we can't add another one.
   // Try to unscale the formula to generate a better scale.
-  if (Base.Scale != 0 && !Base.Unscale())
+  if (Base.Scale != 0 && !Base.unscale())
     return;
 
-  assert(Base.Scale == 0 && "Unscale did not did its job!");
+  assert(Base.Scale == 0 && "unscale did not did its job!");
 
   // Check each interesting stride.
   for (int64_t Factor : Factors) {
@@ -3587,7 +3591,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           // TODO: This could be optimized to avoid all the copying.
           Formula F = Base;
           F.ScaledReg = Quotient;
-          F.DeleteBaseReg(F.BaseRegs[i]);
+          F.deleteBaseReg(F.BaseRegs[i]);
           // The canonical representation of 1*reg is reg, which is already in
           // Base. In that case, do not try to insert the formula, it will be
           // rejected anyway.
@@ -3599,7 +3603,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   }
 }
 
-/// GenerateTruncates - Generate reuse formulae from different IV types.
+/// Generate reuse formulae from different IV types.
 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   // Don't bother truncating symbolic values.
   if (Base.BaseGV) return;
@@ -3629,9 +3633,9 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
 
 namespace {
 
-/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
-/// defer modifications so that the search phase doesn't have to worry about
-/// the data structures moving underneath it.
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
 struct WorkItem {
   size_t LUIdx;
   int64_t Imm;
@@ -3651,14 +3655,13 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
-/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
-/// distance apart and try to form reuse opportunities between them.
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
 void LSRInstance::GenerateCrossUseConstantOffsets() {
   // Group the registers by their value without any added constant offset.
   typedef std::map<int64_t, const SCEV *> ImmMapTy;
@@ -3751,7 +3754,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       // very similar but slightly different. Investigate if they
       // could be merged. That way, we would not have to unscale the
       // Formula.
-      F.Unscale();
+      F.unscale();
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
@@ -3770,14 +3773,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
         // value to the immediate would produce a value closer to zero than the
         // immediate itself, then the formula isn't worthwhile.
         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
-          if (C->getValue()->isNegative() !=
-                (NewF.BaseOffset < 0) &&
-              (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale))
-                .ule(std::abs(NewF.BaseOffset)))
+          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+              (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+                  .ule(std::abs(NewF.BaseOffset)))
             continue;
 
         // OK, looks good.
-        NewF.Canonicalize();
+        NewF.canonicalize();
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3801,15 +3803,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           // zero than the immediate itself, then the formula isn't worthwhile.
           for (const SCEV *NewReg : NewF.BaseRegs)
             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
-              if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt(
-                   std::abs(NewF.BaseOffset)) &&
-                  (C->getValue()->getValue() +
-                   NewF.BaseOffset).countTrailingZeros() >=
-                   countTrailingZeros<uint64_t>(NewF.BaseOffset))
+              if ((C->getAPInt() + NewF.BaseOffset)
+                      .abs()
+                      .slt(std::abs(NewF.BaseOffset)) &&
+                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+                      countTrailingZeros<uint64_t>(NewF.BaseOffset))
                 goto skip_formula;
 
           // Ok, looks good.
-          NewF.Canonicalize();
+          NewF.canonicalize();
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -3819,7 +3821,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
   }
 }
 
-/// GenerateAllReuseFormulae - Generate formulae for each use.
+/// Generate formulae for each use.
 void
 LSRInstance::GenerateAllReuseFormulae() {
   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
@@ -3959,10 +3961,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 // This is a rough guess that seems to work fairly well.
 static const size_t ComplexityLimit = UINT16_MAX;
 
-/// EstimateSearchSpaceComplexity - Estimate the worst-case number of
-/// solutions the solver might have to consider. It almost never considers
-/// this many solutions because it prune the search space, but the pruning
-/// isn't always sufficient.
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
   size_t Power = 1;
   for (const LSRUse &LU : Uses) {
@@ -3978,10 +3979,9 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
   return Power;
 }
 
-/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
-/// of the registers of another formula, it won't help reduce register
-/// pressure (though it may not necessarily hurt register pressure); remove
-/// it to simplify the system.
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
     DEBUG(dbgs() << "The search space is too complex.\n");
@@ -4042,9 +4042,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
   }
 }
 
-/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
-/// for expressions like A, A+1, A+2, etc., allocate a single register for
-/// them.
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
     return;
@@ -4121,8 +4120,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
 }
 
-/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
-/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
 /// we've done more filtering, as it may be able to find more formulae to
 /// eliminate.
 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
@@ -4139,9 +4137,9 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   }
 }
 
-/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
-/// to be profitable, and then in any use which has any reference to that
-/// register, delete all formulae which do not reference that register.
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
   // With all other options exhausted, loop until the system is simple
   // enough to handle.
@@ -4202,10 +4200,10 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
   }
 }
 
-/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
-/// formulae to choose from, use some rough heuristics to prune down the number
-/// of formulae. This keeps the main solver from taking an extraordinary amount
-/// of time in some worst-case scenarios.
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByDetectingSupersets();
   NarrowSearchSpaceByCollapsingUnrolledCode();
@@ -4213,7 +4211,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByPickingWinnerRegs();
 }
 
-/// SolveRecurse - This is the recursive solver.
+/// This is the recursive solver.
 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                                Cost &SolutionCost,
                                SmallVectorImpl<const Formula *> &Workspace,
@@ -4291,8 +4289,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
   }
 }
 
-/// Solve - Choose one formula from each use. Return the results in the given
-/// Solution vector.
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   SmallVector<const Formula *, 8> Workspace;
   Cost SolutionCost;
@@ -4326,10 +4324,9 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   assert(Solution.size() == Uses.size() && "Malformed solution!");
 }
 
-/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
-/// the dominator tree far as we can go while still being dominated by the
-/// input positions. This helps canonicalize the insert position, which
-/// encourages sharing.
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
 BasicBlock::iterator
 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
                                  const SmallVectorImpl<Instruction *> &Inputs)
@@ -4365,21 +4362,21 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
       // instead of at the end, so that it can be used for other expansions.
       if (IDom == Inst->getParent() &&
           (!BetterPos || !DT.dominates(Inst, BetterPos)))
-        BetterPos = std::next(BasicBlock::iterator(Inst));
+        BetterPos = &*std::next(BasicBlock::iterator(Inst));
     }
     if (!AllDominate)
       break;
     if (BetterPos)
-      IP = BetterPos;
+      IP = BetterPos->getIterator();
     else
-      IP = Tentative;
+      IP = Tentative->getIterator();
   }
 
   return IP;
 }
 
-/// AdjustInsertPositionForExpand - Determine an input position which will be
-/// dominated by the operands and which will dominate the result.
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
 BasicBlock::iterator
 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
                                            const LSRFixup &LF,
@@ -4417,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
     }
   }
 
-  assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP)
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
          && !isa<DbgInfoIntrinsic>(LowestIP) &&
          "Insertion point must be a normal instruction");
 
@@ -4429,7 +4426,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
   while (isa<PHINode>(IP)) ++IP;
 
   // Ignore landingpad instructions.
-  while (isa<LandingPadInst>(IP)) ++IP;
+  while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;
 
   // Ignore debug intrinsics.
   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
@@ -4437,13 +4434,14 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
   // IP consistent across expansions and allows the previously inserted
   // instructions to be reused by subsequent expansion.
-  while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP;
+  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+    ++IP;
 
   return IP;
 }
 
-/// Expand - Emit instructions for the leading candidate expression for this
-/// LSRUse (this is called "expanding").
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
 Value *LSRInstance::Expand(const LSRFixup &LF,
                            const Formula &F,
                            BasicBlock::iterator IP,
@@ -4487,7 +4485,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                                  LF.UserInst, LF.OperandValToReplace,
                                  Loops, SE, DT);
 
-    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP)));
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
   }
 
   // Expand the ScaledReg portion.
@@ -4505,14 +4503,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // Expand ScaleReg as if it was part of the base regs.
       if (F.Scale == 1)
         Ops.push_back(
-            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)));
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
       else {
         // An interesting way of "folding" with an icmp is to use a negated
         // scale, which we'll implement by inserting it into the other operand
         // of the icmp.
         assert(F.Scale == -1 &&
                "The only scale supported by ICmpZero uses is -1!");
-        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
       }
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
@@ -4522,11 +4520,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // Unless the addressing mode will not be folded.
       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
           isAMCompletelyFolded(TTI, LU, F)) {
-        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
         Ops.clear();
         Ops.push_back(SE.getUnknown(FullV));
       }
-      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP));
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
       if (F.Scale != 1)
         ScaledS =
             SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
@@ -4538,7 +4536,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   if (F.BaseGV) {
     // Flush the operand list to suppress SCEVExpander hoisting.
     if (!Ops.empty()) {
-      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
       Ops.clear();
       Ops.push_back(SE.getUnknown(FullV));
     }
@@ -4548,7 +4546,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
   // unfolded offsets. LSR assumes they both live next to their uses.
   if (!Ops.empty()) {
-    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
     Ops.clear();
     Ops.push_back(SE.getUnknown(FullV));
   }
@@ -4584,7 +4582,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   const SCEV *FullS = Ops.empty() ?
                       SE.getConstant(IntTy, 0) :
                       SE.getAddExpr(Ops);
-  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
 
   // We're done expanding now, so reset the rewriter.
   Rewriter.clearPostInc();
@@ -4626,15 +4624,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   return FullV;
 }
 
-/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
-/// of their operands effectively happens in their predecessor blocks, so the
-/// expression may need to be expanded in multiple places.
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
 void LSRInstance::RewriteForPHI(PHINode *PN,
                                 const LSRFixup &LF,
                                 const Formula &F,
                                 SCEVExpander &Rewriter,
-                                SmallVectorImpl<WeakVH> &DeadInsts,
-                                Pass *P) const {
+                                SmallVectorImpl<WeakVH> &DeadInsts) const {
   DenseMap<BasicBlock *, Value *> Inserted;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
@@ -4658,8 +4655,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
                                           .setDontDeleteUselessPHIs());
           } else {
             SmallVector<BasicBlock*, 2> NewBBs;
-            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs,
-                                        /*AliasAnalysis*/ nullptr, &DT, &LI);
+            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
             NewBB = NewBBs[0];
           }
           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
@@ -4685,7 +4681,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
       if (!Pair.second)
         PN->setIncomingValue(i, Pair.first->second);
       else {
-        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
+        Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
+                              Rewriter, DeadInsts);
 
         // If this is reuse-by-noop-cast, insert the noop cast.
         Type *OpTy = LF.OperandValToReplace->getType();
@@ -4702,20 +4699,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
     }
 }
 
-/// Rewrite - Emit instructions for the leading candidate expression for this
-/// LSRUse (this is called "expanding"), and update the UserInst to reference
-/// the newly expanded value.
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
 void LSRInstance::Rewrite(const LSRFixup &LF,
                           const Formula &F,
                           SCEVExpander &Rewriter,
-                          SmallVectorImpl<WeakVH> &DeadInsts,
-                          Pass *P) const {
+                          SmallVectorImpl<WeakVH> &DeadInsts) const {
   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
-    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
+    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
   } else {
-    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
+    Value *FullV =
+        Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
 
     // If this is reuse-by-noop-cast, insert the noop cast.
     Type *OpTy = LF.OperandValToReplace->getType();
@@ -4740,11 +4737,10 @@ void LSRInstance::Rewrite(const LSRFixup &LF,
   DeadInsts.emplace_back(LF.OperandValToReplace);
 }
 
-/// ImplementSolution - Rewrite all the fixup locations with new values,
-/// following the chosen solution.
-void
-LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
-                               Pass *P) {
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+    const SmallVectorImpl<const Formula *> &Solution) {
   // Keep track of instructions we may have made dead, so that
   // we can remove them after we are done working.
   SmallVector<WeakVH, 16> DeadInsts;
@@ -4766,7 +4762,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
 
   // Expand the new value definitions and update the users.
   for (const LSRFixup &Fixup : Fixups) {
-    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
+    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
 
     Changed = true;
   }
@@ -4782,13 +4778,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
 }
 
-LSRInstance::LSRInstance(Loop *L, Pass *P)
-    : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
-      DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
-      LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()),
-      TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-          *L->getHeader()->getParent())),
-      L(L), Changed(false), IVIncInsertPos(nullptr) {
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+                         DominatorTree &DT, LoopInfo &LI,
+                         const TargetTransformInfo &TTI)
+    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
+      IVIncInsertPos(nullptr) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
@@ -4879,7 +4873,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P)
 #endif
 
   // Now that we've decided what we want, make it so.
-  ImplementSolution(Solution, P);
+  ImplementSolution(Solution);
 }
 
 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
@@ -4931,11 +4925,10 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
-#endif
 
 namespace {
 
@@ -4956,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(IVUsers)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
@@ -4982,8 +4975,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredID(LoopSimplifyID);
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
-  AU.addRequired<ScalarEvolution>();
-  AU.addPreserved<ScalarEvolution>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
   // Requiring LoopSimplify a second time here prevents IVUsers from running
   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
   AU.addRequiredID(LoopSimplifyID);
@@ -4996,17 +4989,24 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   if (skipOptnoneFunction(L))
     return false;
 
+  auto &IU = getAnalysis<IVUsers>();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *L->getHeader()->getParent());
   bool Changed = false;
 
   // Run the main LSR transformation.
-  Changed |= LSRInstance(L, this).getChanged();
+  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
 
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader());
   if (EnablePhiElim && L->isLoopSimplifyForm()) {
     SmallVector<WeakVH, 16> DeadInsts;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-    SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr");
+    SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL,
+                          "lsr");
 #ifndef NDEBUG
     Rewriter.setDebugType(DEBUG_TYPE);
 #endif
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index d78db6c..ecef6db 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -38,16 +39,16 @@ using namespace llvm;
 #define DEBUG_TYPE "loop-unroll"
 
 static cl::opt<unsigned>
-    UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
+    UnrollThreshold("unroll-threshold", cl::Hidden,
                     cl::desc("The baseline cost threshold for loop unrolling"));
 
 static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold(
-    "unroll-percent-dynamic-cost-saved-threshold", cl::init(20), cl::Hidden,
+    "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden,
     cl::desc("The percentage of estimated dynamic cost which must be saved by "
              "unrolling to allow unrolling up to the max threshold."));
 
 static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount(
-    "unroll-dynamic-cost-savings-discount", cl::init(2000), cl::Hidden,
+    "unroll-dynamic-cost-savings-discount", cl::Hidden,
     cl::desc("This is the amount discounted from the total unroll cost when "
              "the unrolled form has a high dynamic cost savings (triggered by "
              "the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));
@@ -58,17 +59,17 @@ static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
              "iterations when checking full unroll profitability"));
 
 static cl::opt<unsigned>
-UnrollCount("unroll-count", cl::init(0), cl::Hidden,
+UnrollCount("unroll-count", cl::Hidden,
   cl::desc("Use this unroll count for all loops including those with "
            "unroll_count pragma values, for testing purposes"));
 
 static cl::opt<bool>
-UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
+UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
   cl::desc("Allows loops to be partially unrolled until "
            "-unroll-threshold loop size is reached."));
 
 static cl::opt<bool>
-UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden,
+UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
   cl::desc("Unroll loops with run-time trip counts"));
 
 static cl::opt<unsigned>
@@ -76,178 +77,95 @@ PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden
   cl::desc("Unrolled size limit for loops with an unroll(full) or "
            "unroll_count pragma."));
 
-namespace {
-  class LoopUnroll : public LoopPass {
-  public:
-    static char ID; // Pass ID, replacement for typeid
-    LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
-      CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
-      CurrentPercentDynamicCostSavedThreshold =
-          UnrollPercentDynamicCostSavedThreshold;
-      CurrentDynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
-      CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
-      CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
-      CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
-
-      UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
-      UserPercentDynamicCostSavedThreshold =
-          (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0);
-      UserDynamicCostSavingsDiscount =
-          (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0);
-      UserAllowPartial = (P != -1) ||
-                         (UnrollAllowPartial.getNumOccurrences() > 0);
-      UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
-      UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0);
-
-      initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
-    }
 
-    /// A magic value for use with the Threshold parameter to indicate
-    /// that the loop unroll should be performed regardless of how much
-    /// code expansion would result.
-    static const unsigned NoThreshold = UINT_MAX;
-
-    // Threshold to use when optsize is specified (and there is no
-    // explicit -unroll-threshold).
-    static const unsigned OptSizeUnrollThreshold = 50;
-
-    // Default unroll count for loops with run-time trip count if
-    // -unroll-count is not set
-    static const unsigned UnrollRuntimeCount = 8;
-
-    unsigned CurrentCount;
-    unsigned CurrentThreshold;
-    unsigned CurrentPercentDynamicCostSavedThreshold;
-    unsigned CurrentDynamicCostSavingsDiscount;
-    bool CurrentAllowPartial;
-    bool CurrentRuntime;
-
-    // Flags for whether the 'current' settings are user-specified.
-    bool UserCount;
-    bool UserThreshold;
-    bool UserPercentDynamicCostSavedThreshold;
-    bool UserDynamicCostSavingsDiscount;
-    bool UserAllowPartial;
-    bool UserRuntime;
-
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
-    /// This transformation requires natural loop information & requires that
-    /// loop preheaders be inserted into the CFG...
-    ///
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addPreservedID(LCSSAID);
-      AU.addRequired<ScalarEvolution>();
-      AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
-      // If loop unroll does not preserve dom info then LCSSA pass on next
-      // loop will receive invalid dom info.
-      // For now, recreate dom info, if loop is unrolled.
-      AU.addPreserved<DominatorTreeWrapperPass>();
-    }
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = UINT_MAX;
 
-    // Fill in the UnrollingPreferences parameter with values from the
-    // TargetTransformationInfo.
-    void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
-                                 TargetTransformInfo::UnrollingPreferences &UP) {
-      UP.Threshold = CurrentThreshold;
-      UP.PercentDynamicCostSavedThreshold =
-          CurrentPercentDynamicCostSavedThreshold;
-      UP.DynamicCostSavingsDiscount = CurrentDynamicCostSavingsDiscount;
-      UP.OptSizeThreshold = OptSizeUnrollThreshold;
-      UP.PartialThreshold = CurrentThreshold;
-      UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
-      UP.Count = CurrentCount;
-      UP.MaxCount = UINT_MAX;
-      UP.Partial = CurrentAllowPartial;
-      UP.Runtime = CurrentRuntime;
-      UP.AllowExpensiveTripCount = false;
-      TTI.getUnrollingPreferences(L, UP);
-    }
+/// Default unroll count for loops with run-time trip count if
+/// -unroll-count is not set
+static const unsigned DefaultUnrollRuntimeCount = 8;
 
-    // Select and return an unroll count based on parameters from
-    // user, unroll preferences, unroll pragmas, or a heuristic.
-    // SetExplicitly is set to true if the unroll count is is set by
-    // the user or a pragma rather than selected heuristically.
-    unsigned
-    selectUnrollCount(const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
-                      unsigned PragmaCount,
-                      const TargetTransformInfo::UnrollingPreferences &UP,
-                      bool &SetExplicitly);
-
-    // Select threshold values used to limit unrolling based on a
-    // total unrolled size.  Parameters Threshold and PartialThreshold
-    // are set to the maximum unrolled size for fully and partially
-    // unrolled loops respectively.
-    void selectThresholds(const Loop *L, bool HasPragma,
-                          const TargetTransformInfo::UnrollingPreferences &UP,
-                          unsigned &Threshold, unsigned &PartialThreshold,
-                          unsigned &PercentDynamicCostSavedThreshold,
-                          unsigned &DynamicCostSavingsDiscount) {
-      // Determine the current unrolling threshold.  While this is
-      // normally set from UnrollThreshold, it is overridden to a
-      // smaller value if the current function is marked as
-      // optimize-for-size, and the unroll threshold was not user
-      // specified.
-      Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
-      PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
-      PercentDynamicCostSavedThreshold =
-          UserPercentDynamicCostSavedThreshold
-              ? CurrentPercentDynamicCostSavedThreshold
-              : UP.PercentDynamicCostSavedThreshold;
-      DynamicCostSavingsDiscount = UserDynamicCostSavingsDiscount
-                                       ? CurrentDynamicCostSavingsDiscount
-                                       : UP.DynamicCostSavingsDiscount;
-
-      if (!UserThreshold &&
-          L->getHeader()->getParent()->hasFnAttribute(
-              Attribute::OptimizeForSize)) {
-        Threshold = UP.OptSizeThreshold;
-        PartialThreshold = UP.PartialOptSizeThreshold;
-      }
-      if (HasPragma) {
-        // If the loop has an unrolling pragma, we want to be more
-        // aggressive with unrolling limits.  Set thresholds to at
-        // least the PragmaTheshold value which is larger than the
-        // default limits.
-        if (Threshold != NoThreshold)
-          Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold);
-        if (PartialThreshold != NoThreshold)
-          PartialThreshold =
-              std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold);
-      }
-    }
-    bool canUnrollCompletely(Loop *L, unsigned Threshold,
-                             unsigned PercentDynamicCostSavedThreshold,
-                             unsigned DynamicCostSavingsDiscount,
-                             uint64_t UnrolledCost, uint64_t RolledDynamicCost);
-  };
-}
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides, pragmas, and user specified parameters.
+static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+    Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
+    Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
+    Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll,
+    bool PragmaEnableUnroll, unsigned TripCount) {
+  TargetTransformInfo::UnrollingPreferences UP;
 
-char LoopUnroll::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+  // Set up the defaults
+  UP.Threshold = 150;
+  UP.PercentDynamicCostSavedThreshold = 20;
+  UP.DynamicCostSavingsDiscount = 2000;
+  UP.OptSizeThreshold = 50;
+  UP.PartialThreshold = UP.Threshold;
+  UP.PartialOptSizeThreshold = UP.OptSizeThreshold;
+  UP.Count = 0;
+  UP.MaxCount = UINT_MAX;
+  UP.Partial = false;
+  UP.Runtime = false;
+  UP.AllowExpensiveTripCount = false;
+
+  // Override with any target specific settings
+  TTI.getUnrollingPreferences(L, UP);
+
+  // Apply size attributes
+  if (L->getHeader()->getParent()->optForSize()) {
+    UP.Threshold = UP.OptSizeThreshold;
+    UP.PartialThreshold = UP.PartialOptSizeThreshold;
+  }
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
-                                 int Runtime) {
-  return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
-}
+  // Apply unroll count pragmas
+  if (PragmaCount)
+    UP.Count = PragmaCount;
+  else if (PragmaFullUnroll)
+    UP.Count = TripCount;
 
-Pass *llvm::createSimpleLoopUnrollPass() {
-  return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+  // Apply any user values specified by cl::opt
+  if (UnrollThreshold.getNumOccurrences() > 0) {
+    UP.Threshold = UnrollThreshold;
+    UP.PartialThreshold = UnrollThreshold;
+  }
+  if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0)
+    UP.PercentDynamicCostSavedThreshold =
+        UnrollPercentDynamicCostSavedThreshold;
+  if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0)
+    UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
+  if (UnrollCount.getNumOccurrences() > 0)
+    UP.Count = UnrollCount;
+  if (UnrollAllowPartial.getNumOccurrences() > 0)
+    UP.Partial = UnrollAllowPartial;
+  if (UnrollRuntime.getNumOccurrences() > 0)
+    UP.Runtime = UnrollRuntime;
+
+  // Apply user values provided by argument
+  if (UserThreshold.hasValue()) {
+    UP.Threshold = *UserThreshold;
+    UP.PartialThreshold = *UserThreshold;
+  }
+  if (UserCount.hasValue())
+    UP.Count = *UserCount;
+  if (UserAllowPartial.hasValue())
+    UP.Partial = *UserAllowPartial;
+  if (UserRuntime.hasValue())
+    UP.Runtime = *UserRuntime;
+
+  if (PragmaCount > 0 ||
+      ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) {
+    // If the loop has an unrolling pragma, we want to be more aggressive with
+    // unrolling limits. Set thresholds to at least the PragmaTheshold value
+    // which is larger than the default limits.
+    if (UP.Threshold != NoThreshold)
+      UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+    if (UP.PartialThreshold != NoThreshold)
+      UP.PartialThreshold =
+          std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+  }
+
+  return UP;
 }
 
 namespace {
@@ -278,8 +196,8 @@ class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {
 public:
   UnrolledInstAnalyzer(unsigned Iteration,
                        DenseMap<Value *, Constant *> &SimplifiedValues,
-                       const Loop *L, ScalarEvolution &SE)
-      : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) {
+                       ScalarEvolution &SE)
+      : SimplifiedValues(SimplifiedValues), SE(SE) {
       IterationNumber = SE.getConstant(APInt(64, Iteration));
   }
 
@@ -295,13 +213,6 @@ private:
   /// results saved.
   DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses;
 
-  /// \brief Number of currently simulated iteration.
-  ///
-  /// If an expression is ConstAddress+Constant, then the Constant is
-  /// Start + Iteration*Step, where Start and Step could be obtained from
-  /// SCEVGEPCache.
-  unsigned Iteration;
-
   /// \brief SCEV expression corresponding to number of currently simulated
   /// iteration.
   const SCEV *IterationNumber;
@@ -316,7 +227,6 @@ private:
   /// post-unrolling.
   DenseMap<Value *, Constant *> &SimplifiedValues;
 
-  const Loop *L;
   ScalarEvolution &SE;
 
   /// \brief Try to simplify instruction \param I using its SCEV expression.
@@ -368,11 +278,9 @@ private:
     return simplifyInstWithSCEV(&I);
   }
 
-  /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt.
-
   /// Try to simplify binary operator I.
   ///
-  /// TODO: Probaly it's worth to hoist the code for estimating the
+  /// TODO: Probably it's worth to hoist the code for estimating the
   /// simplifications effects to a separate class, since we have a very similar
   /// code in InlineCost already.
   bool visitBinaryOperator(BinaryOperator &I) {
@@ -412,7 +320,7 @@ private:
     auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);
     // We're only interested in loads that can be completely folded to a
     // constant.
-    if (!GV || !GV->hasInitializer())
+    if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant())
       return false;
 
     ConstantDataSequential *CDS =
@@ -420,6 +328,12 @@ private:
     if (!CDS)
       return false;
 
+    // We might have a vector load from an array. FIXME: for now we just bail
+    // out in this case, but we should be able to resolve and simplify such
+    // loads.
+    if(!CDS->isElementTypeCompatible(I.getType()))
+      return false;
+
     int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
     assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 &&
            "Unexpectedly large index value.");
@@ -436,6 +350,59 @@ private:
 
     return true;
   }
+
+  bool visitCastInst(CastInst &I) {
+    // Propagate constants through casts.
+    Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+    if (!COp)
+      COp = SimplifiedValues.lookup(I.getOperand(0));
+    if (COp)
+      if (Constant *C =
+              ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
+        SimplifiedValues[&I] = C;
+        return true;
+      }
+
+    return Base::visitCastInst(I);
+  }
+
+  bool visitCmpInst(CmpInst &I) {
+    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+    // First try to handle simplified comparisons.
+    if (!isa<Constant>(LHS))
+      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+        LHS = SimpleLHS;
+    if (!isa<Constant>(RHS))
+      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+        RHS = SimpleRHS;
+
+    if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+      auto SimplifiedLHS = SimplifiedAddresses.find(LHS);
+      if (SimplifiedLHS != SimplifiedAddresses.end()) {
+        auto SimplifiedRHS = SimplifiedAddresses.find(RHS);
+        if (SimplifiedRHS != SimplifiedAddresses.end()) {
+          SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
+          SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
+          if (LHSAddr.Base == RHSAddr.Base) {
+            LHS = LHSAddr.Offset;
+            RHS = RHSAddr.Offset;
+          }
+        }
+      }
+    }
+
+    if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+      if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+        if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+          SimplifiedValues[&I] = C;
+          return true;
+        }
+      }
+    }
+
+    return Base::visitCmpInst(I);
+  }
 };
 } // namespace
 
@@ -443,11 +410,11 @@ private:
 namespace {
 struct EstimatedUnrollCost {
   /// \brief The estimated cost after unrolling.
-  unsigned UnrolledCost;
+  int UnrolledCost;
 
   /// \brief The estimated dynamic cost of executing the instructions in the
   /// rolled form.
-  unsigned RolledDynamicCost;
+  int RolledDynamicCost;
 };
 }
 
@@ -464,10 +431,10 @@ struct EstimatedUnrollCost {
 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
 /// the analysis failed (no benefits expected from the unrolling, or the loop is
 /// too big to analyze), the returned value is None.
-Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
-                      const TargetTransformInfo &TTI,
-                      unsigned MaxUnrolledLoopSize) {
+static Optional<EstimatedUnrollCost>
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                      int MaxUnrolledLoopSize) {
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
   // analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -481,24 +448,61 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
 
   SmallSetVector<BasicBlock *, 16> BBWorklist;
   DenseMap<Value *, Constant *> SimplifiedValues;
+  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
 
   // The estimated cost of the unrolled form of the loop. We try to estimate
   // this by simplifying as much as we can while computing the estimate.
-  unsigned UnrolledCost = 0;
+  int UnrolledCost = 0;
   // We also track the estimated dynamic (that is, actually executed) cost in
   // the rolled form. This helps identify cases when the savings from unrolling
   // aren't just exposing dead control flows, but actual reduced dynamic
   // instructions due to the simplifications which we expect to occur after
   // unrolling.
-  unsigned RolledDynamicCost = 0;
+  int RolledDynamicCost = 0;
+
+  // Ensure that we don't violate the loop structure invariants relied on by
+  // this analysis.
+  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+  assert(L->isLCSSAForm(DT) &&
+         "Must have loops in LCSSA form to track live-out values.");
+
+  DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
 
   // Simulate execution of each iteration of the loop counting instructions,
   // which would be simplified.
   // Since the same load will take different values on different iterations,
   // we literally have to go through all loop's iterations.
   for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+    DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+    // Prepare for the iteration by collecting any simplified entry or backedge
+    // inputs.
+    for (Instruction &I : *L->getHeader()) {
+      auto *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI)
+        break;
+
+      // The loop header PHI nodes must have exactly two input: one from the
+      // loop preheader and one from the loop latch.
+      assert(
+          PHI->getNumIncomingValues() == 2 &&
+          "Must have an incoming value only for the preheader and the latch.");
+
+      Value *V = PHI->getIncomingValueForBlock(
+          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+      Constant *C = dyn_cast<Constant>(V);
+      if (Iteration != 0 && !C)
+        C = SimplifiedValues.lookup(V);
+      if (C)
+        SimplifiedInputValues.push_back({PHI, C});
+    }
+
+    // Now clear and re-populate the map for the next iteration.
     SimplifiedValues.clear();
-    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE);
+    while (!SimplifiedInputValues.empty())
+      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE);
 
     BBWorklist.clear();
     BBWorklist.insert(L->getHeader());
@@ -510,21 +514,67 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
       // it.  We don't change the actual IR, just count optimization
       // opportunities.
       for (Instruction &I : *BB) {
-        unsigned InstCost = TTI.getUserCost(&I);
+        int InstCost = TTI.getUserCost(&I);
 
         // Visit the instruction to analyze its loop cost after unrolling,
         // and if the visitor returns false, include this instruction in the
         // unrolled cost.
         if (!Analyzer.visit(I))
           UnrolledCost += InstCost;
+        else {
+          DEBUG(dbgs() << "  " << I
+                       << " would be simplified if loop is unrolled.\n");
+          (void)0;
+        }
 
         // Also track this instructions expected cost when executing the rolled
         // loop form.
         RolledDynamicCost += InstCost;
 
         // If unrolled body turns out to be too big, bail out.
-        if (UnrolledCost > MaxUnrolledLoopSize)
+        if (UnrolledCost > MaxUnrolledLoopSize) {
+          DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
+                       << "  UnrolledCost: " << UnrolledCost
+                       << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+                       << "\n");
           return None;
+        }
+      }
+
+      TerminatorInst *TI = BB->getTerminator();
+
+      // Add in the live successors by first checking whether we have terminator
+      // that may be simplified based on the values simplified by this call.
+      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+        if (BI->isConditional()) {
+          if (Constant *SimpleCond =
+                  SimplifiedValues.lookup(BI->getCondition())) {
+            BasicBlock *Succ = nullptr;
+            // Just take the first successor if condition is undef
+            if (isa<UndefValue>(SimpleCond))
+              Succ = BI->getSuccessor(0);
+            else
+              Succ = BI->getSuccessor(
+                  cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
+            if (L->contains(Succ))
+              BBWorklist.insert(Succ);
+            continue;
+          }
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+        if (Constant *SimpleCond =
+                SimplifiedValues.lookup(SI->getCondition())) {
+          BasicBlock *Succ = nullptr;
+          // Just take the first successor if condition is undef
+          if (isa<UndefValue>(SimpleCond))
+            Succ = SI->getSuccessor(0);
+          else
+            Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond))
+                       .getCaseSuccessor();
+          if (L->contains(Succ))
+            BBWorklist.insert(Succ);
+          continue;
+        }
       }
 
       // Add BB's successors to the worklist.
@@ -535,9 +585,15 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
 
     // If we found no optimization opportunities on the first iteration, we
     // won't find them on later ones too.
-    if (UnrolledCost == RolledDynamicCost)
+    if (UnrolledCost == RolledDynamicCost) {
+      DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
+                   << "  UnrolledCost: " << UnrolledCost << "\n");
       return None;
+    }
   }
+  DEBUG(dbgs() << "Analysis finished:\n"
+               << "UnrolledCost: " << UnrolledCost << ", "
+               << "RolledDynamicCost: " << RolledDynamicCost << "\n");
   return {{UnrolledCost, RolledDynamicCost}};
 }
 
@@ -583,6 +639,12 @@ static bool HasUnrollFullPragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
 }
 
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool HasUnrollEnablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
 // Returns true if the loop has an unroll(disable) pragma.
 static bool HasUnrollDisablePragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
@@ -644,12 +706,11 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
   L->setLoopID(NewLoopID);
 }
 
-bool LoopUnroll::canUnrollCompletely(Loop *L, unsigned Threshold,
-                                     unsigned PercentDynamicCostSavedThreshold,
-                                     unsigned DynamicCostSavingsDiscount,
-                                     uint64_t UnrolledCost,
-                                     uint64_t RolledDynamicCost) {
-
+static bool canUnrollCompletely(Loop *L, unsigned Threshold,
+                                unsigned PercentDynamicCostSavedThreshold,
+                                unsigned DynamicCostSavingsDiscount,
+                                uint64_t UnrolledCost,
+                                uint64_t RolledDynamicCost) {
   if (Threshold == NoThreshold) {
     DEBUG(dbgs() << "  Can fully unroll, because no threshold is set.\n");
     return true;
@@ -697,58 +758,13 @@ bool LoopUnroll::canUnrollCompletely(Loop *L, unsigned Threshold,
   return false;
 }
 
-unsigned LoopUnroll::selectUnrollCount(
-    const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
-    unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
-    bool &SetExplicitly) {
-  SetExplicitly = true;
-
-  // User-specified count (either as a command-line option or
-  // constructor parameter) has highest precedence.
-  unsigned Count = UserCount ? CurrentCount : 0;
-
-  // If there is no user-specified count, unroll pragmas have the next
-  // highest precendence.
-  if (Count == 0) {
-    if (PragmaCount) {
-      Count = PragmaCount;
-    } else if (PragmaFullUnroll) {
-      Count = TripCount;
-    }
-  }
-
-  if (Count == 0)
-    Count = UP.Count;
-
-  if (Count == 0) {
-    SetExplicitly = false;
-    if (TripCount == 0)
-      // Runtime trip count.
-      Count = UnrollRuntimeCount;
-    else
-      // Conservative heuristic: if we know the trip count, see if we can
-      // completely unroll (subject to the threshold, checked below); otherwise
-      // try to find greatest modulo of the trip count which is still under
-      // threshold value.
-      Count = TripCount;
-  }
-  if (TripCount && Count > TripCount)
-    return TripCount;
-  return Count;
-}
-
-bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipOptnoneFunction(L))
-    return false;
-
-  Function &F = *L->getHeader()->getParent();
-
-  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
-  const TargetTransformInfo &TTI =
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-
+static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
+                            AssumptionCache &AC, bool PreserveLCSSA,
+                            Optional<unsigned> ProvidedCount,
+                            Optional<unsigned> ProvidedThreshold,
+                            Optional<bool> ProvidedAllowPartial,
+                            Optional<bool> ProvidedRuntime) {
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
         << "] Loop %" << Header->getName() << "\n");
@@ -757,11 +773,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
   }
   bool PragmaFullUnroll = HasUnrollFullPragma(L);
+  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
   unsigned PragmaCount = UnrollCountPragmaValue(L);
-  bool HasPragma = PragmaFullUnroll || PragmaCount > 0;
-
-  TargetTransformInfo::UnrollingPreferences UP;
-  getUnrollingPreferences(L, TTI, UP);
+  bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0;
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -777,11 +791,18 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
   }
 
-  // Select an initial unroll count.  This may be reduced later based
-  // on size thresholds.
-  bool CountSetExplicitly;
-  unsigned Count = selectUnrollCount(L, TripCount, PragmaFullUnroll,
-                                     PragmaCount, UP, CountSetExplicitly);
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+      ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll,
+      TripCount);
+
+  unsigned Count = UP.Count;
+  bool CountSetExplicitly = Count != 0;
+  // Use a heuristic count if we didn't set anything explicitly.
+  if (!CountSetExplicitly)
+    Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount;
+  if (TripCount && Count > TripCount)
+    Count = TripCount;
 
   unsigned NumInlineCandidates;
   bool notDuplicatable;
@@ -803,13 +824,6 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
   }
 
-  unsigned Threshold, PartialThreshold;
-  unsigned PercentDynamicCostSavedThreshold;
-  unsigned DynamicCostSavingsDiscount;
-  selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold,
-                   PercentDynamicCostSavedThreshold,
-                   DynamicCostSavingsDiscount);
-
   // Given Count, TripCount and thresholds determine the type of
   // unrolling which is to be performed.
   enum { Full = 0, Partial = 1, Runtime = 2 };
@@ -817,7 +831,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (TripCount && Count == TripCount) {
     Unrolling = Partial;
     // If the loop is really small, we don't need to run an expensive analysis.
-    if (canUnrollCompletely(L, Threshold, 100, DynamicCostSavingsDiscount,
+    if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
                             UnrolledSize, UnrolledSize)) {
       Unrolling = Full;
     } else {
@@ -825,10 +839,12 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount))
-        if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold,
-                                DynamicCostSavingsDiscount, Cost->UnrolledCost,
-                                Cost->RolledDynamicCost)) {
+              L, TripCount, DT, *SE, TTI,
+              UP.Threshold + UP.DynamicCostSavingsDiscount))
+        if (canUnrollCompletely(L, UP.Threshold,
+                                UP.PercentDynamicCostSavedThreshold,
+                                UP.DynamicCostSavingsDiscount,
+                                Cost->UnrolledCost, Cost->RolledDynamicCost)) {
           Unrolling = Full;
         }
     }
@@ -840,22 +856,22 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Reduce count based on the type of unrolling and the threshold values.
   unsigned OriginalCount = Count;
-  bool AllowRuntime =
-      (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime);
+  bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime;
   // Don't unroll a runtime trip count loop with unroll full pragma.
   if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
     AllowRuntime = false;
   }
   if (Unrolling == Partial) {
-    bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+    bool AllowPartial = PragmaEnableUnroll || UP.Partial;
     if (!AllowPartial && !CountSetExplicitly) {
       DEBUG(dbgs() << "  will not try to unroll partially because "
                    << "-unroll-allow-partial not given\n");
       return false;
     }
-    if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) {
+    if (UP.PartialThreshold != NoThreshold &&
+        UnrolledSize > UP.PartialThreshold) {
       // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2);
+      Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2);
       while (Count != 0 && TripCount % Count != 0)
         Count--;
     }
@@ -867,7 +883,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     }
     // Reduce unroll count to be the largest power-of-two factor of
     // the original count which satisfies the threshold limit.
-    while (Count != 0 && UnrolledSize > PartialThreshold) {
+    while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
       Count >>= 1;
       UnrolledSize = (LoopSize-2) * Count + 2;
     }
@@ -887,23 +903,27 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     DebugLoc LoopLoc = L->getStartLoc();
     Function *F = Header->getParent();
     LLVMContext &Ctx = F->getContext();
-    if (PragmaFullUnroll && PragmaCount == 0) {
-      if (TripCount && Count != TripCount) {
-        emitOptimizationRemarkMissed(
-            Ctx, DEBUG_TYPE, *F, LoopLoc,
-            "Unable to fully unroll loop as directed by unroll(full) pragma "
-            "because unrolled size is too large.");
-      } else if (!TripCount) {
-        emitOptimizationRemarkMissed(
-            Ctx, DEBUG_TYPE, *F, LoopLoc,
-            "Unable to fully unroll loop as directed by unroll(full) pragma "
-            "because loop has a runtime trip count.");
-      }
-    } else if (PragmaCount > 0 && Count != OriginalCount) {
+    if ((PragmaCount > 0) && Count != OriginalCount) {
       emitOptimizationRemarkMissed(
           Ctx, DEBUG_TYPE, *F, LoopLoc,
           "Unable to unroll loop the number of times directed by "
           "unroll_count pragma because unrolled size is too large.");
+    } else if (PragmaFullUnroll && !TripCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to fully unroll loop as directed by unroll(full) pragma "
+          "because loop has a runtime trip count.");
+    } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to unroll loop as directed by unroll(enable) pragma because "
+          "unrolled size is too large.");
+    } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+               Count != TripCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to fully unroll loop as directed by unroll pragma because "
+          "unrolled size is too large.");
     }
   }
 
@@ -915,8 +935,96 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Unroll the loop.
   if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount,
-                  TripMultiple, LI, this, &LPM, &AC))
+                  TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA))
     return false;
 
   return true;
 }
+
+namespace {
+class LoopUnroll : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopUnroll(Optional<unsigned> Threshold = None,
+             Optional<unsigned> Count = None,
+             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None)
+      : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold),
+        ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) {
+    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+
+  Optional<unsigned> ProvidedCount;
+  Optional<unsigned> ProvidedThreshold;
+  Optional<bool> ProvidedAllowPartial;
+  Optional<bool> ProvidedRuntime;
+
+  bool runOnLoop(Loop *L, LPPassManager &) override {
+    if (skipOptnoneFunction(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, PreserveLCSSA, ProvidedCount,
+                           ProvidedThreshold, ProvidedAllowPartial,
+                           ProvidedRuntime);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreservedID(LCSSAID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+    // If loop unroll does not preserve dom info then LCSSA pass on next
+    // loop will receive invalid dom info.
+    // For now, recreate dom info, if loop is unrolled.
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char LoopUnroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+                                 int Runtime) {
+  // TODO: It would make more sense for this function to take the optionals
+  // directly, but that's dangerous since it would silently break out of tree
+  // callers.
+  return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
+                        Count == -1 ? None : Optional<unsigned>(Count),
+                        AllowPartial == -1 ? None
+                                           : Optional<bool>(AllowPartial),
+                        Runtime == -1 ? None : Optional<bool>(Runtime));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass() {
+  return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index cbc563b..95d7f8a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -37,6 +38,10 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -70,6 +75,19 @@ static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
 
+static cl::opt<bool>
+LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
+    cl::init(false), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to access PGO "
+             "heuristics to minimize code growth in cold regions."));
+
+static cl::opt<unsigned>
+ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
+    cl::desc("Coldness threshold in percentage. The loop header frequency "
+             "(relative to the entry frequency) is compared with this "
+             "threshold to determine if non-trivial unswitching should be "
+             "enabled."));
+
 namespace {
 
   class LUAnalysisCache {
@@ -148,12 +166,19 @@ namespace {
     LPPassManager *LPM;
     AssumptionCache *AC;
 
-    // LoopProcessWorklist - Used to check if second loop needs processing
-    // after RewriteLoopBodyWithConditionConstant rewrites first loop.
+    // Used to check if second loop needs processing after
+    // RewriteLoopBodyWithConditionConstant rewrites first loop.
     std::vector<Loop*> LoopProcessWorklist;
 
     LUAnalysisCache BranchesInfo;
 
+    bool EnabledPGO;
+
+    // BFI and ColdEntryFreq are only used when PGO and
+    // LoopUnswitchWithBlockFrequency are enabled.
+    BlockFrequencyInfo BFI;
+    BlockFrequency ColdEntryFreq;
+
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -192,9 +217,11 @@ namespace {
       AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
     }
 
   private:
@@ -210,7 +237,10 @@ namespace {
 
     /// Split all of the edges from inside the loop to their exit blocks.
     /// Update the appropriate Phi nodes as we do so.
-    void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+    void SplitExitEdges(Loop *L,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+    bool TryTrivialLoopUnswitch(bool &Changed);
 
     bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                               TerminatorInst *TI = nullptr);
@@ -229,9 +259,6 @@ namespace {
                                         TerminatorInst *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
-    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr,
-                                    BasicBlock **LoopExit = nullptr);
-
   };
 }
 
@@ -367,9 +394,8 @@ Pass *llvm::createLoopUnswitchPass(bool Os) {
   return new LoopUnswitch(Os);
 }
 
-/// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
-/// invariant in the loop, or has an invariant piece, return the invariant.
-/// Otherwise, return null.
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
 
   // We started analyze new instruction, increment scanned instructions counter.
@@ -411,11 +437,23 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
       *L->getHeader()->getParent());
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   LPM = &LPM_Ref;
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   currentLoop = L;
   Function *F = currentLoop->getHeader()->getParent();
+
+  EnabledPGO = F->getEntryCount().hasValue();
+
+  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
+    BranchProbabilityInfo BPI(*F, *LI);
+    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
+
+    // Use BranchProbability to compute a minimum frequency based on
+    // function entry baseline frequency. Loops with headers below this
+    // frequency are considered as cold.
+    const BranchProbability ColdProb(ColdnessThreshold, 100);
+    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
+  }
+
   bool Changed = false;
   do {
     assert(currentLoop->isLCSSAForm(*DT));
@@ -423,16 +461,13 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
     Changed |= processCurrentLoop();
   } while(redoLoop);
 
-  if (Changed) {
-    // FIXME: Reconstruct dom info, because it is not preserved properly.
-    if (DT)
-      DT->recalculate(*F);
-  }
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  if (Changed)
+    DT->recalculate(*F);
   return Changed;
 }
 
-/// processCurrentLoop - Do actual work and unswitch loop if possible
-/// and profitable.
+/// Do actual work and unswitch loop if possible and profitable.
 bool LoopUnswitch::processCurrentLoop() {
   bool Changed = false;
 
@@ -452,14 +487,48 @@ bool LoopUnswitch::processCurrentLoop() {
 
   LLVMContext &Context = loopHeader->getContext();
 
-  // Probably we reach the quota of branches for this loop. If so
-  // stop unswitching.
+  // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
   if (!BranchesInfo.countLoop(
           currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
                            *currentLoop->getHeader()->getParent()),
           AC))
     return false;
 
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  if (TryTrivialLoopUnswitch(Changed)) {
+    return true;
+  }
+
+  // Do not unswitch loops containing convergent operations, as we might be
+  // making them control dependent on the unswitch value when they were not
+  // before.
+  // FIXME: This could be refined to only bail if the convergent operation is
+  // not already control-dependent on the unswitch value.
+  for (const auto BB : currentLoop->blocks()) {
+    for (auto &I : *BB) {
+      auto CS = CallSite(&I);
+      if (!CS) continue;
+      if (CS.hasFnAttr(Attribute::Convergent))
+        return false;
+    }
+  }
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  // FIXME: Use Function::optForSize().
+  if (OptimizeForSize ||
+      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+
+  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
+    // Compute the weighted frequency of the hottest block in the
+    // loop (loopHeader in this case since inner loops should be
+    // processed before outer loop). If it is less than ColdFrequency,
+    // we should not unswitch.
+    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
+    if (LoopEntryFreq < ColdEntryFreq)
+      return false;
+  }
+
   // Loop over all of the basic blocks in the loop.  If we find an interior
   // block that is branching on a loop-invariant condition, we can unswitch this
   // loop.
@@ -528,8 +597,8 @@ bool LoopUnswitch::processCurrentLoop() {
   return Changed;
 }
 
-/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the
-/// loop with no side effects (including infinite loops).
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
 ///
 /// If true, we return true and set ExitBB to the block we
 /// exit through.
@@ -566,9 +635,9 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
   return true;
 }
 
-/// isTrivialLoopExitBlock - Return true if the specified block unconditionally
-/// leads to an exit from the specified loop, and has no side-effects in the
-/// process.  If so, return the block that is exited to, otherwise return null.
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
 static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   std::set<BasicBlock*> Visited;
   Visited.insert(L->getHeader());  // Branches to header make infinite loops.
@@ -578,105 +647,11 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   return nullptr;
 }
 
-/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is
-/// trivial: that is, that the condition controls whether or not the loop does
-/// anything at all.  If this is a trivial condition, unswitching produces no
-/// code duplications (equivalently, it produces a simpler loop and a new empty
-/// loop, which gets deleted).
-///
-/// If this is a trivial condition, return true, otherwise return false.  When
-/// returning true, this sets Cond and Val to the condition that controls the
-/// trivial condition: when Cond dynamically equals Val, the loop is known to
-/// exit.  Finally, this sets LoopExit to the BB that the loop exits to when
-/// Cond == Val.
-///
-bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
-                                       BasicBlock **LoopExit) {
-  BasicBlock *Header = currentLoop->getHeader();
-  TerminatorInst *HeaderTerm = Header->getTerminator();
-  LLVMContext &Context = Header->getContext();
-
-  BasicBlock *LoopExitBB = nullptr;
-  if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
-    // If the header block doesn't end with a conditional branch on Cond, we
-    // can't handle it.
-    if (!BI->isConditional() || BI->getCondition() != Cond)
-      return false;
-
-    // Check to see if a successor of the branch is guaranteed to
-    // exit through a unique exit block without having any
-    // side-effects.  If so, determine the value of Cond that causes it to do
-    // this.
-    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
-                                             BI->getSuccessor(0)))) {
-      if (Val) *Val = ConstantInt::getTrue(Context);
-    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
-                                                    BI->getSuccessor(1)))) {
-      if (Val) *Val = ConstantInt::getFalse(Context);
-    }
-  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
-    // If this isn't a switch on Cond, we can't handle it.
-    if (SI->getCondition() != Cond) return false;
-
-    // Check to see if a successor of the switch is guaranteed to go to the
-    // latch block or exit through a one exit block without having any
-    // side-effects.  If so, determine the value of Cond that causes it to do
-    // this.
-    // Note that we can't trivially unswitch on the default case or
-    // on already unswitched cases.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
-      BasicBlock *LoopExitCandidate;
-      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
-                                               i.getCaseSuccessor()))) {
-        // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt *CaseVal = i.getCaseValue();
-
-        // Check that it was not unswitched before, since already unswitched
-        // trivial vals are looks trivial too.
-        if (BranchesInfo.isUnswitched(SI, CaseVal))
-          continue;
-        LoopExitBB = LoopExitCandidate;
-        if (Val) *Val = CaseVal;
-        break;
-      }
-    }
-  }
-
-  // If we didn't find a single unique LoopExit block, or if the loop exit block
-  // contains phi nodes, this isn't trivial.
-  if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
-    return false;   // Can't handle this.
-
-  if (LoopExit) *LoopExit = LoopExitBB;
-
-  // We already know that nothing uses any scalar values defined inside of this
-  // loop.  As such, we just have to check to see if this loop will execute any
-  // side-effecting instructions (e.g. stores, calls, volatile loads) in the
-  // part of the loop that the code *would* execute.  We already checked the
-  // tail, check the header now.
-  for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I)
-    if (I->mayHaveSideEffects())
-      return false;
-  return true;
-}
-
-/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when
-/// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
+/// We have found that we can unswitch currentLoop when LoopCond == Val to
+/// simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                                         TerminatorInst *TI) {
-  Function *F = loopHeader->getParent();
-  Constant *CondVal = nullptr;
-  BasicBlock *ExitBlock = nullptr;
-
-  if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
-    // If the condition is trivial, always unswitch. There is no code growth
-    // for this case.
-    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock, TI);
-    return true;
-  }
-
   // Check to see if it would be profitable to unswitch current loop.
   if (!BranchesInfo.CostAllowsUnswitching()) {
     DEBUG(dbgs() << "NOT unswitching loop %"
@@ -687,32 +662,27 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
     return false;
   }
 
-  // Do not do non-trivial unswitch while optimizing for size.
-  if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize))
-    return false;
-
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
   return true;
 }
 
-/// CloneLoop - Recursively clone the specified loop and all of its children,
+/// Recursively clone the specified loop and all of its children,
 /// mapping the blocks with the specified map.
 static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
                        LoopInfo *LI, LPPassManager *LPM) {
-  Loop *New = new Loop();
-  LPM->insertLoop(New, PL);
+  Loop &New = LPM->addLoop(PL);
 
   // Add all of the blocks in L to the new loop.
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
     if (LI->getLoopFor(*I) == L)
-      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
 
   // Add all of the subloops to the new loop.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    CloneLoop(*I, New, VM, LI, LPM);
+    CloneLoop(*I, &New, VM, LI, LPM);
 
-  return New;
+  return &New;
 }
 
 static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,
@@ -744,15 +714,15 @@ static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,
         }
       }
       // fallthrough.
+    case LLVMContext::MD_make_implicit:
     case LLVMContext::MD_dbg:
       DstInst->setMetadata(MD.first, MD.second);
     }
   }
 }
 
-/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values
-/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest.  Insert the
-/// code immediately before InsertPt.
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
 void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                                   BasicBlock *TrueDest,
                                                   BasicBlock *FalseDest,
@@ -782,11 +752,11 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
   SplitCriticalEdge(BI, 1, Options);
 }
 
-/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
-/// condition in it (a cond branch from its header block to its latch block,
-/// where the path through the loop that doesn't execute its body has no
-/// side-effects), unswitch it.  This doesn't involve any code duplication, just
-/// moving the conditional branch outside of the loop and updating loop info.
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
 void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                             BasicBlock *ExitBlock,
                                             TerminatorInst *TI) {
@@ -810,7 +780,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
   // without actually branching to it (the exit block should be dominated by the
   // loop header, not the preheader).
   assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
-  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI);
+  BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI);
 
   // Okay, now we have a position to branch from and a position to branch to,
   // insert the new conditional branch.
@@ -829,8 +799,155 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
   ++NumTrivial;
 }
 
-/// SplitExitEdges - Split all of the edges from inside the loop to their exit
-/// blocks.  Update the appropriate Phi nodes as we do so.
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
+  BasicBlock *CurrentBB = currentLoop->getHeader();
+  TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+  LLVMContext &Context = CurrentBB->getContext();
+
+  // If loop header has only one reachable successor (currently via an
+  // unconditional branch or constant foldable conditional branch, but
+  // should also consider adding constant foldable switch instruction in
+  // future), we should keep looking for trivial condition candidates in
+  // the successor as well. An alternative is to constant fold conditions
+  // and merge successors into loop header (then we only need to check header's
+  // terminator). The reason for not doing this in LoopUnswitch pass is that
+  // it could potentially break LoopPassManager's invariants. Folding dead
+  // branches could either eliminate the current loop or make other loops
+  // unreachable. LCSSA form might also not be preserved after deleting
+  // branches. The following code keeps traversing loop header's successors
+  // until it finds the trivial condition candidate (condition that is not a
+  // constant). Since unswitching generates branches with constant conditions,
+  // this scenario could be very common in practice.
+  SmallSet<BasicBlock*, 8> Visited;
+
+  while (true) {
+    // If we exit loop or reach a previous visited block, then
+    // we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch
+    // can happen. Exit and return false.
+    if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+      return false;
+
+    // Check if this loop will execute any side-effecting instructions (e.g.
+    // stores, calls, volatile loads) in the part of the loop that the code
+    // *would* execute. Check the header first.
+    for (Instruction &I : *CurrentBB)
+      if (I.mayHaveSideEffects())
+        return false;
+
+    // FIXME: add check for constant foldable switch instructions.
+    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+      if (BI->isUnconditional()) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+        CurrentBB = BI->getSuccessor(1);
+      } else {
+        // Found a trivial condition candidate: non-foldable conditional branch.
+        break;
+      }
+    } else {
+      break;
+    }
+
+    CurrentTerm = CurrentBB->getTerminator();
+  }
+
+  // CondVal is the condition that controls the trivial condition.
+  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+  Constant *CondVal = nullptr;
+  BasicBlock *LoopExitBB = nullptr;
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+    // If this isn't branching on an invariant condition, we can't unswitch it.
+    if (!BI->isConditional())
+      return false;
+
+    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+                                           currentLoop, Changed);
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != BI->getCondition())
+      return false;
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes
+    // it to do this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                             BI->getSuccessor(0)))) {
+      CondVal = ConstantInt::getTrue(Context);
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                                    BI->getSuccessor(1)))) {
+      CondVal = ConstantInt::getFalse(Context);
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+                             CurrentTerm);
+    ++NumBranches;
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+    // If this isn't switching on an invariant condition, we can't unswitch it.
+    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                           currentLoop, Changed);
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != SI->getCondition())
+      return false;
+
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    // Note that we can't trivially unswitch on the default case or
+    // on already unswitched cases.
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *LoopExitCandidate;
+      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
+                                               i.getCaseSuccessor()))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        ConstantInt *CaseVal = i.getCaseValue();
+
+        // Check that it was not unswitched before, since already unswitched
+        // trivial vals are looks trivial too.
+        if (BranchesInfo.isUnswitched(SI, CaseVal))
+          continue;
+        LoopExitBB = LoopExitCandidate;
+        CondVal = CaseVal;
+        break;
+      }
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+                             nullptr);
+    ++NumSwitches;
+    return true;
+  }
+  return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
 void LoopUnswitch::SplitExitEdges(Loop *L,
                                const SmallVectorImpl<BasicBlock *> &ExitBlocks){
 
@@ -841,15 +958,14 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
 
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
-    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa",
-                           /*AliasAnalysis*/ nullptr, DT, LI,
+    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI,
                            /*PreserveLCSSA*/ true);
   }
 }
 
-/// UnswitchNontrivialCondition - We determined that the loop is profitable
-/// to unswitch when LIC equal Val.  Split it into loop versions and test the
-/// condition outside of either loop.  Return the loops created as Out1/Out2.
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
 void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                                Loop *L, TerminatorInst *TI) {
   Function *F = loopHeader->getParent();
@@ -858,8 +974,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
         << " blocks] in Function " << F->getName()
         << " when '" << *Val << "' == " << *LIC << "\n");
 
-  if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
-    SE->forgetLoop(L);
+  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+    SEWP->getSE().forgetLoop(L);
 
   LoopBlocks.clear();
   NewBlocks.clear();
@@ -901,8 +1017,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // Splice the newly inserted blocks into the function right before the
   // original preheader.
-  F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(),
-                                NewBlocks[0], F->end());
+  F->getBasicBlockList().splice(NewPreheader->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(), F->end());
 
   // FIXME: We could register any cloned assumptions instead of clearing the
   // whole function's cache.
@@ -944,7 +1061,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
     if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
       PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
-                                    ExitSucc->getFirstInsertionPt());
+                                    &*ExitSucc->getFirstInsertionPt());
 
       for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
            I != E; ++I) {
@@ -960,7 +1077,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
-      RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+      RemapInstruction(&*I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
 
   // Rewrite the original preheader to select between versions of the loop.
   BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
@@ -994,8 +1112,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
 }
 
-/// RemoveFromWorklist - Remove all instances of I from the worklist vector
-/// specified.
+/// Remove all instances of I from the worklist vector specified.
 static void RemoveFromWorklist(Instruction *I,
                                std::vector<Instruction*> &Worklist) {
 
@@ -1003,7 +1120,7 @@ static void RemoveFromWorklist(Instruction *I,
                  Worklist.end());
 }
 
-/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
+/// When we find that I really equals V, remove I from the
 /// program, replacing all uses with V and update the worklist.
 static void ReplaceUsesOfWith(Instruction *I, Value *V,
                               std::vector<Instruction*> &Worklist,
@@ -1025,9 +1142,9 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   ++NumSimplify;
 }
 
-// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
-// the value specified by Val in the specified loop, or we know it does NOT have
-// that value.  Rewrite any uses of LIC or of properties correlated to it.
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
 void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                                         Constant *Val,
                                                         bool IsEqual) {
@@ -1138,18 +1255,16 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // domtree here -- instead we force it to do a full recomputation
     // after the pass is complete -- but we do need to inform it of
     // new blocks.
-    if (DT)
-      DT->addNewBlock(Abort, NewSISucc);
+    DT->addNewBlock(Abort, NewSISucc);
   }
 
   SimplifyCode(Worklist, L);
 }
 
-/// SimplifyCode - Okay, now that we have simplified some instructions in the
-/// loop, walk over it and constant prop, dce, and fold control flow where
-/// possible.  Note that this is effectively a very simple loop-structure-aware
-/// optimizer.  During processing of this loop, L could very well be deleted, so
-/// it must not be used.
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
 ///
 /// FIXME: When the loop optimizer is more mature, separate this out to a new
 /// pass.
@@ -1207,8 +1322,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         Succ->replaceAllUsesWith(Pred);
 
         // Move all of the successor contents from Succ to Pred.
-        Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
-                                   Succ->end());
+        Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
+                                   Succ->begin(), Succ->end());
         LPM->deleteSimpleAnalysisValue(BI, L);
         BI->eraseFromParent();
         RemoveFromWorklist(BI, Worklist);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
index 3314e1e..41511bc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 #define DEBUG_TYPE "loweratomic"
 
 static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
-  IRBuilder<> Builder(CXI->getParent(), CXI);
+  IRBuilder<> Builder(CXI);
   Value *Ptr = CXI->getPointerOperand();
   Value *Cmp = CXI->getCompareOperand();
   Value *Val = CXI->getNewValOperand();
@@ -41,7 +41,7 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
 }
 
 static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
-  IRBuilder<> Builder(RMWI->getParent(), RMWI);
+  IRBuilder<> Builder(RMWI);
   Value *Ptr = RMWI->getPointerOperand();
   Value *Val = RMWI->getValOperand();
 
@@ -120,7 +120,7 @@ namespace {
         return false;
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
-        Instruction *Inst = DI++;
+        Instruction *Inst = &*DI++;
         if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
           Changed |= LowerFenceInst(FI);
         else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 0c47cbd..2ace902 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -139,7 +139,7 @@ static bool lowerExpectIntrinsic(Function &F) {
         ExpectIntrinsicsHandled++;
     }
 
-    // remove llvm.expect intrinsics.
+    // Remove llvm.expect intrinsics.
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
       CallInst *CI = dyn_cast<CallInst>(BI++);
       if (!CI)
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 85012af..6b43b0f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -30,7 +31,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <list>
+#include <algorithm>
 using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
@@ -71,9 +72,9 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
   return Offset;
 }
 
-/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
-/// constant offset, and return that constant offset.  For example, Ptr1 might
-/// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8.
+/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
+/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
+/// might be &A[40]. In this case offset would be -8.
 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
                             const DataLayout &DL) {
   Ptr1 = Ptr1->stripPointerCasts();
@@ -125,7 +126,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 }
 
 
-/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
+/// Represents a range of memset'd bytes with the ByteVal value.
 /// This allows us to analyze stores like:
 ///   store 0 -> P+1
 ///   store 0 -> P+0
@@ -164,8 +165,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 
   // If any of the stores are a memset, then it is always good to extend the
   // memset.
-  for (unsigned i = 0, e = TheStores.size(); i != e; ++i)
-    if (!isa<StoreInst>(TheStores[i]))
+  for (Instruction *SI : TheStores)
+    if (!isa<StoreInst>(SI))
       return true;
 
   // Assume that the code generator is capable of merging pairs of stores
@@ -189,7 +190,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   unsigned NumPointerStores = Bytes / MaxIntSize;
 
   // Assume the remaining bytes if any are done a byte at a time.
-  unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize;
+  unsigned NumByteStores = Bytes % MaxIntSize;
 
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
@@ -200,15 +201,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 
 namespace {
 class MemsetRanges {
-  /// Ranges - A sorted list of the memset ranges.  We use std::list here
-  /// because each element is relatively large and expensive to copy.
-  std::list<MemsetRange> Ranges;
-  typedef std::list<MemsetRange>::iterator range_iterator;
+  /// A sorted list of the memset ranges.
+  SmallVector<MemsetRange, 8> Ranges;
+  typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
   const DataLayout &DL;
 public:
   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 
-  typedef std::list<MemsetRange>::const_iterator const_iterator;
+  typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
   const_iterator begin() const { return Ranges.begin(); }
   const_iterator end() const { return Ranges.end(); }
   bool empty() const { return Ranges.empty(); }
@@ -240,26 +240,20 @@ public:
 } // end anon namespace
 
 
-/// addRange - Add a new store to the MemsetRanges data structure.  This adds a
+/// Add a new store to the MemsetRanges data structure.  This adds a
 /// new range for the specified store at the specified offset, merging into
 /// existing ranges as appropriate.
-///
-/// Do a linear search of the ranges to see if this can be joined and/or to
-/// find the insertion point in the list.  We keep the ranges sorted for
-/// simplicity here.  This is a linear search of a linked list, which is ugly,
-/// however the number of ranges is limited, so this won't get crazy slow.
 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
                             unsigned Alignment, Instruction *Inst) {
   int64_t End = Start+Size;
-  range_iterator I = Ranges.begin(), E = Ranges.end();
 
-  while (I != E && Start > I->End)
-    ++I;
+  range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
+    [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
 
   // We now know that I == E, in which case we didn't find anything to merge
   // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
   // to insert a new range.  Handle this now.
-  if (I == E || End < I->Start) {
+  if (I == Ranges.end() || End < I->Start) {
     MemsetRange &R = *Ranges.insert(I, MemsetRange());
     R.Start        = Start;
     R.End          = End;
@@ -295,7 +289,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
   if (End > I->End) {
     I->End = End;
     range_iterator NextI = I;
-    while (++NextI != E && End >= NextI->Start) {
+    while (++NextI != Ranges.end() && End >= NextI->Start) {
       // Merge the range in.
       I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
       if (NextI->End > I->End)
@@ -331,9 +325,9 @@ namespace {
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<MemoryDependenceAnalysis>();
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
 
@@ -357,7 +351,7 @@ namespace {
   char MemCpyOpt::ID = 0;
 }
 
-// createMemCpyOptPass - The public interface to this file...
+/// The public interface to this file...
 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
 
 INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
@@ -366,14 +360,15 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                     false, false)
 
-/// tryMergingIntoMemset - When scanning forward over instructions, we look for
-/// some other patterns to fold away.  In particular, this looks for stores to
-/// neighboring locations of memory.  If it sees enough consecutive ones, it
-/// attempts to merge them together into a memcpy/memset.
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
 Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
                                              Value *StartPtr, Value *ByteVal) {
   const DataLayout &DL = StartInst->getModule()->getDataLayout();
@@ -384,7 +379,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   // are stored.
   MemsetRanges Ranges(DL);
 
-  BasicBlock::iterator BI = StartInst;
+  BasicBlock::iterator BI(StartInst);
   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
     if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
       // If the instruction is readnone, ignore it, otherwise bail out.  We
@@ -439,14 +434,12 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   // If we create any memsets, we put it right before the first instruction that
   // isn't part of the memset block.  This ensure that the memset is dominated
   // by any addressing instruction needed by the start of the block.
-  IRBuilder<> Builder(BI);
+  IRBuilder<> Builder(&*BI);
 
   // Now that we have full information about ranges, loop over the ranges and
   // emit memset's for anything big enough to be worthwhile.
   Instruction *AMemSet = nullptr;
-  for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
-       I != E; ++I) {
-    const MemsetRange &Range = *I;
+  for (const MemsetRange &Range : Ranges) {
 
     if (Range.TheStores.size() == 1) continue;
 
@@ -470,19 +463,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
       Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
 
     DEBUG(dbgs() << "Replace stores:\n";
-          for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
-            dbgs() << *Range.TheStores[i] << '\n';
+          for (Instruction *SI : Range.TheStores)
+            dbgs() << *SI << '\n';
           dbgs() << "With: " << *AMemSet << '\n');
 
     if (!Range.TheStores.empty())
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
     // Zap all the stores.
-    for (SmallVectorImpl<Instruction *>::const_iterator
-         SI = Range.TheStores.begin(),
-         SE = Range.TheStores.end(); SI != SE; ++SI) {
-      MD->removeInstruction(*SI);
-      (*SI)->eraseFromParent();
+    for (Instruction *SI : Range.TheStores) {
+      MD->removeInstruction(SI);
+      SI->eraseFromParent();
     }
     ++NumMemSetInfer;
   }
@@ -490,17 +481,111 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   return AMemSet;
 }
 
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = SI->getAlignment();
+  if (!StoreAlign)
+    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  unsigned LoadAlign = LI->getAlignment();
+  if (!LoadAlign)
+    LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+  return std::min(StoreAlign, LoadAlign);
+}
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
+
+  // Avoid merging nontemporal stores since the resulting
+  // memcpy/memset would not be able to preserve the nontemporal hint.
+  // In theory we could teach how to propagate the !nontemporal metadata to
+  // memset calls. However, that change would force the backend to
+  // conservatively expand !nontemporal memset calls back to sequences of
+  // store instructions (effectively undoing the merging).
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return false;
+
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
-  // Detect cases where we're performing call slot forwarding, but
-  // happen to be using a load-store pair to implement it, rather than
-  // a memcpy.
+  // Load to store forwarding can be interpreted as memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
     if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
+
+      auto *T = LI->getType();
+      if (T->isAggregateType()) {
+        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+        // We use alias analysis to check if an instruction may store to
+        // the memory we load from in between the load and the store. If
+        // such an instruction is found, we try to promote there instead
+        // of at the store position.
+        Instruction *P = SI;
+        for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
+             I != E; ++I) {
+          if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod))
+            continue;
+
+          // We found an instruction that may write to the loaded memory.
+          // We can try to promote at this position instead of the store
+          // position if nothing alias the store memory after this and the store
+          // destination is not in the range.
+          P = &*I;
+          for (; I != E; ++I) {
+            MemoryLocation StoreLoc = MemoryLocation::get(SI);
+            if (&*I == SI->getOperand(1) ||
+                AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+              P = nullptr;
+              break;
+            }
+          }
+
+          break;
+        }
+
+        // If a valid insertion position is found, then we can promote
+        // the load/store pair to a memcpy.
+        if (P) {
+          // If we load from memory that may alias the memory we store to,
+          // memmove must be used to preserve semantic. If not, memcpy can
+          // be used.
+          bool UseMemMove = false;
+          if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+            UseMemMove = true;
+
+          unsigned Align = findCommonAlignment(DL, SI, LI);
+          uint64_t Size = DL.getTypeStoreSize(T);
+
+          IRBuilder<> Builder(P);
+          Instruction *M;
+          if (UseMemMove)
+            M = Builder.CreateMemMove(SI->getPointerOperand(),
+                                      LI->getPointerOperand(), Size,
+                                      Align, SI->isVolatile());
+          else
+            M = Builder.CreateMemCpy(SI->getPointerOperand(),
+                                     LI->getPointerOperand(), Size,
+                                     Align, SI->isVolatile());
+
+          DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+                       << " => " << *M << "\n");
+
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+
+          // Make sure we do not invalidate the iterator.
+          BBI = M->getIterator();
+          return true;
+        }
+      }
+
+      // Detect cases where we're performing call slot forwarding, but
+      // happen to be using a load-store pair to implement it, rather than
+      // a memcpy.
       MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = nullptr;
       if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
@@ -509,11 +594,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       if (C) {
         // Check that nothing touches the dest of the "copy" between
         // the call and the store.
-        AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
         MemoryLocation StoreLoc = MemoryLocation::get(SI);
-        for (BasicBlock::iterator I = --BasicBlock::iterator(SI),
-                                  E = C; I != E; --I) {
-          if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) {
+        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
+             I != E; --I) {
+          if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
             C = nullptr;
             break;
           }
@@ -521,18 +606,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
 
       if (C) {
-        unsigned storeAlign = SI->getAlignment();
-        if (!storeAlign)
-          storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
-        unsigned loadAlign = LI->getAlignment();
-        if (!loadAlign)
-          loadAlign = DL.getABITypeAlignment(LI->getType());
-
         bool changed = performCallSlotOptzn(
             LI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            std::min(storeAlign, loadAlign), C);
+            findCommonAlignment(DL, SI, LI), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -551,13 +629,39 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // Ensure that the value being stored is something that can be memset'able a
   // byte at a time like "0" or "-1" or any width, as well as things like
   // 0xA0A0A0A0 and 0.0.
-  if (Value *ByteVal = isBytewiseValue(SI->getOperand(0)))
+  auto *V = SI->getOperand(0);
+  if (Value *ByteVal = isBytewiseValue(V)) {
     if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
                                               ByteVal)) {
-      BBI = I;  // Don't invalidate iterator.
+      BBI = I->getIterator(); // Don't invalidate iterator.
       return true;
     }
 
+    // If we have an aggregate, we try to promote it to memset regardless
+    // of opportunity for merging as it can expose optimization opportunities
+    // in subsequent passes.
+    auto *T = V->getType();
+    if (T->isAggregateType()) {
+      uint64_t Size = DL.getTypeStoreSize(T);
+      unsigned Align = SI->getAlignment();
+      if (!Align)
+        Align = DL.getABITypeAlignment(T);
+      IRBuilder<> Builder(SI);
+      auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
+                                     Size, Align, SI->isVolatile());
+
+      DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
+      MD->removeInstruction(SI);
+      SI->eraseFromParent();
+      NumMemSetInfer++;
+
+      // Make sure we do not invalidate the iterator.
+      BBI = M->getIterator();
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -567,14 +671,14 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
     if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
                                               MSI->getValue())) {
-      BBI = I;  // Don't invalidate iterator.
+      BBI = I->getIterator(); // Don't invalidate iterator.
       return true;
     }
   return false;
 }
 
 
-/// performCallSlotOptzn - takes a memcpy and a call that it depends on,
+/// Takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
 bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
@@ -710,12 +814,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // unexpected manner, for example via a global, which we deduce from
   // the use analysis, we also need to know that it does not sneakily
   // access dest.  We rely on AA to figure this out for us.
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-  AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize);
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
   // If necessary, perform additional analysis.
-  if (MR != AliasAnalysis::NoModRef)
+  if (MR != MRI_NoModRef)
     MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
-  if (MR != AliasAnalysis::NoModRef)
+  if (MR != MRI_NoModRef)
     return false;
 
   // All the checks have passed, so do the transformation.
@@ -749,11 +853,9 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // Update AA metadata
   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
   // handled here, but combineMetadata doesn't support them yet
-  unsigned KnownIDs[] = {
-    LLVMContext::MD_tbaa,
-    LLVMContext::MD_alias_scope,
-    LLVMContext::MD_noalias,
-  };
+  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_noalias,
+                         LLVMContext::MD_invariant_group};
   combineMetadata(C, cpy, KnownIDs);
 
   // Remove the memcpy.
@@ -763,10 +865,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   return true;
 }
 
-/// processMemCpyMemCpyDependence - We've found that the (upward scanning)
-/// memory dependence of memcpy 'M' is the memcpy 'MDep'.  Try to simplify M to
-/// copy from MDep's input if we can.
-///
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
 bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
   // We can only transforms memcpy's where the dest of one is the source of the
   // other.
@@ -788,7 +888,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
     return false;
 
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   // Verify that the copied-from memory doesn't change in between the two
   // transfers.  For example, in:
@@ -802,8 +902,9 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
   //
   // NOTE: This is conservative, it will stop on any read from the source loc,
   // not just the defining memcpy.
-  MemDepResult SourceDep = MD->getPointerDependencyFrom(
-      MemoryLocation::getForSource(MDep), false, M, M->getParent());
+  MemDepResult SourceDep =
+      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+                                   M->getIterator(), M->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
 
@@ -860,8 +961,9 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
     return false;
 
   // Check that there are no other dependencies on the memset destination.
-  MemDepResult DstDepInfo = MD->getPointerDependencyFrom(
-      MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent());
+  MemDepResult DstDepInfo =
+      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
+                                   MemCpy->getIterator(), MemCpy->getParent());
   if (DstDepInfo.getInst() != MemSet)
     return false;
 
@@ -936,7 +1038,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   return true;
 }
 
-/// processMemCpy - perform simplification of memcpy's.  If we have memcpy A
+/// Perform simplification of memcpy's.  If we have memcpy A
 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
@@ -998,8 +1100,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   }
 
   MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
-  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,
-                                                         M, M->getParent());
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+      SrcLoc, true, M->getIterator(), M->getParent());
 
   if (SrcDepInfo.isClobber()) {
     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
@@ -1037,10 +1139,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   return false;
 }
 
-/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst
-/// are guaranteed not to alias.
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
 bool MemCpyOpt::processMemMove(MemMoveInst *M) {
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   if (!TLI->has(LibFunc::memmove))
     return false;
@@ -1053,12 +1155,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
   DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
 
   // If not, then we know we can transform this.
-  Module *Mod = M->getParent()->getParent()->getParent();
   Type *ArgTys[3] = { M->getRawDest()->getType(),
                       M->getRawSource()->getType(),
                       M->getLength()->getType() };
-  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy,
-                                                 ArgTys));
+  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+                                                 Intrinsic::memcpy, ArgTys));
 
   // MemDep may have over conservative information about this instruction, just
   // conservatively flush it from the cache.
@@ -1068,7 +1169,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
   return true;
 }
 
-/// processByValArgument - This is called on every byval argument in call sites.
+/// This is called on every byval argument in call sites.
 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
   // Find out what feeds this byval argument.
@@ -1076,8 +1177,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
   MemDepResult DepInfo = MD->getPointerDependencyFrom(
-      MemoryLocation(ByValArg, ByValSize), true, CS.getInstruction(),
-      CS.getInstruction()->getParent());
+      MemoryLocation(ByValArg, ByValSize), true,
+      CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
   if (!DepInfo.isClobber())
     return false;
 
@@ -1119,9 +1220,9 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   //
   // NOTE: This is conservative, it will stop on any read from the source loc,
   // not just the defining memcpy.
-  MemDepResult SourceDep =
-      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                   CS.getInstruction(), MDep->getParent());
+  MemDepResult SourceDep = MD->getPointerDependencyFrom(
+      MemoryLocation::getForSource(MDep), false,
+      CS.getInstruction()->getIterator(), MDep->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
 
@@ -1140,7 +1241,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   return true;
 }
 
-/// iterateOnFunction - Executes one iteration of MemCpyOpt.
+/// Executes one iteration of MemCpyOpt.
 bool MemCpyOpt::iterateOnFunction(Function &F) {
   bool MadeChange = false;
 
@@ -1148,7 +1249,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
   for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
       // Avoid invalidating the iterator.
-      Instruction *I = BI++;
+      Instruction *I = &*BI++;
 
       bool RepeatInstruction = false;
 
@@ -1177,9 +1278,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
   return MadeChange;
 }
 
-// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
-// function.
-//
+/// This is the main transformation entry point for a function.
 bool MemCpyOpt::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 643f374..c812d61 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -78,6 +78,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
@@ -91,6 +92,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mldst-motion"
@@ -106,7 +108,7 @@ class MergedLoadStoreMotion : public FunctionPass {
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  explicit MergedLoadStoreMotion(void)
+  MergedLoadStoreMotion()
       : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {
     initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());
   }
@@ -116,10 +118,11 @@ public:
 private:
   // This transformation requires dominator postdominator info
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<MemoryDependenceAnalysis>();
-    AU.addPreserved<AliasAnalysis>();
   }
 
   // Helper routines
@@ -156,7 +159,7 @@ private:
 };
 
 char MergedLoadStoreMotion::ID = 0;
-}
+} // anonymous namespace
 
 ///
 /// \brief createMergedLoadStoreMotionPass - The public interface to this file.
@@ -169,7 +172,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
                       "MergedLoadStoreMotion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
                     "MergedLoadStoreMotion", false, false)
 
@@ -236,12 +240,11 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
 /// being loaded or protect against the load from happening
 /// it is considered a hoist barrier.
 ///
-
 bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, 
                                                       const Instruction& End,
                                                       LoadInst* LI) {
   MemoryLocation Loc = MemoryLocation::get(LI);
-  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod);
+  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
 }
 
 ///
@@ -256,7 +259,7 @@ LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
 
   for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
        ++BBI) {
-    Instruction *Inst = BBI;
+    Instruction *Inst = &*BBI;
 
     // Only merge and hoist loads when their result in used only in BB
     if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1))
@@ -293,7 +296,7 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
 
   // Intersect optional metadata.
   HoistCand->intersectOptionalDataWith(ElseInst);
-  HoistCand->dropUnknownMetadata();
+  HoistCand->dropUnknownNonDebugMetadata();
 
   // Prepend point for instruction insert
   Instruction *HoistPt = BB->getTerminator();
@@ -363,8 +366,7 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
   int NLoads = 0;
   for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
        BBI != BBE;) {
-
-    Instruction *I = BBI;
+    Instruction *I = &*BBI;
     ++BBI;
 
     // Only move non-simple (atomic, volatile) loads.
@@ -394,11 +396,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
 /// value being stored or protect against the store from
 /// happening it is considered a sink barrier.
 ///
-
 bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
                                                       const Instruction &End,
                                                       MemoryLocation Loc) {
-  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef);
+  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
 }
 
 ///
@@ -438,23 +439,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
 PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
                                               StoreInst *S1) {
   // Create a phi if the values mismatch.
-  PHINode *NewPN = 0;
+  PHINode *NewPN = nullptr;
   Value *Opd1 = S0->getValueOperand();
   Value *Opd2 = S1->getValueOperand();
   if (Opd1 != Opd2) {
     NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
-                            BB->begin());
+                            &BB->front());
     NewPN->addIncoming(Opd1, S0->getParent());
     NewPN->addIncoming(Opd2, S1->getParent());
-    if (NewPN->getType()->getScalarType()->isPointerTy()) {
-      // AA needs to be informed when a PHI-use of the pointer value is added
-      for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) {
-        unsigned J = PHINode::getOperandNumForIncomingValue(I);
-        AA->addEscapingUse(NewPN->getOperandUse(J));
-      }
-      if (MD)
-        MD->invalidateCachedPointerInfo(NewPN);
-    }
+    if (MD && NewPN->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(NewPN);
   }
   return NewPN;
 }
@@ -479,12 +473,12 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
     BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
     // Intersect optional metadata.
     S0->intersectOptionalDataWith(S1);
-    S0->dropUnknownMetadata();
+    S0->dropUnknownNonDebugMetadata();
 
     // Create the new store to be inserted at the join point.
     StoreInst *SNew = (StoreInst *)(S0->clone());
     Instruction *ANew = A0->clone();
-    SNew->insertBefore(InsertPt);
+    SNew->insertBefore(&*InsertPt);
     ANew->insertBefore(SNew);
 
     assert(S0->getParent() == A0->getParent());
@@ -566,12 +560,13 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
   }
   return MergedStores;
 }
+
 ///
 /// \brief Run the transformation for each function
 ///
 bool MergedLoadStoreMotion::runOnFunction(Function &F) {
   MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   bool Changed = false;
   DEBUG(dbgs() << "Instruction Merger\n");
@@ -579,7 +574,7 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) {
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
-    BasicBlock *BB = FI++;
+    BasicBlock *BB = &*FI++;
 
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index f42f830..c8f885e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -71,8 +71,8 @@
 //
 // Limitations and TODO items:
 //
-// 1) We only considers n-ary adds for now. This should be extended and
-// generalized.
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
 //
 //===----------------------------------------------------------------------===//
 
@@ -110,11 +110,11 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<ScalarEvolution>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.setPreservesCFG();
@@ -145,12 +145,23 @@ private:
                                               unsigned I, Value *LHS,
                                               Value *RHS, Type *IndexedType);
 
-  // Reassociate Add for better CSE.
-  Instruction *tryReassociateAdd(BinaryOperator *I);
-  // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed.
-  Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I);
-  // Rewrites I to LHS + RHS if LHS is computed already.
-  Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I);
+  // Reassociate binary operators for better CSE.
+  Instruction *tryReassociateBinaryOp(BinaryOperator *I);
+
+  // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly
+  // passed.
+  Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS,
+                                      BinaryOperator *I);
+  // Rewrites I to (LHS op RHS) if LHS is computed already.
+  Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS,
+                                       BinaryOperator *I);
+
+  // Tries to match Op1 and Op2 by using V.
+  bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2);
+
+  // Gets SCEV for (LHS op RHS).
+  const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
+                            const SCEV *RHS);
 
   // Returns the closest dominator of \c Dominatee that computes
   // \c CandidateExpr. Returns null if not found.
@@ -161,11 +172,6 @@ private:
   // GEP's pointer size, i.e., whether Index needs to be sign-extended in order
   // to be an index of GEP.
   bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP);
-  // Returns whether V is known to be non-negative at context \c Ctxt.
-  bool isKnownNonNegative(Value *V, Instruction *Ctxt);
-  // Returns whether AO may sign overflow at context \c Ctxt. It computes a
-  // conservative result -- it answers true when not sure.
-  bool maySignOverflow(AddOperator *AO, Instruction *Ctxt);
 
   AssumptionCache *AC;
   const DataLayout *DL;
@@ -182,7 +188,7 @@ private:
   //     foo(a + b);
   //   if (p2)
   //     bar(a + b);
-  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> SeenExprs;
+  DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs;
 };
 } // anonymous namespace
 
@@ -191,7 +197,7 @@ INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation",
@@ -207,7 +213,7 @@ bool NaryReassociate::runOnFunction(Function &F) {
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
@@ -224,6 +230,7 @@ static bool isPotentiallyNaryReassociable(Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
   case Instruction::GetElementPtr:
+  case Instruction::Mul:
     return true;
   default:
     return false;
@@ -239,19 +246,21 @@ bool NaryReassociate::doOneIteration(Function &F) {
        Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
     BasicBlock *BB = Node->getBlock();
     for (auto I = BB->begin(); I != BB->end(); ++I) {
-      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) {
-        const SCEV *OldSCEV = SE->getSCEV(I);
-        if (Instruction *NewI = tryReassociate(I)) {
+      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
+        const SCEV *OldSCEV = SE->getSCEV(&*I);
+        if (Instruction *NewI = tryReassociate(&*I)) {
           Changed = true;
-          SE->forgetValue(I);
+          SE->forgetValue(&*I);
           I->replaceAllUsesWith(NewI);
-          RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
-          I = NewI;
+          // If SeenExprs constains I's WeakVH, that entry will be replaced with
+          // nullptr.
+          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+          I = NewI->getIterator();
         }
         // Add the rewritten instruction to SeenExprs; the original instruction
         // is deleted.
-        const SCEV *NewSCEV = SE->getSCEV(I);
-        SeenExprs[NewSCEV].push_back(I);
+        const SCEV *NewSCEV = SE->getSCEV(&*I);
+        SeenExprs[NewSCEV].push_back(WeakVH(&*I));
         // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
         // is equivalent to I. However, ScalarEvolution::getSCEV may
         // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
@@ -271,7 +280,7 @@ bool NaryReassociate::doOneIteration(Function &F) {
         //
         // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
         if (NewSCEV != OldSCEV)
-          SeenExprs[OldSCEV].push_back(I);
+          SeenExprs[OldSCEV].push_back(WeakVH(&*I));
       }
     }
   }
@@ -281,7 +290,8 @@ bool NaryReassociate::doOneIteration(Function &F) {
 Instruction *NaryReassociate::tryReassociate(Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
-    return tryReassociateAdd(cast<BinaryOperator>(I));
+  case Instruction::Mul:
+    return tryReassociateBinaryOp(cast<BinaryOperator>(I));
   case Instruction::GetElementPtr:
     return tryReassociateGEP(cast<GetElementPtrInst>(I));
   default:
@@ -352,27 +362,6 @@ bool NaryReassociate::requiresSignExtension(Value *Index,
   return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
 }
 
-bool NaryReassociate::isKnownNonNegative(Value *V, Instruction *Ctxt) {
-  bool NonNegative, Negative;
-  // TODO: ComputeSignBits is expensive. Consider caching the results.
-  ComputeSignBit(V, NonNegative, Negative, *DL, 0, AC, Ctxt, DT);
-  return NonNegative;
-}
-
-bool NaryReassociate::maySignOverflow(AddOperator *AO, Instruction *Ctxt) {
-  if (AO->hasNoSignedWrap())
-    return false;
-
-  Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
-  // If LHS or RHS has the same sign as the sum, AO doesn't sign overflow.
-  // TODO: handle the negative case as well.
-  if (isKnownNonNegative(AO, Ctxt) &&
-      (isKnownNonNegative(LHS, Ctxt) || isKnownNonNegative(RHS, Ctxt)))
-    return false;
-
-  return true;
-}
-
 GetElementPtrInst *
 NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
                                           Type *IndexedType) {
@@ -381,7 +370,7 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
     IndexToSplit = SExt->getOperand(0);
   } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
     // zext can be treated as sext if the source is non-negative.
-    if (isKnownNonNegative(ZExt->getOperand(0), GEP))
+    if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
       IndexToSplit = ZExt->getOperand(0);
   }
 
@@ -389,8 +378,11 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
     // If the I-th index needs sext and the underlying add is not equipped with
     // nsw, we cannot split the add because
     //   sext(LHS + RHS) != sext(LHS) + sext(RHS).
-    if (requiresSignExtension(IndexToSplit, GEP) && maySignOverflow(AO, GEP))
+    if (requiresSignExtension(IndexToSplit, GEP) &&
+        computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+            OverflowResult::NeverOverflows)
       return nullptr;
+
     Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
     // IndexToSplit = LHS + RHS.
     if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
@@ -415,7 +407,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
     IndexExprs.push_back(SE->getSCEV(*Index));
   // Replace the I-th index with LHS.
   IndexExprs[I] = SE->getSCEV(LHS);
-  if (isKnownNonNegative(LHS, GEP) &&
+  if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
       DL->getTypeSizeInBits(LHS->getType()) <
           DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
     // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
@@ -429,19 +421,20 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
       GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()),
       IndexExprs, GEP->isInBounds());
 
-  auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+  Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
   if (Candidate == nullptr)
     return nullptr;
 
-  PointerType *TypeOfCandidate = dyn_cast<PointerType>(Candidate->getType());
-  // Pretty rare but theoretically possible when a numeric value happens to
-  // share CandidateExpr.
-  if (TypeOfCandidate == nullptr)
-    return nullptr;
+  IRBuilder<> Builder(GEP);
+  // Candidate does not necessarily have the same pointer type as GEP. Use
+  // bitcast or pointer cast to make sure they have the same type, so that the
+  // later RAUW doesn't complain.
+  Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+  assert(Candidate->getType() == GEP->getType());
 
   // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
   uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
-  Type *ElementType = TypeOfCandidate->getElementType();
+  Type *ElementType = GEP->getType()->getElementType();
   uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
   // Another less rare case: because I is not necessarily the last index of the
   // GEP, the size of the type at the I-th index (IndexedSize) is not
@@ -461,8 +454,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
     return nullptr;
 
   // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
-  IRBuilder<> Builder(GEP);
-  Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate);
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
   if (RHS->getType() != IntPtrTy)
     RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
   if (IndexedSize != ElementSize) {
@@ -476,54 +468,89 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
   return NewGEP;
 }
 
-Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) {
+Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) {
   Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
-  if (auto *NewI = tryReassociateAdd(LHS, RHS, I))
+  if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
     return NewI;
-  if (auto *NewI = tryReassociateAdd(RHS, LHS, I))
+  if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
     return NewI;
   return nullptr;
 }
 
-Instruction *NaryReassociate::tryReassociateAdd(Value *LHS, Value *RHS,
-                                                Instruction *I) {
+Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+                                                     BinaryOperator *I) {
   Value *A = nullptr, *B = nullptr;
-  // To be conservative, we reassociate I only when it is the only user of A+B.
-  if (LHS->hasOneUse() && match(LHS, m_Add(m_Value(A), m_Value(B)))) {
-    // I = (A + B) + RHS
-    //   = (A + RHS) + B or (B + RHS) + A
+  // To be conservative, we reassociate I only when it is the only user of (A op
+  // B).
+  if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+    // I = (A op B) op RHS
+    //   = (A op RHS) op B or (B op RHS) op A
     const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
     const SCEV *RHSExpr = SE->getSCEV(RHS);
     if (BExpr != RHSExpr) {
-      if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(AExpr, RHSExpr), B, I))
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
         return NewI;
     }
     if (AExpr != RHSExpr) {
-      if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(BExpr, RHSExpr), A, I))
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
         return NewI;
     }
   }
   return nullptr;
 }
 
-Instruction *NaryReassociate::tryReassociatedAdd(const SCEV *LHSExpr,
-                                                 Value *RHS, Instruction *I) {
-  auto Pos = SeenExprs.find(LHSExpr);
-  // Bail out if LHSExpr is not previously seen.
-  if (Pos == SeenExprs.end())
-    return nullptr;
-
+Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+                                                      Value *RHS,
+                                                      BinaryOperator *I) {
   // Look for the closest dominator LHS of I that computes LHSExpr, and replace
-  // I with LHS + RHS.
+  // I with LHS op RHS.
   auto *LHS = findClosestMatchingDominator(LHSExpr, I);
   if (LHS == nullptr)
     return nullptr;
 
-  Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+  Instruction *NewI = nullptr;
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+    break;
+  case Instruction::Mul:
+    NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+    break;
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
   NewI->takeName(I);
   return NewI;
 }
 
+bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1,
+                                     Value *&Op2) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+  case Instruction::Mul:
+    return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return false;
+}
+
+const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
+                                           const SCEV *RHS) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return SE->getAddExpr(LHS, RHS);
+  case Instruction::Mul:
+    return SE->getMulExpr(LHS, RHS);
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return nullptr;
+}
+
 Instruction *
 NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,
                                               Instruction *Dominatee) {
@@ -537,9 +564,13 @@ NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,
   // future instruction either. Therefore, we pop it out of the stack. This
   // optimization makes the algorithm O(n).
   while (!Candidates.empty()) {
-    Instruction *Candidate = Candidates.back();
-    if (DT->dominates(Candidate, Dominatee))
-      return Candidate;
+    // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed
+    // during rewriting.
+    if (Value *Candidate = Candidates.back()) {
+      Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+      if (DT->dominates(CandidateInstruction, Dominatee))
+        return CandidateInstruction;
+    }
     Candidates.pop_back();
   }
   return nullptr;
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 31d7df3..9f26f78 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -154,7 +154,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
   Phi->addIncoming(Call, &CurrBB);
   Phi->addIncoming(LibCall, LibCallBB);
 
-  BB = JoinBB;
+  BB = JoinBB->getIterator();
   return true;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 366301a..b56b355 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -27,7 +27,7 @@
 // well defined state for inspection by the collector.  In the current
 // implementation, this is done via the insertion of poll sites at method entry
 // and the backedge of most loops.  We try to avoid inserting more polls than
-// are neccessary to ensure a finite period between poll sites.  This is not
+// are necessary to ensure a finite period between poll sites.  This is not
 // because the poll itself is expensive in the generated code; it's not.  Polls
 // do tend to impact the optimizer itself in negative ways; we'd like to avoid
 // perturbing the optimization of the method as much as we can.
@@ -91,13 +91,15 @@ STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");
 
 using namespace llvm;
 
-// Ignore oppurtunities to avoid placing safepoints on backedges, useful for
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
 // validation
 static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
                                   cl::init(false));
 
-/// If true, do not place backedge safepoints in counted loops.
-static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true));
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"?  Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+                                         cl::Hidden, cl::init(32));
 
 // If true, split the backedge of a loop when placing the safepoint, otherwise
 // split the latch block itself.  Both are useful to support for
@@ -121,7 +123,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
   std::vector<TerminatorInst *> PollLocations;
 
   /// True unless we're running spp-no-calls in which case we need to disable
-  /// the call dependend placement opts.
+  /// the call-dependent placement opts.
   bool CallSafepointsEnabled;
 
   ScalarEvolution *SE = nullptr;
@@ -142,7 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    SE = &getAnalysis<ScalarEvolution>();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     for (auto I = LI->begin(), E = LI->end(); I != E; I++) {
@@ -153,7 +155,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     // We no longer modify the IR at all in this pass.  Thus all
     // analysis are preserved.
@@ -190,10 +192,8 @@ static void
 InsertSafepointPoll(Instruction *InsertBefore,
                     std::vector<CallSite> &ParsePointsNeeded /*rval*/);
 
-static bool isGCLeafFunction(const CallSite &CS);
-
 static bool needsStatepoint(const CallSite &CS) {
-  if (isGCLeafFunction(CS))
+  if (callsGCLeafFunction(CS))
     return false;
   if (CS.isCall()) {
     CallInst *call = cast<CallInst>(CS.getInstruction());
@@ -206,7 +206,7 @@ static bool needsStatepoint(const CallSite &CS) {
   return true;
 }
 
-static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P);
+static Value *ReplaceWithStatepoint(const CallSite &CS);
 
 /// Returns true if this loop is known to contain a call safepoint which
 /// must unconditionally execute on any iteration of the loop which returns
@@ -220,7 +220,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
   // For the moment, we look only for the 'cuts' that consist of a single call
   // instruction in a block which is dominated by the Header and dominates the
   // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain
-  // of such dominating blocks gets substaintially more occurences than just
+  // of such dominating blocks gets substantially more occurrences than just
   // checking the Pred and Header blocks themselves.  This may be due to the
   // density of loop exit conditions caused by range and null checks.
   // TODO: structure this as an analysis pass, cache the result for subloops,
@@ -255,18 +255,12 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
 /// conservatism in the analysis.
 static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
                                     BasicBlock *Pred) {
-  // Only used when SkipCounted is off
-  const unsigned upperTripBound = 8192;
-
   // A conservative bound on the loop as a whole.
   const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
-  if (MaxTrips != SE->getCouldNotCompute()) {
-    if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound))
-      return true;
-    if (SkipCounted &&
-        SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32))
-      return true;
-  }
+  if (MaxTrips != SE->getCouldNotCompute() &&
+      SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+          CountedLoopTripWidth))
+    return true;
 
   // If this is a conditional branch to the header with the alternate path
   // being outside the loop, we can ask questions about the execution frequency
@@ -275,13 +269,10 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
     // This returns an exact expression only.  TODO: We really only need an
     // upper bound here, but SE doesn't expose that.
     const SCEV *MaxExec = SE->getExitCount(L, Pred);
-    if (MaxExec != SE->getCouldNotCompute()) {
-      if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound))
-        return true;
-      if (SkipCounted &&
-          SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32))
+    if (MaxExec != SE->getCouldNotCompute() &&
+        SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+            CountedLoopTripWidth))
         return true;
-    }
   }
 
   return /* not finite */ false;
@@ -432,14 +423,14 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
     assert(hasNextInstruction(I) &&
            "first check if there is a next instruction!");
     if (I->isTerminator()) {
-      return I->getParent()->getUniqueSuccessor()->begin();
+      return &I->getParent()->getUniqueSuccessor()->front();
     } else {
-      return std::next(BasicBlock::iterator(I));
+      return &*++I->getIterator();
     }
   };
 
   Instruction *cursor = nullptr;
-  for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor);
+  for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor);
        cursor = nextInstruction(cursor)) {
 
     // We need to ensure a safepoint poll occurs before any 'real' call.  The
@@ -466,7 +457,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
 static void findCallSafepoints(Function &F,
                                std::vector<CallSite> &Found /*rval*/) {
   assert(Found.empty() && "must be empty!");
-  for (Instruction &I : inst_range(F)) {
+  for (Instruction &I : instructions(F)) {
     Instruction *inst = &I;
     if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {
       CallSite CS(inst);
@@ -508,7 +499,7 @@ static bool isGCSafepointPoll(Function &F) {
 static bool shouldRewriteFunction(Function &F) {
   // TODO: This should check the GCStrategy
   if (F.hasGC()) {
-    const char *FunctionGCName = F.getGC();
+    const auto &FunctionGCName = F.getGC();
     const StringRef StatepointExampleName("statepoint-example");
     const StringRef CoreCLRName("coreclr");
     return (StatepointExampleName == FunctionGCName) ||
@@ -713,7 +704,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
                                   Invoke->getParent());
     }
 
-    Value *GCResult = ReplaceWithStatepoint(CS, nullptr);
+    Value *GCResult = ReplaceWithStatepoint(CS);
     Results.push_back(GCResult);
   }
   assert(Results.size() == ParsePointNeeded.size());
@@ -747,7 +738,7 @@ FunctionPass *llvm::createPlaceSafepointsPass() {
 INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
                       "place-backedge-safepoints-impl",
                       "Place Backedge Safepoints", false, false)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
@@ -759,31 +750,6 @@ INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
 INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
                     false, false)
 
-static bool isGCLeafFunction(const CallSite &CS) {
-  Instruction *inst = CS.getInstruction();
-  if (isa<IntrinsicInst>(inst)) {
-    // Most LLVM intrinsics are things which can never take a safepoint.
-    // As a result, we don't need to have the stack parsable at the
-    // callsite.  This is a highly useful optimization since intrinsic
-    // calls are fairly prevelent, particularly in debug builds.
-    return true;
-  }
-
-  // If this function is marked explicitly as a leaf call, we don't need to
-  // place a safepoint of it.  In fact, for correctness we *can't* in many
-  // cases.  Note: Indirect calls return Null for the called function,
-  // these obviously aren't runtime functions with attributes
-  // TODO: Support attributes on the call site as well.
-  const Function *F = CS.getCalledFunction();
-  bool isLeaf =
-      F &&
-      F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true");
-  if (isLeaf) {
-    return true;
-  }
-  return false;
-}
-
 static void
 InsertSafepointPoll(Instruction *InsertBefore,
                     std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
@@ -796,6 +762,7 @@ InsertSafepointPoll(Instruction *InsertBefore,
   // path call - where we need to insert a safepoint (parsepoint).
 
   auto *F = M->getFunction(GCSafepointPollName);
+  assert(F && "gc.safepoint_poll function is missing");
   assert(F->getType()->getElementType() ==
          FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
          "gc.safepoint_poll declared with wrong type");
@@ -864,10 +831,8 @@ InsertSafepointPoll(Instruction *InsertBefore,
 /// Replaces the given call site (Call or Invoke) with a gc.statepoint
 /// intrinsic with an empty deoptimization arguments list.  This does
 /// NOT do explicit relocation for GC support.
-static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
-                                    Pass *P) {
-  assert(CS.getInstruction()->getParent()->getParent()->getParent() &&
-         "must be set");
+static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) {
+  assert(CS.getInstruction()->getModule() && "must be set");
 
   // TODO: technically, a pass is not allowed to get functions from within a
   // function pass since it might trigger a new function addition.  Refactor
@@ -917,15 +882,10 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
       CS.getInstruction()->getContext(), AttributeSet::FunctionIndex,
       AttrsToRemove);
 
-  Value *StatepointTarget = NumPatchBytes == 0
-                                ? CS.getCalledValue()
-                                : ConstantPointerNull::get(cast<PointerType>(
-                                      CS.getCalledValue()->getType()));
-
   if (CS.isCall()) {
     CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
     CallInst *Call = Builder.CreateGCStatepointCall(
-        ID, NumPatchBytes, StatepointTarget,
+        ID, NumPatchBytes, CS.getCalledValue(),
         makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None,
         "safepoint_token");
     Call->setTailCall(ToReplace->isTailCall());
@@ -938,7 +898,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
 
     Token = Call;
 
-    // Put the following gc_result and gc_relocate calls immediately after the
+    // Put the following gc_result and gc_relocate calls immediately after
     // the old call (which we're about to delete).
     assert(ToReplace->getNextNode() && "not a terminator, must have next");
     Builder.SetInsertPoint(ToReplace->getNextNode());
@@ -951,7 +911,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
     // original block.
     Builder.SetInsertPoint(ToReplace->getParent());
     InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
-        ID, NumPatchBytes, StatepointTarget, ToReplace->getNormalDest(),
+        ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(),
         ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()),
         None, None, "safepoint_token");
 
@@ -967,7 +927,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
     // We'll insert the gc.result into the normal block
     BasicBlock *NormalDest = ToReplace->getNormalDest();
     // Can not insert gc.result in case of phi nodes preset.
-    // Should have removed this cases prior to runnning this function
+    // Should have removed this cases prior to running this function
     assert(!isa<PHINode>(NormalDest->begin()));
     Instruction *IP = &*(NormalDest->getFirstInsertionPt());
     Builder.SetInsertPoint(IP);
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index d1acf78..bcadd4e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -26,6 +26,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -62,7 +64,7 @@ namespace {
 /// Print out the expression identified in the Ops list.
 ///
 static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
-  Module *M = I->getParent()->getParent()->getParent();
+  Module *M = I->getModule();
   dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
        << *Ops[0].Op->getType() << '\t';
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
@@ -82,20 +84,6 @@ namespace {
 
     Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
 
-    /// \brief Sort factors by their Base.
-    struct BaseSorter {
-      bool operator()(const Factor &LHS, const Factor &RHS) {
-        return LHS.Base < RHS.Base;
-      }
-    };
-
-    /// \brief Compare factors for equal bases.
-    struct BaseEqual {
-      bool operator()(const Factor &LHS, const Factor &RHS) {
-        return LHS.Base == RHS.Base;
-      }
-    };
-
     /// \brief Sort factors in descending order by their power.
     struct PowerDescendingSorter {
       bool operator()(const Factor &LHS, const Factor &RHS) {
@@ -172,6 +160,7 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
     }
   private:
     void BuildRankMap(Function &F);
@@ -194,6 +183,8 @@ namespace {
     Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
     void EraseInst(Instruction *I);
+    void RecursivelyEraseDeadInsts(Instruction *I,
+                                   SetVector<AssertingVH<Instruction>> &Insts);
     void OptimizeInst(Instruction *I);
     Instruction *canonicalizeNegConstExpr(Instruction *I);
   };
@@ -255,27 +246,6 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
   return nullptr;
 }
 
-static bool isUnmovableInstruction(Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::PHI:
-  case Instruction::LandingPad:
-  case Instruction::Alloca:
-  case Instruction::Load:
-  case Instruction::Invoke:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-    return true;
-  case Instruction::Call:
-    return !isa<DbgInfoIntrinsic>(I);
-  default:
-    return false;
-  }
-}
-
 void Reassociate::BuildRankMap(Function &F) {
   unsigned i = 2;
 
@@ -295,7 +265,7 @@ void Reassociate::BuildRankMap(Function &F) {
     // we cannot move.  This ensures that the ranks for these instructions are
     // all different in the block.
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (isUnmovableInstruction(I))
+      if (mayBeMemoryDependent(*I))
         ValueRankMap[&*I] = ++BBRank;
   }
 }
@@ -913,7 +883,11 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
 /// that computes the negative version of the value specified.  The negative
 /// version of the value is returned, and BI is left pointing at the instruction
 /// that should be processed next by the reassociation pass.
-static Value *NegateValue(Value *V, Instruction *BI) {
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds.  These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+                          SetVector<AssertingVH<Instruction>> &ToRedo) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->getType()->isFPOrFPVectorTy()) {
       return ConstantExpr::getFNeg(C);
@@ -934,8 +908,8 @@ static Value *NegateValue(Value *V, Instruction *BI) {
   if (BinaryOperator *I =
           isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
     // Push the negates through the add.
-    I->setOperand(0, NegateValue(I->getOperand(0), BI));
-    I->setOperand(1, NegateValue(I->getOperand(1), BI));
+    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
     if (I->getOpcode() == Instruction::Add) {
       I->setHasNoUnsignedWrap(false);
       I->setHasNoSignedWrap(false);
@@ -948,6 +922,10 @@ static Value *NegateValue(Value *V, Instruction *BI) {
     //
     I->moveBefore(BI);
     I->setName(I->getName()+".neg");
+
+    // Add the intermediate negates to the redo list as processing them later
+    // could expose more reassociating opportunities.
+    ToRedo.insert(I);
     return I;
   }
 
@@ -972,26 +950,28 @@ static Value *NegateValue(Value *V, Instruction *BI) {
       if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
         InsertPt = II->getNormalDest()->begin();
       } else {
-        InsertPt = InstInput;
-        ++InsertPt;
+        InsertPt = ++InstInput->getIterator();
       }
       while (isa<PHINode>(InsertPt)) ++InsertPt;
     } else {
       InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
     }
-    TheNeg->moveBefore(InsertPt);
+    TheNeg->moveBefore(&*InsertPt);
     if (TheNeg->getOpcode() == Instruction::Sub) {
       TheNeg->setHasNoUnsignedWrap(false);
       TheNeg->setHasNoSignedWrap(false);
     } else {
       TheNeg->andIRFlags(BI);
     }
+    ToRedo.insert(TheNeg);
     return TheNeg;
   }
 
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
-  return CreateNeg(V, V->getName() + ".neg", BI, BI);
+  BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  ToRedo.insert(NewNeg);
+  return NewNeg;
 }
 
 /// Return true if we should break up this subtract of X-Y into (X + -Y).
@@ -1025,14 +1005,15 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
 
 /// If we have (X-Y), and if either X is an add, or if this is only used by an
 /// add, transform this into (X+(0-Y)) to promote better reassociation.
-static BinaryOperator *BreakUpSubtract(Instruction *Sub) {
+static BinaryOperator *
+BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
   // Calculate the negative value of Operand 1 of the sub instruction,
   // and set it as the RHS of the add instruction we just made.
   //
-  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
   BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
   Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
   Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
@@ -1166,7 +1147,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
     return nullptr;
   }
 
-  BasicBlock::iterator InsertPt = BO; ++InsertPt;
+  BasicBlock::iterator InsertPt = ++BO->getIterator();
 
   // If this was just a single multiply, remove the multiply and return the only
   // remaining operand.
@@ -1179,7 +1160,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
   }
 
   if (NeedsNegate)
-    V = CreateNeg(V, "neg", InsertPt, BO);
+    V = CreateNeg(V, "neg", &*InsertPt, BO);
 
   return V;
 }
@@ -1250,7 +1231,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
   return nullptr;
 }
 
-/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
 /// instruction with the given two operands, and return the resulting
 /// instruction. There are two special cases: 1) if the constant operand is 0,
 /// it will return NULL. 2) if the constant is ~0, the symbolic operand will
@@ -1947,6 +1928,22 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
   return nullptr;
 }
 
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void Reassociate::RecursivelyEraseDeadInsts(
+    Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+  ValueRankMap.erase(I);
+  Insts.remove(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  for (auto Op : Ops)
+    if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+      if (OpInst->use_empty())
+        Insts.insert(OpInst);
+}
+
 /// Zap the given instruction, adding interesting operands to the work list.
 void Reassociate::EraseInst(Instruction *I) {
   assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
@@ -2083,7 +2080,7 @@ void Reassociate::OptimizeInst(Instruction *I) {
     return;
 
   // Don't optimize floating point instructions that don't have unsafe algebra.
-  if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra())
+  if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
     return;
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
@@ -2099,7 +2096,7 @@ void Reassociate::OptimizeInst(Instruction *I) {
   // see if we can convert it to X+-Y.
   if (I->getOpcode() == Instruction::Sub) {
     if (ShouldBreakUpSubtract(I)) {
-      Instruction *NI = BreakUpSubtract(I);
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
@@ -2110,6 +2107,12 @@ void Reassociate::OptimizeInst(Instruction *I) {
           (!I->hasOneUse() ||
            !isReassociableOp(I->user_back(), Instruction::Mul))) {
         Instruction *NI = LowerNegateToMultiply(I);
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
         RedoInsts.insert(I);
         MadeChange = true;
         I = NI;
@@ -2117,7 +2120,7 @@ void Reassociate::OptimizeInst(Instruction *I) {
     }
   } else if (I->getOpcode() == Instruction::FSub) {
     if (ShouldBreakUpSubtract(I)) {
-      Instruction *NI = BreakUpSubtract(I);
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
@@ -2127,7 +2130,13 @@ void Reassociate::OptimizeInst(Instruction *I) {
       if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
           (!I->hasOneUse() ||
            !isReassociableOp(I->user_back(), Instruction::FMul))) {
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
         Instruction *NI = LowerNegateToMultiply(I);
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
         RedoInsts.insert(I);
         MadeChange = true;
         I = NI;
@@ -2142,8 +2151,15 @@ void Reassociate::OptimizeInst(Instruction *I) {
   // If this is an interior node of a reassociable tree, ignore it until we
   // get to the root of the tree, to avoid N^2 analysis.
   unsigned Opcode = BO->getOpcode();
-  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode)
+  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+    // During the initial run we will get to the root of the tree.
+    // But if we get here while we are redoing instructions, there is no
+    // guarantee that the root will be visited. So Redo later
+    if (BO->user_back() != BO &&
+        BO->getParent() == BO->user_back()->getParent())
+      RedoInsts.insert(BO->user_back());
     return;
+  }
 
   // If this is an add tree that is used by a sub instruction, ignore it
   // until we process the subtract.
@@ -2250,15 +2266,29 @@ bool Reassociate::runOnFunction(Function &F) {
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Optimize every instruction in the basic block.
     for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
-      if (isInstructionTriviallyDead(II)) {
-        EraseInst(II++);
+      if (isInstructionTriviallyDead(&*II)) {
+        EraseInst(&*II++);
       } else {
-        OptimizeInst(II);
+        OptimizeInst(&*II);
         assert(II->getParent() == BI && "Moved to a different block!");
         ++II;
       }
 
-    // If this produced extra instructions to optimize, handle them now.
+    // Make a copy of all the instructions to be redone so we can remove dead
+    // instructions.
+    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+    // Iterate over all instructions to be reevaluated and remove trivially dead
+    // instructions. If any operand of the trivially dead instruction becomes
+    // dead mark it for deletion as well. Continue this process until all
+    // trivially dead instructions have been removed.
+    while (!ToRedo.empty()) {
+      Instruction *I = ToRedo.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        RecursivelyEraseDeadInsts(I, ToRedo);
+    }
+
+    // Now that we have removed dead instructions, we can reoptimize the
+    // remaining instructions.
     while (!RedoInsts.empty()) {
       Instruction *I = RedoInsts.pop_back_val();
       if (isInstructionTriviallyDead(I))
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 1b46727..915f897 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -82,10 +82,9 @@ bool RegToMem::runOnFunction(Function &F) {
   BasicBlock::iterator I = BBEntry->begin();
   while (isa<AllocaInst>(I)) ++I;
 
-  CastInst *AllocaInsertionPoint =
-    new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())),
-                    Type::getInt32Ty(F.getContext()),
-                    "reg2mem alloca point", I);
+  CastInst *AllocaInsertionPoint = new BitCastInst(
+      Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
 
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
@@ -95,7 +94,7 @@ bool RegToMem::runOnFunction(Function &F) {
     for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
          iib != iie; ++iib) {
       if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
-          valueEscapes(iib)) {
+          valueEscapes(&*iib)) {
         WorkList.push_front(&*iib);
       }
     }
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index ae2ae3a..d77d574 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -14,12 +14,14 @@
 
 #include "llvm/Pass.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
@@ -46,10 +48,6 @@
 
 using namespace llvm;
 
-// Print tracing output
-static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden,
-                              cl::init(false));
-
 // Print the liveset found at the insert location
 static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
                                   cl::init(false));
@@ -74,6 +72,19 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
                                                   cl::location(ClobberNonLive),
                                                   cl::Hidden);
 
+static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden,
+                                     cl::init(false));
+static cl::opt<bool>
+    AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+                                   cl::Hidden, cl::init(true));
+
+/// Should we split vectors of pointers into their individual elements?  This
+/// is known to be buggy, but the alternate implementation isn't yet ready.
+/// This is purely to provide a debugging and dianostic hook until the vector
+/// split is replaced with vector relocations.
+static cl::opt<bool> UseVectorSplit("rs4gc-split-vector-values", cl::Hidden,
+                                    cl::init(true));
+
 namespace {
 struct RewriteStatepointsForGC : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
@@ -88,10 +99,10 @@ struct RewriteStatepointsForGC : public ModulePass {
       Changed |= runOnFunction(F);
 
     if (Changed) {
-      // stripDereferenceabilityInfo asserts that shouldRewriteStatepointsIn
+      // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
       // returns true for at least one function in the module.  Since at least
       // one function changed, we know that the precondition is satisfied.
-      stripDereferenceabilityInfo(M);
+      stripNonValidAttributes(M);
     }
 
     return Changed;
@@ -108,15 +119,16 @@ struct RewriteStatepointsForGC : public ModulePass {
   /// dereferenceability that are no longer valid/correct after
   /// RewriteStatepointsForGC has run.  This is because semantically, after
   /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
-  /// heap.  stripDereferenceabilityInfo (conservatively) restores correctness
+  /// heap.  stripNonValidAttributes (conservatively) restores correctness
   /// by erasing all attributes in the module that externally imply
   /// dereferenceability.
-  ///
-  void stripDereferenceabilityInfo(Module &M);
+  /// Similar reasoning also applies to the noalias attributes. gc.statepoint
+  /// can touch the entire heap including noalias objects.
+  void stripNonValidAttributes(Module &M);
 
-  // Helpers for stripDereferenceabilityInfo
-  void stripDereferenceabilityInfoFromBody(Function &F);
-  void stripDereferenceabilityInfoFromPrototype(Function &F);
+  // Helpers for stripNonValidAttributes
+  void stripNonValidAttributesFromBody(Function &F);
+  void stripNonValidAttributesFromPrototype(Function &F);
 };
 } // namespace
 
@@ -160,15 +172,16 @@ struct GCPtrLivenessData {
 // base relation will remain.  Internally, we add a mixture of the two
 // types, then update all the second type to the first type
 typedef DenseMap<Value *, Value *> DefiningValueMapTy;
-typedef DenseSet<llvm::Value *> StatepointLiveSetTy;
-typedef DenseMap<Instruction *, Value *> RematerializedValueMapTy;
+typedef DenseSet<Value *> StatepointLiveSetTy;
+typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>>
+  RematerializedValueMapTy;
 
 struct PartiallyConstructedSafepointRecord {
-  /// The set of values known to be live accross this safepoint
-  StatepointLiveSetTy liveset;
+  /// The set of values known to be live across this safepoint
+  StatepointLiveSetTy LiveSet;
 
   /// Mapping from live pointers to a base-defining-value
-  DenseMap<llvm::Value *, llvm::Value *> PointerToBase;
+  DenseMap<Value *, Value *> PointerToBase;
 
   /// The *new* gc.statepoint instruction itself.  This produces the token
   /// that normal path gc.relocates and the gc.result are tied to.
@@ -179,12 +192,26 @@ struct PartiallyConstructedSafepointRecord {
   Instruction *UnwindToken;
 
   /// Record live values we are rematerialized instead of relocating.
-  /// They are not included into 'liveset' field.
+  /// They are not included into 'LiveSet' field.
   /// Maps rematerialized copy to it's original value.
   RematerializedValueMapTy RematerializedValues;
 };
 }
 
+static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
+  assert(UseDeoptBundles && "Should not be called otherwise!");
+
+  Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt");
+
+  if (!DeoptBundle.hasValue()) {
+    assert(AllowStatepointWithNoDeoptInfo &&
+           "Found non-leaf call without deopt info!");
+    return None;
+  }
+
+  return DeoptBundle.getValue().Inputs;
+}
+
 /// Compute the live-in set for every basic block in the function
 static void computeLiveInValues(DominatorTree &DT, Function &F,
                                 GCPtrLivenessData &Data);
@@ -195,10 +222,10 @@ static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
                               StatepointLiveSetTy &out);
 
 // TODO: Once we can get to the GCStrategy, this becomes
-// Optional<bool> isGCManagedPointer(const Value *V) const override {
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
 
-static bool isGCPointerType(const Type *T) {
-  if (const PointerType *PT = dyn_cast<PointerType>(T))
+static bool isGCPointerType(Type *T) {
+  if (auto *PT = dyn_cast<PointerType>(T))
     // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
     // GC managed heap.  We know that a pointer into this heap needs to be
     // updated and that no other pointer does.
@@ -233,9 +260,8 @@ static bool containsGCPtrType(Type *Ty) {
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
     return containsGCPtrType(AT->getElementType());
   if (StructType *ST = dyn_cast<StructType>(Ty))
-    return std::any_of(
-        ST->subtypes().begin(), ST->subtypes().end(),
-        [](Type *SubType) { return containsGCPtrType(SubType); });
+    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
+                       containsGCPtrType);
   return false;
 }
 
@@ -247,7 +273,7 @@ static bool isUnhandledGCPointerType(Type *Ty) {
 }
 #endif
 
-static bool order_by_name(llvm::Value *a, llvm::Value *b) {
+static bool order_by_name(Value *a, Value *b) {
   if (a->hasName() && b->hasName()) {
     return -1 == a->getName().compare(b->getName());
   } else if (a->hasName() && !b->hasName()) {
@@ -260,6 +286,13 @@ static bool order_by_name(llvm::Value *a, llvm::Value *b) {
   }
 }
 
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+                                    StringRef DefaultName) {
+  return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
 // Conservatively identifies any definitions which might be live at the
 // given instruction. The  analysis is performed immediately before the
 // given instruction. Values defined by that instruction are not considered
@@ -269,30 +302,56 @@ static void analyzeParsePointLiveness(
     const CallSite &CS, PartiallyConstructedSafepointRecord &result) {
   Instruction *inst = CS.getInstruction();
 
-  StatepointLiveSetTy liveset;
-  findLiveSetAtInst(inst, OriginalLivenessData, liveset);
+  StatepointLiveSetTy LiveSet;
+  findLiveSetAtInst(inst, OriginalLivenessData, LiveSet);
 
   if (PrintLiveSet) {
     // Note: This output is used by several of the test cases
-    // The order of elemtns in a set is not stable, put them in a vec and sort
+    // The order of elements in a set is not stable, put them in a vec and sort
     // by name
-    SmallVector<Value *, 64> temp;
-    temp.insert(temp.end(), liveset.begin(), liveset.end());
-    std::sort(temp.begin(), temp.end(), order_by_name);
+    SmallVector<Value *, 64> Temp;
+    Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end());
+    std::sort(Temp.begin(), Temp.end(), order_by_name);
     errs() << "Live Variables:\n";
-    for (Value *V : temp) {
-      errs() << " " << V->getName(); // no newline
-      V->dump();
-    }
+    for (Value *V : Temp)
+      dbgs() << " " << V->getName() << " " << *V << "\n";
   }
   if (PrintLiveSetSize) {
     errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
-    errs() << "Number live values: " << liveset.size() << "\n";
+    errs() << "Number live values: " << LiveSet.size() << "\n";
+  }
+  result.LiveSet = LiveSet;
+}
+
+static bool isKnownBaseResult(Value *V);
+namespace {
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value.  The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value.  The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer. 
+struct BaseDefiningValueResult {
+  /// Contains the value which is the base defining value.
+  Value * const BDV;
+  /// True if the base defining value is also known to be an actual base
+  /// pointer.
+  const bool IsKnownBase;
+  BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+    : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+    // Check consistency between new and old means of checking whether a BDV is
+    // a base.
+    bool MustBeBase = isKnownBaseResult(BDV);
+    assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
   }
-  result.liveset = liveset;
+};
 }
 
-static Value *findBaseDefiningValue(Value *I);
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
 
 /// Return a base defining value for the 'Index' element of the given vector
 /// instruction 'I'.  If Index is null, returns a BDV for the entire vector
@@ -303,61 +362,27 @@ static Value *findBaseDefiningValue(Value *I);
 /// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
 /// If the later, the return pointer is a BDV (or possibly a base) for the
 /// particular element in 'I'.  
-static std::pair<Value *, bool>
-findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) {
-  assert(I->getType()->isVectorTy() &&
-         cast<VectorType>(I->getType())->getElementType()->isPointerTy() &&
-         "Illegal to ask for the base pointer of a non-pointer type");
-
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
   // Each case parallels findBaseDefiningValue below, see that code for
   // detailed motivation.
 
   if (isa<Argument>(I))
     // An incoming argument to the function is a base pointer
-    return std::make_pair(I, true);
-
-  // We shouldn't see the address of a global as a vector value?
-  assert(!isa<GlobalVariable>(I) &&
-         "unexpected global variable found in base of vector");
-
-  // inlining could possibly introduce phi node that contains
-  // undef if callee has multiple returns
-  if (isa<UndefValue>(I))
-    // utterly meaningless, but useful for dealing with partially optimized
-    // code.
-    return std::make_pair(I, true);
-
-  // Due to inheritance, this must be _after_ the global variable and undef
-  // checks
-  if (Constant *Con = dyn_cast<Constant>(I)) {
-    assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&
-           "order of checks wrong!");
-    assert(Con->isNullValue() && "null is the only case which makes sense");
-    return std::make_pair(Con, true);
-  }
-  
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I))
+    // Constant vectors consist only of constant pointers.
+    return BaseDefiningValueResult(I, true);
+
   if (isa<LoadInst>(I))
-    return std::make_pair(I, true);
-  
-  // For an insert element, we might be able to look through it if we know
-  // something about the indexes.
-  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(I)) {
-    if (Index) {
-      Value *InsertIndex = IEI->getOperand(2);
-      // This index is inserting the value, look for its BDV
-      if (InsertIndex == Index)
-        return std::make_pair(findBaseDefiningValue(IEI->getOperand(1)), false);
-      // Both constant, and can't be equal per above. This insert is definitely
-      // not relevant, look back at the rest of the vector and keep trying.
-      if (isa<ConstantInt>(Index) && isa<ConstantInt>(InsertIndex))
-        return findBaseDefiningValueOfVector(IEI->getOperand(0), Index);
-    }
-    
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<InsertElementInst>(I))
     // We don't know whether this vector contains entirely base pointers or
     // not.  To be conservatively correct, we treat it as a BDV and will
     // duplicate code as needed to construct a parallel vector of bases.
-    return std::make_pair(IEI, false);
-  }
+    return BaseDefiningValueResult(I, false);
 
   if (isa<ShuffleVectorInst>(I))
     // We don't know whether this vector contains entirely base pointers or
@@ -365,105 +390,47 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) {
     // duplicate code as needed to construct a parallel vector of bases.
     // TODO: There a number of local optimizations which could be applied here
     // for particular sufflevector patterns.
-    return std::make_pair(I, false);
+    return BaseDefiningValueResult(I, false);
 
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
          "unknown vector instruction - no base found for vector element");
-  return std::make_pair(I, false);
+  return BaseDefiningValueResult(I, false);
 }
 
-static bool isKnownBaseResult(Value *V);
-
 /// Helper function for findBasePointer - Will return a value which either a)
-/// defines the base pointer for the input or b) blocks the simple search
-/// (i.e. a PHI or Select of two derived pointers)
-static Value *findBaseDefiningValue(Value *I) {
-  if (I->getType()->isVectorTy())
-    return findBaseDefiningValueOfVector(I).first;
-  
-  assert(I->getType()->isPointerTy() &&
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+  assert(I->getType()->isPtrOrPtrVectorTy() &&
          "Illegal to ask for the base pointer of a non-pointer type");
 
-  // This case is a bit of a hack - it only handles extracts from vectors which
-  // trivially contain only base pointers or cases where we can directly match
-  // the index of the original extract element to an insertion into the vector.
-  // See note inside the function for how to improve this.
-  if (auto *EEI = dyn_cast<ExtractElementInst>(I)) {
-    Value *VectorOperand = EEI->getVectorOperand();
-    Value *Index = EEI->getIndexOperand();
-    std::pair<Value *, bool> pair =
-      findBaseDefiningValueOfVector(VectorOperand, Index);
-    Value *VectorBase = pair.first;
-    if (VectorBase->getType()->isPointerTy())
-      // We found a BDV for this specific element with the vector.  This is an
-      // optimization, but in practice it covers most of the useful cases
-      // created via scalarization.
-      return VectorBase;
-    else {
-      assert(VectorBase->getType()->isVectorTy());
-      if (pair.second)
-        // If the entire vector returned is known to be entirely base pointers,
-        // then the extractelement is valid base for this value.
-        return EEI;
-      else {
-        // Otherwise, we have an instruction which potentially produces a
-        // derived pointer and we need findBasePointers to clone code for us
-        // such that we can create an instruction which produces the
-        // accompanying base pointer.
-        // Note: This code is currently rather incomplete.  We don't currently
-        // support the general form of shufflevector of insertelement.
-        // Conceptually, these are just 'base defining values' of the same
-        // variety as phi or select instructions.  We need to update the
-        // findBasePointers algorithm to insert new 'base-only' versions of the
-        // original instructions. This is relative straight forward to do, but
-        // the case which would motivate the work hasn't shown up in real
-        // workloads yet.  
-        assert((isa<PHINode>(VectorBase) || isa<SelectInst>(VectorBase)) &&
-               "need to extend findBasePointers for generic vector"
-               "instruction cases");
-        return VectorBase;
-      }
-    }
-  }
+  if (I->getType()->isVectorTy())
+    return findBaseDefiningValueOfVector(I);
 
   if (isa<Argument>(I))
     // An incoming argument to the function is a base pointer
     // We should have never reached here if this argument isn't an gc value
-    return I;
-
-  if (isa<GlobalVariable>(I))
-    // base case
-    return I;
-
-  // inlining could possibly introduce phi node that contains
-  // undef if callee has multiple returns
-  if (isa<UndefValue>(I))
-    // utterly meaningless, but useful for dealing with
-    // partially optimized code.
-    return I;
-
-  // Due to inheritance, this must be _after_ the global variable and undef
-  // checks
-  if (Constant *Con = dyn_cast<Constant>(I)) {
-    assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&
-           "order of checks wrong!");
-    // Note: Finding a constant base for something marked for relocation
-    // doesn't really make sense.  The most likely case is either a) some
-    // screwed up the address space usage or b) your validating against
-    // compiled C++ code w/o the proper separation.  The only real exception
-    // is a null pointer.  You could have generic code written to index of
-    // off a potentially null value and have proven it null.  We also use
-    // null pointers in dead paths of relocation phis (which we might later
-    // want to find a base pointer for).
-    assert(isa<ConstantPointerNull>(Con) &&
-           "null is the only case which makes sense");
-    return Con;
-  }
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I))
+    // We assume that objects with a constant base (e.g. a global) can't move
+    // and don't need to be reported to the collector because they are always
+    // live.  All constants have constant bases.  Besides global references, all
+    // kinds of constants (e.g. undef, constant expressions, null pointers) can
+    // be introduced by the inliner or the optimizer, especially on dynamically
+    // dead paths.  See e.g. test4 in constants.ll.
+    return BaseDefiningValueResult(I, true);
 
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     Value *Def = CI->stripPointerCasts();
+    // If stripping pointer casts changes the address space there is an
+    // addrspacecast in between.
+    assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+               cast<PointerType>(CI->getType())->getAddressSpace() &&
+           "unsupported addrspacecast");
     // If we find a cast instruction here, it means we've found a cast which is
     // not simply a pointer cast (i.e. an inttoptr).  We don't know how to
     // handle int->ptr conversion.
@@ -472,7 +439,9 @@ static Value *findBaseDefiningValue(Value *I) {
   }
 
   if (isa<LoadInst>(I))
-    return I; // The value loaded is an gc base itself
+    // The value loaded is an gc base itself
+    return BaseDefiningValueResult(I, true);
+  
 
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
     // The base of this GEP is the base
@@ -480,14 +449,11 @@ static Value *findBaseDefiningValue(Value *I) {
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::experimental_gc_result_ptr:
     default:
       // fall through to general call handling
       break;
     case Intrinsic::experimental_gc_statepoint:
-    case Intrinsic::experimental_gc_result_float:
-    case Intrinsic::experimental_gc_result_int:
-      llvm_unreachable("these don't produce pointers");
+      llvm_unreachable("statepoints don't produce pointers");
     case Intrinsic::experimental_gc_relocate: {
       // Rerunning safepoint insertion after safepoints are already
       // inserted is not supported.  It could probably be made to work,
@@ -506,17 +472,17 @@ static Value *findBaseDefiningValue(Value *I) {
   // pointers.  This should probably be generalized via attributes to support
   // both source language and internal functions.
   if (isa<CallInst>(I) || isa<InvokeInst>(I))
-    return I;
+    return BaseDefiningValueResult(I, true);
 
   // I have absolutely no idea how to implement this part yet.  It's not
-  // neccessarily hard, I just haven't really looked at it yet.
+  // necessarily hard, I just haven't really looked at it yet.
   assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
 
   if (isa<AtomicCmpXchgInst>(I))
     // A CAS is effectively a atomic store and load combined under a
     // predicate.  From the perspective of base pointers, we just treat it
     // like a load.
-    return I;
+    return BaseDefiningValueResult(I, true);
 
   assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
                                    "binary ops which don't apply to pointers");
@@ -525,34 +491,41 @@ static Value *findBaseDefiningValue(Value *I) {
   // stack, but in either case, this is simply a field load.  As a result,
   // this is a defining definition of the base just like a load is.
   if (isa<ExtractValueInst>(I))
-    return I;
+    return BaseDefiningValueResult(I, true);
 
   // We should never see an insert vector since that would require we be
   // tracing back a struct value not a pointer value.
   assert(!isa<InsertValueInst>(I) &&
          "Base pointer for a struct is meaningless");
 
+  // An extractelement produces a base result exactly when it's input does.
+  // We may need to insert a parallel instruction to extract the appropriate
+  // element out of the base vector corresponding to the input. Given this,
+  // it's analogous to the phi and select case even though it's not a merge.
+  if (isa<ExtractElementInst>(I))
+    // Note: There a lot of obvious peephole cases here.  This are deliberately
+    // handled after the main base pointer inference algorithm to make writing
+    // test cases to exercise that code easier.
+    return BaseDefiningValueResult(I, false);
+
   // The last two cases here don't return a base pointer.  Instead, they
-  // return a value which dynamically selects from amoung several base
+  // return a value which dynamically selects from among several base
   // derived pointers (each with it's own base potentially).  It's the job of
   // the caller to resolve these.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
          "missing instruction case in findBaseDefiningValing");
-  return I;
+  return BaseDefiningValueResult(I, false);
 }
 
 /// Returns the base defining value for this value.
 static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
   Value *&Cached = Cache[I];
   if (!Cached) {
-    Cached = findBaseDefiningValue(I);
+    Cached = findBaseDefiningValue(I).BDV;
+    DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+                 << Cached->getName() << "\n");
   }
   assert(Cache[I] != nullptr);
-
-  if (TraceLSP) {
-    dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName()
-           << "\n";
-  }
   return Cached;
 }
 
@@ -572,7 +545,9 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
 /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
 /// is it known to be a base pointer?  Or do we need to continue searching.
 static bool isKnownBaseResult(Value *V) {
-  if (!isa<PHINode>(V) && !isa<SelectInst>(V)) {
+  if (!isa<PHINode>(V) && !isa<SelectInst>(V) &&
+      !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+      !isa<ShuffleVectorInst>(V)) {
     // no recursion possible
     return true;
   }
@@ -587,17 +562,19 @@ static bool isKnownBaseResult(Value *V) {
   return false;
 }
 
-// TODO: find a better name for this
 namespace {
-class PhiState {
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
 public:
   enum Status { Unknown, Base, Conflict };
 
-  PhiState(Status s, Value *b = nullptr) : status(s), base(b) {
+  BDVState(Status s, Value *b = nullptr) : status(s), base(b) {
     assert(status != Base || b);
   }
-  PhiState(Value *b) : status(Base), base(b) {}
-  PhiState() : status(Unknown), base(nullptr) {}
+  explicit BDVState(Value *b) : status(Base), base(b) {}
+  BDVState() : status(Unknown), base(nullptr) {}
 
   Status getStatus() const { return status; }
   Value *getBase() const { return base; }
@@ -606,72 +583,80 @@ public:
   bool isUnknown() const { return getStatus() == Unknown; }
   bool isConflict() const { return getStatus() == Conflict; }
 
-  bool operator==(const PhiState &other) const {
+  bool operator==(const BDVState &other) const {
     return base == other.base && status == other.status;
   }
 
-  bool operator!=(const PhiState &other) const { return !(*this == other); }
+  bool operator!=(const BDVState &other) const { return !(*this == other); }
 
-  void dump() {
-    errs() << status << " (" << base << " - "
-           << (base ? base->getName() : "nullptr") << "): ";
+  LLVM_DUMP_METHOD
+  void dump() const { print(dbgs()); dbgs() << '\n'; }
+  
+  void print(raw_ostream &OS) const {
+    switch (status) {
+    case Unknown:
+      OS << "U";
+      break;
+    case Base:
+      OS << "B";
+      break;
+    case Conflict:
+      OS << "C";
+      break;
+    };
+    OS << " (" << base << " - "
+       << (base ? base->getName() : "nullptr") << "): ";
   }
 
 private:
   Status status;
-  Value *base; // non null only if status == base
+  AssertingVH<Value> base; // non null only if status == base
 };
+}
 
-typedef DenseMap<Value *, PhiState> ConflictStateMapTy;
-// Values of type PhiState form a lattice, and this is a helper
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+  State.print(OS);
+  return OS;
+}
+#endif
+
+namespace {
+// Values of type BDVState form a lattice, and this is a helper
 // class that implementes the meet operation.  The meat of the meet
-// operation is implemented in MeetPhiStates::pureMeet
-class MeetPhiStates {
+// operation is implemented in MeetBDVStates::pureMeet
+class MeetBDVStates {
 public:
-  // phiStates is a mapping from PHINodes and SelectInst's to PhiStates.
-  explicit MeetPhiStates(const ConflictStateMapTy &phiStates)
-      : phiStates(phiStates) {}
-
-  // Destructively meet the current result with the base V.  V can
-  // either be a merge instruction (SelectInst / PHINode), in which
-  // case its status is looked up in the phiStates map; or a regular
-  // SSA value, in which case it is assumed to be a base.
-  void meetWith(Value *V) {
-    PhiState otherState = getStateForBDV(V);
-    assert((MeetPhiStates::pureMeet(otherState, currentResult) ==
-            MeetPhiStates::pureMeet(currentResult, otherState)) &&
-           "math is wrong: meet does not commute!");
-    currentResult = MeetPhiStates::pureMeet(otherState, currentResult);
+  /// Initializes the currentResult to the TOP state so that if can be met with
+  /// any other state to produce that state.
+  MeetBDVStates() {}
+
+  // Destructively meet the current result with the given BDVState
+  void meetWith(BDVState otherState) {
+    currentResult = meet(otherState, currentResult);
   }
 
-  PhiState getResult() const { return currentResult; }
+  BDVState getResult() const { return currentResult; }
 
 private:
-  const ConflictStateMapTy &phiStates;
-  PhiState currentResult;
-
-  /// Return a phi state for a base defining value.  We'll generate a new
-  /// base state for known bases and expect to find a cached state otherwise
-  PhiState getStateForBDV(Value *baseValue) {
-    if (isKnownBaseResult(baseValue)) {
-      return PhiState(baseValue);
-    } else {
-      return lookupFromMap(baseValue);
-    }
-  }
+  BDVState currentResult;
 
-  PhiState lookupFromMap(Value *V) {
-    auto I = phiStates.find(V);
-    assert(I != phiStates.end() && "lookup failed!");
-    return I->second;
+  /// Perform a meet operation on two elements of the BDVState lattice.
+  static BDVState meet(BDVState LHS, BDVState RHS) {
+    assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) &&
+           "math is wrong: meet does not commute!");
+    BDVState Result = pureMeet(LHS, RHS);
+    DEBUG(dbgs() << "meet of " << LHS << " with " << RHS
+                 << " produced " << Result << "\n");
+    return Result;
   }
 
-  static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) {
+  static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) {
     switch (stateA.getStatus()) {
-    case PhiState::Unknown:
+    case BDVState::Unknown:
       return stateB;
 
-    case PhiState::Base:
+    case BDVState::Base:
       assert(stateA.getBase() && "can't be null");
       if (stateB.isUnknown())
         return stateA;
@@ -681,18 +666,20 @@ private:
           assert(stateA == stateB && "equality broken!");
           return stateA;
         }
-        return PhiState(PhiState::Conflict);
+        return BDVState(BDVState::Conflict);
       }
       assert(stateB.isConflict() && "only three states!");
-      return PhiState(PhiState::Conflict);
+      return BDVState(BDVState::Conflict);
 
-    case PhiState::Conflict:
+    case BDVState::Conflict:
       return stateA;
     }
     llvm_unreachable("only three states!");
   }
 };
 }
+
+
 /// For a given value or instruction, figure out what base ptr it's derived
 /// from.  For gc objects, this is simply itself.  On success, returns a value
 /// which is the base pointer.  (This is reliable and can be used for
@@ -723,171 +710,252 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
   //
   // Note: A simpler form of this would be to add the conflict form of all
   // PHIs without running the optimistic algorithm.  This would be
-  // analougous to pessimistic data flow and would likely lead to an
+  // analogous to pessimistic data flow and would likely lead to an
   // overall worse solution.
 
-  ConflictStateMapTy states;
-  states[def] = PhiState();
-  // Recursively fill in all phis & selects reachable from the initial one
-  // for which we don't already know a definite base value for
-  // TODO: This should be rewritten with a worklist
-  bool done = false;
-  while (!done) {
-    done = true;
-    // Since we're adding elements to 'states' as we run, we can't keep
-    // iterators into the set.
-    SmallVector<Value *, 16> Keys;
-    Keys.reserve(states.size());
-    for (auto Pair : states) {
-      Value *V = Pair.first;
-      Keys.push_back(V);
-    }
-    for (Value *v : Keys) {
-      assert(!isKnownBaseResult(v) && "why did it get added?");
-      if (PHINode *phi = dyn_cast<PHINode>(v)) {
-        assert(phi->getNumIncomingValues() > 0 &&
-               "zero input phis are illegal");
-        for (Value *InVal : phi->incoming_values()) {
-          Value *local = findBaseOrBDV(InVal, cache);
-          if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
-            states[local] = PhiState();
-            done = false;
-          }
-        }
-      } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) {
-        Value *local = findBaseOrBDV(sel->getTrueValue(), cache);
-        if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
-          states[local] = PhiState();
-          done = false;
-        }
-        local = findBaseOrBDV(sel->getFalseValue(), cache);
-        if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
-          states[local] = PhiState();
-          done = false;
-        }
+#ifndef NDEBUG
+  auto isExpectedBDVType = [](Value *BDV) {
+    return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+           isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV);
+  };
+#endif
+
+  // Once populated, will contain a mapping from each potentially non-base BDV
+  // to a lattice value (described above) which corresponds to that BDV.
+  // We use the order of insertion (DFS over the def/use graph) to provide a
+  // stable deterministic ordering for visiting DenseMaps (which are unordered)
+  // below.  This is important for deterministic compilation.
+  MapVector<Value *, BDVState> States;
+
+  // Recursively fill in all base defining values reachable from the initial
+  // one for which we don't already know a definite base value for
+  /* scope */ {
+    SmallVector<Value*, 16> Worklist;
+    Worklist.push_back(def);
+    States.insert(std::make_pair(def, BDVState()));
+    while (!Worklist.empty()) {
+      Value *Current = Worklist.pop_back_val();
+      assert(!isKnownBaseResult(Current) && "why did it get added?");
+
+      auto visitIncomingValue = [&](Value *InVal) {
+        Value *Base = findBaseOrBDV(InVal, cache);
+        if (isKnownBaseResult(Base))
+          // Known bases won't need new instructions introduced and can be
+          // ignored safely
+          return;
+        assert(isExpectedBDVType(Base) && "the only non-base values "
+               "we see should be base defining values");
+        if (States.insert(std::make_pair(Base, BDVState())).second)
+          Worklist.push_back(Base);
+      };
+      if (PHINode *Phi = dyn_cast<PHINode>(Current)) {
+        for (Value *InVal : Phi->incoming_values())
+          visitIncomingValue(InVal);
+      } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) {
+        visitIncomingValue(Sel->getTrueValue());
+        visitIncomingValue(Sel->getFalseValue());
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+        visitIncomingValue(EE->getVectorOperand());
+      } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+        visitIncomingValue(IE->getOperand(0)); // vector operand
+        visitIncomingValue(IE->getOperand(1)); // scalar operand
+      } else {
+        // There is one known class of instructions we know we don't handle.
+        assert(isa<ShuffleVectorInst>(Current));
+        llvm_unreachable("unimplemented instruction case");
       }
     }
   }
 
-  if (TraceLSP) {
-    errs() << "States after initialization:\n";
-    for (auto Pair : states) {
-      Instruction *v = cast<Instruction>(Pair.first);
-      PhiState state = Pair.second;
-      state.dump();
-      v->dump();
-    }
+#ifndef NDEBUG
+  DEBUG(dbgs() << "States after initialization:\n");
+  for (auto Pair : States) {
+    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
+#endif
 
-  // TODO: come back and revisit the state transitions around inputs which
-  // have reached conflict state.  The current version seems too conservative.
+  // Return a phi state for a base defining value.  We'll generate a new
+  // base state for known bases and expect to find a cached state otherwise.
+  auto getStateForBDV = [&](Value *baseValue) {
+    if (isKnownBaseResult(baseValue))
+      return BDVState(baseValue);
+    auto I = States.find(baseValue);
+    assert(I != States.end() && "lookup failed!");
+    return I->second;
+  };
 
   bool progress = true;
   while (progress) {
 #ifndef NDEBUG
-    size_t oldSize = states.size();
+    const size_t oldSize = States.size();
 #endif
     progress = false;
-    // We're only changing keys in this loop, thus safe to keep iterators
-    for (auto Pair : states) {
-      MeetPhiStates calculateMeet(states);
-      Value *v = Pair.first;
-      assert(!isKnownBaseResult(v) && "why did it get added?");
-      if (SelectInst *select = dyn_cast<SelectInst>(v)) {
-        calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache));
-        calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache));
-      } else
-        for (Value *Val : cast<PHINode>(v)->incoming_values())
-          calculateMeet.meetWith(findBaseOrBDV(Val, cache));
-
-      PhiState oldState = states[v];
-      PhiState newState = calculateMeet.getResult();
+    // We're only changing values in this loop, thus safe to keep iterators.
+    // Since this is computing a fixed point, the order of visit does not
+    // effect the result.  TODO: We could use a worklist here and make this run
+    // much faster.
+    for (auto Pair : States) {
+      Value *BDV = Pair.first;
+      assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+      // Given an input value for the current instruction, return a BDVState
+      // instance which represents the BDV of that value.
+      auto getStateForInput = [&](Value *V) mutable {
+        Value *BDV = findBaseOrBDV(V, cache);
+        return getStateForBDV(BDV);
+      };
+
+      MeetBDVStates calculateMeet;
+      if (SelectInst *select = dyn_cast<SelectInst>(BDV)) {
+        calculateMeet.meetWith(getStateForInput(select->getTrueValue()));
+        calculateMeet.meetWith(getStateForInput(select->getFalseValue()));
+      } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) {
+        for (Value *Val : Phi->incoming_values())
+          calculateMeet.meetWith(getStateForInput(Val));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+        // The 'meet' for an extractelement is slightly trivial, but it's still
+        // useful in that it drives us to conflict if our input is.
+        calculateMeet.meetWith(getStateForInput(EE->getVectorOperand()));
+      } else {
+        // Given there's a inherent type mismatch between the operands, will
+        // *always* produce Conflict.
+        auto *IE = cast<InsertElementInst>(BDV);
+        calculateMeet.meetWith(getStateForInput(IE->getOperand(0)));
+        calculateMeet.meetWith(getStateForInput(IE->getOperand(1)));
+      }
+
+      BDVState oldState = States[BDV];
+      BDVState newState = calculateMeet.getResult();
       if (oldState != newState) {
         progress = true;
-        states[v] = newState;
+        States[BDV] = newState;
       }
     }
 
-    assert(oldSize <= states.size());
-    assert(oldSize == states.size() || progress);
+    assert(oldSize == States.size() &&
+           "fixed point shouldn't be adding any new nodes to state");
   }
 
-  if (TraceLSP) {
-    errs() << "States after meet iteration:\n";
-    for (auto Pair : states) {
-      Instruction *v = cast<Instruction>(Pair.first);
-      PhiState state = Pair.second;
-      state.dump();
-      v->dump();
-    }
+#ifndef NDEBUG
+  DEBUG(dbgs() << "States after meet iteration:\n");
+  for (auto Pair : States) {
+    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
-
+#endif
+  
   // Insert Phis for all conflicts
-  // We want to keep naming deterministic in the loop that follows, so
-  // sort the keys before iteration.  This is useful in allowing us to
-  // write stable tests. Note that there is no invalidation issue here.
-  SmallVector<Value *, 16> Keys;
-  Keys.reserve(states.size());
-  for (auto Pair : states) {
-    Value *V = Pair.first;
-    Keys.push_back(V);
-  }
-  std::sort(Keys.begin(), Keys.end(), order_by_name);
   // TODO: adjust naming patterns to avoid this order of iteration dependency
-  for (Value *V : Keys) {
-    Instruction *v = cast<Instruction>(V);
-    PhiState state = states[V];
-    assert(!isKnownBaseResult(v) && "why did it get added?");
-    assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
-    if (!state.isConflict())
+  for (auto Pair : States) {
+    Instruction *I = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+    assert(!isKnownBaseResult(I) && "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+    // extractelement instructions are a bit special in that we may need to
+    // insert an extract even when we know an exact base for the instruction.
+    // The problem is that we need to convert from a vector base to a scalar
+    // base for the particular indice we're interested in.
+    if (State.isBase() && isa<ExtractElementInst>(I) &&
+        isa<VectorType>(State.getBase()->getType())) {
+      auto *EE = cast<ExtractElementInst>(I);
+      // TODO: In many cases, the new instruction is just EE itself.  We should
+      // exploit this, but can't do it here since it would break the invariant
+      // about the BDV not being known to be a base.
+      auto *BaseInst = ExtractElementInst::Create(State.getBase(),
+                                                  EE->getIndexOperand(),
+                                                  "base_ee", EE);
+      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+      States[I] = BDVState(BDVState::Base, BaseInst);
+    }
+
+    // Since we're joining a vector and scalar base, they can never be the
+    // same.  As a result, we should always see insert element having reached
+    // the conflict state.
+    if (isa<InsertElementInst>(I)) {
+      assert(State.isConflict());
+    }
+    
+    if (!State.isConflict())
       continue;
 
-    if (isa<PHINode>(v)) {
-      int num_preds =
-          std::distance(pred_begin(v->getParent()), pred_end(v->getParent()));
-      assert(num_preds > 0 && "how did we reach here");
-      PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v);
-      // Add metadata marking this as a base value
-      auto *const_1 = ConstantInt::get(
-          Type::getInt32Ty(
-              v->getParent()->getParent()->getParent()->getContext()),
-          1);
-      auto MDConst = ConstantAsMetadata::get(const_1);
-      MDNode *md = MDNode::get(
-          v->getParent()->getParent()->getParent()->getContext(), MDConst);
-      phi->setMetadata("is_base_value", md);
-      states[v] = PhiState(PhiState::Conflict, phi);
+    /// Create and insert a new instruction which will represent the base of
+    /// the given instruction 'I'.
+    auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+      if (isa<PHINode>(I)) {
+        BasicBlock *BB = I->getParent();
+        int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+        assert(NumPreds > 0 && "how did we reach here");
+        std::string Name = suffixed_name_or(I, ".base", "base_phi");
+        return PHINode::Create(I->getType(), NumPreds, Name, I);
+      } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) {
+        // The undef will be replaced later
+        UndefValue *Undef = UndefValue::get(Sel->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_select");
+        return SelectInst::Create(Sel->getCondition(), Undef,
+                                  Undef, Name, Sel);
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+        UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ee");
+        return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+                                          EE);
+      } else {
+        auto *IE = cast<InsertElementInst>(I);
+        UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+        UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ie");
+        return InsertElementInst::Create(VecUndef, ScalarUndef,
+                                         IE->getOperand(2), Name, IE);
+      }
+
+    };
+    Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+    // Add metadata marking this as a base value
+    BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+    States[I] = BDVState(BDVState::Conflict, BaseInst);
+  }
+
+  // Returns a instruction which produces the base pointer for a given
+  // instruction.  The instruction is assumed to be an input to one of the BDVs
+  // seen in the inference algorithm above.  As such, we must either already
+  // know it's base defining value is a base, or have inserted a new
+  // instruction to propagate the base of it's BDV and have entered that newly
+  // introduced instruction into the state table.  In either case, we are
+  // assured to be able to determine an instruction which produces it's base
+  // pointer. 
+  auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+    Value *BDV = findBaseOrBDV(Input, cache);
+    Value *Base = nullptr;
+    if (isKnownBaseResult(BDV)) {
+      Base = BDV;
     } else {
-      SelectInst *sel = cast<SelectInst>(v);
-      // The undef will be replaced later
-      UndefValue *undef = UndefValue::get(sel->getType());
-      SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef,
-                                               undef, "base_select", sel);
-      // Add metadata marking this as a base value
-      auto *const_1 = ConstantInt::get(
-          Type::getInt32Ty(
-              v->getParent()->getParent()->getParent()->getContext()),
-          1);
-      auto MDConst = ConstantAsMetadata::get(const_1);
-      MDNode *md = MDNode::get(
-          v->getParent()->getParent()->getParent()->getContext(), MDConst);
-      basesel->setMetadata("is_base_value", md);
-      states[v] = PhiState(PhiState::Conflict, basesel);
+      // Either conflict or base.
+      assert(States.count(BDV));
+      Base = States[BDV].getBase();
     }
-  }
+    assert(Base && "can't be null");
+    // The cast is needed since base traversal may strip away bitcasts
+    if (Base->getType() != Input->getType() &&
+        InsertPt) {
+      Base = new BitCastInst(Base, Input->getType(), "cast",
+                             InsertPt);
+    }
+    return Base;
+  };
 
-  // Fixup all the inputs of the new PHIs
-  for (auto Pair : states) {
-    Instruction *v = cast<Instruction>(Pair.first);
-    PhiState state = Pair.second;
+  // Fixup all the inputs of the new PHIs.  Visit order needs to be
+  // deterministic and predictable because we're naming newly created
+  // instructions.
+  for (auto Pair : States) {
+    Instruction *BDV = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
 
-    assert(!isKnownBaseResult(v) && "why did it get added?");
-    assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
-    if (!state.isConflict())
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+    if (!State.isConflict())
       continue;
 
-    if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) {
-      PHINode *phi = cast<PHINode>(v);
+    if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) {
+      PHINode *phi = cast<PHINode>(BDV);
       unsigned NumPHIValues = phi->getNumIncomingValues();
       for (unsigned i = 0; i < NumPHIValues; i++) {
         Value *InVal = phi->getIncomingValue(i);
@@ -906,104 +974,145 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
         if (blockIndex != -1) {
           Value *oldBase = basephi->getIncomingValue(blockIndex);
           basephi->addIncoming(oldBase, InBB);
+          
 #ifndef NDEBUG
-          Value *base = findBaseOrBDV(InVal, cache);
-          if (!isKnownBaseResult(base)) {
-            // Either conflict or base.
-            assert(states.count(base));
-            base = states[base].getBase();
-            assert(base != nullptr && "unknown PhiState!");
-          }
-
-          // In essense this assert states: the only way two
+          Value *Base = getBaseForInput(InVal, nullptr);
+          // In essence this assert states: the only way two
           // values incoming from the same basic block may be
           // different is by being different bitcasts of the same
           // value.  A cleanup that remains TODO is changing
           // findBaseOrBDV to return an llvm::Value of the correct
           // type (and still remain pure).  This will remove the
           // need to add bitcasts.
-          assert(base->stripPointerCasts() == oldBase->stripPointerCasts() &&
+          assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() &&
                  "sanity -- findBaseOrBDV should be pure!");
 #endif
           continue;
         }
 
-        // Find either the defining value for the PHI or the normal base for
-        // a non-phi node
-        Value *base = findBaseOrBDV(InVal, cache);
-        if (!isKnownBaseResult(base)) {
-          // Either conflict or base.
-          assert(states.count(base));
-          base = states[base].getBase();
-          assert(base != nullptr && "unknown PhiState!");
-        }
-        assert(base && "can't be null");
-        // Must use original input BB since base may not be Instruction
-        // The cast is needed since base traversal may strip away bitcasts
-        if (base->getType() != basephi->getType()) {
-          base = new BitCastInst(base, basephi->getType(), "cast",
-                                 InBB->getTerminator());
-        }
-        basephi->addIncoming(base, InBB);
+        // Find the instruction which produces the base for each input.  We may
+        // need to insert a bitcast in the incoming block.
+        // TODO: Need to split critical edges if insertion is needed
+        Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+        basephi->addIncoming(Base, InBB);
       }
       assert(basephi->getNumIncomingValues() == NumPHIValues);
-    } else {
-      SelectInst *basesel = cast<SelectInst>(state.getBase());
-      SelectInst *sel = cast<SelectInst>(v);
+    } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) {
+      SelectInst *Sel = cast<SelectInst>(BDV);
       // Operand 1 & 2 are true, false path respectively. TODO: refactor to
       // something more safe and less hacky.
       for (int i = 1; i <= 2; i++) {
-        Value *InVal = sel->getOperand(i);
-        // Find either the defining value for the PHI or the normal base for
-        // a non-phi node
-        Value *base = findBaseOrBDV(InVal, cache);
-        if (!isKnownBaseResult(base)) {
-          // Either conflict or base.
-          assert(states.count(base));
-          base = states[base].getBase();
-          assert(base != nullptr && "unknown PhiState!");
-        }
-        assert(base && "can't be null");
-        // Must use original input BB since base may not be Instruction
-        // The cast is needed since base traversal may strip away bitcasts
-        if (base->getType() != basesel->getType()) {
-          base = new BitCastInst(base, basesel->getType(), "cast", basesel);
-        }
-        basesel->setOperand(i, base);
+        Value *InVal = Sel->getOperand(i);
+        // Find the instruction which produces the base for each input.  We may
+        // need to insert a bitcast.
+        Value *Base = getBaseForInput(InVal, BaseSel);
+        BaseSel->setOperand(i, Base);
       }
+    } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) {
+      Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+      // Find the instruction which produces the base for each input.  We may
+      // need to insert a bitcast.
+      Value *Base = getBaseForInput(InVal, BaseEE);
+      BaseEE->setOperand(0, Base);
+    } else {
+      auto *BaseIE = cast<InsertElementInst>(State.getBase());
+      auto *BdvIE = cast<InsertElementInst>(BDV);
+      auto UpdateOperand = [&](int OperandIdx) {
+        Value *InVal = BdvIE->getOperand(OperandIdx);
+        Value *Base = getBaseForInput(InVal, BaseIE);
+        BaseIE->setOperand(OperandIdx, Base);
+      };
+      UpdateOperand(0); // vector operand
+      UpdateOperand(1); // scalar operand
+    }
+
+  }
+
+  // Now that we're done with the algorithm, see if we can optimize the 
+  // results slightly by reducing the number of new instructions needed. 
+  // Arguably, this should be integrated into the algorithm above, but 
+  // doing as a post process step is easier to reason about for the moment.
+  DenseMap<Value *, Value *> ReverseMap;
+  SmallPtrSet<Instruction *, 16> NewInsts;
+  SmallSetVector<AssertingVH<Instruction>, 16> Worklist;
+  // Note: We need to visit the states in a deterministic order.  We uses the
+  // Keys we sorted above for this purpose.  Note that we are papering over a
+  // bigger problem with the algorithm above - it's visit order is not
+  // deterministic.  A larger change is needed to fix this.
+  for (auto Pair : States) {
+    auto *BDV = Pair.first;
+    auto State = Pair.second;
+    Value *Base = State.getBase();
+    assert(BDV && Base);
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    assert(isKnownBaseResult(Base) &&
+           "must be something we 'know' is a base pointer");
+    if (!State.isConflict())
+      continue;
+
+    ReverseMap[Base] = BDV;
+    if (auto *BaseI = dyn_cast<Instruction>(Base)) {
+      NewInsts.insert(BaseI);
+      Worklist.insert(BaseI);
+    }
+  }
+  auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI,
+                                 Value *Replacement) {
+    // Add users which are new instructions (excluding self references)
+    for (User *U : BaseI->users())
+      if (auto *UI = dyn_cast<Instruction>(U))
+        if (NewInsts.count(UI) && UI != BaseI)
+          Worklist.insert(UI);
+    // Then do the actual replacement
+    NewInsts.erase(BaseI);
+    ReverseMap.erase(BaseI);
+    BaseI->replaceAllUsesWith(Replacement);
+    assert(States.count(BDV));
+    assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI);
+    States[BDV] = BDVState(BDVState::Conflict, Replacement);
+    BaseI->eraseFromParent();
+  };
+  const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout();
+  while (!Worklist.empty()) {
+    Instruction *BaseI = Worklist.pop_back_val();
+    assert(NewInsts.count(BaseI));
+    Value *Bdv = ReverseMap[BaseI];
+    if (auto *BdvI = dyn_cast<Instruction>(Bdv))
+      if (BaseI->isIdenticalTo(BdvI)) {
+        DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n");
+        ReplaceBaseInstWith(Bdv, BaseI, Bdv);
+        continue;
+      }
+    if (Value *V = SimplifyInstruction(BaseI, DL)) {
+      DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n");
+      ReplaceBaseInstWith(Bdv, BaseI, V);
+      continue;
     }
   }
 
   // Cache all of our results so we can cheaply reuse them
   // NOTE: This is actually two caches: one of the base defining value
   // relation and one of the base pointer relation!  FIXME
-  for (auto item : states) {
-    Value *v = item.first;
-    Value *base = item.second.getBase();
-    assert(v && base);
-    assert(!isKnownBaseResult(v) && "why did it get added?");
-
-    if (TraceLSP) {
-      std::string fromstr =
-          cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "")
-                         : "none";
-      errs() << "Updating base value cache"
-             << " for: " << (v->hasName() ? v->getName() : "")
-             << " from: " << fromstr
-             << " to: " << (base->hasName() ? base->getName() : "") << "\n";
-    }
-
-    assert(isKnownBaseResult(base) &&
-           "must be something we 'know' is a base pointer");
-    if (cache.count(v)) {
+  for (auto Pair : States) {
+    auto *BDV = Pair.first;
+    Value *base = Pair.second.getBase();
+    assert(BDV && base);
+
+    std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none";
+    DEBUG(dbgs() << "Updating base value cache"
+          << " for: " << BDV->getName()
+          << " from: " << fromstr
+          << " to: " << base->getName() << "\n");
+
+    if (cache.count(BDV)) {
       // Once we transition from the BDV relation being store in the cache to
       // the base relation being stored, it must be stable
-      assert((!isKnownBaseResult(cache[v]) || cache[v] == base) &&
+      assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) &&
              "base relation should be stable");
     }
-    cache[v] = base;
+    cache[BDV] = base;
   }
-  assert(cache.find(def) != cache.end());
+  assert(cache.count(def));
   return cache[def];
 }
 
@@ -1024,7 +1133,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
 // pointer was a base pointer.
 static void
 findBasePointers(const StatepointLiveSetTy &live,
-                 DenseMap<llvm::Value *, llvm::Value *> &PointerToBase,
+                 DenseMap<Value *, Value *> &PointerToBase,
                  DominatorTree *DT, DefiningValueMapTy &DVCache) {
   // For the naming of values inserted to be deterministic - which makes for
   // much cleaner and more stable tests - we need to assign an order to the
@@ -1043,7 +1152,7 @@ findBasePointers(const StatepointLiveSetTy &live,
 
     // If you see this trip and like to live really dangerously, the code should
     // be correct, just with idioms the verifier can't handle.  You can try
-    // disabling the verifier at your own substaintial risk.
+    // disabling the verifier at your own substantial risk.
     assert(!isa<ConstantPointerNull>(base) &&
            "the relocation code needs adjustment to handle the relocation of "
            "a null pointer constant without causing false positives in the "
@@ -1056,8 +1165,8 @@ findBasePointers(const StatepointLiveSetTy &live,
 static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
                              const CallSite &CS,
                              PartiallyConstructedSafepointRecord &result) {
-  DenseMap<llvm::Value *, llvm::Value *> PointerToBase;
-  findBasePointers(result.liveset, PointerToBase, &DT, DVCache);
+  DenseMap<Value *, Value *> PointerToBase;
+  findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
 
   if (PrintBasePointers) {
     // Note: Need to print these in a stable order since this is checked in
@@ -1071,8 +1180,11 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
     std::sort(Temp.begin(), Temp.end(), order_by_name);
     for (Value *Ptr : Temp) {
       Value *Base = PointerToBase[Ptr];
-      errs() << " derived %" << Ptr->getName() << " base %" << Base->getName()
-             << "\n";
+      errs() << " derived ";
+      Ptr->printAsOperand(errs(), false);
+      errs() << " base ";
+      Base->printAsOperand(errs(), false);
+      errs() << "\n";;
     }
   }
 
@@ -1086,10 +1198,10 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
                                   PartiallyConstructedSafepointRecord &result);
 
 static void recomputeLiveInValues(
-    Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate,
+    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
     MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
   // TODO-PERF: reuse the original liveness, then simply run the dataflow
-  // again.  The old values are still live and will help it stablize quickly.
+  // again.  The old values are still live and will help it stabilize quickly.
   GCPtrLivenessData RevisedLivenessData;
   computeLiveInValues(DT, F, RevisedLivenessData);
   for (size_t i = 0; i < records.size(); i++) {
@@ -1099,69 +1211,66 @@ static void recomputeLiveInValues(
   }
 }
 
-// When inserting gc.relocate calls, we need to ensure there are no uses
-// of the original value between the gc.statepoint and the gc.relocate call.
-// One case which can arise is a phi node starting one of the successor blocks.
-// We also need to be able to insert the gc.relocates only on the path which
-// goes through the statepoint.  We might need to split an edge to make this
-// possible.
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call.  One case which can arise is a phi node
+// starting one of the successor blocks.  We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint.  We might
+// need to split an edge to make this possible.
 static BasicBlock *
 normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
                             DominatorTree &DT) {
   BasicBlock *Ret = BB;
-  if (!BB->getUniquePredecessor()) {
-    Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, &DT);
-  }
+  if (!BB->getUniquePredecessor())
+    Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
 
-  // Now that 'ret' has unique predecessor we can safely remove all phi nodes
+  // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
   // from it
   FoldSingleEntryPHINodes(Ret);
-  assert(!isa<PHINode>(Ret->begin()));
+  assert(!isa<PHINode>(Ret->begin()) &&
+         "All PHI nodes should have been removed!");
 
-  // At this point, we can safely insert a gc.relocate as the first instruction
-  // in Ret if needed.
+  // At this point, we can safely insert a gc.relocate or gc.result as the first
+  // instruction in Ret if needed.
   return Ret;
 }
 
-static int find_index(ArrayRef<Value *> livevec, Value *val) {
-  auto itr = std::find(livevec.begin(), livevec.end(), val);
-  assert(livevec.end() != itr);
-  size_t index = std::distance(livevec.begin(), itr);
-  assert(index < livevec.size());
-  return index;
-}
-
-// Create new attribute set containing only attributes which can be transfered
+// Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
 static AttributeSet legalizeCallAttributes(AttributeSet AS) {
-  AttributeSet ret;
+  AttributeSet Ret;
 
   for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
-    unsigned index = AS.getSlotIndex(Slot);
+    unsigned Index = AS.getSlotIndex(Slot);
 
-    if (index == AttributeSet::ReturnIndex ||
-        index == AttributeSet::FunctionIndex) {
+    if (Index == AttributeSet::ReturnIndex ||
+        Index == AttributeSet::FunctionIndex) {
 
-      for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end;
-           ++it) {
-        Attribute attr = *it;
+      for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
 
         // Do not allow certain attributes - just skip them
         // Safepoint can not be read only or read none.
-        if (attr.hasAttribute(Attribute::ReadNone) ||
-            attr.hasAttribute(Attribute::ReadOnly))
+        if (Attr.hasAttribute(Attribute::ReadNone) ||
+            Attr.hasAttribute(Attribute::ReadOnly))
+          continue;
+
+        // These attributes control the generation of the gc.statepoint call /
+        // invoke itself; and once the gc.statepoint is in place, they're of no
+        // use.
+        if (Attr.hasAttribute("statepoint-num-patch-bytes") ||
+            Attr.hasAttribute("statepoint-id"))
           continue;
 
-        ret = ret.addAttributes(
-            AS.getContext(), index,
-            AttributeSet::get(AS.getContext(), index, AttrBuilder(attr)));
+        Ret = Ret.addAttributes(
+            AS.getContext(), Index,
+            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
       }
     }
 
     // Just skip parameter attributes for now
   }
 
-  return ret;
+  return Ret;
 }
 
 /// Helper function to place all gc relocates necessary for the given
@@ -1173,225 +1282,306 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {
 ///   statepointToken - statepoint instruction to which relocates should be
 ///   bound.
 ///   Builder - Llvm IR builder to be used to construct new calls.
-static void CreateGCRelocates(ArrayRef<llvm::Value *> LiveVariables,
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
                               const int LiveStart,
-                              ArrayRef<llvm::Value *> BasePtrs,
+                              ArrayRef<Value *> BasePtrs,
                               Instruction *StatepointToken,
                               IRBuilder<> Builder) {
-  SmallVector<Instruction *, 64> NewDefs;
-  NewDefs.reserve(LiveVariables.size());
+  if (LiveVariables.empty())
+    return;
+
+  auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+    auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val);
+    assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+    size_t Index = std::distance(LiveVec.begin(), ValIt);
+    assert(Index < LiveVec.size() && "Bug in std::find?");
+    return Index;
+  };
+  Module *M = StatepointToken->getModule();
+  
+  // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+  // element type is i8 addrspace(1)*). We originally generated unique
+  // declarations for each pointer type, but this proved problematic because
+  // the intrinsic mangling code is incomplete and fragile.  Since we're moving
+  // towards a single unified pointer type anyways, we can just cast everything
+  // to an i8* of the right address space.  A bitcast is added later to convert
+  // gc_relocate to the actual value's type.  
+  auto getGCRelocateDecl = [&] (Type *Ty) {
+    assert(isHandledGCPointerType(Ty));
+    auto AS = Ty->getScalarType()->getPointerAddressSpace();
+    Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+    if (auto *VT = dyn_cast<VectorType>(Ty))
+      NewTy = VectorType::get(NewTy, VT->getNumElements());
+    return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+                                     {NewTy});
+  };
 
-  Module *M = StatepointToken->getParent()->getParent()->getParent();
+  // Lazily populated map from input types to the canonicalized form mentioned
+  // in the comment above.  This should probably be cached somewhere more
+  // broadly.
+  DenseMap<Type*, Value*> TypeToDeclMap;
 
   for (unsigned i = 0; i < LiveVariables.size(); i++) {
-    // We generate a (potentially) unique declaration for every pointer type
-    // combination.  This results is some blow up the function declarations in
-    // the IR, but removes the need for argument bitcasts which shrinks the IR
-    // greatly and makes it much more readable.
-    SmallVector<Type *, 1> Types;                 // one per 'any' type
-    // All gc_relocate are set to i8 addrspace(1)* type. This could help avoid
-    // cases where the actual value's type mangling is not supported by llvm. A
-    // bitcast is added later to convert gc_relocate to the actual value's type.
-    Types.push_back(Type::getInt8PtrTy(M->getContext(), 1));
-    Value *GCRelocateDecl = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_gc_relocate, Types);
-
     // Generate the gc.relocate call and save the result
     Value *BaseIdx =
-        ConstantInt::get(Type::getInt32Ty(M->getContext()),
-                         LiveStart + find_index(LiveVariables, BasePtrs[i]));
-    Value *LiveIdx = ConstantInt::get(
-        Type::getInt32Ty(M->getContext()),
-        LiveStart + find_index(LiveVariables, LiveVariables[i]));
+      Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i]));
+    Value *LiveIdx = Builder.getInt32(LiveStart + i);
+
+    Type *Ty = LiveVariables[i]->getType();
+    if (!TypeToDeclMap.count(Ty))
+      TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+    Value *GCRelocateDecl = TypeToDeclMap[Ty];
 
     // only specify a debug name if we can give a useful one
-    Value *Reloc = Builder.CreateCall(
+    CallInst *Reloc = Builder.CreateCall(
         GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
-        LiveVariables[i]->hasName() ? LiveVariables[i]->getName() + ".relocated"
-                                    : "");
+        suffixed_name_or(LiveVariables[i], ".relocated", ""));
     // Trick CodeGen into thinking there are lots of free registers at this
     // fake call.
-    cast<CallInst>(Reloc)->setCallingConv(CallingConv::Cold);
+    Reloc->setCallingConv(CallingConv::Cold);
+  }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s.  Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+  AssertingVH<Instruction> Old;
+  AssertingVH<Instruction> New;
+
+public:
+  explicit DeferredReplacement(Instruction *Old, Instruction *New) :
+    Old(Old), New(New) {
+    assert(Old != New && "Not allowed!");
+  }
+
+  /// Does the task represented by this instance.
+  void doReplacement() {
+    Instruction *OldI = Old;
+    Instruction *NewI = New;
+
+    assert(OldI != NewI && "Disallowed at construction?!");
+
+    Old = nullptr;
+    New = nullptr;
 
-    NewDefs.push_back(cast<Instruction>(Reloc));
+    if (NewI)
+      OldI->replaceAllUsesWith(NewI);
+    OldI->eraseFromParent();
   }
-  assert(NewDefs.size() == LiveVariables.size() &&
-         "missing or extra redefinition at safepoint");
+};
 }
 
 static void
-makeStatepointExplicitImpl(const CallSite &CS, /* to replace */
-                           const SmallVectorImpl<llvm::Value *> &basePtrs,
-                           const SmallVectorImpl<llvm::Value *> &liveVariables,
-                           Pass *P,
-                           PartiallyConstructedSafepointRecord &result) {
-  assert(basePtrs.size() == liveVariables.size());
-  assert(isStatepoint(CS) &&
+makeStatepointExplicitImpl(const CallSite CS, /* to replace */
+                           const SmallVectorImpl<Value *> &BasePtrs,
+                           const SmallVectorImpl<Value *> &LiveVariables,
+                           PartiallyConstructedSafepointRecord &Result,
+                           std::vector<DeferredReplacement> &Replacements) {
+  assert(BasePtrs.size() == LiveVariables.size());
+  assert((UseDeoptBundles || isStatepoint(CS)) &&
          "This method expects to be rewriting a statepoint");
 
-  BasicBlock *BB = CS.getInstruction()->getParent();
-  assert(BB);
-  Function *F = BB->getParent();
-  assert(F && "must be set");
-  Module *M = F->getParent();
-  (void)M;
-  assert(M && "must be set");
-
-  // We're not changing the function signature of the statepoint since the gc
-  // arguments go into the var args section.
-  Function *gc_statepoint_decl = CS.getCalledFunction();
-
   // Then go ahead and use the builder do actually do the inserts.  We insert
   // immediately before the previous instruction under the assumption that all
   // arguments will be available here.  We can't insert afterwards since we may
   // be replacing a terminator.
-  Instruction *insertBefore = CS.getInstruction();
-  IRBuilder<> Builder(insertBefore);
-  // Copy all of the arguments from the original statepoint - this includes the
-  // target, call args, and deopt args
-  SmallVector<llvm::Value *, 64> args;
-  args.insert(args.end(), CS.arg_begin(), CS.arg_end());
-  // TODO: Clear the 'needs rewrite' flag
-
-  // add all the pointers to be relocated (gc arguments)
-  // Capture the start of the live variable list for use in the gc_relocates
-  const int live_start = args.size();
-  args.insert(args.end(), liveVariables.begin(), liveVariables.end());
+  Instruction *InsertBefore = CS.getInstruction();
+  IRBuilder<> Builder(InsertBefore);
+
+  ArrayRef<Value *> GCArgs(LiveVariables);
+  uint64_t StatepointID = 0xABCDEF00;
+  uint32_t NumPatchBytes = 0;
+  uint32_t Flags = uint32_t(StatepointFlags::None);
+
+  ArrayRef<Use> CallArgs;
+  ArrayRef<Use> DeoptArgs;
+  ArrayRef<Use> TransitionArgs;
+
+  Value *CallTarget = nullptr;
+
+  if (UseDeoptBundles) {
+    CallArgs = {CS.arg_begin(), CS.arg_end()};
+    DeoptArgs = GetDeoptBundleOperands(CS);
+    // TODO: we don't fill in TransitionArgs or Flags in this branch, but we
+    // could have an operand bundle for that too.
+    AttributeSet OriginalAttrs = CS.getAttributes();
+
+    Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                                  "statepoint-id");
+    if (AttrID.isStringAttribute())
+      AttrID.getValueAsString().getAsInteger(10, StatepointID);
+
+    Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
+        AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
+    if (AttrNumPatchBytes.isStringAttribute())
+      AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
+
+    CallTarget = CS.getCalledValue();
+  } else {
+    // This branch will be gone soon, and we will soon only support the
+    // UseDeoptBundles == true configuration.
+    Statepoint OldSP(CS);
+    StatepointID = OldSP.getID();
+    NumPatchBytes = OldSP.getNumPatchBytes();
+    Flags = OldSP.getFlags();
+
+    CallArgs = {OldSP.arg_begin(), OldSP.arg_end()};
+    DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()};
+    TransitionArgs = {OldSP.gc_transition_args_begin(),
+                      OldSP.gc_transition_args_end()};
+    CallTarget = OldSP.getCalledValue();
+  }
 
   // Create the statepoint given all the arguments
-  Instruction *token = nullptr;
-  AttributeSet return_attributes;
+  Instruction *Token = nullptr;
+  AttributeSet ReturnAttrs;
   if (CS.isCall()) {
-    CallInst *toReplace = cast<CallInst>(CS.getInstruction());
-    CallInst *call =
-        Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token");
-    call->setTailCall(toReplace->isTailCall());
-    call->setCallingConv(toReplace->getCallingConv());
+    CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+    CallInst *Call = Builder.CreateGCStatepointCall(
+        StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+    Call->setTailCall(ToReplace->isTailCall());
+    Call->setCallingConv(ToReplace->getCallingConv());
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes());
-    // In case if we can handle this set of sttributes - set up function attrs
+    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    call->setAttributes(new_attrs.getFnAttributes());
-    return_attributes = new_attrs.getRetAttributes();
+    Call->setAttributes(NewAttrs.getFnAttributes());
+    ReturnAttrs = NewAttrs.getRetAttributes();
 
-    token = call;
+    Token = Call;
 
     // Put the following gc_result and gc_relocate calls immediately after the
     // the old call (which we're about to delete)
-    BasicBlock::iterator next(toReplace);
-    assert(BB->end() != next && "not a terminator, must have next");
-    next++;
-    Instruction *IP = &*(next);
-    Builder.SetInsertPoint(IP);
-    Builder.SetCurrentDebugLocation(IP->getDebugLoc());
-
+    assert(ToReplace->getNextNode() && "Not a terminator, must have next!");
+    Builder.SetInsertPoint(ToReplace->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
   } else {
-    InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction());
+    InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
 
     // Insert the new invoke into the old block.  We'll remove the old one in a
     // moment at which point this will become the new terminator for the
     // original block.
-    InvokeInst *invoke = InvokeInst::Create(
-        gc_statepoint_decl, toReplace->getNormalDest(),
-        toReplace->getUnwindDest(), args, "", toReplace->getParent());
-    invoke->setCallingConv(toReplace->getCallingConv());
+    InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+        StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(),
+        ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs,
+        GCArgs, "statepoint_token");
+
+    Invoke->setCallingConv(ToReplace->getCallingConv());
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes());
-    // In case if we can handle this set of sttributes - set up function attrs
+    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    invoke->setAttributes(new_attrs.getFnAttributes());
-    return_attributes = new_attrs.getRetAttributes();
+    Invoke->setAttributes(NewAttrs.getFnAttributes());
+    ReturnAttrs = NewAttrs.getRetAttributes();
 
-    token = invoke;
+    Token = Invoke;
 
     // Generate gc relocates in exceptional path
-    BasicBlock *unwindBlock = toReplace->getUnwindDest();
-    assert(!isa<PHINode>(unwindBlock->begin()) &&
-           unwindBlock->getUniquePredecessor() &&
+    BasicBlock *UnwindBlock = ToReplace->getUnwindDest();
+    assert(!isa<PHINode>(UnwindBlock->begin()) &&
+           UnwindBlock->getUniquePredecessor() &&
            "can't safely insert in this block!");
 
-    Instruction *IP = &*(unwindBlock->getFirstInsertionPt());
-    Builder.SetInsertPoint(IP);
-    Builder.SetCurrentDebugLocation(toReplace->getDebugLoc());
+    Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
 
-    // Extract second element from landingpad return value. We will attach
-    // exceptional gc relocates to it.
-    const unsigned idx = 1;
-    Instruction *exceptional_token =
-        cast<Instruction>(Builder.CreateExtractValue(
-            unwindBlock->getLandingPadInst(), idx, "relocate_token"));
-    result.UnwindToken = exceptional_token;
+    // Attach exceptional gc relocates to the landingpad.
+    Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+    Result.UnwindToken = ExceptionalToken;
 
-    // Just throw away return value. We will use the one we got for normal
-    // block.
-    (void)CreateGCRelocates(liveVariables, live_start, basePtrs,
-                            exceptional_token, Builder);
+    const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+    CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
+                      Builder);
 
     // Generate gc relocates and returns for normal block
-    BasicBlock *normalDest = toReplace->getNormalDest();
-    assert(!isa<PHINode>(normalDest->begin()) &&
-           normalDest->getUniquePredecessor() &&
+    BasicBlock *NormalDest = ToReplace->getNormalDest();
+    assert(!isa<PHINode>(NormalDest->begin()) &&
+           NormalDest->getUniquePredecessor() &&
            "can't safely insert in this block!");
 
-    IP = &*(normalDest->getFirstInsertionPt());
-    Builder.SetInsertPoint(IP);
+    Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
 
     // gc relocates will be generated later as if it were regular call
     // statepoint
   }
-  assert(token);
-
-  // Take the name of the original value call if it had one.
-  token->takeName(CS.getInstruction());
+  assert(Token && "Should be set in one of the above branches!");
+
+  if (UseDeoptBundles) {
+    Token->setName("statepoint_token");
+    if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+      StringRef Name =
+          CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+      CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
+      GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+
+      // We cannot RAUW or delete CS.getInstruction() because it could be in the
+      // live set of some other safepoint, in which case that safepoint's
+      // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+      // llvm::Instruction.  Instead, we defer the replacement and deletion to
+      // after the live sets have been made explicit in the IR, and we no longer
+      // have raw pointers to worry about.
+      Replacements.emplace_back(CS.getInstruction(), GCResult);
+    } else {
+      Replacements.emplace_back(CS.getInstruction(), nullptr);
+    }
+  } else {
+    assert(!CS.getInstruction()->hasNUsesOrMore(2) &&
+           "only valid use before rewrite is gc.result");
+    assert(!CS.getInstruction()->hasOneUse() ||
+           isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin())));
 
-// The GCResult is already inserted, we just need to find it
-#ifndef NDEBUG
-  Instruction *toReplace = CS.getInstruction();
-  assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) &&
-         "only valid use before rewrite is gc.result");
-  assert(!toReplace->hasOneUse() ||
-         isGCResult(cast<Instruction>(*toReplace->user_begin())));
-#endif
+    // Take the name of the original statepoint token if there was one.
+    Token->takeName(CS.getInstruction());
 
-  // Update the gc.result of the original statepoint (if any) to use the newly
-  // inserted statepoint.  This is safe to do here since the token can't be
-  // considered a live reference.
-  CS.getInstruction()->replaceAllUsesWith(token);
+    // Update the gc.result of the original statepoint (if any) to use the newly
+    // inserted statepoint.  This is safe to do here since the token can't be
+    // considered a live reference.
+    CS.getInstruction()->replaceAllUsesWith(Token);
+    CS.getInstruction()->eraseFromParent();
+  }
 
-  result.StatepointToken = token;
+  Result.StatepointToken = Token;
 
   // Second, create a gc.relocate for every live variable
-  CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder);
+  const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+  CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
 }
 
 namespace {
-struct name_ordering {
-  Value *base;
-  Value *derived;
-  bool operator()(name_ordering const &a, name_ordering const &b) {
-    return -1 == a.derived->getName().compare(b.derived->getName());
+struct NameOrdering {
+  Value *Base;
+  Value *Derived;
+
+  bool operator()(NameOrdering const &a, NameOrdering const &b) {
+    return -1 == a.Derived->getName().compare(b.Derived->getName());
   }
 };
 }
-static void stablize_order(SmallVectorImpl<Value *> &basevec,
-                           SmallVectorImpl<Value *> &livevec) {
-  assert(basevec.size() == livevec.size());
-
-  SmallVector<name_ordering, 64> temp;
-  for (size_t i = 0; i < basevec.size(); i++) {
-    name_ordering v;
-    v.base = basevec[i];
-    v.derived = livevec[i];
-    temp.push_back(v);
-  }
-  std::sort(temp.begin(), temp.end(), name_ordering());
-  for (size_t i = 0; i < basevec.size(); i++) {
-    basevec[i] = temp[i].base;
-    livevec[i] = temp[i].derived;
+
+static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec,
+                           SmallVectorImpl<Value *> &LiveVec) {
+  assert(BaseVec.size() == LiveVec.size());
+
+  SmallVector<NameOrdering, 64> Temp;
+  for (size_t i = 0; i < BaseVec.size(); i++) {
+    NameOrdering v;
+    v.Base = BaseVec[i];
+    v.Derived = LiveVec[i];
+    Temp.push_back(v);
+  }
+
+  std::sort(Temp.begin(), Temp.end(), NameOrdering());
+  for (size_t i = 0; i < BaseVec.size(); i++) {
+    BaseVec[i] = Temp[i].Base;
+    LiveVec[i] = Temp[i].Derived;
   }
 }
 
@@ -1401,71 +1591,63 @@ static void stablize_order(SmallVectorImpl<Value *> &basevec,
 // WARNING: Does not do any fixup to adjust users of the original live
 // values.  That's the callers responsibility.
 static void
-makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P,
-                       PartiallyConstructedSafepointRecord &result) {
-  auto liveset = result.liveset;
-  auto PointerToBase = result.PointerToBase;
+makeStatepointExplicit(DominatorTree &DT, const CallSite &CS,
+                       PartiallyConstructedSafepointRecord &Result,
+                       std::vector<DeferredReplacement> &Replacements) {
+  const auto &LiveSet = Result.LiveSet;
+  const auto &PointerToBase = Result.PointerToBase;
 
   // Convert to vector for efficient cross referencing.
-  SmallVector<Value *, 64> basevec, livevec;
-  livevec.reserve(liveset.size());
-  basevec.reserve(liveset.size());
-  for (Value *L : liveset) {
-    livevec.push_back(L);
-
-    assert(PointerToBase.find(L) != PointerToBase.end());
-    Value *base = PointerToBase[L];
-    basevec.push_back(base);
+  SmallVector<Value *, 64> BaseVec, LiveVec;
+  LiveVec.reserve(LiveSet.size());
+  BaseVec.reserve(LiveSet.size());
+  for (Value *L : LiveSet) {
+    LiveVec.push_back(L);
+    assert(PointerToBase.count(L));
+    Value *Base = PointerToBase.find(L)->second;
+    BaseVec.push_back(Base);
   }
-  assert(livevec.size() == basevec.size());
+  assert(LiveVec.size() == BaseVec.size());
 
   // To make the output IR slightly more stable (for use in diffs), ensure a
   // fixed order of the values in the safepoint (by sorting the value name).
   // The order is otherwise meaningless.
-  stablize_order(basevec, livevec);
+  StabilizeOrder(BaseVec, LiveVec);
 
   // Do the actual rewriting and delete the old statepoint
-  makeStatepointExplicitImpl(CS, basevec, livevec, P, result);
-  CS.getInstruction()->eraseFromParent();
+  makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);
 }
 
 // Helper function for the relocationViaAlloca.
-// It receives iterator to the statepoint gc relocates and emits store to the
-// assigned
-// location (via allocaMap) for the each one of them.
-// Add visited values into the visitedLiveValues set we will later use them
-// for sanity check.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them.  It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
 static void
 insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
                        DenseMap<Value *, Value *> &AllocaMap,
                        DenseSet<Value *> &VisitedLiveValues) {
 
   for (User *U : GCRelocs) {
-    if (!isa<IntrinsicInst>(U))
+    GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+    if (!Relocate)
       continue;
 
-    IntrinsicInst *RelocatedValue = cast<IntrinsicInst>(U);
-
-    // We only care about relocates
-    if (RelocatedValue->getIntrinsicID() !=
-        Intrinsic::experimental_gc_relocate) {
-      continue;
-    }
-
-    GCRelocateOperands RelocateOperands(RelocatedValue);
-    Value *OriginalValue =
-        const_cast<Value *>(RelocateOperands.getDerivedPtr());
+    Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr());
     assert(AllocaMap.count(OriginalValue));
     Value *Alloca = AllocaMap[OriginalValue];
 
     // Emit store into the related alloca
-    // All gc_relocate are i8 addrspace(1)* typed, and it must be bitcasted to
+    // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
     // the correct type according to alloca.
-    assert(RelocatedValue->getNextNode() && "Should always have one since it's not a terminator");
-    IRBuilder<> Builder(RelocatedValue->getNextNode());
+    assert(Relocate->getNextNode() &&
+           "Should always have one since it's not a terminator");
+    IRBuilder<> Builder(Relocate->getNextNode());
     Value *CastedRelocatedValue =
-        Builder.CreateBitCast(RelocatedValue, cast<AllocaInst>(Alloca)->getAllocatedType(),
-        RelocatedValue->hasName() ? RelocatedValue->getName() + ".casted" : "");
+      Builder.CreateBitCast(Relocate,
+                            cast<AllocaInst>(Alloca)->getAllocatedType(),
+                            suffixed_name_or(Relocate, ".casted", ""));
 
     StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
     Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
@@ -1501,10 +1683,10 @@ insertRematerializationStores(
   }
 }
 
-/// do all the relocation update via allocas and mem2reg
+/// Do all the relocation update via allocas and mem2reg
 static void relocationViaAlloca(
     Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
-    ArrayRef<struct PartiallyConstructedSafepointRecord> Records) {
+    ArrayRef<PartiallyConstructedSafepointRecord> Records) {
 #ifndef NDEBUG
   // record initial number of (static) allocas; we'll check we have the same
   // number when we get done.
@@ -1531,15 +1713,12 @@ static void relocationViaAlloca(
     PromotableAllocas.push_back(Alloca);
   };
 
-  // emit alloca for each live gc pointer
-  for (unsigned i = 0; i < Live.size(); i++) {
-    emitAllocaFor(Live[i]);
-  }
-
-  // emit allocas for rematerialized values
-  for (size_t i = 0; i < Records.size(); i++) {
-    const struct PartiallyConstructedSafepointRecord &Info = Records[i];
+  // Emit alloca for each live gc pointer
+  for (Value *V : Live)
+    emitAllocaFor(V);
 
+  // Emit allocas for rematerialized values
+  for (const auto &Info : Records)
     for (auto RematerializedValuePair : Info.RematerializedValues) {
       Value *OriginalValue = RematerializedValuePair.second;
       if (AllocaMap.count(OriginalValue) != 0)
@@ -1548,20 +1727,17 @@ static void relocationViaAlloca(
       emitAllocaFor(OriginalValue);
       ++NumRematerializedValues;
     }
-  }
 
   // The next two loops are part of the same conceptual operation.  We need to
   // insert a store to the alloca after the original def and at each
   // redefinition.  We need to insert a load before each use.  These are split
   // into distinct loops for performance reasons.
 
-  // update gc pointer after each statepoint
-  // either store a relocated value or null (if no relocated value found for
-  // this gc pointer and it is not a gc_result)
-  // this must happen before we update the statepoint with load of alloca
-  // otherwise we lose the link between statepoint and old def
-  for (size_t i = 0; i < Records.size(); i++) {
-    const struct PartiallyConstructedSafepointRecord &Info = Records[i];
+  // Update gc pointer after each statepoint: either store a relocated value or
+  // null (if no relocated value was found for this gc pointer and it is not a
+  // gc_result).  This must happen before we update the statepoint with load of
+  // alloca otherwise we lose the link between statepoint and old def.
+  for (const auto &Info : Records) {
     Value *Statepoint = Info.StatepointToken;
 
     // This will be used for consistency check
@@ -1582,7 +1758,7 @@ static void relocationViaAlloca(
                                   VisitedLiveValues);
 
     if (ClobberNonLive) {
-      // As a debuging aid, pretend that an unrelocated pointer becomes null at
+      // As a debugging aid, pretend that an unrelocated pointer becomes null at
       // the gc.statepoint.  This will turn some subtle GC problems into
       // slightly easier to debug SEGVs.  Note that on large IR files with
       // lots of gc.statepoints this is extremely costly both memory and time
@@ -1612,23 +1788,22 @@ static void relocationViaAlloca(
       // Insert the clobbering stores.  These may get intermixed with the
       // gc.results and gc.relocates, but that's fine.
       if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
-        InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt());
-        InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt());
+        InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+        InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
       } else {
-        BasicBlock::iterator Next(cast<CallInst>(Statepoint));
-        Next++;
-        InsertClobbersAt(Next);
+        InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
       }
     }
   }
-  // update use with load allocas and add store for gc_relocated
+
+  // Update use with load allocas and add store for gc_relocated.
   for (auto Pair : AllocaMap) {
     Value *Def = Pair.first;
     Value *Alloca = Pair.second;
 
-    // we pre-record the uses of allocas so that we dont have to worry about
-    // later update
-    // that change the user information.
+    // We pre-record the uses of allocas so that we dont have to worry about
+    // later update that changes the user information..
+
     SmallVector<Instruction *, 20> Uses;
     // PERF: trade a linear scan for repeated reallocation
     Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
@@ -1663,9 +1838,9 @@ static void relocationViaAlloca(
       }
     }
 
-    // emit store for the initial gc value
-    // store must be inserted after load, otherwise store will be in alloca's
-    // use list and an extra load will be inserted before it
+    // Emit store for the initial gc value.  Store must be inserted after load,
+    // otherwise store will be in alloca's use list and an extra load will be
+    // inserted before it.
     StoreInst *Store = new StoreInst(Def, Alloca);
     if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
       if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
@@ -1688,14 +1863,13 @@ static void relocationViaAlloca(
   assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
          "we must have the same allocas with lives");
   if (!PromotableAllocas.empty()) {
-    // apply mem2reg to promote alloca to SSA
+    // Apply mem2reg to promote alloca to SSA
     PromoteMemToReg(PromotableAllocas, DT);
   }
 
 #ifndef NDEBUG
-  for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E;
-       I++)
-    if (isa<AllocaInst>(*I))
+  for (auto &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
       InitialAllocaNum--;
   assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
 #endif
@@ -1719,28 +1893,27 @@ static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
     // No values to hold live, might as well not insert the empty holder
     return;
 
-  Module *M = CS.getInstruction()->getParent()->getParent()->getParent();
+  Module *M = CS.getInstruction()->getModule();
   // Use a dummy vararg function to actually hold the values live
   Function *Func = cast<Function>(M->getOrInsertFunction(
       "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)));
   if (CS.isCall()) {
     // For call safepoints insert dummy calls right after safepoint
-    BasicBlock::iterator Next(CS.getInstruction());
-    Next++;
-    Holders.push_back(CallInst::Create(Func, Values, "", Next));
+    Holders.push_back(CallInst::Create(Func, Values, "",
+                                       &*++CS.getInstruction()->getIterator()));
     return;
   }
   // For invoke safepooints insert dummy calls both in normal and
   // exceptional destination blocks
   auto *II = cast<InvokeInst>(CS.getInstruction());
   Holders.push_back(CallInst::Create(
-      Func, Values, "", II->getNormalDest()->getFirstInsertionPt()));
+      Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
   Holders.push_back(CallInst::Create(
-      Func, Values, "", II->getUnwindDest()->getFirstInsertionPt()));
+      Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
 }
 
 static void findLiveReferences(
-    Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate,
+    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
     MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
   GCPtrLivenessData OriginalLivenessData;
   computeLiveInValues(DT, F, OriginalLivenessData);
@@ -1751,12 +1924,12 @@ static void findLiveReferences(
   }
 }
 
-/// Remove any vector of pointers from the liveset by scalarizing them over the
-/// statepoint instruction.  Adds the scalarized pieces to the liveset.  It
-/// would be preferrable to include the vector in the statepoint itself, but
+/// Remove any vector of pointers from the live set by scalarizing them over the
+/// statepoint instruction.  Adds the scalarized pieces to the live set.  It
+/// would be preferable to include the vector in the statepoint itself, but
 /// the lowering code currently does not handle that.  Extending it would be
 /// slightly non-trivial since it requires a format change.  Given how rare
-/// such cases are (for the moment?) scalarizing is an acceptable comprimise.
+/// such cases are (for the moment?) scalarizing is an acceptable compromise.
 static void splitVectorValues(Instruction *StatepointInst,
                               StatepointLiveSetTy &LiveSet,
                               DenseMap<Value *, Value *>& PointerToBase,
@@ -1887,7 +2060,7 @@ static void splitVectorValues(Instruction *StatepointInst,
 // Helper function for the "rematerializeLiveValues". It walks use chain
 // starting from the "CurrentValue" until it meets "BaseValue". Only "simple"
 // values are visited (currently it is GEP's and casts). Returns true if it
-// sucessfully reached "BaseValue" and false otherwise.
+// successfully reached "BaseValue" and false otherwise.
 // Fills "ChainToBase" array with all visited values. "BaseValue" is not
 // recorded.
 static bool findRematerializableChainToBasePointer(
@@ -1907,16 +2080,12 @@ static bool findRematerializableChainToBasePointer(
   }
 
   if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
-    Value *Def = CI->stripPointerCasts();
-
-    // This two checks are basically similar. First one is here for the
-    // consistency with findBasePointers logic.
-    assert(!isa<CastInst>(Def) && "not a pointer cast found");
     if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
       return false;
 
     ChainToBase.push_back(CI);
-    return findRematerializableChainToBasePointer(ChainToBase, Def, BaseValue);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  CI->getOperand(0), BaseValue);
   }
 
   // Not supported instruction in the chain
@@ -1957,8 +2126,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
   return Cost;
 }
 
-// From the statepoint liveset pick values that are cheaper to recompute then to
-// relocate. Remove this values from the liveset, rematerialize them after
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
 // statepoint and record them in "Info" structure. Note that similar to
 // relocated values we don't do any user adjustments here.
 static void rematerializeLiveValues(CallSite CS,
@@ -1970,10 +2139,10 @@ static void rematerializeLiveValues(CallSite CS,
   // We can not di this in following loop due to iterator invalidation.
   SmallVector<Value *, 32> LiveValuesToBeDeleted;
 
-  for (Value *LiveValue: Info.liveset) {
+  for (Value *LiveValue: Info.LiveSet) {
     // For each live pointer find it's defining chain
     SmallVector<Instruction *, 3> ChainToBase;
-    assert(Info.PointerToBase.find(LiveValue) != Info.PointerToBase.end());
+    assert(Info.PointerToBase.count(LiveValue));
     bool FoundChain =
       findRematerializableChainToBasePointer(ChainToBase,
                                              LiveValue,
@@ -2059,9 +2228,9 @@ static void rematerializeLiveValues(CallSite CS,
       InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());
 
       Instruction *NormalInsertBefore =
-          Invoke->getNormalDest()->getFirstInsertionPt();
+          &*Invoke->getNormalDest()->getFirstInsertionPt();
       Instruction *UnwindInsertBefore =
-          Invoke->getUnwindDest()->getFirstInsertionPt();
+          &*Invoke->getUnwindDest()->getFirstInsertionPt();
 
       Instruction *NormalRematerializedValue =
           rematerializeChain(NormalInsertBefore);
@@ -2075,22 +2244,23 @@ static void rematerializeLiveValues(CallSite CS,
 
   // Remove rematerializaed values from the live set
   for (auto LiveValue: LiveValuesToBeDeleted) {
-    Info.liveset.erase(LiveValue);
+    Info.LiveSet.erase(LiveValue);
   }
 }
 
-static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
-                              SmallVectorImpl<CallSite> &toUpdate) {
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+                              TargetTransformInfo &TTI,
+                              SmallVectorImpl<CallSite> &ToUpdate) {
 #ifndef NDEBUG
   // sanity check the input
-  std::set<CallSite> uniqued;
-  uniqued.insert(toUpdate.begin(), toUpdate.end());
-  assert(uniqued.size() == toUpdate.size() && "no duplicates please!");
+  std::set<CallSite> Uniqued;
+  Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+  assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
 
-  for (size_t i = 0; i < toUpdate.size(); i++) {
-    CallSite &CS = toUpdate[i];
+  for (CallSite CS : ToUpdate) {
     assert(CS.getInstruction()->getParent()->getParent() == &F);
-    assert(isStatepoint(CS) && "expected to already be a deopt statepoint");
+    assert((UseDeoptBundles || isStatepoint(CS)) &&
+           "expected to already be a deopt statepoint");
   }
 #endif
 
@@ -2098,50 +2268,45 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
   // the top of the successor blocks.  See the comment on
   // normalForInvokeSafepoint on exactly what is needed.  Note that this step
   // may restructure the CFG.
-  for (CallSite CS : toUpdate) {
+  for (CallSite CS : ToUpdate) {
     if (!CS.isInvoke())
       continue;
-    InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction());
-    normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(),
-                                DT);
-    normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(),
-                                DT);
+    auto *II = cast<InvokeInst>(CS.getInstruction());
+    normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+    normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
   }
 
   // A list of dummy calls added to the IR to keep various values obviously
   // live in the IR.  We'll remove all of these when done.
-  SmallVector<CallInst *, 64> holders;
+  SmallVector<CallInst *, 64> Holders;
 
   // Insert a dummy call with all of the arguments to the vm_state we'll need
   // for the actual safepoint insertion.  This ensures reference arguments in
   // the deopt argument list are considered live through the safepoint (and
   // thus makes sure they get relocated.)
-  for (size_t i = 0; i < toUpdate.size(); i++) {
-    CallSite &CS = toUpdate[i];
-    Statepoint StatepointCS(CS);
-
+  for (CallSite CS : ToUpdate) {
     SmallVector<Value *, 64> DeoptValues;
-    for (Use &U : StatepointCS.vm_state_args()) {
-      Value *Arg = cast<Value>(&U);
+
+    iterator_range<const Use *> DeoptStateRange =
+        UseDeoptBundles
+            ? iterator_range<const Use *>(GetDeoptBundleOperands(CS))
+            : iterator_range<const Use *>(Statepoint(CS).vm_state_args());
+
+    for (Value *Arg : DeoptStateRange) {
       assert(!isUnhandledGCPointerType(Arg->getType()) &&
              "support for FCA unimplemented");
       if (isHandledGCPointerType(Arg->getType()))
         DeoptValues.push_back(Arg);
     }
-    insertUseHolderAfter(CS, DeoptValues, holders);
-  }
 
-  SmallVector<struct PartiallyConstructedSafepointRecord, 64> records;
-  records.reserve(toUpdate.size());
-  for (size_t i = 0; i < toUpdate.size(); i++) {
-    struct PartiallyConstructedSafepointRecord info;
-    records.push_back(info);
+    insertUseHolderAfter(CS, DeoptValues, Holders);
   }
-  assert(records.size() == toUpdate.size());
 
-  // A) Identify all gc pointers which are staticly live at the given call
+  SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+  // A) Identify all gc pointers which are statically live at the given call
   // site.
-  findLiveReferences(F, DT, P, toUpdate, records);
+  findLiveReferences(F, DT, ToUpdate, Records);
 
   // B) Find the base pointers for each live pointer
   /* scope for caching */ {
@@ -2150,10 +2315,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
     // large numbers of duplicate base_phis.
     DefiningValueMapTy DVCache;
 
-    for (size_t i = 0; i < records.size(); i++) {
-      struct PartiallyConstructedSafepointRecord &info = records[i];
-      CallSite &CS = toUpdate[i];
-      findBasePointers(DT, DVCache, CS, info);
+    for (size_t i = 0; i < Records.size(); i++) {
+      PartiallyConstructedSafepointRecord &info = Records[i];
+      findBasePointers(DT, DVCache, ToUpdate[i], info);
     }
   } // end of cache scope
 
@@ -2170,63 +2334,79 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
   // the base pointers which were identified for that safepoint.  We'll then
   // ask liveness for _every_ base inserted to see what is now live.  Then we
   // remove the dummy calls.
-  holders.reserve(holders.size() + records.size());
-  for (size_t i = 0; i < records.size(); i++) {
-    struct PartiallyConstructedSafepointRecord &info = records[i];
-    CallSite &CS = toUpdate[i];
+  Holders.reserve(Holders.size() + Records.size());
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
 
     SmallVector<Value *, 128> Bases;
-    for (auto Pair : info.PointerToBase) {
+    for (auto Pair : Info.PointerToBase)
       Bases.push_back(Pair.second);
-    }
-    insertUseHolderAfter(CS, Bases, holders);
+
+    insertUseHolderAfter(ToUpdate[i], Bases, Holders);
   }
 
   // By selecting base pointers, we've effectively inserted new uses. Thus, we
   // need to rerun liveness.  We may *also* have inserted new defs, but that's
   // not the key issue.
-  recomputeLiveInValues(F, DT, P, toUpdate, records);
+  recomputeLiveInValues(F, DT, ToUpdate, Records);
 
   if (PrintBasePointers) {
-    for (size_t i = 0; i < records.size(); i++) {
-      struct PartiallyConstructedSafepointRecord &info = records[i];
+    for (auto &Info : Records) {
       errs() << "Base Pairs: (w/Relocation)\n";
-      for (auto Pair : info.PointerToBase) {
-        errs() << " derived %" << Pair.first->getName() << " base %"
-               << Pair.second->getName() << "\n";
+      for (auto Pair : Info.PointerToBase) {
+        errs() << " derived ";
+        Pair.first->printAsOperand(errs(), false);
+        errs() << " base ";
+        Pair.second->printAsOperand(errs(), false);
+        errs() << "\n";
       }
     }
   }
-  for (size_t i = 0; i < holders.size(); i++) {
-    holders[i]->eraseFromParent();
-    holders[i] = nullptr;
-  }
-  holders.clear();
+
+  // It is possible that non-constant live variables have a constant base.  For
+  // example, a GEP with a variable offset from a global.  In this case we can
+  // remove it from the liveset.  We already don't add constants to the liveset
+  // because we assume they won't move at runtime and the GC doesn't need to be
+  // informed about them.  The same reasoning applies if the base is constant.
+  // Note that the relocation placement code relies on this filtering for
+  // correctness as it expects the base to be in the liveset, which isn't true
+  // if the base is constant.
+  for (auto &Info : Records)
+    for (auto &BasePair : Info.PointerToBase)
+      if (isa<Constant>(BasePair.second))
+        Info.LiveSet.erase(BasePair.first);
+
+  for (CallInst *CI : Holders)
+    CI->eraseFromParent();
+
+  Holders.clear();
 
   // Do a limited scalarization of any live at safepoint vector values which
   // contain pointers.  This enables this pass to run after vectorization at
-  // the cost of some possible performance loss.  TODO: it would be nice to
-  // natively support vectors all the way through the backend so we don't need
-  // to scalarize here.
-  for (size_t i = 0; i < records.size(); i++) {
-    struct PartiallyConstructedSafepointRecord &info = records[i];
-    Instruction *statepoint = toUpdate[i].getInstruction();
-    splitVectorValues(cast<Instruction>(statepoint), info.liveset,
-                      info.PointerToBase, DT);
-  }
+  // the cost of some possible performance loss.  Note: This is known to not
+  // handle updating of the side tables correctly which can lead to relocation
+  // bugs when the same vector is live at multiple statepoints.  We're in the
+  // process of implementing the alternate lowering - relocating the
+  // vector-of-pointers as first class item and updating the backend to
+  // understand that - but that's not yet complete.  
+  if (UseVectorSplit)
+    for (size_t i = 0; i < Records.size(); i++) {
+      PartiallyConstructedSafepointRecord &Info = Records[i];
+      Instruction *Statepoint = ToUpdate[i].getInstruction();
+      splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet,
+                        Info.PointerToBase, DT);
+    }
 
   // In order to reduce live set of statepoint we might choose to rematerialize
-  // some values instead of relocating them. This is purelly an optimization and
+  // some values instead of relocating them. This is purely an optimization and
   // does not influence correctness.
-  TargetTransformInfo &TTI =
-    P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
-  for (size_t i = 0; i < records.size(); i++) {
-    struct PartiallyConstructedSafepointRecord &info = records[i];
-    CallSite &CS = toUpdate[i];
+  for (size_t i = 0; i < Records.size(); i++)
+    rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
 
-    rematerializeLiveValues(CS, info, TTI);
-  }
+  // We need this to safely RAUW and delete call or invoke return values that
+  // may themselves be live over a statepoint.  For details, please see usage in
+  // makeStatepointExplicitImpl.
+  std::vector<DeferredReplacement> Replacements;
 
   // Now run through and replace the existing statepoints with new ones with
   // the live variables listed.  We do not yet update uses of the values being
@@ -2234,61 +2414,78 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
   // survive to the last iteration of this loop.  (By construction, the
   // previous statepoint can not be a live variable, thus we can and remove
   // the old statepoint calls as we go.)
-  for (size_t i = 0; i < records.size(); i++) {
-    struct PartiallyConstructedSafepointRecord &info = records[i];
-    CallSite &CS = toUpdate[i];
-    makeStatepointExplicit(DT, CS, P, info);
+  for (size_t i = 0; i < Records.size(); i++)
+    makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+  ToUpdate.clear(); // prevent accident use of invalid CallSites
+
+  for (auto &PR : Replacements)
+    PR.doReplacement();
+
+  Replacements.clear();
+
+  for (auto &Info : Records) {
+    // These live sets may contain state Value pointers, since we replaced calls
+    // with operand bundles with calls wrapped in gc.statepoint, and some of
+    // those calls may have been def'ing live gc pointers.  Clear these out to
+    // avoid accidentally using them.
+    //
+    // TODO: We should create a separate data structure that does not contain
+    // these live sets, and migrate to using that data structure from this point
+    // onward.
+    Info.LiveSet.clear();
+    Info.PointerToBase.clear();
   }
-  toUpdate.clear(); // prevent accident use of invalid CallSites
 
   // Do all the fixups of the original live variables to their relocated selves
-  SmallVector<Value *, 128> live;
-  for (size_t i = 0; i < records.size(); i++) {
-    struct PartiallyConstructedSafepointRecord &info = records[i];
+  SmallVector<Value *, 128> Live;
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
     // We can't simply save the live set from the original insertion.  One of
     // the live values might be the result of a call which needs a safepoint.
     // That Value* no longer exists and we need to use the new gc_result.
-    // Thankfully, the liveset is embedded in the statepoint (and updated), so
+    // Thankfully, the live set is embedded in the statepoint (and updated), so
     // we just grab that.
-    Statepoint statepoint(info.StatepointToken);
-    live.insert(live.end(), statepoint.gc_args_begin(),
-                statepoint.gc_args_end());
+    Statepoint Statepoint(Info.StatepointToken);
+    Live.insert(Live.end(), Statepoint.gc_args_begin(),
+                Statepoint.gc_args_end());
 #ifndef NDEBUG
     // Do some basic sanity checks on our liveness results before performing
     // relocation.  Relocation can and will turn mistakes in liveness results
     // into non-sensical code which is must harder to debug.
     // TODO: It would be nice to test consistency as well
-    assert(DT.isReachableFromEntry(info.StatepointToken->getParent()) &&
+    assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
            "statepoint must be reachable or liveness is meaningless");
-    for (Value *V : statepoint.gc_args()) {
+    for (Value *V : Statepoint.gc_args()) {
       if (!isa<Instruction>(V))
         // Non-instruction values trivial dominate all possible uses
         continue;
-      auto LiveInst = cast<Instruction>(V);
+      auto *LiveInst = cast<Instruction>(V);
       assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
              "unreachable values should never be live");
-      assert(DT.dominates(LiveInst, info.StatepointToken) &&
+      assert(DT.dominates(LiveInst, Info.StatepointToken) &&
              "basic SSA liveness expectation violated by liveness analysis");
     }
 #endif
   }
-  unique_unsorted(live);
+  unique_unsorted(Live);
 
 #ifndef NDEBUG
   // sanity check
-  for (auto ptr : live) {
-    assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type");
-  }
+  for (auto *Ptr : Live)
+    assert(isHandledGCPointerType(Ptr->getType()) &&
+           "must be a gc pointer type");
 #endif
 
-  relocationViaAlloca(F, DT, live, records);
-  return !records.empty();
+  relocationViaAlloca(F, DT, Live, Records);
+  return !Records.empty();
 }
 
 // Handles both return values and arguments for Functions and CallSites.
 template <typename AttrHolder>
-static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
-                                   unsigned Index) {
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+                                      unsigned Index) {
   AttrBuilder R;
   if (AH.getDereferenceableBytes(Index))
     R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
@@ -2296,6 +2493,8 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
   if (AH.getDereferenceableOrNullBytes(Index))
     R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
                                   AH.getDereferenceableOrNullBytes(Index)));
+  if (AH.doesNotAlias(Index))
+    R.addAttribute(Attribute::NoAlias);
 
   if (!R.empty())
     AH.setAttributes(AH.getAttributes().removeAttributes(
@@ -2303,25 +2502,25 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
 }
 
 void
-RewriteStatepointsForGC::stripDereferenceabilityInfoFromPrototype(Function &F) {
+RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
   LLVMContext &Ctx = F.getContext();
 
   for (Argument &A : F.args())
     if (isa<PointerType>(A.getType()))
-      RemoveDerefAttrAtIndex(Ctx, F, A.getArgNo() + 1);
+      RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
 
   if (isa<PointerType>(F.getReturnType()))
-    RemoveDerefAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
 }
 
-void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) {
+void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
   if (F.empty())
     return;
 
   LLVMContext &Ctx = F.getContext();
   MDBuilder Builder(Ctx);
 
-  for (Instruction &I : inst_range(F)) {
+  for (Instruction &I : instructions(F)) {
     if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
       assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
       bool IsImmutableTBAA =
@@ -2344,9 +2543,9 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) {
     if (CallSite CS = CallSite(&I)) {
       for (int i = 0, e = CS.arg_size(); i != e; i++)
         if (isa<PointerType>(CS.getArgument(i)->getType()))
-          RemoveDerefAttrAtIndex(Ctx, CS, i + 1);
+          RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
       if (isa<PointerType>(CS.getType()))
-        RemoveDerefAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
     }
   }
 }
@@ -2356,7 +2555,7 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) {
 static bool shouldRewriteStatepointsIn(Function &F) {
   // TODO: This should check the GCStrategy
   if (F.hasGC()) {
-    const char *FunctionGCName = F.getGC();
+    const auto &FunctionGCName = F.getGC();
     const StringRef StatepointExampleName("statepoint-example");
     const StringRef CoreCLRName("coreclr");
     return (StatepointExampleName == FunctionGCName) ||
@@ -2365,17 +2564,17 @@ static bool shouldRewriteStatepointsIn(Function &F) {
     return false;
 }
 
-void RewriteStatepointsForGC::stripDereferenceabilityInfo(Module &M) {
+void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
 #ifndef NDEBUG
   assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) &&
          "precondition!");
 #endif
 
   for (Function &F : M)
-    stripDereferenceabilityInfoFromPrototype(F);
+    stripNonValidAttributesFromPrototype(F);
 
   for (Function &F : M)
-    stripDereferenceabilityInfoFromBody(F);
+    stripNonValidAttributesFromBody(F);
 }
 
 bool RewriteStatepointsForGC::runOnFunction(Function &F) {
@@ -2389,15 +2588,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {
     return false;
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  auto NeedsRewrite = [](Instruction &I) {
+    if (UseDeoptBundles) {
+      if (ImmutableCallSite CS = ImmutableCallSite(&I))
+        return !callsGCLeafFunction(CS);
+      return false;
+    }
+
+    return isStatepoint(I);
+  };
 
   // Gather all the statepoints which need rewritten.  Be careful to only
   // consider those in reachable code since we need to ask dominance queries
   // when rewriting.  We'll delete the unreachable ones in a moment.
   SmallVector<CallSite, 64> ParsePointNeeded;
   bool HasUnreachableStatepoint = false;
-  for (Instruction &I : inst_range(F)) {
+  for (Instruction &I : instructions(F)) {
     // TODO: only the ones with the flag set!
-    if (isStatepoint(I)) {
+    if (NeedsRewrite(I)) {
       if (DT.isReachableFromEntry(I.getParent()))
         ParsePointNeeded.push_back(CallSite(&I));
       else
@@ -2428,7 +2639,38 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {
       FoldSingleEntryPHINodes(&BB);
     }
 
-  MadeChange |= insertParsePoints(F, DT, this, ParsePointNeeded);
+  // Before we start introducing relocations, we want to tweak the IR a bit to
+  // avoid unfortunate code generation effects.  The main example is that we 
+  // want to try to make sure the comparison feeding a branch is after any
+  // safepoints.  Otherwise, we end up with a comparison of pre-relocation
+  // values feeding a branch after relocation.  This is semantically correct,
+  // but results in extra register pressure since both the pre-relocation and
+  // post-relocation copies must be available in registers.  For code without
+  // relocations this is handled elsewhere, but teaching the scheduler to
+  // reverse the transform we're about to do would be slightly complex.
+  // Note: This may extend the live range of the inputs to the icmp and thus
+  // increase the liveset of any statepoint we move over.  This is profitable
+  // as long as all statepoints are in rare blocks.  If we had in-register
+  // lowering for live values this would be a much safer transform.
+  auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+    if (auto *BI = dyn_cast<BranchInst>(TI))
+      if (BI->isConditional())
+        return dyn_cast<Instruction>(BI->getCondition());
+    // TODO: Extend this to handle switches
+    return nullptr;
+  };
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (auto *Cond = getConditionInst(TI))
+      // TODO: Handle more than just ICmps here.  We should be able to move
+      // most instructions without side effects or memory access.  
+      if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+        MadeChange = true;
+        Cond->moveBefore(TI);
+      }
+  }
+
+  MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
   return MadeChange;
 }
 
@@ -2461,7 +2703,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,
              "support for FCA unimplemented");
       if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
         // The choice to exclude all things constant here is slightly subtle.
-        // There are two idependent reasons:
+        // There are two independent reasons:
         // - We assume that things which are constant (from LLVM's definition)
         // do not move at runtime.  For example, the address of a global
         // variable is fixed, even though it's contents may not be.
@@ -2599,7 +2841,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,
   } // while( !worklist.empty() )
 
 #ifndef NDEBUG
-  // Sanity check our ouput against SSA properties.  This helps catch any
+  // Sanity check our output against SSA properties.  This helps catch any
   // missing kills during the above iteration.
   for (BasicBlock &BB : F) {
     checkBasicSSA(DT, Data, BB);
@@ -2620,7 +2862,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
   // call result is not live (normal), nor are it's arguments
   // (unless they're used again later).  This adjustment is
   // specifically what we need to relocate
-  BasicBlock::reverse_iterator rend(Inst);
+  BasicBlock::reverse_iterator rend(Inst->getIterator());
   computeLiveInValues(BB->rbegin(), rend, LiveOut);
   LiveOut.erase(Inst);
   Out.insert(LiveOut.begin(), LiveOut.end());
@@ -2669,5 +2911,5 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
     assert(Updated.count(KVPair.first) && "record for non-live value");
 #endif
 
-  Info.liveset = Updated;
+  Info.LiveSet = Updated;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index 4d3a708..8569e08 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
@@ -479,6 +480,13 @@ private:
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
   void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+  void visitFuncletPadInst(FuncletPadInst &FPI) {
+    markAnythingOverdefined(&FPI);
+  }
+  void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+    markAnythingOverdefined(&CPI);
+    visitTerminatorInst(CPI);
+  }
 
   // Instructions that cannot be folded away.
   void visitStoreInst     (StoreInst &I);
@@ -539,9 +547,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
     return;
   }
 
-  if (isa<InvokeInst>(TI)) {
-    // Invoke instructions successors are always executable.
-    Succs[0] = Succs[1] = true;
+  // Unwinding instructions successors are always executable.
+  if (TI.isExceptional()) {
+    Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
 
@@ -605,8 +613,8 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     return BI->getSuccessor(CI->isZero()) == To;
   }
 
-  // Invoke instructions successors are always executable.
-  if (isa<InvokeInst>(TI))
+  // Unwinding instructions successors are always executable.
+  if (TI->isExceptional())
     return true;
 
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
@@ -630,7 +638,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
 #ifndef NDEBUG
   dbgs() << "Unknown terminator instruction: " << *TI << '\n';
 #endif
-  llvm_unreachable(nullptr);
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
 
 // visit Implementations - Something changed in this instruction, either an
@@ -749,9 +757,14 @@ void SCCPSolver::visitCastInst(CastInst &I) {
   LatticeVal OpSt = getValueState(I.getOperand(0));
   if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
     markOverdefined(&I);
-  else if (OpSt.isConstant())        // Propagate constant value
-    markConstant(&I, ConstantExpr::getCast(I.getOpcode(),
-                                           OpSt.getConstant(), I.getType()));
+  else if (OpSt.isConstant()) {
+    Constant *C =
+        ConstantExpr::getCast(I.getOpcode(), OpSt.getConstant(), I.getType());
+    if (isa<UndefValue>(C))
+      return;
+    // Propagate constant value
+    markConstant(&I, C);
+  }
 }
 
 
@@ -851,10 +864,14 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
   LatticeVal &IV = ValueState[&I];
   if (IV.isOverdefined()) return;
 
-  if (V1State.isConstant() && V2State.isConstant())
-    return markConstant(IV, &I,
-                        ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
-                                          V2State.getConstant()));
+  if (V1State.isConstant() && V2State.isConstant()) {
+    Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                    V2State.getConstant());
+    // X op Y -> undef.
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
 
   // If something is undef, wait for it to resolve.
   if (!V1State.isOverdefined() && !V2State.isOverdefined())
@@ -909,10 +926,13 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
   LatticeVal &IV = ValueState[&I];
   if (IV.isOverdefined()) return;
 
-  if (V1State.isConstant() && V2State.isConstant())
-    return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(),
-                                                         V1State.getConstant(),
-                                                        V2State.getConstant()));
+  if (V1State.isConstant() && V2State.isConstant()) {
+    Constant *C = ConstantExpr::getCompare(
+        I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
 
   // If operands are still undefined, wait for it to resolve.
   if (!V1State.isOverdefined() && !V2State.isOverdefined())
@@ -1012,8 +1032,11 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
 
   Constant *Ptr = Operands[0];
   auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
-  markConstant(&I, ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr,
-                                                  Indices));
+  Constant *C =
+      ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+  if (isa<UndefValue>(C))
+      return;
+  markConstant(&I, C);
 }
 
 void SCCPSolver::visitStoreInst(StoreInst &SI) {
@@ -1053,9 +1076,9 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
 
   Constant *Ptr = PtrVal.getConstant();
 
-  // load null -> null
+  // load null is undefined.
   if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
-    return markConstant(IV, &I, UndefValue::get(I.getType()));
+    return;
 
   // Transform load (constant global) into the value loaded.
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
@@ -1071,8 +1094,11 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
   }
 
   // Transform load from a constant into a constant if possible.
-  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL))
+  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) {
+    if (isa<UndefValue>(C))
+      return;
     return markConstant(IV, &I, C);
+  }
 
   // Otherwise we cannot say for certain what value this load will produce.
   // Bail out.
@@ -1114,8 +1140,12 @@ CallOverdefined:
 
       // If we can constant fold this, mark the result of the call as a
       // constant.
-      if (Constant *C = ConstantFoldCall(F, Operands, TLI))
+      if (Constant *C = ConstantFoldCall(F, Operands, TLI)) {
+        // call -> undef.
+        if (isa<UndefValue>(C))
+          return;
         return markConstant(I, C);
+      }
     }
 
     // Otherwise, we don't know anything about this call, mark it overdefined.
@@ -1126,7 +1156,7 @@ CallOverdefined:
   // entry block executable and merge in the actual arguments to the call into
   // the formal arguments of the function.
   if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
-    MarkBlockExecutable(F->begin());
+    MarkBlockExecutable(&F->front());
 
     // Propagate information from this call site into the callee.
     CallSite::arg_iterator CAI = CS.arg_begin();
@@ -1135,17 +1165,17 @@ CallOverdefined:
       // If this argument is byval, and if the function is not readonly, there
       // will be an implicit copy formed of the input aggregate.
       if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
-        markOverdefined(AI);
+        markOverdefined(&*AI);
         continue;
       }
 
       if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           LatticeVal CallArg = getStructValueState(*CAI, i);
-          mergeInValue(getStructValueState(AI, i), AI, CallArg);
+          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
         }
       } else {
-        mergeInValue(AI, getValueState(*CAI));
+        mergeInValue(&*AI, getValueState(*CAI));
       }
     }
   }
@@ -1246,18 +1276,18 @@ void SCCPSolver::Solve() {
 /// even if X isn't defined.
 bool SCCPSolver::ResolvedUndefsIn(Function &F) {
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (!BBExecutable.count(BB))
+    if (!BBExecutable.count(&*BB))
       continue;
 
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    for (Instruction &I : *BB) {
       // Look for instructions which produce undef values.
-      if (I->getType()->isVoidTy()) continue;
+      if (I.getType()->isVoidTy()) continue;
 
-      if (StructType *STy = dyn_cast<StructType>(I->getType())) {
+      if (StructType *STy = dyn_cast<StructType>(I.getType())) {
         // Only a few things that can be structs matter for undef.
 
         // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
-        if (CallSite CS = CallSite(I))
+        if (CallSite CS = CallSite(&I))
           if (Function *F = CS.getCalledFunction())
             if (MRVFunctionsTracked.count(F))
               continue;
@@ -1270,14 +1300,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // Send the results of everything else to overdefined.  We could be
         // more precise than this but it isn't worth bothering.
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          LatticeVal &LV = getStructValueState(I, i);
+          LatticeVal &LV = getStructValueState(&I, i);
           if (LV.isUndefined())
-            markOverdefined(LV, I);
+            markOverdefined(LV, &I);
         }
         continue;
       }
 
-      LatticeVal &LV = getValueState(I);
+      LatticeVal &LV = getValueState(&I);
       if (!LV.isUndefined()) continue;
 
       // extractvalue is safe; check here because the argument is a struct.
@@ -1287,24 +1317,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // Compute the operand LatticeVals, for convenience below.
       // Anything taking a struct is conservatively assumed to require
       // overdefined markings.
-      if (I->getOperand(0)->getType()->isStructTy()) {
-        markOverdefined(I);
+      if (I.getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(&I);
         return true;
       }
-      LatticeVal Op0LV = getValueState(I->getOperand(0));
+      LatticeVal Op0LV = getValueState(I.getOperand(0));
       LatticeVal Op1LV;
-      if (I->getNumOperands() == 2) {
-        if (I->getOperand(1)->getType()->isStructTy()) {
-          markOverdefined(I);
+      if (I.getNumOperands() == 2) {
+        if (I.getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(&I);
           return true;
         }
 
-        Op1LV = getValueState(I->getOperand(1));
+        Op1LV = getValueState(I.getOperand(1));
       }
       // If this is an instructions whose result is defined even if the input is
       // not fully defined, propagate the information.
-      Type *ITy = I->getType();
-      switch (I->getOpcode()) {
+      Type *ITy = I.getType();
+      switch (I.getOpcode()) {
       case Instruction::Add:
       case Instruction::Sub:
       case Instruction::Trunc:
@@ -1318,9 +1348,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::FRem:
         // Floating-point binary operation: be conservative.
         if (Op0LV.isUndefined() && Op1LV.isUndefined())
-          markForcedConstant(I, Constant::getNullValue(ITy));
+          markForcedConstant(&I, Constant::getNullValue(ITy));
         else
-          markOverdefined(I);
+          markOverdefined(&I);
         return true;
       case Instruction::ZExt:
       case Instruction::SExt:
@@ -1332,7 +1362,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::SIToFP:
       case Instruction::UIToFP:
         // undef -> 0; some outputs are impossible
-        markForcedConstant(I, Constant::getNullValue(ITy));
+        markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Mul:
       case Instruction::And:
@@ -1341,7 +1371,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
           break;
         // undef * X -> 0.   X could be zero.
         // undef & X -> 0.   X could be zero.
-        markForcedConstant(I, Constant::getNullValue(ITy));
+        markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
 
       case Instruction::Or:
@@ -1349,7 +1379,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         if (Op0LV.isUndefined() && Op1LV.isUndefined())
           break;
         // undef | X -> -1.   X could be -1.
-        markForcedConstant(I, Constant::getAllOnesValue(ITy));
+        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
         return true;
 
       case Instruction::Xor:
@@ -1357,7 +1387,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // necessary, but we try to be nice to people who expect this
         // behavior in simple cases
         if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
-          markForcedConstant(I, Constant::getNullValue(ITy));
+          markForcedConstant(&I, Constant::getNullValue(ITy));
           return true;
         }
         // undef ^ X -> undef
@@ -1371,9 +1401,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // X % undef -> undef.  No change.
         if (Op1LV.isUndefined()) break;
 
+        // X / 0 -> undef.  No change.
+        // X % 0 -> undef.  No change.
+        if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
+          break;
+
         // undef / X -> 0.   X could be maxint.
         // undef % X -> 0.   X could be 1.
-        markForcedConstant(I, Constant::getNullValue(ITy));
+        markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
 
       case Instruction::AShr:
@@ -1381,7 +1416,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         if (Op1LV.isUndefined()) break;
 
         // undef >>a X -> all ones
-        markForcedConstant(I, Constant::getAllOnesValue(ITy));
+        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
         return true;
       case Instruction::LShr:
       case Instruction::Shl:
@@ -1391,17 +1426,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
 
         // undef << X -> 0
         // undef >> X -> 0
-        markForcedConstant(I, Constant::getNullValue(ITy));
+        markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Select:
-        Op1LV = getValueState(I->getOperand(1));
+        Op1LV = getValueState(I.getOperand(1));
         // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
         if (Op0LV.isUndefined()) {
           if (!Op1LV.isConstant())  // Pick the constant one if there is any.
-            Op1LV = getValueState(I->getOperand(2));
+            Op1LV = getValueState(I.getOperand(2));
         } else if (Op1LV.isUndefined()) {
           // c ? undef : undef -> undef.  No change.
-          Op1LV = getValueState(I->getOperand(2));
+          Op1LV = getValueState(I.getOperand(2));
           if (Op1LV.isUndefined())
             break;
           // Otherwise, c ? undef : x -> x.
@@ -1410,9 +1445,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         }
 
         if (Op1LV.isConstant())
-          markForcedConstant(I, Op1LV.getConstant());
+          markForcedConstant(&I, Op1LV.getConstant());
         else
-          markOverdefined(I);
+          markOverdefined(&I);
         return true;
       case Instruction::Load:
         // A load here means one of two things: a load of undef from a global,
@@ -1421,9 +1456,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         break;
       case Instruction::ICmp:
         // X == undef -> undef.  Other comparisons get more complicated.
-        if (cast<ICmpInst>(I)->isEquality())
+        if (cast<ICmpInst>(&I)->isEquality())
           break;
-        markOverdefined(I);
+        markOverdefined(&I);
         return true;
       case Instruction::Call:
       case Instruction::Invoke: {
@@ -1432,19 +1467,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // 2. It could be constant-foldable.
         // Because of the way we solve return values, tracked calls must
         // never be marked overdefined in ResolvedUndefsIn.
-        if (Function *F = CallSite(I).getCalledFunction())
+        if (Function *F = CallSite(&I).getCalledFunction())
           if (TrackedRetVals.count(F))
             break;
 
         // If the call is constant-foldable, we mark it overdefined because
         // we do not know what return values are valid.
-        markOverdefined(I);
+        markOverdefined(&I);
         return true;
       }
       default:
         // If we don't know what should happen here, conservatively mark it
         // overdefined.
-        markOverdefined(I);
+        markOverdefined(&I);
         return true;
       }
     }
@@ -1462,7 +1497,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // false.
       if (isa<UndefValue>(BI->getCondition())) {
         BI->setCondition(ConstantInt::getFalse(BI->getContext()));
-        markEdgeExecutable(BB, TI->getSuccessor(1));
+        markEdgeExecutable(&*BB, TI->getSuccessor(1));
         return true;
       }
 
@@ -1484,7 +1519,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
         SI->setCondition(SI->case_begin().getCaseValue());
-        markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor());
+        markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor());
         return true;
       }
 
@@ -1506,6 +1541,7 @@ namespace {
   struct SCCP : public FunctionPass {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
     }
     static char ID; // Pass identification, replacement for typeid
     SCCP() : FunctionPass(ID) {
@@ -1541,11 +1577,10 @@ static void DeleteInstructionInBlock(BasicBlock *BB) {
   Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
   while (EndInst != BB->begin()) {
     // Delete the next to last instruction.
-    BasicBlock::iterator I = EndInst;
-    Instruction *Inst = --I;
+    Instruction *Inst = &*--EndInst->getIterator();
     if (!Inst->use_empty())
       Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
-    if (isa<LandingPadInst>(Inst)) {
+    if (Inst->isEHPad()) {
       EndInst = Inst;
       continue;
     }
@@ -1568,11 +1603,11 @@ bool SCCP::runOnFunction(Function &F) {
   SCCPSolver Solver(DL, TLI);
 
   // Mark the first block of the function as being executable.
-  Solver.MarkBlockExecutable(F.begin());
+  Solver.MarkBlockExecutable(&F.front());
 
   // Mark all arguments to the function as being overdefined.
-  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI)
-    Solver.markAnythingOverdefined(AI);
+  for (Argument &AI : F.args())
+    Solver.markAnythingOverdefined(&AI);
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1589,8 +1624,8 @@ bool SCCP::runOnFunction(Function &F) {
   // as we cannot modify the CFG of the function.
 
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (!Solver.isBlockExecutable(BB)) {
-      DeleteInstructionInBlock(BB);
+    if (!Solver.isBlockExecutable(&*BB)) {
+      DeleteInstructionInBlock(&*BB);
       MadeChanges = true;
       continue;
     }
@@ -1599,7 +1634,7 @@ bool SCCP::runOnFunction(Function &F) {
     // constants if we have found them to be of constant values.
     //
     for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
-      Instruction *Inst = BI++;
+      Instruction *Inst = &*BI++;
       if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
         continue;
 
@@ -1713,36 +1748,34 @@ bool IPSCCP::runOnModule(Module &M) {
     // If this is a strong or ODR definition of this function, then we can
     // propagate information about its result into callsites of it.
     if (!F->mayBeOverridden())
-      Solver.AddTrackedFunction(F);
+      Solver.AddTrackedFunction(&*F);
 
     // If this function only has direct calls that we can see, we can track its
     // arguments and return value aggressively, and can assume it is not called
     // unless we see evidence to the contrary.
     if (F->hasLocalLinkage()) {
-      if (AddressIsTaken(F))
-        AddressTakenFunctions.insert(F);
+      if (AddressIsTaken(&*F))
+        AddressTakenFunctions.insert(&*F);
       else {
-        Solver.AddArgumentTrackedFunction(F);
+        Solver.AddArgumentTrackedFunction(&*F);
         continue;
       }
     }
 
     // Assume the function is called.
-    Solver.MarkBlockExecutable(F->begin());
+    Solver.MarkBlockExecutable(&F->front());
 
     // Assume nothing about the incoming arguments.
-    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
-         AI != E; ++AI)
-      Solver.markAnythingOverdefined(AI);
+    for (Argument &AI : F->args())
+      Solver.markAnythingOverdefined(&AI);
   }
 
   // Loop over global variables.  We inform the solver about any internal global
   // variables that do not have their 'addresses taken'.  If they don't have
   // their addresses taken, we can propagate constants through them.
-  for (Module::global_iterator G = M.global_begin(), E = M.global_end();
-       G != E; ++G)
-    if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G))
-      Solver.TrackValueOfGlobalVariable(G);
+  for (GlobalVariable &G : M.globals())
+    if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+      Solver.TrackValueOfGlobalVariable(&G);
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1763,7 +1796,10 @@ bool IPSCCP::runOnModule(Module &M) {
   SmallVector<BasicBlock*, 512> BlocksToErase;
 
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (Solver.isBlockExecutable(F->begin())) {
+    if (F->isDeclaration())
+      continue;
+
+    if (Solver.isBlockExecutable(&F->front())) {
       for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
            AI != E; ++AI) {
         if (AI->use_empty() || AI->getType()->isStructTy()) continue;
@@ -1771,7 +1807,7 @@ bool IPSCCP::runOnModule(Module &M) {
         // TODO: Could use getStructLatticeValueFor to find out if the entire
         // result is a constant and replace it entirely if so.
 
-        LatticeVal IV = Solver.getLatticeValueFor(AI);
+        LatticeVal IV = Solver.getLatticeValueFor(&*AI);
         if (IV.isOverdefined()) continue;
 
         Constant *CST = IV.isConstant() ?
@@ -1786,28 +1822,27 @@ bool IPSCCP::runOnModule(Module &M) {
     }
 
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      if (!Solver.isBlockExecutable(BB)) {
-        DeleteInstructionInBlock(BB);
+      if (!Solver.isBlockExecutable(&*BB)) {
+        DeleteInstructionInBlock(&*BB);
         MadeChanges = true;
 
         TerminatorInst *TI = BB->getTerminator();
-        for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-          BasicBlock *Succ = TI->getSuccessor(i);
+        for (BasicBlock *Succ : TI->successors()) {
           if (!Succ->empty() && isa<PHINode>(Succ->begin()))
-            TI->getSuccessor(i)->removePredecessor(BB);
+            Succ->removePredecessor(&*BB);
         }
         if (!TI->use_empty())
           TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
         TI->eraseFromParent();
-        new UnreachableInst(M.getContext(), BB);
+        new UnreachableInst(M.getContext(), &*BB);
 
         if (&*BB != &F->front())
-          BlocksToErase.push_back(BB);
+          BlocksToErase.push_back(&*BB);
         continue;
       }
 
       for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
-        Instruction *Inst = BI++;
+        Instruction *Inst = &*BI++;
         if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())
           continue;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index 947513a..a7361b5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -23,12 +23,12 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SROA.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -37,8 +37,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
@@ -53,9 +51,9 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TimeValue.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 #if __cplusplus >= 201103L && !defined(NDEBUG)
 // We only use this for a debug check in C++11
@@ -63,6 +61,7 @@
 #endif
 
 using namespace llvm;
+using namespace llvm::sroa;
 
 #define DEBUG_TYPE "sroa"
 
@@ -77,11 +76,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
 STATISTIC(NumDeleted, "Number of instructions deleted");
 STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
-/// Hidden option to force the pass to not use DomTree and mem2reg, instead
-/// forming SSA values through the SSAUpdater infrastructure.
-static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false),
-                                     cl::Hidden);
-
 /// Hidden option to enable randomly shuffling the slices to help uncover
 /// instability in their order.
 static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
@@ -205,7 +199,6 @@ template <typename T> struct isPodLike;
 template <> struct isPodLike<Slice> { static const bool value = true; };
 }
 
-namespace {
 /// \brief Representation of the alloca slices.
 ///
 /// This class represents the slices of an alloca which are formed by its
@@ -213,7 +206,7 @@ namespace {
 /// for the slices used and we reflect that in this structure. The uses are
 /// stored, sorted by increasing beginning offset and with unsplittable slices
 /// starting at a particular offset before splittable slices.
-class AllocaSlices {
+class llvm::sroa::AllocaSlices {
 public:
   /// \brief Construct the slices of a particular alloca.
   AllocaSlices(const DataLayout &DL, AllocaInst &AI);
@@ -253,281 +246,10 @@ public:
     std::inplace_merge(Slices.begin(), SliceI, Slices.end());
   }
 
-  // Forward declare an iterator to befriend it.
+  // Forward declare the iterator and range accessor for walking the
+  // partitions.
   class partition_iterator;
-
-  /// \brief A partition of the slices.
-  ///
-  /// An ephemeral representation for a range of slices which can be viewed as
-  /// a partition of the alloca. This range represents a span of the alloca's
-  /// memory which cannot be split, and provides access to all of the slices
-  /// overlapping some part of the partition.
-  ///
-  /// Objects of this type are produced by traversing the alloca's slices, but
-  /// are only ephemeral and not persistent.
-  class Partition {
-  private:
-    friend class AllocaSlices;
-    friend class AllocaSlices::partition_iterator;
-
-    /// \brief The begining and ending offsets of the alloca for this partition.
-    uint64_t BeginOffset, EndOffset;
-
-    /// \brief The start end end iterators of this partition.
-    iterator SI, SJ;
-
-    /// \brief A collection of split slice tails overlapping the partition.
-    SmallVector<Slice *, 4> SplitTails;
-
-    /// \brief Raw constructor builds an empty partition starting and ending at
-    /// the given iterator.
-    Partition(iterator SI) : SI(SI), SJ(SI) {}
-
-  public:
-    /// \brief The start offset of this partition.
-    ///
-    /// All of the contained slices start at or after this offset.
-    uint64_t beginOffset() const { return BeginOffset; }
-
-    /// \brief The end offset of this partition.
-    ///
-    /// All of the contained slices end at or before this offset.
-    uint64_t endOffset() const { return EndOffset; }
-
-    /// \brief The size of the partition.
-    ///
-    /// Note that this can never be zero.
-    uint64_t size() const {
-      assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
-      return EndOffset - BeginOffset;
-    }
-
-    /// \brief Test whether this partition contains no slices, and merely spans
-    /// a region occupied by split slices.
-    bool empty() const { return SI == SJ; }
-
-    /// \name Iterate slices that start within the partition.
-    /// These may be splittable or unsplittable. They have a begin offset >= the
-    /// partition begin offset.
-    /// @{
-    // FIXME: We should probably define a "concat_iterator" helper and use that
-    // to stitch together pointee_iterators over the split tails and the
-    // contiguous iterators of the partition. That would give a much nicer
-    // interface here. We could then additionally expose filtered iterators for
-    // split, unsplit, and unsplittable splices based on the usage patterns.
-    iterator begin() const { return SI; }
-    iterator end() const { return SJ; }
-    /// @}
-
-    /// \brief Get the sequence of split slice tails.
-    ///
-    /// These tails are of slices which start before this partition but are
-    /// split and overlap into the partition. We accumulate these while forming
-    /// partitions.
-    ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
-  };
-
-  /// \brief An iterator over partitions of the alloca's slices.
-  ///
-  /// This iterator implements the core algorithm for partitioning the alloca's
-  /// slices. It is a forward iterator as we don't support backtracking for
-  /// efficiency reasons, and re-use a single storage area to maintain the
-  /// current set of split slices.
-  ///
-  /// It is templated on the slice iterator type to use so that it can operate
-  /// with either const or non-const slice iterators.
-  class partition_iterator
-      : public iterator_facade_base<partition_iterator,
-                                    std::forward_iterator_tag, Partition> {
-    friend class AllocaSlices;
-
-    /// \brief Most of the state for walking the partitions is held in a class
-    /// with a nice interface for examining them.
-    Partition P;
-
-    /// \brief We need to keep the end of the slices to know when to stop.
-    AllocaSlices::iterator SE;
-
-    /// \brief We also need to keep track of the maximum split end offset seen.
-    /// FIXME: Do we really?
-    uint64_t MaxSplitSliceEndOffset;
-
-    /// \brief Sets the partition to be empty at given iterator, and sets the
-    /// end iterator.
-    partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
-        : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
-      // If not already at the end, advance our state to form the initial
-      // partition.
-      if (SI != SE)
-        advance();
-    }
-
-    /// \brief Advance the iterator to the next partition.
-    ///
-    /// Requires that the iterator not be at the end of the slices.
-    void advance() {
-      assert((P.SI != SE || !P.SplitTails.empty()) &&
-             "Cannot advance past the end of the slices!");
-
-      // Clear out any split uses which have ended.
-      if (!P.SplitTails.empty()) {
-        if (P.EndOffset >= MaxSplitSliceEndOffset) {
-          // If we've finished all splits, this is easy.
-          P.SplitTails.clear();
-          MaxSplitSliceEndOffset = 0;
-        } else {
-          // Remove the uses which have ended in the prior partition. This
-          // cannot change the max split slice end because we just checked that
-          // the prior partition ended prior to that max.
-          P.SplitTails.erase(
-              std::remove_if(
-                  P.SplitTails.begin(), P.SplitTails.end(),
-                  [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
-              P.SplitTails.end());
-          assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
-                             [&](Slice *S) {
-                               return S->endOffset() == MaxSplitSliceEndOffset;
-                             }) &&
-                 "Could not find the current max split slice offset!");
-          assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
-                             [&](Slice *S) {
-                               return S->endOffset() <= MaxSplitSliceEndOffset;
-                             }) &&
-                 "Max split slice end offset is not actually the max!");
-        }
-      }
-
-      // If P.SI is already at the end, then we've cleared the split tail and
-      // now have an end iterator.
-      if (P.SI == SE) {
-        assert(P.SplitTails.empty() && "Failed to clear the split slices!");
-        return;
-      }
-
-      // If we had a non-empty partition previously, set up the state for
-      // subsequent partitions.
-      if (P.SI != P.SJ) {
-        // Accumulate all the splittable slices which started in the old
-        // partition into the split list.
-        for (Slice &S : P)
-          if (S.isSplittable() && S.endOffset() > P.EndOffset) {
-            P.SplitTails.push_back(&S);
-            MaxSplitSliceEndOffset =
-                std::max(S.endOffset(), MaxSplitSliceEndOffset);
-          }
-
-        // Start from the end of the previous partition.
-        P.SI = P.SJ;
-
-        // If P.SI is now at the end, we at most have a tail of split slices.
-        if (P.SI == SE) {
-          P.BeginOffset = P.EndOffset;
-          P.EndOffset = MaxSplitSliceEndOffset;
-          return;
-        }
-
-        // If the we have split slices and the next slice is after a gap and is
-        // not splittable immediately form an empty partition for the split
-        // slices up until the next slice begins.
-        if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
-            !P.SI->isSplittable()) {
-          P.BeginOffset = P.EndOffset;
-          P.EndOffset = P.SI->beginOffset();
-          return;
-        }
-      }
-
-      // OK, we need to consume new slices. Set the end offset based on the
-      // current slice, and step SJ past it. The beginning offset of the
-      // parttion is the beginning offset of the next slice unless we have
-      // pre-existing split slices that are continuing, in which case we begin
-      // at the prior end offset.
-      P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
-      P.EndOffset = P.SI->endOffset();
-      ++P.SJ;
-
-      // There are two strategies to form a partition based on whether the
-      // partition starts with an unsplittable slice or a splittable slice.
-      if (!P.SI->isSplittable()) {
-        // When we're forming an unsplittable region, it must always start at
-        // the first slice and will extend through its end.
-        assert(P.BeginOffset == P.SI->beginOffset());
-
-        // Form a partition including all of the overlapping slices with this
-        // unsplittable slice.
-        while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
-          if (!P.SJ->isSplittable())
-            P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
-          ++P.SJ;
-        }
-
-        // We have a partition across a set of overlapping unsplittable
-        // partitions.
-        return;
-      }
-
-      // If we're starting with a splittable slice, then we need to form
-      // a synthetic partition spanning it and any other overlapping splittable
-      // splices.
-      assert(P.SI->isSplittable() && "Forming a splittable partition!");
-
-      // Collect all of the overlapping splittable slices.
-      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
-             P.SJ->isSplittable()) {
-        P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
-        ++P.SJ;
-      }
-
-      // Back upiP.EndOffset if we ended the span early when encountering an
-      // unsplittable slice. This synthesizes the early end offset of
-      // a partition spanning only splittable slices.
-      if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
-        assert(!P.SJ->isSplittable());
-        P.EndOffset = P.SJ->beginOffset();
-      }
-    }
-
-  public:
-    bool operator==(const partition_iterator &RHS) const {
-      assert(SE == RHS.SE &&
-             "End iterators don't match between compared partition iterators!");
-
-      // The observed positions of partitions is marked by the P.SI iterator and
-      // the emptyness of the split slices. The latter is only relevant when
-      // P.SI == SE, as the end iterator will additionally have an empty split
-      // slices list, but the prior may have the same P.SI and a tail of split
-      // slices.
-      if (P.SI == RHS.P.SI &&
-          P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
-        assert(P.SJ == RHS.P.SJ &&
-               "Same set of slices formed two different sized partitions!");
-        assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
-               "Same slice position with differently sized non-empty split "
-               "slice tails!");
-        return true;
-      }
-      return false;
-    }
-
-    partition_iterator &operator++() {
-      advance();
-      return *this;
-    }
-
-    Partition &operator*() { return P; }
-  };
-
-  /// \brief A forward range over the partitions of the alloca's slices.
-  ///
-  /// This accesses an iterator range over the partitions of the alloca's
-  /// slices. It computes these partitions on the fly based on the overlapping
-  /// offsets of the slices and the ability to split them. It will visit "empty"
-  /// partitions to cover regions of the alloca only accessed via split
-  /// slices.
-  iterator_range<partition_iterator> partitions() {
-    return make_range(partition_iterator(begin(), end()),
-                      partition_iterator(end(), end()));
-  }
+  iterator_range<partition_iterator> partitions();
 
   /// \brief Access the dead users for this alloca.
   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
@@ -595,6 +317,280 @@ private:
   /// the alloca.
   SmallVector<Use *, 8> DeadOperands;
 };
+
+/// \brief A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+  friend class AllocaSlices;
+  friend class AllocaSlices::partition_iterator;
+
+  typedef AllocaSlices::iterator iterator;
+
+  /// \brief The beginning and ending offsets of the alloca for this
+  /// partition.
+  uint64_t BeginOffset, EndOffset;
+
+  /// \brief The start end end iterators of this partition.
+  iterator SI, SJ;
+
+  /// \brief A collection of split slice tails overlapping the partition.
+  SmallVector<Slice *, 4> SplitTails;
+
+  /// \brief Raw constructor builds an empty partition starting and ending at
+  /// the given iterator.
+  Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+  /// \brief The start offset of this partition.
+  ///
+  /// All of the contained slices start at or after this offset.
+  uint64_t beginOffset() const { return BeginOffset; }
+
+  /// \brief The end offset of this partition.
+  ///
+  /// All of the contained slices end at or before this offset.
+  uint64_t endOffset() const { return EndOffset; }
+
+  /// \brief The size of the partition.
+  ///
+  /// Note that this can never be zero.
+  uint64_t size() const {
+    assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+    return EndOffset - BeginOffset;
+  }
+
+  /// \brief Test whether this partition contains no slices, and merely spans
+  /// a region occupied by split slices.
+  bool empty() const { return SI == SJ; }
+
+  /// \name Iterate slices that start within the partition.
+  /// These may be splittable or unsplittable. They have a begin offset >= the
+  /// partition begin offset.
+  /// @{
+  // FIXME: We should probably define a "concat_iterator" helper and use that
+  // to stitch together pointee_iterators over the split tails and the
+  // contiguous iterators of the partition. That would give a much nicer
+  // interface here. We could then additionally expose filtered iterators for
+  // split, unsplit, and unsplittable splices based on the usage patterns.
+  iterator begin() const { return SI; }
+  iterator end() const { return SJ; }
+  /// @}
+
+  /// \brief Get the sequence of split slice tails.
+  ///
+  /// These tails are of slices which start before this partition but are
+  /// split and overlap into the partition. We accumulate these while forming
+  /// partitions.
+  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// \brief An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+    : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+                                  Partition> {
+  friend class AllocaSlices;
+
+  /// \brief Most of the state for walking the partitions is held in a class
+  /// with a nice interface for examining them.
+  Partition P;
+
+  /// \brief We need to keep the end of the slices to know when to stop.
+  AllocaSlices::iterator SE;
+
+  /// \brief We also need to keep track of the maximum split end offset seen.
+  /// FIXME: Do we really?
+  uint64_t MaxSplitSliceEndOffset;
+
+  /// \brief Sets the partition to be empty at given iterator, and sets the
+  /// end iterator.
+  partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+      : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+    // If not already at the end, advance our state to form the initial
+    // partition.
+    if (SI != SE)
+      advance();
+  }
+
+  /// \brief Advance the iterator to the next partition.
+  ///
+  /// Requires that the iterator not be at the end of the slices.
+  void advance() {
+    assert((P.SI != SE || !P.SplitTails.empty()) &&
+           "Cannot advance past the end of the slices!");
+
+    // Clear out any split uses which have ended.
+    if (!P.SplitTails.empty()) {
+      if (P.EndOffset >= MaxSplitSliceEndOffset) {
+        // If we've finished all splits, this is easy.
+        P.SplitTails.clear();
+        MaxSplitSliceEndOffset = 0;
+      } else {
+        // Remove the uses which have ended in the prior partition. This
+        // cannot change the max split slice end because we just checked that
+        // the prior partition ended prior to that max.
+        P.SplitTails.erase(
+            std::remove_if(
+                P.SplitTails.begin(), P.SplitTails.end(),
+                [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+            P.SplitTails.end());
+        assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
+                           [&](Slice *S) {
+                             return S->endOffset() == MaxSplitSliceEndOffset;
+                           }) &&
+               "Could not find the current max split slice offset!");
+        assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
+                           [&](Slice *S) {
+                             return S->endOffset() <= MaxSplitSliceEndOffset;
+                           }) &&
+               "Max split slice end offset is not actually the max!");
+      }
+    }
+
+    // If P.SI is already at the end, then we've cleared the split tail and
+    // now have an end iterator.
+    if (P.SI == SE) {
+      assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+      return;
+    }
+
+    // If we had a non-empty partition previously, set up the state for
+    // subsequent partitions.
+    if (P.SI != P.SJ) {
+      // Accumulate all the splittable slices which started in the old
+      // partition into the split list.
+      for (Slice &S : P)
+        if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+          P.SplitTails.push_back(&S);
+          MaxSplitSliceEndOffset =
+              std::max(S.endOffset(), MaxSplitSliceEndOffset);
+        }
+
+      // Start from the end of the previous partition.
+      P.SI = P.SJ;
+
+      // If P.SI is now at the end, we at most have a tail of split slices.
+      if (P.SI == SE) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = MaxSplitSliceEndOffset;
+        return;
+      }
+
+      // If the we have split slices and the next slice is after a gap and is
+      // not splittable immediately form an empty partition for the split
+      // slices up until the next slice begins.
+      if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+          !P.SI->isSplittable()) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = P.SI->beginOffset();
+        return;
+      }
+    }
+
+    // OK, we need to consume new slices. Set the end offset based on the
+    // current slice, and step SJ past it. The beginning offset of the
+    // partition is the beginning offset of the next slice unless we have
+    // pre-existing split slices that are continuing, in which case we begin
+    // at the prior end offset.
+    P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+    P.EndOffset = P.SI->endOffset();
+    ++P.SJ;
+
+    // There are two strategies to form a partition based on whether the
+    // partition starts with an unsplittable slice or a splittable slice.
+    if (!P.SI->isSplittable()) {
+      // When we're forming an unsplittable region, it must always start at
+      // the first slice and will extend through its end.
+      assert(P.BeginOffset == P.SI->beginOffset());
+
+      // Form a partition including all of the overlapping slices with this
+      // unsplittable slice.
+      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+        if (!P.SJ->isSplittable())
+          P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+        ++P.SJ;
+      }
+
+      // We have a partition across a set of overlapping unsplittable
+      // partitions.
+      return;
+    }
+
+    // If we're starting with a splittable slice, then we need to form
+    // a synthetic partition spanning it and any other overlapping splittable
+    // splices.
+    assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+    // Collect all of the overlapping splittable slices.
+    while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+           P.SJ->isSplittable()) {
+      P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+      ++P.SJ;
+    }
+
+    // Back upiP.EndOffset if we ended the span early when encountering an
+    // unsplittable slice. This synthesizes the early end offset of
+    // a partition spanning only splittable slices.
+    if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+      assert(!P.SJ->isSplittable());
+      P.EndOffset = P.SJ->beginOffset();
+    }
+  }
+
+public:
+  bool operator==(const partition_iterator &RHS) const {
+    assert(SE == RHS.SE &&
+           "End iterators don't match between compared partition iterators!");
+
+    // The observed positions of partitions is marked by the P.SI iterator and
+    // the emptiness of the split slices. The latter is only relevant when
+    // P.SI == SE, as the end iterator will additionally have an empty split
+    // slices list, but the prior may have the same P.SI and a tail of split
+    // slices.
+    if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+      assert(P.SJ == RHS.P.SJ &&
+             "Same set of slices formed two different sized partitions!");
+      assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+             "Same slice position with differently sized non-empty split "
+             "slice tails!");
+      return true;
+    }
+    return false;
+  }
+
+  partition_iterator &operator++() {
+    advance();
+    return *this;
+  }
+
+  Partition &operator*() { return P; }
+};
+
+/// \brief A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+  return make_range(partition_iterator(begin(), end()),
+                    partition_iterator(end(), end()));
 }
 
 static Value *foldSelectInst(SelectInst &SI) {
@@ -1072,217 +1068,6 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
 
 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
-namespace {
-/// \brief Implementation of LoadAndStorePromoter for promoting allocas.
-///
-/// This subclass of LoadAndStorePromoter adds overrides to handle promoting
-/// the loads and stores of an alloca instruction, as well as updating its
-/// debug information. This is used when a domtree is unavailable and thus
-/// mem2reg in its full form can't be used to handle promotion of allocas to
-/// scalar values.
-class AllocaPromoter : public LoadAndStorePromoter {
-  AllocaInst &AI;
-  DIBuilder &DIB;
-
-  SmallVector<DbgDeclareInst *, 4> DDIs;
-  SmallVector<DbgValueInst *, 4> DVIs;
-
-public:
-  AllocaPromoter(ArrayRef<const Instruction *> Insts,
-                 SSAUpdater &S,
-                 AllocaInst &AI, DIBuilder &DIB)
-      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
-
-  void run(const SmallVectorImpl<Instruction *> &Insts) {
-    // Retain the debug information attached to the alloca for use when
-    // rewriting loads and stores.
-    if (auto *L = LocalAsMetadata::getIfExists(&AI)) {
-      if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) {
-        for (User *U : DINode->users())
-          if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
-            DDIs.push_back(DDI);
-          else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
-            DVIs.push_back(DVI);
-      }
-    }
-
-    LoadAndStorePromoter::run(Insts);
-
-    // While we have the debug information, clear it off of the alloca. The
-    // caller takes care of deleting the alloca.
-    while (!DDIs.empty())
-      DDIs.pop_back_val()->eraseFromParent();
-    while (!DVIs.empty())
-      DVIs.pop_back_val()->eraseFromParent();
-  }
-
-  bool
-  isInstInList(Instruction *I,
-               const SmallVectorImpl<Instruction *> &Insts) const override {
-    Value *Ptr;
-    if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      Ptr = LI->getOperand(0);
-    else
-      Ptr = cast<StoreInst>(I)->getPointerOperand();
-
-    // Only used to detect cycles, which will be rare and quickly found as
-    // we're walking up a chain of defs rather than down through uses.
-    SmallPtrSet<Value *, 4> Visited;
-
-    do {
-      if (Ptr == &AI)
-        return true;
-
-      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
-        Ptr = BCI->getOperand(0);
-      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
-        Ptr = GEPI->getPointerOperand();
-      else
-        return false;
-
-    } while (Visited.insert(Ptr).second);
-
-    return false;
-  }
-
-  void updateDebugInfo(Instruction *Inst) const override {
-    for (DbgDeclareInst *DDI : DDIs)
-      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
-      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-    for (DbgValueInst *DVI : DVIs) {
-      Value *Arg = nullptr;
-      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        // If an argument is zero extended then use argument directly. The ZExt
-        // may be zapped by an optimization pass in future.
-        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
-          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
-        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
-          Arg = dyn_cast<Argument>(SExt->getOperand(0));
-        if (!Arg)
-          Arg = SI->getValueOperand();
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Arg = LI->getPointerOperand();
-      } else {
-        continue;
-      }
-      DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(),
-                                  DVI->getExpression(), DVI->getDebugLoc(),
-                                  Inst);
-    }
-  }
-};
-} // end anon namespace
-
-namespace {
-/// \brief An optimization pass providing Scalar Replacement of Aggregates.
-///
-/// This pass takes allocations which can be completely analyzed (that is, they
-/// don't escape) and tries to turn them into scalar SSA values. There are
-/// a few steps to this process.
-///
-/// 1) It takes allocations of aggregates and analyzes the ways in which they
-///    are used to try to split them into smaller allocations, ideally of
-///    a single scalar data type. It will split up memcpy and memset accesses
-///    as necessary and try to isolate individual scalar accesses.
-/// 2) It will transform accesses into forms which are suitable for SSA value
-///    promotion. This can be replacing a memset with a scalar store of an
-///    integer value, or it can involve speculating operations on a PHI or
-///    select to be a PHI or select of the results.
-/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
-///    onto insert and extract operations on a vector value, and convert them to
-///    this form. By doing so, it will enable promotion of vector aggregates to
-///    SSA vector values.
-class SROA : public FunctionPass {
-  const bool RequiresDomTree;
-
-  LLVMContext *C;
-  DominatorTree *DT;
-  AssumptionCache *AC;
-
-  /// \brief Worklist of alloca instructions to simplify.
-  ///
-  /// Each alloca in the function is added to this. Each new alloca formed gets
-  /// added to it as well to recursively simplify unless that alloca can be
-  /// directly promoted. Finally, each time we rewrite a use of an alloca other
-  /// the one being actively rewritten, we add it back onto the list if not
-  /// already present to ensure it is re-visited.
-  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist;
-
-  /// \brief A collection of instructions to delete.
-  /// We try to batch deletions to simplify code and make things a bit more
-  /// efficient.
-  SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
-
-  /// \brief Post-promotion worklist.
-  ///
-  /// Sometimes we discover an alloca which has a high probability of becoming
-  /// viable for SROA after a round of promotion takes place. In those cases,
-  /// the alloca is enqueued here for re-processing.
-  ///
-  /// Note that we have to be very careful to clear allocas out of this list in
-  /// the event they are deleted.
-  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist;
-
-  /// \brief A collection of alloca instructions we can directly promote.
-  std::vector<AllocaInst *> PromotableAllocas;
-
-  /// \brief A worklist of PHIs to speculate prior to promoting allocas.
-  ///
-  /// All of these PHIs have been checked for the safety of speculation and by
-  /// being speculated will allow promoting allocas currently in the promotable
-  /// queue.
-  SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs;
-
-  /// \brief A worklist of select instructions to speculate prior to promoting
-  /// allocas.
-  ///
-  /// All of these select instructions have been checked for the safety of
-  /// speculation and by being speculated will allow promoting allocas
-  /// currently in the promotable queue.
-  SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
-
-public:
-  SROA(bool RequiresDomTree = true)
-      : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
-        DT(nullptr) {
-    initializeSROAPass(*PassRegistry::getPassRegistry());
-  }
-  bool runOnFunction(Function &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  const char *getPassName() const override { return "SROA"; }
-  static char ID;
-
-private:
-  friend class PHIOrSelectSpeculator;
-  friend class AllocaSliceRewriter;
-
-  bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
-  AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                               AllocaSlices::Partition &P);
-  bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
-  bool runOnAlloca(AllocaInst &AI);
-  void clobberUse(Use &U);
-  void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
-  bool promoteAllocas(Function &F);
-};
-}
-
-char SROA::ID = 0;
-
-FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
-  return new SROA(RequiresDomTree);
-}
-
-INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
-                    false)
-
 /// Walk the range of a partitioning looking for a common type to cover this
 /// sequence of slices.
 static Type *findCommonType(AllocaSlices::const_iterator B,
@@ -1373,7 +1158,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
 
     // Ensure that there are no instructions between the PHI and the load that
     // could store.
-    for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
+    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
       if (BBI->mayWriteToMemory())
         return false;
 
@@ -1934,10 +1719,10 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 
 /// \brief Test whether the given slice use can be promoted to a vector.
 ///
-/// This function is called to test each entry in a partioning which is slated
+/// This function is called to test each entry in a partition which is slated
 /// for a single slice.
-static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,
-                                            const Slice &S, VectorType *Ty,
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+                                            VectorType *Ty,
                                             uint64_t ElementSize,
                                             const DataLayout &DL) {
   // First validate the slice offsets.
@@ -2012,8 +1797,7 @@ static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
-                                           const DataLayout &DL) {
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   // Collect the candidate types for vector-based promotion. Also track whether
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
@@ -2130,7 +1914,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
 
   // We can't reasonably handle cases where the load or store extends past
-  // the end of the aloca's type and into its padding.
+  // the end of the alloca's type and into its padding.
   if (RelEnd > Size)
     return false;
 
@@ -2199,7 +1983,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
 /// This is a quick test to check whether we can rewrite the integer loads and
 /// stores to a particular alloca into wider loads and stores and be able to
 /// promote the resulting alloca.
-static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy,
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
                                     const DataLayout &DL) {
   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
   // Don't create integer types larger than the maximum bitwidth.
@@ -2368,14 +2152,14 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   return V;
 }
 
-namespace {
 /// \brief Visitor to rewrite instructions using p particular slice of an alloca
 /// to use a new alloca.
 ///
 /// Also implements the rewriting to vector-based accesses when the partition
 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
 /// lives here.
-class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
+class llvm::sroa::AllocaSliceRewriter
+    : public InstVisitor<AllocaSliceRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
   friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
   typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
@@ -2583,9 +2367,19 @@ private:
     V = convertValue(DL, IRB, V, IntTy);
     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
-    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset)
-      V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset,
-                         "extract");
+    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+      IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+    }
+    // It is possible that the extracted type is not the load type. This
+    // happens if there is a load past the end of the alloca, and as
+    // a consequence the slice is narrower but still a candidate for integer
+    // lowering. To handle this case, we just zero extend the extracted
+    // integer.
+    assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+           "Can only handle an extract for an overly wide load");
+    if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+      V = IRB.CreateZExt(V, LI.getType());
     return V;
   }
 
@@ -2648,7 +2442,7 @@ private:
                  DL.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
-      IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI)));
+      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
       // Create a placeholder value with the same type as LI to use as the
       // basis for the new value. This allows us to replace the uses of LI with
       // the computed value, and then replace the placeholder with LI, leaving
@@ -3126,7 +2920,7 @@ private:
     // dominate the PHI.
     IRBuilderTy PtrBuilder(IRB);
     if (isa<PHINode>(OldPtr))
-      PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt());
+      PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
     else
       PtrBuilder.SetInsertPoint(OldPtr);
     PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
@@ -3169,7 +2963,6 @@ private:
     return true;
   }
 };
-}
 
 namespace {
 /// \brief Visitor to rewrite aggregate loads and stores as scalar.
@@ -3181,8 +2974,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
   friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
 
-  const DataLayout &DL;
-
   /// Queue of pointer uses to analyze and potentially rewrite.
   SmallVector<Use *, 8> Queue;
 
@@ -3194,8 +2985,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   Use *U;
 
 public:
-  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
-
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
   bool rewrite(Instruction &I) {
@@ -3711,7 +3500,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
                        return true;
                      }),
       Stores.end());
-  // Now we have to go *back* through all te stores, because a later store may
+  // Now we have to go *back* through all the stores, because a later store may
   // have caused an earlier store's load to become unsplittable and if it is
   // unsplittable for the later store, then we can't rely on it being split in
   // the earlier store either.
@@ -3773,7 +3562,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
            "Cannot represent alloca access size using 64-bit integers!");
 
     Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
-    IRB.SetInsertPoint(BasicBlock::iterator(LI));
+    IRB.SetInsertPoint(LI);
 
     DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
 
@@ -3825,7 +3614,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       }
 
       Value *StoreBasePtr = SI->getPointerOperand();
-      IRB.SetInsertPoint(BasicBlock::iterator(SI));
+      IRB.SetInsertPoint(SI);
 
       DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
 
@@ -3914,7 +3703,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       if (SplitLoads) {
         PLoad = (*SplitLoads)[Idx];
       } else {
-        IRB.SetInsertPoint(BasicBlock::iterator(LI));
+        IRB.SetInsertPoint(LI);
         PLoad = IRB.CreateAlignedLoad(
             getAdjustedPtr(IRB, DL, LoadBasePtr,
                            APInt(DL.getPointerSizeInBits(), PartOffset),
@@ -3924,7 +3713,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       }
 
       // And store this partition.
-      IRB.SetInsertPoint(BasicBlock::iterator(SI));
+      IRB.SetInsertPoint(SI);
       StoreInst *PStore = IRB.CreateAlignedStore(
           PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
                                 APInt(DL.getPointerSizeInBits(), PartOffset),
@@ -3972,7 +3761,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
 
     // Mark the original store as dead now that we've split it up and kill its
     // slice. Note that we leave the original load in place unless this store
-    // was its ownly use. It may in turn be split up if it is an alloca load
+    // was its only use. It may in turn be split up if it is an alloca load
     // for some other alloca, but it may be a normal load. This may introduce
     // redundant loads, but where those can be merged the rest of the optimizer
     // should handle the merging, and this uncovers SSA splits which is more
@@ -4024,7 +3813,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
 /// at enabling promotion and if it was successful queues the alloca to be
 /// promoted.
 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                                   AllocaSlices::Partition &P) {
+                                   Partition &P) {
   // Try to compute a friendly type for this partition of the alloca. This
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
@@ -4230,12 +4019,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
       std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
 
   // Migrate debug information from the old alloca to the new alloca(s)
-  // and the individial partitions.
+  // and the individual partitions.
   if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
     auto *Var = DbgDecl->getVariable();
     auto *Expr = DbgDecl->getExpression();
-    DIBuilder DIB(*AI.getParent()->getParent()->getParent(),
-                  /*AllowUnresolved*/ false);
+    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
     bool IsSplit = Pieces.size() > 1;
     for (auto Piece : Pieces) {
       // Create a piece expression describing the new partition or reuse AI's
@@ -4308,7 +4096,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(DL);
+  AggLoadStoreRewriter AggRewriter;
   Changed |= AggRewriter.rewrite(AI);
 
   // Build the slices using a recursive instruction-visiting builder.
@@ -4388,107 +4176,29 @@ void SROA::deleteDeadInstructions(
   }
 }
 
-static void enqueueUsersInWorklist(Instruction &I,
-                                   SmallVectorImpl<Instruction *> &Worklist,
-                                   SmallPtrSetImpl<Instruction *> &Visited) {
-  for (User *U : I.users())
-    if (Visited.insert(cast<Instruction>(U)).second)
-      Worklist.push_back(cast<Instruction>(U));
-}
-
 /// \brief Promote the allocas, using the best available technique.
 ///
 /// This attempts to promote whatever allocas have been identified as viable in
 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
-/// If there is a domtree available, we attempt to promote using the full power
-/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
-/// based on the SSAUpdater utilities. This function returns whether any
-/// promotion occurred.
+/// This function returns whether any promotion occurred.
 bool SROA::promoteAllocas(Function &F) {
   if (PromotableAllocas.empty())
     return false;
 
   NumPromoted += PromotableAllocas.size();
 
-  if (DT && !ForceSSAUpdater) {
-    DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
-    PromotableAllocas.clear();
-    return true;
-  }
-
-  DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
-  SSAUpdater SSA;
-  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
-  SmallVector<Instruction *, 64> Insts;
-
-  // We need a worklist to walk the uses of each alloca.
-  SmallVector<Instruction *, 8> Worklist;
-  SmallPtrSet<Instruction *, 8> Visited;
-  SmallVector<Instruction *, 32> DeadInsts;
-
-  for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
-    AllocaInst *AI = PromotableAllocas[Idx];
-    Insts.clear();
-    Worklist.clear();
-    Visited.clear();
-
-    enqueueUsersInWorklist(*AI, Worklist, Visited);
-
-    while (!Worklist.empty()) {
-      Instruction *I = Worklist.pop_back_val();
-
-      // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
-      // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
-      // leading to them) here. Eventually it should use them to optimize the
-      // scalar values produced.
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-        assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
-               II->getIntrinsicID() == Intrinsic::lifetime_end);
-        II->eraseFromParent();
-        continue;
-      }
-
-      // Push the loads and stores we find onto the list. SROA will already
-      // have validated that all loads and stores are viable candidates for
-      // promotion.
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        assert(LI->getType() == AI->getAllocatedType());
-        Insts.push_back(LI);
-        continue;
-      }
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
-        Insts.push_back(SI);
-        continue;
-      }
-
-      // For everything else, we know that only no-op bitcasts and GEPs will
-      // make it this far, just recurse through them and recall them for later
-      // removal.
-      DeadInsts.push_back(I);
-      enqueueUsersInWorklist(*I, Worklist, Visited);
-    }
-    AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
-    while (!DeadInsts.empty())
-      DeadInsts.pop_back_val()->eraseFromParent();
-    AI->eraseFromParent();
-  }
-
+  DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
   PromotableAllocas.clear();
   return true;
 }
 
-bool SROA::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+                                AssumptionCache &RunAC) {
   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  DT = &RunDT;
+  AC = &RunAC;
 
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
@@ -4527,12 +4237,55 @@ bool SROA::runOnFunction(Function &F) {
     PostPromotionWorklist.clear();
   } while (!Worklist.empty());
 
-  return Changed;
+  // FIXME: Even when promoting allocas we should preserve some abstract set of
+  // CFG-specific analyses.
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
-void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AssumptionCacheTracker>();
-  if (RequiresDomTree)
-    AU.addRequired<DominatorTreeWrapperPass>();
-  AU.setPreservesCFG();
+PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) {
+  return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F),
+                 AM->getResult<AssumptionAnalysis>(F));
 }
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+  /// The SROA implementation.
+  SROA Impl;
+
+public:
+  SROALegacyPass() : FunctionPass(ID) {
+    initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+
+    auto PA = Impl.runImpl(
+        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+    return !PA.areAllPreserved();
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  const char *getPassName() const override { return "SROA"; }
+  static char ID;
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+                      "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index d5d3605..52d477c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -16,7 +16,10 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
@@ -27,10 +30,9 @@ using namespace llvm;
 /// initializeScalarOptsPasses - Initialize all passes linked into the
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
-  initializeADCEPass(Registry);
+  initializeADCELegacyPassPass(Registry);
   initializeBDCEPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
-  initializeSampleProfileLoaderPass(Registry);
   initializeConstantHoistingPass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
@@ -66,7 +68,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeRewriteStatepointsForGCPass(Registry);
   initializeSCCPPass(Registry);
   initializeIPSCCPPass(Registry);
-  initializeSROAPass(Registry);
+  initializeSROALegacyPassPass(Registry);
   initializeSROA_DTPass(Registry);
   initializeSROA_SSAUpPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
@@ -81,6 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializePlaceSafepointsPass(Registry);
   initializeFloat2IntPass(Registry);
   initializeLoopDistributePass(Registry);
+  initializeLoopLoadEliminationPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -225,15 +228,15 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createTypeBasedAliasAnalysisPass());
+  unwrap(PM)->add(createTypeBasedAAWrapperPass());
 }
 
 void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createScopedNoAliasAAPass());
+  unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
 }
 
 void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createBasicAliasAnalysisPass());
+  unwrap(PM)->add(createBasicAAWrapperPass());
 }
 
 void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index d955da7..114d22d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -60,6 +60,7 @@ STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
 STATISTIC(NumConverted, "Number of aggregates converted to scalar");
 
 namespace {
+#define SROA SROA_
   struct SROA : public FunctionPass {
     SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
       : FunctionPass(ID), HasDomTree(hasDT) {
@@ -382,8 +383,8 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
     // Create and insert the integer alloca.
     NewTy = IntegerType::get(AI->getContext(), BitWidth);
   }
-  AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "",
-                                     AI->getParent()->begin());
+  AllocaInst *NewAI =
+      new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front());
   ConvertUsesToScalar(AI, NewAI, 0, nullptr);
   return NewAI;
 }
@@ -1195,7 +1196,7 @@ static bool isSafePHIToSpeculate(PHINode *PN) {
 
     // Ensure that there are no instructions between the PHI and the load that
     // could store.
-    for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI)
+    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
       if (BBI->mayWriteToMemory())
         return false;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 0493003..054bacd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -253,10 +253,10 @@ bool Scalarizer::doInitialization(Module &M) {
 }
 
 bool Scalarizer::runOnFunction(Function &F) {
-  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
-    BasicBlock *BB = BBI;
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
-      Instruction *I = II;
+  assert(Gathered.empty() && Scattered.empty());
+  for (BasicBlock &BB : F) {
+    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+      Instruction *I = &*II;
       bool Done = visit(I);
       ++II;
       if (Done && I->getType()->isVoidTy())
@@ -285,7 +285,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
   }
   // In the fallback case, just put the scattered before Point and
   // keep the result local to Point.
-  return Scatterer(Point->getParent(), Point, V);
+  return Scatterer(Point->getParent(), Point->getIterator(), V);
 }
 
 // Replace Op with the gathered form of the components in CV.  Defer the
@@ -377,7 +377,7 @@ bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
     return false;
 
   unsigned NumElems = VT->getNumElements();
-  IRBuilder<> Builder(I.getParent(), &I);
+  IRBuilder<> Builder(&I);
   Scatterer Op0 = scatter(&I, I.getOperand(0));
   Scatterer Op1 = scatter(&I, I.getOperand(1));
   assert(Op0.size() == NumElems && "Mismatched binary operation");
@@ -397,7 +397,7 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) {
     return false;
 
   unsigned NumElems = VT->getNumElements();
-  IRBuilder<> Builder(SI.getParent(), &SI);
+  IRBuilder<> Builder(&SI);
   Scatterer Op1 = scatter(&SI, SI.getOperand(1));
   Scatterer Op2 = scatter(&SI, SI.getOperand(2));
   assert(Op1.size() == NumElems && "Mismatched select");
@@ -438,7 +438,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   if (!VT)
     return false;
 
-  IRBuilder<> Builder(GEPI.getParent(), &GEPI);
+  IRBuilder<> Builder(&GEPI);
   unsigned NumElems = VT->getNumElements();
   unsigned NumIndices = GEPI.getNumIndices();
 
@@ -472,7 +472,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) {
     return false;
 
   unsigned NumElems = VT->getNumElements();
-  IRBuilder<> Builder(CI.getParent(), &CI);
+  IRBuilder<> Builder(&CI);
   Scatterer Op0 = scatter(&CI, CI.getOperand(0));
   assert(Op0.size() == NumElems && "Mismatched cast");
   ValueVector Res;
@@ -492,7 +492,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
 
   unsigned DstNumElems = DstVT->getNumElements();
   unsigned SrcNumElems = SrcVT->getNumElements();
-  IRBuilder<> Builder(BCI.getParent(), &BCI);
+  IRBuilder<> Builder(&BCI);
   Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
   ValueVector Res;
   Res.resize(DstNumElems);
@@ -569,7 +569,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) {
     return false;
 
   unsigned NumElems = VT->getNumElements();
-  IRBuilder<> Builder(PHI.getParent(), &PHI);
+  IRBuilder<> Builder(&PHI);
   ValueVector Res;
   Res.resize(NumElems);
 
@@ -600,7 +600,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) {
     return false;
 
   unsigned NumElems = Layout.VecTy->getNumElements();
-  IRBuilder<> Builder(LI.getParent(), &LI);
+  IRBuilder<> Builder(&LI);
   Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
   ValueVector Res;
   Res.resize(NumElems);
@@ -625,7 +625,7 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {
     return false;
 
   unsigned NumElems = Layout.VecTy->getNumElements();
-  IRBuilder<> Builder(SI.getParent(), &SI);
+  IRBuilder<> Builder(&SI);
   Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
   Scatterer Val = scatter(&SI, FullValue);
 
@@ -642,7 +642,9 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {
 // Delete the instructions that we scalarized.  If a full vector result
 // is still needed, recreate it using InsertElements.
 bool Scalarizer::finish() {
-  if (Gathered.empty())
+  // The presence of data in Gathered or Scattered indicates changes
+  // made to the Function.
+  if (Gathered.empty() && Scattered.empty())
     return false;
   for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();
        GMI != GME; ++GMI) {
@@ -655,7 +657,7 @@ bool Scalarizer::finish() {
       Value *Res = UndefValue::get(Ty);
       BasicBlock *BB = Op->getParent();
       unsigned Count = Ty->getVectorNumElements();
-      IRBuilder<> Builder(BB, Op);
+      IRBuilder<> Builder(Op);
       if (isa<PHINode>(Op))
         Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
       for (unsigned I = 0; I < Count; ++I)
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 4a87531..86a10d2 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -156,6 +156,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -164,6 +168,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
@@ -174,6 +179,7 @@
 #include "llvm/IR/IRBuilder.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
@@ -319,8 +325,11 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
   bool doInitialization(Module &M) override {
@@ -373,15 +382,42 @@ private:
   ///
   /// Verified in @i32_add in split-gep.ll
   bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+  /// the constant offset. After extraction, it becomes desirable to reunion the
+  /// distributed sexts. For example,
+  ///
+  ///                              &a[sext(i +nsw (j +nsw 5)]
+  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)]
+  ///   => constant extraction     &a[sext(i) + sext(j)] + 5
+  ///   => reunion                 &a[sext(i +nsw j)] + 5
+  bool reuniteExts(Function &F);
+  /// A helper that reunites sexts in an instruction.
+  bool reuniteExts(Instruction *I);
+  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+  Instruction *findClosestMatchingDominator(const SCEV *Key,
+                                            Instruction *Dominatee);
   /// Verify F is free of dead code.
   void verifyNoDeadCode(Function &F);
 
+  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+  // Swap the index operand of two GEP.
+  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+  // Check if it is safe to swap operand of two GEP.
+  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+                            Loop *CurLoop);
+
   const DataLayout *DL;
-  const DominatorTree *DT;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
   const TargetMachine *TM;
+
+  LoopInfo *LI;
+  TargetLibraryInfo *TLI;
   /// Whether to lower a GEP with multiple indices into arithmetic operations or
   /// multiple GEPs with a single index.
   bool LowerGEP;
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
 };
 }  // anonymous namespace
 
@@ -391,7 +427,10 @@ INITIALIZE_PASS_BEGIN(
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
     SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
@@ -734,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
   Type *I8PtrTy =
       Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
   Value *ResultPtr = Variadic->getOperand(0);
+  Loop *L = LI->getLoopFor(Variadic->getParent());
+  // Check if the base is not loop invariant or used more than once.
+  bool isSwapCandidate =
+      L && L->isLoopInvariant(ResultPtr) &&
+      !hasMoreThanOneUseInLoop(ResultPtr, L);
+  Value *FirstResult = nullptr;
+
   if (ResultPtr->getType() != I8PtrTy)
     ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
 
@@ -762,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
       // Create an ugly GEP with a single index for each index.
       ResultPtr =
           Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+      if (FirstResult == nullptr)
+        FirstResult = ResultPtr;
     }
   }
 
@@ -770,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
     Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
     ResultPtr =
         Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
-  }
+  } else
+    isSwapCandidate = false;
+
+  // If we created a GEP with constant index, and the base is loop invariant,
+  // then we swap the first one with it, so LICM can move constant GEP out
+  // later.
+  GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult);
+  GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
+  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+    swapGEPOperand(FirstGEP, SecondGEP);
+
   if (ResultPtr->getType() != Variadic->getType())
     ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
 
@@ -891,13 +949,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // Clear the inbounds attribute because the new index may be off-bound.
   // e.g.,
   //
-  // b = add i64 a, 5
-  // addr = gep inbounds float* p, i64 b
+  //   b     = add i64 a, 5
+  //   addr  = gep inbounds float, float* p, i64 b
   //
   // is transformed to:
   //
-  // addr2 = gep float* p, i64 a
-  // addr = gep float* addr2, i64 5
+  //   addr2 = gep float, float* p, i64 a ; inbounds removed
+  //   addr  = gep inbounds float, float* addr2, i64 5
   //
   // If a is -4, although the old index b is in bounds, the new index a is
   // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
@@ -907,6 +965,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   //
   // TODO(jingyue): do some range analysis to keep as many inbounds as
   // possible. GEPs with inbounds are more friendly to alias analysis.
+  bool GEPWasInBounds = GEP->isInBounds();
   GEP->setIsInBounds(false);
 
   // Lowers a GEP to either GEPs with a single index or arithmetic operations.
@@ -968,6 +1027,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
     NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
                                        ConstantInt::get(IntPtrTy, Index, true),
                                        GEP->getName(), GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
   } else {
     // Unlikely but possible. For example,
     // #pragma pack(1)
@@ -990,6 +1051,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
         Type::getInt8Ty(GEP->getContext()), NewGEP,
         ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
         GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
     if (GEP->getType() != I8PtrTy)
       NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
   }
@@ -1008,24 +1071,96 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
     return false;
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   bool Changed = false;
   for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;)
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
         Changed |= splitGEP(GEP);
-      }
-      // No need to split GEP ConstantExprs because all its indices are constant
-      // already.
-    }
+    // No need to split GEP ConstantExprs because all its indices are constant
+    // already.
   }
 
+  Changed |= reuniteExts(F);
+
   if (VerifyNoDeadCode)
     verifyNoDeadCode(F);
 
   return Changed;
 }
 
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+    const SCEV *Key, Instruction *Dominatee) {
+  auto Pos = DominatingExprs.find(Key);
+  if (Pos == DominatingExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    Instruction *Candidate = Candidates.back();
+    if (DT->dominates(Candidate, Dominatee))
+      return Candidate;
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  //   Dom: LHS+RHS
+  //   I: sext(LHS)+sext(RHS)
+  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+  // TODO: handle zext
+  Value *LHS = nullptr, *RHS = nullptr;
+  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
+      match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+    if (LHS->getType() == RHS->getType()) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      if (auto *Dom = findClosestMatchingDominator(Key, I)) {
+        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+        NewSExt->takeName(I);
+        I->replaceAllUsesWith(NewSExt);
+        RecursivelyDeleteTriviallyDeadInstructions(I);
+        return true;
+      }
+    }
+  }
+
+  // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
+      match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+    if (isKnownNotFullPoison(I)) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      DominatingExprs[Key].push_back(I);
+    }
+  }
+  return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+  bool Changed = false;
+  DominatingExprs.clear();
+  for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+       Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ) {
+      Instruction *Cur = &*I++;
+      Changed |= reuniteExts(Cur);
+    }
+  }
+  return Changed;
+}
+
 void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
   for (auto &B : F) {
     for (auto &I : B) {
@@ -1038,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
     }
   }
 }
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+  if (!FirstGEP || !FirstGEP->hasOneUse())
+    return false;
+
+  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+    return false;
+
+  if (FirstGEP == SecondGEP)
+    return false;
+
+  unsigned FirstNum = FirstGEP->getNumOperands();
+  unsigned SecondNum = SecondGEP->getNumOperands();
+  // Give up if the number of operands are not 2.
+  if (FirstNum != SecondNum || FirstNum != 2)
+    return false;
+
+  Value *FirstBase = FirstGEP->getOperand(0);
+  Value *SecondBase = SecondGEP->getOperand(0);
+  Value *FirstOffset = FirstGEP->getOperand(1);
+  // Give up if the index of the first GEP is loop invariant.
+  if (CurLoop->isLoopInvariant(FirstOffset))
+    return false;
+
+  // Give up if base doesn't have same type.
+  if (FirstBase->getType() != SecondBase->getType())
+    return false;
+
+  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+  // Check if the second operand of first GEP has constant coefficient.
+  // For an example, for the following code,  we won't gain anything by
+  // hoisting the second GEP out because the second GEP can be folded away.
+  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+  //   %67 = shl i64 %scevgep.sum.ur159, 2
+  //   %uglygep160 = getelementptr i8* %65, i64 %67
+  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+  // Skip constant shift instruction which may be generated by Splitting GEPs.
+  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+      isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+  // Give up if FirstOffsetDef is an Add or Sub with constant.
+  // Because it may not profitable at all due to constant folding.
+  if (FirstOffsetDef)
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+      unsigned opc = BO->getOpcode();
+      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+          (isa<ConstantInt>(BO->getOperand(0)) ||
+           isa<ConstantInt>(BO->getOperand(1))))
+        return false;
+    }
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+  int UsesInLoop = 0;
+  for (User *U : V->users()) {
+    if (Instruction *User = dyn_cast<Instruction>(U))
+      if (L->contains(User))
+        if (++UsesInLoop > 1)
+          return true;
+  }
+  return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+                                                GetElementPtrInst *Second) {
+  Value *Offset1 = First->getOperand(1);
+  Value *Offset2 = Second->getOperand(1);
+  First->setOperand(1, Offset2);
+  Second->setOperand(1, Offset1);
+
+  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+  const DataLayout &DAL = First->getModule()->getDataLayout();
+  APInt Offset(DAL.getPointerSizeInBits(
+                   cast<PointerType>(First->getType())->getAddressSpace()),
+               0);
+  Value *NewBase =
+      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+  uint64_t ObjectSize;
+  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+     Offset.ugt(ObjectSize)) {
+    First->setIsInBounds(false);
+    Second->setIsInBounds(false);
+  } else
+    First->setIsInBounds(true);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 231411a..63c8836 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
@@ -67,15 +68,14 @@ static bool mergeEmptyReturnBlocks(Function &F) {
     // single PHI node that is the operand to the return.
     if (Ret != &BB.front()) {
       // Check for something else in the block.
-      BasicBlock::iterator I = Ret;
+      BasicBlock::iterator I(Ret);
       --I;
       // Skip over debug info.
       while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
         --I;
       if (!isa<DbgInfoIntrinsic>(I) &&
-          (!isa<PHINode>(I) || I != BB.begin() ||
-           Ret->getNumOperands() == 0 ||
-           Ret->getOperand(0) != I))
+          (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+           Ret->getOperand(0) != &*I))
         continue;
     }
 
@@ -136,7 +136,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -217,6 +217,7 @@ struct CFGSimplifyPass : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index f49f4ea..64109b2 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -48,7 +48,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       FunctionPass::getAnalysisUsage(AU);
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
@@ -66,7 +66,7 @@ char Sinking::ID = 0;
 INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
 
 FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
@@ -99,7 +99,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
 bool Sinking::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   bool MadeChange, EverMadeChange = false;
 
@@ -119,7 +119,7 @@ bool Sinking::runOnFunction(Function &F) {
 
 bool Sinking::ProcessBlock(BasicBlock &BB) {
   // Can't sink anything out of a block that has less than two successors.
-  if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false;
+  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
 
   // Don't bother sinking code out of unreachable blocks. In addition to being
   // unprofitable, it can also lead to infinite looping, because in an
@@ -134,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
   bool ProcessedBegin = false;
   SmallPtrSet<Instruction *, 8> Stores;
   do {
-    Instruction *Inst = I;  // The instruction to sink.
+    Instruction *Inst = &*I; // The instruction to sink.
 
     // Predecrement I (if it's not begin) so that it isn't invalidated by
     // sinking.
@@ -165,14 +165,16 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
   if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     MemoryLocation Loc = MemoryLocation::get(L);
     for (Instruction *S : Stores)
-      if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod)
+      if (AA->getModRefInfo(S, Loc) & MRI_Mod)
         return false;
   }
 
-  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst))
+  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() ||
+      Inst->mayThrow())
     return false;
 
-  // Convergent operations can only be moved to control equivalent blocks.
+  // Convergent operations cannot be made control-dependent on additional
+  // values.
   if (auto CS = CallSite(Inst)) {
     if (CS.hasFnAttr(Attribute::Convergent))
       return false;
@@ -193,6 +195,11 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
   if (Inst->getParent() == SuccToSinkTo)
     return false;
 
+  // It's never legal to sink an instruction into a block which terminates in an
+  // EH-pad.
+  if (SuccToSinkTo->getTerminator()->isExceptional())
+    return false;
+
   // If the block has multiple predecessors, this would introduce computation
   // on different code paths.  We could split the critical edge, but for now we
   // just punt.
@@ -278,6 +285,6 @@ bool Sinking::SinkInstruction(Instruction *Inst,
         dbgs() << ")\n");
 
   // Move the instruction.
-  Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt());
+  Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
   return true;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index ff3f00a..147d615 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -227,7 +227,7 @@ bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock,
     // changes the list that I is iterating through.
     auto Current = I;
     ++I;
-    if (!NotHoisted.count(Current)) {
+    if (!NotHoisted.count(&*Current)) {
       Current->moveBefore(ToBlock.getTerminator());
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 6d9d417..1faa65e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -131,7 +131,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     // We do not modify the shape of the CFG.
     AU.setPreservesCFG();
@@ -212,7 +212,7 @@ char StraightLineStrengthReduce::ID = 0;
 INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
                       "Straight line strength reduction", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
                     "Straight line strength reduction", false, false)
@@ -234,6 +234,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
           Basis.CandidateKind == C.CandidateKind);
 }
 
+// TODO: use TTI->getGEPCost.
 static bool isGEPFoldable(GetElementPtrInst *GEP,
                           const TargetTransformInfo *TTI,
                           const DataLayout *DL) {
@@ -523,7 +524,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
       continue;
 
     const SCEV *OrigIndexExpr = IndexExprs[I - 1];
-    IndexExprs[I - 1] = SE->getConstant(OrigIndexExpr->getType(), 0);
+    IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
 
     // The base of this candidate is GEP's base plus the offsets of all
     // indices except this current one.
@@ -689,7 +690,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = &getAnalysis<ScalarEvolution>();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   // Traverse the dominator tree in the depth-first order. This order makes sure
   // all bases of a candidate are in Candidates when we process it.
   for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 4f23e20..662513c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -358,13 +358,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
     BranchInst *Term = cast<BranchInst>(BB->getTerminator());
 
-    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-      BasicBlock *Succ = Term->getSuccessor(i);
-
-      if (Visited.count(Succ)) {
+    for (BasicBlock *Succ : Term->successors())
+      if (Visited.count(Succ))
         Loops[Succ] = BB;
-      }
-    }
   }
 }
 
@@ -903,14 +899,14 @@ void StructurizeCFG::rebuildSSA() {
             continue;
         }
 
-        if (DT->dominates(II, User))
+        if (DT->dominates(&*II, User))
           continue;
 
         if (!Initialized) {
           Value *Undef = UndefValue::get(II->getType());
           Updater.Initialize(II->getType(), "");
           Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
-          Updater.AddAvailableValue(BB, II);
+          Updater.AddAvailableValue(BB, &*II);
           Initialized = true;
         }
         Updater.RewriteUseAfterInsertions(U);
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index c7de2e2..4e84d72 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -54,6 +54,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -136,6 +137,7 @@ FunctionPass *llvm::createTailCallEliminationPass() {
 
 void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
 }
 
 /// \brief Scan the specified function for alloca instructions.
@@ -195,8 +197,8 @@ struct AllocaDerivedValueTracker {
       case Instruction::Call:
       case Instruction::Invoke: {
         CallSite CS(I);
-        bool IsNocapture = !CS.isCallee(U) &&
-                           CS.doesNotCapture(CS.getArgumentNo(U));
+        bool IsNocapture =
+            CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));
         callUsesLocalStack(CS, IsNocapture);
         if (IsNocapture) {
           // If the alloca-derived argument is passed in as nocapture, then it
@@ -302,7 +304,9 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
       if (!CI || CI->isTailCall())
         continue;
 
-      if (CI->doesNotAccessMemory()) {
+      bool IsNoTail = CI->isNoTailCall();
+
+      if (!IsNoTail && CI->doesNotAccessMemory()) {
         // A call to a readnone function whose arguments are all things computed
         // outside this function can be marked tail. Even if you stored the
         // alloca address into a global, a readnone function can't load the
@@ -330,7 +334,7 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
         }
       }
 
-      if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
         DeferredTails.push_back(CI);
       } else {
         AllCallsAreTailCalls = false;
@@ -404,7 +408,7 @@ bool TailCallElim::runTRE(Function &F) {
   // Until this is resolved, disable this transformation if that would ever
   // happen.  This bug is PR962.
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
-    BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB.
+    BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB.
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
                                           ArgumentPHIs, !CanTRETailMarkedCall);
@@ -421,9 +425,7 @@ bool TailCallElim::runTRE(Function &F) {
   // with themselves.  Check to see if we did and clean up our mess if so.  This
   // occurs when a function passes an argument straight through to its tail
   // call.
-  for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
-    PHINode *PN = ArgumentPHIs[i];
-
+  for (PHINode *PN : ArgumentPHIs) {
     // If the PHI Node is a dynamic constant, replace it with the value it is.
     if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
       PN->replaceAllUsesWith(PNV);
@@ -464,10 +466,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
   // return value of the call, it must only use things that are defined before
   // the call, or movable instructions between the call and the instruction
   // itself.
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-    if (I->getOperand(i) == CI)
-      return false;
-  return true;
+  return std::find(I->op_begin(), I->op_end(), CI) == I->op_end();
 }
 
 /// Return true if the specified value is the same when the return would exit
@@ -574,7 +573,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   // Scan backwards from the return, checking to see if there is a tail call in
   // this block.  If so, set CI to it.
   CallInst *CI = nullptr;
-  BasicBlock::iterator BBI = TI;
+  BasicBlock::iterator BBI(TI);
   while (true) {
     CI = dyn_cast<CallInst>(BBI);
     if (CI && CI->getCalledFunction() == F)
@@ -595,9 +594,8 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
   if (BB == &F->getEntryBlock() &&
-      FirstNonDbg(BB->front()) == CI &&
-      FirstNonDbg(std::next(BB->begin())) == TI &&
-      CI->getCalledFunction() &&
+      FirstNonDbg(BB->front().getIterator()) == CI &&
+      FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
@@ -636,19 +634,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   // tail call if all of the instructions between the call and the return are
   // movable to above the call itself, leaving the call next to the return.
   // Check that this is the case now.
-  BasicBlock::iterator BBI = CI;
+  BasicBlock::iterator BBI(CI);
   for (++BBI; &*BBI != Ret; ++BBI) {
-    if (CanMoveAboveCall(BBI, CI)) continue;
+    if (CanMoveAboveCall(&*BBI, CI)) continue;
 
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
     // using accumulator recursion elimination.  Check to see if this is the
     // case, and if so, remember the initial accumulator value for later.
     if ((AccumulatorRecursionEliminationInitVal =
-                           CanTransformAccumulatorRecursion(BBI, CI))) {
+             CanTransformAccumulatorRecursion(&*BBI, CI))) {
       // Yes, this is accumulator recursion.  Remember which instruction
       // accumulates.
-      AccumulatorRecursionInstr = BBI;
+      AccumulatorRecursionInstr = &*BBI;
     } else {
       return false;   // Otherwise, we cannot eliminate the tail recursion!
     }
@@ -698,19 +696,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
              NEBI = NewEntry->begin(); OEBI != E; )
         if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
           if (isa<ConstantInt>(AI->getArraySize()))
-            AI->moveBefore(NEBI);
+            AI->moveBefore(&*NEBI);
 
     // Now that we have created a new block, which jumps to the entry
     // block, insert a PHI node for each argument of the function.
     // For now, we initialize each PHI to only have the real arguments
     // which are passed in.
-    Instruction *InsertPos = OldEntry->begin();
+    Instruction *InsertPos = &OldEntry->front();
     for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
          I != E; ++I) {
       PHINode *PN = PHINode::Create(I->getType(), 2,
                                     I->getName() + ".tr", InsertPos);
       I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
-      PN->addIncoming(I, NewEntry);
+      PN->addIncoming(&*I, NewEntry);
       ArgumentPHIs.push_back(PN);
     }
   }
@@ -739,10 +737,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
     Instruction *AccRecInstr = AccumulatorRecursionInstr;
     // Start by inserting a new PHI node for the accumulator.
     pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
-    PHINode *AccPN =
-      PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(),
-                      std::distance(PB, PE) + 1,
-                      "accumulator.tr", OldEntry->begin());
+    PHINode *AccPN = PHINode::Create(
+        AccumulatorRecursionEliminationInitVal->getType(),
+        std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());
 
     // Loop over all of the predecessors of the tail recursion block.  For the
     // real entry into the function we seed the PHI with the initial value,
diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 03c3a80..409326e 100644
--- a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
 namespace llvm {
diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
index e9f6239..0262358f 100644
--- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -52,32 +52,34 @@
 // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "add-discriminators"
 
 namespace {
-  struct AddDiscriminators : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    AddDiscriminators() : FunctionPass(ID) {
-      initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry());
-    }
+struct AddDiscriminators : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  AddDiscriminators() : FunctionPass(ID) {
+    initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override;
-  };
+  bool runOnFunction(Function &F) override;
+};
 }
 
 char AddDiscriminators::ID = 0;
@@ -89,17 +91,17 @@ INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators",
 // Command line option to disable discriminator generation even in the
 // presence of debug information. This is only needed when debugging
 // debug info generation issues.
-static cl::opt<bool>
-NoDiscriminators("no-discriminators", cl::init(false),
-                 cl::desc("Disable generation of discriminator information."));
+static cl::opt<bool> NoDiscriminators(
+    "no-discriminators", cl::init(false),
+    cl::desc("Disable generation of discriminator information."));
 
 FunctionPass *llvm::createAddDiscriminatorsPass() {
   return new AddDiscriminators();
 }
 
 static bool hasDebugInfo(const Function &F) {
-  NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu");
-  return CUNodes != nullptr;
+  DISubprogram *S = getDISubprogram(&F);
+  return S != nullptr;
 }
 
 /// \brief Assign DWARF discriminators.
@@ -159,8 +161,7 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   // Simlarly, if the function has no debug info, do nothing.
   // Finally, if this module is built with dwarf versions earlier than 4,
   // do nothing (discriminator support is a DWARF 4 feature).
-  if (NoDiscriminators ||
-      !hasDebugInfo(F) ||
+  if (NoDiscriminators || !hasDebugInfo(F) ||
       F.getParent()->getDwarfVersion() < 4)
     return false;
 
@@ -169,59 +170,77 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   LLVMContext &Ctx = M->getContext();
   DIBuilder Builder(*M, /*AllowUnresolved*/ false);
 
-  // Traverse all the blocks looking for instructions in different
-  // blocks that are at the same file:line location.
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *B = I;
-    TerminatorInst *Last = B->getTerminator();
-    const DILocation *LastDIL = Last->getDebugLoc();
-    if (!LastDIL)
-      continue;
-
-    for (unsigned I = 0; I < Last->getNumSuccessors(); ++I) {
-      BasicBlock *Succ = Last->getSuccessor(I);
-      Instruction *First = Succ->getFirstNonPHIOrDbgOrLifetime();
-      const DILocation *FirstDIL = First->getDebugLoc();
-      if (!FirstDIL)
+  typedef std::pair<StringRef, unsigned> Location;
+  typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap;
+  typedef DenseMap<Location, BBScopeMap> LocationBBMap;
+
+  LocationBBMap LBM;
+
+  // Traverse all instructions in the function. If the source line location
+  // of the instruction appears in other basic block, assign a new
+  // discriminator for this instruction.
+  for (BasicBlock &B : F) {
+    for (auto &I : B.getInstList()) {
+      if (isa<DbgInfoIntrinsic>(&I))
+        continue;
+      const DILocation *DIL = I.getDebugLoc();
+      if (!DIL)
+        continue;
+      Location L = std::make_pair(DIL->getFilename(), DIL->getLine());
+      auto &BBMap = LBM[L];
+      auto R = BBMap.insert(std::make_pair(&B, (Metadata *)nullptr));
+      if (BBMap.size() == 1)
+        continue;
+      bool InsertSuccess = R.second;
+      Metadata *&NewScope = R.first->second;
+      // If we could insert a different block in the same location, a
+      // discriminator is needed to distinguish both instructions.
+      if (InsertSuccess) {
+        auto *Scope = DIL->getScope();
+        auto *File =
+            Builder.createFile(DIL->getFilename(), Scope->getDirectory());
+        NewScope = Builder.createLexicalBlockFile(
+            Scope, File, DIL->computeNewDiscriminator());
+      }
+      I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(),
+                                    NewScope, DIL->getInlinedAt()));
+      DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                   << DIL->getColumn() << ":"
+                   << dyn_cast<DILexicalBlockFile>(NewScope)->getDiscriminator()
+                   << I << "\n");
+      Changed = true;
+    }
+  }
+
+  // Traverse all instructions and assign new discriminators to call
+  // instructions with the same lineno that are in the same basic block.
+  // Sample base profile needs to distinguish different function calls within
+  // a same source line for correct profile annotation.
+  for (BasicBlock &B : F) {
+    const DILocation *FirstDIL = NULL;
+    for (auto &I : B.getInstList()) {
+      CallInst *Current = dyn_cast<CallInst>(&I);
+      if (!Current || isa<DbgInfoIntrinsic>(&I))
         continue;
 
-      // If the first instruction (First) of Succ is at the same file
-      // location as B's last instruction (Last), add a new
-      // discriminator for First's location and all the instructions
-      // in Succ that share the same location with First.
-      if (!FirstDIL->canDiscriminate(*LastDIL)) {
-        // Create a new lexical scope and compute a new discriminator
-        // number for it.
-        StringRef Filename = FirstDIL->getFilename();
-        auto *Scope = FirstDIL->getScope();
-        auto *File = Builder.createFile(Filename, Scope->getDirectory());
-
-        // FIXME: Calculate the discriminator here, based on local information,
-        // and delete DILocation::computeNewDiscriminator().  The current
-        // solution gives different results depending on other modules in the
-        // same context.  All we really need is to discriminate between
-        // FirstDIL and LastDIL -- a local map would suffice.
-        unsigned Discriminator = FirstDIL->computeNewDiscriminator();
-        auto *NewScope =
-            Builder.createLexicalBlockFile(Scope, File, Discriminator);
-        auto *NewDIL =
-            DILocation::get(Ctx, FirstDIL->getLine(), FirstDIL->getColumn(),
-                            NewScope, FirstDIL->getInlinedAt());
-        DebugLoc newDebugLoc = NewDIL;
-
-        // Attach this new debug location to First and every
-        // instruction following First that shares the same location.
-        for (BasicBlock::iterator I1(*First), E1 = Succ->end(); I1 != E1;
-             ++I1) {
-          if (I1->getDebugLoc().get() != FirstDIL)
-            break;
-          I1->setDebugLoc(newDebugLoc);
-          DEBUG(dbgs() << NewDIL->getFilename() << ":" << NewDIL->getLine()
-                       << ":" << NewDIL->getColumn() << ":"
-                       << NewDIL->getDiscriminator() << *I1 << "\n");
+      DILocation *CurrentDIL = Current->getDebugLoc();
+      if (FirstDIL) {
+        if (CurrentDIL && CurrentDIL->getLine() == FirstDIL->getLine() &&
+            CurrentDIL->getFilename() == FirstDIL->getFilename()) {
+          auto *Scope = FirstDIL->getScope();
+          auto *File = Builder.createFile(FirstDIL->getFilename(),
+                                          Scope->getDirectory());
+          auto *NewScope = Builder.createLexicalBlockFile(
+              Scope, File, FirstDIL->computeNewDiscriminator());
+          Current->setDebugLoc(DILocation::get(
+              Ctx, CurrentDIL->getLine(), CurrentDIL->getColumn(), NewScope,
+              CurrentDIL->getInlinedAt()));
+          Changed = true;
+        } else {
+          FirstDIL = CurrentDIL;
         }
-        DEBUG(dbgs() << "\n");
-        Changed = true;
+      } else {
+        FirstDIL = CurrentDIL;
       }
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index ef7daca..72db980 100644
--- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -41,8 +41,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 
   // Loop through all of our successors and make sure they know that one
   // of their predecessors is going away.
-  for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
-    BBTerm->getSuccessor(i)->removePredecessor(BB);
+  for (BasicBlock *Succ : BBTerm->successors())
+    Succ->removePredecessor(BB);
 
   // Zap all the instructions in the block.
   while (!BB->empty()) {
@@ -65,7 +65,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 /// any single-entry PHI nodes in it, fold them away.  This handles the case
 /// when all entries to the PHI nodes in a block are guaranteed equal, such as
 /// when the block has exactly one predecessor.
-void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA,
+void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
                                    MemoryDependenceAnalysis *MemDep) {
   if (!isa<PHINode>(BB->begin())) return;
 
@@ -77,8 +77,6 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA,
 
     if (MemDep)
       MemDep->removeInstruction(PN);  // Memdep updates AA itself.
-    else if (AA && isa<PointerType>(PN->getType()))
-      AA->deleteValue(PN);
 
     PN->eraseFromParent();
   }
@@ -108,7 +106,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
-                                     LoopInfo *LI, AliasAnalysis *AA,
+                                     LoopInfo *LI,
                                      MemoryDependenceAnalysis *MemDep) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
@@ -119,8 +117,9 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
 
   // Don't break self-loops.
   if (PredBB == BB) return false;
-  // Don't break invokes.
-  if (isa<InvokeInst>(PredBB->getTerminator())) return false;
+  // Don't break unwinding instructions.
+  if (PredBB->getTerminator()->isExceptional())
+    return false;
 
   succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
   BasicBlock *OnlySucc = BB;
@@ -145,7 +144,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
 
   // Begin by getting rid of unneeded PHIs.
   if (isa<PHINode>(BB->front()))
-    FoldSingleEntryPHINodes(BB, AA, MemDep);
+    FoldSingleEntryPHINodes(BB, MemDep);
 
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
@@ -253,7 +252,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
     // block.
     assert(SP == BB && "CFG broken");
     SP = nullptr;
-    return SplitBlock(Succ, Succ->begin(), DT, LI);
+    return SplitBlock(Succ, &Succ->front(), DT, LI);
   }
 
   // Otherwise, if BB has a single successor, split it at the bottom of the
@@ -284,8 +283,8 @@ llvm::SplitAllCriticalEdges(Function &F,
 ///
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                              DominatorTree *DT, LoopInfo *LI) {
-  BasicBlock::iterator SplitIt = SplitPt;
-  while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt))
+  BasicBlock::iterator SplitIt = SplitPt->getIterator();
+  while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
     ++SplitIt;
   BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
 
@@ -393,7 +392,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
 /// from NewBB. This also updates AliasAnalysis, if available.
 static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
                            ArrayRef<BasicBlock *> Preds, BranchInst *BI,
-                           AliasAnalysis *AA, bool HasLoopExit) {
+                           bool HasLoopExit) {
   // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
   SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
   for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
@@ -474,17 +473,20 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
 ///
 BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock *> Preds,
-                                         const char *Suffix, AliasAnalysis *AA,
-                                         DominatorTree *DT, LoopInfo *LI,
-                                         bool PreserveLCSSA) {
+                                         const char *Suffix, DominatorTree *DT,
+                                         LoopInfo *LI, bool PreserveLCSSA) {
+  // Do not attempt to split that which cannot be split.
+  if (!BB->canSplitPredecessors())
+    return nullptr;
+
   // For the landingpads we need to act a bit differently.
   // Delegate this work to the SplitLandingPadPredecessors.
   if (BB->isLandingPad()) {
     SmallVector<BasicBlock*, 2> NewBBs;
     std::string NewName = std::string(Suffix) + ".split-lp";
 
-    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(),
-                                NewBBs, AA, DT, LI, PreserveLCSSA);
+    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs, DT,
+                                LI, PreserveLCSSA);
     return NewBBs[0];
   }
 
@@ -523,7 +525,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                             HasLoopExit);
 
   // Update the PHI nodes in BB with the values coming from NewBB.
-  UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit);
+  UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
   return NewBB;
 }
 
@@ -544,8 +546,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock *> Preds,
                                        const char *Suffix1, const char *Suffix2,
                                        SmallVectorImpl<BasicBlock *> &NewBBs,
-                                       AliasAnalysis *AA, DominatorTree *DT,
-                                       LoopInfo *LI, bool PreserveLCSSA) {
+                                       DominatorTree *DT, LoopInfo *LI,
+                                       bool PreserveLCSSA) {
   assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
 
   // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
@@ -574,7 +576,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                             HasLoopExit);
 
   // Update the PHI nodes in OrigBB with the values coming from NewBB1.
-  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit);
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);
 
   // Move the remaining edges from OrigBB to point to NewBB2.
   SmallVector<BasicBlock*, 8> NewBB2Preds;
@@ -611,7 +613,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                               PreserveLCSSA, HasLoopExit);
 
     // Update the PHI nodes in OrigBB with the values coming from NewBB2.
-    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit);
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit);
   }
 
   LandingPadInst *LPad = OrigBB->getLandingPadInst();
@@ -624,11 +626,17 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
     Clone2->setName(Twine("lpad") + Suffix2);
     NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
 
-    // Create a PHI node for the two cloned landingpad instructions.
-    PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
-    PN->addIncoming(Clone1, NewBB1);
-    PN->addIncoming(Clone2, NewBB2);
-    LPad->replaceAllUsesWith(PN);
+    // Create a PHI node for the two cloned landingpad instructions only
+    // if the original landingpad instruction has some uses.
+    if (!LPad->use_empty()) {
+      assert(!LPad->getType()->isTokenTy() &&
+             "Split cannot be applied if LPad is token type. Otherwise an "
+             "invalid PHINode of token type would be created.");
+      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+      PN->addIncoming(Clone1, NewBB1);
+      PN->addIncoming(Clone2, NewBB2);
+      LPad->replaceAllUsesWith(PN);
+    }
     LPad->eraseFromParent();
   } else {
     // There is no second clone. Just replace the landing pad with the first
@@ -661,7 +669,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
       // return instruction.
       V = BCI->getOperand(0);
       NewBC = BCI->clone();
-      Pred->getInstList().insert(NewRet, NewBC);
+      Pred->getInstList().insert(NewRet->getIterator(), NewBC);
       *i = NewBC;
     }
     if (PHINode *PN = dyn_cast<PHINode>(V)) {
@@ -707,7 +715,7 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
                                                 MDNode *BranchWeights,
                                                 DominatorTree *DT) {
   BasicBlock *Head = SplitBefore->getParent();
-  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   TerminatorInst *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
@@ -757,7 +765,7 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
                                          TerminatorInst **ElseTerm,
                                          MDNode *BranchWeights) {
   BasicBlock *Head = SplitBefore->getParent();
-  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   TerminatorInst *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 7e83c9e..9582599 100644
--- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -101,10 +101,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
         continue;
 
     // Otherwise a new PHI is needed. Create one and populate it.
-    PHINode *NewPN =
-      PHINode::Create(PN->getType(), Preds.size(), "split",
-                      SplitBB->isLandingPad() ?
-                      SplitBB->begin() : SplitBB->getTerminator());
+    PHINode *NewPN = PHINode::Create(
+        PN->getType(), Preds.size(), "split",
+        SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
     for (unsigned i = 0, e = Preds.size(); i != e; ++i)
       NewPN->addIncoming(V, Preds[i]);
 
@@ -141,9 +140,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
-  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // Splitting the critical edge to a pad block is non-trivial. Don't do
   // it in this generic function.
-  if (DestBB->isLandingPad()) return nullptr;
+  if (DestBB->isEHPad()) return nullptr;
 
   // Create a new basic block, linking it into the CFG.
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
@@ -157,7 +156,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
 
   // Insert the block into the function... right after the block TI lives in.
   Function &F = *TIBB->getParent();
-  Function::iterator FBBI = TIBB;
+  Function::iterator FBBI = TIBB->getIterator();
   F.getBasicBlockList().insert(++FBBI, NewBB);
 
   // If there are any PHI nodes in DestBB, we need to update them so that they
@@ -197,7 +196,6 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   }
 
   // If we have nothing to update, just return.
-  auto *AA = Options.AA;
   auto *DT = Options.DT;
   auto *LI = Options.LI;
   if (!DT && !LI)
@@ -319,10 +317,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           LoopPreds.push_back(P);
         }
         if (!LoopPreds.empty()) {
-          assert(!DestBB->isLandingPad() &&
-                 "We don't split edges to landing pads!");
+          assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");
           BasicBlock *NewExitBB = SplitBlockPredecessors(
-              DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA);
+              DestBB, LoopPreds, "split", DT, LI, Options.PreserveLCSSA);
           if (Options.PreserveLCSSA)
             createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);
         }
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 8aa7b2a..64b44a6 100644
--- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -21,7 +22,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 
 using namespace llvm;
 
@@ -55,32 +55,6 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
   return CI;
 }
 
-/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the
-/// specified pointer.  Ptr is required to be some pointer type, MaxLen must
-/// be of size_t type, and the return value has 'intptr_t' type.
-Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
-                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strnlen))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrNLen =
-      M->getOrInsertFunction("strnlen", AttributeSet::get(M->getContext(), AS),
-                             DL.getIntPtrType(Context), B.getInt8PtrTy(),
-                             DL.getIntPtrType(Context), nullptr);
-  CallInst *CI = B.CreateCall(StrNLen, {CastToCStr(Ptr, B), MaxLen}, "strnlen");
-  if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts()))
-    CI->setCallingConv(F->getCallingConv());
-
-  return CI;
-}
-
 /// EmitStrChr - Emit a call to the strchr function to the builder, for the
 /// specified pointer and character.  Ptr is required to be some pointer type,
 /// and the return value has 'i8*' type.
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index f2d5e07..42287d3 100644
--- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -74,17 +74,13 @@ namespace llvm {
 // insertFastDiv - Substitutes the div/rem instruction with code that checks the
 // value of the operands and uses a shorter-faster div/rem instruction when
 // possible and the longer-slower div/rem instruction otherwise.
-static bool insertFastDiv(Function &F,
-                          Function::iterator &I,
-                          BasicBlock::iterator &J,
-                          IntegerType *BypassType,
-                          bool UseDivOp,
-                          bool UseSignedOp,
+static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
+                          bool UseDivOp, bool UseSignedOp,
                           DivCacheTy &PerBBDivCache) {
+  Function *F = I->getParent()->getParent();
   // Get instruction operands
-  Instruction *Instr = J;
-  Value *Dividend = Instr->getOperand(0);
-  Value *Divisor = Instr->getOperand(1);
+  Value *Dividend = I->getOperand(0);
+  Value *Divisor = I->getOperand(1);
 
   if (isa<ConstantInt>(Divisor) ||
       (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
@@ -94,13 +90,12 @@ static bool insertFastDiv(Function &F,
   }
 
   // Basic Block is split before divide
-  BasicBlock *MainBB = I;
-  BasicBlock *SuccessorBB = I->splitBasicBlock(J);
-  ++I; //advance iterator I to successorBB
+  BasicBlock *MainBB = &*I->getParent();
+  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
 
   // Add new basic block for slow divide operation
-  BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
-                                          MainBB->getParent(), SuccessorBB);
+  BasicBlock *SlowBB =
+      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
   SlowBB->moveBefore(SuccessorBB);
   IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
   Value *SlowQuotientV;
@@ -115,8 +110,8 @@ static bool insertFastDiv(Function &F,
   SlowBuilder.CreateBr(SuccessorBB);
 
   // Add new basic block for fast divide operation
-  BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
-                                          MainBB->getParent(), SuccessorBB);
+  BasicBlock *FastBB =
+      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
   FastBB->moveBefore(SlowBB);
   IRBuilder<> FastBuilder(FastBB, FastBB->begin());
   Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
@@ -139,19 +134,19 @@ static bool insertFastDiv(Function &F,
 
   // Phi nodes for result of div and rem
   IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
-  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
   QuoPhi->addIncoming(SlowQuotientV, SlowBB);
   QuoPhi->addIncoming(FastQuotientV, FastBB);
-  PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
   RemPhi->addIncoming(SlowRemainderV, SlowBB);
   RemPhi->addIncoming(FastRemainderV, FastBB);
 
-  // Replace Instr with appropriate phi node
+  // Replace I with appropriate phi node
   if (UseDivOp)
-    Instr->replaceAllUsesWith(QuoPhi);
+    I->replaceAllUsesWith(QuoPhi);
   else
-    Instr->replaceAllUsesWith(RemPhi);
-  Instr->eraseFromParent();
+    I->replaceAllUsesWith(RemPhi);
+  I->eraseFromParent();
 
   // Combine operands into a single value with OR for value testing below
   MainBB->getInstList().back().eraseFromParent();
@@ -168,9 +163,6 @@ static bool insertFastDiv(Function &F,
   Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
   MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
 
-  // point iterator J at first instruction of successorBB
-  J = I->begin();
-
   // Cache phi nodes to be used later in place of other instances
   // of div or rem with the same sign, dividend, and divisor
   DivOpInfo Key(UseSignedOp, Dividend, Divisor);
@@ -179,57 +171,54 @@ static bool insertFastDiv(Function &F,
   return true;
 }
 
-// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
-// operands and operation are identical. Otherwise call insertFastDiv to perform
-// the optimization and cache the resulting dividend and remainder.
-static bool reuseOrInsertFastDiv(Function &F,
-                                 Function::iterator &I,
-                                 BasicBlock::iterator &J,
-                                 IntegerType *BypassType,
-                                 bool UseDivOp,
-                                 bool UseSignedOp,
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from
+// the current BB if operands and operation are identical. Otherwise calls
+// insertFastDiv to perform the optimization and caches the resulting dividend
+// and remainder.
+static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType,
+                                 bool UseDivOp, bool UseSignedOp,
                                  DivCacheTy &PerBBDivCache) {
   // Get instruction operands
-  Instruction *Instr = J;
-  DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+  DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1));
   DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
 
   if (CacheI == PerBBDivCache.end()) {
     // If previous instance does not exist, insert fast div
-    return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
-                         PerBBDivCache);
+    return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
   }
 
   // Replace operation value with previously generated phi node
   DivPhiNodes &Value = CacheI->second;
   if (UseDivOp) {
     // Replace all uses of div instruction with quotient phi node
-    J->replaceAllUsesWith(Value.Quotient);
+    I->replaceAllUsesWith(Value.Quotient);
   } else {
     // Replace all uses of rem instruction with remainder phi node
-    J->replaceAllUsesWith(Value.Remainder);
+    I->replaceAllUsesWith(Value.Remainder);
   }
 
-  // Advance to next operation
-  ++J;
-
   // Remove redundant operation
-  Instr->eraseFromParent();
+  I->eraseFromParent();
   return true;
 }
 
-// bypassSlowDivision - This optimization identifies DIV instructions that can
-// be profitably bypassed and carried out with a shorter, faster divide.
-bool llvm::bypassSlowDivision(Function &F,
-                              Function::iterator &I,
-                              const DenseMap<unsigned int, unsigned int> &BypassWidths) {
+// bypassSlowDivision - This optimization identifies DIV instructions in a BB
+// that can be profitably bypassed and carried out with a shorter, faster
+// divide.
+bool llvm::bypassSlowDivision(
+    BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) {
   DivCacheTy DivCache;
 
   bool MadeChange = false;
-  for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+  Instruction* Next = &*BB->begin();
+  while (Next != nullptr) {
+    // We may add instructions immediately after I, but we want to skip over
+    // them.
+    Instruction* I = Next;
+    Next = Next->getNextNode();
 
     // Get instruction details
-    unsigned Opcode = J->getOpcode();
+    unsigned Opcode = I->getOpcode();
     bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
     bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
     bool UseSignedOp = Opcode == Instruction::SDiv ||
@@ -240,11 +229,11 @@ bool llvm::bypassSlowDivision(Function &F,
       continue;
 
     // Skip division on vector types, only optimize integer instructions
-    if (!J->getType()->isIntegerTy())
+    if (!I->getType()->isIntegerTy())
       continue;
 
     // Get bitwidth of div/rem instruction
-    IntegerType *T = cast<IntegerType>(J->getType());
+    IntegerType *T = cast<IntegerType>(I->getType());
     unsigned int bitwidth = T->getBitWidth();
 
     // Continue if bitwidth is not bypassed
@@ -253,10 +242,9 @@ bool llvm::bypassSlowDivision(Function &F,
       continue;
 
     // Get type for div/rem instruction with bypass bitwidth
-    IntegerType *BT = IntegerType::get(J->getContext(), BI->second);
+    IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
 
-    MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp,
-                                       UseSignedOp, DivCache);
+    MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
   }
 
   return MadeChange;
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
index cc4d6c6..6454afb 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -52,8 +52,8 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
-    VMap[II] = NewInst;                // Add instruction map to value.
-    
+    VMap[&*II] = NewInst; // Add instruction map to value.
+
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
       if (isa<ConstantInt>(AI->getArraySize()))
@@ -85,9 +85,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   assert(NameSuffix && "NameSuffix cannot be null!");
 
 #ifndef NDEBUG
-  for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
-       E = OldFunc->arg_end(); I != E; ++I)
-    assert(VMap.count(I) && "No mapping from source argument specified!");
+  for (const Argument &I : OldFunc->args())
+    assert(VMap.count(&I) && "No mapping from source argument specified!");
 #endif
 
   // Copy all attributes other than those stored in the AttributeSet.  We need
@@ -96,6 +95,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   NewFunc->copyAttributesFrom(OldFunc);
   NewFunc->setAttributes(NewAttrs);
 
+  // Fix up the personality function that got copied over.
+  if (OldFunc->hasPersonalityFn())
+    NewFunc->setPersonalityFn(
+        MapValue(OldFunc->getPersonalityFn(), VMap,
+                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                 TypeMapper, Materializer));
+
   AttributeSet OldAttrs = OldFunc->getAttributes();
   // Clone any argument attributes that are present in the VMap.
   for (const Argument &OldArg : OldFunc->args())
@@ -136,7 +142,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     if (BB.hasAddressTaken()) {
       Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
                                               const_cast<BasicBlock*>(&BB));
-      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);                                         
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
     }
 
     // Note return instructions for the caller.
@@ -146,11 +152,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
-  for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]),
-         BE = NewFunc->end(); BB != BE; ++BB)
+  for (Function::iterator BB =
+           cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(),
+                          BE = NewFunc->end();
+       BB != BE; ++BB)
     // Loop over all instructions, fixing each one as we find it...
-    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
-      RemapInstruction(II, VMap,
+    for (Instruction &II : *BB)
+      RemapInstruction(&II, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                        TypeMapper, Materializer);
 }
@@ -187,11 +195,9 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,
   const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder);
   if (!OldSubprogramMDNode) return;
 
-  // Ensure that OldFunc appears in the map.
-  // (if it's already there it must point to NewFunc anyway)
-  VMap[OldFunc] = NewFunc;
   auto *NewSubprogram =
       cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap));
+  NewFunc->setSubprogram(NewSubprogram);
 
   for (auto *CU : Finder.compile_units()) {
     auto Subprograms = CU->getSubprograms();
@@ -222,10 +228,9 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
   // The user might be deleting arguments to the function by specifying them in
   // the VMap.  If so, we need to not add the arguments to the arg ty vector
   //
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I)
-    if (VMap.count(I) == 0)  // Haven't mapped the argument to anything yet?
-      ArgTypes.push_back(I->getType());
+  for (const Argument &I : F->args())
+    if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet?
+      ArgTypes.push_back(I.getType());
 
   // Create a new function type...
   FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
@@ -236,11 +241,10 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
 
   // Loop over the arguments, copying the names of the mapped arguments over...
   Function::arg_iterator DestI = NewF->arg_begin();
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I)
-    if (VMap.count(I) == 0) {   // Is this argument preserved?
-      DestI->setName(I->getName()); // Copy the name over...
-      VMap[I] = DestI++;        // Add mapping to VMap
+  for (const Argument & I : F->args())
+    if (VMap.count(&I) == 0) {     // Is this argument preserved?
+      DestI->setName(I.getName()); // Copy the name over...
+      VMap[&I] = &*DestI++;        // Add mapping to VMap
     }
 
   if (ModuleLevelChanges)
@@ -262,27 +266,14 @@ namespace {
     bool ModuleLevelChanges;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
-    CloningDirector *Director;
-    ValueMapTypeRemapper *TypeMapper;
-    ValueMaterializer *Materializer;
 
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap, bool moduleLevelChanges,
-                          const char *nameSuffix, ClonedCodeInfo *codeInfo,
-                          CloningDirector *Director)
+                          const char *nameSuffix, ClonedCodeInfo *codeInfo)
         : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
           ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
-          CodeInfo(codeInfo), Director(Director) {
-      // These are optional components.  The Director may return null.
-      if (Director) {
-        TypeMapper = Director->getTypeRemapper();
-        Materializer = Director->getValueMaterializer();
-      } else {
-        TypeMapper = nullptr;
-        Materializer = nullptr;
-      }
-    }
+          CodeInfo(codeInfo) {}
 
     /// The specified block is found to be reachable, clone it and
     /// anything that it can reach.
@@ -328,23 +319,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   // loop doesn't include the terminator.
   for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end();
        II != IE; ++II) {
-    // If the "Director" remaps the instruction, don't clone it.
-    if (Director) {
-      CloningDirector::CloningAction Action 
-                              = Director->handleInstruction(VMap, II, NewBB);
-      // If the cloning director says stop, we want to stop everything, not
-      // just break out of the loop (which would cause the terminator to be
-      // cloned).  The cloning director is responsible for inserting a proper
-      // terminator into the new basic block in this case.
-      if (Action == CloningDirector::StopCloningBB)
-        return;
-      // If the cloning director says skip, continue to the next instruction.
-      // In this case, the cloning director is responsible for mapping the
-      // skipped instruction to some value that is defined in the new
-      // basic block.
-      if (Action == CloningDirector::SkipInstruction)
-        continue;
-    }
 
     Instruction *NewInst = II->clone();
 
@@ -352,8 +326,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     // nodes for which we defer processing until we update the CFG.
     if (!isa<PHINode>(NewInst)) {
       RemapInstruction(NewInst, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper, Materializer);
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
 
       // If we can simplify this instruction to some other value, simply add
       // a mapping to that value rather than inserting a new instruction into
@@ -365,7 +338,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
         if (Value *MappedV = VMap.lookup(V))
           V = MappedV;
 
-        VMap[II] = V;
+        VMap[&*II] = V;
         delete NewInst;
         continue;
       }
@@ -373,9 +346,15 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
 
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
-    VMap[II] = NewInst;                // Add instruction map to value.
+    VMap[&*II] = NewInst; // Add instruction map to value.
     NewBB->getInstList().push_back(NewInst);
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+
+    if (CodeInfo)
+      if (auto CS = ImmutableCallSite(&*II))
+        if (CS.hasOperandBundles())
+          CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
       if (isa<ConstantInt>(AI->getArraySize()))
         hasStaticAllocas = true;
@@ -387,26 +366,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   // Finally, clone over the terminator.
   const TerminatorInst *OldTI = BB->getTerminator();
   bool TerminatorDone = false;
-  if (Director) {
-    CloningDirector::CloningAction Action 
-                           = Director->handleInstruction(VMap, OldTI, NewBB);
-    // If the cloning director says stop, we want to stop everything, not
-    // just break out of the loop (which would cause the terminator to be
-    // cloned).  The cloning director is responsible for inserting a proper
-    // terminator into the new basic block in this case.
-    if (Action == CloningDirector::StopCloningBB)
-      return;
-    if (Action == CloningDirector::CloneSuccessors) {
-      // If the director says to skip with a terminate instruction, we still
-      // need to clone this block's successors.
-      const TerminatorInst *TI = NewBB->getTerminator();
-      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-        ToClone.push_back(TI->getSuccessor(i));
-      return;
-    }
-    assert(Action != CloningDirector::SkipInstruction && 
-           "SkipInstruction is not valid for terminators.");
-  }
   if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
     if (BI->isConditional()) {
       // If the condition was a known constant in the callee...
@@ -447,11 +406,16 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       NewInst->setName(OldTI->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
     VMap[OldTI] = NewInst;             // Add instruction map to value.
-    
+
+    if (CodeInfo)
+      if (auto CS = ImmutableCallSite(OldTI))
+        if (CS.hasOperandBundles())
+          CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
     // Recursively clone any reachable successor blocks.
     const TerminatorInst *TI = BB->getTerminator();
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      ToClone.push_back(TI->getSuccessor(i));
+    for (const BasicBlock *Succ : TI->successors())
+      ToClone.push_back(Succ);
   }
   
   if (CodeInfo) {
@@ -470,41 +434,34 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                                      ValueToValueMapTy &VMap,
                                      bool ModuleLevelChanges,
                                      SmallVectorImpl<ReturnInst *> &Returns,
-                                     const char *NameSuffix, 
-                                     ClonedCodeInfo *CodeInfo,
-                                     CloningDirector *Director) {
+                                     const char *NameSuffix,
+                                     ClonedCodeInfo *CodeInfo) {
   assert(NameSuffix && "NameSuffix cannot be null!");
 
   ValueMapTypeRemapper *TypeMapper = nullptr;
   ValueMaterializer *Materializer = nullptr;
 
-  if (Director) {
-    TypeMapper = Director->getTypeRemapper();
-    Materializer = Director->getValueMaterializer();
-  }
-
 #ifndef NDEBUG
-  // If the cloning starts at the begining of the function, verify that
+  // If the cloning starts at the beginning of the function, verify that
   // the function arguments are mapped.
   if (!StartingInst)
-    for (Function::const_arg_iterator II = OldFunc->arg_begin(),
-         E = OldFunc->arg_end(); II != E; ++II)
-      assert(VMap.count(II) && "No mapping from source argument specified!");
+    for (const Argument &II : OldFunc->args())
+      assert(VMap.count(&II) && "No mapping from source argument specified!");
 #endif
 
   PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
-                            NameSuffix, CodeInfo, Director);
+                            NameSuffix, CodeInfo);
   const BasicBlock *StartingBB;
   if (StartingInst)
     StartingBB = StartingInst->getParent();
   else {
     StartingBB = &OldFunc->getEntryBlock();
-    StartingInst = StartingBB->begin();
+    StartingInst = &StartingBB->front();
   }
 
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock*> CloneWorklist;
-  PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist);
+  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
   while (!CloneWorklist.empty()) {
     const BasicBlock *BB = CloneWorklist.back();
     CloneWorklist.pop_back();
@@ -517,9 +474,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   //
   // Defer PHI resolution until rest of function is resolved.
   SmallVector<const PHINode*, 16> PHIToResolve;
-  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
-       BI != BE; ++BI) {
-    Value *V = VMap[BI];
+  for (const BasicBlock &BI : *OldFunc) {
+    Value *V = VMap[&BI];
     BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
     if (!NewBB) continue;  // Dead block.
 
@@ -528,7 +484,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
-    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) {
+    for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) {
       // PHI nodes may have been remapped to non-PHI nodes by the caller or
       // during the cloning process.
       if (const PHINode *PN = dyn_cast<PHINode>(I)) {
@@ -621,8 +577,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
       while ((PN = dyn_cast<PHINode>(I++))) {
         Value *NV = UndefValue::get(PN->getType());
         PN->replaceAllUsesWith(NV);
-        assert(VMap[OldI] == PN && "VMap mismatch");
-        VMap[OldI] = NV;
+        assert(VMap[&*OldI] == PN && "VMap mismatch");
+        VMap[&*OldI] = NV;
         PN->eraseFromParent();
         ++OldI;
       }
@@ -644,15 +600,15 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // and zap unconditional fall-through branches. This happens all the time when
   // specializing code: code specialization turns conditional branches into
   // uncond branches, and this code folds them.
-  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]);
+  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
   Function::iterator I = Begin;
   while (I != NewFunc->end()) {
     // Check if this block has become dead during inlining or other
     // simplifications. Note that the first block will appear dead, as it has
     // not yet been wired up properly.
-    if (I != Begin && (pred_begin(I) == pred_end(I) ||
-                       I->getSinglePredecessor() == I)) {
-      BasicBlock *DeadBB = I++;
+    if (I != Begin && (pred_begin(&*I) == pred_end(&*I) ||
+                       I->getSinglePredecessor() == &*I)) {
+      BasicBlock *DeadBB = &*I++;
       DeleteDeadBlock(DeadBB);
       continue;
     }
@@ -662,7 +618,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     // simplification required looking through PHI nodes, those are only
     // available after forming the full basic block. That may leave some here,
     // and we still want to prune the dead code as early as possible.
-    ConstantFoldTerminator(I);
+    ConstantFoldTerminator(&*I);
 
     BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
     if (!BI || BI->isConditional()) { ++I; continue; }
@@ -681,7 +637,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     BI->eraseFromParent();
     
     // Make all PHI nodes that referred to Dest now refer to I as their source.
-    Dest->replaceAllUsesWith(I);
+    Dest->replaceAllUsesWith(&*I);
 
     // Move all the instructions in the succ to the pred.
     I->getInstList().splice(I->end(), Dest->getInstList());
@@ -695,7 +651,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Make a final pass over the basic blocks from the old function to gather
   // any return instructions which survived folding. We have to do this here
   // because we can iteratively remove and merge returns above.
-  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]),
+  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(),
                           E = NewFunc->end();
        I != E; ++I)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
@@ -717,9 +673,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
                                      Instruction *TheCall) {
-  CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), VMap,
-                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo,
-                            nullptr);
+  CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
+                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
 }
 
 /// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
@@ -780,9 +735,10 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
   }
 
   // Move them physically from the end of the block list.
-  F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH);
-  F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
-                                NewLoop->getHeader(), F->end());
+  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+                                NewPH);
+  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+                                NewLoop->getHeader()->getIterator(), F->end());
 
   return NewLoop;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
index 61f1811..ab08335 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -20,21 +20,28 @@
 #include "llvm-c/Core.h"
 using namespace llvm;
 
-/// CloneModule - Return an exact copy of the specified module.  This is not as
-/// easy as it might seem because we have to worry about making copies of global
-/// variables and functions, and making their (initializers and references,
-/// respectively) refer to the right globals.
+/// This is not as easy as it might seem because we have to worry about making
+/// copies of global variables and functions, and making their (initializers and
+/// references, respectively) refer to the right globals.
 ///
-Module *llvm::CloneModule(const Module *M) {
+std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
   // Create the value map that maps things from the old module over to the new
   // module.
   ValueToValueMapTy VMap;
   return CloneModule(M, VMap);
 }
 
-Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
+std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+                                          ValueToValueMapTy &VMap) {
+  return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
+}
+
+std::unique_ptr<Module> llvm::CloneModule(
+    const Module *M, ValueToValueMapTy &VMap,
+    std::function<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
-  Module *New = new Module(M->getModuleIdentifier(), M->getContext());
+  std::unique_ptr<Module> New =
+      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
   New->setDataLayout(M->getDataLayout());
   New->setTargetTriple(M->getTargetTriple());
   New->setModuleInlineAsm(M->getModuleInlineAsm());
@@ -52,26 +59,48 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
                                             (GlobalVariable*) nullptr,
                                             I->getThreadLocalMode(),
                                             I->getType()->getAddressSpace());
-    GV->copyAttributesFrom(I);
-    VMap[I] = GV;
+    GV->copyAttributesFrom(&*I);
+    VMap[&*I] = GV;
   }
 
   // Loop over the functions in the module, making external functions as before
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
     Function *NF =
-      Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                       I->getLinkage(), I->getName(), New);
-    NF->copyAttributesFrom(I);
-    VMap[I] = NF;
+        Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                         I->getLinkage(), I->getName(), New.get());
+    NF->copyAttributesFrom(&*I);
+    VMap[&*I] = NF;
   }
 
   // Loop over the aliases in the module
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
-    auto *PTy = cast<PointerType>(I->getType());
-    auto *GA = GlobalAlias::create(PTy, I->getLinkage(), I->getName(), New);
-    GA->copyAttributesFrom(I);
-    VMap[I] = GA;
+    if (!ShouldCloneDefinition(&*I)) {
+      // An alias cannot act as an external reference, so we need to create
+      // either a function or a global variable depending on the value type.
+      // FIXME: Once pointee types are gone we can probably pick one or the
+      // other.
+      GlobalValue *GV;
+      if (I->getValueType()->isFunctionTy())
+        GV = Function::Create(cast<FunctionType>(I->getValueType()),
+                              GlobalValue::ExternalLinkage, I->getName(),
+                              New.get());
+      else
+        GV = new GlobalVariable(
+            *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
+            (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+            I->getThreadLocalMode(), I->getType()->getAddressSpace());
+      VMap[&*I] = GV;
+      // We do not copy attributes (mainly because copying between different
+      // kinds of globals is forbidden), but this is generally not required for
+      // correctness.
+      continue;
+    }
+    auto *GA = GlobalAlias::create(I->getValueType(),
+                                   I->getType()->getPointerAddressSpace(),
+                                   I->getLinkage(), I->getName(), New.get());
+    GA->copyAttributesFrom(&*I);
+    VMap[&*I] = GA;
   }
   
   // Now that all of the things that global variable initializer can refer to
@@ -80,7 +109,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   //
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I) {
-    GlobalVariable *GV = cast<GlobalVariable>(VMap[I]);
+    GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
+    if (!ShouldCloneDefinition(&*I)) {
+      // Skip after setting the correct linkage for an external reference.
+      GV->setLinkage(GlobalValue::ExternalLinkage);
+      continue;
+    }
     if (I->hasInitializer())
       GV->setInitializer(MapValue(I->getInitializer(), VMap));
   }
@@ -88,18 +122,22 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   // Similarly, copy over function bodies now...
   //
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
-    Function *F = cast<Function>(VMap[I]);
+    Function *F = cast<Function>(VMap[&*I]);
+    if (!ShouldCloneDefinition(&*I)) {
+      // Skip after setting the correct linkage for an external reference.
+      F->setLinkage(GlobalValue::ExternalLinkage);
+      continue;
+    }
     if (!I->isDeclaration()) {
       Function::arg_iterator DestI = F->arg_begin();
       for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
            ++J) {
         DestI->setName(J->getName());
-        VMap[J] = DestI++;
+        VMap[&*J] = &*DestI++;
       }
 
       SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-      CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns);
-
+      CloneFunctionInto(F, &*I, VMap, /*ModuleLevelChanges=*/true, Returns);
     }
 
     if (I->hasPersonalityFn())
@@ -109,7 +147,10 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   // And aliases
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
-    GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
+    // We already dealt with undefined aliases above.
+    if (!ShouldCloneDefinition(&*I))
+      continue;
+    GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]);
     if (const Constant *C = I->getAliasee())
       GA->setAliasee(MapValue(C, VMap));
   }
@@ -129,7 +170,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
 extern "C" {
 
 LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
-  return wrap(CloneModule(unwrap(M)));
+  return wrap(CloneModule(unwrap(M)).release());
 }
 
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index ab89b41..823696d 100644
--- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -51,7 +51,7 @@ AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
 /// \brief Test whether a block is valid for extraction.
 static bool isBlockValidForExtraction(const BasicBlock &BB) {
   // Landing pads must be in the function where they were inserted for cleanup.
-  if (BB.isLandingPad())
+  if (BB.isEHPad())
     return false;
 
   // Don't hoist code containing allocas, invokes, or vastarts.
@@ -175,7 +175,7 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
 
       for (User *U : II->users())
         if (!definedInRegion(Blocks, U)) {
-          Outputs.insert(II);
+          Outputs.insert(&*II);
           break;
         }
     }
@@ -211,7 +211,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
   // containing PHI nodes merging values from outside of the region, and a
   // second that contains all of the code for the block and merges back any
   // incoming values from inside of the region.
-  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI();
+  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI()->getIterator();
   BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs,
                                               Header->getName()+".ce");
 
@@ -246,7 +246,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
       // Create a new PHI node in the new region, which has an incoming value
       // from OldPred of PN.
       PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
-                                       PN->getName()+".ce", NewBB->begin());
+                                       PN->getName() + ".ce", &NewBB->front());
       NewPN->addIncoming(PN, OldPred);
 
       // Loop over all of the incoming value in PN, moving them to NewPN if they
@@ -266,7 +266,8 @@ void CodeExtractor::splitReturnBlocks() {
   for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
        I != E; ++I)
     if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) {
-      BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret");
+      BasicBlock *New =
+          (*I)->splitBasicBlock(RI->getIterator(), (*I)->getName() + ".ret");
       if (DT) {
         // Old dominates New. New node dominates all other nodes dominated
         // by Old.
@@ -365,10 +366,10 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
       TerminatorInst *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
-          StructTy, AI, Idx, "gep_" + inputs[i]->getName(), TI);
+          StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
     } else
-      RewriteVal = AI++;
+      RewriteVal = &*AI++;
 
     std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end());
     for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
@@ -440,8 +441,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       StructValues.push_back(*i);
     } else {
       AllocaInst *alloca =
-        new AllocaInst((*i)->getType(), nullptr, (*i)->getName()+".loc",
-                       codeReplacer->getParent()->begin()->begin());
+          new AllocaInst((*i)->getType(), nullptr, (*i)->getName() + ".loc",
+                         &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
     }
@@ -457,9 +458,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
     // Allocate a struct at the beginning of this function
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
-    Struct =
-      new AllocaInst(StructArgTy, nullptr, "structArg",
-                     codeReplacer->getParent()->begin()->begin());
+    Struct = new AllocaInst(StructArgTy, nullptr, "structArg",
+                            &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
     for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
@@ -566,8 +566,12 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
             bool DominatesDef = true;
 
-            if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) {
-              DefBlock = Invoke->getNormalDest();
+            BasicBlock *NormalDest = nullptr;
+            if (auto *Invoke = dyn_cast<InvokeInst>(outputs[out]))
+              NormalDest = Invoke->getNormalDest();
+
+            if (NormalDest) {
+              DefBlock = NormalDest;
 
               // Make sure we are looking at the original successor block, not
               // at a newly inserted exit block, which won't be in the dominator
@@ -606,11 +610,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
                 Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),
                                           FirstOut+out);
                 GetElementPtrInst *GEP = GetElementPtrInst::Create(
-                    StructArgTy, OAI, Idx, "gep_" + outputs[out]->getName(),
+                    StructArgTy, &*OAI, Idx, "gep_" + outputs[out]->getName(),
                     NTRet);
                 new StoreInst(outputs[out], GEP, NTRet);
               } else {
-                new StoreInst(outputs[out], OAI, NTRet);
+                new StoreInst(outputs[out], &*OAI, NTRet);
               }
             }
             // Advance output iterator even if we don't emit a store
diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
index dc95089..b56ff68 100644
--- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp
@@ -50,7 +50,7 @@ void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
   GlobalVariable *NGV =
       new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
                          CA, "", GCL->getThreadLocalMode());
-  GCL->getParent()->getGlobalList().insert(GCL, NGV);
+  GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV);
   NGV->takeName(GCL);
 
   // Nuke the old list, replacing any uses with the new one.
diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index 003da58..75a1dde 100644
--- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -35,8 +35,8 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
                           I.getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = I.getParent()->getParent();
-    Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem",
-                          F->getEntryBlock().begin());
+    Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem",
+                          &F->getEntryBlock().front());
   }
 
   // We cannot demote invoke instructions to the stack if their normal edge
@@ -89,16 +89,15 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
   // AFTER the terminator instruction.
   BasicBlock::iterator InsertPt;
   if (!isa<TerminatorInst>(I)) {
-    InsertPt = &I;
-    ++InsertPt;
-    for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+    InsertPt = ++I.getIterator();
+    for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
       /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
   } else {
     InvokeInst &II = cast<InvokeInst>(I);
     InsertPt = II.getNormalDest()->getFirstInsertionPt();
   }
 
-  new StoreInst(&I, Slot, InsertPt);
+  new StoreInst(&I, Slot, &*InsertPt);
   return Slot;
 }
 
@@ -118,8 +117,8 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
                           P->getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
-    Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem",
-                          F->getEntryBlock().begin());
+    Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem",
+                          &F->getEntryBlock().front());
   }
 
   // Iterate over each operand inserting a store in each predecessor.
@@ -133,12 +132,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   }
 
   // Insert a load in place of the PHI and replace all uses.
-  BasicBlock::iterator InsertPt = P;
+  BasicBlock::iterator InsertPt = P->getIterator();
 
-  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+  for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
     /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
 
-  Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt);
+  Value *V = new LoadInst(Slot, P->getName() + ".reload", &*InsertPt);
   P->replaceAllUsesWith(V);
 
   // Delete PHI.
diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 4eb3e3d..492ae9f 100644
--- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -28,12 +28,11 @@ class FlattenCFGOpt {
   AliasAnalysis *AA;
   /// \brief Use parallel-and or parallel-or to generate conditions for
   /// conditional branches.
-  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
-                            Pass *P = nullptr);
+  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
   /// \brief If \param BB is the merge block of an if-region, attempt to merge
   /// the if-region with an adjacent if-region upstream if two if-regions
   /// contain identical instructions.
-  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = nullptr);
+  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
   /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
   /// are from two if-regions whose entry blocks are \p Head1 and \p
   /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
@@ -122,8 +121,7 @@ public:
 ///  its predecessor.  In Case 2, \param BB (BB3) only has conditional branches
 ///  as its predecessors.
 ///
-bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
-                                         Pass *P) {
+bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   PHINode *PHI = dyn_cast<PHINode>(BB->begin());
   if (PHI)
     return false; // For simplicity, avoid cases containing PHI nodes.
@@ -177,8 +175,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
 
       // Instructions in the internal condition blocks should be safe
       // to hoist up.
-      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) {
-        Instruction *CI = BI++;
+      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator();
+           BI != BE;) {
+        Instruction *CI = &*BI++;
         if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))
           return false;
       }
@@ -315,7 +314,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
                                          BasicBlock *Block1,
                                          BasicBlock *Block2) {
   TerminatorInst *PTI2 = Head2->getTerminator();
-  Instruction *PBI2 = Head2->begin();
+  Instruction *PBI2 = &Head2->front();
 
   bool eq1 = (Block1 == Head1);
   bool eq2 = (Block2 == Head2);
@@ -327,9 +326,9 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
   // Check whether instructions in Block1 and Block2 are identical
   // and do not alias with instructions in Head2.
   BasicBlock::iterator iter1 = Block1->begin();
-  BasicBlock::iterator end1 = Block1->getTerminator();
+  BasicBlock::iterator end1 = Block1->getTerminator()->getIterator();
   BasicBlock::iterator iter2 = Block2->begin();
-  BasicBlock::iterator end2 = Block2->getTerminator();
+  BasicBlock::iterator end2 = Block2->getTerminator()->getIterator();
 
   while (1) {
     if (iter1 == end1) {
@@ -338,7 +337,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
       break;
     }
 
-    if (!iter1->isIdenticalTo(iter2))
+    if (!iter1->isIdenticalTo(&*iter2))
       return false;
 
     // Illegal to remove instructions with side effects except
@@ -356,10 +355,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
       return false;
 
     if (iter1->mayWriteToMemory()) {
-      for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) {
+      for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
         if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {
           // Check alias with Head2.
-          if (!AA || AA->alias(iter1, BI))
+          if (!AA || AA->alias(&*iter1, &*BI))
             return false;
         }
       }
@@ -386,8 +385,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
 /// if (a || b)
 ///   statement;
 ///
-bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder,
-                                  Pass *P) {
+bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   BasicBlock *IfTrue2, *IfFalse2;
   Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
   Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2);
@@ -413,7 +411,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder,
     return false;
 
   TerminatorInst *PTI2 = SecondEntryBlock->getTerminator();
-  Instruction *PBI2 = SecondEntryBlock->begin();
+  Instruction *PBI2 = &SecondEntryBlock->front();
 
   if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,
                             IfTrue2))
@@ -425,8 +423,8 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder,
 
   // Check whether \param SecondEntryBlock has side-effect and is safe to
   // speculate.
-  for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) {
-    Instruction *CI = BI;
+  for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+    Instruction *CI = &*BI;
     if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||
         !isSafeToSpeculativelyExecute(CI))
       return false;
diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index 44b7d25..3893a75 100644
--- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -49,6 +49,10 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) {
 
 static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
                              SmallPtrSetImpl<const PHINode *> &PhiUsers) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (GV->isExternallyInitialized())
+      GS.StoredType = GlobalStatus::StoredOnce;
+
   for (const Use &U : V->uses()) {
     const User *UR = U.getUser();
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
index d2d60d7..1457411 100644
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -13,14 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
@@ -41,6 +42,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CommandLine.h"
 #include <algorithm>
+
 using namespace llvm;
 
 static cl::opt<bool>
@@ -54,17 +56,17 @@ PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
   cl::desc("Convert align attributes to assumptions during inlining."));
 
 bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
-                          bool InsertLifetime) {
-  return InlineFunction(CallSite(CI), IFI, InsertLifetime);
+                          AAResults *CalleeAAR, bool InsertLifetime) {
+  return InlineFunction(CallSite(CI), IFI, CalleeAAR, InsertLifetime);
 }
 bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
-                          bool InsertLifetime) {
-  return InlineFunction(CallSite(II), IFI, InsertLifetime);
+                          AAResults *CalleeAAR, bool InsertLifetime) {
+  return InlineFunction(CallSite(II), IFI, CalleeAAR, InsertLifetime);
 }
 
 namespace {
-  /// A class for recording information about inlining through an invoke.
-  class InvokeInliningInfo {
+  /// A class for recording information about inlining a landing pad.
+  class LandingPadInliningInfo {
     BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind.
     BasicBlock *InnerResumeDest; ///< Destination for the callee's resume.
     LandingPadInst *CallerLPad;  ///< LandingPadInst associated with the invoke.
@@ -72,7 +74,7 @@ namespace {
     SmallVector<Value*, 8> UnwindDestPHIValues;
 
   public:
-    InvokeInliningInfo(InvokeInst *II)
+    LandingPadInliningInfo(InvokeInst *II)
       : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr),
         CallerLPad(nullptr), InnerEHValuesPHI(nullptr) {
       // If there are PHI nodes in the unwind destination block, we need to keep
@@ -121,14 +123,14 @@ namespace {
       }
     }
   };
-}
+} // anonymous namespace
 
 /// Get or create a target for the branch from ResumeInsts.
-BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
+BasicBlock *LandingPadInliningInfo::getInnerResumeDest() {
   if (InnerResumeDest) return InnerResumeDest;
 
   // Split the landing pad.
-  BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint;
+  BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator();
   InnerResumeDest =
     OuterResumeDest->splitBasicBlock(SplitPoint,
                                      OuterResumeDest->getName() + ".body");
@@ -137,7 +139,7 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
   const unsigned PHICapacity = 2;
 
   // Create corresponding new PHIs for all the PHIs in the outer landing pad.
-  BasicBlock::iterator InsertPoint = InnerResumeDest->begin();
+  Instruction *InsertPoint = &InnerResumeDest->front();
   BasicBlock::iterator I = OuterResumeDest->begin();
   for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
     PHINode *OuterPHI = cast<PHINode>(I);
@@ -162,8 +164,8 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
 /// When the landing pad block has only one predecessor, this is a simple
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
-void InvokeInliningInfo::forwardResume(ResumeInst *RI,
-                               SmallPtrSetImpl<LandingPadInst*> &InlinedLPads) {
+void LandingPadInliningInfo::forwardResume(
+    ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) {
   BasicBlock *Dest = getInnerResumeDest();
   BasicBlock *Src = RI->getParent();
 
@@ -182,33 +184,39 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI,
 /// This function analyze BB to see if there are any calls, and if so,
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
-static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
-                                                   InvokeInliningInfo &Invoke) {
+static BasicBlock *
+HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
-    Instruction *I = BBI++;
+    Instruction *I = &*BBI++;
 
     // We only need to check for function calls: inlined invoke
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
 
-    // If this call cannot unwind, don't convert it to an invoke.
-    // Inline asm calls cannot throw.
     if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
     // Convert this function call into an invoke instruction.  First, split the
     // basic block.
-    BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
+    BasicBlock *Split =
+        BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
 
     // Delete the unconditional branch inserted by splitBasicBlock
     BB->getInstList().pop_back();
 
     // Create the new invoke instruction.
-    ImmutableCallSite CS(CI);
-    SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
-    InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split,
-                                        Invoke.getOuterResumeDest(),
-                                        InvokeArgs, CI->getName(), BB);
+    SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
+    SmallVector<OperandBundleDef, 1> OpBundles;
+
+    CI->getOperandBundlesAsDefs(OpBundles);
+
+    // Note: we're round tripping operand bundles through memory here, and that
+    // can potentially be avoided with a cleverer API design that we do not have
+    // as of this time.
+
+    InvokeInst *II =
+        InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge, InvokeArgs,
+                           OpBundles, CI->getName(), BB);
     II->setDebugLoc(CI->getDebugLoc());
     II->setCallingConv(CI->getCallingConv());
     II->setAttributes(CI->getAttributes());
@@ -219,12 +227,9 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
 
     // Delete the original call
     Split->getInstList().pop_front();
-
-    // Update any PHI nodes in the exceptional block to indicate that there is
-    // now a new entry in them.
-    Invoke.addIncomingPHIValuesFor(BB);
-    return;
+    return BB;
   }
+  return nullptr;
 }
 
 /// If we inlined an invoke site, we need to convert calls
@@ -233,8 +238,8 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
 /// II is the invoke instruction being inlined.  FirstNewBlock is the first
 /// block of the inlined code (the last block is the end of the function),
 /// and InlineCodeInfo is information about the code that got inlined.
-static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
-                                ClonedCodeInfo &InlinedCodeInfo) {
+static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+                                    ClonedCodeInfo &InlinedCodeInfo) {
   BasicBlock *InvokeDest = II->getUnwindDest();
 
   Function *Caller = FirstNewBlock->getParent();
@@ -242,11 +247,12 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
   // The inlined code is currently at the end of the function, scan from the
   // start of the inlined code to its end, checking for stuff we need to
   // rewrite.
-  InvokeInliningInfo Invoke(II);
+  LandingPadInliningInfo Invoke(II);
 
   // Get all of the inlined landing pad instructions.
   SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
-  for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I)
+  for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end();
+       I != E; ++I)
     if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
       InlinedLPads.insert(II->getLandingPadInst());
 
@@ -262,9 +268,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
       InlinedLPad->setCleanup(true);
   }
 
-  for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
+  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+       BB != E; ++BB) {
     if (InlinedCodeInfo.ContainsCalls)
-      HandleCallsInBlockInlinedThroughInvoke(BB, Invoke);
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, Invoke.getOuterResumeDest()))
+        // Update any PHI nodes in the exceptional block to indicate that there
+        // is now a new entry in them.
+        Invoke.addIncomingPHIValuesFor(NewBB);
 
     // Forward any resumes that are remaining here.
     if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
@@ -278,6 +289,99 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
   InvokeDest->removePredecessor(II->getParent());
 }
 
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+                               ClonedCodeInfo &InlinedCodeInfo) {
+  BasicBlock *UnwindDest = II->getUnwindDest();
+  Function *Caller = FirstNewBlock->getParent();
+
+  assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!");
+
+  // If there are PHI nodes in the unwind destination block, we need to keep
+  // track of which values came into them from the invoke before removing the
+  // edge from this block.
+  SmallVector<Value *, 8> UnwindDestPHIValues;
+  llvm::BasicBlock *InvokeBB = II->getParent();
+  for (Instruction &I : *UnwindDest) {
+    // Save the value to use for this edge.
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      break;
+    UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+  }
+
+  // Add incoming-PHI values to the unwind destination block for the given basic
+  // block, using the values for the original invoke's source block.
+  auto UpdatePHINodes = [&](BasicBlock *Src) {
+    BasicBlock::iterator I = UnwindDest->begin();
+    for (Value *V : UnwindDestPHIValues) {
+      PHINode *PHI = cast<PHINode>(I);
+      PHI->addIncoming(V, Src);
+      ++I;
+    }
+  };
+
+  // This connects all the instructions which 'unwind to caller' to the invoke
+  // destination.
+  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+       BB != E; ++BB) {
+    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+      if (CRI->unwindsToCaller()) {
+        CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI);
+        CRI->eraseFromParent();
+        UpdatePHINodes(&*BB);
+      }
+    }
+
+    Instruction *I = BB->getFirstNonPHI();
+    if (!I->isEHPad())
+      continue;
+
+    Instruction *Replacement = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+      if (CatchSwitch->unwindsToCaller()) {
+        auto *NewCatchSwitch = CatchSwitchInst::Create(
+            CatchSwitch->getParentPad(), UnwindDest,
+            CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
+            CatchSwitch);
+        for (BasicBlock *PadBB : CatchSwitch->handlers())
+          NewCatchSwitch->addHandler(PadBB);
+        Replacement = NewCatchSwitch;
+      }
+    } else if (!isa<FuncletPadInst>(I)) {
+      llvm_unreachable("unexpected EHPad!");
+    }
+
+    if (Replacement) {
+      Replacement->takeName(I);
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      UpdatePHINodes(&*BB);
+    }
+  }
+
+  if (InlinedCodeInfo.ContainsCalls)
+    for (Function::iterator BB = FirstNewBlock->getIterator(),
+                            E = Caller->end();
+         BB != E; ++BB)
+      if (BasicBlock *NewBB =
+              HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest))
+        // Update any PHI nodes in the exceptional block to indicate that there
+        // is now a new entry in them.
+        UpdatePHINodes(NewBB);
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction. Eliminate these entries (which might even delete the
+  // PHI node) now.
+  UnwindDest->removePredecessor(InvokeBB);
+}
+
 /// When inlining a function that contains noalias scope metadata,
 /// this metadata needs to be cloned so that the inlined blocks
 /// have different "unqiue scopes" at every call site. Were this not done, then
@@ -395,17 +499,16 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
 /// parameters with noalias metadata specifying the new scope, and tag all
 /// non-derived loads, stores and memory intrinsics with the new alias scopes.
 static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
-                                  const DataLayout &DL, AliasAnalysis *AA) {
+                                  const DataLayout &DL, AAResults *CalleeAAR) {
   if (!EnableNoAliasConversion)
     return;
 
   const Function *CalledFunc = CS.getCalledFunction();
   SmallVector<const Argument *, 4> NoAliasArgs;
 
-  for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
-       E = CalledFunc->arg_end(); I != E; ++I) {
-    if (I->hasNoAliasAttr() && !I->hasNUses(0))
-      NoAliasArgs.push_back(I);
+  for (const Argument &I : CalledFunc->args()) {
+    if (I.hasNoAliasAttr() && !I.hasNUses(0))
+      NoAliasArgs.push_back(&I);
   }
 
   if (NoAliasArgs.empty())
@@ -480,10 +583,10 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
           continue;
 
         IsFuncCall = true;
-        if (AA) {
-          AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(ICS);
-          if (MRB == AliasAnalysis::OnlyAccessesArgumentPointees ||
-              MRB == AliasAnalysis::OnlyReadsArgumentPointees)
+        if (CalleeAAR) {
+          FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(ICS);
+          if (MRB == FMRB_OnlyAccessesArgumentPointees ||
+              MRB == FMRB_OnlyReadsArgumentPointees)
             IsArgMemOnlyCall = true;
         }
 
@@ -518,7 +621,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
       for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) {
         SmallVector<Value *, 4> Objects;
         GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]),
-                             Objects, DL, /* MaxLookup = */ 0);
+                             Objects, DL, /* LI = */ nullptr);
 
         for (Value *O : Objects)
           ObjSet.insert(O);
@@ -646,7 +749,7 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
       // caller, then don't bother inserting the assumption.
       Value *Arg = CS.getArgument(I->getArgNo());
       if (getKnownAlignment(Arg, DL, CS.getInstruction(),
-                            &IFI.ACT->getAssumptionCache(*CalledFunc),
+                            &IFI.ACT->getAssumptionCache(*CS.getCaller()),
                             &DT) >= Align)
         continue;
 
@@ -731,7 +834,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
                                     BasicBlock *InsertBlock,
                                     InlineFunctionInfo &IFI) {
   Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
-  IRBuilder<> Builder(InsertBlock->begin());
+  IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
 
   Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
 
@@ -851,9 +954,8 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
   // Starting from the top, rebuild the nodes to point to the new inlined-at
   // location (then rebuilding the rest of the chain behind it) and update the
   // map of already-constructed inlined-at nodes.
-  for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend();
-       I != E; ++I) {
-    const DILocation *MD = *I;
+  for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(),
+                                         InlinedAtLocations.rend())) {
     Last = IANodes[MD] = DILocation::getDistinct(
         Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
   }
@@ -917,7 +1019,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 /// exists in the instruction stream.  Similarly this will inline a recursive
 /// function by one level.
 bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
-                          bool InsertLifetime) {
+                          AAResults *CalleeAAR, bool InsertLifetime) {
   Instruction *TheCall = CS.getInstruction();
   assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
          "Instruction not in function!");
@@ -930,6 +1032,22 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       CalledFunc->isDeclaration() || // call, or call to a vararg function!
       CalledFunc->getFunctionType()->isVarArg()) return false;
 
+  // The inliner does not know how to inline through calls with operand bundles
+  // in general ...
+  if (CS.hasOperandBundles()) {
+    for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) {
+      uint32_t Tag = CS.getOperandBundleAt(i).getTagID();
+      // ... but it knows how to inline through "deopt" operand bundles ...
+      if (Tag == LLVMContext::OB_deopt)
+        continue;
+      // ... and "funclet" operand bundles.
+      if (Tag == LLVMContext::OB_funclet)
+        continue;
+
+      return false;
+    }
+  }
+
   // If the call to the callee cannot throw, set the 'nounwind' flag on any
   // calls that we inline.
   bool MarkNoUnwind = CS.doesNotThrow();
@@ -950,13 +1068,17 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
   // Get the personality function from the callee if it contains a landing pad.
   Constant *CalledPersonality =
-      CalledFunc->hasPersonalityFn() ? CalledFunc->getPersonalityFn() : nullptr;
+      CalledFunc->hasPersonalityFn()
+          ? CalledFunc->getPersonalityFn()->stripPointerCasts()
+          : nullptr;
 
   // Find the personality function used by the landing pads of the caller. If it
   // exists, then check to see that it matches the personality function used in
   // the callee.
   Constant *CallerPersonality =
-      Caller->hasPersonalityFn() ? Caller->getPersonalityFn() : nullptr;
+      Caller->hasPersonalityFn()
+          ? Caller->getPersonalityFn()->stripPointerCasts()
+          : nullptr;
   if (CalledPersonality) {
     if (!CallerPersonality)
       Caller->setPersonalityFn(CalledPersonality);
@@ -968,9 +1090,46 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       return false;
   }
 
+  // We need to figure out which funclet the callsite was in so that we may
+  // properly nest the callee.
+  Instruction *CallSiteEHPad = nullptr;
+  if (CallerPersonality) {
+    EHPersonality Personality = classifyEHPersonality(CallerPersonality);
+    if (isFuncletEHPersonality(Personality)) {
+      Optional<OperandBundleUse> ParentFunclet =
+          CS.getOperandBundle(LLVMContext::OB_funclet);
+      if (ParentFunclet)
+        CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
+
+      // OK, the inlining site is legal.  What about the target function?
+
+      if (CallSiteEHPad) {
+        if (Personality == EHPersonality::MSVC_CXX) {
+          // The MSVC personality cannot tolerate catches getting inlined into
+          // cleanup funclets.
+          if (isa<CleanupPadInst>(CallSiteEHPad)) {
+            // Ok, the call site is within a cleanuppad.  Let's check the callee
+            // for catchpads.
+            for (const BasicBlock &CalledBB : *CalledFunc) {
+              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
+                return false;
+            }
+          }
+        } else if (isAsynchronousEHPersonality(Personality)) {
+          // SEH is even less tolerant, there may not be any sort of exceptional
+          // funclet in the callee.
+          for (const BasicBlock &CalledBB : *CalledFunc) {
+            if (CalledBB.isEHPad())
+              return false;
+          }
+        }
+      }
+    }
+  }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
-  Function::iterator LastBlock = &Caller->back();
+  Function::iterator LastBlock = --Caller->end();
 
   // Make sure to capture all of the return instructions from the cloned
   // function.
@@ -1007,7 +1166,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
       }
 
-      VMap[I] = ActualArg;
+      VMap[&*I] = ActualArg;
     }
 
     // Add alignment assumptions if necessary. We do this before the inlined
@@ -1029,7 +1188,61 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Inject byval arguments initialization.
     for (std::pair<Value*, Value*> &Init : ByValInit)
       HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
-                              FirstNewBlock, IFI);
+                              &*FirstNewBlock, IFI);
+
+    Optional<OperandBundleUse> ParentDeopt =
+        CS.getOperandBundle(LLVMContext::OB_deopt);
+    if (ParentDeopt) {
+      SmallVector<OperandBundleDef, 2> OpDefs;
+
+      for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
+        Instruction *I = dyn_cast_or_null<Instruction>(VH);
+        if (!I) continue;  // instruction was DCE'd or RAUW'ed to undef
+
+        OpDefs.clear();
+
+        CallSite ICS(I);
+        OpDefs.reserve(ICS.getNumOperandBundles());
+
+        for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) {
+          auto ChildOB = ICS.getOperandBundleAt(i);
+          if (ChildOB.getTagID() != LLVMContext::OB_deopt) {
+            // If the inlined call has other operand bundles, let them be
+            OpDefs.emplace_back(ChildOB);
+            continue;
+          }
+
+          // It may be useful to separate this logic (of handling operand
+          // bundles) out to a separate "policy" component if this gets crowded.
+          // Prepend the parent's deoptimization continuation to the newly
+          // inlined call's deoptimization continuation.
+          std::vector<Value *> MergedDeoptArgs;
+          MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() +
+                                  ChildOB.Inputs.size());
+
+          MergedDeoptArgs.insert(MergedDeoptArgs.end(),
+                                 ParentDeopt->Inputs.begin(),
+                                 ParentDeopt->Inputs.end());
+          MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(),
+                                 ChildOB.Inputs.end());
+
+          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
+        }
+
+        Instruction *NewI = nullptr;
+        if (isa<CallInst>(I))
+          NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I);
+        else
+          NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I);
+
+        // Note: the RAUW does the appropriate fixup in VMap, so we need to do
+        // this even if the call returns void.
+        I->replaceAllUsesWith(NewI);
+
+        VH = nullptr;
+        I->eraseFromParent();
+      }
+    }
 
     // Update the callgraph if requested.
     if (IFI.CG)
@@ -1042,7 +1255,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     CloneAliasScopeMetadata(CS, VMap);
 
     // Add noalias metadata if necessary.
-    AddAliasScopeMetadata(CS, VMap, DL, IFI.AA);
+    AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);
 
     // FIXME: We could register any cloned assumptions instead of clearing the
     // whole function's cache.
@@ -1085,9 +1298,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Transfer all of the allocas over in a block.  Using splice means
       // that the instructions aren't removed from the symbol table, then
       // reinserted.
-      Caller->getEntryBlock().getInstList().splice(InsertPoint,
-                                                   FirstNewBlock->getInstList(),
-                                                   AI, I);
+      Caller->getEntryBlock().getInstList().splice(
+          InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
     }
     // Move any dbg.declares describing the allocas into the entry basic block.
     DIBuilder DIB(*Caller->getParent());
@@ -1137,7 +1349,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // Leave lifetime markers for the static alloca's, scoping them to the
   // function we just inlined.
   if (InsertLifetime && !IFI.StaticAllocas.empty()) {
-    IRBuilder<> builder(FirstNewBlock->begin());
+    IRBuilder<> builder(&FirstNewBlock->front());
     for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
       AllocaInst *AI = IFI.StaticAllocas[ai];
 
@@ -1189,7 +1401,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
 
     // Insert the llvm.stacksave.
-    CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin())
+    CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin())
                              .CreateCall(StackSave, {}, "savedstack");
 
     // Insert a call to llvm.stackrestore before any return instructions in the
@@ -1203,10 +1415,74 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  // Update the lexical scopes of the new funclets and callsites.
+  // Anything that had 'none' as its parent is now nested inside the callsite's
+  // EHPad.
+
+  if (CallSiteEHPad) {
+    for (Function::iterator BB = FirstNewBlock->getIterator(),
+                            E = Caller->end();
+         BB != E; ++BB) {
+      // Add bundle operands to any top-level call sites.
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
+        Instruction *I = &*BBI++;
+        CallSite CS(I);
+        if (!CS)
+          continue;
+
+        // Skip call sites which are nounwind intrinsics.
+        auto *CalledFn =
+            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+        if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow())
+          continue;
+
+        // Skip call sites which already have a "funclet" bundle.
+        if (CS.getOperandBundle(LLVMContext::OB_funclet))
+          continue;
+
+        CS.getOperandBundlesAsDefs(OpBundles);
+        OpBundles.emplace_back("funclet", CallSiteEHPad);
+
+        Instruction *NewInst;
+        if (CS.isCall())
+          NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I);
+        else
+          NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I);
+        NewInst->setDebugLoc(I->getDebugLoc());
+        NewInst->takeName(I);
+        I->replaceAllUsesWith(NewInst);
+        I->eraseFromParent();
+
+        OpBundles.clear();
+      }
+
+      Instruction *I = BB->getFirstNonPHI();
+      if (!I->isEHPad())
+        continue;
+
+      if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+        if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
+          CatchSwitch->setParentPad(CallSiteEHPad);
+      } else {
+        auto *FPI = cast<FuncletPadInst>(I);
+        if (isa<ConstantTokenNone>(FPI->getParentPad()))
+          FPI->setParentPad(CallSiteEHPad);
+      }
+    }
+  }
+
   // If we are inlining for an invoke instruction, we must make sure to rewrite
   // any call instructions into invoke instructions.
-  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
-    HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo);
+  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
+    BasicBlock *UnwindDest = II->getUnwindDest();
+    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    if (isa<LandingPadInst>(FirstNonPHI)) {
+      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    } else {
+      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    }
+  }
 
   // Handle any inlined musttail call sites.  In order for a new call site to be
   // musttail, the source of the clone and the inlined call site must have been
@@ -1250,7 +1526,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // the calling basic block.
   if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
     // Move all of the instructions right before the call.
-    OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(),
+    OrigBB->getInstList().splice(TheCall->getIterator(),
+                                 FirstNewBlock->getInstList(),
                                  FirstNewBlock->begin(), FirstNewBlock->end());
     // Remove the cloned basic block.
     Caller->getBasicBlockList().pop_back();
@@ -1297,15 +1574,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
     // symmetric to the call case.
-    AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest,
-                                          CalledFunc->getName()+".exit");
+    AfterCallBB =
+        OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(),
+                                CalledFunc->getName() + ".exit");
 
   } else {  // It's a call
     // If this is a call instruction, we need to split the basic block that
     // the call lives in.
     //
-    AfterCallBB = OrigBB->splitBasicBlock(TheCall,
-                                          CalledFunc->getName()+".exit");
+    AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(),
+                                          CalledFunc->getName() + ".exit");
   }
 
   // Change the branch that used to go to AfterCallBB to branch to the first
@@ -1314,14 +1592,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   TerminatorInst *Br = OrigBB->getTerminator();
   assert(Br && Br->getOpcode() == Instruction::Br &&
          "splitBasicBlock broken!");
-  Br->setOperand(0, FirstNewBlock);
-
+  Br->setOperand(0, &*FirstNewBlock);
 
   // Now that the function is correct, make it a little bit nicer.  In
   // particular, move the basic blocks inserted from the end of the function
   // into the space made by splitting the source basic block.
-  Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(),
-                                     FirstNewBlock, Caller->end());
+  Caller->getBasicBlockList().splice(AfterCallBB->getIterator(),
+                                     Caller->getBasicBlockList(), FirstNewBlock,
+                                     Caller->end());
 
   // Handle all of the return instructions that we just cloned in, and eliminate
   // any users of the original call/invoke instruction.
@@ -1333,7 +1611,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // possible incoming values.
     if (!TheCall->use_empty()) {
       PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(),
-                            AfterCallBB->begin());
+                            &AfterCallBB->front());
       // Anything that used the result of the function call should now use the
       // PHI node as their operand.
       TheCall->replaceAllUsesWith(PHI);
@@ -1350,7 +1628,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       }
     }
 
-
     // Add a branch to the merge points and remove return instructions.
     DebugLoc Loc;
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
@@ -1413,7 +1690,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // Splice the code entry block into calling block, right before the
   // unconditional branch.
   CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes
-  OrigBB->getInstList().splice(Br, CalleeEntry->getInstList());
+  OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList());
 
   // Remove the unconditional branch.
   OrigBB->getInstList().erase(Br);
diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
index 30edf3b..5687afa 100644
--- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -380,14 +380,10 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
 
   IRBuilder<> Builder(Rem);
 
-  Type *RemTy = Rem->getType();
-  if (RemTy->isVectorTy())
-    llvm_unreachable("Div over vectors not supported");
-
-  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
-
-  if (RemTyBitWidth != 32 && RemTyBitWidth != 64)
-    llvm_unreachable("Div of bitwidth other than 32 or 64 not supported");
+  assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported");
+  assert((Rem->getType()->getIntegerBitWidth() == 32 ||
+          Rem->getType()->getIntegerBitWidth() == 64) &&
+         "Div of bitwidth other than 32 or 64 not supported");
 
   // First prepare the sign if it's a signed remainder
   if (Rem->getOpcode() == Instruction::SRem) {
@@ -401,7 +397,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
     // If we didn't actually generate an urem instruction, we're done
     // This happens for example if the input were constant. In this case the
     // Builder insertion point was unchanged
-    if (Rem == Builder.GetInsertPoint())
+    if (Rem == Builder.GetInsertPoint().getNodePtrUnchecked())
       return true;
 
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
@@ -440,14 +436,10 @@ bool llvm::expandDivision(BinaryOperator *Div) {
 
   IRBuilder<> Builder(Div);
 
-  Type *DivTy = Div->getType();
-  if (DivTy->isVectorTy())
-    llvm_unreachable("Div over vectors not supported");
-
-  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
-
-  if (DivTyBitWidth != 32 && DivTyBitWidth != 64)
-    llvm_unreachable("Div of bitwidth other than 32 or 64 not supported");
+  assert(!Div->getType()->isVectorTy() && "Div over vectors not supported");
+  assert((Div->getType()->getIntegerBitWidth() == 32 ||
+          Div->getType()->getIntegerBitWidth() == 64) &&
+         "Div of bitwidth other than 32 or 64 not supported");
 
   // First prepare the sign if it's a signed division
   if (Div->getOpcode() == Instruction::SDiv) {
@@ -461,7 +453,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {
     // If we didn't actually generate an udiv instruction, we're done
     // This happens for example if the input were constant. In this case the
     // Builder insertion point was unchanged
-    if (Div == Builder.GetInsertPoint())
+    if (Div == Builder.GetInsertPoint().getNodePtrUnchecked())
       return true;
 
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
@@ -492,15 +484,14 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
           "Trying to expand remainder from a non-remainder function");
 
   Type *RemTy = Rem->getType();
-  if (RemTy->isVectorTy())
-    llvm_unreachable("Div over vectors not supported");
+  assert(!RemTy->isVectorTy() && "Div over vectors not supported");
 
   unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
 
-  if (RemTyBitWidth > 32) 
-    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+  assert(RemTyBitWidth <= 32 &&
+         "Div of bitwidth greater than 32 not supported");
 
-  if (RemTyBitWidth == 32) 
+  if (RemTyBitWidth == 32)
     return expandRemainder(Rem);
 
   // If bitwidth smaller than 32 extend inputs, extend output and proceed
@@ -542,15 +533,13 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
           "Trying to expand remainder from a non-remainder function");
 
   Type *RemTy = Rem->getType();
-  if (RemTy->isVectorTy())
-    llvm_unreachable("Div over vectors not supported");
+  assert(!RemTy->isVectorTy() && "Div over vectors not supported");
 
   unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
 
-  if (RemTyBitWidth > 64) 
-    llvm_unreachable("Div of bitwidth greater than 64 not supported");
+  assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported");
 
-  if (RemTyBitWidth == 64) 
+  if (RemTyBitWidth == 64)
     return expandRemainder(Rem);
 
   // If bitwidth smaller than 64 extend inputs, extend output and proceed
@@ -593,13 +582,11 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
           "Trying to expand division from a non-division function");
 
   Type *DivTy = Div->getType();
-  if (DivTy->isVectorTy())
-    llvm_unreachable("Div over vectors not supported");
+  assert(!DivTy->isVectorTy() && "Div over vectors not supported");
 
   unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
 
-  if (DivTyBitWidth > 32)
-    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+  assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported");
 
   if (DivTyBitWidth == 32)
     return expandDivision(Div);
@@ -643,13 +630,12 @@ bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
           "Trying to expand division from a non-division function");
 
   Type *DivTy = Div->getType();
-  if (DivTy->isVectorTy())
-    llvm_unreachable("Div over vectors not supported");
+  assert(!DivTy->isVectorTy() && "Div over vectors not supported");
 
   unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
 
-  if (DivTyBitWidth > 64)
-    llvm_unreachable("Div of bitwidth greater than 64 not supported");
+  assert(DivTyBitWidth <= 64 &&
+         "Div of bitwidth greater than 64 not supported");
 
   if (DivTyBitWidth == 64)
     return expandDivision(Div);
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
index 9d40b69..b4b2e14 100644
--- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -31,8 +31,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -64,6 +66,13 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
                                PredIteratorCache &PredCache, LoopInfo *LI) {
   SmallVector<Use *, 16> UsesToRewrite;
 
+  // Tokens cannot be used in PHI nodes, so we skip over them.
+  // We can run into tokens which are live out of a loop with catchswitch
+  // instructions in Windows EH if the catchswitch has one catchpad which
+  // is inside the loop and another which is not.
+  if (Inst.getType()->isTokenTy())
+    return false;
+
   BasicBlock *InstBB = Inst.getParent();
 
   for (Use &U : Inst.uses()) {
@@ -84,9 +93,8 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
 
   // Invoke instructions are special in that their result value is not available
   // along their unwind edge. The code below tests to see whether DomBB
-  // dominates
-  // the value, so adjust DomBB to the normal destination block, which is
-  // effectively where the value is first usable.
+  // dominates the value, so adjust DomBB to the normal destination block,
+  // which is effectively where the value is first usable.
   BasicBlock *DomBB = Inst.getParent();
   if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
     DomBB = Inv->getNormalDest();
@@ -101,10 +109,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
 
   // Insert the LCSSA phi's into all of the exit blocks dominated by the
   // value, and add them to the Phi's map.
-  for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(),
-                                                     BBE = ExitBlocks.end();
-       BBI != BBE; ++BBI) {
-    BasicBlock *ExitBB = *BBI;
+  for (BasicBlock *ExitBB : ExitBlocks) {
     if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
       continue;
 
@@ -113,7 +118,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
       continue;
 
     PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
-                                  Inst.getName() + ".lcssa", ExitBB->begin());
+                                  Inst.getName() + ".lcssa", &ExitBB->front());
 
     // Add inputs from inside the loop for this PHI.
     for (BasicBlock *Pred : PredCache.get(ExitBB)) {
@@ -148,26 +153,26 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
 
   // Rewrite all uses outside the loop in terms of the new PHIs we just
   // inserted.
-  for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
+  for (Use *UseToRewrite : UsesToRewrite) {
     // If this use is in an exit block, rewrite to use the newly inserted PHI.
     // This is required for correctness because SSAUpdate doesn't handle uses in
     // the same block.  It assumes the PHI we inserted is at the end of the
     // block.
-    Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser());
+    Instruction *User = cast<Instruction>(UseToRewrite->getUser());
     BasicBlock *UserBB = User->getParent();
     if (PHINode *PN = dyn_cast<PHINode>(User))
-      UserBB = PN->getIncomingBlock(*UsesToRewrite[i]);
+      UserBB = PN->getIncomingBlock(*UseToRewrite);
 
     if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
       // Tell the VHs that the uses changed. This updates SCEV's caches.
-      if (UsesToRewrite[i]->get()->hasValueHandle())
-        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
-      UsesToRewrite[i]->set(UserBB->begin());
+      if (UseToRewrite->get()->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
+      UseToRewrite->set(&UserBB->front());
       continue;
     }
 
     // Otherwise, do full PHI insertion.
-    SSAUpdate.RewriteUse(*UsesToRewrite[i]);
+    SSAUpdate.RewriteUse(*UseToRewrite);
   }
 
   // Post process PHI instructions that were inserted into another disjoint loop
@@ -190,10 +195,9 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
   }
 
   // Remove PHI nodes that did not have any uses rewritten.
-  for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
-    if (AddedPHIs[i]->use_empty())
-      AddedPHIs[i]->eraseFromParent();
-  }
+  for (PHINode *PN : AddedPHIs)
+    if (PN->use_empty())
+      PN->eraseFromParent();
 
   return true;
 }
@@ -205,8 +209,8 @@ blockDominatesAnExit(BasicBlock *BB,
                      DominatorTree &DT,
                      const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
   DomTreeNode *DomNode = DT.getNode(BB);
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (DT.dominates(DomNode, DT.getNode(ExitBlocks[i])))
+  for (BasicBlock *ExitBB : ExitBlocks)
+    if (DT.dominates(DomNode, DT.getNode(ExitBB)))
       return true;
 
   return false;
@@ -227,25 +231,22 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
 
   // Look at all the instructions in the loop, checking to see if they have uses
   // outside the loop.  If so, rewrite those uses.
-  for (Loop::block_iterator BBI = L.block_begin(), BBE = L.block_end();
-       BBI != BBE; ++BBI) {
-    BasicBlock *BB = *BBI;
-
+  for (BasicBlock *BB : L.blocks()) {
     // For large loops, avoid use-scanning by using dominance information:  In
     // particular, if a block does not dominate any of the loop exits, then none
     // of the values defined in the block could be used outside the loop.
     if (!blockDominatesAnExit(BB, DT, ExitBlocks))
       continue;
 
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    for (Instruction &I : *BB) {
       // Reject two common cases fast: instructions with no uses (like stores)
       // and instructions with one use that is in the same block as this.
-      if (I->use_empty() ||
-          (I->hasOneUse() && I->user_back()->getParent() == BB &&
-           !isa<PHINode>(I->user_back())))
+      if (I.use_empty() ||
+          (I.hasOneUse() && I.user_back()->getParent() == BB &&
+           !isa<PHINode>(I.user_back())))
         continue;
 
-      Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache, LI);
+      Changed |= processInstruction(L, I, DT, ExitBlocks, PredCache, LI);
     }
   }
 
@@ -266,8 +267,8 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
-  for (Loop::iterator I = L.begin(), E = L.end(); I != E; ++I)
-    Changed |= formLCSSARecursively(**I, DT, LI, SE);
+  for (Loop *SubLoop : L.getSubLoops())
+    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
 
   Changed |= formLCSSA(L, DT, LI, SE);
   return Changed;
@@ -296,8 +297,10 @@ struct LCSSA : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreservedID(LoopSimplifyID);
-    AU.addPreserved<AliasAnalysis>();
-    AU.addPreserved<ScalarEvolution>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
   }
 };
 }
@@ -306,6 +309,8 @@ char LCSSA::ID = 0;
 INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
 
 Pass *llvm::createLCSSAPass() { return new LCSSA(); }
@@ -317,7 +322,8 @@ bool LCSSA::runOnFunction(Function &F) {
   bool Changed = false;
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = getAnalysisIfAvailable<ScalarEvolution>();
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  SE = SEWP ? &SEWP->getSE() : nullptr;
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
index ba8af47..d2793e5 100644
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -17,11 +17,13 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -188,9 +190,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       BasicBlock *BB = SI->getParent();
 
       // Remove entries from PHI nodes which we no longer branch to...
-      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+      for (BasicBlock *Succ : SI->successors()) {
         // Found case matching a constant operand?
-        BasicBlock *Succ = SI->getSuccessor(i);
         if (Succ == TheOnlyDest)
           TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
         else
@@ -230,6 +231,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
                                             SIDef->getValue().getZExtValue()));
       }
 
+      // Update make.implicit metadata to the newly-created conditional branch.
+      MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit);
+      if (MakeImplicitMD)
+        NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD);
+
       // Delete the old switch.
       SI->eraseFromParent();
       return true;
@@ -283,8 +289,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
                                       const TargetLibraryInfo *TLI) {
   if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
 
-  // We don't want the landingpad instruction removed by anything this general.
-  if (isa<LandingPadInst>(I))
+  // We don't want the landingpad-like instructions removed by anything this
+  // general.
+  if (I->isEHPad())
     return false;
 
   // We don't want debug info removed by anything this general, unless
@@ -414,6 +421,49 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
   return false;
 }
 
+static bool
+simplifyAndDCEInstruction(Instruction *I,
+                          SmallSetVector<Instruction *, 16> &WorkList,
+                          const DataLayout &DL,
+                          const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+
+    return true;
+  }
+
+  if (Value *SimpleV = SimplifyInstruction(I, DL)) {
+    // Add the users to the worklist. CAREFUL: an instruction can use itself,
+    // in the case of a phi node.
+    for (User *U : I->users())
+      if (U != I)
+        WorkList.insert(cast<Instruction>(U));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+    I->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
 /// SimplifyInstructionsInBlock - Scan the specified basic block and try to
 /// simplify any instructions in it and recursively delete dead instructions.
 ///
@@ -422,30 +472,34 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
 bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
                                        const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
 
 #ifndef NDEBUG
   // In debug builds, ensure that the terminator of the block is never replaced
   // or deleted by these simplifications. The idea of simplification is that it
   // cannot introduce new instructions, and there is no way to replace the
   // terminator of a block without introducing a new instruction.
-  AssertingVH<Instruction> TerminatorVH(--BB->end());
+  AssertingVH<Instruction> TerminatorVH(&BB->back());
 #endif
 
-  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) {
     assert(!BI->isTerminator());
-    Instruction *Inst = BI++;
+    Instruction *I = &*BI;
+    ++BI;
 
-    WeakVH BIHandle(BI);
-    if (recursivelySimplifyInstruction(Inst, TLI)) {
-      MadeChange = true;
-      if (BIHandle != BI)
-        BI = BB->begin();
-      continue;
-    }
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+  }
 
-    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
-    if (BIHandle != BI)
-      BI = BB->begin();
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
   }
   return MadeChange;
 }
@@ -808,7 +862,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
 
     // Copy over any phi, debug or lifetime instruction.
     BB->getTerminator()->eraseFromParent();
-    Succ->getInstList().splice(Succ->getFirstNonPHI(), BB->getInstList());
+    Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(),
+                               BB->getInstList());
   } else {
     while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
       // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
@@ -997,9 +1052,31 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
     ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
-  if (ExtendedArg)
-    Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, DIExpr,
+  if (ExtendedArg) {
+    // We're now only describing a subset of the variable. The piece we're
+    // describing will always be smaller than the variable size, because
+    // VariableSize == Size of Alloca described by DDI. Since SI stores
+    // to the alloca described by DDI, if it's first operand is an extend,
+    // we're guaranteed that before extension, the value was narrower than
+    // the size of the alloca, hence the size of the described variable.
+    SmallVector<uint64_t, 3> NewDIExpr;
+    unsigned PieceOffset = 0;
+    // If this already is a bit piece, we drop the bit piece from the expression
+    // and record the offset.
+    if (DIExpr->isBitPiece()) {
+      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
+      PieceOffset = DIExpr->getBitPieceOffset();
+    } else {
+      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    }
+    NewDIExpr.push_back(dwarf::DW_OP_bit_piece);
+    NewDIExpr.push_back(PieceOffset); //Offset
+    const DataLayout &DL = DDI->getModule()->getDataLayout();
+    NewDIExpr.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
+    Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar,
+                                    Builder.createExpression(NewDIExpr),
                                     DDI->getDebugLoc(), SI);
+  }
   else
     Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
                                     DDI->getDebugLoc(), SI);
@@ -1017,8 +1094,13 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (LdStHasDebugValue(DIVar, LI))
     return true;
 
-  Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr,
-                                  DDI->getDebugLoc(), LI);
+  // We are now tracking the loaded value instead of the address. In the
+  // future if multi-location support is added to the IR, it might be
+  // preferable to keep tracking both the loaded value and the original
+  // address in case the alloca can not be elided.
+  Instruction *DbgValue = Builder.insertDbgValueIntrinsic(
+      LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr);
+  DbgValue->insertAfter(LI);
   return true;
 }
 
@@ -1034,8 +1116,8 @@ bool llvm::LowerDbgDeclare(Function &F) {
   DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<DbgDeclareInst *, 4> Dbgs;
   for (auto &FI : F)
-    for (BasicBlock::iterator BI : FI)
-      if (auto DDI = dyn_cast<DbgDeclareInst>(BI))
+    for (Instruction &BI : FI)
+      if (auto DDI = dyn_cast<DbgDeclareInst>(&BI))
         Dbgs.push_back(DDI);
 
   if (Dbgs.empty())
@@ -1060,9 +1142,13 @@ bool llvm::LowerDbgDeclare(Function &F) {
           // This is a call by-value or some other instruction that
           // takes a pointer to the variable. Insert a *value*
           // intrinsic that describes the alloca.
+          SmallVector<uint64_t, 1> NewDIExpr;
+          auto *DIExpr = DDI->getExpression();
+          NewDIExpr.push_back(dwarf::DW_OP_deref);
+          NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
           DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(),
-                                      DDI->getExpression(), DDI->getDebugLoc(),
-                                      CI);
+                                      DIB.createExpression(NewDIExpr),
+                                      DDI->getDebugLoc(), CI);
         }
       DDI->eraseFromParent();
     }
@@ -1082,9 +1168,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
-bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
-                                      DIBuilder &Builder, bool Deref) {
-  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
+                             Instruction *InsertBefore, DIBuilder &Builder,
+                             bool Deref, int Offset) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address);
   if (!DDI)
     return false;
   DebugLoc Loc = DDI->getDebugLoc();
@@ -1092,29 +1179,40 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (Deref) {
+  if (Deref || Offset) {
     // Create a copy of the original DIDescriptor for user variable, prepending
     // "deref" operation to a list of address elements, as new llvm.dbg.declare
     // will take a value storing address of the memory for variable, not
     // alloca itself.
     SmallVector<uint64_t, 4> NewDIExpr;
-    NewDIExpr.push_back(dwarf::DW_OP_deref);
+    if (Deref)
+      NewDIExpr.push_back(dwarf::DW_OP_deref);
+    if (Offset > 0) {
+      NewDIExpr.push_back(dwarf::DW_OP_plus);
+      NewDIExpr.push_back(Offset);
+    } else if (Offset < 0) {
+      NewDIExpr.push_back(dwarf::DW_OP_minus);
+      NewDIExpr.push_back(-Offset);
+    }
     if (DIExpr)
       NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
     DIExpr = Builder.createExpression(NewDIExpr);
   }
 
-  // Insert llvm.dbg.declare in the same basic block as the original alloca,
-  // and remove old llvm.dbg.declare.
-  BasicBlock *BB = AI->getParent();
-  Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, BB);
+  // Insert llvm.dbg.declare immediately after the original alloca, and remove
+  // old llvm.dbg.declare.
+  Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
   DDI->eraseFromParent();
   return true;
 }
 
-/// changeToUnreachable - Insert an unreachable instruction before the specified
-/// instruction, making it and the rest of the code in the block dead.
-static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder, bool Deref, int Offset) {
+  return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder,
+                           Deref, Offset);
+}
+
+void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
@@ -1132,7 +1230,7 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   new UnreachableInst(I->getContext(), I);
 
   // All instructions after this are dead.
-  BasicBlock::iterator BBI = I, BBE = BB->end();
+  BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
   while (BBI != BBE) {
     if (!BBI->use_empty())
       BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
@@ -1142,8 +1240,11 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
 
 /// changeToCall - Convert the specified invoke into a normal call.
 static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  II->getOperandBundlesAsDefs(OpBundles);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                       "", II);
   NewCall->takeName(II);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
@@ -1162,7 +1263,7 @@ static bool markAliveBlocks(Function &F,
                             SmallPtrSetImpl<BasicBlock*> &Reachable) {
 
   SmallVector<BasicBlock*, 128> Worklist;
-  BasicBlock *BB = F.begin();
+  BasicBlock *BB = &F.front();
   Worklist.push_back(BB);
   Reachable.insert(BB);
   bool Changed = false;
@@ -1187,7 +1288,7 @@ static bool markAliveBlocks(Function &F,
 
           if (MakeUnreachable) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(BBI, false);
+            changeToUnreachable(&*BBI, false);
             Changed = true;
             break;
           }
@@ -1201,7 +1302,7 @@ static bool markAliveBlocks(Function &F,
           ++BBI;
           if (!isa<UnreachableInst>(BBI)) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(BBI, false);
+            changeToUnreachable(&*BBI, false);
             Changed = true;
           }
           break;
@@ -1227,8 +1328,9 @@ static bool markAliveBlocks(Function &F,
       }
     }
 
-    // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+    TerminatorInst *Terminator = BB->getTerminator();
+    if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
+      // Turn invokes that call 'nounwind' functions into ordinary calls.
       Value *Callee = II->getCalledValue();
       if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
         changeToUnreachable(II, true);
@@ -1243,6 +1345,44 @@ static bool markAliveBlocks(Function &F,
           changeToCall(II);
         Changed = true;
       }
+    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
+      // Remove catchpads which cannot be reached.
+      struct CatchPadDenseMapInfo {
+        static CatchPadInst *getEmptyKey() {
+          return DenseMapInfo<CatchPadInst *>::getEmptyKey();
+        }
+        static CatchPadInst *getTombstoneKey() {
+          return DenseMapInfo<CatchPadInst *>::getTombstoneKey();
+        }
+        static unsigned getHashValue(CatchPadInst *CatchPad) {
+          return static_cast<unsigned>(hash_combine_range(
+              CatchPad->value_op_begin(), CatchPad->value_op_end()));
+        }
+        static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) {
+          if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+              RHS == getEmptyKey() || RHS == getTombstoneKey())
+            return LHS == RHS;
+          return LHS->isIdenticalTo(RHS);
+        }
+      };
+
+      // Set of unique CatchPads.
+      SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
+                    CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
+          HandlerSet;
+      detail::DenseSetEmpty Empty;
+      for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(),
+                                             E = CatchSwitch->handler_end();
+           I != E; ++I) {
+        BasicBlock *HandlerBB = *I;
+        auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
+        if (!HandlerSet.insert({CatchPad, Empty}).second) {
+          CatchSwitch->removeHandler(I);
+          --I;
+          --E;
+          Changed = true;
+        }
+      }
     }
 
     Changed |= ConstantFoldTerminator(BB, true);
@@ -1253,10 +1393,44 @@ static bool markAliveBlocks(Function &F,
   return Changed;
 }
 
+void llvm::removeUnwindEdge(BasicBlock *BB) {
+  TerminatorInst *TI = BB->getTerminator();
+
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    changeToCall(II);
+    return;
+  }
+
+  TerminatorInst *NewTI;
+  BasicBlock *UnwindDest;
+
+  if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
+    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
+    UnwindDest = CRI->getUnwindDest();
+  } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+    auto *NewCatchSwitch = CatchSwitchInst::Create(
+        CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
+        CatchSwitch->getName(), CatchSwitch);
+    for (BasicBlock *PadBB : CatchSwitch->handlers())
+      NewCatchSwitch->addHandler(PadBB);
+
+    NewTI = NewCatchSwitch;
+    UnwindDest = CatchSwitch->getUnwindDest();
+  } else {
+    llvm_unreachable("Could not find unwind successor");
+  }
+
+  NewTI->takeName(TI);
+  NewTI->setDebugLoc(TI->getDebugLoc());
+  UnwindDest->removePredecessor(BB);
+  TI->replaceAllUsesWith(NewTI);
+  TI->eraseFromParent();
+}
+
 /// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
-bool llvm::removeUnreachableBlocks(Function &F) {
+bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
   bool Changed = markAliveBlocks(F, Reachable);
 
@@ -1270,17 +1444,20 @@ bool llvm::removeUnreachableBlocks(Function &F) {
   // Loop over all of the basic blocks that are not reachable, dropping all of
   // their internal references...
   for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
+    if (Reachable.count(&*BB))
       continue;
 
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE;
+         ++SI)
       if (Reachable.count(*SI))
-        (*SI)->removePredecessor(BB);
+        (*SI)->removePredecessor(&*BB);
+    if (LVI)
+      LVI->eraseBlock(&*BB);
     BB->dropAllReferences();
   }
 
   for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(I))
+    if (!Reachable.count(&*I))
       I = F.getBasicBlockList().erase(I);
     else
       ++I;
@@ -1288,9 +1465,10 @@ bool llvm::removeUnreachableBlocks(Function &F) {
   return true;
 }
 
-void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs) {
+void llvm::combineMetadata(Instruction *K, const Instruction *J,
+                           ArrayRef<unsigned> KnownIDs) {
   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
-  K->dropUnknownMetadata(KnownIDs);
+  K->dropUnknownNonDebugMetadata(KnownIDs);
   K->getAllMetadataOtherThanDebugLoc(Metadata);
   for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
     unsigned Kind = Metadata[i].first;
@@ -1326,8 +1504,29 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign
         // Only set the !nonnull if it is present in both instructions.
         K->setMetadata(Kind, JMD);
         break;
+      case LLVMContext::MD_invariant_group:
+        // Preserve !invariant.group in K.
+        break;
+      case LLVMContext::MD_align:
+        K->setMetadata(Kind, 
+          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+        break;
+      case LLVMContext::MD_dereferenceable:
+      case LLVMContext::MD_dereferenceable_or_null:
+        K->setMetadata(Kind, 
+          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+        break;
     }
   }
+  // Set !invariant.group from J if J has it. If both instructions have it
+  // then we will just pick it from J - even when they are different.
+  // Also make sure that K is load or store - f.e. combining bitcast with load
+  // could produce bitcast with invariant.group metadata, which is invalid.
+  // FIXME: we should try to preserve both invariant.group md if they are
+  // different, but right now instruction can only have one invariant.group.
+  if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group))
+    if (isa<LoadInst>(K) || isa<StoreInst>(K))
+      K->setMetadata(LLVMContext::MD_invariant_group, JMD);
 }
 
 unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
@@ -1349,3 +1548,40 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
   }
   return Count;
 }
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlock *BB) {
+  assert(From->getType() == To->getType());
+
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    auto *I = cast<Instruction>(U.getUser());
+    if (DT.dominates(BB, I->getParent())) {
+      U.set(To);
+      DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+                   << *To << " in " << *U << "\n");
+      ++Count;
+    }
+  }
+  return Count;
+}
+
+bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
+  if (isa<IntrinsicInst>(CS.getInstruction()))
+    // Most LLVM intrinsics are things which can never take a safepoint.
+    // As a result, we don't need to have the stack parsable at the
+    // callsite.  This is a highly useful optimization since intrinsic
+    // calls are fairly prevalent, particularly in debug builds.
+    return true;
+
+  // Check if the function is specifically marked as a gc leaf function.
+  if (CS.hasFnAttr("gc-leaf-function"))
+    return true;
+  if (const Function *F = CS.getCalledFunction())
+    return F->hasFnAttribute("gc-leaf-function");
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 5c98043..1fa4695 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -44,11 +44,14 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -78,7 +81,7 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,
                                      SmallVectorImpl<BasicBlock *> &SplitPreds,
                                      Loop *L) {
   // Check to see if NewBB is already well placed.
-  Function::iterator BBI = NewBB; --BBI;
+  Function::iterator BBI = --NewBB->getIterator();
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
     if (&*BBI == SplitPreds[i])
       return;
@@ -92,9 +95,8 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,
   // block that neighbors a BB actually in the loop.
   BasicBlock *FoundBB = nullptr;
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
-    Function::iterator BBI = SplitPreds[i];
-    if (++BBI != NewBB->getParent()->end() &&
-        L->contains(BBI)) {
+    Function::iterator BBI = SplitPreds[i]->getIterator();
+    if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) {
       FoundBB = SplitPreds[i];
       break;
     }
@@ -112,17 +114,10 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,
 /// preheader, this method is called to insert one.  This method has two phases:
 /// preheader insertion and analysis updating.
 ///
-BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
+BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
+                                         LoopInfo *LI, bool PreserveLCSSA) {
   BasicBlock *Header = L->getHeader();
 
-  // Get analyses that we try to update.
-  auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>();
-  auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>();
-  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID);
-
   // Compute the set of predecessors of the loop that are not in the loop.
   SmallVector<BasicBlock*, 8> OutsideBlocks;
   for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
@@ -141,8 +136,10 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
 
   // Split out the loop pre-header.
   BasicBlock *PreheaderBB;
-  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
-                                       AA, DT, LI, PreserveLCSSA);
+  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT,
+                                       LI, PreserveLCSSA);
+  if (!PreheaderBB)
+    return nullptr;
 
   DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
                << PreheaderBB->getName() << "\n");
@@ -159,8 +156,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
 /// This method is used to split exit blocks that have predecessors outside of
 /// the loop.
 static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit,
-                                        AliasAnalysis *AA, DominatorTree *DT,
-                                        LoopInfo *LI, Pass *PP) {
+                                        DominatorTree *DT, LoopInfo *LI,
+                                        bool PreserveLCSSA) {
   SmallVector<BasicBlock*, 8> LoopBlocks;
   for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
     BasicBlock *P = *I;
@@ -175,10 +172,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit,
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
   BasicBlock *NewExitBB = nullptr;
 
-  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID);
-
-  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT,
-                                     LI, PreserveLCSSA);
+  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI,
+                                     PreserveLCSSA);
+  if (!NewExitBB)
+    return nullptr;
 
   DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
                << NewExitBB->getName() << "\n");
@@ -206,8 +203,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
 
 /// \brief The first part of loop-nestification is to find a PHI node that tells
 /// us how to partition the loops.
-static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
-                                        DominatorTree *DT,
+static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
                                         AssumptionCache *AC) {
   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
@@ -216,7 +212,6 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
     if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
-      if (AA) AA->deleteValue(PN);
       PN->eraseFromParent();
       continue;
     }
@@ -251,18 +246,18 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
 /// created.
 ///
 static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
-                                AliasAnalysis *AA, DominatorTree *DT,
-                                LoopInfo *LI, ScalarEvolution *SE, Pass *PP,
+                                DominatorTree *DT, LoopInfo *LI,
+                                ScalarEvolution *SE, bool PreserveLCSSA,
                                 AssumptionCache *AC) {
   // Don't try to separate loops without a preheader.
   if (!Preheader)
     return nullptr;
 
   // The header is not a landing pad; preheader insertion should ensure this.
-  assert(!L->getHeader()->isLandingPad() &&
-         "Can't insert backedge to landing pad");
+  BasicBlock *Header = L->getHeader();
+  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
 
-  PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AC);
+  PHINode *PN = findPHIToPartitionLoops(L, DT, AC);
   if (!PN) return nullptr;  // No known way to partition.
 
   // Pull out all predecessors that have varying values in the loop.  This
@@ -286,11 +281,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
   if (SE)
     SE->forgetLoop(L);
 
-  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID);
-
-  BasicBlock *Header = L->getHeader();
   BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer",
-                                             AA, DT, LI, PreserveLCSSA);
+                                             DT, LI, PreserveLCSSA);
 
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
@@ -357,7 +349,6 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
 /// and have that block branch to the loop header.  This ensures that loops
 /// have exactly one backedge.
 static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
-                                             AliasAnalysis *AA,
                                              DominatorTree *DT, LoopInfo *LI) {
   assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
 
@@ -369,8 +360,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   if (!Preheader)
     return nullptr;
 
-  // The header is not a landing pad; preheader insertion should ensure this.
-  assert(!Header->isLandingPad() && "Can't insert backedge to landing pad");
+  // The header is not an EH pad; preheader insertion should ensure this.
+  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
 
   // Figure out which basic blocks contain back-edges to the loop header.
   std::vector<BasicBlock*> BackedgeBlocks;
@@ -394,7 +385,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
                << BEBlock->getName() << "\n");
 
   // Move the new backedge block to right after the last backedge block.
-  Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos;
+  Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
   F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
 
   // Now that the block has been inserted into the function, create PHI nodes in
@@ -443,7 +434,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
     // eliminate the PHI Node.
     if (HasUniqueIncomingValue) {
       NewPN->replaceAllUsesWith(UniqueValue);
-      if (AA) AA->deleteValue(NewPN);
       BEBlock->getInstList().erase(NewPN);
     }
   }
@@ -470,15 +460,10 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 }
 
 /// \brief Simplify one loop and queue further loops for simplification.
-///
-/// FIXME: Currently this accepts both lots of analyses that it uses and a raw
-/// Pass pointer. The Pass pointer is used by numerous utilities to update
-/// specific analyses. Rather than a pass it would be much cleaner and more
-/// explicit if they accepted the analysis directly and then updated it.
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
-                            AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
-                            ScalarEvolution *SE, Pass *PP,
-                            AssumptionCache *AC) {
+                            DominatorTree *DT, LoopInfo *LI,
+                            ScalarEvolution *SE, AssumptionCache *AC,
+                            bool PreserveLCSSA) {
   bool Changed = false;
 ReprocessLoop:
 
@@ -544,7 +529,7 @@ ReprocessLoop:
   // Does the loop already have a preheader?  If so, don't insert one.
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
-    Preheader = InsertPreheaderForLoop(L, PP);
+    Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
     if (Preheader) {
       ++NumInserted;
       Changed = true;
@@ -568,7 +553,7 @@ ReprocessLoop:
       // Must be exactly this loop: no subloops, parent loops, or non-loop preds
       // allowed.
       if (!L->contains(*PI)) {
-        if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) {
+        if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) {
           ++NumInserted;
           Changed = true;
         }
@@ -585,7 +570,7 @@ ReprocessLoop:
     // common backedge instead.
     if (L->getNumBackEdges() < 8) {
       if (Loop *OuterL =
-              separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP, AC)) {
+              separateNestedLoop(L, Preheader, DT, LI, SE, PreserveLCSSA, AC)) {
         ++NumNested;
         // Enqueue the outer loop as it should be processed next in our
         // depth-first nest walk.
@@ -602,7 +587,7 @@ ReprocessLoop:
     // If we either couldn't, or didn't want to, identify nesting of the loops,
     // insert a new block that all backedges target, then make it jump to the
     // loop header.
-    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, AA, DT, LI);
+    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI);
     if (LoopLatch) {
       ++NumInserted;
       Changed = true;
@@ -618,7 +603,6 @@ ReprocessLoop:
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
     if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
-      if (AA) AA->deleteValue(PN);
       if (SE) SE->forgetValue(PN);
       PN->replaceAllUsesWith(V);
       PN->eraseFromParent();
@@ -654,7 +638,7 @@ ReprocessLoop:
       bool AllInvariant = true;
       bool AnyInvariant = false;
       for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) {
-        Instruction *Inst = I++;
+        Instruction *Inst = &*I++;
         // Skip debug info intrinsics.
         if (isa<DbgInfoIntrinsic>(Inst))
           continue;
@@ -716,9 +700,9 @@ ReprocessLoop:
   return Changed;
 }
 
-bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
-                        AliasAnalysis *AA, ScalarEvolution *SE,
-                        AssumptionCache *AC) {
+bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                        ScalarEvolution *SE, AssumptionCache *AC,
+                        bool PreserveLCSSA) {
   bool Changed = false;
 
   // Worklist maintains our depth-first queue of loops in this nest to process.
@@ -734,8 +718,8 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
   }
 
   while (!Worklist.empty())
-    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI,
-                               SE, PP, AC);
+    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE,
+                               AC, PreserveLCSSA);
 
   return Changed;
 }
@@ -747,9 +731,6 @@ namespace {
       initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
-    // AA - If we have an alias analysis object to update, this is it, otherwise
-    // this is null.
-    AliasAnalysis *AA;
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
@@ -767,8 +748,11 @@ namespace {
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
 
-      AU.addPreserved<AliasAnalysis>();
-      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<BasicAAWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
       AU.addPreserved<DependenceAnalysis>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
     }
@@ -784,6 +768,9 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", false, false)
 
@@ -796,15 +783,16 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 ///
 bool LoopSimplify::runOnFunction(Function &F) {
   bool Changed = false;
-  AA = getAnalysisIfAvailable<AliasAnalysis>();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  SE = getAnalysisIfAvailable<ScalarEvolution>();
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  SE = SEWP ? &SEWP->getSE() : nullptr;
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, AC);
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA);
 
   return Changed;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 1dbce47..eea9237 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -73,7 +73,7 @@ static inline void RemapInstruction(Instruction *I,
 /// of loops that have already been forgotten to prevent redundant, expensive
 /// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
 static BasicBlock *
-FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,
+FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
                          SmallPtrSetImpl<Loop *> &ForgottenLoops) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
@@ -109,12 +109,10 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,
   // Erase basic block from the function...
 
   // ScalarEvolution holds references to loop exit blocks.
-  if (LPM) {
-    if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
-      if (Loop *L = LI->getLoopFor(BB)) {
-        if (ForgottenLoops.insert(L).second)
-          SE->forgetLoop(L);
-      }
+  if (SE) {
+    if (Loop *L = LI->getLoopFor(BB)) {
+      if (ForgottenLoops.insert(L).second)
+        SE->forgetLoop(L);
     }
   }
   LI->removeBlock(BB);
@@ -155,15 +153,13 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,
 ///
 /// The LoopInfo Analysis that is passed will be kept consistent.
 ///
-/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
-/// removed from the LoopPassManager as well. LPM can also be NULL.
-///
-/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
-/// available from the Pass it must also preserve those analyses.
+/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
+/// DominatorTree if they are non-null.
 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
                       bool AllowRuntime, bool AllowExpensiveTripCount,
-                      unsigned TripMultiple, LoopInfo *LI, Pass *PP,
-                      LPPassManager *LPM, AssumptionCache *AC) {
+                      unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
+                      DominatorTree *DT, AssumptionCache *AC,
+                      bool PreserveLCSSA) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -220,6 +216,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = Count == TripCount;
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  Loop *ParentL = L->getParentLoop();
+  bool AllExitsAreInsideParentLoop = !ParentL ||
+      std::all_of(ExitBlocks.begin(), ExitBlocks.end(),
+                  [&](BasicBlock *BB) { return ParentL->contains(BB); });
 
   // We assume a run-time trip count if the compiler cannot
   // figure out the loop trip count and the unroll-runtime
@@ -227,13 +229,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
 
   if (RuntimeTripCount &&
-      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, LPM))
+      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
+                               PreserveLCSSA))
     return false;
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
-  ScalarEvolution *SE =
-      PP ? PP->getAnalysisIfAvailable<ScalarEvolution>() : nullptr;
   if (SE)
     SE->forgetLoop(L);
 
@@ -392,7 +393,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     for (unsigned i = 0; i < NewBlocks.size(); ++i)
       for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
-        ::RemapInstruction(I, LastValueMap);
+        ::RemapInstruction(&*I, LastValueMap);
   }
 
   // Loop over the PHI nodes in the original block, setting incoming values.
@@ -432,8 +433,9 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
     // For a complete unroll, make the last iteration end with a branch
     // to the exit block.
-    if (CompletelyUnroll && j == 0) {
-      Dest = LoopExit;
+    if (CompletelyUnroll) {
+      if (j == 0)
+        Dest = LoopExit;
       NeedConditional = false;
     }
 
@@ -473,7 +475,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM,
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, SE,
                                                       ForgottenLoops))
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
     }
@@ -483,29 +485,24 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // whole function's cache.
   AC->clear();
 
-  DominatorTree *DT = nullptr;
-  if (PP) {
-    // FIXME: Reconstruct dom info, because it is not preserved properly.
-    // Incrementally updating domtree after loop unrolling would be easy.
-    if (DominatorTreeWrapperPass *DTWP =
-            PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
-      DT = &DTWP->getDomTree();
-      DT->recalculate(*L->getHeader()->getParent());
-    }
-
-    // Simplify any new induction variables in the partially unrolled loop.
-    if (SE && !CompletelyUnroll) {
-      SmallVector<WeakVH, 16> DeadInsts;
-      simplifyLoopIVs(L, SE, LPM, DeadInsts);
-
-      // Aggressively clean up dead instructions that simplifyLoopIVs already
-      // identified. Any remaining should be cleaned up below.
-      while (!DeadInsts.empty())
-        if (Instruction *Inst =
-            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-          RecursivelyDeleteTriviallyDeadInstructions(Inst);
-    }
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  // Incrementally updating domtree after loop unrolling would be easy.
+  if (DT)
+    DT->recalculate(*L->getHeader()->getParent());
+
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && !CompletelyUnroll) {
+    SmallVector<WeakVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty())
+      if (Instruction *Inst =
+              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
   }
+
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
@@ -514,7 +511,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
        BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
-      Instruction *Inst = I++;
+      Instruction *Inst = &*I++;
 
       if (isInstructionTriviallyDead(Inst))
         (*BB)->getInstList().erase(Inst);
@@ -529,29 +526,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   ++NumUnrolled;
 
   Loop *OuterL = L->getParentLoop();
-  // Remove the loop from the LoopPassManager if it's completely removed.
-  if (CompletelyUnroll && LPM != nullptr)
-    LPM->deleteLoopFromQueue(L);
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->markAsRemoved(L);
 
   // If we have a pass and a DominatorTree we should re-simplify impacted loops
   // to ensure subsequent analyses can rely on this form. We want to simplify
   // at least one layer outside of the loop that was unrolled so that any
   // changes to the parent loop exposed by the unrolling are considered.
-  if (PP && DT) {
+  if (DT) {
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
-      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, AC);
+      bool Simplified = simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
-      // deleteLoopFromQueue updates LoopInfo.
+      // LoopInfo's been updated by markAsRemoved.
       Loop *LatchLoop = LI->getLoopFor(Latches.back());
       if (!OuterL->contains(LatchLoop))
         while (OuterL->getParentLoop() != LatchLoop)
           OuterL = OuterL->getParentLoop();
 
-      formLCSSARecursively(*OuterL, *DT, LI, SE);
+      if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified))
+        formLCSSARecursively(*OuterL, *DT, LI, SE);
+      else
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
     }
   }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index add5432..0d68f18 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -62,8 +62,8 @@ STATISTIC(NumRuntimeUnrolled,
 static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
                           BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
                           BasicBlock *OrigPH, BasicBlock *NewPH,
-                          ValueToValueMapTy &VMap, AliasAnalysis *AA,
-                          DominatorTree *DT, LoopInfo *LI, Pass *P) {
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
 
@@ -127,8 +127,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
   SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
-  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI,
-                         P->mustPreserveAnalysisID(LCSSAID));
+  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
+                         PreserveLCSSA);
   // Add the branch to the exit block (around the unrolled loop)
   B.CreateCondBr(BrLoopExit, Exit, NewPH);
   InsertPt->eraseFromParent();
@@ -150,7 +150,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   Function *F = Header->getParent();
   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
-  Loop *NewLoop = 0;
+  Loop *NewLoop = nullptr;
   Loop *ParentLoop = L->getParentLoop();
   if (!UnrollProlog) {
     NewLoop = new Loop();
@@ -206,9 +206,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   // Change the incoming values to the ones defined in the preheader or
   // cloned loop.
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *NewPHI = cast<PHINode>(VMap[I]);
+    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
     if (UnrollProlog) {
-      VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
+      VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
       cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
     } else {
       unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
@@ -279,7 +279,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
 ///
 bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
                                    bool AllowExpensiveTripCount, LoopInfo *LI,
-                                   LPPassManager *LPM) {
+                                   ScalarEvolution *SE, DominatorTree *DT,
+                                   bool PreserveLCSSA) {
   // for now, only unroll loops that contain a single exit
   if (!L->getExitingBlock())
     return false;
@@ -291,9 +292,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
 
   // Use Scalar Evolution to compute the trip count.  This allows more
   // loops to be unrolled than relying on induction var simplification
-  if (!LPM)
-    return false;
-  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
   if (!SE)
     return false;
 
@@ -308,7 +306,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
 
   // Add 1 since the backedge count doesn't include the first loop iteration
   const SCEV *TripCountSC =
-    SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
+      SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
@@ -333,10 +331,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
-  // Grab analyses that we preserve.
-  auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
   BasicBlock *PH = L->getLoopPreheader();
   BasicBlock *Latch = L->getLoopLatch();
   // It helps to splits the original preheader twice, one for the end of the
@@ -397,8 +391,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
                   VMap, LI);
 
   // Insert the cloned blocks into function just before the original loop
-  F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0],
-                                F->end());
+  F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(), F->end());
 
   // Rewrite the cloned instruction operands to use the values
   // created when the clone is created.
@@ -406,7 +400,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
                               E = NewBlocks[i]->end();
          I != E; ++I) {
-      RemapInstruction(I, VMap,
+      RemapInstruction(&*I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
     }
   }
@@ -414,8 +408,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
   // Connect the prolog code to the original loop and update the
   // PHI functions.
   BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
-                /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass());
+  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI,
+                PreserveLCSSA);
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 5cbde94..fa958e9 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 using namespace llvm;
@@ -34,6 +34,124 @@ bool RecurrenceDescriptor::areAllUsesIn(Instruction *I,
   return true;
 }
 
+bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) {
+  switch (Kind) {
+  default:
+    break;
+  case RK_IntegerAdd:
+  case RK_IntegerMult:
+  case RK_IntegerOr:
+  case RK_IntegerAnd:
+  case RK_IntegerXor:
+  case RK_IntegerMinMax:
+    return true;
+  }
+  return false;
+}
+
+bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) {
+  return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind);
+}
+
+bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) {
+  switch (Kind) {
+  default:
+    break;
+  case RK_IntegerAdd:
+  case RK_IntegerMult:
+  case RK_FloatAdd:
+  case RK_FloatMult:
+    return true;
+  }
+  return false;
+}
+
+Instruction *
+RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
+                                     SmallPtrSetImpl<Instruction *> &Visited,
+                                     SmallPtrSetImpl<Instruction *> &CI) {
+  if (!Phi->hasOneUse())
+    return Phi;
+
+  const APInt *M = nullptr;
+  Instruction *I, *J = cast<Instruction>(Phi->use_begin()->getUser());
+
+  // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT
+  // with a new integer type of the corresponding bit width.
+  if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)),
+                           m_And(m_APInt(M), m_Instruction(I))))) {
+    int32_t Bits = (*M + 1).exactLogBase2();
+    if (Bits > 0) {
+      RT = IntegerType::get(Phi->getContext(), Bits);
+      Visited.insert(Phi);
+      CI.insert(J);
+      return J;
+    }
+  }
+  return Phi;
+}
+
+bool RecurrenceDescriptor::getSourceExtensionKind(
+    Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned,
+    SmallPtrSetImpl<Instruction *> &Visited,
+    SmallPtrSetImpl<Instruction *> &CI) {
+
+  SmallVector<Instruction *, 8> Worklist;
+  bool FoundOneOperand = false;
+  unsigned DstSize = RT->getPrimitiveSizeInBits();
+  Worklist.push_back(Exit);
+
+  // Traverse the instructions in the reduction expression, beginning with the
+  // exit value.
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (Use &U : I->operands()) {
+
+      // Terminate the traversal if the operand is not an instruction, or we
+      // reach the starting value.
+      Instruction *J = dyn_cast<Instruction>(U.get());
+      if (!J || J == Start)
+        continue;
+
+      // Otherwise, investigate the operation if it is also in the expression.
+      if (Visited.count(J)) {
+        Worklist.push_back(J);
+        continue;
+      }
+
+      // If the operand is not in Visited, it is not a reduction operation, but
+      // it does feed into one. Make sure it is either a single-use sign- or
+      // zero-extend instruction.
+      CastInst *Cast = dyn_cast<CastInst>(J);
+      bool IsSExtInst = isa<SExtInst>(J);
+      if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst))
+        return false;
+
+      // Ensure the source type of the extend is no larger than the reduction
+      // type. It is not necessary for the types to be identical.
+      unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+      if (SrcSize > DstSize)
+        return false;
+
+      // Furthermore, ensure that all such extends are of the same kind.
+      if (FoundOneOperand) {
+        if (IsSigned != IsSExtInst)
+          return false;
+      } else {
+        FoundOneOperand = true;
+        IsSigned = IsSExtInst;
+      }
+
+      // Lastly, if the source type of the extend matches the reduction type,
+      // add the extend to CI so that we can avoid accounting for it in the
+      // cost model.
+      if (SrcSize == DstSize)
+        CI.insert(Cast);
+    }
+  }
+  return true;
+}
+
 bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
                                            Loop *TheLoop, bool HasFunNoNaNAttr,
                                            RecurrenceDescriptor &RedDes) {
@@ -68,10 +186,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   unsigned NumCmpSelectPatternInst = 0;
   InstDesc ReduxDesc(false, nullptr);
 
+  // Data used for determining if the recurrence has been type-promoted.
+  Type *RecurrenceType = Phi->getType();
+  SmallPtrSet<Instruction *, 4> CastInsts;
+  Instruction *Start = Phi;
+  bool IsSigned = false;
+
   SmallPtrSet<Instruction *, 8> VisitedInsts;
   SmallVector<Instruction *, 8> Worklist;
-  Worklist.push_back(Phi);
-  VisitedInsts.insert(Phi);
+
+  // Return early if the recurrence kind does not match the type of Phi. If the
+  // recurrence kind is arithmetic, we attempt to look through AND operations
+  // resulting from the type promotion performed by InstCombine.  Vector
+  // operations are not limited to the legal integer widths, so we may be able
+  // to evaluate the reduction in the narrower width.
+  if (RecurrenceType->isFloatingPointTy()) {
+    if (!isFloatingPointRecurrenceKind(Kind))
+      return false;
+  } else {
+    if (!isIntegerRecurrenceKind(Kind))
+      return false;
+    if (isArithmeticRecurrenceKind(Kind))
+      Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
+  }
+
+  Worklist.push_back(Start);
+  VisitedInsts.insert(Start);
 
   // A value in the reduction can be used:
   //  - By the reduction:
@@ -110,10 +250,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
         !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
       return false;
 
-    // Any reduction instruction must be of one of the allowed kinds.
-    ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr);
-    if (!ReduxDesc.isRecurrence())
-      return false;
+    // Any reduction instruction must be of one of the allowed kinds. We ignore
+    // the starting value (the Phi or an AND instruction if the Phi has been
+    // type-promoted).
+    if (Cur != Start) {
+      ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr);
+      if (!ReduxDesc.isRecurrence())
+        return false;
+    }
 
     // A reduction operation must only have one use of the reduction value.
     if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
@@ -131,7 +275,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       ++NumCmpSelectPatternInst;
 
     // Check  whether we found a reduction operator.
-    FoundReduxOp |= !IsAPhi;
+    FoundReduxOp |= !IsAPhi && Cur != Start;
 
     // Process users of current instruction. Push non-PHI nodes after PHI nodes
     // onto the stack. This way we are going to have seen all inputs to PHI
@@ -193,6 +337,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
+  // If we think Phi may have been type-promoted, we also need to ensure that
+  // all source operands of the reduction are either SExtInsts or ZEstInsts. If
+  // so, we will be able to evaluate the reduction in the narrower bit width.
+  if (Start != Phi)
+    if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType,
+                                IsSigned, VisitedInsts, CastInsts))
+      return false;
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -200,9 +352,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind,
-                          ReduxDesc.getMinMaxKind());
-
+  RecurrenceDescriptor RD(
+      RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(),
+      ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts);
   RedDes = RD;
 
   return true;
@@ -263,14 +415,14 @@ RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
                                         InstDesc &Prev, bool HasFunNoNaNAttr) {
   bool FP = I->getType()->isFloatingPointTy();
-  bool FastMath = FP && I->hasUnsafeAlgebra();
+  Instruction *UAI = Prev.getUnsafeAlgebraInst();
+  if (!UAI && FP && !I->hasUnsafeAlgebra())
+    UAI = I; // Found an unsafe (unvectorizable) algebra instruction.
+
   switch (I->getOpcode()) {
   default:
     return InstDesc(false, I);
   case Instruction::PHI:
-    if (FP &&
-        (Kind != RK_FloatMult && Kind != RK_FloatAdd && Kind != RK_FloatMinMax))
-      return InstDesc(false, I);
     return InstDesc(I, Prev.getMinMaxKind());
   case Instruction::Sub:
   case Instruction::Add:
@@ -284,10 +436,10 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   case Instruction::Xor:
     return InstDesc(Kind == RK_IntegerXor, I);
   case Instruction::FMul:
-    return InstDesc(Kind == RK_FloatMult && FastMath, I);
+    return InstDesc(Kind == RK_FloatMult, I, UAI);
   case Instruction::FSub:
   case Instruction::FAdd:
-    return InstDesc(Kind == RK_FloatAdd && FastMath, I);
+    return InstDesc(Kind == RK_FloatAdd, I, UAI);
   case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
@@ -442,6 +594,13 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
     break;
   }
 
+  // We only match FP sequences with unsafe algebra, so we can unconditionally
+  // set it on any generated instructions.
+  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+  FastMathFlags FMF;
+  FMF.setUnsafeAlgebra();
+  Builder.setFastMathFlags(FMF);
+
   Value *Cmp;
   if (RK == MRK_FloatMin || RK == MRK_FloatMax)
     Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
@@ -452,8 +611,54 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
   return Select;
 }
 
-bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
-                          ConstantInt *&StepValue) {
+InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
+                                         ConstantInt *Step)
+  : StartValue(Start), IK(K), StepValue(Step) {
+  assert(IK != IK_NoInduction && "Not an induction");
+  assert(StartValue && "StartValue is null");
+  assert(StepValue && !StepValue->isZero() && "StepValue is zero");
+  assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
+         "StartValue is not a pointer for pointer induction");
+  assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
+         "StartValue is not an integer for integer induction");
+  assert(StepValue->getType()->isIntegerTy() &&
+         "StepValue is not an integer");
+}
+
+int InductionDescriptor::getConsecutiveDirection() const {
+  if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
+    return StepValue->getSExtValue();
+  return 0;
+}
+
+Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index) const {
+  switch (IK) {
+  case IK_IntInduction:
+    assert(Index->getType() == StartValue->getType() &&
+           "Index type does not match StartValue type");
+    if (StepValue->isMinusOne())
+      return B.CreateSub(StartValue, Index);
+    if (!StepValue->isOne())
+      Index = B.CreateMul(Index, StepValue);
+    return B.CreateAdd(StartValue, Index);
+
+  case IK_PtrInduction:
+    assert(Index->getType() == StepValue->getType() &&
+           "Index type does not match StepValue type");
+    if (StepValue->isMinusOne())
+      Index = B.CreateNeg(Index);
+    else if (!StepValue->isOne())
+      Index = B.CreateMul(Index, StepValue);
+    return B.CreateGEP(nullptr, StartValue, Index);
+
+  case IK_NoInduction:
+    return nullptr;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
+                                         InductionDescriptor &D) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
@@ -467,6 +672,10 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
     return false;
   }
 
+  assert(AR->getLoop()->getHeader() == Phi->getParent() &&
+         "PHI is an AddRec for a different loop?!");
+  Value *StartValue =
+    Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
   const SCEV *Step = AR->getStepRecurrence(*SE);
   // Calculate the pointer stride and check if it is consecutive.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
@@ -475,7 +684,7 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
 
   ConstantInt *CV = C->getValue();
   if (PhiTy->isIntegerTy()) {
-    StepValue = CV;
+    D = InductionDescriptor(StartValue, IK_IntInduction, CV);
     return true;
   }
 
@@ -494,6 +703,27 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
   int64_t CVSize = CV->getSExtValue();
   if (CVSize % Size)
     return false;
-  StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
+  auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
+
+  D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue);
   return true;
 }
+
+/// \brief Returns the instructions that use values defined in the loop.
+SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
+  SmallVector<Instruction *, 8> UsedOutside;
+
+  for (auto *Block : L->getBlocks())
+    // FIXME: I believe that this could use copy_if if the Inst reference could
+    // be adapted into a pointer.
+    for (auto &Inst : *Block) {
+      auto Users = Inst.users();
+      if (std::any_of(Users.begin(), Users.end(), [&](User *U) {
+            auto *Use = cast<Instruction>(U);
+            return !L->contains(Use->getParent());
+          }))
+        UsedOutside.push_back(&Inst);
+    }
+
+  return UsedOutside;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 832079d..9a2a06c 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -13,43 +13,81 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
 
 using namespace llvm;
 
 LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
-                               DominatorTree *DT,
-                               const SmallVector<int, 8> *PtrToPartition)
-    : VersionedLoop(L), NonVersionedLoop(nullptr),
-      PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {
+                               DominatorTree *DT, ScalarEvolution *SE,
+                               bool UseLAIChecks)
+    : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT),
+      SE(SE) {
   assert(L->getExitBlock() && "No single exit block");
   assert(L->getLoopPreheader() && "No preheader");
+  if (UseLAIChecks) {
+    setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
+    setSCEVChecks(LAI.PSE.getUnionPredicate());
+  }
 }
 
-bool LoopVersioning::needsRuntimeChecks() const {
-  return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition);
+void LoopVersioning::setAliasChecks(
+    const SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
+  AliasChecks = std::move(Checks);
 }
 
-void LoopVersioning::versionLoop(Pass *P) {
+void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) {
+  Preds = std::move(Check);
+}
+
+void LoopVersioning::versionLoop(
+    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
   Instruction *FirstCheckInst;
   Instruction *MemRuntimeCheck;
+  Value *SCEVRuntimeCheck;
+  Value *RuntimeCheck = nullptr;
+
   // Add the memcheck in the original preheader (this is empty initially).
-  BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader();
+  BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-      LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition);
+      LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
   assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
+  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+  SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
+                   "scev.check");
+  SCEVRuntimeCheck =
+      Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator());
+  auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
+
+  // Discard the SCEV runtime check if it is always true.
+  if (CI && CI->isZero())
+    SCEVRuntimeCheck = nullptr;
+
+  if (MemRuntimeCheck && SCEVRuntimeCheck) {
+    RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
+                                          SCEVRuntimeCheck, "ldist.safe");
+    if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
+      I->insertBefore(RuntimeCheckBB->getTerminator());
+  } else
+    RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
+
+  assert(RuntimeCheck && "called even though we don't need "
+                         "any runtime checks");
+
   // Rename the block to make the IR more readable.
-  MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck");
+  RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() +
+                          ".lver.check");
 
   // Create empty preheader for the loop (and after cloning for the
   // non-versioned loop).
-  BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI);
+  BasicBlock *PH =
+      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI);
   PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
 
   // Clone the loop including the preheader.
@@ -58,20 +96,23 @@ void LoopVersioning::versionLoop(Pass *P) {
   // block is a join between the two loops.
   SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
   NonVersionedLoop =
-      cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig",
-                             LI, DT, NonVersionedLoopBlocks);
+      cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap,
+                             ".lver.orig", LI, DT, NonVersionedLoopBlocks);
   remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
 
   // Insert the conditional branch based on the result of the memchecks.
-  Instruction *OrigTerm = MemCheckBB->getTerminator();
+  Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
   BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
-                     VersionedLoop->getLoopPreheader(), MemRuntimeCheck,
-                     OrigTerm);
+                     VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
   OrigTerm->eraseFromParent();
 
   // The loops merge in the original exit block.  This is now dominated by the
   // memchecking block.
-  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB);
+  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB);
+
+  // Adds the necessary PHI nodes for the versioned loops based on the
+  // loop-defined values used outside of the loop.
+  addPHINodes(DefsUsedOutside);
 }
 
 void LoopVersioning::addPHINodes(
@@ -94,7 +135,7 @@ void LoopVersioning::addPHINodes(
     // If not create it.
     if (!PN) {
       PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
-                           PHIBlock->begin());
+                           &PHIBlock->front());
       for (auto *User : Inst->users())
         if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
           User->replaceUsesOfWith(Inst, PN);
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index 66d57b0..b0ad4d5 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -69,7 +69,7 @@ bool LowerInvoke::runOnFunction(Function &F) {
       BranchInst::Create(II->getNormalDest(), II);
 
       // Remove any PHI node entries from the exception destination.
-      II->getUnwindDest()->removePredecessor(BB);
+      II->getUnwindDest()->removePredecessor(&*BB);
 
       // Remove the invoke instruction now.
       BB->getInstList().erase(II);
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 4acd988..52beb15 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -49,8 +49,7 @@ namespace {
     return I != Ranges.end() && I->Low <= R.Low;
   }
 
-  /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
-  /// instructions.
+  /// Replace all SwitchInst instructions with chained branch instructions.
   class LowerSwitch : public FunctionPass {
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -78,7 +77,7 @@ namespace {
     typedef std::vector<CaseRange> CaseVector;
     typedef std::vector<CaseRange>::iterator CaseItr;
   private:
-    void processSwitchInst(SwitchInst *SI);
+    void processSwitchInst(SwitchInst *SI, SmallPtrSetImpl<BasicBlock*> &DeleteList);
 
     BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
                               ConstantInt *LowerBound, ConstantInt *UpperBound,
@@ -116,21 +115,30 @@ FunctionPass *llvm::createLowerSwitchPass() {
 
 bool LowerSwitch::runOnFunction(Function &F) {
   bool Changed = false;
+  SmallPtrSet<BasicBlock*, 8> DeleteList;
 
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks
+    BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks
+
+    // If the block is a dead Default block that will be deleted later, don't
+    // waste time processing it.
+    if (DeleteList.count(Cur))
+      continue;
 
     if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
       Changed = true;
-      processSwitchInst(SI);
+      processSwitchInst(SI, DeleteList);
     }
   }
 
+  for (BasicBlock* BB: DeleteList) {
+    DeleteDeadBlock(BB);
+  }
+
   return Changed;
 }
 
-// operator<< - Used for debugging purposes.
-//
+/// Used for debugging purposes.
 static raw_ostream& operator<<(raw_ostream &O,
                                const LowerSwitch::CaseVector &C)
     LLVM_ATTRIBUTE_USED;
@@ -147,23 +155,24 @@ static raw_ostream& operator<<(raw_ostream &O,
   return O << "]";
 }
 
-// \brief Update the first occurrence of the "switch statement" BB in the PHI
-// node with the "new" BB. The other occurrences will:
-//
-// 1) Be updated by subsequent calls to this function.  Switch statements may
-// have more than one outcoming edge into the same BB if they all have the same
-// value. When the switch statement is converted these incoming edges are now
-// coming from multiple BBs.
-// 2) Removed if subsequent incoming values now share the same case, i.e.,
-// multiple outcome edges are condensed into one. This is necessary to keep the
-// number of phi values equal to the number of branches to SuccBB.
+/// \brief Update the first occurrence of the "switch statement" BB in the PHI
+/// node with the "new" BB. The other occurrences will:
+///
+/// 1) Be updated by subsequent calls to this function.  Switch statements may
+/// have more than one outcoming edge into the same BB if they all have the same
+/// value. When the switch statement is converted these incoming edges are now
+/// coming from multiple BBs.
+/// 2) Removed if subsequent incoming values now share the same case, i.e.,
+/// multiple outcome edges are condensed into one. This is necessary to keep the
+/// number of phi values equal to the number of branches to SuccBB.
 static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
                     unsigned NumMergedCases) {
-  for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI();
+  for (BasicBlock::iterator I = SuccBB->begin(),
+                            IE = SuccBB->getFirstNonPHI()->getIterator();
        I != IE; ++I) {
     PHINode *PN = cast<PHINode>(I);
 
-    // Only update the first occurence.
+    // Only update the first occurrence.
     unsigned Idx = 0, E = PN->getNumIncomingValues();
     unsigned LocalNumMergedCases = NumMergedCases;
     for (; Idx != E; ++Idx) {
@@ -173,7 +182,7 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
       }
     }
 
-    // Remove additional occurences coming from condensed cases and keep the
+    // Remove additional occurrences coming from condensed cases and keep the
     // number of incoming values equal to the number of branches to SuccBB.
     SmallVector<unsigned, 8> Indices;
     for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
@@ -188,11 +197,11 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
   }
 }
 
-// switchConvert - Convert the switch statement into a binary lookup of
-// the case values. The function recursively builds this tree.
-// LowerBound and UpperBound are used to keep track of the bounds for Val
-// that have already been checked by a block emitted by one of the previous
-// calls to switchConvert in the call stack.
+/// Convert the switch statement into a binary lookup of the case values.
+/// The function recursively builds this tree. LowerBound and UpperBound are
+/// used to keep track of the bounds for Val that have already been checked by
+/// a block emitted by one of the previous calls to switchConvert in the call
+/// stack.
 BasicBlock *
 LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
                            ConstantInt *UpperBound, Value *Val,
@@ -278,28 +287,24 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
                                       UpperBound, Val, NewNode, OrigBlock,
                                       Default, UnreachableRanges);
 
-  Function::iterator FI = OrigBlock;
-  F->getBasicBlockList().insert(++FI, NewNode);
+  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);
   NewNode->getInstList().push_back(Comp);
 
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
   return NewNode;
 }
 
-// newLeafBlock - Create a new leaf block for the binary lookup tree. It
-// checks if the switch's value == the case's value. If not, then it
-// jumps to the default branch. At this point in the tree, the value
-// can't be another valid case value, so the jump to the "default" branch
-// is warranted.
-//
+/// Create a new leaf block for the binary lookup tree. It checks if the
+/// switch's value == the case's value. If not, then it jumps to the default
+/// branch. At this point in the tree, the value can't be another valid case
+/// value, so the jump to the "default" branch is warranted.
 BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
                                       BasicBlock* OrigBlock,
                                       BasicBlock* Default)
 {
   Function* F = OrigBlock->getParent();
   BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock");
-  Function::iterator FI = OrigBlock;
-  F->getBasicBlockList().insert(++FI, NewLeaf);
+  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf);
 
   // Emit comparison
   ICmpInst* Comp = nullptr;
@@ -352,7 +357,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
   return NewLeaf;
 }
 
-// Clusterify - Transform simple list of Cases into list of CaseRange's
+/// Transform simple list of Cases into list of CaseRange's.
 unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   unsigned numCmps = 0;
 
@@ -394,10 +399,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   return numCmps;
 }
 
-// processSwitchInst - Replace the specified switch instruction with a sequence
-// of chained if-then insts in a balanced binary search.
-//
-void LowerSwitch::processSwitchInst(SwitchInst *SI) {
+/// Replace the specified switch instruction with a sequence of chained if-then
+/// insts in a balanced binary search.
+void LowerSwitch::processSwitchInst(SwitchInst *SI,
+                                    SmallPtrSetImpl<BasicBlock*> &DeleteList) {
   BasicBlock *CurBlock = SI->getParent();
   BasicBlock *OrigBlock = CurBlock;
   Function *F = CurBlock->getParent();
@@ -424,7 +429,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   std::vector<IntRange> UnreachableRanges;
 
   if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) {
-    // Make the bounds tightly fitted around the case value range, becase we
+    // Make the bounds tightly fitted around the case value range, because we
     // know that the value passed to the switch must be exactly one of the case
     // values.
     assert(!Cases.empty());
@@ -495,7 +500,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   // Create a new, empty default block so that the new hierarchy of
   // if-then statements go to this and the PHI nodes are happy.
   BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
-  F->getBasicBlockList().insert(Default, NewDefault);
+  F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
   BranchInst::Create(Default, NewDefault);
 
   // If there is an entry in any PHI nodes for the default edge, make sure
@@ -518,7 +523,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   BasicBlock *OldDefault = SI->getDefaultDest();
   CurBlock->getInstList().erase(SI);
 
-  // If the Default block has no more predecessors just remove it.
+  // If the Default block has no more predecessors just add it to DeleteList.
   if (pred_begin(OldDefault) == pred_end(OldDefault))
-    DeleteDeadBlock(OldDefault);
+    DeleteList.insert(OldDefault);
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index 00cf4e6..aa1e35d 100644
--- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -63,6 +63,9 @@ bool PromotePass::runOnFunction(Function &F) {
 
   BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
 
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
   bool Changed  = false;
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index 395a46b..c999bd0 100644
--- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -42,6 +42,24 @@ namespace {
     }
   };
 
+  static const char *const metaNames[] = {
+    // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+    "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+    "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+  };
+
+  struct Renamer {
+    Renamer(unsigned int seed) {
+      prng.srand(seed);
+    }
+
+    const char *newName() {
+      return metaNames[prng.rand() % array_lengthof(metaNames)];
+    }
+
+    PRNG prng;
+  };
+  
   struct MetaRenamer : public ModulePass {
     static char ID; // Pass identification, replacement for typeid
     MetaRenamer() : ModulePass(ID) {
@@ -53,36 +71,26 @@ namespace {
     }
 
     bool runOnModule(Module &M) override {
-      static const char *const metaNames[] = {
-        // See http://en.wikipedia.org/wiki/Metasyntactic_variable
-        "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
-        "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
-      };
-
       // Seed our PRNG with simple additive sum of ModuleID. We're looking to
       // simply avoid always having the same function names, and we need to
       // remain deterministic.
       unsigned int randSeed = 0;
-      for (std::string::const_iterator I = M.getModuleIdentifier().begin(),
-           E = M.getModuleIdentifier().end(); I != E; ++I)
-        randSeed += *I;
+      for (auto C : M.getModuleIdentifier())
+        randSeed += C;
 
-      PRNG prng;
-      prng.srand(randSeed);
+      Renamer renamer(randSeed);
 
       // Rename all aliases
-      for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
-           AI != AE; ++AI) {
+      for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {
         StringRef Name = AI->getName();
         if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
           continue;
 
         AI->setName("alias");
       }
-      
+
       // Rename all global variables
-      for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
-           GI != GE; ++GI) {
+      for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
         StringRef Name = GI->getName();
         if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
           continue;
@@ -93,40 +101,37 @@ namespace {
       // Rename all struct types
       TypeFinder StructTypes;
       StructTypes.run(M, true);
-      for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
-        StructType *STy = StructTypes[i];
+      for (StructType *STy : StructTypes) {
         if (STy->isLiteral() || STy->getName().empty()) continue;
 
         SmallString<128> NameStorage;
-        STy->setName((Twine("struct.") + metaNames[prng.rand() %
-                     array_lengthof(metaNames)]).toStringRef(NameStorage));
+        STy->setName((Twine("struct.") +
+          renamer.newName()).toStringRef(NameStorage));
       }
 
       // Rename all functions
-      for (Module::iterator FI = M.begin(), FE = M.end();
-           FI != FE; ++FI) {
-        StringRef Name = FI->getName();
+      for (auto &F : M) {
+        StringRef Name = F.getName();
         if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
           continue;
 
-        FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
-        runOnFunction(*FI);
+        F.setName(renamer.newName());
+        runOnFunction(F);
       }
       return true;
     }
 
     bool runOnFunction(Function &F) {
-      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
-           AI != AE; ++AI)
+      for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)
         if (!AI->getType()->isVoidTy())
           AI->setName("arg");
 
-      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-        BB->setName("bb");
+      for (auto &BB : F) {
+        BB.setName("bb");
 
-        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-          if (!I->getType()->isVoidTy())
-            I->setName("tmp");
+        for (auto &I : BB)
+          if (!I.getType()->isVoidTy())
+            I.setName("tmp");
       }
       return true;
     }
diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index d69a81e..9ec28a3 100644
--- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -43,9 +43,9 @@ static void appendToGlobalArray(const char *Array,
     }
     GVCtor->eraseFromParent();
   } else {
-    // Use a simple two-field struct if there isn't one already.
+    // Use the new three-field struct if there isn't one already.
     EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                            nullptr);
+                            IRB.getInt8PtrTy(), nullptr);
   }
 
   // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
@@ -107,7 +107,8 @@ Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
 
 std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
     Module &M, StringRef CtorName, StringRef InitName,
-    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs) {
+    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
+    StringRef VersionCheckName) {
   assert(!InitName.empty() && "Expected init function name");
   assert(InitArgTypes.size() == InitArgTypes.size() &&
          "Sanitizer's init function expects different number of arguments");
@@ -122,6 +123,13 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
           AttributeSet()));
   InitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(InitFunction, InitArgs);
+  if (!VersionCheckName.empty()) {
+    Function *VersionCheckFunction =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
+            AttributeSet()));
+    IRB.CreateCall(VersionCheckFunction, {});
+  }
   return std::make_pair(Ctor, InitFunction);
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index a87f850..c4f9b9f 100644
--- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -205,10 +205,9 @@ public:
     // avoid gratuitus rescans.
     const BasicBlock *BB = I->getParent();
     unsigned InstNo = 0;
-    for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); BBI != E;
-         ++BBI)
-      if (isInterestingInstruction(BBI))
-        InstNumbers[BBI] = InstNo++;
+    for (const Instruction &BBI : *BB)
+      if (isInterestingInstruction(&BBI))
+        InstNumbers[&BBI] = InstNo++;
     It = InstNumbers.find(I);
 
     assert(It != InstNumbers.end() && "Didn't insert instruction?");
@@ -402,8 +401,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
   if (DbgDeclareInst *DDI = Info.DbgDeclare) {
-    DIBuilder DIB(*AI->getParent()->getParent()->getParent(),
-                  /*AllowUnresolved*/ false);
+    DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
     ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);
     DDI->eraseFromParent();
     LBI.deleteValue(DDI);
@@ -425,14 +423,17 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
 /// using the Alloca.
 ///
 /// If we cannot promote this alloca (because it is read before it is written),
-/// return true.  This is necessary in cases where, due to control flow, the
-/// alloca is potentially undefined on some control flow paths.  e.g. code like
-/// this is potentially correct:
-///
-///   for (...) { if (c) { A = undef; undef = B; } }
-///
-/// ... so long as A is not used before undef is set.
-static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
+/// return false.  This is necessary in cases where, due to control flow, the
+/// alloca is undefined only on some control flow paths.  e.g. code like
+/// this is correct in LLVM IR:
+///  // A is an alloca with no stores so far
+///  for (...) {
+///    int t = *A;
+///    if (!first_iteration)
+///      use(t);
+///    *A = 42;
+///  }
+static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                                      LargeBlockInfo &LBI,
                                      AliasSetTracker *AST) {
   // The trickiest case to handle is when we have large blocks. Because of this,
@@ -467,10 +468,15 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                          std::make_pair(LoadIdx,
                                         static_cast<StoreInst *>(nullptr)),
                          less_first());
-
-    if (I == StoresByIndex.begin())
-      // If there is no store before this load, the load takes the undef value.
-      LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+    if (I == StoresByIndex.begin()) {
+      if (StoresByIndex.empty())
+        // If there are no stores, the load takes the undef value.
+        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+      else
+        // There is no store before this load, bail out (load may be affected
+        // by the following stores - see main comment).
+        return false;
+    }
     else
       // Otherwise, there was a store before this load, the load takes its value.
       LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0));
@@ -486,8 +492,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Record debuginfo for the store before removing it.
     if (DbgDeclareInst *DDI = Info.DbgDeclare) {
-      DIBuilder DIB(*AI->getParent()->getParent()->getParent(),
-                    /*AllowUnresolved*/ false);
+      DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
       ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
     }
     SI->eraseFromParent();
@@ -506,6 +511,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   }
 
   ++NumLocalPromoted;
+  return true;
 }
 
 void PromoteMem2Reg::run() {
@@ -557,9 +563,8 @@ void PromoteMem2Reg::run() {
 
     // If the alloca is only read and written in one basic block, just perform a
     // linear sweep over the block to eliminate it.
-    if (Info.OnlyUsedInOneBlock) {
-      promoteSingleBlockAlloca(AI, Info, LBI, AST);
-
+    if (Info.OnlyUsedInOneBlock &&
+        promoteSingleBlockAlloca(AI, Info, LBI, AST)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -636,7 +641,7 @@ void PromoteMem2Reg::run() {
   // and inserting the phi nodes we marked as necessary
   //
   std::vector<RenamePassData> RenamePassWorkList;
-  RenamePassWorkList.emplace_back(F.begin(), nullptr, std::move(Values));
+  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values));
   do {
     RenamePassData RPD;
     RPD.swap(RenamePassWorkList.back());
@@ -854,7 +859,7 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
   // BasicBlock.
   PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
                        Allocas[AllocaNo]->getName() + "." + Twine(Version++),
-                       BB->begin());
+                       &BB->front());
   ++NumPHIInsert;
   PhiToAllocaMap[PN] = AllocaNo;
 
@@ -919,7 +924,7 @@ NextIteration:
     return;
 
   for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) {
-    Instruction *I = II++; // get the instruction, increment iterator
+    Instruction *I = &*II++; // get the instruction, increment iterator
 
     if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
       AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 36781c1..3125a2c 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -14,11 +14,13 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -43,7 +45,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <map>
@@ -73,6 +74,22 @@ static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores if an unconditional store precedes"));
 
+static cl::opt<bool> MergeCondStores(
+    "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
+    cl::desc("Hoist conditional stores even if an unconditional store does not "
+             "precede - hoist multiple conditional stores into a single "
+             "predicated store"));
+
+static cl::opt<bool> MergeCondStoresAggressively(
+    "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false),
+    cl::desc("When merging conditional stores, do so even if the resultant "
+             "basic blocks are unlikely to be if-converted as a result"));
+
+static cl::opt<bool> SpeculateOneExpensiveInst(
+    "speculate-one-expensive-inst", cl::Hidden, cl::init(true),
+    cl::desc("Allow exactly one expensive instruction to be speculatively "
+             "executed"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
@@ -83,13 +100,13 @@ STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
   // The first field contains the value that the switch produces when a certain
-  // case group is selected, and the second field is a vector containing the cases
-  // composing the case group.
+  // case group is selected, and the second field is a vector containing the
+  // cases composing the case group.
   typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
     SwitchCaseResultVectorTy;
   // The first field contains the phi node that generates a result of the switch
-  // and the second field contains the value generated for a certain case in the switch
-  // for that PHI.
+  // and the second field contains the value generated for a certain case in the
+  // switch for that PHI.
   typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
 
   /// ValueEqualityComparisonCase - Represents a case of a switch.
@@ -124,6 +141,9 @@ class SimplifyCFGOpt {
 
   bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
   bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
+  bool SimplifySingleResume(ResumeInst *RI);
+  bool SimplifyCommonResume(ResumeInst *RI);
+  bool SimplifyCleanupReturn(CleanupReturnInst *RI);
   bool SimplifyUnreachable(UnreachableInst *UI);
   bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
   bool SimplifyIndirectBr(IndirectBrInst *IBI);
@@ -226,6 +246,7 @@ static unsigned ComputeSpeculationCost(const User *I,
          "Instruction is not safe to speculatively execute!");
   return TTI.getUserCost(I);
 }
+
 /// If we have a merge point of an "if condition" as accepted above,
 /// return true if the specified value dominates the block.  We
 /// don't handle the true generality of domination here, just a special case
@@ -246,7 +267,8 @@ static unsigned ComputeSpeculationCost(const User *I,
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 SmallPtrSetImpl<Instruction*> *AggressiveInsts,
                                 unsigned &CostRemaining,
-                                const TargetTransformInfo &TTI) {
+                                const TargetTransformInfo &TTI,
+                                unsigned Depth = 0) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs
@@ -284,15 +306,24 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
   unsigned Cost = ComputeSpeculationCost(I, TTI);
 
-  if (Cost > CostRemaining)
+  // Allow exactly one instruction to be speculated regardless of its cost
+  // (as long as it is safe to do so).
+  // This is intended to flatten the CFG even if the instruction is a division
+  // or other expensive operation. The speculation of an expensive instruction
+  // is expected to be undone in CodeGenPrepare if the speculation has not
+  // enabled further IR optimizations.
+  if (Cost > CostRemaining &&
+      (!SpeculateOneExpensiveInst || !AggressiveInsts->empty() || Depth > 0))
     return false;
 
-  CostRemaining -= Cost;
+  // Avoid unsigned wrap.
+  CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost;
 
   // Okay, we can only really hoist these out if their operands do
   // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI))
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI,
+                             Depth + 1))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
   AggressiveInsts->insert(I);
@@ -970,8 +1001,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Okay, at this point, we know which new successor Pred will get.  Make
       // sure we update the number of entries in the PHI nodes for these
       // successors.
-      for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i)
-        AddPredecessorToBlock(NewSuccessors[i], Pred, BB);
+      for (BasicBlock *NewSuccessor : NewSuccessors)
+        AddPredecessorToBlock(NewSuccessor, Pred, BB);
 
       Builder.SetInsertPoint(PTI);
       // Convert pointer to int before we switch.
@@ -984,8 +1015,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,
                                                PredCases.size());
       NewSI->setDebugLoc(PTI->getDebugLoc());
-      for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-        NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
+      for (ValueEqualityComparisonCase &V : PredCases)
+        NewSI->addCase(V.Value, V.Dest);
 
       if (PredHasWeights || SuccHasWeights) {
         // Halve the weights if any of them cannot fit in an uint32_t
@@ -1059,15 +1090,15 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
   BasicBlock::iterator BB1_Itr = BB1->begin();
   BasicBlock::iterator BB2_Itr = BB2->begin();
 
-  Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++;
+  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
   // Skip debug info if it is not identical.
   DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
   DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
   if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
     while (isa<DbgInfoIntrinsic>(I1))
-      I1 = BB1_Itr++;
+      I1 = &*BB1_Itr++;
     while (isa<DbgInfoIntrinsic>(I2))
-      I2 = BB2_Itr++;
+      I2 = &*BB2_Itr++;
   }
   if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
       (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
@@ -1088,31 +1119,30 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     // For a normal instruction, we just move one to right before the branch,
     // then replace all uses of the other with the first.  Finally, we remove
     // the now redundant second instruction.
-    BIParent->getInstList().splice(BI, BB1->getInstList(), I1);
+    BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);
     if (!I2->use_empty())
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
     unsigned KnownIDs[] = {
-      LLVMContext::MD_tbaa,
-      LLVMContext::MD_range,
-      LLVMContext::MD_fpmath,
-      LLVMContext::MD_invariant_load,
-      LLVMContext::MD_nonnull
-    };
+        LLVMContext::MD_tbaa,    LLVMContext::MD_range,
+        LLVMContext::MD_fpmath,  LLVMContext::MD_invariant_load,
+        LLVMContext::MD_nonnull, LLVMContext::MD_invariant_group,
+        LLVMContext::MD_align,   LLVMContext::MD_dereferenceable,
+        LLVMContext::MD_dereferenceable_or_null};
     combineMetadata(I1, I2, KnownIDs);
     I2->eraseFromParent();
     Changed = true;
 
-    I1 = BB1_Itr++;
-    I2 = BB2_Itr++;
+    I1 = &*BB1_Itr++;
+    I2 = &*BB2_Itr++;
     // Skip debug info if it is not identical.
     DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
     DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
     if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
       while (isa<DbgInfoIntrinsic>(I1))
-        I1 = BB1_Itr++;
+        I1 = &*BB1_Itr++;
       while (isa<DbgInfoIntrinsic>(I2))
-        I2 = BB2_Itr++;
+        I2 = &*BB2_Itr++;
     }
   } while (I1->isIdenticalToWhenDefined(I2));
 
@@ -1147,7 +1177,7 @@ HoistTerminator:
 
   // Okay, it is safe to hoist the terminator.
   Instruction *NT = I1->clone();
-  BIParent->getInstList().insert(BI, NT);
+  BIParent->getInstList().insert(BI->getIterator(), NT);
   if (!NT->getType()->isVoidTy()) {
     I1->replaceAllUsesWith(NT);
     I2->replaceAllUsesWith(NT);
@@ -1265,7 +1295,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     // Cannot move control-flow-involving, volatile loads, vaarg, etc.
     if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
         isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
-        isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) ||
+        I1->isEHPad() || I2->isEHPad() ||
         isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
         I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
         I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
@@ -1324,7 +1354,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       if (!NewPN) {
         NewPN =
             PHINode::Create(DifferentOp1->getType(), 2,
-                            DifferentOp1->getName() + ".sink", BBEnd->begin());
+                            DifferentOp1->getName() + ".sink", &BBEnd->front());
         NewPN->addIncoming(DifferentOp1, BB1);
         NewPN->addIncoming(DifferentOp2, BB2);
         DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
@@ -1339,7 +1369,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     // instruction in the basic block down.
     bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
     // Sink the instruction.
-    BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1);
+    BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(),
+                                BB1->getInstList(), I1);
     if (!OldPN->use_empty())
       OldPN->replaceAllUsesWith(I1);
     OldPN->eraseFromParent();
@@ -1355,7 +1386,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       RE1 = BB1->getInstList().rend();
     if (UpdateRE2)
       RE2 = BB2->getInstList().rend();
-    FirstNonPhiInBBEnd = I1;
+    FirstNonPhiInBBEnd = &*I1;
     NumSinkCommons++;
     Changed = true;
   }
@@ -1491,7 +1522,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   for (BasicBlock::iterator BBI = ThenBB->begin(),
                             BBE = std::prev(ThenBB->end());
        BBI != BBE; ++BBI) {
-    Instruction *I = BBI;
+    Instruction *I = &*BBI;
     // Skip debug info.
     if (isa<DbgInfoIntrinsic>(I))
       continue;
@@ -1604,9 +1635,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     SpeculatedStore->setOperand(0, S);
   }
 
+  // Metadata can be dependent on the condition we are hoisting above.
+  // Conservatively strip all metadata on the instruction.
+  for (auto &I: *ThenBB)
+    I.dropUnknownNonDebugMetadata();
+
   // Hoist the instructions.
-  BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
-                           std::prev(ThenBB->end()));
+  BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(),
+                           ThenBB->begin(), std::prev(ThenBB->end()));
 
   // Insert selects and rewrite the PHI operands.
   IRBuilder<true, NoFolder> Builder(BI);
@@ -1747,13 +1783,13 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
 
       // Check for trivial simplification.
       if (Value *V = SimplifyInstruction(N, DL)) {
-        TranslateMap[BBI] = V;
+        TranslateMap[&*BBI] = V;
         delete N;   // Instruction folded away, don't need actual inst
       } else {
         // Insert the new instruction into its new home.
         EdgeBB->getInstList().insert(InsertPt, N);
         if (!BBI->use_empty())
-          TranslateMap[BBI] = N;
+          TranslateMap[&*BBI] = N;
       }
     }
 
@@ -1850,7 +1886,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   } else {
     DomBlock = *pred_begin(IfBlock1);
     for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I)
-      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
         // This is not an aggressive instruction that we can promote.
         // Because of this, we won't be able to get rid of the control
         // flow, so the xform is not worth it.
@@ -1863,7 +1899,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   } else {
     DomBlock = *pred_begin(IfBlock2);
     for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I)
-      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
         // This is not an aggressive instruction that we can promote.
         // Because of this, we won't be able to get rid of the control
         // flow, so the xform is not worth it.
@@ -1882,13 +1918,13 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
   if (IfBlock1)
-    DomBlock->getInstList().splice(InsertPt,
+    DomBlock->getInstList().splice(InsertPt->getIterator(),
                                    IfBlock1->getInstList(), IfBlock1->begin(),
-                                   IfBlock1->getTerminator());
+                                   IfBlock1->getTerminator()->getIterator());
   if (IfBlock2)
-    DomBlock->getInstList().splice(InsertPt,
+    DomBlock->getInstList().splice(InsertPt->getIterator(),
                                    IfBlock2->getInstList(), IfBlock2->begin(),
-                                   IfBlock2->getTerminator());
+                                   IfBlock2->getTerminator()->getIterator());
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
@@ -2057,7 +2093,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
              BI->getSuccessor(0) == PBI->getSuccessor(1))) {
           for (BasicBlock::iterator I = BB->begin(), E = BB->end();
                I != E; ) {
-            Instruction *Curr = I++;
+            Instruction *Curr = &*I++;
             if (isa<CmpInst>(Curr)) {
               Cond = Curr;
               break;
@@ -2077,7 +2113,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
   return false;
 
   // Make sure the instruction after the condition is the cond branch.
-  BasicBlock::iterator CondIt = Cond; ++CondIt;
+  BasicBlock::iterator CondIt = ++Cond->getIterator();
 
   // Ignore dbg intrinsics.
   while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
@@ -2095,7 +2131,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // Ignore dbg intrinsics.
     if (isa<DbgInfoIntrinsic>(I))
       continue;
-    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I))
+    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
       return false;
     // I has only one use and can be executed unconditionally.
     Instruction *User = dyn_cast<Instruction>(I->user_back());
@@ -2192,17 +2228,17 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
       Instruction *NewBonusInst = BonusInst->clone();
       RemapInstruction(NewBonusInst, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
-      VMap[BonusInst] = NewBonusInst;
+      VMap[&*BonusInst] = NewBonusInst;
 
       // If we moved a load, we cannot any longer claim any knowledge about
       // its potential value. The previous information might have been valid
       // only given the branch precondition.
       // For an analogous reason, we must also drop all the metadata whose
       // semantics we don't understand.
-      NewBonusInst->dropUnknownMetadata(LLVMContext::MD_dbg);
+      NewBonusInst->dropUnknownNonDebugMetadata();
 
-      PredBlock->getInstList().insert(PBI, NewBonusInst);
-      NewBonusInst->takeName(BonusInst);
+      PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
+      NewBonusInst->takeName(&*BonusInst);
       BonusInst->setName(BonusInst->getName() + ".old");
     }
 
@@ -2211,7 +2247,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     Instruction *New = Cond->clone();
     RemapInstruction(New, VMap,
                      RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
-    PredBlock->getInstList().insert(PBI, New);
+    PredBlock->getInstList().insert(PBI->getIterator(), New);
     New->takeName(Cond);
     Cond->setName(New->getName() + ".old");
 
@@ -2332,11 +2368,297 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
   return false;
 }
 
+// If there is only one store in BB1 and BB2, return it, otherwise return
+// nullptr.
+static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
+  StoreInst *S = nullptr;
+  for (auto *BB : {BB1, BB2}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        if (S)
+          // Multiple stores seen.
+          return nullptr;
+        else
+          S = SI;
+      }
+  }
+  return S;
+}
+
+static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
+                                              Value *AlternativeV = nullptr) {
+  // PHI is going to be a PHI node that allows the value V that is defined in
+  // BB to be referenced in BB's only successor.
+  //
+  // If AlternativeV is nullptr, the only value we care about in PHI is V. It
+  // doesn't matter to us what the other operand is (it'll never get used). We
+  // could just create a new PHI with an undef incoming value, but that could
+  // increase register pressure if EarlyCSE/InstCombine can't fold it with some
+  // other PHI. So here we directly look for some PHI in BB's successor with V
+  // as an incoming operand. If we find one, we use it, else we create a new
+  // one.
+  //
+  // If AlternativeV is not nullptr, we care about both incoming values in PHI.
+  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
+  // where OtherBB is the single other predecessor of BB's only successor.
+  PHINode *PHI = nullptr;
+  BasicBlock *Succ = BB->getSingleSuccessor();
+  
+  for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
+    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
+      PHI = cast<PHINode>(I);
+      if (!AlternativeV)
+        break;
+
+      assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2);
+      auto PredI = pred_begin(Succ);
+      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
+      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
+        break;
+      PHI = nullptr;
+    }
+  if (PHI)
+    return PHI;
+
+  // If V is not an instruction defined in BB, just return it.
+  if (!AlternativeV &&
+      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
+    return V;
+
+  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
+  PHI->addIncoming(V, BB);
+  for (BasicBlock *PredBB : predecessors(Succ))
+    if (PredBB != BB)
+      PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()),
+                       PredBB);
+  return PHI;
+}
+
+static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
+                                           BasicBlock *QTB, BasicBlock *QFB,
+                                           BasicBlock *PostBB, Value *Address,
+                                           bool InvertPCond, bool InvertQCond) {
+  auto IsaBitcastOfPointerType = [](const Instruction &I) {
+    return Operator::getOpcode(&I) == Instruction::BitCast &&
+           I.getType()->isPointerTy();
+  };
+
+  // If we're not in aggressive mode, we only optimize if we have some
+  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+  auto IsWorthwhile = [&](BasicBlock *BB) {
+    if (!BB)
+      return true;
+    // Heuristic: if the block can be if-converted/phi-folded and the
+    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+    // thread this store.
+    unsigned N = 0;
+    for (auto &I : *BB) {
+      // Cheap instructions viable for folding.
+      if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
+          isa<StoreInst>(I))
+        ++N;
+      // Free instructions.
+      else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+               IsaBitcastOfPointerType(I))
+        continue;
+      else
+        return false;
+    }
+    return N <= PHINodeFoldingThreshold;
+  };
+
+  if (!MergeCondStoresAggressively && (!IsWorthwhile(PTB) ||
+                                       !IsWorthwhile(PFB) ||
+                                       !IsWorthwhile(QTB) ||
+                                       !IsWorthwhile(QFB)))
+    return false;
+
+  // For every pointer, there must be exactly two stores, one coming from
+  // PTB or PFB, and the other from QTB or QFB. We don't support more than one
+  // store (to any address) in PTB,PFB or QTB,QFB.
+  // FIXME: We could relax this restriction with a bit more work and performance
+  // testing.
+  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
+  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
+  if (!PStore || !QStore)
+    return false;
+
+  // Now check the stores are compatible.
+  if (!QStore->isUnordered() || !PStore->isUnordered())
+    return false;
+
+  // Check that sinking the store won't cause program behavior changes. Sinking
+  // the store out of the Q blocks won't change any behavior as we're sinking
+  // from a block to its unconditional successor. But we're moving a store from
+  // the P blocks down through the middle block (QBI) and past both QFB and QTB.
+  // So we need to check that there are no aliasing loads or stores in
+  // QBI, QTB and QFB. We also need to check there are no conflicting memory
+  // operations between PStore and the end of its parent block.
+  //
+  // The ideal way to do this is to query AliasAnalysis, but we don't
+  // preserve AA currently so that is dangerous. Be super safe and just
+  // check there are no other memory operations at all.
+  for (auto &I : *QFB->getSinglePredecessor())
+    if (I.mayReadOrWriteMemory())
+      return false;
+  for (auto &I : *QFB)
+    if (&I != QStore && I.mayReadOrWriteMemory())
+      return false;
+  if (QTB)
+    for (auto &I : *QTB)
+      if (&I != QStore && I.mayReadOrWriteMemory())
+        return false;
+  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
+       I != E; ++I)
+    if (&*I != PStore && I->mayReadOrWriteMemory())
+      return false;
+
+  // OK, we're going to sink the stores to PostBB. The store has to be
+  // conditional though, so first create the predicate.
+  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+
+  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
+                                                PStore->getParent());
+  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
+                                                QStore->getParent(), PPHI);
+
+  IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
+
+  Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
+  Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
+
+  if (InvertPCond)
+    PPred = QB.CreateNot(PPred);
+  if (InvertQCond)
+    QPred = QB.CreateNot(QPred);
+  Value *CombinedPred = QB.CreateOr(PPred, QPred);
+
+  auto *T =
+      SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(), false);
+  QB.SetInsertPoint(T);
+  StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
+  AAMDNodes AAMD;
+  PStore->getAAMetadata(AAMD, /*Merge=*/false);
+  PStore->getAAMetadata(AAMD, /*Merge=*/true);
+  SI->setAAMetadata(AAMD);
+
+  QStore->eraseFromParent();
+  PStore->eraseFromParent();
+  
+  return true;
+}
+
+static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
+  // The intention here is to find diamonds or triangles (see below) where each
+  // conditional block contains a store to the same address. Both of these
+  // stores are conditional, so they can't be unconditionally sunk. But it may
+  // be profitable to speculatively sink the stores into one merged store at the
+  // end, and predicate the merged store on the union of the two conditions of
+  // PBI and QBI.
+  //
+  // This can reduce the number of stores executed if both of the conditions are
+  // true, and can allow the blocks to become small enough to be if-converted.
+  // This optimization will also chain, so that ladders of test-and-set
+  // sequences can be if-converted away.
+  //
+  // We only deal with simple diamonds or triangles:
+  //
+  //     PBI       or      PBI        or a combination of the two
+  //    /   \               | \
+  //   PTB  PFB             |  PFB
+  //    \   /               | /
+  //     QBI                QBI
+  //    /  \                | \
+  //   QTB  QFB             |  QFB
+  //    \  /                | /
+  //    PostBB            PostBB
+  //
+  // We model triangles as a type of diamond with a nullptr "true" block.
+  // Triangles are canonicalized so that the fallthrough edge is represented by
+  // a true condition, as in the diagram above.
+  //  
+  BasicBlock *PTB = PBI->getSuccessor(0);
+  BasicBlock *PFB = PBI->getSuccessor(1);
+  BasicBlock *QTB = QBI->getSuccessor(0);
+  BasicBlock *QFB = QBI->getSuccessor(1);
+  BasicBlock *PostBB = QFB->getSingleSuccessor();
+
+  bool InvertPCond = false, InvertQCond = false;
+  // Canonicalize fallthroughs to the true branches.
+  if (PFB == QBI->getParent()) {
+    std::swap(PFB, PTB);
+    InvertPCond = true;
+  }
+  if (QFB == PostBB) {
+    std::swap(QFB, QTB);
+    InvertQCond = true;
+  }
+
+  // From this point on we can assume PTB or QTB may be fallthroughs but PFB
+  // and QFB may not. Model fallthroughs as a nullptr block.
+  if (PTB == QBI->getParent())
+    PTB = nullptr;
+  if (QTB == PostBB)
+    QTB = nullptr;
+
+  // Legality bailouts. We must have at least the non-fallthrough blocks and
+  // the post-dominating block, and the non-fallthroughs must only have one
+  // predecessor.
+  auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
+    return BB->getSinglePredecessor() == P &&
+           BB->getSingleSuccessor() == S;
+  };
+  if (!PostBB ||
+      !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
+      !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB))
+    return false;
+  if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
+      (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
+    return false;
+  if (PostBB->getNumUses() != 2 || QBI->getParent()->getNumUses() != 2)
+    return false;
+
+  // OK, this is a sequence of two diamonds or triangles.
+  // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
+  SmallPtrSet<Value *,4> PStoreAddresses, QStoreAddresses;
+  for (auto *BB : {PTB, PFB}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        PStoreAddresses.insert(SI->getPointerOperand());
+  }
+  for (auto *BB : {QTB, QFB}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        QStoreAddresses.insert(SI->getPointerOperand());
+  }
+  
+  set_intersect(PStoreAddresses, QStoreAddresses);
+  // set_intersect mutates PStoreAddresses in place. Rename it here to make it
+  // clear what it contains.
+  auto &CommonAddresses = PStoreAddresses;
+
+  bool Changed = false;
+  for (auto *Address : CommonAddresses)
+    Changed |= mergeConditionalStoreToAddress(
+        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond);
+  return Changed;
+}
+
 /// If we have a conditional branch as a predecessor of another block,
 /// this function tries to simplify it.  We know
 /// that PBI and BI are both conditional branches, and BI is in one of the
 /// successor blocks of PBI - PBI branches to BI.
-static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+                                           const DataLayout &DL) {
   assert(PBI->isConditional() && BI->isConditional());
   BasicBlock *BB = BI->getParent();
 
@@ -2360,10 +2682,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     // simplifycfg will thread the block.
     if (BlockIsSimpleEnoughToThreadThrough(BB)) {
       pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
-      PHINode *NewPN = PHINode::Create(Type::getInt1Ty(BB->getContext()),
-                                       std::distance(PB, PE),
-                                       BI->getCondition()->getName() + ".pr",
-                                       BB->begin());
+      PHINode *NewPN = PHINode::Create(
+          Type::getInt1Ty(BB->getContext()), std::distance(PB, PE),
+          BI->getCondition()->getName() + ".pr", &BB->front());
       // Okay, we're going to insert the PHI node.  Since PBI is not the only
       // predecessor, compute the PHI'd conditional value for all of the preds.
       // Any predecessor where the condition is not computable we keep symbolic.
@@ -2386,6 +2707,29 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     }
   }
 
+  if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+    if (CE->canTrap())
+      return false;
+
+  // If BI is reached from the true path of PBI and PBI's condition implies
+  // BI's condition, we know the direction of the BI branch.
+  if (PBI->getSuccessor(0) == BI->getParent() &&
+      isImpliedCondition(PBI->getCondition(), BI->getCondition(), DL) &&
+      PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
+      BB->getSinglePredecessor()) {
+    // Turn this into a branch on constant.
+    auto *OldCond = BI->getCondition();
+    BI->setCondition(ConstantInt::getTrue(BB->getContext()));
+    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+    return true;  // Nuke the branch on constant.
+  }
+
+  // If both branches are conditional and both contain stores to the same
+  // address, remove the stores from the conditionals and create a conditional
+  // merged store at the end.
+  if (MergeCondStores && mergeConditionalStores(PBI, BI))
+    return true;
+
   // If this is a conditional branch in an empty block, and if any
   // predecessors are a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
@@ -2396,11 +2740,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   if (&*BBI != BI)
     return false;
 
-
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
-    if (CE->canTrap())
-      return false;
-
   int PBIOp, BIOp;
   if (PBI->getSuccessor(0) == BI->getSuccessor(0))
     PBIOp = BIOp = 0;
@@ -2565,15 +2904,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
   BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
 
   // Then remove the rest.
-  for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) {
-    BasicBlock *Succ = OldTerm->getSuccessor(I);
+  for (BasicBlock *Succ : OldTerm->successors()) {
     // Make sure only to keep exactly one copy of each edge.
     if (Succ == KeepEdge1)
       KeepEdge1 = nullptr;
     else if (Succ == KeepEdge2)
       KeepEdge2 = nullptr;
     else
-      Succ->removePredecessor(OldTerm->getParent());
+      Succ->removePredecessor(OldTerm->getParent(),
+                              /*DontDeleteUselessPHIs=*/true);
   }
 
   IRBuilder<> Builder(OldTerm);
@@ -2827,7 +3166,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
 
   // If Extra was used, we require at least two switch values to do the
-  // transformation.  A switch with one value is just an cond branch.
+  // transformation.  A switch with one value is just a conditional branch.
   if (ExtraCase && Values.size() < 2) return false;
 
   // TODO: Preserve branch weight metadata, similarly to how
@@ -2847,7 +3186,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // then we evaluate them with an explicit branch first.  Split the block
   // right before the condbr to handle it.
   if (ExtraCase) {
-    BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test");
+    BasicBlock *NewBB =
+        BB->splitBasicBlock(BI->getIterator(), "switch.early.test");
     // Remove the uncond branch added to the old block.
     TerminatorInst *OldTI = BB->getTerminator();
     Builder.SetInsertPoint(OldTI);
@@ -2901,47 +3241,233 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 }
 
 bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
-  // If this is a trivial landing pad that just continues unwinding the caught
-  // exception then zap the landing pad, turning its invokes into calls.
+  if (isa<PHINode>(RI->getValue()))
+    return SimplifyCommonResume(RI);
+  else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
+           RI->getValue() == RI->getParent()->getFirstNonPHI())
+    // The resume must unwind the exception that caused control to branch here.
+    return SimplifySingleResume(RI);
+
+  return false;
+}
+
+// Simplify resume that is shared by several landing pads (phi of landing pad).
+bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
+  BasicBlock *BB = RI->getParent();
+
+  // Check that there are no other instructions except for debug intrinsics
+  // between the phi of landing pads (RI->getValue()) and resume instruction.
+  BasicBlock::iterator I = cast<Instruction>(RI->getValue())->getIterator(),
+		  	  	  	   E = RI->getIterator();
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  SmallSet<BasicBlock *, 4> TrivialUnwindBlocks;
+  auto *PhiLPInst = cast<PHINode>(RI->getValue());
+
+  // Check incoming blocks to see if any of them are trivial.
+  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues();
+       Idx != End; Idx++) {
+    auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
+    auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
+
+    // If the block has other successors, we can not delete it because
+    // it has other dependents.
+    if (IncomingBB->getUniqueSuccessor() != BB)
+      continue;
+
+    auto *LandingPad =
+        dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+    // Not the landing pad that caused the control to branch here.
+    if (IncomingValue != LandingPad)
+      continue;
+
+    bool isTrivial = true;
+
+    I = IncomingBB->getFirstNonPHI()->getIterator();
+    E = IncomingBB->getTerminator()->getIterator();
+    while (++I != E)
+      if (!isa<DbgInfoIntrinsic>(I)) {
+        isTrivial = false;
+        break;
+      }
+
+    if (isTrivial)
+      TrivialUnwindBlocks.insert(IncomingBB);
+  }
+
+  // If no trivial unwind blocks, don't do any simplifications.
+  if (TrivialUnwindBlocks.empty()) return false;
+
+  // Turn all invokes that unwind here into calls.
+  for (auto *TrivialBB : TrivialUnwindBlocks) {
+    // Blocks that will be simplified should be removed from the phi node.
+    // Note there could be multiple edges to the resume block, and we need
+    // to remove them all.
+    while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1)
+      BB->removePredecessor(TrivialBB, true);
+
+    for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB);
+         PI != PE;) {
+      BasicBlock *Pred = *PI++;
+      removeUnwindEdge(Pred);
+    }
+
+    // In each SimplifyCFG run, only the current processed block can be erased.
+    // Otherwise, it will break the iteration of SimplifyCFG pass. So instead
+    // of erasing TrivialBB, we only remove the branch to the common resume
+    // block so that we can later erase the resume block since it has no
+    // predecessors.
+    TrivialBB->getTerminator()->eraseFromParent();
+    new UnreachableInst(RI->getContext(), TrivialBB);
+  }
+
+  // Delete the resume block if all its predecessors have been removed.
+  if (pred_empty(BB))
+    BB->eraseFromParent();
+
+  return !TrivialUnwindBlocks.empty();
+}
+
+// Simplify resume that is only used by a single (non-phi) landing pad.
+bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
   LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
-  if (RI->getValue() != LPInst)
-    // Not a landing pad, or the resume is not unwinding the exception that
-    // caused control to branch here.
-    return false;
+  assert (RI->getValue() == LPInst &&
+          "Resume must unwind the exception that caused control to here");
 
   // Check that there are no other instructions except for debug intrinsics.
-  BasicBlock::iterator I = LPInst, E = RI;
+  BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator();
   while (++I != E)
     if (!isa<DbgInfoIntrinsic>(I))
       return false;
 
   // Turn all invokes that unwind here into calls and delete the basic block.
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
-    InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
-    SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-    // Insert a call instruction before the invoke.
-    CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
-    Call->takeName(II);
-    Call->setCallingConv(II->getCallingConv());
-    Call->setAttributes(II->getAttributes());
-    Call->setDebugLoc(II->getDebugLoc());
+    BasicBlock *Pred = *PI++;
+    removeUnwindEdge(Pred);
+  }
 
-    // Anything that used the value produced by the invoke instruction now uses
-    // the value produced by the call instruction.  Note that we do this even
-    // for void functions and calls with no uses so that the callgraph edge is
-    // updated.
-    II->replaceAllUsesWith(Call);
-    BB->removePredecessor(II->getParent());
+  // The landingpad is now unreachable.  Zap it.
+  BB->eraseFromParent();
+  return true;
+}
 
-    // Insert a branch to the normal destination right before the invoke.
-    BranchInst::Create(II->getNormalDest(), II);
+bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+  // If this is a trivial cleanup pad that executes no instructions, it can be
+  // eliminated.  If the cleanup pad continues to the caller, any predecessor
+  // that is an EH pad will be updated to continue to the caller and any
+  // predecessor that terminates with an invoke instruction will have its invoke
+  // instruction converted to a call instruction.  If the cleanup pad being
+  // simplified does not continue to the caller, each predecessor will be
+  // updated to continue to the unwind destination of the cleanup pad being
+  // simplified.
+  BasicBlock *BB = RI->getParent();
+  CleanupPadInst *CPInst = RI->getCleanupPad();
+  if (CPInst->getParent() != BB)
+    // This isn't an empty cleanup.
+    return false;
 
-    // Finally, delete the invoke instruction!
-    II->eraseFromParent();
+  // Check that there are no other instructions except for debug intrinsics.
+  BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator();
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  // If the cleanup return we are simplifying unwinds to the caller, this will
+  // set UnwindDest to nullptr.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr;
+
+  // We're about to remove BB from the control flow.  Before we do, sink any
+  // PHINodes into the unwind destination.  Doing this before changing the
+  // control flow avoids some potentially slow checks, since we can currently
+  // be certain that UnwindDest and BB have no common predecessors (since they
+  // are both EH pads).
+  if (UnwindDest) {
+    // First, go through the PHI nodes in UnwindDest and update any nodes that
+    // reference the block we are removing
+    for (BasicBlock::iterator I = UnwindDest->begin(),
+                              IE = DestEHPad->getIterator();
+         I != IE; ++I) {
+      PHINode *DestPN = cast<PHINode>(I);
+
+      int Idx = DestPN->getBasicBlockIndex(BB);
+      // Since BB unwinds to UnwindDest, it has to be in the PHI node.
+      assert(Idx != -1);
+      // This PHI node has an incoming value that corresponds to a control
+      // path through the cleanup pad we are removing.  If the incoming
+      // value is in the cleanup pad, it must be a PHINode (because we
+      // verified above that the block is otherwise empty).  Otherwise, the
+      // value is either a constant or a value that dominates the cleanup
+      // pad being removed.
+      //
+      // Because BB and UnwindDest are both EH pads, all of their
+      // predecessors must unwind to these blocks, and since no instruction
+      // can have multiple unwind destinations, there will be no overlap in
+      // incoming blocks between SrcPN and DestPN.
+      Value *SrcVal = DestPN->getIncomingValue(Idx);
+      PHINode *SrcPN = dyn_cast<PHINode>(SrcVal);
+
+      // Remove the entry for the block we are deleting.
+      DestPN->removeIncomingValue(Idx, false);
+
+      if (SrcPN && SrcPN->getParent() == BB) {
+        // If the incoming value was a PHI node in the cleanup pad we are
+        // removing, we need to merge that PHI node's incoming values into
+        // DestPN.
+        for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
+              SrcIdx != SrcE; ++SrcIdx) {
+          DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
+                              SrcPN->getIncomingBlock(SrcIdx));
+        }
+      } else {
+        // Otherwise, the incoming value came from above BB and
+        // so we can just reuse it.  We must associate all of BB's
+        // predecessors with this value.
+        for (auto *pred : predecessors(BB)) {
+          DestPN->addIncoming(SrcVal, pred);
+        }
+      }
+    }
+
+    // Sink any remaining PHI nodes directly into UnwindDest.
+    Instruction *InsertPt = DestEHPad;
+    for (BasicBlock::iterator I = BB->begin(),
+                              IE = BB->getFirstNonPHI()->getIterator();
+         I != IE;) {
+      // The iterator must be incremented here because the instructions are
+      // being moved to another block.
+      PHINode *PN = cast<PHINode>(I++);
+      if (PN->use_empty())
+        // If the PHI node has no uses, just leave it.  It will be erased
+        // when we erase BB below.
+        continue;
+
+      // Otherwise, sink this PHI node into UnwindDest.
+      // Any predecessors to UnwindDest which are not already represented
+      // must be back edges which inherit the value from the path through
+      // BB.  In this case, the PHI value must reference itself.
+      for (auto *pred : predecessors(UnwindDest))
+        if (pred != BB)
+          PN->addIncoming(PN, pred);
+      PN->moveBefore(InsertPt);
+    }
   }
 
-  // The landingpad is now unreachable.  Zap it.
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    // The iterator must be updated here because we are removing this pred.
+    BasicBlock *PredBB = *PI++;
+    if (UnwindDest == nullptr) {
+      removeUnwindEdge(PredBB);
+    } else {
+      TerminatorInst *TI = PredBB->getTerminator();
+      TI->replaceUsesOfWith(BB, UnwindDest);
+    }
+  }
+
+  // The cleanup pad is now unreachable.  Zap it.
   BB->eraseFromParent();
   return true;
 }
@@ -3003,8 +3529,8 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
   // If there are any instructions immediately before the unreachable that can
   // be removed, do so.
-  while (UI != BB->begin()) {
-    BasicBlock::iterator BBI = UI;
+  while (UI->getIterator() != BB->begin()) {
+    BasicBlock::iterator BBI = UI->getIterator();
     --BBI;
     // Do not delete instructions that can have side effects which might cause
     // the unreachable to not be reachable; specifically, calls and volatile
@@ -3012,18 +3538,26 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
 
     if (BBI->mayHaveSideEffects()) {
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (auto *SI = dyn_cast<StoreInst>(BBI)) {
         if (SI->isVolatile())
           break;
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      } else if (auto *LI = dyn_cast<LoadInst>(BBI)) {
         if (LI->isVolatile())
           break;
-      } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+      } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
         if (RMWI->isVolatile())
           break;
-      } else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+      } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
         if (CXI->isVolatile())
           break;
+      } else if (isa<CatchPadInst>(BBI)) {
+        // A catchpad may invoke exception object constructors and such, which
+        // in some languages can be arbitrary code, so be conservative by
+        // default.
+        // For CoreCLR, it just involves a type test, so can be removed.
+        if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) !=
+            EHPersonality::CoreCLR)
+          break;
       } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
                  !isa<LandingPadInst>(BBI)) {
         break;
@@ -3049,7 +3583,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
     IRBuilder<> Builder(TI);
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
         if (BI->getSuccessor(0) == BB) {
           new UnreachableInst(TI->getContext(), TI);
@@ -3066,7 +3600,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
           Changed = true;
         }
       }
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
            i != e; ++i)
         if (i.getCaseSuccessor() == BB) {
@@ -3075,25 +3609,48 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
           --i; --e;
           Changed = true;
         }
-    } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) {
+    } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
       if (II->getUnwindDest() == BB) {
-        // Convert the invoke to a call instruction.  This would be a good
-        // place to note that the call does not throw though.
-        BranchInst *BI = Builder.CreateBr(II->getNormalDest());
-        II->removeFromParent();   // Take out of symbol table
-
-        // Insert the call now...
-        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
-        Builder.SetInsertPoint(BI);
-        CallInst *CI = Builder.CreateCall(II->getCalledValue(),
-                                          Args, II->getName());
-        CI->setCallingConv(II->getCallingConv());
-        CI->setAttributes(II->getAttributes());
-        // If the invoke produced a value, the call does now instead.
-        II->replaceAllUsesWith(CI);
-        delete II;
+        removeUnwindEdge(TI->getParent());
         Changed = true;
       }
+    } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
+      if (CSI->getUnwindDest() == BB) {
+        removeUnwindEdge(TI->getParent());
+        Changed = true;
+        continue;
+      }
+
+      for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
+                                             E = CSI->handler_end();
+           I != E; ++I) {
+        if (*I == BB) {
+          CSI->removeHandler(I);
+          --I;
+          --E;
+          Changed = true;
+        }
+      }
+      if (CSI->getNumHandlers() == 0) {
+        BasicBlock *CatchSwitchBB = CSI->getParent();
+        if (CSI->hasUnwindDest()) {
+          // Redirect preds to the unwind dest
+          CatchSwitchBB->replaceAllUsesWith(CSI->getUnwindDest());
+        } else {
+          // Rewrite all preds to unwind to caller (or from invoke to call).
+          SmallVector<BasicBlock *, 8> EHPreds(predecessors(CatchSwitchBB));
+          for (BasicBlock *EHPred : EHPreds)
+            removeUnwindEdge(EHPred);
+        }
+        // The catchswitch is no longer reachable.
+        new UnreachableInst(CSI->getContext(), CSI);
+        CSI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (isa<CleanupReturnInst>(TI)) {
+      new UnreachableInst(TI->getContext(), TI);
+      TI->eraseFromParent();
+      Changed = true;
     }
   }
 
@@ -3249,6 +3806,29 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
     }
   }
 
+  // If we can prove that the cases must cover all possible values, the 
+  // default destination becomes dead and we can remove it.  If we know some 
+  // of the bits in the value, we can use that to more precisely compute the
+  // number of possible unique case values.
+  bool HasDefault =
+    !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const unsigned NumUnknownBits = Bits - 
+    (KnownZero.Or(KnownOne)).countPopulation();
+  assert(NumUnknownBits <= Bits);
+  if (HasDefault && DeadCases.empty() &&
+      NumUnknownBits < 64 /* avoid overflow */ &&  
+      SI->getNumCases() == (1ULL << NumUnknownBits)) {
+    DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+    BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(),
+                                                    SI->getParent(), "");
+    SI->setDefaultDest(&*NewDefault);
+    SplitBlock(&*NewDefault, &NewDefault->front());
+    auto *OldTI = NewDefault->getTerminator();
+    new UnreachableInst(SI->getContext(), OldTI);
+    EraseTerminatorInstAndDCECond(OldTI);
+    return true;
+  }
+
   SmallVector<uint64_t, 8> Weights;
   bool HasWeight = HasBranchWeights(SI);
   if (HasWeight) {
@@ -3439,7 +4019,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
     } else if (isa<DbgInfoIntrinsic>(I)) {
       // Skip debug intrinsic.
       continue;
-    } else if (Constant *C = ConstantFold(I, DL, ConstantPool)) {
+    } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) {
       // Instruction is side-effect free and constant.
 
       // If the instruction has uses outside this block or a phi node slot for
@@ -3456,7 +4036,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
         return false;
       }
 
-      ConstantPool.insert(std::make_pair(I, C));
+      ConstantPool.insert(std::make_pair(&*I, C));
     } else {
       break;
     }
@@ -3664,7 +4244,7 @@ namespace {
     /// Return true if a table with TableSize elements of
     /// type ElementType would fit in a target-legal register.
     static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
-                                   const Type *ElementType);
+                                   Type *ElementType);
 
   private:
     // Depending on the contents of the table, it can be represented in
@@ -3880,8 +4460,8 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
 
 bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
                                            uint64_t TableSize,
-                                           const Type *ElementType) {
-  const IntegerType *IT = dyn_cast<IntegerType>(ElementType);
+                                           Type *ElementType) {
+  auto *IT = dyn_cast<IntegerType>(ElementType);
   if (!IT)
     return false;
   // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
@@ -3992,7 +4572,7 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
     assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
            "Expect true or false as compare result.");
   }
- 
+  
   // Check if the branch instruction dominates the phi node. It's a simple
   // dominance check, but sufficient for our needs.
   // Although this check is invariant in the calling loops, it's better to do it
@@ -4422,7 +5002,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
     return true;
 
   // If the Terminator is the only non-phi instruction, simplify the block.
-  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg();
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
@@ -4457,6 +5037,16 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   return false;
 }
 
+static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
+  BasicBlock *PredPred = nullptr;
+  for (auto *P : predecessors(BB)) {
+    BasicBlock *PPred = P->getSinglePredecessor();
+    if (!PPred || (PredPred && PredPred != PPred))
+      return nullptr;
+    PredPred = PPred;
+  }
+  return PredPred;
+}
 
 bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
@@ -4537,9 +5127,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
-        if (SimplifyCondBranchToCondBranch(PBI, BI))
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DL))
           return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
+  // Look for diamond patterns.
+  if (MergeCondStores)
+    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
+        if (PBI != BI && PBI->isConditional())
+          if (mergeConditionalStores(PBI, BI))
+            return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+  
   return false;
 }
 
@@ -4663,6 +5261,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
     if (SimplifyReturn(RI, Builder)) return true;
   } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
     if (SimplifyResume(RI, Builder)) return true;
+  } else if (CleanupReturnInst *RI =
+               dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+    if (SimplifyCleanupReturn(RI)) return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
     if (SimplifySwitch(SI, Builder)) return true;
   } else if (UnreachableInst *UI =
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index ab30aa1..ddd8775 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -47,15 +47,16 @@ namespace {
     Loop             *L;
     LoopInfo         *LI;
     ScalarEvolution  *SE;
+    DominatorTree    *DT;
 
     SmallVectorImpl<WeakVH> &DeadInsts;
 
     bool Changed;
 
   public:
-    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI,
-                   SmallVectorImpl<WeakVH> &Dead)
-        : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) {
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
+                   LoopInfo *LI,SmallVectorImpl<WeakVH> &Dead)
+        : L(Loop), LI(LI), SE(SE), DT(DT), DeadInsts(Dead), Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
     }
 
@@ -63,11 +64,13 @@ namespace {
 
     /// Iteratively perform simplification on a worklist of users of the
     /// specified induction variable. This is the top-level driver that applies
-    /// all simplicitions to users of an IV.
+    /// all simplifications to users of an IV.
     void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
 
     Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
 
+    bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
+
     bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
@@ -166,19 +169,65 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   S = SE->getSCEVAtScope(S, ICmpLoop);
   X = SE->getSCEVAtScope(X, ICmpLoop);
 
+  ICmpInst::Predicate InvariantPredicate;
+  const SCEV *InvariantLHS, *InvariantRHS;
+
   // If the condition is always true or always false, replace it with
   // a constant value.
-  if (SE->isKnownPredicate(Pred, S, X))
+  if (SE->isKnownPredicate(Pred, S, X)) {
     ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
-  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
+    DeadInsts.emplace_back(ICmp);
+    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
     ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
-  else
+    DeadInsts.emplace_back(ICmp);
+    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  } else if (isa<PHINode>(IVOperand) &&
+             SE->isLoopInvariantPredicate(Pred, S, X, ICmpLoop,
+                                          InvariantPredicate, InvariantLHS,
+                                          InvariantRHS)) {
+
+    // Rewrite the comparison to a loop invariant comparison if it can be done
+    // cheaply, where cheaply means "we don't need to emit any new
+    // instructions".
+
+    Value *NewLHS = nullptr, *NewRHS = nullptr;
+
+    if (S == InvariantLHS || X == InvariantLHS)
+      NewLHS =
+          ICmp->getOperand(S == InvariantLHS ? IVOperIdx : (1 - IVOperIdx));
+
+    if (S == InvariantRHS || X == InvariantRHS)
+      NewRHS =
+          ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx));
+
+    for (Value *Incoming : cast<PHINode>(IVOperand)->incoming_values()) {
+      if (NewLHS && NewRHS)
+        break;
+
+      const SCEV *IncomingS = SE->getSCEV(Incoming);
+
+      if (!NewLHS && IncomingS == InvariantLHS)
+        NewLHS = Incoming;
+      if (!NewRHS && IncomingS == InvariantRHS)
+        NewRHS = Incoming;
+    }
+
+    if (!NewLHS || !NewRHS)
+      // We could not find an existing value to replace either LHS or RHS.
+      // Generating new instructions has subtler tradeoffs, so avoid doing that
+      // for now.
+      return;
+
+    DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+    ICmp->setPredicate(InvariantPredicate);
+    ICmp->setOperand(0, NewLHS);
+    ICmp->setOperand(1, NewRHS);
+  } else
     return;
 
-  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   ++NumElimCmp;
   Changed = true;
-  DeadInsts.emplace_back(ICmp);
 }
 
 /// SimplifyIVUsers helper for eliminating useless
@@ -207,8 +256,7 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
     Rem->replaceAllUsesWith(Rem->getOperand(0));
   else {
     // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
-    const SCEV *LessOne =
-      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
+    const SCEV *LessOne = SE->getMinusSCEV(S, SE->getOne(S->getType()));
     if (IsSigned && !SE->isKnownNonNegative(LessOne))
       return;
 
@@ -232,9 +280,9 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
   DeadInsts.emplace_back(Rem);
 }
 
-/// Eliminate an operation that consumes a simple IV and has
-/// no observable side-effect given the range of IV values.
-/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+/// Eliminate an operation that consumes a simple IV and has no observable
+/// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
+/// but UseInst may not be.
 bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
                                      Instruction *IVOperand) {
   if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
@@ -249,12 +297,45 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     }
   }
 
-  // Eliminate any operation that SCEV can prove is an identity function.
+  if (eliminateIdentitySCEV(UseInst, IVOperand))
+    return true;
+
+  return false;
+}
+
+/// Eliminate any operation that SCEV can prove is an identity function.
+bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
+                                           Instruction *IVOperand) {
   if (!SE->isSCEVable(UseInst->getType()) ||
       (UseInst->getType() != IVOperand->getType()) ||
       (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
     return false;
 
+  // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the
+  // dominator tree, even if X is an operand to Y.  For instance, in
+  //
+  //     %iv = phi i32 {0,+,1}
+  //     br %cond, label %left, label %merge
+  //
+  //   left:
+  //     %X = add i32 %iv, 0
+  //     br label %merge
+  //
+  //   merge:
+  //     %M = phi (%X, %iv)
+  //
+  // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and
+  // %M.replaceAllUsesWith(%X) would be incorrect.
+
+  if (isa<PHINode>(UseInst))
+    // If UseInst is not a PHI node then we know that IVOperand dominates
+    // UseInst directly from the legality of SSA.
+    if (!DT || !DT->dominates(IVOperand, UseInst))
+      return false;
+
+  if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
+    return false;
+
   DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
 
   UseInst->replaceAllUsesWith(IVOperand);
@@ -436,8 +517,8 @@ static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
 /// This algorithm does not require IVUsers analysis. Instead, it simplifies
 /// instructions in-place during analysis. Rather than rewriting induction
 /// variables bottom-up from their users, it transforms a chain of IVUsers
-/// top-down, updating the IR only when it encouters a clear optimization
-/// opportunitiy.
+/// top-down, updating the IR only when it encounters a clear optimization
+/// opportunity.
 ///
 /// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
 ///
@@ -513,22 +594,21 @@ void IVVisitor::anchor() { }
 
 /// Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
-bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
-                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V)
-{
-  LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead);
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
+                       LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead,
+                       IVVisitor *V) {
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Dead);
   SIV.simplifyUsers(CurrIV, V);
   return SIV.hasChanged();
 }
 
 /// Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
-bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
-                     SmallVectorImpl<WeakVH> &Dead) {
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+                     LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead) {
   bool Changed = false;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead);
+    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead);
   }
   return Changed;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
index c499c87..d5377f9 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -20,12 +20,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -64,7 +64,7 @@ namespace {
           // Here be subtlety: the iterator must be incremented before the loop
           // body (not sure why), so a range-for loop won't work here.
           for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-            Instruction *I = BI++;
+            Instruction *I = &*BI++;
             // The first time through the loop ToSimplify is empty and we try to
             // simplify all instructions.  On later iterations ToSimplify is not
             // empty and we only bother simplifying instructions that are in it.
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 6bbf828..dc07440 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -30,8 +31,8 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -52,20 +53,11 @@ static cl::opt<bool>
 //===----------------------------------------------------------------------===//
 
 static bool ignoreCallingConv(LibFunc::Func Func) {
-  switch (Func) {
-  case LibFunc::abs:
-  case LibFunc::labs:
-  case LibFunc::llabs:
-  case LibFunc::strlen:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("All cases should be covered in the switch.");
+  return Func == LibFunc::abs || Func == LibFunc::labs ||
+         Func == LibFunc::llabs || Func == LibFunc::strlen;
 }
 
-/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
-/// value is equal or not-equal to zero.
+/// Return true if it only matters that the value is equal or not-equal to zero.
 static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
   for (User *U : V->users()) {
     if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -79,8 +71,7 @@ static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
   return true;
 }
 
-/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
-/// comparisons with With.
+/// Return true if it is only used in equality comparisons with With.
 static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
   for (User *U : V->users()) {
     if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -93,16 +84,13 @@ static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
 }
 
 static bool callHasFloatingPointArgument(const CallInst *CI) {
-  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
-       it != e; ++it) {
-    if ((*it)->getType()->isFloatingPointTy())
-      return true;
-  }
-  return false;
+  return std::any_of(CI->op_begin(), CI->op_end(), [](const Use &OI) {
+    return OI->getType()->isFloatingPointTy();
+  });
 }
 
 /// \brief Check whether the overloaded unary floating point function
-/// corresponing to \a Ty is available.
+/// corresponding to \a Ty is available.
 static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
                             LibFunc::Func DoubleFn, LibFunc::Func FloatFn,
                             LibFunc::Func LongDoubleFn) {
@@ -116,6 +104,23 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
+/// \brief Check whether we can use unsafe floating point math for
+/// the function passed as input.
+static bool canUseUnsafeFPMath(Function *F) {
+
+  // FIXME: For finer-grain optimization, we need intrinsics to have the same
+  // fast-math flag decorations that are applied to FP instructions. For now,
+  // we have to rely on the function-level unsafe-fp-math attribute to do this
+  // optimization because there's no other way to express that the call can be
+  // relaxed.
+  if (F->hasFnAttribute("unsafe-fp-math")) {
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    if (Attr.getValueAsString() == "true")
+      return true;
+  }
+  return false;
+}
+
 /// \brief Returns whether \p F matches the signature expected for the
 /// string/memory copying library function \p Func.
 /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset.
@@ -242,12 +247,12 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
       !FT->getParamType(2)->isIntegerTy())
     return nullptr;
 
-  // Extract some information from the instruction
+  // Extract some information from the instruction.
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
   uint64_t Len;
 
-  // We don't do anything if length is not constant
+  // We don't do anything if length is not constant.
   if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
     Len = LengthArg->getZExtValue();
   else
@@ -265,12 +270,12 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
   if (SrcLen == 0 || Len == 0)
     return Dst;
 
-  // We don't optimize this case
+  // We don't optimize this case.
   if (Len < SrcLen)
     return nullptr;
 
   // strncat(x, s, c) -> strcat(x, s)
-  // s is constant so the strcat can be optimized further
+  // s is constant so the strcat can be optimized further.
   return emitStrLenMemCpy(Src, Dst, SrcLen, B);
 }
 
@@ -303,7 +308,8 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
     if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
-      return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr");
+      return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI),
+                         "strchr");
     return nullptr;
   }
 
@@ -467,9 +473,6 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Verify the "stpcpy" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-
   if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy))
     return nullptr;
 
@@ -484,10 +487,10 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
   if (Len == 0)
     return nullptr;
 
-  Type *PT = FT->getParamType(0);
+  Type *PT = Callee->getFunctionType()->getParamType(0);
   Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
-  Value *DstEnd =
-      B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+  Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
+                              ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
@@ -497,8 +500,6 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-
   if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy))
     return nullptr;
 
@@ -531,7 +532,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   if (Len > SrcLen + 1)
     return nullptr;
 
-  Type *PT = FT->getParamType(0);
+  Type *PT = Callee->getFunctionType()->getParamType(0);
   // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
   B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1);
 
@@ -597,7 +598,8 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
     if (I == StringRef::npos) // No match.
       return Constant::getNullValue(CI->getType());
 
-    return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), "strpbrk");
+    return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
+                       "strpbrk");
   }
 
   // strpbrk(s, "a") -> strchr(s, 'a')
@@ -862,6 +864,29 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
     return B.CreateSub(LHSV, RHSV, "chardiff");
   }
 
+  // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
+  if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
+
+    IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
+    unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
+
+    if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment &&
+        getKnownAlignment(RHS, DL, CI) >= PrefAlignment) {
+
+      Type *LHSPtrTy =
+          IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
+      Type *RHSPtrTy =
+          IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
+
+      Value *LHSV =
+          B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv");
+      Value *RHSV =
+          B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv");
+
+      return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
+    }
+  }
+
   // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
   StringRef LHSStr, RHSStr;
   if (getConstantStringInfo(LHS, LHSStr) &&
@@ -969,10 +994,14 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   Value *V = valueHasFloatPrecision(CI->getArgOperand(0));
   if (V == nullptr)
     return nullptr;
+  
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
 
   // floor((double)floatval) -> (double)floorf(floatval)
   if (Callee->isIntrinsic()) {
-    Module *M = CI->getParent()->getParent()->getParent();
+    Module *M = CI->getModule();
     Intrinsic::ID IID = Callee->getIntrinsicID();
     Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
     V = B.CreateCall(F, V);
@@ -1004,6 +1033,10 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   if (V2 == nullptr)
     return nullptr;
 
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
   // fmin((double)floatval1, (double)floatval2)
   //                      -> (double)fminf(floatval1, floatval2)
   // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
@@ -1015,9 +1048,9 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
-  if (UnsafeFPShrink && Callee->getName() == "cos" && TLI->has(LibFunc::cosf)) {
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  }
 
   FunctionType *FT = Callee->getFunctionType();
   // Just make sure this has 1 argument of FP type, which matches the
@@ -1035,13 +1068,37 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
+  // Multiplications calculated using Addition Chains.
+  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+
+  assert(Exp != 0 && "Incorrect exponent 0 not handled");
+
+  if (InnerChain[Exp])
+    return InnerChain[Exp];
+
+  static const unsigned AddChain[33][2] = {
+      {0, 0}, // Unused.
+      {0, 0}, // Unused (base case = pow1).
+      {1, 1}, // Unused (pre-computed).
+      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4},
+      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7},
+      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10},
+      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
+      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
+  };
+
+  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
+                                 getPow(InnerChain, AddChain[Exp][1], B));
+  return InnerChain[Exp];
+}
+
 Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-
   Value *Ret = nullptr;
-  if (UnsafeFPShrink && Callee->getName() == "pow" && TLI->has(LibFunc::powf)) {
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  }
 
   FunctionType *FT = Callee->getFunctionType();
   // Just make sure this has 2 arguments of the same FP type, which match the
@@ -1060,7 +1117,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     if (Op1C->isExactlyValue(2.0) &&
         hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
                         LibFunc::exp2l))
-      return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
+      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B,
+                                  Callee->getAttributes());
     // pow(10.0, x) -> exp10(x)
     if (Op1C->isExactlyValue(10.0) &&
         hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
@@ -1069,6 +1127,29 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
                                   Callee->getAttributes());
   }
 
+  // FIXME: Use instruction-level FMF.
+  bool UnsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent());
+
+  // pow(exp(x), y) -> exp(x * y)
+  // pow(exp2(x), y) -> exp2(x * y)
+  // We enable these only with fast-math. Besides rounding differences, the
+  // transformation changes overflow and underflow behavior quite dramatically.
+  // Example: x = 1000, y = 0.001.
+  // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
+  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) {
+    LibFunc::Func Func;
+    Function *OpCCallee = OpC->getCalledFunction();
+    if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) &&
+        TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+      Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul");
+      return EmitUnaryFloatFnCall(FMul, OpCCallee->getName(), B,
+                                  OpCCallee->getAttributes());
+    }
+  }
+
   ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
   if (!Op2C)
     return Ret;
@@ -1081,10 +1162,18 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
                       LibFunc::sqrtl) &&
       hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
                       LibFunc::fabsl)) {
+
+    // In -ffast-math, pow(x, 0.5) -> sqrt(x).
+    if (CI->hasUnsafeAlgebra()) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+      return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+                                  Callee->getAttributes());
+    }
+
     // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
     // This is faster than calling pow, and still handles negative zero
     // and negative infinity correctly.
-    // TODO: In fast-math mode, this could be just sqrt(x).
     // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
     Value *Inf = ConstantFP::getInfinity(CI->getType());
     Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
@@ -1102,18 +1191,42 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFMul(Op1, Op1, "pow2");
   if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
     return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+
+  // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
+  if (UnsafeFPMath) {
+    APFloat V = abs(Op2C->getValueAPF());
+    // We limit to a max of 7 fmul(s). Thus max exponent is 32.
+    // This transformation applies to integer exponents only.
+    if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan ||
+        !V.isInteger())
+      return nullptr;
+
+    // We will memoize intermediate products of the Addition Chain.
+    Value *InnerChain[33] = {nullptr};
+    InnerChain[1] = Op1;
+    InnerChain[2] = B.CreateFMul(Op1, Op1);
+
+    // We cannot readily convert a non-double type (like float) to a double.
+    // So we first convert V to something which could be converted to double.
+    bool ignored;
+    V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
+    // For negative exponents simply compute the reciprocal.
+    if (Op2C->isNegative())
+      FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul);
+    return FMul;
+  }
+
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Function *Caller = CI->getParent()->getParent();
-
   Value *Ret = nullptr;
-  if (UnsafeFPShrink && Callee->getName() == "exp2" &&
-      TLI->has(LibFunc::exp2f)) {
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  }
 
   FunctionType *FT = Callee->getFunctionType();
   // Just make sure this has 1 argument of FP type, which matches the
@@ -1162,11 +1275,10 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-
   Value *Ret = nullptr;
-  if (Callee->getName() == "fabs" && TLI->has(LibFunc::fabsf)) {
+  StringRef Name = Callee->getName();
+  if (Name == "fabs" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, false);
-  }
 
   FunctionType *FT = Callee->getFunctionType();
   // Make sure this has 1 argument of FP type which matches the result type.
@@ -1184,6 +1296,102 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
+Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
+  // If we can shrink the call to a float function rather than a double
+  // function, do that first.
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
+    if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+      return Ret;
+
+  // Make sure this has 2 arguments of FP type which match the result type.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return nullptr;
+
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  FastMathFlags FMF;
+  if (CI->hasUnsafeAlgebra()) {
+    // Unsafe algebra sets all fast-math-flags to true.
+    FMF.setUnsafeAlgebra();
+  } else {
+    // At a minimum, no-nans-fp-math must be true.
+    if (!CI->hasNoNaNs())
+      return nullptr;
+    // No-signed-zeros is implied by the definitions of fmax/fmin themselves:
+    // "Ideally, fmax would be sensitive to the sign of zero, for example
+    // fmax(-0. 0, +0. 0) would return +0; however, implementation in software
+    // might be impractical."
+    FMF.setNoSignedZeros();
+    FMF.setNoNaNs();
+  }
+  B.setFastMathFlags(FMF);
+
+  // We have a relaxed floating-point environment. We can ignore NaN-handling
+  // and transform to a compare and select. We do not have to consider errno or
+  // exceptions, because fmin/fmax do not have those.
+  Value *Op0 = CI->getArgOperand(0);
+  Value *Op1 = CI->getArgOperand(1);
+  Value *Cmp = Callee->getName().startswith("fmin") ?
+    B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1);
+  return B.CreateSelect(Cmp, Op0, Op1);
+}
+
+Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+  FunctionType *FT = Callee->getFunctionType();
+
+  // Just make sure this has 1 argument of FP type, which matches the
+  // result type.
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
+  if (!CI->hasUnsafeAlgebra())
+    return Ret;
+  Value *Op1 = CI->getArgOperand(0);
+  auto *OpC = dyn_cast<CallInst>(Op1);
+
+  // The earlier call must also be unsafe in order to do these transforms.
+  if (!OpC || !OpC->hasUnsafeAlgebra())
+    return Ret;
+
+  // log(pow(x,y)) -> y*log(x)
+  // This is only applicable to log, log2, log10.
+  if (Name != "log" && Name != "log2" && Name != "log10")
+    return Ret;
+
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  FastMathFlags FMF;
+  FMF.setUnsafeAlgebra();
+  B.setFastMathFlags(FMF);
+
+  LibFunc::Func Func;
+  Function *F = OpC->getCalledFunction();
+  if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+      Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow))
+    return B.CreateFMul(OpC->getArgOperand(1),
+      EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
+                           Callee->getAttributes()), "mul");
+
+  // log(exp2(y)) -> y*log(2)
+  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
+      TLI->has(Func) && Func == LibFunc::exp2)
+    return B.CreateFMul(
+        OpC->getArgOperand(0),
+        EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
+                             Callee->getName(), B, Callee->getAttributes()),
+        "logmul");
+  return Ret;
+}
+
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   
@@ -1192,73 +1400,99 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
                                    Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  // FIXME: For finer-grain optimization, we need intrinsics to have the same
-  // fast-math flag decorations that are applied to FP instructions. For now,
-  // we have to rely on the function-level unsafe-fp-math attribute to do this
-  // optimization because there's no other way to express that the sqrt can be
-  // reassociated.
-  Function *F = CI->getParent()->getParent();
-  if (F->hasFnAttribute("unsafe-fp-math")) {
-    // Check for unsafe-fp-math = true.
-    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
-    if (Attr.getValueAsString() != "true")
-      return Ret;
-  }
-  Value *Op = CI->getArgOperand(0);
-  if (Instruction *I = dyn_cast<Instruction>(Op)) {
-    if (I->getOpcode() == Instruction::FMul && I->hasUnsafeAlgebra()) {
-      // We're looking for a repeated factor in a multiplication tree,
-      // so we can do this fold: sqrt(x * x) -> fabs(x);
-      // or this fold: sqrt(x * x * y) -> fabs(x) * sqrt(y).
-      Value *Op0 = I->getOperand(0);
-      Value *Op1 = I->getOperand(1);
-      Value *RepeatOp = nullptr;
-      Value *OtherOp = nullptr;
-      if (Op0 == Op1) {
-        // Simple match: the operands of the multiply are identical.
-        RepeatOp = Op0;
-      } else {
-        // Look for a more complicated pattern: one of the operands is itself
-        // a multiply, so search for a common factor in that multiply.
-        // Note: We don't bother looking any deeper than this first level or for
-        // variations of this pattern because instcombine's visitFMUL and/or the
-        // reassociation pass should give us this form.
-        Value *OtherMul0, *OtherMul1;
-        if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
-          // Pattern: sqrt((x * y) * z)
-          if (OtherMul0 == OtherMul1) {
-            // Matched: sqrt((x * x) * z)
-            RepeatOp = OtherMul0;
-            OtherOp = Op1;
-          }
-        }
-      }
-      if (RepeatOp) {
-        // Fast math flags for any created instructions should match the sqrt
-        // and multiply.
-        // FIXME: We're not checking the sqrt because it doesn't have
-        // fast-math-flags (see earlier comment).
-        IRBuilder<true, ConstantFolder,
-          IRBuilderDefaultInserter<true> >::FastMathFlagGuard Guard(B);
-        B.SetFastMathFlags(I->getFastMathFlags());
-        // If we found a repeated factor, hoist it out of the square root and
-        // replace it with the fabs of that factor.
-        Module *M = Callee->getParent();
-        Type *ArgType = Op->getType();
-        Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
-        Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
-        if (OtherOp) {
-          // If we found a non-repeated factor, we still need to get its square
-          // root. We then multiply that by the value that was simplified out
-          // of the square root calculation.
-          Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
-          Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
-          return B.CreateFMul(FabsCall, SqrtCall);
-        }
-        return FabsCall;
+  if (!CI->hasUnsafeAlgebra())
+    return Ret;
+
+  Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
+  if (!I || I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+    return Ret;
+
+  // We're looking for a repeated factor in a multiplication tree,
+  // so we can do this fold: sqrt(x * x) -> fabs(x);
+  // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y).
+  Value *Op0 = I->getOperand(0);
+  Value *Op1 = I->getOperand(1);
+  Value *RepeatOp = nullptr;
+  Value *OtherOp = nullptr;
+  if (Op0 == Op1) {
+    // Simple match: the operands of the multiply are identical.
+    RepeatOp = Op0;
+  } else {
+    // Look for a more complicated pattern: one of the operands is itself
+    // a multiply, so search for a common factor in that multiply.
+    // Note: We don't bother looking any deeper than this first level or for
+    // variations of this pattern because instcombine's visitFMUL and/or the
+    // reassociation pass should give us this form.
+    Value *OtherMul0, *OtherMul1;
+    if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
+      // Pattern: sqrt((x * y) * z)
+      if (OtherMul0 == OtherMul1 &&
+          cast<Instruction>(Op0)->hasUnsafeAlgebra()) {
+        // Matched: sqrt((x * x) * z)
+        RepeatOp = OtherMul0;
+        OtherOp = Op1;
       }
     }
   }
+  if (!RepeatOp)
+    return Ret;
+
+  // Fast math flags for any created instructions should match the sqrt
+  // and multiply.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(I->getFastMathFlags());
+
+  // If we found a repeated factor, hoist it out of the square root and
+  // replace it with the fabs of that factor.
+  Module *M = Callee->getParent();
+  Type *ArgType = I->getType();
+  Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
+  Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
+  if (OtherOp) {
+    // If we found a non-repeated factor, we still need to get its square
+    // root. We then multiply that by the value that was simplified out
+    // of the square root calculation.
+    Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
+    Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
+    return B.CreateFMul(FabsCall, SqrtCall);
+  }
+  return FabsCall;
+}
+
+// TODO: Generalize to handle any trig function and its inverse.
+Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+  FunctionType *FT = Callee->getFunctionType();
+
+  // Just make sure this has 1 argument of FP type, which matches the
+  // result type.
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
+  Value *Op1 = CI->getArgOperand(0);
+  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (!OpC)
+    return Ret;
+
+  // Both calls must allow unsafe optimizations in order to remove them.
+  if (!CI->hasUnsafeAlgebra() || !OpC->hasUnsafeAlgebra())
+    return Ret;
+
+  // tan(atan(x)) -> x
+  // tanf(atanf(x)) -> x
+  // tanl(atanl(x)) -> x
+  LibFunc::Func Func;
+  Function *F = OpC->getCalledFunction();
+  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+      ((Func == LibFunc::atan && Callee->getName() == "tan") ||
+       (Func == LibFunc::atanf && Callee->getName() == "tanf") ||
+       (Func == LibFunc::atanl && Callee->getName() == "tanl")))
+    Ret = OpC->getArgOperand(0);
   return Ret;
 }
 
@@ -1329,9 +1563,9 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
     return;
 
   Function *Callee = CI->getCalledFunction();
-  StringRef FuncName = Callee->getName();
   LibFunc::Func Func;
-  if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || !isTrigLibCall(CI))
+  if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) ||
+      !isTrigLibCall(CI))
     return;
 
   if (IsFloat) {
@@ -1353,10 +1587,8 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
 
 void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls,
                                          Value *Res) {
-  for (SmallVectorImpl<CallInst *>::iterator I = Calls.begin(), E = Calls.end();
-       I != E; ++I) {
-    replaceAllUsesWith(*I, Res);
-  }
+  for (CallInst *C : Calls)
+    replaceAllUsesWith(C, Res);
 }
 
 void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
@@ -1387,8 +1619,7 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
     // If the argument is an instruction, it must dominate all uses so put our
     // sincos call there.
-    BasicBlock::iterator Loc = ArgInst;
-    B.SetInsertPoint(ArgInst->getParent(), ++Loc);
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
   } else {
     // Otherwise (e.g. for a constant) the beginning of the function is as
     // good a place as any.
@@ -1413,15 +1644,16 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
+static bool checkIntUnaryReturnAndParam(Function *Callee) {
+  FunctionType *FT = Callee->getFunctionType();
+  return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) &&
+    FT->getParamType(0)->isIntegerTy();
+}
+
 Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 2 arguments of the same FP type, which match the
-  // result type.
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy(32) ||
-      !FT->getParamType(0)->isIntegerTy())
+  if (!checkIntUnaryReturnAndParam(Callee))
     return nullptr;
-
   Value *Op = CI->getArgOperand(0);
 
   // Constant fold.
@@ -1436,7 +1668,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
   Type *ArgType = Op->getType();
   Value *F =
       Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType);
-  Value *V = B.CreateCall(F, {Op, B.getFalse()}, "cttz");
+  Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");
   V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
   V = B.CreateIntCast(V, B.getInt32Ty(), false);
 
@@ -1461,11 +1693,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // We require integer(i32)
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-      !FT->getParamType(0)->isIntegerTy(32))
+  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
     return nullptr;
 
   // isdigit(c) -> (c-'0') <u 10
@@ -1476,11 +1704,7 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // We require integer(i32)
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-      !FT->getParamType(0)->isIntegerTy(32))
+  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
     return nullptr;
 
   // isascii(c) -> c <u 128
@@ -1490,11 +1714,7 @@ Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // We require i32(i32)
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isIntegerTy(32))
+  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
     return nullptr;
 
   // toascii(c) -> c & 0x7f
@@ -1529,10 +1749,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
 }
 
 static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
-  if (!ColdErrorCalls)
-    return false;
-
-  if (!Callee || !Callee->isDeclaration())
+  if (!ColdErrorCalls || !Callee || !Callee->isDeclaration())
     return false;
 
   if (StreamArg < 0)
@@ -1962,22 +2179,17 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
   StringRef FuncName = Callee->getName();
-  IRBuilder<> Builder(CI);
+
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+  IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
   bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
   // Command-line parameter overrides function attribute.
   if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
     UnsafeFPShrink = EnableUnsafeFPShrink;
-  else if (Callee->hasFnAttribute("unsafe-fp-math")) {
-    // FIXME: This is the same problem as described in optimizeSqrt().
-    // If calls gain access to IR-level FMF, then use that instead of a
-    // function attribute.
-
-    // Check for unsafe-fp-math = true.
-    Attribute Attr = Callee->getFnAttribute("unsafe-fp-math");
-    if (Attr.getValueAsString() == "true")
-      UnsafeFPShrink = true;
-  }
+  else if (canUseUnsafeFPMath(Callee))
+    UnsafeFPShrink = true;
 
   // First, check for intrinsics.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
@@ -1990,6 +2202,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeExp2(CI, Builder);
     case Intrinsic::fabs:
       return optimizeFabs(CI, Builder);
+    case Intrinsic::log:
+      return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
       return optimizeSqrt(CI, Builder);
     default:
@@ -2001,13 +2215,17 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) {
     // Try to further simplify the result.
     CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
-    if (SimplifiedCI && SimplifiedCI->getCalledFunction())
-      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
+    if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
+      // Use an IR Builder from SimplifiedCI if available instead of CI
+      // to guarantee we reach all uses we might replace later on.
+      IRBuilder<> TmpBuilder(SimplifiedCI);
+      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
         // If we were able to further simplify, remove the now redundant call.
         SimplifiedCI->replaceAllUsesWith(V);
         SimplifiedCI->eraseFromParent();
         return V;
       }
+    }
     return SimplifiedFortifiedCI;
   }
 
@@ -2068,8 +2286,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeFWrite(CI, Builder);
     case LibFunc::fputs:
       return optimizeFPuts(CI, Builder);
+    case LibFunc::log:
+    case LibFunc::log10:
+    case LibFunc::log1p:
+    case LibFunc::log2:
+    case LibFunc::logb:
+      return optimizeLog(CI, Builder);
     case LibFunc::puts:
       return optimizePuts(CI, Builder);
+    case LibFunc::tan:
+    case LibFunc::tanf:
+    case LibFunc::tanl:
+      return optimizeTan(CI, Builder);
     case LibFunc::perror:
       return optimizeErrorReporting(CI, Builder);
     case LibFunc::vfprintf:
@@ -2097,24 +2325,23 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     case LibFunc::exp:
     case LibFunc::exp10:
     case LibFunc::expm1:
-    case LibFunc::log:
-    case LibFunc::log10:
-    case LibFunc::log1p:
-    case LibFunc::log2:
-    case LibFunc::logb:
     case LibFunc::sin:
     case LibFunc::sinh:
-    case LibFunc::tan:
     case LibFunc::tanh:
       if (UnsafeFPShrink && hasFloatVersion(FuncName))
         return optimizeUnaryDoubleFP(CI, Builder, true);
       return nullptr;
     case LibFunc::copysign:
-    case LibFunc::fmin:
-    case LibFunc::fmax:
       if (hasFloatVersion(FuncName))
         return optimizeBinaryDoubleFP(CI, Builder);
       return nullptr;
+    case LibFunc::fminf:
+    case LibFunc::fmin:
+    case LibFunc::fminl:
+    case LibFunc::fmaxf:
+    case LibFunc::fmax:
+    case LibFunc::fmaxl:
+      return optimizeFMinFMax(CI, Builder);
     default:
       return nullptr;
     }
@@ -2133,37 +2360,27 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
   Replacer(I, With);
 }
 
-/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I,
-                                                             Value *With) {
-  I->replaceAllUsesWith(With);
-  I->eraseFromParent();
-}
-
 // TODO:
 //   Additional cases that we need to add to this file:
 //
 // cbrt:
 //   * cbrt(expN(X))  -> expN(x/3)
 //   * cbrt(sqrt(x))  -> pow(x,1/6)
-//   * cbrt(sqrt(x))  -> pow(x,1/9)
+//   * cbrt(cbrt(x))  -> pow(x,1/9)
 //
 // exp, expf, expl:
 //   * exp(log(x))  -> x
 //
 // log, logf, logl:
 //   * log(exp(x))   -> x
-//   * log(x**y)     -> y*log(x)
 //   * log(exp(y))   -> y*log(e)
-//   * log(exp2(y))  -> y*log(2)
 //   * log(exp10(y)) -> y*log(10)
 //   * log(sqrt(x))  -> 0.5*log(x)
-//   * log(pow(x,y)) -> y*log(x)
 //
 // lround, lroundf, lroundl:
 //   * lround(cnst) -> cnst'
 //
 // pow, powf, powl:
-//   * pow(exp(x),y)  -> exp(x*y)
 //   * pow(sqrt(x),y) -> pow(x,y*0.5)
 //   * pow(pow(x,y),z)-> pow(x,y*z)
 //
@@ -2179,9 +2396,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 //   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
 //   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
 //
-// tan, tanf, tanl:
-//   * tan(atan(x)) -> x
-//
 // trunc, truncf, truncl:
 //   * trunc(cnst) -> cnst'
 //
@@ -2218,7 +2432,8 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
   return false;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
+                                                     IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
   if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk))
@@ -2232,7 +2447,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &
   return nullptr;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
+                                                      IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
   if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk))
@@ -2246,7 +2462,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<>
   return nullptr;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
+                                                     IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
   if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk))
@@ -2338,7 +2555,10 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
   StringRef FuncName = Callee->getName();
-  IRBuilder<> Builder(CI);
+
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+  IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
   bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
   // First, check that this is a known library functions.
diff --git a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
new file mode 100644
index 0000000..ad6b782
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp
@@ -0,0 +1,85 @@
+//===- SplitModule.cpp - Split a module into partitions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the function llvm::SplitModule, which splits a module
+// into multiple linkable partitions. It can be used to implement parallel code
+// generation for link-time optimization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+static void externalize(GlobalValue *GV) {
+  if (GV->hasLocalLinkage()) {
+    GV->setLinkage(GlobalValue::ExternalLinkage);
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+  }
+
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV->hasName())
+    GV->setName("__llvmsplit_unnamed");
+}
+
+// Returns whether GV should be in partition (0-based) I of N.
+static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
+  if (auto GA = dyn_cast<GlobalAlias>(GV))
+    if (const GlobalObject *Base = GA->getBaseObject())
+      GV = Base;
+
+  StringRef Name;
+  if (const Comdat *C = GV->getComdat())
+    Name = C->getName();
+  else
+    Name = GV->getName();
+
+  // Partition by MD5 hash. We only need a few bits for evenness as the number
+  // of partitions will generally be in the 1-2 figure range; the low 16 bits
+  // are enough.
+  MD5 H;
+  MD5::MD5Result R;
+  H.update(Name);
+  H.final(R);
+  return (R[0] | (R[1] << 8)) % N == I;
+}
+
+void llvm::SplitModule(
+    std::unique_ptr<Module> M, unsigned N,
+    std::function<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+  for (Function &F : *M)
+    externalize(&F);
+  for (GlobalVariable &GV : M->globals())
+    externalize(&GV);
+  for (GlobalAlias &GA : M->aliases())
+    externalize(&GA);
+
+  // FIXME: We should be able to reuse M as the last partition instead of
+  // cloning it.
+  for (unsigned I = 0; I != N; ++I) {
+    ValueToValueMapTy VMap;
+    std::unique_ptr<Module> MPart(
+        CloneModule(M.get(), VMap, [=](const GlobalValue *GV) {
+          return isInPartition(GV, I, N);
+        }));
+    if (I != 0)
+      MPart->setModuleInlineAsm("");
+    ModuleCallback(std::move(MPart));
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index a2a54da..1d1f602 100644
--- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -69,7 +69,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 7e00a80..6b1d1da 100644
--- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -50,11 +50,11 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
   //
   std::vector<BasicBlock*> ReturningBlocks;
   std::vector<BasicBlock*> UnreachableBlocks;
-  for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
-    if (isa<ReturnInst>(I->getTerminator()))
-      ReturningBlocks.push_back(I);
-    else if (isa<UnreachableInst>(I->getTerminator()))
-      UnreachableBlocks.push_back(I);
+  for (BasicBlock &I : F)
+    if (isa<ReturnInst>(I.getTerminator()))
+      ReturningBlocks.push_back(&I);
+    else if (isa<UnreachableInst>(I.getTerminator()))
+      UnreachableBlocks.push_back(&I);
 
   // Then unreachable blocks.
   if (UnreachableBlocks.empty()) {
diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 8c72641..f47ddb9 100644
--- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -19,11 +19,14 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
 using namespace llvm;
 
 // Out of line method to get vtable etc for class.
 void ValueMapTypeRemapper::anchor() {}
 void ValueMaterializer::anchor() {}
+void ValueMaterializer::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
+}
 
 Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
                       ValueMapTypeRemapper *TypeMapper,
@@ -35,15 +38,28 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   
   // If we have a materializer and it can materialize a value, use that.
   if (Materializer) {
-    if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V)))
-      return VM[V] = NewV;
+    if (Value *NewV =
+            Materializer->materializeDeclFor(const_cast<Value *>(V))) {
+      VM[V] = NewV;
+      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
+        Materializer->materializeInitFor(
+            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
+      return NewV;
+    }
   }
 
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
-  if (isa<GlobalValue>(V))
+  if (isa<GlobalValue>(V)) {
+    if (Flags & RF_NullMapMissingGlobalValues) {
+      assert(!(Flags & RF_IgnoreMissingEntries) &&
+             "Illegal to specify both RF_NullMapMissingGlobalValues and "
+             "RF_IgnoreMissingEntries");
+      return nullptr;
+    }
     return VM[V] = const_cast<Value*>(V);
-  
+  }
+
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
     // Inline asm may need *type* remapping.
     FunctionType *NewTy = IA->getFunctionType();
@@ -73,7 +89,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // correct.  For now, just match behaviour from before the metadata/value
     // split.
     //
-    //    assert(MappedMD && "Referenced metadata value not in value map");
+    //    assert((MappedMD || (Flags & RF_NullMapMissingGlobalValues)) &&
+    //           "Referenced metadata value not in value map");
     return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD);
   }
 
@@ -127,9 +144,13 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,
                              Flags, TypeMapper, Materializer));
   }
-  
+  Type *NewSrcTy = nullptr;
+  if (TypeMapper)
+    if (auto *GEPO = dyn_cast<GEPOperator>(C))
+      NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    return VM[V] = CE->getWithOperands(Ops, NewTy);
+    return VM[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
   if (isa<ConstantArray>(C))
     return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
   if (isa<ConstantStruct>(C))
@@ -146,29 +167,42 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
 }
 
 static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key,
-                     Metadata *Val) {
+                               Metadata *Val, ValueMaterializer *Materializer,
+                               RemapFlags Flags) {
   VM.MD()[Key].reset(Val);
+  if (Materializer && !(Flags & RF_HaveUnmaterializedMetadata)) {
+    auto *N = dyn_cast_or_null<MDNode>(Val);
+    // Need to invoke this once we have non-temporary MD.
+    if (!N || !N->isTemporary())
+      Materializer->replaceTemporaryMetadata(Key, Val);
+  }
   return Val;
 }
 
-static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) {
-  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD));
+static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD,
+                           ValueMaterializer *Materializer, RemapFlags Flags) {
+  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD), Materializer, Flags);
 }
 
 static Metadata *MapMetadataImpl(const Metadata *MD,
-                                 SmallVectorImpl<MDNode *> &Cycles,
+                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
                                  ValueToValueMapTy &VM, RemapFlags Flags,
                                  ValueMapTypeRemapper *TypeMapper,
                                  ValueMaterializer *Materializer);
 
-static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles,
+static Metadata *mapMetadataOp(Metadata *Op,
+                               SmallVectorImpl<MDNode *> &DistinctWorklist,
                                ValueToValueMapTy &VM, RemapFlags Flags,
                                ValueMapTypeRemapper *TypeMapper,
                                ValueMaterializer *Materializer) {
   if (!Op)
     return nullptr;
-  if (Metadata *MappedOp =
-          MapMetadataImpl(Op, Cycles, VM, Flags, TypeMapper, Materializer))
+
+  if (Materializer && !Materializer->isMetadataNeeded(Op))
+    return nullptr;
+
+  if (Metadata *MappedOp = MapMetadataImpl(Op, DistinctWorklist, VM, Flags,
+                                           TypeMapper, Materializer))
     return MappedOp;
   // Use identity map if MappedOp is null and we can ignore missing entries.
   if (Flags & RF_IgnoreMissingEntries)
@@ -178,89 +212,122 @@ static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles,
   // correct.  For now, just match behaviour from before the metadata/value
   // split.
   //
-  //    llvm_unreachable("Referenced metadata not in value map!");
+  //    assert((Flags & RF_NullMapMissingGlobalValues) &&
+  //           "Referenced metadata not in value map!");
   return nullptr;
 }
 
-/// \brief Remap nodes.
+/// Resolve uniquing cycles involving the given metadata.
+static void resolveCycles(Metadata *MD, bool AllowTemps) {
+  if (auto *N = dyn_cast_or_null<MDNode>(MD)) {
+    if (AllowTemps && N->isTemporary())
+      return;
+    if (!N->isResolved()) {
+      if (AllowTemps)
+        // Note that this will drop RAUW support on any temporaries, which
+        // blocks uniquing. If this ends up being an issue, in the future
+        // we can experiment with delaying resolving these nodes until
+        // after metadata is fully materialized (i.e. when linking metadata
+        // as a postpass after function importing).
+        N->resolveNonTemporaries();
+      else
+        N->resolveCycles();
+    }
+  }
+}
+
+/// Remap the operands of an MDNode.
 ///
-/// Insert \c NewNode in the value map, and then remap \c OldNode's operands.
-/// Assumes that \c NewNode is already a clone of \c OldNode.
+/// If \c Node is temporary, uniquing cycles are ignored.  If \c Node is
+/// distinct, uniquing cycles are resolved as they're found.
 ///
-/// \pre \c NewNode is a clone of \c OldNode.
-static bool remap(const MDNode *OldNode, MDNode *NewNode,
-                  SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM,
-                  RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                  ValueMaterializer *Materializer) {
-  assert(OldNode->getNumOperands() == NewNode->getNumOperands() &&
-         "Expected nodes to match");
-  assert(OldNode->isResolved() && "Expected resolved node");
-  assert(!NewNode->isUniqued() && "Expected non-uniqued node");
-
-  // Map the node upfront so it's available for cyclic references.
-  mapToMetadata(VM, OldNode, NewNode);
-  bool AnyChanged = false;
-  for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) {
-    Metadata *Old = OldNode->getOperand(I);
-    assert(NewNode->getOperand(I) == Old &&
-           "Expected old operands to already be in place");
+/// \pre \c Node.isDistinct() or \c Node.isTemporary().
+static bool remapOperands(MDNode &Node,
+                          SmallVectorImpl<MDNode *> &DistinctWorklist,
+                          ValueToValueMapTy &VM, RemapFlags Flags,
+                          ValueMapTypeRemapper *TypeMapper,
+                          ValueMaterializer *Materializer) {
+  assert(!Node.isUniqued() && "Expected temporary or distinct node");
+  const bool IsDistinct = Node.isDistinct();
 
-    Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags,
-                                  TypeMapper, Materializer);
+  bool AnyChanged = false;
+  for (unsigned I = 0, E = Node.getNumOperands(); I != E; ++I) {
+    Metadata *Old = Node.getOperand(I);
+    Metadata *New = mapMetadataOp(Old, DistinctWorklist, VM, Flags, TypeMapper,
+                                  Materializer);
     if (Old != New) {
       AnyChanged = true;
-      NewNode->replaceOperandWith(I, New);
+      Node.replaceOperandWith(I, New);
+
+      // Resolve uniquing cycles underneath distinct nodes on the fly so they
+      // don't infect later operands.
+      if (IsDistinct)
+        resolveCycles(New, Flags & RF_HaveUnmaterializedMetadata);
     }
   }
 
   return AnyChanged;
 }
 
-/// \brief Map a distinct MDNode.
+/// Map a distinct MDNode.
 ///
-/// Distinct nodes are not uniqued, so they must always recreated.
+/// Whether distinct nodes change is independent of their operands.  If \a
+/// RF_MoveDistinctMDs, then they are reused, and their operands remapped in
+/// place; effectively, they're moved from one graph to another.  Otherwise,
+/// they're cloned/duplicated, and the new copy's operands are remapped.
 static Metadata *mapDistinctNode(const MDNode *Node,
-                                 SmallVectorImpl<MDNode *> &Cycles,
+                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
                                  ValueToValueMapTy &VM, RemapFlags Flags,
                                  ValueMapTypeRemapper *TypeMapper,
                                  ValueMaterializer *Materializer) {
   assert(Node->isDistinct() && "Expected distinct node");
 
-  MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone());
-  remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer);
+  MDNode *NewMD;
+  if (Flags & RF_MoveDistinctMDs)
+    NewMD = const_cast<MDNode *>(Node);
+  else
+    NewMD = MDNode::replaceWithDistinct(Node->clone());
 
-  // Track any cycles beneath this node.
-  for (Metadata *Op : NewMD->operands())
-    if (auto *Node = dyn_cast_or_null<MDNode>(Op))
-      if (!Node->isResolved())
-        Cycles.push_back(Node);
-
-  return NewMD;
+  // Remap operands later.
+  DistinctWorklist.push_back(NewMD);
+  return mapToMetadata(VM, Node, NewMD, Materializer, Flags);
 }
 
 /// \brief Map a uniqued MDNode.
 ///
 /// Uniqued nodes may not need to be recreated (they may map to themselves).
 static Metadata *mapUniquedNode(const MDNode *Node,
-                                SmallVectorImpl<MDNode *> &Cycles,
+                                SmallVectorImpl<MDNode *> &DistinctWorklist,
                                 ValueToValueMapTy &VM, RemapFlags Flags,
                                 ValueMapTypeRemapper *TypeMapper,
                                 ValueMaterializer *Materializer) {
-  assert(Node->isUniqued() && "Expected uniqued node");
+  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isUniqued()) &&
+         "Expected uniqued node");
 
-  // Create a temporary node upfront in case we have a metadata cycle.
+  // Create a temporary node and map it upfront in case we have a uniquing
+  // cycle.  If necessary, this mapping will get updated by RAUW logic before
+  // returning.
   auto ClonedMD = Node->clone();
-  if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer))
-    // No operands changed, so use the identity mapping.
-    return mapToSelf(VM, Node);
+  mapToMetadata(VM, Node, ClonedMD.get(), Materializer, Flags);
+  if (!remapOperands(*ClonedMD, DistinctWorklist, VM, Flags, TypeMapper,
+                     Materializer)) {
+    // No operands changed, so use the original.
+    ClonedMD->replaceAllUsesWith(const_cast<MDNode *>(Node));
+    // Even though replaceAllUsesWith would have replaced the value map
+    // entry, we need to explictly map with the final non-temporary node
+    // to replace any temporary metadata via the callback.
+    return mapToSelf(VM, Node, Materializer, Flags);
+  }
 
-  // At least one operand has changed, so uniquify the cloned node.
+  // Uniquify the cloned node. Explicitly map it with the final non-temporary
+  // node so that replacement of temporary metadata via the callback occurs.
   return mapToMetadata(VM, Node,
-                       MDNode::replaceWithUniqued(std::move(ClonedMD)));
+                       MDNode::replaceWithUniqued(std::move(ClonedMD)),
+                       Materializer, Flags);
 }
 
 static Metadata *MapMetadataImpl(const Metadata *MD,
-                                 SmallVectorImpl<MDNode *> &Cycles,
+                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
                                  ValueToValueMapTy &VM, RemapFlags Flags,
                                  ValueMapTypeRemapper *TypeMapper,
                                  ValueMaterializer *Materializer) {
@@ -269,26 +336,28 @@ static Metadata *MapMetadataImpl(const Metadata *MD,
     return NewMD;
 
   if (isa<MDString>(MD))
-    return mapToSelf(VM, MD);
+    return mapToSelf(VM, MD, Materializer, Flags);
 
   if (isa<ConstantAsMetadata>(MD))
     if ((Flags & RF_NoModuleLevelChanges))
-      return mapToSelf(VM, MD);
+      return mapToSelf(VM, MD, Materializer, Flags);
 
   if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) {
     Value *MappedV =
         MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer);
     if (VMD->getValue() == MappedV ||
         (!MappedV && (Flags & RF_IgnoreMissingEntries)))
-      return mapToSelf(VM, MD);
+      return mapToSelf(VM, MD, Materializer, Flags);
 
     // FIXME: This assert crashes during bootstrap, but I think it should be
     // correct.  For now, just match behaviour from before the metadata/value
     // split.
     //
-    //    assert(MappedV && "Referenced metadata not in value map!");
+    //    assert((MappedV || (Flags & RF_NullMapMissingGlobalValues)) &&
+    //           "Referenced metadata not in value map!");
     if (MappedV)
-      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV));
+      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV), Materializer,
+                           Flags);
     return nullptr;
   }
 
@@ -299,37 +368,54 @@ static Metadata *MapMetadataImpl(const Metadata *MD,
   // If this is a module-level metadata and we know that nothing at the
   // module level is changing, then use an identity mapping.
   if (Flags & RF_NoModuleLevelChanges)
-    return mapToSelf(VM, MD);
+    return mapToSelf(VM, MD, Materializer, Flags);
 
   // Require resolved nodes whenever metadata might be remapped.
-  assert(Node->isResolved() && "Unexpected unresolved node");
+  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isResolved()) &&
+         "Unexpected unresolved node");
+
+  if (Materializer && Node->isTemporary()) {
+    assert(Flags & RF_HaveUnmaterializedMetadata);
+    Metadata *TempMD =
+        Materializer->mapTemporaryMetadata(const_cast<Metadata *>(MD));
+    // If the above callback returned an existing temporary node, use it
+    // instead of the current temporary node. This happens when earlier
+    // function importing passes already created and saved a temporary
+    // metadata node for the same value id.
+    if (TempMD) {
+      mapToMetadata(VM, MD, TempMD, Materializer, Flags);
+      return TempMD;
+    }
+  }
 
   if (Node->isDistinct())
-    return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer);
+    return mapDistinctNode(Node, DistinctWorklist, VM, Flags, TypeMapper,
+                           Materializer);
 
-  return mapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, Materializer);
+  return mapUniquedNode(Node, DistinctWorklist, VM, Flags, TypeMapper,
+                        Materializer);
 }
 
 Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,
                             RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
                             ValueMaterializer *Materializer) {
-  SmallVector<MDNode *, 8> Cycles;
-  Metadata *NewMD =
-      MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer);
-
-  // Resolve cycles underneath MD.
-  if (NewMD && NewMD != MD) {
-    if (auto *N = dyn_cast<MDNode>(NewMD))
-      if (!N->isResolved())
-        N->resolveCycles();
+  SmallVector<MDNode *, 8> DistinctWorklist;
+  Metadata *NewMD = MapMetadataImpl(MD, DistinctWorklist, VM, Flags, TypeMapper,
+                                    Materializer);
 
-    for (MDNode *N : Cycles)
-      if (!N->isResolved())
-        N->resolveCycles();
-  } else {
-    // Shouldn't get unresolved cycles if nothing was remapped.
-    assert(Cycles.empty() && "Expected no unresolved cycles");
-  }
+  // When there are no module-level changes, it's possible that the metadata
+  // graph has temporaries.  Skip the logic to resolve cycles, since it's
+  // unnecessary (and invalid) in that case.
+  if (Flags & RF_NoModuleLevelChanges)
+    return NewMD;
+
+  // Resolve cycles involving the entry metadata.
+  resolveCycles(NewMD, Flags & RF_HaveUnmaterializedMetadata);
+
+  // Remap the operands of distinct MDNodes.
+  while (!DistinctWorklist.empty())
+    remapOperands(*DistinctWorklist.pop_back_val(), DistinctWorklist, VM, Flags,
+                  TypeMapper, Materializer);
 
   return NewMD;
 }
@@ -374,14 +460,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   // Remap attached metadata.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   I->getAllMetadata(MDs);
-  for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
-           MI = MDs.begin(),
-           ME = MDs.end();
-       MI != ME; ++MI) {
-    MDNode *Old = MI->second;
+  for (const auto &MI : MDs) {
+    MDNode *Old = MI.second;
     MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer);
     if (New != Old)
-      I->setMetadata(MI->first, New);
+      I->setMetadata(MI.first, New);
   }
   
   if (!TypeMapper)
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
index 215d6f9..8844d57 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -25,8 +25,11 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -204,9 +207,10 @@ namespace {
 
     BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
       : BasicBlockPass(ID), Config(C) {
-      AA = &P->getAnalysis<AliasAnalysis>();
+      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
       DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &P->getAnalysis<ScalarEvolution>();
+      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       TTI = IgnoreTargetInfo
                 ? nullptr
                 : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
@@ -221,6 +225,7 @@ namespace {
     AliasAnalysis *AA;
     DominatorTree *DT;
     ScalarEvolution *SE;
+    const TargetLibraryInfo *TLI;
     const TargetTransformInfo *TTI;
 
     // FIXME: const correct?
@@ -437,9 +442,10 @@ namespace {
     bool runOnBasicBlock(BasicBlock &BB) override {
       // OptimizeNone check deferred to vectorizeBB().
 
-      AA = &getAnalysis<AliasAnalysis>();
+      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &getAnalysis<ScalarEvolution>();
+      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       TTI = IgnoreTargetInfo
                 ? nullptr
                 : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
@@ -450,13 +456,15 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       BasicBlockPass::getAnalysisUsage(AU);
-      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
       AU.setPreservesCFG();
     }
 
@@ -842,7 +850,7 @@ namespace {
 
     // It is important to cleanup here so that future iterations of this
     // function have less work to do.
-    (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo());
+    (void)SimplifyInstructionsInBlock(&BB, TLI);
     return true;
   }
 
@@ -1239,20 +1247,23 @@ namespace {
       if (I == Start) IAfterStart = true;
 
       bool IsSimpleLoadStore;
-      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
+        continue;
 
       // Look for an instruction with which to pair instruction *I...
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory()) WriteSet.add(I);
+      if (I->mayWriteToMemory())
+        WriteSet.add(&*I);
 
       bool JAfterStart = IAfterStart;
       BasicBlock::iterator J = std::next(I);
       for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (J == Start) JAfterStart = true;
+        if (&*J == Start)
+          JAfterStart = true;
 
         // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
+        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
         if (Config.FastDep) {
           // Note: For this heuristic to be effective, independent operations
           // must tend to be intermixed. This is likely to be true from some
@@ -1269,25 +1280,26 @@ namespace {
         // J does not use I, and comes before the first use of I, so it can be
         // merged with I if the instructions are compatible.
         int CostSavings, FixedOrder;
-        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
-            CostSavings, FixedOrder)) continue;
+        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
+                                CostSavings, FixedOrder))
+          continue;
 
         // J is a candidate for merging with I.
         if (PairableInsts.empty() ||
-             PairableInsts[PairableInsts.size()-1] != I) {
-          PairableInsts.push_back(I);
+            PairableInsts[PairableInsts.size() - 1] != &*I) {
+          PairableInsts.push_back(&*I);
         }
 
-        CandidatePairs[I].push_back(J);
+        CandidatePairs[&*I].push_back(&*J);
         ++TotalPairs;
         if (TTI)
-          CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
-                                                            CostSavings));
+          CandidatePairCostSavings.insert(
+              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
 
         if (FixedOrder == 1)
-          FixedOrderPairs.insert(ValuePair(I, J));
+          FixedOrderPairs.insert(ValuePair(&*I, &*J));
         else if (FixedOrder == -1)
-          FixedOrderPairs.insert(ValuePair(J, I));
+          FixedOrderPairs.insert(ValuePair(&*J, &*I));
 
         // The next call to this function must start after the last instruction
         // selected during this invocation.
@@ -1468,14 +1480,16 @@ namespace {
     BasicBlock::iterator E = BB.end(), EL =
       BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
     for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
-      if (IsInPair.find(I) == IsInPair.end()) continue;
+      if (IsInPair.find(&*I) == IsInPair.end())
+        continue;
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory()) WriteSet.add(I);
+      if (I->mayWriteToMemory())
+        WriteSet.add(&*I);
 
       for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
-        (void) trackUsesOfI(Users, WriteSet, I, J);
+        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
 
         if (J == EL)
           break;
@@ -1484,7 +1498,7 @@ namespace {
       for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
            U != E; ++U) {
         if (IsInPair.find(*U) == IsInPair.end()) continue;
-        PairableInstUsers.insert(ValuePair(I, *U));
+        PairableInstUsers.insert(ValuePair(&*I, *U));
       }
 
       if (I == EL)
@@ -2806,55 +2820,51 @@ namespace {
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt,
                      Instruction *&K1, Instruction *&K2) {
-    if (isa<StoreInst>(I)) {
-      AA->replaceWithNewValue(I, K);
-      AA->replaceWithNewValue(J, K);
-    } else {
-      Type *IType = I->getType();
-      Type *JType = J->getType();
+    if (isa<StoreInst>(I))
+      return;
 
-      VectorType *VType = getVecTypeForPair(IType, JType);
-      unsigned numElem = VType->getNumElements();
+    Type *IType = I->getType();
+    Type *JType = J->getType();
 
-      unsigned numElemI = getNumScalarElements(IType);
-      unsigned numElemJ = getNumScalarElements(JType);
+    VectorType *VType = getVecTypeForPair(IType, JType);
+    unsigned numElem = VType->getNumElements();
 
-      if (IType->isVectorTy()) {
-        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
-        for (unsigned v = 0; v < numElemI; ++v) {
-          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
-        }
+    unsigned numElemI = getNumScalarElements(IType);
+    unsigned numElemJ = getNumScalarElements(JType);
 
-        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                   ConstantVector::get( Mask1),
-                                   getReplacementName(K, false, 1));
-      } else {
-        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-        K1 = ExtractElementInst::Create(K, CV0,
-                                          getReplacementName(K, false, 1));
+    if (IType->isVectorTy()) {
+      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
+      for (unsigned v = 0; v < numElemI; ++v) {
+        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
       }
 
-      if (JType->isVectorTy()) {
-        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
-        for (unsigned v = 0; v < numElemJ; ++v) {
-          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
-        }
+      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                 ConstantVector::get(Mask1),
+                                 getReplacementName(K, false, 1));
+    } else {
+      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
+    }
 
-        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                   ConstantVector::get( Mask2),
-                                   getReplacementName(K, false, 2));
-      } else {
-        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
-        K2 = ExtractElementInst::Create(K, CV1,
-                                          getReplacementName(K, false, 2));
+    if (JType->isVectorTy()) {
+      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
+      for (unsigned v = 0; v < numElemJ; ++v) {
+        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
       }
 
-      K1->insertAfter(K);
-      K2->insertAfter(K1);
-      InsertionPt = K2;
+      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                 ConstantVector::get(Mask2),
+                                 getReplacementName(K, false, 2));
+    } else {
+      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
+      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
     }
+
+    K1->insertAfter(K);
+    K2->insertAfter(K1);
+    InsertionPt = K2;
   }
 
   // Move all uses of the function I (including pairing-induced uses) after J.
@@ -2869,7 +2879,7 @@ namespace {
     if (I->mayWriteToMemory()) WriteSet.add(I);
 
     for (; cast<Instruction>(L) != J; ++L)
-      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
+      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
 
     assert(cast<Instruction>(L) == J &&
       "Tracking has not proceeded far enough to check for dependencies");
@@ -2891,9 +2901,9 @@ namespace {
     if (I->mayWriteToMemory()) WriteSet.add(I);
 
     for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
+      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
         // Move this instruction
-        Instruction *InstToMove = L; ++L;
+        Instruction *InstToMove = &*L++;
 
         DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
                         " to after " << *InsertionPt << "\n");
@@ -2924,11 +2934,11 @@ namespace {
     // Note: We cannot end the loop when we reach J because J could be moved
     // farther down the use chain by another instruction pairing. Also, J
     // could be before I if this is an inverted input.
-    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
-      if (trackUsesOfI(Users, WriteSet, I, L)) {
+    for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
+      if (trackUsesOfI(Users, WriteSet, I, &*L)) {
         if (L->mayReadFromMemory()) {
-          LoadMoveSet[L].push_back(I);
-          LoadMoveSetPairs.insert(ValuePair(L, I));
+          LoadMoveSet[&*L].push_back(I);
+          LoadMoveSetPairs.insert(ValuePair(&*L, I));
         }
       }
     }
@@ -2991,7 +3001,7 @@ namespace {
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
     for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
       if (P == ChosenPairs.end()) {
         ++PI;
         continue;
@@ -3116,12 +3126,9 @@ namespace {
       } else if (!isa<StoreInst>(K))
         K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
 
-      unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,
-        LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,
-        LLVMContext::MD_fpmath
-      };
+      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
+                             LLVMContext::MD_invariant_group};
       combineMetadata(K, H, KnownIDs);
       K->intersectOptionalDataWith(H);
 
@@ -3145,8 +3152,6 @@ namespace {
       if (!isa<StoreInst>(I)) {
         L->replaceAllUsesWith(K1);
         H->replaceAllUsesWith(K2);
-        AA->replaceWithNewValue(L, K1);
-        AA->replaceWithNewValue(H, K2);
       }
 
       // Instructions that may read from memory may be in the load move set.
@@ -3197,10 +3202,14 @@ namespace {
 char BBVectorize::ID = 0;
 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
 
 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 69ca268..2c0d317 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -48,7 +48,6 @@
 
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
@@ -58,10 +57,13 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -99,6 +101,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
+#include <functional>
 #include <map>
 #include <tuple>
 
@@ -123,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
                                       "trip count that is smaller than this "
                                       "value."));
 
+static cl::opt<bool> MaximizeBandwidth(
+    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+    cl::desc("Maximize bandwidth when selecting vectorization factor which "
+             "will be determined by the smallest type in loop."));
+
 /// This enables versioning on the strides of symbolically striding memory
 /// accesses in code like the following.
 ///   for (i = 0; i < N; ++i)
@@ -136,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
 ///      ...
 static cl::opt<bool> EnableMemAccessVersioning(
     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
-    cl::desc("Enable symblic stride memory access versioning"));
+    cl::desc("Enable symbolic stride memory access versioning"));
 
 static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
@@ -214,12 +222,27 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
     cl::desc("The maximum interleave count to use when interleaving a scalar "
              "reduction in a nested loop."));
 
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum allowed number of runtime memory checks with a "
+             "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed with a "
+             "vectorize(enable) pragma"));
+
 namespace {
 
 // Forward declarations.
+class LoopVectorizeHints;
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
-class LoopVectorizeHints;
+class LoopVectorizationRequirements;
 
 /// \brief This modifies LoopAccessReport to initialize message with
 /// loop-vectorizer-specific part.
@@ -245,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
+/// A helper function that returns GEP instruction and knows to skip a
+/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
+/// pointee types of the 'bitcast' have the same size.
+/// For example:
+///   bitcast double** %var to i64* - can be skipped
+///   bitcast double** %var to i8*  - can not
+static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
+
+  if (isa<GetElementPtrInst>(Ptr))
+    return cast<GetElementPtrInst>(Ptr);
+
+  if (isa<BitCastInst>(Ptr) &&
+      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
+    Type *BitcastTy = Ptr->getType();
+    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
+    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
+      return nullptr;
+    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
+    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
+    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
+    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
+      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
+  }
+  return nullptr;
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -261,25 +310,30 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {
 /// and reduction variables that were found to a given vectorization factor.
 class InnerLoopVectorizer {
 public:
-  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, const TargetLibraryInfo *TLI,
+  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, unsigned VecWidth,
                       unsigned UnrollFactor)
-      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
-        Legal(nullptr), AddedSafetyChecks(false) {}
+        TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
+        AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
-  void vectorize(LoopVectorizationLegality *L) {
+  // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
+  // can be validly truncated to. The cost model has assumed this truncation
+  // will happen when vectorizing.
+  void vectorize(LoopVectorizationLegality *L,
+                 MapVector<Instruction*,uint64_t> MinimumBitWidths) {
+    MinBWs = MinimumBitWidths;
     Legal = L;
     // Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop();
     // Widen each instruction in the old loop to a new one in the new loop.
     // Use the Legality module to find the induction and reduction variables.
     vectorizeLoop();
-    // Register the new loop and update the analysis passes.
-    updateAnalysis();
   }
 
   // Return true if any runtime check is added.
@@ -302,14 +356,11 @@ protected:
   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
                    VectorParts> EdgeMaskCache;
 
-  /// \brief Add checks for strides that were assumed to be 1.
-  ///
-  /// Returns the last check instruction and the first check instruction in the
-  /// pair as (first, last).
-  std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
-
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop();
+  /// Create a new induction variable inside L.
+  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+                                   Value *Step, Instruction *DL);
   /// Copy and widen the instructions from the old loop.
   virtual void vectorizeLoop();
 
@@ -319,6 +370,9 @@ protected:
   /// See PR14725.
   void fixLCSSAPHIs();
 
+  /// Shrinks vector element sizes based on information in "MinBWs".
+  void truncateToMinimalBitwidths();
+  
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
   /// mask for the block BB.
@@ -329,7 +383,7 @@ protected:
 
   /// A helper function to vectorize a single BB within the innermost loop.
   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
-
+  
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
@@ -374,6 +428,23 @@ protected:
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
+  /// Returns (and creates if needed) the original loop trip count.
+  Value *getOrCreateTripCount(Loop *NewLoop);
+
+  /// Returns (and creates if needed) the trip count of the widened loop.
+  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+  /// Emit a bypass check to see if the trip count would overflow, or we
+  /// wouldn't have enough iterations to execute one vector loop.
+  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+  /// Emit a bypass check to see if the vector trip count is nonzero.
+  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
+  /// Emit a bypass check to see if all of the SCEV assumptions we've
+  /// had to make are correct.
+  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+  /// Emit bypass checks to check any memory assumptions we may have made.
+  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
   /// This is a helper class that holds the vectorizer state. It maps scalar
   /// instructions to vector instructions. When the code is 'unrolled' then
   /// then a single scalar value is mapped to multiple vector parts. The parts
@@ -416,8 +487,10 @@ protected:
 
   /// The original loop.
   Loop *OrigLoop;
-  /// Scev analysis to use.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info.
   LoopInfo *LI;
   /// Dominator Tree.
@@ -462,12 +535,21 @@ protected:
   PHINode *Induction;
   /// The induction variable of the old basic block.
   PHINode *OldInduction;
-  /// Holds the extended (to the widest induction type) start index.
-  Value *ExtendedIdx;
   /// Maps scalars to widened vectors.
   ValueMap WidenMap;
+  /// Store instructions that should be predicated, as a pair
+  ///   <StoreInst, Predicate>
+  SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
   EdgeMaskCache MaskCache;
-
+  /// Trip count of the original loop.
+  Value *TripCount;
+  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+  Value *VectorTripCount;
+
+  /// Map of scalar integer values to the smallest bitwidth they can be legally
+  /// represented as. The vector equivalents of these values should be truncated
+  /// to this type.
+  MapVector<Instruction*,uint64_t> MinBWs;
   LoopVectorizationLegality *Legal;
 
   // Record whether runtime check is added.
@@ -476,10 +558,11 @@ protected:
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
-  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, const TargetLibraryInfo *TLI,
+  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
                     const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -551,7 +634,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {
     if (Kind != LLVMContext::MD_tbaa &&
         Kind != LLVMContext::MD_alias_scope &&
         Kind != LLVMContext::MD_noalias &&
-        Kind != LLVMContext::MD_fpmath)
+        Kind != LLVMContext::MD_fpmath &&
+        Kind != LLVMContext::MD_nontemporal)
       continue;
 
     To->setMetadata(Kind, M.second);
@@ -559,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {
 }
 
 /// \brief Propagate known metadata from one instruction to a vector of others.
-static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
+static void propagateMetadata(SmallVectorImpl<Value *> &To,
+                              const Instruction *From) {
   for (Value *V : To)
     if (Instruction *I = dyn_cast<Instruction>(V))
       propagateMetadata(I, From);
@@ -699,8 +784,9 @@ private:
 /// between the member and the group in a map.
 class InterleavedAccessInfo {
 public:
-  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT)
-      : SE(SE), TheLoop(L), DT(DT) {}
+  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+                        DominatorTree *DT)
+      : PSE(PSE), TheLoop(L), DT(DT) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -730,7 +816,11 @@ public:
   }
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+  /// The interleaved access analysis can also add new predicates (for example
+  /// by versioning strides of pointers).
+  PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
 
@@ -778,6 +868,304 @@ private:
       const ValueToValueMap &Strides);
 };
 
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
+class LoopVectorizeHints {
+  enum HintKind {
+    HK_WIDTH,
+    HK_UNROLL,
+    HK_FORCE
+  };
+
+  /// Hint - associates name and validation with the hint value.
+  struct Hint {
+    const char * Name;
+    unsigned Value; // This may have to change for non-numeric values.
+    HintKind Kind;
+
+    Hint(const char * Name, unsigned Value, HintKind Kind)
+      : Name(Name), Value(Value), Kind(Kind) { }
+
+    bool validate(unsigned Val) {
+      switch (Kind) {
+      case HK_WIDTH:
+        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+      case HK_UNROLL:
+        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+      case HK_FORCE:
+        return (Val <= 1);
+      }
+      return false;
+    }
+  };
+
+  /// Vectorization width.
+  Hint Width;
+  /// Vectorization interleave factor.
+  Hint Interleave;
+  /// Vectorization forced
+  Hint Force;
+
+  /// Return the loop metadata prefix.
+  static StringRef Prefix() { return "llvm.loop."; }
+
+public:
+  enum ForceKind {
+    FK_Undefined = -1, ///< Not selected.
+    FK_Disabled = 0,   ///< Forcing disabled.
+    FK_Enabled = 1,    ///< Forcing enabled.
+  };
+
+  LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
+      : Width("vectorize.width", VectorizerParams::VectorizationFactor,
+              HK_WIDTH),
+        Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+        Force("vectorize.enable", FK_Undefined, HK_FORCE),
+        TheLoop(L) {
+    // Populate values with existing loop metadata.
+    getHintsFromMetadata();
+
+    // force-vector-interleave overrides DisableInterleaving.
+    if (VectorizerParams::isInterleaveForced())
+      Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+          << "LV: Interleaving disabled by the pass manager\n");
+  }
+
+  /// Mark the loop L as already vectorized by setting the width to 1.
+  void setAlreadyVectorized() {
+    Width.Value = Interleave.Value = 1;
+    Hint Hints[] = {Width, Interleave};
+    writeHintsToMetadata(Hints);
+  }
+
+  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
+    if (getForce() == LoopVectorizeHints::FK_Disabled) {
+      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+      emitOptimizationRemarkAnalysis(F->getContext(),
+                                     vectorizeAnalysisPassName(), *F,
+                                     L->getStartLoc(), emitRemark());
+      return false;
+    }
+
+    if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+      emitOptimizationRemarkAnalysis(F->getContext(),
+                                     vectorizeAnalysisPassName(), *F,
+                                     L->getStartLoc(), emitRemark());
+      return false;
+    }
+
+    if (getWidth() == 1 && getInterleave() == 1) {
+      // FIXME: Add a separate metadata to indicate when the loop has already
+      // been vectorized instead of setting width and count to 1.
+      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+      // FIXME: Add interleave.disable metadata. This will allow
+      // vectorize.disable to be used without disabling the pass and errors
+      // to differentiate between disabled vectorization and a width of 1.
+      emitOptimizationRemarkAnalysis(
+          F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(),
+          "loop not vectorized: vectorization and interleaving are explicitly "
+          "disabled, or vectorize width and interleave count are both set to "
+          "1");
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Dumps all the hint information.
+  std::string emitRemark() const {
+    VectorizationReport R;
+    if (Force.Value == LoopVectorizeHints::FK_Disabled)
+      R << "vectorization is explicitly disabled";
+    else {
+      R << "use -Rpass-analysis=loop-vectorize for more info";
+      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+        R << " (Force=true";
+        if (Width.Value != 0)
+          R << ", Vector Width=" << Width.Value;
+        if (Interleave.Value != 0)
+          R << ", Interleave Count=" << Interleave.Value;
+        R << ")";
+      }
+    }
+
+    return R.str();
+  }
+
+  unsigned getWidth() const { return Width.Value; }
+  unsigned getInterleave() const { return Interleave.Value; }
+  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+  const char *vectorizeAnalysisPassName() const {
+    // If hints are provided that don't disable vectorization use the
+    // AlwaysPrint pass name to force the frontend to print the diagnostic.
+    if (getWidth() == 1)
+      return LV_NAME;
+    if (getForce() == LoopVectorizeHints::FK_Disabled)
+      return LV_NAME;
+    if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+      return LV_NAME;
+    return DiagnosticInfo::AlwaysPrint;
+  }
+
+  bool allowReordering() const {
+    // When enabling loop hints are provided we allow the vectorizer to change
+    // the order of operations that is given by the scalar loop. This is not
+    // enabled by default because can be unsafe or inefficient. For example,
+    // reordering floating-point operations will change the way round-off
+    // error accumulates in the loop.
+    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+  }
+
+private:
+  /// Find hints specified in the loop metadata and update local values.
+  void getHintsFromMetadata() {
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (!LoopID)
+      return;
+
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      const MDString *S = nullptr;
+      SmallVector<Metadata *, 4> Args;
+
+      // The expected hint is either a MDString or a MDNode with the first
+      // operand a MDString.
+      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+        if (!MD || MD->getNumOperands() == 0)
+          continue;
+        S = dyn_cast<MDString>(MD->getOperand(0));
+        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+          Args.push_back(MD->getOperand(i));
+      } else {
+        S = dyn_cast<MDString>(LoopID->getOperand(i));
+        assert(Args.size() == 0 && "too many arguments for MDString");
+      }
+
+      if (!S)
+        continue;
+
+      // Check if the hint starts with the loop metadata prefix.
+      StringRef Name = S->getString();
+      if (Args.size() == 1)
+        setHint(Name, Args[0]);
+    }
+  }
+
+  /// Checks string hint with one operand and set value if valid.
+  void setHint(StringRef Name, Metadata *Arg) {
+    if (!Name.startswith(Prefix()))
+      return;
+    Name = Name.substr(Prefix().size(), StringRef::npos);
+
+    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+    if (!C) return;
+    unsigned Val = C->getZExtValue();
+
+    Hint *Hints[] = {&Width, &Interleave, &Force};
+    for (auto H : Hints) {
+      if (Name == H->Name) {
+        if (H->validate(Val))
+          H->Value = Val;
+        else
+          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+        break;
+      }
+    }
+  }
+
+  /// Create a new hint from name / value pair.
+  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    Metadata *MDs[] = {MDString::get(Context, Name),
+                       ConstantAsMetadata::get(
+                           ConstantInt::get(Type::getInt32Ty(Context), V))};
+    return MDNode::get(Context, MDs);
+  }
+
+  /// Matches metadata with hint name.
+  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+    MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
+    if (!Name)
+      return false;
+
+    for (auto H : HintTypes)
+      if (Name->getString().endswith(H.Name))
+        return true;
+    return false;
+  }
+
+  /// Sets current hints into loop metadata, keeping other values intact.
+  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+    if (HintTypes.size() == 0)
+      return;
+
+    // Reserve the first element to LoopID (see below).
+    SmallVector<Metadata *, 4> MDs(1);
+    // If the loop already has metadata, then ignore the existing operands.
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (LoopID) {
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+        // If node in update list, ignore old value.
+        if (!matchesHintMetadataName(Node, HintTypes))
+          MDs.push_back(Node);
+      }
+    }
+
+    // Now, add the missing hints.
+    for (auto H : HintTypes)
+      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+    // Replace current metadata node with new one.
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    TheLoop->setLoopID(NewLoopID);
+  }
+
+  /// The loop these hints belong to.
+  const Loop *TheLoop;
+};
+
+static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop,
+                             const LoopVectorizeHints &Hints,
+                             const LoopAccessReport &Message) {
+  const char *Name = Hints.vectorizeAnalysisPassName();
+  LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name);
+}
+
+static void emitMissedWarning(Function *F, Loop *L,
+                              const LoopVectorizeHints &LH) {
+  emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+                               LH.emitRemark());
+
+  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
+    if (LH.getWidth() != 1)
+      emitLoopVectorizeWarning(
+          F->getContext(), *F, L->getStartLoc(),
+          "failed explicitly specified loop vectorization");
+    else if (LH.getInterleave() != 1)
+      emitLoopInterleaveWarning(
+          F->getContext(), *F, L->getStartLoc(),
+          "failed explicitly specified loop interleaving");
+  }
+}
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -793,87 +1181,17 @@ private:
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                            TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                            Function *F, const TargetTransformInfo *TTI,
-                            LoopAccessAnalysis *LAA)
-      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
-
-  /// This enum represents the kinds of inductions that we support.
-  enum InductionKind {
-    IK_NoInduction,  ///< Not an induction variable.
-    IK_IntInduction, ///< Integer induction variable. Step = C.
-    IK_PtrInduction  ///< Pointer induction var. Step = C / sizeof(elem).
-  };
-
-  /// A struct for saving information about induction variables.
-  struct InductionInfo {
-    InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
-        : StartValue(Start), IK(K), StepValue(Step) {
-      assert(IK != IK_NoInduction && "Not an induction");
-      assert(StartValue && "StartValue is null");
-      assert(StepValue && !StepValue->isZero() && "StepValue is zero");
-      assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
-             "StartValue is not a pointer for pointer induction");
-      assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
-             "StartValue is not an integer for integer induction");
-      assert(StepValue->getType()->isIntegerTy() &&
-             "StepValue is not an integer");
-    }
-    InductionInfo()
-        : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
-
-    /// Get the consecutive direction. Returns:
-    ///   0 - unknown or non-consecutive.
-    ///   1 - consecutive and increasing.
-    ///  -1 - consecutive and decreasing.
-    int getConsecutiveDirection() const {
-      if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
-        return StepValue->getSExtValue();
-      return 0;
-    }
-
-    /// Compute the transformed value of Index at offset StartValue using step
-    /// StepValue.
-    /// For integer induction, returns StartValue + Index * StepValue.
-    /// For pointer induction, returns StartValue[Index * StepValue].
-    /// FIXME: The newly created binary instructions should contain nsw/nuw
-    /// flags, which can be found from the original scalar operations.
-    Value *transform(IRBuilder<> &B, Value *Index) const {
-      switch (IK) {
-      case IK_IntInduction:
-        assert(Index->getType() == StartValue->getType() &&
-               "Index type does not match StartValue type");
-        if (StepValue->isMinusOne())
-          return B.CreateSub(StartValue, Index);
-        if (!StepValue->isOne())
-          Index = B.CreateMul(Index, StepValue);
-        return B.CreateAdd(StartValue, Index);
-
-      case IK_PtrInduction:
-        assert(Index->getType() == StepValue->getType() &&
-               "Index type does not match StepValue type");
-        if (StepValue->isMinusOne())
-          Index = B.CreateNeg(Index);
-        else if (!StepValue->isOne())
-          Index = B.CreateMul(Index, StepValue);
-        return B.CreateGEP(nullptr, StartValue, Index);
-
-      case IK_NoInduction:
-        return nullptr;
-      }
-      llvm_unreachable("invalid enum");
-    }
-
-    /// Start value.
-    TrackingVH<Value> StartValue;
-    /// Induction kind.
-    InductionKind IK;
-    /// Step value.
-    ConstantInt *StepValue;
-  };
+  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
+                            DominatorTree *DT, TargetLibraryInfo *TLI,
+                            AliasAnalysis *AA, Function *F,
+                            const TargetTransformInfo *TTI,
+                            LoopAccessAnalysis *LAA,
+                            LoopVectorizationRequirements *R,
+                            const LoopVectorizeHints *H)
+      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
+        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -881,7 +1199,7 @@ public:
 
   /// InductionList saves induction variables and maps them to the
   /// induction descriptor.
-  typedef MapVector<PHINode*, InductionInfo> InductionList;
+  typedef MapVector<PHINode*, InductionDescriptor> InductionList;
 
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
@@ -903,6 +1221,9 @@ public:
   /// Returns True if V is an induction variable in this loop.
   bool isInductionVariable(const Value *V);
 
+  /// Returns True if PN is a reduction variable in this loop.
+  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
+
   /// Return true if the block BB needs to be predicated in order for the loop
   /// to be vectorized.
   bool blockNeedsPredication(BasicBlock *BB);
@@ -954,12 +1275,12 @@ public:
   /// Returns true if the target machine supports masked store operation
   /// for the given \p DataType and kind of access to \p Ptr.
   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
-    return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
+    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
   }
   /// Returns true if the target machine supports masked load operation
   /// for the given \p DataType and kind of access to \p Ptr.
   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
-    return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
+    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
   }
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
@@ -999,10 +1320,6 @@ private:
   /// and we know that we can read from them without segfault.
   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
 
-  /// Returns the induction kind of Phi and record the step. This function may
-  /// return NoInduction if the PHI is not an induction variable.
-  InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
-
   /// \brief Collect memory access with loop invariant strides.
   ///
   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
@@ -1013,16 +1330,20 @@ private:
   /// not vectorized.  These are handled as LoopAccessReport rather than
   /// VectorizationReport because the << operator of VectorizationReport returns
   /// LoopAccessReport.
-  void emitAnalysis(const LoopAccessReport &Message) {
-    LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
+  void emitAnalysis(const LoopAccessReport &Message) const {
+    emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
   }
 
   unsigned NumPredStores;
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+  /// Applies dynamic knowledge to simplify SCEV expressions in the context
+  /// of existing SCEV assumptions. The analysis will also add a minimal set
+  /// of new predicates if this is required to enable vectorization and
+  /// unrolling.
+  PredicatedScalarEvolution &PSE;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
   /// Parent function
@@ -1065,12 +1386,18 @@ private:
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
 
+  /// Vectorization requirements that will go through late-evaluation.
+  LoopVectorizationRequirements *Requirements;
+
+  /// Used to emit an analysis of any legality issues.
+  const LoopVectorizeHints *Hints;
+
   ValueToValueMap Strides;
   SmallPtrSet<Value *, 8> StrideSet;
 
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
-  SmallPtrSet<const Instruction*, 8> MaskedOp;
+  SmallPtrSet<const Instruction *, 8> MaskedOp;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1082,15 +1409,14 @@ private:
 /// different operations.
 class LoopVectorizationCostModel {
 public:
-  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
-                             LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
-                             const TargetLibraryInfo *TLI, AssumptionCache *AC,
-                             const Function *F, const LoopVectorizeHints *Hints)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
-        TheFunction(F), Hints(Hints) {
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-  }
+                             const TargetLibraryInfo *TLI, DemandedBits *DB,
+                             AssumptionCache *AC, const Function *F,
+                             const LoopVectorizeHints *Hints)
+      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), TheFunction(F), Hints(Hints) {}
 
   /// Information about vectorization costs
   struct VectorizationFactor {
@@ -1103,10 +1429,10 @@ public:
   /// possible.
   VectorizationFactor selectVectorizationFactor(bool OptForSize);
 
-  /// \return The size (in bits) of the widest type in the code that
-  /// needs to be vectorized. We ignore values that remain scalar such as
+  /// \return The size (in bits) of the smallest and widest types in the code
+  /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
-  unsigned getWidestType();
+  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
 
   /// \return The desired interleave count.
   /// If interleave count has been specified by metadata it will be returned.
@@ -1133,8 +1459,13 @@ public:
     unsigned NumInstructions;
   };
 
-  /// \return  information about the register usage of the loop.
-  RegisterUsage calculateRegisterUsage();
+  /// \return Returns information about the register usages of the loop for the
+  /// given vectorization factors.
+  SmallVector<RegisterUsage, 8>
+  calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
+
+  /// Collect values we want to ignore in the cost model.
+  void collectValuesToIgnore();
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
@@ -1155,17 +1486,20 @@ private:
   /// not vectorized.  These are handled as LoopAccessReport rather than
   /// VectorizationReport because the << operator of VectorizationReport returns
   /// LoopAccessReport.
-  void emitAnalysis(const LoopAccessReport &Message) {
-    LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
+  void emitAnalysis(const LoopAccessReport &Message) const {
+    emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
   }
 
-  /// Values used only by @llvm.assume calls.
-  SmallPtrSet<const Value *, 32> EphValues;
+public:
+  /// Map of scalar integer values to the smallest bitwidth they can be legally
+  /// represented as. The vector equivalents of these values should be truncated
+  /// to this type.
+  MapVector<Instruction*,uint64_t> MinBWs;
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// Predicated scalar evolution analysis.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info analysis.
   LoopInfo *LI;
   /// Vectorization legality.
@@ -1174,247 +1508,78 @@ private:
   const TargetTransformInfo &TTI;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
+  /// Demanded bits analysis.
+  DemandedBits *DB;
+  /// Assumption cache.
+  AssumptionCache *AC;
   const Function *TheFunction;
-  // Loop Vectorize Hint.
+  /// Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
+  /// Values to ignore in the cost model.
+  SmallPtrSet<const Value *, 16> ValuesToIgnore;
+  /// Values to ignore in the cost model when VF > 1.
+  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
-/// Utility class for getting and setting loop vectorizer hints in the form
-/// of loop metadata.
-/// This class keeps a number of loop annotations locally (as member variables)
-/// and can, upon request, write them back as metadata on the loop. It will
-/// initially scan the loop for existing metadata, and will update the local
-/// values based on information in the loop.
-/// We cannot write all values to metadata, as the mere presence of some info,
-/// for example 'force', means a decision has been made. So, we need to be
-/// careful NOT to add them if the user hasn't specifically asked so.
-class LoopVectorizeHints {
-  enum HintKind {
-    HK_WIDTH,
-    HK_UNROLL,
-    HK_FORCE
-  };
-
-  /// Hint - associates name and validation with the hint value.
-  struct Hint {
-    const char * Name;
-    unsigned Value; // This may have to change for non-numeric values.
-    HintKind Kind;
-
-    Hint(const char * Name, unsigned Value, HintKind Kind)
-      : Name(Name), Value(Value), Kind(Kind) { }
-
-    bool validate(unsigned Val) {
-      switch (Kind) {
-      case HK_WIDTH:
-        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
-      case HK_UNROLL:
-        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
-      case HK_FORCE:
-        return (Val <= 1);
-      }
-      return false;
-    }
-  };
-
-  /// Vectorization width.
-  Hint Width;
-  /// Vectorization interleave factor.
-  Hint Interleave;
-  /// Vectorization forced
-  Hint Force;
-
-  /// Return the loop metadata prefix.
-  static StringRef Prefix() { return "llvm.loop."; }
-
+/// \brief This holds vectorization requirements that must be verified late in
+/// the process. The requirements are set by legalize and costmodel. Once
+/// vectorization has been determined to be possible and profitable the
+/// requirements can be verified by looking for metadata or compiler options.
+/// For example, some loops require FP commutativity which is only allowed if
+/// vectorization is explicitly specified or if the fast-math compiler option
+/// has been provided.
+/// Late evaluation of these requirements allows helpful diagnostics to be
+/// composed that tells the user what need to be done to vectorize the loop. For
+/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
+/// evaluation should be used only when diagnostics can generated that can be
+/// followed by a non-expert user.
+class LoopVectorizationRequirements {
 public:
-  enum ForceKind {
-    FK_Undefined = -1, ///< Not selected.
-    FK_Disabled = 0,   ///< Forcing disabled.
-    FK_Enabled = 1,    ///< Forcing enabled.
-  };
-
-  LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
-      : Width("vectorize.width", VectorizerParams::VectorizationFactor,
-              HK_WIDTH),
-        Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
-        Force("vectorize.enable", FK_Undefined, HK_FORCE),
-        TheLoop(L) {
-    // Populate values with existing loop metadata.
-    getHintsFromMetadata();
-
-    // force-vector-interleave overrides DisableInterleaving.
-    if (VectorizerParams::isInterleaveForced())
-      Interleave.Value = VectorizerParams::VectorizationInterleave;
-
-    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
-          << "LV: Interleaving disabled by the pass manager\n");
-  }
-
-  /// Mark the loop L as already vectorized by setting the width to 1.
-  void setAlreadyVectorized() {
-    Width.Value = Interleave.Value = 1;
-    Hint Hints[] = {Width, Interleave};
-    writeHintsToMetadata(Hints);
-  }
-
-  /// Dumps all the hint information.
-  std::string emitRemark() const {
-    VectorizationReport R;
-    if (Force.Value == LoopVectorizeHints::FK_Disabled)
-      R << "vectorization is explicitly disabled";
-    else {
-      R << "use -Rpass-analysis=loop-vectorize for more info";
-      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
-        R << " (Force=true";
-        if (Width.Value != 0)
-          R << ", Vector Width=" << Width.Value;
-        if (Interleave.Value != 0)
-          R << ", Interleave Count=" << Interleave.Value;
-        R << ")";
-      }
-    }
-
-    return R.str();
-  }
-
-  unsigned getWidth() const { return Width.Value; }
-  unsigned getInterleave() const { return Interleave.Value; }
-  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
-
-private:
-  /// Find hints specified in the loop metadata and update local values.
-  void getHintsFromMetadata() {
-    MDNode *LoopID = TheLoop->getLoopID();
-    if (!LoopID)
-      return;
-
-    // First operand should refer to the loop id itself.
-    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
-    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      const MDString *S = nullptr;
-      SmallVector<Metadata *, 4> Args;
-
-      // The expected hint is either a MDString or a MDNode with the first
-      // operand a MDString.
-      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
-        if (!MD || MD->getNumOperands() == 0)
-          continue;
-        S = dyn_cast<MDString>(MD->getOperand(0));
-        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
-          Args.push_back(MD->getOperand(i));
-      } else {
-        S = dyn_cast<MDString>(LoopID->getOperand(i));
-        assert(Args.size() == 0 && "too many arguments for MDString");
-      }
-
-      if (!S)
-        continue;
-
-      // Check if the hint starts with the loop metadata prefix.
-      StringRef Name = S->getString();
-      if (Args.size() == 1)
-        setHint(Name, Args[0]);
+  LoopVectorizationRequirements()
+      : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {}
+
+  void addUnsafeAlgebraInst(Instruction *I) {
+    // First unsafe algebra instruction.
+    if (!UnsafeAlgebraInst)
+      UnsafeAlgebraInst = I;
+  }
+
+  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
+
+  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+    const char *Name = Hints.vectorizeAnalysisPassName();
+    bool Failed = false;
+    if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+      emitOptimizationRemarkAnalysisFPCommute(
+          F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
+          VectorizationReport() << "cannot prove it is safe to reorder "
+                                   "floating-point operations");
+      Failed = true;
     }
-  }
-
-  /// Checks string hint with one operand and set value if valid.
-  void setHint(StringRef Name, Metadata *Arg) {
-    if (!Name.startswith(Prefix()))
-      return;
-    Name = Name.substr(Prefix().size(), StringRef::npos);
-
-    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
-    if (!C) return;
-    unsigned Val = C->getZExtValue();
 
-    Hint *Hints[] = {&Width, &Interleave, &Force};
-    for (auto H : Hints) {
-      if (Name == H->Name) {
-        if (H->validate(Val))
-          H->Value = Val;
-        else
-          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
-        break;
-      }
+    // Test if runtime memcheck thresholds are exceeded.
+    bool PragmaThresholdReached =
+        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+    bool ThresholdReached =
+        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+    if ((ThresholdReached && !Hints.allowReordering()) ||
+        PragmaThresholdReached) {
+      emitOptimizationRemarkAnalysisAliasing(
+          F->getContext(), Name, *F, L->getStartLoc(),
+          VectorizationReport()
+              << "cannot prove it is safe to reorder memory operations");
+      DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+      Failed = true;
     }
-  }
 
-  /// Create a new hint from name / value pair.
-  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
-    LLVMContext &Context = TheLoop->getHeader()->getContext();
-    Metadata *MDs[] = {MDString::get(Context, Name),
-                       ConstantAsMetadata::get(
-                           ConstantInt::get(Type::getInt32Ty(Context), V))};
-    return MDNode::get(Context, MDs);
+    return Failed;
   }
 
-  /// Matches metadata with hint name.
-  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
-    MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
-    if (!Name)
-      return false;
-
-    for (auto H : HintTypes)
-      if (Name->getString().endswith(H.Name))
-        return true;
-    return false;
-  }
-
-  /// Sets current hints into loop metadata, keeping other values intact.
-  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
-    if (HintTypes.size() == 0)
-      return;
-
-    // Reserve the first element to LoopID (see below).
-    SmallVector<Metadata *, 4> MDs(1);
-    // If the loop already has metadata, then ignore the existing operands.
-    MDNode *LoopID = TheLoop->getLoopID();
-    if (LoopID) {
-      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
-        // If node in update list, ignore old value.
-        if (!matchesHintMetadataName(Node, HintTypes))
-          MDs.push_back(Node);
-      }
-    }
-
-    // Now, add the missing hints.
-    for (auto H : HintTypes)
-      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
-
-    // Replace current metadata node with new one.
-    LLVMContext &Context = TheLoop->getHeader()->getContext();
-    MDNode *NewLoopID = MDNode::get(Context, MDs);
-    // Set operand 0 to refer to the loop id itself.
-    NewLoopID->replaceOperandWith(0, NewLoopID);
-
-    TheLoop->setLoopID(NewLoopID);
-  }
-
-  /// The loop these hints belong to.
-  const Loop *TheLoop;
+private:
+  unsigned NumRuntimePointerChecks;
+  Instruction *UnsafeAlgebraInst;
 };
 
-static void emitMissedWarning(Function *F, Loop *L,
-                              const LoopVectorizeHints &LH) {
-  emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
-                               L->getStartLoc(), LH.emitRemark());
-
-  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
-    if (LH.getWidth() != 1)
-      emitLoopVectorizeWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop vectorization");
-    else if (LH.getInterleave() != 1)
-      emitLoopInterleaveWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop interleaving");
-  }
-}
-
 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
   if (L.empty())
     return V.push_back(&L);
@@ -1441,6 +1606,7 @@ struct LoopVectorize : public FunctionPass {
   DominatorTree *DT;
   BlockFrequencyInfo *BFI;
   TargetLibraryInfo *TLI;
+  DemandedBits *DB;
   AliasAnalysis *AA;
   AssumptionCache *AC;
   LoopAccessAnalysis *LAA;
@@ -1450,16 +1616,17 @@ struct LoopVectorize : public FunctionPass {
   BlockFrequency ColdEntryFreq;
 
   bool runOnFunction(Function &F) override {
-    SE = &getAnalysis<ScalarEvolution>();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    BFI = &getAnalysis<BlockFrequencyInfo>();
+    BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
     TLI = TLIP ? &TLIP->getTLI() : nullptr;
-    AA = &getAnalysis<AliasAnalysis>();
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     LAA = &getAnalysis<LoopAccessAnalysis>();
+    DB = &getAnalysis<DemandedBits>();
 
     // Compute some weights outside of the loop over the loops. Compute this
     // using a BranchProbability to re-use its scaling math.
@@ -1562,26 +1729,8 @@ struct LoopVectorize : public FunctionPass {
     // less verbose reporting vectorized loops and unvectorized loops that may
     // benefit from vectorization, respectively.
 
-    if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
-      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
-      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
-                                     L->getStartLoc(), Hints.emitRemark());
-      return false;
-    }
-
-    if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
-      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
-      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
-                                     L->getStartLoc(), Hints.emitRemark());
-      return false;
-    }
-
-    if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
-      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
-      emitOptimizationRemarkAnalysis(
-          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-          "loop not vectorized: vector width and interleave count are "
-          "explicitly set to 1");
+    if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+      DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
       return false;
     }
 
@@ -1595,15 +1744,19 @@ struct LoopVectorize : public FunctionPass {
         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
       else {
         DEBUG(dbgs() << "\n");
-        emitOptimizationRemarkAnalysis(
-            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-            "vectorization is not beneficial and is not explicitly forced");
+        emitAnalysisDiag(F, L, Hints, VectorizationReport()
+                                          << "vectorization is not beneficial "
+                                             "and is not explicitly forced");
         return false;
       }
     }
 
+    PredicatedScalarEvolution PSE(*SE);
+
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
+    LoopVectorizationRequirements Requirements;
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+                                  &Requirements, &Hints);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1611,16 +1764,18 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
+    LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
+                                  &Hints);
+    CM.collectValuesToIgnore();
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-                      F->hasFnAttribute(Attribute::OptimizeForSize);
+                      F->optForSize();
 
     // Compute the weighted frequency of this loop being executed and see if it
     // is less than 20% of the function entry baseline frequency. Note that we
-    // always have a canonical loop here because we think we *can* vectoriez.
+    // always have a canonical loop here because we think we *can* vectorize.
     // FIXME: This is hidden behind a flag due to pervasive problems with
     // exactly what block frequency models.
     if (LoopVectorizeWithBlockFrequency) {
@@ -1630,16 +1785,17 @@ struct LoopVectorize : public FunctionPass {
         OptForSize = true;
     }
 
-    // Check the function attributes to see if implicit floats are allowed.a
+    // Check the function attributes to see if implicit floats are allowed.
     // FIXME: This check doesn't seem possibly correct -- what if the loop is
     // an integer loop and the vector instructions selected are purely integer
     // vector instructions?
     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
             "attribute is used.\n");
-      emitOptimizationRemarkAnalysis(
-          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-          "loop not vectorized due to NoImplicitFloat attribute");
+      emitAnalysisDiag(
+          F, L, Hints,
+          VectorizationReport()
+              << "loop not vectorized due to NoImplicitFloat attribute");
       emitMissedWarning(F, L, Hints);
       return false;
     }
@@ -1651,32 +1807,86 @@ struct LoopVectorize : public FunctionPass {
     // Select the interleave count.
     unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
-                 << DebugLocStr << '\n');
-    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    // Get user interleave count.
+    unsigned UserIC = Hints.getInterleave();
+
+    // Identify the diagnostic messages that should be produced.
+    std::string VecDiagMsg, IntDiagMsg;
+    bool VectorizeLoop = true, InterleaveLoop = true;
+
+    if (Requirements.doesNotMeet(F, L, Hints)) {
+      DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+                      "requirements.\n");
+      emitMissedWarning(F, L, Hints);
+      return false;
+    }
 
     if (VF.Width == 1) {
-      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+      VecDiagMsg =
+          "the cost-model indicates that vectorization is not beneficial";
+      VectorizeLoop = false;
+    }
 
-      if (IC == 1) {
-        emitOptimizationRemarkAnalysis(
-            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-            "not beneficial to vectorize and user disabled interleaving");
-        return false;
-      }
-      DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
+    if (IC == 1 && UserIC <= 1) {
+      // Tell the user interleaving is not beneficial.
+      DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+      IntDiagMsg =
+          "the cost-model indicates that interleaving is not beneficial";
+      InterleaveLoop = false;
+      if (UserIC == 1)
+        IntDiagMsg +=
+            " and is explicitly disabled or interleave count is set to 1";
+    } else if (IC > 1 && UserIC == 1) {
+      // Tell the user interleaving is beneficial, but it explicitly disabled.
+      DEBUG(dbgs()
+            << "LV: Interleaving is beneficial but is explicitly disabled.");
+      IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
+                   "but is explicitly disabled or interleave count is set to 1";
+      InterleaveLoop = false;
+    }
 
-      // Report the unrolling decision.
-      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-                             Twine("interleaved by " + Twine(IC) +
-                                   " (vectorization not beneficial)"));
+    // Override IC if user provided an interleave count.
+    IC = UserIC > 0 ? UserIC : IC;
+
+    // Emit diagnostic messages, if any.
+    const char *VAPassName = Hints.vectorizeAnalysisPassName();
+    if (!VectorizeLoop && !InterleaveLoop) {
+      // Do not vectorize or interleaving the loop.
+      emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+                                     L->getStartLoc(), VecDiagMsg);
+      emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+                                     L->getStartLoc(), IntDiagMsg);
+      return false;
+    } else if (!VectorizeLoop && InterleaveLoop) {
+      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+      emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+                                     L->getStartLoc(), VecDiagMsg);
+    } else if (VectorizeLoop && !InterleaveLoop) {
+      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                   << DebugLocStr << '\n');
+      emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+                                     L->getStartLoc(), IntDiagMsg);
+    } else if (VectorizeLoop && InterleaveLoop) {
+      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                   << DebugLocStr << '\n');
+      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    }
+
+    if (!VectorizeLoop) {
+      assert(IC > 1 && "interleave count should not be 1 or 0");
+      // If we decided that it is not legal to vectorize the loop then
+      // interleave it.
+      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
+      Unroller.vectorize(&LVL, CM.MinBWs);
 
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
-      Unroller.vectorize(&LVL);
+      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+                             Twine("interleaved loop (interleaved count: ") +
+                                 Twine(IC) + ")");
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
-      LB.vectorize(&LVL);
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
+      LB.vectorize(&LVL, CM.MinBWs);
       ++LoopsVectorized;
 
       // Add metadata to disable runtime unrolling scalar loop when there's no
@@ -1686,7 +1896,7 @@ struct LoopVectorize : public FunctionPass {
         AddRuntimeUnrollDisableMetaData(L);
 
       // Report the vectorization decision.
-      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
                              Twine("vectorized loop (vectorization width: ") +
                                  Twine(VF.Width) + ", interleaved count: " +
                                  Twine(IC) + ")");
@@ -1703,16 +1913,19 @@ struct LoopVectorize : public FunctionPass {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
-    AU.addRequired<BlockFrequencyInfo>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<LoopAccessAnalysis>();
+    AU.addRequired<DemandedBits>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<AliasAnalysis>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
 };
@@ -1773,6 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
+  auto *SE = PSE.getSE();
   // Make sure that the pointer does not point to structs.
   if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
@@ -1780,11 +1994,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // If this value is a pointer induction variable we know it is consecutive.
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
   if (Phi && Inductions.count(Phi)) {
-    InductionInfo II = Inductions[Phi];
+    InductionDescriptor II = Inductions[Phi];
     return II.getConsecutiveDirection();
   }
 
-  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
+  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   if (!Gep)
     return 0;
 
@@ -1802,10 +2016,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 
     // Make sure that all of the index operands are loop invariant.
     for (unsigned i = 1; i < NumOperands; ++i)
-      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
         return 0;
 
-    InductionInfo II = Inductions[Phi];
+    InductionDescriptor II = Inductions[Phi];
     return II.getConsecutiveDirection();
   }
 
@@ -1815,14 +2029,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // operand.
   for (unsigned i = 0; i != NumOperands; ++i)
     if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
   const SCEV *Last = nullptr;
   if (!Strides.count(Gep))
-    Last = SE->getSCEV(Gep->getOperand(InductionOperand));
+    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
   else {
     // Because of the multiplication by a stride we can have a s/zext cast.
     // We are going to replace this stride by 1 so the cast is safe to ignore.
@@ -1833,7 +2047,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     //  %idxprom = zext i32 %mul to i64  << Safe cast.
     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
     //
-    Last = replaceSymbolicStrideSCEV(SE, Strides,
+    Last = replaceSymbolicStrideSCEV(PSE, Strides,
                                      Gep->getOperand(InductionOperand), Gep);
     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
       Last =
@@ -2177,7 +2391,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   VectorParts &Entry = WidenMap.get(Instr);
 
   // Handle consecutive loads/stores.
-  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
     setDebugLocFromInst(Builder, Gep);
     Value *PtrOperand = Gep->getPointerOperand();
@@ -2191,8 +2405,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     Ptr = Builder.Insert(Gep2);
   } else if (Gep) {
     setDebugLocFromInst(Builder, Gep);
-    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
-                               OrigLoop) && "Base ptr must be invariant");
+    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+                                        OrigLoop) &&
+           "Base ptr must be invariant");
 
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
@@ -2209,7 +2424,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
       if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
         assert((i == InductionOperand ||
-               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
+                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+                                             OrigLoop)) &&
                "Must be last index or loop invariant");
 
         VectorParts &GEPParts = getVectorValue(GepOperand);
@@ -2237,14 +2453,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     // We don't want to update the value in the map as it might be used in
     // another expression. So don't use a reference type for "StoredVal".
     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
-    
+
     for (unsigned Part = 0; Part < UF; ++Part) {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr =
           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
 
       if (Reverse) {
-        // If we store to reverse consecutive memory locations then we need
+        // If we store to reverse consecutive memory locations, then we need
         // to reverse the order of elements in the stored value.
         StoredVal[Part] = reverseVector(StoredVal[Part]);
         // If the address is consecutive but reversed, then the
@@ -2298,7 +2514,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   }
 }
 
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+                                               bool IfPredicateStore) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
   SmallVector<VectorParts, 4> Params;
@@ -2318,7 +2535,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
     // Try using previously calculated values.
     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
 
-    // If the src is an instruction that appeared earlier in the basic block
+    // If the src is an instruction that appeared earlier in the basic block,
     // then it should already be vectorized.
     if (SrcInst && OrigLoop->contains(SrcInst)) {
       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
@@ -2343,19 +2560,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
-  Instruction *InsertPt = Builder.GetInsertPoint();
-  BasicBlock *IfBlock = Builder.GetInsertBlock();
-  BasicBlock *CondBlock = nullptr;
-
   VectorParts Cond;
-  Loop *VectorLp = nullptr;
   if (IfPredicateStore) {
     assert(Instr->getParent()->getSinglePredecessor() &&
            "Only support single predecessor blocks");
     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
                           Instr->getParent());
-    VectorLp = LI->getLoopFor(IfBlock);
-    assert(VectorLp && "Must have a loop for this block");
   }
 
   // For each vector unroll 'part':
@@ -2367,12 +2577,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
       Value *Cmp = nullptr;
       if (IfPredicateStore) {
         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
-        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
-        CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
-        LoopVectorBody.push_back(CondBlock);
-        VectorLp->addBasicBlockToLoop(CondBlock, *LI);
-        // Update Builder with newly created basic block.
-        Builder.SetInsertPoint(InsertPt);
+        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
+                                 ConstantInt::get(Cmp->getType(), 1));
       }
 
       Instruction *Cloned = Instr->clone();
@@ -2396,85 +2602,223 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
                                                        Builder.getInt32(Width));
       // End if-block.
-      if (IfPredicateStore) {
-         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-         LoopVectorBody.push_back(NewIfBlock);
-         VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
-         Builder.SetInsertPoint(InsertPt);
-         ReplaceInstWithInst(IfBlock->getTerminator(),
-                             BranchInst::Create(CondBlock, NewIfBlock, Cmp));
-         IfBlock = NewIfBlock;
-      }
+      if (IfPredicateStore)
+        PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+                                                  Cmp));
     }
   }
 }
 
-static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
-                                 Instruction *Loc) {
-  if (FirstInst)
-    return FirstInst;
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() == Loc->getParent() ? I : nullptr;
-  return nullptr;
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+                                                      Value *End, Value *Step,
+                                                      Instruction *DL) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  // As we're just creating this loop, it's possible no latch exists
+  // yet. If so, use the header as this will be a single block loop.
+  if (!Latch)
+    Latch = Header;
+
+  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
+  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+  Builder.SetInsertPoint(Latch->getTerminator());
+  
+  // Create i+1 and fill the PHINode.
+  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(Start, L->getLoopPreheader());
+  Induction->addIncoming(Next, Latch);
+  // Create the compare.
+  Value *ICmp = Builder.CreateICmpEQ(Next, End);
+  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+  
+  // Now we have two terminators. Remove the old one from the block.
+  Latch->getTerminator()->eraseFromParent();
+
+  return Induction;
 }
 
-std::pair<Instruction *, Instruction *>
-InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
-  Instruction *tnullptr = nullptr;
-  if (!Legal->mustCheckStrides())
-    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
-
-  IRBuilder<> ChkBuilder(Loc);
-
-  // Emit checks.
-  Value *Check = nullptr;
-  Instruction *FirstInst = nullptr;
-  for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
-                                         SE = Legal->strides_end();
-       SI != SE; ++SI) {
-    Value *Ptr = stripIntegerCast(*SI);
-    Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
-                                       "stride.chk");
-    // Store the first instruction we create.
-    FirstInst = getFirstInst(FirstInst, C, Loc);
-    if (Check)
-      Check = ChkBuilder.CreateOr(Check, C);
-    else
-      Check = C;
-  }
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+  if (TripCount)
+    return TripCount;
 
-  // We have to do this trickery because the IRBuilder might fold the check to a
-  // constant expression in which case there is no Instruction anchored in a
-  // the block.
-  LLVMContext &Ctx = Loc->getContext();
-  Instruction *TheCheck =
-      BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
-  ChkBuilder.Insert(TheCheck, "stride.not.one");
-  FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  // Find the loop boundaries.
+  ScalarEvolution *SE = PSE.getSE();
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
+  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+         "Invalid loop count");
 
-  return std::make_pair(FirstInst, TheCheck);
+  Type *IdxTy = Legal->getWidestInductionType();
+  
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+      IdxTy->getPrimitiveSizeInBits())
+    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+  
+  // Get the total trip count from the count by adding 1.
+  const SCEV *ExitCount = SE->getAddExpr(
+      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, DL, "induction");
+
+  // Count holds the overall loop count (N).
+  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                L->getLoopPreheader()->getTerminator());
+
+  if (TripCount->getType()->isPointerTy())
+    TripCount =
+      CastInst::CreatePointerCast(TripCount, IdxTy,
+                                  "exitcount.ptrcnt.to.int",
+                                  L->getLoopPreheader()->getTerminator());
+
+  return TripCount;
 }
 
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+  if (VectorTripCount)
+    return VectorTripCount;
+  
+  Value *TC = getOrCreateTripCount(L);
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  
+  // Now we need to generate the expression for N - (N % VF), which is
+  // the part that the vectorized body will execute.
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
+  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+  return VectorTripCount;
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+                                                         BasicBlock *Bypass) {
+  Value *Count = getOrCreateTripCount(L);
+  BasicBlock *BB = L->getLoopPreheader();
+  IRBuilder<> Builder(BB->getTerminator());
+
+  // Generate code to check that the loop's trip count that we computed by
+  // adding one to the backedge-taken count will not overflow.
+  Value *CheckMinIters =
+    Builder.CreateICmpULT(Count,
+                          ConstantInt::get(Count->getType(), VF * UF),
+                          "min.iters.check");
+  
+  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
+                                          "min.iters.checked");
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
+  LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
+                                                     BasicBlock *Bypass) {
+  Value *TC = getOrCreateVectorTripCount(L);
+  BasicBlock *BB = L->getLoopPreheader();
+  IRBuilder<> Builder(BB->getTerminator());
+  
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
+  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
+                                    "cmp.zero");
+
+  // Generate code to check that the loop's trip count that we computed by
+  // adding one to the backedge-taken count will not overflow.
+  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
+                                          "vector.ph");
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, Cmp));
+  LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+  BasicBlock *BB = L->getLoopPreheader();
+
+  // Generate the code to check that the SCEV assumptions that we made.
+  // We want the new basic block to start at the first instruction in a
+  // sequence of instructions that form a check.
+  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+                   "scev.check");
+  Value *SCEVCheck =
+      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+
+  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+    if (C->isZero())
+      return;
+
+  // Create a new block containing the stride check.
+  BB->setName("vector.scevcheck");
+  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, SCEVCheck));
+  LoopBypassBlocks.push_back(BB);
+  AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
+                                               BasicBlock *Bypass) {
+  BasicBlock *BB = L->getLoopPreheader();
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  Instruction *FirstCheckInst;
+  Instruction *MemRuntimeCheck;
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+      Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+  if (!MemRuntimeCheck)
+    return;
+
+  // Create a new block containing the memory check.
+  BB->setName("vector.memcheck");
+  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
+  LoopBypassBlocks.push_back(BB);
+  AddedSafetyChecks = true;
+}
+
+
 void InnerLoopVectorizer::createEmptyLoop() {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- Back-edge taken count overflow check.
+       [ ] <-- loop iteration number check.
     /   |
    /    v
   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
   |  /  |
   | /   v
   ||   [ ]     <-- vector pre header.
-  ||    |
-  ||    v
-  ||   [  ] \
-  ||   [  ]_|   <-- vector loop.
-  ||    |
-  | \   v
-  |   >[ ]   <--- middle-block.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
   |  /  |
   | /   v
   -|- >[ ]     <--- new preheader.
@@ -2498,65 +2842,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
   // don't have a single induction variable.
+  //
+  // We try to obtain an induction variable from the original loop as hard
+  // as possible. However if we don't find one that:
+  //   - is an integer
+  //   - counts from zero, stepping by one
+  //   - is the size of the widest induction variable type
+  // then we create a new one.
   OldInduction = Legal->getInduction();
   Type *IdxTy = Legal->getWidestInductionType();
 
-  // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
-  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
-  // The exit count might have the type of i64 while the phi is i32. This can
-  // happen if we have an induction variable that is sign extended before the
-  // compare. The only way that we get a backedge taken count is that the
-  // induction variable was signed and as such will not overflow. In such a case
-  // truncation is legal.
-  if (ExitCount->getType()->getPrimitiveSizeInBits() >
-      IdxTy->getPrimitiveSizeInBits())
-    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
-
-  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
-  // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(BackedgeTakeCount,
-                             SE->getConstant(BackedgeTakeCount->getType(), 1));
-
-  const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
-
-  // Expand the trip count and place the new instructions in the preheader.
-  // Notice that the pre-header does not change, only the loop body.
-  SCEVExpander Exp(*SE, DL, "induction");
-
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  Value *BackedgeCount =
-      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
-                        VectorPH->getTerminator());
-  if (BackedgeCount->getType()->isPointerTy())
-    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
-                                                "backedge.ptrcnt.to.int",
-                                                VectorPH->getTerminator());
-  Instruction *CheckBCOverflow =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
-                      Constant::getAllOnesValue(BackedgeCount->getType()),
-                      "backedge.overflow", VectorPH->getTerminator());
-
-  // The loop index does not have to start at Zero. Find the original start
-  // value from the induction PHI node. If we don't have an induction variable
-  // then we know that it starts at zero.
-  Builder.SetInsertPoint(VectorPH->getTerminator());
-  Value *StartIdx = ExtendedIdx =
-      OldInduction
-          ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
-                               IdxTy)
-          : ConstantInt::get(IdxTy, 0);
-
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                   VectorPH->getTerminator());
-
-  LoopBypassBlocks.push_back(VectorPH);
-
   // Split the single block loop into the two loop structure described above.
   BasicBlock *VecBody =
       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
@@ -2580,118 +2875,36 @@ void InnerLoopVectorizer::createEmptyLoop() {
   }
   Lp->addBasicBlockToLoop(VecBody, *LI);
 
-  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
-  // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
-
-  // Generate the induction variable.
-  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
-  Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
-  BasicBlock *NewVectorPH =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
-  if (ParentLoop)
-    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-  ReplaceInstWithInst(
-      VectorPH->getTerminator(),
-      BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
-  VectorPH = NewVectorPH;
-
-  // This is the IR builder that we use to add all of the logic for bypassing
-  // the new vector loop.
-  IRBuilder<> BypassBuilder(VectorPH->getTerminator());
-  setDebugLocFromInst(BypassBuilder,
-                      getDebugLocFromInstOrOperands(OldInduction));
-
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  if (Count->getType() != IdxTy) {
-    // The exit count can be of pointer type. Convert it to the correct
-    // integer type.
-    if (ExitCount->getType()->isPointerTy())
-      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
-    else
-      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
-  }
-
-  // Add the start index to the loop count to get the new end index.
-  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
+  // Find the loop boundaries.
+  Value *Count = getOrCreateTripCount(Lp);
 
-  // Now we need to generate the expression for N - (N % VF), which is
-  // the part that the vectorized body will execute.
-  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
-  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
-  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
-                                                     "end.idx.rnd.down");
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
 
+  // We need to test whether the backedge-taken count is uint##_max. Adding one
+  // to it will cause overflow and an incorrect loop trip count in the vector
+  // body. In case of overflow we want to directly jump to the scalar remainder
+  // loop.
+  emitMinimumIterationCountCheck(Lp, ScalarPH);
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
-  Value *Cmp =
-      BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
-  NewVectorPH =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
-  if (ParentLoop)
-    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-  LoopBypassBlocks.push_back(VectorPH);
-  ReplaceInstWithInst(VectorPH->getTerminator(),
-                      BranchInst::Create(MiddleBlock, NewVectorPH, Cmp));
-  VectorPH = NewVectorPH;
-
-  // Generate the code to check that the strides we assumed to be one are really
-  // one. We want the new basic block to start at the first instruction in a
-  // sequence of instructions that form a check.
-  Instruction *StrideCheck;
-  Instruction *FirstCheckInst;
-  std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(VectorPH->getTerminator());
-  if (StrideCheck) {
-    AddedSafetyChecks = true;
-    // Create a new block containing the stride check.
-    VectorPH->setName("vector.stridecheck");
-    NewVectorPH =
-        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
-    if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-    LoopBypassBlocks.push_back(VectorPH);
-
-    // Replace the branch into the memory check block with a conditional branch
-    // for the "few elements case".
-    ReplaceInstWithInst(
-        VectorPH->getTerminator(),
-        BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck));
-
-    VectorPH = NewVectorPH;
-  }
+  emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  emitSCEVChecks(Lp, ScalarPH);
 
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  Instruction *MemRuntimeCheck;
-  std::tie(FirstCheckInst, MemRuntimeCheck) =
-      Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
-  if (MemRuntimeCheck) {
-    AddedSafetyChecks = true;
-    // Create a new block containing the memory check.
-    VectorPH->setName("vector.memcheck");
-    NewVectorPH =
-        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
-    if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-    LoopBypassBlocks.push_back(VectorPH);
-
-    // Replace the branch into the memory check block with a conditional branch
-    // for the "few elements case".
-    ReplaceInstWithInst(
-        VectorPH->getTerminator(),
-        BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck));
-
-    VectorPH = NewVectorPH;
-  }
+  emitMemRuntimeChecks(Lp, ScalarPH);
+  
+  // Generate the induction variable.
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+  Induction =
+    createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                            getDebugLocFromInstOrOperands(OldInduction));
 
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
@@ -2701,152 +2914,60 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // If we come from a bypass edge then we need to start from the original
   // start value.
 
-  // This variable saves the new starting index for the scalar loop.
-  PHINode *ResumeIndex = nullptr;
+  // This variable saves the new starting index for the scalar loop. It is used
+  // to test if there are any tail iterations left once the vector loop has
+  // completed.
   LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
-  // Set builder to point to last bypass block.
-  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
   for (I = List->begin(), E = List->end(); I != E; ++I) {
     PHINode *OrigPhi = I->first;
-    LoopVectorizationLegality::InductionInfo II = I->second;
-
-    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
-    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
-                                         MiddleBlock->getTerminator());
-    // We might have extended the type of the induction variable but we need a
-    // truncated version for the scalar loop.
-    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
-      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
-                      MiddleBlock->getTerminator()) : nullptr;
+    InductionDescriptor II = I->second;
 
     // Create phi nodes to merge from the  backedge-taken check block.
-    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+    PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
+                                           "bc.resume.val",
                                            ScalarPH->getTerminator());
-    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
-
-    PHINode *BCTruncResumeVal = nullptr;
+    Value *EndValue;
     if (OrigPhi == OldInduction) {
-      BCTruncResumeVal =
-          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
-                          ScalarPH->getTerminator());
-      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
-    }
-
-    Value *EndValue = nullptr;
-    switch (II.IK) {
-    case LoopVectorizationLegality::IK_NoInduction:
-      llvm_unreachable("Unknown induction");
-    case LoopVectorizationLegality::IK_IntInduction: {
-      // Handle the integer induction counter.
-      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-
-      // We have the canonical induction variable.
-      if (OrigPhi == OldInduction) {
-        // Create a truncated version of the resume value for the scalar loop,
-        // we might have promoted the type to a larger width.
-        EndValue =
-          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
-        // The new PHI merges the original incoming value, in case of a bypass,
-        // or the value at the end of the vectorized loop.
-        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
-        TruncResumeVal->addIncoming(EndValue, VecBody);
-
-        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
-
-        // We know what the end value is.
-        EndValue = IdxEndRoundDown;
-        // We also know which PHI node holds it.
-        ResumeIndex = ResumeVal;
-        break;
-      }
-
-      // Not the canonical induction variable - add the vector loop count to the
-      // start value.
-      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
-                                                   II.StartValue->getType(),
-                                                   "cast.crd");
-      EndValue = II.transform(BypassBuilder, CRD);
+      // We know what the end value is.
+      EndValue = CountRoundDown;
+    } else {
+      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+      Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
+                                       II.getStepValue()->getType(),
+                                       "cast.crd");
+      EndValue = II.transform(B, CRD);
       EndValue->setName("ind.end");
-      break;
     }
-    case LoopVectorizationLegality::IK_PtrInduction: {
-      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
-                                                   II.StepValue->getType(),
-                                                   "cast.crd");
-      EndValue = II.transform(BypassBuilder, CRD);
-      EndValue->setName("ptr.ind.end");
-      break;
-    }
-    }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
-      if (OrigPhi == OldInduction)
-        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
-      else
-        ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
-    }
-    ResumeVal->addIncoming(EndValue, VecBody);
+    BCResumeVal->addIncoming(EndValue, MiddleBlock);
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
 
     // The old induction's phi node in the scalar body needs the truncated
     // value.
-    if (OrigPhi == OldInduction) {
-      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
-      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
-    } else {
-      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
-      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
-    }
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
+    OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
   }
 
-  // If we are generating a new induction variable then we also need to
-  // generate the code that calculates the exit value. This value is not
-  // simply the end of the counter because we may skip the vectorized body
-  // in case of a runtime check.
-  if (!OldInduction){
-    assert(!ResumeIndex && "Unexpected resume value found");
-    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
-                                  MiddleBlock->getTerminator());
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
-    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
-  }
-
-  // Make sure that we found the index where scalar loop needs to continue.
-  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
-         "Invalid resume Index");
-
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
-                                ResumeIndex, "cmp.n",
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                                CountRoundDown, "cmp.n",
                                 MiddleBlock->getTerminator());
   ReplaceInstWithInst(MiddleBlock->getTerminator(),
                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
 
-  // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
-  Induction->addIncoming(StartIdx, VectorPH);
-  Induction->addIncoming(NextIdx, VecBody);
-  // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
-  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
-
-  // Now we have two terminators. Remove the old one from the block.
-  VecBody->getTerminator()->eraseFromParent();
-
   // Get ready to start creating new instructions into the vectorized body.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
 
   // Save the state.
-  LoopVectorPreHeader = VectorPH;
+  LoopVectorPreHeader = Lp->getLoopPreheader();
   LoopScalarPreHeader = ScalarPH;
   LoopMiddleBlock = MiddleBlock;
   LoopExitBlock = ExitBlock;
@@ -2899,7 +3020,7 @@ static void cse(SmallVector<BasicBlock *, 4> &BBs) {
   for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
     BasicBlock *BB = BBs[i];
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-      Instruction *In = I++;
+      Instruction *In = &*I++;
 
       if (!CSEDenseMapInfo::canHandle(In))
         continue;
@@ -3021,6 +3142,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
   return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
 }
 
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+  IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
+  IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+  IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
+  IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+  // For every instruction `I` in MinBWs, truncate the operands, create a
+  // truncated version of `I` and reextend its result. InstCombine runs
+  // later and will remove any ext/trunc pairs.
+  //
+  for (auto &KV : MinBWs) {
+    VectorParts &Parts = WidenMap.get(KV.first);
+    for (Value *&I : Parts) {
+      if (I->use_empty())
+        continue;
+      Type *OriginalTy = I->getType();
+      Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
+                                                 KV.second);
+      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
+                                          OriginalTy->getVectorNumElements());
+      if (TruncatedTy == OriginalTy)
+        continue;
+
+      IRBuilder<> B(cast<Instruction>(I));
+      auto ShrinkOperand = [&](Value *V) -> Value* {
+        if (auto *ZI = dyn_cast<ZExtInst>(V))
+          if (ZI->getSrcTy() == TruncatedTy)
+            return ZI->getOperand(0);
+        return B.CreateZExtOrTrunc(V, TruncatedTy);
+      };
+
+      // The actual instruction modification depends on the instruction type,
+      // unfortunately.
+      Value *NewI = nullptr;
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        NewI = B.CreateBinOp(BO->getOpcode(),
+                             ShrinkOperand(BO->getOperand(0)),
+                             ShrinkOperand(BO->getOperand(1)));
+        cast<BinaryOperator>(NewI)->copyIRFlags(I);
+      } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
+        NewI = B.CreateICmp(CI->getPredicate(),
+                            ShrinkOperand(CI->getOperand(0)),
+                            ShrinkOperand(CI->getOperand(1)));
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        NewI = B.CreateSelect(SI->getCondition(),
+                              ShrinkOperand(SI->getTrueValue()),
+                              ShrinkOperand(SI->getFalseValue()));
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        switch (CI->getOpcode()) {
+        default: llvm_unreachable("Unhandled cast!");
+        case Instruction::Trunc:
+          NewI = ShrinkOperand(CI->getOperand(0));
+          break;
+        case Instruction::SExt:
+          NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
+                                     smallestIntegerVectorType(OriginalTy,
+                                                               TruncatedTy));
+          break;
+        case Instruction::ZExt:
+          NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
+                                     smallestIntegerVectorType(OriginalTy,
+                                                               TruncatedTy));
+          break;
+        }
+      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+        auto *O0 =
+          B.CreateZExtOrTrunc(SI->getOperand(0),
+                              VectorType::get(ScalarTruncatedTy, Elements0));
+        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+        auto *O1 =
+          B.CreateZExtOrTrunc(SI->getOperand(1),
+                              VectorType::get(ScalarTruncatedTy, Elements1));
+
+        NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+      } else if (isa<LoadInst>(I)) {
+        // Don't do anything with the operands, just extend the result.
+        continue;
+      } else {
+        llvm_unreachable("Unhandled instruction type!");
+      }
+
+      // Lastly, extend the result.
+      NewI->takeName(cast<Instruction>(I));
+      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+      I->replaceAllUsesWith(Res);
+      cast<Instruction>(I)->eraseFromParent();
+      I = Res;
+    }
+  }
+
+  // We'll have created a bunch of ZExts that are now parentless. Clean up.
+  for (auto &KV : MinBWs) {
+    VectorParts &Parts = WidenMap.get(KV.first);
+    for (Value *&I : Parts) {
+      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+      if (Inst && Inst->use_empty()) {
+        Value *NewI = Inst->getOperand(0);
+        Inst->eraseFromParent();
+        I = NewI;
+      }
+    }
+  }
+}
+
 void InnerLoopVectorizer::vectorizeLoop() {
   //===------------------------------------------------===//
   //
@@ -3051,6 +3283,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
        be = DFS.endRPO(); bb != be; ++bb)
     vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
 
+  // Insert truncates and extends for any truncated instructions as hints to
+  // InstCombine.
+  if (VF > 1)
+    truncateToMinimalBitwidths();
+  
   // At this point every instruction in the original loop is widened to
   // a vector form. We are almost done. Now, we need to fix the PHI nodes
   // that we vectorized. The PHI nodes are currently empty because we did
@@ -3066,7 +3303,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
     // Find the reduction variable descriptor.
-    assert(Legal->getReductionVars()->count(RdxPhi) &&
+    assert(Legal->isReductionVariable(RdxPhi) &&
            "Unable to find the reduction variable");
     RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
 
@@ -3141,21 +3378,33 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // the PHIs and the values we are going to write.
     // This allows us to write both PHINodes and the extractelement
     // instructions.
-    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
 
-    VectorParts RdxParts;
+    VectorParts RdxParts = getVectorValue(LoopExitInst);
     setDebugLocFromInst(Builder, LoopExitInst);
-    for (unsigned part = 0; part < UF; ++part) {
-      // This PHINode contains the vectorized reduction variable, or
-      // the initial value vector, if we bypass the vector loop.
-      VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
-      NewPhi->addIncoming(RdxExitVal[part],
-                          LoopVectorBody.back());
-      RdxParts.push_back(NewPhi);
+
+    // If the vector reduction can be performed in a smaller type, we truncate
+    // then extend the loop exit value to enable InstCombine to evaluate the
+    // entire expression in the smaller type.
+    if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) {
+      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+      Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
+      for (unsigned part = 0; part < UF; ++part) {
+        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+        Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+                                          : Builder.CreateZExt(Trunc, VecTy);
+        for (Value::user_iterator UI = RdxParts[part]->user_begin();
+             UI != RdxParts[part]->user_end();)
+          if (*UI != Trunc) {
+            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+            RdxParts[part] = Extnd;
+          } else {
+            ++UI;
+          }
+      }
+      Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+      for (unsigned part = 0; part < UF; ++part)
+        RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
     }
 
     // Reduce all of the unrolled parts into a single vector.
@@ -3208,13 +3457,22 @@ void InnerLoopVectorizer::vectorizeLoop() {
       // The result is in the first element of the vector.
       ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
                                                     Builder.getInt32(0));
+
+      // If the reduction can be performed in a smaller type, we need to extend
+      // the reduction to the wider type before we branch to the original loop.
+      if (RdxPhi->getType() != RdxDesc.getRecurrenceType())
+        ReducedPartRdx =
+            RdxDesc.isSigned()
+                ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType())
+                : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());
     }
 
     // Create a phi node that merges control-flow from the backedge-taken check
     // block and the middle block.
     PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
                                           LoopScalarPreHeader->getTerminator());
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
     BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
 
     // Now, we need to fix the users of the reduction variable
@@ -3252,6 +3510,20 @@ void InnerLoopVectorizer::vectorizeLoop() {
 
   fixLCSSAPHIs();
 
+  // Make sure DomTree is updated.
+  updateAnalysis();
+  
+  // Predicate any stores.
+  for (auto KV : PredicatedStores) {
+    BasicBlock::iterator I(KV.first);
+    auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
+    auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
+                                        /*BranchWeights=*/nullptr, DT);
+    I->moveBefore(T);
+    I->getParent()->setName("pred.store.if");
+    BB->setName("pred.store.continue");
+  }
+  DEBUG(DT->verifyDomTree());
   // Remove redundant induction instructions.
   cse(LoopVectorBody);
 }
@@ -3326,18 +3598,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   return BlockMask;
 }
 
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
-                                              InnerLoopVectorizer::VectorParts &Entry,
-                                              unsigned UF, unsigned VF, PhiVector *PV) {
+void InnerLoopVectorizer::widenPHIInstruction(
+    Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
+    unsigned VF, PhiVector *PV) {
   PHINode* P = cast<PHINode>(PN);
   // Handle reduction variables:
-  if (Legal->getReductionVars()->count(P)) {
+  if (Legal->isReductionVariable(P)) {
     for (unsigned part = 0; part < UF; ++part) {
       // This is phase one of vectorizing PHIs.
       Type *VecTy = (VF == 1) ? PN->getType() :
       VectorType::get(PN->getType(), VF);
-      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
-                                    LoopVectorBody.back()-> getFirstInsertionPt());
+      Entry[part] = PHINode::Create(
+          VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
     }
     PV->push_back(P);
     return;
@@ -3385,53 +3657,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
   assert(Legal->getInductionVars()->count(P) &&
          "Not an induction variable");
 
-  LoopVectorizationLegality::InductionInfo II =
-  Legal->getInductionVars()->lookup(P);
+  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
 
   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
   // which can be found from the original scalar operations.
-  switch (II.IK) {
-    case LoopVectorizationLegality::IK_NoInduction:
+  switch (II.getKind()) {
+    case InductionDescriptor::IK_NoInduction:
       llvm_unreachable("Unknown induction");
-    case LoopVectorizationLegality::IK_IntInduction: {
-      assert(P->getType() == II.StartValue->getType() && "Types must match");
-      Type *PhiTy = P->getType();
-      Value *Broadcasted;
-      if (P == OldInduction) {
-        // Handle the canonical induction variable. We might have had to
-        // extend the type.
-        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
-      } else {
-        // Handle other induction variables that are now based on the
-        // canonical one.
-        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
-                                                 "normalized.idx");
-        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
-        Broadcasted = II.transform(Builder, NormalizedIdx);
-        Broadcasted->setName("offset.idx");
+    case InductionDescriptor::IK_IntInduction: {
+      assert(P->getType() == II.getStartValue()->getType() &&
+             "Types must match");
+      // Handle other induction variables that are now based on the
+      // canonical one.
+      Value *V = Induction;
+      if (P != OldInduction) {
+        V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+        V = II.transform(Builder, V);
+        V->setName("offset.idx");
       }
-      Broadcasted = getBroadcastInstrs(Broadcasted);
+      Value *Broadcasted = getBroadcastInstrs(V);
       // After broadcasting the induction variable we need to make the vector
       // consecutive by adding 0, 1, 2, etc.
       for (unsigned part = 0; part < UF; ++part)
-        Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
+        Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());
       return;
     }
-    case LoopVectorizationLegality::IK_PtrInduction:
+    case InductionDescriptor::IK_PtrInduction:
       // Handle the pointer induction variable case.
       assert(P->getType()->isPointerTy() && "Unexpected type.");
       // This is the normalized GEP that starts counting at zero.
-      Value *NormalizedIdx =
-          Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
-      NormalizedIdx =
-          Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType());
+      Value *PtrInd = Induction;
+      PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
       // This is the vector of results. Notice that we don't generate
       // vector geps because scalar geps result in better code.
       for (unsigned part = 0; part < UF; ++part) {
         if (VF == 1) {
           int EltIndex = part;
-          Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
           Value *SclrGep = II.transform(Builder, GlobalIdx);
           SclrGep->setName("next.gep");
           Entry[part] = SclrGep;
@@ -3441,8 +3704,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
         Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
         for (unsigned int i = 0; i < VF; ++i) {
           int EltIndex = i + part * VF;
-          Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
           Value *SclrGep = II.transform(Builder, GlobalIdx);
           SclrGep->setName("next.gep");
           VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
@@ -3458,7 +3721,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    VectorParts &Entry = WidenMap.get(it);
+    VectorParts &Entry = WidenMap.get(&*it);
+
     switch (it->getOpcode()) {
     case Instruction::Br:
       // Nothing to do for PHIs and BR, since we already took care of the
@@ -3466,7 +3730,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       continue;
     case Instruction::PHI: {
       // Vectorize PHINodes.
-      widenPHIInstruction(it, Entry, UF, VF, PV);
+      widenPHIInstruction(&*it, Entry, UF, VF, PV);
       continue;
     }// End of PHI.
 
@@ -3504,16 +3768,17 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         Entry[Part] = V;
       }
 
-      propagateMetadata(Entry, it);
+      propagateMetadata(Entry, &*it);
       break;
     }
     case Instruction::Select: {
       // Widen selects.
       // If the selector is loop invariant we can create a select
       // instruction with a scalar condition. Otherwise, use vector-select.
-      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
-                                               OrigLoop);
-      setDebugLocFromInst(Builder, it);
+      auto *SE = PSE.getSE();
+      bool InvariantCond =
+          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
+      setDebugLocFromInst(Builder, &*it);
 
       // The condition can be loop invariant  but still defined inside the
       // loop. This means that we can't just use the original 'cond' value.
@@ -3522,7 +3787,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       VectorParts &Cond = getVectorValue(it->getOperand(0));
       VectorParts &Op0  = getVectorValue(it->getOperand(1));
       VectorParts &Op1  = getVectorValue(it->getOperand(2));
-
+      
       Value *ScalarCond = (VF == 1) ? Cond[0] :
         Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
 
@@ -3533,7 +3798,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
           Op1[Part]);
       }
 
-      propagateMetadata(Entry, it);
+      propagateMetadata(Entry, &*it);
       break;
     }
 
@@ -3542,25 +3807,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Widen compares. Generate vector compares.
       bool FCmp = (it->getOpcode() == Instruction::FCmp);
       CmpInst *Cmp = dyn_cast<CmpInst>(it);
-      setDebugLocFromInst(Builder, it);
+      setDebugLocFromInst(Builder, &*it);
       VectorParts &A = getVectorValue(it->getOperand(0));
       VectorParts &B = getVectorValue(it->getOperand(1));
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *C = nullptr;
-        if (FCmp)
+        if (FCmp) {
           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
-        else
+          cast<FCmpInst>(C)->copyFastMathFlags(&*it);
+        } else {
           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+        }
         Entry[Part] = C;
       }
 
-      propagateMetadata(Entry, it);
+      propagateMetadata(Entry, &*it);
       break;
     }
 
     case Instruction::Store:
     case Instruction::Load:
-      vectorizeMemoryInstruction(it);
+      vectorizeMemoryInstruction(&*it);
         break;
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -3575,7 +3842,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       CastInst *CI = dyn_cast<CastInst>(it);
-      setDebugLocFromInst(Builder, it);
+      setDebugLocFromInst(Builder, &*it);
       /// Optimize the special case where the source is the induction
       /// variable. Notice that we can only optimize the 'trunc' case
       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
@@ -3585,13 +3852,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
                                                CI->getType());
         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
-        LoopVectorizationLegality::InductionInfo II =
+        InductionDescriptor II =
             Legal->getInductionVars()->lookup(OldInduction);
-        Constant *Step =
-            ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
+        Constant *Step = ConstantInt::getSigned(
+            CI->getType(), II.getStepValue()->getSExtValue());
         for (unsigned Part = 0; Part < UF; ++Part)
           Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
-        propagateMetadata(Entry, it);
+        propagateMetadata(Entry, &*it);
         break;
       }
       /// Vectorize casts.
@@ -3601,7 +3868,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       VectorParts &A = getVectorValue(it->getOperand(0));
       for (unsigned Part = 0; Part < UF; ++Part)
         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-      propagateMetadata(Entry, it);
+      propagateMetadata(Entry, &*it);
       break;
     }
 
@@ -3609,7 +3876,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Ignore dbg intrinsics.
       if (isa<DbgInfoIntrinsic>(it))
         break;
-      setDebugLocFromInst(Builder, it);
+      setDebugLocFromInst(Builder, &*it);
 
       Module *M = BB->getParent()->getParent();
       CallInst *CI = cast<CallInst>(it);
@@ -3625,7 +3892,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       if (ID &&
           (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
            ID == Intrinsic::lifetime_start)) {
-        scalarizeInstruction(it);
+        scalarizeInstruction(&*it);
         break;
       }
       // The flag shows whether we use Intrinsic or a usual Call for vectorized
@@ -3636,7 +3903,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       bool UseVectorIntrinsic =
           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
       if (!UseVectorIntrinsic && NeedToScalarize) {
-        scalarizeInstruction(it);
+        scalarizeInstruction(&*it);
         break;
       }
 
@@ -3677,13 +3944,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         Entry[Part] = Builder.CreateCall(VectorF, Args);
       }
 
-      propagateMetadata(Entry, it);
+      propagateMetadata(Entry, &*it);
       break;
     }
 
     default:
       // All other instructions are unsupported. Scalarize them.
-      scalarizeInstruction(it);
+      scalarizeInstruction(&*it);
       break;
     }// end of switch.
   }// end of for_each instr.
@@ -3691,7 +3958,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
 
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
-  SE->forgetLoop(OrigLoop);
+  PSE.getSE()->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
@@ -3701,19 +3968,12 @@ void InnerLoopVectorizer::updateAnalysis() {
     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
 
-  // Due to if predication of stores we might create a sequence of "if(pred)
-  // a[i] = ...;  " blocks.
-  for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
-    if (i == 0)
-      DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
-    else if (isPredicatedBlock(i)) {
-      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
-    } else {
-      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
-    }
-  }
+  // We don't predicate stores by this point, so the vector body should be a
+  // single loop.
+  assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
+  DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
 
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
@@ -3850,10 +4110,10 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(VectorizationReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(VectorizationReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -3879,10 +4139,28 @@ bool LoopVectorizationLegality::canVectorize() {
                        : "")
                << "!\n");
 
+  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+    UseInterleaved = EnableInterleavedMemAccesses;
+
   // Analyze interleaved memory accesses.
-  if (EnableInterleavedMemAccesses)
+  if (UseInterleaved)
     InterleaveInfo.analyzeInterleaving(Strides);
 
+  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+    emitAnalysis(VectorizationReport()
+                 << "Too many SCEV assumptions need to be made and checked "
+                 << "at runtime");
+    DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+    return false;
+  }
+
   // Okay! We can vectorize. At this point we don't have any other mem analysis
   // which may limit our maximum vectorization factor, so just return true with
   // no restrictions.
@@ -3929,7 +4207,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
 }
 
 bool LoopVectorizationLegality::canVectorizeInstrs() {
-  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
 
   // Look for the attribute signaling the absence of NaNs.
@@ -3953,7 +4230,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (!PhiTy->isIntegerTy() &&
             !PhiTy->isFloatingPointTy() &&
             !PhiTy->isPointerTy()) {
-          emitAnalysis(VectorizationReport(it)
+          emitAnalysis(VectorizationReport(&*it)
                        << "loop control flow is not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
@@ -3965,9 +4242,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (*bb != Header) {
           // Check that this instruction has no outside users or is an
           // identified reduction value with an outside user.
-          if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+          if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))
             continue;
-          emitAnalysis(VectorizationReport(it) <<
+          emitAnalysis(VectorizationReport(&*it) <<
                        "value could not be identified as "
                        "an induction or reduction variable");
           return false;
@@ -3975,19 +4252,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
         // We only allow if-converted PHIs with exactly two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
-          emitAnalysis(VectorizationReport(it)
+          emitAnalysis(VectorizationReport(&*it)
                        << "control flow not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
           return false;
         }
 
-        // This is the value coming from the preheader.
-        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
-        ConstantInt *StepValue = nullptr;
-        // Check if this is an induction variable.
-        InductionKind IK = isInductionVariable(Phi, StepValue);
-
-        if (IK_NoInduction != IK) {
+        InductionDescriptor ID;
+        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
+          Inductions[Phi] = ID;
           // Get the widest type.
           if (!WidestIndTy)
             WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
@@ -3995,21 +4268,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
             WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
 
           // Int inductions are special because we only allow one IV.
-          if (IK == IK_IntInduction && StepValue->isOne()) {
+          if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+              ID.getStepValue()->isOne() &&
+              isa<Constant>(ID.getStartValue()) &&
+                cast<Constant>(ID.getStartValue())->isNullValue()) {
             // Use the phi node with the widest type as induction. Use the last
             // one if there are multiple (no good reason for doing this other
-            // than it is expedient).
+            // than it is expedient). We've checked that it begins at zero and
+            // steps by one, so this is a canonical induction variable.
             if (!Induction || PhiTy == WidestIndTy)
               Induction = Phi;
           }
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
-          Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
 
           // Until we explicitly handle the case of an induction variable with
           // an outside loop user we have to give up vectorizing this loop.
-          if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
-            emitAnalysis(VectorizationReport(it) <<
+          if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
+            emitAnalysis(VectorizationReport(&*it) <<
                          "use of induction value outside of the "
                          "loop is not handled by vectorizer");
             return false;
@@ -4018,13 +4294,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
-                                                 Reductions[Phi])) {
-          AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
+        RecurrenceDescriptor RedDes;
+        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
+          if (RedDes.hasUnsafeAlgebra())
+            Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+          AllowedExit.insert(RedDes.getLoopExitInstr());
+          Reductions[Phi] = RedDes;
           continue;
         }
 
-        emitAnalysis(VectorizationReport(it) <<
+        emitAnalysis(VectorizationReport(&*it) <<
                      "value that could not be identified as "
                      "reduction is used outside the loop");
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
@@ -4039,8 +4318,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
-        emitAnalysis(VectorizationReport(it) <<
-                     "call instruction cannot be vectorized");
+        emitAnalysis(VectorizationReport(&*it)
+                     << "call instruction cannot be vectorized");
         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
         return false;
       }
@@ -4049,8 +4328,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // second argument is the same (i.e. loop invariant)
       if (CI &&
           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
-        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
-          emitAnalysis(VectorizationReport(it)
+        auto *SE = PSE.getSE();
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+          emitAnalysis(VectorizationReport(&*it)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
           return false;
@@ -4061,7 +4341,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(it->getType()) &&
            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
-        emitAnalysis(VectorizationReport(it)
+        emitAnalysis(VectorizationReport(&*it)
                      << "instruction return type cannot be vectorized");
         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
@@ -4085,8 +4365,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
-        emitAnalysis(VectorizationReport(it) <<
+      if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
+        emitAnalysis(VectorizationReport(&*it) <<
                      "value cannot be used outside the loop");
         return false;
       }
@@ -4104,6 +4384,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     }
   }
 
+  // Now we know the widest induction type, check if our found induction
+  // is the same size. If it's not, unset it here and InnerLoopVectorizer
+  // will create another.
+  if (Induction && WidestIndTy != Induction->getType())
+    Induction = nullptr;
+
   return true;
 }
 
@@ -4116,7 +4402,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   else
     return;
 
-  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
+  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
   if (!Stride)
     return;
 
@@ -4142,7 +4428,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
        BE = TheLoop->block_end(); B != BE; ++B)
     for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
          I != IE; ++I)
-      if (I->getType()->isPointerTy() && isConsecutivePtr(I))
+      if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))
         Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
 
   while (!Worklist.empty()) {
@@ -4179,30 +4465,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return false;
   }
 
-  if (LAI->getNumRuntimePointerChecks() >
-      VectorizerParams::RuntimeMemoryCheckThreshold) {
-    emitAnalysis(VectorizationReport()
-                 << LAI->getNumRuntimePointerChecks() << " exceeds limit of "
-                 << VectorizerParams::RuntimeMemoryCheckThreshold
-                 << " dependent memory operations checked at runtime");
-    DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
-    return false;
-  }
-  return true;
-}
+  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+  PSE.addPredicate(LAI->PSE.getUnionPredicate());
 
-LoopVectorizationLegality::InductionKind
-LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
-                                               ConstantInt *&StepValue) {
-  if (!isInductionPHI(Phi, SE, StepValue))
-    return IK_NoInduction;
-
-  Type *PhiTy = Phi->getType();
-  // Found an Integer induction variable.
-  if (PhiTy->isIntegerTy())
-    return IK_IntInduction;
-  // Found an Pointer induction variable.
-  return IK_PtrInduction;
+  return true;
 }
 
 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
@@ -4256,8 +4522,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
       
       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
           !isSinglePredecessor) {
-        // Build a masked store if it is legal for the target, otherwise scalarize
-        // the block.
+        // Build a masked store if it is legal for the target, otherwise
+        // scalarize the block.
         bool isLegalMaskedOp =
           isLegalMaskedStore(SI->getValueOperand()->getType(),
                              SI->getPointerOperand());
@@ -4315,7 +4581,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     StoreInst *SI = dyn_cast<StoreInst>(I);
 
     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
 
     // The factor of the corresponding interleave group.
     unsigned Factor = std::abs(Stride);
@@ -4324,7 +4590,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
       continue;
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
@@ -4411,12 +4677,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
         continue;
 
       // Calculate the distance and prepare for the rule 3.
-      const SCEVConstant *DistToA =
-          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
+          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
       if (!DistToA)
         continue;
 
-      int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
+      int DistanceToA = DistToA->getAPInt().getSExtValue();
 
       // Skip if the distance is not multiple of size as they are not in the
       // same group.
@@ -4454,8 +4720,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     emitAnalysis(VectorizationReport() <<
                  "runtime pointer checks needed. Enable vectorization of this "
                  "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os");
-    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
+                 "compiling with -Os/-Oz");
+    DEBUG(dbgs() <<
+          "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
     return Factor;
   }
 
@@ -4467,10 +4734,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   }
 
   // Find the trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
-  unsigned WidestType = getWidestType();
+  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+  unsigned SmallestType, WidestType;
+  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
   unsigned MaxSafeDepDist = -1U;
   if (Legal->getMaxSafeDepDistBytes() != -1U)
@@ -4478,7 +4747,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                     WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
-  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+
+  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
+               << WidestType << " bits.\n");
   DEBUG(dbgs() << "LV: The Widest register is: "
           << WidestRegister << " bits.\n");
 
@@ -4491,6 +4762,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
          " into one vector!");
 
   unsigned VF = MaxVectorSize;
+  if (MaximizeBandwidth && !OptForSize) {
+    // Collect all viable vectorization factors.
+    SmallVector<unsigned, 8> VFs;
+    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+    for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+      VFs.push_back(VS);
+
+    // For each VF calculate its register usage.
+    auto RUs = calculateRegisterUsage(VFs);
+
+    // Select the largest VF which doesn't require more registers than existing
+    // ones.
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+    for (int i = RUs.size() - 1; i >= 0; --i) {
+      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+        VF = VFs[i];
+        break;
+      }
+    }
+  }
 
   // If we optimize the program for size, avoid creating the tail loop.
   if (OptForSize) {
@@ -4499,7 +4790,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
       emitAnalysis
         (VectorizationReport() <<
          "unable to calculate the loop count due to complex control flow");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
       return Factor;
     }
 
@@ -4515,8 +4806,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
                    "cannot optimize for size and vectorize at the "
                    "same time. Enable vectorization of this loop "
                    "with '#pragma clang loop vectorize(enable)' "
-                   "when compiling with -Os");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+                   "when compiling with -Os/-Oz");
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
       return Factor;
     }
   }
@@ -4566,7 +4857,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   return Factor;
 }
 
-unsigned LoopVectorizationCostModel::getWidestType() {
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+  unsigned MinWidth = -1U;
   unsigned MaxWidth = 8;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
@@ -4579,18 +4872,22 @@ unsigned LoopVectorizationCostModel::getWidestType() {
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
       Type *T = it->getType();
 
-      // Ignore ephemeral values.
-      if (EphValues.count(it))
+      // Skip ignored values.
+      if (ValuesToIgnore.count(&*it))
         continue;
 
       // Only examine Loads, Stores and PHINodes.
       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
         continue;
 
-      // Examine PHI nodes that are reduction variables.
-      if (PHINode *PN = dyn_cast<PHINode>(it))
-        if (!Legal->getReductionVars()->count(PN))
+      // Examine PHI nodes that are reduction variables. Update the type to
+      // account for the recurrence type.
+      if (PHINode *PN = dyn_cast<PHINode>(it)) {
+        if (!Legal->isReductionVariable(PN))
           continue;
+        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+        T = RdxDesc.getRecurrenceType();
+      }
 
       // Examine the stored values.
       if (StoreInst *ST = dyn_cast<StoreInst>(it))
@@ -4599,15 +4896,17 @@ unsigned LoopVectorizationCostModel::getWidestType() {
       // Ignore loaded pointer types and stored pointer types that are not
       // consecutive. However, we do want to take consecutive stores/loads of
       // pointer vectors into account.
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
         continue;
 
+      MinWidth = std::min(MinWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
       MaxWidth = std::max(MaxWidth,
                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
     }
   }
 
-  return MaxWidth;
+  return {MinWidth, MaxWidth};
 }
 
 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
@@ -4628,11 +4927,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  // Use the user preference, unless 'auto' is selected.
-  int UserUF = Hints->getInterleave();
-  if (UserUF != 0)
-    return UserUF;
-
   // When we optimize for size, we don't interleave.
   if (OptForSize)
     return 1;
@@ -4642,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
     return 1;
 
   // Do not interleave loops with a relatively small trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
     return 1;
 
@@ -4658,7 +4952,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
       TargetNumRegisters = ForceTargetNumVectorRegs;
   }
 
-  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+  RegisterUsage R = calculateRegisterUsage({VF})[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
@@ -4756,8 +5050,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
-  // this
-  // point) that could benefit from interleaving.
+  // this point) that could benefit from interleaving.
   bool HasReductions = (Legal->getReductionVars()->size() > 0);
   if (TTI.enableAggressiveInterleaving(HasReductions)) {
     DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
@@ -4768,8 +5061,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   return 1;
 }
 
-LoopVectorizationCostModel::RegisterUsage
-LoopVectorizationCostModel::calculateRegisterUsage() {
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(
+    const SmallVector<unsigned, 8> &VFs) {
   // This function calculates the register usage by measuring the highest number
   // of values that are alive at a single location. Obviously, this is a very
   // rough estimation. We scan the loop in a topological order in order and
@@ -4790,8 +5084,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
   LoopBlocksDFS DFS(TheLoop);
   DFS.perform(LI);
 
-  RegisterUsage R;
-  R.NumInstructions = 0;
+  RegisterUsage RU;
+  RU.NumInstructions = 0;
 
   // Each 'key' in the map opens a new interval. The values
   // of the map are the index of the 'last seen' usage of the
@@ -4810,15 +5104,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
   unsigned Index = 0;
   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
        be = DFS.endRPO(); bb != be; ++bb) {
-    R.NumInstructions += (*bb)->size();
-    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
-         ++it) {
-      Instruction *I = it;
-      IdxToInstr[Index++] = I;
+    RU.NumInstructions += (*bb)->size();
+    for (Instruction &I : **bb) {
+      IdxToInstr[Index++] = &I;
 
       // Save the end location of each USE.
-      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-        Value *U = I->getOperand(i);
+      for (unsigned i = 0; i < I.getNumOperands(); ++i) {
+        Value *U = I.getOperand(i);
         Instruction *Instr = dyn_cast<Instruction>(U);
 
         // Ignore non-instruction values such as arguments, constants, etc.
@@ -4847,42 +5139,85 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
     TransposeEnds[it->second].push_back(it->first);
 
   SmallSet<Instruction*, 8> OpenIntervals;
-  unsigned MaxUsage = 0;
 
+  // Get the size of the widest register.
+  unsigned MaxSafeDepDist = -1U;
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+  unsigned WidestRegister =
+      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+  SmallVector<RegisterUsage, 8> RUs(VFs.size());
+  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
 
   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+  // A lambda that gets the register usage for the given type and VF.
+  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
+    return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+  };
+
   for (unsigned int i = 0; i < Index; ++i) {
     Instruction *I = IdxToInstr[i];
     // Ignore instructions that are never used within the loop.
     if (!Ends.count(I)) continue;
 
-    // Ignore ephemeral values.
-    if (EphValues.count(I))
-      continue;
-
     // Remove all of the instructions that end at this location.
     InstrList &List = TransposeEnds[i];
-    for (unsigned int j=0, e = List.size(); j < e; ++j)
+    for (unsigned int j = 0, e = List.size(); j < e; ++j)
       OpenIntervals.erase(List[j]);
 
-    // Count the number of live interals.
-    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+    // Skip ignored values.
+    if (ValuesToIgnore.count(I))
+      continue;
 
-    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
-          OpenIntervals.size() << '\n');
+    // For each VF find the maximum usage of registers.
+    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+      if (VFs[j] == 1) {
+        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
+        continue;
+      }
+
+      // Count the number of live intervals.
+      unsigned RegUsage = 0;
+      for (auto Inst : OpenIntervals) {
+        // Skip ignored values for VF > 1.
+        if (VecValuesToIgnore.count(Inst))
+          continue;
+        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      }
+      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+    }
+
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+                 << OpenIntervals.size() << '\n');
 
     // Add the current instruction to the list of open intervals.
     OpenIntervals.insert(I);
   }
 
-  unsigned Invariant = LoopInvariants.size();
-  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
-  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
-  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
+  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+    unsigned Invariant = 0;
+    if (VFs[i] == 1)
+      Invariant = LoopInvariants.size();
+    else {
+      for (auto Inst : LoopInvariants)
+        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+    }
+
+    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] <<  '\n');
+    DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+    DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+    DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
 
-  R.LoopInvariantRegs = Invariant;
-  R.MaxLocalUsers = MaxUsage;
-  return R;
+    RU.LoopInvariantRegs = Invariant;
+    RU.MaxLocalUsers = MaxUsages[i];
+    RUs[i] = RU;
+  }
+
+  return RUs;
 }
 
 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
@@ -4900,11 +5235,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
       if (isa<DbgInfoIntrinsic>(it))
         continue;
 
-      // Ignore ephemeral values.
-      if (EphValues.count(it))
+      // Skip ignored values.
+      if (ValuesToIgnore.count(&*it))
         continue;
 
-      unsigned C = getInstructionCost(it, VF);
+      unsigned C = getInstructionCost(&*it, VF);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
@@ -4969,7 +5304,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
   if (!C)
     return true;
 
-  const APInt &APStepVal = C->getValue()->getValue();
+  const APInt &APStepVal = C->getAPInt();
 
   // Huge step value - give up.
   if (APStepVal.getBitWidth() > 64)
@@ -4981,9 +5316,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
 }
 
 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
-  if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
-    return true;
-  return false;
+  return Legal->hasStride(I->getOperand(0)) ||
+         Legal->hasStride(I->getOperand(1));
 }
 
 unsigned
@@ -4994,7 +5328,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     VF = 1;
 
   Type *RetTy = I->getType();
+  if (VF > 1 && MinBWs.count(I))
+    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   Type *VectorTy = ToVectorTy(RetTy, VF);
+  auto SE = PSE.getSE();
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
@@ -5076,6 +5413,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
+    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+    auto It = MinBWs.find(Op0AsInstruction);
+    if (VF > 1 && It != MinBWs.end())
+      ValTy = IntegerType::get(ValTy->getContext(), It->second);
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
   }
@@ -5199,8 +5540,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
         Legal->isInductionVariable(I->getOperand(0)))
       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
                                   I->getOperand(0)->getType());
-
-    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+    
+    Type *SrcScalarTy = I->getOperand(0)->getType();
+    Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
+    if (VF > 1 && MinBWs.count(I)) {
+      // This cast is going to be shrunk. This may remove the cast or it might
+      // turn it into slightly different cast. For example, if MinBW == 16,
+      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+      //
+      // Calculate the modified src and dest types.
+      Type *MinVecTy = VectorTy;
+      if (I->getOpcode() == Instruction::Trunc) {
+        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+        VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
+                                            MinVecTy);
+      } else if (I->getOpcode() == Instruction::ZExt ||
+                 I->getOpcode() == Instruction::SExt) {
+        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+        VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
+                                             MinVecTy);
+      }
+    }
+    
     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
@@ -5240,15 +5601,18 @@ char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBits)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
@@ -5269,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
   return false;
 }
 
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+  // Ignore ephemeral values.
+  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+  // Ignore type-promoting instructions we identified during reduction
+  // detection.
+  for (auto &Reduction : *Legal->getReductionVars()) {
+    RecurrenceDescriptor &RedDes = Reduction.second;
+    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+  }
+
+  // Ignore induction phis that are only used in either GetElementPtr or ICmp
+  // instruction to exit loop. Induction variables usually have large types and
+  // can have big impact when estimating register usage.
+  // This is for when VF > 1.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *PN = Induction.first;
+    auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
+
+    // Check that the PHI is only used by the induction increment (UpdateV) or
+    // by GEPs. Then check that UpdateV is only used by a compare instruction or
+    // the loop header PHI.
+    // FIXME: Need precise def-use analysis to determine if this instruction
+    // variable will be vectorized.
+    if (std::all_of(PN->user_begin(), PN->user_end(),
+                    [&](const User *U) -> bool {
+                      return U == UpdateV || isa<GetElementPtrInst>(U);
+                    }) &&
+        std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
+                    [&](const User *U) -> bool {
+                      return U == PN || isa<ICmpInst>(U);
+                    })) {
+      VecValuesToIgnore.insert(PN);
+      VecValuesToIgnore.insert(UpdateV);
+    }
+  }
+
+  // Ignore instructions that will not be vectorized.
+  // This is for when VF > 1.
+  for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
+       ++bb) {
+    for (auto &Inst : **bb) {
+      switch (Inst.getOpcode()) {
+      case Instruction::GetElementPtr: {
+        // Ignore GEP if its last operand is an induction variable so that it is
+        // a consecutive load/store and won't be vectorized as scatter/gather
+        // pattern.
+
+        GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
+        unsigned NumOperands = Gep->getNumOperands();
+        unsigned InductionOperand = getGEPInductionOperand(Gep);
+        bool GepToIgnore = true;
+
+        // Check that all of the gep indices are uniform except for the
+        // induction operand.
+        for (unsigned i = 0; i != NumOperands; ++i) {
+          if (i != InductionOperand &&
+              !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
+                                            TheLoop)) {
+            GepToIgnore = false;
+            break;
+          }
+        }
+
+        if (GepToIgnore)
+          VecValuesToIgnore.insert(&Inst);
+        break;
+      }
+      }
+    }
+  }
+}
 
 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
                                              bool IfPredicateStore) {
@@ -5316,19 +5753,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
-  Instruction *InsertPt = Builder.GetInsertPoint();
-  BasicBlock *IfBlock = Builder.GetInsertBlock();
-  BasicBlock *CondBlock = nullptr;
-
   VectorParts Cond;
-  Loop *VectorLp = nullptr;
   if (IfPredicateStore) {
     assert(Instr->getParent()->getSinglePredecessor() &&
            "Only support single predecessor blocks");
     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
                           Instr->getParent());
-    VectorLp = LI->getLoopFor(IfBlock);
-    assert(VectorLp && "Must have a loop for this block");
   }
 
   // For each vector unroll 'part':
@@ -5343,11 +5773,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
             Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
       Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
                                ConstantInt::get(Cond[Part]->getType(), 1));
-      CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
-      LoopVectorBody.push_back(CondBlock);
-      VectorLp->addBasicBlockToLoop(CondBlock, *LI);
-      // Update Builder with newly created basic block.
-      Builder.SetInsertPoint(InsertPt);
     }
 
     Instruction *Cloned = Instr->clone();
@@ -5367,16 +5792,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
       if (!IsVoidRetTy)
         VecResults[Part] = Cloned;
 
-    // End if-block.
-      if (IfPredicateStore) {
-        BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-        LoopVectorBody.push_back(NewIfBlock);
-        VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
-        Builder.SetInsertPoint(InsertPt);
-        ReplaceInstWithInst(IfBlock->getTerminator(),
-                            BranchInst::Create(CondBlock, NewIfBlock, Cmp));
-        IfBlock = NewIfBlock;
-      }
+      // End if-block.
+      if (IfPredicateStore)
+        PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+                                                  Cmp));
   }
 }
 
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b180c97..27d3337 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -61,7 +62,7 @@ static cl::opt<int>
                               "number "));
 
 static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                    cl::desc("Attempt to vectorize horizontal reductions"));
 
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
@@ -73,6 +74,14 @@ static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+    cl::desc("Limit the size of the SLP scheduling region per block"));
+
 namespace {
 
 // FIXME: Set this via cl::opt to allow overriding.
@@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10;
 // This limit is useful for very large basic blocks.
 static const unsigned MaxMemDepDistance = 160;
 
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
 /// \brief Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -156,13 +169,11 @@ static unsigned getAltOpcode(unsigned Op) {
 /// of an alternate sequence which can later be merged as
 /// a ShuffleVector instruction.
 static bool canCombineAsAltInst(unsigned Op) {
-  if (Op == Instruction::FAdd || Op == Instruction::FSub ||
-      Op == Instruction::Sub || Op == Instruction::Add)
-    return true;
-  return false;
+  return Op == Instruction::FAdd || Op == Instruction::FSub ||
+         Op == Instruction::Sub || Op == Instruction::Add;
 }
 
-/// \returns ShuffleVector instruction if intructions in \p VL have
+/// \returns ShuffleVector instruction if instructions in \p VL have
 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
 static unsigned isAltInst(ArrayRef<Value *> VL) {
@@ -211,7 +222,7 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
     }
   }
 }
-  
+
 /// \returns \p I after propagating metadata from \p VL.
 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
   Instruction *I0 = cast<Instruction>(VL[0]);
@@ -242,6 +253,9 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
       case LLVMContext::MD_fpmath:
         MD = MDNode::getMostGenericFPMath(MD, IMD);
         break;
+      case LLVMContext::MD_nontemporal:
+        MD = MDNode::intersect(MD, IMD);
+        break;
       }
     }
     I->setMetadata(Kind, MD);
@@ -393,7 +407,7 @@ public:
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
-  /// \returns true if it is benefitial to reverse the vector order.
+  /// \returns true if it is beneficial to reverse the vector order.
   bool shouldReorder() const {
     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
   }
@@ -441,7 +455,7 @@ private:
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 
-  /// \returns whether the VectorizableTree is fully vectoriable and will
+  /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree();
 
@@ -492,7 +506,7 @@ private:
     }
     return Last;
   }
-  
+
   /// -- Vectorization State --
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
@@ -506,7 +520,7 @@ private:
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser (Value *S, llvm::User *U, int L) :
-      Scalar(S), User(U), Lane(L){};
+      Scalar(S), User(U), Lane(L){}
     // Which scalar in our function.
     Value *Scalar;
     // Which user that uses the scalar.
@@ -717,6 +731,8 @@ private:
         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
           ScheduleStart(nullptr), ScheduleEnd(nullptr),
           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+          ScheduleRegionSize(0),
+          ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
           // Make sure that the initial SchedulingRegionID is greater than the
           // initial SchedulingRegionID in ScheduleData (which is 0).
           SchedulingRegionID(1) {}
@@ -728,6 +744,13 @@ private:
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
 
+      // Reduce the maximum schedule region size by the size of the
+      // previous scheduling run.
+      ScheduleRegionSizeLimit -= ScheduleRegionSize;
+      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+        ScheduleRegionSizeLimit = MinScheduleRegionSize;
+      ScheduleRegionSize = 0;
+
       // Make a new scheduling region, i.e. all existing ScheduleData is not
       // in the new region yet.
       ++SchedulingRegionID;
@@ -804,7 +827,8 @@ private:
     void cancelScheduling(ArrayRef<Value *> VL);
 
     /// Extends the scheduling region so that V is inside the region.
-    void extendSchedulingRegion(Value *V);
+    /// \returns true if the region size is within the limit.
+    bool extendSchedulingRegion(Value *V);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
@@ -858,6 +882,12 @@ private:
     /// (can be null).
     ScheduleData *LastLoadStoreInRegion;
 
+    /// The current size of the scheduling region.
+    int ScheduleRegionSize;
+
+    /// The maximum size allowed for the scheduling region.
+    int ScheduleRegionSizeLimit;
+
     /// The ID of the scheduling region. For a new vectorization iteration this
     /// is incremented which "removes" all ScheduleData from the region.
     int SchedulingRegionID;
@@ -1059,7 +1089,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     newTreeEntry(VL, false);
     return;
   }
-  
+
   // Check that every instructions appears once in this bundle.
   for (unsigned i = 0, e = VL.size(); i < e; ++i)
     for (unsigned j = i+1; j < e; ++j)
@@ -1077,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
   if (!BS.tryScheduleBundle(VL, this)) {
     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    BS.cancelScheduling(VL);
+    assert((!BS.getScheduleData(VL[0]) ||
+            !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+           "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, false);
     return;
   }
@@ -1125,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       return;
     }
     case Instruction::Load: {
+      // Check that a vectorized load would load the same memory as a scalar
+      // load.
+      // For example we don't want vectorize loads that are smaller than 8 bit.
+      // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
+      // loading/storing it as an i8 struct. If we vectorize loads/stores from
+      // such a struct we read/write packed bits disagreeing with the
+      // unvectorized version.
+      const DataLayout &DL = F->getParent()->getDataLayout();
+      Type *ScalarTy = VL[0]->getType();
+
+      if (DL.getTypeSizeInBits(ScalarTy) !=
+          DL.getTypeAllocSizeInBits(ScalarTy)) {
+        BS.cancelScheduling(VL);
+        newTreeEntry(VL, false);
+        DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+        return;
+      }
       // Check if the loads are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
         LoadInst *L = cast<LoadInst>(VL[i]);
@@ -1134,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
-        const DataLayout &DL = F->getParent()->getDataLayout();
+
         if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
             ++NumLoadsWantToChangeOrder;
@@ -1662,7 +1711,7 @@ int BoUpSLP::getSpillCost() {
   int Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
-  Instruction *PrevInst = nullptr; 
+  Instruction *PrevInst = nullptr;
 
   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
@@ -1687,10 +1736,11 @@ int BoUpSLP::getSpillCost() {
     for (auto &J : PrevInst->operands()) {
       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
-    }    
+    }
 
     // Now find the sequence of instructions between PrevInst and Inst.
-    BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
+    BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
+        PrevInstIt(PrevInst->getIterator());
     --PrevInstIt;
     while (InstIt != PrevInstIt) {
       if (PrevInstIt == PrevInst->getParent()->rend()) {
@@ -1730,30 +1780,29 @@ int BoUpSLP::getTreeCost() {
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
 
-  for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
-    int C = getEntryCost(&VectorizableTree[i]);
+  for (TreeEntry &TE : VectorizableTree) {
+    int C = getEntryCost(&TE);
     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
-          << *VectorizableTree[i].Scalars[0] << " .\n");
+          << TE.Scalars[0] << " .\n");
     Cost += C;
   }
 
   SmallSet<Value *, 16> ExtractCostCalculated;
   int ExtractCost = 0;
-  for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
-       I != E; ++I) {
+  for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(I->Scalar).second)
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
       continue;
 
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
     // removed as well).
-    if (EphValues.count(I->User))
+    if (EphValues.count(EU.User))
       continue;
 
-    VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
+    VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                           I->Lane);
+                                           EU.Lane);
   }
 
   Cost += getSpillCost();
@@ -1890,106 +1939,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
   }
 }
 
+// Return true if I should be commuted before adding it's left and right
+// operands to the arrays Left and Right.
+//
+// The vectorizer is trying to either have all elements one side being
+// instruction with the same opcode to enable further vectorization, or having
+// a splat to lower the vectorizing cost.
+static bool shouldReorderOperands(int i, Instruction &I,
+                                  SmallVectorImpl<Value *> &Left,
+                                  SmallVectorImpl<Value *> &Right,
+                                  bool AllSameOpcodeLeft,
+                                  bool AllSameOpcodeRight, bool SplatLeft,
+                                  bool SplatRight) {
+  Value *VLeft = I.getOperand(0);
+  Value *VRight = I.getOperand(1);
+  // If we have "SplatRight", try to see if commuting is needed to preserve it.
+  if (SplatRight) {
+    if (VRight == Right[i - 1])
+      // Preserve SplatRight
+      return false;
+    if (VLeft == Right[i - 1]) {
+      // Commuting would preserve SplatRight, but we don't want to break
+      // SplatLeft either, i.e. preserve the original order if possible.
+      // (FIXME: why do we care?)
+      if (SplatLeft && VLeft == Left[i - 1])
+        return false;
+      return true;
+    }
+  }
+  // Symmetrically handle Right side.
+  if (SplatLeft) {
+    if (VLeft == Left[i - 1])
+      // Preserve SplatLeft
+      return false;
+    if (VRight == Left[i - 1])
+      return true;
+  }
+
+  Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+  Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+  // If we have "AllSameOpcodeRight", try to see if the left operands preserves
+  // it and not the right, in this case we want to commute.
+  if (AllSameOpcodeRight) {
+    unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
+    if (IRight && RightPrevOpcode == IRight->getOpcode())
+      // Do not commute, a match on the right preserves AllSameOpcodeRight
+      return false;
+    if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
+      // We have a match and may want to commute, but first check if there is
+      // not also a match on the existing operands on the Left to preserve
+      // AllSameOpcodeLeft, i.e. preserve the original order if possible.
+      // (FIXME: why do we care?)
+      if (AllSameOpcodeLeft && ILeft &&
+          cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
+        return false;
+      return true;
+    }
+  }
+  // Symmetrically handle Left side.
+  if (AllSameOpcodeLeft) {
+    unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
+    if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
+      return false;
+    if (IRight && LeftPrevOpcode == IRight->getOpcode())
+      return true;
+  }
+  return false;
+}
+
 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
                                              SmallVectorImpl<Value *> &Left,
                                              SmallVectorImpl<Value *> &Right) {
 
-  SmallVector<Value *, 16> OrigLeft, OrigRight;
-
-  bool AllSameOpcodeLeft = true;
-  bool AllSameOpcodeRight = true;
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    Instruction *I = cast<Instruction>(VL[i]);
-    Value *VLeft = I->getOperand(0);
-    Value *VRight = I->getOperand(1);
-
-    OrigLeft.push_back(VLeft);
-    OrigRight.push_back(VRight);
-
-    Instruction *ILeft = dyn_cast<Instruction>(VLeft);
-    Instruction *IRight = dyn_cast<Instruction>(VRight);
-
-    // Check whether all operands on one side have the same opcode. In this case
-    // we want to preserve the original order and not make things worse by
-    // reordering.
-    if (i && AllSameOpcodeLeft && ILeft) {
-      if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
-        if (PLeft->getOpcode() != ILeft->getOpcode())
-          AllSameOpcodeLeft = false;
-      } else
-        AllSameOpcodeLeft = false;
-    }
-    if (i && AllSameOpcodeRight && IRight) {
-      if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
-        if (PRight->getOpcode() != IRight->getOpcode())
-          AllSameOpcodeRight = false;
-      } else
-        AllSameOpcodeRight = false;
-    }
-
-    // Sort two opcodes. In the code below we try to preserve the ability to use
-    // broadcast of values instead of individual inserts.
-    // vl1 = load
-    // vl2 = phi
-    // vr1 = load
-    // vr2 = vr2
-    //    = vl1 x vr1
-    //    = vl2 x vr2
-    // If we just sorted according to opcode we would leave the first line in
-    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
-    //    = vl1 x vr1
-    //    = vr2 x vl2
-    // Because vr2 and vr1 are from the same load we loose the opportunity of a
-    // broadcast for the packed right side in the backend: we have [vr1, vl2]
-    // instead of [vr1, vr2=vr1].
-    if (ILeft && IRight) {
-      if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
-        Left.push_back(IRight);
-        Right.push_back(ILeft);
-      } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
-                 Right[i - 1] != IRight) {
-        // Try not to destroy a broad cast for no apparent benefit.
-        Left.push_back(IRight);
-        Right.push_back(ILeft);
-      } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
-                 Right[i - 1] == ILeft) {
-        // Try preserve broadcasts.
-        Left.push_back(IRight);
-        Right.push_back(ILeft);
-      } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
-                 Left[i - 1] == IRight) {
-        // Try preserve broadcasts.
-        Left.push_back(IRight);
-        Right.push_back(ILeft);
-      } else {
-        Left.push_back(ILeft);
-        Right.push_back(IRight);
-      }
-      continue;
-    }
-    // One opcode, put the instruction on the right.
-    if (ILeft) {
-      Left.push_back(VRight);
-      Right.push_back(ILeft);
-      continue;
-    }
+  if (VL.size()) {
+    // Peel the first iteration out of the loop since there's nothing
+    // interesting to do anyway and it simplifies the checks in the loop.
+    auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
+    auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+    if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
+      // Favor having instruction to the right. FIXME: why?
+      std::swap(VLeft, VRight);
     Left.push_back(VLeft);
     Right.push_back(VRight);
   }
 
-  bool LeftBroadcast = isSplat(Left);
-  bool RightBroadcast = isSplat(Right);
-
-  // If operands end up being broadcast return this operand order.
-  if (LeftBroadcast || RightBroadcast)
-    return;
+  // Keep track if we have instructions with all the same opcode on one side.
+  bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
+  bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
+  // Keep track if we have one side with all the same value (broadcast).
+  bool SplatLeft = true;
+  bool SplatRight = true;
 
-  // Don't reorder if the operands where good to begin.
-  if (AllSameOpcodeRight || AllSameOpcodeLeft) {
-    Left = OrigLeft;
-    Right = OrigRight;
+  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    assert(I->isCommutative() && "Can only process commutative instruction");
+    // Commute to favor either a splat or maximizing having the same opcodes on
+    // one side.
+    if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
+                              AllSameOpcodeRight, SplatLeft, SplatRight)) {
+      Left.push_back(I->getOperand(1));
+      Right.push_back(I->getOperand(0));
+    } else {
+      Left.push_back(I->getOperand(0));
+      Right.push_back(I->getOperand(1));
+    }
+    // Update Splat* and AllSameOpcode* after the insertion.
+    SplatRight = SplatRight && (Right[i - 1] == Right[i]);
+    SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
+    AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
+                        (cast<Instruction>(Left[i - 1])->getOpcode() ==
+                         cast<Instruction>(Left[i])->getOpcode());
+    AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
+                         (cast<Instruction>(Right[i - 1])->getOpcode() ==
+                          cast<Instruction>(Right[i])->getOpcode());
   }
 
+  // If one operand end up being broadcast, return this operand order.
+  if (SplatRight || SplatLeft)
+    return;
+
   const DataLayout &DL = F->getParent()->getDataLayout();
 
   // Finally check if we can get longer vectorizable chain by reordering
@@ -2030,7 +2099,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
 
 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  BasicBlock::iterator NextInst = VL0;
+  BasicBlock::iterator NextInst(VL0);
   ++NextInst;
   Builder.SetInsertPoint(VL0->getParent(), NextInst);
   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
@@ -2481,13 +2550,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 }
 
 Value *BoUpSLP::vectorizeTree() {
-  
+
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
     scheduleBlock(BSIter.second.get());
   }
 
-  Builder.SetInsertPoint(F->getEntryBlock().begin());
+  Builder.SetInsertPoint(&F->getEntryBlock().front());
   vectorizeTree(&VectorizableTree[0]);
 
   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
@@ -2532,7 +2601,7 @@ Value *BoUpSLP::vectorizeTree() {
         User->replaceUsesOfWith(Scalar, Ex);
      }
     } else {
-      Builder.SetInsertPoint(F->getEntryBlock().begin());
+      Builder.SetInsertPoint(&F->getEntryBlock().front());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
       CSEBlocks.insert(&F->getEntryBlock());
       User->replaceUsesOfWith(Scalar, Ex);
@@ -2641,7 +2710,7 @@ void BoUpSLP::optimizeGatherSequence() {
     BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
-      Instruction *In = it++;
+      Instruction *In = &*it++;
       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
@@ -2681,8 +2750,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+
+  // Make sure that the scheduling region contains all
+  // instructions of the bundle.
+  for (Value *V : VL) {
+    if (!extendSchedulingRegion(V))
+      return false;
+  }
+
   for (Value *V : VL) {
-    extendSchedulingRegion(V);
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
@@ -2743,7 +2819,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
       schedule(pickedSD, ReadyInsts);
     }
   }
-  return Bundle->isReady();
+  if (!Bundle->isReady()) {
+    cancelScheduling(VL);
+    return false;
+  }
+  return true;
 }
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
@@ -2772,9 +2852,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
   }
 }
 
-void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
   if (getScheduleData(V))
-    return;
+    return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
@@ -2785,21 +2865,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
     ScheduleEnd = I->getNextNode();
     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
-    return;
+    return true;
   }
   // Search up and down at the same time, because we don't know if the new
   // instruction is above or below the existing scheduling region.
-  BasicBlock::reverse_iterator UpIter(ScheduleStart);
+  BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
   BasicBlock::reverse_iterator UpperEnd = BB->rend();
   BasicBlock::iterator DownIter(ScheduleEnd);
   BasicBlock::iterator LowerEnd = BB->end();
   for (;;) {
+    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+      DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
+      return false;
+    }
+
     if (UpIter != UpperEnd) {
       if (&*UpIter == I) {
         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
         ScheduleStart = I;
         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
-        return;
+        return true;
       }
       UpIter++;
     }
@@ -2810,13 +2895,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
         ScheduleEnd = I->getNextNode();
         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
-        return;
+        return true;
       }
       DownIter++;
     }
     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
            "instruction not found in block");
   }
+  return true;
 }
 
 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
@@ -2896,8 +2982,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
             }
           } else {
             // I'm not sure if this can ever happen. But we need to be safe.
-            // This lets the instruction/bundle never be scheduled and eventally
-            // disable vectorization.
+            // This lets the instruction/bundle never be scheduled and
+            // eventually disable vectorization.
             BundleMember->Dependencies++;
             BundleMember->incrementUnscheduledDeps(1);
           }
@@ -2985,10 +3071,10 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
 }
 
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
-  
+
   if (!BS->ScheduleStart)
     return;
-  
+
   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
 
   BS->resetSchedule();
@@ -3003,7 +3089,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
 
-  // Ensure that all depencency data is updated and fill the ready-list with
+  // Ensure that all dependency data is updated and fill the ready-list with
   // initial instructions.
   int Idx = 0;
   int NumToSchedule = 0;
@@ -3035,7 +3121,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
       Instruction *pickedInst = BundleMember->Inst;
       if (LastScheduledInst->getNextNode() != pickedInst) {
         BS->BB->getInstList().remove(pickedInst);
-        BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
+        BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+                                     pickedInst);
       }
       LastScheduledInst = pickedInst;
       BundleMember = BundleMember->NextInBundle;
@@ -3074,11 +3161,11 @@ struct SLPVectorizer : public FunctionPass {
     if (skipOptnoneFunction(F))
       return false;
 
-    SE = &getAnalysis<ScalarEvolution>();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
     TLI = TLIP ? &TLIP->getTLI() : nullptr;
-    AA = &getAnalysis<AliasAnalysis>();
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -3139,13 +3226,15 @@ struct SLPVectorizer : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     FunctionPass::getAnalysisUsage(AU);
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<ScalarEvolution>();
-    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
     AU.setPreservesCFG();
   }
 
@@ -3260,15 +3349,26 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
   // Do a quadratic search on all of the given stores and find
   // all of the pairs of stores that follow each other.
+  SmallVector<unsigned, 16> IndexQueue;
   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
-    for (unsigned j = 0; j < e; ++j) {
-      if (i == j)
-        continue;
-      const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
-      if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
-        Tails.insert(Stores[j]);
+    const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+    IndexQueue.clear();
+    // If a store has multiple consecutive store candidates, search Stores
+    // array according to the sequence: from i+1 to e, then from i-1 to 0.
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find slp vectorization opportunity.
+    unsigned j = 0;
+    for (j = i + 1; j < e; ++j)
+      IndexQueue.push_back(j);
+    for (j = i; j > 0; --j)
+      IndexQueue.push_back(j - 1);
+
+    for (auto &k : IndexQueue) {
+      if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+        Tails.insert(Stores[k]);
         Heads.insert(Stores[i]);
-        ConsecutiveChain[Stores[i]] = Stores[j];
+        ConsecutiveChain[Stores[i]] = Stores[k];
+        break;
       }
     }
   }
@@ -3428,7 +3528,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         unsigned VecIdx = 0;
         for (auto &V : BuildVectorSlice) {
           IRBuilder<true, NoFolder> Builder(
-              ++BasicBlock::iterator(InsertAfter));
+              InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
           InsertElementInst *IE = cast<InsertElementInst>(V);
           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
               VectorizedRoot, Builder.getInt32(VecIdx++)));
@@ -3489,7 +3589,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
 /// \param NumEltsToRdx The number of elements that should be reduced in the
 ///        vector.
 /// \param IsPairwise Whether the reduction is a pairwise or splitting
-///        reduction. A pairwise reduction will generate a mask of 
+///        reduction. A pairwise reduction will generate a mask of
 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
@@ -3552,16 +3652,17 @@ class HorizontalReduction {
   unsigned ReductionOpcode;
   /// The opcode of the values we perform a reduction on.
   unsigned ReducedValueOpcode;
-  /// The width of one full horizontal reduction operation.
-  unsigned ReduxWidth;
   /// Should we model this reduction as a pairwise reduction tree or a tree that
   /// splits the vector in halves and adds those halves.
   bool IsPairwiseReduction;
 
 public:
+  /// The width of one full horizontal reduction operation.
+  unsigned ReduxWidth;
+
   HorizontalReduction()
     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
-    ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+    ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
 
   /// \brief Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -3607,11 +3708,11 @@ public:
       return false;
 
     // Post order traverse the reduction tree starting at B. We only handle true
-    // trees containing only binary operators.
-    SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+    // trees containing only binary operators or selects.
+    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
     Stack.push_back(std::make_pair(B, 0));
     while (!Stack.empty()) {
-      BinaryOperator *TreeN = Stack.back().first;
+      Instruction *TreeN = Stack.back().first;
       unsigned EdgeToVist = Stack.back().second++;
       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
 
@@ -3647,9 +3748,10 @@ public:
 
       // Visit left or right.
       Value *NextV = TreeN->getOperand(EdgeToVist);
-      BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
-      if (Next)
-        Stack.push_back(std::make_pair(Next, 0));
+      // We currently only allow BinaryOperator's and SelectInst's as reduction
+      // values in our tree.
+      if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
+        Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
       else if (NextV != Phi)
         return false;
     }
@@ -3670,7 +3772,7 @@ public:
     IRBuilder<> Builder(ReductionRoot);
     FastMathFlags Unsafe;
     Unsafe.setUnsafeAlgebra();
-    Builder.SetFastMathFlags(Unsafe);
+    Builder.setFastMathFlags(Unsafe);
     unsigned i = 0;
 
     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
@@ -3717,9 +3819,12 @@ public:
     return VectorizedTree != nullptr;
   }
 
-private:
+  unsigned numReductionValues() const {
+    return ReducedVals.size();
+  }
 
-  /// \brief Calcuate the cost of a reduction.
+private:
+  /// \brief Calculate the cost of a reduction.
   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
@@ -3825,6 +3930,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
 
+/// \brief Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+                                BasicBlock *ParentBB, LoopInfo *LI) {
+  // There are situations where the reduction value is not dominated by the
+  // reduction phi. Vectorizing such cases has been reported to cause
+  // miscompiles. See PR25787.
+  auto DominatedReduxValue = [&](Value *R) {
+    return (
+        dyn_cast<Instruction>(R) &&
+        DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+  };
+
+  Value *Rdx = nullptr;
+
+  // Return the incoming value if it comes from the same BB as the phi node.
+  if (P->getIncomingBlock(0) == ParentBB) {
+    Rdx = P->getIncomingValue(0);
+  } else if (P->getIncomingBlock(1) == ParentBB) {
+    Rdx = P->getIncomingValue(1);
+  }
+
+  if (Rdx && DominatedReduxValue(Rdx))
+    return Rdx;
+
+  // Otherwise, check whether we have a loop latch to look at.
+  Loop *BBL = LI->getLoopFor(ParentBB);
+  if (!BBL)
+    return nullptr;
+  BasicBlock *BBLatch = BBL->getLoopLatch();
+  if (!BBLatch)
+    return nullptr;
+
+  // There is a loop latch, return the incoming value if it comes from
+  // that. This reduction pattern occassionaly turns up.
+  if (P->getIncomingBlock(0) == BBLatch) {
+    Rdx = P->getIncomingValue(0);
+  } else if (P->getIncomingBlock(1) == BBLatch) {
+    Rdx = P->getIncomingValue(1);
+  }
+
+  if (Rdx && DominatedReduxValue(Rdx))
+    return Rdx;
+
+  return nullptr;
+}
+
+/// \brief Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding
+/// the phi node P with reduction operators BI, then check if it
+/// can be done.
+/// \returns true if a horizontal reduction was matched and reduced.
+/// \returns false if a horizontal reduction was not matched.
+static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
+                                        BoUpSLP &R, TargetTransformInfo *TTI) {
+  if (!ShouldVectorizeHor)
+    return false;
+
+  HorizontalReduction HorRdx;
+  if (!HorRdx.matchAssociativeReduction(P, BI))
+    return false;
+
+  // If there is a sufficient number of reduction values, reduce
+  // to a nearby power-of-2. Can safely generate oversized
+  // vectors and rely on the backend to split them to legal sizes.
+  HorRdx.ReduxWidth =
+    std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+
+  return HorRdx.tryToReduce(R, TTI);
+}
+
 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
@@ -3836,9 +4017,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
     // Collect the incoming values from the PHIs.
     Incoming.clear();
-    for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
-         ++instr) {
-      PHINode *P = dyn_cast<PHINode>(instr);
+    for (Instruction &I : *BB) {
+      PHINode *P = dyn_cast<PHINode>(&I);
       if (!P)
         break;
 
@@ -3881,7 +4061,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
     // We may go through BB multiple times so skip the one we have checked.
-    if (!VisitedInstrs.insert(it).second)
+    if (!VisitedInstrs.insert(&*it).second)
       continue;
 
     if (isa<DbgInfoIntrinsic>(it))
@@ -3892,20 +4072,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // Check that the PHI is a reduction PHI.
       if (P->getNumIncomingValues() != 2)
         return Changed;
-      Value *Rdx =
-          (P->getIncomingBlock(0) == BB
-               ? (P->getIncomingValue(0))
-               : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
-                                               : nullptr));
+
+      Value *Rdx = getReductionValue(DT, P, BB, LI);
+
       // Check if this is a Binary Operator.
       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
       if (!BI)
         continue;
 
       // Try to match and vectorize a horizontal reduction.
-      HorizontalReduction HorRdx;
-      if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
-          HorRdx.tryToReduce(R, TTI)) {
+      if (canMatchHorizontalReduction(P, BI, R, TTI)) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
@@ -3928,15 +4104,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       continue;
     }
 
-    // Try to vectorize horizontal reductions feeding into a store.
     if (ShouldStartVectorizeHorAtStore)
       if (StoreInst *SI = dyn_cast<StoreInst>(it))
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
-          HorizontalReduction HorRdx;
-          if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
-                HorRdx.tryToReduce(R, TTI)) ||
-               tryToVectorize(BinOp, R))) {
+          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
+              tryToVectorize(BinOp, R)) {
             Changed = true;
             it = BB->begin();
             e = BB->end();
@@ -4037,10 +4210,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
 char SLPVectorizer::ID = 0;
 static const char lv_name[] = "SLP Vectorizer";
 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)